Merge "Perform a permission check on the title when changing the page language"
[lhc/web/wiklou.git] / includes / libs / mime / XmlTypeCheck.php
1 <?php
2 /**
3 * XML syntax and type checker.
4 *
5 * Since 1.24.2, it uses XMLReader instead of xml_parse, which gives us
6 * more control over the expansion of XML entities. When passed to the
7 * callback, entities will be fully expanded, but may report the XML is
8 * invalid if expanding the entities are likely to cause a DoS.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License along
21 * with this program; if not, write to the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 * http://www.gnu.org/copyleft/gpl.html
24 *
25 * @file
26 */
27
28 class XmlTypeCheck {
29 /**
30 * Will be set to true or false to indicate whether the file is
31 * well-formed XML. Note that this doesn't check schema validity.
32 */
33 public $wellFormed = null;
34
35 /**
36 * Will be set to true if the optional element filter returned
37 * a match at some point.
38 */
39 public $filterMatch = false;
40
41 /**
42 * Will contain the type of filter hit if the optional element filter returned
43 * a match at some point.
44 * @var mixed
45 */
46 public $filterMatchType = false;
47
48 /**
49 * Name of the document's root element, including any namespace
50 * as an expanded URL.
51 */
52 public $rootElement = '';
53
54 /**
55 * A stack of strings containing the data of each xml element as it's processed. Append
56 * data to the top string of the stack, then pop off the string and process it when the
57 * element is closed.
58 */
59 protected $elementData = [];
60
61 /**
62 * A stack of element names and attributes, as we process them.
63 */
64 protected $elementDataContext = [];
65
66 /**
67 * Current depth of the data stack.
68 */
69 protected $stackDepth = 0;
70
71 /**
72 * Additional parsing options
73 */
74 private $parserOptions = [
75 'processing_instruction_handler' => '',
76 'external_dtd_handler' => '',
77 'dtd_handler' => '',
78 'require_safe_dtd' => true
79 ];
80
81 /**
82 * Allow filtering an XML file.
83 *
84 * Filters should return either true or a string to indicate something
85 * is wrong with the file. $this->filterMatch will store if the
86 * file failed validation (true = failed validation).
87 * $this->filterMatchType will contain the validation error.
88 * $this->wellFormed will contain whether the xml file is well-formed.
89 *
90 * @note If multiple filters are hit, only one of them will have the
91 * result stored in $this->filterMatchType.
92 *
93 * @param string $input a filename or string containing the XML element
94 * @param callable $filterCallback (optional)
95 * Function to call to do additional custom validity checks from the
96 * SAX element handler event. This gives you access to the element
97 * namespace, name, attributes, and text contents.
98 * Filter should return a truthy value describing the error.
99 * @param bool $isFile (optional) indicates if the first parameter is a
100 * filename (default, true) or if it is a string (false)
101 * @param array $options list of additional parsing options:
102 * processing_instruction_handler: Callback for xml_set_processing_instruction_handler
103 * external_dtd_handler: Callback for the url of external dtd subset
104 * dtd_handler: Callback given the full text of the <!DOCTYPE declaration.
105 * require_safe_dtd: Only allow non-recursive entities in internal dtd (default true)
106 */
107 function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
108 $this->filterCallback = $filterCallback;
109 $this->parserOptions = array_merge( $this->parserOptions, $options );
110 $this->validateFromInput( $input, $isFile );
111 }
112
113 /**
114 * Alternative constructor: from filename
115 *
116 * @param string $fname the filename of an XML document
117 * @param callable $filterCallback (optional)
118 * Function to call to do additional custom validity checks from the
119 * SAX element handler event. This gives you access to the element
120 * namespace, name, and attributes, but not to text contents.
121 * Filter should return 'true' to toggle on $this->filterMatch
122 * @return XmlTypeCheck
123 */
124 public static function newFromFilename( $fname, $filterCallback = null ) {
125 return new self( $fname, $filterCallback, true );
126 }
127
128 /**
129 * Alternative constructor: from string
130 *
131 * @param string $string a string containing an XML element
132 * @param callable $filterCallback (optional)
133 * Function to call to do additional custom validity checks from the
134 * SAX element handler event. This gives you access to the element
135 * namespace, name, and attributes, but not to text contents.
136 * Filter should return 'true' to toggle on $this->filterMatch
137 * @return XmlTypeCheck
138 */
139 public static function newFromString( $string, $filterCallback = null ) {
140 return new self( $string, $filterCallback, false );
141 }
142
143 /**
144 * Get the root element. Simple accessor to $rootElement
145 *
146 * @return string
147 */
148 public function getRootElement() {
149 return $this->rootElement;
150 }
151
152 /**
153 * @param string $fname the filename
154 */
155 private function validateFromInput( $xml, $isFile ) {
156 $reader = new XMLReader();
157 if ( $isFile ) {
158 $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
159 } else {
160 $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
161 }
162 if ( $s !== true ) {
163 // Couldn't open the XML
164 $this->wellFormed = false;
165 } else {
166 $oldDisable = libxml_disable_entity_loader( true );
167 $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true );
168 try {
169 $this->validate( $reader );
170 } catch ( Exception $e ) {
171 // Calling this malformed, because we didn't parse the whole
172 // thing. Maybe just an external entity refernce.
173 $this->wellFormed = false;
174 $reader->close();
175 libxml_disable_entity_loader( $oldDisable );
176 throw $e;
177 }
178 $reader->close();
179 libxml_disable_entity_loader( $oldDisable );
180 }
181 }
182
183 private function readNext( XMLReader $reader ) {
184 set_error_handler( [ $this, 'XmlErrorHandler' ] );
185 $ret = $reader->read();
186 restore_error_handler();
187 return $ret;
188 }
189
190 public function XmlErrorHandler( $errno, $errstr ) {
191 $this->wellFormed = false;
192 }
193
194 private function validate( $reader ) {
195 // First, move through anything that isn't an element, and
196 // handle any processing instructions with the callback
197 do {
198 if ( !$this->readNext( $reader ) ) {
199 // Hit the end of the document before any elements
200 $this->wellFormed = false;
201 return;
202 }
203 if ( $reader->nodeType === XMLReader::PI ) {
204 $this->processingInstructionHandler( $reader->name, $reader->value );
205 }
206 if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
207 $this->DTDHandler( $reader );
208 }
209 } while ( $reader->nodeType != XMLReader::ELEMENT );
210
211 // Process the rest of the document
212 do {
213 switch ( $reader->nodeType ) {
214 case XMLReader::ELEMENT:
215 $name = $this->expandNS(
216 $reader->name,
217 $reader->namespaceURI
218 );
219 if ( $this->rootElement === '' ) {
220 $this->rootElement = $name;
221 }
222 $empty = $reader->isEmptyElement;
223 $attrs = $this->getAttributesArray( $reader );
224 $this->elementOpen( $name, $attrs );
225 if ( $empty ) {
226 $this->elementClose();
227 }
228 break;
229
230 case XMLReader::END_ELEMENT:
231 $this->elementClose();
232 break;
233
234 case XMLReader::WHITESPACE:
235 case XMLReader::SIGNIFICANT_WHITESPACE:
236 case XMLReader::CDATA:
237 case XMLReader::TEXT:
238 $this->elementData( $reader->value );
239 break;
240
241 case XMLReader::ENTITY_REF:
242 // Unexpanded entity (maybe external?),
243 // don't send to the filter (xml_parse didn't)
244 break;
245
246 case XMLReader::COMMENT:
247 // Don't send to the filter (xml_parse didn't)
248 break;
249
250 case XMLReader::PI:
251 // Processing instructions can happen after the header too
252 $this->processingInstructionHandler(
253 $reader->name,
254 $reader->value
255 );
256 break;
257 case XMLReader::DOC_TYPE:
258 // We should never see a doctype after first
259 // element.
260 $this->wellFormed = false;
261 break;
262 default:
263 // One of DOC, ENTITY, END_ENTITY,
264 // NOTATION, or XML_DECLARATION
265 // xml_parse didn't send these to the filter, so we won't.
266 }
267 } while ( $this->readNext( $reader ) );
268
269 if ( $this->stackDepth !== 0 ) {
270 $this->wellFormed = false;
271 } elseif ( $this->wellFormed === null ) {
272 $this->wellFormed = true;
273 }
274 }
275
276 /**
277 * Get all of the attributes for an XMLReader's current node
278 * @param XMLReader $r
279 * @return array of attributes
280 */
281 private function getAttributesArray( XMLReader $r ) {
282 $attrs = [];
283 while ( $r->moveToNextAttribute() ) {
284 if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) {
285 // XMLReader treats xmlns attributes as normal
286 // attributes, while xml_parse doesn't
287 continue;
288 }
289 $name = $this->expandNS( $r->name, $r->namespaceURI );
290 $attrs[$name] = $r->value;
291 }
292 return $attrs;
293 }
294
295 /**
296 * @param string $name element or attribute name, maybe with a full or short prefix
297 * @param string $namespaceURI the namespaceURI
298 * @return string the name prefixed with namespaceURI
299 */
300 private function expandNS( $name, $namespaceURI ) {
301 if ( $namespaceURI ) {
302 $parts = explode( ':', $name );
303 $localname = array_pop( $parts );
304 return "$namespaceURI:$localname";
305 }
306 return $name;
307 }
308
309 /**
310 * @param string $name
311 * @param string $attribs
312 */
313 private function elementOpen( $name, $attribs ) {
314 $this->elementDataContext[] = [ $name, $attribs ];
315 $this->elementData[] = '';
316 $this->stackDepth++;
317 }
318
319 private function elementClose() {
320 list( $name, $attribs ) = array_pop( $this->elementDataContext );
321 $data = array_pop( $this->elementData );
322 $this->stackDepth--;
323 $callbackReturn = false;
324
325 if ( is_callable( $this->filterCallback ) ) {
326 $callbackReturn = call_user_func(
327 $this->filterCallback,
328 $name,
329 $attribs,
330 $data
331 );
332 }
333 if ( $callbackReturn ) {
334 // Filter hit!
335 $this->filterMatch = true;
336 $this->filterMatchType = $callbackReturn;
337 }
338 }
339
340 /**
341 * @param string $data
342 */
343 private function elementData( $data ) {
344 // Collect any data here, and we'll run the callback in elementClose
345 $this->elementData[ $this->stackDepth - 1 ] .= trim( $data );
346 }
347
348 /**
349 * @param string $target
350 * @param string $data
351 */
352 private function processingInstructionHandler( $target, $data ) {
353 $callbackReturn = false;
354 if ( $this->parserOptions['processing_instruction_handler'] ) {
355 $callbackReturn = call_user_func(
356 $this->parserOptions['processing_instruction_handler'],
357 $target,
358 $data
359 );
360 }
361 if ( $callbackReturn ) {
362 // Filter hit!
363 $this->filterMatch = true;
364 $this->filterMatchType = $callbackReturn;
365 }
366 }
367 /**
368 * Handle coming across a <!DOCTYPE declaration.
369 *
370 * @param XMLReader $reader Reader currently pointing at DOCTYPE node.
371 */
372 private function DTDHandler( XMLReader $reader ) {
373 $externalCallback = $this->parserOptions['external_dtd_handler'];
374 $generalCallback = $this->parserOptions['dtd_handler'];
375 $checkIfSafe = $this->parserOptions['require_safe_dtd'];
376 if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
377 return;
378 }
379 $dtd = $reader->readOuterXML();
380 $callbackReturn = false;
381
382 if ( $generalCallback ) {
383 $callbackReturn = call_user_func( $generalCallback, $dtd );
384 }
385 if ( $callbackReturn ) {
386 // Filter hit!
387 $this->filterMatch = true;
388 $this->filterMatchType = $callbackReturn;
389 $callbackReturn = false;
390 }
391
392 $parsedDTD = $this->parseDTD( $dtd );
393 if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
394 $callbackReturn = call_user_func(
395 $externalCallback,
396 $parsedDTD['type'],
397 isset( $parsedDTD['publicid'] ) ? $parsedDTD['publicid'] : null,
398 isset( $parsedDTD['systemid'] ) ? $parsedDTD['systemid'] : null
399 );
400 }
401 if ( $callbackReturn ) {
402 // Filter hit!
403 $this->filterMatch = true;
404 $this->filterMatchType = $callbackReturn;
405 $callbackReturn = false;
406 }
407
408 if ( $checkIfSafe && isset( $parsedDTD['internal'] ) ) {
409 if ( !$this->checkDTDIsSafe( $parsedDTD['internal'] ) ) {
410 $this->wellFormed = false;
411 }
412 }
413 }
414
415 /**
416 * Check if the internal subset of the DTD is safe.
417 *
418 * We whitelist an extremely restricted subset of DTD features.
419 *
420 * Safe is defined as:
421 * * Only contains entity defintions (e.g. No <!ATLIST )
422 * * Entity definitions are not "system" entities
423 * * Entity definitions are not "parameter" (i.e. %) entities
424 * * Entity definitions do not reference other entites except &amp;
425 * and quotes. Entity aliases (where the entity contains only
426 * another entity are allowed)
427 * * Entity references aren't overly long (>255 bytes).
428 * * <!ATTLIST svg xmlns:xlink CDATA #FIXED "http://www.w3.org/1999/xlink">
429 * allowed if matched exactly for compatibility with graphviz
430 * * Comments.
431 *
432 * @param string $internalSubset The internal subset of the DTD
433 * @return bool true if safe.
434 */
435 private function checkDTDIsSafe( $internalSubset ) {
436 $offset = 0;
437 $res = preg_match(
438 '/^(?:\s*<!ENTITY\s+\S+\s+' .
439 '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&amp;|&quot;){0,255})"' .
440 '|\'(?:&[^"%&;]{1,64};|(?:[^\'%&]|&amp;|&apos;){0,255})\')\s*>' .
441 '|\s*<!--(?:[^-]|-[^-])*-->' .
442 '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
443 '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
444 $internalSubset
445 );
446
447 return (bool)$res;
448 }
449
450 /**
451 * Parse DTD into parts.
452 *
453 * If there is an error parsing the dtd, sets wellFormed to false.
454 *
455 * @param string $dtd
456 * @return array Possibly containing keys publicid, systemid, type and internal.
457 */
458 private function parseDTD( $dtd ) {
459 $m = [];
460 $res = preg_match(
461 '/^<!DOCTYPE\s*\S+\s*' .
462 '(?:(?P<typepublic>PUBLIC)\s*' .
463 '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
464 '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
465 '|(?P<typesystem>SYSTEM)\s*' .
466 '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
467 ')?\s*' .
468 '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
469 $dtd,
470 $m
471 );
472 if ( !$res ) {
473 $this->wellFormed = false;
474 return [];
475 }
476 $parsed = [];
477 foreach ( $m as $field => $value ) {
478 if ( $value === '' || is_numeric( $field ) ) {
479 continue;
480 }
481 switch ( $field ) {
482 case 'typepublic':
483 case 'typesystem':
484 $parsed['type'] = $value;
485 break;
486 case 'pubquote':
487 case 'pubapos':
488 $parsed['publicid'] = $value;
489 break;
490 case 'pubsysquote':
491 case 'pubsysapos':
492 case 'sysquote':
493 case 'sysapos':
494 $parsed['systemid'] = $value;
495 break;
496 case 'internal':
497 $parsed['internal'] = $value;
498 break;
499 }
500 }
501 return $parsed;
502 }
503 }