3 * XML syntax and type checker.
5 * Since 1.24.2, it uses XMLReader instead of xml_parse, which gives us
6 * more control over the expansion of XML entities. When passed to the
7 * callback, entities will be fully expanded, but may report the XML is
8 * invalid if expanding the entities are likely to cause a DoS.
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License along
21 * with this program; if not, write to the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 * http://www.gnu.org/copyleft/gpl.html
30 * @var bool|null Will be set to true or false to indicate whether the file is
31 * well-formed XML. Note that this doesn't check schema validity.
33 public $wellFormed = null;
36 * @var bool Will be set to true if the optional element filter returned
37 * a match at some point.
39 public $filterMatch = false;
42 * Will contain the type of filter hit if the optional element filter returned
43 * a match at some point.
46 public $filterMatchType = false;
49 * @var string Name of the document's root element, including any namespace
52 public $rootElement = '';
55 * @var string[] A stack of strings containing the data of each xml element as it's processed.
56 * Append data to the top string of the stack, then pop off the string and process it when the
59 protected $elementData = [];
62 * @var array A stack of element names and attributes, as we process them.
64 protected $elementDataContext = [];
67 * @var int Current depth of the data stack.
69 protected $stackDepth = 0;
71 /** @var callable|null */
72 protected $filterCallback;
75 * @var array Additional parsing options
77 private $parserOptions = [
78 'processing_instruction_handler' => null,
79 'external_dtd_handler' => '',
81 'require_safe_dtd' => true
85 * Allow filtering an XML file.
87 * Filters should return either true or a string to indicate something
88 * is wrong with the file. $this->filterMatch will store if the
89 * file failed validation (true = failed validation).
90 * $this->filterMatchType will contain the validation error.
91 * $this->wellFormed will contain whether the xml file is well-formed.
93 * @note If multiple filters are hit, only one of them will have the
94 * result stored in $this->filterMatchType.
96 * @param string $input a filename or string containing the XML element
97 * @param callable|null $filterCallback (optional)
98 * Function to call to do additional custom validity checks from the
99 * SAX element handler event. This gives you access to the element
100 * namespace, name, attributes, and text contents.
101 * Filter should return a truthy value describing the error.
102 * @param bool $isFile (optional) indicates if the first parameter is a
103 * filename (default, true) or if it is a string (false)
104 * @param array $options list of additional parsing options:
105 * processing_instruction_handler: Callback for xml_set_processing_instruction_handler
106 * external_dtd_handler: Callback for the url of external dtd subset
107 * dtd_handler: Callback given the full text of the <!DOCTYPE declaration.
108 * require_safe_dtd: Only allow non-recursive entities in internal dtd (default true)
110 function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
111 $this->filterCallback
= $filterCallback;
112 $this->parserOptions
= array_merge( $this->parserOptions
, $options );
113 $this->validateFromInput( $input, $isFile );
117 * Alternative constructor: from filename
119 * @param string $fname the filename of an XML document
120 * @param callable|null $filterCallback (optional)
121 * Function to call to do additional custom validity checks from the
122 * SAX element handler event. This gives you access to the element
123 * namespace, name, and attributes, but not to text contents.
124 * Filter should return 'true' to toggle on $this->filterMatch
125 * @return XmlTypeCheck
127 public static function newFromFilename( $fname, $filterCallback = null ) {
128 return new self( $fname, $filterCallback, true );
132 * Alternative constructor: from string
134 * @param string $string a string containing an XML element
135 * @param callable|null $filterCallback (optional)
136 * Function to call to do additional custom validity checks from the
137 * SAX element handler event. This gives you access to the element
138 * namespace, name, and attributes, but not to text contents.
139 * Filter should return 'true' to toggle on $this->filterMatch
140 * @return XmlTypeCheck
142 public static function newFromString( $string, $filterCallback = null ) {
143 return new self( $string, $filterCallback, false );
147 * Get the root element. Simple accessor to $rootElement
151 public function getRootElement() {
152 return $this->rootElement
;
157 * @param bool $isFile
159 private function validateFromInput( $xml, $isFile ) {
160 $reader = new XMLReader();
162 $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING
);
164 $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING
);
167 // Couldn't open the XML
168 $this->wellFormed
= false;
170 $oldDisable = libxml_disable_entity_loader( true );
171 $reader->setParserProperty( XMLReader
::SUBST_ENTITIES
, true );
173 $this->validate( $reader );
174 } catch ( Exception
$e ) {
175 // Calling this malformed, because we didn't parse the whole
176 // thing. Maybe just an external entity refernce.
177 $this->wellFormed
= false;
179 libxml_disable_entity_loader( $oldDisable );
183 libxml_disable_entity_loader( $oldDisable );
187 private function readNext( XMLReader
$reader ) {
188 set_error_handler( [ $this, 'XmlErrorHandler' ] );
189 $ret = $reader->read();
190 restore_error_handler();
194 public function XmlErrorHandler( $errno, $errstr ) {
195 $this->wellFormed
= false;
198 private function validate( $reader ) {
199 // First, move through anything that isn't an element, and
200 // handle any processing instructions with the callback
202 if ( !$this->readNext( $reader ) ) {
203 // Hit the end of the document before any elements
204 $this->wellFormed
= false;
207 if ( $reader->nodeType
=== XMLReader
::PI
) {
208 $this->processingInstructionHandler( $reader->name
, $reader->value
);
210 if ( $reader->nodeType
=== XMLReader
::DOC_TYPE
) {
211 $this->DTDHandler( $reader );
213 } while ( $reader->nodeType
!= XMLReader
::ELEMENT
);
215 // Process the rest of the document
217 switch ( $reader->nodeType
) {
218 case XMLReader
::ELEMENT
:
219 $name = $this->expandNS(
221 $reader->namespaceURI
223 if ( $this->rootElement
=== '' ) {
224 $this->rootElement
= $name;
226 $empty = $reader->isEmptyElement
;
227 $attrs = $this->getAttributesArray( $reader );
228 $this->elementOpen( $name, $attrs );
230 $this->elementClose();
234 case XMLReader
::END_ELEMENT
:
235 $this->elementClose();
238 case XMLReader
::WHITESPACE
:
239 case XMLReader
::SIGNIFICANT_WHITESPACE
:
240 case XMLReader
::CDATA
:
241 case XMLReader
::TEXT
:
242 $this->elementData( $reader->value
);
245 case XMLReader
::ENTITY_REF
:
246 // Unexpanded entity (maybe external?),
247 // don't send to the filter (xml_parse didn't)
250 case XMLReader
::COMMENT
:
251 // Don't send to the filter (xml_parse didn't)
255 // Processing instructions can happen after the header too
256 $this->processingInstructionHandler(
261 case XMLReader
::DOC_TYPE
:
262 // We should never see a doctype after first
264 $this->wellFormed
= false;
267 // One of DOC, ENTITY, END_ENTITY,
268 // NOTATION, or XML_DECLARATION
269 // xml_parse didn't send these to the filter, so we won't.
271 } while ( $this->readNext( $reader ) );
273 if ( $this->stackDepth
!== 0 ) {
274 $this->wellFormed
= false;
275 } elseif ( $this->wellFormed
=== null ) {
276 $this->wellFormed
= true;
281 * Get all of the attributes for an XMLReader's current node
282 * @param XMLReader $r
283 * @return array of attributes
285 private function getAttributesArray( XMLReader
$r ) {
287 while ( $r->moveToNextAttribute() ) {
288 if ( $r->namespaceURI
=== 'http://www.w3.org/2000/xmlns/' ) {
289 // XMLReader treats xmlns attributes as normal
290 // attributes, while xml_parse doesn't
293 $name = $this->expandNS( $r->name
, $r->namespaceURI
);
294 $attrs[$name] = $r->value
;
300 * @param string $name element or attribute name, maybe with a full or short prefix
301 * @param string $namespaceURI
302 * @return string the name prefixed with namespaceURI
304 private function expandNS( $name, $namespaceURI ) {
305 if ( $namespaceURI ) {
306 $parts = explode( ':', $name );
307 $localname = array_pop( $parts );
308 return "$namespaceURI:$localname";
314 * @param string $name
315 * @param array $attribs
317 private function elementOpen( $name, $attribs ) {
318 $this->elementDataContext
[] = [ $name, $attribs ];
319 $this->elementData
[] = '';
323 private function elementClose() {
324 list( $name, $attribs ) = array_pop( $this->elementDataContext
);
325 $data = array_pop( $this->elementData
);
327 $callbackReturn = false;
329 if ( is_callable( $this->filterCallback
) ) {
330 $callbackReturn = call_user_func(
331 $this->filterCallback
,
337 if ( $callbackReturn ) {
339 $this->filterMatch
= true;
340 $this->filterMatchType
= $callbackReturn;
345 * @param string $data
347 private function elementData( $data ) {
348 // Collect any data here, and we'll run the callback in elementClose
349 $this->elementData
[ $this->stackDepth
- 1 ] .= trim( $data );
353 * @param string $target
354 * @param string $data
356 private function processingInstructionHandler( $target, $data ) {
357 $callbackReturn = false;
358 if ( $this->parserOptions
['processing_instruction_handler'] ) {
359 $callbackReturn = call_user_func(
360 $this->parserOptions
['processing_instruction_handler'],
365 if ( $callbackReturn ) {
367 $this->filterMatch
= true;
368 $this->filterMatchType
= $callbackReturn;
373 * Handle coming across a <!DOCTYPE declaration.
375 * @param XMLReader $reader Reader currently pointing at DOCTYPE node.
377 private function DTDHandler( XMLReader
$reader ) {
378 $externalCallback = $this->parserOptions
['external_dtd_handler'];
379 $generalCallback = $this->parserOptions
['dtd_handler'];
380 $checkIfSafe = $this->parserOptions
['require_safe_dtd'];
381 if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
384 $dtd = $reader->readOuterXml();
385 $callbackReturn = false;
387 if ( $generalCallback ) {
388 $callbackReturn = call_user_func( $generalCallback, $dtd );
390 if ( $callbackReturn ) {
392 $this->filterMatch
= true;
393 $this->filterMatchType
= $callbackReturn;
394 $callbackReturn = false;
397 $parsedDTD = $this->parseDTD( $dtd );
398 if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
399 $callbackReturn = call_user_func(
402 $parsedDTD['publicid'] ??
null,
403 $parsedDTD['systemid'] ??
null
406 if ( $callbackReturn ) {
408 $this->filterMatch
= true;
409 $this->filterMatchType
= $callbackReturn;
410 $callbackReturn = false;
413 if ( $checkIfSafe && isset( $parsedDTD['internal'] ) &&
414 !$this->checkDTDIsSafe( $parsedDTD['internal'] )
416 $this->wellFormed
= false;
421 * Check if the internal subset of the DTD is safe.
423 * We whitelist an extremely restricted subset of DTD features.
425 * Safe is defined as:
426 * * Only contains entity definitions (e.g. No <!ATLIST )
427 * * Entity definitions are not "system" entities
428 * * Entity definitions are not "parameter" (i.e. %) entities
429 * * Entity definitions do not reference other entities except &
430 * and quotes. Entity aliases (where the entity contains only
431 * another entity are allowed)
432 * * Entity references aren't overly long (>255 bytes).
433 * * <!ATTLIST svg xmlns:xlink CDATA #FIXED "http://www.w3.org/1999/xlink">
434 * allowed if matched exactly for compatibility with graphviz
437 * @param string $internalSubset The internal subset of the DTD
438 * @return bool true if safe.
440 private function checkDTDIsSafe( $internalSubset ) {
443 '/^(?:\s*<!ENTITY\s+\S+\s+' .
444 '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&|"){0,255})"' .
445 '|\'(?:&[^"%&;]{1,64};|(?:[^\'%&]|&|'){0,255})\')\s*>' .
446 '|\s*<!--(?:[^-]|-[^-])*-->' .
447 '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
448 '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
456 * Parse DTD into parts.
458 * If there is an error parsing the dtd, sets wellFormed to false.
461 * @return array Possibly containing keys publicid, systemid, type and internal.
463 private function parseDTD( $dtd ) {
466 '/^<!DOCTYPE\s*\S+\s*' .
467 '(?:(?P<typepublic>PUBLIC)\s*' .
468 '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
469 '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
470 '|(?P<typesystem>SYSTEM)\s*' .
471 '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
473 '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
478 $this->wellFormed
= false;
482 foreach ( $m as $field => $value ) {
483 if ( $value === '' ||
is_numeric( $field ) ) {
489 $parsed['type'] = $value;
493 $parsed['publicid'] = $value;
499 $parsed['systemid'] = $value;
502 $parsed['internal'] = $value;