More file type checks...
authorBrion Vibber <brion@users.mediawiki.org>
Wed, 6 Feb 2008 01:23:12 +0000 (01:23 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Wed, 6 Feb 2008 01:23:12 +0000 (01:23 +0000)
* Switch XML type detection/validity check from dipping for XML processing instructions, doctypes, or subtags to just trying to parse it and checking the root element's name and namespace. This lets us properly handle SVG files which specify a namespace but no doctype, as well as rejecting files that aren't well-formed. (See http://meta.wikimedia.org/wiki/SVG_validity_checks for some samples of bad files I encountered.) Non-XML files will abort parsing pretty quickly, so this shouldn't be a big burden on other types that didn't hit a magic check.
* Fix Unicode unix script checks (er.... is that even right? :D), remove the iconv dependency

includes/AutoLoader.php
includes/MimeMagic.php
includes/XmlTypeCheck.php [new file with mode: 0644]

index 34c6704..2e2083b 100644 (file)
@@ -271,6 +271,7 @@ function __autoload($className) {
                'WikiErrorMsg' => 'includes/WikiError.php',
                'WikiXmlError' => 'includes/WikiError.php',
                'Xml' => 'includes/Xml.php',
+               'XmlTypeCheck' => 'includes/XmlTypeCheck.php',
                'ZhClient' => 'includes/ZhClient.php',
                'memcached' => 'includes/memcached-client.php',
                'EmaillingJob' => 'includes/JobQueue.php',
index 77a3062..75cfb6e 100644 (file)
@@ -455,71 +455,20 @@ class MimeMagic {
                /*
                 * look for XML formats (XHTML and SVG)
                 */
-               $xml_type = NULL;
-               if ( substr( $head, 0, 5 ) == "<?xml" ) {
-                       $xml_type = "ASCII";
-               } elseif ( substr( $head, 0, 8 ) == "\xef\xbb\xbf<?xml") {
-                       $xml_type = "UTF-8";
-               } elseif ( substr( $head, 0, 12 ) == "\xfe\xff\x00<\x00?\x00x\x00m\x00l" ) {
-                       $xml_type = "UTF-16BE";
-               } elseif ( substr( $head, 0, 12 ) == "\xff\xfe<\x00?\x00x\x00m\x00l\x00") {
-                       $xml_type = "UTF-16LE";
-               } else {
-                       /*
-                       echo "WARNING: Undetected xml_type ...\n";
-                       for( $i = 0; $i < 10; $i++ ) {
-                               $c = ord( $head{$i} );
-                               if( $c < 32 || $c > 126 ) {
-                                       printf( "\\x%02x", $c );
-                               } else {
-                                       print $head{$i};
-                               }
-                       }
-                       echo "\n";
-                       */
-               }
-
-               if( $xml_type == 'UTF-16BE' || $xml_type == 'UTF-16LE' ) {
-                       // Quick and dirty fold down to ASCII!
-                       $pack = array( 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' );
-                       $chars = unpack( $pack[$xml_type], substr( $head, 2 ) );
-                       $head = '';
-                       foreach( $chars as $codepoint ) {
-                               if( $codepoint < 128 ) {
-                                       $head .= chr( $codepoint );
-                               } else {
-                                       $head .= '?';
-                               }
-                       }
-               }
-
-               $match = array();
-               $doctype = "";
-               $tag = "";
-
-               if ( preg_match( '%<!DOCTYPE\s+[\w-]+\s+PUBLIC\s+["'."'".'"](.*?)["'."'".'"].*>%siD', 
-                       $head, $match ) ) {
-                               $doctype = $match[1];
-                       }
-               
-               if( $xml_type || $doctype ) {
-                       if ( preg_match( '%<(\w+)\b%si', $head, $match ) ) {
-                               $tag = $match[1];
-                       }
-
-                       #print "<br>ANALYSING $file: doctype= $doctype; tag= $tag<br>";
-
-                       if ( strpos( $doctype, "-//W3C//DTD SVG" ) === 0 ) {
-                               return "image/svg+xml";
-                       } elseif ( $tag === "svg" ) {
-                               return "image/svg+xml";
-                       } elseif ( strpos( $doctype, "-//W3C//DTD XHTML" ) === 0 ) {
-                               return "text/html";
-                       } elseif ( $tag === "html" ) {
-                               return "text/html";
+               $xml = new XmlTypeCheck( $file );
+               if( $xml->wellFormed ) {
+                       $types = array(
+                               'http://www.w3.org/2000/svg:svg'    => 'image/svg+xml',
+                               'svg'                               => 'image/svg+xml',
+                               'http://www.w3.org/1999/xhtml:html' => 'text/html', // application/xhtml+xml?
+                               'html'                              => 'text/html', // application/xhtml+xml?
+                       );
+                       if( isset( $types[$xml->rootElement] ) ) {
+                               $mime = $types[$xml->rootElement];
+                               return $mime;
                        } else {
                                /// Fixme -- this would be the place to allow additional XML type checks
-                               return "application/xml";
+                               return 'application/xml';
                        }
                }
 
@@ -541,7 +490,17 @@ class MimeMagic {
 
                if ( $script_type ) {
                        if ( $script_type !== "UTF-8" && $script_type !== "ASCII") {
-                               $head = iconv( $script_type, "ASCII//IGNORE", $head);
+                               // Quick and dirty fold down to ASCII!
+                               $pack = array( 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' );
+                               $chars = unpack( $pack[$script_type], substr( $head, 2 ) );
+                               $head = '';
+                               foreach( $chars as $codepoint ) {
+                                       if( $codepoint < 128 ) {
+                                               $head .= chr( $codepoint );
+                                       } else {
+                                               $head .= '?';
+                                       }
+                               }
                        }
 
                        $match = array();
diff --git a/includes/XmlTypeCheck.php b/includes/XmlTypeCheck.php
new file mode 100644 (file)
index 0000000..639d1f8
--- /dev/null
@@ -0,0 +1,93 @@
+<?php
+
+class XmlTypeCheck {
+       /**
+        * Will be set to true or false to indicate whether the file is
+        * well-formed XML. Note that this doesn't check schema validity.
+        */
+       public $wellFormed = false;
+       
+       /**
+        * Name of the document's root element, including any namespace
+        * as an expanded URL.
+        */
+       public $rootElement = '';
+       
+       private $softNamespaces;
+       private $namespaces = array();
+       
+       /**
+        * @param $file string filename
+        * @param $softNamespaces bool
+        *        If set to true, use of undeclared XML namespaces will be ignored.
+        *        This matches the behavior of rsvg, but more compliant consumers
+        *        such as Firefox will reject such files.
+        *        Leave off for the default, stricter checks.
+        */
+       function __construct( $file, $softNamespaces=false ) {
+               $this->softNamespaces = $softNamespaces;
+               $this->run( $file );
+       }
+       
+       private function run( $fname ) {
+               if( $this->softNamespaces ) {
+                       $parser = xml_parser_create( 'UTF-8' );
+               } else {
+                       $parser = xml_parser_create_ns( 'UTF-8' );
+               }
+               
+               // case folding violates XML standard, turn it off
+               xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
+               
+               xml_set_element_handler( $parser, array( $this, 'elementOpen' ), false );
+
+               $file = fopen( $fname, "rb" );
+               do {
+                       $chunk = fread( $file, 32768 );
+                       $ret = xml_parse( $parser, $chunk, feof( $file ) );
+                       if( $ret == 0 ) {
+                               // XML isn't well-formed!
+                               fclose( $file );
+                               xml_parser_free( $parser );
+                               return;
+                       }
+               } while( !feof( $file ) );
+               
+               $this->wellFormed = true;
+               
+               fclose( $file );
+               xml_parser_free( $parser );
+       }
+
+       private function elementOpen( $parser, $name, $attribs ) {
+               if( $this->softNamespaces ) {
+                       // Check namespaces manually, so expat doesn't throw
+                       // errors on use of undeclared namespaces.
+                       foreach( $attribs as $attrib => $val ) {
+                               if( $attrib == 'xmlns' ) {
+                                       $this->namespaces[''] = $val;
+                               } elseif( substr( $attrib, 0, strlen( 'xmlns:' ) ) == 'xmlns:' ) {
+                                       $this->namespaces[substr( $attrib, strlen( 'xmlns:' ) )] = $val;
+                               }
+                       }
+                       
+                       if( strpos( $name, ':' ) === false ) {
+                               $ns = '';
+                               $subname = $name;
+                       } else {
+                               list( $ns, $subname ) = explode( ':', $name, 2 );
+                       }
+                       
+                       if( isset( $this->namespaces[$ns] ) ) {
+                               $name = $this->namespaces[$ns] . ':' . $subname;
+                       } else {
+                               // Technically this is invalid for XML with Namespaces.
+                               // But..... we'll just let it slide in soft mode.
+                       }
+               }
+               
+               // We only need the first open element
+               $this->rootElement = $name;
+               xml_set_element_handler( $parser, false, false );
+       }
+}