Spaces to tabs
[lhc/web/wiklou.git] / includes / MimeMagic.php
index fd42499..018f601 100644 (file)
@@ -1,45 +1,98 @@
 <?php
-/** Module defining helper functions for detecting and dealing with mime types.
+/**
+ * Module defining helper functions for detecting and dealing with mime types.
  *
+ * @file
  */
 
- /** Defines a set of well known mime types
+/**
+ * Defines a set of well known mime types
  * This is used as a fallback to mime.types files.
  * An extensive list of well known mime types is provided by
  * the file mime.types in the includes directory.
+ * 
+ * This list concatenated with mime.types is used to create a mime <-> ext
+ * map. Each line contains a mime type followed by a space separated list of
+ * extensions. If multiple extensions for a single mime type exist or if 
+ * multiple mime types exist for a single extension then in most cases
+ * MediaWiki assumes that the first extension following the mime type is the
+ * canonical extension, and the first time a mime type appears for a certain
+ * extension is considered the canonical mime type.
+ * 
+ * (Note that appending $wgMimeTypeFile to the end of MM_WELL_KNOWN_MIME_TYPES
+ * sucks because you can't redefine canonical types. This could be fixed by 
+ * appending MM_WELL_KNOWN_MIME_TYPES behind $wgMimeTypeFile, but who knows
+ * what will break? In practice this probably isn't a problem anyway -- Bryan)
  */
 define('MM_WELL_KNOWN_MIME_TYPES',<<<END_STRING
-application/ogg ogg ogm
+application/ogg ogx ogg ogm ogv oga spx
 application/pdf pdf
+application/vnd.oasis.opendocument.chart odc
+application/vnd.oasis.opendocument.chart-template otc
+application/vnd.oasis.opendocument.formula odf
+application/vnd.oasis.opendocument.formula-template otf
+application/vnd.oasis.opendocument.graphics odg
+application/vnd.oasis.opendocument.graphics-template otg
+application/vnd.oasis.opendocument.image odi
+application/vnd.oasis.opendocument.image-template oti
+application/vnd.oasis.opendocument.presentation odp
+application/vnd.oasis.opendocument.presentation-template otp
+application/vnd.oasis.opendocument.spreadsheet ods
+application/vnd.oasis.opendocument.spreadsheet-template ots
+application/vnd.oasis.opendocument.text odt
+application/vnd.oasis.opendocument.text-template ott
+application/vnd.oasis.opendocument.text-master otm
+application/vnd.oasis.opendocument.text-web oth
 application/x-javascript js
 application/x-shockwave-flash swf
 audio/midi mid midi kar
 audio/mpeg mpga mpa mp2 mp3
 audio/x-aiff aif aiff aifc
 audio/x-wav wav
-audio/ogg ogg
+audio/ogg oga spx ogg
 image/x-bmp bmp
 image/gif gif
 image/jpeg jpeg jpg jpe
 image/png png
-image/svg+xml image/svg svg
+image/svg+xml svg 
+image/svg svg
 image/tiff tiff tif
-image/vnd.djvu image/x.djvu image/x-djvu djvu
+image/vnd.djvu djvu
+image/x.djvu djvu
+image/x-djvu djvu
 image/x-portable-pixmap ppm
+image/x-xcf xcf
 text/plain txt
 text/html html htm
-video/ogg ogm ogg
+video/ogg ogv ogm ogg
 video/mpeg mpg mpeg
 END_STRING
 );
 
- /** Defines a set of well known mime info entries
+/**
+ * Defines a set of well known mime info entries
  * This is used as a fallback to mime.info files.
  * An extensive list of well known mime types is provided by
  * the file mime.info in the includes directory.
  */
 define('MM_WELL_KNOWN_MIME_INFO', <<<END_STRING
 application/pdf [OFFICE]
+application/vnd.oasis.opendocument.chart [OFFICE]
+application/vnd.oasis.opendocument.chart-template [OFFICE]
+application/vnd.oasis.opendocument.formula [OFFICE]
+application/vnd.oasis.opendocument.formula-template [OFFICE]
+application/vnd.oasis.opendocument.graphics [OFFICE]
+application/vnd.oasis.opendocument.graphics-template [OFFICE]
+application/vnd.oasis.opendocument.image [OFFICE]
+application/vnd.oasis.opendocument.image-template [OFFICE]
+application/vnd.oasis.opendocument.presentation [OFFICE]
+application/vnd.oasis.opendocument.presentation-template [OFFICE]
+application/vnd.oasis.opendocument.spreadsheet [OFFICE]
+application/vnd.oasis.opendocument.spreadsheet-template [OFFICE]
+application/vnd.oasis.opendocument.text [OFFICE]
+application/vnd.oasis.opendocument.text-template [OFFICE]
+application/vnd.oasis.opendocument.text-master [OFFICE]
+application/vnd.oasis.opendocument.text-web [OFFICE]
 text/javascript application/x-javascript [EXECUTABLE]
 application/x-shockwave-flash [MULTIMEDIA]
 audio/midi [AUDIO]
@@ -47,13 +100,14 @@ audio/x-aiff [AUDIO]
 audio/x-wav [AUDIO]
 audio/mp3 audio/mpeg [AUDIO]
 application/ogg audio/ogg video/ogg [MULTIMEDIA]
-image/x-bmp image/bmp [BITMAP]
+image/x-bmp image/x-ms-bmp image/bmp [BITMAP]
 image/gif [BITMAP]
 image/jpeg [BITMAP]
 image/png [BITMAP]
 image/svg+xml [DRAWING]
 image/tiff [BITMAP]
 image/vnd.djvu [BITMAP]
+image/x-xcf [BITMAP]
 image/x-portable-pixmap [BITMAP]
 text/plain [TEXT]
 text/html [TEXT]
@@ -68,10 +122,10 @@ END_STRING
 global $wgLoadFileinfoExtension;
 
 if ($wgLoadFileinfoExtension) {
-       if(!extension_loaded('fileinfo')) dl('fileinfo.' . PHP_SHLIB_SUFFIX);
+       wfDl( 'fileinfo' );
 }
 
-/** 
+/**
  * Implements functions related to mime types such as detection and mapping to
  * file extension.
  *
@@ -84,19 +138,23 @@ class MimeMagic {
        * Mapping of media types to arrays of mime types.
        * This is used by findMediaType and getMediaType, respectively
        */
-       var $mMediaTypes= NULL;
+       var $mMediaTypes= null;
 
        /** Map of mime type aliases
        */
-       var $mMimeTypeAliases= NULL;
+       var $mMimeTypeAliases= null;
 
        /** map of mime types to file extensions (as a space seprarated list)
        */
-       var $mMimeToExt= NULL;
+       var $mMimeToExt= null;
 
        /** map of file extensions types to mime types (as a space seprarated list)
        */
-       var $mExtToMime= NULL;
+       var $mExtToMime= null;
+
+       /** IEContentAnalyzer instance
+        */
+       var $mIEAnalyzer;
 
        /** The singleton instance
         */
@@ -118,7 +176,7 @@ class MimeMagic {
                if ( $wgMimeTypeFile == 'includes/mime.types' ) {
                        $wgMimeTypeFile = "$IP/$wgMimeTypeFile";
                }
-               
+
                if ( $wgMimeTypeFile ) {
                        if ( is_file( $wgMimeTypeFile ) and is_readable( $wgMimeTypeFile ) ) {
                                wfDebug( __METHOD__.": loading mime types from $wgMimeTypeFile\n" );
@@ -290,7 +348,7 @@ class MimeMagic {
        */
        function guessTypesForExtension( $ext ) {
                $m = $this->getTypesForExtension( $ext );
-               if ( is_null( $m ) ) return NULL;
+               if ( is_null( $m ) ) return null;
 
                $m = trim( $m );
                $m = preg_replace( '/\s.*$/', '', $m );
@@ -307,7 +365,7 @@ class MimeMagic {
                $ext = $this->getExtensionsForType( $mime );
 
                if ( !$ext ) {
-                       return NULL;  //unknown
+                       return null;  //unknown
                }
 
                $ext = explode( ' ', $ext );
@@ -351,179 +409,394 @@ class MimeMagic {
         */
        function isRecognizableExtension( $extension ) {
                static $types = array(
+                       // Types recognized by getimagesize()
                        'gif', 'jpeg', 'jpg', 'png', 'swf', 'psd',
                        'bmp', 'tiff', 'tif', 'jpc', 'jp2',
                        'jpx', 'jb2', 'swc', 'iff', 'wbmp',
-                       'xbm', 'djvu'
+                       'xbm',
+
+                       // Formats we recognize magic numbers for
+                       'djvu', 'ogx', 'ogg', 'ogv', 'oga', 'spx',
+                       'mid', 'pdf', 'wmf', 'xcf', 'webm', 'mkv', 'mka',
+                       'webp',
+
+                       // XML formats we sure hope we recognize reliably
+                       'svg',
                );
                return in_array( strtolower( $extension ), $types );
        }
 
+       /** improves a mime type using the file extension. Some file formats are very generic,
+       * so their mime type is not very meaningful. A more useful mime type can be derived 
+       * by looking at the file extension. Typically, this method would be called on the 
+       * result of guessMimeType().
+       * 
+       * Currently, this method does the following:
+       *
+       * If $mime is "unknown/unknown" and isRecognizableExtension( $ext ) returns false,
+       * return the result of guessTypesForExtension($ext). 
+       *
+       * If $mime is "application/x-opc+zip" and isMatchingExtension( $ext, $mime )
+       * gives true, return the result of guessTypesForExtension($ext). 
+       *
+       * @param $mime String: the mime type, typically guessed from a file's content.
+       * @param $ext String: the file extension, as taken from the file name
+       *
+       * @return string the mime type
+       */
+       function improveTypeFromExtension( $mime, $ext ) {
+               if ( $mime === "unknown/unknown" ) {
+                       if( $this->isRecognizableExtension( $ext ) ) {
+                               wfDebug( __METHOD__. ": refusing to guess mime type for .$ext file, " .
+                                       "we should have recognized it\n" );
+                       } else {
+                               /* Not something we can detect, so simply 
+                               * trust the file extension */
+                               $mime = $this->guessTypesForExtension( $ext );
+                       }
+               }
+               else if ( $mime === "application/x-opc+zip" ) {
+                       if ( $this->isMatchingExtension( $ext, $mime ) ) {
+                               /* A known file extension for an OPC file,
+                               * find the proper mime type for that file extension */
+                               $mime = $this->guessTypesForExtension( $ext );
+                       } else {
+                               wfDebug( __METHOD__. ": refusing to guess better type for $mime file, " . 
+                                       ".$ext is not a known OPC extension.\n" );
+                               $mime = "application/zip";
+                       }
+               }
+
+               if ( isset( $this->mMimeTypeAliases[$mime] ) ) {
+                       $mime = $this->mMimeTypeAliases[$mime];
+               }
+
+               wfDebug(__METHOD__.": improved mime type for .$ext: $mime\n");
+               return $mime;
+       }
 
        /** mime type detection. This uses detectMimeType to detect the mime type of the file,
        * but applies additional checks to determine some well known file formats that may be missed
-       * or misinterpreter by the default mime detection (namely xml based formats like XHTML or SVG).
+       * or misinterpreter by the default mime detection (namely XML based formats like XHTML or SVG,
+       * as well as ZIP based formats like OPC/ODF files).
        *
-       * @param string $file The file to check
-       * @param mixed $ext The file extension, or true to extract it from the filename. 
-       *                   Set it to false to ignore the extension.
+       * @param $file String: the file to check
+       * @param $ext Mixed: the file extension, or true (default) to extract it from the filename.
+       *             Set it to false to ignore the extension. DEPRECATED! Set to false, use 
+       *             improveTypeFromExtension($mime, $ext) later to improve mime type.
        *
        * @return string the mime type of $file
        */
        function guessMimeType( $file, $ext = true ) {
-               $mime = $this->detectMimeType( $file, $ext );
+               if( $ext ) { # TODO: make $ext default to false. Or better, remove it.
+                       wfDebug( __METHOD__.": WARNING: use of the \$ext parameter is deprecated. " .
+                               "Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
+               }
+
+               $mime = $this->doGuessMimeType( $file, $ext );
+
+               if( !$mime ) {
+                       wfDebug( __METHOD__.": internal type detection failed for $file (.$ext)...\n" );
+                       $mime = $this->detectMimeType( $file, $ext );
+               }
 
+               if ( isset( $this->mMimeTypeAliases[$mime] ) ) {
+                       $mime = $this->mMimeTypeAliases[$mime];
+               }
+
+               wfDebug(__METHOD__.": guessed mime type of $file: $mime\n");
+               return $mime;
+       }
+
+       private function doGuessMimeType( $file, $ext ) { # TODO: remove $ext param
                // Read a chunk of the file
                wfSuppressWarnings();
                $f = fopen( $file, "rt" );
                wfRestoreWarnings();
                if( !$f ) return "unknown/unknown";
                $head = fread( $f, 1024 );
+               fseek( $f, -65558, SEEK_END );
+               $tail = fread( $f, 65558 ); // 65558 = maximum size of a zip EOCDR
                fclose( $f );
 
-               $sub4 =  substr( $head, 0, 4 );
-               if ( $sub4 == "\x01\x00\x09\x00" || $sub4 == "\xd7\xcd\xc6\x9a" ) {
-                       // WMF kill kill kill
-                       // Note that WMF may have a bare header, no magic number.
-                       // The former of the above two checks is theoretically prone to false positives
-                       $mime = "application/x-msmetafile";
-               }
-
-               if ( strpos( $mime, "text/" ) === 0 || $mime === "application/xml" ) {
-
-                       $xml_type = NULL;
-                       $script_type = NULL;
-
-                       /*
-                       * look for XML formats (XHTML and SVG)
-                       */
-                       if ($mime === "text/sgml" ||
-                           $mime === "text/plain" ||
-                           $mime === "text/html" ||
-                           $mime === "text/xml" ||
-                           $mime === "application/xml") {
-
-                               if ( substr( $head, 0, 5 ) == "<?xml" ) {
-                                       $xml_type = "ASCII";
-                               } elseif ( substr( $head, 0, 8 ) == "\xef\xbb\xbf<?xml") {
-                                       $xml_type = "UTF-8";
-                               } elseif ( substr( $head, 0, 10 ) == "\xfe\xff\x00<\x00?\x00x\x00m\x00l" ) {
-                                       $xml_type = "UTF-16BE";
-                               } elseif ( substr( $head, 0, 10 ) == "\xff\xfe<\x00?\x00x\x00m\x00l\x00") {
-                                       $xml_type = "UTF-16LE";
-                               }
-
-                               if ( $xml_type ) {
-                                       if ( $xml_type !== "UTF-8" && $xml_type !== "ASCII" ) {
-                                               $head = iconv( $xml_type, "ASCII//IGNORE", $head );
-                                       }
+               wfDebug( __METHOD__ . ": analyzing head and tail of $file for magic numbers.\n" );
 
-                                       $match = array();
-                                       $doctype = "";
-                                       $tag = "";
+               // Hardcode a few magic number checks...
+               $headers = array(
+                       // Multimedia...
+                       'MThd'             => 'audio/midi',
+                       'OggS'             => 'application/ogg',
 
-                                       if ( preg_match( '%<!DOCTYPE\s+[\w-]+\s+PUBLIC\s+["'."'".'"](.*?)["'."'".'"].*>%sim', 
-                                               $head, $match ) ) {
-                                                       $doctype = $match[1];
-                                               }
-                                       if ( preg_match( '%<(\w+).*>%sim', $head, $match ) ) {
-                                               $tag = $match[1];
-                                       }
+                       // Image formats...
+                       // Note that WMF may have a bare header, no magic number.
+                       "\x01\x00\x09\x00" => 'application/x-msmetafile', // Possibly prone to false positives?
+                       "\xd7\xcd\xc6\x9a" => 'application/x-msmetafile',
+                       '%PDF'             => 'application/pdf',
+                       'gimp xcf'         => 'image/x-xcf',
+
+                       // Some forbidden fruit...
+                       'MZ'               => 'application/octet-stream', // DOS/Windows executable
+                       "\xca\xfe\xba\xbe" => 'application/octet-stream', // Mach-O binary
+                       "\x7fELF"          => 'application/octet-stream', // ELF binary
+               );
 
-                                       #print "<br>ANALYSING $file ($mime): doctype= $doctype; tag= $tag<br>";
+               foreach( $headers as $magic => $candidate ) {
+                       if( strncmp( $head, $magic, strlen( $magic ) ) == 0 ) {
+                               wfDebug( __METHOD__ . ": magic header in $file recognized as $candidate\n" );
+                               return $candidate;
+                       }
+               }
 
-                                       if ( strpos( $doctype, "-//W3C//DTD SVG" ) === 0 ) {
-                                               $mime = "image/svg+xml";
-                                       } elseif ( $tag === "svg" ) {
-                                               $mime = "image/svg+xml";
-                                       } elseif ( strpos( $doctype, "-//W3C//DTD XHTML" ) === 0 ) {
-                                               $mime = "text/html";
-                                       } elseif ( $tag === "html" ) {
-                                               $mime = "text/html";
-                                       }
+               /* Look for WebM and Matroska files */
+               if( strncmp( $head, pack( "C4", 0x1a, 0x45, 0xdf, 0xa3 ), 4 ) == 0 ) {
+                       $doctype = strpos( $head, "\x42\x82" );
+                       if( $doctype ) {
+                               // Next byte is datasize, then data (sizes larger than 1 byte are very stupid muxers)
+                               $data = substr($head, $doctype+3, 8);
+                               if( strncmp( $data, "matroska", 8 ) == 0 ) {
+                                       wfDebug( __METHOD__ . ": recognized file as video/x-matroska\n" );
+                                       return "video/x-matroska";
+                               } else if ( strncmp( $data, "webm", 4 ) == 0 ) {
+                                       wfDebug( __METHOD__ . ": recognized file as video/webm\n" );
+                                       return "video/webm";
                                }
                        }
+                       wfDebug( __METHOD__ . ": unknown EBML file\n" );
+                       return "unknown/unknown";
+               }
 
-                       /*
-                       * look for shell scripts
-                       */
-                       if ( !$xml_type ) {
-                               $script_type = NULL;
-
-                               # detect by shebang
-                               if ( substr( $head, 0, 2) == "#!" ) {
-                                       $script_type = "ASCII";
-                               } elseif ( substr( $head, 0, 5) == "\xef\xbb\xbf#!" ) {
-                                       $script_type = "UTF-8";
-                               } elseif ( substr( $head, 0, 7) == "\xfe\xff\x00#\x00!" ) {
-                                       $script_type = "UTF-16BE";
-                               } elseif ( substr( $head, 0, 7 ) == "\xff\xfe#\x00!" ) {
-                                       $script_type= "UTF-16LE";
-                               }
+               /* Look for WebP */
+               if( strncmp( $head, "RIFF", 4 ) == 0 && strncmp( substr( $head, 8, 8), "WEBPVP8 ", 8 ) == 0 ) {
+                       wfDebug( __METHOD__ . ": recognized file as image/webp\n" );
+                       return "image/webp";
+               }
 
-                               if ( $script_type ) {
-                                       if ( $script_type !== "UTF-8" && $script_type !== "ASCII") {
-                                               $head = iconv( $script_type, "ASCII//IGNORE", $head);
-                                       }
+               /*
+                * Look for PHP.  Check for this before HTML/XML...  Warning: this is a
+                * heuristic, and won't match a file with a lot of non-PHP before.  It
+                * will also match text files which could be PHP. :)
+                *
+                * FIXME: For this reason, the check is probably useless -- an attacker
+                * could almost certainly just pad the file with a lot of nonsense to
+                * circumvent the check in any case where it would be a security
+                * problem.  On the other hand, it causes harmful false positives (bug
+                * 16583).  The heuristic has been cut down to exclude three-character
+                * strings like "<? ", but should it be axed completely?
+                */
+               if( ( strpos( $head, '<?php' ) !== false ) ||
+
+                   ( strpos( $head, "<\x00?\x00p\x00h\x00p" ) !== false ) ||
+                   ( strpos( $head, "<\x00?\x00 " ) !== false ) ||
+                   ( strpos( $head, "<\x00?\x00\n" ) !== false ) ||
+                   ( strpos( $head, "<\x00?\x00\t" ) !== false ) ||
+                   ( strpos( $head, "<\x00?\x00=" ) !== false ) ) {
+
+                       wfDebug( __METHOD__ . ": recognized $file as application/x-php\n" );
+                       return "application/x-php";
+               }
+
+               /*
+                * look for XML formats (XHTML and SVG)
+                */
+               $xml = new XmlTypeCheck( $file );
+               if( $xml->wellFormed ) {
+                       global $wgXMLMimeTypes;
+                       if( isset( $wgXMLMimeTypes[$xml->getRootElement()] ) ) {
+                               return $wgXMLMimeTypes[$xml->getRootElement()];
+                       } else {
+                               return 'application/xml';
+                       }
+               }
 
-                                       $match = array();
+               /*
+                * look for shell scripts
+                */
+               $script_type = null;
+
+               # detect by shebang
+               if ( substr( $head, 0, 2) == "#!" ) {
+                       $script_type = "ASCII";
+               } elseif ( substr( $head, 0, 5) == "\xef\xbb\xbf#!" ) {
+                       $script_type = "UTF-8";
+               } elseif ( substr( $head, 0, 7) == "\xfe\xff\x00#\x00!" ) {
+                       $script_type = "UTF-16BE";
+               } elseif ( substr( $head, 0, 7 ) == "\xff\xfe#\x00!" ) {
+                       $script_type= "UTF-16LE";
+               }
 
-                                       if ( preg_match( '%/?([^\s]+/)(\w+)%', $head, $match ) ) {
-                                               $mime = "application/x-{$match[2]}";
+               if ( $script_type ) {
+                       if ( $script_type !== "UTF-8" && $script_type !== "ASCII") {
+                               // Quick and dirty fold down to ASCII!
+                               $pack = array( 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' );
+                               $chars = unpack( $pack[$script_type], substr( $head, 2 ) );
+                               $head = '';
+                               foreach( $chars as $codepoint ) {
+                                       if( $codepoint < 128 ) {
+                                               $head .= chr( $codepoint );
+                                       } else {
+                                               $head .= '?';
                                        }
                                }
                        }
 
-                       /*
-                       * look for PHP
-                       */
-                       if( !$xml_type && !$script_type ) {
+                       $match = array();
 
-                               if( ( strpos( $head, '<?php' ) !== false ) ||
-                                   ( strpos( $head, '<? ' ) !== false ) ||
-                                   ( strpos( $head, "<?\n" ) !== false ) ||
-                                   ( strpos( $head, "<?\t" ) !== false ) ||
-                                   ( strpos( $head, "<?=" ) !== false ) ||
+                       if ( preg_match( '%/?([^\s]+/)(\w+)%', $head, $match ) ) {
+                               $mime = "application/x-{$match[2]}";
+                               wfDebug( __METHOD__.": shell script recognized as $mime\n" );
+                               return $mime;
+                       }
+               }
 
-                                   ( strpos( $head, "<\x00?\x00p\x00h\x00p" ) !== false ) ||
-                                   ( strpos( $head, "<\x00?\x00 " ) !== false ) ||
-                                   ( strpos( $head, "<\x00?\x00\n" ) !== false ) ||
-                                   ( strpos( $head, "<\x00?\x00\t" ) !== false ) ||
-                                   ( strpos( $head, "<\x00?\x00=" ) !== false ) ) {
+               // Check for ZIP variants (before getimagesize)
+               if ( strpos( $tail, "PK\x05\x06" ) !== false ) {
+                       wfDebug( __METHOD__.": ZIP header present in $file\n" );
+                       return $this->detectZipType( $head, $tail, $ext );
+               }
 
-                                       $mime = "application/x-php";
-                               }
-                       }
+               wfSuppressWarnings();
+               $gis = getimagesize( $file );
+               wfRestoreWarnings();
 
+               if( $gis && isset( $gis['mime'] ) ) {
+                       $mime = $gis['mime'];
+                       wfDebug( __METHOD__.": getimagesize detected $file as $mime\n" );
+                       return $mime;
                }
 
-               if ( isset( $this->mMimeTypeAliases[$mime] ) ) {
-                       $mime = $this->mMimeTypeAliases[$mime];
+               // Also test DjVu
+               $deja = new DjVuImage( $file );
+               if( $deja->isValid() ) {
+                       wfDebug( __METHOD__.": detected $file as image/vnd.djvu\n" );
+                       return 'image/vnd.djvu';
                }
 
-               wfDebug(__METHOD__.": final mime type of $file: $mime\n");
+               return false;
+       }
+       
+       /**
+        * Detect application-specific file type of a given ZIP file from its
+        * header data.  Currently works for OpenDocument and OpenXML types...
+        * If can't tell, returns 'application/zip'.
+        *
+        * @param $header String: some reasonably-sized chunk of file header
+        * @param $tail   String: the tail of the file
+        * @param $ext Mixed: the file extension, or true to extract it from the filename.
+        *             Set it to false (default) to ignore the extension. DEPRECATED! Set to false, 
+        *             use improveTypeFromExtension($mime, $ext) later to improve mime type.
+        *
+        * @return string
+        */
+       function detectZipType( $header, $tail = null, $ext = false ) {
+               if( $ext ) { # TODO: remove $ext param
+                       wfDebug( __METHOD__.": WARNING: use of the \$ext parameter is deprecated. " .
+                               "Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
+               }
+
+               $mime = 'application/zip';
+               $opendocTypes = array(
+                       'chart-template',
+                       'chart',
+                       'formula-template',
+                       'formula',
+                       'graphics-template',
+                       'graphics',
+                       'image-template',
+                       'image',
+                       'presentation-template',
+                       'presentation',
+                       'spreadsheet-template',
+                       'spreadsheet',
+                       'text-template',
+                       'text-master',
+                       'text-web',
+                       'text' );
+
+               // http://lists.oasis-open.org/archives/office/200505/msg00006.html
+               $types = '(?:' . implode( '|', $opendocTypes ) . ')';
+               $opendocRegex = "/^mimetype(application\/vnd\.oasis\.opendocument\.$types)/";
+
+               $openxmlRegex = "/^\[Content_Types\].xml/";
+
+               if( preg_match( $opendocRegex, substr( $header, 30 ), $matches ) ) {
+                       $mime = $matches[1];
+                       wfDebug( __METHOD__.": detected $mime from ZIP archive\n" );
+               } elseif( preg_match( $openxmlRegex, substr( $header, 30 ) ) ) {
+                       $mime = "application/x-opc+zip";
+                       # TODO: remove the block below, as soon as improveTypeFromExtension is used everywhere 
+                       if( $ext !== true && $ext !== false ) { 
+                               /** This is the mode used by getPropsFromPath
+                               * These mime's are stored in the database, where we don't really want
+                               * x-opc+zip, because we use it only for internal purposes
+                               */
+                               if( $this->isMatchingExtension( $ext, $mime) ) {
+                                       /* A known file extension for an OPC file,
+                                       * find the proper mime type for that file extension */
+                                       $mime = $this->guessTypesForExtension( $ext );
+                               } else {
+                                       $mime = "application/zip";
+                               }
+                       }
+                       wfDebug( __METHOD__.": detected an Open Packaging Conventions archive: $mime\n" );
+               } else if( substr( $header, 0, 8 ) == "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" && 
+                               ($headerpos = strpos( $tail, "PK\x03\x04" ) ) !== false &&
+                               preg_match( $openxmlRegex, substr( $tail, $headerpos + 30 ) ) ) {
+                       if( substr( $header, 512, 4) == "\xEC\xA5\xC1\x00" ) {
+                               $mime = "application/msword";
+                       } 
+                       switch( substr( $header, 512, 6) ) {
+                               case "\xEC\xA5\xC1\x00\x0E\x00":
+                               case "\xEC\xA5\xC1\x00\x1C\x00":
+                               case "\xEC\xA5\xC1\x00\x43\x00":
+                                       $mime = "application/vnd.ms-powerpoint";
+                                       break;
+                               case "\xFD\xFF\xFF\xFF\x10\x00":
+                               case "\xFD\xFF\xFF\xFF\x1F\x00":
+                               case "\xFD\xFF\xFF\xFF\x22\x00":
+                               case "\xFD\xFF\xFF\xFF\x23\x00":
+                               case "\xFD\xFF\xFF\xFF\x28\x00":
+                               case "\xFD\xFF\xFF\xFF\x29\x00":
+                               case "\xFD\xFF\xFF\xFF\x10\x02":
+                               case "\xFD\xFF\xFF\xFF\x1F\x02":
+                               case "\xFD\xFF\xFF\xFF\x22\x02":
+                               case "\xFD\xFF\xFF\xFF\x23\x02":
+                               case "\xFD\xFF\xFF\xFF\x28\x02":
+                               case "\xFD\xFF\xFF\xFF\x29\x02":
+                                       $mime = "application/vnd.msexcel";
+                                       break;
+                       }
+
+                       wfDebug( __METHOD__.": detected a MS Office document with OPC trailer\n");
+               } else {
+                       wfDebug( __METHOD__.": unable to identify type of ZIP archive\n" );
+               }
                return $mime;
        }
 
        /** Internal mime type detection, please use guessMimeType() for application code instead.
        * Detection is done using an external program, if $wgMimeDetectorCommand is set.
        * Otherwise, the fileinfo extension and mime_content_type are tried (in this order), if they are available.
-       * If the dections fails and $ext is not false, the mime type is guessed from the file extension, using 
+       * If the dections fails and $ext is not false, the mime type is guessed from the file extension, using
        * guessTypesForExtension.
        * If the mime type is still unknown, getimagesize is used to detect the mime type if the file is an image.
        * If no mime type can be determined, this function returns "unknown/unknown".
        *
-       * @param string $file The file to check
-       * @param mixed $ext The file extension, or true to extract it from the filename. 
-       *                   Set it to false to ignore the extension.
+       * @param $file String: the file to check
+       * @param $ext Mixed: the file extension, or true (default) to extract it from the filename.
+       *             Set it to false to ignore the extension. DEPRECATED! Set to false, use 
+       *             improveTypeFromExtension($mime, $ext) later to improve mime type.
        *
        * @return string the mime type of $file
        * @access private
        */
-       function detectMimeType( $file, $ext = true ) {
+       private function detectMimeType( $file, $ext = true ) {
                global $wgMimeDetectorCommand;
 
-               $m = NULL;
+               if( $ext ) { # TODO:  make $ext default to false. Or better, remove it.
+                       wfDebug( __METHOD__.": WARNING: use of the \$ext parameter is deprecated. Use improveTypeFromExtension(\$mime, \$ext) instead.\n" );
+               }
+
+               $m = null;
                if ( $wgMimeDetectorCommand ) {
                        $fn = wfEscapeShellArg( $file );
                        $m = `$wgMimeDetectorCommand $fn`;
@@ -559,15 +832,6 @@ class MimeMagic {
                        # see http://www.php.net/manual/en/ref.mime-magic.php for details.
 
                        $m = mime_content_type($file);
-
-                       if ( $m == 'text/plain' ) {
-                               // mime_content_type sometimes considers DJVU files to be text/plain.
-                               $deja = new DjVuImage( $file );
-                               if( $deja->isValid() ) {
-                                       wfDebug( __METHOD__.": (re)detected $file as image/vnd.djvu\n" );
-                                       $m = 'image/vnd.djvu';
-                               }
-                       }
                } else {
                        wfDebug( __METHOD__.": no magic mime detector found!\n" );
                }
@@ -579,73 +843,27 @@ class MimeMagic {
                        $m = strtolower( $m );
 
                        if ( strpos( $m, 'unknown' ) !== false ) {
-                               $m = NULL;
+                               $m = null;
                        } else {
                                wfDebug( __METHOD__.": magic mime type of $file: $m\n" );
                                return $m;
                        }
                }
 
-               # if still not known, use getimagesize to find out the type of image
-               # TODO: skip things that do not have a well-known image extension? Would that be safe?
-               wfSuppressWarnings();
-               $gis = getimagesize( $file );
-               wfRestoreWarnings();
-
-               $notAnImage = false;
-
-               if ( $gis && is_array($gis) && $gis[2] ) {
-                       
-                       switch ( $gis[2] ) {
-                               case IMAGETYPE_GIF: $m = "image/gif"; break;
-                               case IMAGETYPE_JPEG: $m = "image/jpeg"; break;
-                               case IMAGETYPE_PNG: $m = "image/png"; break;
-                               case IMAGETYPE_SWF: $m = "application/x-shockwave-flash"; break;
-                               case IMAGETYPE_PSD: $m = "application/photoshop"; break;
-                               case IMAGETYPE_BMP: $m = "image/bmp"; break;
-                               case IMAGETYPE_TIFF_II: $m = "image/tiff"; break;
-                               case IMAGETYPE_TIFF_MM: $m = "image/tiff"; break;
-                               case IMAGETYPE_JPC: $m = "image"; break;
-                               case IMAGETYPE_JP2: $m = "image/jpeg2000"; break;
-                               case IMAGETYPE_JPX: $m = "image/jpeg2000"; break;
-                               case IMAGETYPE_JB2: $m = "image"; break;
-                               case IMAGETYPE_SWC: $m = "application/x-shockwave-flash"; break;
-                               case IMAGETYPE_IFF: $m = "image/vnd.xiff"; break;
-                               case IMAGETYPE_WBMP: $m = "image/vnd.wap.wbmp"; break;
-                               case IMAGETYPE_XBM: $m = "image/x-xbitmap"; break;
-                       }
-
-                       if ( $m ) {
-                               wfDebug( __METHOD__.": image mime type of $file: $m\n" );
-                               return $m;
-                       }
-                       else {
-                               $notAnImage = true;
-                       }
-               } else {
-                       // Also test DjVu
-                       $deja = new DjVuImage( $file );
-                       if( $deja->isValid() ) {
-                               wfDebug( __METHOD__.": detected $file as image/vnd.djvu\n" );
-                               return 'image/vnd.djvu';
-                       }
-               }
-
                # if desired, look at extension as a fallback.
                if ( $ext === true ) {
                        $i = strrpos( $file, '.' );
                        $ext = strtolower( $i ? substr( $file, $i + 1 ) : '' );
                }
                if ( $ext ) {
-                       $m = $this->guessTypesForExtension( $ext );
-
-                       # TODO: if $notAnImage is set, do not trust the file extension if
-                       # the results is one of the image types that should have been recognized
-                       # by getimagesize
-
-                       if ( $m ) {
-                               wfDebug( __METHOD__.": extension mime type of $file: $m\n" );
-                               return $m;
+                       if( $this->isRecognizableExtension( $ext ) ) {
+                               wfDebug( __METHOD__. ": refusing to guess mime type for .$ext file, we should have recognized it\n" );
+                       } else {
+                               $m = $this->guessTypesForExtension( $ext );
+                               if ( $m ) {
+                                       wfDebug( __METHOD__.": extension mime type of $file: $m\n" );
+                                       return $m;
+                               }
                        }
                }
 
@@ -664,13 +882,13 @@ class MimeMagic {
        * @todo analyse file if need be
        * @todo look at multiple extension, separately and together.
        *
-       * @param string $path full path to the image file, in case we have to look at the contents
+       * @param $path String: full path to the image file, in case we have to look at the contents
        *        (if null, only the mime type is used to determine the media type code).
-       * @param string $mime mime type. If null it will be guessed using guessMimeType.
+       * @param $mime String: mime type. If null it will be guessed using guessMimeType.
        *
        * @return (int?string?) a value to be used with the MEDIATYPE_xxx constants.
        */
-       function getMediaType( $path = NULL, $mime = NULL ) {
+       function getMediaType( $path = null, $mime = null ) {
                if( !$mime && !$path ) return MEDIATYPE_UNKNOWN;
 
                # If mime type is unknown, guess it
@@ -703,7 +921,6 @@ class MimeMagic {
                }
 
                # Check for entry for file extension
-               $e = NULL;
                if ( $path ) {
                        $i = strrpos( $path, '.' );
                        $e = strtolower( $i ? substr( $path, $i + 1 ) : '' );
@@ -742,7 +959,7 @@ class MimeMagic {
                        if ( !$m ) return MEDIATYPE_UNKNOWN;
 
                        $m = explode( ' ', $m );
-               } else { 
+               } else {
                        # Normalize mime type
                        if ( isset( $this->mMimeTypeAliases[$extMime] ) ) {
                                $extMime = $this->mMimeTypeAliases[$extMime];
@@ -761,6 +978,27 @@ class MimeMagic {
 
                return MEDIATYPE_UNKNOWN;
        }
-}
 
+       /**
+        * Get the MIME types that various versions of Internet Explorer would 
+        * detect from a chunk of the content.
+        *
+        * @param $fileName String: the file name (unused at present)
+        * @param $chunk String: the first 256 bytes of the file
+        * @param $proposed String: the MIME type proposed by the server
+        */
+       public function getIEMimeTypes( $fileName, $chunk, $proposed ) {
+               $ca = $this->getIEContentAnalyzer();
+               return $ca->getRealMimesFromData( $fileName, $chunk, $proposed );
+       }
 
+       /**
+        * Get a cached instance of IEContentAnalyzer
+        */
+       protected function getIEContentAnalyzer() {
+               if ( is_null( $this->mIEAnalyzer ) ) {
+                       $this->mIEAnalyzer = new IEContentAnalyzer;
+               }
+               return $this->mIEAnalyzer;
+       }
+}