Better detection for old MS Office files
authorTim Starling <tstarling@wikimedia.org>
Tue, 15 Jan 2019 03:15:18 +0000 (14:15 +1100)
committerTim Starling <tstarling@wikimedia.org>
Tue, 22 Jan 2019 22:23:17 +0000 (09:23 +1100)
* Introduce MSCompoundFileReader, which reads the CFB directory and
  detects the file type from well-known names in the root directory
* Do not detect a ZIP file if the EOCDR is not at the end. Other
  containers, especially CFB files, may contain ZIP files embedded
  within them in the last 64KB, but this is not a security concern
  unless the EOCDR is exactly at the end of the file.

Bug: T40432
Change-Id: Id5b1a258ccf3c3c8951e32f6b7a5b1bafe941082

18 files changed:
autoload.php
includes/libs/mime/MSCompoundFileReader.php [new file with mode: 0644]
includes/libs/mime/MimeAnalyzer.php
includes/utils/ZipDirectoryReader.php
tests/phpunit/data/MSCompoundFileReader/calc.xls [new file with mode: 0644]
tests/phpunit/data/MSCompoundFileReader/dir-beyond-end.xls [new file with mode: 0644]
tests/phpunit/data/MSCompoundFileReader/excel2016-compat97.xls [new file with mode: 0755]
tests/phpunit/data/MSCompoundFileReader/fat-loop.xls [new file with mode: 0644]
tests/phpunit/data/MSCompoundFileReader/gnumeric.xls [new file with mode: 0644]
tests/phpunit/data/MSCompoundFileReader/impress.ppt [new file with mode: 0644]
tests/phpunit/data/MSCompoundFileReader/invalid-signature.xls [new file with mode: 0644]
tests/phpunit/data/MSCompoundFileReader/powerpoint2016-compat97.ppt [new file with mode: 0755]
tests/phpunit/data/MSCompoundFileReader/word2016-compat97.doc [new file with mode: 0755]
tests/phpunit/data/MSCompoundFileReader/writer.doc [new file with mode: 0644]
tests/phpunit/data/media/zip-in-doc.doc [new file with mode: 0644]
tests/phpunit/includes/libs/mime/MSCompoundFileReaderTest.php [new file with mode: 0644]
tests/phpunit/includes/libs/mime/MimeAnalyzerTest.php
tests/phpunit/includes/utils/ZipDirectoryReaderTest.php

index 91be2e7..4481cd7 100644 (file)
@@ -816,6 +816,7 @@ $wgAutoloadLocalClasses = [
        'LonelyPagesPage' => __DIR__ . '/includes/specials/SpecialLonelypages.php',
        'LongPagesPage' => __DIR__ . '/includes/specials/SpecialLongpages.php',
        'MIMEsearchPage' => __DIR__ . '/includes/specials/SpecialMIMEsearch.php',
+       'MSCompoundFileReader' => __DIR__ . '/includes/libs/mime/MSCompoundFileReader.php',
        'MWCallableUpdate' => __DIR__ . '/includes/deferred/MWCallableUpdate.php',
        'MWContentSerializationException' => __DIR__ . '/includes/exception/MWContentSerializationException.php',
        'MWCryptHKDF' => __DIR__ . '/includes/utils/MWCryptHKDF.php',
diff --git a/includes/libs/mime/MSCompoundFileReader.php b/includes/libs/mime/MSCompoundFileReader.php
new file mode 100644 (file)
index 0000000..51407a7
--- /dev/null
@@ -0,0 +1,355 @@
+<?php
+/*
+ * Copyright 2019 Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ * OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+// strlen() is actually pretty fast compared to just about any loop body
+// phpcs:disable Generic.CodeAnalysis.ForLoopWithTestFunctionCall.NotAllowed
+
+/**
+ * Read the directory of a Microsoft Compound File Binary file, a.k.a. an OLE
+ * file, and detect the MIME type.
+ *
+ * References:
+ *  - MS-CFB https://msdn.microsoft.com/en-us/library/dd942138.aspx
+ *  - MS-XLS https://msdn.microsoft.com/en-us/library/cc313154.aspx
+ *  - MS-PPT https://msdn.microsoft.com/en-us/library/cc313106.aspx
+ *  - MS-DOC https://msdn.microsoft.com/en-us/library/cc313153.aspx
+ *  - Python olefile https://github.com/decalage2/olefile
+ *  - OpenOffice.org's Documentation of the Microsoft Compound Document
+ *    File Format https://www.openoffice.org/sc/compdocfileformat.pdf
+ *
+ * @since 1.33
+ */
+class MSCompoundFileReader {
+       private $file;
+       private $header;
+       private $mime;
+       private $mimeFromClsid;
+       private $error;
+       private $errorCode;
+       private $valid = false;
+
+       private $sectorLength;
+       private $difat;
+       private $fat = [];
+       private $fileLength;
+
+       const TYPE_UNALLOCATED = 0;
+       const TYPE_STORAGE = 1;
+       const TYPE_STREAM = 2;
+       const TYPE_ROOT = 5;
+
+       const ERROR_FILE_OPEN = 1;
+       const ERROR_SEEK = 2;
+       const ERROR_READ = 3;
+       const ERROR_INVALID_SIGNATURE = 4;
+       const ERROR_READ_PAST_END = 5;
+       const ERROR_INVALID_FORMAT = 6;
+
+       private static $mimesByClsid = [
+               // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File
+               '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
+               '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
+               '00020906-0000-0000-C000-000000000046' => 'application/msword',
+               '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint',
+       ];
+
+       /**
+        * Read a file by name
+        *
+        * @param string $fileName The full path to the file
+        * @return array An associative array of information about the file:
+        *    - valid: true if the file is valid, false otherwise
+        *    - error: An error message in English, should be present if valid=false
+        *    - errorCode: One of the self::ERROR_* constants
+        *    - mime: The MIME type detected from the directory contents
+        *    - mimeFromClsid: The MIME type detected from the CLSID on the root
+        *      directory entry
+        */
+       public static function readFile( $fileName ) {
+               $handle = fopen( $fileName, 'r' );
+               if ( $handle === false ) {
+                       return [
+                               'valid' => false,
+                               'error' => 'file does not exist',
+                               'errorCode' => self::ERROR_FILE_OPEN
+                       ];
+               }
+               return self::readHandle( $handle );
+       }
+
+       /**
+        * Read from an open seekable handle
+        *
+        * @param resource $fileHandle The file handle
+        * @return array An associative array of information about the file:
+        *    - valid: true if the file is valid, false otherwise
+        *    - error: An error message in English, should be present if valid=false
+        *    - errorCode: One of the self::ERROR_* constants
+        *    - mime: The MIME type detected from the directory contents
+        *    - mimeFromClsid: The MIME type detected from the CLSID on the root
+        *      directory entry
+        */
+       public static function readHandle( $fileHandle ) {
+               $reader = new self( $fileHandle );
+               $info = [
+                       'valid' => $reader->valid,
+                       'mime' => $reader->mime,
+                       'mimeFromClsid' => $reader->mimeFromClsid
+               ];
+               if ( $reader->error ) {
+                       $info['error'] = $reader->error;
+                       $info['errorCode'] = $reader->errorCode;
+               }
+               return $info;
+       }
+
+       private function __construct( $fileHandle ) {
+               $this->file = $fileHandle;
+               try {
+                       $this->init();
+               } catch ( RuntimeException $e ) {
+                       $this->valid = false;
+                       $this->error = $e->getMessage();
+                       $this->errorCode = $e->getCode();
+               }
+       }
+
+       private function init() {
+               $this->header = $this->unpackOffset( 0, [
+                       'header_signature' => 8,
+                       'header_clsid' => 16,
+                       'minor_version' => 2,
+                       'major_version' => 2,
+                       'byte_order' => 2,
+                       'sector_shift' => 2,
+                       'mini_sector_shift' => 2,
+                       'reserved' => 6,
+                       'num_dir_sectors' => 4,
+                       'num_fat_sectors' => 4,
+                       'first_dir_sector' => 4,
+                       'transaction_signature_number' => 4,
+                       'mini_stream_cutoff_size' => 4,
+                       'first_mini_fat_sector' => 4,
+                       'num_mini_fat_sectors' => 4,
+                       'first_difat_sector' => 4,
+                       'num_difat_sectors' => 4,
+                       'difat' => 436,
+               ] );
+               if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) {
+                       $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ),
+                               self::ERROR_INVALID_SIGNATURE );
+               }
+               $this->sectorLength = 1 << $this->header['sector_shift'];
+               $this->readDifat();
+               $this->readDirectory();
+
+               $this->valid = true;
+       }
+
+       private function sectorOffset( $sectorId ) {
+               return $this->sectorLength * ( $sectorId + 1 );
+       }
+
+       private function decodeClsid( $binaryClsid ) {
+               $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid );
+               return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X",
+                       $parts['a'],
+                       $parts['b'],
+                       $parts['c'],
+                       $parts['d1'],
+                       $parts['d2'],
+                       $parts['d3'],
+                       $parts['d4'],
+                       $parts['d5'],
+                       $parts['d6'],
+                       $parts['d7'],
+                       $parts['d8']
+               );
+       }
+
+       private function unpackOffset( $offset, $struct ) {
+               $block = $this->readOffset( $offset, array_sum( $struct ) );
+               return $this->unpack( $block, 0, $struct );
+       }
+
+       private function unpackSector( $sectorNumber, $struct ) {
+               $offset = $this->sectorOffset( $sectorNumber );
+               return $this->unpackOffset( $offset, array_sum( $struct ) );
+       }
+
+       private function unpack( $block, $offset, $struct ) {
+               $data = [];
+               foreach ( $struct as $key => $length ) {
+                       if ( $length > 4 ) {
+                               $data[$key] = substr( $block, $offset, $length );
+                       } else {
+                               $data[$key] = $this->bin2dec( $block, $offset, $length );
+                       }
+                       $offset += $length;
+               }
+               return $data;
+       }
+
+       private function bin2dec( $str, $offset, $length ) {
+               $value = 0;
+               for ( $i = $length - 1; $i >= 0; $i-- ) {
+                       $value *= 256;
+                       $value += ord( $str[$offset + $i] );
+               }
+               return $value;
+       }
+
+       private function readOffset( $offset, $length ) {
+               $this->fseek( $offset );
+               Wikimedia\suppressWarnings();
+               $block = fread( $this->file, $length );
+               Wikimedia\restoreWarnings();
+               if ( $block === false ) {
+                       $this->error( 'error reading from file', self::ERROR_READ );
+               }
+               if ( strlen( $block ) !== $length ) {
+                       $this->error( 'unable to read the required number of bytes from the file',
+                               self::ERROR_READ_PAST_END );
+               }
+               return $block;
+       }
+
+       private function readSector( $sectorId ) {
+               return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] );
+       }
+
+       private function error( $message, $code ) {
+               throw new RuntimeException( $message, $code );
+       }
+
+       private function fseek( $offset ) {
+               Wikimedia\suppressWarnings();
+               $result = fseek( $this->file, $offset );
+               Wikimedia\restoreWarnings();
+               if ( $result !== 0 ) {
+                       $this->error( "unable to seek to offset $offset", self::ERROR_SEEK );
+               }
+       }
+
+       private function readDifat() {
+               $binaryDifat = $this->header['difat'];
+               $nextDifatSector = $this->header['first_difat_sector'];
+               for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) {
+                       $block = $this->readSector( $nextDifatSector );
+                       $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 );
+                       $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 );
+                       if ( $nextDifatSector == 0xFFFFFFFE ) {
+                               break;
+                       }
+               }
+
+               $this->difat = [];
+               for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) {
+                       $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 );
+                       if ( $fatSector < 0xFFFFFFFC ) {
+                               $this->difat[] = $fatSector;
+                       } else {
+                               break;
+                       }
+               }
+       }
+
+       private function getNextSectorIdFromFat( $sectorId ) {
+               $entriesPerSector = intdiv( $this->sectorLength, 4 );
+               $fatSectorId = intdiv( $sectorId, $entriesPerSector );
+               $fatSectorArray = $this->getFatSector( $fatSectorId );
+               return $fatSectorArray[$sectorId % $entriesPerSector];
+       }
+
+       private function getFatSector( $fatSectorId ) {
+               if ( !isset( $this->fat[$fatSectorId] ) ) {
+                       $fat = [];
+                       if ( !isset( $this->difat[$fatSectorId] ) ) {
+                               $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT );
+                       }
+                       $absoluteSectorId = $this->difat[$fatSectorId];
+                       $block = $this->readSector( $absoluteSectorId );
+                       for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) {
+                               $fat[] = $this->bin2dec( $block, $pos, 4 );
+                       }
+                       $this->fat[$fatSectorId] = $fat;
+               }
+               return $this->fat[$fatSectorId];
+       }
+
+       private function readDirectory() {
+               $dirSectorId = $this->header['first_dir_sector'];
+               $binaryDir = '';
+               $seenSectorIds = [];
+               while ( $dirSectorId !== 0xFFFFFFFE ) {
+                       if ( isset( $seenSectorIds[$dirSectorId] ) ) {
+                               $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT );
+                       }
+                       $seenSectorIds[$dirSectorId] = true;
+
+                       $binaryDir .= $this->readSector( $dirSectorId );
+                       $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId );
+               }
+
+               $struct = [
+                       'name_raw' => 64,
+                       'name_length' => 2,
+                       'object_type' => 1,
+                       'color' => 1,
+                       'sid_left' => 4,
+                       'sid_right' => 4,
+                       'sid_child' => 4,
+                       'clsid' => 16,
+                       'state_bits' => 4,
+                       'create_time_low' => 4,
+                       'create_time_high' => 4,
+                       'modify_time_low' => 4,
+                       'modify_time_high' => 4,
+                       'first_sector' => 4,
+                       'size_low' => 4,
+                       'size_high' => 4,
+               ];
+               $entryLength = array_sum( $struct );
+
+               for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) {
+                       $entry = $this->unpack( $binaryDir, $pos, $struct );
+
+                       // According to [MS-CFB] size_high may contain garbage due to a
+                       // bug in a writer, it's best to pretend it is zero
+                       $entry['size_high'] = 0;
+
+                       $type = $entry['object_type'];
+                       if ( $type == self::TYPE_UNALLOCATED ) {
+                               continue;
+                       }
+
+                       $name = iconv( 'UTF-16', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) );
+
+                       $clsid = $this->decodeClsid( $entry['clsid'] );
+                       if ( $type == self::TYPE_ROOT && isset( self::$mimesByClsid[$clsid] ) ) {
+                               $this->mimeFromClsid = self::$mimesByClsid[$clsid];
+                       }
+
+                       if ( $name === 'Workbook' ) {
+                               $this->mime = 'application/vnd.ms-excel';
+                       } elseif ( $name === 'WordDocument' ) {
+                               $this->mime = 'application/msword';
+                       } elseif ( $name === 'PowerPoint Document' ) {
+                               $this->mime = 'application/vnd.ms-powerpoint';
+                       }
+               }
+       }
+}
index b93084a..a2075dc 100644 (file)
@@ -653,7 +653,6 @@ EOT;
                                "Seeking $tailLength bytes from EOF failed in " . __METHOD__ );
                }
                $tail = $tailLength ? fread( $f, $tailLength ) : '';
-               fclose( $f );
 
                $this->logger->info( __METHOD__ .
                        ": analyzing head and tail of $file for magic numbers.\n" );
@@ -724,6 +723,12 @@ EOT;
                        return "image/webp";
                }
 
+               /* Look for MS Compound Binary (OLE) files */
+               if ( strncmp( $head, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8 ) == 0 ) {
+                       $this->logger->info( __METHOD__ . ': recognized MS CFB (OLE) file' );
+                       return $this->detectMicrosoftBinaryType( $f );
+               }
+
                /**
                 * Look for PHP.  Check for this before HTML/XML...  Warning: this is a
                 * heuristic, and won't match a file with a lot of non-PHP before.  It
@@ -797,9 +802,16 @@ EOT;
                }
 
                // Check for ZIP variants (before getimagesize)
-               if ( strpos( $tail, "PK\x05\x06" ) !== false ) {
-                       $this->logger->info( __METHOD__ . ": ZIP header present in $file\n" );
-                       return $this->detectZipType( $head, $tail, $ext );
+               $eocdrPos = strpos( $tail, "PK\x05\x06" );
+               if ( $eocdrPos !== false ) {
+                       $this->logger->info( __METHOD__ . ": ZIP signature present in $file\n" );
+                       // Check if it really is a ZIP file, make sure the EOCDR is at the end (T40432)
+                       $commentLength = unpack( "n", $tail, $eocdrPos + 20 )[0];
+                       if ( $eocdrPos + 22 + $commentLength !== strlen( $tail ) ) {
+                               $this->logger->info( __METHOD__ . ": ZIP EOCDR not at end. Not a ZIP file." );
+                       } else {
+                               return $this->detectZipType( $head, $tail, $ext );
+                       }
                }
 
                // Check for STL (3D) files
@@ -945,6 +957,26 @@ EOT;
                return $mime;
        }
 
+       /**
+        * Detect the type of a Microsoft Compound Binary a.k.a. OLE file.
+        * These are old style pre-ODF files such as .doc and .xls
+        *
+        * @param resource $handle An opened seekable file handle
+        * @return string The detected MIME type
+        */
+       function detectMicrosoftBinaryType( $handle ) {
+               $info = MSCompoundFileReader::readHandle( $handle );
+               if ( !$info['valid'] ) {
+                       $this->logger->info( __METHOD__ . ': invalid file format' );
+                       return 'unknown/unknown';
+               }
+               if ( !$info['mime'] ) {
+                       $this->logger->info( __METHOD__ . ": unrecognised document subtype" );
+                       return 'unknown/unknown';
+               }
+               return $info['mime'];
+       }
+
        /**
         * Internal MIME type detection. Detection is done using the fileinfo
         * extension if it is available. It can be overriden by callback, which could
index 46f1aaa..e7846f4 100644 (file)
@@ -228,7 +228,9 @@ class ZipDirectoryReader {
                $this->eocdr['EOCDR size'] = $structSize + $this->eocdr['file comment length'];
 
                if ( $structSize + $this->eocdr['file comment length'] != strlen( $block ) - $sigPos ) {
-                       $this->error( 'zip-bad', 'trailing bytes after the end of the file comment' );
+                       // T40432: MS binary documents frequently embed ZIP files
+                       $this->error( 'zip-wrong-format', 'there is a ZIP signature but it is not at ' .
+                               'the end of the file. It could be an OLE file with a ZIP file embedded.' );
                }
                if ( $this->eocdr['disk'] !== 0
                        || $this->eocdr['CD start disk'] !== 0
diff --git a/tests/phpunit/data/MSCompoundFileReader/calc.xls b/tests/phpunit/data/MSCompoundFileReader/calc.xls
new file mode 100644 (file)
index 0000000..f15e2c1
Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/calc.xls differ
diff --git a/tests/phpunit/data/MSCompoundFileReader/dir-beyond-end.xls b/tests/phpunit/data/MSCompoundFileReader/dir-beyond-end.xls
new file mode 100644 (file)
index 0000000..b2aebf4
Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/dir-beyond-end.xls differ
diff --git a/tests/phpunit/data/MSCompoundFileReader/excel2016-compat97.xls b/tests/phpunit/data/MSCompoundFileReader/excel2016-compat97.xls
new file mode 100755 (executable)
index 0000000..8895407
Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/excel2016-compat97.xls differ
diff --git a/tests/phpunit/data/MSCompoundFileReader/fat-loop.xls b/tests/phpunit/data/MSCompoundFileReader/fat-loop.xls
new file mode 100644 (file)
index 0000000..ef8e1ad
Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/fat-loop.xls differ
diff --git a/tests/phpunit/data/MSCompoundFileReader/gnumeric.xls b/tests/phpunit/data/MSCompoundFileReader/gnumeric.xls
new file mode 100644 (file)
index 0000000..4e4995d
Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/gnumeric.xls differ
diff --git a/tests/phpunit/data/MSCompoundFileReader/impress.ppt b/tests/phpunit/data/MSCompoundFileReader/impress.ppt
new file mode 100644 (file)
index 0000000..8e39c4b
Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/impress.ppt differ
diff --git a/tests/phpunit/data/MSCompoundFileReader/invalid-signature.xls b/tests/phpunit/data/MSCompoundFileReader/invalid-signature.xls
new file mode 100644 (file)
index 0000000..b8d9c2c
Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/invalid-signature.xls differ
diff --git a/tests/phpunit/data/MSCompoundFileReader/powerpoint2016-compat97.ppt b/tests/phpunit/data/MSCompoundFileReader/powerpoint2016-compat97.ppt
new file mode 100755 (executable)
index 0000000..1fcafbb
Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/powerpoint2016-compat97.ppt differ
diff --git a/tests/phpunit/data/MSCompoundFileReader/word2016-compat97.doc b/tests/phpunit/data/MSCompoundFileReader/word2016-compat97.doc
new file mode 100755 (executable)
index 0000000..edd25e9
Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/word2016-compat97.doc differ
diff --git a/tests/phpunit/data/MSCompoundFileReader/writer.doc b/tests/phpunit/data/MSCompoundFileReader/writer.doc
new file mode 100644 (file)
index 0000000..b51df91
Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/writer.doc differ
diff --git a/tests/phpunit/data/media/zip-in-doc.doc b/tests/phpunit/data/media/zip-in-doc.doc
new file mode 100644 (file)
index 0000000..0b4c574
Binary files /dev/null and b/tests/phpunit/data/media/zip-in-doc.doc differ
diff --git a/tests/phpunit/includes/libs/mime/MSCompoundFileReaderTest.php b/tests/phpunit/includes/libs/mime/MSCompoundFileReaderTest.php
new file mode 100644 (file)
index 0000000..4509a61
--- /dev/null
@@ -0,0 +1,60 @@
+<?php
+/*
+ * Copyright 2019 Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ * OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+/**
+ * @group Media
+ * @covers MSCompoundFileReader
+ */
+class MSCompoundFileReaderTest extends PHPUnit\Framework\TestCase {
+       public static function provideValid() {
+               return [
+                       [ 'calc.xls', 'application/vnd.ms-excel' ],
+                       [ 'excel2016-compat97.xls', 'application/vnd.ms-excel' ],
+                       [ 'gnumeric.xls', 'application/vnd.ms-excel' ],
+                       [ 'impress.ppt', 'application/vnd.ms-powerpoint' ],
+                       [ 'powerpoint2016-compat97.ppt', 'application/vnd.ms-powerpoint' ],
+                       [ 'word2016-compat97.doc', 'application/msword' ],
+                       [ 'writer.doc', 'application/msword' ],
+               ];
+       }
+
+       /** @dataProvider provideValid */
+       public function testReadFile( $fileName, $expectedMime ) {
+               global $IP;
+
+               $info = MSCompoundFileReader::readFile( "$IP/tests/phpunit/data/MSCompoundFileReader/$fileName" );
+               $this->assertTrue( $info['valid'] );
+               $this->assertSame( $expectedMime, $info['mime'] );
+       }
+
+       public static function provideInvalid() {
+               return [
+                       [ 'dir-beyond-end.xls', 'ERROR_READ_PAST_END' ],
+                       [ 'fat-loop.xls', 'ERROR_INVALID_FORMAT' ],
+                       [ 'invalid-signature.xls', 'ERROR_INVALID_SIGNATURE' ],
+               ];
+       }
+
+       /** @dataProvider provideInvalid */
+       public function testReadFileInvalid( $fileName, $expectedError ) {
+               global $IP;
+
+               $info = MSCompoundFileReader::readFile( "$IP/tests/phpunit/data/MSCompoundFileReader/$fileName" );
+               $this->assertFalse( $info['valid'] );
+               $this->assertSame( constant( MSCompoundFileReader::class . '::' . $expectedError ),
+                       $info['errorCode'] );
+       }
+}
index fbe5a2b..1947812 100644 (file)
@@ -128,4 +128,13 @@ class MimeAnalyzerTest extends PHPUnit\Framework\TestCase {
                $actualType = $this->doGuessMimeType( [ $file, 'mp3' ] );
                $this->assertEquals( 'audio/mpeg', $actualType );
        }
+
+       /**
+        * A ZIP file embedded in the middle of a .doc file is still a Word Document.
+        */
+       function testZipInDoc() {
+               $file = __DIR__ . '/../../../data/media/zip-in-doc.doc';
+               $actualType = $this->doGuessMimeType( [ $file, 'doc' ] );
+               $this->assertEquals( 'application/msword', $actualType );
+       }
 }
index 9f18e5a..a1a3fd7 100644 (file)
@@ -61,7 +61,8 @@ class ZipDirectoryReaderTest extends PHPUnit\Framework\TestCase {
        }
 
        public function testTrailingBytes() {
-               $this->readZipAssertError( 'trail.zip', 'zip-bad',
+               // Due to T40432 this is now zip-wrong-format instead of zip-bad
+               $this->readZipAssertError( 'trail.zip', 'zip-wrong-format',
                        'Trailing bytes error' );
        }