51407a76e0efaf814857aba25e8d713eb121af76
[lhc/web/wiklou.git] / includes / libs / mime / MSCompoundFileReader.php
1 <?php
2 /*
3 * Copyright 2019 Wikimedia Foundation
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License"); you may
6 * not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software distributed
12 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
13 * OF ANY KIND, either express or implied. See the License for the
14 * specific language governing permissions and limitations under the License.
15 */
16
17 // strlen() is actually pretty fast compared to just about any loop body
18 // phpcs:disable Generic.CodeAnalysis.ForLoopWithTestFunctionCall.NotAllowed
19
20 /**
21 * Read the directory of a Microsoft Compound File Binary file, a.k.a. an OLE
22 * file, and detect the MIME type.
23 *
24 * References:
25 * - MS-CFB https://msdn.microsoft.com/en-us/library/dd942138.aspx
26 * - MS-XLS https://msdn.microsoft.com/en-us/library/cc313154.aspx
27 * - MS-PPT https://msdn.microsoft.com/en-us/library/cc313106.aspx
28 * - MS-DOC https://msdn.microsoft.com/en-us/library/cc313153.aspx
29 * - Python olefile https://github.com/decalage2/olefile
30 * - OpenOffice.org's Documentation of the Microsoft Compound Document
31 * File Format https://www.openoffice.org/sc/compdocfileformat.pdf
32 *
33 * @since 1.33
34 */
35 class MSCompoundFileReader {
36 private $file;
37 private $header;
38 private $mime;
39 private $mimeFromClsid;
40 private $error;
41 private $errorCode;
42 private $valid = false;
43
44 private $sectorLength;
45 private $difat;
46 private $fat = [];
47 private $fileLength;
48
49 const TYPE_UNALLOCATED = 0;
50 const TYPE_STORAGE = 1;
51 const TYPE_STREAM = 2;
52 const TYPE_ROOT = 5;
53
54 const ERROR_FILE_OPEN = 1;
55 const ERROR_SEEK = 2;
56 const ERROR_READ = 3;
57 const ERROR_INVALID_SIGNATURE = 4;
58 const ERROR_READ_PAST_END = 5;
59 const ERROR_INVALID_FORMAT = 6;
60
61 private static $mimesByClsid = [
62 // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File
63 '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
64 '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
65 '00020906-0000-0000-C000-000000000046' => 'application/msword',
66 '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint',
67 ];
68
69 /**
70 * Read a file by name
71 *
72 * @param string $fileName The full path to the file
73 * @return array An associative array of information about the file:
74 * - valid: true if the file is valid, false otherwise
75 * - error: An error message in English, should be present if valid=false
76 * - errorCode: One of the self::ERROR_* constants
77 * - mime: The MIME type detected from the directory contents
78 * - mimeFromClsid: The MIME type detected from the CLSID on the root
79 * directory entry
80 */
81 public static function readFile( $fileName ) {
82 $handle = fopen( $fileName, 'r' );
83 if ( $handle === false ) {
84 return [
85 'valid' => false,
86 'error' => 'file does not exist',
87 'errorCode' => self::ERROR_FILE_OPEN
88 ];
89 }
90 return self::readHandle( $handle );
91 }
92
93 /**
94 * Read from an open seekable handle
95 *
96 * @param resource $fileHandle The file handle
97 * @return array An associative array of information about the file:
98 * - valid: true if the file is valid, false otherwise
99 * - error: An error message in English, should be present if valid=false
100 * - errorCode: One of the self::ERROR_* constants
101 * - mime: The MIME type detected from the directory contents
102 * - mimeFromClsid: The MIME type detected from the CLSID on the root
103 * directory entry
104 */
105 public static function readHandle( $fileHandle ) {
106 $reader = new self( $fileHandle );
107 $info = [
108 'valid' => $reader->valid,
109 'mime' => $reader->mime,
110 'mimeFromClsid' => $reader->mimeFromClsid
111 ];
112 if ( $reader->error ) {
113 $info['error'] = $reader->error;
114 $info['errorCode'] = $reader->errorCode;
115 }
116 return $info;
117 }
118
119 private function __construct( $fileHandle ) {
120 $this->file = $fileHandle;
121 try {
122 $this->init();
123 } catch ( RuntimeException $e ) {
124 $this->valid = false;
125 $this->error = $e->getMessage();
126 $this->errorCode = $e->getCode();
127 }
128 }
129
130 private function init() {
131 $this->header = $this->unpackOffset( 0, [
132 'header_signature' => 8,
133 'header_clsid' => 16,
134 'minor_version' => 2,
135 'major_version' => 2,
136 'byte_order' => 2,
137 'sector_shift' => 2,
138 'mini_sector_shift' => 2,
139 'reserved' => 6,
140 'num_dir_sectors' => 4,
141 'num_fat_sectors' => 4,
142 'first_dir_sector' => 4,
143 'transaction_signature_number' => 4,
144 'mini_stream_cutoff_size' => 4,
145 'first_mini_fat_sector' => 4,
146 'num_mini_fat_sectors' => 4,
147 'first_difat_sector' => 4,
148 'num_difat_sectors' => 4,
149 'difat' => 436,
150 ] );
151 if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) {
152 $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ),
153 self::ERROR_INVALID_SIGNATURE );
154 }
155 $this->sectorLength = 1 << $this->header['sector_shift'];
156 $this->readDifat();
157 $this->readDirectory();
158
159 $this->valid = true;
160 }
161
162 private function sectorOffset( $sectorId ) {
163 return $this->sectorLength * ( $sectorId + 1 );
164 }
165
166 private function decodeClsid( $binaryClsid ) {
167 $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid );
168 return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X",
169 $parts['a'],
170 $parts['b'],
171 $parts['c'],
172 $parts['d1'],
173 $parts['d2'],
174 $parts['d3'],
175 $parts['d4'],
176 $parts['d5'],
177 $parts['d6'],
178 $parts['d7'],
179 $parts['d8']
180 );
181 }
182
183 private function unpackOffset( $offset, $struct ) {
184 $block = $this->readOffset( $offset, array_sum( $struct ) );
185 return $this->unpack( $block, 0, $struct );
186 }
187
188 private function unpackSector( $sectorNumber, $struct ) {
189 $offset = $this->sectorOffset( $sectorNumber );
190 return $this->unpackOffset( $offset, array_sum( $struct ) );
191 }
192
193 private function unpack( $block, $offset, $struct ) {
194 $data = [];
195 foreach ( $struct as $key => $length ) {
196 if ( $length > 4 ) {
197 $data[$key] = substr( $block, $offset, $length );
198 } else {
199 $data[$key] = $this->bin2dec( $block, $offset, $length );
200 }
201 $offset += $length;
202 }
203 return $data;
204 }
205
206 private function bin2dec( $str, $offset, $length ) {
207 $value = 0;
208 for ( $i = $length - 1; $i >= 0; $i-- ) {
209 $value *= 256;
210 $value += ord( $str[$offset + $i] );
211 }
212 return $value;
213 }
214
215 private function readOffset( $offset, $length ) {
216 $this->fseek( $offset );
217 Wikimedia\suppressWarnings();
218 $block = fread( $this->file, $length );
219 Wikimedia\restoreWarnings();
220 if ( $block === false ) {
221 $this->error( 'error reading from file', self::ERROR_READ );
222 }
223 if ( strlen( $block ) !== $length ) {
224 $this->error( 'unable to read the required number of bytes from the file',
225 self::ERROR_READ_PAST_END );
226 }
227 return $block;
228 }
229
230 private function readSector( $sectorId ) {
231 return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] );
232 }
233
234 private function error( $message, $code ) {
235 throw new RuntimeException( $message, $code );
236 }
237
238 private function fseek( $offset ) {
239 Wikimedia\suppressWarnings();
240 $result = fseek( $this->file, $offset );
241 Wikimedia\restoreWarnings();
242 if ( $result !== 0 ) {
243 $this->error( "unable to seek to offset $offset", self::ERROR_SEEK );
244 }
245 }
246
247 private function readDifat() {
248 $binaryDifat = $this->header['difat'];
249 $nextDifatSector = $this->header['first_difat_sector'];
250 for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) {
251 $block = $this->readSector( $nextDifatSector );
252 $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 );
253 $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 );
254 if ( $nextDifatSector == 0xFFFFFFFE ) {
255 break;
256 }
257 }
258
259 $this->difat = [];
260 for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) {
261 $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 );
262 if ( $fatSector < 0xFFFFFFFC ) {
263 $this->difat[] = $fatSector;
264 } else {
265 break;
266 }
267 }
268 }
269
270 private function getNextSectorIdFromFat( $sectorId ) {
271 $entriesPerSector = intdiv( $this->sectorLength, 4 );
272 $fatSectorId = intdiv( $sectorId, $entriesPerSector );
273 $fatSectorArray = $this->getFatSector( $fatSectorId );
274 return $fatSectorArray[$sectorId % $entriesPerSector];
275 }
276
277 private function getFatSector( $fatSectorId ) {
278 if ( !isset( $this->fat[$fatSectorId] ) ) {
279 $fat = [];
280 if ( !isset( $this->difat[$fatSectorId] ) ) {
281 $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT );
282 }
283 $absoluteSectorId = $this->difat[$fatSectorId];
284 $block = $this->readSector( $absoluteSectorId );
285 for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) {
286 $fat[] = $this->bin2dec( $block, $pos, 4 );
287 }
288 $this->fat[$fatSectorId] = $fat;
289 }
290 return $this->fat[$fatSectorId];
291 }
292
293 private function readDirectory() {
294 $dirSectorId = $this->header['first_dir_sector'];
295 $binaryDir = '';
296 $seenSectorIds = [];
297 while ( $dirSectorId !== 0xFFFFFFFE ) {
298 if ( isset( $seenSectorIds[$dirSectorId] ) ) {
299 $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT );
300 }
301 $seenSectorIds[$dirSectorId] = true;
302
303 $binaryDir .= $this->readSector( $dirSectorId );
304 $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId );
305 }
306
307 $struct = [
308 'name_raw' => 64,
309 'name_length' => 2,
310 'object_type' => 1,
311 'color' => 1,
312 'sid_left' => 4,
313 'sid_right' => 4,
314 'sid_child' => 4,
315 'clsid' => 16,
316 'state_bits' => 4,
317 'create_time_low' => 4,
318 'create_time_high' => 4,
319 'modify_time_low' => 4,
320 'modify_time_high' => 4,
321 'first_sector' => 4,
322 'size_low' => 4,
323 'size_high' => 4,
324 ];
325 $entryLength = array_sum( $struct );
326
327 for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) {
328 $entry = $this->unpack( $binaryDir, $pos, $struct );
329
330 // According to [MS-CFB] size_high may contain garbage due to a
331 // bug in a writer, it's best to pretend it is zero
332 $entry['size_high'] = 0;
333
334 $type = $entry['object_type'];
335 if ( $type == self::TYPE_UNALLOCATED ) {
336 continue;
337 }
338
339 $name = iconv( 'UTF-16', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) );
340
341 $clsid = $this->decodeClsid( $entry['clsid'] );
342 if ( $type == self::TYPE_ROOT && isset( self::$mimesByClsid[$clsid] ) ) {
343 $this->mimeFromClsid = self::$mimesByClsid[$clsid];
344 }
345
346 if ( $name === 'Workbook' ) {
347 $this->mime = 'application/vnd.ms-excel';
348 } elseif ( $name === 'WordDocument' ) {
349 $this->mime = 'application/msword';
350 } elseif ( $name === 'PowerPoint Document' ) {
351 $this->mime = 'application/vnd.ms-powerpoint';
352 }
353 }
354 }
355 }