Merge "Refactor Watchlist code so mobile can be more consistent"
[lhc/web/wiklou.git] / includes / utils / ZipDirectoryReader.php
1 <?php
2 /**
3 * ZIP file directories reader, for the purposes of upload verification.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 */
22
23 /**
24 * A class for reading ZIP file directories, for the purposes of upload
25 * verification.
26 *
27 * Only a functional interface is provided: ZipFileReader::read(). No access is
28 * given to object instances.
29 *
30 */
31 class ZipDirectoryReader {
32 /**
33 * Read a ZIP file and call a function for each file discovered in it.
34 *
35 * Because this class is aimed at verification, an error is raised on
36 * suspicious or ambiguous input, instead of emulating some standard
37 * behavior.
38 *
39 * @param string $fileName The archive file name
40 * @param array $callback The callback function. It will be called for each file
41 * with a single associative array each time, with members:
42 *
43 * - name: The file name. Directories conventionally have a trailing
44 * slash.
45 *
46 * - mtime: The file modification time, in MediaWiki 14-char format
47 *
48 * - size: The uncompressed file size
49 *
50 * @param array $options An associative array of read options, with the option
51 * name in the key. This may currently contain:
52 *
53 * - zip64: If this is set to true, then we will emulate a
54 * library with ZIP64 support, like OpenJDK 7. If it is set to
55 * false, then we will emulate a library with no knowledge of
56 * ZIP64.
57 *
58 * NOTE: The ZIP64 code is untested and probably doesn't work. It
59 * turned out to be easier to just reject ZIP64 archive uploads,
60 * since they are likely to be very rare. Confirming safety of a
61 * ZIP64 file is fairly complex. What do you do with a file that is
62 * ambiguous and broken when read with a non-ZIP64 reader, but valid
63 * when read with a ZIP64 reader? This situation is normal for a
64 * valid ZIP64 file, and working out what non-ZIP64 readers will make
65 * of such a file is not trivial.
66 *
67 * @return Status A Status object. The following fatal errors are defined:
68 *
69 * - zip-file-open-error: The file could not be opened.
70 *
71 * - zip-wrong-format: The file does not appear to be a ZIP file.
72 *
73 * - zip-bad: There was something wrong or ambiguous about the file
74 * data.
75 *
76 * - zip-unsupported: The ZIP file uses features which
77 * ZipDirectoryReader does not support.
78 *
79 * The default messages for those fatal errors are written in a way that
80 * makes sense for upload verification.
81 *
82 * If a fatal error is returned, more information about the error will be
83 * available in the debug log.
84 *
85 * Note that the callback function may be called any number of times before
86 * a fatal error is returned. If this occurs, the data sent to the callback
87 * function should be discarded.
88 */
89 public static function read( $fileName, $callback, $options = array() ) {
90 $zdr = new self( $fileName, $callback, $options );
91
92 return $zdr->execute();
93 }
94
95 /** The file name */
96 protected $fileName;
97
98 /** The opened file resource */
99 protected $file;
100
101 /** The cached length of the file, or null if it has not been loaded yet. */
102 protected $fileLength;
103
104 /** A segmented cache of the file contents */
105 protected $buffer;
106
107 /** The file data callback */
108 protected $callback;
109
110 /** The ZIP64 mode */
111 protected $zip64 = false;
112
113 /** Stored headers */
114 protected $eocdr, $eocdr64, $eocdr64Locator;
115
116 protected $data;
117
118 /** The "extra field" ID for ZIP64 central directory entries */
119 const ZIP64_EXTRA_HEADER = 0x0001;
120
121 /** The segment size for the file contents cache */
122 const SEGSIZE = 16384;
123
124 /** The index of the "general field" bit for UTF-8 file names */
125 const GENERAL_UTF8 = 11;
126
127 /** The index of the "general field" bit for central directory encryption */
128 const GENERAL_CD_ENCRYPTED = 13;
129
130 /**
131 * Private constructor
132 */
133 protected function __construct( $fileName, $callback, $options ) {
134 $this->fileName = $fileName;
135 $this->callback = $callback;
136
137 if ( isset( $options['zip64'] ) ) {
138 $this->zip64 = $options['zip64'];
139 }
140 }
141
142 /**
143 * Read the directory according to settings in $this.
144 *
145 * @return Status
146 */
147 function execute() {
148 $this->file = fopen( $this->fileName, 'r' );
149 $this->data = array();
150 if ( !$this->file ) {
151 return Status::newFatal( 'zip-file-open-error' );
152 }
153
154 $status = Status::newGood();
155 try {
156 $this->readEndOfCentralDirectoryRecord();
157 if ( $this->zip64 ) {
158 list( $offset, $size ) = $this->findZip64CentralDirectory();
159 $this->readCentralDirectory( $offset, $size );
160 } else {
161 if ( $this->eocdr['CD size'] == 0xffffffff
162 || $this->eocdr['CD offset'] == 0xffffffff
163 || $this->eocdr['CD entries total'] == 0xffff
164 ) {
165 $this->error( 'zip-unsupported', 'Central directory header indicates ZIP64, ' .
166 'but we are in legacy mode. Rejecting this upload is necessary to avoid ' .
167 'opening vulnerabilities on clients using OpenJDK 7 or later.' );
168 }
169
170 list( $offset, $size ) = $this->findOldCentralDirectory();
171 $this->readCentralDirectory( $offset, $size );
172 }
173 } catch ( ZipDirectoryReaderError $e ) {
174 $status->fatal( $e->getErrorCode() );
175 }
176
177 fclose( $this->file );
178
179 return $status;
180 }
181
182 /**
183 * Throw an error, and log a debug message
184 * @param mixed $code
185 * @param string $debugMessage
186 */
187 function error( $code, $debugMessage ) {
188 wfDebug( __CLASS__ . ": Fatal error: $debugMessage\n" );
189 throw new ZipDirectoryReaderError( $code );
190 }
191
192 /**
193 * Read the header which is at the end of the central directory,
194 * unimaginatively called the "end of central directory record" by the ZIP
195 * spec.
196 */
197 function readEndOfCentralDirectoryRecord() {
198 $info = array(
199 'signature' => 4,
200 'disk' => 2,
201 'CD start disk' => 2,
202 'CD entries this disk' => 2,
203 'CD entries total' => 2,
204 'CD size' => 4,
205 'CD offset' => 4,
206 'file comment length' => 2,
207 );
208 $structSize = $this->getStructSize( $info );
209 $startPos = $this->getFileLength() - 65536 - $structSize;
210 if ( $startPos < 0 ) {
211 $startPos = 0;
212 }
213
214 $block = $this->getBlock( $startPos );
215 $sigPos = strrpos( $block, "PK\x05\x06" );
216 if ( $sigPos === false ) {
217 $this->error( 'zip-wrong-format',
218 "zip file lacks EOCDR signature. It probably isn't a zip file." );
219 }
220
221 $this->eocdr = $this->unpack( substr( $block, $sigPos ), $info );
222 $this->eocdr['EOCDR size'] = $structSize + $this->eocdr['file comment length'];
223
224 if ( $structSize + $this->eocdr['file comment length'] != strlen( $block ) - $sigPos ) {
225 $this->error( 'zip-bad', 'trailing bytes after the end of the file comment' );
226 }
227 if ( $this->eocdr['disk'] !== 0
228 || $this->eocdr['CD start disk'] !== 0
229 ) {
230 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR)' );
231 }
232 $this->eocdr += $this->unpack(
233 $block,
234 array( 'file comment' => array( 'string', $this->eocdr['file comment length'] ) ),
235 $sigPos + $structSize );
236 $this->eocdr['position'] = $startPos + $sigPos;
237 }
238
239 /**
240 * Read the header called the "ZIP64 end of central directory locator". An
241 * error will be raised if it does not exist.
242 */
243 function readZip64EndOfCentralDirectoryLocator() {
244 $info = array(
245 'signature' => array( 'string', 4 ),
246 'eocdr64 start disk' => 4,
247 'eocdr64 offset' => 8,
248 'number of disks' => 4,
249 );
250 $structSize = $this->getStructSize( $info );
251
252 $start = $this->getFileLength() - $this->eocdr['EOCDR size'] - $structSize;
253 $block = $this->getBlock( $start, $structSize );
254 $this->eocdr64Locator = $data = $this->unpack( $block, $info );
255
256 if ( $data['signature'] !== "PK\x06\x07" ) {
257 // Note: Java will allow this and continue to read the
258 // EOCDR64, so we have to reject the upload, we can't
259 // just use the EOCDR header instead.
260 $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory locator' );
261 }
262 }
263
264 /**
265 * Read the header called the "ZIP64 end of central directory record". It
266 * may replace the regular "end of central directory record" in ZIP64 files.
267 */
268 function readZip64EndOfCentralDirectoryRecord() {
269 if ( $this->eocdr64Locator['eocdr64 start disk'] != 0
270 || $this->eocdr64Locator['number of disks'] != 0
271 ) {
272 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64 locator)' );
273 }
274
275 $info = array(
276 'signature' => array( 'string', 4 ),
277 'EOCDR64 size' => 8,
278 'version made by' => 2,
279 'version needed' => 2,
280 'disk' => 4,
281 'CD start disk' => 4,
282 'CD entries this disk' => 8,
283 'CD entries total' => 8,
284 'CD size' => 8,
285 'CD offset' => 8
286 );
287 $structSize = $this->getStructSize( $info );
288 $block = $this->getBlock( $this->eocdr64Locator['eocdr64 offset'], $structSize );
289 $this->eocdr64 = $data = $this->unpack( $block, $info );
290 if ( $data['signature'] !== "PK\x06\x06" ) {
291 $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory record' );
292 }
293 if ( $data['disk'] !== 0
294 || $data['CD start disk'] !== 0
295 ) {
296 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64)' );
297 }
298 }
299
300 /**
301 * Find the location of the central directory, as would be seen by a
302 * non-ZIP64 reader.
303 *
304 * @return array List containing offset, size and end position.
305 */
306 function findOldCentralDirectory() {
307 $size = $this->eocdr['CD size'];
308 $offset = $this->eocdr['CD offset'];
309 $endPos = $this->eocdr['position'];
310
311 // Some readers use the EOCDR position instead of the offset field
312 // to find the directory, so to be safe, we check if they both agree.
313 if ( $offset + $size != $endPos ) {
314 $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' .
315 'of central directory record' );
316 }
317
318 return array( $offset, $size );
319 }
320
321 /**
322 * Find the location of the central directory, as would be seen by a
323 * ZIP64-compliant reader.
324 *
325 * @return array List containing offset, size and end position.
326 */
327 function findZip64CentralDirectory() {
328 // The spec is ambiguous about the exact rules of precedence between the
329 // ZIP64 headers and the original headers. Here we follow zip_util.c
330 // from OpenJDK 7.
331 $size = $this->eocdr['CD size'];
332 $offset = $this->eocdr['CD offset'];
333 $numEntries = $this->eocdr['CD entries total'];
334 $endPos = $this->eocdr['position'];
335 if ( $size == 0xffffffff
336 || $offset == 0xffffffff
337 || $numEntries == 0xffff
338 ) {
339 $this->readZip64EndOfCentralDirectoryLocator();
340
341 if ( isset( $this->eocdr64Locator['eocdr64 offset'] ) ) {
342 $this->readZip64EndOfCentralDirectoryRecord();
343 if ( isset( $this->eocdr64['CD offset'] ) ) {
344 $size = $this->eocdr64['CD size'];
345 $offset = $this->eocdr64['CD offset'];
346 $endPos = $this->eocdr64Locator['eocdr64 offset'];
347 }
348 }
349 }
350 // Some readers use the EOCDR position instead of the offset field
351 // to find the directory, so to be safe, we check if they both agree.
352 if ( $offset + $size != $endPos ) {
353 $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' .
354 'of central directory record' );
355 }
356
357 return array( $offset, $size );
358 }
359
360 /**
361 * Read the central directory at the given location
362 */
363 function readCentralDirectory( $offset, $size ) {
364 $block = $this->getBlock( $offset, $size );
365
366 $fixedInfo = array(
367 'signature' => array( 'string', 4 ),
368 'version made by' => 2,
369 'version needed' => 2,
370 'general bits' => 2,
371 'compression method' => 2,
372 'mod time' => 2,
373 'mod date' => 2,
374 'crc-32' => 4,
375 'compressed size' => 4,
376 'uncompressed size' => 4,
377 'name length' => 2,
378 'extra field length' => 2,
379 'comment length' => 2,
380 'disk number start' => 2,
381 'internal attrs' => 2,
382 'external attrs' => 4,
383 'local header offset' => 4,
384 );
385 $fixedSize = $this->getStructSize( $fixedInfo );
386
387 $pos = 0;
388 while ( $pos < $size ) {
389 $data = $this->unpack( $block, $fixedInfo, $pos );
390 $pos += $fixedSize;
391
392 if ( $data['signature'] !== "PK\x01\x02" ) {
393 $this->error( 'zip-bad', 'Invalid signature found in directory entry' );
394 }
395
396 $variableInfo = array(
397 'name' => array( 'string', $data['name length'] ),
398 'extra field' => array( 'string', $data['extra field length'] ),
399 'comment' => array( 'string', $data['comment length'] ),
400 );
401 $data += $this->unpack( $block, $variableInfo, $pos );
402 $pos += $this->getStructSize( $variableInfo );
403
404 if ( $this->zip64 && (
405 $data['compressed size'] == 0xffffffff
406 || $data['uncompressed size'] == 0xffffffff
407 || $data['local header offset'] == 0xffffffff )
408 ) {
409 $zip64Data = $this->unpackZip64Extra( $data['extra field'] );
410 if ( $zip64Data ) {
411 $data = $zip64Data + $data;
412 }
413 }
414
415 if ( $this->testBit( $data['general bits'], self::GENERAL_CD_ENCRYPTED ) ) {
416 $this->error( 'zip-unsupported', 'central directory encryption is not supported' );
417 }
418
419 // Convert the timestamp into MediaWiki format
420 // For the format, please see the MS-DOS 2.0 Programmer's Reference,
421 // pages 3-5 and 3-6.
422 $time = $data['mod time'];
423 $date = $data['mod date'];
424
425 $year = 1980 + ( $date >> 9 );
426 $month = ( $date >> 5 ) & 15;
427 $day = $date & 31;
428 $hour = ( $time >> 11 ) & 31;
429 $minute = ( $time >> 5 ) & 63;
430 $second = ( $time & 31 ) * 2;
431 $timestamp = sprintf( "%04d%02d%02d%02d%02d%02d",
432 $year, $month, $day, $hour, $minute, $second );
433
434 // Convert the character set in the file name
435 if ( $this->testBit( $data['general bits'], self::GENERAL_UTF8 ) ) {
436 $name = $data['name'];
437 } else {
438 $name = iconv( 'CP437', 'UTF-8', $data['name'] );
439 }
440
441 // Compile a data array for the user, with a sensible format
442 $userData = array(
443 'name' => $name,
444 'mtime' => $timestamp,
445 'size' => $data['uncompressed size'],
446 );
447 call_user_func( $this->callback, $userData );
448 }
449 }
450
451 /**
452 * Interpret ZIP64 "extra field" data and return an associative array.
453 * @return array|bool
454 */
455 function unpackZip64Extra( $extraField ) {
456 $extraHeaderInfo = array(
457 'id' => 2,
458 'size' => 2,
459 );
460 $extraHeaderSize = $this->getStructSize( $extraHeaderInfo );
461
462 $zip64ExtraInfo = array(
463 'uncompressed size' => 8,
464 'compressed size' => 8,
465 'local header offset' => 8,
466 'disk number start' => 4,
467 );
468
469 $extraPos = 0;
470 while ( $extraPos < strlen( $extraField ) ) {
471 $extra = $this->unpack( $extraField, $extraHeaderInfo, $extraPos );
472 $extraPos += $extraHeaderSize;
473 $extra += $this->unpack( $extraField,
474 array( 'data' => array( 'string', $extra['size'] ) ),
475 $extraPos );
476 $extraPos += $extra['size'];
477
478 if ( $extra['id'] == self::ZIP64_EXTRA_HEADER ) {
479 return $this->unpack( $extra['data'], $zip64ExtraInfo );
480 }
481 }
482
483 return false;
484 }
485
486 /**
487 * Get the length of the file.
488 * @return int
489 */
490 function getFileLength() {
491 if ( $this->fileLength === null ) {
492 $stat = fstat( $this->file );
493 $this->fileLength = $stat['size'];
494 }
495
496 return $this->fileLength;
497 }
498
499 /**
500 * Get the file contents from a given offset. If there are not enough bytes
501 * in the file to satisfy the request, an exception will be thrown.
502 *
503 * @param int $start The byte offset of the start of the block.
504 * @param int $length The number of bytes to return. If omitted, the remainder
505 * of the file will be returned.
506 *
507 * @return string
508 */
509 function getBlock( $start, $length = null ) {
510 $fileLength = $this->getFileLength();
511 if ( $start >= $fileLength ) {
512 $this->error( 'zip-bad', "getBlock() requested position $start, " .
513 "file length is $fileLength" );
514 }
515 if ( $length === null ) {
516 $length = $fileLength - $start;
517 }
518 $end = $start + $length;
519 if ( $end > $fileLength ) {
520 $this->error( 'zip-bad', "getBlock() requested end position $end, " .
521 "file length is $fileLength" );
522 }
523 $startSeg = floor( $start / self::SEGSIZE );
524 $endSeg = ceil( $end / self::SEGSIZE );
525
526 $block = '';
527 for ( $segIndex = $startSeg; $segIndex <= $endSeg; $segIndex++ ) {
528 $block .= $this->getSegment( $segIndex );
529 }
530
531 $block = substr( $block,
532 $start - $startSeg * self::SEGSIZE,
533 $length );
534
535 if ( strlen( $block ) < $length ) {
536 $this->error( 'zip-bad', 'getBlock() returned an unexpectedly small amount of data' );
537 }
538
539 return $block;
540 }
541
542 /**
543 * Get a section of the file starting at position $segIndex * self::SEGSIZE,
544 * of length self::SEGSIZE. The result is cached. This is a helper function
545 * for getBlock().
546 *
547 * If there are not enough bytes in the file to satisfy the request, the
548 * return value will be truncated. If a request is made for a segment beyond
549 * the end of the file, an empty string will be returned.
550 *
551 * @param int $segIndex
552 *
553 * @return string
554 */
555 function getSegment( $segIndex ) {
556 if ( !isset( $this->buffer[$segIndex] ) ) {
557 $bytePos = $segIndex * self::SEGSIZE;
558 if ( $bytePos >= $this->getFileLength() ) {
559 $this->buffer[$segIndex] = '';
560
561 return '';
562 }
563 if ( fseek( $this->file, $bytePos ) ) {
564 $this->error( 'zip-bad', "seek to $bytePos failed" );
565 }
566 $seg = fread( $this->file, self::SEGSIZE );
567 if ( $seg === false ) {
568 $this->error( 'zip-bad', "read from $bytePos failed" );
569 }
570 $this->buffer[$segIndex] = $seg;
571 }
572
573 return $this->buffer[$segIndex];
574 }
575
576 /**
577 * Get the size of a structure in bytes. See unpack() for the format of $struct.
578 * @return int
579 */
580 function getStructSize( $struct ) {
581 $size = 0;
582 foreach ( $struct as $type ) {
583 if ( is_array( $type ) ) {
584 list( , $fieldSize ) = $type;
585 $size += $fieldSize;
586 } else {
587 $size += $type;
588 }
589 }
590
591 return $size;
592 }
593
594 /**
595 * Unpack a binary structure. This is like the built-in unpack() function
596 * except nicer.
597 *
598 * @param string $string The binary data input
599 *
600 * @param array $struct An associative array giving structure members and their
601 * types. In the key is the field name. The value may be either an
602 * integer, in which case the field is a little-endian unsigned integer
603 * encoded in the given number of bytes, or an array, in which case the
604 * first element of the array is the type name, and the subsequent
605 * elements are type-dependent parameters. Only one such type is defined:
606 * - "string": The second array element gives the length of string.
607 * Not null terminated.
608 *
609 * @param int $offset The offset into the string at which to start unpacking.
610 *
611 * @throws MWException
612 * @return array Unpacked associative array. Note that large integers in the input
613 * may be represented as floating point numbers in the return value, so
614 * the use of weak comparison is advised.
615 */
616 function unpack( $string, $struct, $offset = 0 ) {
617 $size = $this->getStructSize( $struct );
618 if ( $offset + $size > strlen( $string ) ) {
619 $this->error( 'zip-bad', 'unpack() would run past the end of the supplied string' );
620 }
621
622 $data = array();
623 $pos = $offset;
624 foreach ( $struct as $key => $type ) {
625 if ( is_array( $type ) ) {
626 list( $typeName, $fieldSize ) = $type;
627 switch ( $typeName ) {
628 case 'string':
629 $data[$key] = substr( $string, $pos, $fieldSize );
630 $pos += $fieldSize;
631 break;
632 default:
633 throw new MWException( __METHOD__ . ": invalid type \"$typeName\"" );
634 }
635 } else {
636 // Unsigned little-endian integer
637 $length = intval( $type );
638
639 // Calculate the value. Use an algorithm which automatically
640 // upgrades the value to floating point if necessary.
641 $value = 0;
642 for ( $i = $length - 1; $i >= 0; $i-- ) {
643 $value *= 256;
644 $value += ord( $string[$pos + $i] );
645 }
646
647 // Throw an exception if there was loss of precision
648 if ( $value > pow( 2, 52 ) ) {
649 $this->error( 'zip-unsupported', 'number too large to be stored in a double. ' .
650 'This could happen if we tried to unpack a 64-bit structure ' .
651 'at an invalid location.' );
652 }
653 $data[$key] = $value;
654 $pos += $length;
655 }
656 }
657
658 return $data;
659 }
660
661 /**
662 * Returns a bit from a given position in an integer value, converted to
663 * boolean.
664 *
665 * @param int $value
666 * @param int $bitIndex The index of the bit, where 0 is the LSB.
667 * @return bool
668 */
669 function testBit( $value, $bitIndex ) {
670 return (bool)( ( $value >> $bitIndex ) & 1 );
671 }
672
673 /**
674 * Debugging helper function which dumps a string in hexdump -C format.
675 * @param string $s
676 */
677 function hexDump( $s ) {
678 $n = strlen( $s );
679 for ( $i = 0; $i < $n; $i += 16 ) {
680 printf( "%08X ", $i );
681 for ( $j = 0; $j < 16; $j++ ) {
682 print " ";
683 if ( $j == 8 ) {
684 print " ";
685 }
686 if ( $i + $j >= $n ) {
687 print " ";
688 } else {
689 printf( "%02X", ord( $s[$i + $j] ) );
690 }
691 }
692
693 print " |";
694 for ( $j = 0; $j < 16; $j++ ) {
695 if ( $i + $j >= $n ) {
696 print " ";
697 } elseif ( ctype_print( $s[$i + $j] ) ) {
698 print $s[$i + $j];
699 } else {
700 print '.';
701 }
702 }
703 print "|\n";
704 }
705 }
706 }
707
708 /**
709 * Internal exception class. Will be caught by private code.
710 */
711 class ZipDirectoryReaderError extends Exception {
712 protected $errorCode;
713
714 function __construct( $code ) {
715 $this->errorCode = $code;
716 parent::__construct( "ZipDirectoryReader error: $code" );
717 }
718
719 /**
720 * @return mixed
721 */
722 function getErrorCode() {
723 return $this->errorCode;
724 }
725 }