(bug 19195) Make user IDs more readily available with the API
[lhc/web/wiklou.git] / includes / ZipDirectoryReader.php
1 <?php
2
3 /**
4 * A class for reading ZIP file directories, for the purposes of upload
5 * verification.
6 *
7 * Only a functional interface is provided: ZipFileReader::read(). No access is
8 * given to object instances.
9 *
10 */
11 class ZipDirectoryReader {
12 /**
13 * Read a ZIP file and call a function for each file discovered in it.
14 *
15 * Because this class is aimed at verification, an error is raised on
16 * suspicious or ambiguous input, instead of emulating some standard
17 * behaviour.
18 *
19 * @param $fileName string The archive file name
20 * @param $callback Array The callback function. It will be called for each file
21 * with a single associative array each time, with members:
22 *
23 * - name: The file name. Directories conventionally have a trailing
24 * slash.
25 *
26 * - mtime: The file modification time, in MediaWiki 14-char format
27 *
28 * - size: The uncompressed file size
29 *
30 * @param $options Array An associative array of read options, with the option
31 * name in the key. This may currently contain:
32 *
33 * - zip64: If this is set to true, then we will emulate a
34 * library with ZIP64 support, like OpenJDK 7. If it is set to
35 * false, then we will emulate a library with no knowledge of
36 * ZIP64.
37 *
38 * NOTE: The ZIP64 code is untested and probably doesn't work. It
39 * turned out to be easier to just reject ZIP64 archive uploads,
40 * since they are likely to be very rare. Confirming safety of a
41 * ZIP64 file is fairly complex. What do you do with a file that is
42 * ambiguous and broken when read with a non-ZIP64 reader, but valid
43 * when read with a ZIP64 reader? This situation is normal for a
44 * valid ZIP64 file, and working out what non-ZIP64 readers will make
45 * of such a file is not trivial.
46 *
47 * @return Status object. The following fatal errors are defined:
48 *
49 * - zip-file-open-error: The file could not be opened.
50 *
51 * - zip-wrong-format: The file does not appear to be a ZIP file.
52 *
53 * - zip-bad: There was something wrong or ambiguous about the file
54 * data.
55 *
56 * - zip-unsupported: The ZIP file uses features which
57 * ZipDirectoryReader does not support.
58 *
59 * The default messages for those fatal errors are written in a way that
60 * makes sense for upload verification.
61 *
62 * If a fatal error is returned, more information about the error will be
63 * available in the debug log.
64 *
65 * Note that the callback function may be called any number of times before
66 * a fatal error is returned. If this occurs, the data sent to the callback
67 * function should be discarded.
68 */
69 public static function read( $fileName, $callback, $options = array() ) {
70 $zdr = new self( $fileName, $callback, $options );
71 return $zdr->execute();
72 }
73
74 /** The file name */
75 var $fileName;
76
77 /** The opened file resource */
78 var $file;
79
80 /** The cached length of the file, or null if it has not been loaded yet. */
81 var $fileLength;
82
83 /** A segmented cache of the file contents */
84 var $buffer;
85
86 /** The file data callback */
87 var $callback;
88
89 /** The ZIP64 mode */
90 var $zip64 = false;
91
92 /** Stored headers */
93 var $eocdr, $eocdr64, $eocdr64Locator;
94
95 var $data;
96
97 /** The "extra field" ID for ZIP64 central directory entries */
98 const ZIP64_EXTRA_HEADER = 0x0001;
99
100 /** The segment size for the file contents cache */
101 const SEGSIZE = 16384;
102
103 /** The index of the "general field" bit for UTF-8 file names */
104 const GENERAL_UTF8 = 11;
105
106 /** The index of the "general field" bit for central directory encryption */
107 const GENERAL_CD_ENCRYPTED = 13;
108
109 /**
110 * Private constructor
111 */
112 protected function __construct( $fileName, $callback, $options ) {
113 $this->fileName = $fileName;
114 $this->callback = $callback;
115
116 if ( isset( $options['zip64'] ) ) {
117 $this->zip64 = $options['zip64'];
118 }
119 }
120
121 /**
122 * Read the directory according to settings in $this.
123 *
124 * @return Status
125 */
126 function execute() {
127 $this->file = fopen( $this->fileName, 'r' );
128 $this->data = array();
129 if ( !$this->file ) {
130 return Status::newFatal( 'zip-file-open-error' );
131 }
132
133 $status = Status::newGood();
134 try {
135 $this->readEndOfCentralDirectoryRecord();
136 if ( $this->zip64 ) {
137 list( $offset, $size ) = $this->findZip64CentralDirectory();
138 $this->readCentralDirectory( $offset, $size );
139 } else {
140 if ( $this->eocdr['CD size'] == 0xffffffff
141 || $this->eocdr['CD offset'] == 0xffffffff
142 || $this->eocdr['CD entries total'] == 0xffff )
143 {
144 $this->error( 'zip-unsupported', 'Central directory header indicates ZIP64, ' .
145 'but we are in legacy mode. Rejecting this upload is necessary to avoid '.
146 'opening vulnerabilities on clients using OpenJDK 7 or later.' );
147 }
148
149 list( $offset, $size ) = $this->findOldCentralDirectory();
150 $this->readCentralDirectory( $offset, $size );
151 }
152 } catch ( ZipDirectoryReaderError $e ) {
153 $status->fatal( $e->getErrorCode() );
154 }
155
156 fclose( $this->file );
157 return $status;
158 }
159
160 /**
161 * Throw an error, and log a debug message
162 */
163 function error( $code, $debugMessage ) {
164 wfDebug( __CLASS__.": Fatal error: $debugMessage\n" );
165 throw new ZipDirectoryReaderError( $code );
166 }
167
168 /**
169 * Read the header which is at the end of the central directory,
170 * unimaginatively called the "end of central directory record" by the ZIP
171 * spec.
172 */
173 function readEndOfCentralDirectoryRecord() {
174 $info = array(
175 'signature' => 4,
176 'disk' => 2,
177 'CD start disk' => 2,
178 'CD entries this disk' => 2,
179 'CD entries total' => 2,
180 'CD size' => 4,
181 'CD offset' => 4,
182 'file comment length' => 2,
183 );
184 $structSize = $this->getStructSize( $info );
185 $startPos = $this->getFileLength() - 65536 - $structSize;
186 if ( $startPos < 0 ) {
187 $startPos = 0;
188 }
189
190 $block = $this->getBlock( $startPos );
191 $sigPos = strrpos( $block, "PK\x05\x06" );
192 if ( $sigPos === false ) {
193 $this->error( 'zip-wrong-format',
194 "zip file lacks EOCDR signature. It probably isn't a zip file." );
195 }
196
197 $this->eocdr = $this->unpack( substr( $block, $sigPos ), $info );
198 $this->eocdr['EOCDR size'] = $structSize + $this->eocdr['file comment length'];
199
200 if ( $structSize + $this->eocdr['file comment length'] != strlen( $block ) - $sigPos ) {
201 $this->error( 'zip-bad', 'trailing bytes after the end of the file comment' );
202 }
203 if ( $this->eocdr['disk'] !== 0
204 || $this->eocdr['CD start disk'] !== 0 )
205 {
206 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR)' );
207 }
208 $this->eocdr += $this->unpack(
209 $block,
210 array( 'file comment' => array( 'string', $this->eocdr['file comment length'] ) ),
211 $sigPos + $structSize );
212 $this->eocdr['position'] = $startPos + $sigPos;
213 }
214
215 /**
216 * Read the header called the "ZIP64 end of central directory locator". An
217 * error will be raised if it does not exist.
218 */
219 function readZip64EndOfCentralDirectoryLocator() {
220 $info = array(
221 'signature' => array( 'string', 4 ),
222 'eocdr64 start disk' => 4,
223 'eocdr64 offset' => 8,
224 'number of disks' => 4,
225 );
226 $structSize = $this->getStructSize( $info );
227
228 $block = $this->getBlock( $this->getFileLength() - $this->eocdr['EOCDR size']
229 - $structSize, $structSize );
230 $this->eocdr64Locator = $data = $this->unpack( $block, $info );
231
232 if ( $data['signature'] !== "PK\x06\x07" ) {
233 // Note: Java will allow this and continue to read the
234 // EOCDR64, so we have to reject the upload, we can't
235 // just use the EOCDR header instead.
236 $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory locator' );
237 }
238 }
239
240 /**
241 * Read the header called the "ZIP64 end of central directory record". It
242 * may replace the regular "end of central directory record" in ZIP64 files.
243 */
244 function readZip64EndOfCentralDirectoryRecord() {
245 if ( $this->eocdr64Locator['eocdr64 start disk'] != 0
246 || $this->eocdr64Locator['number of disks'] != 0 )
247 {
248 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64 locator)' );
249 }
250
251 $info = array(
252 'signature' => array( 'string', 4 ),
253 'EOCDR64 size' => 8,
254 'version made by' => 2,
255 'version needed' => 2,
256 'disk' => 4,
257 'CD start disk' => 4,
258 'CD entries this disk' => 8,
259 'CD entries total' => 8,
260 'CD size' => 8,
261 'CD offset' => 8
262 );
263 $structSize = $this->getStructSize( $info );
264 $block = $this->getBlock( $this->eocdr64Locator['eocdr64 offset'], $structSize );
265 $this->eocdr64 = $data = $this->unpack( $block, $info );
266 if ( $data['signature'] !== "PK\x06\x06" ) {
267 $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory record' );
268 }
269 if ( $data['disk'] !== 0
270 || $data['CD start disk'] !== 0 )
271 {
272 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64)' );
273 }
274 }
275
276 /**
277 * Find the location of the central directory, as would be seen by a
278 * non-ZIP64 reader.
279 *
280 * @return List containing offset, size and end position.
281 */
282 function findOldCentralDirectory() {
283 $size = $this->eocdr['CD size'];
284 $offset = $this->eocdr['CD offset'];
285 $endPos = $this->eocdr['position'];
286
287 // Some readers use the EOCDR position instead of the offset field
288 // to find the directory, so to be safe, we check if they both agree.
289 if ( $offset + $size != $endPos ) {
290 $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' .
291 'of central directory record' );
292 }
293 return array( $offset, $size );
294 }
295
296 /**
297 * Find the location of the central directory, as would be seen by a
298 * ZIP64-compliant reader.
299 *
300 * @return array List containing offset, size and end position.
301 */
302 function findZip64CentralDirectory() {
303 // The spec is ambiguous about the exact rules of precedence between the
304 // ZIP64 headers and the original headers. Here we follow zip_util.c
305 // from OpenJDK 7.
306 $size = $this->eocdr['CD size'];
307 $offset = $this->eocdr['CD offset'];
308 $numEntries = $this->eocdr['CD entries total'];
309 $endPos = $this->eocdr['position'];
310 if ( $size == 0xffffffff
311 || $offset == 0xffffffff
312 || $numEntries == 0xffff )
313 {
314 $this->readZip64EndOfCentralDirectoryLocator();
315
316 if ( isset( $this->eocdr64Locator['eocdr64 offset'] ) ) {
317 $this->readZip64EndOfCentralDirectoryRecord();
318 if ( isset( $this->eocdr64['CD offset'] ) ) {
319 $size = $this->eocdr64['CD size'];
320 $offset = $this->eocdr64['CD offset'];
321 $endPos = $this->eocdr64Locator['eocdr64 offset'];
322 }
323 }
324 }
325 // Some readers use the EOCDR position instead of the offset field
326 // to find the directory, so to be safe, we check if they both agree.
327 if ( $offset + $size != $endPos ) {
328 $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' .
329 'of central directory record' );
330 }
331 return array( $offset, $size );
332 }
333
334 /**
335 * Read the central directory at the given location
336 */
337 function readCentralDirectory( $offset, $size ) {
338 $block = $this->getBlock( $offset, $size );
339
340 $fixedInfo = array(
341 'signature' => array( 'string', 4 ),
342 'version made by' => 2,
343 'version needed' => 2,
344 'general bits' => 2,
345 'compression method' => 2,
346 'mod time' => 2,
347 'mod date' => 2,
348 'crc-32' => 4,
349 'compressed size' => 4,
350 'uncompressed size' => 4,
351 'name length' => 2,
352 'extra field length' => 2,
353 'comment length' => 2,
354 'disk number start' => 2,
355 'internal attrs' => 2,
356 'external attrs' => 4,
357 'local header offset' => 4,
358 );
359 $fixedSize = $this->getStructSize( $fixedInfo );
360
361 $pos = 0;
362 while ( $pos < $size ) {
363 $data = $this->unpack( $block, $fixedInfo, $pos );
364 $pos += $fixedSize;
365
366 if ( $data['signature'] !== "PK\x01\x02" ) {
367 $this->error( 'zip-bad', 'Invalid signature found in directory entry' );
368 }
369
370 $variableInfo = array(
371 'name' => array( 'string', $data['name length'] ),
372 'extra field' => array( 'string', $data['extra field length'] ),
373 'comment' => array( 'string', $data['comment length'] ),
374 );
375 $data += $this->unpack( $block, $variableInfo, $pos );
376 $pos += $this->getStructSize( $variableInfo );
377
378 if ( $this->zip64 && (
379 $data['compressed size'] == 0xffffffff
380 || $data['uncompressed size'] == 0xffffffff
381 || $data['local header offset'] == 0xffffffff ) )
382 {
383 $zip64Data = $this->unpackZip64Extra( $data['extra field'] );
384 if ( $zip64Data ) {
385 $data = $zip64Data + $data;
386 }
387 }
388
389 if ( $this->testBit( $data['general bits'], self::GENERAL_CD_ENCRYPTED ) ) {
390 $this->error( 'zip-unsupported', 'central directory encryption is not supported' );
391 }
392
393 // Convert the timestamp into MediaWiki format
394 // For the format, please see the MS-DOS 2.0 Programmer's Reference,
395 // pages 3-5 and 3-6.
396 $time = $data['mod time'];
397 $date = $data['mod date'];
398
399 $year = 1980 + ( $date >> 9 );
400 $month = ( $date >> 5 ) & 15;
401 $day = $date & 31;
402 $hour = ( $time >> 11 ) & 31;
403 $minute = ( $time >> 5 ) & 63;
404 $second = ( $time & 31 ) * 2;
405 $timestamp = sprintf( "%04d%02d%02d%02d%02d%02d",
406 $year, $month, $day, $hour, $minute, $second );
407
408 // Convert the character set in the file name
409 if ( !function_exists( 'iconv' )
410 || $this->testBit( $data['general bits'], self::GENERAL_UTF8 ) )
411 {
412 $name = $data['name'];
413 } else {
414 $name = iconv( 'CP437', 'UTF-8', $data['name'] );
415 }
416
417 // Compile a data array for the user, with a sensible format
418 $userData = array(
419 'name' => $name,
420 'mtime' => $timestamp,
421 'size' => $data['uncompressed size'],
422 );
423 call_user_func( $this->callback, $userData );
424 }
425 }
426
427 /**
428 * Interpret ZIP64 "extra field" data and return an associative array.
429 * @return array|bool
430 */
431 function unpackZip64Extra( $extraField ) {
432 $extraHeaderInfo = array(
433 'id' => 2,
434 'size' => 2,
435 );
436 $extraHeaderSize = $this->getStructSize( $extraHeaderInfo );
437
438 $zip64ExtraInfo = array(
439 'uncompressed size' => 8,
440 'compressed size' => 8,
441 'local header offset' => 8,
442 'disk number start' => 4,
443 );
444
445 $extraPos = 0;
446 while ( $extraPos < strlen( $extraField ) ) {
447 $extra = $this->unpack( $extraField, $extraHeaderInfo, $extraPos );
448 $extraPos += $extraHeaderSize;
449 $extra += $this->unpack( $extraField,
450 array( 'data' => array( 'string', $extra['size'] ) ),
451 $extraPos );
452 $extraPos += $extra['size'];
453
454 if ( $extra['id'] == self::ZIP64_EXTRA_HEADER ) {
455 return $this->unpack( $extra['data'], $zip64ExtraInfo );
456 }
457 }
458
459 return false;
460 }
461
462 /**
463 * Get the length of the file.
464 */
465 function getFileLength() {
466 if ( $this->fileLength === null ) {
467 $stat = fstat( $this->file );
468 $this->fileLength = $stat['size'];
469 }
470 return $this->fileLength;
471 }
472
473 /**
474 * Get the file contents from a given offset. If there are not enough bytes
475 * in the file to satisfy the request, an exception will be thrown.
476 *
477 * @param $start int The byte offset of the start of the block.
478 * @param $length int The number of bytes to return. If omitted, the remainder
479 * of the file will be returned.
480 *
481 * @return string
482 */
483 function getBlock( $start, $length = null ) {
484 $fileLength = $this->getFileLength();
485 if ( $start >= $fileLength ) {
486 $this->error( 'zip-bad', "getBlock() requested position $start, " .
487 "file length is $fileLength" );
488 }
489 if ( $length === null ) {
490 $length = $fileLength - $start;
491 }
492 $end = $start + $length;
493 if ( $end > $fileLength ) {
494 $this->error( 'zip-bad', "getBlock() requested end position $end, " .
495 "file length is $fileLength" );
496 }
497 $startSeg = floor( $start / self::SEGSIZE );
498 $endSeg = ceil( $end / self::SEGSIZE );
499
500 $block = '';
501 for ( $segIndex = $startSeg; $segIndex <= $endSeg; $segIndex++ ) {
502 $block .= $this->getSegment( $segIndex );
503 }
504
505 $block = substr( $block,
506 $start - $startSeg * self::SEGSIZE,
507 $length );
508
509 if ( strlen( $block ) < $length ) {
510 $this->error( 'zip-bad', 'getBlock() returned an unexpectedly small amount of data' );
511 }
512
513 return $block;
514 }
515
516 /**
517 * Get a section of the file starting at position $segIndex * self::SEGSIZE,
518 * of length self::SEGSIZE. The result is cached. This is a helper function
519 * for getBlock().
520 *
521 * If there are not enough bytes in the file to satsify the request, the
522 * return value will be truncated. If a request is made for a segment beyond
523 * the end of the file, an empty string will be returned.
524 * @return string
525 */
526 function getSegment( $segIndex ) {
527 if ( !isset( $this->buffer[$segIndex] ) ) {
528 $bytePos = $segIndex * self::SEGSIZE;
529 if ( $bytePos >= $this->getFileLength() ) {
530 $this->buffer[$segIndex] = '';
531 return '';
532 }
533 if ( fseek( $this->file, $bytePos ) ) {
534 $this->error( 'zip-bad', "seek to $bytePos failed" );
535 }
536 $seg = fread( $this->file, self::SEGSIZE );
537 if ( $seg === false ) {
538 $this->error( 'zip-bad', "read from $bytePos failed" );
539 }
540 $this->buffer[$segIndex] = $seg;
541 }
542 return $this->buffer[$segIndex];
543 }
544
545 /**
546 * Get the size of a structure in bytes. See unpack() for the format of $struct.
547 * @return int
548 */
549 function getStructSize( $struct ) {
550 $size = 0;
551 foreach ( $struct as $type ) {
552 if ( is_array( $type ) ) {
553 list( $typeName, $fieldSize ) = $type;
554 $size += $fieldSize;
555 } else {
556 $size += $type;
557 }
558 }
559 return $size;
560 }
561
562 /**
563 * Unpack a binary structure. This is like the built-in unpack() function
564 * except nicer.
565 *
566 * @param $string string The binary data input
567 *
568 * @param $struct array An associative array giving structure members and their
569 * types. In the key is the field name. The value may be either an
570 * integer, in which case the field is a little-endian unsigned integer
571 * encoded in the given number of bytes, or an array, in which case the
572 * first element of the array is the type name, and the subsequent
573 * elements are type-dependent parameters. Only one such type is defined:
574 * - "string": The second array element gives the length of string.
575 * Not null terminated.
576 *
577 * @param $offset int The offset into the string at which to start unpacking.
578 *
579 * @return array Unpacked associative array. Note that large integers in the input
580 * may be represented as floating point numbers in the return value, so
581 * the use of weak comparison is advised.
582 */
583 function unpack( $string, $struct, $offset = 0 ) {
584 $size = $this->getStructSize( $struct );
585 if ( $offset + $size > strlen( $string ) ) {
586 $this->error( 'zip-bad', 'unpack() would run past the end of the supplied string' );
587 }
588
589 $data = array();
590 $pos = $offset;
591 foreach ( $struct as $key => $type ) {
592 if ( is_array( $type ) ) {
593 list( $typeName, $fieldSize ) = $type;
594 switch ( $typeName ) {
595 case 'string':
596 $data[$key] = substr( $string, $pos, $fieldSize );
597 $pos += $fieldSize;
598 break;
599 default:
600 throw new MWException( __METHOD__.": invalid type \"$typeName\"" );
601 }
602 } else {
603 // Unsigned little-endian integer
604 $length = intval( $type );
605 $bytes = substr( $string, $pos, $length );
606
607 // Calculate the value. Use an algorithm which automatically
608 // upgrades the value to floating point if necessary.
609 $value = 0;
610 for ( $i = $length - 1; $i >= 0; $i-- ) {
611 $value *= 256;
612 $value += ord( $string[$pos + $i] );
613 }
614
615 // Throw an exception if there was loss of precision
616 if ( $value > pow( 2, 52 ) ) {
617 $this->error( 'zip-unsupported', 'number too large to be stored in a double. ' .
618 'This could happen if we tried to unpack a 64-bit structure ' .
619 'at an invalid location.' );
620 }
621 $data[$key] = $value;
622 $pos += $length;
623 }
624 }
625
626 return $data;
627 }
628
629 /**
630 * Returns a bit from a given position in an integer value, converted to
631 * boolean.
632 *
633 * @param $value integer
634 * @param $bitIndex int The index of the bit, where 0 is the LSB.
635 * @return bool
636 */
637 function testBit( $value, $bitIndex ) {
638 return (bool)( ( $value >> $bitIndex ) & 1 );
639 }
640
641 /**
642 * Debugging helper function which dumps a string in hexdump -C format.
643 */
644 function hexDump( $s ) {
645 $n = strlen( $s );
646 for ( $i = 0; $i < $n; $i += 16 ) {
647 printf( "%08X ", $i );
648 for ( $j = 0; $j < 16; $j++ ) {
649 print " ";
650 if ( $j == 8 ) {
651 print " ";
652 }
653 if ( $i + $j >= $n ) {
654 print " ";
655 } else {
656 printf( "%02X", ord( $s[$i + $j] ) );
657 }
658 }
659
660 print " |";
661 for ( $j = 0; $j < 16; $j++ ) {
662 if ( $i + $j >= $n ) {
663 print " ";
664 } elseif ( ctype_print( $s[$i + $j] ) ) {
665 print $s[$i + $j];
666 } else {
667 print '.';
668 }
669 }
670 print "|\n";
671 }
672 }
673 }
674
675 /**
676 * Internal exception class. Will be caught by private code.
677 */
678 class ZipDirectoryReaderError extends Exception {
679 var $code;
680
681 function __construct( $code ) {
682 $this->code = $code;
683 parent::__construct( "ZipDirectoryReader error: $code" );
684 }
685
686 /**
687 * @return mixed
688 */
689 function getErrorCode() {
690 return $this->code;
691 }
692 }