Merge "Make DBAccessBase use DBConnRef, rename $wiki, and hide getLoadBalancer()"
[lhc/web/wiklou.git] / includes / utils / ZipDirectoryReader.php
1 <?php
2 /**
3 * ZIP file directories reader, for the purposes of upload verification.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 */
22
23 /**
24 * A class for reading ZIP file directories, for the purposes of upload
25 * verification.
26 *
27 * Only a functional interface is provided: ZipFileReader::read(). No access is
28 * given to object instances.
29 */
30 class ZipDirectoryReader {
31 /**
32 * Read a ZIP file and call a function for each file discovered in it.
33 *
34 * Because this class is aimed at verification, an error is raised on
35 * suspicious or ambiguous input, instead of emulating some standard
36 * behavior.
37 *
38 * @param string $fileName The archive file name
39 * @param array $callback The callback function. It will be called for each file
40 * with a single associative array each time, with members:
41 *
42 * - name: The file name. Directories conventionally have a trailing
43 * slash.
44 *
45 * - mtime: The file modification time, in MediaWiki 14-char format
46 *
47 * - size: The uncompressed file size
48 *
49 * @param array $options An associative array of read options, with the option
50 * name in the key. This may currently contain:
51 *
52 * - zip64: If this is set to true, then we will emulate a
53 * library with ZIP64 support, like OpenJDK 7. If it is set to
54 * false, then we will emulate a library with no knowledge of
55 * ZIP64.
56 *
57 * NOTE: The ZIP64 code is untested and probably doesn't work. It
58 * turned out to be easier to just reject ZIP64 archive uploads,
59 * since they are likely to be very rare. Confirming safety of a
60 * ZIP64 file is fairly complex. What do you do with a file that is
61 * ambiguous and broken when read with a non-ZIP64 reader, but valid
62 * when read with a ZIP64 reader? This situation is normal for a
63 * valid ZIP64 file, and working out what non-ZIP64 readers will make
64 * of such a file is not trivial.
65 *
66 * @return Status A Status object. The following fatal errors are defined:
67 *
68 * - zip-file-open-error: The file could not be opened.
69 *
70 * - zip-wrong-format: The file does not appear to be a ZIP file.
71 *
72 * - zip-bad: There was something wrong or ambiguous about the file
73 * data.
74 *
75 * - zip-unsupported: The ZIP file uses features which
76 * ZipDirectoryReader does not support.
77 *
78 * The default messages for those fatal errors are written in a way that
79 * makes sense for upload verification.
80 *
81 * If a fatal error is returned, more information about the error will be
82 * available in the debug log.
83 *
84 * Note that the callback function may be called any number of times before
85 * a fatal error is returned. If this occurs, the data sent to the callback
86 * function should be discarded.
87 */
88 public static function read( $fileName, $callback, $options = [] ) {
89 $zdr = new self( $fileName, $callback, $options );
90
91 return $zdr->execute();
92 }
93
94 /** The file name */
95 protected $fileName;
96
97 /** The opened file resource */
98 protected $file;
99
100 /** The cached length of the file, or null if it has not been loaded yet. */
101 protected $fileLength;
102
103 /** A segmented cache of the file contents */
104 protected $buffer;
105
106 /** The file data callback */
107 protected $callback;
108
109 /** The ZIP64 mode */
110 protected $zip64 = false;
111
112 /** Stored headers */
113 protected $eocdr, $eocdr64, $eocdr64Locator;
114
115 protected $data;
116
117 /** The "extra field" ID for ZIP64 central directory entries */
118 const ZIP64_EXTRA_HEADER = 0x0001;
119
120 /** The segment size for the file contents cache */
121 const SEGSIZE = 16384;
122
123 /** The index of the "general field" bit for UTF-8 file names */
124 const GENERAL_UTF8 = 11;
125
126 /** The index of the "general field" bit for central directory encryption */
127 const GENERAL_CD_ENCRYPTED = 13;
128
129 /**
130 * @param string $fileName
131 * @param callable $callback
132 * @param array $options
133 */
134 protected function __construct( $fileName, $callback, $options ) {
135 $this->fileName = $fileName;
136 $this->callback = $callback;
137
138 if ( isset( $options['zip64'] ) ) {
139 $this->zip64 = $options['zip64'];
140 }
141 }
142
143 /**
144 * Read the directory according to settings in $this.
145 *
146 * @return Status
147 */
148 function execute() {
149 $this->file = fopen( $this->fileName, 'r' );
150 $this->data = [];
151 if ( !$this->file ) {
152 return Status::newFatal( 'zip-file-open-error' );
153 }
154
155 $status = Status::newGood();
156 try {
157 $this->readEndOfCentralDirectoryRecord();
158 if ( $this->zip64 ) {
159 list( $offset, $size ) = $this->findZip64CentralDirectory();
160 $this->readCentralDirectory( $offset, $size );
161 } else {
162 if ( $this->eocdr['CD size'] == 0xffffffff
163 || $this->eocdr['CD offset'] == 0xffffffff
164 || $this->eocdr['CD entries total'] == 0xffff
165 ) {
166 $this->error( 'zip-unsupported', 'Central directory header indicates ZIP64, ' .
167 'but we are in legacy mode. Rejecting this upload is necessary to avoid ' .
168 'opening vulnerabilities on clients using OpenJDK 7 or later.' );
169 }
170
171 list( $offset, $size ) = $this->findOldCentralDirectory();
172 $this->readCentralDirectory( $offset, $size );
173 }
174 } catch ( ZipDirectoryReaderError $e ) {
175 $status->fatal( $e->getErrorCode() );
176 }
177
178 fclose( $this->file );
179
180 return $status;
181 }
182
183 /**
184 * Throw an error, and log a debug message
185 * @param mixed $code
186 * @param string $debugMessage
187 * @throws ZipDirectoryReaderError
188 */
189 function error( $code, $debugMessage ) {
190 wfDebug( __CLASS__ . ": Fatal error: $debugMessage\n" );
191 throw new ZipDirectoryReaderError( $code );
192 }
193
194 /**
195 * Read the header which is at the end of the central directory,
196 * unimaginatively called the "end of central directory record" by the ZIP
197 * spec.
198 */
199 function readEndOfCentralDirectoryRecord() {
200 $info = [
201 'signature' => 4,
202 'disk' => 2,
203 'CD start disk' => 2,
204 'CD entries this disk' => 2,
205 'CD entries total' => 2,
206 'CD size' => 4,
207 'CD offset' => 4,
208 'file comment length' => 2,
209 ];
210 $structSize = $this->getStructSize( $info );
211 $startPos = $this->getFileLength() - 65536 - $structSize;
212 if ( $startPos < 0 ) {
213 $startPos = 0;
214 }
215
216 if ( $this->getFileLength() === 0 ) {
217 $this->error( 'zip-wrong-format', "The file is empty." );
218 }
219
220 $block = $this->getBlock( $startPos );
221 $sigPos = strrpos( $block, "PK\x05\x06" );
222 if ( $sigPos === false ) {
223 $this->error( 'zip-wrong-format',
224 "zip file lacks EOCDR signature. It probably isn't a zip file." );
225 }
226
227 $this->eocdr = $this->unpack( substr( $block, $sigPos ), $info );
228 $this->eocdr['EOCDR size'] = $structSize + $this->eocdr['file comment length'];
229
230 if ( $structSize + $this->eocdr['file comment length'] != strlen( $block ) - $sigPos ) {
231 // T40432: MS binary documents frequently embed ZIP files
232 $this->error( 'zip-wrong-format', 'there is a ZIP signature but it is not at ' .
233 'the end of the file. It could be an OLE file with a ZIP file embedded.' );
234 }
235 if ( $this->eocdr['disk'] !== 0
236 || $this->eocdr['CD start disk'] !== 0
237 ) {
238 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR)' );
239 }
240 $this->eocdr += $this->unpack(
241 $block,
242 [ 'file comment' => [ 'string', $this->eocdr['file comment length'] ] ],
243 $sigPos + $structSize );
244 $this->eocdr['position'] = $startPos + $sigPos;
245 }
246
247 /**
248 * Read the header called the "ZIP64 end of central directory locator". An
249 * error will be raised if it does not exist.
250 */
251 function readZip64EndOfCentralDirectoryLocator() {
252 $info = [
253 'signature' => [ 'string', 4 ],
254 'eocdr64 start disk' => 4,
255 'eocdr64 offset' => 8,
256 'number of disks' => 4,
257 ];
258 $structSize = $this->getStructSize( $info );
259
260 $start = $this->getFileLength() - $this->eocdr['EOCDR size'] - $structSize;
261 $block = $this->getBlock( $start, $structSize );
262 $this->eocdr64Locator = $data = $this->unpack( $block, $info );
263
264 if ( $data['signature'] !== "PK\x06\x07" ) {
265 // Note: Java will allow this and continue to read the
266 // EOCDR64, so we have to reject the upload, we can't
267 // just use the EOCDR header instead.
268 $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory locator' );
269 }
270 }
271
272 /**
273 * Read the header called the "ZIP64 end of central directory record". It
274 * may replace the regular "end of central directory record" in ZIP64 files.
275 */
276 function readZip64EndOfCentralDirectoryRecord() {
277 if ( $this->eocdr64Locator['eocdr64 start disk'] != 0
278 || $this->eocdr64Locator['number of disks'] != 0
279 ) {
280 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64 locator)' );
281 }
282
283 $info = [
284 'signature' => [ 'string', 4 ],
285 'EOCDR64 size' => 8,
286 'version made by' => 2,
287 'version needed' => 2,
288 'disk' => 4,
289 'CD start disk' => 4,
290 'CD entries this disk' => 8,
291 'CD entries total' => 8,
292 'CD size' => 8,
293 'CD offset' => 8
294 ];
295 $structSize = $this->getStructSize( $info );
296 $block = $this->getBlock( $this->eocdr64Locator['eocdr64 offset'], $structSize );
297 $this->eocdr64 = $data = $this->unpack( $block, $info );
298 if ( $data['signature'] !== "PK\x06\x06" ) {
299 $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory record' );
300 }
301 if ( $data['disk'] !== 0
302 || $data['CD start disk'] !== 0
303 ) {
304 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64)' );
305 }
306 }
307
308 /**
309 * Find the location of the central directory, as would be seen by a
310 * non-ZIP64 reader.
311 *
312 * @return array List containing offset, size and end position.
313 */
314 function findOldCentralDirectory() {
315 $size = $this->eocdr['CD size'];
316 $offset = $this->eocdr['CD offset'];
317 $endPos = $this->eocdr['position'];
318
319 // Some readers use the EOCDR position instead of the offset field
320 // to find the directory, so to be safe, we check if they both agree.
321 if ( $offset + $size != $endPos ) {
322 $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' .
323 'of central directory record' );
324 }
325
326 return [ $offset, $size ];
327 }
328
329 /**
330 * Find the location of the central directory, as would be seen by a
331 * ZIP64-compliant reader.
332 *
333 * @return array List containing offset, size and end position.
334 */
335 function findZip64CentralDirectory() {
336 // The spec is ambiguous about the exact rules of precedence between the
337 // ZIP64 headers and the original headers. Here we follow zip_util.c
338 // from OpenJDK 7.
339 $size = $this->eocdr['CD size'];
340 $offset = $this->eocdr['CD offset'];
341 $numEntries = $this->eocdr['CD entries total'];
342 $endPos = $this->eocdr['position'];
343 if ( $size == 0xffffffff
344 || $offset == 0xffffffff
345 || $numEntries == 0xffff
346 ) {
347 $this->readZip64EndOfCentralDirectoryLocator();
348
349 if ( isset( $this->eocdr64Locator['eocdr64 offset'] ) ) {
350 $this->readZip64EndOfCentralDirectoryRecord();
351 if ( isset( $this->eocdr64['CD offset'] ) ) {
352 $size = $this->eocdr64['CD size'];
353 $offset = $this->eocdr64['CD offset'];
354 $endPos = $this->eocdr64Locator['eocdr64 offset'];
355 }
356 }
357 }
358 // Some readers use the EOCDR position instead of the offset field
359 // to find the directory, so to be safe, we check if they both agree.
360 if ( $offset + $size != $endPos ) {
361 $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' .
362 'of central directory record' );
363 }
364
365 return [ $offset, $size ];
366 }
367
368 /**
369 * Read the central directory at the given location
370 * @param int $offset
371 * @param int $size
372 * @suppress PhanTypeInvalidLeftOperandOfIntegerOp
373 */
374 function readCentralDirectory( $offset, $size ) {
375 $block = $this->getBlock( $offset, $size );
376
377 $fixedInfo = [
378 'signature' => [ 'string', 4 ],
379 'version made by' => 2,
380 'version needed' => 2,
381 'general bits' => 2,
382 'compression method' => 2,
383 'mod time' => 2,
384 'mod date' => 2,
385 'crc-32' => 4,
386 'compressed size' => 4,
387 'uncompressed size' => 4,
388 'name length' => 2,
389 'extra field length' => 2,
390 'comment length' => 2,
391 'disk number start' => 2,
392 'internal attrs' => 2,
393 'external attrs' => 4,
394 'local header offset' => 4,
395 ];
396 $fixedSize = $this->getStructSize( $fixedInfo );
397
398 $pos = 0;
399 while ( $pos < $size ) {
400 $data = $this->unpack( $block, $fixedInfo, $pos );
401 $pos += $fixedSize;
402
403 if ( $data['signature'] !== "PK\x01\x02" ) {
404 $this->error( 'zip-bad', 'Invalid signature found in directory entry' );
405 }
406
407 $variableInfo = [
408 'name' => [ 'string', $data['name length'] ],
409 'extra field' => [ 'string', $data['extra field length'] ],
410 'comment' => [ 'string', $data['comment length'] ],
411 ];
412 $data += $this->unpack( $block, $variableInfo, $pos );
413 $pos += $this->getStructSize( $variableInfo );
414
415 if ( $this->zip64 && (
416 $data['compressed size'] == 0xffffffff
417 || $data['uncompressed size'] == 0xffffffff
418 || $data['local header offset'] == 0xffffffff )
419 ) {
420 $zip64Data = $this->unpackZip64Extra( $data['extra field'] );
421 if ( $zip64Data ) {
422 $data = $zip64Data + $data;
423 }
424 }
425
426 if ( $this->testBit( $data['general bits'], self::GENERAL_CD_ENCRYPTED ) ) {
427 $this->error( 'zip-unsupported', 'central directory encryption is not supported' );
428 }
429
430 // Convert the timestamp into MediaWiki format
431 // For the format, please see the MS-DOS 2.0 Programmer's Reference,
432 // pages 3-5 and 3-6.
433 $time = $data['mod time'];
434 $date = $data['mod date'];
435
436 $year = 1980 + ( $date >> 9 );
437 $month = ( $date >> 5 ) & 15;
438 $day = $date & 31;
439 $hour = ( $time >> 11 ) & 31;
440 $minute = ( $time >> 5 ) & 63;
441 $second = ( $time & 31 ) * 2;
442 $timestamp = sprintf( "%04d%02d%02d%02d%02d%02d",
443 $year, $month, $day, $hour, $minute, $second );
444
445 // Convert the character set in the file name
446 if ( $this->testBit( $data['general bits'], self::GENERAL_UTF8 ) ) {
447 $name = $data['name'];
448 } else {
449 $name = iconv( 'CP437', 'UTF-8', $data['name'] );
450 }
451
452 // Compile a data array for the user, with a sensible format
453 $userData = [
454 'name' => $name,
455 'mtime' => $timestamp,
456 'size' => $data['uncompressed size'],
457 ];
458 call_user_func( $this->callback, $userData );
459 }
460 }
461
462 /**
463 * Interpret ZIP64 "extra field" data and return an associative array.
464 * @param string $extraField
465 * @return array|bool
466 */
467 function unpackZip64Extra( $extraField ) {
468 $extraHeaderInfo = [
469 'id' => 2,
470 'size' => 2,
471 ];
472 $extraHeaderSize = $this->getStructSize( $extraHeaderInfo );
473
474 $zip64ExtraInfo = [
475 'uncompressed size' => 8,
476 'compressed size' => 8,
477 'local header offset' => 8,
478 'disk number start' => 4,
479 ];
480
481 $extraPos = 0;
482 while ( $extraPos < strlen( $extraField ) ) {
483 $extra = $this->unpack( $extraField, $extraHeaderInfo, $extraPos );
484 $extraPos += $extraHeaderSize;
485 $extra += $this->unpack( $extraField,
486 [ 'data' => [ 'string', $extra['size'] ] ],
487 $extraPos );
488 $extraPos += $extra['size'];
489
490 if ( $extra['id'] == self::ZIP64_EXTRA_HEADER ) {
491 return $this->unpack( $extra['data'], $zip64ExtraInfo );
492 }
493 }
494
495 return false;
496 }
497
498 /**
499 * Get the length of the file.
500 * @return int
501 */
502 function getFileLength() {
503 if ( $this->fileLength === null ) {
504 $stat = fstat( $this->file );
505 $this->fileLength = $stat['size'];
506 }
507
508 return $this->fileLength;
509 }
510
511 /**
512 * Get the file contents from a given offset. If there are not enough bytes
513 * in the file to satisfy the request, an exception will be thrown.
514 *
515 * @param int $start The byte offset of the start of the block.
516 * @param int|null $length The number of bytes to return. If omitted, the remainder
517 * of the file will be returned.
518 *
519 * @return string
520 */
521 function getBlock( $start, $length = null ) {
522 $fileLength = $this->getFileLength();
523 if ( $start >= $fileLength ) {
524 $this->error( 'zip-bad', "getBlock() requested position $start, " .
525 "file length is $fileLength" );
526 }
527 if ( $length === null ) {
528 $length = $fileLength - $start;
529 }
530 $end = $start + $length;
531 if ( $end > $fileLength ) {
532 $this->error( 'zip-bad', "getBlock() requested end position $end, " .
533 "file length is $fileLength" );
534 }
535 $startSeg = floor( $start / self::SEGSIZE );
536 $endSeg = ceil( $end / self::SEGSIZE );
537
538 $block = '';
539 for ( $segIndex = $startSeg; $segIndex <= $endSeg; $segIndex++ ) {
540 $block .= $this->getSegment( $segIndex );
541 }
542
543 $block = substr( $block,
544 $start - $startSeg * self::SEGSIZE,
545 $length );
546
547 if ( strlen( $block ) < $length ) {
548 $this->error( 'zip-bad', 'getBlock() returned an unexpectedly small amount of data' );
549 }
550
551 return $block;
552 }
553
554 /**
555 * Get a section of the file starting at position $segIndex * self::SEGSIZE,
556 * of length self::SEGSIZE. The result is cached. This is a helper function
557 * for getBlock().
558 *
559 * If there are not enough bytes in the file to satisfy the request, the
560 * return value will be truncated. If a request is made for a segment beyond
561 * the end of the file, an empty string will be returned.
562 *
563 * @param int $segIndex
564 *
565 * @return string
566 */
567 function getSegment( $segIndex ) {
568 if ( !isset( $this->buffer[$segIndex] ) ) {
569 $bytePos = $segIndex * self::SEGSIZE;
570 if ( $bytePos >= $this->getFileLength() ) {
571 $this->buffer[$segIndex] = '';
572
573 return '';
574 }
575 if ( fseek( $this->file, $bytePos ) ) {
576 $this->error( 'zip-bad', "seek to $bytePos failed" );
577 }
578 $seg = fread( $this->file, self::SEGSIZE );
579 if ( $seg === false ) {
580 $this->error( 'zip-bad', "read from $bytePos failed" );
581 }
582 $this->buffer[$segIndex] = $seg;
583 }
584
585 return $this->buffer[$segIndex];
586 }
587
588 /**
589 * Get the size of a structure in bytes. See unpack() for the format of $struct.
590 * @param array $struct
591 * @return int
592 */
593 function getStructSize( $struct ) {
594 $size = 0;
595 foreach ( $struct as $type ) {
596 if ( is_array( $type ) ) {
597 list( , $fieldSize ) = $type;
598 $size += $fieldSize;
599 } else {
600 $size += $type;
601 }
602 }
603
604 return $size;
605 }
606
607 /**
608 * Unpack a binary structure. This is like the built-in unpack() function
609 * except nicer.
610 *
611 * @param string $string The binary data input
612 *
613 * @param array $struct An associative array giving structure members and their
614 * types. In the key is the field name. The value may be either an
615 * integer, in which case the field is a little-endian unsigned integer
616 * encoded in the given number of bytes, or an array, in which case the
617 * first element of the array is the type name, and the subsequent
618 * elements are type-dependent parameters. Only one such type is defined:
619 * - "string": The second array element gives the length of string.
620 * Not null terminated.
621 *
622 * @param int $offset The offset into the string at which to start unpacking.
623 *
624 * @throws MWException
625 * @return array Unpacked associative array. Note that large integers in the input
626 * may be represented as floating point numbers in the return value, so
627 * the use of weak comparison is advised.
628 */
629 function unpack( $string, $struct, $offset = 0 ) {
630 $size = $this->getStructSize( $struct );
631 if ( $offset + $size > strlen( $string ) ) {
632 $this->error( 'zip-bad', 'unpack() would run past the end of the supplied string' );
633 }
634
635 $data = [];
636 $pos = $offset;
637 foreach ( $struct as $key => $type ) {
638 if ( is_array( $type ) ) {
639 list( $typeName, $fieldSize ) = $type;
640 switch ( $typeName ) {
641 case 'string':
642 $data[$key] = substr( $string, $pos, $fieldSize );
643 $pos += $fieldSize;
644 break;
645 default:
646 throw new MWException( __METHOD__ . ": invalid type \"$typeName\"" );
647 }
648 } else {
649 // Unsigned little-endian integer
650 $length = intval( $type );
651
652 // Calculate the value. Use an algorithm which automatically
653 // upgrades the value to floating point if necessary.
654 $value = 0;
655 for ( $i = $length - 1; $i >= 0; $i-- ) {
656 $value *= 256;
657 $value += ord( $string[$pos + $i] );
658 }
659
660 // Throw an exception if there was loss of precision
661 if ( $value > 2 ** 52 ) {
662 $this->error( 'zip-unsupported', 'number too large to be stored in a double. ' .
663 'This could happen if we tried to unpack a 64-bit structure ' .
664 'at an invalid location.' );
665 }
666 $data[$key] = $value;
667 $pos += $length;
668 }
669 }
670
671 return $data;
672 }
673
674 /**
675 * Returns a bit from a given position in an integer value, converted to
676 * boolean.
677 *
678 * @param int $value
679 * @param int $bitIndex The index of the bit, where 0 is the LSB.
680 * @return bool
681 */
682 function testBit( $value, $bitIndex ) {
683 return (bool)( ( $value >> $bitIndex ) & 1 );
684 }
685
686 /**
687 * Debugging helper function which dumps a string in hexdump -C format.
688 * @param string $s
689 */
690 function hexDump( $s ) {
691 $n = strlen( $s );
692 for ( $i = 0; $i < $n; $i += 16 ) {
693 printf( "%08X ", $i );
694 for ( $j = 0; $j < 16; $j++ ) {
695 print " ";
696 if ( $j == 8 ) {
697 print " ";
698 }
699 if ( $i + $j >= $n ) {
700 print " ";
701 } else {
702 printf( "%02X", ord( $s[$i + $j] ) );
703 }
704 }
705
706 print " |";
707 for ( $j = 0; $j < 16; $j++ ) {
708 if ( $i + $j >= $n ) {
709 print " ";
710 } elseif ( ctype_print( $s[$i + $j] ) ) {
711 print $s[$i + $j];
712 } else {
713 print '.';
714 }
715 }
716 print "|\n";
717 }
718 }
719 }