Update the Chinese conversion tables.
[lhc/web/wiklou.git] / includes / IEContentAnalyzer.php
1 <?php
2
3 /**
4 * This class simulates Microsoft Internet Explorer's terribly broken and
5 * insecure MIME type detection algorithm. It can be used to check web uploads
6 * with an apparently safe type, to see if IE will reinterpret them to produce
7 * something dangerous.
8 *
9 * It is full of bugs and strange design choices should not under any
10 * circumstances be used to determine a MIME type to present to a user or
11 * client. (Apple Safari developers, this means you too.)
12 *
13 * This class is based on a disassembly of IE 5.0, 6.0 and 7.0. Although I have
14 * attempted to ensure that this code works in exactly the same way as Internet
15 * Explorer, it does not share any source code, or creative choices such as
16 * variable names, thus I (Tim Starling) claim copyright on it.
17 *
18 * It may be redistributed without restriction. To aid reuse, this class does
19 * not depend on any MediaWiki module.
20 */
21 class IEContentAnalyzer {
22 /**
23 * Relevant data taken from the type table in IE 5
24 */
25 protected $baseTypeTable = array(
26 'ambiguous' /*1*/ => array(
27 'text/plain',
28 'application/octet-stream',
29 'application/x-netcdf', // [sic]
30 ),
31 'text' /*3*/ => array(
32 'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',
33 'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'
34 ),
35 'binary' /*4*/ => array(
36 'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',
37 'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp',
38 'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi',
39 'video/x-msvideo', 'video/mpeg', 'application/x-compressed',
40 'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',
41 'application/x-msdownload'
42 ),
43 'html' /*5*/ => array( 'text/html' ),
44 );
45
46 /**
47 * Changes to the type table in later versions of IE
48 */
49 protected $addedTypes = array(
50 'ie07' => array(
51 'text' => array( 'text/xml', 'application/xml' )
52 ),
53 );
54
55 /**
56 * An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a
57 * typical Windows installation.
58 *
59 * Used for extension to MIME type mapping if detection fails.
60 */
61 protected $registry = array(
62 '.323' => 'text/h323',
63 '.3g2' => 'video/3gpp2',
64 '.3gp' => 'video/3gpp',
65 '.3gp2' => 'video/3gpp2',
66 '.3gpp' => 'video/3gpp',
67 '.aac' => 'audio/aac',
68 '.ac3' => 'audio/ac3',
69 '.accda' => 'application/msaccess',
70 '.accdb' => 'application/msaccess',
71 '.accdc' => 'application/msaccess',
72 '.accde' => 'application/msaccess',
73 '.accdr' => 'application/msaccess',
74 '.accdt' => 'application/msaccess',
75 '.ade' => 'application/msaccess',
76 '.adp' => 'application/msaccess',
77 '.adts' => 'audio/aac',
78 '.ai' => 'application/postscript',
79 '.aif' => 'audio/aiff',
80 '.aifc' => 'audio/aiff',
81 '.aiff' => 'audio/aiff',
82 '.amc' => 'application/x-mpeg',
83 '.application' => 'application/x-ms-application',
84 '.asf' => 'video/x-ms-asf',
85 '.asx' => 'video/x-ms-asf',
86 '.au' => 'audio/basic',
87 '.avi' => 'video/avi',
88 '.bmp' => 'image/bmp',
89 '.caf' => 'audio/x-caf',
90 '.cat' => 'application/vnd.ms-pki.seccat',
91 '.cbo' => 'application/sha',
92 '.cdda' => 'audio/aiff',
93 '.cer' => 'application/x-x509-ca-cert',
94 '.conf' => 'text/plain',
95 '.crl' => 'application/pkix-crl',
96 '.crt' => 'application/x-x509-ca-cert',
97 '.css' => 'text/css',
98 '.csv' => 'application/vnd.ms-excel',
99 '.der' => 'application/x-x509-ca-cert',
100 '.dib' => 'image/bmp',
101 '.dif' => 'video/x-dv',
102 '.dll' => 'application/x-msdownload',
103 '.doc' => 'application/msword',
104 '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',
105 '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
106 '.dot' => 'application/msword',
107 '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',
108 '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
109 '.dv' => 'video/x-dv',
110 '.dwfx' => 'model/vnd.dwfx+xps',
111 '.edn' => 'application/vnd.adobe.edn',
112 '.eml' => 'message/rfc822',
113 '.eps' => 'application/postscript',
114 '.etd' => 'application/x-ebx',
115 '.exe' => 'application/x-msdownload',
116 '.fdf' => 'application/vnd.fdf',
117 '.fif' => 'application/fractals',
118 '.gif' => 'image/gif',
119 '.gsm' => 'audio/x-gsm',
120 '.hqx' => 'application/mac-binhex40',
121 '.hta' => 'application/hta',
122 '.htc' => 'text/x-component',
123 '.htm' => 'text/html',
124 '.html' => 'text/html',
125 '.htt' => 'text/webviewhtml',
126 '.hxa' => 'application/xml',
127 '.hxc' => 'application/xml',
128 '.hxd' => 'application/octet-stream',
129 '.hxe' => 'application/xml',
130 '.hxf' => 'application/xml',
131 '.hxh' => 'application/octet-stream',
132 '.hxi' => 'application/octet-stream',
133 '.hxk' => 'application/xml',
134 '.hxq' => 'application/octet-stream',
135 '.hxr' => 'application/octet-stream',
136 '.hxs' => 'application/octet-stream',
137 '.hxt' => 'application/xml',
138 '.hxv' => 'application/xml',
139 '.hxw' => 'application/octet-stream',
140 '.ico' => 'image/x-icon',
141 '.iii' => 'application/x-iphone',
142 '.ins' => 'application/x-internet-signup',
143 '.iqy' => 'text/x-ms-iqy',
144 '.isp' => 'application/x-internet-signup',
145 '.jfif' => 'image/jpeg',
146 '.jnlp' => 'application/x-java-jnlp-file',
147 '.jpe' => 'image/jpeg',
148 '.jpeg' => 'image/jpeg',
149 '.jpg' => 'image/jpeg',
150 '.jtx' => 'application/x-jtx+xps',
151 '.latex' => 'application/x-latex',
152 '.log' => 'text/plain',
153 '.m1v' => 'video/mpeg',
154 '.m2v' => 'video/mpeg',
155 '.m3u' => 'audio/x-mpegurl',
156 '.mac' => 'image/x-macpaint',
157 '.man' => 'application/x-troff-man',
158 '.mda' => 'application/msaccess',
159 '.mdb' => 'application/msaccess',
160 '.mde' => 'application/msaccess',
161 '.mfp' => 'application/x-shockwave-flash',
162 '.mht' => 'message/rfc822',
163 '.mhtml' => 'message/rfc822',
164 '.mid' => 'audio/mid',
165 '.midi' => 'audio/mid',
166 '.mod' => 'video/mpeg',
167 '.mov' => 'video/quicktime',
168 '.mp2' => 'video/mpeg',
169 '.mp2v' => 'video/mpeg',
170 '.mp3' => 'audio/mpeg',
171 '.mp4' => 'video/mp4',
172 '.mpa' => 'video/mpeg',
173 '.mpe' => 'video/mpeg',
174 '.mpeg' => 'video/mpeg',
175 '.mpf' => 'application/vnd.ms-mediapackage',
176 '.mpg' => 'video/mpeg',
177 '.mpv2' => 'video/mpeg',
178 '.mqv' => 'video/quicktime',
179 '.NMW' => 'application/nmwb',
180 '.nws' => 'message/rfc822',
181 '.odc' => 'text/x-ms-odc',
182 '.ols' => 'application/vnd.ms-publisher',
183 '.p10' => 'application/pkcs10',
184 '.p12' => 'application/x-pkcs12',
185 '.p7b' => 'application/x-pkcs7-certificates',
186 '.p7c' => 'application/pkcs7-mime',
187 '.p7m' => 'application/pkcs7-mime',
188 '.p7r' => 'application/x-pkcs7-certreqresp',
189 '.p7s' => 'application/pkcs7-signature',
190 '.pct' => 'image/pict',
191 '.pdf' => 'application/pdf',
192 '.pdx' => 'application/vnd.adobe.pdx',
193 '.pfx' => 'application/x-pkcs12',
194 '.pic' => 'image/pict',
195 '.pict' => 'image/pict',
196 '.pinstall' => 'application/x-picasa-detect',
197 '.pko' => 'application/vnd.ms-pki.pko',
198 '.png' => 'image/png',
199 '.pnt' => 'image/x-macpaint',
200 '.pntg' => 'image/x-macpaint',
201 '.pot' => 'application/vnd.ms-powerpoint',
202 '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',
203 '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',
204 '.ppa' => 'application/vnd.ms-powerpoint',
205 '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
206 '.pps' => 'application/vnd.ms-powerpoint',
207 '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
208 '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
209 '.ppt' => 'application/vnd.ms-powerpoint',
210 '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
211 '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
212 '.prf' => 'application/pics-rules',
213 '.ps' => 'application/postscript',
214 '.pub' => 'application/vnd.ms-publisher',
215 '.pwz' => 'application/vnd.ms-powerpoint',
216 '.py' => 'text/plain',
217 '.pyw' => 'text/plain',
218 '.qht' => 'text/x-html-insertion',
219 '.qhtm' => 'text/x-html-insertion',
220 '.qt' => 'video/quicktime',
221 '.qti' => 'image/x-quicktime',
222 '.qtif' => 'image/x-quicktime',
223 '.qtl' => 'application/x-quicktimeplayer',
224 '.rat' => 'application/rat-file',
225 '.rmf' => 'application/vnd.adobe.rmf',
226 '.rmi' => 'audio/mid',
227 '.rqy' => 'text/x-ms-rqy',
228 '.rtf' => 'application/msword',
229 '.sct' => 'text/scriptlet',
230 '.sd2' => 'audio/x-sd2',
231 '.sdp' => 'application/sdp',
232 '.shtml' => 'text/html',
233 '.sit' => 'application/x-stuffit',
234 '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',
235 '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',
236 '.slk' => 'application/vnd.ms-excel',
237 '.snd' => 'audio/basic',
238 '.so' => 'application/x-apachemodule',
239 '.sol' => 'text/plain',
240 '.sor' => 'text/plain',
241 '.spc' => 'application/x-pkcs7-certificates',
242 '.spl' => 'application/futuresplash',
243 '.sst' => 'application/vnd.ms-pki.certstore',
244 '.stl' => 'application/vnd.ms-pki.stl',
245 '.swf' => 'application/x-shockwave-flash',
246 '.thmx' => 'application/vnd.ms-officetheme',
247 '.tif' => 'image/tiff',
248 '.tiff' => 'image/tiff',
249 '.txt' => 'text/plain',
250 '.uls' => 'text/iuls',
251 '.vcf' => 'text/x-vcard',
252 '.vdx' => 'application/vnd.ms-visio.viewer',
253 '.vsd' => 'application/vnd.ms-visio.viewer',
254 '.vss' => 'application/vnd.ms-visio.viewer',
255 '.vst' => 'application/vnd.ms-visio.viewer',
256 '.vsx' => 'application/vnd.ms-visio.viewer',
257 '.vtx' => 'application/vnd.ms-visio.viewer',
258 '.wav' => 'audio/wav',
259 '.wax' => 'audio/x-ms-wax',
260 '.wbk' => 'application/msword',
261 '.wdp' => 'image/vnd.ms-photo',
262 '.wiz' => 'application/msword',
263 '.wm' => 'video/x-ms-wm',
264 '.wma' => 'audio/x-ms-wma',
265 '.wmd' => 'application/x-ms-wmd',
266 '.wmv' => 'video/x-ms-wmv',
267 '.wmx' => 'video/x-ms-wmx',
268 '.wmz' => 'application/x-ms-wmz',
269 '.wpl' => 'application/vnd.ms-wpl',
270 '.wsc' => 'text/scriptlet',
271 '.wvx' => 'video/x-ms-wvx',
272 '.xaml' => 'application/xaml+xml',
273 '.xbap' => 'application/x-ms-xbap',
274 '.xdp' => 'application/vnd.adobe.xdp+xml',
275 '.xfdf' => 'application/vnd.adobe.xfdf',
276 '.xht' => 'application/xhtml+xml',
277 '.xhtml' => 'application/xhtml+xml',
278 '.xla' => 'application/vnd.ms-excel',
279 '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',
280 '.xlk' => 'application/vnd.ms-excel',
281 '.xll' => 'application/vnd.ms-excel',
282 '.xlm' => 'application/vnd.ms-excel',
283 '.xls' => 'application/vnd.ms-excel',
284 '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
285 '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',
286 '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
287 '.xlt' => 'application/vnd.ms-excel',
288 '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',
289 '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
290 '.xlw' => 'application/vnd.ms-excel',
291 '.xml' => 'text/xml',
292 '.xps' => 'application/vnd.ms-xpsdocument',
293 '.xsl' => 'text/xml',
294 );
295
296 /**
297 * IE versions which have been analysed to bring you this class, and for
298 * which some substantive difference exists. These will appear as keys
299 * in the return value of getRealMimesFromData(). The names are chosen to sort correctly.
300 */
301 protected $versions = array( 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' );
302
303 /**
304 * Type table with versions expanded
305 */
306 protected $typeTable = array();
307
308 /** constructor */
309 function __construct() {
310 // Construct versioned type arrays from the base type array plus additions
311 $types = $this->baseTypeTable;
312 foreach ( $this->versions as $version ) {
313 if ( isset( $this->addedTypes[$version] ) ) {
314 foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {
315 $types[$format] = array_merge( $types[$format], $addedTypes );
316 }
317 }
318 $this->typeTable[$version] = $types;
319 }
320 }
321
322 /**
323 * Get the MIME types from getMimesFromData(), but convert the result from IE's
324 * idiosyncratic private types into something other apps will understand.
325 *
326 * @param $fileName String: the file name (unused at present)
327 * @param $chunk String: the first 256 bytes of the file
328 * @param $proposed String: the MIME type proposed by the server
329 *
330 * @return Array: map of IE version to detected mime type
331 */
332 public function getRealMimesFromData( $fileName, $chunk, $proposed ) {
333 $types = $this->getMimesFromData( $fileName, $chunk, $proposed );
334 $types = array_map( array( $this, 'translateMimeType' ), $types );
335 return $types;
336 }
337
338 /**
339 * Translate a MIME type from IE's idiosyncratic private types into
340 * more commonly understood type strings
341 */
342 public function translateMimeType( $type ) {
343 static $table = array(
344 'image/pjpeg' => 'image/jpeg',
345 'image/x-png' => 'image/png',
346 'image/x-wmf' => 'application/x-msmetafile',
347 'image/bmp' => 'image/x-bmp',
348 'application/x-zip-compressed' => 'application/zip',
349 'application/x-compressed' => 'application/x-compress',
350 'application/x-gzip-compressed' => 'application/x-gzip',
351 'audio/mid' => 'audio/midi',
352 );
353 if ( isset( $table[$type] ) ) {
354 $type = $table[$type];
355 }
356 return $type;
357 }
358
359 /**
360 * Get the untranslated MIME types for all known versions
361 *
362 * @param $fileName String: the file name (unused at present)
363 * @param $chunk String: the first 256 bytes of the file
364 * @param $proposed String: the MIME type proposed by the server
365 *
366 * @return Array: map of IE version to detected mime type
367 */
368 public function getMimesFromData( $fileName, $chunk, $proposed ) {
369 $types = array();
370 foreach ( $this->versions as $version ) {
371 $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );
372 }
373 return $types;
374 }
375
376 /**
377 * Get the MIME type for a given named version
378 */
379 protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {
380 // Strip text after a semicolon
381 $semiPos = strpos( $proposed, ';' );
382 if ( $semiPos !== false ) {
383 $proposed = substr( $proposed, 0, $semiPos );
384 }
385
386 $proposedFormat = $this->getDataFormat( $version, $proposed );
387 if ( $proposedFormat == 'unknown'
388 && $proposed != 'multipart/mixed'
389 && $proposed != 'multipart/x-mixed-replace' )
390 {
391 return $proposed;
392 }
393 if ( strval( $chunk ) === '' ) {
394 return $proposed;
395 }
396
397 // Truncate chunk at 255 bytes
398 $chunk = substr( $chunk, 0, 255 );
399
400 // IE does the Check*Headers() calls last, and instead does the following image
401 // type checks by directly looking for the magic numbers. What I do here should
402 // have the same effect since the magic number checks are identical in both cases.
403 $result = $this->sampleData( $version, $chunk );
404 $sampleFound = $result['found'];
405 $counters = $result['counters'];
406 $binaryType = $this->checkBinaryHeaders( $version, $chunk );
407 $textType = $this->checkTextHeaders( $version, $chunk );
408
409 if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {
410 return 'text/html';
411 }
412 if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {
413 return 'image/gif';
414 }
415 if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )
416 && $binaryType == 'image/pjpeg' )
417 {
418 return $proposed;
419 }
420 // PNG check added in IE 7
421 if ( $version >= 'ie07'
422 && ( $proposed == 'image/x-png' || $proposed == 'image/png' )
423 && $binaryType == 'image/x-png' )
424 {
425 return $proposed;
426 }
427
428 // CDF was removed in IE 7 so it won't be in $sampleFound for later versions
429 if ( isset( $sampleFound['cdf'] ) ) {
430 return 'application/x-cdf';
431 }
432
433 // RSS and Atom were added in IE 7 so they won't be in $sampleFound for
434 // previous versions
435 if ( isset( $sampleFound['rss'] ) ) {
436 return 'application/rss+xml';
437 }
438 if ( isset( $sampleFound['rdf-tag'] )
439 && isset( $sampleFound['rdf-url'] )
440 && isset( $sampleFound['rdf-purl'] ) )
441 {
442 return 'application/rss+xml';
443 }
444 if ( isset( $sampleFound['atom'] ) ) {
445 return 'application/atom+xml';
446 }
447
448 if ( isset( $sampleFound['xml'] ) ) {
449 // TODO: I'm not sure under what circumstances this flag is enabled
450 if ( strpos( $version, 'strict' ) !== false ) {
451 if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {
452 return 'text/xml';
453 }
454 } else {
455 return 'text/xml';
456 }
457 }
458 if ( isset( $sampleFound['html'] ) ) {
459 // TODO: I'm not sure under what circumstances this flag is enabled
460 if ( strpos( $version, 'nohtml' ) !== false ) {
461 if ( $proposed == 'text/plain' ) {
462 return 'text/html';
463 }
464 } else {
465 return 'text/html';
466 }
467 }
468 if ( isset( $sampleFound['xbm'] ) ) {
469 return 'image/x-bitmap';
470 }
471 if ( isset( $sampleFound['binhex'] ) ) {
472 return 'application/macbinhex40';
473 }
474 if ( isset( $sampleFound['scriptlet'] ) ) {
475 if ( strpos( $version, 'strict' ) !== false ) {
476 if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {
477 return 'text/scriptlet';
478 }
479 } else {
480 return 'text/scriptlet';
481 }
482 }
483
484 // Freaky heuristics to determine if the data is text or binary
485 // The heuristic is of course broken for non-ASCII text
486 if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] )
487 < ( $counters['ctrl'] + $counters['high'] ) * 16 )
488 {
489 $kindOfBinary = true;
490 $type = $binaryType ? $binaryType : $textType;
491 if ( $type === false ) {
492 $type = 'application/octet-stream';
493 }
494 } else {
495 $kindOfBinary = false;
496 $type = $textType ? $textType : $binaryType;
497 if ( $type === false ) {
498 $type = 'text/plain';
499 }
500 }
501
502 // Check if the output format is ambiguous
503 // This generally means that detection failed, real types aren't ambiguous
504 $detectedFormat = $this->getDataFormat( $version, $type );
505 if ( $detectedFormat != 'ambiguous' ) {
506 return $type;
507 }
508
509 if ( $proposedFormat != 'ambiguous' ) {
510 // FormatAgreesWithData()
511 if ( $proposedFormat == 'text' && !$kindOfBinary ) {
512 return $proposed;
513 }
514 if ( $proposedFormat == 'binary' && $kindOfBinary ) {
515 return $proposed;
516 }
517 if ( $proposedFormat == 'html' ) {
518 return $proposed;
519 }
520 }
521
522 // Find a MIME type by searching the registry for the file extension.
523 $dotPos = strrpos( $fileName, '.' );
524 if ( $dotPos === false ) {
525 return $type;
526 }
527 $ext = substr( $fileName, $dotPos );
528 if ( isset( $this->registry[$ext] ) ) {
529 return $this->registry[$ext];
530 }
531
532 // TODO: If the extension has an application registered to it, IE will return
533 // application/octet-stream. We'll skip that, so we could erroneously
534 // return text/plain or application/x-netcdf where application/octet-stream
535 // would be correct.
536
537 return $type;
538 }
539
540 /**
541 * Check for text headers at the start of the chunk
542 * Confirmed same in 5 and 7.
543 */
544 private function checkTextHeaders( $version, $chunk ) {
545 $chunk2 = substr( $chunk, 0, 2 );
546 $chunk4 = substr( $chunk, 0, 4 );
547 $chunk5 = substr( $chunk, 0, 5 );
548 if ( $chunk4 == '%PDF' ) {
549 return 'application/pdf';
550 }
551 if ( $chunk2 == '%!' ) {
552 return 'application/postscript';
553 }
554 if ( $chunk5 == '{\\rtf' ) {
555 return 'text/richtext';
556 }
557 if ( $chunk5 == 'begin' ) {
558 return 'application/base64';
559 }
560 return false;
561 }
562
563 /**
564 * Check for binary headers at the start of the chunk
565 * Confirmed same in 5 and 7.
566 */
567 private function checkBinaryHeaders( $version, $chunk ) {
568 $chunk2 = substr( $chunk, 0, 2 );
569 $chunk3 = substr( $chunk, 0, 3 );
570 $chunk4 = substr( $chunk, 0, 4 );
571 $chunk5 = substr( $chunk, 0, 5 );
572 $chunk5uc = strtoupper( $chunk5 );
573 $chunk8 = substr( $chunk, 0, 8 );
574 if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) {
575 return 'image/gif';
576 }
577 if ( $chunk2 == "\xff\xd8" ) {
578 return 'image/pjpeg'; // actually plain JPEG but this is what IE returns
579 }
580
581 if ( $chunk2 == 'BM'
582 && substr( $chunk, 6, 2 ) == "\000\000"
583 && substr( $chunk, 8, 2 ) == "\000\000" )
584 {
585 return 'image/bmp'; // another non-standard MIME
586 }
587 if ( $chunk4 == 'RIFF'
588 && substr( $chunk, 8, 4 ) == 'WAVE' )
589 {
590 return 'audio/wav';
591 }
592 // These were integer literals in IE
593 // Perhaps the author was not sure what the target endianness was
594 if ( $chunk4 == ".sd\000"
595 || $chunk4 == ".snd"
596 || $chunk4 == "\000ds."
597 || $chunk4 == "dns." )
598 {
599 return 'audio/basic';
600 }
601 if ( $chunk3 == "MM\000" ) {
602 return 'image/tiff';
603 }
604 if ( $chunk2 == 'MZ' ) {
605 return 'application/x-msdownload';
606 }
607 if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {
608 return 'image/x-png'; // [sic]
609 }
610 if ( strlen( $chunk ) >= 5 ) {
611 $byte2 = ord( $chunk[2] );
612 $byte4 = ord( $chunk[4] );
613 if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {
614 return 'image/x-jg';
615 }
616 }
617 // More endian confusion?
618 if ( $chunk4 == 'MROF' ) {
619 return 'audio/x-aiff';
620 }
621 $chunk4_8 = substr( $chunk, 8, 4 );
622 if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {
623 return 'audio/x-aiff';
624 }
625 if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {
626 return 'video/avi';
627 }
628 if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {
629 return 'video/mpeg';
630 }
631 if ( $chunk4 == "\001\000\000\000"
632 && substr( $chunk, 40, 4 ) == ' EMF' )
633 {
634 return 'image/x-emf';
635 }
636 if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {
637 return 'image/x-wmf';
638 }
639 if ( $chunk4 == "\xca\xfe\xba\xbe" ) {
640 return 'application/java';
641 }
642 if ( $chunk2 == 'PK' ) {
643 return 'application/x-zip-compressed';
644 }
645 if ( $chunk2 == "\x1f\x9d" ) {
646 return 'application/x-compressed';
647 }
648 if ( $chunk2 == "\x1f\x8b" ) {
649 return 'application/x-gzip-compressed';
650 }
651 // Skip redundant check for ZIP
652 if ( $chunk5 == "MThd\000" ) {
653 return 'audio/mid';
654 }
655 if ( $chunk4 == '%PDF' ) {
656 return 'application/pdf';
657 }
658 return false;
659 }
660
661 /**
662 * Do heuristic checks on the bulk of the data sample.
663 * Search for HTML tags.
664 */
665 protected function sampleData( $version, $chunk ) {
666 $found = array();
667 $counters = array(
668 'ctrl' => 0,
669 'high' => 0,
670 'low' => 0,
671 'lf' => 0,
672 'cr' => 0,
673 'ff' => 0
674 );
675 $htmlTags = array(
676 'html',
677 'head',
678 'title',
679 'body',
680 'script',
681 'a href',
682 'pre',
683 'img',
684 'plaintext',
685 'table'
686 );
687 $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
688 $rdfPurl = 'http://purl.org/rss/1.0/';
689 $xbmMagic1 = '#define';
690 $xbmMagic2 = '_width';
691 $xbmMagic3 = '_bits';
692 $binhexMagic = 'converted with BinHex';
693
694 for ( $offset = 0; $offset < strlen( $chunk ); $offset++ ) {
695 $curChar = $chunk[$offset];
696 if ( $curChar == "\x0a" ) {
697 $counters['lf']++;
698 continue;
699 } elseif ( $curChar == "\x0d" ) {
700 $counters['cr']++;
701 continue;
702 } elseif ( $curChar == "\x0c" ) {
703 $counters['ff']++;
704 continue;
705 } elseif ( $curChar == "\t" ) {
706 $counters['low']++;
707 continue;
708 } elseif ( ord( $curChar ) < 32 ) {
709 $counters['ctrl']++;
710 continue;
711 } elseif ( ord( $curChar ) >= 128 ) {
712 $counters['high']++;
713 continue;
714 }
715
716 $counters['low']++;
717 if ( $curChar == '<' ) {
718 // XML
719 $remainder = substr( $chunk, $offset + 1 );
720 if ( !strncasecmp( $remainder, '?XML', 4 ) ) {
721 $nextChar = substr( $chunk, $offset + 5, 1 );
722 if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {
723 $found['xml'] = true;
724 }
725 }
726 // Scriptlet (JSP)
727 if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {
728 $found['scriptlet'] = true;
729 break;
730 }
731 // HTML
732 foreach ( $htmlTags as $tag ) {
733 if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {
734 $found['html'] = true;
735 }
736 }
737 // Skip broken check for additional tags (HR etc.)
738
739 // CHANNEL replaced by RSS, RDF and FEED in IE 7
740 if ( $version < 'ie07' ) {
741 if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {
742 $found['cdf'] = true;
743 }
744 } else {
745 // RSS
746 if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {
747 $found['rss'] = true;
748 break; // return from SampleData
749 }
750 if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {
751 $found['rdf-tag'] = true;
752 // no break
753 }
754 if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {
755 $found['atom'] = true;
756 break;
757 }
758 }
759 continue;
760 }
761 // Skip broken check for -->
762
763 // RSS URL checks
764 // For some reason both URLs must appear before it is recognised
765 $remainder = substr( $chunk, $offset );
766 if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {
767 $found['rdf-url'] = true;
768 if ( isset( $found['rdf-tag'] )
769 && isset( $found['rdf-purl'] ) ) // [sic]
770 {
771 break;
772 }
773 continue;
774 }
775
776 if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {
777 if ( isset( $found['rdf-tag'] )
778 && isset( $found['rdf-url'] ) ) // [sic]
779 {
780 break;
781 }
782 continue;
783 }
784
785 // XBM checks
786 if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {
787 $found['xbm1'] = true;
788 continue;
789 }
790 if ( $curChar == '_' ) {
791 if ( isset( $found['xbm2'] ) ) {
792 if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {
793 $found['xbm'] = true;
794 break;
795 }
796 } elseif ( isset( $found['xbm1'] ) ) {
797 if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {
798 $found['xbm2'] = true;
799 }
800 }
801 }
802
803 // BinHex
804 if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {
805 $found['binhex'] = true;
806 }
807 }
808 return array( 'found' => $found, 'counters' => $counters );
809 }
810
811 protected function getDataFormat( $version, $type ) {
812 $types = $this->typeTable[$version];
813 if ( $type == '(null)' || strval( $type ) === '' ) {
814 return 'ambiguous';
815 }
816 foreach ( $types as $format => $list ) {
817 if ( in_array( $type, $list ) ) {
818 return $format;
819 }
820 }
821 return 'unknown';
822 }
823 }
824