Fixes to address MaxSem's comment on r106480 regarding DB2 tables.sql syntax, indenta...
[lhc/web/wiklou.git] / includes / media / IPTC.php
1 <?php
2 /**
3 *Class for some IPTC functions.
4
5 */
6 class IPTC {
7
8 /**
9 * This takes the results of iptcparse() and puts it into a
10 * form that can be handled by mediawiki. Generally called from
11 * BitmapMetadataHandler::doApp13.
12 *
13 * @see http://www.iptc.org/std/IIM/4.1/specification/IIMV4.1.pdf
14 *
15 * @param $rawData String app13 block from jpeg containing iptc/iim data
16 * @return Array iptc metadata array
17 */
18 static function parse( $rawData ) {
19 $parsed = iptcparse( $rawData );
20 $data = Array();
21 if (!is_array($parsed)) {
22 return $data;
23 }
24
25 $c = '';
26 //charset info contained in tag 1:90.
27 if (isset($parsed['1#090']) && isset($parsed['1#090'][0])) {
28 $c = self::getCharset($parsed['1#090'][0]);
29 if ($c === false) {
30 //Unknown charset. refuse to parse.
31 //note: There is a different between
32 //unknown and no charset specified.
33 return array();
34 }
35 unset( $parsed['1#090'] );
36 }
37
38 foreach ( $parsed as $tag => $val ) {
39 if ( isset( $val[0] ) && trim($val[0]) == '' ) {
40 wfDebugLog('iptc', "IPTC tag $tag had only whitespace as its value.");
41 continue;
42 }
43 switch( $tag ) {
44 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
45 $data['ImageDescription'] = self::convIPTC( $val, $c );
46 break;
47 case '2#116': /* copyright. Mapped with exif copyright */
48 $data['Copyright'] = self::convIPTC( $val, $c );
49 break;
50 case '2#080': /* byline. Mapped with exif Artist */
51 /* merge with byline title (2:85)
52 * like how exif does it with
53 * Title, person. Not sure if this is best
54 * approach since we no longer have the two fields
55 * separate. each byline title entry corresponds to a
56 * specific byline. */
57
58 $bylines = self::convIPTC( $val, $c );
59 if ( isset( $parsed['2#085'] ) ) {
60 $titles = self::convIPTC( $parsed['2#085'], $c );
61 } else {
62 $titles = array();
63 }
64
65 for ( $i = 0; $i < count( $titles ); $i++ ) {
66 if ( isset( $bylines[$i] ) ) {
67 // theoretically this should always be set
68 // but doesn't hurt to be careful.
69 $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
70 }
71 }
72 $data['Artist'] = $bylines;
73 break;
74 case '2#025': /* keywords */
75 $data['Keywords'] = self::convIPTC( $val, $c );
76 break;
77 case '2#101': /* Country (shown)*/
78 $data['CountryDest'] = self::convIPTC( $val, $c );
79 break;
80 case '2#095': /* state/province (shown) */
81 $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
82 break;
83 case '2#090': /* city (Shown) */
84 $data['CityDest'] = self::convIPTC( $val, $c );
85 break;
86 case '2#092': /* sublocation (shown) */
87 $data['SublocationDest'] = self::convIPTC( $val, $c );
88 break;
89 case '2#005': /* object name/title */
90 $data['ObjectName'] = self::convIPTC( $val, $c );
91 break;
92 case '2#040': /* special instructions */
93 $data['SpecialInstructions'] = self::convIPTC( $val, $c );
94 break;
95 case '2#105': /* headline*/
96 $data['Headline'] = self::convIPTC( $val, $c );
97 break;
98 case '2#110': /* credit */
99 /*"Identifies the provider of the objectdata,
100 * not necessarily the owner/creator". */
101 $data['Credit'] = self::convIPTC( $val, $c );
102 break;
103 case '2#115': /* source */
104 /* "Identifies the original owner of the intellectual content of the
105 *objectdata. This could be an agency, a member of an agency or
106 *an individual." */
107 $data['Source'] = self::convIPTC( $val, $c );
108 break;
109
110 case '2#007': /* edit status (lead, correction, etc) */
111 $data['EditStatus'] = self::convIPTC( $val, $c );
112 break;
113 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
114 $data['iimCategory'] = self::convIPTC( $val, $c );
115 break;
116 case '2#020': /* category. deprecated. */
117 $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
118 break;
119 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
120 $data['Urgency'] = self::convIPTC( $val, $c );
121 break;
122 case '2#022':
123 /* "Identifies objectdata that recurs often and predictably...
124 * Example: Euroweather" */
125 $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
126 break;
127 case '2#026':
128 /* Content location code (iso 3166 + some custom things)
129 * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
130 * See wikipedia article on iso 3166 and appendix D of iim std. */
131 $data['LocationDestCode'] = self::convIPTC( $val, $c );
132 break;
133 case '2#027':
134 /* Content location name. Full printable name
135 * of location of photo. */
136 $data['LocationDest'] = self::convIPTC( $val, $c );
137 break;
138 case '2#065':
139 /* Originating Program.
140 * Combine with Program version (2:70) if present.
141 */
142 $software = self::convIPTC( $val, $c );
143
144 if ( count( $software ) !== 1 ) {
145 //according to iim standard this cannot have multiple values
146 //so if there is more than one, something weird is happening,
147 //and we skip it.
148 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
149 break;
150 }
151
152 if ( isset( $parsed['2#070'] ) ) {
153 //if a version is set for the software.
154 $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
155 unset($parsed['2#070']);
156 $data['Software'] = array( array( $software[0], $softwareVersion[0] ) );
157 } else {
158 $data['Software'] = $software;
159 }
160 break;
161 case '2#075':
162 /* Object cycle.
163 * a for morning (am), p for evening, b for both */
164 $data['ObjectCycle'] = self::convIPTC( $val, $c );
165 break;
166 case '2#100':
167 /* Country/Primary location code.
168 * "Indicates the code of the country/primary location where the
169 * intellectual property of the objectdata was created"
170 * unclear how this differs from 2#026
171 */
172 $data['CountryCodeDest'] = self::convIPTC( $val, $c );
173 break;
174 case '2#103':
175 /* original transmission ref.
176 * "A code representing the location of original transmission ac-
177 * cording to practises of the provider."
178 */
179 $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
180 break;
181 case '2#118': /*contact*/
182 $data['Contact'] = self::convIPTC( $val, $c );
183 break;
184 case '2#122':
185 /* Writer/Editor
186 * "Identification of the name of the person involved in the writing,
187 * editing or correcting the objectdata or caption/abstract."
188 */
189 $data['Writer'] = self::convIPTC( $val, $c );
190 break;
191 case '2#135': /* lang code */
192 $data['LanguageCode'] = self::convIPTC( $val, $c );
193 break;
194
195 // Start date stuff.
196 // It doesn't accept incomplete dates even though they are valid
197 // according to spec.
198 // Should potentially store timezone as well.
199 case '2#055':
200 //Date created (not date digitized).
201 //Maps to exif DateTimeOriginal
202 if ( isset( $parsed['2#060'] ) ) {
203 $time = $parsed['2#060'];
204 } else {
205 $time = Array();
206 }
207 $timestamp = self::timeHelper( $val, $time, $c );
208 if ($timestamp) {
209 $data['DateTimeOriginal'] = $timestamp;
210 }
211 break;
212
213 case '2#062':
214 //Date converted to digital representation.
215 //Maps to exif DateTimeDigitized
216 if ( isset( $parsed['2#063'] ) ) {
217 $time = $parsed['2#063'];
218 } else {
219 $time = Array();
220 }
221 $timestamp = self::timeHelper( $val, $time, $c );
222 if ($timestamp) {
223 $data['DateTimeDigitized'] = $timestamp;
224 }
225 break;
226
227 case '2#030':
228 //Date released.
229 if ( isset( $parsed['2#035'] ) ) {
230 $time = $parsed['2#035'];
231 } else {
232 $time = Array();
233 }
234 $timestamp = self::timeHelper( $val, $time, $c );
235 if ($timestamp) {
236 $data['DateTimeReleased'] = $timestamp;
237 }
238 break;
239
240 case '2#037':
241 //Date expires.
242 if ( isset( $parsed['2#038'] ) ) {
243 $time = $parsed['2#038'];
244 } else {
245 $time = Array();
246 }
247 $timestamp = self::timeHelper( $val, $time, $c );
248 if ($timestamp) {
249 $data['DateTimeExpires'] = $timestamp;
250 }
251 break;
252
253 case '2#000': /* iim version */
254 // unlike other tags, this is a 2-byte binary number.
255 //technically this is required if there is iptc data
256 //but in practise it isn't always there.
257 if ( strlen( $val[0] ) == 2 ) {
258 //if is just to be paranoid.
259 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
260 $versionValue += ord( substr( $val[0], 1, 1 ) );
261 $data['iimVersion'] = $versionValue;
262 }
263 break;
264
265 case '2#004':
266 // IntellectualGenere.
267 // first 4 characters are an id code
268 // That we're not really interested in.
269
270 // This prop is weird, since it's
271 // allowed to have multiple values
272 // in iim 4.1, but not in the XMP
273 // stuff. We're going to just
274 // extract the first value.
275 $con = self::ConvIPTC( $val, $c );
276 if ( strlen( $con[0] ) < 5 ) {
277 wfDebugLog( 'iptc', 'IPTC: '
278 . '2:04 too short. '
279 . 'Ignoring.' );
280 break;
281 }
282 $extracted = substr( $con[0], 4 );
283 $data['IntellectualGenre'] = $extracted;
284 break;
285
286 case '2#012':
287 // Subject News code - this is a compound field
288 // at the moment we only extract the subject news
289 // code, which is an 8 digit (ascii) number
290 // describing the subject matter of the content.
291 $codes = self::convIPTC( $val, $c );
292 foreach ( $codes as $ic ) {
293 $fields = explode(':', $ic, 3 );
294
295 if ( count( $fields ) < 2 ||
296 $fields[0] !== 'IPTC' )
297 {
298 wfDebugLog( 'IPTC', 'IPTC: '
299 . 'Invalid 2:12 - ' . $ic );
300 break;
301 }
302 $data['SubjectNewsCode'] = $fields[1];
303 }
304 break;
305
306 // purposely does not do 2:125, 2:130, 2:131,
307 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
308 // 2:200, 2:201, 2:202
309 // or the audio stuff (2:150 to 2:154)
310
311 case '2#070':
312 case '2#060':
313 case '2#063':
314 case '2#085':
315 case '2#038':
316 case '2#035':
317 //ignore. Handled elsewhere.
318 break;
319
320 default:
321 wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ));
322 break;
323 }
324
325 }
326 return $data;
327 }
328
329 /**
330 * Convert an iptc date and time tags into the exif format
331 *
332 * @todo Potentially this should also capture the timezone offset.
333 * @param Array $date The date tag
334 * @param Array $time The time tag
335 * @param $c
336 * @return String Date in exif format.
337 */
338 private static function timeHelper( $date, $time, $c ) {
339 if ( count( $date ) === 1 ) {
340 //the standard says this should always be 1
341 //just double checking.
342 list($date) = self::convIPTC( $date, $c );
343 } else {
344 return null;
345 }
346
347 if ( count( $time ) === 1 ) {
348 list($time) = self::convIPTC( $time, $c );
349 $dateOnly = false;
350 } else {
351 $time = '000000+0000'; //placeholder
352 $dateOnly = true;
353 }
354
355 if ( ! ( preg_match('/\d\d\d\d\d\d[-+]\d\d\d\d/', $time)
356 && preg_match('/\d\d\d\d\d\d\d\d/', $date)
357 && substr($date, 0, 4) !== '0000'
358 && substr($date, 4, 2) !== '00'
359 && substr($date, 6, 2) !== '00'
360 ) ) {
361 //something wrong.
362 // Note, this rejects some valid dates according to iptc spec
363 // for example: the date 00000400 means the photo was taken in
364 // April, but the year and day is unknown. We don't process these
365 // types of incomplete dates atm.
366 wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )");
367 return null;
368 }
369
370 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ));
371 if ( $unixTS === false ) {
372 wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
373 return null;
374 }
375
376 $tz = ( intval( substr( $time, 7, 2 ) ) *60*60 )
377 + ( intval( substr( $time, 9, 2 ) ) * 60 );
378
379 if ( substr( $time, 6, 1 ) === '-' ) {
380 $tz = - $tz;
381 }
382
383 $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
384 if ( $finalTimestamp === false ) {
385 wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
386 return null;
387 }
388 if ( $dateOnly ) {
389 //return the date only
390 return substr( $finalTimestamp, 0, 10 );
391 } else {
392 return $finalTimestamp;
393 }
394 }
395
396 /**
397 * Helper function to convert charset for iptc values.
398 * @param $data Mixed String or Array: The iptc string
399 * @param $charset String: The charset
400 *
401 * @return string
402 */
403 private static function convIPTC ( $data, $charset ) {
404 if ( is_array( $data ) ) {
405 foreach ($data as &$val) {
406 $val = self::convIPTCHelper( $val, $charset );
407 }
408 } else {
409 $data = self::convIPTCHelper( $data, $charset );
410 }
411
412 return $data;
413 }
414 /**
415 * Helper function of a helper function to convert charset for iptc values.
416 * @param $data Mixed String or Array: The iptc string
417 * @param $charset String: The charset
418 *
419 * @return string
420 */
421 private static function convIPTCHelper ( $data, $charset ) {
422 if ( $charset ) {
423 wfSuppressWarnings();
424 $data = iconv($charset, "UTF-8//IGNORE", $data);
425 wfRestoreWarnings();
426 if ($data === false) {
427 $data = "";
428 wfDebugLog('iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8");
429 }
430 } else {
431 //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
432 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
433 $oldData = $data;
434 UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8
435 if ($data === $oldData) {
436 return $data; //if validation didn't change $data
437 } else {
438 return self::convIPTCHelper( $oldData, 'Windows-1252' );
439 }
440 }
441 return trim( $data );
442 }
443
444 /**
445 * take the value of 1:90 tag and returns a charset
446 * @param String $tag 1:90 tag.
447 * @return string charset name or "?"
448 * Warning, this function does not (and is not intended to) detect
449 * all iso 2022 escape codes. In practise, the code for utf-8 is the
450 * only code that seems to have wide use. It does detect that code.
451 */
452 static function getCharset($tag) {
453
454 //According to iim standard, charset is defined by the tag 1:90.
455 //in which there are iso 2022 escape sequences to specify the character set.
456 //the iim standard seems to encourage that all necessary escape sequences are
457 //in the 1:90 tag, but says it doesn't have to be.
458
459 //This is in need of more testing probably. This is definitely not complete.
460 //however reading the docs of some other iptc software, it appears that most iptc software
461 //only recognizes utf-8. If 1:90 tag is not present content is
462 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
463
464 //This also won't work if there are more than one escape sequence in the 1:90 tag
465 //or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
466
467 // This is just going through the charsets mentioned in appendix C of the iim standard.
468
469 // \x1b = ESC.
470 switch ( $tag ) {
471 case "\x1b%G": //utf-8
472 //Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
473 case "\x1b(B": // ascii
474 case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
475 $c = 'UTF-8';
476 break;
477 case "\x1b(A": //like ascii, but british.
478 $c = 'ISO646-GB';
479 break;
480 case "\x1b(C": //some obscure sweedish/finland encoding
481 $c = 'ISO-IR-8-1';
482 break;
483 case "\x1b(D":
484 $c = 'ISO-IR-8-2';
485 break;
486 case "\x1b(E": //some obscure danish/norway encoding
487 $c = 'ISO-IR-9-1';
488 break;
489 case "\x1b(F":
490 $c = 'ISO-IR-9-2';
491 break;
492 case "\x1b(G":
493 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
494 break;
495 case "\x1b(I":
496 $c = "ISO646-IT";
497 break;
498 case "\x1b(L":
499 $c = "ISO646-PT";
500 break;
501 case "\x1b(Z":
502 $c = "ISO646-ES";
503 break;
504 case "\x1b([":
505 $c = "GREEK7-OLD";
506 break;
507 case "\x1b(K":
508 $c = "ISO646-DE";
509 break;
510 case "\x1b(N": //crylic
511 $c = "ISO_5427";
512 break;
513 case "\x1b(`": //iso646-NO
514 $c = "NS_4551-1";
515 break;
516 case "\x1b(f": //iso646-FR
517 $c = "NF_Z_62-010";
518 break;
519 case "\x1b(g":
520 $c = "PT2"; //iso646-PT2
521 break;
522 case "\x1b(h":
523 $c = "ES2";
524 break;
525 case "\x1b(i": //iso646-HU
526 $c = "MSZ_7795.3";
527 break;
528 case "\x1b(w":
529 $c = "CSA_Z243.4-1985-1";
530 break;
531 case "\x1b(x":
532 $c = "CSA_Z243.4-1985-2";
533 break;
534 case "\x1b\$(B":
535 case "\x1b\$B":
536 case "\x1b&@\x1b\$B":
537 case "\x1b&@\x1b\$(B":
538 $c = "JIS_C6226-1983";
539 break;
540 case "\x1b-A": // iso-8859-1. at least for the high code characters.
541 case "\x1b(@\x1b-A":
542 case "\x1b(B\x1b-A":
543 $c = 'ISO-8859-1';
544 break;
545 case "\x1b-B": // iso-8859-2. at least for the high code characters.
546 $c = 'ISO-8859-2';
547 break;
548 case "\x1b-C": // iso-8859-3. at least for the high code characters.
549 $c = 'ISO-8859-3';
550 break;
551 case "\x1b-D": // iso-8859-4. at least for the high code characters.
552 $c = 'ISO-8859-4';
553 break;
554 case "\x1b-E": // iso-8859-5. at least for the high code characters.
555 $c = 'ISO-8859-5';
556 break;
557 case "\x1b-F": // iso-8859-6. at least for the high code characters.
558 $c = 'ISO-8859-6';
559 break;
560 case "\x1b-G": // iso-8859-7. at least for the high code characters.
561 $c = 'ISO-8859-7';
562 break;
563 case "\x1b-H": // iso-8859-8. at least for the high code characters.
564 $c = 'ISO-8859-8';
565 break;
566 case "\x1b-I": // CSN_369103. at least for the high code characters.
567 $c = 'CSN_369103';
568 break;
569 default:
570 wfDebugLog('iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
571 //at this point just give up and refuse to parse iptc?
572 $c = false;
573 }
574 return $c;
575 }
576 }