Merge "Add .pipeline/ with dev image variant"
[lhc/web/wiklou.git] / includes / media / IPTC.php
1 <?php
2 /**
3 * Class for some IPTC functions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Media
22 */
23
24 /**
25 * Class for some IPTC functions.
26 *
27 * @ingroup Media
28 */
29 class IPTC {
30 /**
31 * This takes the results of iptcparse() and puts it into a
32 * form that can be handled by mediawiki. Generally called from
33 * BitmapMetadataHandler::doApp13.
34 *
35 * @see http://www.iptc.org/std/IIM/4.1/specification/IIMV4.1.pdf
36 *
37 * @param string $rawData The app13 block from jpeg containing iptc/iim data
38 * @return array IPTC metadata array
39 * @suppress PhanTypeArraySuspicious
40 */
41 static function parse( $rawData ) {
42 $parsed = iptcparse( $rawData );
43 $data = [];
44 if ( !is_array( $parsed ) ) {
45 return $data;
46 }
47
48 $c = '';
49 // charset info contained in tag 1:90.
50 if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
51 $c = self::getCharset( $parsed['1#090'][0] );
52 if ( $c === false ) {
53 // Unknown charset. refuse to parse.
54 // note: There is a different between
55 // unknown and no charset specified.
56 return [];
57 }
58 unset( $parsed['1#090'] );
59 }
60
61 foreach ( $parsed as $tag => $val ) {
62 if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
63 wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
64 continue;
65 }
66 switch ( $tag ) {
67 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
68 $data['ImageDescription'] = self::convIPTC( $val, $c );
69 break;
70 case '2#116': /* copyright. Mapped with exif copyright */
71 $data['Copyright'] = self::convIPTC( $val, $c );
72 break;
73 case '2#080': /* byline. Mapped with exif Artist */
74 /* merge with byline title (2:85)
75 * like how exif does it with
76 * Title, person. Not sure if this is best
77 * approach since we no longer have the two fields
78 * separate. each byline title entry corresponds to a
79 * specific byline. */
80
81 $bylines = self::convIPTC( $val, $c );
82 if ( isset( $parsed['2#085'] ) ) {
83 $titles = self::convIPTC( $parsed['2#085'], $c );
84 } else {
85 $titles = [];
86 }
87
88 $titleCount = count( $titles );
89 for ( $i = 0; $i < $titleCount; $i++ ) {
90 if ( isset( $bylines[$i] ) ) {
91 // theoretically this should always be set
92 // but doesn't hurt to be careful.
93 $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
94 }
95 }
96 $data['Artist'] = $bylines;
97 break;
98 case '2#025': /* keywords */
99 $data['Keywords'] = self::convIPTC( $val, $c );
100 break;
101 case '2#101': /* Country (shown) */
102 $data['CountryDest'] = self::convIPTC( $val, $c );
103 break;
104 case '2#095': /* state/province (shown) */
105 $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
106 break;
107 case '2#090': /* city (Shown) */
108 $data['CityDest'] = self::convIPTC( $val, $c );
109 break;
110 case '2#092': /* sublocation (shown) */
111 $data['SublocationDest'] = self::convIPTC( $val, $c );
112 break;
113 case '2#005': /* object name/title */
114 $data['ObjectName'] = self::convIPTC( $val, $c );
115 break;
116 case '2#040': /* special instructions */
117 $data['SpecialInstructions'] = self::convIPTC( $val, $c );
118 break;
119 case '2#105': /* headline */
120 $data['Headline'] = self::convIPTC( $val, $c );
121 break;
122 case '2#110': /* credit */
123 /*"Identifies the provider of the objectdata,
124 * not necessarily the owner/creator". */
125 $data['Credit'] = self::convIPTC( $val, $c );
126 break;
127 case '2#115': /* source */
128 /* "Identifies the original owner of the intellectual content of the
129 *objectdata. This could be an agency, a member of an agency or
130 *an individual." */
131 $data['Source'] = self::convIPTC( $val, $c );
132 break;
133
134 case '2#007': /* edit status (lead, correction, etc) */
135 $data['EditStatus'] = self::convIPTC( $val, $c );
136 break;
137 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
138 $data['iimCategory'] = self::convIPTC( $val, $c );
139 break;
140 case '2#020': /* category. deprecated. */
141 $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
142 break;
143 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
144 $data['Urgency'] = self::convIPTC( $val, $c );
145 break;
146 case '2#022':
147 /* "Identifies objectdata that recurs often and predictably...
148 * Example: Euroweather" */
149 $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
150 break;
151 case '2#026':
152 /* Content location code (iso 3166 + some custom things)
153 * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
154 * See wikipedia article on iso 3166 and appendix D of iim std. */
155 $data['LocationDestCode'] = self::convIPTC( $val, $c );
156 break;
157 case '2#027':
158 /* Content location name. Full printable name
159 * of location of photo. */
160 $data['LocationDest'] = self::convIPTC( $val, $c );
161 break;
162 case '2#065':
163 /* Originating Program.
164 * Combine with Program version (2:70) if present.
165 */
166 $software = self::convIPTC( $val, $c );
167
168 if ( count( $software ) !== 1 ) {
169 // according to iim standard this cannot have multiple values
170 // so if there is more than one, something weird is happening,
171 // and we skip it.
172 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
173 break;
174 }
175
176 if ( isset( $parsed['2#070'] ) ) {
177 // if a version is set for the software.
178 $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
179 unset( $parsed['2#070'] );
180 $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
181 } else {
182 $data['Software'] = $software;
183 }
184 break;
185 case '2#075':
186 /* Object cycle.
187 * a for morning (am), p for evening, b for both */
188 $data['ObjectCycle'] = self::convIPTC( $val, $c );
189 break;
190 case '2#100':
191 /* Country/Primary location code.
192 * "Indicates the code of the country/primary location where the
193 * intellectual property of the objectdata was created"
194 * unclear how this differs from 2#026
195 */
196 $data['CountryCodeDest'] = self::convIPTC( $val, $c );
197 break;
198 case '2#103':
199 /* original transmission ref.
200 * "A code representing the location of original transmission ac-
201 * cording to practises of the provider."
202 */
203 $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
204 break;
205 case '2#118': /*contact*/
206 $data['Contact'] = self::convIPTC( $val, $c );
207 break;
208 case '2#122':
209 /* Writer/Editor
210 * "Identification of the name of the person involved in the writing,
211 * editing or correcting the objectdata or caption/abstract."
212 */
213 $data['Writer'] = self::convIPTC( $val, $c );
214 break;
215 case '2#135': /* lang code */
216 $data['LanguageCode'] = self::convIPTC( $val, $c );
217 break;
218
219 // Start date stuff.
220 // It doesn't accept incomplete dates even though they are valid
221 // according to spec.
222 // Should potentially store timezone as well.
223 case '2#055':
224 // Date created (not date digitized).
225 // Maps to exif DateTimeOriginal
226 $time = $parsed['2#060'] ?? [];
227 $timestamp = self::timeHelper( $val, $time, $c );
228 if ( $timestamp ) {
229 $data['DateTimeOriginal'] = $timestamp;
230 }
231 break;
232
233 case '2#062':
234 // Date converted to digital representation.
235 // Maps to exif DateTimeDigitized
236 $time = $parsed['2#063'] ?? [];
237 $timestamp = self::timeHelper( $val, $time, $c );
238 if ( $timestamp ) {
239 $data['DateTimeDigitized'] = $timestamp;
240 }
241 break;
242
243 case '2#030':
244 // Date released.
245 $time = $parsed['2#035'] ?? [];
246 $timestamp = self::timeHelper( $val, $time, $c );
247 if ( $timestamp ) {
248 $data['DateTimeReleased'] = $timestamp;
249 }
250 break;
251
252 case '2#037':
253 // Date expires.
254 $time = $parsed['2#038'] ?? [];
255 $timestamp = self::timeHelper( $val, $time, $c );
256 if ( $timestamp ) {
257 $data['DateTimeExpires'] = $timestamp;
258 }
259 break;
260
261 case '2#000': /* iim version */
262 // unlike other tags, this is a 2-byte binary number.
263 // technically this is required if there is iptc data
264 // but in practise it isn't always there.
265 if ( strlen( $val[0] ) == 2 ) {
266 // if is just to be paranoid.
267 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
268 $versionValue += ord( substr( $val[0], 1, 1 ) );
269 $data['iimVersion'] = $versionValue;
270 }
271 break;
272
273 case '2#004':
274 // IntellectualGenere.
275 // first 4 characters are an id code
276 // That we're not really interested in.
277
278 // This prop is weird, since it's
279 // allowed to have multiple values
280 // in iim 4.1, but not in the XMP
281 // stuff. We're going to just
282 // extract the first value.
283 $con = self::convIPTC( $val, $c );
284 if ( strlen( $con[0] ) < 5 ) {
285 wfDebugLog( 'iptc', 'IPTC: '
286 . '2:04 too short. '
287 . 'Ignoring.' );
288 break;
289 }
290 $extracted = substr( $con[0], 4 );
291 $data['IntellectualGenre'] = $extracted;
292 break;
293
294 case '2#012':
295 // Subject News code - this is a compound field
296 // at the moment we only extract the subject news
297 // code, which is an 8 digit (ascii) number
298 // describing the subject matter of the content.
299 $codes = self::convIPTC( $val, $c );
300 foreach ( $codes as $ic ) {
301 $fields = explode( ':', $ic, 3 );
302
303 if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
304 wfDebugLog( 'IPTC', 'IPTC: '
305 . 'Invalid 2:12 - ' . $ic );
306 break;
307 }
308 $data['SubjectNewsCode'] = $fields[1];
309 }
310 break;
311
312 // purposely does not do 2:125, 2:130, 2:131,
313 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
314 // 2:200, 2:201, 2:202
315 // or the audio stuff (2:150 to 2:154)
316
317 case '2#070':
318 case '2#060':
319 case '2#063':
320 case '2#085':
321 case '2#038':
322 case '2#035':
323 // ignore. Handled elsewhere.
324 break;
325
326 default:
327 wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
328 break;
329 }
330 }
331
332 return $data;
333 }
334
335 /**
336 * Convert an iptc date and time tags into the exif format
337 *
338 * @todo Potentially this should also capture the timezone offset.
339 * @param array $date The date tag
340 * @param array $time The time tag
341 * @param string $charset
342 * @return string Date in EXIF format.
343 */
344 private static function timeHelper( $date, $time, $charset ) {
345 if ( count( $date ) === 1 ) {
346 // the standard says this should always be 1
347 // just double checking.
348 list( $date ) = self::convIPTC( $date, $charset );
349 } else {
350 return null;
351 }
352
353 if ( count( $time ) === 1 ) {
354 list( $time ) = self::convIPTC( $time, $charset );
355 $dateOnly = false;
356 } else {
357 $time = '000000+0000'; // placeholder
358 $dateOnly = true;
359 }
360
361 if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
362 && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
363 && substr( $date, 0, 4 ) !== '0000'
364 && substr( $date, 4, 2 ) !== '00'
365 && substr( $date, 6, 2 ) !== '00'
366 ) ) {
367 // something wrong.
368 // Note, this rejects some valid dates according to iptc spec
369 // for example: the date 00000400 means the photo was taken in
370 // April, but the year and day is unknown. We don't process these
371 // types of incomplete dates atm.
372 wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
373
374 return null;
375 }
376
377 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
378 if ( $unixTS === false ) {
379 wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
380
381 return null;
382 }
383
384 $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
385 + ( intval( substr( $time, 9, 2 ) ) * 60 );
386
387 if ( substr( $time, 6, 1 ) === '-' ) {
388 $tz = -$tz;
389 }
390
391 $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
392 if ( $finalTimestamp === false ) {
393 wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
394
395 return null;
396 }
397 if ( $dateOnly ) {
398 // return the date only
399 return substr( $finalTimestamp, 0, 10 );
400 } else {
401 return $finalTimestamp;
402 }
403 }
404
405 /**
406 * Helper function to convert charset for iptc values.
407 * @param string|array $data The iptc string
408 * @param string $charset
409 *
410 * @return string|array
411 */
412 private static function convIPTC( $data, $charset ) {
413 if ( is_array( $data ) ) {
414 foreach ( $data as &$val ) {
415 $val = self::convIPTCHelper( $val, $charset );
416 }
417 } else {
418 $data = self::convIPTCHelper( $data, $charset );
419 }
420
421 return $data;
422 }
423
424 /**
425 * Helper function of a helper function to convert charset for iptc values.
426 * @param string|array $data The IPTC string
427 * @param string $charset
428 *
429 * @return string
430 */
431 private static function convIPTCHelper( $data, $charset ) {
432 if ( $charset ) {
433 Wikimedia\suppressWarnings();
434 $data = iconv( $charset, "UTF-8//IGNORE", $data );
435 Wikimedia\restoreWarnings();
436 if ( $data === false ) {
437 $data = "";
438 wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
439 }
440 } else {
441 // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
442 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
443 $oldData = $data;
444 UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
445 if ( $data === $oldData ) {
446 return $data; // if validation didn't change $data
447 } else {
448 return self::convIPTCHelper( $oldData, 'Windows-1252' );
449 }
450 }
451
452 return trim( $data );
453 }
454
455 /**
456 * take the value of 1:90 tag and returns a charset
457 * @param string $tag 1:90 tag.
458 * @return string Charset name or "?"
459 * Warning, this function does not (and is not intended to) detect
460 * all iso 2022 escape codes. In practise, the code for utf-8 is the
461 * only code that seems to have wide use. It does detect that code.
462 */
463 static function getCharset( $tag ) {
464 // According to iim standard, charset is defined by the tag 1:90.
465 // in which there are iso 2022 escape sequences to specify the character set.
466 // the iim standard seems to encourage that all necessary escape sequences are
467 // in the 1:90 tag, but says it doesn't have to be.
468
469 // This is in need of more testing probably. This is definitely not complete.
470 // however reading the docs of some other iptc software, it appears that most iptc software
471 // only recognizes utf-8. If 1:90 tag is not present content is
472 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
473
474 // This also won't work if there are more than one escape sequence in the 1:90 tag
475 // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
476
477 // This is just going through the charsets mentioned in appendix C of the iim standard.
478
479 // \x1b = ESC.
480 switch ( $tag ) {
481 case "\x1b%G": // utf-8
482 // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
483 case "\x1b(B": // ascii
484 case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
485 $c = 'UTF-8';
486 break;
487 case "\x1b(A": // like ascii, but british.
488 $c = 'ISO646-GB';
489 break;
490 case "\x1b(C": // some obscure sweedish/finland encoding
491 $c = 'ISO-IR-8-1';
492 break;
493 case "\x1b(D":
494 $c = 'ISO-IR-8-2';
495 break;
496 case "\x1b(E": // some obscure danish/norway encoding
497 $c = 'ISO-IR-9-1';
498 break;
499 case "\x1b(F":
500 $c = 'ISO-IR-9-2';
501 break;
502 case "\x1b(G":
503 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
504 break;
505 case "\x1b(I":
506 $c = "ISO646-IT";
507 break;
508 case "\x1b(L":
509 $c = "ISO646-PT";
510 break;
511 case "\x1b(Z":
512 $c = "ISO646-ES";
513 break;
514 case "\x1b([":
515 $c = "GREEK7-OLD";
516 break;
517 case "\x1b(K":
518 $c = "ISO646-DE";
519 break;
520 case "\x1b(N": // crylic
521 $c = "ISO_5427";
522 break;
523 case "\x1b(`": // iso646-NO
524 $c = "NS_4551-1";
525 break;
526 case "\x1b(f": // iso646-FR
527 $c = "NF_Z_62-010";
528 break;
529 case "\x1b(g":
530 $c = "PT2"; // iso646-PT2
531 break;
532 case "\x1b(h":
533 $c = "ES2";
534 break;
535 case "\x1b(i": // iso646-HU
536 $c = "MSZ_7795.3";
537 break;
538 case "\x1b(w":
539 $c = "CSA_Z243.4-1985-1";
540 break;
541 case "\x1b(x":
542 $c = "CSA_Z243.4-1985-2";
543 break;
544 case "\x1b\$(B":
545 case "\x1b\$B":
546 case "\x1b&@\x1b\$B":
547 case "\x1b&@\x1b\$(B":
548 $c = "JIS_C6226-1983";
549 break;
550 case "\x1b-A": // iso-8859-1. at least for the high code characters.
551 case "\x1b(@\x1b-A":
552 case "\x1b(B\x1b-A":
553 $c = 'ISO-8859-1';
554 break;
555 case "\x1b-B": // iso-8859-2. at least for the high code characters.
556 $c = 'ISO-8859-2';
557 break;
558 case "\x1b-C": // iso-8859-3. at least for the high code characters.
559 $c = 'ISO-8859-3';
560 break;
561 case "\x1b-D": // iso-8859-4. at least for the high code characters.
562 $c = 'ISO-8859-4';
563 break;
564 case "\x1b-E": // iso-8859-5. at least for the high code characters.
565 $c = 'ISO-8859-5';
566 break;
567 case "\x1b-F": // iso-8859-6. at least for the high code characters.
568 $c = 'ISO-8859-6';
569 break;
570 case "\x1b-G": // iso-8859-7. at least for the high code characters.
571 $c = 'ISO-8859-7';
572 break;
573 case "\x1b-H": // iso-8859-8. at least for the high code characters.
574 $c = 'ISO-8859-8';
575 break;
576 case "\x1b-I": // CSN_369103. at least for the high code characters.
577 $c = 'CSN_369103';
578 break;
579 default:
580 wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
581 // at this point just give up and refuse to parse iptc?
582 $c = false;
583 }
584 return $c;
585 }
586 }