(Follow-up r86169) Needed supress warnings around iconv
[lhc/web/wiklou.git] / includes / media / JpegMetadataExtractor.php
1 <?php
2 /**
3 * Class for reading jpegs and extracting metadata.
4 * see also BitmapMetadataHandler.
5 *
6 * Based somewhat on GIFMetadataExtrator.
7 */
8 class JpegMetadataExtractor {
9 const MAX_JPEG_SEGMENTS = 200;
10 // the max segment is a sanity check.
11 // A jpeg file should never even remotely have
12 // that many segments. Your average file has about 10.
13
14 /** Function to extract metadata segments of interest from jpeg files
15 * based on GIFMetadataExtractor.
16 *
17 * we can almost use getimagesize to do this
18 * but gis doesn't support having multiple app1 segments
19 * and those can't extract xmp on files containing both exif and xmp data
20 *
21 * @param String $filename name of jpeg file
22 * @return Array of interesting segments.
23 * @throws MWException if given invalid file.
24 */
25 static function segmentSplitter ( $filename ) {
26 $showXMP = function_exists( 'xml_parser_create_ns' );
27
28 $segmentCount = 0;
29
30 $segments = Array( 'XMP_ext' => array(), 'COM' => array() );
31
32 if ( !$filename ) throw new MWException( "No filename specified for " . __METHOD__ );
33 if ( !file_exists( $filename ) || is_dir( $filename ) ) throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
34
35 $fh = fopen( $filename, "rb" );
36
37 if ( !$fh ) throw new MWException( "Could not open file $filename" );
38
39 $buffer = fread( $fh, 2 );
40 if ( $buffer !== "\xFF\xD8" ) throw new MWException( "Not a jpeg, no SOI" );
41 while ( !feof( $fh ) ) {
42 $buffer = fread( $fh, 1 );
43 $segmentCount++;
44 if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
45 // this is just a sanity check
46 throw new MWException( 'Too many jpeg segments. Aborting' );
47 }
48 if ( $buffer !== "\xFF" ) {
49 throw new MWException( "Error reading jpeg file marker" );
50 }
51
52 $buffer = fread( $fh, 1 );
53 if ( $buffer === "\xFE" ) {
54
55 // COM section -- file comment
56 // First see if valid utf-8,
57 // if not try to convert it to windows-1252.
58 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
59 UtfNormal::quickIsNFCVerify( $com );
60 // turns $com to valid utf-8.
61 // thus if no change, its utf-8, otherwise its something else.
62 if ( $com !== $oldCom ) {
63 wfSuppressWarnings();
64 $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
65 wfRestoreWarnings();
66 }
67 // Try it again, if its still not a valid string, then probably
68 // binary junk or some really weird encoding, so don't extract.
69 UtfNormal::quickIsNFCVerify( $com );
70 if ( $com === $oldCom ) {
71 $segments["COM"][] = $oldCom;
72 } else {
73 wfDebug( __METHOD__ . ' Ignoring JPEG comment as is garbage.' );
74 }
75
76 } elseif ( $buffer === "\xE1" && $showXMP ) {
77 // APP1 section (Exif, XMP, and XMP extended)
78 // only extract if XMP is enabled.
79 $temp = self::jpegExtractMarker( $fh );
80
81 // check what type of app segment this is.
82 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" ) {
83 $segments["XMP"] = substr( $temp, 29 );
84 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" ) {
85 $segments["XMP_ext"][] = substr( $temp, 35 );
86 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" ) {
87 // Some images (especially flickr images) seem to have this.
88 // I really have no idea what the deal is with them, but
89 // whatever...
90 $segments["XMP"] = substr( $temp, 29 );
91 wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
92 . "Using anyways.\n" );
93 }
94 } elseif ( $buffer === "\xED" ) {
95 // APP13 - PSIR. IPTC and some photoshop stuff
96 $temp = self::jpegExtractMarker( $fh );
97 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
98 $segments["PSIR"] = $temp;
99 }
100 } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
101 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
102 return $segments;
103 } else {
104 // segment we don't care about, so skip
105 $size = unpack( "nint", fread( $fh, 2 ) );
106 if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" );
107 fseek( $fh, $size['int'] - 2, SEEK_CUR );
108 }
109
110 }
111 // shouldn't get here.
112 throw new MWException( "Reached end of jpeg file unexpectedly" );
113
114 }
115 /**
116 * Helper function for jpegSegmentSplitter
117 * @param &$fh FileHandle for jpeg file
118 * @return data content of segment.
119 */
120 private static function jpegExtractMarker( &$fh ) {
121 $size = unpack( "nint", fread( $fh, 2 ) );
122 if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" );
123 return fread( $fh, $size['int'] - 2 );
124 }
125
126 /**
127 * This reads the photoshop image resource.
128 * Currently it only compares the iptc/iim hash
129 * with the stored hash, which is used to determine the precedence
130 * of the iptc data. In future it may extract some other info, like
131 * url of copyright license.
132 *
133 * This should generally be called by BitmapMetadataHandler::doApp13()
134 *
135 * @param String $app13 photoshop psir app13 block from jpg.
136 * @return String if the iptc hash is good or not.
137 */
138 public static function doPSIR ( $app13 ) {
139 if ( !$app13 ) return;
140 // First compare hash with real thing
141 // 0x404 contains IPTC, 0x425 has hash
142 // This is used to determine if the iptc is newer than
143 // the xmp data, as xmp programs update the hash,
144 // where non-xmp programs don't.
145
146 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
147 $appLen = strlen( $app13 );
148 $realHash = "";
149 $recordedHash = "";
150
151 // the +12 is the length of an empty item.
152 while ( $offset + 12 <= $appLen ) {
153 if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
154 // its supposed to be 8BIM
155 // but apparently sometimes isn't esp. in
156 // really old jpg's
157 $valid = false;
158 }
159 $offset += 4;
160 $id = substr( $app13, $offset, 2 );
161 // id is a 2 byte id number which identifies
162 // the piece of info this record contains.
163
164 $offset += 2;
165
166 // some record types can contain a name, which
167 // is a pascal string 0-padded to be an even
168 // number of bytes. Most times (and any time
169 // we care) this is empty, making it two null bytes.
170
171 $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
172 // we never use the name so skip it. +1 for length byte
173 if ( $lenName % 2 == 1 ) $lenName++; // pad to even.
174 $offset += $lenName;
175
176 // now length of data (unsigned long big endian)
177 $lenData = unpack( 'Nlen', substr( $app13, $offset, 4 ) );
178 $offset += 4; // 4bytes length field;
179
180 // this should not happen, but check.
181 if ( $lenData['len'] + $offset > $appLen ) {
182 wfDebug( __METHOD__ . " PSIR data too long.\n" );
183 return 'iptc-no-hash';
184 }
185
186 if ( $valid ) {
187 switch ( $id ) {
188 case "\x04\x04":
189 // IPTC block
190 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
191 break;
192 case "\x04\x25":
193 $recordedHash = substr( $app13, $offset, $lenData['len'] );
194 break;
195 }
196 }
197
198 // if odd, add 1 to length to account for
199 // null pad byte.
200 if ( $lenData['len'] % 2 == 1 ) $lenData['len']++;
201 $offset += $lenData['len'];
202
203 }
204
205 if ( !$realHash || !$recordedHash ) {
206 return 'iptc-no-hash';
207 } elseif ( $realHash === $recordedHash ) {
208 return 'iptc-good-hash';
209 } else { /*$realHash !== $recordedHash */
210 return 'iptc-bad-hash';
211 }
212
213 }
214
215 }