Follow-up to r111091. Dont paste md5 in the code.
[lhc/web/wiklou.git] / includes / media / JpegMetadataExtractor.php
1 <?php
2 /**
3 * Class for reading jpegs and extracting metadata.
4 * see also BitmapMetadataHandler.
5 *
6 * Based somewhat on GIFMetadataExtrator.
7 */
8 class JpegMetadataExtractor {
9
10 const MAX_JPEG_SEGMENTS = 200;
11 // the max segment is a sanity check.
12 // A jpeg file should never even remotely have
13 // that many segments. Your average file has about 10.
14
15 /** Function to extract metadata segments of interest from jpeg files
16 * based on GIFMetadataExtractor.
17 *
18 * we can almost use getimagesize to do this
19 * but gis doesn't support having multiple app1 segments
20 * and those can't extract xmp on files containing both exif and xmp data
21 *
22 * @param String $filename name of jpeg file
23 * @return Array of interesting segments.
24 * @throws MWException if given invalid file.
25 */
26 static function segmentSplitter ( $filename ) {
27 $showXMP = function_exists( 'xml_parser_create_ns' );
28
29 $segmentCount = 0;
30
31 $segments = array(
32 'XMP_ext' => array(),
33 'COM' => array(),
34 'PSIR' => array(),
35 );
36
37 if ( !$filename ) {
38 throw new MWException( "No filename specified for " . __METHOD__ );
39 }
40 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
41 throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
42 }
43
44 $fh = fopen( $filename, "rb" );
45
46 if ( !$fh ) {
47 throw new MWException( "Could not open file $filename" );
48 }
49
50 $buffer = fread( $fh, 2 );
51 if ( $buffer !== "\xFF\xD8" ) {
52 throw new MWException( "Not a jpeg, no SOI" );
53 }
54 while ( !feof( $fh ) ) {
55 $buffer = fread( $fh, 1 );
56 $segmentCount++;
57 if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
58 // this is just a sanity check
59 throw new MWException( 'Too many jpeg segments. Aborting' );
60 }
61 if ( $buffer !== "\xFF" ) {
62 throw new MWException( "Error reading jpeg file marker. Expected 0xFF but got " . bin2hex( $buffer ) );
63 }
64
65 $buffer = fread( $fh, 1 );
66 while( $buffer === "\xFF" && !feof( $fh ) ) {
67 // Skip through any 0xFF padding bytes.
68 $buffer = fread( $fh, 1 );
69 }
70 if ( $buffer === "\xFE" ) {
71
72 // COM section -- file comment
73 // First see if valid utf-8,
74 // if not try to convert it to windows-1252.
75 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
76 UtfNormal::quickIsNFCVerify( $com );
77 // turns $com to valid utf-8.
78 // thus if no change, its utf-8, otherwise its something else.
79 if ( $com !== $oldCom ) {
80 wfSuppressWarnings();
81 $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
82 wfRestoreWarnings();
83 }
84 // Try it again, if its still not a valid string, then probably
85 // binary junk or some really weird encoding, so don't extract.
86 UtfNormal::quickIsNFCVerify( $com );
87 if ( $com === $oldCom ) {
88 $segments["COM"][] = $oldCom;
89 } else {
90 wfDebug( __METHOD__ . ' Ignoring JPEG comment as is garbage.' );
91 }
92
93 } elseif ( $buffer === "\xE1" ) {
94 // APP1 section (Exif, XMP, and XMP extended)
95 // only extract if XMP is enabled.
96 $temp = self::jpegExtractMarker( $fh );
97 // check what type of app segment this is.
98 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
99 $segments["XMP"] = substr( $temp, 29 );
100 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
101 $segments["XMP_ext"][] = substr( $temp, 35 );
102 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
103 // Some images (especially flickr images) seem to have this.
104 // I really have no idea what the deal is with them, but
105 // whatever...
106 $segments["XMP"] = substr( $temp, 29 );
107 wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
108 . "Using anyways.\n" );
109 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
110 // Just need to find out what the byte order is.
111 // because php's exif plugin sucks...
112 // This is a II for little Endian, MM for big. Not a unicode BOM.
113 $byteOrderMarker = substr( $temp, 6, 2 );
114 if ( $byteOrderMarker === 'MM' ) {
115 $segments['byteOrder'] = 'BE';
116 } elseif ( $byteOrderMarker === 'II' ) {
117 $segments['byteOrder'] = 'LE';
118 } else {
119 wfDebug( __METHOD__ . ' Invalid byte ordering?!' );
120 }
121 }
122 } elseif ( $buffer === "\xED" ) {
123 // APP13 - PSIR. IPTC and some photoshop stuff
124 $temp = self::jpegExtractMarker( $fh );
125 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
126 $segments["PSIR"][] = $temp;
127 }
128 } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
129 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
130 return $segments;
131 } else {
132 // segment we don't care about, so skip
133 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
134 if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" );
135 fseek( $fh, $size['int'] - 2, SEEK_CUR );
136 }
137
138 }
139 // shouldn't get here.
140 throw new MWException( "Reached end of jpeg file unexpectedly" );
141 }
142
143 /**
144 * Helper function for jpegSegmentSplitter
145 * @param &$fh FileHandle for jpeg file
146 * @return string data content of segment.
147 */
148 private static function jpegExtractMarker( &$fh ) {
149 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
150 if ( $size['int'] <= 2 ) {
151 throw new MWException( "invalid marker size in jpeg" );
152 }
153 $segment = fread( $fh, $size['int'] - 2 );
154 if ( strlen( $segment ) !== $size['int'] - 2 ) {
155 throw new MWException( "Segment shorter than expected" );
156 }
157 return $segment;
158 }
159
160 /**
161 * This reads the photoshop image resource.
162 * Currently it only compares the iptc/iim hash
163 * with the stored hash, which is used to determine the precedence
164 * of the iptc data. In future it may extract some other info, like
165 * url of copyright license.
166 *
167 * This should generally be called by BitmapMetadataHandler::doApp13()
168 *
169 * @param String $app13 photoshop psir app13 block from jpg.
170 * @throws MWException (It gets caught next level up though)
171 * @return String if the iptc hash is good or not.
172 */
173 public static function doPSIR ( $app13 ) {
174 if ( !$app13 ) {
175 throw new MWException( "No App13 segment given" );
176 }
177 // First compare hash with real thing
178 // 0x404 contains IPTC, 0x425 has hash
179 // This is used to determine if the iptc is newer than
180 // the xmp data, as xmp programs update the hash,
181 // where non-xmp programs don't.
182
183 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
184 $appLen = strlen( $app13 );
185 $realHash = "";
186 $recordedHash = "";
187
188 // the +12 is the length of an empty item.
189 while ( $offset + 12 <= $appLen ) {
190 $valid = true;
191 if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
192 // its supposed to be 8BIM
193 // but apparently sometimes isn't esp. in
194 // really old jpg's
195 $valid = false;
196 }
197 $offset += 4;
198 $id = substr( $app13, $offset, 2 );
199 // id is a 2 byte id number which identifies
200 // the piece of info this record contains.
201
202 $offset += 2;
203
204 // some record types can contain a name, which
205 // is a pascal string 0-padded to be an even
206 // number of bytes. Most times (and any time
207 // we care) this is empty, making it two null bytes.
208
209 $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
210 // we never use the name so skip it. +1 for length byte
211 if ( $lenName % 2 == 1 ) {
212 $lenName++;
213 } // pad to even.
214 $offset += $lenName;
215
216 // now length of data (unsigned long big endian)
217 $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
218 // PHP can take issue with very large unsigned ints and make them negative.
219 // Which should never ever happen, as this has to be inside a segment
220 // which is limited to a 16 bit number.
221 if ( $lenData['len'] < 0 ) throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
222
223 $offset += 4; // 4bytes length field;
224
225 // this should not happen, but check.
226 if ( $lenData['len'] + $offset > $appLen ) {
227 throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
228 . "; offset=$offset; total length=$appLen)" );
229 }
230
231 if ( $valid ) {
232 switch ( $id ) {
233 case "\x04\x04":
234 // IPTC block
235 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
236 break;
237 case "\x04\x25":
238 $recordedHash = substr( $app13, $offset, $lenData['len'] );
239 break;
240 }
241 }
242
243 // if odd, add 1 to length to account for
244 // null pad byte.
245 if ( $lenData['len'] % 2 == 1 ) $lenData['len']++;
246 $offset += $lenData['len'];
247
248 }
249
250 if ( !$realHash || !$recordedHash ) {
251 return 'iptc-no-hash';
252 } elseif ( $realHash === $recordedHash ) {
253 return 'iptc-good-hash';
254 } else { /*$realHash !== $recordedHash */
255 return 'iptc-bad-hash';
256 }
257 }
258 }