(bug 17602) fix Monobook action tabs not quite touching the page body
[lhc/web/wiklou.git] / includes / media / JpegMetadataExtractor.php
1 <?php
2 /**
3 * Extraction of JPEG image metadata.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Media
22 */
23
24 /**
25 * Class for reading jpegs and extracting metadata.
26 * see also BitmapMetadataHandler.
27 *
28 * Based somewhat on GIFMetadataExtractor.
29 *
30 * @ingroup Media
31 */
32 class JpegMetadataExtractor {
33
34 const MAX_JPEG_SEGMENTS = 200;
35 // the max segment is a sanity check.
36 // A jpeg file should never even remotely have
37 // that many segments. Your average file has about 10.
38
39 /** Function to extract metadata segments of interest from jpeg files
40 * based on GIFMetadataExtractor.
41 *
42 * we can almost use getimagesize to do this
43 * but gis doesn't support having multiple app1 segments
44 * and those can't extract xmp on files containing both exif and xmp data
45 *
46 * @param string $filename name of jpeg file
47 * @return Array of interesting segments.
48 * @throws MWException if given invalid file.
49 */
50 static function segmentSplitter( $filename ) {
51 $showXMP = function_exists( 'xml_parser_create_ns' );
52
53 $segmentCount = 0;
54
55 $segments = array(
56 'XMP_ext' => array(),
57 'COM' => array(),
58 'PSIR' => array(),
59 );
60
61 if ( !$filename ) {
62 throw new MWException( "No filename specified for " . __METHOD__ );
63 }
64 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
65 throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
66 }
67
68 $fh = fopen( $filename, "rb" );
69
70 if ( !$fh ) {
71 throw new MWException( "Could not open file $filename" );
72 }
73
74 $buffer = fread( $fh, 2 );
75 if ( $buffer !== "\xFF\xD8" ) {
76 throw new MWException( "Not a jpeg, no SOI" );
77 }
78 while ( !feof( $fh ) ) {
79 $buffer = fread( $fh, 1 );
80 $segmentCount++;
81 if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
82 // this is just a sanity check
83 throw new MWException( 'Too many jpeg segments. Aborting' );
84 }
85 if ( $buffer !== "\xFF" ) {
86 throw new MWException( "Error reading jpeg file marker. Expected 0xFF but got " . bin2hex( $buffer ) );
87 }
88
89 $buffer = fread( $fh, 1 );
90 while( $buffer === "\xFF" && !feof( $fh ) ) {
91 // Skip through any 0xFF padding bytes.
92 $buffer = fread( $fh, 1 );
93 }
94 if ( $buffer === "\xFE" ) {
95
96 // COM section -- file comment
97 // First see if valid utf-8,
98 // if not try to convert it to windows-1252.
99 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
100 UtfNormal::quickIsNFCVerify( $com );
101 // turns $com to valid utf-8.
102 // thus if no change, its utf-8, otherwise its something else.
103 if ( $com !== $oldCom ) {
104 wfSuppressWarnings();
105 $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
106 wfRestoreWarnings();
107 }
108 // Try it again, if its still not a valid string, then probably
109 // binary junk or some really weird encoding, so don't extract.
110 UtfNormal::quickIsNFCVerify( $com );
111 if ( $com === $oldCom ) {
112 $segments["COM"][] = $oldCom;
113 } else {
114 wfDebug( __METHOD__ . ' Ignoring JPEG comment as is garbage.' );
115 }
116
117 } elseif ( $buffer === "\xE1" ) {
118 // APP1 section (Exif, XMP, and XMP extended)
119 // only extract if XMP is enabled.
120 $temp = self::jpegExtractMarker( $fh );
121 // check what type of app segment this is.
122 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
123 $segments["XMP"] = substr( $temp, 29 );
124 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
125 $segments["XMP_ext"][] = substr( $temp, 35 );
126 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
127 // Some images (especially flickr images) seem to have this.
128 // I really have no idea what the deal is with them, but
129 // whatever...
130 $segments["XMP"] = substr( $temp, 29 );
131 wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
132 . "Using anyways.\n" );
133 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
134 // Just need to find out what the byte order is.
135 // because php's exif plugin sucks...
136 // This is a II for little Endian, MM for big. Not a unicode BOM.
137 $byteOrderMarker = substr( $temp, 6, 2 );
138 if ( $byteOrderMarker === 'MM' ) {
139 $segments['byteOrder'] = 'BE';
140 } elseif ( $byteOrderMarker === 'II' ) {
141 $segments['byteOrder'] = 'LE';
142 } else {
143 wfDebug( __METHOD__ . ' Invalid byte ordering?!' );
144 }
145 }
146 } elseif ( $buffer === "\xED" ) {
147 // APP13 - PSIR. IPTC and some photoshop stuff
148 $temp = self::jpegExtractMarker( $fh );
149 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
150 $segments["PSIR"][] = $temp;
151 }
152 } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
153 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
154 return $segments;
155 } else {
156 // segment we don't care about, so skip
157 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
158 if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" );
159 fseek( $fh, $size['int'] - 2, SEEK_CUR );
160 }
161
162 }
163 // shouldn't get here.
164 throw new MWException( "Reached end of jpeg file unexpectedly" );
165 }
166
167 /**
168 * Helper function for jpegSegmentSplitter
169 * @param &$fh FileHandle for jpeg file
170 * @throws MWException
171 * @return string data content of segment.
172 */
173 private static function jpegExtractMarker( &$fh ) {
174 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
175 if ( $size['int'] <= 2 ) {
176 throw new MWException( "invalid marker size in jpeg" );
177 }
178 $segment = fread( $fh, $size['int'] - 2 );
179 if ( strlen( $segment ) !== $size['int'] - 2 ) {
180 throw new MWException( "Segment shorter than expected" );
181 }
182 return $segment;
183 }
184
185 /**
186 * This reads the photoshop image resource.
187 * Currently it only compares the iptc/iim hash
188 * with the stored hash, which is used to determine the precedence
189 * of the iptc data. In future it may extract some other info, like
190 * url of copyright license.
191 *
192 * This should generally be called by BitmapMetadataHandler::doApp13()
193 *
194 * @param string $app13 photoshop psir app13 block from jpg.
195 * @throws MWException (It gets caught next level up though)
196 * @return String if the iptc hash is good or not.
197 */
198 public static function doPSIR( $app13 ) {
199 if ( !$app13 ) {
200 throw new MWException( "No App13 segment given" );
201 }
202 // First compare hash with real thing
203 // 0x404 contains IPTC, 0x425 has hash
204 // This is used to determine if the iptc is newer than
205 // the xmp data, as xmp programs update the hash,
206 // where non-xmp programs don't.
207
208 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
209 $appLen = strlen( $app13 );
210 $realHash = "";
211 $recordedHash = "";
212
213 // the +12 is the length of an empty item.
214 while ( $offset + 12 <= $appLen ) {
215 $valid = true;
216 if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
217 // its supposed to be 8BIM
218 // but apparently sometimes isn't esp. in
219 // really old jpg's
220 $valid = false;
221 }
222 $offset += 4;
223 $id = substr( $app13, $offset, 2 );
224 // id is a 2 byte id number which identifies
225 // the piece of info this record contains.
226
227 $offset += 2;
228
229 // some record types can contain a name, which
230 // is a pascal string 0-padded to be an even
231 // number of bytes. Most times (and any time
232 // we care) this is empty, making it two null bytes.
233
234 $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
235 // we never use the name so skip it. +1 for length byte
236 if ( $lenName % 2 == 1 ) {
237 $lenName++;
238 } // pad to even.
239 $offset += $lenName;
240
241 // now length of data (unsigned long big endian)
242 $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
243 // PHP can take issue with very large unsigned ints and make them negative.
244 // Which should never ever happen, as this has to be inside a segment
245 // which is limited to a 16 bit number.
246 if ( $lenData['len'] < 0 ) throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
247
248 $offset += 4; // 4bytes length field;
249
250 // this should not happen, but check.
251 if ( $lenData['len'] + $offset > $appLen ) {
252 throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
253 . "; offset=$offset; total length=$appLen)" );
254 }
255
256 if ( $valid ) {
257 switch ( $id ) {
258 case "\x04\x04":
259 // IPTC block
260 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
261 break;
262 case "\x04\x25":
263 $recordedHash = substr( $app13, $offset, $lenData['len'] );
264 break;
265 }
266 }
267
268 // if odd, add 1 to length to account for
269 // null pad byte.
270 if ( $lenData['len'] % 2 == 1 ) $lenData['len']++;
271 $offset += $lenData['len'];
272
273 }
274
275 if ( !$realHash || !$recordedHash ) {
276 return 'iptc-no-hash';
277 } elseif ( $realHash === $recordedHash ) {
278 return 'iptc-good-hash';
279 } else { /*$realHash !== $recordedHash */
280 return 'iptc-bad-hash';
281 }
282 }
283 }