3 * Class for reading jpegs and extracting metadata.
4 * see also BitmapMetadataHandler.
6 * Based somewhat on GIFMetadataExtrator.
8 class JpegMetadataExtractor
{
10 const MAX_JPEG_SEGMENTS
= 200;
11 // the max segment is a sanity check.
12 // A jpeg file should never even remotely have
13 // that many segments. Your average file has about 10.
15 /** Function to extract metadata segments of interest from jpeg files
16 * based on GIFMetadataExtractor.
18 * we can almost use getimagesize to do this
19 * but gis doesn't support having multiple app1 segments
20 * and those can't extract xmp on files containing both exif and xmp data
22 * @param String $filename name of jpeg file
23 * @return Array of interesting segments.
24 * @throws MWException if given invalid file.
26 static function segmentSplitter ( $filename ) {
27 $showXMP = function_exists( 'xml_parser_create_ns' );
31 $segments = array( 'XMP_ext' => array(), 'COM' => array() );
34 throw new MWException( "No filename specified for " . __METHOD__
);
36 if ( !file_exists( $filename ) ||
is_dir( $filename ) ) {
37 throw new MWException( "Invalid file $filename passed to " . __METHOD__
);
40 $fh = fopen( $filename, "rb" );
43 throw new MWException( "Could not open file $filename" );
46 $buffer = fread( $fh, 2 );
47 if ( $buffer !== "\xFF\xD8" ) {
48 throw new MWException( "Not a jpeg, no SOI" );
50 while ( !feof( $fh ) ) {
51 $buffer = fread( $fh, 1 );
53 if ( $segmentCount > self
::MAX_JPEG_SEGMENTS
) {
54 // this is just a sanity check
55 throw new MWException( 'Too many jpeg segments. Aborting' );
57 if ( $buffer !== "\xFF" ) {
58 throw new MWException( "Error reading jpeg file marker" );
61 $buffer = fread( $fh, 1 );
62 if ( $buffer === "\xFE" ) {
64 // COM section -- file comment
65 // First see if valid utf-8,
66 // if not try to convert it to windows-1252.
67 $com = $oldCom = trim( self
::jpegExtractMarker( $fh ) );
68 UtfNormal
::quickIsNFCVerify( $com );
69 // turns $com to valid utf-8.
70 // thus if no change, its utf-8, otherwise its something else.
71 if ( $com !== $oldCom ) {
73 $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
76 // Try it again, if its still not a valid string, then probably
77 // binary junk or some really weird encoding, so don't extract.
78 UtfNormal
::quickIsNFCVerify( $com );
79 if ( $com === $oldCom ) {
80 $segments["COM"][] = $oldCom;
82 wfDebug( __METHOD__
. ' Ignoring JPEG comment as is garbage.' );
85 } elseif ( $buffer === "\xE1" && $showXMP ) {
86 // APP1 section (Exif, XMP, and XMP extended)
87 // only extract if XMP is enabled.
88 $temp = self
::jpegExtractMarker( $fh );
90 // check what type of app segment this is.
91 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" ) {
92 $segments["XMP"] = substr( $temp, 29 );
93 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" ) {
94 $segments["XMP_ext"][] = substr( $temp, 35 );
95 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" ) {
96 // Some images (especially flickr images) seem to have this.
97 // I really have no idea what the deal is with them, but
99 $segments["XMP"] = substr( $temp, 29 );
100 wfDebug( __METHOD__
. ' Found XMP section with wrong app identifier '
101 . "Using anyways.\n" );
103 } elseif ( $buffer === "\xED" ) {
104 // APP13 - PSIR. IPTC and some photoshop stuff
105 $temp = self
::jpegExtractMarker( $fh );
106 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
107 $segments["PSIR"] = $temp;
109 } elseif ( $buffer === "\xD9" ||
$buffer === "\xDA" ) {
110 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
113 // segment we don't care about, so skip
114 $size = unpack( "nint", fread( $fh, 2 ) );
115 if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" );
116 fseek( $fh, $size['int'] - 2, SEEK_CUR
);
120 // shouldn't get here.
121 throw new MWException( "Reached end of jpeg file unexpectedly" );
125 * Helper function for jpegSegmentSplitter
126 * @param &$fh FileHandle for jpeg file
127 * @return data content of segment.
129 private static function jpegExtractMarker( &$fh ) {
130 $size = unpack( "nint", fread( $fh, 2 ) );
131 if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" );
132 return fread( $fh, $size['int'] - 2 );
136 * This reads the photoshop image resource.
137 * Currently it only compares the iptc/iim hash
138 * with the stored hash, which is used to determine the precedence
139 * of the iptc data. In future it may extract some other info, like
140 * url of copyright license.
142 * This should generally be called by BitmapMetadataHandler::doApp13()
144 * @param String $app13 photoshop psir app13 block from jpg.
145 * @return String if the iptc hash is good or not.
147 public static function doPSIR ( $app13 ) {
151 // First compare hash with real thing
152 // 0x404 contains IPTC, 0x425 has hash
153 // This is used to determine if the iptc is newer than
154 // the xmp data, as xmp programs update the hash,
155 // where non-xmp programs don't.
157 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
158 $appLen = strlen( $app13 );
162 // the +12 is the length of an empty item.
163 while ( $offset +
12 <= $appLen ) {
165 if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
166 // its supposed to be 8BIM
167 // but apparently sometimes isn't esp. in
172 $id = substr( $app13, $offset, 2 );
173 // id is a 2 byte id number which identifies
174 // the piece of info this record contains.
178 // some record types can contain a name, which
179 // is a pascal string 0-padded to be an even
180 // number of bytes. Most times (and any time
181 // we care) this is empty, making it two null bytes.
183 $lenName = ord( substr( $app13, $offset, 1 ) ) +
1;
184 // we never use the name so skip it. +1 for length byte
185 if ( $lenName %
2 == 1 ) {
190 // now length of data (unsigned long big endian)
191 $lenData = unpack( 'Nlen', substr( $app13, $offset, 4 ) );
192 $offset +
= 4; // 4bytes length field;
194 // this should not happen, but check.
195 if ( $lenData['len'] +
$offset > $appLen ) {
196 wfDebug( __METHOD__
. " PSIR data too long.\n" );
197 return 'iptc-no-hash';
204 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
207 $recordedHash = substr( $app13, $offset, $lenData['len'] );
212 // if odd, add 1 to length to account for
214 if ( $lenData['len'] %
2 == 1 ) $lenData['len']++
;
215 $offset +
= $lenData['len'];
219 if ( !$realHash ||
!$recordedHash ) {
220 return 'iptc-no-hash';
221 } elseif ( $realHash === $recordedHash ) {
222 return 'iptc-good-hash';
223 } else { /*$realHash !== $recordedHash */
224 return 'iptc-bad-hash';