includes/media/JpegMetadataExtractor.php

   1 <?php
   2 /**
   3  * Extraction of JPEG image metadata.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  * @ingroup Media
  22  */
  23
  24 use Wikimedia\XMPReader\Reader as XMPReader;
  25
  26 /**
  27  * Class for reading jpegs and extracting metadata.
  28  * see also BitmapMetadataHandler.
  29  *
  30  * Based somewhat on GIFMetadataExtractor.
  31  *
  32  * @ingroup Media
  33  */
  34 class JpegMetadataExtractor {
  35         const MAX_JPEG_SEGMENTS = 200;
  36
  37         // the max segment is a sanity check.
  38         // A jpeg file should never even remotely have
  39         // that many segments. Your average file has about 10.
  40
  41         /** Function to extract metadata segments of interest from jpeg files
  42          * based on GIFMetadataExtractor.
  43          *
  44          * we can almost use getimagesize to do this
  45          * but gis doesn't support having multiple app1 segments
  46          * and those can't extract xmp on files containing both exif and xmp data
  47          *
  48          * @param string $filename Name of jpeg file
  49          * @return array Array of interesting segments.
  50          * @throws MWException If given invalid file.
  51          */
  52         static function segmentSplitter( $filename ) {
  53                 $showXMP = XMPReader::isSupported();
  54
  55                 $segmentCount = 0;
  56
  57                 $segments = [
  58                         'XMP_ext' => [],
  59                         'COM' => [],
  60                         'PSIR' => [],
  61                 ];
  62
  63                 if ( !$filename ) {
  64                         throw new MWException( "No filename specified for " . __METHOD__ );
  65                 }
  66                 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
  67                         throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
  68                 }
  69
  70                 $fh = fopen( $filename, "rb" );
  71
  72                 if ( !$fh ) {
  73                         throw new MWException( "Could not open file $filename" );
  74                 }
  75
  76                 $buffer = fread( $fh, 2 );
  77                 if ( $buffer !== "\xFF\xD8" ) {
  78                         throw new MWException( "Not a jpeg, no SOI" );
  79                 }
  80                 while ( !feof( $fh ) ) {
  81                         $buffer = fread( $fh, 1 );
  82                         $segmentCount++;
  83                         if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
  84                                 // this is just a sanity check
  85                                 throw new MWException( 'Too many jpeg segments. Aborting' );
  86                         }
  87                         while ( $buffer !== "\xFF" && !feof( $fh ) ) {
  88                                 // In theory JPEG files are not allowed to contain anything between the sections,
  89                                 // but in practice they sometimes do. It's customary to ignore the garbage data.
  90                                 $buffer = fread( $fh, 1 );
  91                         }
  92
  93                         $buffer = fread( $fh, 1 );
  94                         while ( $buffer === "\xFF" && !feof( $fh ) ) {
  95                                 // Skip through any 0xFF padding bytes.
  96                                 $buffer = fread( $fh, 1 );
  97                         }
  98                         if ( $buffer === "\xFE" ) {
  99                                 // COM section -- file comment
 100                                 // First see if valid utf-8,
 101                                 // if not try to convert it to windows-1252.
 102                                 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
 103                                 UtfNormal\Validator::quickIsNFCVerify( $com );
 104                                 // turns $com to valid utf-8.
 105                                 // thus if no change, its utf-8, otherwise its something else.
 106                                 if ( $com !== $oldCom ) {
 107                                         Wikimedia\suppressWarnings();
 108                                         $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
 109                                         Wikimedia\restoreWarnings();
 110                                 }
 111                                 // Try it again, if its still not a valid string, then probably
 112                                 // binary junk or some really weird encoding, so don't extract.
 113                                 UtfNormal\Validator::quickIsNFCVerify( $com );
 114                                 if ( $com === $oldCom ) {
 115                                         $segments["COM"][] = $oldCom;
 116                                 } else {
 117                                         wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage.\n" );
 118                                 }
 119                         } elseif ( $buffer === "\xE1" ) {
 120                                 // APP1 section (Exif, XMP, and XMP extended)
 121                                 // only extract if XMP is enabled.
 122                                 $temp = self::jpegExtractMarker( $fh );
 123                                 // check what type of app segment this is.
 124                                 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
 125                                         // use trim to remove trailing \0 chars
 126                                         $segments["XMP"] = trim( substr( $temp, 29 ) );
 127                                 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
 128                                         // use trim to remove trailing \0 chars
 129                                         $segments["XMP_ext"][] = trim( substr( $temp, 35 ) );
 130                                 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
 131                                         // Some images (especially flickr images) seem to have this.
 132                                         // I really have no idea what the deal is with them, but
 133                                         // whatever...
 134                                         // use trim to remove trailing \0 chars
 135                                         $segments["XMP"] = trim( substr( $temp, 29 ) );
 136                                         wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
 137                                                 . "Using anyways.\n" );
 138                                 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
 139                                         // Just need to find out what the byte order is.
 140                                         // because php's exif plugin sucks...
 141                                         // This is a II for little Endian, MM for big. Not a unicode BOM.
 142                                         $byteOrderMarker = substr( $temp, 6, 2 );
 143                                         if ( $byteOrderMarker === 'MM' ) {
 144                                                 $segments['byteOrder'] = 'BE';
 145                                         } elseif ( $byteOrderMarker === 'II' ) {
 146                                                 $segments['byteOrder'] = 'LE';
 147                                         } else {
 148                                                 wfDebug( __METHOD__ . " Invalid byte ordering?!\n" );
 149                                         }
 150                                 }
 151                         } elseif ( $buffer === "\xED" ) {
 152                                 // APP13 - PSIR. IPTC and some photoshop stuff
 153                                 $temp = self::jpegExtractMarker( $fh );
 154                                 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
 155                                         $segments["PSIR"][] = $temp;
 156                                 }
 157                         } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
 158                                 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
 159                                 return $segments;
 160                         } else {
 161                                 // segment we don't care about, so skip
 162                                 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
 163                                 if ( $size['int'] < 2 ) {
 164                                         throw new MWException( "invalid marker size in jpeg" );
 165                                 }
 166                                 // Note it's possible to seek beyond end of file if truncated.
 167                                 // fseek doesn't report a failure in this case.
 168                                 fseek( $fh, $size['int'] - 2, SEEK_CUR );
 169                         }
 170                 }
 171                 // shouldn't get here.
 172                 throw new MWException( "Reached end of jpeg file unexpectedly" );
 173         }
 174
 175         /**
 176          * Helper function for jpegSegmentSplitter
 177          * @param resource &$fh File handle for JPEG file
 178          * @throws MWException
 179          * @return string Data content of segment.
 180          */
 181         private static function jpegExtractMarker( &$fh ) {
 182                 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
 183                 if ( $size['int'] < 2 ) {
 184                         throw new MWException( "invalid marker size in jpeg" );
 185                 }
 186                 if ( $size['int'] === 2 ) {
 187                         // fread( ..., 0 ) generates a warning
 188                         return '';
 189                 }
 190                 $segment = fread( $fh, $size['int'] - 2 );
 191                 if ( strlen( $segment ) !== $size['int'] - 2 ) {
 192                         throw new MWException( "Segment shorter than expected" );
 193                 }
 194
 195                 return $segment;
 196         }
 197
 198         /**
 199          * This reads the photoshop image resource.
 200          * Currently it only compares the iptc/iim hash
 201          * with the stored hash, which is used to determine the precedence
 202          * of the iptc data. In future it may extract some other info, like
 203          * url of copyright license.
 204          *
 205          * This should generally be called by BitmapMetadataHandler::doApp13()
 206          *
 207          * @param string $app13 Photoshop psir app13 block from jpg.
 208          * @throws MWException (It gets caught next level up though)
 209          * @return string If the iptc hash is good or not. One of 'iptc-no-hash',
 210          *   'iptc-good-hash', 'iptc-bad-hash'.
 211          */
 212         public static function doPSIR( $app13 ) {
 213                 if ( !$app13 ) {
 214                         throw new MWException( "No App13 segment given" );
 215                 }
 216                 // First compare hash with real thing
 217                 // 0x404 contains IPTC, 0x425 has hash
 218                 // This is used to determine if the iptc is newer than
 219                 // the xmp data, as xmp programs update the hash,
 220                 // where non-xmp programs don't.
 221
 222                 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
 223                 $appLen = strlen( $app13 );
 224                 $realHash = "";
 225                 $recordedHash = "";
 226
 227                 // the +12 is the length of an empty item.
 228                 while ( $offset + 12 <= $appLen ) {
 229                         $valid = true;
 230                         if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
 231                                 // its supposed to be 8BIM
 232                                 // but apparently sometimes isn't esp. in
 233                                 // really old jpg's
 234                                 $valid = false;
 235                         }
 236                         $offset += 4;
 237                         $id = substr( $app13, $offset, 2 );
 238                         // id is a 2 byte id number which identifies
 239                         // the piece of info this record contains.
 240
 241                         $offset += 2;
 242
 243                         // some record types can contain a name, which
 244                         // is a pascal string 0-padded to be an even
 245                         // number of bytes. Most times (and any time
 246                         // we care) this is empty, making it two null bytes.
 247
 248                         $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
 249                         // we never use the name so skip it. +1 for length byte
 250                         if ( $lenName % 2 == 1 ) {
 251                                 $lenName++;
 252                         } // pad to even.
 253                         $offset += $lenName;
 254
 255                         // now length of data (unsigned long big endian)
 256                         $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
 257                         // PHP can take issue with very large unsigned ints and make them negative.
 258                         // Which should never ever happen, as this has to be inside a segment
 259                         // which is limited to a 16 bit number.
 260                         if ( $lenData['len'] < 0 ) {
 261                                 throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
 262                         }
 263
 264                         $offset += 4; // 4bytes length field;
 265
 266                         // this should not happen, but check.
 267                         if ( $lenData['len'] + $offset > $appLen ) {
 268                                 throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
 269                                         . "; offset=$offset; total length=$appLen)" );
 270                         }
 271
 272                         if ( $valid ) {
 273                                 switch ( $id ) {
 274                                         case "\x04\x04":
 275                                                 // IPTC block
 276                                                 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
 277                                                 break;
 278                                         case "\x04\x25":
 279                                                 $recordedHash = substr( $app13, $offset, $lenData['len'] );
 280                                                 break;
 281                                 }
 282                         }
 283
 284                         // if odd, add 1 to length to account for
 285                         // null pad byte.
 286                         if ( $lenData['len'] % 2 == 1 ) {
 287                                 $lenData['len']++;
 288                         }
 289                         $offset += $lenData['len'];
 290                 }
 291
 292                 if ( !$realHash || !$recordedHash ) {
 293                         return 'iptc-no-hash';
 294                 } elseif ( $realHash === $recordedHash ) {
 295                         return 'iptc-good-hash';
 296                 } else { /*$realHash !== $recordedHash */
 297                         return 'iptc-bad-hash';
 298                 }
 299         }
 300 }