store djvu text layer in img_metadata. fetch it in proofreadpage
authorThomasV <thomasv@users.mediawiki.org>
Thu, 4 Jun 2009 09:16:25 +0000 (09:16 +0000)
committerThomasV <thomasv@users.mediawiki.org>
Thu, 4 Jun 2009 09:16:25 +0000 (09:16 +0000)
includes/DjVuImage.php
includes/media/DjVu.php

index 8e7caf6..fbb2586 100644 (file)
@@ -224,7 +224,7 @@ class DjVuImage {
         * @return string
         */
        function retrieveMetaData() {
-               global $wgDjvuToXML, $wgDjvuDump;
+               global $wgDjvuToXML, $wgDjvuDump, $wgDjvuTxt;
                if ( isset( $wgDjvuDump ) ) {
                        # djvudump is faster as of version 3.5
                        # http://sourceforge.net/tracker/index.php?func=detail&aid=1704049&group_id=32953&atid=406583
@@ -242,6 +242,22 @@ class DjVuImage {
                } else {
                        $xml = null;
                }
+               # Text layer
+               if ( isset( $wgDjvuTxt ) ) { 
+                       wfProfileIn( 'djvutxt' );
+                       $cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename ) ;
+                       wfDebug( __METHOD__.": $cmd\n" );
+                       $txt = wfShellExec( $cmd, $retval );
+                       wfProfileOut( 'djvutxt' );
+                       if( $retval == 0) {
+                               $txt = htmlspecialchars($txt);
+                               $txt = preg_replace( "/\(page\s\d*\s\d*\s\d*\s\d*\s*\&quot;(.*?)\&quot;\s*\)/s", "<PAGE value=\"$1\" />", $txt  );
+                               $txt = preg_replace( "/\(\)/", "<PAGE value=\"\" />", $txt );
+                               $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n";
+                               $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml );
+                               $xml = $xml . $txt. '</mw-djvu>' ;
+                       }
+               }
                return $xml;
        }
 
index 66e954d..38c16c2 100644 (file)
@@ -135,7 +135,7 @@ class DjVuHandler extends ImageHandler {
        /**
         * Cache a document tree for the DjVu XML metadata
         */
-       function getMetaTree( $image ) {
+       function getMetaTree( $image , $gettext = false ) {
                if ( isset( $image->dejaMetaTree ) ) {
                        return $image->dejaMetaTree;
                }
@@ -149,15 +149,32 @@ class DjVuHandler extends ImageHandler {
 
                wfSuppressWarnings();
                try {
-                       $image->dejaMetaTree = new SimpleXMLElement( $metadata );
-               } catch( Exception $e ) {
-                       wfDebug( "Bogus multipage XML metadata on '$image->name'\n" );
                        // Set to false rather than null to avoid further attempts
                        $image->dejaMetaTree = false;
+                       $image->djvuTextTree = false;
+                       $tree = new SimpleXMLElement( $metadata );
+                       if( $tree->getName() == 'mw-djvu' ) {
+                               foreach($tree->children() as $b){ 
+                                       if( $b->getName() == 'DjVuTxt' ) {
+                                               $image->djvuTextTree = $b;
+                                       }
+                                       else if ( $b->getName() == 'DjVuXML' ) {
+                                               $image->dejaMetaTree = $b;
+                                       }
+                               }
+                       } else {
+                               $image->dejaMetaTree = $tree;
+                       }
+               } catch( Exception $e ) {
+                       wfDebug( "Bogus multipage XML metadata on '$image->name'\n" );
                }
                wfRestoreWarnings();
                wfProfileOut( __METHOD__ );
-               return $image->dejaMetaTree;
+               if( $gettext ) {
+                       return $image->djvuTextTree;
+               } else {
+                       return $image->dejaMetaTree;
+               }
        }
 
        function getImageSize( $image, $path ) {
@@ -211,4 +228,21 @@ class DjVuHandler extends ImageHandler {
                        return false;
                }
        }
+
+       function getPageText( $image, $page ){
+               $tree = $this->getMetaTree( $image, true );
+               if ( !$tree ) {
+                       return false;
+               }
+
+               $o = $tree->BODY[0]->PAGE[$page-1];
+               if ( $o ) {
+                       $txt = $o['value'];
+                       return $txt;
+               } else {
+                       return false;
+               }
+
+       }
+
 }