API:
[lhc/web/wiklou.git] / maintenance / backupPrefetch.inc
index 759220d..413247d 100644 (file)
@@ -1,5 +1,43 @@
 <?php
 
+// Some smart guy removed XMLReader's global constants from PHP 5.1
+// and replaced them with class constants. Breaking source compatibility
+// is SUPER awesome, and I love languages which do this constantly!
+$xmlReaderConstants = array(
+       "NONE",
+       "ELEMENT",
+       "ATTRIBUTE", 
+       "TEXT",
+       "CDATA",
+       "ENTITY_REF",
+       "ENTITY",
+       "PI",
+       "COMMENT",
+       "DOC",
+       "DOC_TYPE",
+       "DOC_FRAGMENT",
+       "NOTATION",
+       "WHITESPACE",
+       "SIGNIFICANT_WHITESPACE",
+       "END_ELEMENT",
+       "END_ENTITY",
+       "XML_DECLARATION",
+       "LOADDTD",
+       "DEFAULTATTRS",
+       "VALIDATE",
+       "SUBST_ENTITIES" );
+foreach( $xmlReaderConstants as $name ) {
+       $fullName = "XMLREADER_$name";
+       $newName = "XMLReader::$name";
+       if( !defined( $fullName ) ) {
+               if( defined( $newName ) ) {
+                       define( $fullName, constant( $newName ) );
+               } else {
+                       // broken or missing the extension...
+               }
+       }
+}
+
 /**
  * Readahead helper for making large MediaWiki data dumps;
  * reads in a previous XML dump to sequentially prefetch text
 class BaseDump {
        var $reader = null;
        var $atEnd = false;
+       var $atPageEnd = false;
        var $lastPage = 0;
        var $lastRev = 0;
-       
+
        function BaseDump( $infile ) {
                $this->reader = new XMLReader();
                $this->reader->open( $infile );
        }
-       
+
        /**
         * Attempts to fetch the text of a particular page revision
         * from the dump stream. May return null if the page is
@@ -36,50 +75,63 @@ class BaseDump {
         * @return string or null
         */
        function prefetch( $page, $rev ) {
+               $page = intval( $page );
+               $rev = intval( $rev );
                while( $this->lastPage < $page && !$this->atEnd ) {
+                       $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
                        $this->nextPage();
                }
                if( $this->lastPage > $page || $this->atEnd ) {
-                       $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev\n" );
+                       $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev  [$this->lastPage, $this->lastRev]" );
                        return null;
                }
-               while( $this->lastRev < $rev && !$this->atEnd ) {
+               while( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
+                       $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
                        $this->nextRev();
                }
-               if( $this->lastRev == $rev ) {
-                       $this->debug( "BaseDump::prefetch hit on $page, $rev\n" );
+               if( $this->lastRev == $rev && !$this->atEnd ) {
+                       $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
                        return $this->nextText();
                } else {
-                       $this->debug( "BaseDump::prefetch already past rev $rev on page $page\n" );
+                       $this->debug( "BaseDump::prefetch already past rev $rev on page $page  [$this->lastPage, $this->lastRev]" );
                        return null;
                }
        }
-       
+
        function debug( $str ) {
-               wfDebug( $str );
+               wfDebug( $str . "\n" );
                //global $dumper;
                //$dumper->progress( $str );
        }
-       
+
        /**
         * @access private
         */
        function nextPage() {
-               $this->skipTo( 'page' );
-               $this->skipTo( 'id' );
-               $this->lastPage = intval( $this->nodeContents() );
-               $this->lastRev = 0;
+               if( $this->skipTo( 'page', 'mediawiki' ) ) {
+                       if( $this->skipTo( 'id' ) ) {
+                               $this->lastPage = intval( $this->nodeContents() );
+                               $this->lastRev = 0;
+                               $this->atPageEnd = false;
+                       }
+               } else {
+                       $this->atEnd = true;
+               }
        }
-       
+
        /**
         * @access private
         */
        function nextRev() {
-               $this->skipTo( 'revision' );
-               $this->skipTo( 'id' );
-               $this->lastRev = intval( $this->nodeContents() );
+               if( $this->skipTo( 'revision' ) ) {
+                       if( $this->skipTo( 'id' ) ) {
+                               $this->lastRev = intval( $this->nodeContents() );
+                       }
+               } else {
+                       $this->atPageEnd = true;
+               }
        }
-       
+
        /**
         * @access private
         */
@@ -87,11 +139,11 @@ class BaseDump {
                $this->skipTo( 'text' );
                return strval( $this->nodeContents() );
        }
-       
+
        /**
         * @access private
         */
-       function skipTo( $name ) {
+       function skipTo( $name, $parent='page' ) {
                if( $this->atEnd ) {
                        return false;
                }
@@ -100,10 +152,15 @@ class BaseDump {
                                $this->reader->name == $name ) {
                                return true;
                        }
+                       if( $this->reader->nodeType == XMLREADER_END_ELEMENT &&
+                               $this->reader->name == $parent ) {
+                               $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
+                               return false;
+                       }
                }
                return $this->close();
        }
-       
+
        /**
         * Shouldn't something like this be built-in to XMLReader?
         * Fetches text contents of the current element, assuming
@@ -113,7 +170,7 @@ class BaseDump {
         */
        function nodeContents() {
                if( $this->atEnd ) {
-                       return false;
+                       return null;
                }
                if( $this->reader->isEmptyElement ) {
                        return "";
@@ -132,14 +189,14 @@ class BaseDump {
                }
                return $this->close();
        }
-       
+
        /**
         * @access private
         */
        function close() {
                $this->reader->close();
                $this->atEnd = true;
-               return false;
+               return null;
        }
 }