Merge "Add i18n for the infamous "Fatal exception of type MWException" errorbox"
[lhc/web/wiklou.git] / includes / Import.php
index daefb88..eb2ca77 100644 (file)
  */
 class WikiImporter {
        private $reader = null;
+       private $foreignNamespaces = null;
        private $mLogItemCallback, $mUploadCallback, $mRevisionCallback, $mPageCallback;
-       private $mSiteInfoCallback, $mTargetNamespace, $mTargetRootPage, $mPageOutCallback;
+       private $mSiteInfoCallback, $mTargetNamespace, $mPageOutCallback;
        private $mNoticeCallback, $mDebug;
        private $mImportUploads, $mImageBasePath;
        private $mNoUpdates = false;
        /** @var Config */
        private $config;
+       /** @var ImportTitleFactory */
+       private $importTitleFactory;
+       /** @var array */
+       private $countableCache = array();
 
        /**
         * Creates an ImportXMLReader drawing from the source provided
-        * @param ImportStreamSource $source
+        * @param ImportSource $source
         * @param Config $config
         */
-       function __construct( ImportStreamSource $source, Config $config = null ) {
+       function __construct( ImportSource $source, Config $config = null ) {
                $this->reader = new XMLReader();
                if ( !$config ) {
                        wfDeprecated( __METHOD__ . ' without a Config instance', '1.25' );
@@ -64,10 +69,13 @@ class WikiImporter {
                }
 
                // Default callbacks
+               $this->setPageCallback( array( $this, 'beforeImportPage' ) );
                $this->setRevisionCallback( array( $this, "importRevision" ) );
                $this->setUploadCallback( array( $this, 'importUpload' ) );
                $this->setLogItemCallback( array( $this, 'importLogItem' ) );
                $this->setPageOutCallback( array( $this, 'finishImportPage' ) );
+
+               $this->importTitleFactory = new NaiveImportTitleFactory();
        }
 
        /**
@@ -199,6 +207,15 @@ class WikiImporter {
                return $previous;
        }
 
+       /**
+        * Sets the factory object to use to convert ForeignTitle objects into local
+        * Title objects
+        * @param ImportTitleFactory $factory
+        */
+       public function setImportTitleFactory( $factory ) {
+               $this->importTitleFactory = $factory;
+       }
+
        /**
         * Set a target namespace to override the defaults
         * @param null|int $namespace
@@ -208,9 +225,16 @@ class WikiImporter {
                if ( is_null( $namespace ) ) {
                        // Don't override namespaces
                        $this->mTargetNamespace = null;
-               } elseif ( $namespace >= 0 ) {
-                       // @todo FIXME: Check for validity
-                       $this->mTargetNamespace = intval( $namespace );
+                       $this->setImportTitleFactory( new NaiveImportTitleFactory() );
+                       return true;
+               } elseif (
+                       $namespace >= 0 &&
+                       MWNamespace::exists( intval( $namespace ) )
+               ) {
+                       $namespace = intval( $namespace );
+                       $this->mTargetNamespace = $namespace;
+                       $this->setImportTitleFactory( new NamespaceImportTitleFactory( $namespace ) );
+                       return true;
                } else {
                        return false;
                }
@@ -225,7 +249,7 @@ class WikiImporter {
                $status = Status::newGood();
                if ( is_null( $rootpage ) ) {
                        // No rootpage
-                       $this->mTargetRootPage = null;
+                       $this->setImportTitleFactory( new NaiveImportTitleFactory() );
                } elseif ( $rootpage !== '' ) {
                        $rootpage = rtrim( $rootpage, '/' ); //avoid double slashes
                        $title = Title::newFromText( $rootpage, !is_null( $this->mTargetNamespace )
@@ -244,9 +268,9 @@ class WikiImporter {
                                                : $wgContLang->getNsText( $title->getNamespace() );
                                        $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
                                } else {
-                                       // set namespace to 'all', so the namespace check in processTitle() can passed
+                                       // set namespace to 'all', so the namespace check in processTitle() can pass
                                        $this->setTargetNamespace( null );
-                                       $this->mTargetRootPage = $title->getPrefixedDBkey();
+                                       $this->setImportTitleFactory( new SubpageImportTitleFactory( $title ) );
                                }
                        }
                }
@@ -267,6 +291,19 @@ class WikiImporter {
                $this->mImportUploads = $import;
        }
 
+       /**
+        * Default per-page callback. Sets up some things related to site statistics
+        * @param array $titleAndForeignTitle Two-element array, with Title object at
+        * index 0 and ForeignTitle object at index 1
+        * @return bool
+        */
+       public function beforeImportPage( $titleAndForeignTitle ) {
+               $title = $titleAndForeignTitle[0];
+               $page = WikiPage::factory( $title );
+               $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
+               return true;
+       }
+
        /**
         * Default per-revision callback, performs the import.
         * @param WikiRevision $revision
@@ -320,13 +357,34 @@ class WikiImporter {
        /**
         * Mostly for hook use
         * @param Title $title
-        * @param string $origTitle
+        * @param ForeignTitle $foreignTitle
         * @param int $revCount
         * @param int $sRevCount
         * @param array $pageInfo
         * @return bool
         */
-       public function finishImportPage( $title, $origTitle, $revCount, $sRevCount, $pageInfo ) {
+       public function finishImportPage( $title, $foreignTitle, $revCount,
+                       $sRevCount, $pageInfo ) {
+
+               // Update article count statistics (T42009)
+               // The normal counting logic in WikiPage->doEditUpdates() is designed for
+               // one-revision-at-a-time editing, not bulk imports. In this situation it
+               // suffers from issues of slave lag. We let WikiPage handle the total page
+               // and revision count, and we implement our own custom logic for the
+               // article (content page) count.
+               $page = WikiPage::factory( $title );
+               $page->loadPageData( 'fromdbmaster' );
+               $content = $page->getContent();
+               $editInfo = $page->prepareContentForEdit( $content );
+
+               $countable = $page->isCountable( $editInfo );
+               $oldcountable = $this->countableCache['title_' . $title->getPrefixedText()];
+               if ( isset( $oldcountable ) && $countable != $oldcountable ) {
+                       DeferredUpdates::addUpdate( SiteStatsUpdate::factory( array(
+                               'articles' => ( (int)$countable - (int)$oldcountable )
+                       ) ) );
+               }
+
                $args = func_get_args();
                return Hooks::run( 'AfterImportPage', $args );
        }
@@ -348,6 +406,20 @@ class WikiImporter {
                $this->debug( "-- Text: " . $revision->text );
        }
 
+       /**
+        * Notify the callback function of site info
+        * @param array $siteInfo
+        * @return bool|mixed
+        */
+       private function siteInfoCallback( $siteInfo ) {
+               if ( isset( $this->mSiteInfoCallback ) ) {
+                       return call_user_func_array( $this->mSiteInfoCallback,
+                                       array( $siteInfo, $this ) );
+               } else {
+                       return false;
+               }
+       }
+
        /**
         * Notify the callback function when a new "<page>" is reached.
         * @param Title $title
@@ -361,12 +433,13 @@ class WikiImporter {
        /**
         * Notify the callback function when a "</page>" is closed.
         * @param Title $title
-        * @param Title $origTitle
+        * @param ForeignTitle $foreignTitle
         * @param int $revCount
         * @param int $sucCount Number of revisions for which callback returned true
         * @param array $pageInfo Associative array of page information
         */
-       private function pageOutCallback( $title, $origTitle, $revCount, $sucCount, $pageInfo ) {
+       private function pageOutCallback( $title, $foreignTitle, $revCount,
+                       $sucCount, $pageInfo ) {
                if ( isset( $this->mPageOutCallback ) ) {
                        $args = func_get_args();
                        call_user_func_array( $this->mPageOutCallback, $args );
@@ -460,51 +533,76 @@ class WikiImporter {
 
                $keepReading = $this->reader->read();
                $skip = false;
-               while ( $keepReading ) {
-                       $tag = $this->reader->name;
-                       $type = $this->reader->nodeType;
-
-                       if ( !Hooks::run( 'ImportHandleToplevelXMLTag', array( $this ) ) ) {
-                               // Do nothing
-                       } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
-                               break;
-                       } elseif ( $tag == 'siteinfo' ) {
-                               $this->handleSiteInfo();
-                       } elseif ( $tag == 'page' ) {
-                               $this->handlePage();
-                       } elseif ( $tag == 'logitem' ) {
-                               $this->handleLogItem();
-                       } elseif ( $tag != '#text' ) {
-                               $this->warn( "Unhandled top-level XML tag $tag" );
-
-                               $skip = true;
-                       }
+               $rethrow = null;
+               try {
+                       while ( $keepReading ) {
+                               $tag = $this->reader->name;
+                               $type = $this->reader->nodeType;
+
+                               if ( !Hooks::run( 'ImportHandleToplevelXMLTag', array( $this ) ) ) {
+                                       // Do nothing
+                               } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
+                                       break;
+                               } elseif ( $tag == 'siteinfo' ) {
+                                       $this->handleSiteInfo();
+                               } elseif ( $tag == 'page' ) {
+                                       $this->handlePage();
+                               } elseif ( $tag == 'logitem' ) {
+                                       $this->handleLogItem();
+                               } elseif ( $tag != '#text' ) {
+                                       $this->warn( "Unhandled top-level XML tag $tag" );
+
+                                       $skip = true;
+                               }
 
-                       if ( $skip ) {
-                               $keepReading = $this->reader->next();
-                               $skip = false;
-                               $this->debug( "Skip" );
-                       } else {
-                               $keepReading = $this->reader->read();
+                               if ( $skip ) {
+                                       $keepReading = $this->reader->next();
+                                       $skip = false;
+                                       $this->debug( "Skip" );
+                               } else {
+                                       $keepReading = $this->reader->read();
+                               }
                        }
+               } catch ( Exception $ex ) {
+                       $rethrow = $ex;
                }
 
+               // finally
                libxml_disable_entity_loader( $oldDisable );
+               $this->reader->close();
+
+               if ( $rethrow ) {
+                       throw $rethrow;
+               }
+
                return true;
        }
 
-       /**
-        * @return bool
-        * @throws MWException
-        */
        private function handleSiteInfo() {
-               // Site info is useful, but not actually used for dump imports.
-               // Includes a quick short-circuit to save performance.
-               if ( !$this->mSiteInfoCallback ) {
-                       $this->reader->next();
-                       return true;
+               $this->debug( "Enter site info handler." );
+               $siteInfo = array();
+
+               // Fields that can just be stuffed in the siteInfo object
+               $normalFields = array( 'sitename', 'base', 'generator', 'case' );
+
+               while ( $this->reader->read() ) {
+                       if ( $this->reader->nodeType == XmlReader::END_ELEMENT &&
+                                       $this->reader->name == 'siteinfo' ) {
+                               break;
+                       }
+
+                       $tag = $this->reader->name;
+
+                       if ( $tag == 'namespace' ) {
+                               $this->foreignNamespaces[ $this->nodeAttribute( 'key' ) ] =
+                                       $this->nodeContents();
+                       } elseif ( in_array( $tag, $normalFields ) ) {
+                               $siteInfo[$tag] = $this->nodeContents();
+                       }
                }
-               throw new MWException( "SiteInfo tag is not yet handled, do not set mSiteInfoCallback" );
+
+               $siteInfo['_namespaces'] = $this->foreignNamespaces;
+               $this->siteInfoCallback( $siteInfo );
        }
 
        private function handleLogItem() {
@@ -574,7 +672,7 @@ class WikiImporter {
                $pageInfo = array( 'revisionCount' => 0, 'successfulRevisionCount' => 0 );
 
                // Fields that can just be stuffed in the pageInfo object
-               $normalFields = array( 'title', 'id', 'redirect', 'restrictions' );
+               $normalFields = array( 'title', 'ns', 'id', 'redirect', 'restrictions' );
 
                $skip = false;
                $badTitle = false;
@@ -585,6 +683,8 @@ class WikiImporter {
                                break;
                        }
 
+                       $skip = false;
+
                        $tag = $this->reader->name;
 
                        if ( $badTitle ) {
@@ -605,29 +705,35 @@ class WikiImporter {
                                        $pageInfo[$tag] = $this->nodeAttribute( 'title' );
                                } else {
                                        $pageInfo[$tag] = $this->nodeContents();
-                                       if ( $tag == 'title' ) {
-                                               $title = $this->processTitle( $pageInfo['title'] );
+                               }
+                       } elseif ( $tag == 'revision' || $tag == 'upload' ) {
+                               if ( !isset( $title ) ) {
+                                       $title = $this->processTitle( $pageInfo['title'],
+                                               isset( $pageInfo['ns'] ) ? $pageInfo['ns'] : null );
+
+                                       if ( !$title ) {
+                                               $badTitle = true;
+                                               $skip = true;
+                                       }
 
-                                               if ( !$title ) {
-                                                       $badTitle = true;
-                                                       $skip = true;
-                                               }
+                                       $this->pageCallback( $title );
+                                       list( $pageInfo['_title'], $foreignTitle ) = $title;
+                               }
 
-                                               $this->pageCallback( $title );
-                                               list( $pageInfo['_title'], $origTitle ) = $title;
+                               if ( $title ) {
+                                       if ( $tag == 'revision' ) {
+                                               $this->handleRevision( $pageInfo );
+                                       } else {
+                                               $this->handleUpload( $pageInfo );
                                        }
                                }
-                       } elseif ( $tag == 'revision' ) {
-                               $this->handleRevision( $pageInfo );
-                       } elseif ( $tag == 'upload' ) {
-                               $this->handleUpload( $pageInfo );
                        } elseif ( $tag != '#text' ) {
                                $this->warn( "Unhandled page XML tag $tag" );
                                $skip = true;
                        }
                }
 
-               $this->pageOutCallback( $pageInfo['_title'], $origTitle,
+               $this->pageOutCallback( $pageInfo['_title'], $foreignTitle,
                                        $pageInfo['revisionCount'],
                                        $pageInfo['successfulRevisionCount'],
                                        $pageInfo );
@@ -852,28 +958,27 @@ class WikiImporter {
 
        /**
         * @param string $text
+        * @param string|null $ns
         * @return array|bool
         */
-       private function processTitle( $text ) {
-               $workTitle = $text;
-               $origTitle = Title::newFromText( $workTitle );
-
-               if ( !is_null( $this->mTargetNamespace ) && !is_null( $origTitle ) ) {
-                       # makeTitleSafe, because $origTitle can have a interwiki (different setting of interwiki map)
-                       # and than dbKey can begin with a lowercase char
-                       $title = Title::makeTitleSafe( $this->mTargetNamespace,
-                               $origTitle->getDBkey() );
+       private function processTitle( $text, $ns = null ) {
+               if ( is_null( $this->foreignNamespaces ) ) {
+                       $foreignTitleFactory = new NaiveForeignTitleFactory();
                } else {
-                       if ( !is_null( $this->mTargetRootPage ) ) {
-                               $workTitle = $this->mTargetRootPage . '/' . $workTitle;
-                       }
-                       $title = Title::newFromText( $workTitle );
+                       $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
+                               $this->foreignNamespaces );
                }
 
+               $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
+                       intval( $ns ) );
+
+               $title = $this->importTitleFactory->createTitleFromForeignTitle(
+                       $foreignTitle );
+
                $commandLineMode = $this->config->get( 'CommandLineMode' );
                if ( is_null( $title ) ) {
                        # Invalid page title? Ignore the page
-                       $this->notice( 'import-error-invalid', $workTitle );
+                       $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
                        return false;
                } elseif ( $title->isExternal() ) {
                        $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
@@ -891,7 +996,7 @@ class WikiImporter {
                        return false;
                }
 
-               return array( $title, $origTitle );
+               return array( $title, $foreignTitle );
        }
 }
 
@@ -910,10 +1015,10 @@ class UploadSourceAdapter {
        private $mPosition;
 
        /**
-        * @param ImportStreamSource $source
+        * @param ImportSource $source
         * @return string
         */
-       static function registerSource( ImportStreamSource $source ) {
+       static function registerSource( ImportSource $source ) {
                $id = wfRandomString();
 
                self::$sourceRegistrations[$id] = $source;
@@ -1475,7 +1580,6 @@ class WikiRevision {
                                        $this->title->getPrefixedText() . "]], timestamp " . $this->timestamp . "\n" );
                                return false;
                        }
-                       $oldcountable = $page->isCountable();
                }
 
                # @todo FIXME: Use original rev_id optionally (better for backups)
@@ -1498,10 +1602,11 @@ class WikiRevision {
 
                if ( $changed !== false && !$this->mNoUpdates ) {
                        wfDebug( __METHOD__ . ": running updates\n" );
+                       // countable/oldcountable stuff is handled in WikiImporter::finishImportPage
                        $page->doEditUpdates(
                                $revision,
                                $userObj,
-                               array( 'created' => $created, 'oldcountable' => $oldcountable )
+                               array( 'created' => $created, 'oldcountable' => 'no-change' )
                        );
                }
 
@@ -1613,7 +1718,7 @@ class WikiRevision {
                        wfDebug( __METHOD__ . ": Successful\n" );
                        return true;
                } else {
-                       wfDebug( __METHOD__ . ': failed: ' . $status->getXml() . "\n" );
+                       wfDebug( __METHOD__ . ': failed: ' . $status->getHTML() . "\n" );
                        return false;
                }
        }
@@ -1651,6 +1756,30 @@ class WikiRevision {
 
 }
 
+/**
+ * Source interface for XML import.
+ */
+interface ImportSource {
+
+       /**
+        * Indicates whether the end of the input has been reached.
+        * Will return true after a finite number of calls to readChunk.
+        *
+        * @return bool true if there is no more input, false otherwise.
+        */
+       function atEnd();
+
+       /**
+        * Return a chunk of the input, as a (possibly empty) string.
+        * When the end of input is reached, readChunk() returns false.
+        * If atEnd() returns false, readChunk() will return a string.
+        * If atEnd() returns true, readChunk() will return false.
+        *
+        * @return bool|string
+        */
+       function readChunk();
+}
+
 /**
  * Used for importing XML dumps where the content of the dump is in a string.
  * This class is ineffecient, and should only be used for small dumps.
@@ -1658,7 +1787,7 @@ class WikiRevision {
  *
  * @ingroup SpecialPage
  */
-class ImportStringSource {
+class ImportStringSource implements ImportSource {
        function __construct( $string ) {
                $this->mString = $string;
                $this->mRead = false;
@@ -1687,7 +1816,7 @@ class ImportStringSource {
  * Imports a XML dump from a file (either from file upload, files on disk, or HTTP)
  * @ingroup SpecialPage
  */
-class ImportStreamSource {
+class ImportStreamSource implements ImportSource {
        function __construct( $handle ) {
                $this->mHandle = $handle;
        }