allow xml page content or metadata dumps to target specific namespaces
authorAriel T. Glenn <ariel@wikimedia.org>
Wed, 17 Apr 2019 10:49:22 +0000 (13:49 +0300)
committerAriel T. Glenn <ariel@wikimedia.org>
Thu, 18 Apr 2019 11:42:50 +0000 (14:42 +0300)
We don't alter the db query for this, but throw away the extraneous
rows before doing any processing on them whatsoever.

Use of the DumpNamespaceFilter comes too late to avoid processing
for each revision done in XmlDumpWriter::writeRevision.

Bug: T220940
Change-Id: I9cb30ce612d862d97d96720ac68ff2327409f485

includes/export/WikiExporter.php
maintenance/dumpBackup.php
maintenance/includes/BackupDumper.php

index e02cd83..ca63dfe 100644 (file)
@@ -83,13 +83,18 @@ class WikiExporter {
         *   - limit: maximum number of rows to return
         *   - dir: "asc" or "desc" timestamp order
         * @param int $text One of WikiExporter::TEXT or WikiExporter::STUB
+        * @param null|array $limitNamespaces Comma-separated list of namespace numbers
+        *   to limit results
         */
-       function __construct( $db, $history = self::CURRENT, $text = self::TEXT ) {
+       function __construct( $db, $history = self::CURRENT, $text = self::TEXT,
+                       $limitNamespaces = null
+       ) {
                $this->db = $db;
                $this->history = $history;
                $this->writer = new XmlDumpWriter( $text, self::schemaVersion() );
                $this->sink = new DumpOutput();
                $this->text = $text;
+               $this->limitNamespaces = $limitNamespaces;
        }
 
        /**
@@ -468,6 +473,11 @@ class WikiExporter {
         */
        protected function outputPageStreamBatch( $results, $lastRow ) {
                foreach ( $results as $row ) {
+                       if ( $this->limitNamespaces &&
+                               !in_array( $row->page_namespace, $this->limitNamespaces ) ) {
+                               $lastRow = $row;
+                               continue;
+                       }
                        if ( $lastRow === null ||
                                $lastRow->page_namespace !== $row->page_namespace ||
                                $lastRow->page_title !== $row->page_title ) {
index b942302..938a6d1 100644 (file)
@@ -65,6 +65,7 @@ TEXT
                $this->addOption( 'stub', 'Don\'t perform old_text lookups; for 2-pass dump' );
                $this->addOption( 'uploads', 'Include upload records without files' );
                $this->addOption( 'include-files', 'Include files within the XML stream' );
+               $this->addOption( 'namespaces', 'Limit to this comma-separated list of namespace numbers' );
 
                if ( $args ) {
                        $this->loadWithArgv( $args );
@@ -131,6 +132,11 @@ TEXT
                $this->dumpUploads = $this->hasOption( 'uploads' );
                $this->dumpUploadFileContents = $this->hasOption( 'include-files' );
                $this->orderRevs = $this->hasOption( 'orderrevs' );
+               if ( $this->hasOption( 'namespaces' ) ) {
+                       $this->limitNamespaces = explode( ',', $this->getOption( 'namespaces' ) );
+               } else {
+                       $this->limitNamespaces = null;
+               }
        }
 }
 
index 0b450a6..0118c94 100644 (file)
@@ -48,6 +48,7 @@ abstract class BackupDumper extends Maintenance {
        public $dumpUploads = false;
        public $dumpUploadFileContents = false;
        public $orderRevs = false;
+       public $limitNamespaces = [];
 
        protected $reportingInterval = 100;
        protected $pageCount = 0;
@@ -264,7 +265,7 @@ abstract class BackupDumper extends Maintenance {
                $this->initProgress( $history );
 
                $db = $this->backupDb();
-               $exporter = new WikiExporter( $db, $history, $text );
+               $exporter = new WikiExporter( $db, $history, $text, $this->limitNamespaces );
                $exporter->setSchemaVersion( $this->schemaVersion );
                $exporter->dumpUploads = $this->dumpUploads;
                $exporter->dumpUploadFileContents = $this->dumpUploadFileContents;