Apply exportTransform in backupTextPass.inc
authordaniel <daniel.kinzler@wikimedia.de>
Thu, 23 Oct 2014 11:49:59 +0000 (13:49 +0200)
committerAddshore <addshorewiki@gmail.com>
Sun, 23 Nov 2014 16:42:13 +0000 (16:42 +0000)
This is intended to help us fix 72348: Wikidata uses exportTransform to
convert old revision content to the new canonical JSON model. This patch
will make this work with the multi-pass dumping process employed to
generate dumps for dumps.wikimedia.org.

Bug: 72361
Change-Id: Ie9046d1968efc40a02a0812a536f5ef7176af7d7

maintenance/backupTextPass.inc
maintenance/fetchText.php
tests/phpunit/maintenance/DumpTestCase.php
tests/phpunit/maintenance/backupTextPassTest.php

index 5f77637..f13cb44 100644 (file)
@@ -354,6 +354,8 @@ class TextPassDumper extends BackupDumper {
                $this->lastName = "";
                $this->thisPage = 0;
                $this->thisRev = 0;
+               $this->thisRevModel = null;
+               $this->thisRevFormat = null;
 
                $parser = xml_parser_create( "UTF-8" );
                xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
@@ -421,8 +423,34 @@ class TextPassDumper extends BackupDumper {
                return true;
        }
 
+       /**
+        * Applies applicable export transformations to $text.
+        *
+        * @param string $text
+        * @param string $model
+        * @param string|null $format
+        *
+        * @return string
+        */
+       private function exportTransform( $text, $model, $format = null ) {
+               try {
+                       $handler = ContentHandler::getForModelID( $model );
+                       $text = $handler->exportTransform( $text, $format );
+               }
+               catch ( MWException $ex ) {
+                       $this->progress(
+                               "Unable to apply export transformation for content model '$model': " .
+                               $ex->getMessage()
+                       );
+               }
+
+               return $text;
+       }
+
        /**
         * Tries to get the revision text for a revision id.
+        * Export transformations are applied if the content model can is given or can be
+        * determined from the database.
         *
         * Upon errors, retries (Up to $this->maxFailures tries each call).
         * If still no good revision get could be found even after this retrying, "" is returned.
@@ -431,11 +459,14 @@ class TextPassDumper extends BackupDumper {
         * is thrown.
         *
         * @param string $id The revision id to get the text for
+        * @param string|bool|null $model The content model used to determine applicable export transformations.
+        *      If $model is null, it will be determined from the database.
+        * @param string|null $format The content format used when applying export transformations.
         *
-        * @return string The revision text for $id, or ""
         * @throws MWException
+        * @return string The revision text for $id, or ""
         */
-       function getText( $id ) {
+       function getText( $id, $model = null, $format = null ) {
                global $wgContentHandlerUseDB;
 
                $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
@@ -453,6 +484,24 @@ class TextPassDumper extends BackupDumper {
                $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
                $consecutiveFailedTextRetrievals = 0;
 
+               if ( $model === null && $wgContentHandlerUseDB ) {
+                       $row = $this->db->selectRow(
+                               'revision',
+                               array( 'rev_content_model', 'rev_content_format' ),
+                               array( 'rev_id' => $this->thisRev ),
+                               __METHOD__
+                       );
+
+                       if ( $row ) {
+                               $model = $row->rev_content_model;
+                               $format = $row->rev_content_format;
+                       }
+               }
+
+               if ( $model === null || $model === '' ) {
+                       $model = false;
+               }
+
                while ( $failures < $this->maxFailures ) {
 
                        // As soon as we found a good text for the $id, we will return immediately.
@@ -469,9 +518,19 @@ class TextPassDumper extends BackupDumper {
                                        $tryIsPrefetch = true;
                                        $text = $this->prefetch->prefetch( intval( $this->thisPage ),
                                                intval( $this->thisRev ) );
+
                                        if ( $text === null ) {
                                                $text = false;
                                        }
+
+                                       if ( is_string( $text ) && $model !== false ) {
+                                               // Apply export transformation to text coming from an old dump.
+                                               // The purpose of this transformation is to convert up from legacy
+                                               // formats, which may still be used in the older dump that is used
+                                               // for pre-fetching. Applying the transformation again should not
+                                               // interfere with content that is already in the correct form.
+                                               $text = $this->exportTransform( $text, $model, $format );
+                                       }
                                }
 
                                if ( $text === false ) {
@@ -483,6 +542,12 @@ class TextPassDumper extends BackupDumper {
                                                $text = $this->getTextDb( $id );
                                        }
 
+                                       if ( $text !== false && $model !== false ) {
+                                               // Apply export transformation to text coming from the database.
+                                               // Prefetched text should already have transformations applied.
+                                               $text = $this->exportTransform( $text, $model, $format );
+                                       }
+
                                        // No more checks for texts from DB for now.
                                        // If we received something that is not false,
                                        // We treat it as good text, regardless of whether it actually is or is not
@@ -504,21 +569,8 @@ class TextPassDumper extends BackupDumper {
                                        throw new MWException( "No database available" );
                                }
 
-                               $revLength = strlen( $text );
-                               if ( $wgContentHandlerUseDB ) {
-                                       $row = $this->db->selectRow(
-                                               'revision',
-                                               array( 'rev_len', 'rev_content_model' ),
-                                               array( 'rev_id' => $revID ),
-                                               __METHOD__
-                                       );
-                                       if ( $row ) {
-                                               // only check the length for the wikitext content handler,
-                                               // it's a wasted (and failed) check otherwise
-                                               if ( $row->rev_content_model == CONTENT_MODEL_WIKITEXT ) {
-                                                       $revLength = $row->rev_len;
-                                               }
-                                       }
+                               if ( $model !== CONTENT_MODEL_WIKITEXT ) {
+                                       $revLength = strlen( $text );
                                } else {
                                        $revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
                                }
@@ -757,7 +809,14 @@ class TextPassDumper extends BackupDumper {
                }
 
                if ( $name == "text" && isset( $attribs['id'] ) ) {
-                       $text = $this->getText( $attribs['id'] );
+                       $id = $attribs['id'];
+                       $model = trim( $this->thisRevModel );
+                       $format = trim( $this->thisRevFormat );
+
+                       $model = $model === '' ? null : $model;
+                       $format = $format === '' ? null : $format;
+
+                       $text = $this->getText( $id, $model, $format );
                        $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
                        if ( strlen( $text ) > 0 ) {
                                $this->characterData( $parser, $text );
@@ -780,6 +839,8 @@ class TextPassDumper extends BackupDumper {
                        $this->egress->writeRevision( null, $this->buffer );
                        $this->buffer = "";
                        $this->thisRev = "";
+                       $this->thisRevModel = null;
+                       $this->thisRevFormat = null;
                } elseif ( $name == 'page' ) {
                        if ( !$this->firstPageWritten ) {
                                $this->firstPageWritten = trim( $this->thisPage );
@@ -834,6 +895,13 @@ class TextPassDumper extends BackupDumper {
                                $this->thisPage .= $data;
                        }
                }
+               elseif ( $this->lastName == "model"  ) {
+                       $this->thisRevModel .= $data;
+               }
+               elseif ( $this->lastName == "format"  ) {
+                       $this->thisRevFormat .= $data;
+               }
+
                // have to skip the newline left over from closepagetag line of
                // end of checkpoint files. nasty hack!!
                if ( $this->checkpointJustWritten ) {
index fc676b8..3e2c6c9 100644 (file)
@@ -32,7 +32,8 @@ require_once __DIR__ . '/Maintenance.php';
 class FetchText extends Maintenance {
        public function __construct() {
                parent::__construct();
-               $this->mDescription = "Fetch the revision text from an old_id";
+               $this->mDescription = "Fetch the raw revision blob from an old_id.";
+               $this->mDescription .= "\nNOTE: Export transformations are NOT applied. This is left to backupTextPass.php";
        }
 
        /**
index 415e11b..9e62751 100644 (file)
@@ -30,13 +30,15 @@ abstract class DumpTestCase extends MediaWikiLangTestCase {
         *
         * @param Page $page Page to add the revision to
         * @param string $text Revisions text
-        * @param string $summary Revisions summare
-        * @return array
+        * @param string $summary Revisions summary
+        * @param string $model The model ID (defaults to wikitext)
+        *
         * @throws MWException
+        * @return array
         */
-       protected function addRevision( Page $page, $text, $summary ) {
+       protected function addRevision( Page $page, $text, $summary, $model = CONTENT_MODEL_WIKITEXT ) {
                $status = $page->doEditContent(
-                       ContentHandler::makeContent( $text, $page->getTitle() ),
+                       ContentHandler::makeContent( $text, $page->getTitle(), $model ),
                        $summary
                );
 
index e620b08..26662d5 100644 (file)
@@ -27,6 +27,10 @@ class TextPassDumperTest extends DumpTestCase {
                $this->tablesUsed[] = 'revision';
                $this->tablesUsed[] = 'text';
 
+               $this->mergeMwGlobalArrayValue( 'wgContentHandlers', array(
+                       "BackupTextPassTestModel" => "BackupTextPassTestModelHandler"
+               ) );
+
                $ns = $this->getDefaultWikitextNS();
 
                try {
@@ -61,7 +65,8 @@ class TextPassDumperTest extends DumpTestCase {
                        $this->pageId3 = $page->getId();
                        $page->doDeleteArticle( "Testing ;)" );
 
-                       // Page from non-default namespace
+                       // Page from non-default namespace and model.
+                       // ExportTransform applies.
 
                        if ( $ns === NS_TALK ) {
                                // @todo work around this.
@@ -73,7 +78,8 @@ class TextPassDumperTest extends DumpTestCase {
                        $page = WikiPage::factory( $title );
                        list( $this->revId4_1, $this->textId4_1 ) = $this->addRevision( $page,
                                "Talk about BackupDumperTestP1 Text1",
-                               "Talk BackupDumperTestP1 Summary1" );
+                               "Talk BackupDumperTestP1 Summary1",
+                               "BackupTextPassTestModel" );
                        $this->pageId4 = $page->getId();
                } catch ( Exception $e ) {
                        // We'd love to pass $e directly. However, ... see
@@ -141,7 +147,10 @@ class TextPassDumperTest extends DumpTestCase {
                $this->assertPageStart( $this->pageId4, NS_TALK, "Talk:BackupDumperTestP1" );
                $this->assertRevision( $this->revId4_1, "Talk BackupDumperTestP1 Summary1",
                        $this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe",
-                       "Talk about BackupDumperTestP1 Text1" );
+                       "TALK ABOUT BACKUPDUMPERTESTP1 TEXT1",
+                       false,
+                       "BackupTextPassTestModel",
+                       "text/plain" );
                $this->assertPageEnd();
 
                $this->assertDumpEnd();
@@ -209,7 +218,10 @@ class TextPassDumperTest extends DumpTestCase {
                $this->assertPageStart( $this->pageId4, NS_TALK, "Talk:BackupDumperTestP1" );
                $this->assertRevision( $this->revId4_1, "Talk BackupDumperTestP1 Summary1",
                        $this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe",
-                       "Talk about BackupDumperTestP1 Text1" );
+                       "TALK ABOUT BACKUPDUMPERTESTP1 TEXT1",
+                       false,
+                       "BackupTextPassTestModel",
+                       "text/plain" );
                $this->assertPageEnd();
 
                $this->assertDumpEnd();
@@ -362,7 +374,10 @@ class TextPassDumperTest extends DumpTestCase {
                                        $this->assertRevision( $this->revId4_1 + $i * self::$numOfRevs,
                                                "Talk BackupDumperTestP1 Summary1",
                                                $this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe",
-                                               "Talk about BackupDumperTestP1 Text1" );
+                                               "TALK ABOUT BACKUPDUMPERTESTP1 TEXT1",
+                                               false,
+                                               "BackupTextPassTestModel",
+                                               "text/plain" );
                                        $this->assertPageEnd();
 
                                        $lookingForPage = 1;
@@ -566,8 +581,8 @@ class TextPassDumperTest extends DumpTestCase {
         <ip>127.0.0.1</ip>
       </contributor>
       <comment>Talk BackupDumperTestP1 Summary1</comment>
-      <model>wikitext</model>
-      <format>text/x-wiki</format>
+      <model>BackupTextPassTestModel</model>
+      <format>text/plain</format>
       <text id="' . $this->textId4_1 . '" bytes="35" />
       <sha1>nktofwzd0tl192k3zfepmlzxoax1lpe</sha1>
     </revision>
@@ -582,3 +597,15 @@ class TextPassDumperTest extends DumpTestCase {
                return $fname;
        }
 }
+
+class BackupTextPassTestModelHandler extends TextContentHandler {
+
+       public function __construct() {
+               parent::__construct( 'BackupTextPassTestModel' );
+       }
+
+       public function exportTransform( $text, $format = null ) {
+               return strtoupper( $text );
+       }
+
+}
\ No newline at end of file