From e596cac514b61107ae5c97547e5f33680e4561c6 Mon Sep 17 00:00:00 2001 From: daniel Date: Thu, 23 Oct 2014 13:49:59 +0200 Subject: [PATCH] Apply exportTransform in backupTextPass.inc This is intended to help us fix 72348: Wikidata uses exportTransform to convert old revision content to the new canonical JSON model. This patch will make this work with the multi-pass dumping process employed to generate dumps for dumps.wikimedia.org. Bug: 72361 Change-Id: Ie9046d1968efc40a02a0812a536f5ef7176af7d7 --- maintenance/backupTextPass.inc | 104 +++++++++++++++--- maintenance/fetchText.php | 3 +- tests/phpunit/maintenance/DumpTestCase.php | 10 +- .../maintenance/backupTextPassTest.php | 41 +++++-- 4 files changed, 128 insertions(+), 30 deletions(-) diff --git a/maintenance/backupTextPass.inc b/maintenance/backupTextPass.inc index 5f7763730e..f13cb449cc 100644 --- a/maintenance/backupTextPass.inc +++ b/maintenance/backupTextPass.inc @@ -354,6 +354,8 @@ class TextPassDumper extends BackupDumper { $this->lastName = ""; $this->thisPage = 0; $this->thisRev = 0; + $this->thisRevModel = null; + $this->thisRevFormat = null; $parser = xml_parser_create( "UTF-8" ); xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); @@ -421,8 +423,34 @@ class TextPassDumper extends BackupDumper { return true; } + /** + * Applies applicable export transformations to $text. + * + * @param string $text + * @param string $model + * @param string|null $format + * + * @return string + */ + private function exportTransform( $text, $model, $format = null ) { + try { + $handler = ContentHandler::getForModelID( $model ); + $text = $handler->exportTransform( $text, $format ); + } + catch ( MWException $ex ) { + $this->progress( + "Unable to apply export transformation for content model '$model': " . + $ex->getMessage() + ); + } + + return $text; + } + /** * Tries to get the revision text for a revision id. + * Export transformations are applied if the content model can is given or can be + * determined from the database. * * Upon errors, retries (Up to $this->maxFailures tries each call). * If still no good revision get could be found even after this retrying, "" is returned. @@ -431,11 +459,14 @@ class TextPassDumper extends BackupDumper { * is thrown. * * @param string $id The revision id to get the text for + * @param string|bool|null $model The content model used to determine applicable export transformations. + * If $model is null, it will be determined from the database. + * @param string|null $format The content format used when applying export transformations. * - * @return string The revision text for $id, or "" * @throws MWException + * @return string The revision text for $id, or "" */ - function getText( $id ) { + function getText( $id, $model = null, $format = null ) { global $wgContentHandlerUseDB; $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch. @@ -453,6 +484,24 @@ class TextPassDumper extends BackupDumper { $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals; $consecutiveFailedTextRetrievals = 0; + if ( $model === null && $wgContentHandlerUseDB ) { + $row = $this->db->selectRow( + 'revision', + array( 'rev_content_model', 'rev_content_format' ), + array( 'rev_id' => $this->thisRev ), + __METHOD__ + ); + + if ( $row ) { + $model = $row->rev_content_model; + $format = $row->rev_content_format; + } + } + + if ( $model === null || $model === '' ) { + $model = false; + } + while ( $failures < $this->maxFailures ) { // As soon as we found a good text for the $id, we will return immediately. @@ -469,9 +518,19 @@ class TextPassDumper extends BackupDumper { $tryIsPrefetch = true; $text = $this->prefetch->prefetch( intval( $this->thisPage ), intval( $this->thisRev ) ); + if ( $text === null ) { $text = false; } + + if ( is_string( $text ) && $model !== false ) { + // Apply export transformation to text coming from an old dump. + // The purpose of this transformation is to convert up from legacy + // formats, which may still be used in the older dump that is used + // for pre-fetching. Applying the transformation again should not + // interfere with content that is already in the correct form. + $text = $this->exportTransform( $text, $model, $format ); + } } if ( $text === false ) { @@ -483,6 +542,12 @@ class TextPassDumper extends BackupDumper { $text = $this->getTextDb( $id ); } + if ( $text !== false && $model !== false ) { + // Apply export transformation to text coming from the database. + // Prefetched text should already have transformations applied. + $text = $this->exportTransform( $text, $model, $format ); + } + // No more checks for texts from DB for now. // If we received something that is not false, // We treat it as good text, regardless of whether it actually is or is not @@ -504,21 +569,8 @@ class TextPassDumper extends BackupDumper { throw new MWException( "No database available" ); } - $revLength = strlen( $text ); - if ( $wgContentHandlerUseDB ) { - $row = $this->db->selectRow( - 'revision', - array( 'rev_len', 'rev_content_model' ), - array( 'rev_id' => $revID ), - __METHOD__ - ); - if ( $row ) { - // only check the length for the wikitext content handler, - // it's a wasted (and failed) check otherwise - if ( $row->rev_content_model == CONTENT_MODEL_WIKITEXT ) { - $revLength = $row->rev_len; - } - } + if ( $model !== CONTENT_MODEL_WIKITEXT ) { + $revLength = strlen( $text ); } else { $revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) ); } @@ -757,7 +809,14 @@ class TextPassDumper extends BackupDumper { } if ( $name == "text" && isset( $attribs['id'] ) ) { - $text = $this->getText( $attribs['id'] ); + $id = $attribs['id']; + $model = trim( $this->thisRevModel ); + $format = trim( $this->thisRevFormat ); + + $model = $model === '' ? null : $model; + $format = $format === '' ? null : $format; + + $text = $this->getText( $id, $model, $format ); $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) ); if ( strlen( $text ) > 0 ) { $this->characterData( $parser, $text ); @@ -780,6 +839,8 @@ class TextPassDumper extends BackupDumper { $this->egress->writeRevision( null, $this->buffer ); $this->buffer = ""; $this->thisRev = ""; + $this->thisRevModel = null; + $this->thisRevFormat = null; } elseif ( $name == 'page' ) { if ( !$this->firstPageWritten ) { $this->firstPageWritten = trim( $this->thisPage ); @@ -834,6 +895,13 @@ class TextPassDumper extends BackupDumper { $this->thisPage .= $data; } } + elseif ( $this->lastName == "model" ) { + $this->thisRevModel .= $data; + } + elseif ( $this->lastName == "format" ) { + $this->thisRevFormat .= $data; + } + // have to skip the newline left over from closepagetag line of // end of checkpoint files. nasty hack!! if ( $this->checkpointJustWritten ) { diff --git a/maintenance/fetchText.php b/maintenance/fetchText.php index fc676b89d7..3e2c6c9337 100644 --- a/maintenance/fetchText.php +++ b/maintenance/fetchText.php @@ -32,7 +32,8 @@ require_once __DIR__ . '/Maintenance.php'; class FetchText extends Maintenance { public function __construct() { parent::__construct(); - $this->mDescription = "Fetch the revision text from an old_id"; + $this->mDescription = "Fetch the raw revision blob from an old_id."; + $this->mDescription .= "\nNOTE: Export transformations are NOT applied. This is left to backupTextPass.php"; } /** diff --git a/tests/phpunit/maintenance/DumpTestCase.php b/tests/phpunit/maintenance/DumpTestCase.php index 415e11b795..9e62751ec8 100644 --- a/tests/phpunit/maintenance/DumpTestCase.php +++ b/tests/phpunit/maintenance/DumpTestCase.php @@ -30,13 +30,15 @@ abstract class DumpTestCase extends MediaWikiLangTestCase { * * @param Page $page Page to add the revision to * @param string $text Revisions text - * @param string $summary Revisions summare - * @return array + * @param string $summary Revisions summary + * @param string $model The model ID (defaults to wikitext) + * * @throws MWException + * @return array */ - protected function addRevision( Page $page, $text, $summary ) { + protected function addRevision( Page $page, $text, $summary, $model = CONTENT_MODEL_WIKITEXT ) { $status = $page->doEditContent( - ContentHandler::makeContent( $text, $page->getTitle() ), + ContentHandler::makeContent( $text, $page->getTitle(), $model ), $summary ); diff --git a/tests/phpunit/maintenance/backupTextPassTest.php b/tests/phpunit/maintenance/backupTextPassTest.php index e620b080d1..26662d5de6 100644 --- a/tests/phpunit/maintenance/backupTextPassTest.php +++ b/tests/phpunit/maintenance/backupTextPassTest.php @@ -27,6 +27,10 @@ class TextPassDumperTest extends DumpTestCase { $this->tablesUsed[] = 'revision'; $this->tablesUsed[] = 'text'; + $this->mergeMwGlobalArrayValue( 'wgContentHandlers', array( + "BackupTextPassTestModel" => "BackupTextPassTestModelHandler" + ) ); + $ns = $this->getDefaultWikitextNS(); try { @@ -61,7 +65,8 @@ class TextPassDumperTest extends DumpTestCase { $this->pageId3 = $page->getId(); $page->doDeleteArticle( "Testing ;)" ); - // Page from non-default namespace + // Page from non-default namespace and model. + // ExportTransform applies. if ( $ns === NS_TALK ) { // @todo work around this. @@ -73,7 +78,8 @@ class TextPassDumperTest extends DumpTestCase { $page = WikiPage::factory( $title ); list( $this->revId4_1, $this->textId4_1 ) = $this->addRevision( $page, "Talk about BackupDumperTestP1 Text1", - "Talk BackupDumperTestP1 Summary1" ); + "Talk BackupDumperTestP1 Summary1", + "BackupTextPassTestModel" ); $this->pageId4 = $page->getId(); } catch ( Exception $e ) { // We'd love to pass $e directly. However, ... see @@ -141,7 +147,10 @@ class TextPassDumperTest extends DumpTestCase { $this->assertPageStart( $this->pageId4, NS_TALK, "Talk:BackupDumperTestP1" ); $this->assertRevision( $this->revId4_1, "Talk BackupDumperTestP1 Summary1", $this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe", - "Talk about BackupDumperTestP1 Text1" ); + "TALK ABOUT BACKUPDUMPERTESTP1 TEXT1", + false, + "BackupTextPassTestModel", + "text/plain" ); $this->assertPageEnd(); $this->assertDumpEnd(); @@ -209,7 +218,10 @@ class TextPassDumperTest extends DumpTestCase { $this->assertPageStart( $this->pageId4, NS_TALK, "Talk:BackupDumperTestP1" ); $this->assertRevision( $this->revId4_1, "Talk BackupDumperTestP1 Summary1", $this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe", - "Talk about BackupDumperTestP1 Text1" ); + "TALK ABOUT BACKUPDUMPERTESTP1 TEXT1", + false, + "BackupTextPassTestModel", + "text/plain" ); $this->assertPageEnd(); $this->assertDumpEnd(); @@ -362,7 +374,10 @@ class TextPassDumperTest extends DumpTestCase { $this->assertRevision( $this->revId4_1 + $i * self::$numOfRevs, "Talk BackupDumperTestP1 Summary1", $this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe", - "Talk about BackupDumperTestP1 Text1" ); + "TALK ABOUT BACKUPDUMPERTESTP1 TEXT1", + false, + "BackupTextPassTestModel", + "text/plain" ); $this->assertPageEnd(); $lookingForPage = 1; @@ -566,8 +581,8 @@ class TextPassDumperTest extends DumpTestCase { 127.0.0.1 Talk BackupDumperTestP1 Summary1 - wikitext - text/x-wiki + BackupTextPassTestModel + text/plain nktofwzd0tl192k3zfepmlzxoax1lpe @@ -582,3 +597,15 @@ class TextPassDumperTest extends DumpTestCase { return $fname; } } + +class BackupTextPassTestModelHandler extends TextContentHandler { + + public function __construct() { + parent::__construct( 'BackupTextPassTestModel' ); + } + + public function exportTransform( $text, $format = null ) { + return strtoupper( $text ); + } + +} \ No newline at end of file -- 2.20.1