Make XmlDumpwriter resilient to blob store corruption.
authordaniel <dkinzler@wikimedia.org>
Tue, 23 Jul 2019 11:23:41 +0000 (13:23 +0200)
committerdaniel <dkinzler@wikimedia.org>
Tue, 23 Jul 2019 11:59:57 +0000 (13:59 +0200)
In the WMF databases, we have several revisions for which we cannot
load the content. They typically (but not necessarily) have
content_address = "tt:0" and content_sha1 = "" and rev_sha1 = ""
and content_size = 0 and rev_len = 0.

This patch makes sure we can still generate dumps in the presence of
such revisions.

Bug: T228720
Change-Id: Iaadad44eb5b5fe5a4f2e60da406ffc11f39c735b

includes/export/XmlDumpWriter.php
tests/phpunit/maintenance/backup_PageTest.php

index 8535564..f34b3bd 100644 (file)
@@ -291,6 +291,34 @@ class XmlDumpWriter {
                return MediaWikiServices::getInstance()->getBlobStore();
        }
 
+       /**
+        * Invokes the given method on the given object, catching and logging any storage related
+        * exceptions.
+        *
+        * @param object $obj
+        * @param string $method
+        * @param array $args
+        * @param string $warning The warning to output in case of a storage related exception.
+        *
+        * @return mixed Returns the method's return value,
+        *         or null in case of a storage related exception.
+        * @throws Exception
+        */
+       private function invokeLenient( $obj, $method, $args = [], $warning ) {
+               try {
+                       return call_user_func_array( [ $obj, $method ], $args );
+               } catch ( SuppressedDataException $ex ) {
+                       return null;
+               } catch ( Exception $ex ) {
+                       if ( $ex instanceof MWException || $ex instanceof RuntimeException ) {
+                               MWDebug::warning( $warning . ': ' . $ex->getMessage() );
+                               return null;
+                       } else {
+                               throw $ex;
+                       }
+               }
+       }
+
        /**
         * Dumps a "<revision>" section on the output stream, with
         * data filled in from the given database row.
@@ -354,14 +382,28 @@ class XmlDumpWriter {
                if ( $rev->isDeleted( RevisionRecord::DELETED_TEXT ) ) {
                        $out .= "      <sha1/>\n";
                } else {
-                       $out .= "      " . Xml::element( 'sha1', null, strval( $rev->getSha1() ) ) . "\n";
+                       $sha1 = $this->invokeLenient(
+                               $rev,
+                               'getSha1',
+                               [],
+                               'failed to determine sha1 for revision ' . $rev->getId()
+                       );
+                       $out .= "      " . Xml::element( 'sha1', null, strval( $sha1 ) ) . "\n";
                }
 
                // Avoid PHP 7.1 warning from passing $this by reference
                $writer = $this;
                $text = '';
                if ( $contentMode === self::WRITE_CONTENT ) {
-                       $text = $rev->getContent( SlotRecord::MAIN, RevisionRecord::RAW );
+                       /** @var Content $content */
+                       $content = $this->invokeLenient(
+                               $rev,
+                               'getContent',
+                               [ SlotRecord::MAIN, RevisionRecord::RAW ],
+                               'Failed to load main slot content of revision ' . $rev->getId()
+                       );
+
+                       $text = $content ? $content->serialize() : '';
                }
                Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text, $rev ] );
 
@@ -410,37 +452,38 @@ class XmlDumpWriter {
 
                $textAttributes = [
                        'xml:space' => 'preserve',
-                       'bytes' => $slot->getSize(),
+                       'bytes' => $this->invokeLenient(
+                               $slot,
+                               'getSize',
+                               [],
+                               'failed to determine size for slot ' . $slot->getRole() . ' of revision '
+                               . $slot->getRevision()
+                       ) ?: '0'
                ];
 
                if ( $isV11 ) {
-                       $textAttributes['sha1'] = $slot->getSha1();
+                       $textAttributes['sha1'] = $this->invokeLenient(
+                               $slot,
+                               'getSha1',
+                               [],
+                               'failed to determine sha1 for slot ' . $slot->getRole() . ' of revision '
+                               . $slot->getRevision()
+                       ) ?: '';
                }
 
                if ( $contentMode === self::WRITE_CONTENT ) {
-                       try {
-                               // write <text> tag
-                               $out .= $this->writeText( $slot->getContent(), $textAttributes, $indent );
-                       } catch ( SuppressedDataException $ex ) {
-                               // NOTE: this shouldn't happen, since the caller is supposed to have checked
-                               // for suppressed content!
-                               // write <text> placeholder tag
-                               $textAttributes['deleted'] = 'deleted';
+                       $content = $this->invokeLenient(
+                               $slot,
+                               'getContent',
+                               [],
+                               'failed to load content for slot ' . $slot->getRole() . ' of revision '
+                               . $slot->getRevision()
+                       );
+
+                       if ( $content === null ) {
                                $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
-                       }
-                       catch ( Exception $ex ) {
-                               if ( $ex instanceof MWException || $ex instanceof RuntimeException ) {
-                                       // there's no provision in the schema for an attribute that will let
-                                       // the user know this element was unavailable due to error; an empty
-                                       // tag is the best we can do
-                                       $out .= $indent . Xml::element( 'text' ) . "\n";
-                                       wfLogWarning(
-                                               'failed to load content slot ' . $slot->getRole() . ' for revision '
-                                               . $slot->getRevision() . "\n"
-                                       );
-                               } else {
-                                       throw $ex;
-                               }
+                       } else {
+                               $out .= $this->writeText( $content, $textAttributes, $indent );
                        }
                } elseif ( $contentMode === self::WRITE_STUB_DELETED ) {
                        // write <text> placeholder tag
@@ -455,14 +498,21 @@ class XmlDumpWriter {
                        // Output the numerical text ID if possible, for backwards compatibility.
                        // Note that this is currently the ONLY reason we have a BlobStore here at all.
                        // When removing this line, check whether the BlobStore has become unused.
-                       $textId = $this->getBlobStore()->getTextIdFromAddress( $slot->getAddress() );
+                       try {
+                               // NOTE: this will only work for addresses of the form "tt:12345".
+                               // If we want to support other kinds of addresses in the future,
+                               // we will have to silently ignore failures here.
+                               // For now, this fails for "tt:0", which is present in the WMF production
+                               // database of of Juli 2019, due to data corruption.
+                               $textId = $this->getBlobStore()->getTextIdFromAddress( $slot->getAddress() );
+                       } catch ( InvalidArgumentException $ex ) {
+                               MWDebug::warning( 'Bad content address for slot ' . $slot->getRole()
+                                       . ' of revision ' . $slot->getRevision() . ': ' . $ex->getMessage() );
+                               $textId = 0;
+                       }
+
                        if ( $textId ) {
                                $textAttributes['id'] = $textId;
-                       } elseif ( !$isV11 ) {
-                               throw new InvalidArgumentException(
-                                       'Cannot produce stubs for non-text-table content blobs with schema version '
-                                       . $this->schemaVersion
-                               );
                        }
 
                        $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
index 7a78e52..54a362e 100644 (file)
@@ -6,6 +6,7 @@ use DumpBackup;
 use Exception;
 use MediaWiki\MediaWikiServices;
 use MediaWiki\Revision\RevisionRecord;
+use MediaWiki\Revision\SlotRecord;
 use MediaWikiTestCase;
 use MWException;
 use RequestContext;
@@ -28,13 +29,14 @@ class BackupDumperPageTest extends DumpTestCase {
 
        // We'll add several pages, revision and texts. The following variables hold the
        // corresponding ids.
-       private $pageId1, $pageId2, $pageId3, $pageId4;
-       private $pageTitle1, $pageTitle2, $pageTitle3, $pageTitle4;
+       private $pageId1, $pageId2, $pageId3, $pageId4, $pageId5;
+       private $pageTitle1, $pageTitle2, $pageTitle3, $pageTitle4, $pageTitle5;
        private $revId1_1, $textId1_1;
        private $revId2_1, $textId2_1, $revId2_2, $textId2_2;
        private $revId2_3, $textId2_3, $revId2_4, $textId2_4;
        private $revId3_1, $textId3_1, $revId3_2, $textId3_2;
        private $revId4_1, $textId4_1;
+       private $revId5_1, $textId5_1;
        private $namespace, $talk_namespace;
 
        /**
@@ -106,6 +108,15 @@ class BackupDumperPageTest extends DumpTestCase {
                                "Talk about BackupDumperTestP1 Text1",
                                "Talk BackupDumperTestP1 Summary1" );
                        $this->pageId4 = $page->getId();
+
+                       $this->pageTitle5 = Title::newFromText( 'BackupDumperTestP5' );
+                       $page = WikiPage::factory( $this->pageTitle5 );
+                       list( $this->revId5_1, $this->textId5_1 ) = $this->addRevision( $page,
+                               "BackupDumperTestP5 Text1",
+                               "BackupDumperTestP5 Summary1" );
+                       $this->pageId5 = $page->getId();
+
+                       $this->corruptRevisionData( $page->getRevision()->getRevisionRecord() );
                } catch ( Exception $e ) {
                        // We'd love to pass $e directly. However, ... see
                        // documentation of exceptionFromAddDBData in
@@ -114,6 +125,39 @@ class BackupDumperPageTest extends DumpTestCase {
                }
        }
 
+       /**
+        * Corrupt the information about the given revision in the database.
+        *
+        * @param RevisionRecord $revision
+        */
+       private function corruptRevisionData( RevisionRecord $revision ) {
+               global $wgMultiContentRevisionSchemaMigrationStage;
+
+               if ( ( $wgMultiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_WRITE_OLD ) ) {
+                       $this->db->update(
+                               'revision',
+                               [
+                                       'rev_text_id' => 0,
+                                       'rev_sha1' => '',
+                                       'rev_len' => '0',
+                               ],
+                               [ 'rev_id' => $revision->getId() ]
+                       );
+               }
+
+               if ( ( $wgMultiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_WRITE_NEW ) ) {
+                       $this->db->update(
+                               'content',
+                               [
+                                       'content_address' => 'tt:0',
+                                       'content_sha1' => '',
+                                       'content_size' => '0',
+                               ],
+                               [ 'content_id' => $revision->getSlot( SlotRecord::MAIN )->getContentId() ]
+                       );
+               }
+       }
+
        protected function setUp() {
                parent::setUp();
 
@@ -201,11 +245,14 @@ class BackupDumperPageTest extends DumpTestCase {
                $dumper = $this->newDumpBackup(
                        [ '--full', '--quiet', '--output', 'file:' . $fname, '--schema-version', $schemaVersion ],
                        $this->pageId1,
-                       $this->pageId4 + 1
+                       $this->pageId5 + 1
                );
 
-               // Performing the dump
+               // Performing the dump. Suppress warnings, since we want to test
+               // accessing broken revision data (page 5).
+               $this->setMwGlobals( 'wgDevelopmentWarnings', false );
                $dumper->execute();
+               $this->setMwGlobals( 'wgDevelopmentWarnings', true );
 
                // Checking the dumped data
                $this->assertDumpSchema( $fname, $this->getXmlSchemaPath( $schemaVersion ) );
@@ -295,6 +342,26 @@ class BackupDumperPageTest extends DumpTestCase {
                );
                $asserter->assertPageEnd();
 
+               // Page 5 (broken revision data)
+               $asserter->assertPageStart(
+                       $this->pageId5,
+                       $this->namespace,
+                       $this->pageTitle5->getPrefixedText()
+               );
+               $asserter->assertRevision(
+                       $this->revId5_1,
+                       "BackupDumperTestP5 Summary1",
+                       null,
+                       0,
+                       "",
+                       false,
+                       false,
+                       CONTENT_MODEL_WIKITEXT,
+                       CONTENT_FORMAT_WIKITEXT,
+                       $schemaVersion
+               );
+               $asserter->assertPageEnd();
+
                $asserter->assertDumpEnd();
 
                // FIXME: add multi-slot test case!
@@ -317,11 +384,14 @@ class BackupDumperPageTest extends DumpTestCase {
                                '--schema-version', $schemaVersion,
                        ],
                        $this->pageId1,
-                       $this->pageId4 + 1
+                       $this->pageId5 + 1
                );
 
-               // Performing the dump
+               // Performing the dump. Suppress warnings, since we want to test
+               // accessing broken revision data (page 5).
+               $this->setMwGlobals( 'wgDevelopmentWarnings', false );
                $dumper->execute();
+               $this->setMwGlobals( 'wgDevelopmentWarnings', true );
 
                // Checking the dumped data
                $this->assertDumpSchema( $fname, $this->getXmlSchemaPath( $schemaVersion ) );
@@ -404,6 +474,21 @@ class BackupDumperPageTest extends DumpTestCase {
                );
                $asserter->assertPageEnd();
 
+               // Page 5 (broken revision data)
+               $asserter->assertPageStart(
+                       $this->pageId5,
+                       $this->namespace,
+                       $this->pageTitle5->getPrefixedText()
+               );
+               $asserter->assertRevision(
+                       $this->revId5_1,
+                       "BackupDumperTestP5 Summary1",
+                       null,
+                       0,
+                       ""
+               );
+               $asserter->assertPageEnd();
+
                $asserter->assertDumpEnd();
        }