[MCR] populateContentTables maintenance script
authoraude <aude.wiki@gmail.com>
Fri, 12 Jan 2018 00:56:20 +0000 (19:56 -0500)
committerdaniel <daniel.kinzler@wikimedia.de>
Mon, 25 Jun 2018 13:40:01 +0000 (15:40 +0200)
This introduces a maintenance script for populating the tables
introduced by the MCR storage schema, namely:

  slots
  slot_roles
  content
  content_models

Per default, both the revision and archive tables are processed.

This script is part of the MCR schema migration: after instructing
RevisionStroe to write both the old and the new schema by setting
$wgMultiContentRevisionSchemaMigrationStage = MIGRATION_WRITE_BOTH,
this script can be used to back-fill the new schema for existing
revisions.

Doing this is a precondition to later setting
$wgMultiContentRevisionSchemaMigrationStage = MIGRATION_NEW to
complete the schema migration.

Bug: T182682
Change-Id: Iecc67c1b8c082be1a1039eeb52e76ad16b965226

autoload.php
includes/DefaultSettings.php
maintenance/populateContentTables.php [new file with mode: 0644]

index 77144df..46a264c 100644 (file)
@@ -1100,6 +1100,7 @@ $wgAutoloadLocalClasses = [
        'PopulateBacklinkNamespace' => __DIR__ . '/maintenance/populateBacklinkNamespace.php',
        'PopulateCategory' => __DIR__ . '/maintenance/populateCategory.php',
        'PopulateContentModel' => __DIR__ . '/maintenance/populateContentModel.php',
+       'PopulateContentTables' => __DIR__ . '/maintenance/populateContentTables.php',
        'PopulateExternallinksIndex60' => __DIR__ . '/maintenance/populateExternallinksIndex60.php',
        'PopulateFilearchiveSha1' => __DIR__ . '/maintenance/populateFilearchiveSha1.php',
        'PopulateImageSha1' => __DIR__ . '/maintenance/populateImageSha1.php',
index 562d887..23a021c 100644 (file)
@@ -8875,6 +8875,17 @@ $wgInterwikiPrefixDisplayTypes = [];
  */
 $wgCommentTableSchemaMigrationStage = MIGRATION_OLD;
 
+/**
+ * RevisionStore table schema migration stage (content, slots, content_models & slot_roles tables)
+ *
+ * @see Task: https://phabricator.wikimedia.org/T174028
+ * @see Commit: https://gerrit.wikimedia.org/r/#/c/378724/
+ *
+ * @since 1.32
+ * @var int One of the MIGRATION_* constants
+ */
+$wgMultiContentRevisionSchemaMigrationStage = MIGRATION_OLD;
+
 /**
  * Actor table schema migration stage.
  * @since 1.31
diff --git a/maintenance/populateContentTables.php b/maintenance/populateContentTables.php
new file mode 100644 (file)
index 0000000..eee534f
--- /dev/null
@@ -0,0 +1,330 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Maintenance
+ */
+
+use MediaWiki\MediaWikiServices;
+use MediaWiki\Storage\NameTableStore;
+use MediaWiki\Storage\SqlBlobStore;
+use Wikimedia\Assert\Assert;
+use Wikimedia\Rdbms\IDatabase;
+use Wikimedia\Rdbms\ResultWrapper;
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Populate the content and slot tables.
+ * @since 1.32
+ */
+class PopulateContentTables extends Maintenance {
+
+       /** @var IDatabase */
+       private $dbw;
+
+       /** @var NameTableStore */
+       private $contentModelStore;
+
+       /** @var int */
+       private $mainRoleId;
+
+       /** @var array|null Map "{$modelId}:{$address}" to content_id */
+       private $contentRowMap = null;
+
+       private $count = 0, $totalCount = 0;
+
+       public function __construct() {
+               parent::__construct();
+
+               $this->addDescription( 'Populate content and slot tables' );
+               $this->addOption( 'table', 'revision or archive table, or `all` to populate both', false,
+                       true );
+               $this->addOption( 'reuse-content',
+                       'Reuse content table rows when the address and model are the same. '
+                       . 'This will increase the script\'s time and memory usage, perhaps significantly.',
+                       false, false );
+               $this->setBatchSize( 500 );
+       }
+
+       private function initServices() {
+               $this->dbw = $this->getDB( DB_MASTER );
+               $this->contentModelStore = MediaWikiServices::getInstance()->getContentModelStore();
+               $this->mainRoleId = MediaWikiServices::getInstance()->getSlotRoleStore()->acquireId( 'main' );
+       }
+
+       public function execute() {
+               global $wgMultiContentRevisionSchemaMigrationStage;
+
+               $t0 = microtime( true );
+
+               if ( $wgMultiContentRevisionSchemaMigrationStage < MIGRATION_WRITE_BOTH ) {
+                       $this->writeln(
+                               "...cannot update while \$wgMultiContentRevisionSchemaMigrationStage < MIGRATION_WRITE_BOTH"
+                       );
+                       return false;
+               }
+
+               $this->initServices();
+
+               if ( $this->getOption( 'reuse-content', false ) ) {
+                       $this->loadContentMap();
+               }
+
+               foreach ( $this->getTables() as $table ) {
+                       $this->populateTable( $table );
+               }
+
+               $elapsed = microtime( true ) - $t0;
+               $this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" );
+       }
+
+       /**
+        * @return string[]
+        */
+       private function getTables() {
+               $table = $this->getOption( 'table', 'all' );
+               $validTableOptions = [ 'all', 'revision', 'archive' ];
+
+               if ( !in_array( $table, $validTableOptions ) ) {
+                       $this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' );
+               }
+
+               if ( $table === 'all' ) {
+                       $tables = [ 'revision', 'archive' ];
+               } else {
+                       $tables = [ $table ];
+               }
+
+               return $tables;
+       }
+
+       private function loadContentMap() {
+               $t0 = microtime( true );
+               $this->writeln( "Loading existing content table rows..." );
+               $this->contentRowMap = [];
+               $dbr = $this->getDB( DB_REPLICA );
+               $from = false;
+               while ( true ) {
+                       $res = $dbr->select(
+                               'content',
+                               [ 'content_id', 'content_address', 'content_model' ],
+                               $from ? "content_id > $from" : '',
+                               __METHOD__,
+                               [ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ]
+                       );
+                       if ( !$res || !$res->numRows() ) {
+                               break;
+                       }
+                       foreach ( $res as $row ) {
+                               $from = $row->content_id;
+                               $this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id;
+                       }
+               }
+               $elapsed = microtime( true ) - $t0;
+               $this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" );
+       }
+
+       /**
+        * @param string $table
+        */
+       private function populateTable( $table ) {
+               $t0 = microtime( true );
+               $this->count = 0;
+               $this->writeln( "Populating $table..." );
+
+               if ( $table === 'revision' ) {
+                       $idField = 'rev_id';
+                       $tables = [ 'revision', 'slots', 'page' ];
+                       $fields = [
+                               'rev_id',
+                               'len' => 'rev_len',
+                               'sha1' => 'rev_sha1',
+                               'text_id' => 'rev_text_id',
+                               'content_model' => 'rev_content_model',
+                               'namespace' => 'page_namespace',
+                               'title' => 'page_title',
+                       ];
+                       $joins = [
+                               'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ],
+                               'page' => [ 'LEFT JOIN', 'rev_page=page_id' ],
+                       ];
+               } else {
+                       $idField = 'ar_rev_id';
+                       $tables = [ 'archive', 'slots' ];
+                       $fields = [
+                               'rev_id' => 'ar_rev_id',
+                               'len' => 'ar_len',
+                               'sha1' => 'ar_sha1',
+                               'text_id' => 'ar_text_id',
+                               'content_model' => 'ar_content_model',
+                               'namespace' => 'ar_namespace',
+                               'title' => 'ar_title',
+                       ];
+                       $joins = [
+                               'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ],
+                       ];
+               }
+
+               $minmax = $this->dbw->selectRow(
+                       $table,
+                       [ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ],
+                       '',
+                       __METHOD__
+               );
+               $batchSize = $this->getBatchSize();
+
+               for ( $startId = $minmax->min; $startId <= $minmax->max; $startId += $batchSize ) {
+                       $endId = min( $startId + $batchSize - 1, $minmax->max );
+                       $rows = $this->dbw->select(
+                               $tables,
+                               $fields,
+                               [
+                                       "$idField >= $startId",
+                                       "$idField <= $endId",
+                                       'slot_revision_id IS NULL',
+                               ],
+                               __METHOD__,
+                               [ 'ORDER BY' => 'rev_id' ],
+                               $joins
+                       );
+                       if ( $rows->numRows() !== 0 ) {
+                               $this->populateContentTablesForRowBatch( $rows, $startId, $table );
+                       }
+
+                       $elapsed = microtime( true ) - $t0;
+                       $this->writeln(
+                               "... $table processed up to revision id $endId of {$minmax->max}"
+                               . " ($this->count rows in $elapsed seconds)"
+                       );
+               }
+
+               $elapsed = microtime( true ) - $t0;
+               $this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" );
+       }
+
+       /**
+        * @param ResultWrapper $rows
+        * @param int $startId
+        * @param string $table
+        * @return int|null
+        */
+       private function populateContentTablesForRowBatch( ResultWrapper $rows, $startId, $table ) {
+               $this->beginTransaction( $this->dbw, __METHOD__ );
+
+               if ( $this->contentRowMap === null ) {
+                       $map = [];
+               } else {
+                       $map = &$this->contentRowMap;
+               }
+               $contentKeys = [];
+
+               try {
+                       // Step 1: Figure out content rows needing insertion.
+                       $contentRows = [];
+                       foreach ( $rows as $row ) {
+                               $revisionId = $row->rev_id;
+
+                               Assert::invariant( $revisionId !== null, 'rev_id must not be null' );
+
+                               $modelId = $this->contentModelStore->acquireId( $this->getContentModel( $row ) );
+                               $address = SqlBlobStore::makeAddressFromTextId( $row->text_id );
+
+                               $key = "{$modelId}:{$address}";
+                               $contentKeys[$revisionId] = $key;
+
+                               if ( !isset( $map[$key] ) ) {
+                                       $map[$key] = false;
+                                       $contentRows[] = [
+                                               'content_size' => (int)$row->len,
+                                               'content_sha1' => $row->sha1,
+                                               'content_model' => $modelId,
+                                               'content_address' => $address,
+                                       ];
+                               }
+                       }
+
+                       // Step 2: Insert them, then read them back in for use in the next step.
+                       if ( $contentRows ) {
+                               $id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ );
+                               $this->dbw->insert( 'content', $contentRows, __METHOD__ );
+                               $res = $this->dbw->select(
+                                       'content',
+                                       [ 'content_id', 'content_model', 'content_address' ],
+                                       'content_id > ' . (int)$id,
+                                       __METHOD__
+                               );
+                               foreach ( $res as $row ) {
+                                       $key = $row->content_model . ':' . $row->content_address;
+                                       $map[$key] = $row->content_id;
+                               }
+                       }
+
+                       // Step 3: Insert the slot rows.
+                       $slotRows = [];
+                       foreach ( $rows as $row ) {
+                               $revisionId = $row->rev_id;
+                               $contentId = $map[$contentKeys[$revisionId]] ?? false;
+                               if ( $contentId === false ) {
+                                       throw new \RuntimeException( "Content row for $revisionId not found after content insert" );
+                               }
+                               $slotRows[] = [
+                                       'slot_revision_id' => $revisionId,
+                                       'slot_role_id' => $this->mainRoleId,
+                                       'slot_content_id' => $contentId,
+                                       // There's no way to really know the previous revision, so assume no inheriting.
+                                       // rev_parent_id can get changed on undeletions, and deletions can screw up
+                                       // rev_timestamp ordering.
+                                       'slot_origin' => $revisionId,
+                               ];
+                       }
+                       $this->dbw->insert( 'slots', $slotRows, __METHOD__ );
+                       $this->count += count( $slotRows );
+                       $this->totalCount += count( $slotRows );
+               } catch ( \Exception $e ) {
+                       $this->rollbackTransaction( $this->dbw, __METHOD__ );
+                       $this->fatalError( "Failed to populate content table $table row batch starting at $startId "
+                               . "due to exception: " . $e->__toString() );
+               }
+
+               $this->commitTransaction( $this->dbw, __METHOD__ );
+       }
+
+       /**
+        * @param \stdClass $row
+        * @return string
+        */
+       private function getContentModel( $row ) {
+               if ( isset( $row->content_model ) ) {
+                       return $row->content_model;
+               }
+
+               $title = Title::makeTitle( $row->namespace, $row->title );
+
+               return ContentHandler::getDefaultModelFor( $title );
+       }
+
+       /**
+        * @param string $msg
+        */
+       private function writeln( $msg ) {
+               $this->output( "$msg\n" );
+       }
+}
+
+$maintClass = 'PopulateContentTables';
+require_once RUN_MAINTENANCE_IF_MAIN;