addDescription( 'Populate content and slot tables' ); $this->addOption( 'table', 'revision or archive table, or `all` to populate both', false, true ); $this->addOption( 'reuse-content', 'Reuse content table rows when the address and model are the same. ' . 'This will increase the script\'s time and memory usage, perhaps significantly.', false, false ); $this->addOption( 'start-revision', 'The rev_id to start at', false, true ); $this->addOption( 'start-archive', 'The ar_rev_id to start at', false, true ); $this->setBatchSize( 500 ); } private function initServices() { $this->dbw = $this->getDB( DB_MASTER ); $this->contentModelStore = MediaWikiServices::getInstance()->getContentModelStore(); $this->mainRoleId = MediaWikiServices::getInstance()->getSlotRoleStore() ->acquireId( SlotRecord::MAIN ); } public function execute() { global $wgMultiContentRevisionSchemaMigrationStage; $t0 = microtime( true ); if ( ( $wgMultiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_WRITE_NEW ) === 0 ) { $this->writeln( '...cannot update while \$wgMultiContentRevisionSchemaMigrationStage ' . 'does not have the SCHEMA_COMPAT_WRITE_NEW bit set.' ); return false; } $this->initServices(); if ( $this->getOption( 'reuse-content', false ) ) { $this->loadContentMap(); } foreach ( $this->getTables() as $table ) { $this->populateTable( $table ); } $elapsed = microtime( true ) - $t0; $this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" ); return true; } /** * @return string[] */ private function getTables() { $table = $this->getOption( 'table', 'all' ); $validTableOptions = [ 'all', 'revision', 'archive' ]; if ( !in_array( $table, $validTableOptions ) ) { $this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' ); } if ( $table === 'all' ) { $tables = [ 'revision', 'archive' ]; } else { $tables = [ $table ]; } return $tables; } private function loadContentMap() { $t0 = microtime( true ); $this->writeln( "Loading existing content table rows..." ); $this->contentRowMap = []; $dbr = $this->getDB( DB_REPLICA ); $from = false; while ( true ) { $res = $dbr->select( 'content', [ 'content_id', 'content_address', 'content_model' ], $from ? "content_id > $from" : '', __METHOD__, [ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ] ); if ( !$res || !$res->numRows() ) { break; } foreach ( $res as $row ) { $from = $row->content_id; $this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id; } } $elapsed = microtime( true ) - $t0; $this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" ); } /** * @param string $table */ private function populateTable( $table ) { $t0 = microtime( true ); $this->count = 0; $this->writeln( "Populating $table..." ); if ( $table === 'revision' ) { $idField = 'rev_id'; $tables = [ 'revision', 'slots', 'page' ]; $fields = [ 'rev_id', 'len' => 'rev_len', 'sha1' => 'rev_sha1', 'text_id' => 'rev_text_id', 'content_model' => 'rev_content_model', 'namespace' => 'page_namespace', 'title' => 'page_title', ]; $joins = [ 'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ], 'page' => [ 'LEFT JOIN', 'rev_page=page_id' ], ]; $startOption = 'start-revision'; } else { $idField = 'ar_rev_id'; $tables = [ 'archive', 'slots' ]; $fields = [ 'rev_id' => 'ar_rev_id', 'len' => 'ar_len', 'sha1' => 'ar_sha1', 'text_id' => 'ar_text_id', 'content_model' => 'ar_content_model', 'namespace' => 'ar_namespace', 'title' => 'ar_title', ]; $joins = [ 'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ], ]; $startOption = 'start-archive'; } if ( !$this->dbw->fieldExists( $table, $fields['text_id'], __METHOD__ ) ) { $this->writeln( "No need to populate, $table.{$fields['text_id']} field does not exist" ); return; } $minmax = $this->dbw->selectRow( $table, [ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ], '', __METHOD__ ); if ( $this->hasOption( $startOption ) ) { $minmax->min = (int)$this->getOption( $startOption ); } if ( !$minmax || !is_numeric( $minmax->min ) || !is_numeric( $minmax->max ) ) { // No rows? $minmax = (object)[ 'min' => 1, 'max' => 0 ]; } $batchSize = $this->getBatchSize(); for ( $startId = $minmax->min; $startId <= $minmax->max; $startId += $batchSize ) { $endId = min( $startId + $batchSize - 1, $minmax->max ); $rows = $this->dbw->select( $tables, $fields, [ "$idField >= $startId", "$idField <= $endId", 'slot_revision_id IS NULL', ], __METHOD__, [ 'ORDER BY' => 'rev_id' ], $joins ); if ( $rows->numRows() !== 0 ) { $this->populateContentTablesForRowBatch( $rows, $startId, $table ); } $elapsed = microtime( true ) - $t0; $this->writeln( "... $table processed up to revision id $endId of {$minmax->max}" . " ($this->count rows in $elapsed seconds)" ); } $elapsed = microtime( true ) - $t0; $this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" ); } /** * @param ResultWrapper $rows * @param int $startId * @param string $table * @return int|null */ private function populateContentTablesForRowBatch( ResultWrapper $rows, $startId, $table ) { $this->beginTransaction( $this->dbw, __METHOD__ ); if ( $this->contentRowMap === null ) { $map = []; } else { $map = &$this->contentRowMap; } $contentKeys = []; try { // Step 1: Figure out content rows needing insertion. $contentRows = []; foreach ( $rows as $row ) { $revisionId = $row->rev_id; Assert::invariant( $revisionId !== null, 'rev_id must not be null' ); $modelId = $this->contentModelStore->acquireId( $this->getContentModel( $row ) ); $address = SqlBlobStore::makeAddressFromTextId( $row->text_id ); $key = "{$modelId}:{$address}"; $contentKeys[$revisionId] = $key; if ( !isset( $map[$key] ) ) { $map[$key] = false; $contentRows[] = [ 'content_size' => (int)$row->len, 'content_sha1' => $row->sha1, 'content_model' => $modelId, 'content_address' => $address, ]; } } // Step 2: Insert them, then read them back in for use in the next step. if ( $contentRows ) { $id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ ); $this->dbw->insert( 'content', $contentRows, __METHOD__ ); $res = $this->dbw->select( 'content', [ 'content_id', 'content_model', 'content_address' ], 'content_id > ' . (int)$id, __METHOD__ ); foreach ( $res as $row ) { $key = $row->content_model . ':' . $row->content_address; $map[$key] = $row->content_id; } } // Step 3: Insert the slot rows. $slotRows = []; foreach ( $rows as $row ) { $revisionId = $row->rev_id; $contentId = $map[$contentKeys[$revisionId]] ?? false; if ( $contentId === false ) { throw new \RuntimeException( "Content row for $revisionId not found after content insert" ); } $slotRows[] = [ 'slot_revision_id' => $revisionId, 'slot_role_id' => $this->mainRoleId, 'slot_content_id' => $contentId, // There's no way to really know the previous revision, so assume no inheriting. // rev_parent_id can get changed on undeletions, and deletions can screw up // rev_timestamp ordering. 'slot_origin' => $revisionId, ]; } $this->dbw->insert( 'slots', $slotRows, __METHOD__ ); $this->count += count( $slotRows ); $this->totalCount += count( $slotRows ); } catch ( \Exception $e ) { $this->rollbackTransaction( $this->dbw, __METHOD__ ); $this->fatalError( "Failed to populate content table $table row batch starting at $startId " . "due to exception: " . $e->__toString() ); } $this->commitTransaction( $this->dbw, __METHOD__ ); } /** * @param \stdClass $row * @return string */ private function getContentModel( $row ) { if ( isset( $row->content_model ) ) { return $row->content_model; } $title = Title::makeTitle( $row->namespace, $row->title ); return ContentHandler::getDefaultModelFor( $title ); } /** * @param string $msg */ private function writeln( $msg ) { $this->output( "$msg\n" ); } } $maintClass = 'PopulateContentTables'; require_once RUN_MAINTENANCE_IF_MAIN;