3325b054566d263143c794cf0a4aa0f2d21997a6
[lhc/web/wiklou.git] / maintenance / populateContentTables.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 * @file
19 * @ingroup Maintenance
20 */
21
22 use MediaWiki\MediaWikiServices;
23 use MediaWiki\Revision\SlotRecord;
24 use MediaWiki\Storage\BlobStore;
25 use MediaWiki\Storage\NameTableStore;
26 use MediaWiki\Storage\SqlBlobStore;
27 use Wikimedia\Assert\Assert;
28 use Wikimedia\Rdbms\IDatabase;
29 use Wikimedia\Rdbms\IResultWrapper;
30
31 require_once __DIR__ . '/Maintenance.php';
32
33 /**
34 * Populate the content and slot tables.
35 * @since 1.32
36 */
37 class PopulateContentTables extends Maintenance {
38
39 /** @var IDatabase */
40 private $dbw;
41
42 /** @var NameTableStore */
43 private $contentModelStore;
44
45 /** @var BlobStore */
46 private $blobStore;
47
48 /** @var int */
49 private $mainRoleId;
50
51 /** @var array|null Map "{$modelId}:{$address}" to content_id */
52 private $contentRowMap = null;
53
54 private $count = 0, $totalCount = 0;
55
56 public function __construct() {
57 parent::__construct();
58
59 $this->addDescription( 'Populate content and slot tables' );
60 $this->addOption( 'table', 'revision or archive table, or `all` to populate both', false,
61 true );
62 $this->addOption( 'reuse-content',
63 'Reuse content table rows when the address and model are the same. '
64 . 'This will increase the script\'s time and memory usage, perhaps significantly.',
65 false, false );
66 $this->addOption( 'start-revision', 'The rev_id to start at', false, true );
67 $this->addOption( 'start-archive', 'The ar_rev_id to start at', false, true );
68 $this->setBatchSize( 500 );
69 }
70
71 private function initServices() {
72 $this->dbw = $this->getDB( DB_MASTER );
73 $this->contentModelStore = MediaWikiServices::getInstance()->getContentModelStore();
74 $this->blobStore = MediaWikiServices::getInstance()->getBlobStore();
75 $this->mainRoleId = MediaWikiServices::getInstance()->getSlotRoleStore()
76 ->acquireId( SlotRecord::MAIN );
77 }
78
79 public function execute() {
80 $multiContentRevisionSchemaMigrationStage =
81 $this->getConfig()->get( 'MultiContentRevisionSchemaMigrationStage' );
82
83 $t0 = microtime( true );
84
85 if ( ( $multiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_WRITE_NEW ) === 0 ) {
86 $this->writeln(
87 '...cannot update while \$wgMultiContentRevisionSchemaMigrationStage '
88 . 'does not have the SCHEMA_COMPAT_WRITE_NEW bit set.'
89 );
90 return false;
91 }
92
93 $this->initServices();
94
95 if ( $this->getOption( 'reuse-content', false ) ) {
96 $this->loadContentMap();
97 }
98
99 foreach ( $this->getTables() as $table ) {
100 $this->populateTable( $table );
101 }
102
103 $elapsed = microtime( true ) - $t0;
104 $this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" );
105 return true;
106 }
107
108 /**
109 * @return string[]
110 */
111 private function getTables() {
112 $table = $this->getOption( 'table', 'all' );
113 $validTableOptions = [ 'all', 'revision', 'archive' ];
114
115 if ( !in_array( $table, $validTableOptions ) ) {
116 $this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' );
117 }
118
119 if ( $table === 'all' ) {
120 $tables = [ 'revision', 'archive' ];
121 } else {
122 $tables = [ $table ];
123 }
124
125 return $tables;
126 }
127
128 private function loadContentMap() {
129 $t0 = microtime( true );
130 $this->writeln( "Loading existing content table rows..." );
131 $this->contentRowMap = [];
132 $dbr = $this->getDB( DB_REPLICA );
133 $from = false;
134 while ( true ) {
135 $res = $dbr->select(
136 'content',
137 [ 'content_id', 'content_address', 'content_model' ],
138 $from ? "content_id > $from" : '',
139 __METHOD__,
140 [ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ]
141 );
142 if ( !$res || !$res->numRows() ) {
143 break;
144 }
145 foreach ( $res as $row ) {
146 $from = $row->content_id;
147 $this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id;
148 }
149 }
150 $elapsed = microtime( true ) - $t0;
151 $this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" );
152 }
153
154 /**
155 * @param string $table
156 */
157 private function populateTable( $table ) {
158 $t0 = microtime( true );
159 $this->count = 0;
160 $this->writeln( "Populating $table..." );
161
162 if ( $table === 'revision' ) {
163 $idField = 'rev_id';
164 $tables = [ 'revision', 'slots', 'page' ];
165 $fields = [
166 'rev_id',
167 'len' => 'rev_len',
168 'sha1' => 'rev_sha1',
169 'text_id' => 'rev_text_id',
170 'content_model' => 'rev_content_model',
171 'namespace' => 'page_namespace',
172 'title' => 'page_title',
173 ];
174 $joins = [
175 'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ],
176 'page' => [ 'LEFT JOIN', 'rev_page=page_id' ],
177 ];
178 $startOption = 'start-revision';
179 } else {
180 $idField = 'ar_rev_id';
181 $tables = [ 'archive', 'slots' ];
182 $fields = [
183 'rev_id' => 'ar_rev_id',
184 'len' => 'ar_len',
185 'sha1' => 'ar_sha1',
186 'text_id' => 'ar_text_id',
187 'content_model' => 'ar_content_model',
188 'namespace' => 'ar_namespace',
189 'title' => 'ar_title',
190 ];
191 $joins = [
192 'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ],
193 ];
194 $startOption = 'start-archive';
195 }
196
197 if ( !$this->dbw->fieldExists( $table, $fields['text_id'], __METHOD__ ) ) {
198 $this->writeln( "No need to populate, $table.{$fields['text_id']} field does not exist" );
199 return;
200 }
201
202 $minmax = $this->dbw->selectRow(
203 $table,
204 [ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ],
205 '',
206 __METHOD__
207 );
208 if ( $this->hasOption( $startOption ) ) {
209 $minmax->min = (int)$this->getOption( $startOption );
210 }
211 if ( !$minmax || !is_numeric( $minmax->min ) || !is_numeric( $minmax->max ) ) {
212 // No rows?
213 $minmax = (object)[ 'min' => 1, 'max' => 0 ];
214 }
215
216 $batchSize = $this->getBatchSize();
217
218 for ( $startId = $minmax->min; $startId <= $minmax->max; $startId += $batchSize ) {
219 $endId = min( $startId + $batchSize - 1, $minmax->max );
220 $rows = $this->dbw->select(
221 $tables,
222 $fields,
223 [
224 "$idField >= $startId",
225 "$idField <= $endId",
226 'slot_revision_id IS NULL',
227 ],
228 __METHOD__,
229 [ 'ORDER BY' => 'rev_id' ],
230 $joins
231 );
232 if ( $rows->numRows() !== 0 ) {
233 $this->populateContentTablesForRowBatch( $rows, $startId, $table );
234 }
235
236 $elapsed = microtime( true ) - $t0;
237 $this->writeln(
238 "... $table processed up to revision id $endId of {$minmax->max}"
239 . " ($this->count rows in $elapsed seconds)"
240 );
241 }
242
243 $elapsed = microtime( true ) - $t0;
244 $this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" );
245 }
246
247 /**
248 * @param IResultWrapper $rows
249 * @param int $startId
250 * @param string $table
251 * @return int|null
252 */
253 private function populateContentTablesForRowBatch( IResultWrapper $rows, $startId, $table ) {
254 $this->beginTransaction( $this->dbw, __METHOD__ );
255
256 if ( $this->contentRowMap === null ) {
257 $map = [];
258 } else {
259 $map = &$this->contentRowMap;
260 }
261 $contentKeys = [];
262
263 try {
264 // Step 1: Figure out content rows needing insertion.
265 $contentRows = [];
266 foreach ( $rows as $row ) {
267 $revisionId = $row->rev_id;
268
269 Assert::invariant( $revisionId !== null, 'rev_id must not be null' );
270
271 $model = $this->getContentModel( $row );
272 $modelId = $this->contentModelStore->acquireId( $model );
273 $address = SqlBlobStore::makeAddressFromTextId( $row->text_id );
274
275 $key = "{$modelId}:{$address}";
276 $contentKeys[$revisionId] = $key;
277
278 if ( !isset( $map[$key] ) ) {
279 $this->fillMissingFields( $row, $model, $address );
280
281 $map[$key] = false;
282 $contentRows[] = [
283 'content_size' => (int)$row->len,
284 'content_sha1' => $row->sha1,
285 'content_model' => $modelId,
286 'content_address' => $address,
287 ];
288 }
289 }
290
291 // Step 2: Insert them, then read them back in for use in the next step.
292 if ( $contentRows ) {
293 $id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ );
294 $this->dbw->insert( 'content', $contentRows, __METHOD__ );
295 $res = $this->dbw->select(
296 'content',
297 [ 'content_id', 'content_model', 'content_address' ],
298 'content_id > ' . (int)$id,
299 __METHOD__
300 );
301 foreach ( $res as $row ) {
302 $key = $row->content_model . ':' . $row->content_address;
303 $map[$key] = $row->content_id;
304 }
305 }
306
307 // Step 3: Insert the slot rows.
308 $slotRows = [];
309 foreach ( $rows as $row ) {
310 $revisionId = $row->rev_id;
311 $contentId = $map[$contentKeys[$revisionId]] ?? false;
312 if ( $contentId === false ) {
313 throw new \RuntimeException( "Content row for $revisionId not found after content insert" );
314 }
315 $slotRows[] = [
316 'slot_revision_id' => $revisionId,
317 'slot_role_id' => $this->mainRoleId,
318 'slot_content_id' => $contentId,
319 // There's no way to really know the previous revision, so assume no inheriting.
320 // rev_parent_id can get changed on undeletions, and deletions can screw up
321 // rev_timestamp ordering.
322 'slot_origin' => $revisionId,
323 ];
324 }
325 $this->dbw->insert( 'slots', $slotRows, __METHOD__ );
326 $this->count += count( $slotRows );
327 $this->totalCount += count( $slotRows );
328 } catch ( \Exception $e ) {
329 $this->rollbackTransaction( $this->dbw, __METHOD__ );
330 $this->fatalError( "Failed to populate content table $table row batch starting at $startId "
331 . "due to exception: " . $e->__toString() );
332 }
333
334 $this->commitTransaction( $this->dbw, __METHOD__ );
335 }
336
337 /**
338 * @param \stdClass $row
339 * @return string
340 */
341 private function getContentModel( $row ) {
342 if ( isset( $row->content_model ) ) {
343 return $row->content_model;
344 }
345
346 $title = Title::makeTitle( $row->namespace, $row->title );
347
348 return ContentHandler::getDefaultModelFor( $title );
349 }
350
351 /**
352 * @param string $msg
353 */
354 private function writeln( $msg ) {
355 $this->output( "$msg\n" );
356 }
357
358 /**
359 * Compute any missing fields in $row.
360 * The way the missing values are computed must correspond to the way this is done in SlotRecord.
361 *
362 * @param object $row to be modified
363 * @param string $model
364 * @param string $address
365 */
366 private function fillMissingFields( $row, $model, $address ) {
367 if ( !isset( $row->content_model ) ) {
368 // just for completeness
369 $row->content_model = $model;
370 }
371
372 if ( isset( $row->len ) && isset( $row->sha1 ) && $row->sha1 !== '' ) {
373 // No need to load the content, quite now.
374 return;
375 }
376
377 $blob = $this->blobStore->getBlob( $address );
378
379 if ( !isset( $row->len ) ) {
380 // NOTE: The nominal size of the content may not be the length of the raw blob.
381 $handler = ContentHandler::getForModelID( $model );
382 $content = $handler->unserializeContent( $blob );
383
384 $row->len = $content->getSize();
385 }
386
387 if ( !isset( $row->sha1 ) || $row->sha1 === '' ) {
388 $row->sha1 = SlotRecord::base36Sha1( $blob );
389 }
390 }
391 }
392
393 $maintClass = 'PopulateContentTables';
394 require_once RUN_MAINTENANCE_IF_MAIN;