Merge "Improve docs for Title::getInternalURL/getCanonicalURL"
[lhc/web/wiklou.git] / maintenance / populateContentTables.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 * @file
19 * @ingroup Maintenance
20 */
21
22 use MediaWiki\MediaWikiServices;
23 use MediaWiki\Revision\SlotRecord;
24 use MediaWiki\Storage\NameTableStore;
25 use MediaWiki\Storage\SqlBlobStore;
26 use Wikimedia\Assert\Assert;
27 use Wikimedia\Rdbms\IDatabase;
28 use Wikimedia\Rdbms\ResultWrapper;
29
30 require_once __DIR__ . '/Maintenance.php';
31
32 /**
33 * Populate the content and slot tables.
34 * @since 1.32
35 */
36 class PopulateContentTables extends Maintenance {
37
38 /** @var IDatabase */
39 private $dbw;
40
41 /** @var NameTableStore */
42 private $contentModelStore;
43
44 /** @var int */
45 private $mainRoleId;
46
47 /** @var array|null Map "{$modelId}:{$address}" to content_id */
48 private $contentRowMap = null;
49
50 private $count = 0, $totalCount = 0;
51
52 public function __construct() {
53 parent::__construct();
54
55 $this->addDescription( 'Populate content and slot tables' );
56 $this->addOption( 'table', 'revision or archive table, or `all` to populate both', false,
57 true );
58 $this->addOption( 'reuse-content',
59 'Reuse content table rows when the address and model are the same. '
60 . 'This will increase the script\'s time and memory usage, perhaps significantly.',
61 false, false );
62 $this->addOption( 'start-revision', 'The rev_id to start at', false, true );
63 $this->addOption( 'start-archive', 'The ar_rev_id to start at', false, true );
64 $this->setBatchSize( 500 );
65 }
66
67 private function initServices() {
68 $this->dbw = $this->getDB( DB_MASTER );
69 $this->contentModelStore = MediaWikiServices::getInstance()->getContentModelStore();
70 $this->mainRoleId = MediaWikiServices::getInstance()->getSlotRoleStore()
71 ->acquireId( SlotRecord::MAIN );
72 }
73
74 public function execute() {
75 global $wgMultiContentRevisionSchemaMigrationStage;
76
77 $t0 = microtime( true );
78
79 if ( ( $wgMultiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_WRITE_NEW ) === 0 ) {
80 $this->writeln(
81 '...cannot update while \$wgMultiContentRevisionSchemaMigrationStage '
82 . 'does not have the SCHEMA_COMPAT_WRITE_NEW bit set.'
83 );
84 return false;
85 }
86
87 $this->initServices();
88
89 if ( $this->getOption( 'reuse-content', false ) ) {
90 $this->loadContentMap();
91 }
92
93 foreach ( $this->getTables() as $table ) {
94 $this->populateTable( $table );
95 }
96
97 $elapsed = microtime( true ) - $t0;
98 $this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" );
99 return true;
100 }
101
102 /**
103 * @return string[]
104 */
105 private function getTables() {
106 $table = $this->getOption( 'table', 'all' );
107 $validTableOptions = [ 'all', 'revision', 'archive' ];
108
109 if ( !in_array( $table, $validTableOptions ) ) {
110 $this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' );
111 }
112
113 if ( $table === 'all' ) {
114 $tables = [ 'revision', 'archive' ];
115 } else {
116 $tables = [ $table ];
117 }
118
119 return $tables;
120 }
121
122 private function loadContentMap() {
123 $t0 = microtime( true );
124 $this->writeln( "Loading existing content table rows..." );
125 $this->contentRowMap = [];
126 $dbr = $this->getDB( DB_REPLICA );
127 $from = false;
128 while ( true ) {
129 $res = $dbr->select(
130 'content',
131 [ 'content_id', 'content_address', 'content_model' ],
132 $from ? "content_id > $from" : '',
133 __METHOD__,
134 [ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ]
135 );
136 if ( !$res || !$res->numRows() ) {
137 break;
138 }
139 foreach ( $res as $row ) {
140 $from = $row->content_id;
141 $this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id;
142 }
143 }
144 $elapsed = microtime( true ) - $t0;
145 $this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" );
146 }
147
148 /**
149 * @param string $table
150 */
151 private function populateTable( $table ) {
152 $t0 = microtime( true );
153 $this->count = 0;
154 $this->writeln( "Populating $table..." );
155
156 if ( $table === 'revision' ) {
157 $idField = 'rev_id';
158 $tables = [ 'revision', 'slots', 'page' ];
159 $fields = [
160 'rev_id',
161 'len' => 'rev_len',
162 'sha1' => 'rev_sha1',
163 'text_id' => 'rev_text_id',
164 'content_model' => 'rev_content_model',
165 'namespace' => 'page_namespace',
166 'title' => 'page_title',
167 ];
168 $joins = [
169 'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ],
170 'page' => [ 'LEFT JOIN', 'rev_page=page_id' ],
171 ];
172 $startOption = 'start-revision';
173 } else {
174 $idField = 'ar_rev_id';
175 $tables = [ 'archive', 'slots' ];
176 $fields = [
177 'rev_id' => 'ar_rev_id',
178 'len' => 'ar_len',
179 'sha1' => 'ar_sha1',
180 'text_id' => 'ar_text_id',
181 'content_model' => 'ar_content_model',
182 'namespace' => 'ar_namespace',
183 'title' => 'ar_title',
184 ];
185 $joins = [
186 'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ],
187 ];
188 $startOption = 'start-archive';
189 }
190
191 if ( !$this->dbw->fieldExists( $table, $fields['text_id'], __METHOD__ ) ) {
192 $this->writeln( "No need to populate, $table.{$fields['text_id']} field does not exist" );
193 return;
194 }
195
196 $minmax = $this->dbw->selectRow(
197 $table,
198 [ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ],
199 '',
200 __METHOD__
201 );
202 if ( $this->hasOption( $startOption ) ) {
203 $minmax->min = (int)$this->getOption( $startOption );
204 }
205 if ( !$minmax || !is_numeric( $minmax->min ) || !is_numeric( $minmax->max ) ) {
206 // No rows?
207 $minmax = (object)[ 'min' => 1, 'max' => 0 ];
208 }
209
210 $batchSize = $this->getBatchSize();
211
212 for ( $startId = $minmax->min; $startId <= $minmax->max; $startId += $batchSize ) {
213 $endId = min( $startId + $batchSize - 1, $minmax->max );
214 $rows = $this->dbw->select(
215 $tables,
216 $fields,
217 [
218 "$idField >= $startId",
219 "$idField <= $endId",
220 'slot_revision_id IS NULL',
221 ],
222 __METHOD__,
223 [ 'ORDER BY' => 'rev_id' ],
224 $joins
225 );
226 if ( $rows->numRows() !== 0 ) {
227 $this->populateContentTablesForRowBatch( $rows, $startId, $table );
228 }
229
230 $elapsed = microtime( true ) - $t0;
231 $this->writeln(
232 "... $table processed up to revision id $endId of {$minmax->max}"
233 . " ($this->count rows in $elapsed seconds)"
234 );
235 }
236
237 $elapsed = microtime( true ) - $t0;
238 $this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" );
239 }
240
241 /**
242 * @param ResultWrapper $rows
243 * @param int $startId
244 * @param string $table
245 * @return int|null
246 */
247 private function populateContentTablesForRowBatch( ResultWrapper $rows, $startId, $table ) {
248 $this->beginTransaction( $this->dbw, __METHOD__ );
249
250 if ( $this->contentRowMap === null ) {
251 $map = [];
252 } else {
253 $map = &$this->contentRowMap;
254 }
255 $contentKeys = [];
256
257 try {
258 // Step 1: Figure out content rows needing insertion.
259 $contentRows = [];
260 foreach ( $rows as $row ) {
261 $revisionId = $row->rev_id;
262
263 Assert::invariant( $revisionId !== null, 'rev_id must not be null' );
264
265 $modelId = $this->contentModelStore->acquireId( $this->getContentModel( $row ) );
266 $address = SqlBlobStore::makeAddressFromTextId( $row->text_id );
267
268 $key = "{$modelId}:{$address}";
269 $contentKeys[$revisionId] = $key;
270
271 if ( !isset( $map[$key] ) ) {
272 $map[$key] = false;
273 $contentRows[] = [
274 'content_size' => (int)$row->len,
275 'content_sha1' => $row->sha1,
276 'content_model' => $modelId,
277 'content_address' => $address,
278 ];
279 }
280 }
281
282 // Step 2: Insert them, then read them back in for use in the next step.
283 if ( $contentRows ) {
284 $id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ );
285 $this->dbw->insert( 'content', $contentRows, __METHOD__ );
286 $res = $this->dbw->select(
287 'content',
288 [ 'content_id', 'content_model', 'content_address' ],
289 'content_id > ' . (int)$id,
290 __METHOD__
291 );
292 foreach ( $res as $row ) {
293 $key = $row->content_model . ':' . $row->content_address;
294 $map[$key] = $row->content_id;
295 }
296 }
297
298 // Step 3: Insert the slot rows.
299 $slotRows = [];
300 foreach ( $rows as $row ) {
301 $revisionId = $row->rev_id;
302 $contentId = $map[$contentKeys[$revisionId]] ?? false;
303 if ( $contentId === false ) {
304 throw new \RuntimeException( "Content row for $revisionId not found after content insert" );
305 }
306 $slotRows[] = [
307 'slot_revision_id' => $revisionId,
308 'slot_role_id' => $this->mainRoleId,
309 'slot_content_id' => $contentId,
310 // There's no way to really know the previous revision, so assume no inheriting.
311 // rev_parent_id can get changed on undeletions, and deletions can screw up
312 // rev_timestamp ordering.
313 'slot_origin' => $revisionId,
314 ];
315 }
316 $this->dbw->insert( 'slots', $slotRows, __METHOD__ );
317 $this->count += count( $slotRows );
318 $this->totalCount += count( $slotRows );
319 } catch ( \Exception $e ) {
320 $this->rollbackTransaction( $this->dbw, __METHOD__ );
321 $this->fatalError( "Failed to populate content table $table row batch starting at $startId "
322 . "due to exception: " . $e->__toString() );
323 }
324
325 $this->commitTransaction( $this->dbw, __METHOD__ );
326 }
327
328 /**
329 * @param \stdClass $row
330 * @return string
331 */
332 private function getContentModel( $row ) {
333 if ( isset( $row->content_model ) ) {
334 return $row->content_model;
335 }
336
337 $title = Title::makeTitle( $row->namespace, $row->title );
338
339 return ContentHandler::getDefaultModelFor( $title );
340 }
341
342 /**
343 * @param string $msg
344 */
345 private function writeln( $msg ) {
346 $this->output( "$msg\n" );
347 }
348 }
349
350 $maintClass = 'PopulateContentTables';
351 require_once RUN_MAINTENANCE_IF_MAIN;