a1e157dc8e95a7011dd0605edc154fa7737b54da
[lhc/web/wiklou.git] / maintenance / storage / trackBlobs.php
1 <?php
2 /**
3 * Adds blobs from a given external storage cluster to the blob_tracking table.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Maintenance
22 * @see wfWaitForSlaves()
23 */
24
25 use MediaWiki\MediaWikiServices;
26 use Wikimedia\Rdbms\DBConnectionError;
27
28 require __DIR__ . '/../commandLine.inc';
29
30 if ( count( $args ) < 1 ) {
31 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
32 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
33 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
34
35 exit( 1 );
36 }
37 $tracker = new TrackBlobs( $args );
38 $tracker->run();
39 echo "All done.\n";
40
41 class TrackBlobs {
42 public $clusters, $textClause;
43 public $doBlobOrphans;
44 public $trackedBlobs = [];
45
46 public $batchSize = 1000;
47 public $reportingInterval = 10;
48
49 function __construct( $clusters ) {
50 $this->clusters = $clusters;
51 if ( extension_loaded( 'gmp' ) ) {
52 $this->doBlobOrphans = true;
53 foreach ( $clusters as $cluster ) {
54 $this->trackedBlobs[$cluster] = gmp_init( 0 );
55 }
56 } else {
57 echo "Warning: the gmp extension is needed to find orphan blobs\n";
58 }
59 }
60
61 function run() {
62 $this->checkIntegrity();
63 $this->initTrackingTable();
64 $this->trackRevisions();
65 $this->trackOrphanText();
66 if ( $this->doBlobOrphans ) {
67 $this->findOrphanBlobs();
68 }
69 }
70
71 function checkIntegrity() {
72 echo "Doing integrity check...\n";
73 $dbr = wfGetDB( DB_REPLICA );
74
75 // Scan for HistoryBlobStub objects in the text table (T22757)
76
77 $exists = $dbr->selectField( 'text', 1,
78 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
79 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
80 __METHOD__
81 );
82
83 if ( $exists ) {
84 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
85 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
86 "to fix this.\n";
87 exit( 1 );
88 }
89
90 // Scan the archive table for HistoryBlobStub objects or external flags (T24624)
91 $flags = $dbr->selectField( 'archive', 'ar_flags',
92 'ar_flags LIKE \'%external%\' OR (' .
93 'ar_flags LIKE \'%object%\' ' .
94 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
95 __METHOD__
96 );
97
98 if ( strpos( $flags, 'external' ) !== false ) {
99 echo "Integrity check failed: found external storage pointers in your archive table.\n" .
100 "Run normaliseArchiveTable.php to fix this.\n";
101 exit( 1 );
102 } elseif ( $flags ) {
103 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
104 "These objects are probably already broken, continuing would make them\n" .
105 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
106 exit( 1 );
107 }
108
109 echo "Integrity check OK\n";
110 }
111
112 function initTrackingTable() {
113 $dbw = wfGetDB( DB_MASTER );
114 if ( $dbw->tableExists( 'blob_tracking' ) ) {
115 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
116 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
117 }
118 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
119 }
120
121 function getTextClause() {
122 if ( !$this->textClause ) {
123 $dbr = wfGetDB( DB_REPLICA );
124 $this->textClause = '';
125 foreach ( $this->clusters as $cluster ) {
126 if ( $this->textClause != '' ) {
127 $this->textClause .= ' OR ';
128 }
129 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
130 }
131 }
132
133 return $this->textClause;
134 }
135
136 function interpretPointer( $text ) {
137 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
138 return false;
139 }
140
141 return [
142 'cluster' => $m[1],
143 'id' => intval( $m[2] ),
144 'hash' => isset( $m[3] ) ? $m[3] : null
145 ];
146 }
147
148 /**
149 * Scan the revision table for rows stored in the specified clusters
150 */
151 function trackRevisions() {
152 $dbw = wfGetDB( DB_MASTER );
153 $dbr = wfGetDB( DB_REPLICA );
154
155 $textClause = $this->getTextClause();
156 $startId = 0;
157 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', '', __METHOD__ );
158 $batchesDone = 0;
159 $rowsInserted = 0;
160
161 echo "Finding revisions...\n";
162
163 while ( true ) {
164 $res = $dbr->select( [ 'revision', 'text' ],
165 [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ],
166 [
167 'rev_id > ' . $dbr->addQuotes( $startId ),
168 'rev_text_id=old_id',
169 $textClause,
170 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
171 ],
172 __METHOD__,
173 [
174 'ORDER BY' => 'rev_id',
175 'LIMIT' => $this->batchSize
176 ]
177 );
178 if ( !$res->numRows() ) {
179 break;
180 }
181
182 $insertBatch = [];
183 foreach ( $res as $row ) {
184 $startId = $row->rev_id;
185 $info = $this->interpretPointer( $row->old_text );
186 if ( !$info ) {
187 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
188 continue;
189 }
190 if ( !in_array( $info['cluster'], $this->clusters ) ) {
191 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
192 continue;
193 }
194 $insertBatch[] = [
195 'bt_page' => $row->rev_page,
196 'bt_rev_id' => $row->rev_id,
197 'bt_text_id' => $row->old_id,
198 'bt_cluster' => $info['cluster'],
199 'bt_blob_id' => $info['id'],
200 'bt_cgz_hash' => $info['hash']
201 ];
202 if ( $this->doBlobOrphans ) {
203 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
204 }
205 }
206 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
207 $rowsInserted += count( $insertBatch );
208
209 ++$batchesDone;
210 if ( $batchesDone >= $this->reportingInterval ) {
211 $batchesDone = 0;
212 echo "$startId / $endId\n";
213 wfWaitForSlaves();
214 }
215 }
216 echo "Found $rowsInserted revisions\n";
217 }
218
219 /**
220 * Scan the text table for orphan text
221 * Orphan text here does not imply DB corruption -- deleted text tracked by the
222 * archive table counts as orphan for our purposes.
223 */
224 function trackOrphanText() {
225 # Wait until the blob_tracking table is available in the replica DB
226 $dbw = wfGetDB( DB_MASTER );
227 $dbr = wfGetDB( DB_REPLICA );
228 $pos = $dbw->getMasterPos();
229 $dbr->masterPosWait( $pos, 100000 );
230
231 $textClause = $this->getTextClause( $this->clusters );
232 $startId = 0;
233 $endId = $dbr->selectField( 'text', 'MAX(old_id)', '', __METHOD__ );
234 $rowsInserted = 0;
235 $batchesDone = 0;
236
237 echo "Finding orphan text...\n";
238
239 # Scan the text table for orphan text
240 while ( true ) {
241 $res = $dbr->select( [ 'text', 'blob_tracking' ],
242 [ 'old_id', 'old_flags', 'old_text' ],
243 [
244 'old_id>' . $dbr->addQuotes( $startId ),
245 $textClause,
246 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
247 'bt_text_id IS NULL'
248 ],
249 __METHOD__,
250 [
251 'ORDER BY' => 'old_id',
252 'LIMIT' => $this->batchSize
253 ],
254 [ 'blob_tracking' => [ 'LEFT JOIN', 'bt_text_id=old_id' ] ]
255 );
256 $ids = [];
257 foreach ( $res as $row ) {
258 $ids[] = $row->old_id;
259 }
260
261 if ( !$res->numRows() ) {
262 break;
263 }
264
265 $insertBatch = [];
266 foreach ( $res as $row ) {
267 $startId = $row->old_id;
268 $info = $this->interpretPointer( $row->old_text );
269 if ( !$info ) {
270 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
271 continue;
272 }
273 if ( !in_array( $info['cluster'], $this->clusters ) ) {
274 echo "Invalid cluster returned in SQL query\n";
275 continue;
276 }
277
278 $insertBatch[] = [
279 'bt_page' => 0,
280 'bt_rev_id' => 0,
281 'bt_text_id' => $row->old_id,
282 'bt_cluster' => $info['cluster'],
283 'bt_blob_id' => $info['id'],
284 'bt_cgz_hash' => $info['hash']
285 ];
286 if ( $this->doBlobOrphans ) {
287 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
288 }
289 }
290 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
291
292 $rowsInserted += count( $insertBatch );
293 ++$batchesDone;
294 if ( $batchesDone >= $this->reportingInterval ) {
295 $batchesDone = 0;
296 echo "$startId / $endId\n";
297 wfWaitForSlaves();
298 }
299 }
300 echo "Found $rowsInserted orphan text rows\n";
301 }
302
303 /**
304 * Scan the blobs table for rows not registered in blob_tracking (and thus not
305 * registered in the text table).
306 *
307 * Orphan blobs are indicative of DB corruption. They are inaccessible and
308 * should probably be deleted.
309 */
310 function findOrphanBlobs() {
311 if ( !extension_loaded( 'gmp' ) ) {
312 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
313
314 return;
315 }
316
317 $dbw = wfGetDB( DB_MASTER );
318
319 foreach ( $this->clusters as $cluster ) {
320 echo "Searching for orphan blobs in $cluster...\n";
321 $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
322 $lb = $lbFactory->getExternalLB( $cluster );
323 try {
324 $extDB = $lb->getConnection( DB_REPLICA );
325 } catch ( DBConnectionError $e ) {
326 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
327 echo "No database on $cluster\n";
328 } else {
329 echo "Error on $cluster: " . $e->getMessage() . "\n";
330 }
331 continue;
332 }
333 $table = $extDB->getLBInfo( 'blobs table' );
334 if ( is_null( $table ) ) {
335 $table = 'blobs';
336 }
337 if ( !$extDB->tableExists( $table ) ) {
338 echo "No blobs table on cluster $cluster\n";
339 continue;
340 }
341 $startId = 0;
342 $batchesDone = 0;
343 $actualBlobs = gmp_init( 0 );
344 $endId = $extDB->selectField( $table, 'MAX(blob_id)', '', __METHOD__ );
345
346 // Build a bitmap of actual blob rows
347 while ( true ) {
348 $res = $extDB->select( $table,
349 [ 'blob_id' ],
350 [ 'blob_id > ' . $extDB->addQuotes( $startId ) ],
351 __METHOD__,
352 [ 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ]
353 );
354
355 if ( !$res->numRows() ) {
356 break;
357 }
358
359 foreach ( $res as $row ) {
360 gmp_setbit( $actualBlobs, $row->blob_id );
361 }
362 $startId = $row->blob_id;
363
364 ++$batchesDone;
365 if ( $batchesDone >= $this->reportingInterval ) {
366 $batchesDone = 0;
367 echo "$startId / $endId\n";
368 }
369 }
370
371 // Find actual blobs that weren't tracked by the previous passes
372 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
373 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
374
375 // Traverse the orphan list
376 $insertBatch = [];
377 $id = 0;
378 $numOrphans = 0;
379 while ( true ) {
380 $id = gmp_scan1( $orphans, $id );
381 if ( $id == -1 ) {
382 break;
383 }
384 $insertBatch[] = [
385 'bo_cluster' => $cluster,
386 'bo_blob_id' => $id
387 ];
388 if ( count( $insertBatch ) > $this->batchSize ) {
389 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
390 $insertBatch = [];
391 }
392
393 ++$id;
394 ++$numOrphans;
395 }
396 if ( $insertBatch ) {
397 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
398 }
399 echo "Found $numOrphans orphan(s) in $cluster\n";
400 }
401 }
402 }