Merge "Add 3D filetype for STL files"
[lhc/web/wiklou.git] / maintenance / storage / trackBlobs.php
1 <?php
2 /**
3 * Adds blobs from a given external storage cluster to the blob_tracking table.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Maintenance
22 * @see wfWaitForSlaves()
23 */
24
25 use Wikimedia\Rdbms\DBConnectionError;
26
27 require __DIR__ . '/../commandLine.inc';
28
29 if ( count( $args ) < 1 ) {
30 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
31 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
32 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
33
34 exit( 1 );
35 }
36 $tracker = new TrackBlobs( $args );
37 $tracker->run();
38 echo "All done.\n";
39
40 class TrackBlobs {
41 public $clusters, $textClause;
42 public $doBlobOrphans;
43 public $trackedBlobs = [];
44
45 public $batchSize = 1000;
46 public $reportingInterval = 10;
47
48 function __construct( $clusters ) {
49 $this->clusters = $clusters;
50 if ( extension_loaded( 'gmp' ) ) {
51 $this->doBlobOrphans = true;
52 foreach ( $clusters as $cluster ) {
53 $this->trackedBlobs[$cluster] = gmp_init( 0 );
54 }
55 } else {
56 echo "Warning: the gmp extension is needed to find orphan blobs\n";
57 }
58 }
59
60 function run() {
61 $this->checkIntegrity();
62 $this->initTrackingTable();
63 $this->trackRevisions();
64 $this->trackOrphanText();
65 if ( $this->doBlobOrphans ) {
66 $this->findOrphanBlobs();
67 }
68 }
69
70 function checkIntegrity() {
71 echo "Doing integrity check...\n";
72 $dbr = wfGetDB( DB_REPLICA );
73
74 // Scan for HistoryBlobStub objects in the text table (T22757)
75
76 $exists = $dbr->selectField( 'text', 1,
77 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
78 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
79 __METHOD__
80 );
81
82 if ( $exists ) {
83 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
84 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
85 "to fix this.\n";
86 exit( 1 );
87 }
88
89 // Scan the archive table for HistoryBlobStub objects or external flags (T24624)
90 $flags = $dbr->selectField( 'archive', 'ar_flags',
91 'ar_flags LIKE \'%external%\' OR (' .
92 'ar_flags LIKE \'%object%\' ' .
93 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
94 __METHOD__
95 );
96
97 if ( strpos( $flags, 'external' ) !== false ) {
98 echo "Integrity check failed: found external storage pointers in your archive table.\n" .
99 "Run normaliseArchiveTable.php to fix this.\n";
100 exit( 1 );
101 } elseif ( $flags ) {
102 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
103 "These objects are probably already broken, continuing would make them\n" .
104 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
105 exit( 1 );
106 }
107
108 echo "Integrity check OK\n";
109 }
110
111 function initTrackingTable() {
112 $dbw = wfGetDB( DB_MASTER );
113 if ( $dbw->tableExists( 'blob_tracking' ) ) {
114 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
115 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
116 }
117 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
118 }
119
120 function getTextClause() {
121 if ( !$this->textClause ) {
122 $dbr = wfGetDB( DB_REPLICA );
123 $this->textClause = '';
124 foreach ( $this->clusters as $cluster ) {
125 if ( $this->textClause != '' ) {
126 $this->textClause .= ' OR ';
127 }
128 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
129 }
130 }
131
132 return $this->textClause;
133 }
134
135 function interpretPointer( $text ) {
136 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
137 return false;
138 }
139
140 return [
141 'cluster' => $m[1],
142 'id' => intval( $m[2] ),
143 'hash' => isset( $m[3] ) ? $m[3] : null
144 ];
145 }
146
147 /**
148 * Scan the revision table for rows stored in the specified clusters
149 */
150 function trackRevisions() {
151 $dbw = wfGetDB( DB_MASTER );
152 $dbr = wfGetDB( DB_REPLICA );
153
154 $textClause = $this->getTextClause();
155 $startId = 0;
156 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
157 $batchesDone = 0;
158 $rowsInserted = 0;
159
160 echo "Finding revisions...\n";
161
162 while ( true ) {
163 $res = $dbr->select( [ 'revision', 'text' ],
164 [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ],
165 [
166 'rev_id > ' . $dbr->addQuotes( $startId ),
167 'rev_text_id=old_id',
168 $textClause,
169 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
170 ],
171 __METHOD__,
172 [
173 'ORDER BY' => 'rev_id',
174 'LIMIT' => $this->batchSize
175 ]
176 );
177 if ( !$res->numRows() ) {
178 break;
179 }
180
181 $insertBatch = [];
182 foreach ( $res as $row ) {
183 $startId = $row->rev_id;
184 $info = $this->interpretPointer( $row->old_text );
185 if ( !$info ) {
186 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
187 continue;
188 }
189 if ( !in_array( $info['cluster'], $this->clusters ) ) {
190 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
191 continue;
192 }
193 $insertBatch[] = [
194 'bt_page' => $row->rev_page,
195 'bt_rev_id' => $row->rev_id,
196 'bt_text_id' => $row->old_id,
197 'bt_cluster' => $info['cluster'],
198 'bt_blob_id' => $info['id'],
199 'bt_cgz_hash' => $info['hash']
200 ];
201 if ( $this->doBlobOrphans ) {
202 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
203 }
204 }
205 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
206 $rowsInserted += count( $insertBatch );
207
208 ++$batchesDone;
209 if ( $batchesDone >= $this->reportingInterval ) {
210 $batchesDone = 0;
211 echo "$startId / $endId\n";
212 wfWaitForSlaves();
213 }
214 }
215 echo "Found $rowsInserted revisions\n";
216 }
217
218 /**
219 * Scan the text table for orphan text
220 * Orphan text here does not imply DB corruption -- deleted text tracked by the
221 * archive table counts as orphan for our purposes.
222 */
223 function trackOrphanText() {
224 # Wait until the blob_tracking table is available in the replica DB
225 $dbw = wfGetDB( DB_MASTER );
226 $dbr = wfGetDB( DB_REPLICA );
227 $pos = $dbw->getMasterPos();
228 $dbr->masterPosWait( $pos, 100000 );
229
230 $textClause = $this->getTextClause( $this->clusters );
231 $startId = 0;
232 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
233 $rowsInserted = 0;
234 $batchesDone = 0;
235
236 echo "Finding orphan text...\n";
237
238 # Scan the text table for orphan text
239 while ( true ) {
240 $res = $dbr->select( [ 'text', 'blob_tracking' ],
241 [ 'old_id', 'old_flags', 'old_text' ],
242 [
243 'old_id>' . $dbr->addQuotes( $startId ),
244 $textClause,
245 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
246 'bt_text_id IS NULL'
247 ],
248 __METHOD__,
249 [
250 'ORDER BY' => 'old_id',
251 'LIMIT' => $this->batchSize
252 ],
253 [ 'blob_tracking' => [ 'LEFT JOIN', 'bt_text_id=old_id' ] ]
254 );
255 $ids = [];
256 foreach ( $res as $row ) {
257 $ids[] = $row->old_id;
258 }
259
260 if ( !$res->numRows() ) {
261 break;
262 }
263
264 $insertBatch = [];
265 foreach ( $res as $row ) {
266 $startId = $row->old_id;
267 $info = $this->interpretPointer( $row->old_text );
268 if ( !$info ) {
269 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
270 continue;
271 }
272 if ( !in_array( $info['cluster'], $this->clusters ) ) {
273 echo "Invalid cluster returned in SQL query\n";
274 continue;
275 }
276
277 $insertBatch[] = [
278 'bt_page' => 0,
279 'bt_rev_id' => 0,
280 'bt_text_id' => $row->old_id,
281 'bt_cluster' => $info['cluster'],
282 'bt_blob_id' => $info['id'],
283 'bt_cgz_hash' => $info['hash']
284 ];
285 if ( $this->doBlobOrphans ) {
286 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
287 }
288 }
289 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
290
291 $rowsInserted += count( $insertBatch );
292 ++$batchesDone;
293 if ( $batchesDone >= $this->reportingInterval ) {
294 $batchesDone = 0;
295 echo "$startId / $endId\n";
296 wfWaitForSlaves();
297 }
298 }
299 echo "Found $rowsInserted orphan text rows\n";
300 }
301
302 /**
303 * Scan the blobs table for rows not registered in blob_tracking (and thus not
304 * registered in the text table).
305 *
306 * Orphan blobs are indicative of DB corruption. They are inaccessible and
307 * should probably be deleted.
308 */
309 function findOrphanBlobs() {
310 if ( !extension_loaded( 'gmp' ) ) {
311 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
312
313 return;
314 }
315
316 $dbw = wfGetDB( DB_MASTER );
317
318 foreach ( $this->clusters as $cluster ) {
319 echo "Searching for orphan blobs in $cluster...\n";
320 $lb = wfGetLBFactory()->getExternalLB( $cluster );
321 try {
322 $extDB = $lb->getConnection( DB_REPLICA );
323 } catch ( DBConnectionError $e ) {
324 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
325 echo "No database on $cluster\n";
326 } else {
327 echo "Error on $cluster: " . $e->getMessage() . "\n";
328 }
329 continue;
330 }
331 $table = $extDB->getLBInfo( 'blobs table' );
332 if ( is_null( $table ) ) {
333 $table = 'blobs';
334 }
335 if ( !$extDB->tableExists( $table ) ) {
336 echo "No blobs table on cluster $cluster\n";
337 continue;
338 }
339 $startId = 0;
340 $batchesDone = 0;
341 $actualBlobs = gmp_init( 0 );
342 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
343
344 // Build a bitmap of actual blob rows
345 while ( true ) {
346 $res = $extDB->select( $table,
347 [ 'blob_id' ],
348 [ 'blob_id > ' . $extDB->addQuotes( $startId ) ],
349 __METHOD__,
350 [ 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ]
351 );
352
353 if ( !$res->numRows() ) {
354 break;
355 }
356
357 foreach ( $res as $row ) {
358 gmp_setbit( $actualBlobs, $row->blob_id );
359 }
360 $startId = $row->blob_id;
361
362 ++$batchesDone;
363 if ( $batchesDone >= $this->reportingInterval ) {
364 $batchesDone = 0;
365 echo "$startId / $endId\n";
366 }
367 }
368
369 // Find actual blobs that weren't tracked by the previous passes
370 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
371 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
372
373 // Traverse the orphan list
374 $insertBatch = [];
375 $id = 0;
376 $numOrphans = 0;
377 while ( true ) {
378 $id = gmp_scan1( $orphans, $id );
379 if ( $id == -1 ) {
380 break;
381 }
382 $insertBatch[] = [
383 'bo_cluster' => $cluster,
384 'bo_blob_id' => $id
385 ];
386 if ( count( $insertBatch ) > $this->batchSize ) {
387 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
388 $insertBatch = [];
389 }
390
391 ++$id;
392 ++$numOrphans;
393 }
394 if ( $insertBatch ) {
395 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
396 }
397 echo "Found $numOrphans orphan(s) in $cluster\n";
398 }
399 }
400 }