phpcs: More require/include is not a function
[lhc/web/wiklou.git] / maintenance / storage / trackBlobs.php
1 <?php
2 /**
3 * Adds blobs from a given external storage cluster to the blob_tracking table.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Maintenance
22 * @see wfWaitForSlaves()
23 */
24
25 require __DIR__ . '/../commandLine.inc';
26
27
28 if ( count( $args ) < 1 ) {
29 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
30 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
31 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
32
33 exit( 1 );
34 }
35 $tracker = new TrackBlobs( $args );
36 $tracker->run();
37 echo "All done.\n";
38
39 class TrackBlobs {
40 public $clusters, $textClause;
41 public $doBlobOrphans;
42 public $trackedBlobs = array();
43
44 public $batchSize = 1000;
45 public $reportingInterval = 10;
46
47 function __construct( $clusters ) {
48 $this->clusters = $clusters;
49 if ( extension_loaded( 'gmp' ) ) {
50 $this->doBlobOrphans = true;
51 foreach ( $clusters as $cluster ) {
52 $this->trackedBlobs[$cluster] = gmp_init( 0 );
53 }
54 } else {
55 echo "Warning: the gmp extension is needed to find orphan blobs\n";
56 }
57 }
58
59 function run() {
60 $this->checkIntegrity();
61 $this->initTrackingTable();
62 $this->trackRevisions();
63 $this->trackOrphanText();
64 if ( $this->doBlobOrphans ) {
65 $this->findOrphanBlobs();
66 }
67 }
68
69 function checkIntegrity() {
70 echo "Doing integrity check...\n";
71 $dbr = wfGetDB( DB_SLAVE );
72
73 // Scan for HistoryBlobStub objects in the text table (bug 20757)
74
75 $exists = $dbr->selectField( 'text', 1,
76 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
77 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
78 __METHOD__
79 );
80
81 if ( $exists ) {
82 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
83 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
84 "to fix this.\n";
85 exit( 1 );
86 }
87
88 // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
89 $flags = $dbr->selectField( 'archive', 'ar_flags',
90 'ar_flags LIKE \'%external%\' OR (' .
91 'ar_flags LIKE \'%object%\' ' .
92 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
93 __METHOD__
94 );
95
96 if ( strpos( $flags, 'external' ) !== false ) {
97 echo "Integrity check failed: found external storage pointers in your archive table.\n" .
98 "Run normaliseArchiveTable.php to fix this.\n";
99 exit( 1 );
100 } elseif ( $flags ) {
101 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
102 "These objects are probably already broken, continuing would make them\n" .
103 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
104 exit( 1 );
105 }
106
107 echo "Integrity check OK\n";
108 }
109
110 function initTrackingTable() {
111 $dbw = wfGetDB( DB_MASTER );
112 if ( $dbw->tableExists( 'blob_tracking' ) ) {
113 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
114 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
115 }
116 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
117 }
118
119 function getTextClause() {
120 if ( !$this->textClause ) {
121 $dbr = wfGetDB( DB_SLAVE );
122 $this->textClause = '';
123 foreach ( $this->clusters as $cluster ) {
124 if ( $this->textClause != '' ) {
125 $this->textClause .= ' OR ';
126 }
127 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
128 }
129 }
130 return $this->textClause;
131 }
132
133 function interpretPointer( $text ) {
134 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
135 return false;
136 }
137 return array(
138 'cluster' => $m[1],
139 'id' => intval( $m[2] ),
140 'hash' => isset( $m[3] ) ? $m[3] : null
141 );
142 }
143
144 /**
145 * Scan the revision table for rows stored in the specified clusters
146 */
147 function trackRevisions() {
148 $dbw = wfGetDB( DB_MASTER );
149 $dbr = wfGetDB( DB_SLAVE );
150
151 $textClause = $this->getTextClause();
152 $startId = 0;
153 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
154 $batchesDone = 0;
155 $rowsInserted = 0;
156
157 echo "Finding revisions...\n";
158
159 while ( true ) {
160 $res = $dbr->select( array( 'revision', 'text' ),
161 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
162 array(
163 'rev_id > ' . $dbr->addQuotes( $startId ),
164 'rev_text_id=old_id',
165 $textClause,
166 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
167 ),
168 __METHOD__,
169 array(
170 'ORDER BY' => 'rev_id',
171 'LIMIT' => $this->batchSize
172 )
173 );
174 if ( !$res->numRows() ) {
175 break;
176 }
177
178 $insertBatch = array();
179 foreach ( $res as $row ) {
180 $startId = $row->rev_id;
181 $info = $this->interpretPointer( $row->old_text );
182 if ( !$info ) {
183 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
184 continue;
185 }
186 if ( !in_array( $info['cluster'], $this->clusters ) ) {
187 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
188 continue;
189 }
190 $insertBatch[] = array(
191 'bt_page' => $row->rev_page,
192 'bt_rev_id' => $row->rev_id,
193 'bt_text_id' => $row->old_id,
194 'bt_cluster' => $info['cluster'],
195 'bt_blob_id' => $info['id'],
196 'bt_cgz_hash' => $info['hash']
197 );
198 if ( $this->doBlobOrphans ) {
199 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
200 }
201 }
202 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
203 $rowsInserted += count( $insertBatch );
204
205 ++$batchesDone;
206 if ( $batchesDone >= $this->reportingInterval ) {
207 $batchesDone = 0;
208 echo "$startId / $endId\n";
209 wfWaitForSlaves();
210 }
211 }
212 echo "Found $rowsInserted revisions\n";
213 }
214
215 /**
216 * Scan the text table for orphan text
217 * Orphan text here does not imply DB corruption -- deleted text tracked by the
218 * archive table counts as orphan for our purposes.
219 */
220 function trackOrphanText() {
221 # Wait until the blob_tracking table is available in the slave
222 $dbw = wfGetDB( DB_MASTER );
223 $dbr = wfGetDB( DB_SLAVE );
224 $pos = $dbw->getMasterPos();
225 $dbr->masterPosWait( $pos, 100000 );
226
227 $textClause = $this->getTextClause( $this->clusters );
228 $startId = 0;
229 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
230 $rowsInserted = 0;
231 $batchesDone = 0;
232
233 echo "Finding orphan text...\n";
234
235 # Scan the text table for orphan text
236 while ( true ) {
237 $res = $dbr->select( array( 'text', 'blob_tracking' ),
238 array( 'old_id', 'old_flags', 'old_text' ),
239 array(
240 'old_id>' . $dbr->addQuotes( $startId ),
241 $textClause,
242 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
243 'bt_text_id IS NULL'
244 ),
245 __METHOD__,
246 array(
247 'ORDER BY' => 'old_id',
248 'LIMIT' => $this->batchSize
249 ),
250 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
251 );
252 $ids = array();
253 foreach ( $res as $row ) {
254 $ids[] = $row->old_id;
255 }
256
257 if ( !$res->numRows() ) {
258 break;
259 }
260
261 $insertBatch = array();
262 foreach ( $res as $row ) {
263 $startId = $row->old_id;
264 $info = $this->interpretPointer( $row->old_text );
265 if ( !$info ) {
266 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
267 continue;
268 }
269 if ( !in_array( $info['cluster'], $this->clusters ) ) {
270 echo "Invalid cluster returned in SQL query\n";
271 continue;
272 }
273
274 $insertBatch[] = array(
275 'bt_page' => 0,
276 'bt_rev_id' => 0,
277 'bt_text_id' => $row->old_id,
278 'bt_cluster' => $info['cluster'],
279 'bt_blob_id' => $info['id'],
280 'bt_cgz_hash' => $info['hash']
281 );
282 if ( $this->doBlobOrphans ) {
283 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
284 }
285 }
286 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
287
288 $rowsInserted += count( $insertBatch );
289 ++$batchesDone;
290 if ( $batchesDone >= $this->reportingInterval ) {
291 $batchesDone = 0;
292 echo "$startId / $endId\n";
293 wfWaitForSlaves();
294 }
295 }
296 echo "Found $rowsInserted orphan text rows\n";
297 }
298
299 /**
300 * Scan the blobs table for rows not registered in blob_tracking (and thus not
301 * registered in the text table).
302 *
303 * Orphan blobs are indicative of DB corruption. They are inaccessible and
304 * should probably be deleted.
305 */
306 function findOrphanBlobs() {
307 if ( !extension_loaded( 'gmp' ) ) {
308 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
309 return;
310 }
311
312 $dbw = wfGetDB( DB_MASTER );
313
314 foreach ( $this->clusters as $cluster ) {
315 echo "Searching for orphan blobs in $cluster...\n";
316 $lb = wfGetLBFactory()->getExternalLB( $cluster );
317 try {
318 $extDB = $lb->getConnection( DB_SLAVE );
319 } catch ( DBConnectionError $e ) {
320 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
321 echo "No database on $cluster\n";
322 } else {
323 echo "Error on $cluster: " . $e->getMessage() . "\n";
324 }
325 continue;
326 }
327 $table = $extDB->getLBInfo( 'blobs table' );
328 if ( is_null( $table ) ) {
329 $table = 'blobs';
330 }
331 if ( !$extDB->tableExists( $table ) ) {
332 echo "No blobs table on cluster $cluster\n";
333 continue;
334 }
335 $startId = 0;
336 $batchesDone = 0;
337 $actualBlobs = gmp_init( 0 );
338 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
339
340 // Build a bitmap of actual blob rows
341 while ( true ) {
342 $res = $extDB->select( $table,
343 array( 'blob_id' ),
344 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
345 __METHOD__,
346 array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
347 );
348
349 if ( !$res->numRows() ) {
350 break;
351 }
352
353 foreach ( $res as $row ) {
354 gmp_setbit( $actualBlobs, $row->blob_id );
355 }
356 $startId = $row->blob_id;
357
358 ++$batchesDone;
359 if ( $batchesDone >= $this->reportingInterval ) {
360 $batchesDone = 0;
361 echo "$startId / $endId\n";
362 }
363 }
364
365 // Find actual blobs that weren't tracked by the previous passes
366 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
367 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
368
369 // Traverse the orphan list
370 $insertBatch = array();
371 $id = 0;
372 $numOrphans = 0;
373 while ( true ) {
374 $id = gmp_scan1( $orphans, $id );
375 if ( $id == -1 ) {
376 break;
377 }
378 $insertBatch[] = array(
379 'bo_cluster' => $cluster,
380 'bo_blob_id' => $id
381 );
382 if ( count( $insertBatch ) > $this->batchSize ) {
383 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
384 $insertBatch = array();
385 }
386
387 ++$id;
388 ++$numOrphans;
389 }
390 if ( $insertBatch ) {
391 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
392 }
393 echo "Found $numOrphans orphan(s) in $cluster\n";
394 }
395 }
396 }