Merge "New getHTML() method for QuickTemplate to get the HTML of a template."
[lhc/web/wiklou.git] / maintenance / storage / trackBlobs.php
1 <?php
2 /**
3 * Adds blobs from a given external storage cluster to the blob_tracking table.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Maintenance
22 * @see wfWaitForSlaves()
23 */
24
25 require __DIR__ . '/../commandLine.inc';
26
27 if ( count( $args ) < 1 ) {
28 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
29 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
30 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
31
32 exit( 1 );
33 }
34 $tracker = new TrackBlobs( $args );
35 $tracker->run();
36 echo "All done.\n";
37
38 class TrackBlobs {
39 public $clusters, $textClause;
40 public $doBlobOrphans;
41 public $trackedBlobs = array();
42
43 public $batchSize = 1000;
44 public $reportingInterval = 10;
45
46 function __construct( $clusters ) {
47 $this->clusters = $clusters;
48 if ( extension_loaded( 'gmp' ) ) {
49 $this->doBlobOrphans = true;
50 foreach ( $clusters as $cluster ) {
51 $this->trackedBlobs[$cluster] = gmp_init( 0 );
52 }
53 } else {
54 echo "Warning: the gmp extension is needed to find orphan blobs\n";
55 }
56 }
57
58 function run() {
59 $this->checkIntegrity();
60 $this->initTrackingTable();
61 $this->trackRevisions();
62 $this->trackOrphanText();
63 if ( $this->doBlobOrphans ) {
64 $this->findOrphanBlobs();
65 }
66 }
67
68 function checkIntegrity() {
69 echo "Doing integrity check...\n";
70 $dbr = wfGetDB( DB_SLAVE );
71
72 // Scan for HistoryBlobStub objects in the text table (bug 20757)
73
74 $exists = $dbr->selectField( 'text', 1,
75 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
76 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
77 __METHOD__
78 );
79
80 if ( $exists ) {
81 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
82 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
83 "to fix this.\n";
84 exit( 1 );
85 }
86
87 // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
88 $flags = $dbr->selectField( 'archive', 'ar_flags',
89 'ar_flags LIKE \'%external%\' OR (' .
90 'ar_flags LIKE \'%object%\' ' .
91 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
92 __METHOD__
93 );
94
95 if ( strpos( $flags, 'external' ) !== false ) {
96 echo "Integrity check failed: found external storage pointers in your archive table.\n" .
97 "Run normaliseArchiveTable.php to fix this.\n";
98 exit( 1 );
99 } elseif ( $flags ) {
100 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
101 "These objects are probably already broken, continuing would make them\n" .
102 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
103 exit( 1 );
104 }
105
106 echo "Integrity check OK\n";
107 }
108
109 function initTrackingTable() {
110 $dbw = wfGetDB( DB_MASTER );
111 if ( $dbw->tableExists( 'blob_tracking' ) ) {
112 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
113 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
114 }
115 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
116 }
117
118 function getTextClause() {
119 if ( !$this->textClause ) {
120 $dbr = wfGetDB( DB_SLAVE );
121 $this->textClause = '';
122 foreach ( $this->clusters as $cluster ) {
123 if ( $this->textClause != '' ) {
124 $this->textClause .= ' OR ';
125 }
126 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
127 }
128 }
129 return $this->textClause;
130 }
131
132 function interpretPointer( $text ) {
133 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
134 return false;
135 }
136 return array(
137 'cluster' => $m[1],
138 'id' => intval( $m[2] ),
139 'hash' => isset( $m[3] ) ? $m[3] : null
140 );
141 }
142
143 /**
144 * Scan the revision table for rows stored in the specified clusters
145 */
146 function trackRevisions() {
147 $dbw = wfGetDB( DB_MASTER );
148 $dbr = wfGetDB( DB_SLAVE );
149
150 $textClause = $this->getTextClause();
151 $startId = 0;
152 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
153 $batchesDone = 0;
154 $rowsInserted = 0;
155
156 echo "Finding revisions...\n";
157
158 while ( true ) {
159 $res = $dbr->select( array( 'revision', 'text' ),
160 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
161 array(
162 'rev_id > ' . $dbr->addQuotes( $startId ),
163 'rev_text_id=old_id',
164 $textClause,
165 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
166 ),
167 __METHOD__,
168 array(
169 'ORDER BY' => 'rev_id',
170 'LIMIT' => $this->batchSize
171 )
172 );
173 if ( !$res->numRows() ) {
174 break;
175 }
176
177 $insertBatch = array();
178 foreach ( $res as $row ) {
179 $startId = $row->rev_id;
180 $info = $this->interpretPointer( $row->old_text );
181 if ( !$info ) {
182 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
183 continue;
184 }
185 if ( !in_array( $info['cluster'], $this->clusters ) ) {
186 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
187 continue;
188 }
189 $insertBatch[] = array(
190 'bt_page' => $row->rev_page,
191 'bt_rev_id' => $row->rev_id,
192 'bt_text_id' => $row->old_id,
193 'bt_cluster' => $info['cluster'],
194 'bt_blob_id' => $info['id'],
195 'bt_cgz_hash' => $info['hash']
196 );
197 if ( $this->doBlobOrphans ) {
198 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
199 }
200 }
201 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
202 $rowsInserted += count( $insertBatch );
203
204 ++$batchesDone;
205 if ( $batchesDone >= $this->reportingInterval ) {
206 $batchesDone = 0;
207 echo "$startId / $endId\n";
208 wfWaitForSlaves();
209 }
210 }
211 echo "Found $rowsInserted revisions\n";
212 }
213
214 /**
215 * Scan the text table for orphan text
216 * Orphan text here does not imply DB corruption -- deleted text tracked by the
217 * archive table counts as orphan for our purposes.
218 */
219 function trackOrphanText() {
220 # Wait until the blob_tracking table is available in the slave
221 $dbw = wfGetDB( DB_MASTER );
222 $dbr = wfGetDB( DB_SLAVE );
223 $pos = $dbw->getMasterPos();
224 $dbr->masterPosWait( $pos, 100000 );
225
226 $textClause = $this->getTextClause( $this->clusters );
227 $startId = 0;
228 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
229 $rowsInserted = 0;
230 $batchesDone = 0;
231
232 echo "Finding orphan text...\n";
233
234 # Scan the text table for orphan text
235 while ( true ) {
236 $res = $dbr->select( array( 'text', 'blob_tracking' ),
237 array( 'old_id', 'old_flags', 'old_text' ),
238 array(
239 'old_id>' . $dbr->addQuotes( $startId ),
240 $textClause,
241 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
242 'bt_text_id IS NULL'
243 ),
244 __METHOD__,
245 array(
246 'ORDER BY' => 'old_id',
247 'LIMIT' => $this->batchSize
248 ),
249 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
250 );
251 $ids = array();
252 foreach ( $res as $row ) {
253 $ids[] = $row->old_id;
254 }
255
256 if ( !$res->numRows() ) {
257 break;
258 }
259
260 $insertBatch = array();
261 foreach ( $res as $row ) {
262 $startId = $row->old_id;
263 $info = $this->interpretPointer( $row->old_text );
264 if ( !$info ) {
265 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
266 continue;
267 }
268 if ( !in_array( $info['cluster'], $this->clusters ) ) {
269 echo "Invalid cluster returned in SQL query\n";
270 continue;
271 }
272
273 $insertBatch[] = array(
274 'bt_page' => 0,
275 'bt_rev_id' => 0,
276 'bt_text_id' => $row->old_id,
277 'bt_cluster' => $info['cluster'],
278 'bt_blob_id' => $info['id'],
279 'bt_cgz_hash' => $info['hash']
280 );
281 if ( $this->doBlobOrphans ) {
282 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
283 }
284 }
285 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
286
287 $rowsInserted += count( $insertBatch );
288 ++$batchesDone;
289 if ( $batchesDone >= $this->reportingInterval ) {
290 $batchesDone = 0;
291 echo "$startId / $endId\n";
292 wfWaitForSlaves();
293 }
294 }
295 echo "Found $rowsInserted orphan text rows\n";
296 }
297
298 /**
299 * Scan the blobs table for rows not registered in blob_tracking (and thus not
300 * registered in the text table).
301 *
302 * Orphan blobs are indicative of DB corruption. They are inaccessible and
303 * should probably be deleted.
304 */
305 function findOrphanBlobs() {
306 if ( !extension_loaded( 'gmp' ) ) {
307 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
308 return;
309 }
310
311 $dbw = wfGetDB( DB_MASTER );
312
313 foreach ( $this->clusters as $cluster ) {
314 echo "Searching for orphan blobs in $cluster...\n";
315 $lb = wfGetLBFactory()->getExternalLB( $cluster );
316 try {
317 $extDB = $lb->getConnection( DB_SLAVE );
318 } catch ( DBConnectionError $e ) {
319 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
320 echo "No database on $cluster\n";
321 } else {
322 echo "Error on $cluster: " . $e->getMessage() . "\n";
323 }
324 continue;
325 }
326 $table = $extDB->getLBInfo( 'blobs table' );
327 if ( is_null( $table ) ) {
328 $table = 'blobs';
329 }
330 if ( !$extDB->tableExists( $table ) ) {
331 echo "No blobs table on cluster $cluster\n";
332 continue;
333 }
334 $startId = 0;
335 $batchesDone = 0;
336 $actualBlobs = gmp_init( 0 );
337 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
338
339 // Build a bitmap of actual blob rows
340 while ( true ) {
341 $res = $extDB->select( $table,
342 array( 'blob_id' ),
343 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
344 __METHOD__,
345 array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
346 );
347
348 if ( !$res->numRows() ) {
349 break;
350 }
351
352 foreach ( $res as $row ) {
353 gmp_setbit( $actualBlobs, $row->blob_id );
354 }
355 $startId = $row->blob_id;
356
357 ++$batchesDone;
358 if ( $batchesDone >= $this->reportingInterval ) {
359 $batchesDone = 0;
360 echo "$startId / $endId\n";
361 }
362 }
363
364 // Find actual blobs that weren't tracked by the previous passes
365 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
366 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
367
368 // Traverse the orphan list
369 $insertBatch = array();
370 $id = 0;
371 $numOrphans = 0;
372 while ( true ) {
373 $id = gmp_scan1( $orphans, $id );
374 if ( $id == -1 ) {
375 break;
376 }
377 $insertBatch[] = array(
378 'bo_cluster' => $cluster,
379 'bo_blob_id' => $id
380 );
381 if ( count( $insertBatch ) > $this->batchSize ) {
382 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
383 $insertBatch = array();
384 }
385
386 ++$id;
387 ++$numOrphans;
388 }
389 if ( $insertBatch ) {
390 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
391 }
392 echo "Found $numOrphans orphan(s) in $cluster\n";
393 }
394 }
395 }