63327d5389a205322ced3733dd1925057bf7810a
[lhc/web/wiklou.git] / maintenance / storage / trackBlobs.php
1 <?php
2
3 require( dirname( __FILE__ ) .'/../commandLine.inc' );
4
5
6 if ( count( $args ) < 1 ) {
7 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
8 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
9 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
10
11 exit( 1 );
12 }
13 $tracker = new TrackBlobs( $args );
14 $tracker->run();
15 echo "All done.\n";
16
17 class TrackBlobs {
18 var $clusters, $textClause;
19 var $doBlobOrphans;
20 var $trackedBlobs = array();
21
22 var $batchSize = 1000;
23 var $reportingInterval = 10;
24
25 function __construct( $clusters ) {
26 $this->clusters = $clusters;
27 if ( extension_loaded( 'gmp' ) ) {
28 $this->doBlobOrphans = true;
29 foreach ( $clusters as $cluster ) {
30 $this->trackedBlobs[$cluster] = gmp_init( 0 );
31 }
32 } else {
33 echo "Warning: the gmp extension is needed to find orphan blobs\n";
34 }
35 }
36
37 function run() {
38 $this->initTrackingTable();
39 $this->trackRevisions();
40 $this->trackOrphanText();
41 if ( $this->doBlobOrphans ) {
42 $this->findOrphanBlobs();
43 }
44 }
45
46 function initTrackingTable() {
47 $dbw = wfGetDB( DB_MASTER );
48 if ( $dbw->tableExists( 'blob_tracking' ) ) {
49 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
50 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
51 }
52 $dbw->sourceFile( dirname( __FILE__ ) . '/blob_tracking.sql' );
53 }
54
55 function getTextClause() {
56 if ( !$this->textClause ) {
57 $dbr = wfGetDB( DB_SLAVE );
58 $this->textClause = '';
59 foreach ( $this->clusters as $cluster ) {
60 if ( $this->textClause != '' ) {
61 $this->textClause .= ' OR ';
62 }
63 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
64 }
65 }
66 return $this->textClause;
67 }
68
69 function interpretPointer( $text ) {
70 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
71 return false;
72 }
73 return array(
74 'cluster' => $m[1],
75 'id' => intval( $m[2] ),
76 'hash' => isset( $m[3] ) ? $m[3] : null
77 );
78 }
79
80 /**
81 * Scan the revision table for rows stored in the specified clusters
82 */
83 function trackRevisions() {
84 $dbw = wfGetDB( DB_MASTER );
85 $dbr = wfGetDB( DB_SLAVE );
86
87 $textClause = $this->getTextClause();
88 $startId = 0;
89 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
90 $batchesDone = 0;
91 $rowsInserted = 0;
92
93 echo "Finding revisions...\n";
94
95 while ( true ) {
96 $res = $dbr->select( array( 'revision', 'text' ),
97 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
98 array(
99 'rev_id > ' . $dbr->addQuotes( $startId ),
100 'rev_text_id=old_id',
101 $textClause,
102 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
103 ),
104 __METHOD__,
105 array(
106 'ORDER BY' => 'rev_id',
107 'LIMIT' => $this->batchSize
108 )
109 );
110 if ( !$res->numRows() ) {
111 break;
112 }
113
114 $insertBatch = array();
115 foreach ( $res as $row ) {
116 $startId = $row->rev_id;
117 $info = $this->interpretPointer( $row->old_text );
118 if ( !$info ) {
119 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
120 continue;
121 }
122 if ( !in_array( $info['cluster'], $this->clusters ) ) {
123 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
124 continue;
125 }
126 $insertBatch[] = array(
127 'bt_page' => $row->rev_page,
128 'bt_rev_id' => $row->rev_id,
129 'bt_text_id' => $row->old_id,
130 'bt_cluster' => $info['cluster'],
131 'bt_blob_id' => $info['id'],
132 'bt_cgz_hash' => $info['hash']
133 );
134 if ( $this->doBlobOrphans ) {
135 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
136 }
137 }
138 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
139 $rowsInserted += count( $insertBatch );
140
141 ++$batchesDone;
142 if ( $batchesDone >= $this->reportingInterval ) {
143 $batchesDone = 0;
144 echo "$startId / $endId\n";
145 wfWaitForSlaves( 5 );
146 }
147 }
148 echo "Found $rowsInserted revisions\n";
149 }
150
151 /**
152 * Scan the text table for orphan text
153 * Orphan text here does not imply DB corruption -- deleted text tracked by the
154 * archive table counts as orphan for our purposes.
155 */
156 function trackOrphanText() {
157 # Wait until the blob_tracking table is available in the slave
158 $dbw = wfGetDB( DB_MASTER );
159 $dbr = wfGetDB( DB_SLAVE );
160 $pos = $dbw->getMasterPos();
161 $dbr->masterPosWait( $pos, 100000 );
162
163 $textClause = $this->getTextClause( $this->clusters );
164 $startId = 0;
165 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
166 $rowsInserted = 0;
167 $batchesDone = 0;
168
169 echo "Finding orphan text...\n";
170
171 # Scan the text table for orphan text
172 while ( true ) {
173 $res = $dbr->select( array( 'text', 'blob_tracking' ),
174 array( 'old_id', 'old_flags', 'old_text' ),
175 array(
176 'old_id>' . $dbr->addQuotes( $startId ),
177 $textClause,
178 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
179 'bt_text_id IS NULL'
180 ),
181 __METHOD__,
182 array(
183 'ORDER BY' => 'old_id',
184 'LIMIT' => $this->batchSize
185 ),
186 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
187 );
188 $ids = array();
189 foreach ( $res as $row ) {
190 $ids[] = $row->old_id;
191 }
192
193 if ( !$res->numRows() ) {
194 break;
195 }
196
197 $insertBatch = array();
198 foreach ( $res as $row ) {
199 $startId = $row->old_id;
200 $info = $this->interpretPointer( $row->old_text );
201 if ( !$info ) {
202 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
203 continue;
204 }
205 if ( !in_array( $info['cluster'], $this->clusters ) ) {
206 echo "Invalid cluster returned in SQL query\n";
207 continue;
208 }
209
210 $insertBatch[] = array(
211 'bt_page' => 0,
212 'bt_rev_id' => 0,
213 'bt_text_id' => $row->old_id,
214 'bt_cluster' => $info['cluster'],
215 'bt_blob_id' => $info['id'],
216 'bt_cgz_hash' => $info['hash']
217 );
218 if ( $this->doBlobOrphans ) {
219 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
220 }
221 }
222 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
223
224 $rowsInserted += count( $insertBatch );
225 ++$batchesDone;
226 if ( $batchesDone >= $this->reportingInterval ) {
227 $batchesDone = 0;
228 echo "$startId / $endId\n";
229 wfWaitForSlaves( 5 );
230 }
231 }
232 echo "Found $rowsInserted orphan text rows\n";
233 }
234
235 /**
236 * Scan the blobs table for rows not registered in blob_tracking (and thus not
237 * registered in the text table).
238 *
239 * Orphan blobs are indicative of DB corruption. They are inaccessible and
240 * should probably be deleted.
241 */
242 function findOrphanBlobs() {
243 if ( !extension_loaded( 'gmp' ) ) {
244 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
245 return;
246 }
247
248 $dbw = wfGetDB( DB_MASTER );
249
250 foreach ( $this->clusters as $cluster ) {
251 echo "Searching for orphan blobs in $cluster...\n";
252 $lb = wfGetLBFactory()->getExternalLB( $cluster );
253 try {
254 $extDB = $lb->getConnection( DB_SLAVE );
255 } catch ( DBConnectionError $e ) {
256 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
257 echo "No database on $cluster\n";
258 } else {
259 echo "Error on $cluster: " . $e->getMessage() . "\n";
260 }
261 continue;
262 }
263 $table = $extDB->getLBInfo( 'blobs table' );
264 if ( is_null( $table ) ) {
265 $table = 'blobs';
266 }
267 if ( !$extDB->tableExists( $table ) ) {
268 echo "No blobs table on cluster $cluster\n";
269 continue;
270 }
271 $startId = 0;
272 $batchesDone = 0;
273 $actualBlobs = gmp_init( 0 );
274 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
275
276 // Build a bitmap of actual blob rows
277 while ( true ) {
278 $res = $extDB->select( $table,
279 array( 'blob_id' ),
280 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
281 __METHOD__,
282 array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
283 );
284
285 if ( !$res->numRows() ) {
286 break;
287 }
288
289 foreach ( $res as $row ) {
290 gmp_setbit( $actualBlobs, $row->blob_id );
291 }
292 $startId = $row->blob_id;
293
294 ++$batchesDone;
295 if ( $batchesDone >= $this->reportingInterval ) {
296 $batchesDone = 0;
297 echo "$startId / $endId\n";
298 }
299 }
300
301 // Find actual blobs that weren't tracked by the previous passes
302 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
303 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
304
305 // Traverse the orphan list
306 $insertBatch = array();
307 $id = 0;
308 $numOrphans = 0;
309 while ( true ) {
310 $id = gmp_scan1( $orphans, $id );
311 if ( $id == -1 ) {
312 break;
313 }
314 $insertBatch[] = array(
315 'bo_cluster' => $cluster,
316 'bo_blob_id' => $id
317 );
318 if ( count( $insertBatch ) > $this->batchSize ) {
319 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
320 $insertBatch = array();
321 }
322
323 ++$id;
324 ++$numOrphans;
325 }
326 if ( $insertBatch ) {
327 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
328 }
329 echo "Found $numOrphans orphan(s) in $cluster\n";
330 }
331 }
332 }