Merge "Add note that IP::isInRange() can return unexpected results for invalid args"
[lhc/web/wiklou.git] / maintenance / storage / fixBug20757.php
1 <?php
2 /**
3 * Script to fix bug 20757.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Maintenance ExternalStorage
22 */
23
24 require_once __DIR__ . '/../Maintenance.php';
25
26 /**
27 * Maintenance script to fix bug 20757.
28 *
29 * @ingroup Maintenance ExternalStorage
30 */
31 class FixBug20757 extends Maintenance {
32 public $batchSize = 10000;
33 public $mapCache = [];
34 public $mapCacheSize = 0;
35 public $maxMapCacheSize = 1000000;
36
37 function __construct() {
38 parent::__construct();
39 $this->addDescription( 'Script to fix bug 20757 assuming that blob_tracking is intact' );
40 $this->addOption( 'dry-run', 'Report only' );
41 $this->addOption( 'start', 'old_id to start at', false, true );
42 }
43
44 function execute() {
45 $dbr = $this->getDB( DB_SLAVE );
46 $dbw = $this->getDB( DB_MASTER );
47
48 $dryRun = $this->getOption( 'dry-run' );
49 if ( $dryRun ) {
50 print "Dry run only.\n";
51 }
52
53 $startId = $this->getOption( 'start', 0 );
54 $numGood = 0;
55 $numFixed = 0;
56 $numBad = 0;
57
58 $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
59
60 if ( $dbr->getType() == 'mysql' ) {
61 // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function
62 $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))';
63 }
64
65 while ( true ) {
66 print "ID: $startId / $totalRevs\r";
67
68 $res = $dbr->select(
69 'text',
70 [ 'old_id', 'old_flags', 'old_text' ],
71 [
72 'old_id > ' . intval( $startId ),
73 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
74 "$lowerLeft = 'o:15:\"historyblobstub\"'",
75 ],
76 __METHOD__,
77 [
78 'ORDER BY' => 'old_id',
79 'LIMIT' => $this->batchSize,
80 ]
81 );
82
83 if ( !$res->numRows() ) {
84 break;
85 }
86
87 $secondaryIds = [];
88 $stubs = [];
89
90 foreach ( $res as $row ) {
91 $startId = $row->old_id;
92
93 // Basic sanity checks
94 $obj = unserialize( $row->old_text );
95 if ( $obj === false ) {
96 print "{$row->old_id}: unrecoverable: cannot unserialize\n";
97 ++$numBad;
98 continue;
99 }
100
101 if ( !is_object( $obj ) ) {
102 print "{$row->old_id}: unrecoverable: unserialized to type " .
103 gettype( $obj ) . ", possible double-serialization\n";
104 ++$numBad;
105 continue;
106 }
107
108 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
109 print "{$row->old_id}: unrecoverable: unexpected object class " .
110 get_class( $obj ) . "\n";
111 ++$numBad;
112 continue;
113 }
114
115 // Process flags
116 $flags = explode( ',', $row->old_flags );
117 if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
118 $legacyEncoding = false;
119 } else {
120 $legacyEncoding = true;
121 }
122
123 // Queue the stub for future batch processing
124 $id = intval( $obj->mOldId );
125 $secondaryIds[] = $id;
126 $stubs[$row->old_id] = [
127 'legacyEncoding' => $legacyEncoding,
128 'secondaryId' => $id,
129 'hash' => $obj->mHash,
130 ];
131 }
132
133 $secondaryIds = array_unique( $secondaryIds );
134
135 if ( !count( $secondaryIds ) ) {
136 continue;
137 }
138
139 // Run the batch query on blob_tracking
140 $res = $dbr->select(
141 'blob_tracking',
142 '*',
143 [
144 'bt_text_id' => $secondaryIds,
145 ],
146 __METHOD__
147 );
148 $trackedBlobs = [];
149 foreach ( $res as $row ) {
150 $trackedBlobs[$row->bt_text_id] = $row;
151 }
152
153 // Process the stubs
154 foreach ( $stubs as $primaryId => $stub ) {
155 $secondaryId = $stub['secondaryId'];
156 if ( !isset( $trackedBlobs[$secondaryId] ) ) {
157 // No tracked blob. Work out what went wrong
158 $secondaryRow = $dbr->selectRow(
159 'text',
160 [ 'old_flags', 'old_text' ],
161 [ 'old_id' => $secondaryId ],
162 __METHOD__
163 );
164 if ( !$secondaryRow ) {
165 print "$primaryId: unrecoverable: secondary row is missing\n";
166 ++$numBad;
167 } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
168 // Not broken yet, and not in the tracked clusters so it won't get
169 // broken by the current RCT run.
170 ++$numGood;
171 } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
172 print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
173 ++$numBad;
174 } else {
175 print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
176 ++$numBad;
177 }
178 unset( $stubs[$primaryId] );
179 continue;
180 }
181 $trackRow = $trackedBlobs[$secondaryId];
182
183 // Check that the specified text really is available in the tracked source row
184 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
185 $text = ExternalStore::fetchFromURL( $url );
186 if ( $text === false ) {
187 print "$primaryId: unrecoverable: source text missing\n";
188 ++$numBad;
189 unset( $stubs[$primaryId] );
190 continue;
191 }
192 if ( md5( $text ) !== $stub['hash'] ) {
193 print "$primaryId: unrecoverable: content hashes do not match\n";
194 ++$numBad;
195 unset( $stubs[$primaryId] );
196 continue;
197 }
198
199 // Find the page_id and rev_id
200 // The page is probably the same as the page of the secondary row
201 $pageId = intval( $trackRow->bt_page );
202 if ( !$pageId ) {
203 $revId = $pageId = 0;
204 } else {
205 $revId = $this->findTextIdInPage( $pageId, $primaryId );
206 if ( !$revId ) {
207 // Actually an orphan
208 $pageId = $revId = 0;
209 }
210 }
211
212 $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
213
214 if ( !$dryRun ) {
215 // Reset the text row to point to the original copy
216 $this->beginTransaction( $dbw, __METHOD__ );
217 $dbw->update(
218 'text',
219 // SET
220 [
221 'old_flags' => $newFlags,
222 'old_text' => $url
223 ],
224 // WHERE
225 [ 'old_id' => $primaryId ],
226 __METHOD__
227 );
228
229 // Add a blob_tracking row so that the new reference can be recompressed
230 // without needing to run trackBlobs.php again
231 $dbw->insert( 'blob_tracking',
232 [
233 'bt_page' => $pageId,
234 'bt_rev_id' => $revId,
235 'bt_text_id' => $primaryId,
236 'bt_cluster' => $trackRow->bt_cluster,
237 'bt_blob_id' => $trackRow->bt_blob_id,
238 'bt_cgz_hash' => $stub['hash'],
239 'bt_new_url' => null,
240 'bt_moved' => 0,
241 ],
242 __METHOD__
243 );
244 $this->commitTransaction( $dbw, __METHOD__ );
245 $this->waitForSlaves();
246 }
247
248 print "$primaryId: resolved to $url\n";
249 ++$numFixed;
250 }
251 }
252
253 print "\n";
254 print "Fixed: $numFixed\n";
255 print "Unrecoverable: $numBad\n";
256 print "Good stubs: $numGood\n";
257 }
258
259 function waitForSlaves() {
260 static $iteration = 0;
261 ++$iteration;
262 if ( ++$iteration > 50 == 0 ) {
263 wfWaitForSlaves();
264 $iteration = 0;
265 }
266 }
267
268 function findTextIdInPage( $pageId, $textId ) {
269 $ids = $this->getRevTextMap( $pageId );
270 if ( !isset( $ids[$textId] ) ) {
271 return null;
272 } else {
273 return $ids[$textId];
274 }
275 }
276
277 function getRevTextMap( $pageId ) {
278 if ( !isset( $this->mapCache[$pageId] ) ) {
279 // Limit cache size
280 while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
281 $key = key( $this->mapCache );
282 $this->mapCacheSize -= count( $this->mapCache[$key] );
283 unset( $this->mapCache[$key] );
284 }
285
286 $dbr = $this->getDB( DB_SLAVE );
287 $map = [];
288 $res = $dbr->select( 'revision',
289 [ 'rev_id', 'rev_text_id' ],
290 [ 'rev_page' => $pageId ],
291 __METHOD__
292 );
293 foreach ( $res as $row ) {
294 $map[$row->rev_text_id] = $row->rev_id;
295 }
296 $this->mapCache[$pageId] = $map;
297 $this->mapCacheSize += count( $map );
298 }
299
300 return $this->mapCache[$pageId];
301 }
302
303 /**
304 * This is based on part of HistoryBlobStub::getText().
305 * Determine if the text can be retrieved from the row in the normal way.
306 * @param array $stub
307 * @param stdClass $secondaryRow
308 * @return bool
309 */
310 function isUnbrokenStub( $stub, $secondaryRow ) {
311 $flags = explode( ',', $secondaryRow->old_flags );
312 $text = $secondaryRow->old_text;
313 if ( in_array( 'external', $flags ) ) {
314 $url = $text;
315 MediaWiki\suppressWarnings();
316 list( /* $proto */, $path ) = explode( '://', $url, 2 );
317 MediaWiki\restoreWarnings();
318
319 if ( $path == "" ) {
320 return false;
321 }
322 $text = ExternalStore::fetchFromURL( $url );
323 }
324 if ( !in_array( 'object', $flags ) ) {
325 return false;
326 }
327
328 if ( in_array( 'gzip', $flags ) ) {
329 $obj = unserialize( gzinflate( $text ) );
330 } else {
331 $obj = unserialize( $text );
332 }
333
334 if ( !is_object( $obj ) ) {
335 // Correct for old double-serialization bug.
336 $obj = unserialize( $obj );
337 }
338
339 if ( !is_object( $obj ) ) {
340 return false;
341 }
342
343 $obj->uncompress();
344 $text = $obj->getItem( $stub['hash'] );
345
346 return $text !== false;
347 }
348 }
349
350 $maintClass = 'FixBug20757';
351 require_once RUN_MAINTENANCE_IF_MAIN;