Merge "Add support for 'hu-formal'"
[lhc/web/wiklou.git] / maintenance / storage / fixT22757.php
1 <?php
2 /**
3 * Script to fix T22757.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Maintenance ExternalStorage
22 */
23
24 require_once __DIR__ . '/../Maintenance.php';
25
26 /**
27 * Maintenance script to fix T22757.
28 *
29 * @ingroup Maintenance ExternalStorage
30 */
31 class FixT22757 extends Maintenance {
32 public $batchSize = 10000;
33 public $mapCache = [];
34 public $mapCacheSize = 0;
35 public $maxMapCacheSize = 1000000;
36
37 function __construct() {
38 parent::__construct();
39 $this->addDescription( 'Script to fix T22757 assuming that blob_tracking is intact' );
40 $this->addOption( 'dry-run', 'Report only' );
41 $this->addOption( 'start', 'old_id to start at', false, true );
42 }
43
44 function execute() {
45 $dbr = $this->getDB( DB_REPLICA );
46 $dbw = $this->getDB( DB_MASTER );
47
48 $dryRun = $this->getOption( 'dry-run' );
49 if ( $dryRun ) {
50 print "Dry run only.\n";
51 }
52
53 $startId = $this->getOption( 'start', 0 );
54 $numGood = 0;
55 $numFixed = 0;
56 $numBad = 0;
57
58 $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', '', __METHOD__ );
59
60 // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function
61 $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))';
62
63 while ( true ) {
64 print "ID: $startId / $totalRevs\r";
65
66 $res = $dbr->select(
67 'text',
68 [ 'old_id', 'old_flags', 'old_text' ],
69 [
70 'old_id > ' . intval( $startId ),
71 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
72 "$lowerLeft = 'o:15:\"historyblobstub\"'",
73 ],
74 __METHOD__,
75 [
76 'ORDER BY' => 'old_id',
77 'LIMIT' => $this->batchSize,
78 ]
79 );
80
81 if ( !$res->numRows() ) {
82 break;
83 }
84
85 $secondaryIds = [];
86 $stubs = [];
87
88 foreach ( $res as $row ) {
89 $startId = $row->old_id;
90
91 // Basic sanity checks
92 $obj = unserialize( $row->old_text );
93 if ( $obj === false ) {
94 print "{$row->old_id}: unrecoverable: cannot unserialize\n";
95 ++$numBad;
96 continue;
97 }
98
99 if ( !is_object( $obj ) ) {
100 print "{$row->old_id}: unrecoverable: unserialized to type " .
101 gettype( $obj ) . ", possible double-serialization\n";
102 ++$numBad;
103 continue;
104 }
105
106 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
107 print "{$row->old_id}: unrecoverable: unexpected object class " .
108 get_class( $obj ) . "\n";
109 ++$numBad;
110 continue;
111 }
112
113 // Process flags
114 $flags = explode( ',', $row->old_flags );
115 if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
116 $legacyEncoding = false;
117 } else {
118 $legacyEncoding = true;
119 }
120
121 // Queue the stub for future batch processing
122 $id = intval( $obj->mOldId );
123 $secondaryIds[] = $id;
124 $stubs[$row->old_id] = [
125 'legacyEncoding' => $legacyEncoding,
126 'secondaryId' => $id,
127 'hash' => $obj->mHash,
128 ];
129 }
130
131 $secondaryIds = array_unique( $secondaryIds );
132
133 if ( !count( $secondaryIds ) ) {
134 continue;
135 }
136
137 // Run the batch query on blob_tracking
138 $res = $dbr->select(
139 'blob_tracking',
140 '*',
141 [
142 'bt_text_id' => $secondaryIds,
143 ],
144 __METHOD__
145 );
146 $trackedBlobs = [];
147 foreach ( $res as $row ) {
148 $trackedBlobs[$row->bt_text_id] = $row;
149 }
150
151 // Process the stubs
152 foreach ( $stubs as $primaryId => $stub ) {
153 $secondaryId = $stub['secondaryId'];
154 if ( !isset( $trackedBlobs[$secondaryId] ) ) {
155 // No tracked blob. Work out what went wrong
156 $secondaryRow = $dbr->selectRow(
157 'text',
158 [ 'old_flags', 'old_text' ],
159 [ 'old_id' => $secondaryId ],
160 __METHOD__
161 );
162 if ( !$secondaryRow ) {
163 print "$primaryId: unrecoverable: secondary row is missing\n";
164 ++$numBad;
165 } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
166 // Not broken yet, and not in the tracked clusters so it won't get
167 // broken by the current RCT run.
168 ++$numGood;
169 } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
170 print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
171 ++$numBad;
172 } else {
173 print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
174 ++$numBad;
175 }
176 unset( $stubs[$primaryId] );
177 continue;
178 }
179 $trackRow = $trackedBlobs[$secondaryId];
180
181 // Check that the specified text really is available in the tracked source row
182 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
183 $text = ExternalStore::fetchFromURL( $url );
184 if ( $text === false ) {
185 print "$primaryId: unrecoverable: source text missing\n";
186 ++$numBad;
187 unset( $stubs[$primaryId] );
188 continue;
189 }
190 if ( md5( $text ) !== $stub['hash'] ) {
191 print "$primaryId: unrecoverable: content hashes do not match\n";
192 ++$numBad;
193 unset( $stubs[$primaryId] );
194 continue;
195 }
196
197 // Find the page_id and rev_id
198 // The page is probably the same as the page of the secondary row
199 $pageId = intval( $trackRow->bt_page );
200 if ( !$pageId ) {
201 $revId = $pageId = 0;
202 } else {
203 $revId = $this->findTextIdInPage( $pageId, $primaryId );
204 if ( !$revId ) {
205 // Actually an orphan
206 $pageId = $revId = 0;
207 }
208 }
209
210 $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
211
212 if ( !$dryRun ) {
213 // Reset the text row to point to the original copy
214 $this->beginTransaction( $dbw, __METHOD__ );
215 $dbw->update(
216 'text',
217 // SET
218 [
219 'old_flags' => $newFlags,
220 'old_text' => $url
221 ],
222 // WHERE
223 [ 'old_id' => $primaryId ],
224 __METHOD__
225 );
226
227 // Add a blob_tracking row so that the new reference can be recompressed
228 // without needing to run trackBlobs.php again
229 $dbw->insert( 'blob_tracking',
230 [
231 'bt_page' => $pageId,
232 'bt_rev_id' => $revId,
233 'bt_text_id' => $primaryId,
234 'bt_cluster' => $trackRow->bt_cluster,
235 'bt_blob_id' => $trackRow->bt_blob_id,
236 'bt_cgz_hash' => $stub['hash'],
237 'bt_new_url' => null,
238 'bt_moved' => 0,
239 ],
240 __METHOD__
241 );
242 $this->commitTransaction( $dbw, __METHOD__ );
243 }
244
245 print "$primaryId: resolved to $url\n";
246 ++$numFixed;
247 }
248 }
249
250 print "\n";
251 print "Fixed: $numFixed\n";
252 print "Unrecoverable: $numBad\n";
253 print "Good stubs: $numGood\n";
254 }
255
256 function findTextIdInPage( $pageId, $textId ) {
257 $ids = $this->getRevTextMap( $pageId );
258 if ( !isset( $ids[$textId] ) ) {
259 return null;
260 } else {
261 return $ids[$textId];
262 }
263 }
264
265 function getRevTextMap( $pageId ) {
266 if ( !isset( $this->mapCache[$pageId] ) ) {
267 // Limit cache size
268 while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
269 $key = key( $this->mapCache );
270 $this->mapCacheSize -= count( $this->mapCache[$key] );
271 unset( $this->mapCache[$key] );
272 }
273
274 $dbr = $this->getDB( DB_REPLICA );
275 $map = [];
276 $res = $dbr->select( 'revision',
277 [ 'rev_id', 'rev_text_id' ],
278 [ 'rev_page' => $pageId ],
279 __METHOD__
280 );
281 foreach ( $res as $row ) {
282 $map[$row->rev_text_id] = $row->rev_id;
283 }
284 $this->mapCache[$pageId] = $map;
285 $this->mapCacheSize += count( $map );
286 }
287
288 return $this->mapCache[$pageId];
289 }
290
291 /**
292 * This is based on part of HistoryBlobStub::getText().
293 * Determine if the text can be retrieved from the row in the normal way.
294 * @param array $stub
295 * @param stdClass $secondaryRow
296 * @return bool
297 */
298 function isUnbrokenStub( $stub, $secondaryRow ) {
299 $flags = explode( ',', $secondaryRow->old_flags );
300 $text = $secondaryRow->old_text;
301 if ( in_array( 'external', $flags ) ) {
302 $url = $text;
303 Wikimedia\suppressWarnings();
304 list( /* $proto */, $path ) = explode( '://', $url, 2 );
305 Wikimedia\restoreWarnings();
306
307 if ( $path == "" ) {
308 return false;
309 }
310 $text = ExternalStore::fetchFromURL( $url );
311 }
312 if ( !in_array( 'object', $flags ) ) {
313 return false;
314 }
315
316 if ( in_array( 'gzip', $flags ) ) {
317 $obj = unserialize( gzinflate( $text ) );
318 } else {
319 $obj = unserialize( $text );
320 }
321
322 if ( !is_object( $obj ) ) {
323 // Correct for old double-serialization bug.
324 $obj = unserialize( $obj );
325 }
326
327 if ( !is_object( $obj ) ) {
328 return false;
329 }
330
331 $obj->uncompress();
332 $text = $obj->getItem( $stub['hash'] );
333
334 return $text !== false;
335 }
336 }
337
338 $maintClass = FixT22757::class;
339 require_once RUN_MAINTENANCE_IF_MAIN;