Add unit test for bug 32888
[lhc/web/wiklou.git] / maintenance / dumpTextPass.php
1 <?php
2 /**
3 * Script that postprocesses XML dumps from dumpBackup.php to add page text
4 *
5 * Copyright (C) 2005 Brion Vibber <brion@pobox.com>
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Maintenance
25 */
26
27 $originalDir = getcwd();
28
29 require_once( dirname( __FILE__ ) . '/commandLine.inc' );
30 require_once( 'backup.inc' );
31
32 /**
33 * @ingroup Maintenance
34 */
35 class TextPassDumper extends BackupDumper {
36 var $prefetch = null;
37 var $input = "php://stdin";
38 var $history = WikiExporter::FULL;
39 var $fetchCount = 0;
40 var $prefetchCount = 0;
41 var $prefetchCountLast = 0;
42 var $fetchCountLast = 0;
43
44 var $failures = 0;
45 var $maxFailures = 5;
46 var $failedTextRetrievals = 0;
47 var $maxConsecutiveFailedTextRetrievals = 200;
48 var $failureTimeout = 5; // Seconds to sleep after db failure
49
50 var $php = "php";
51 var $spawn = false;
52 var $spawnProc = false;
53 var $spawnWrite = false;
54 var $spawnRead = false;
55 var $spawnErr = false;
56
57 var $xmlwriterobj = false;
58
59 // when we spend more than maxTimeAllowed seconds on this run, we continue
60 // processing until we write out the next complete page, then save output file(s),
61 // rename it/them and open new one(s)
62 var $maxTimeAllowed = 0; // 0 = no limit
63 var $timeExceeded = false;
64 var $firstPageWritten = false;
65 var $lastPageWritten = false;
66 var $checkpointJustWritten = false;
67 var $checkpointFiles = array();
68
69 /**
70 * @var DatabaseBase
71 */
72 protected $db;
73
74 function initProgress( $history ) {
75 parent::initProgress();
76 $this->timeOfCheckpoint = $this->startTime;
77 }
78
79 function dump( $history, $text = WikiExporter::TEXT ) {
80 // This shouldn't happen if on console... ;)
81 header( 'Content-type: text/html; charset=UTF-8' );
82
83 // Notice messages will foul up your XML output even if they're
84 // relatively harmless.
85 if ( ini_get( 'display_errors' ) )
86 ini_set( 'display_errors', 'stderr' );
87
88 $this->initProgress( $this->history );
89
90 $this->db = $this->backupDb();
91
92 $this->egress = new ExportProgressFilter( $this->sink, $this );
93
94 // it would be nice to do it in the constructor, oh well. need egress set
95 $this->finalOptionCheck();
96
97 // we only want this so we know how to close a stream :-P
98 $this->xmlwriterobj = new XmlDumpWriter();
99
100 $input = fopen( $this->input, "rt" );
101 $result = $this->readDump( $input );
102
103 if ( WikiError::isError( $result ) ) {
104 throw new MWException( $result->getMessage() );
105 }
106
107 if ( $this->spawnProc ) {
108 $this->closeSpawn();
109 }
110
111 $this->report( true );
112 }
113
114 function processOption( $opt, $val, $param ) {
115 global $IP;
116 $url = $this->processFileOpt( $val, $param );
117
118 switch( $opt ) {
119 case 'prefetch':
120 require_once "$IP/maintenance/backupPrefetch.inc";
121 $this->prefetch = new BaseDump( $url );
122 break;
123 case 'stub':
124 $this->input = $url;
125 break;
126 case 'maxtime':
127 $this->maxTimeAllowed = intval($val)*60;
128 break;
129 case 'checkpointfile':
130 $this->checkpointFiles[] = $val;
131 break;
132 case 'current':
133 $this->history = WikiExporter::CURRENT;
134 break;
135 case 'full':
136 $this->history = WikiExporter::FULL;
137 break;
138 case 'spawn':
139 $this->spawn = true;
140 if ( $val ) {
141 $this->php = $val;
142 }
143 break;
144 }
145 }
146
147 function processFileOpt( $val, $param ) {
148 $fileURIs = explode(';',$param);
149 foreach ( $fileURIs as $URI ) {
150 switch( $val ) {
151 case "file":
152 $newURI = $URI;
153 break;
154 case "gzip":
155 $newURI = "compress.zlib://$URI";
156 break;
157 case "bzip2":
158 $newURI = "compress.bzip2://$URI";
159 break;
160 case "7zip":
161 $newURI = "mediawiki.compress.7z://$URI";
162 break;
163 default:
164 $newURI = $URI;
165 }
166 $newFileURIs[] = $newURI;
167 }
168 $val = implode( ';', $newFileURIs );
169 return $val;
170 }
171
172 /**
173 * Overridden to include prefetch ratio if enabled.
174 */
175 function showReport() {
176 if ( !$this->prefetch ) {
177 parent::showReport();
178 return;
179 }
180
181 if ( $this->reporting ) {
182 $now = wfTimestamp( TS_DB );
183 $nowts = wfTime();
184 $deltaAll = wfTime() - $this->startTime;
185 $deltaPart = wfTime() - $this->lastTime;
186 $this->pageCountPart = $this->pageCount - $this->pageCountLast;
187 $this->revCountPart = $this->revCount - $this->revCountLast;
188
189 if ( $deltaAll ) {
190 $portion = $this->revCount / $this->maxCount;
191 $eta = $this->startTime + $deltaAll / $portion;
192 $etats = wfTimestamp( TS_DB, intval( $eta ) );
193 if ( $this->fetchCount ) {
194 $fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount;
195 } else {
196 $fetchRate = '-';
197 }
198 $pageRate = $this->pageCount / $deltaAll;
199 $revRate = $this->revCount / $deltaAll;
200 } else {
201 $pageRate = '-';
202 $revRate = '-';
203 $etats = '-';
204 $fetchRate = '-';
205 }
206 if ( $deltaPart ) {
207 if ( $this->fetchCountLast ) {
208 $fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast;
209 } else {
210 $fetchRatePart = '-';
211 }
212 $pageRatePart = $this->pageCountPart / $deltaPart;
213 $revRatePart = $this->revCountPart / $deltaPart;
214
215 } else {
216 $fetchRatePart = '-';
217 $pageRatePart = '-';
218 $revRatePart = '-';
219 }
220 $this->progress( sprintf( "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), %d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% prefetched (all|curr), ETA %s [max %d]",
221 $now, wfWikiID(), $this->ID, $this->pageCount, $pageRate, $pageRatePart, $this->revCount, $revRate, $revRatePart, $fetchRate, $fetchRatePart, $etats, $this->maxCount ) );
222 $this->lastTime = $nowts;
223 $this->revCountLast = $this->revCount;
224 $this->prefetchCountLast = $this->prefetchCount;
225 $this->fetchCountLast = $this->fetchCount;
226 }
227 }
228
229 function setTimeExceeded() {
230 $this->timeExceeded = True;
231 }
232
233 function checkIfTimeExceeded() {
234 if ( $this->maxTimeAllowed && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed ) ) {
235 return true;
236 }
237 return false;
238 }
239
240 function finalOptionCheck() {
241 if ( ( $this->checkpointFiles && ! $this->maxTimeAllowed ) ||
242 ( $this->maxTimeAllowed && !$this->checkpointFiles ) ) {
243 throw new MWException("Options checkpointfile and maxtime must be specified together.\n");
244 }
245 foreach ($this->checkpointFiles as $checkpointFile) {
246 $count = substr_count ( $checkpointFile,"%s" );
247 if ( $count != 2 ) {
248 throw new MWException("Option checkpointfile must contain two '%s' for substitution of first and last pageids, count is $count instead, file is $checkpointFile.\n");
249 }
250 }
251
252 if ( $this->checkpointFiles ) {
253 $filenameList = (array)$this->egress->getFilenames();
254 if ( count( $filenameList ) != count( $this->checkpointFiles ) ) {
255 throw new MWException("One checkpointfile must be specified for each output option, if maxtime is used.\n");
256 }
257 }
258 }
259
260 function readDump( $input ) {
261 $this->buffer = "";
262 $this->openElement = false;
263 $this->atStart = true;
264 $this->state = "";
265 $this->lastName = "";
266 $this->thisPage = 0;
267 $this->thisRev = 0;
268
269 $parser = xml_parser_create( "UTF-8" );
270 xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
271
272 xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
273 xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
274
275 $offset = 0; // for context extraction on error reporting
276 $bufferSize = 512 * 1024;
277 do {
278 if ($this->checkIfTimeExceeded()) {
279 $this->setTimeExceeded();
280 }
281 $chunk = fread( $input, $bufferSize );
282 if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
283 wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
284 return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
285 }
286 $offset += strlen( $chunk );
287 } while ( $chunk !== false && !feof( $input ) );
288 if ($this->maxTimeAllowed) {
289 $filenameList = (array)$this->egress->getFilenames();
290 // we wrote some stuff after last checkpoint that needs renamed
291 if (file_exists($filenameList[0])) {
292 $newFilenames = array();
293 # we might have just written the header and footer and had no
294 # pages or revisions written... perhaps they were all deleted
295 # there's no pageID 0 so we use that. the caller is responsible
296 # for deciding what to do with a file containing only the
297 # siteinfo information and the mw tags.
298 if (! $this->firstPageWritten) {
299 $firstPageID = str_pad(0,9,"0",STR_PAD_LEFT);
300 $lastPageID = str_pad(0,9,"0",STR_PAD_LEFT);
301 }
302 else {
303 $firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT);
304 $lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT);
305 }
306 for ( $i = 0; $i < count( $filenameList ); $i++ ) {
307 $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
308 $fileinfo = pathinfo($filenameList[$i]);
309 $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
310 }
311 $this->egress->closeAndRename( $newFilenames );
312 }
313 }
314 xml_parser_free( $parser );
315
316 return true;
317 }
318
319 function getText( $id ) {
320 $this->fetchCount++;
321 if ( isset( $this->prefetch ) ) {
322 $text = $this->prefetch->prefetch( $this->thisPage, $this->thisRev );
323 if ( $text !== null ) { // Entry missing from prefetch dump
324 $dbr = wfGetDB( DB_SLAVE );
325 $revID = intval( $this->thisRev );
326 $revLength = $dbr->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
327 // if length of rev text in file doesn't match length in db, we reload
328 // this avoids carrying forward broken data from previous xml dumps
329 if( strlen( $text ) == $revLength ) {
330 $this->prefetchCount++;
331 return $text;
332 }
333 }
334 }
335 return $this->doGetText( $id );
336 }
337
338 private function doGetText( $id ) {
339 $id = intval( $id );
340 $this->failures = 0;
341 $ex = new MWException( "Graceful storage failure" );
342 while (true) {
343 if ( $this->spawn ) {
344 if ($this->failures) {
345 // we don't know why it failed, could be the child process
346 // borked, could be db entry busted, could be db server out to lunch,
347 // so cover all bases
348 $this->closeSpawn();
349 $this->openSpawn();
350 }
351 $text = $this->getTextSpawned( $id );
352 } else {
353 $text = $this->getTextDbSafe( $id );
354 }
355 if ( $text === false ) {
356 $this->failures++;
357 if ( $this->failures > $this->maxFailures) {
358 $this->progress( "Failed to retrieve revision text for text id ".
359 "$id after $this->maxFailures tries, giving up" );
360 // were there so many bad retrievals in a row we want to bail?
361 // at some point we have to declare the dump irretrievably broken
362 $this->failedTextRetrievals++;
363 if ($this->failedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals) {
364 throw $ex;
365 } else {
366 // would be nice to return something better to the caller someday,
367 // log what we know about the failure and about the revision
368 return "";
369 }
370 } else {
371 $this->progress( "Error $this->failures " .
372 "of allowed $this->maxFailures retrieving revision text for text id $id! " .
373 "Pausing $this->failureTimeout seconds before retry..." );
374 sleep( $this->failureTimeout );
375 }
376 } else {
377 $this->failedTextRetrievals= 0;
378 return $text;
379 }
380 }
381 return '';
382 }
383
384 /**
385 * Fetch a text revision from the database, retrying in case of failure.
386 * This may survive some transitory errors by reconnecting, but
387 * may not survive a long-term server outage.
388 *
389 * FIXME: WTF? Why is it using a loop and then returning unconditionally?
390 */
391 private function getTextDbSafe( $id ) {
392 while ( true ) {
393 try {
394 $text = $this->getTextDb( $id );
395 } catch ( DBQueryError $ex ) {
396 $text = false;
397 }
398 return $text;
399 }
400 }
401
402 /**
403 * May throw a database error if, say, the server dies during query.
404 * @param $id
405 * @return bool|string
406 */
407 private function getTextDb( $id ) {
408 global $wgContLang;
409 $row = $this->db->selectRow( 'text',
410 array( 'old_text', 'old_flags' ),
411 array( 'old_id' => $id ),
412 __METHOD__ );
413 $text = Revision::getRevisionText( $row );
414 if ( $text === false ) {
415 return false;
416 }
417 $stripped = str_replace( "\r", "", $text );
418 $normalized = $wgContLang->normalize( $stripped );
419 return $normalized;
420 }
421
422 private function getTextSpawned( $id ) {
423 wfSuppressWarnings();
424 if ( !$this->spawnProc ) {
425 // First time?
426 $this->openSpawn();
427 }
428 $text = $this->getTextSpawnedOnce( $id );
429 wfRestoreWarnings();
430 return $text;
431 }
432
433 function openSpawn() {
434 global $IP;
435
436 if ( file_exists( "$IP/../multiversion/MWScript.php" ) ) {
437 $cmd = implode( " ",
438 array_map( 'wfEscapeShellArg',
439 array(
440 $this->php,
441 "$IP/../multiversion/MWScript.php",
442 "fetchText.php",
443 '--wiki', wfWikiID() ) ) );
444 }
445 else {
446 $cmd = implode( " ",
447 array_map( 'wfEscapeShellArg',
448 array(
449 $this->php,
450 "$IP/maintenance/fetchText.php",
451 '--wiki', wfWikiID() ) ) );
452 }
453 $spec = array(
454 0 => array( "pipe", "r" ),
455 1 => array( "pipe", "w" ),
456 2 => array( "file", "/dev/null", "a" ) );
457 $pipes = array();
458
459 $this->progress( "Spawning database subprocess: $cmd" );
460 $this->spawnProc = proc_open( $cmd, $spec, $pipes );
461 if ( !$this->spawnProc ) {
462 // shit
463 $this->progress( "Subprocess spawn failed." );
464 return false;
465 }
466 list(
467 $this->spawnWrite, // -> stdin
468 $this->spawnRead, // <- stdout
469 ) = $pipes;
470
471 return true;
472 }
473
474 private function closeSpawn() {
475 wfSuppressWarnings();
476 if ( $this->spawnRead )
477 fclose( $this->spawnRead );
478 $this->spawnRead = false;
479 if ( $this->spawnWrite )
480 fclose( $this->spawnWrite );
481 $this->spawnWrite = false;
482 if ( $this->spawnErr )
483 fclose( $this->spawnErr );
484 $this->spawnErr = false;
485 if ( $this->spawnProc )
486 pclose( $this->spawnProc );
487 $this->spawnProc = false;
488 wfRestoreWarnings();
489 }
490
491 private function getTextSpawnedOnce( $id ) {
492 global $wgContLang;
493
494 $ok = fwrite( $this->spawnWrite, "$id\n" );
495 // $this->progress( ">> $id" );
496 if ( !$ok ) return false;
497
498 $ok = fflush( $this->spawnWrite );
499 // $this->progress( ">> [flush]" );
500 if ( !$ok ) return false;
501
502 // check that the text id they are sending is the one we asked for
503 // this avoids out of sync revision text errors we have encountered in the past
504 $newId = fgets( $this->spawnRead );
505 if ( $newId === false ) {
506 return false;
507 }
508 if ( $id != intval( $newId ) ) {
509 return false;
510 }
511
512 $len = fgets( $this->spawnRead );
513 // $this->progress( "<< " . trim( $len ) );
514 if ( $len === false ) return false;
515
516 $nbytes = intval( $len );
517 // actual error, not zero-length text
518 if ($nbytes < 0 ) return false;
519
520 $text = "";
521
522 // Subprocess may not send everything at once, we have to loop.
523 while ( $nbytes > strlen( $text ) ) {
524 $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) );
525 if ( $buffer === false ) break;
526 $text .= $buffer;
527 }
528
529 $gotbytes = strlen( $text );
530 if ( $gotbytes != $nbytes ) {
531 $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " );
532 return false;
533 }
534
535 // Do normalization in the dump thread...
536 $stripped = str_replace( "\r", "", $text );
537 $normalized = $wgContLang->normalize( $stripped );
538 return $normalized;
539 }
540
541 function startElement( $parser, $name, $attribs ) {
542 $this->checkpointJustWritten = false;
543
544 $this->clearOpenElement( null );
545 $this->lastName = $name;
546
547 if ( $name == 'revision' ) {
548 $this->state = $name;
549 $this->egress->writeOpenPage( null, $this->buffer );
550 $this->buffer = "";
551 } elseif ( $name == 'page' ) {
552 $this->state = $name;
553 if ( $this->atStart ) {
554 $this->egress->writeOpenStream( $this->buffer );
555 $this->buffer = "";
556 $this->atStart = false;
557 }
558 }
559
560 if ( $name == "text" && isset( $attribs['id'] ) ) {
561 $text = $this->getText( $attribs['id'] );
562 $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
563 if ( strlen( $text ) > 0 ) {
564 $this->characterData( $parser, $text );
565 }
566 } else {
567 $this->openElement = array( $name, $attribs );
568 }
569 }
570
571 function endElement( $parser, $name ) {
572 $this->checkpointJustWritten = false;
573
574 if ( $this->openElement ) {
575 $this->clearOpenElement( "" );
576 } else {
577 $this->buffer .= "</$name>";
578 }
579
580 if ( $name == 'revision' ) {
581 $this->egress->writeRevision( null, $this->buffer );
582 $this->buffer = "";
583 $this->thisRev = "";
584 } elseif ( $name == 'page' ) {
585 if (! $this->firstPageWritten) {
586 $this->firstPageWritten = trim($this->thisPage);
587 }
588 $this->lastPageWritten = trim($this->thisPage);
589 if ($this->timeExceeded) {
590 $this->egress->writeClosePage( $this->buffer );
591 // nasty hack, we can't just write the chardata after the
592 // page tag, it will include leading blanks from the next line
593 $this->egress->sink->write("\n");
594
595 $this->buffer = $this->xmlwriterobj->closeStream();
596 $this->egress->writeCloseStream( $this->buffer );
597
598 $this->buffer = "";
599 $this->thisPage = "";
600 // this could be more than one file if we had more than one output arg
601
602 $filenameList = (array)$this->egress->getFilenames();
603 $newFilenames = array();
604 $firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT);
605 $lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT);
606 for ( $i = 0; $i < count( $filenameList ); $i++ ) {
607 $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
608 $fileinfo = pathinfo($filenameList[$i]);
609 $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
610 }
611 $this->egress->closeRenameAndReopen( $newFilenames );
612 $this->buffer = $this->xmlwriterobj->openStream();
613 $this->timeExceeded = false;
614 $this->timeOfCheckpoint = $this->lastTime;
615 $this->firstPageWritten = false;
616 $this->checkpointJustWritten = true;
617 }
618 else {
619 $this->egress->writeClosePage( $this->buffer );
620 $this->buffer = "";
621 $this->thisPage = "";
622 }
623
624 } elseif ( $name == 'mediawiki' ) {
625 $this->egress->writeCloseStream( $this->buffer );
626 $this->buffer = "";
627 }
628 }
629
630 function characterData( $parser, $data ) {
631 $this->clearOpenElement( null );
632 if ( $this->lastName == "id" ) {
633 if ( $this->state == "revision" ) {
634 $this->thisRev .= $data;
635 } elseif ( $this->state == "page" ) {
636 $this->thisPage .= $data;
637 }
638 }
639 // have to skip the newline left over from closepagetag line of
640 // end of checkpoint files. nasty hack!!
641 if ($this->checkpointJustWritten) {
642 if ($data[0] == "\n") {
643 $data = substr($data,1);
644 }
645 $this->checkpointJustWritten = false;
646 }
647 $this->buffer .= htmlspecialchars( $data );
648 }
649
650 function clearOpenElement( $style ) {
651 if ( $this->openElement ) {
652 $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
653 $this->openElement = false;
654 }
655 }
656 }
657
658
659 $dumper = new TextPassDumper( $argv );
660
661 if ( !isset( $options['help'] ) ) {
662 $dumper->dump( true );
663 } else {
664 $dumper->progress( <<<ENDS
665 This script postprocesses XML dumps from dumpBackup.php to add
666 page text which was stubbed out (using --stub).
667
668 XML input is accepted on stdin.
669 XML output is sent to stdout; progress reports are sent to stderr.
670
671 Usage: php dumpTextPass.php [<options>]
672 Options:
673 --stub=<type>:<file> To load a compressed stub dump instead of stdin
674 --prefetch=<type>:<file> Use a prior dump file as a text source, to save
675 pressure on the database.
676 (Requires the XMLReader extension)
677 --maxtime=<minutes> Write out checkpoint file after this many minutes (writing
678 out complete page, closing xml file properly, and opening new one
679 with header). This option requires the checkpointfile option.
680 --checkpointfile=<filenamepattern> Use this string for checkpoint filenames,
681 substituting first pageid written for the first %s (required) and the
682 last pageid written for the second %s if it exists.
683 --quiet Don't dump status reports to stderr.
684 --report=n Report position and speed after every n pages processed.
685 (Default: 100)
686 --server=h Force reading from MySQL server h
687 --current Base ETA on number of pages in database instead of all revisions
688 --spawn Spawn a subprocess for loading text records
689 --help Display this help message
690 ENDS
691 );
692 }
693
694