add mwscript handling for call of fetchText.php maintenance script
[lhc/web/wiklou.git] / maintenance / dumpTextPass.php
1 <?php
2 /**
3 * Script that postprocesses XML dumps from dumpBackup.php to add page text
4 *
5 * Copyright (C) 2005 Brion Vibber <brion@pobox.com>
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Maintenance
25 */
26
27 $originalDir = getcwd();
28
29 require_once( dirname( __FILE__ ) . '/commandLine.inc' );
30 require_once( 'backup.inc' );
31
32 /**
33 * @ingroup Maintenance
34 */
35 class TextPassDumper extends BackupDumper {
36 var $prefetch = null;
37 var $input = "php://stdin";
38 var $history = WikiExporter::FULL;
39 var $fetchCount = 0;
40 var $prefetchCount = 0;
41 var $prefetchCountLast = 0;
42 var $fetchCountLast = 0;
43
44 var $failures = 0;
45 var $maxFailures = 5;
46 var $failedTextRetrievals = 0;
47 var $maxConsecutiveFailedTextRetrievals = 200;
48 var $failureTimeout = 5; // Seconds to sleep after db failure
49
50 var $php = "php";
51 var $spawn = false;
52 var $spawnProc = false;
53 var $spawnWrite = false;
54 var $spawnRead = false;
55 var $spawnErr = false;
56
57 var $xmlwriterobj = false;
58
59 // when we spend more than maxTimeAllowed seconds on this run, we continue
60 // processing until we write out the next complete page, then save output file(s),
61 // rename it/them and open new one(s)
62 var $maxTimeAllowed = 0; // 0 = no limit
63 var $timeExceeded = false;
64 var $firstPageWritten = false;
65 var $lastPageWritten = false;
66 var $checkpointJustWritten = false;
67 var $checkpointFiles = array();
68
69 function initProgress( $history ) {
70 parent::initProgress();
71 $this->timeOfCheckpoint = $this->startTime;
72 }
73
74 function dump( $history, $text = WikiExporter::TEXT ) {
75 // This shouldn't happen if on console... ;)
76 header( 'Content-type: text/html; charset=UTF-8' );
77
78 // Notice messages will foul up your XML output even if they're
79 // relatively harmless.
80 if ( ini_get( 'display_errors' ) )
81 ini_set( 'display_errors', 'stderr' );
82
83 $this->initProgress( $this->history );
84
85 $this->db = $this->backupDb();
86
87 $this->egress = new ExportProgressFilter( $this->sink, $this );
88
89 // it would be nice to do it in the constructor, oh well. need egress set
90 $this->finalOptionCheck();
91
92 // we only want this so we know how to close a stream :-P
93 $this->xmlwriterobj = new XmlDumpWriter();
94
95 $input = fopen( $this->input, "rt" );
96 $result = $this->readDump( $input );
97
98 if ( WikiError::isError( $result ) ) {
99 throw new MWException( $result->getMessage() );
100 }
101
102 if ( $this->spawnProc ) {
103 $this->closeSpawn();
104 }
105
106 $this->report( true );
107 }
108
109 function processOption( $opt, $val, $param ) {
110 global $IP;
111 $url = $this->processFileOpt( $val, $param );
112
113 switch( $opt ) {
114 case 'prefetch':
115 require_once "$IP/maintenance/backupPrefetch.inc";
116 $this->prefetch = new BaseDump( $url );
117 break;
118 case 'stub':
119 $this->input = $url;
120 break;
121 case 'maxtime':
122 $this->maxTimeAllowed = intval($val)*60;
123 break;
124 case 'checkpointfile':
125 $this->checkpointFiles[] = $val;
126 break;
127 case 'current':
128 $this->history = WikiExporter::CURRENT;
129 break;
130 case 'full':
131 $this->history = WikiExporter::FULL;
132 break;
133 case 'spawn':
134 $this->spawn = true;
135 if ( $val ) {
136 $this->php = $val;
137 }
138 break;
139 }
140 }
141
142 function processFileOpt( $val, $param ) {
143 $fileURIs = explode(';',$param);
144 foreach ( $fileURIs as $URI ) {
145 switch( $val ) {
146 case "file":
147 $newURI = $URI;
148 break;
149 case "gzip":
150 $newURI = "compress.zlib://$URI";
151 break;
152 case "bzip2":
153 $newURI = "compress.bzip2://$URI";
154 break;
155 case "7zip":
156 $newURI = "mediawiki.compress.7z://$URI";
157 break;
158 default:
159 $newURI = $URI;
160 }
161 $newFileURIs[] = $newURI;
162 }
163 $val = implode( ';', $newFileURIs );
164 return $val;
165 }
166
167 /**
168 * Overridden to include prefetch ratio if enabled.
169 */
170 function showReport() {
171 if ( !$this->prefetch ) {
172 return parent::showReport();
173 }
174
175 if ( $this->reporting ) {
176 $now = wfTimestamp( TS_DB );
177 $nowts = wfTime();
178 $deltaAll = wfTime() - $this->startTime;
179 $deltaPart = wfTime() - $this->lastTime;
180 $this->pageCountPart = $this->pageCount - $this->pageCountLast;
181 $this->revCountPart = $this->revCount - $this->revCountLast;
182
183 if ( $deltaAll ) {
184 $portion = $this->revCount / $this->maxCount;
185 $eta = $this->startTime + $deltaAll / $portion;
186 $etats = wfTimestamp( TS_DB, intval( $eta ) );
187 if ( $this->fetchCount ) {
188 $fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount;
189 }
190 else {
191 $fetchRate = '-';
192 }
193 $pageRate = $this->pageCount / $deltaAll;
194 $revRate = $this->revCount / $deltaAll;
195 } else {
196 $pageRate = '-';
197 $revRate = '-';
198 $etats = '-';
199 $fetchRate = '-';
200 }
201 if ( $deltaPart ) {
202 if ( $this->fetchCountLast ) {
203 $fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast;
204 }
205 else {
206 $fetchRatePart = '-';
207 }
208 $pageRatePart = $this->pageCountPart / $deltaPart;
209 $revRatePart = $this->revCountPart / $deltaPart;
210
211 } else {
212 $fetchRatePart = '-';
213 $pageRatePart = '-';
214 $revRatePart = '-';
215 }
216 $this->progress( sprintf( "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), %d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% prefetched (all|curr), ETA %s [max %d]",
217 $now, wfWikiID(), $this->ID, $this->pageCount, $pageRate, $pageRatePart, $this->revCount, $revRate, $revRatePart, $fetchRate, $fetchRatePart, $etats, $this->maxCount ) );
218 $this->lastTime = $nowts;
219 $this->revCountLast = $this->revCount;
220 $this->prefetchCountLast = $this->prefetchCount;
221 $this->fetchCountLast = $this->fetchCount;
222 }
223 }
224
225 function setTimeExceeded() {
226 $this->timeExceeded = True;
227 }
228
229 function checkIfTimeExceeded() {
230 if ( $this->maxTimeAllowed && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed ) ) {
231 return True;
232 }
233 return False;
234 }
235
236 function finalOptionCheck() {
237 if ( ( $this->checkpointFiles && ! $this->maxTimeAllowed ) ||
238 ( $this->maxTimeAllowed && !$this->checkpointFiles ) ) {
239 throw new MWException("Options checkpointfile and maxtime must be specified together.\n");
240 }
241 foreach ($this->checkpointFiles as $checkpointFile) {
242 $count = substr_count ( $checkpointFile,"%s" );
243 if ( $count != 2 ) {
244 throw new MWException("Option checkpointfile must contain two '%s' for substitution of first and last pageids, count is $count instead, file is $checkpointFile.\n");
245 }
246 }
247
248 if ( $this->checkpointFiles ) {
249 $filenameList = (array)$this->egress->getFilenames();
250 if ( count( $filenameList ) != count( $this->checkpointFiles ) ) {
251 throw new MWException("One checkpointfile must be specified for each output option, if maxtime is used.\n");
252 }
253 }
254 }
255
256 function readDump( $input ) {
257 $this->buffer = "";
258 $this->openElement = false;
259 $this->atStart = true;
260 $this->state = "";
261 $this->lastName = "";
262 $this->thisPage = 0;
263 $this->thisRev = 0;
264
265 $parser = xml_parser_create( "UTF-8" );
266 xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
267
268 xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
269 xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
270
271 $offset = 0; // for context extraction on error reporting
272 $bufferSize = 512 * 1024;
273 do {
274 if ($this->checkIfTimeExceeded()) {
275 $this->setTimeExceeded();
276 }
277 $chunk = fread( $input, $bufferSize );
278 if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
279 wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
280 return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
281 }
282 $offset += strlen( $chunk );
283 } while ( $chunk !== false && !feof( $input ) );
284 if ($this->maxTimeAllowed) {
285 $filenameList = (array)$this->egress->getFilenames();
286 // we wrote some stuff after last checkpoint that needs renamed
287 if (file_exists($filenameList[0])) {
288 $newFilenames = array();
289 # we might have just written the header and footer and had no
290 # pages or revisions written... perhaps they were all deleted
291 # there's no pageID 0 so we use that. the caller is responsible
292 # for deciding what to do with a file containing only the
293 # siteinfo information and the mw tags.
294 if (! $this->firstPageWritten) {
295 $firstPageID = str_pad(0,9,"0",STR_PAD_LEFT);
296 $lastPageID = str_pad(0,9,"0",STR_PAD_LEFT);
297 }
298 else {
299 $firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT);
300 $lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT);
301 }
302 for ( $i = 0; $i < count( $filenameList ); $i++ ) {
303 $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
304 $fileinfo = pathinfo($filenameList[$i]);
305 $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
306 }
307 $this->egress->closeAndRename( $newFilenames );
308 }
309 }
310 xml_parser_free( $parser );
311
312 return true;
313 }
314
315 function getText( $id ) {
316 $this->fetchCount++;
317 if ( isset( $this->prefetch ) ) {
318 $text = $this->prefetch->prefetch( $this->thisPage, $this->thisRev );
319 if ( $text !== null ) { // Entry missing from prefetch dump
320 $dbr = wfGetDB( DB_SLAVE );
321 $revID = intval( $this->thisRev );
322 $revLength = $dbr->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
323 // if length of rev text in file doesn't match length in db, we reload
324 // this avoids carrying forward broken data from previous xml dumps
325 if( strlen( $text ) == $revLength ) {
326 $this->prefetchCount++;
327 return $text;
328 }
329 }
330 }
331 return $this->doGetText( $id );
332 }
333
334 private function doGetText( $id ) {
335
336 $id = intval( $id );
337 $this->failures = 0;
338 $ex = new MWException( "Graceful storage failure" );
339 while (true) {
340 if ( $this->spawn ) {
341 if ($this->failures) {
342 // we don't know why it failed, could be the child process
343 // borked, could be db entry busted, could be db server out to lunch,
344 // so cover all bases
345 $this->closeSpawn();
346 $this->openSpawn();
347 }
348 $text = $this->getTextSpawned( $id );
349 } else {
350 $text = $this->getTextDbSafe( $id );
351 }
352 if ( $text === false ) {
353 $this->failures++;
354 if ( $this->failures > $this->maxFailures) {
355 $this->progress( "Failed to retrieve revision text for text id ".
356 "$id after $this->maxFailures tries, giving up" );
357 // were there so many bad retrievals in a row we want to bail?
358 // at some point we have to declare the dump irretrievably broken
359 $this->failedTextRetrievals++;
360 if ($this->failedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals) {
361 throw $ex;
362 }
363 else {
364 // would be nice to return something better to the caller someday,
365 // log what we know about the failure and about the revision
366 return("");
367 }
368 } else {
369 $this->progress( "Error $this->failures " .
370 "of allowed $this->maxFailures retrieving revision text for text id $id! " .
371 "Pausing $this->failureTimeout seconds before retry..." );
372 sleep( $this->failureTimeout );
373 }
374 } else {
375 $this->failedTextRetrievals= 0;
376 return( $text );
377 }
378 }
379
380 }
381
382 /**
383 * Fetch a text revision from the database, retrying in case of failure.
384 * This may survive some transitory errors by reconnecting, but
385 * may not survive a long-term server outage.
386 */
387 private function getTextDbSafe( $id ) {
388 while ( true ) {
389 try {
390 $text = $this->getTextDb( $id );
391 } catch ( DBQueryError $ex ) {
392 $text = false;
393 }
394 return $text;
395 }
396 }
397
398 /**
399 * May throw a database error if, say, the server dies during query.
400 */
401 private function getTextDb( $id ) {
402 global $wgContLang;
403 $row = $this->db->selectRow( 'text',
404 array( 'old_text', 'old_flags' ),
405 array( 'old_id' => $id ),
406 __METHOD__ );
407 $text = Revision::getRevisionText( $row );
408 if ( $text === false ) {
409 return false;
410 }
411 $stripped = str_replace( "\r", "", $text );
412 $normalized = $wgContLang->normalize( $stripped );
413 return $normalized;
414 }
415
416 private function getTextSpawned( $id ) {
417 wfSuppressWarnings();
418 if ( !$this->spawnProc ) {
419 // First time?
420 $this->openSpawn();
421 }
422 $text = $this->getTextSpawnedOnce( $id );
423 wfRestoreWarnings();
424 return $text;
425 }
426
427 function openSpawn() {
428 global $IP;
429
430 if ( file_exists( "$IP/../multiversion/MWScript.php" ) ) {
431 $cmd = implode( " ",
432 array_map( 'wfEscapeShellArg',
433 array(
434 $this->php,
435 "$IP/../multiversion/MWScript.php",
436 "fetchText.php",
437 '--wiki', wfWikiID() ) ) );
438 }
439 else {
440 $cmd = implode( " ",
441 array_map( 'wfEscapeShellArg',
442 array(
443 $this->php,
444 "$IP/maintenance/fetchText.php",
445 '--wiki', wfWikiID() ) ) );
446 }
447 $spec = array(
448 0 => array( "pipe", "r" ),
449 1 => array( "pipe", "w" ),
450 2 => array( "file", "/dev/null", "a" ) );
451 $pipes = array();
452
453 $this->progress( "Spawning database subprocess: $cmd" );
454 $this->spawnProc = proc_open( $cmd, $spec, $pipes );
455 if ( !$this->spawnProc ) {
456 // shit
457 $this->progress( "Subprocess spawn failed." );
458 return false;
459 }
460 list(
461 $this->spawnWrite, // -> stdin
462 $this->spawnRead, // <- stdout
463 ) = $pipes;
464
465 return true;
466 }
467
468 private function closeSpawn() {
469 wfSuppressWarnings();
470 if ( $this->spawnRead )
471 fclose( $this->spawnRead );
472 $this->spawnRead = false;
473 if ( $this->spawnWrite )
474 fclose( $this->spawnWrite );
475 $this->spawnWrite = false;
476 if ( $this->spawnErr )
477 fclose( $this->spawnErr );
478 $this->spawnErr = false;
479 if ( $this->spawnProc )
480 pclose( $this->spawnProc );
481 $this->spawnProc = false;
482 wfRestoreWarnings();
483 }
484
485 private function getTextSpawnedOnce( $id ) {
486 global $wgContLang;
487
488 $ok = fwrite( $this->spawnWrite, "$id\n" );
489 // $this->progress( ">> $id" );
490 if ( !$ok ) return false;
491
492 $ok = fflush( $this->spawnWrite );
493 // $this->progress( ">> [flush]" );
494 if ( !$ok ) return false;
495
496 // check that the text id they are sending is the one we asked for
497 // this avoids out of sync revision text errors we have encountered in the past
498 $newId = fgets( $this->spawnRead );
499 if ( $newId === false ) {
500 return false;
501 }
502 if ( $id != intval( $newId ) ) {
503 return false;
504 }
505
506 $len = fgets( $this->spawnRead );
507 // $this->progress( "<< " . trim( $len ) );
508 if ( $len === false ) return false;
509
510 $nbytes = intval( $len );
511 // actual error, not zero-length text
512 if ($nbytes < 0 ) return false;
513
514 $text = "";
515
516 // Subprocess may not send everything at once, we have to loop.
517 while ( $nbytes > strlen( $text ) ) {
518 $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) );
519 if ( $buffer === false ) break;
520 $text .= $buffer;
521 }
522
523 $gotbytes = strlen( $text );
524 if ( $gotbytes != $nbytes ) {
525 $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " );
526 return false;
527 }
528
529 // Do normalization in the dump thread...
530 $stripped = str_replace( "\r", "", $text );
531 $normalized = $wgContLang->normalize( $stripped );
532 return $normalized;
533 }
534
535 function startElement( $parser, $name, $attribs ) {
536 $this->checkpointJustWritten = false;
537
538 $this->clearOpenElement( null );
539 $this->lastName = $name;
540
541 if ( $name == 'revision' ) {
542 $this->state = $name;
543 $this->egress->writeOpenPage( null, $this->buffer );
544 $this->buffer = "";
545 } elseif ( $name == 'page' ) {
546 $this->state = $name;
547 if ( $this->atStart ) {
548 $this->egress->writeOpenStream( $this->buffer );
549 $this->buffer = "";
550 $this->atStart = false;
551 }
552 }
553
554 if ( $name == "text" && isset( $attribs['id'] ) ) {
555 $text = $this->getText( $attribs['id'] );
556 $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
557 if ( strlen( $text ) > 0 ) {
558 $this->characterData( $parser, $text );
559 }
560 } else {
561 $this->openElement = array( $name, $attribs );
562 }
563 }
564
565 function endElement( $parser, $name ) {
566 $this->checkpointJustWritten = false;
567
568 if ( $this->openElement ) {
569 $this->clearOpenElement( "" );
570 } else {
571 $this->buffer .= "</$name>";
572 }
573
574 if ( $name == 'revision' ) {
575 $this->egress->writeRevision( null, $this->buffer );
576 $this->buffer = "";
577 $this->thisRev = "";
578 } elseif ( $name == 'page' ) {
579 if (! $this->firstPageWritten) {
580 $this->firstPageWritten = trim($this->thisPage);
581 }
582 $this->lastPageWritten = trim($this->thisPage);
583 if ($this->timeExceeded) {
584 $this->egress->writeClosePage( $this->buffer );
585 // nasty hack, we can't just write the chardata after the
586 // page tag, it will include leading blanks from the next line
587 $this->egress->sink->write("\n");
588
589 $this->buffer = $this->xmlwriterobj->closeStream();
590 $this->egress->writeCloseStream( $this->buffer );
591
592 $this->buffer = "";
593 $this->thisPage = "";
594 // this could be more than one file if we had more than one output arg
595 $checkpointFilenames = array();
596 $filenameList = (array)$this->egress->getFilenames();
597 $newFilenames = array();
598 $firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT);
599 $lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT);
600 for ( $i = 0; $i < count( $filenameList ); $i++ ) {
601 $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
602 $fileinfo = pathinfo($filenameList[$i]);
603 $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
604 }
605 $this->egress->closeRenameAndReopen( $newFilenames );
606 $this->buffer = $this->xmlwriterobj->openStream();
607 $this->timeExceeded = false;
608 $this->timeOfCheckpoint = $this->lastTime;
609 $this->firstPageWritten = false;
610 $this->checkpointJustWritten = true;
611 }
612 else {
613 $this->egress->writeClosePage( $this->buffer );
614 $this->buffer = "";
615 $this->thisPage = "";
616 }
617
618 } elseif ( $name == 'mediawiki' ) {
619 $this->egress->writeCloseStream( $this->buffer );
620 $this->buffer = "";
621 }
622 }
623
624 function characterData( $parser, $data ) {
625 $this->clearOpenElement( null );
626 if ( $this->lastName == "id" ) {
627 if ( $this->state == "revision" ) {
628 $this->thisRev .= $data;
629 } elseif ( $this->state == "page" ) {
630 $this->thisPage .= $data;
631 }
632 }
633 // have to skip the newline left over from closepagetag line of
634 // end of checkpoint files. nasty hack!!
635 if ($this->checkpointJustWritten) {
636 if ($data[0] == "\n") {
637 $data = substr($data,1);
638 }
639 $this->checkpointJustWritten = false;
640 }
641 $this->buffer .= htmlspecialchars( $data );
642 }
643
644 function clearOpenElement( $style ) {
645 if ( $this->openElement ) {
646 $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
647 $this->openElement = false;
648 }
649 }
650 }
651
652
653 $dumper = new TextPassDumper( $argv );
654
655 if ( !isset( $options['help'] ) ) {
656 $dumper->dump( true );
657 } else {
658 $dumper->progress( <<<ENDS
659 This script postprocesses XML dumps from dumpBackup.php to add
660 page text which was stubbed out (using --stub).
661
662 XML input is accepted on stdin.
663 XML output is sent to stdout; progress reports are sent to stderr.
664
665 Usage: php dumpTextPass.php [<options>]
666 Options:
667 --stub=<type>:<file> To load a compressed stub dump instead of stdin
668 --prefetch=<type>:<file> Use a prior dump file as a text source, to save
669 pressure on the database.
670 (Requires the XMLReader extension)
671 --maxtime=<minutes> Write out checkpoint file after this many minutes (writing
672 out complete page, closing xml file properly, and opening new one
673 with header). This option requires the checkpointfile option.
674 --checkpointfile=<filenamepattern> Use this string for checkpoint filenames,
675 substituting first pageid written for the first %s (required) and the
676 last pageid written for the second %s if it exists.
677 --quiet Don't dump status reports to stderr.
678 --report=n Report position and speed after every n pages processed.
679 (Default: 100)
680 --server=h Force reading from MySQL server h
681 --current Base ETA on number of pages in database instead of all revisions
682 --spawn Spawn a subprocess for loading text records
683 --help Display this help message
684 ENDS
685 );
686 }
687
688