Merge "Remove unused $wgDebugDBTransactions"
[lhc/web/wiklou.git] / maintenance / dumpTextPass.php
1 <?php
2 /**
3 * Script that postprocesses XML dumps from dumpBackup.php to add page text
4 *
5 * Copyright (C) 2005 Brion Vibber <brion@pobox.com>
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Maintenance
25 */
26
27 $originalDir = getcwd();
28
29 require_once( dirname( __FILE__ ) . '/commandLine.inc' );
30 require_once( 'backup.inc' );
31
32 /**
33 * @ingroup Maintenance
34 */
35 class TextPassDumper extends BackupDumper {
36 var $prefetch = null;
37 var $input = "php://stdin";
38 var $history = WikiExporter::FULL;
39 var $fetchCount = 0;
40 var $prefetchCount = 0;
41 var $prefetchCountLast = 0;
42 var $fetchCountLast = 0;
43
44 var $maxFailures = 5;
45 var $maxConsecutiveFailedTextRetrievals = 200;
46 var $failureTimeout = 5; // Seconds to sleep after db failure
47
48 var $php = "php";
49 var $spawn = false;
50 var $spawnProc = false;
51 var $spawnWrite = false;
52 var $spawnRead = false;
53 var $spawnErr = false;
54
55 var $xmlwriterobj = false;
56
57 // when we spend more than maxTimeAllowed seconds on this run, we continue
58 // processing until we write out the next complete page, then save output file(s),
59 // rename it/them and open new one(s)
60 var $maxTimeAllowed = 0; // 0 = no limit
61 var $timeExceeded = false;
62 var $firstPageWritten = false;
63 var $lastPageWritten = false;
64 var $checkpointJustWritten = false;
65 var $checkpointFiles = array();
66
67 /**
68 * @var DatabaseBase
69 */
70 protected $db;
71
72
73 /**
74 * Drop the database connection $this->db and try to get a new one.
75 *
76 * This function tries to get a /different/ connection if this is
77 * possible. Hence, (if this is possible) it switches to a different
78 * failover upon each call.
79 *
80 * This function resets $this->lb and closes all connections on it.
81 *
82 * @throws MWException
83 */
84 function rotateDb() {
85 // Cleaning up old connections
86 if ( isset( $this->lb ) ) {
87 $this->lb->closeAll();
88 unset( $this->lb );
89 }
90
91 if ( isset( $this->db ) && $this->db->isOpen() ) {
92 throw new MWException( 'DB is set and has not been closed by the Load Balancer' );
93 }
94
95 unset( $this->db );
96
97 // Trying to set up new connection.
98 // We do /not/ retry upon failure, but delegate to encapsulating logic, to avoid
99 // individually retrying at different layers of code.
100
101 // 1. The LoadBalancer.
102 try {
103 $this->lb = wfGetLBFactory()->newMainLB();
104 } catch ( Exception $e ) {
105 throw new MWException( __METHOD__ . " rotating DB failed to obtain new load balancer (" . $e->getMessage() . ")" );
106 }
107
108
109 // 2. The Connection, through the load balancer.
110 try {
111 $this->db = $this->lb->getConnection( DB_SLAVE, 'backup' );
112 } catch ( Exception $e ) {
113 throw new MWException( __METHOD__ . " rotating DB failed to obtain new database (" . $e->getMessage() . ")" );
114 }
115 }
116
117
118 function initProgress( $history ) {
119 parent::initProgress();
120 $this->timeOfCheckpoint = $this->startTime;
121 }
122
123 function dump( $history, $text = WikiExporter::TEXT ) {
124 // This shouldn't happen if on console... ;)
125 header( 'Content-type: text/html; charset=UTF-8' );
126
127 // Notice messages will foul up your XML output even if they're
128 // relatively harmless.
129 if ( ini_get( 'display_errors' ) )
130 ini_set( 'display_errors', 'stderr' );
131
132 $this->initProgress( $this->history );
133
134 // We are trying to get an initial database connection to avoid that the
135 // first try of this request's first call to getText fails. However, if
136 // obtaining a good DB connection fails it's not a serious issue, as
137 // getText does retry upon failure and can start without having a working
138 // DB connection.
139 try {
140 $this->rotateDb();
141 } catch ( Exception $e ) {
142 // We do not even count this as failure. Just let eventual
143 // watchdogs know.
144 $this->progress( "Getting initial DB connection failed (" .
145 $e->getMessage() . ")" );
146 }
147
148 $this->egress = new ExportProgressFilter( $this->sink, $this );
149
150 // it would be nice to do it in the constructor, oh well. need egress set
151 $this->finalOptionCheck();
152
153 // we only want this so we know how to close a stream :-P
154 $this->xmlwriterobj = new XmlDumpWriter();
155
156 $input = fopen( $this->input, "rt" );
157 $result = $this->readDump( $input );
158
159 if ( WikiError::isError( $result ) ) {
160 throw new MWException( $result->getMessage() );
161 }
162
163 if ( $this->spawnProc ) {
164 $this->closeSpawn();
165 }
166
167 $this->report( true );
168 }
169
170 function processOption( $opt, $val, $param ) {
171 global $IP;
172 $url = $this->processFileOpt( $val, $param );
173
174 switch( $opt ) {
175 case 'prefetch':
176 require_once "$IP/maintenance/backupPrefetch.inc";
177 $this->prefetch = new BaseDump( $url );
178 break;
179 case 'stub':
180 $this->input = $url;
181 break;
182 case 'maxtime':
183 $this->maxTimeAllowed = intval( $val ) * 60;
184 break;
185 case 'checkpointfile':
186 $this->checkpointFiles[] = $val;
187 break;
188 case 'current':
189 $this->history = WikiExporter::CURRENT;
190 break;
191 case 'full':
192 $this->history = WikiExporter::FULL;
193 break;
194 case 'spawn':
195 $this->spawn = true;
196 if ( $val ) {
197 $this->php = $val;
198 }
199 break;
200 }
201 }
202
203 function processFileOpt( $val, $param ) {
204 $fileURIs = explode( ';', $param );
205 foreach ( $fileURIs as $URI ) {
206 switch( $val ) {
207 case "file":
208 $newURI = $URI;
209 break;
210 case "gzip":
211 $newURI = "compress.zlib://$URI";
212 break;
213 case "bzip2":
214 $newURI = "compress.bzip2://$URI";
215 break;
216 case "7zip":
217 $newURI = "mediawiki.compress.7z://$URI";
218 break;
219 default:
220 $newURI = $URI;
221 }
222 $newFileURIs[] = $newURI;
223 }
224 $val = implode( ';', $newFileURIs );
225 return $val;
226 }
227
228 /**
229 * Overridden to include prefetch ratio if enabled.
230 */
231 function showReport() {
232 if ( !$this->prefetch ) {
233 parent::showReport();
234 return;
235 }
236
237 if ( $this->reporting ) {
238 $now = wfTimestamp( TS_DB );
239 $nowts = wfTime();
240 $deltaAll = wfTime() - $this->startTime;
241 $deltaPart = wfTime() - $this->lastTime;
242 $this->pageCountPart = $this->pageCount - $this->pageCountLast;
243 $this->revCountPart = $this->revCount - $this->revCountLast;
244
245 if ( $deltaAll ) {
246 $portion = $this->revCount / $this->maxCount;
247 $eta = $this->startTime + $deltaAll / $portion;
248 $etats = wfTimestamp( TS_DB, intval( $eta ) );
249 if ( $this->fetchCount ) {
250 $fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount;
251 } else {
252 $fetchRate = '-';
253 }
254 $pageRate = $this->pageCount / $deltaAll;
255 $revRate = $this->revCount / $deltaAll;
256 } else {
257 $pageRate = '-';
258 $revRate = '-';
259 $etats = '-';
260 $fetchRate = '-';
261 }
262 if ( $deltaPart ) {
263 if ( $this->fetchCountLast ) {
264 $fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast;
265 } else {
266 $fetchRatePart = '-';
267 }
268 $pageRatePart = $this->pageCountPart / $deltaPart;
269 $revRatePart = $this->revCountPart / $deltaPart;
270
271 } else {
272 $fetchRatePart = '-';
273 $pageRatePart = '-';
274 $revRatePart = '-';
275 }
276 $this->progress( sprintf( "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), %d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% prefetched (all|curr), ETA %s [max %d]",
277 $now, wfWikiID(), $this->ID, $this->pageCount, $pageRate, $pageRatePart, $this->revCount, $revRate, $revRatePart, $fetchRate, $fetchRatePart, $etats, $this->maxCount ) );
278 $this->lastTime = $nowts;
279 $this->revCountLast = $this->revCount;
280 $this->prefetchCountLast = $this->prefetchCount;
281 $this->fetchCountLast = $this->fetchCount;
282 }
283 }
284
285 function setTimeExceeded() {
286 $this->timeExceeded = True;
287 }
288
289 function checkIfTimeExceeded() {
290 if ( $this->maxTimeAllowed && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed ) ) {
291 return true;
292 }
293 return false;
294 }
295
296 function finalOptionCheck() {
297 if ( ( $this->checkpointFiles && ! $this->maxTimeAllowed ) ||
298 ( $this->maxTimeAllowed && !$this->checkpointFiles ) ) {
299 throw new MWException( "Options checkpointfile and maxtime must be specified together.\n" );
300 }
301 foreach ( $this->checkpointFiles as $checkpointFile ) {
302 $count = substr_count ( $checkpointFile, "%s" );
303 if ( $count != 2 ) {
304 throw new MWException( "Option checkpointfile must contain two '%s' for substitution of first and last pageids, count is $count instead, file is $checkpointFile.\n" );
305 }
306 }
307
308 if ( $this->checkpointFiles ) {
309 $filenameList = (array)$this->egress->getFilenames();
310 if ( count( $filenameList ) != count( $this->checkpointFiles ) ) {
311 throw new MWException( "One checkpointfile must be specified for each output option, if maxtime is used.\n" );
312 }
313 }
314 }
315
316 function readDump( $input ) {
317 $this->buffer = "";
318 $this->openElement = false;
319 $this->atStart = true;
320 $this->state = "";
321 $this->lastName = "";
322 $this->thisPage = 0;
323 $this->thisRev = 0;
324
325 $parser = xml_parser_create( "UTF-8" );
326 xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
327
328 xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
329 xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
330
331 $offset = 0; // for context extraction on error reporting
332 $bufferSize = 512 * 1024;
333 do {
334 if ( $this->checkIfTimeExceeded() ) {
335 $this->setTimeExceeded();
336 }
337 $chunk = fread( $input, $bufferSize );
338 if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
339 wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
340 return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
341 }
342 $offset += strlen( $chunk );
343 } while ( $chunk !== false && !feof( $input ) );
344 if ( $this->maxTimeAllowed ) {
345 $filenameList = (array)$this->egress->getFilenames();
346 // we wrote some stuff after last checkpoint that needs renamed
347 if ( file_exists( $filenameList[0] ) ) {
348 $newFilenames = array();
349 # we might have just written the header and footer and had no
350 # pages or revisions written... perhaps they were all deleted
351 # there's no pageID 0 so we use that. the caller is responsible
352 # for deciding what to do with a file containing only the
353 # siteinfo information and the mw tags.
354 if ( ! $this->firstPageWritten ) {
355 $firstPageID = str_pad( 0, 9, "0", STR_PAD_LEFT );
356 $lastPageID = str_pad( 0, 9, "0", STR_PAD_LEFT );
357 }
358 else {
359 $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT );
360 $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT );
361 }
362 for ( $i = 0; $i < count( $filenameList ); $i++ ) {
363 $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
364 $fileinfo = pathinfo( $filenameList[$i] );
365 $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
366 }
367 $this->egress->closeAndRename( $newFilenames );
368 }
369 }
370 xml_parser_free( $parser );
371
372 return true;
373 }
374
375 /**
376 * Tries to get the revision text for a revision id.
377 *
378 * Upon errors, retries (Up to $this->maxFailures tries each call).
379 * If still no good revision get could be found even after this retrying, "" is returned.
380 * If no good revision text could be returned for
381 * $this->maxConsecutiveFailedTextRetrievals consecutive calls to getText, MWException
382 * is thrown.
383 *
384 * @param $id string The revision id to get the text for
385 *
386 * @return string The revision text for $id, or ""
387 * @throws MWException
388 */
389 function getText( $id ) {
390 $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
391 $text = false; // The candidate for a good text. false if no proper value.
392 $failures = 0; // The number of times, this invocation of getText already failed.
393
394 static $consecutiveFailedTextRetrievals = 0; // The number of times getText failed without
395 // yielding a good text in between.
396
397 $this->fetchCount++;
398
399 // To allow to simply return on success and do not have to worry about book keeping,
400 // we assume, this fetch works (possible after some retries). Nevertheless, we koop
401 // the old value, so we can restore it, if problems occur (See after the while loop).
402 $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
403 $consecutiveFailedTextRetrievals = 0;
404
405 while ( $failures < $this->maxFailures ) {
406
407 // As soon as we found a good text for the $id, we will return immediately.
408 // Hence, if we make it past the try catch block, we know that we did not
409 // find a good text.
410
411 try {
412 // Step 1: Get some text (or reuse from previous iteratuon if checking
413 // for plausibility failed)
414
415 // Trying to get prefetch, if it has not been tried before
416 if ( $text === false && isset( $this->prefetch ) && $prefetchNotTried ) {
417 $prefetchNotTried = false;
418 $tryIsPrefetch = true;
419 $text = $this->prefetch->prefetch( $this->thisPage, $this->thisRev );
420 if ( $text === null ) {
421 $text = false;
422 }
423 }
424
425 if ( $text === false ) {
426 // Fallback to asking the database
427 $tryIsPrefetch = false;
428 if ( $this->spawn ) {
429 $text = $this->getTextSpawned( $id );
430 } else {
431 $text = $this->getTextDb( $id );
432 }
433 }
434
435 if ( $text === false ) {
436 throw new MWException( "Generic error while obtaining text for id " . $id );
437 }
438
439 // We received a good candidate for the text of $id via some method
440
441 // Step 2: Checking for plausibility and return the text if it is
442 // plausible
443 $revID = intval( $this->thisRev );
444 if ( ! isset( $this->db ) ) {
445 throw new MWException( "No database available" );
446 }
447 $revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
448 if ( strlen( $text ) == $revLength ) {
449 if ( $tryIsPrefetch ) {
450 $this->prefetchCount++;
451 }
452 return $text;
453 }
454
455 $text = false;
456 throw new MWException( "Received text is unplausible for id " . $id );
457
458 } catch ( Exception $e ) {
459 $msg = "getting/checking text " . $id . " failed (" . $e->getMessage() . ")";
460 if ( $failures + 1 < $this->maxFailures ) {
461 $msg .= " (Will retry " . ( $this->maxFailures - $failures - 1 ) . " more times)";
462 }
463 $this->progress( $msg );
464 }
465
466 // Something went wrong; we did not a text that was plausible :(
467 $failures++;
468
469
470 // After backing off for some time, we try to reboot the whole process as
471 // much as possible to not carry over failures from one part to the other
472 // parts
473 sleep( $this->failureTimeout );
474 try {
475 $this->rotateDb();
476 if ( $this->spawn ) {
477 $this->closeSpawn();
478 $this->openSpawn();
479 }
480 } catch ( Exception $e ) {
481 $this->progress( "Rebooting getText infrastructure failed (" . $e->getMessage() . ")" .
482 " Trying to continue anyways" );
483 }
484 }
485
486 // Retirieving a good text for $id failed (at least) maxFailures times.
487 // We abort for this $id.
488
489 // Restoring the consecutive failures, and maybe aborting, if the dump
490 // is too broken.
491 $consecutiveFailedTextRetrievals = $oldConsecutiveFailedTextRetrievals + 1;
492 if ( $consecutiveFailedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals ) {
493 throw new MWException( "Graceful storage failure" );
494 }
495
496 return "";
497 }
498
499
500 /**
501 * May throw a database error if, say, the server dies during query.
502 * @param $id
503 * @return bool|string
504 * @throws MWException
505 */
506 private function getTextDb( $id ) {
507 global $wgContLang;
508 if ( ! isset( $this->db ) ) {
509 throw new MWException( __METHOD__ . "No database available" );
510 }
511 $row = $this->db->selectRow( 'text',
512 array( 'old_text', 'old_flags' ),
513 array( 'old_id' => $id ),
514 __METHOD__ );
515 $text = Revision::getRevisionText( $row );
516 if ( $text === false ) {
517 return false;
518 }
519 $stripped = str_replace( "\r", "", $text );
520 $normalized = $wgContLang->normalize( $stripped );
521 return $normalized;
522 }
523
524 private function getTextSpawned( $id ) {
525 wfSuppressWarnings();
526 if ( !$this->spawnProc ) {
527 // First time?
528 $this->openSpawn();
529 }
530 $text = $this->getTextSpawnedOnce( $id );
531 wfRestoreWarnings();
532 return $text;
533 }
534
535 function openSpawn() {
536 global $IP;
537
538 if ( file_exists( "$IP/../multiversion/MWScript.php" ) ) {
539 $cmd = implode( " ",
540 array_map( 'wfEscapeShellArg',
541 array(
542 $this->php,
543 "$IP/../multiversion/MWScript.php",
544 "fetchText.php",
545 '--wiki', wfWikiID() ) ) );
546 }
547 else {
548 $cmd = implode( " ",
549 array_map( 'wfEscapeShellArg',
550 array(
551 $this->php,
552 "$IP/maintenance/fetchText.php",
553 '--wiki', wfWikiID() ) ) );
554 }
555 $spec = array(
556 0 => array( "pipe", "r" ),
557 1 => array( "pipe", "w" ),
558 2 => array( "file", "/dev/null", "a" ) );
559 $pipes = array();
560
561 $this->progress( "Spawning database subprocess: $cmd" );
562 $this->spawnProc = proc_open( $cmd, $spec, $pipes );
563 if ( !$this->spawnProc ) {
564 // shit
565 $this->progress( "Subprocess spawn failed." );
566 return false;
567 }
568 list(
569 $this->spawnWrite, // -> stdin
570 $this->spawnRead, // <- stdout
571 ) = $pipes;
572
573 return true;
574 }
575
576 private function closeSpawn() {
577 wfSuppressWarnings();
578 if ( $this->spawnRead )
579 fclose( $this->spawnRead );
580 $this->spawnRead = false;
581 if ( $this->spawnWrite )
582 fclose( $this->spawnWrite );
583 $this->spawnWrite = false;
584 if ( $this->spawnErr )
585 fclose( $this->spawnErr );
586 $this->spawnErr = false;
587 if ( $this->spawnProc )
588 pclose( $this->spawnProc );
589 $this->spawnProc = false;
590 wfRestoreWarnings();
591 }
592
593 private function getTextSpawnedOnce( $id ) {
594 global $wgContLang;
595
596 $ok = fwrite( $this->spawnWrite, "$id\n" );
597 // $this->progress( ">> $id" );
598 if ( !$ok ) return false;
599
600 $ok = fflush( $this->spawnWrite );
601 // $this->progress( ">> [flush]" );
602 if ( !$ok ) return false;
603
604 // check that the text id they are sending is the one we asked for
605 // this avoids out of sync revision text errors we have encountered in the past
606 $newId = fgets( $this->spawnRead );
607 if ( $newId === false ) {
608 return false;
609 }
610 if ( $id != intval( $newId ) ) {
611 return false;
612 }
613
614 $len = fgets( $this->spawnRead );
615 // $this->progress( "<< " . trim( $len ) );
616 if ( $len === false ) return false;
617
618 $nbytes = intval( $len );
619 // actual error, not zero-length text
620 if ( $nbytes < 0 ) return false;
621
622 $text = "";
623
624 // Subprocess may not send everything at once, we have to loop.
625 while ( $nbytes > strlen( $text ) ) {
626 $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) );
627 if ( $buffer === false ) break;
628 $text .= $buffer;
629 }
630
631 $gotbytes = strlen( $text );
632 if ( $gotbytes != $nbytes ) {
633 $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " );
634 return false;
635 }
636
637 // Do normalization in the dump thread...
638 $stripped = str_replace( "\r", "", $text );
639 $normalized = $wgContLang->normalize( $stripped );
640 return $normalized;
641 }
642
643 function startElement( $parser, $name, $attribs ) {
644 $this->checkpointJustWritten = false;
645
646 $this->clearOpenElement( null );
647 $this->lastName = $name;
648
649 if ( $name == 'revision' ) {
650 $this->state = $name;
651 $this->egress->writeOpenPage( null, $this->buffer );
652 $this->buffer = "";
653 } elseif ( $name == 'page' ) {
654 $this->state = $name;
655 if ( $this->atStart ) {
656 $this->egress->writeOpenStream( $this->buffer );
657 $this->buffer = "";
658 $this->atStart = false;
659 }
660 }
661
662 if ( $name == "text" && isset( $attribs['id'] ) ) {
663 $text = $this->getText( $attribs['id'] );
664 $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
665 if ( strlen( $text ) > 0 ) {
666 $this->characterData( $parser, $text );
667 }
668 } else {
669 $this->openElement = array( $name, $attribs );
670 }
671 }
672
673 function endElement( $parser, $name ) {
674 $this->checkpointJustWritten = false;
675
676 if ( $this->openElement ) {
677 $this->clearOpenElement( "" );
678 } else {
679 $this->buffer .= "</$name>";
680 }
681
682 if ( $name == 'revision' ) {
683 $this->egress->writeRevision( null, $this->buffer );
684 $this->buffer = "";
685 $this->thisRev = "";
686 } elseif ( $name == 'page' ) {
687 if ( ! $this->firstPageWritten ) {
688 $this->firstPageWritten = trim( $this->thisPage );
689 }
690 $this->lastPageWritten = trim( $this->thisPage );
691 if ( $this->timeExceeded ) {
692 $this->egress->writeClosePage( $this->buffer );
693 // nasty hack, we can't just write the chardata after the
694 // page tag, it will include leading blanks from the next line
695 $this->egress->sink->write( "\n" );
696
697 $this->buffer = $this->xmlwriterobj->closeStream();
698 $this->egress->writeCloseStream( $this->buffer );
699
700 $this->buffer = "";
701 $this->thisPage = "";
702 // this could be more than one file if we had more than one output arg
703
704 $filenameList = (array)$this->egress->getFilenames();
705 $newFilenames = array();
706 $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT );
707 $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT );
708 for ( $i = 0; $i < count( $filenameList ); $i++ ) {
709 $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
710 $fileinfo = pathinfo( $filenameList[$i] );
711 $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
712 }
713 $this->egress->closeRenameAndReopen( $newFilenames );
714 $this->buffer = $this->xmlwriterobj->openStream();
715 $this->timeExceeded = false;
716 $this->timeOfCheckpoint = $this->lastTime;
717 $this->firstPageWritten = false;
718 $this->checkpointJustWritten = true;
719 }
720 else {
721 $this->egress->writeClosePage( $this->buffer );
722 $this->buffer = "";
723 $this->thisPage = "";
724 }
725
726 } elseif ( $name == 'mediawiki' ) {
727 $this->egress->writeCloseStream( $this->buffer );
728 $this->buffer = "";
729 }
730 }
731
732 function characterData( $parser, $data ) {
733 $this->clearOpenElement( null );
734 if ( $this->lastName == "id" ) {
735 if ( $this->state == "revision" ) {
736 $this->thisRev .= $data;
737 } elseif ( $this->state == "page" ) {
738 $this->thisPage .= $data;
739 }
740 }
741 // have to skip the newline left over from closepagetag line of
742 // end of checkpoint files. nasty hack!!
743 if ( $this->checkpointJustWritten ) {
744 if ( $data[0] == "\n" ) {
745 $data = substr( $data, 1 );
746 }
747 $this->checkpointJustWritten = false;
748 }
749 $this->buffer .= htmlspecialchars( $data );
750 }
751
752 function clearOpenElement( $style ) {
753 if ( $this->openElement ) {
754 $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
755 $this->openElement = false;
756 }
757 }
758 }
759
760
761 $dumper = new TextPassDumper( $argv );
762
763 if ( !isset( $options['help'] ) ) {
764 $dumper->dump( true );
765 } else {
766 $dumper->progress( <<<ENDS
767 This script postprocesses XML dumps from dumpBackup.php to add
768 page text which was stubbed out (using --stub).
769
770 XML input is accepted on stdin.
771 XML output is sent to stdout; progress reports are sent to stderr.
772
773 Usage: php dumpTextPass.php [<options>]
774 Options:
775 --stub=<type>:<file> To load a compressed stub dump instead of stdin
776 --prefetch=<type>:<file> Use a prior dump file as a text source, to save
777 pressure on the database.
778 (Requires the XMLReader extension)
779 --maxtime=<minutes> Write out checkpoint file after this many minutes (writing
780 out complete page, closing xml file properly, and opening new one
781 with header). This option requires the checkpointfile option.
782 --checkpointfile=<filenamepattern> Use this string for checkpoint filenames,
783 substituting first pageid written for the first %s (required) and the
784 last pageid written for the second %s if it exists.
785 --quiet Don't dump status reports to stderr.
786 --report=n Report position and speed after every n pages processed.
787 (Default: 100)
788 --server=h Force reading from MySQL server h
789 --current Base ETA on number of pages in database instead of all revisions
790 --spawn Spawn a subprocess for loading text records
791 --help Display this help message
792 ENDS
793 );
794 }
795
796