Follow-up to r114256: Removing final assert
[lhc/web/wiklou.git] / maintenance / dumpTextPass.php
1 <?php
2 /**
3 * Script that postprocesses XML dumps from dumpBackup.php to add page text
4 *
5 * Copyright (C) 2005 Brion Vibber <brion@pobox.com>
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Maintenance
25 */
26
27 $originalDir = getcwd();
28
29 require_once( dirname( __FILE__ ) . '/commandLine.inc' );
30 require_once( 'backup.inc' );
31
32 /**
33 * @ingroup Maintenance
34 */
35 class TextPassDumper extends BackupDumper {
36 var $prefetch = null;
37 var $input = "php://stdin";
38 var $history = WikiExporter::FULL;
39 var $fetchCount = 0;
40 var $prefetchCount = 0;
41 var $prefetchCountLast = 0;
42 var $fetchCountLast = 0;
43
44 var $maxFailures = 5;
45 var $maxConsecutiveFailedTextRetrievals = 200;
46 var $failureTimeout = 5; // Seconds to sleep after db failure
47
48 var $php = "php";
49 var $spawn = false;
50 var $spawnProc = false;
51 var $spawnWrite = false;
52 var $spawnRead = false;
53 var $spawnErr = false;
54
55 var $xmlwriterobj = false;
56
57 // when we spend more than maxTimeAllowed seconds on this run, we continue
58 // processing until we write out the next complete page, then save output file(s),
59 // rename it/them and open new one(s)
60 var $maxTimeAllowed = 0; // 0 = no limit
61 var $timeExceeded = false;
62 var $firstPageWritten = false;
63 var $lastPageWritten = false;
64 var $checkpointJustWritten = false;
65 var $checkpointFiles = array();
66
67 /**
68 * @var DatabaseBase
69 */
70 protected $db;
71
72
73 /**
74 * Drop the database connection $this->db and try to get a new one.
75 *
76 * This function tries to get a /different/ connection if this is
77 * possible. Hence, (if this is possible) it switches to a different
78 * failover upon each call.
79 *
80 * This function resets $this->lb and closes all connections on it.
81 *
82 * @throws MWException
83 */
84 function rotateDb() {
85 // Cleaning up old connections
86 if ( isset( $this->lb ) ) {
87 $this->lb->closeAll();
88 unset( $this->lb );
89 }
90
91 if ( isset( $this->db ) && $this->db->isOpen() )
92 {
93 throw new MWException( 'DB is set and has not been closed by the Load Balancer' );
94 }
95
96
97 unset( $this->db );
98
99 // Trying to set up new connection.
100 // We do /not/ retry upon failure, but delegate to encapsulating logic, to avoid
101 // individually retrying at different layers of code.
102
103 // 1. The LoadBalancer.
104 try {
105 $this->lb = wfGetLBFactory()->newMainLB();
106 } catch (Exception $e) {
107 throw new MWException( __METHOD__ . " rotating DB failed to obtain new load balancer (" . $e->getMessage() . ")" );
108 }
109
110
111 // 2. The Connection, through the load balancer.
112 try {
113 $this->db = $this->lb->getConnection( DB_SLAVE, 'backup' );
114 } catch (Exception $e) {
115 throw new MWException( __METHOD__ . " rotating DB failed to obtain new database (" . $e->getMessage() . ")" );
116 }
117 }
118
119
120 function initProgress( $history ) {
121 parent::initProgress();
122 $this->timeOfCheckpoint = $this->startTime;
123 }
124
125 function dump( $history, $text = WikiExporter::TEXT ) {
126 // This shouldn't happen if on console... ;)
127 header( 'Content-type: text/html; charset=UTF-8' );
128
129 // Notice messages will foul up your XML output even if they're
130 // relatively harmless.
131 if ( ini_get( 'display_errors' ) )
132 ini_set( 'display_errors', 'stderr' );
133
134 $this->initProgress( $this->history );
135
136 // We are trying to get an initial database connection to avoid that the
137 // first try of this request's first call to getText fails. However, if
138 // obtaining a good DB connection fails it's not a serious issue, as
139 // getText does retry upon failure and can start without having a working
140 // DB connection.
141 try {
142 $this->rotateDb();
143 } catch (Exception $e) {
144 // We do not even count this as failure. Just let eventual
145 // watchdogs know.
146 $this->progress( "Getting initial DB connection failed (" .
147 $e->getMessage() . ")" );
148 }
149
150 $this->egress = new ExportProgressFilter( $this->sink, $this );
151
152 // it would be nice to do it in the constructor, oh well. need egress set
153 $this->finalOptionCheck();
154
155 // we only want this so we know how to close a stream :-P
156 $this->xmlwriterobj = new XmlDumpWriter();
157
158 $input = fopen( $this->input, "rt" );
159 $result = $this->readDump( $input );
160
161 if ( WikiError::isError( $result ) ) {
162 throw new MWException( $result->getMessage() );
163 }
164
165 if ( $this->spawnProc ) {
166 $this->closeSpawn();
167 }
168
169 $this->report( true );
170 }
171
172 function processOption( $opt, $val, $param ) {
173 global $IP;
174 $url = $this->processFileOpt( $val, $param );
175
176 switch( $opt ) {
177 case 'prefetch':
178 require_once "$IP/maintenance/backupPrefetch.inc";
179 $this->prefetch = new BaseDump( $url );
180 break;
181 case 'stub':
182 $this->input = $url;
183 break;
184 case 'maxtime':
185 $this->maxTimeAllowed = intval($val)*60;
186 break;
187 case 'checkpointfile':
188 $this->checkpointFiles[] = $val;
189 break;
190 case 'current':
191 $this->history = WikiExporter::CURRENT;
192 break;
193 case 'full':
194 $this->history = WikiExporter::FULL;
195 break;
196 case 'spawn':
197 $this->spawn = true;
198 if ( $val ) {
199 $this->php = $val;
200 }
201 break;
202 }
203 }
204
205 function processFileOpt( $val, $param ) {
206 $fileURIs = explode(';',$param);
207 foreach ( $fileURIs as $URI ) {
208 switch( $val ) {
209 case "file":
210 $newURI = $URI;
211 break;
212 case "gzip":
213 $newURI = "compress.zlib://$URI";
214 break;
215 case "bzip2":
216 $newURI = "compress.bzip2://$URI";
217 break;
218 case "7zip":
219 $newURI = "mediawiki.compress.7z://$URI";
220 break;
221 default:
222 $newURI = $URI;
223 }
224 $newFileURIs[] = $newURI;
225 }
226 $val = implode( ';', $newFileURIs );
227 return $val;
228 }
229
230 /**
231 * Overridden to include prefetch ratio if enabled.
232 */
233 function showReport() {
234 if ( !$this->prefetch ) {
235 parent::showReport();
236 return;
237 }
238
239 if ( $this->reporting ) {
240 $now = wfTimestamp( TS_DB );
241 $nowts = wfTime();
242 $deltaAll = wfTime() - $this->startTime;
243 $deltaPart = wfTime() - $this->lastTime;
244 $this->pageCountPart = $this->pageCount - $this->pageCountLast;
245 $this->revCountPart = $this->revCount - $this->revCountLast;
246
247 if ( $deltaAll ) {
248 $portion = $this->revCount / $this->maxCount;
249 $eta = $this->startTime + $deltaAll / $portion;
250 $etats = wfTimestamp( TS_DB, intval( $eta ) );
251 if ( $this->fetchCount ) {
252 $fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount;
253 } else {
254 $fetchRate = '-';
255 }
256 $pageRate = $this->pageCount / $deltaAll;
257 $revRate = $this->revCount / $deltaAll;
258 } else {
259 $pageRate = '-';
260 $revRate = '-';
261 $etats = '-';
262 $fetchRate = '-';
263 }
264 if ( $deltaPart ) {
265 if ( $this->fetchCountLast ) {
266 $fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast;
267 } else {
268 $fetchRatePart = '-';
269 }
270 $pageRatePart = $this->pageCountPart / $deltaPart;
271 $revRatePart = $this->revCountPart / $deltaPart;
272
273 } else {
274 $fetchRatePart = '-';
275 $pageRatePart = '-';
276 $revRatePart = '-';
277 }
278 $this->progress( sprintf( "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), %d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% prefetched (all|curr), ETA %s [max %d]",
279 $now, wfWikiID(), $this->ID, $this->pageCount, $pageRate, $pageRatePart, $this->revCount, $revRate, $revRatePart, $fetchRate, $fetchRatePart, $etats, $this->maxCount ) );
280 $this->lastTime = $nowts;
281 $this->revCountLast = $this->revCount;
282 $this->prefetchCountLast = $this->prefetchCount;
283 $this->fetchCountLast = $this->fetchCount;
284 }
285 }
286
287 function setTimeExceeded() {
288 $this->timeExceeded = True;
289 }
290
291 function checkIfTimeExceeded() {
292 if ( $this->maxTimeAllowed && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed ) ) {
293 return true;
294 }
295 return false;
296 }
297
298 function finalOptionCheck() {
299 if ( ( $this->checkpointFiles && ! $this->maxTimeAllowed ) ||
300 ( $this->maxTimeAllowed && !$this->checkpointFiles ) ) {
301 throw new MWException("Options checkpointfile and maxtime must be specified together.\n");
302 }
303 foreach ($this->checkpointFiles as $checkpointFile) {
304 $count = substr_count ( $checkpointFile,"%s" );
305 if ( $count != 2 ) {
306 throw new MWException("Option checkpointfile must contain two '%s' for substitution of first and last pageids, count is $count instead, file is $checkpointFile.\n");
307 }
308 }
309
310 if ( $this->checkpointFiles ) {
311 $filenameList = (array)$this->egress->getFilenames();
312 if ( count( $filenameList ) != count( $this->checkpointFiles ) ) {
313 throw new MWException("One checkpointfile must be specified for each output option, if maxtime is used.\n");
314 }
315 }
316 }
317
318 function readDump( $input ) {
319 $this->buffer = "";
320 $this->openElement = false;
321 $this->atStart = true;
322 $this->state = "";
323 $this->lastName = "";
324 $this->thisPage = 0;
325 $this->thisRev = 0;
326
327 $parser = xml_parser_create( "UTF-8" );
328 xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
329
330 xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
331 xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
332
333 $offset = 0; // for context extraction on error reporting
334 $bufferSize = 512 * 1024;
335 do {
336 if ($this->checkIfTimeExceeded()) {
337 $this->setTimeExceeded();
338 }
339 $chunk = fread( $input, $bufferSize );
340 if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
341 wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
342 return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
343 }
344 $offset += strlen( $chunk );
345 } while ( $chunk !== false && !feof( $input ) );
346 if ($this->maxTimeAllowed) {
347 $filenameList = (array)$this->egress->getFilenames();
348 // we wrote some stuff after last checkpoint that needs renamed
349 if (file_exists($filenameList[0])) {
350 $newFilenames = array();
351 # we might have just written the header and footer and had no
352 # pages or revisions written... perhaps they were all deleted
353 # there's no pageID 0 so we use that. the caller is responsible
354 # for deciding what to do with a file containing only the
355 # siteinfo information and the mw tags.
356 if (! $this->firstPageWritten) {
357 $firstPageID = str_pad(0,9,"0",STR_PAD_LEFT);
358 $lastPageID = str_pad(0,9,"0",STR_PAD_LEFT);
359 }
360 else {
361 $firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT);
362 $lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT);
363 }
364 for ( $i = 0; $i < count( $filenameList ); $i++ ) {
365 $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
366 $fileinfo = pathinfo($filenameList[$i]);
367 $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
368 }
369 $this->egress->closeAndRename( $newFilenames );
370 }
371 }
372 xml_parser_free( $parser );
373
374 return true;
375 }
376
377 /**
378 * Tries to get the revision text for a revision id.
379 *
380 * Upon errors, retries (Up to $this->maxFailures tries each call).
381 * If still no good revision get could be found even after this retrying, "" is returned.
382 * If no good revision text could be returned for
383 * $this->maxConsecutiveFailedTextRetrievals consecutive calls to getText, MWException
384 * is thrown.
385 *
386 * @param $id string The revision id to get the text for
387 *
388 * @return string The revision text for $id, or ""
389 * @throws MWException
390 */
391 function getText( $id ) {
392 $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
393 $text = false; // The candidate for a good text. false if no proper value.
394 $failures = 0; // The number of times, this invocation of getText already failed.
395
396 static $consecutiveFailedTextRetrievals = 0; // The number of times getText failed without
397 // yielding a good text in between.
398
399 $this->fetchCount++;
400
401 // To allow to simply return on success and do not have to worry about book keeping,
402 // we assume, this fetch works (possible after some retries). Nevertheless, we koop
403 // the old value, so we can restore it, if problems occur (See after the while loop).
404 $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
405 $consecutiveFailedTextRetrievals = 0;
406
407 while ( $failures < $this->maxFailures ) {
408
409 // As soon as we found a good text for the $id, we will return immediately.
410 // Hence, if we make it past the try catch block, we know that we did not
411 // find a good text.
412
413 try {
414 // Step 1: Get some text (or reuse from previous iteratuon if checking
415 // for plausibility failed)
416
417 // Trying to get prefetch, if it has not been tried before
418 if ( $text === false && isset( $this->prefetch ) && $prefetchNotTried ) {
419 $prefetchNotTried = false;
420 $tryIsPrefetch = true;
421 $text = $this->prefetch->prefetch( $this->thisPage, $this->thisRev );
422 if ( $text === null ) {
423 $text = false;
424 }
425 }
426
427 if ( $text === false ) {
428 // Fallback to asking the database
429 $tryIsPrefetch = false;
430 if ( $this->spawn ) {
431 $text = $this->getTextSpawned( $id );
432 } else {
433 $text = $this->getTextDb( $id );
434 }
435 }
436
437 if ( $text === false ) {
438 throw new MWException( "Generic error while obtaining text for id " . $id );
439 }
440
441 // We received a good candidate for the text of $id via some method
442
443 // Step 2: Checking for plausibility and return the text if it is
444 // plausible
445 $revID = intval( $this->thisRev );
446 if ( ! isset( $this->db ) ) {
447 throw new MWException( "No database available" );
448 }
449 $revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
450 if( strlen( $text ) == $revLength ) {
451 if ( $tryIsPrefetch ) {
452 $this->prefetchCount++;
453 }
454 return $text;
455 }
456
457 $text = false;
458 throw new MWException( "Received text is unplausible for id " . $id );
459
460 } catch (Exception $e) {
461 $msg = "getting/checking text " . $id . " failed (".$e->getMessage().")";
462 if ( $failures + 1 < $this->maxFailures ) {
463 $msg .= " (Will retry " . ( $this->maxFailures - $failures - 1) . " more times)";
464 }
465 $this->progress( $msg );
466 }
467
468 // Something went wrong; we did not a text that was plausible :(
469 $failures++;
470
471
472 // After backing off for some time, we try to reboot the whole process as
473 // much as possible to not carry over failures from one part to the other
474 // parts
475 sleep( $this->failureTimeout );
476 try {
477 $this->rotateDb();
478 if ( $this->spawn ) {
479 $this->closeSpawn();
480 $this->openSpawn();
481 }
482 } catch (Exception $e) {
483 $this->progress( "Rebooting getText infrastructure failed (".$e->getMessage().")" .
484 " Trying to continue anyways" );
485 }
486 }
487
488 // Retirieving a good text for $id failed (at least) maxFailures times.
489 // We abort for this $id.
490
491 // Restoring the consecutive failures, and maybe aborting, if the dump
492 // is too broken.
493 $consecutiveFailedTextRetrievals = $oldConsecutiveFailedTextRetrievals + 1;
494 if ( $consecutiveFailedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals ) {
495 throw new MWException( "Graceful storage failure" );
496 }
497
498 return "";
499 }
500
501
502 /**
503 * May throw a database error if, say, the server dies during query.
504 * @param $id
505 * @return bool|string
506 * @throws MWException
507 */
508 private function getTextDb( $id ) {
509 global $wgContLang;
510 if ( ! isset( $this->db ) ) {
511 throw new MWException( __METHOD__ . "No database available" );
512 }
513 $row = $this->db->selectRow( 'text',
514 array( 'old_text', 'old_flags' ),
515 array( 'old_id' => $id ),
516 __METHOD__ );
517 $text = Revision::getRevisionText( $row );
518 if ( $text === false ) {
519 return false;
520 }
521 $stripped = str_replace( "\r", "", $text );
522 $normalized = $wgContLang->normalize( $stripped );
523 return $normalized;
524 }
525
526 private function getTextSpawned( $id ) {
527 wfSuppressWarnings();
528 if ( !$this->spawnProc ) {
529 // First time?
530 $this->openSpawn();
531 }
532 $text = $this->getTextSpawnedOnce( $id );
533 wfRestoreWarnings();
534 return $text;
535 }
536
537 function openSpawn() {
538 global $IP;
539
540 if ( file_exists( "$IP/../multiversion/MWScript.php" ) ) {
541 $cmd = implode( " ",
542 array_map( 'wfEscapeShellArg',
543 array(
544 $this->php,
545 "$IP/../multiversion/MWScript.php",
546 "fetchText.php",
547 '--wiki', wfWikiID() ) ) );
548 }
549 else {
550 $cmd = implode( " ",
551 array_map( 'wfEscapeShellArg',
552 array(
553 $this->php,
554 "$IP/maintenance/fetchText.php",
555 '--wiki', wfWikiID() ) ) );
556 }
557 $spec = array(
558 0 => array( "pipe", "r" ),
559 1 => array( "pipe", "w" ),
560 2 => array( "file", "/dev/null", "a" ) );
561 $pipes = array();
562
563 $this->progress( "Spawning database subprocess: $cmd" );
564 $this->spawnProc = proc_open( $cmd, $spec, $pipes );
565 if ( !$this->spawnProc ) {
566 // shit
567 $this->progress( "Subprocess spawn failed." );
568 return false;
569 }
570 list(
571 $this->spawnWrite, // -> stdin
572 $this->spawnRead, // <- stdout
573 ) = $pipes;
574
575 return true;
576 }
577
578 private function closeSpawn() {
579 wfSuppressWarnings();
580 if ( $this->spawnRead )
581 fclose( $this->spawnRead );
582 $this->spawnRead = false;
583 if ( $this->spawnWrite )
584 fclose( $this->spawnWrite );
585 $this->spawnWrite = false;
586 if ( $this->spawnErr )
587 fclose( $this->spawnErr );
588 $this->spawnErr = false;
589 if ( $this->spawnProc )
590 pclose( $this->spawnProc );
591 $this->spawnProc = false;
592 wfRestoreWarnings();
593 }
594
595 private function getTextSpawnedOnce( $id ) {
596 global $wgContLang;
597
598 $ok = fwrite( $this->spawnWrite, "$id\n" );
599 // $this->progress( ">> $id" );
600 if ( !$ok ) return false;
601
602 $ok = fflush( $this->spawnWrite );
603 // $this->progress( ">> [flush]" );
604 if ( !$ok ) return false;
605
606 // check that the text id they are sending is the one we asked for
607 // this avoids out of sync revision text errors we have encountered in the past
608 $newId = fgets( $this->spawnRead );
609 if ( $newId === false ) {
610 return false;
611 }
612 if ( $id != intval( $newId ) ) {
613 return false;
614 }
615
616 $len = fgets( $this->spawnRead );
617 // $this->progress( "<< " . trim( $len ) );
618 if ( $len === false ) return false;
619
620 $nbytes = intval( $len );
621 // actual error, not zero-length text
622 if ($nbytes < 0 ) return false;
623
624 $text = "";
625
626 // Subprocess may not send everything at once, we have to loop.
627 while ( $nbytes > strlen( $text ) ) {
628 $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) );
629 if ( $buffer === false ) break;
630 $text .= $buffer;
631 }
632
633 $gotbytes = strlen( $text );
634 if ( $gotbytes != $nbytes ) {
635 $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " );
636 return false;
637 }
638
639 // Do normalization in the dump thread...
640 $stripped = str_replace( "\r", "", $text );
641 $normalized = $wgContLang->normalize( $stripped );
642 return $normalized;
643 }
644
645 function startElement( $parser, $name, $attribs ) {
646 $this->checkpointJustWritten = false;
647
648 $this->clearOpenElement( null );
649 $this->lastName = $name;
650
651 if ( $name == 'revision' ) {
652 $this->state = $name;
653 $this->egress->writeOpenPage( null, $this->buffer );
654 $this->buffer = "";
655 } elseif ( $name == 'page' ) {
656 $this->state = $name;
657 if ( $this->atStart ) {
658 $this->egress->writeOpenStream( $this->buffer );
659 $this->buffer = "";
660 $this->atStart = false;
661 }
662 }
663
664 if ( $name == "text" && isset( $attribs['id'] ) ) {
665 $text = $this->getText( $attribs['id'] );
666 $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
667 if ( strlen( $text ) > 0 ) {
668 $this->characterData( $parser, $text );
669 }
670 } else {
671 $this->openElement = array( $name, $attribs );
672 }
673 }
674
675 function endElement( $parser, $name ) {
676 $this->checkpointJustWritten = false;
677
678 if ( $this->openElement ) {
679 $this->clearOpenElement( "" );
680 } else {
681 $this->buffer .= "</$name>";
682 }
683
684 if ( $name == 'revision' ) {
685 $this->egress->writeRevision( null, $this->buffer );
686 $this->buffer = "";
687 $this->thisRev = "";
688 } elseif ( $name == 'page' ) {
689 if (! $this->firstPageWritten) {
690 $this->firstPageWritten = trim($this->thisPage);
691 }
692 $this->lastPageWritten = trim($this->thisPage);
693 if ($this->timeExceeded) {
694 $this->egress->writeClosePage( $this->buffer );
695 // nasty hack, we can't just write the chardata after the
696 // page tag, it will include leading blanks from the next line
697 $this->egress->sink->write("\n");
698
699 $this->buffer = $this->xmlwriterobj->closeStream();
700 $this->egress->writeCloseStream( $this->buffer );
701
702 $this->buffer = "";
703 $this->thisPage = "";
704 // this could be more than one file if we had more than one output arg
705
706 $filenameList = (array)$this->egress->getFilenames();
707 $newFilenames = array();
708 $firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT);
709 $lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT);
710 for ( $i = 0; $i < count( $filenameList ); $i++ ) {
711 $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
712 $fileinfo = pathinfo($filenameList[$i]);
713 $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
714 }
715 $this->egress->closeRenameAndReopen( $newFilenames );
716 $this->buffer = $this->xmlwriterobj->openStream();
717 $this->timeExceeded = false;
718 $this->timeOfCheckpoint = $this->lastTime;
719 $this->firstPageWritten = false;
720 $this->checkpointJustWritten = true;
721 }
722 else {
723 $this->egress->writeClosePage( $this->buffer );
724 $this->buffer = "";
725 $this->thisPage = "";
726 }
727
728 } elseif ( $name == 'mediawiki' ) {
729 $this->egress->writeCloseStream( $this->buffer );
730 $this->buffer = "";
731 }
732 }
733
734 function characterData( $parser, $data ) {
735 $this->clearOpenElement( null );
736 if ( $this->lastName == "id" ) {
737 if ( $this->state == "revision" ) {
738 $this->thisRev .= $data;
739 } elseif ( $this->state == "page" ) {
740 $this->thisPage .= $data;
741 }
742 }
743 // have to skip the newline left over from closepagetag line of
744 // end of checkpoint files. nasty hack!!
745 if ($this->checkpointJustWritten) {
746 if ($data[0] == "\n") {
747 $data = substr($data,1);
748 }
749 $this->checkpointJustWritten = false;
750 }
751 $this->buffer .= htmlspecialchars( $data );
752 }
753
754 function clearOpenElement( $style ) {
755 if ( $this->openElement ) {
756 $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
757 $this->openElement = false;
758 }
759 }
760 }
761
762
763 $dumper = new TextPassDumper( $argv );
764
765 if ( !isset( $options['help'] ) ) {
766 $dumper->dump( true );
767 } else {
768 $dumper->progress( <<<ENDS
769 This script postprocesses XML dumps from dumpBackup.php to add
770 page text which was stubbed out (using --stub).
771
772 XML input is accepted on stdin.
773 XML output is sent to stdout; progress reports are sent to stderr.
774
775 Usage: php dumpTextPass.php [<options>]
776 Options:
777 --stub=<type>:<file> To load a compressed stub dump instead of stdin
778 --prefetch=<type>:<file> Use a prior dump file as a text source, to save
779 pressure on the database.
780 (Requires the XMLReader extension)
781 --maxtime=<minutes> Write out checkpoint file after this many minutes (writing
782 out complete page, closing xml file properly, and opening new one
783 with header). This option requires the checkpointfile option.
784 --checkpointfile=<filenamepattern> Use this string for checkpoint filenames,
785 substituting first pageid written for the first %s (required) and the
786 last pageid written for the second %s if it exists.
787 --quiet Don't dump status reports to stderr.
788 --report=n Report position and speed after every n pages processed.
789 (Default: 100)
790 --server=h Force reading from MySQL server h
791 --current Base ETA on number of pages in database instead of all revisions
792 --spawn Spawn a subprocess for loading text records
793 --help Display this help message
794 ENDS
795 );
796 }
797
798