+ return $text;
+ }
+
+ /**
+ * Tries to get the revision text for a revision id.
+ * Export transformations are applied if the content model can is given or can be
+ * determined from the database.
+ *
+ * Upon errors, retries (Up to $this->maxFailures tries each call).
+ * If still no good revision get could be found even after this retrying, "" is returned.
+ * If no good revision text could be returned for
+ * $this->maxConsecutiveFailedTextRetrievals consecutive calls to getText, MWException
+ * is thrown.
+ *
+ * @param string $id The revision id to get the text for
+ * @param string|bool|null $model The content model used to determine
+ * applicable export transformations.
+ * If $model is null, it will be determined from the database.
+ * @param string|null $format The content format used when applying export transformations.
+ *
+ * @throws MWException
+ * @return string The revision text for $id, or ""
+ */
+ function getText( $id, $model = null, $format = null ) {
+ global $wgContentHandlerUseDB;
+
+ $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
+ $text = false; // The candidate for a good text. false if no proper value.
+ $failures = 0; // The number of times, this invocation of getText already failed.
+
+ // The number of times getText failed without yielding a good text in between.
+ static $consecutiveFailedTextRetrievals = 0;
+
+ $this->fetchCount++;
+
+ // To allow to simply return on success and do not have to worry about book keeping,
+ // we assume, this fetch works (possible after some retries). Nevertheless, we koop
+ // the old value, so we can restore it, if problems occur (See after the while loop).
+ $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
+ $consecutiveFailedTextRetrievals = 0;
+
+ if ( $model === null && $wgContentHandlerUseDB ) {
+ $row = $this->db->selectRow(
+ 'revision',
+ array( 'rev_content_model', 'rev_content_format' ),
+ array( 'rev_id' => $this->thisRev ),
+ __METHOD__
+ );
+
+ if ( $row ) {
+ $model = $row->rev_content_model;
+ $format = $row->rev_content_format;
+ }
+ }
+
+ if ( $model === null || $model === '' ) {
+ $model = false;
+ }
+
+ while ( $failures < $this->maxFailures ) {
+
+ // As soon as we found a good text for the $id, we will return immediately.
+ // Hence, if we make it past the try catch block, we know that we did not
+ // find a good text.
+
+ try {
+ // Step 1: Get some text (or reuse from previous iteratuon if checking
+ // for plausibility failed)
+
+ // Trying to get prefetch, if it has not been tried before
+ if ( $text === false && isset( $this->prefetch ) && $prefetchNotTried ) {
+ $prefetchNotTried = false;
+ $tryIsPrefetch = true;
+ $text = $this->prefetch->prefetch( intval( $this->thisPage ),
+ intval( $this->thisRev ) );
+
+ if ( $text === null ) {
+ $text = false;
+ }
+
+ if ( is_string( $text ) && $model !== false ) {
+ // Apply export transformation to text coming from an old dump.
+ // The purpose of this transformation is to convert up from legacy
+ // formats, which may still be used in the older dump that is used
+ // for pre-fetching. Applying the transformation again should not
+ // interfere with content that is already in the correct form.
+ $text = $this->exportTransform( $text, $model, $format );
+ }
+ }
+
+ if ( $text === false ) {
+ // Fallback to asking the database
+ $tryIsPrefetch = false;
+ if ( $this->spawn ) {
+ $text = $this->getTextSpawned( $id );
+ } else {
+ $text = $this->getTextDb( $id );
+ }
+
+ if ( $text !== false && $model !== false ) {
+ // Apply export transformation to text coming from the database.
+ // Prefetched text should already have transformations applied.
+ $text = $this->exportTransform( $text, $model, $format );
+ }
+
+ // No more checks for texts from DB for now.
+ // If we received something that is not false,
+ // We treat it as good text, regardless of whether it actually is or is not
+ if ( $text !== false ) {
+ return $text;
+ }
+ }
+
+ if ( $text === false ) {
+ throw new MWException( "Generic error while obtaining text for id " . $id );
+ }
+
+ // We received a good candidate for the text of $id via some method
+
+ // Step 2: Checking for plausibility and return the text if it is
+ // plausible
+ $revID = intval( $this->thisRev );
+ if ( !isset( $this->db ) ) {
+ throw new MWException( "No database available" );
+ }
+
+ if ( $model !== CONTENT_MODEL_WIKITEXT ) {
+ $revLength = strlen( $text );
+ } else {
+ $revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
+ }
+
+ if ( strlen( $text ) == $revLength ) {
+ if ( $tryIsPrefetch ) {
+ $this->prefetchCount++;
+ }
+
+ return $text;
+ }
+
+ $text = false;
+ throw new MWException( "Received text is unplausible for id " . $id );
+ } catch ( Exception $e ) {
+ $msg = "getting/checking text " . $id . " failed (" . $e->getMessage() . ")";
+ if ( $failures + 1 < $this->maxFailures ) {
+ $msg .= " (Will retry " . ( $this->maxFailures - $failures - 1 ) . " more times)";
+ }
+ $this->progress( $msg );
+ }
+
+ // Something went wrong; we did not a text that was plausible :(
+ $failures++;
+
+ // A failure in a prefetch hit does not warrant resetting db connection etc.
+ if ( !$tryIsPrefetch ) {
+ // After backing off for some time, we try to reboot the whole process as
+ // much as possible to not carry over failures from one part to the other
+ // parts
+ sleep( $this->failureTimeout );
+ try {
+ $this->rotateDb();
+ if ( $this->spawn ) {
+ $this->closeSpawn();
+ $this->openSpawn();
+ }
+ } catch ( Exception $e ) {
+ $this->progress( "Rebooting getText infrastructure failed (" . $e->getMessage() . ")" .
+ " Trying to continue anyways" );
+ }
+ }
+ }
+
+ // Retirieving a good text for $id failed (at least) maxFailures times.
+ // We abort for this $id.
+
+ // Restoring the consecutive failures, and maybe aborting, if the dump
+ // is too broken.
+ $consecutiveFailedTextRetrievals = $oldConsecutiveFailedTextRetrievals + 1;
+ if ( $consecutiveFailedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals ) {
+ throw new MWException( "Graceful storage failure" );
+ }
+
+ return "";
+ }
+
+ /**
+ * May throw a database error if, say, the server dies during query.
+ * @param int $id
+ * @return bool|string
+ * @throws MWException
+ */
+ private function getTextDb( $id ) {
+ global $wgContLang;
+ if ( !isset( $this->db ) ) {
+ throw new MWException( __METHOD__ . "No database available" );
+ }
+ $row = $this->db->selectRow( 'text',
+ array( 'old_text', 'old_flags' ),
+ array( 'old_id' => $id ),
+ __METHOD__ );
+ $text = Revision::getRevisionText( $row );
+ if ( $text === false ) {
+ return false;
+ }
+ $stripped = str_replace( "\r", "", $text );
+ $normalized = $wgContLang->normalize( $stripped );
+
+ return $normalized;
+ }
+
+ private function getTextSpawned( $id ) {
+ MediaWiki\suppressWarnings();
+ if ( !$this->spawnProc ) {
+ // First time?
+ $this->openSpawn();
+ }
+ $text = $this->getTextSpawnedOnce( $id );
+ MediaWiki\restoreWarnings();
+
+ return $text;
+ }
+
+ function openSpawn() {
+ global $IP;
+
+ if ( file_exists( "$IP/../multiversion/MWScript.php" ) ) {
+ $cmd = implode( " ",
+ array_map( 'wfEscapeShellArg',
+ array(
+ $this->php,
+ "$IP/../multiversion/MWScript.php",
+ "fetchText.php",
+ '--wiki', wfWikiID() ) ) );
+ } else {
+ $cmd = implode( " ",
+ array_map( 'wfEscapeShellArg',
+ array(
+ $this->php,
+ "$IP/maintenance/fetchText.php",
+ '--wiki', wfWikiID() ) ) );
+ }
+ $spec = array(
+ 0 => array( "pipe", "r" ),
+ 1 => array( "pipe", "w" ),
+ 2 => array( "file", "/dev/null", "a" ) );
+ $pipes = array();
+
+ $this->progress( "Spawning database subprocess: $cmd" );
+ $this->spawnProc = proc_open( $cmd, $spec, $pipes );
+ if ( !$this->spawnProc ) {
+ $this->progress( "Subprocess spawn failed." );
+
+ return false;
+ }
+ list(
+ $this->spawnWrite, // -> stdin
+ $this->spawnRead, // <- stdout
+ ) = $pipes;
+
+ return true;
+ }
+
+ private function closeSpawn() {
+ MediaWiki\suppressWarnings();
+ if ( $this->spawnRead ) {
+ fclose( $this->spawnRead );
+ }
+ $this->spawnRead = false;
+ if ( $this->spawnWrite ) {
+ fclose( $this->spawnWrite );
+ }
+ $this->spawnWrite = false;
+ if ( $this->spawnErr ) {
+ fclose( $this->spawnErr );
+ }
+ $this->spawnErr = false;
+ if ( $this->spawnProc ) {
+ pclose( $this->spawnProc );
+ }
+ $this->spawnProc = false;
+ MediaWiki\restoreWarnings();
+ }
+
+ private function getTextSpawnedOnce( $id ) {
+ global $wgContLang;
+
+ $ok = fwrite( $this->spawnWrite, "$id\n" );
+ // $this->progress( ">> $id" );
+ if ( !$ok ) {
+ return false;
+ }
+
+ $ok = fflush( $this->spawnWrite );
+ // $this->progress( ">> [flush]" );
+ if ( !$ok ) {
+ return false;
+ }
+
+ // check that the text id they are sending is the one we asked for
+ // this avoids out of sync revision text errors we have encountered in the past
+ $newId = fgets( $this->spawnRead );
+ if ( $newId === false ) {
+ return false;
+ }
+ if ( $id != intval( $newId ) ) {
+ return false;
+ }
+
+ $len = fgets( $this->spawnRead );
+ // $this->progress( "<< " . trim( $len ) );
+ if ( $len === false ) {
+ return false;
+ }
+
+ $nbytes = intval( $len );
+ // actual error, not zero-length text
+ if ( $nbytes < 0 ) {
+ return false;
+ }
+
+ $text = "";
+
+ // Subprocess may not send everything at once, we have to loop.
+ while ( $nbytes > strlen( $text ) ) {
+ $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) );
+ if ( $buffer === false ) {
+ break;
+ }
+ $text .= $buffer;
+ }
+
+ $gotbytes = strlen( $text );
+ if ( $gotbytes != $nbytes ) {
+ $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " );
+
+ return false;
+ }
+
+ // Do normalization in the dump thread...
+ $stripped = str_replace( "\r", "", $text );
+ $normalized = $wgContLang->normalize( $stripped );
+
+ return $normalized;
+ }
+
+ function startElement( $parser, $name, $attribs ) {
+ $this->checkpointJustWritten = false;
+
+ $this->clearOpenElement( null );
+ $this->lastName = $name;
+
+ if ( $name == 'revision' ) {
+ $this->state = $name;
+ $this->egress->writeOpenPage( null, $this->buffer );
+ $this->buffer = "";
+ } elseif ( $name == 'page' ) {
+ $this->state = $name;
+ if ( $this->atStart ) {
+ $this->egress->writeOpenStream( $this->buffer );
+ $this->buffer = "";
+ $this->atStart = false;
+ }
+ }
+
+ if ( $name == "text" && isset( $attribs['id'] ) ) {
+ $id = $attribs['id'];
+ $model = trim( $this->thisRevModel );
+ $format = trim( $this->thisRevFormat );
+
+ $model = $model === '' ? null : $model;
+ $format = $format === '' ? null : $format;
+
+ $text = $this->getText( $id, $model, $format );
+ $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
+ if ( strlen( $text ) > 0 ) {
+ $this->characterData( $parser, $text );
+ }
+ } else {
+ $this->openElement = array( $name, $attribs );
+ }
+ }
+
+ function endElement( $parser, $name ) {
+ $this->checkpointJustWritten = false;
+
+ if ( $this->openElement ) {
+ $this->clearOpenElement( "" );
+ } else {
+ $this->buffer .= "</$name>";
+ }
+
+ if ( $name == 'revision' ) {
+ $this->egress->writeRevision( null, $this->buffer );
+ $this->buffer = "";
+ $this->thisRev = "";
+ $this->thisRevModel = null;
+ $this->thisRevFormat = null;
+ } elseif ( $name == 'page' ) {
+ if ( !$this->firstPageWritten ) {
+ $this->firstPageWritten = trim( $this->thisPage );
+ }
+ $this->lastPageWritten = trim( $this->thisPage );
+ if ( $this->timeExceeded ) {
+ $this->egress->writeClosePage( $this->buffer );
+ // nasty hack, we can't just write the chardata after the
+ // page tag, it will include leading blanks from the next line
+ $this->egress->sink->write( "\n" );
+
+ $this->buffer = $this->xmlwriterobj->closeStream();
+ $this->egress->writeCloseStream( $this->buffer );
+
+ $this->buffer = "";
+ $this->thisPage = "";
+ // this could be more than one file if we had more than one output arg
+
+ $filenameList = (array)$this->egress->getFilenames();
+ $newFilenames = array();
+ $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT );
+ $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT );
+ $filenamesCount = count( $filenameList );
+ for ( $i = 0; $i < $filenamesCount; $i++ ) {
+ $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
+ $fileinfo = pathinfo( $filenameList[$i] );
+ $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
+ }
+ $this->egress->closeRenameAndReopen( $newFilenames );
+ $this->buffer = $this->xmlwriterobj->openStream();
+ $this->timeExceeded = false;
+ $this->timeOfCheckpoint = $this->lastTime;
+ $this->firstPageWritten = false;
+ $this->checkpointJustWritten = true;
+ } else {
+ $this->egress->writeClosePage( $this->buffer );
+ $this->buffer = "";
+ $this->thisPage = "";
+ }
+ } elseif ( $name == 'mediawiki' ) {
+ $this->egress->writeCloseStream( $this->buffer );
+ $this->buffer = "";
+ }
+ }
+
+ function characterData( $parser, $data ) {
+ $this->clearOpenElement( null );
+ if ( $this->lastName == "id" ) {
+ if ( $this->state == "revision" ) {
+ $this->thisRev .= $data;
+ } elseif ( $this->state == "page" ) {
+ $this->thisPage .= $data;
+ }
+ } elseif ( $this->lastName == "model" ) {
+ $this->thisRevModel .= $data;
+ } elseif ( $this->lastName == "format" ) {
+ $this->thisRevFormat .= $data;
+ }
+
+ // have to skip the newline left over from closepagetag line of
+ // end of checkpoint files. nasty hack!!
+ if ( $this->checkpointJustWritten ) {
+ if ( $data[0] == "\n" ) {
+ $data = substr( $data, 1 );
+ }
+ $this->checkpointJustWritten = false;
+ }
+ $this->buffer .= htmlspecialchars( $data );
+ }
+
+ function clearOpenElement( $style ) {
+ if ( $this->openElement ) {
+ $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
+ $this->openElement = false;
+ }
+ }