X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=maintenance%2FdumpTextPass.php;h=c55208fa705b5f89fbb7e674f415e5e191e6b72d;hb=a4ff5fcee153b83977b2d4a17bd0ecba703a4def;hp=ae55ec8f8832da427c6a465b2a5ac5227616b448;hpb=b393b81d97f44d51997891fa17aadca2dc0cfcf0;p=lhc%2Fweb%2Fwiklou.git diff --git a/maintenance/dumpTextPass.php b/maintenance/dumpTextPass.php index ae55ec8f88..c55208fa70 100644 --- a/maintenance/dumpTextPass.php +++ b/maintenance/dumpTextPass.php @@ -2,83 +2,152 @@ /** * Copyright (C) 2005 Brion Vibber * http://www.mediawiki.org/ - * + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html * - * @package MediaWiki - * @subpackage SpecialPage + * @file + * @ingroup Maintenance */ $originalDir = getcwd(); -$optionsWithArgs = array( 'server', 'pagelist', 'start', 'end' ); - -require_once( 'commandLine.inc' ); -require_once( 'SpecialExport.php' ); -require_once( 'maintenance/backup.inc' ); +require_once( dirname(__FILE__) . '/commandLine.inc' ); +require_once( 'backup.inc' ); +/** + * @ingroup Maintenance + */ class TextPassDumper extends BackupDumper { var $prefetch = null; + var $input = "php://stdin"; + var $history = WikiExporter::FULL; + var $fetchCount = 0; + var $prefetchCount = 0; + var $failures = 0; + var $maxFailures = 200; + var $failureTimeout = 5; // Seconds to sleep after db failure + + var $php = "php"; + var $spawn = false; + var $spawnProc = false; + var $spawnWrite = false; + var $spawnRead = false; + var $spawnErr = false; + function dump() { # This shouldn't happen if on console... ;) header( 'Content-type: text/html; charset=UTF-8' ); - + # Notice messages will foul up your XML output even if they're # relatively harmless. -// ini_set( 'display_errors', false ); - - $this->startTime = wfTime(); - - $this->db =& wfGetDB( DB_SLAVE ); - $this->maxCount = $this->db->selectField( 'page', 'MAX(page_id)', '', 'BackupDumper::dump' ); - $this->startTime = wfTime(); - + if( ini_get( 'display_errors' ) ) + ini_set( 'display_errors', 'stderr' ); + + $this->initProgress( $this->history ); + + $this->db = $this->backupDb(); + $this->egress = new ExportProgressFilter( $this->sink, $this ); - $input = fopen( "php://stdin", "rt" ); + $input = fopen( $this->input, "rt" ); $result = $this->readDump( $input ); - + if( WikiError::isError( $result ) ) { - $this->progress( $result->getMessage() ); + wfDie( $result->getMessage() ); } + if( $this->spawnProc ) { + $this->closeSpawn(); + } + $this->report( true ); } - + function processOption( $opt, $val, $param ) { - if( $opt == 'prefetch' ) { - require_once 'maintenance/backupPrefetch.inc'; - switch( $val ) { - case "file": - $filename = $param; - break; - case "gzip": - $filename = "compress.gzip://$param"; - break; - case "bzip2": - $filename = "compress.bzip2://$param"; - break; - default: - $filename = $val; + $url = $this->processFileOpt( $val, $param ); + + switch( $opt ) { + case 'prefetch': + global $IP; + require_once "$IP/maintenance/backupPrefetch.inc"; + $this->prefetch = new BaseDump( $url ); + break; + case 'stub': + $this->input = $url; + break; + case 'current': + $this->history = WikiExporter::CURRENT; + break; + case 'full': + $this->history = WikiExporter::FULL; + break; + case 'spawn': + $this->spawn = true; + if( $val ) { + $this->php = $val; } - $this->prefetch = new BaseDump( $filename ); + break; } } + function processFileOpt( $val, $param ) { + switch( $val ) { + case "file": + return $param; + case "gzip": + return "compress.zlib://$param"; + case "bzip2": + return "compress.bzip2://$param"; + case "7zip": + return "mediawiki.compress.7z://$param"; + default: + return $val; + } + } + + /** + * Overridden to include prefetch ratio if enabled. + */ + function showReport() { + if( !$this->prefetch ) { + return parent::showReport(); + } + + if( $this->reporting ) { + $delta = wfTime() - $this->startTime; + $now = wfTimestamp( TS_DB ); + if( $delta ) { + $rate = $this->pageCount / $delta; + $revrate = $this->revCount / $delta; + $portion = $this->revCount / $this->maxCount; + $eta = $this->startTime + $delta / $portion; + $etats = wfTimestamp( TS_DB, intval( $eta ) ); + $fetchrate = 100.0 * $this->prefetchCount / $this->fetchCount; + } else { + $rate = '-'; + $revrate = '-'; + $etats = '-'; + $fetchrate = '-'; + } + $this->progress( sprintf( "%s: %s %d pages (%0.3f/sec), %d revs (%0.3f/sec), %0.1f%% prefetched, ETA %s [max %d]", + $now, wfWikiID(), $this->pageCount, $rate, $this->revCount, $revrate, $fetchrate, $etats, $this->maxCount ) ); + } + } + function readDump( $input ) { $this->buffer = ""; $this->openElement = false; @@ -87,13 +156,13 @@ class TextPassDumper extends BackupDumper { $this->lastName = ""; $this->thisPage = 0; $this->thisRev = 0; - + $parser = xml_parser_create( "UTF-8" ); xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); - + xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) ); xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) ); - + $offset = 0; // for context extraction on error reporting $bufferSize = 512 * 1024; do { @@ -105,26 +174,191 @@ class TextPassDumper extends BackupDumper { $offset += strlen( $chunk ); } while( $chunk !== false && !feof( $input ) ); xml_parser_free( $parser ); + + return true; } - + function getText( $id ) { + $this->fetchCount++; if( isset( $this->prefetch ) ) { $text = $this->prefetch->prefetch( $this->thisPage, $this->thisRev ); - if( !is_null( $text ) ) + if( $text === null ) { + // Entry missing from prefetch dump + } elseif( $text === "" ) { + // Blank entries may indicate that the prior dump was broken. + // To be safe, reload it. + } else { + $this->prefetchCount++; return $text; + } + } + return $this->doGetText( $id ); + } + + private function doGetText( $id ) { + if( $this->spawn ) { + return $this->getTextSpawned( $id ); + } else { + return $this->getTextDbSafe( $id ); } + } + + /** + * Fetch a text revision from the database, retrying in case of failure. + * This may survive some transitory errors by reconnecting, but + * may not survive a long-term server outage. + */ + private function getTextDbSafe( $id ) { + while( true ) { + try { + $text = $this->getTextDb( $id ); + $ex = new MWException("Graceful storage failure"); + } catch (DBQueryError $ex) { + $text = false; + } + if( $text === false ) { + $this->failures++; + if( $this->failures > $this->maxFailures ) { + throw $ex; + } else { + $this->progress( "Database failure $this->failures " . + "of allowed $this->maxFailures for revision $id! " . + "Pausing $this->failureTimeout seconds..." ); + sleep( $this->failureTimeout ); + } + } else { + return $text; + } + } + } + + /** + * May throw a database error if, say, the server dies during query. + */ + private function getTextDb( $id ) { $id = intval( $id ); $row = $this->db->selectRow( 'text', array( 'old_text', 'old_flags' ), array( 'old_id' => $id ), 'TextPassDumper::getText' ); - return UtfNormal::cleanUp( strval( Revision::getRevisionText( $row ) ) ); + $text = Revision::getRevisionText( $row ); + if( $text === false ) { + return false; + } + $stripped = str_replace( "\r", "", $text ); + $normalized = UtfNormal::cleanUp( $stripped ); + return $normalized; } + private function getTextSpawned( $id ) { + wfSuppressWarnings(); + if( !$this->spawnProc ) { + // First time? + $this->openSpawn(); + } + while( true ) { + + $text = $this->getTextSpawnedOnce( $id ); + if( !is_string( $text ) ) { + $this->progress("Database subprocess failed. Respawning..."); + + $this->closeSpawn(); + sleep( $this->failureTimeout ); + $this->openSpawn(); + + continue; + } + wfRestoreWarnings(); + return $text; + } + } + + function openSpawn() { + global $IP, $wgDBname; + + $cmd = implode( " ", + array_map( 'wfEscapeShellArg', + array( + $this->php, + "$IP/maintenance/fetchText.php", + $wgDBname ) ) ); + $spec = array( + 0 => array( "pipe", "r" ), + 1 => array( "pipe", "w" ), + 2 => array( "file", "/dev/null", "a" ) ); + $pipes = array(); + + $this->progress( "Spawning database subprocess: $cmd" ); + $this->spawnProc = proc_open( $cmd, $spec, $pipes ); + if( !$this->spawnProc ) { + // shit + $this->progress( "Subprocess spawn failed." ); + return false; + } + list( + $this->spawnWrite, // -> stdin + $this->spawnRead, // <- stdout + ) = $pipes; + + return true; + } + + private function closeSpawn() { + wfSuppressWarnings(); + if( $this->spawnRead ) + fclose( $this->spawnRead ); + $this->spawnRead = false; + if( $this->spawnWrite ) + fclose( $this->spawnWrite ); + $this->spawnWrite = false; + if( $this->spawnErr ) + fclose( $this->spawnErr ); + $this->spawnErr = false; + if( $this->spawnProc ) + pclose( $this->spawnProc ); + $this->spawnProc = false; + wfRestoreWarnings(); + } + + private function getTextSpawnedOnce( $id ) { + $ok = fwrite( $this->spawnWrite, "$id\n" ); + //$this->progress( ">> $id" ); + if( !$ok ) return false; + + $ok = fflush( $this->spawnWrite ); + //$this->progress( ">> [flush]" ); + if( !$ok ) return false; + + $len = fgets( $this->spawnRead ); + //$this->progress( "<< " . trim( $len ) ); + if( $len === false ) return false; + + $nbytes = intval( $len ); + $text = ""; + + // Subprocess may not send everything at once, we have to loop. + while( $nbytes > strlen( $text ) ) { + $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) ); + if( $buffer === false ) break; + $text .= $buffer; + } + + $gotbytes = strlen( $text ); + if( $gotbytes != $nbytes ) { + $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes "); + return false; + } + + // Do normalization in the dump thread... + $stripped = str_replace( "\r", "", $text ); + $normalized = UtfNormal::cleanUp( $stripped ); + return $normalized; + } + function startElement( $parser, $name, $attribs ) { $this->clearOpenElement( null ); $this->lastName = $name; - + if( $name == 'revision' ) { $this->state = $name; $this->egress->writeOpenPage( null, $this->buffer ); @@ -137,7 +371,7 @@ class TextPassDumper extends BackupDumper { $this->atStart = false; } } - + if( $name == "text" && isset( $attribs['id'] ) ) { $text = $this->getText( $attribs['id'] ); $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) ); @@ -148,41 +382,43 @@ class TextPassDumper extends BackupDumper { $this->openElement = array( $name, $attribs ); } } - + function endElement( $parser, $name ) { if( $this->openElement ) { $this->clearOpenElement( "" ); } else { $this->buffer .= ""; } - + if( $name == 'revision' ) { $this->egress->writeRevision( null, $this->buffer ); $this->buffer = ""; + $this->thisRev = ""; } elseif( $name == 'page' ) { $this->egress->writeClosePage( $this->buffer ); $this->buffer = ""; + $this->thisPage = ""; } elseif( $name == 'mediawiki' ) { $this->egress->writeCloseStream( $this->buffer ); $this->buffer = ""; } } - + function characterData( $parser, $data ) { $this->clearOpenElement( null ); if( $this->lastName == "id" ) { if( $this->state == "revision" ) { - $this->thisRev = intval( $data ); + $this->thisRev .= $data; } elseif( $this->state == "page" ) { - $this->thisPage = intval( $data ); + $this->thisPage .= $data; } } $this->buffer .= htmlspecialchars( $data ); } - + function clearOpenElement( $style ) { if( $this->openElement ) { - $this->buffer .= wfElement( $this->openElement[0], $this->openElement[1], $style ); + $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style ); $this->openElement = false; } } @@ -194,7 +430,7 @@ $dumper = new TextPassDumper( $argv ); if( true ) { $dumper->dump(); } else { - $dumper->progress( <<progress( <<] Options: - --prefetch Use a prior dump file as a text source where possible. + --stub=: To load a compressed stub dump instead of stdin + --prefetch=: Use a prior dump file as a text source, to save + pressure on the database. (Requires PHP 5.0+ and the XMLReader PECL extension) --quiet Don't dump status reports to stderr. --report=n Report position and speed after every n pages processed. (Default: 100) -END + --server=h Force reading from MySQL server h + --current Base ETA on number of pages in database instead of all revisions + --spawn Spawn a subprocess for loading text records +ENDS ); } -?> +