X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=maintenance%2FrefreshLinks.php;h=5d311add6080e0157a7ad1e01246930338b2e759;hb=5161541d87d663ea86b66258a568349a6cc8fde9;hp=98ea930108cb6be27a9c14503b4fffc7e867315e;hpb=d4eefca4dd645bbef2b435ef1228a141f6a48e67;p=lhc%2Fweb%2Fwiklou.git diff --git a/maintenance/refreshLinks.php b/maintenance/refreshLinks.php index 98ea930108..5d311add60 100644 --- a/maintenance/refreshLinks.php +++ b/maintenance/refreshLinks.php @@ -36,44 +36,54 @@ class RefreshLinks extends Maintenance { $this->addOption( 'new-only', 'Only affect articles with just a single edit' ); $this->addOption( 'redirects-only', 'Only fix redirects, not all links' ); $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' ); - $this->addOption( 'm', 'Maximum replication lag', false, true ); $this->addOption( 'e', 'Last page id to refresh', false, true ); + $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' . + 'query, default 100000', false, true ); $this->addArg( 'start', 'Page_id to start from, default 1', false ); $this->setBatchSize( 100 ); } public function execute() { - $max = $this->getOption( 'm', 0 ); + // Note that there is a difference between not specifying the start + // and end IDs and using the minimum and maximum values from the page + // table. In the latter case, deleteLinksFromNonexistent() will not + // delete entries for nonexistent IDs that fall outside the range. + $start = (int)$this->getArg( 0 ) ?: null; + $end = (int)$this->getOption( 'e' ) ?: null; + $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100000 ); if ( !$this->hasOption( 'dfn-only' ) ) { - $start = $this->getArg( 0, 1 ); $new = $this->getOption( 'new-only', false ); - $end = $this->getOption( 'e', 0 ); $redir = $this->getOption( 'redirects-only', false ); $oldRedir = $this->getOption( 'old-redirects-only', false ); - $this->doRefreshLinks( $start, $new, $max, $end, $redir, $oldRedir ); + $this->doRefreshLinks( $start, $new, $end, $redir, $oldRedir ); + $this->deleteLinksFromNonexistent( null, null, $this->mBatchSize, $dfnChunkSize ); + } else { + $this->deleteLinksFromNonexistent( $start, $end, $this->mBatchSize, $dfnChunkSize ); } - $this->deleteLinksFromNonexistent( $max, $this->mBatchSize ); } /** * Do the actual link refreshing. - * @param $start int Page_id to start from - * @param $newOnly bool Only do pages with 1 edit - * @param $maxLag int Max DB replication lag - * @param $end int Page_id to stop at - * @param $redirectsOnly bool Only fix redirects - * @param $oldRedirectsOnly bool Only fix redirects without redirect entries + * @param int|null $start Page_id to start from + * @param bool $newOnly Only do pages with 1 edit + * @param int|null $end Page_id to stop at + * @param bool $redirectsOnly Only fix redirects + * @param bool $oldRedirectsOnly Only fix redirects without redirect entries */ - private function doRefreshLinks( $start, $newOnly = false, $maxLag = false, - $end = 0, $redirectsOnly = false, $oldRedirectsOnly = false ) { + private function doRefreshLinks( $start, $newOnly = false, + $end = null, $redirectsOnly = false, $oldRedirectsOnly = false + ) { global $wgParser, $wgUseTidy; $reportingInterval = 100; $dbr = wfGetDB( DB_SLAVE ); - $start = intval( $start ); + + if ( $start === null ) { + $start = 1; + } // Give extensions a chance to optimize settings - wfRunHooks( 'MaintenanceRefreshLinksInit', array( $this ) ); + Hooks::run( 'MaintenanceRefreshLinksInit', array( $this ) ); # Don't generate extension images (e.g. Timeline) $wgParser->clearTagHooks(); @@ -88,15 +98,10 @@ class RefreshLinks extends Maintenance { $conds = array( "page_is_redirect=1", - "rd_from IS NULL" + "rd_from IS NULL", + self::intervalCond( $dbr, 'page_id', $start, $end ), ); - if ( $end == 0 ) { - $conds[] = "page_id >= $start"; - } else { - $conds[] = "page_id BETWEEN $start AND $end"; - } - $res = $dbr->select( array( 'page', 'redirect' ), 'page_id', @@ -123,7 +128,8 @@ class RefreshLinks extends Maintenance { array( 'page_id' ), array( 'page_is_new' => 1, - "page_id >= $start" ), + self::intervalCond( $dbr, 'page_id', $start, $end ), + ), __METHOD__ ); $num = $res->numRows(); @@ -185,7 +191,7 @@ class RefreshLinks extends Maintenance { * entry in the "redirect" table points to the correct page and not to an * invalid one. * - * @param $id int The page ID to check + * @param int $id The page ID to check */ private function fixRedirect( $id ) { $page = WikiPage::newFromID( $id ); @@ -196,6 +202,7 @@ class RefreshLinks extends Maintenance { // Delete any redirect table entry for it $dbw->delete( 'redirect', array( 'rd_from' => $id ), __METHOD__ ); + return; } @@ -222,7 +229,7 @@ class RefreshLinks extends Maintenance { /** * Run LinksUpdate for all links on a given page_id - * @param $id int The page_id + * @param int $id The page_id */ public static function fixLinksFromArticle( $id ) { $page = WikiPage::newFromID( $id ); @@ -251,19 +258,60 @@ class RefreshLinks extends Maintenance { * Removes non-existing links from pages from pagelinks, imagelinks, * categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables. * - * @param $maxLag int - * @param $batchSize int The size of deletion batches + * @param int|null $start Page_id to start from + * @param int|null $end Page_id to stop at + * @param int $batchSize The size of deletion batches + * @param int $chunkSize Maximum number of existent IDs to check per query * * @author Merlijn van Deen */ - private function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) { + private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100, + $chunkSize = 100000 + ) { wfWaitForSlaves(); + $this->output( "Deleting illegal entries from the links tables...\n" ); + $dbr = wfGetDB( DB_SLAVE ); + do { + // Find the start of the next chunk. This is based only + // on existent page_ids. + $nextStart = $dbr->selectField( + 'page', + 'page_id', + self::intervalCond( $dbr, 'page_id', $start, $end ), + __METHOD__, + array( 'ORDER BY' => 'page_id', 'OFFSET' => $chunkSize ) + ); - $dbw = wfGetDB( DB_MASTER ); + if ( $nextStart !== false ) { + // To find the end of the current chunk, subtract one. + // This will serve to limit the number of rows scanned in + // dfnCheckInterval(), per query, to at most the sum of + // the chunk size and deletion batch size. + $chunkEnd = $nextStart - 1; + } else { + // This is the last chunk. Check all page_ids up to $end. + $chunkEnd = $end; + } + + $fmtStart = $start !== null ? "[$start" : '(-INF'; + $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)'; + $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" ); + $this->dfnCheckInterval( $start, $chunkEnd, $batchSize ); - $lb = wfGetLBFactory()->newMainLB(); - $dbr = $lb->getConnection( DB_SLAVE ); - $dbr->bufferResults( false ); + $start = $nextStart; + + } while ( $nextStart !== false ); + } + + /** + * @see RefreshLinks::deleteLinksFromNonexistent() + * @param int|null $start Page_id to start from + * @param int|null $end Page_id to stop at + * @param int $batchSize The size of deletion batches + */ + private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) { + $dbw = wfGetDB( DB_MASTER ); + $dbr = wfGetDB( DB_SLAVE ); $linksTables = array( // table name => page_id field 'pagelinks' => 'pl_from', @@ -278,40 +326,58 @@ class RefreshLinks extends Maintenance { ); foreach ( $linksTables as $table => $field ) { - $this->output( "Retrieving illegal entries from $table... " ); - - // SELECT DISTINCT( $field ) FROM $table LEFT JOIN page ON $field=page_id WHERE page_id IS NULL; - $results = $dbr->select( - array( $table, 'page' ), - $field, - array( 'page_id' => null ), - __METHOD__, - 'DISTINCT', - array( 'page' => array( 'LEFT JOIN', "$field=page_id" ) ) - ); - + $this->output( " $table: 0" ); $counter = 0; - $list = array(); - $this->output( "0.." ); - foreach ( $results as $row ) { - $counter++; - $list[] = $row->$field; - if ( ( $counter % $batchSize ) == 0 ) { + do { + $ids = $dbr->selectFieldValues( + $table, + $field, + array( + self::intervalCond( $dbr, $field, $start, $end ), + "$field NOT IN ({$dbr->selectSQLText( 'page', 'page_id' )})", + ), + __METHOD__, + array( 'DISTINCT', 'ORDER BY' => $field, 'LIMIT' => $batchSize ) + ); + + $numIds = count( $ids ); + if ( $numIds ) { + $counter += $numIds; wfWaitForSlaves(); - $dbw->delete( $table, array( $field => $list ), __METHOD__ ); - - $this->output( $counter . ".." ); - $list = array(); + $dbw->delete( $table, array( $field => $ids ), __METHOD__ ); + $this->output( ", $counter" ); + $start = $ids[$numIds - 1] + 1; } - } - $this->output( $counter ); - if ( count( $list ) > 0 ) { - $dbw->delete( $table, array( $field => $list ), __METHOD__ ); - } - $this->output( "\n" ); + + } while ( $numIds >= $batchSize && ( $end === null || $start <= $end ) ); + + $this->output( " deleted.\n" ); + wfWaitForSlaves(); } - $lb->closeAll(); + } + + /** + * Build a SQL expression for a closed interval (i.e. BETWEEN). + * + * By specifying a null $start or $end, it is also possible to create + * half-bounded or unbounded intervals using this function. + * + * @param IDatabase $db Database connection + * @param string $var Field name + * @param mixed $start First value to include or null + * @param mixed $end Last value to include or null + */ + private static function intervalCond( IDatabase $db, $var, $start, $end ) { + if ( $start === null && $end === null ) { + return "$var IS NOT NULL"; + } elseif ( $end === null ) { + return "$var >= {$db->addQuotes( $start )}"; + } elseif ( $start === null ) { + return "$var <= {$db->addQuotes( $end )}"; + } else { + return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}"; + } } }