X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=maintenance%2FrefreshLinks.inc;h=b7d531c7f4a39f4a4d023e69a59c5caaacce0fdf;hb=951c0c1e6e2cb95eb1ac5cb50031303c754ee457;hp=7506f8f479d64690c30f3b31f7da1c3b60197669;hpb=7bbe971aec2d548de981a12ed08a7b56a536dcdb;p=lhc%2Fweb%2Fwiklou.git diff --git a/maintenance/refreshLinks.inc b/maintenance/refreshLinks.inc index 7506f8f479..b7d531c7f4 100644 --- a/maintenance/refreshLinks.inc +++ b/maintenance/refreshLinks.inc @@ -1,75 +1,124 @@ setOption('math', MW_MATH_SOURCE); # Don't generate extension images (e.g. Timeline) - $wgParser->mTagHooks = array(); - - # Don't generate thumbnail images - $wgUseImageResize = false; + if( method_exists( $wgParser, "clearTagHooks" ) ) { + $wgParser->clearTagHooks(); + } + + # Don't use HTML tidy + $wgUseTidy = false; + + $what = $redirectsOnly ? "redirects" : "links"; - if ( $newOnly ) { - print "Refreshing links from "; + if( $oldRedirectsOnly ) { + # This entire code path is cut-and-pasted from below. Hurrah. + $res = $dbr->query( + "SELECT page_id ". + "FROM page ". + "LEFT JOIN redirect ON page_id=rd_from ". + "WHERE page_is_redirect=1 AND rd_from IS NULL AND ". + ($end == 0 ? "page_id >= $start" + : "page_id BETWEEN $start AND $end"), + $fname + ); + $num = $dbr->numRows( $res ); + print "Refreshing $num old redirects from $start...\n"; + + while( $row = $dbr->fetchObject( $res ) ) { + if ( !( ++$i % $reportingInterval ) ) { + print "$i\n"; + wfWaitForSlaves( $maxLag ); + } + fixRedirect( $row->page_id ); + } + } elseif( $newOnly ) { + print "Refreshing $what from "; $res = $dbr->select( 'page', array( 'page_id' ), array( 'page_is_new' => 1, - "page_id > $start" ), + "page_id >= $start" ), $fname ); $num = $dbr->numRows( $res ); print "$num new articles...\n"; - + $i = 0; while ( $row = $dbr->fetchObject( $res ) ) { - if ( !( ++$i % REPORTING_INTERVAL ) ) { + if ( !( ++$i % $reportingInterval ) ) { print "$i\n"; wfWaitForSlaves( $maxLag ); } - - fixLinksFromArticle( $row->page_id ); + if($redirectsOnly) + fixRedirect( $row->page_id ); + else + fixLinksFromArticle( $row->page_id ); } } else { - print "Refreshing link table.\n"; - $end = $dbr->selectField( 'page', 'max(page_id)', false ); + print "Refreshing $what table.\n"; + if ( !$end ) { + $end = $dbr->selectField( 'page', 'max(page_id)', false ); + } print("Starting from page_id $start of $end.\n"); for ($id = $start; $id <= $end; $id++) { - - if ( !($id % REPORTING_INTERVAL) ) { + + if ( !($id % $reportingInterval) ) { print "$id\n"; wfWaitForSlaves( $maxLag ); } - fixLinksFromArticle( $id ); + if($redirectsOnly) + fixRedirect( $id ); + else + fixLinksFromArticle( $id ); } - + } +} + +function fixRedirect( $id ){ + global $wgTitle, $wgArticle; + $wgTitle = Title::newFromID( $id ); + $dbw = wfGetDB( DB_MASTER ); + + if ( is_null( $wgTitle ) ) { + return; } + $wgArticle = new Article($wgTitle); + + $rt = $wgArticle->followRedirect(); + + if($rt == false || !is_object($rt)) + return; + + $wgArticle->updateRedirectOn($dbw,$rt); } function fixLinksFromArticle( $id ) { - global $wgTitle, $wgArticle, $wgOut, $wgParser; - + global $wgTitle, $wgParser; + $wgTitle = Title::newFromID( $id ); - $dbw =& wfGetDB( DB_MASTER ); - + $dbw = wfGetDB( DB_MASTER ); + + $linkCache =& LinkCache::singleton(); + $linkCache->clear(); + if ( is_null( $wgTitle ) ) { return; } @@ -82,44 +131,72 @@ function fixLinksFromArticle( $id ) { $options = new ParserOptions; $parserOutput = $wgParser->parse( $revision->getText(), $wgTitle, $options, true, true, $revision->getId() ); - $update = new LinksUpdate( $wgTitle, $parserOutput ); - $update->doDumbUpdate(); + $update = new LinksUpdate( $wgTitle, $parserOutput, false ); + $update->doUpdate(); $dbw->immediateCommit(); } -function deleteLinksFromNonexistent( $maxLag = 0 ) { - $fname = 'deleteLinksFromNonexistent'; - +/* + * Removes non-existing links from pages from pagelinks, imagelinks, + * categorylinks, templatelinks and externallinks tables. + * + * @param $maxLag + * @param $batchSize The size of deletion batches + * + * @author Merlijn van Deen + */ +function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) { wfWaitForSlaves( $maxLag ); + + $dbw = wfGetDB( DB_MASTER ); - $dbw =& wfGetDB( DB_WRITE ); + $lb = wfGetLBFactory()->newMainLB(); + $dbr = $lb->getConnection( DB_SLAVE ); + $dbr->bufferResults( false ); - $linksTables = array( + $linksTables = array( // table name => page_id field 'pagelinks' => 'pl_from', 'imagelinks' => 'il_from', 'categorylinks' => 'cl_from', + 'templatelinks' => 'tl_from', + 'externallinks' => 'el_from', ); - - $page = $dbw->tableName( 'page' ); - - + foreach ( $linksTables as $table => $field ) { - if ( !$dbw->ping() ) { - print "DB disconnected, reconnecting..."; - while ( !$dbw->ping() ) { - print "."; - sleep(10); + print "Retrieving illegal entries from $table... "; + + // SELECT DISTINCT( $field ) FROM $table LEFT JOIN page ON $field=page_id WHERE page_id IS NULL; + $results = $dbr->select( array( $table, 'page' ), + $field, + array('page_id' => null ), + __METHOD__, + 'DISTINCT', + array( 'page' => array( 'LEFT JOIN', "$field=page_id")) + ); + + $counter = 0; + $list = array(); + print "0.."; + + foreach( $results as $row ) { + $counter++; + $list[] = $row->$field; + if ( ( $counter % $batchSize ) == 0 ) { + wfWaitForSlaves(5); + $dbw->delete( $table, array( $field => $list ), __METHOD__ ); + + print $counter . ".."; + $list = array(); } - print "\n"; } - - $pTable = $dbw->tableName( $table ); - $sql = "DELETE $pTable FROM $pTable LEFT JOIN $page ON page_id=$field WHERE page_id IS NULL"; - print "Deleting $table from non-existent articles..."; - $dbw->query( $sql, $fname ); - print " fixed " .$dbw->affectedRows() . " row(s)\n"; + print $counter; + if (count($list) > 0) { + $dbw->delete( $table, array( $field => $list ), __METHOD__ ); + } + + print "\n"; } + + $lb->closeAll(); } - -?>