In response to a report from Domas that we are seeing HTMLCacheUpdate::invalidate...
authorTim Starling <tstarling@users.mediawiki.org>
Wed, 12 Aug 2009 05:00:30 +0000 (05:00 +0000)
committerTim Starling <tstarling@users.mediawiki.org>
Wed, 12 Aug 2009 05:00:30 +0000 (05:00 +0000)
* Check the number of rows to be updated before actually doing the query, and if it is too large, repartition the job. Due to caching and job queue lag, it is possible that the original partitioning could be pathologically inaccurate.
* Respect $wgRowsPerQuery (regression due to r47317) but increase the default from 10 to 100. It was originally chosen with a low value because I imagined that it would help reduce slave lag, but this is not generally the case since the queries may be in the same transaction.
* Fix lack of initialisation of $jobs in insertJobs() (sloppy but not a bug)
* To avoid queueing up jobs unnecessarily and to reduce the chance of jobs being repartitioned a large number of times as links are incrementally added, make the size threshold for queueing double the job size instead of equal to the job size
* Add a check of title array size to the immediate case, to avoid updating hundreds of thousands of rows when an incorrect size is stored to memcached.

includes/DefaultSettings.php
includes/HTMLCacheUpdate.php

index 7da728a..46704e9 100644 (file)
@@ -3635,7 +3635,7 @@ $wgUpdateRowsPerJob = 500;
 /**
  * Number of rows to update per query
  */
-$wgUpdateRowsPerQuery = 10;
+$wgUpdateRowsPerQuery = 100;
 
 /**
  * Enable AJAX framework
index bd63c07..7c4731b 100644 (file)
  */
 class HTMLCacheUpdate
 {
-       public $mTitle, $mTable, $mPrefix;
+       public $mTitle, $mTable, $mPrefix, $mStart, $mEnd;
        public $mRowsPerJob, $mRowsPerQuery;
 
-       function __construct( $titleTo, $table ) {
+       function __construct( $titleTo, $table, $start = false, $end = false ) {
                global $wgUpdateRowsPerJob, $wgUpdateRowsPerQuery;
 
                $this->mTitle = $titleTo;
                $this->mTable = $table;
+               $this->mStart = $start;
+               $this->mEnd = $end;
                $this->mRowsPerJob = $wgUpdateRowsPerJob;
                $this->mRowsPerQuery = $wgUpdateRowsPerQuery;
                $this->mCache = $this->mTitle->getBacklinkCache();
        }
 
        public function doUpdate() {
-               # Fetch the IDs
-               $numRows = $this->mCache->getNumLinks( $this->mTable );
+               if ( $this->mStart || $this->mEnd ) {
+                       $this->doPartialUpdate();
+                       return;
+               }
 
-               if ( $numRows != 0 ) {
-                       if ( $numRows > $this->mRowsPerJob ) {
-                               $this->insertJobs();
+               # Get an estimate of the number of rows from the BacklinkCache
+               $numRows = $this->mCache->getNumLinks( $this->mTable );
+               if ( $numRows > $this->mRowsPerJob * 2 ) {
+                       # Do fast cached partition
+                       $this->insertJobs();
+               } else {
+                       # Get the links from the DB
+                       $titleArray = $this->mCache->getLinks( $this->mTable );
+                       # Check if the row count estimate was correct
+                       if ( $titleArray->count() > $this->mRowsPerJob * 2 ) {
+                               # Not correct, do accurate partition
+                               wfDebug( __METHOD__.": row count estimate was incorrect, repartitioning\n" );
+                               $this->insertJobsFromTitles( $titleArray );
                        } else {
-                               $this->invalidate();
+                               $this->invalidateTitles( $titleArray );
                        }
                }
                wfRunHooks( 'HTMLCacheUpdate::doUpdate', array($this->mTitle) );
        }
 
+       /**
+        * Update some of the backlinks, defined by a page ID range
+        */
+       protected function doPartialUpdate() {
+               $titleArray = $this->mCache->getLinks( $this->mTable, $this->mStart, $this->mEnd );
+               if ( $titleArray->count() <= $this->mRowsPerJob * 2 ) {
+                       # This partition is small enough, do the update
+                       $this->invalidateTitles( $titleArray );
+               } else {
+                       # Partitioning was excessively inaccurate. Divide the job further.
+                       # This can occur when a large number of links are added in a short 
+                       # period of time, say by updating a heavily-used template.
+                       $this->insertJobsFromTitles( $titleArray );
+               }
+       }
+
+       /**
+        * Partition the current range given by $this->mStart and $this->mEnd,
+        * using a pre-calculated title array which gives the links in that range.
+        * Queue the resulting jobs.
+        */
+       protected function insertJobsFromTitles( $titleArray ) {
+               # We make subpartitions in the sense that the start of the first job
+               # will be the start of the parent partition, and the end of the last
+               # job will be the end of the parent partition.
+               $jobs = array();
+               $start = $this->mStart; # start of the current job
+               $numTitles = 0;
+               foreach ( $titleArray as $title ) {
+                       $id = $title->getArticleID();
+                       # $numTitles is now the number of titles in the current job not 
+                       # including the current ID
+                       if ( $numTitles >= $this->mRowsPerJob ) {
+                               # Add a job up to but not including the current ID
+                               $params = array(
+                                       'table' => $this->mTable,
+                                       'start' => $start,
+                                       'end' => $id - 1
+                               );
+                               $jobs[] = new HTMLCacheUpdateJob( $this->mTitle, $params );
+                               $start = $id;
+                               $numTitles = 0;
+                       }
+                       $numTitles++;
+               }
+               # Last job
+               $params = array(
+                       'table' => $this->mTable,
+                       'start' => $start,
+                       'end' => $this->mEnd
+               );
+               $jobs[] = new HTMLCacheUpdateJob( $this->mTitle, $params );
+               wfDebug( __METHOD__.": repartitioning into " . count( $jobs ) . " jobs\n" );
+
+               if ( count( $jobs ) < 2 ) {
+                       # I don't think this is possible at present, but handling this case
+                       # makes the code a bit more robust against future code updates and 
+                       # avoids a potential infinite loop of repartitioning
+                       wfDebug( __METHOD__.": repartitioning failed!\n" );
+                       $this->invalidateTitles( $titleArray );
+                       return;
+               }
+
+               Job::batchInsert( $jobs );
+       }
+
        protected function insertJobs() {
                $batches = $this->mCache->partition( $this->mTable, $this->mRowsPerJob );
                if ( !$batches ) {
                        return;
                }
+               $jobs = array();
                foreach ( $batches as $batch ) {
                        $params = array(
                                'table' => $this->mTable,
@@ -68,17 +149,20 @@ class HTMLCacheUpdate
                Job::batchInsert( $jobs );
        }
 
-
        /**
-        * Invalidate a set of pages, right now
+        * Invalidate a range of pages, right now
+        * @deprecated
         */
        public function invalidate( $startId = false, $endId = false ) {
-               global $wgUseFileCache, $wgUseSquid;
-
                $titleArray = $this->mCache->getLinks( $this->mTable, $startId, $endId );
-               if ( $titleArray->count() == 0 ) {
-                       return;
-               }
+               $this->invalidateTitles( $titleArray );
+       }
+
+       /**
+        * Invalidate an array (or iterator) of Title objects, right now
+        */
+       protected function invalidateTitles( $titleArray ) {
+               global $wgUseFileCache, $wgUseSquid;
 
                $dbw = wfGetDB( DB_MASTER );
                $timestamp = $dbw->timestamp();
@@ -88,12 +172,20 @@ class HTMLCacheUpdate
                foreach ( $titleArray as $title ) {
                        $ids[] = $title->getArticleID();
                }
+
+               if ( !$ids ) {
+                       return;
+               }
+
                # Update page_touched
-               $dbw->update( 'page',
-                       array( 'page_touched' => $timestamp ),
-                       array( 'page_id IN (' . $dbw->makeList( $ids ) . ')' ),
-                       __METHOD__
-               );
+               $batches = array_chunk( $ids, $this->mRowsPerQuery );
+               foreach ( $batches as $batch ) {
+                       $dbw->update( 'page',
+                               array( 'page_touched' => $timestamp ),
+                               array( 'page_id IN (' . $dbw->makeList( $batch ) . ')' ),
+                               __METHOD__
+                       );
+               }
 
                # Update squid
                if ( $wgUseSquid ) {
@@ -108,6 +200,7 @@ class HTMLCacheUpdate
                        }
                }
        }
+
 }
 
 /**
@@ -133,8 +226,8 @@ class HTMLCacheUpdateJob extends Job {
        }
 
        public function run() {
-               $update = new HTMLCacheUpdate( $this->title, $this->table );
-               $update->invalidate( $this->start, $this->end );
+               $update = new HTMLCacheUpdate( $this->title, $this->table, $this->start, $this->end );
+               $update->doUpdate();
                return true;
        }
 }