Merge "Make setUp and tearDown protected in tests"
[lhc/web/wiklou.git] / includes / cache / BacklinkCache.php
index a59cc9a..8eed1a5 100644 (file)
@@ -20,6 +20,7 @@
  *
  * @file
  * @author Tim Starling
+ * @author Aaron Schulz
  * @copyright © 2009, Tim Starling, Domas Mituzas
  * @copyright © 2010, Max Sem
  * @copyright © 2011, Antoine Musso
@@ -47,6 +48,7 @@ class BacklinkCache {
        /**
         * Multi dimensions array representing batches. Keys are:
         *  > (string) links table name
+        *   > (int) batch size
         *    > 'numRows' : Number of rows for this link table
         *    > 'batches' : array( $start, $end )
         *
@@ -103,9 +105,10 @@ class BacklinkCache {
                        self::$cache = new ProcessCacheLRU( 1 );
                }
                $dbKey = $title->getPrefixedDBkey();
-               if ( !self::$cache->has( $dbKey, 'obj' ) ) {
+               if ( !self::$cache->has( $dbKey, 'obj', 3600 ) ) {
                        self::$cache->set( $dbKey, 'obj', new self( $title ) );
                }
+
                return self::$cache->get( $dbKey, 'obj' );
        }
 
@@ -154,64 +157,83 @@ class BacklinkCache {
        /**
         * Get the backlinks for a given table. Cached in process memory only.
         * @param $table String
-        * @param $startId Integer or false
-        * @param $endId Integer or false
+        * @param $startId Integer|false
+        * @param $endId Integer|false
+        * @param $max Integer|INF
         * @return TitleArrayFromResult
         */
-       public function getLinks( $table, $startId = false, $endId = false ) {
+       public function getLinks( $table, $startId = false, $endId = false, $max = INF ) {
+               return TitleArray::newFromResult( $this->queryLinks( $table, $startId, $endId, $max ) );
+       }
+
+       /**
+        * Get the backlinks for a given table. Cached in process memory only.
+        * @param $table String
+        * @param $startId Integer|false
+        * @param $endId Integer|false
+        * @param $max Integer|INF
+        * @param $select string 'all' or 'ids'
+        * @return ResultWrapper
+        */
+       protected function queryLinks( $table, $startId, $endId, $max, $select = 'all' ) {
                wfProfileIn( __METHOD__ );
 
                $fromField = $this->getPrefix( $table ) . '_from';
 
-               if ( $startId || $endId ) {
-                       // Partial range, not cached
-                       wfDebug( __METHOD__ . ": from DB (uncacheable range)\n" );
+               if ( !$startId && !$endId && is_infinite( $max )
+                       && isset( $this->fullResultCache[$table] )
+               ) {
+                       wfDebug( __METHOD__ . ": got results from cache\n" );
+                       $res = $this->fullResultCache[$table];
+               } else {
+                       wfDebug( __METHOD__ . ": got results from DB\n" );
                        $conds = $this->getConditions( $table );
-
                        // Use the from field in the condition rather than the joined page_id,
                        // because databases are stupid and don't necessarily propagate indexes.
                        if ( $startId ) {
                                $conds[] = "$fromField >= " . intval( $startId );
                        }
-
                        if ( $endId ) {
                                $conds[] = "$fromField <= " . intval( $endId );
                        }
+                       $options = array( 'ORDER BY' => $fromField );
+                       if ( is_finite( $max ) && $max > 0 ) {
+                               $options['LIMIT'] = $max;
+                       }
 
-                       $res = $this->getDB()->select(
-                               array( $table, 'page' ),
-                               array( 'page_namespace', 'page_title', 'page_id' ),
-                               $conds,
-                               __METHOD__,
-                               array(
-                                       'STRAIGHT_JOIN',
-                                       'ORDER BY' => $fromField
-                               ) );
-                       $ta = TitleArray::newFromResult( $res );
-
-                       wfProfileOut( __METHOD__ );
-                       return $ta;
-               }
+                       if ( $select === 'ids' ) {
+                               // Just select from the backlink table and ignore the page JOIN
+                               $res = $this->getDB()->select(
+                                       $table,
+                                       array( $this->getPrefix( $table ) . '_from AS page_id' ),
+                                       array_filter( $conds, function ( $clause ) { // kind of janky
+                                               return !preg_match( '/(\b|=)page_id(\b|=)/', $clause );
+                                       } ),
+                                       __METHOD__,
+                                       $options
+                               );
+                       } else {
+                               // Select from the backlink table and JOIN with page title information
+                               $res = $this->getDB()->select(
+                                       array( $table, 'page' ),
+                                       array( 'page_namespace', 'page_title', 'page_id' ),
+                                       $conds,
+                                       __METHOD__,
+                                       array_merge( array( 'STRAIGHT_JOIN' ), $options )
+                               );
+                       }
 
-               // @todo FIXME: Make this a function?
-               if ( !isset( $this->fullResultCache[$table] ) ) {
-                       wfDebug( __METHOD__ . ": from DB\n" );
-                       $res = $this->getDB()->select(
-                               array( $table, 'page' ),
-                               array( 'page_namespace', 'page_title', 'page_id' ),
-                               $this->getConditions( $table ),
-                               __METHOD__,
-                               array(
-                                       'STRAIGHT_JOIN',
-                                       'ORDER BY' => $fromField,
-                               ) );
-                       $this->fullResultCache[$table] = $res;
+                       if ( $select === 'all' && !$startId && !$endId && $res->numRows() < $max ) {
+                               // The full results fit within the limit, so cache them
+                               $this->fullResultCache[$table] = $res;
+                       } else {
+                               wfDebug( __METHOD__ . ": results from DB were uncacheable\n" );
+                       }
                }
 
-               $ta = TitleArray::newFromResult( $this->fullResultCache[$table] );
-
                wfProfileOut( __METHOD__ );
-               return $ta;
+
+               return $res;
        }
 
        /**
@@ -234,7 +256,7 @@ class BacklinkCache {
                } else {
                        $prefix = null;
                        wfRunHooks( 'BacklinkCacheGetPrefix', array( $table, &$prefix ) );
-                       if( $prefix ) {
+                       if ( $prefix ) {
                                return $prefix;
                        } else {
                                throw new MWException( "Invalid table \"$table\" in " . __CLASS__ );
@@ -277,19 +299,19 @@ class BacklinkCache {
                        case 'imagelinks':
                                $conds = array(
                                        'il_to' => $this->title->getDBkey(),
-                                       'page_id=il_from'
+                                       "page_id={$prefix}_from"
                                );
                                break;
                        case 'categorylinks':
                                $conds = array(
                                        'cl_to' => $this->title->getDBkey(),
-                                       'page_id=cl_from',
+                                       "page_id={$prefix}_from"
                                );
                                break;
                        default:
                                $conds = null;
                                wfRunHooks( 'BacklinkCacheGetConditions', array( $table, $this->title, &$conds ) );
-                               if( !$conds ) {
+                               if ( !$conds ) {
                                        throw new MWException( "Invalid table \"$table\" in " . __CLASS__ );
                                }
                }
@@ -309,15 +331,16 @@ class BacklinkCache {
        /**
         * Get the approximate number of backlinks
         * @param $table String
-        * @param $max integer Only count up to this many backlinks
+        * @param $max integer|INF Only count up to this many backlinks
         * @return integer
         */
        public function getNumLinks( $table, $max = INF ) {
-               global $wgMemc;
+               global $wgMemc, $wgUpdateRowsPerJob;
 
                // 1) try partition cache ...
                if ( isset( $this->partitionCache[$table] ) ) {
                        $entry = reset( $this->partitionCache[$table] );
+
                        return min( $max, $entry['numRows'] );
                }
 
@@ -335,20 +358,20 @@ class BacklinkCache {
                }
 
                // 4) fetch from the database ...
-               if ( is_infinite( $max ) ) { // full count
-                       $count = $this->getLinks( $table )->count();
-                       $wgMemc->set( $memcKey, $count, self::CACHE_EXPIRY );
-               } else { // with limit
-                       $count = $this->getDB()->select(
-                               array( $table, 'page' ),
-                               '1',
-                               $this->getConditions( $table ),
-                               __METHOD__,
-                               array( 'LIMIT' => $max )
-                       )->numRows();
+               if ( is_infinite( $max ) ) { // no limit at all
+                       // Use partition() since it will batch the query and skip the JOIN.
+                       // Use $wgUpdateRowsPerJob just to encourage cache reuse for jobs.
+                       $this->partition( $table, $wgUpdateRowsPerJob ); // updates $this->partitionCache
+                       return $this->partitionCache[$table][$wgUpdateRowsPerJob]['numRows'];
+               } else { // probably some sane limit
+                       // Fetch the full title info, since the caller will likely need it next
+                       $count = $this->getLinks( $table, false, false, $max )->count();
+                       if ( $count < $max ) { // full count
+                               $wgMemc->set( $memcKey, $count, self::CACHE_EXPIRY );
+                       }
                }
 
-               return $count;
+               return min( $max, $count );
        }
 
        /**
@@ -366,6 +389,7 @@ class BacklinkCache {
                // 1) try partition cache ...
                if ( isset( $this->partitionCache[$table][$batchSize] ) ) {
                        wfDebug( __METHOD__ . ": got from partition cache\n" );
+
                        return $this->partitionCache[$table][$batchSize]['batches'];
                }
 
@@ -376,6 +400,7 @@ class BacklinkCache {
                if ( isset( $this->fullResultCache[$table] ) ) {
                        $cacheEntry = $this->partitionResult( $this->fullResultCache[$table], $batchSize );
                        wfDebug( __METHOD__ . ": got from full result cache\n" );
+
                        return $cacheEntry['batches'];
                }
 
@@ -391,12 +416,33 @@ class BacklinkCache {
                if ( is_array( $memcValue ) ) {
                        $cacheEntry = $memcValue;
                        wfDebug( __METHOD__ . ": got from memcached $memcKey\n" );
+
                        return $cacheEntry['batches'];
                }
 
                // 4) ... finally fetch from the slow database :(
-               $this->getLinks( $table );
-               $cacheEntry = $this->partitionResult( $this->fullResultCache[$table], $batchSize );
+               $cacheEntry = array( 'numRows' => 0, 'batches' => array() ); // final result
+               // Do the selects in batches to avoid client-side OOMs (bug 43452).
+               // Use a LIMIT that plays well with $batchSize to keep equal sized partitions.
+               $selectSize = max( $batchSize, 200000 - ( 200000 % $batchSize ) );
+               $start = false;
+               do {
+                       $res = $this->queryLinks( $table, $start, false, $selectSize, 'ids' );
+                       $partitions = $this->partitionResult( $res, $batchSize, false );
+                       // Merge the link count and range partitions for this chunk
+                       $cacheEntry['numRows'] += $partitions['numRows'];
+                       $cacheEntry['batches'] = array_merge( $cacheEntry['batches'], $partitions['batches'] );
+                       if ( count( $partitions['batches'] ) ) {
+                               list( $lStart, $lEnd ) = end( $partitions['batches'] );
+                               $start = $lEnd + 1; // pick up after this inclusive range
+                       }
+               } while ( $partitions['numRows'] >= $selectSize );
+               // Make sure the first range has start=false and the last one has end=false
+               if ( count( $cacheEntry['batches'] ) ) {
+                       $cacheEntry['batches'][0][0] = false;
+                       $cacheEntry['batches'][count( $cacheEntry['batches'] ) - 1][1] = false;
+               }
+
                // Save partitions to memcached
                $wgMemc->set( $memcKey, $cacheEntry, self::CACHE_EXPIRY );
 
@@ -405,6 +451,7 @@ class BacklinkCache {
                $wgMemc->set( $memcKey, $cacheEntry['numRows'], self::CACHE_EXPIRY );
 
                wfDebug( __METHOD__ . ": got from database\n" );
+
                return $cacheEntry['batches'];
        }
 
@@ -412,31 +459,32 @@ class BacklinkCache {
         * Partition a DB result with backlinks in it into batches
         * @param $res ResultWrapper database result
         * @param $batchSize integer
+        * @param $isComplete bool Whether $res includes all the backlinks
         * @throws MWException
-        * @return array @see
+        * @return array
         */
-       protected function partitionResult( $res, $batchSize ) {
+       protected function partitionResult( $res, $batchSize, $isComplete = true ) {
                $batches = array();
                $numRows = $res->numRows();
                $numBatches = ceil( $numRows / $batchSize );
 
                for ( $i = 0; $i < $numBatches; $i++ ) {
-                       if ( $i == 0  ) {
+                       if ( $i == 0 && $isComplete ) {
                                $start = false;
                        } else {
-                               $rowNum = intval( $numRows * $i / $numBatches );
+                               $rowNum = $i * $batchSize;
                                $res->seek( $rowNum );
                                $row = $res->fetchObject();
-                               $start = $row->page_id;
+                               $start = (int)$row->page_id;
                        }
 
-                       if ( $i == $numBatches - 1 ) {
+                       if ( $i == ( $numBatches - 1 ) && $isComplete ) {
                                $end = false;
                        } else {
-                               $rowNum = intval( $numRows * ( $i + 1 ) / $numBatches );
+                               $rowNum = min( $numRows - 1, ( $i + 1 ) * $batchSize - 1 );
                                $res->seek( $rowNum );
                                $row = $res->fetchObject();
-                               $end = $row->page_id - 1;
+                               $end = (int)$row->page_id;
                        }
 
                        # Sanity check order