Reduce disruption during updateCollation.php
authorTim Starling <tstarling@wikimedia.org>
Tue, 12 Mar 2013 00:26:12 +0000 (11:26 +1100)
committerGerrit Code Review <gerrit@wikimedia.org>
Tue, 12 Mar 2013 23:08:29 +0000 (23:08 +0000)
Have updateCollation.php order by cl_to, so that each category is
updated all at once. This minimises the time during which a category
will appear to be incorrectly sorted, while the maintenance script is in
progress.

Mark the cl_collation index as needing deletion, it was always pretty
pointless. You can't do much better than a full table scan when you're
changing the collation value on a wiki.

Increase the batch size since the lack of a cl_to,cl_from index means
that it will have to filesort each category. A larger batch size means
less sorts. As noted by Liangent on bug 45970, you can't order by
cl_sortkey since that will change during execution.

Also fix an inappropriate use of $wgMiserMode and remove a no-op from
the SET clause of the UPDATE.

Very lightly tested.

Change-Id: I19bc8d6701f5f78040aa9c521427ac98ef488d89

maintenance/tables.sql
maintenance/updateCollation.php

index a917783..4307c0c 100644 (file)
@@ -562,10 +562,10 @@ CREATE UNIQUE INDEX /*i*/cl_from ON /*_*/categorylinks (cl_from,cl_to);
 -- callers won't be using an index: fix this?
 CREATE INDEX /*i*/cl_sortkey ON /*_*/categorylinks (cl_to,cl_type,cl_sortkey,cl_from);
 
--- Not really used?
+-- Used by the API (and some extensions)
 CREATE INDEX /*i*/cl_timestamp ON /*_*/categorylinks (cl_to,cl_timestamp);
 
--- For finding rows with outdated collation
+-- FIXME: Not used, delete this
 CREATE INDEX /*i*/cl_collation ON /*_*/categorylinks (cl_collation);
 
 --
index 04a2d47..2132938 100644 (file)
@@ -35,7 +35,7 @@ require_once( __DIR__ . '/Maintenance.php' );
  * @ingroup Maintenance
  */
 class UpdateCollation extends Maintenance {
-       const BATCH_SIZE = 50; // Number of rows to process in one batch
+       const BATCH_SIZE = 10000; // Number of rows to process in one batch
        const SYNC_INTERVAL = 20; // Wait for slaves after this many batches
 
        public $sizeHistogram = array();
@@ -82,10 +82,13 @@ TEXT;
                        $collation = Collation::singleton();
                }
 
-               $options = array( 'LIMIT' => self::BATCH_SIZE, 'STRAIGHT_JOIN' );
+               $options = array(
+                       'LIMIT' => self::BATCH_SIZE,
+                       'ORDER BY' => 'cl_to, cl_type, cl_from',
+                       'STRAIGHT_JOIN',
+               );
 
                if ( $force || $dryRun ) {
-                       $options['ORDER BY'] = 'cl_from, cl_to';
                        $collationConds = array();
                } else {
                        if ( $this->hasOption( 'previous-collation' ) ) {
@@ -96,20 +99,20 @@ TEXT;
                                );
                        }
 
-                       if ( !$wgMiserMode ) {
+                       $count = $dbw->estimateRowCount(
+                               'categorylinks',
+                               '*',
+                               $collationConds,
+                               __METHOD__
+                       );
+                       // Improve estimate if feasible
+                       if ( $count < 1000000 ) {
                                $count = $dbw->selectField(
                                        'categorylinks',
                                        'COUNT(*)',
                                        $collationConds,
                                        __METHOD__
                                );
-                       } else {
-                               $count = $dbw->estimateRowCount(
-                                       'categorylinks',
-                                       '*',
-                                       $collationConds,
-                                       __METHOD__
-                               );
                        }
                        if ( $count == 0 ) {
                                $this->output( "Collations up-to-date.\n" );
@@ -126,7 +129,7 @@ TEXT;
                        $res = $dbw->select(
                                array( 'categorylinks', 'page' ),
                                array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
-                                       'cl_sortkey', 'page_namespace', 'page_title'
+                                       'cl_sortkey', 'cl_type', 'page_namespace', 'page_title'
                                ),
                                array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ),
                                __METHOD__,
@@ -175,7 +178,6 @@ TEXT;
                                                        'cl_sortkey_prefix' => $prefix,
                                                        'cl_collation' => $collationName,
                                                        'cl_type' => $type,
-                                                       'cl_timestamp = cl_timestamp',
                                                ),
                                                array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ),
                                                __METHOD__
@@ -186,12 +188,8 @@ TEXT;
                                $dbw->commit( __METHOD__ );
                        }
 
-                       if ( ( $force || $dryRun ) && $row ) {
-                               $encFrom = $dbw->addQuotes( $row->cl_from );
-                               $encTo = $dbw->addQuotes( $row->cl_to );
-                               $batchConds = array(
-                                       "(cl_from = $encFrom AND cl_to > $encTo) " .
-                                       " OR cl_from > $encFrom" );
+                       if ( $row ) {
+                               $batchConds = array( $this->getBatchCondition( $row ) );
                        }
 
                        $count += $res->numRows();
@@ -212,6 +210,32 @@ TEXT;
                }
        }
 
+       /**
+        * Return an SQL expression selecting rows which sort above the given row,
+        * assuming an ordering of cl_to, cl_type, cl_from
+        */
+       function getBatchCondition( $row ) {
+               $dbw = $this->getDB( DB_MASTER );
+               $fields = array( 'cl_to', 'cl_type', 'cl_from' );
+               $first = true;
+               $cond = false;
+               $prefix = false;
+               foreach ( $fields as $field ) {
+                       $encValue = $dbw->addQuotes( $row->$field );
+                       $inequality = "$field > $encValue";
+                       $equality = "$field = $encValue";
+                       if ( $first ) {
+                               $cond = $inequality;
+                               $prefix = $equality;
+                               $first = false;
+                       } else {
+                               $cond .= " OR ($prefix AND $inequality)";
+                               $prefix .= " AND $equality";
+                       }
+               }
+               return $cond;
+       }
+
        function updateSortKeySizeHistogram( $key ) {
                $length = strlen( $key );
                if ( !isset( $this->sizeHistogram[$length] ) ) {