Reconcept cl_raw_sortkey as cl_sortkey_prefix
authorAryeh Gregor <simetrical@users.mediawiki.org>
Mon, 26 Jul 2010 19:27:13 +0000 (19:27 +0000)
committerAryeh Gregor <simetrical@users.mediawiki.org>
Mon, 26 Jul 2010 19:27:13 +0000 (19:27 +0000)
In response to feedback by Phillipe Verdy on bug 164.  Now if a bunch of
pages have [[Category:Foo| ]], they'll sort amongst themselves according
to page name, instead of in basically random order as it is currently.
This also makes storage more elegant and intuitive: instead of giving
NULL a magic meaning when there's no custom sortkey specified, we just
store an empty string, since there's no prefix.

This means {{defaultsort:}} really now means {{defaultsortprefix:}},
which is slightly confusing, and a lot of code is now slightly
misleading or poorly named.  But it should all work fine.

Also, while I was at it, I made updateCollation.php work as a transition
script, so you can apply the SQL patch and then run updateCollation.php
and things will work.  However, with the new schema it's not trivial to
reverse this -- you'd have to recover the raw sort keys with some PHP.
Conversion goes at about a thousand rows a second for me, and seems to
be CPU-bound.  Could probably be optimized.

I also adjusted the transition script so it will fix rows with collation
versions *greater* than the current one, as well as less.  Thus if some
site wants to use their own collation, they can call it 137 or
something, and if they later want to switch back to MediaWiki stock
collation 7, it will work.

Also fixed a silly bug in updateCollation.php where it would say "1000
done" if it did nothing, and changed $res->numRows() >= self::BATCH_SIZE
to == so people don't wonder how it could be bigger (since it can't, I
hope).

includes/CategoryPage.php
includes/DefaultSettings.php
includes/LinksUpdate.php
includes/Title.php
includes/parser/Parser.php
languages/Language.php
maintenance/archives/patch-categorylinks-better-collation.sql
maintenance/updateCollation.php

index c614a27..97a6098 100644 (file)
@@ -270,7 +270,7 @@ class CategoryViewer {
                        foreach ( array( 'page', 'subcat', 'file' ) as $type ) {
                                $res = $dbr->select(
                                        $tables,
-                                       array_merge( $fields, array( 'cl_raw_sortkey' ) ),
+                                       array_merge( $fields, array( 'cl_sortkey_prefix' ) ),
                                        $conds + array( 'cl_type' => $type ) + ( $type == 'page' ? array( $pageCondition ) : array() ),
                                        __METHOD__,
                                        $opts + ( $type == 'page' ? array( 'LIMIT' => $this->limit + 1 ) : array() ),
@@ -286,14 +286,15 @@ class CategoryViewer {
                                        }
 
                                        $title = Title::newFromRow( $row );
+                                       $rawSortkey = $row->cl_sortkey_prefix . $title->getCategorySortkey();
 
                                        if ( $title->getNamespace() == NS_CATEGORY ) {
                                                $cat = Category::newFromRow( $row, $title );
-                                               $this->addSubcategoryObject( $cat, $row->cl_raw_sortkey, $row->page_len );
+                                               $this->addSubcategoryObject( $cat, $rawSortkey, $row->page_len );
                                        } elseif ( $this->showGallery && $title->getNamespace() == NS_FILE ) {
-                                               $this->addImage( $title, $row->cl_raw_sortkey, $row->page_len, $row->page_is_redirect );
+                                               $this->addImage( $title, $rawSortkey, $row->page_len, $row->page_is_redirect );
                                        } else {
-                                               $this->addPage( $title, $row->cl_raw_sortkey, $row->page_len, $row->page_is_redirect );
+                                               $this->addPage( $title, $rawSortkey, $row->page_len, $row->page_is_redirect );
                                        }
                                }
                        }
index 1e9b6cd..ba1c7c5 100644 (file)
@@ -4476,8 +4476,8 @@ $wgExperimentalCategorySort = false;
 /**
  * A version indicator for collations that will be stored in cl_collation for
  * all new rows.  Used when the collation algorithm changes: a script checks
- * for all rows where cl_collation < $wgCollationVersion and regenerates
- * cl_sortkey based on cl_raw_sortkey.
+ * for all rows where cl_collation != $wgCollationVersion and regenerates
+ * cl_sortkey based on the page name and cl_sortkey_prefix.
  */
 $wgCollationVersion = 1;
 
index 9cb11b9..fd62cff 100644 (file)
@@ -441,14 +441,31 @@ class LinksUpdate {
                                } else {
                                        $type = 'page';
                                }
-                               $convertedSortkey = $wgContLang->convertToSortkey( $sortkey );
-                               # TODO: Set $sortkey to null if it's redundant
+
+                               # TODO: This is kind of wrong, because someone might set a sort
+                               # key prefix that's the same as the default sortkey for the
+                               # title.  This should be fixed by refactoring code to replace
+                               # $sortkey in this array by a prefix, but it's basically harmless
+                               # (Title::moveTo() has had the same issue for a long time).
+                               if ( $this->mTitle->getCategorySortkey() == $sortkey ) {
+                                       $prefix = '';
+                                       $sortkey = $wgContLang->convertToSortkey( $sortkey );
+                               } else {
+                                       # Treat custom sortkeys as a prefix, so that if multiple
+                                       # things are forced to sort as '*' or something, they'll
+                                       # sort properly in the category rather than in page_id
+                                       # order or such.
+                                       $prefix = $sortkey;
+                                       $sortkey = $wgContLang->convertToSortkey(
+                                               $prefix . $this->mTitle->getCategorySortkey() );
+                               }
+
                                $arr[] = array(
                                        'cl_from'    => $this->mId,
                                        'cl_to'      => $name,
-                                       'cl_sortkey' => $convertedSortkey,
+                                       'cl_sortkey' => $sortkey,
                                        'cl_timestamp' => $this->mDb->timestamp(),
-                                       'cl_raw_sortkey' => $sortkey,
+                                       'cl_sortkey_prefix' => $prefix,
                                        'cl_collation' => $wgCollationVersion,
                                        'cl_type' => $type,
                                );
index cb2c554..aa0ec5f 100644 (file)
@@ -4137,4 +4137,22 @@ class Title {
 
                return $types;
        }
+
+       /**
+        * Returns what the default sort key for categories would be, if
+        * {{defaultsort:}} isn't used.  This is the same as getText() for
+        * categories, and for everything if $wgCategoryPrefixedDefaultSortkey is
+        * false; otherwise it's the same as getPrefixedText().
+        *
+        * @return string
+        */
+       public function getCategorySortkey() {
+               global $wgCategoryPrefixedDefaultSortkey;
+               if ( $this->getNamespace() == NS_CATEGORY
+               || !$wgCategoryPrefixedDefaultSortkey ) {
+                       return $this->getText();
+               } else {
+                       return $this->getPrefixedText();
+               }
+       }
 }
index 1fb6d98..4d16393 100644 (file)
@@ -5056,12 +5056,8 @@ class Parser {
                global $wgCategoryPrefixedDefaultSortkey;
                if ( $this->mDefaultSort !== false ) {
                        return $this->mDefaultSort;
-               } elseif ( $this->mTitle->getNamespace() == NS_CATEGORY ||
-                       !$wgCategoryPrefixedDefaultSortkey )
-               {
-                       return $this->mTitle->getText();
                } else {
-                       return $this->mTitle->getPrefixedText();
+                       return $this->mTitle->getCategorySortkey();
                }
        }
 
index 211cf05..4b6a72f 100644 (file)
@@ -2939,7 +2939,9 @@ class Language {
         * Given a string, convert it to a (hopefully short) key that can be used
         * for efficient sorting.  A binary sort according to the sortkeys
         * corresponds to a logical sort of the corresponding strings.  Applying
-        * this to cl_raw_sortkey produces cl_sortkey.
+        * this to cl_sortkey_prefix concatenated with the page title (possibly
+        * with namespace prefix, depending on $wgCategoryPrefixedDefaultSortkey)
+        * gives you cl_sortkey.
         *
         * @param string $string UTF-8 string
         * @return string Binary sortkey
index f73dd52..5722a33 100644 (file)
@@ -8,9 +8,9 @@
 -- to work for MySQL for now, without table prefixes, possibly other random
 -- limitations.
 ALTER TABLE categorylinks
-       ADD COLUMN cl_raw_sortkey varchar(255) binary NULL default NULL,
+       ADD COLUMN cl_sortkey_prefix varchar(255) binary NOT NULL default '',
        ADD COLUMN cl_collation tinyint NOT NULL default 0,
-       ADD COLUMN cl_type ENUM('page', 'subcat', 'file') NOT NULL,
+       ADD COLUMN cl_type ENUM('page', 'subcat', 'file') NOT NULL default 'page',
        ADD INDEX (cl_collation),
        DROP INDEX cl_sortkey,
        ADD INDEX cl_sortkey (cl_to, cl_type, cl_sortkey, cl_from);
index 93c845d..f842537 100644 (file)
@@ -1,6 +1,6 @@
 <?php
 /**
- * @file 
+ * @file
  * @ingroup Maintenance
  * @author Aryeh Gregor (Simetrical)
  */
@@ -17,10 +17,10 @@ class UpdateCollation extends Maintenance {
 
                global $wgCollationVersion;
                $this->mDescription = <<<TEXT
-This script will find all rows in the categorylinks table whose collation is 
-out-of-date (cl_collation < $wgCollationVersion) and repopulate cl_sortkey 
-using cl_raw_sortkey.  If everything's collation is up-to-date, it will do 
-nothing.
+This script will find all rows in the categorylinks table whose collation is
+out-of-date (cl_collation != $wgCollationVersion) and repopulate cl_sortkey
+using the page title and cl_sortkey_prefix.  If everything's collation is
+up-to-date, it will do nothing.
 TEXT;
 
                #$this->addOption( 'force', 'Run on all rows, even if the collation is supposed to be up-to-date.' );
@@ -32,8 +32,8 @@ TEXT;
                $dbw = wfGetDB( DB_MASTER );
                $count = $dbw->estimateRowCount(
                        'categorylinks',
-                       array( 'cl_from', 'cl_to', 'cl_raw_sortkey' ),
-                       'cl_collation < ' . $dbw->addQuotes( $wgCollationVersion ),
+                       array( 'cl_from', 'cl_to', 'cl_sortkey_prefix' ),
+                       'cl_collation != ' . $dbw->addQuotes( $wgCollationVersion ),
                        __METHOD__
                );
 
@@ -42,21 +42,50 @@ TEXT;
                $count = 0;
                do {
                        $res = $dbw->select(
-                               'categorylinks',
-                               array( 'cl_from', 'cl_to', 'cl_raw_sortkey' ),
-                               'cl_collation < ' . $dbw->addQuotes( $wgCollationVersion ),
+                               array( 'categorylinks', 'page' ),
+                               array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
+                                       'cl_sortkey', 'page_namespace', 'page_title'
+                               ),
+                               array(
+                                       'cl_collation != ' . $dbw->addQuotes( $wgCollationVersion ),
+                                       'cl_from = page_id'
+                               ),
                                __METHOD__,
                                array( 'LIMIT' => self::BATCH_SIZE )
                        );
 
                        $dbw->begin();
                        foreach ( $res as $row ) {
-                               # TODO: Handle the case where cl_raw_sortkey is null.
+                               $title = Title::newFromRow( $row );
+                               $rawSortkey = $title->getCategorySortkey();
+                               if ( $row->cl_collation == 0 ) {
+                                       # This is an old-style row, so the sortkey needs to be
+                                       # converted.
+                                       if ( $row->cl_sortkey == $rawSortkey ) {
+                                               $prefix = '';
+                                       } else {
+                                               # Custom sortkey, use it as a prefix
+                                               $prefix = $row->cl_sortkey;
+                                       }
+                               } else {
+                                       $prefix = $row->cl_sortkey_prefix;
+                               }
+                               # cl_type will be wrong for lots of pages if cl_collation is 0,
+                               # so let's update it while we're here.
+                               if ( $title->getNamespace() == NS_CATEGORY ) {
+                                       $type = 'subcat';
+                               } elseif ( $title->getNamespace() == NS_FILE ) {
+                                       $type = 'file';
+                               } else {
+                                       $type = 'page';
+                               }
                                $dbw->update(
                                        'categorylinks',
                                        array(
-                                               'cl_sortkey' => $wgContLang->convertToSortkey( $row->cl_raw_sortkey ),
-                                               'cl_collation' => $wgCollationVersion
+                                               'cl_sortkey' => $wgContLang->convertToSortkey( $prefix . $rawSortkey ),
+                                               'cl_sortkey_prefix' => $prefix,
+                                               'cl_collation' => $wgCollationVersion,
+                                               'cl_type' => $type,
                                        ),
                                        array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ),
                                        __METHOD__
@@ -64,9 +93,9 @@ TEXT;
                        }
                        $dbw->commit();
 
-                       $count += self::BATCH_SIZE;
+                       $count += $res->numRows();
                        $this->output( "$count done.\n" );
-               } while ( $res->numRows() >= self::BATCH_SIZE );
+               } while ( $res->numRows() == self::BATCH_SIZE );
        }
 }