Fix bug in prefixing scheme
authorAryeh Gregor <simetrical@users.mediawiki.org>
Mon, 26 Jul 2010 22:04:19 +0000 (22:04 +0000)
committerAryeh Gregor <simetrical@users.mediawiki.org>
Mon, 26 Jul 2010 22:04:19 +0000 (22:04 +0000)
As Bawolff pointed out at [[mw:User talk:Simetrical/Collation]], the
prefixing scheme I was using meant that the page "Z" with sort key of
"F" would sort after a page named "A" with a sort key of "FF", since the
first one's raw sort key would compute to "FZ", and the second's would
compute to "FFA".  I've fixed this by separating the prefix from the
unprefixed part by a null byte (cl_sortkey is eventually going to be
totally binary anyway, may as well start now).

includes/CategoryPage.php
includes/LinksUpdate.php
includes/Title.php
languages/Language.php
maintenance/updateCollation.php

index 0897fbf..41f4969 100644 (file)
@@ -312,7 +312,7 @@ class CategoryViewer {
                                $count = 0;
                                foreach ( $res as $row ) {
                                        $title = Title::newFromRow( $row );
-                                       $rawSortkey = $row->cl_sortkey_prefix . $title->getCategorySortkey();
+                                       $rawSortkey = $title->getCategorySortkey( $row->cl_sortkey_prefix );
 
                                        if ( ++$count > $this->limit ) {
                                                # We've reached the one extra which shows that there
index fd62cff..b7ad29c 100644 (file)
@@ -457,7 +457,7 @@ class LinksUpdate {
                                        # order or such.
                                        $prefix = $sortkey;
                                        $sortkey = $wgContLang->convertToSortkey(
-                                               $prefix . $this->mTitle->getCategorySortkey() );
+                                               $this->mTitle->getCategorySortkey( $prefix ) );
                                }
 
                                $arr[] = array(
index aa0ec5f..b3b7bce 100644 (file)
@@ -4139,20 +4139,29 @@ class Title {
        }
 
        /**
-        * Returns what the default sort key for categories would be, if
-        * {{defaultsort:}} isn't used.  This is the same as getText() for
-        * categories, and for everything if $wgCategoryPrefixedDefaultSortkey is
-        * false; otherwise it's the same as getPrefixedText().
+        * Returns the raw sort key to be used for categories, with the specified
+        * prefix.  This will be fed to Language::convertToSortkey() to get a
+        * binary sortkey that can be used for actual sorting.
         *
+        * @param $prefix string The prefix to be used, specified using
+        *   {{defaultsort:}} or like [[Category:Foo|prefix]].  Empty for no
+        *   prefix.
         * @return string
         */
-       public function getCategorySortkey() {
+       public function getCategorySortkey( $prefix = '' ) {
                global $wgCategoryPrefixedDefaultSortkey;
                if ( $this->getNamespace() == NS_CATEGORY
                || !$wgCategoryPrefixedDefaultSortkey ) {
-                       return $this->getText();
+                       $unprefixed = $this->getText();
                } else {
-                       return $this->getPrefixedText();
+                       $unprefixed = $this->getPrefixedText();
+               }
+               if ( $prefix !== '' ) {
+                       # Separate with a null byte, so the unprefixed part is only used as
+                       # a tiebreaker when two pages have the exact same prefix -- null
+                       # sorts before everything else (hopefully).
+                       return "$prefix\0$unprefixed";
                }
+               return $unprefixed;
        }
 }
index 4b6a72f..89e5230 100644 (file)
@@ -2938,10 +2938,10 @@ class Language {
        /**
         * Given a string, convert it to a (hopefully short) key that can be used
         * for efficient sorting.  A binary sort according to the sortkeys
-        * corresponds to a logical sort of the corresponding strings.  Applying
-        * this to cl_sortkey_prefix concatenated with the page title (possibly
-        * with namespace prefix, depending on $wgCategoryPrefixedDefaultSortkey)
-        * gives you cl_sortkey.
+        * corresponds to a logical sort of the corresponding strings.  Current
+        * code expects that a null character should sort before all others, but
+        * has no other particular expectations (and that one can be changed if
+        * necessary).
         *
         * @param string $string UTF-8 string
         * @return string Binary sortkey
@@ -2988,6 +2988,9 @@ class Language {
         * @return string UTF-8 string corresponding to the first letter of input
         */
        public function firstLetterForLists( $string ) {
+               if ( $string[0] == "\0" ) {
+                       $string = substr( $string, 1 );
+               }
                return strtoupper( mb_substr( $string, 0, 1 ) );
        }
 }
index f842537..60578ce 100644 (file)
@@ -57,11 +57,10 @@ TEXT;
                        $dbw->begin();
                        foreach ( $res as $row ) {
                                $title = Title::newFromRow( $row );
-                               $rawSortkey = $title->getCategorySortkey();
                                if ( $row->cl_collation == 0 ) {
                                        # This is an old-style row, so the sortkey needs to be
                                        # converted.
-                                       if ( $row->cl_sortkey == $rawSortkey ) {
+                                       if ( $row->cl_sortkey == $title->getCategorySortkey() ) {
                                                $prefix = '';
                                        } else {
                                                # Custom sortkey, use it as a prefix
@@ -82,7 +81,8 @@ TEXT;
                                $dbw->update(
                                        'categorylinks',
                                        array(
-                                               'cl_sortkey' => $wgContLang->convertToSortkey( $prefix . $rawSortkey ),
+                                               'cl_sortkey' => $wgContLang->convertToSortkey(
+                                                       $title->getCategorySortkey( $prefix ) ),
                                                'cl_sortkey_prefix' => $prefix,
                                                'cl_collation' => $wgCollationVersion,
                                                'cl_type' => $type,