Initial commit for category collation framework
authorAryeh Gregor <simetrical@users.mediawiki.org>
Fri, 23 Jul 2010 19:52:02 +0000 (19:52 +0000)
committerAryeh Gregor <simetrical@users.mediawiki.org>
Fri, 23 Jul 2010 19:52:02 +0000 (19:52 +0000)
Hidden behind $wgExperimentalCategorySort until it's reasonably
complete.  If that's false, no behavior should change (but I didn't test
carefully, so poke me if there's a bug).  See DefaultSettings.php for
documentation on setting it to true.  Currently you should not do this
except if you're working on the feature, since functionality is not
close to reasonable yet and will change rapidly.

Bug 1211 is already fixed with this commit for me.  However, many other
things still need to be done, so this is all very much a
proof-of-concept.

includes/CategoryPage.php
includes/DefaultSettings.php
includes/LinksUpdate.php
languages/Language.php

index 56f85fa..e07ae77 100644 (file)
@@ -226,6 +226,8 @@ class CategoryViewer {
        }
 
        function doCategoryQuery() {
+               global $wgExperimentalCategorySort;
+
                $dbr = wfGetDB( DB_SLAVE, 'category' );
                if ( $this->from != '' ) {
                        $pageCondition = 'cl_sortkey >= ' . $dbr->addQuotes( $this->from );
@@ -238,17 +240,23 @@ class CategoryViewer {
                        $this->flip = false;
                }
 
+               $tables = array( 'page', 'categorylinks', 'category' );
+               $fields = array( 'page_title', 'page_namespace', 'page_len',
+                       'page_is_redirect', 'cl_sortkey', 'cat_id', 'cat_title',
+                       'cat_subcats', 'cat_pages', 'cat_files' );
+               $conds = array( $pageCondition, 'cl_to' => $this->title->getDBkey() );
+               $opts = array( 'ORDER BY' => $this->flip ? 'cl_sortkey DESC' :
+                       'cl_sortkey', 'USE INDEX' => array( 'categorylinks' => 'cl_sortkey' ) );
+               $joins = array( 'categorylinks'  => array( 'INNER JOIN', 'cl_from = page_id' ),
+                       'category' => array( 'LEFT JOIN', 'cat_title = page_title AND page_namespace = ' . NS_CATEGORY ) );
+
                $res = $dbr->select(
-                       array( 'page', 'categorylinks', 'category' ),
-                       array( 'page_title', 'page_namespace', 'page_len', 'page_is_redirect', 'cl_sortkey',
-                               'cat_id', 'cat_title', 'cat_subcats', 'cat_pages', 'cat_files' ),
-                       array( $pageCondition, 'cl_to' => $this->title->getDBkey() ),
+                       $tables,
+                       $fields,
+                       $conds + ( $wgExperimentalCategorySort ? array( 'cl_type' => 'page' ) : array() ),
                        __METHOD__,
-                       array( 'ORDER BY' => $this->flip ? 'cl_sortkey DESC' : 'cl_sortkey',
-                               'USE INDEX' => array( 'categorylinks' => 'cl_sortkey' ),
-                               'LIMIT'    => $this->limit + 1 ),
-                       array( 'categorylinks'  => array( 'INNER JOIN', 'cl_from = page_id' ),
-                               'category' => array( 'LEFT JOIN', 'cat_title = page_title AND page_namespace = ' . NS_CATEGORY ) )
+                       $opts + array( 'LIMIT' => $this->limit + 1 ),
+                       $joins
                );
 
                $count = 0;
@@ -273,6 +281,45 @@ class CategoryViewer {
                                $this->addPage( $title, $x->cl_sortkey, $x->page_len, $x->page_is_redirect );
                        }
                }
+
+               if ( $wgExperimentalCategorySort ) {
+                       # Now add all subcategories and files.  TODO: rewrite to be sane
+                       # (this is basically a proof-of-concept, e.g., no pagination here).
+                       $subcatsRes = $dbr->select(
+                               $tables, $fields,
+                               $conds + array( 'cl_type' => 'subcat' ),
+                               __METHOD__, $opts, $joins
+                       );
+
+                       foreach ( $subcatsRes as $row ) {
+                               $title = Title::newFromRow( $row );
+
+                               if ( $title->getNamespace() == NS_CATEGORY ) {
+                                       $cat = Category::newFromRow( $row, $title );
+                                       $this->addSubcategoryObject( $cat, $row->cl_sortkey, $row->page_len );
+                               } else {
+                                       # Will handle this sanely in final code
+                                       throw new MWException( 'Debug: cl_type = subcat but not category' );
+                               }
+                       }
+
+                       $filesRes = $dbr->select(
+                               $tables, $fields,
+                               $conds + array( 'cl_type' => 'file' ),
+                               __METHOD__, $opts, $joins
+                       );
+
+                       foreach ( $filesRes as $row ) {
+                               $title = Title::newFromRow( $row );
+
+                               if ( $this->showGallery && $title->getNamespace() == NS_FILE ) {
+                                       $this->addImage( $title, $row->cl_sortkey, $row->page_len, $row->page_is_redirect );
+                               } else {
+                                       # More temporary debugging
+                                       throw new MWException( 'Debug: cl_type = file but not file' );
+                               }
+                       }
+               }
        }
 
        function getCategoryTop() {
index 893251d..42dc3df 100644 (file)
@@ -4458,6 +4458,24 @@ $wgCategoryPagingLimit = 200;
  */
 $wgCategoryPrefixedDefaultSortkey = true;
 
+/**
+ * Enable experimental support for non-braindead collation on category pages.
+ * For this to work, you need to alter your categorylinks table by applying
+ * maintenance/archives/patch-categorylinks-better-collation.sql, then keep
+ * up-to-date with changes that are made to that file (they won't be
+ * automatically applied).  You should also set $wgUseDumbLinkUpdate = true and
+ * run maintenance/refreshLinks.php.
+ */
+$wgExperimentalCategorySort = false;
+
+/**
+ * A version indicator for collations that will be stored in cl_collation for
+ * all new rows.  Used when the collation algorithm changes: a script checks
+ * for all rows where cl_collation < $wgCollationVersion and regenerates
+ * cl_sortkey based on cl_raw_sortkey.
+ */
+$wgCollationVersion = 0;
+
 /** @} */ # End categories }
 
 /*************************************************************************//**
index aebf249..9cb11b9 100644 (file)
@@ -426,18 +426,40 @@ class LinksUpdate {
         * @private
         */
        function getCategoryInsertions( $existing = array() ) {
-               global $wgContLang;
+               global $wgContLang, $wgExperimentalCategorySort, $wgCollationVersion;
                $diffs = array_diff_assoc( $this->mCategories, $existing );
                $arr = array();
                foreach ( $diffs as $name => $sortkey ) {
                        $nt = Title::makeTitleSafe( NS_CATEGORY, $name );
                        $wgContLang->findVariantLink( $name, $nt, true );
-                       $arr[] = array(
-                               'cl_from'    => $this->mId,
-                               'cl_to'      => $name,
-                               'cl_sortkey' => $sortkey,
-                               'cl_timestamp' => $this->mDb->timestamp()
-                       );
+
+                       if ( $wgExperimentalCategorySort ) {
+                               if ( $this->mTitle->getNamespace() == NS_CATEGORY ) {
+                                       $type = 'subcat';
+                               } elseif ( $this->mTitle->getNamespace() == NS_FILE ) {
+                                       $type = 'file';
+                               } else {
+                                       $type = 'page';
+                               }
+                               $convertedSortkey = $wgContLang->convertToSortkey( $sortkey );
+                               # TODO: Set $sortkey to null if it's redundant
+                               $arr[] = array(
+                                       'cl_from'    => $this->mId,
+                                       'cl_to'      => $name,
+                                       'cl_sortkey' => $convertedSortkey,
+                                       'cl_timestamp' => $this->mDb->timestamp(),
+                                       'cl_raw_sortkey' => $sortkey,
+                                       'cl_collation' => $wgCollationVersion,
+                                       'cl_type' => $type,
+                               );
+                       } else {
+                               $arr[] = array(
+                                       'cl_from'    => $this->mId,
+                                       'cl_to'      => $name,
+                                       'cl_sortkey' => $sortkey,
+                                       'cl_timestamp' => $this->mDb->timestamp()
+                               );
+                       }
                }
                return $arr;
        }
index 41619f7..95d1426 100644 (file)
@@ -2934,4 +2934,58 @@ class Language {
        function getConvRuleTitle() {
                return $this->mConverter->getConvRuleTitle();
        }
+
+       /**
+        * Given a string, convert it to a (hopefully short) key that can be used
+        * for efficient sorting.  A binary sort according to the sortkeys
+        * corresponds to a logical sort of the corresponding strings.  Applying
+        * this to cl_raw_sortkey produces cl_sortkey.
+        *
+        * @param string $string UTF-8 string
+        * @return string Binary sortkey
+        */
+       public function convertToSortkey( $string ) {
+               # Stub function for now
+               return $string;
+       }
+
+       /**
+        * Does it make sense for lists to be split up into sections based on their
+        * first letter?  Logogram-based scripts probably want to return false.
+        *
+        * TODO: Use this in CategoryPage.php.
+        *
+        * @return boolean
+        */
+       public function usesFirstLettersInLists() {
+               return true;
+       }
+
+       /**
+        * Given a string, return the logical "first letter" to be used for
+        * grouping on category pages and so on.  This has to be coordinated
+        * carefully with convertToSortkey(), or else the sorted list might jump
+        * back and forth between the same "initial letters" or other pathological
+        * behavior.  For instance, if you just return the first character, but "a"
+        * sorts the same as "A" based on convertToSortkey(), then you might get a
+        * list like
+        *
+        * == A ==
+        * * [[Aardvark]]
+        *
+        * == a ==
+        * * [[antelope]]
+        *
+        * == A ==
+        * * [[Ape]]
+        *
+        * etc., assuming for the sake of argument that $wgCapitalLinks is false.
+        * Obviously, this is ignored if usesFirstLettersInLists() is false.
+        *
+        * @param string $string UTF-8 string
+        * @return string UTF-8 string corresponding to the first letter of input
+        */
+       public function firstLetterForLists( $string ) {
+               return mb_substr( $string, 0, 1 );
+       }
 }