Merge "Add collation for Abkhaz (ab)"
authorjenkins-bot <jenkins-bot@gerrit.wikimedia.org>
Tue, 23 Jan 2018 18:42:29 +0000 (18:42 +0000)
committerGerrit Code Review <gerrit@wikimedia.org>
Tue, 23 Jan 2018 18:42:29 +0000 (18:42 +0000)
autoload.php
includes/collation/AbkhazUppercaseCollation.php [new file with mode: 0644]
includes/collation/Collation.php
includes/collation/CustomUppercaseCollation.php
tests/phpunit/includes/collation/CustomUppercaseCollationTest.php

index 9e557e1..629f96f 100644 (file)
@@ -6,6 +6,7 @@ global $wgAutoloadLocalClasses;
 $wgAutoloadLocalClasses = [
        'APCBagOStuff' => __DIR__ . '/includes/libs/objectcache/APCBagOStuff.php',
        'APCUBagOStuff' => __DIR__ . '/includes/libs/objectcache/APCUBagOStuff.php',
+       'AbkhazUppercaseCollation' => __DIR__ . '/includes/collation/AbkhazUppercaseCollation.php',
        'AbstractContent' => __DIR__ . '/includes/content/AbstractContent.php',
        'Action' => __DIR__ . '/includes/actions/Action.php',
        'ActiveUsersPager' => __DIR__ . '/includes/specials/pagers/ActiveUsersPager.php',
diff --git a/includes/collation/AbkhazUppercaseCollation.php b/includes/collation/AbkhazUppercaseCollation.php
new file mode 100644 (file)
index 0000000..e0ea237
--- /dev/null
@@ -0,0 +1,93 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.31
+ *
+ * @file
+ */
+
+class AbkhazUppercaseCollation extends CustomUppercaseCollation {
+
+       public function __construct() {
+               parent::__construct( [
+                       'А',
+                       'Б',
+                       'В',
+                       'Г',
+                       'Гь',
+                       'Гә',
+                       'Ҕ',
+                       'Ҕь',
+                       'Ҕә',
+                       'Д',
+                       'Дә',
+                       'Е',
+                       'Ж',
+                       'Жь',
+                       'Жә',
+                       'З',
+                       'Ӡ',
+                       'Ӡә',
+                       'И',
+                       'К',
+                       'Кь',
+                       'Кә',
+                       'Қ',
+                       'Қь',
+                       'Қә',
+                       'Ҟ',
+                       'Ҟь',
+                       'Ҟә',
+                       'Л',
+                       'М',
+                       'Н',
+                       'О',
+                       'П',
+                       'Ҧ',
+                       'Р',
+                       'С',
+                       'Т',
+                       'Тә',
+                       'Ҭ',
+                       'Ҭә',
+                       'У',
+                       'Ф',
+                       'Х',
+                       'Хь',
+                       'Хә',
+                       'Ҳ',
+                       'Ҳә',
+                       'Ц',
+                       'Цә',
+                       'Ҵ',
+                       'Ҵә',
+                       'Ч',
+                       'Ҷ',
+                       'Ҽ',
+                       'Ҿ',
+                       'Ш',
+                       'Шь',
+                       'Шә',
+                       'Ы',
+                       'Ҩ',
+                       'Џ',
+                       'Џь',
+                       'ь',
+                       'ә',
+               ], Language::factory( 'ab' ) );
+       }
+}
index 7171a21..30cae5a 100644 (file)
@@ -65,6 +65,8 @@ abstract class Collation {
                                return new CollationEt;
                        case 'xx-uca-fa':
                                return new CollationFa;
+                       case 'uppercase-ab':
+                               return new AbkhazUppercaseCollation;
                        case 'uppercase-ba':
                                return new BashkirUppercaseCollation;
                        case 'uppercase-se':
index 301972d..170d5c2 100644 (file)
@@ -32,6 +32,7 @@
  * conflicts with other people using private use area)
  *
  * This does not support fancy things like secondary differences, etc.
+ * (It supports digraphs, trigraphs etc. though.)
  *
  * It is expected most people will subclass this and just override the
  * constructor to hard-code an alphabet.
@@ -45,25 +46,30 @@ class CustomUppercaseCollation extends NumericUppercaseCollation {
        private $puaSubset;
 
        /**
-        * @note This assumes $alphabet does not contain U+F3000-U+F303F
+        * @note This assumes $alphabet does not contain U+F3000-U+F3FFF
         *
         * @param array $alphabet Sorted array of uppercase characters.
         * @param Language $lang What language for number sorting.
         */
        public function __construct( array $alphabet, Language $lang ) {
-               // It'd be trivial to extend this past 64, you'd just
-               // need a bit of bit-fiddling. Doesn't seem necessary right
-               // now.
-               if ( count( $alphabet ) < 1 || count( $alphabet ) >= 64 ) {
-                       throw new UnexpectedValueException( "Alphabet must be < 64 items" );
+               if ( count( $alphabet ) < 1 || count( $alphabet ) >= 4096 ) {
+                       throw new UnexpectedValueException( "Alphabet must be < 4096 items" );
                }
-               $this->alphabet = $alphabet;
+               $this->firstLetters = $alphabet;
+               // For digraphs, only the first letter is capitalized in input
+               $this->alphabet = array_map( [ $lang, 'uc' ], $alphabet );
 
                $this->puaSubset = [];
                $len = count( $alphabet );
                for ( $i = 0; $i < $len; $i++ ) {
-                       $this->puaSubset[] = "\xF3\xB3\x80" . chr( $i + 128 );
+                       $this->puaSubset[] = "\xF3\xB3" . chr( floor( $i / 64 ) + 128 ) . chr( ( $i % 64 ) + 128 );
                }
+
+               // Sort these arrays so that any trigraphs, digraphs etc. are first
+               // (and they get replaced first in convertToPua()).
+               $lengths = array_map( 'mb_strlen', $this->alphabet );
+               array_multisort( $lengths, SORT_DESC, $this->firstLetters, $this->alphabet, $this->puaSubset );
+
                parent::__construct( $lang );
        }
 
@@ -76,12 +82,17 @@ class CustomUppercaseCollation extends NumericUppercaseCollation {
        }
 
        public function getFirstLetter( $string ) {
-               // In case a title has a PUA code in it, make it sort
-               // under the header for the character it would replace
-               // to avoid inconsistent behaviour. This class mostly
-               // assumes that people will not use PUA codes.
-               return parent::getFirstLetter(
-                       str_replace( $this->puaSubset, $this->alphabet, $string )
-               );
+               $sortkey = $this->getSortKey( $string );
+
+               // In case a title begins with a character from our alphabet, return the corresponding
+               // first-letter. (This also happens if the title has a corresponding PUA code in it, to avoid
+               // inconsistent behaviour. This class mostly assumes that people will not use PUA codes.)
+               $index = array_search( substr( $sortkey, 0, 4 ), $this->puaSubset );
+               if ( $index !== false ) {
+                       return $this->firstLetters[ $index ];
+               }
+
+               // String begins with a character outside of our alphabet, fall back
+               return parent::getFirstLetter( $string );
        }
 }
index d688928..f9e0bc9 100644 (file)
@@ -9,6 +9,7 @@ class CustomUppercaseCollationTest extends MediaWikiTestCase {
                $this->collation = new CustomUppercaseCollation( [
                        'D',
                        'C',
+                       'Cs',
                        'B'
                ], Language::factory( 'en' ) );
 
@@ -34,6 +35,7 @@ class CustomUppercaseCollationTest extends MediaWikiTestCase {
                        [ '💩 ', 'C', 'Test relocated to end' ],
                        [ 'c', 'b', 'lowercase' ],
                        [ 'x', 'z', 'lowercase original' ],
+                       [ 'Cz', 'Cs', 'digraphs' ],
                        [ 'C50D', 'C100', 'Numbers' ]
                ];
        }
@@ -53,8 +55,14 @@ class CustomUppercaseCollationTest extends MediaWikiTestCase {
                        [ 'afdsa', 'A' ],
                        [ "\xF3\xB3\x80\x80Foo", 'D' ],
                        [ "\xF3\xB3\x80\x81Foo", 'C' ],
-                       [ "\xF3\xB3\x80\x82Foo", 'B' ],
-                       [ "\xF3\xB3\x80\x83Foo", "\xF3\xB3\x80\x83" ],
+                       [ "\xF3\xB3\x80\x82Foo", 'Cs' ],
+                       [ "\xF3\xB3\x80\x83Foo", 'B' ],
+                       [ "\xF3\xB3\x80\x84Foo", "\xF3\xB3\x80\x84" ],
+                       [ 'C', 'C' ],
+                       [ 'Cz', 'C' ],
+                       [ 'Cs', 'Cs' ],
+                       [ 'CS', 'Cs' ],
+                       [ 'cs', 'Cs' ],
                ];
        }
 }