Add collation for Bashkir (ba)
authorBrian Wolff <bawolff+wn@gmail.com>
Fri, 28 Apr 2017 04:52:49 +0000 (04:52 +0000)
committerAmire80 <amir.aharoni@mail.huji.ac.il>
Wed, 10 May 2017 04:17:46 +0000 (04:17 +0000)
This is based on a numeric uppercase collation. Bashkir characters
will be remapped to the private use area for the purpose of sorting.

Bug: T162823
Change-Id: I65f1af0b57ff6ded7d464e39efd401f178a3519e

autoload.php
includes/collation/BashkirUppercaseCollation.php [new file with mode: 0644]
includes/collation/Collation.php
includes/collation/CustomUppercaseCollation.php [new file with mode: 0644]
tests/phpunit/includes/CollationTest.php [deleted file]
tests/phpunit/includes/collation/CollationTest.php [new file with mode: 0644]
tests/phpunit/includes/collation/CustomUppercaseCollationTest.php [new file with mode: 0644]

index e5161f1..3677374 100644 (file)
@@ -177,6 +177,7 @@ $wgAutoloadLocalClasses = [
        'BagOStuff' => __DIR__ . '/includes/libs/objectcache/BagOStuff.php',
        'BaseDump' => __DIR__ . '/maintenance/backupPrefetch.inc',
        'BaseTemplate' => __DIR__ . '/includes/skins/BaseTemplate.php',
+       'BashkirUppercaseCollation' => __DIR__ . '/includes/collation/BashkirUppercaseCollation.php',
        'BatchRowIterator' => __DIR__ . '/includes/utils/BatchRowIterator.php',
        'BatchRowUpdate' => __DIR__ . '/includes/utils/BatchRowUpdate.php',
        'BatchRowWriter' => __DIR__ . '/includes/utils/BatchRowWriter.php',
@@ -315,6 +316,7 @@ $wgAutoloadLocalClasses = [
        'CssContentHandler' => __DIR__ . '/includes/content/CssContentHandler.php',
        'CsvStatsOutput' => __DIR__ . '/maintenance/language/StatOutputs.php',
        'CurlHttpRequest' => __DIR__ . '/includes/http/CurlHttpRequest.php',
+       'CustomUppercaseCollation' => __DIR__ . '/includes/collation/CustomUppercaseCollation.php',
        'DBAccessBase' => __DIR__ . '/includes/dao/DBAccessBase.php',
        'DBAccessError' => __DIR__ . '/includes/libs/rdbms/exception/DBAccessError.php',
        'DBAccessObjectUtils' => __DIR__ . '/includes/dao/DBAccessObjectUtils.php',
diff --git a/includes/collation/BashkirUppercaseCollation.php b/includes/collation/BashkirUppercaseCollation.php
new file mode 100644 (file)
index 0000000..33ed9bc
--- /dev/null
@@ -0,0 +1,71 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.30
+ *
+ * @file
+ */
+
+class BashkirUppercaseCollation extends CustomUppercaseCollation {
+
+       public function __construct() {
+               parent::__construct( [
+                       'А',
+                       'Б',
+                       'В',
+                       'Г',
+                       'Ғ',
+                       'Д',
+                       'Ҙ',
+                       'Е',
+                       'Ё',
+                       'Ж',
+                       'З',
+                       'И',
+                       'Й',
+                       'К',
+                       'Ҡ',
+                       'Л',
+                       'М',
+                       'Н',
+                       'Ң',
+                       'О',
+                       'Ө',
+                       'П',
+                       'Р',
+                       'С',
+                       'Ҫ',
+                       'Т',
+                       'У',
+                       'Ү',
+                       'Ф',
+                       'Х',
+                       'Һ',
+                       'Ц',
+                       'Ч',
+                       'Ш',
+                       'Щ',
+                       'Ъ',
+                       'Ы',
+                       'Ь',
+                       'Э',
+                       'Ә',
+                       'Ю',
+                       'Я',
+               ], Language::factory( 'ba' ) );
+       }
+}
index d67bc7e..d009168 100644 (file)
@@ -65,6 +65,8 @@ abstract class Collation {
                                return new CollationEt;
                        case 'xx-uca-fa':
                                return new CollationFa;
+                       case 'uppercase-ba':
+                               return new BashkirUppercaseCollation;
                        default:
                                $match = [];
                                if ( preg_match( '/^uca-([A-Za-z@=-]+)$/', $collationName, $match ) ) {
diff --git a/includes/collation/CustomUppercaseCollation.php b/includes/collation/CustomUppercaseCollation.php
new file mode 100644 (file)
index 0000000..1b96bff
--- /dev/null
@@ -0,0 +1,87 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.30
+ *
+ * @file
+ */
+
+/**
+ * Resort normal UTF-8 order by putting a bunch of stuff in PUA
+ *
+ * This takes a bunch of characters (The alphabet) that should,
+ * be together, and converts them all to private-use-area characters
+ * so that they are all sorted in the right order relative to each
+ * other.
+ *
+ * This renumbers characters starting at U+F3000 (Chosen to avoid
+ * conflicts with other people using private use area)
+ *
+ * This does not support fancy things like secondary differences, etc.
+ *
+ * It is expected most people will subclass this and just override the
+ * constructor to hard-code an alphabet.
+ */
+class CustomUppercaseCollation extends NumericUppercaseCollation {
+
+       /** @var array $alphabet Sorted array of letters */
+       private $alphabet;
+
+       /** @var array $puaSubset List of private use area codes */
+       private $puaSubset;
+
+       /**
+        * @note This assumes $alphabet does not contain U+F3000-U+F303F
+        *
+        * @param array $alphabet Sorted array of uppercase characters.
+        * @param Language $language What language for number sorting.
+        */
+       public function __construct( array $alphabet, Language $lang ) {
+               // It'd be trivial to extend this past 64, you'd just
+               // need a bit of bit-fiddling. Doesn't seem necessary right
+               // now.
+               if ( count( $alphabet ) < 1 || count( $alphabet ) >= 64 ) {
+                       throw new UnexpectedValueException( "Alphabet must be < 64 items" );
+               }
+               $this->alphabet = $alphabet;
+
+               $this->puaSubset = [];
+               $len = count( $alphabet );
+               for ( $i = 0; $i < $len; $i++ ) {
+                       $this->puaSubset[] = "\xF3\xB3\x80" . chr( $i + 128 );
+               }
+               parent::__construct( $lang );
+       }
+
+       private function convertToPua( $string ) {
+               return str_replace( $this->alphabet, $this->puaSubset, $string );
+       }
+
+       public function getSortKey( $string ) {
+               return $this->convertToPua( parent::getSortKey( $string ) );
+       }
+
+       public function getFirstLetter( $string ) {
+               // In case a title has a PUA code in it, make it sort
+               // under the header for the character it would replace
+               // to avoid inconsistent behaviour. This class mostly
+               // assumes that people will not use PUA codes.
+               return parent::getFirstLetter(
+                       str_replace( $this->puaSubset, $this->alphabet, $string )
+               );
+       }
+}
diff --git a/tests/phpunit/includes/CollationTest.php b/tests/phpunit/includes/CollationTest.php
deleted file mode 100644 (file)
index bf283aa..0000000
+++ /dev/null
@@ -1,117 +0,0 @@
-<?php
-
-/**
- * Class CollationTest
- * @covers Collation
- * @covers IcuCollation
- * @covers IdentityCollation
- * @covers UppercaseCollation
- */
-class CollationTest extends MediaWikiLangTestCase {
-       protected function setUp() {
-               parent::setUp();
-               $this->checkPHPExtension( 'intl' );
-       }
-
-       /**
-        * Test to make sure, that if you
-        * have "X" and "XY", the binary
-        * sortkey also has "X" being a
-        * prefix of "XY". Our collation
-        * code makes this assumption.
-        *
-        * @param string $lang Language code for collator
-        * @param string $base Base string
-        * @param string $extended String containing base as a prefix.
-        *
-        * @dataProvider prefixDataProvider
-        */
-       public function testIsPrefix( $lang, $base, $extended ) {
-               $cp = Collator::create( $lang );
-               $cp->setStrength( Collator::PRIMARY );
-               $baseBin = $cp->getSortKey( $base );
-               // Remove sortkey terminator
-               $baseBin = rtrim( $baseBin, "\0" );
-               $extendedBin = $cp->getSortKey( $extended );
-               $this->assertStringStartsWith( $baseBin, $extendedBin, "$base is not a prefix of $extended" );
-       }
-
-       public static function prefixDataProvider() {
-               return [
-                       [ 'en', 'A', 'AA' ],
-                       [ 'en', 'A', 'AAA' ],
-                       [ 'en', 'Д', 'ДЂ' ],
-                       [ 'en', 'Д', 'ДA' ],
-                       // 'Ʒ' should expand to 'Z ' (note space).
-                       [ 'fi', 'Z', 'Ʒ' ],
-                       // 'Þ' should expand to 'th'
-                       [ 'sv', 't', 'Þ' ],
-                       // Javanese is a limited use alphabet, so should have 3 bytes
-                       // per character, so do some tests with it.
-                       [ 'en', 'ꦲ', 'ꦲꦤ' ],
-                       [ 'en', 'ꦲ', 'ꦲД' ],
-                       [ 'en', 'A', 'Aꦲ' ],
-               ];
-       }
-
-       /**
-        * Opposite of testIsPrefix
-        *
-        * @dataProvider notPrefixDataProvider
-        */
-       public function testNotIsPrefix( $lang, $base, $extended ) {
-               $cp = Collator::create( $lang );
-               $cp->setStrength( Collator::PRIMARY );
-               $baseBin = $cp->getSortKey( $base );
-               // Remove sortkey terminator
-               $baseBin = rtrim( $baseBin, "\0" );
-               $extendedBin = $cp->getSortKey( $extended );
-               $this->assertStringStartsNotWith( $baseBin, $extendedBin, "$base is a prefix of $extended" );
-       }
-
-       public static function notPrefixDataProvider() {
-               return [
-                       [ 'en', 'A', 'B' ],
-                       [ 'en', 'AC', 'ABC' ],
-                       [ 'en', 'Z', 'Ʒ' ],
-                       [ 'en', 'A', 'ꦲ' ],
-               ];
-       }
-
-       /**
-        * Test correct first letter is fetched.
-        *
-        * @param string $collation Collation name (aka uca-en)
-        * @param string $string String to get first letter of
-        * @param string $firstLetter Expected first letter.
-        *
-        * @dataProvider firstLetterProvider
-        */
-       public function testGetFirstLetter( $collation, $string, $firstLetter ) {
-               $col = Collation::factory( $collation );
-               $this->assertEquals( $firstLetter, $col->getFirstLetter( $string ) );
-       }
-
-       function firstLetterProvider() {
-               return [
-                       [ 'uppercase', 'Abc', 'A' ],
-                       [ 'uppercase', 'abc', 'A' ],
-                       [ 'identity', 'abc', 'a' ],
-                       [ 'uca-en', 'abc', 'A' ],
-                       [ 'uca-en', ' ', ' ' ],
-                       [ 'uca-en', 'Êveryone', 'E' ],
-                       [ 'uca-vi', 'Êveryone', 'Ê' ],
-                       // Make sure thorn is not a first letter.
-                       [ 'uca-sv', 'The', 'T' ],
-                       [ 'uca-sv', 'Å', 'Å' ],
-                       [ 'uca-hu', 'dzsdo', 'Dzs' ],
-                       [ 'uca-hu', 'dzdso', 'Dz' ],
-                       [ 'uca-hu', 'CSD', 'Cs' ],
-                       [ 'uca-root', 'CSD', 'C' ],
-                       [ 'uca-fi', 'Ǥ', 'G' ],
-                       [ 'uca-fi', 'Ŧ', 'T' ],
-                       [ 'uca-fi', 'Ʒ', 'Z' ],
-                       [ 'uca-fi', 'Ŋ', 'N' ],
-               ];
-       }
-}
diff --git a/tests/phpunit/includes/collation/CollationTest.php b/tests/phpunit/includes/collation/CollationTest.php
new file mode 100644 (file)
index 0000000..25911a7
--- /dev/null
@@ -0,0 +1,118 @@
+<?php
+
+/**
+ * Class CollationTest
+ * @covers Collation
+ * @covers IcuCollation
+ * @covers IdentityCollation
+ * @covers UppercaseCollation
+ */
+class CollationTest extends MediaWikiLangTestCase {
+       protected function setUp() {
+               parent::setUp();
+               $this->checkPHPExtension( 'intl' );
+       }
+
+       /**
+        * Test to make sure, that if you
+        * have "X" and "XY", the binary
+        * sortkey also has "X" being a
+        * prefix of "XY". Our collation
+        * code makes this assumption.
+        *
+        * @param string $lang Language code for collator
+        * @param string $base Base string
+        * @param string $extended String containing base as a prefix.
+        *
+        * @dataProvider prefixDataProvider
+        */
+       public function testIsPrefix( $lang, $base, $extended ) {
+               $cp = Collator::create( $lang );
+               $cp->setStrength( Collator::PRIMARY );
+               $baseBin = $cp->getSortKey( $base );
+               // Remove sortkey terminator
+               $baseBin = rtrim( $baseBin, "\0" );
+               $extendedBin = $cp->getSortKey( $extended );
+               $this->assertStringStartsWith( $baseBin, $extendedBin, "$base is not a prefix of $extended" );
+       }
+
+       public static function prefixDataProvider() {
+               return [
+                       [ 'en', 'A', 'AA' ],
+                       [ 'en', 'A', 'AAA' ],
+                       [ 'en', 'Д', 'ДЂ' ],
+                       [ 'en', 'Д', 'ДA' ],
+                       // 'Ʒ' should expand to 'Z ' (note space).
+                       [ 'fi', 'Z', 'Ʒ' ],
+                       // 'Þ' should expand to 'th'
+                       [ 'sv', 't', 'Þ' ],
+                       // Javanese is a limited use alphabet, so should have 3 bytes
+                       // per character, so do some tests with it.
+                       [ 'en', 'ꦲ', 'ꦲꦤ' ],
+                       [ 'en', 'ꦲ', 'ꦲД' ],
+                       [ 'en', 'A', 'Aꦲ' ],
+               ];
+       }
+
+       /**
+        * Opposite of testIsPrefix
+        *
+        * @dataProvider notPrefixDataProvider
+        */
+       public function testNotIsPrefix( $lang, $base, $extended ) {
+               $cp = Collator::create( $lang );
+               $cp->setStrength( Collator::PRIMARY );
+               $baseBin = $cp->getSortKey( $base );
+               // Remove sortkey terminator
+               $baseBin = rtrim( $baseBin, "\0" );
+               $extendedBin = $cp->getSortKey( $extended );
+               $this->assertStringStartsNotWith( $baseBin, $extendedBin, "$base is a prefix of $extended" );
+       }
+
+       public static function notPrefixDataProvider() {
+               return [
+                       [ 'en', 'A', 'B' ],
+                       [ 'en', 'AC', 'ABC' ],
+                       [ 'en', 'Z', 'Ʒ' ],
+                       [ 'en', 'A', 'ꦲ' ],
+               ];
+       }
+
+       /**
+        * Test correct first letter is fetched.
+        *
+        * @param string $collation Collation name (aka uca-en)
+        * @param string $string String to get first letter of
+        * @param string $firstLetter Expected first letter.
+        *
+        * @dataProvider firstLetterProvider
+        */
+       public function testGetFirstLetter( $collation, $string, $firstLetter ) {
+               $col = Collation::factory( $collation );
+               $this->assertEquals( $firstLetter, $col->getFirstLetter( $string ) );
+       }
+
+       function firstLetterProvider() {
+               return [
+                       [ 'uppercase', 'Abc', 'A' ],
+                       [ 'uppercase', 'abc', 'A' ],
+                       [ 'identity', 'abc', 'a' ],
+                       [ 'uca-en', 'abc', 'A' ],
+                       [ 'uca-en', ' ', ' ' ],
+                       [ 'uca-en', 'Êveryone', 'E' ],
+                       [ 'uca-vi', 'Êveryone', 'Ê' ],
+                       // Make sure thorn is not a first letter.
+                       [ 'uca-sv', 'The', 'T' ],
+                       [ 'uca-sv', 'Å', 'Å' ],
+                       [ 'uca-hu', 'dzsdo', 'Dzs' ],
+                       [ 'uca-hu', 'dzdso', 'Dz' ],
+                       [ 'uca-hu', 'CSD', 'Cs' ],
+                       [ 'uca-root', 'CSD', 'C' ],
+                       [ 'uca-fi', 'Ǥ', 'G' ],
+                       [ 'uca-fi', 'Ŧ', 'T' ],
+                       [ 'uca-fi', 'Ʒ', 'Z' ],
+                       [ 'uca-fi', 'Ŋ', 'N' ],
+                       [ 'uppercase-ba', 'в', 'В' ],
+               ];
+       }
+}
diff --git a/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php
new file mode 100644 (file)
index 0000000..5d5317b
--- /dev/null
@@ -0,0 +1,57 @@
+<?php
+
+class CustomUppercaseCollationTest extends MediaWikiTestCase {
+
+       public function setUp() {
+               $this->collation = new CustomUppercaseCollation( [
+                       'D',
+                       'C',
+                       'B'
+               ], Language::factory( 'en' ) );
+
+               parent::setUp();
+       }
+
+       /**
+        * @dataProvider providerOrder
+        */
+       public function testOrder( $first, $second, $msg ) {
+               $sortkey1 = $this->collation->getSortKey( $first );
+               $sortkey2 = $this->collation->getSortKey( $second );
+
+               $this->assertTrue( strcmp( $sortkey1, $sortkey2 ) < 0, $msg );
+       }
+
+       public function providerOrder() {
+               return [
+                       [ 'X', 'Z', 'Maintain order of unrearranged' ],
+                       [ 'D', 'C', 'Actually resorts' ],
+                       [ 'D', 'B', 'resort test 2' ],
+                       [ 'Adobe', 'Abode', 'not first letter' ],
+                       [ '💩 ', 'C', 'Test relocated to end' ],
+                       [ 'c', 'b', 'lowercase' ],
+                       [ 'x', 'z', 'lowercase original' ],
+                       [ 'C50D', 'C100', 'Numbers' ]
+               ];
+       }
+
+       /**
+        * @dataProvider provideGetFirstLetter
+        */
+       public function testGetFirstLetter( $string, $first ) {
+               $this->assertSame( $this->collation->getFirstLetter( $string ), $first );
+       }
+
+       public function provideGetFirstLetter() {
+               return [
+                       [ 'Do', 'D' ],
+                       [ 'do', 'D' ],
+                       [ 'Ao', 'A' ],
+                       [ 'afdsa', 'A' ],
+                       [ "\xF3\xB3\x80\x80Foo", 'D' ],
+                       [ "\xF3\xB3\x80\x81Foo", 'C' ],
+                       [ "\xF3\xB3\x80\x82Foo", 'B' ],
+                       [ "\xF3\xB3\x80\x83Foo", "\xF3\xB3\x80\x83" ],
+               ];
+       }
+}