Merge "Update weblinks in comments from HTTP to HTTPS"

[lhc/web/wiklou.git] / includes / collation / IcuCollation.php
diff --git a/includes/collation/IcuCollation.php b/includes/collation/IcuCollation.php

index b956d4b..e6b773c 100644 (file)
--- a/includes/collation/IcuCollation.php
+++ b/includes/collation/IcuCollation.php
@@ -36,6 +36,9 @@ class IcuCollation extends Collation {
         /** @var Language */
         protected $digitTransformLanguage;
  
+       /** @var boolean */
+       private $useNumericCollation = false;
+
         /** @var array */
         private $firstLetterData;
  
@@ -88,90 +91,145 @@ class IcuCollation extends Collation {
          * available and that there are, in fact, no additional letters to consider.
          */
         private static $tailoringFirstLetters = [
-               // Verified by native speakers
-               'be' => [ "Ё" ],
-               'be-tarask' => [ "Ё" ],
-               'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ],
-               'en' => [],
-               // RTL, let's put each letter on a new line
-               'fa' => [
-                       "آ",
-                       "ء",
-                       "ه",
-                       "ا",
-                       "و"
-               ],
-               'fi' => [ "Å", "Ä", "Ö" ],
-               'fr' => [],
-               'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ],
-               'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ],
-               'it' => [],
-               'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ],
-               'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ],
-               'pt' => [],
-               'ru' => [],
-               'sv' => [ "Å", "Ä", "Ö" ],
-               'sv@collation=standard' => [ "Å", "Ä", "Ö" ],
-               'uk' => [ "Ґ", "Ь" ],
-               'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ],
-               // Not verified, but likely correct
                 'af' => [],
-               'ast' => [ "Ch", "Ll", "Ñ" ],
+               'am' => [],
+               'ar' => [],
+               'as' => [ "\xe0\xa6\x82", "\xe0\xa6\x81", "\xe0\xa6\x83", "\xe0\xa7\x8e", "ক্ষ " ],
+               'ast' => [ "Ch", "Ll", "Ñ" ], // not in libicu
                 'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ],
+               'be' => [ "Ё" ],
+               'be-tarask' => [ "Ё" ],
                 'bg' => [],
+               'bo' => [],
                 'br' => [ "Ch", "C'h" ],
                 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
+               'bs-Cyrl' => [],
                 'ca' => [],
-               'co' => [],
+               'chr' => [],
+               'co' => [], // not in libicu
                 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ],
+               'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ],
                 'da' => [ "Æ", "Ø", "Å" ],
                 'de' => [],
+               'de-AT@collation=phonebook' => [ 'ä', 'ö', 'ü', 'ß' ],
                 'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ],
+               'ee' => [ "Dz", "Ɖ", "Ɛ", "Ƒ", "Gb", "Ɣ", "Kp", "Ny", "Ŋ", "Ɔ", "Ts", "Ʋ" ],
                 'el' => [],
+               'en' => [],
                 'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ],
                 'es' => [ "Ñ" ],
                 'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ], // added W for CollationEt (xx-uca-et)
-               'eu' => [ "Ñ" ],
+               'eu' => [ "Ñ" ], // not in libicu
+               'fa' => [
+                       // RTL, let's put each letter on a new line
+                       "آ",
+                       "ء",
+                       "ه",
+                       "ا",
+                       "و"
+               ],
+               'fi' => [ "Å", "Ä", "Ö" ],
+               'fil' => [ "Ñ", "Ng" ],
                 'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ],
-               'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ],
-               'fy' => [],
+               'fr' => [],
+               'fr-CA' => [], // fr-CA sorts accents slightly different from fr.
+               'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ], // not in libicu
+               'fy' => [], // not in libicu
                 'ga' => [],
-               'gd' => [],
+               'gd' => [], // not in libicu
                 'gl' => [ "Ch", "Ll", "Ñ" ],
+               'gu' => [ "\xe0\xaa\x82", "\xe0\xaa\x83", "\xe0\xaa\x81", "\xe0\xaa\xb3" ],
+               'ha' => [ 'Ɓ', 'Ɗ', 'Ƙ', 'Sh', 'Ts', 'Ƴ' ],
+               'haw' => [ 'ʻ' ],
+               'he' => [],
+               'hi' => [ "\xe0\xa4\x82", "\xe0\xa4\x83" ],
                 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
                 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ],
+               'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ],
+               'hy' => [ "և" ],
+               'id' => [],
+               'ig' => [ "Ch", "Gb", "Gh", "Gw", "Ị", "Kp", "Kw", "Ṅ", "Nw", "Ny", "Ọ", "Sh", "Ụ" ],
+               'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ],
+               'it' => [],
+               'ka' => [],
                 'kk' => [ "Ү", "І" ],
                 'kl' => [ "Æ", "Ø", "Å" ],
-               'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ],
+               'km' => [
+                       "រ", "ឫ", "ឬ", "ល", "ឭ", "ឮ", "\xe1\x9e\xbb\xe1\x9f\x86",
+                       "\xe1\x9f\x86", "\xe1\x9e\xb6\xe1\x9f\x86", "\xe1\x9f\x87",
+                       "\xe1\x9e\xb7\xe1\x9f\x87", "\xe1\x9e\xbb\xe1\x9f\x87",
+                       "\xe1\x9f\x81\xe1\x9f\x87", "\xe1\x9f\x84\xe1\x9f\x87",
+               ],
+               'kn' => [ "\xe0\xb2\x81", "\xe0\xb2\x83", "\xe0\xb3\xb1", "\xe0\xb3\xb2" ],
+               'kok' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष" ],
+               'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ], // not in libicu
                 'ky' => [ "Ё" ],
-               'la' => [],
+               'la' => [], // not in libicu
                 'lb' => [],
+               'lkt' => [ 'Č', 'Ǧ', 'Ȟ', 'Š', 'Ž' ],
+               'ln' => [ 'Ɛ' ],
+               'lo' => [],
                 'lt' => [ "Č", "Š", "Ž" ],
-               'mk' => [],
-               'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ],
+               'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ],
+               'mk' => [ "Ѓ", "Ќ" ],
+               'ml' => [],
+               'mn' => [],
+               'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], // not in libicu
+               'mr' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष", "ज्ञ" ],
+               'ms' => [],
                 'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ],
+               'nb' => [ "Æ", "Ø", "Å" ],
+               'ne' => [],
                 'nl' => [],
-               'no' => [ "Æ", "Ø", "Å" ],
-               'oc' => [],
-               'rm' => [],
+               'nn' => [ "Æ", "Ø", "Å" ],
+               'no' => [ "Æ", "Ø", "Å" ], // not in libicu. You should probably use nb or nn instead.
+               'oc' => [], // not in libicu
+               'om' => [ 'Ch', 'Dh', 'Kh', 'Ny', 'Ph', 'Sh' ],
+               'or' => [ "\xe0\xac\x81", "\xe0\xac\x82", "\xe0\xac\x83", "କ୍ଷ" ],
+               'pa' => [ "\xe0\xa9\x8d" ],
+               'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ],
+               'pt' => [],
+               'rm' => [], // not in libicu
                 'ro' => [ "Ă", "Â", "Î", "Ş", "Ţ" ],
-               'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ],
+               'ru' => [],
+               'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ], // not in libicu
                 'sco' => [],
+               'se' => [
+                       'Á', 'Č', 'Ʒ', 'Ǯ', 'Đ', 'Ǧ', 'Ǥ', 'Ǩ', 'Ŋ',
+                       'Š', 'Ŧ', 'Ž', 'Ø', 'Æ', 'Ȧ', 'Ä', 'Ö'
+               ],
+               'si' => [ "\xe0\xb6\x82", "\xe0\xb6\x83", "\xe0\xb6\xa4" ],
                 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ],
                 'sl' => [ "Č", "Š", "Ž" ],
                 'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ],
                 'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ],
                 'sr' => [],
+               'sr-Latn' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
+               'sv' => [ "Å", "Ä", "Ö" ],
+               'sv@collation=standard' => [ "Å", "Ä", "Ö" ],
+               'sw' => [],
                 'ta' => [
                         "\xE0\xAE\x82", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்",
                         "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்",
                         "ஸ்", "ஹ்", "க்ஷ்"
                 ],
+               'te' => [ "\xe0\xb0\x81", "\xe0\xb0\x82", "\xe0\xb0\x83" ],
+               'th' => [ "ฯ", "\xe0\xb9\x86", "\xe0\xb9\x8d", "\xe0\xb8\xba" ],
                 'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ],
-               'tl' => [ "Ñ", "Ng" ],
+               'tl' => [ "Ñ", "Ng" ], // not in libicu
+               'to' => [ "Ng", "ʻ" ],
                 'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ],
-               'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ],
-               'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ],
+               'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ], // not in libicu
+               'uk' => [ "Ґ", "Ь" ],
+               'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ], // not in libicu
+               'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ],
+               'vo' => [ "Ä", "Ö", "Ü" ],
+               'yi' => [
+                       "\xd7\x91\xd6\xbf", "\xd7\x9b\xd6\xbc", "\xd7\xa4\xd6\xbc",
+                       "\xd7\xa9\xd7\x82", "\xd7\xaa\xd6\xbc"
+               ],
+               'yo' => [ "Ẹ", "Gb", "Ọ", "Ṣ" ],
+               'zu' => [],
         ];
  
         /**
@@ -197,6 +255,15 @@ class IcuCollation extends Collation {
  
                 $this->primaryCollator = Collator::create( $locale );
                 $this->primaryCollator->setStrength( Collator::PRIMARY );
+
+               // If the special suffix for numeric collation is present, turn on numeric collation.
+               if ( substr( $locale, -5, 5 ) === '-u-kn' ) {
+                       $this->useNumericCollation = true;
+                       // Strip off the special suffix so it doesn't trip up fetchFirstLetterData().
+                       $this->locale = substr( $this->locale, 0, -5 );
+                       $this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
+                       $this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
+               }
         }
  
         public function getSortKey( $string ) {
@@ -213,8 +280,9 @@ class IcuCollation extends Collation {
                         return '';
                 }
  
-               // Check for CJK
                 $firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
+
+               // If the first character is a CJK character, just return that character.
                 if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) {
                         return $firstChar;
                 }
@@ -232,7 +300,19 @@ class IcuCollation extends Collation {
                         // Before the first letter
                         return '';
                 }
-               return $this->getLetterByIndex( $min );
+
+               $sortLetter = $this->getLetterByIndex( $min );
+
+               if ( $this->useNumericCollation ) {
+                       // If the sort letter is a number, return '0–9' (or localized equivalent).
+                       // ASCII value of 0 is 48. ASCII value of 9 is 57.
+                       // Note that this also applies to non-Arabic numerals since they are
+                       // mapped to Arabic numeral sort letters. For example, ২ sorts as 2.
+                       if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) {
+                               $sortLetter = wfMessage( 'category-header-numerals' )->numParams( 0, 9 )->text();
+                       }
+               }
+               return $sortLetter;
         }
  
         /**
@@ -408,6 +488,7 @@ class IcuCollation extends Collation {
         }
  
         /**
+        * Test if a code point is a CJK (Chinese, Japanese, Korean) character
          * @since 1.16.3
          */
         public static function isCjk( $codepoint ) {
@@ -451,6 +532,13 @@ class IcuCollation extends Collation {
                 $versionPrefix = substr( $icuVersion, 0, 3 );
                 // Source: http://site.icu-project.org/download
                 $map = [
+                       '57.' => '8.0',
+                       '56.' => '8.0',
+                       '55.' => '7.0',
+                       '54.' => '7.0',
+                       '53.' => '6.3',
+                       '52.' => '6.3',
+                       '51.' => '6.2',
                         '50.' => '6.2',
                         '49.' => '6.1',
                         '4.8' => '6.0',