From 37b1fc9456ca1f23b0323f3f5b957fb538525a70 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?= Date: Mon, 31 Oct 2016 16:47:05 +0100 Subject: [PATCH] IcuCollation: Do not split $tailoringFirstLetters into verified/not verified At this point I think it's safe to assume that these mostly work well, and the split makes maintenance of the alphabetical list more difficult (some entries were already in wrong order). We've been enabling these collations for more and more Wikimedia wikis and not hearing about any problems. Mistakes, if any are present, should be treated like any other bug. Also made some comments consistent. Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0 --- includes/collation/IcuCollation.php | 99 ++++++++++++++--------------- 1 file changed, 48 insertions(+), 51 deletions(-) diff --git a/includes/collation/IcuCollation.php b/includes/collation/IcuCollation.php index 4110472d8e..e6b773c5e4 100644 --- a/includes/collation/IcuCollation.php +++ b/includes/collation/IcuCollation.php @@ -91,72 +91,47 @@ class IcuCollation extends Collation { * available and that there are, in fact, no additional letters to consider. */ private static $tailoringFirstLetters = [ - // Verified by native speakers - 'be' => [ "Ё" ], - 'be-tarask' => [ "Ё" ], - 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], - 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ], - 'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ], - 'en' => [], - 'fa' => [ - // RTL, let's put each letter on a new line - "آ", - "ء", - "ه", - "ا", - "و" - ], - 'fi' => [ "Å", "Ä", "Ö" ], - 'fr' => [], - 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], - 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ], - 'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ], - 'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ], - 'it' => [], - 'lt' => [ "Č", "Š", "Ž" ], - 'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ], - 'mk' => [ "Ѓ", "Ќ" ], - 'nl' => [], - 'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ], - 'pt' => [], - 'ru' => [], - 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ], - 'sr' => [], - 'sv' => [ "Å", "Ä", "Ö" ], - 'sv@collation=standard' => [ "Å", "Ä", "Ö" ], - 'ta' => [ - "\xE0\xAE\x82", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்", - "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்", - "ஸ்", "ஹ்", "க்ஷ்" - ], - 'uk' => [ "Ґ", "Ь" ], - 'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ], - // Not verified, but likely correct 'af' => [], 'am' => [], 'ar' => [], 'as' => [ "\xe0\xa6\x82", "\xe0\xa6\x81", "\xe0\xa6\x83", "\xe0\xa7\x8e", "ক্ষ " ], - 'ast' => [ "Ch", "Ll", "Ñ" ], // Not in libicu? + 'ast' => [ "Ch", "Ll", "Ñ" ], // not in libicu 'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ], + 'be' => [ "Ё" ], + 'be-tarask' => [ "Ё" ], 'bg' => [], 'bo' => [], 'br' => [ "Ch", "C'h" ], + 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], 'bs-Cyrl' => [], 'ca' => [], 'chr' => [], - 'co' => [], // Not in libicu? + 'co' => [], // not in libicu + 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ], + 'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ], 'da' => [ "Æ", "Ø", "Å" ], 'de' => [], 'de-AT@collation=phonebook' => [ 'ä', 'ö', 'ü', 'ß' ], 'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ], 'ee' => [ "Dz", "Ɖ", "Ɛ", "Ƒ", "Gb", "Ɣ", "Kp", "Ny", "Ŋ", "Ɔ", "Ts", "Ʋ" ], 'el' => [], + 'en' => [], 'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ], 'es' => [ "Ñ" ], 'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ], // added W for CollationEt (xx-uca-et) - 'eu' => [ "Ñ" ], // Not in libicu? + 'eu' => [ "Ñ" ], // not in libicu + 'fa' => [ + // RTL, let's put each letter on a new line + "آ", + "ء", + "ه", + "ا", + "و" + ], + 'fi' => [ "Å", "Ä", "Ö" ], 'fil' => [ "Ñ", "Ng" ], 'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ], + 'fr' => [], 'fr-CA' => [], // fr-CA sorts accents slightly different from fr. 'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ], // not in libicu 'fy' => [], // not in libicu @@ -168,10 +143,17 @@ class IcuCollation extends Collation { 'haw' => [ 'ʻ' ], 'he' => [], 'hi' => [ "\xe0\xa4\x82", "\xe0\xa4\x83" ], + 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], + 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ], + 'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ], 'hy' => [ "և" ], 'id' => [], 'ig' => [ "Ch", "Gb", "Gh", "Gw", "Ị", "Kp", "Kw", "Ṅ", "Nw", "Ny", "Ọ", "Sh", "Ụ" ], + 'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ], + 'it' => [], 'ka' => [], + 'kk' => [ "Ү", "І" ], + 'kl' => [ "Æ", "Ø", "Å" ], 'km' => [ "រ", "ឫ", "ឬ", "ល", "ឭ", "ឮ", "\xe1\x9e\xbb\xe1\x9f\x86", "\xe1\x9f\x86", "\xe1\x9e\xb6\xe1\x9f\x86", "\xe1\x9f\x87", @@ -180,32 +162,36 @@ class IcuCollation extends Collation { ], 'kn' => [ "\xe0\xb2\x81", "\xe0\xb2\x83", "\xe0\xb3\xb1", "\xe0\xb3\xb2" ], 'kok' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष" ], - 'kk' => [ "Ү", "І" ], - 'kl' => [ "Æ", "Ø", "Å" ], - 'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ], // ku is not in libicu + 'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ], // not in libicu 'ky' => [ "Ё" ], - 'la' => [], // la is not in libicu + 'la' => [], // not in libicu 'lb' => [], 'lkt' => [ 'Č', 'Ǧ', 'Ȟ', 'Š', 'Ž' ], 'ln' => [ 'Ɛ' ], 'lo' => [], + 'lt' => [ "Č", "Š", "Ž" ], + 'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ], + 'mk' => [ "Ѓ", "Ќ" ], 'ml' => [], 'mn' => [], + 'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], // not in libicu 'mr' => [ "\xe0\xa4\x82", "\xe0\xa4\x83", "ळ", "क्ष", "ज्ञ" ], - 'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], // no mo in libicu 'ms' => [], 'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ], 'nb' => [ "Æ", "Ø", "Å" ], 'ne' => [], + 'nl' => [], 'nn' => [ "Æ", "Ø", "Å" ], - // no is not in the libicu list. You should probably use nb or nn instead. - 'no' => [ "Æ", "Ø", "Å" ], + 'no' => [ "Æ", "Ø", "Å" ], // not in libicu. You should probably use nb or nn instead. 'oc' => [], // not in libicu 'om' => [ 'Ch', 'Dh', 'Kh', 'Ny', 'Ph', 'Sh' ], 'or' => [ "\xe0\xac\x81", "\xe0\xac\x82", "\xe0\xac\x83", "କ୍ଷ" ], 'pa' => [ "\xe0\xa9\x8d" ], + 'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ], + 'pt' => [], 'rm' => [], // not in libicu 'ro' => [ "Ă", "Â", "Î", "Ş", "Ţ" ], + 'ru' => [], 'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ], // not in libicu 'sco' => [], 'se' => [ @@ -213,11 +199,20 @@ class IcuCollation extends Collation { 'Š', 'Ŧ', 'Ž', 'Ø', 'Æ', 'Ȧ', 'Ä', 'Ö' ], 'si' => [ "\xe0\xb6\x82", "\xe0\xb6\x83", "\xe0\xb6\xa4" ], + 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ], 'sl' => [ "Č", "Š", "Ž" ], 'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ], 'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ], + 'sr' => [], 'sr-Latn' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ], + 'sv' => [ "Å", "Ä", "Ö" ], + 'sv@collation=standard' => [ "Å", "Ä", "Ö" ], 'sw' => [], + 'ta' => [ + "\xE0\xAE\x82", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்", + "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்", + "ஸ்", "ஹ்", "க்ஷ்" + ], 'te' => [ "\xe0\xb0\x81", "\xe0\xb0\x82", "\xe0\xb0\x83" ], 'th' => [ "ฯ", "\xe0\xb9\x86", "\xe0\xb9\x8d", "\xe0\xb8\xba" ], 'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ], @@ -225,7 +220,9 @@ class IcuCollation extends Collation { 'to' => [ "Ng", "ʻ" ], 'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ], 'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ], // not in libicu + 'uk' => [ "Ґ", "Ь" ], 'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ], // not in libicu + 'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ], 'vo' => [ "Ä", "Ö", "Ü" ], 'yi' => [ "\xd7\x91\xd6\xbf", "\xd7\x9b\xd6\xbc", "\xd7\xa4\xd6\xbc", -- 2.20.1