Add support for icu-ta collation
[lhc/web/wiklou.git] / includes / collation / IcuCollation.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 * @file
19 */
20
21 /**
22 * @since 1.16.3
23 */
24 class IcuCollation extends Collation {
25 const FIRST_LETTER_VERSION = 2;
26
27 /** @var Collator */
28 private $primaryCollator;
29
30 /** @var Collator */
31 private $mainCollator;
32
33 /** @var string */
34 private $locale;
35
36 /** @var Language */
37 protected $digitTransformLanguage;
38
39 /** @var array */
40 private $firstLetterData;
41
42 /**
43 * Unified CJK blocks.
44 *
45 * The same definition of a CJK block must be used for both Collation and
46 * generateCollationData.php. These blocks are omitted from the first
47 * letter data, as an optimisation measure and because the default UCA table
48 * is pretty useless for sorting Chinese text anyway. Japanese and Korean
49 * blocks are not included here, because they are smaller and more useful.
50 */
51 private static $cjkBlocks = [
52 [ 0x2E80, 0x2EFF ], // CJK Radicals Supplement
53 [ 0x2F00, 0x2FDF ], // Kangxi Radicals
54 [ 0x2FF0, 0x2FFF ], // Ideographic Description Characters
55 [ 0x3000, 0x303F ], // CJK Symbols and Punctuation
56 [ 0x31C0, 0x31EF ], // CJK Strokes
57 [ 0x3200, 0x32FF ], // Enclosed CJK Letters and Months
58 [ 0x3300, 0x33FF ], // CJK Compatibility
59 [ 0x3400, 0x4DBF ], // CJK Unified Ideographs Extension A
60 [ 0x4E00, 0x9FFF ], // CJK Unified Ideographs
61 [ 0xF900, 0xFAFF ], // CJK Compatibility Ideographs
62 [ 0xFE30, 0xFE4F ], // CJK Compatibility Forms
63 [ 0x20000, 0x2A6DF ], // CJK Unified Ideographs Extension B
64 [ 0x2A700, 0x2B73F ], // CJK Unified Ideographs Extension C
65 [ 0x2B740, 0x2B81F ], // CJK Unified Ideographs Extension D
66 [ 0x2F800, 0x2FA1F ], // CJK Compatibility Ideographs Supplement
67 ];
68
69 /**
70 * Additional characters (or character groups) to be considered separate
71 * letters for given languages, or to be removed from the list of such
72 * letters (denoted by keys starting with '-').
73 *
74 * These are additions to (or subtractions from) the data stored in the
75 * first-letters-root.ser file (which among others includes full basic latin,
76 * cyrillic and greek alphabets).
77 *
78 * "Separate letter" is a letter that would have a separate heading/section
79 * for it in a dictionary or a phone book in this language. This data isn't
80 * used for sorting (the ICU library handles that), only for deciding which
81 * characters (or character groups) to use as headings.
82 *
83 * Initially generated based on the primary level of Unicode collation
84 * tailorings available at http://developer.mimer.com/charts/tailorings.htm ,
85 * later modified.
86 *
87 * Empty arrays are intended; this signifies that the data for the language is
88 * available and that there are, in fact, no additional letters to consider.
89 */
90 private static $tailoringFirstLetters = [
91 // Verified by native speakers
92 'be' => [ "Ё" ],
93 'be-tarask' => [ "Ё" ],
94 'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ],
95 'en' => [],
96 'fa' => [ "آ", "ء", "ه" ],
97 'fi' => [ "Å", "Ä", "Ö" ],
98 'fr' => [],
99 'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ],
100 'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ],
101 'it' => [],
102 'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ],
103 'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ],
104 'pt' => [],
105 'ru' => [],
106 'sv' => [ "Å", "Ä", "Ö" ],
107 'sv@collation=standard' => [ "Å", "Ä", "Ö" ],
108 'uk' => [ "Ґ", "Ь" ],
109 'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ],
110 // Not verified, but likely correct
111 'af' => [],
112 'ast' => [ "Ch", "Ll", "Ñ" ],
113 'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ],
114 'bg' => [],
115 'br' => [ "Ch", "C'h" ],
116 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
117 'ca' => [],
118 'co' => [],
119 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ],
120 'da' => [ "Æ", "Ø", "Å" ],
121 'de' => [],
122 'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ],
123 'el' => [],
124 'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ],
125 'es' => [ "Ñ" ],
126 'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ], // added W for CollationEt (xx-uca-et)
127 'eu' => [ "Ñ" ],
128 'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ],
129 'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ],
130 'fy' => [],
131 'ga' => [],
132 'gd' => [],
133 'gl' => [ "Ch", "Ll", "Ñ" ],
134 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
135 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ],
136 'kk' => [ "Ү", "І" ],
137 'kl' => [ "Æ", "Ø", "Å" ],
138 'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ],
139 'ky' => [ "Ё" ],
140 'la' => [],
141 'lb' => [],
142 'lt' => [ "Č", "Š", "Ž" ],
143 'mk' => [],
144 'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ],
145 'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ],
146 'nl' => [],
147 'no' => [ "Æ", "Ø", "Å" ],
148 'oc' => [],
149 'rm' => [],
150 'ro' => [ "Ă", "Â", "Î", "Ş", "Ţ" ],
151 'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ],
152 'sco' => [],
153 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ],
154 'sl' => [ "Č", "Š", "Ž" ],
155 'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ],
156 'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ],
157 'sr' => [],
158 'ta' => [
159 "\xE0\xAE\x82", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்",
160 "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்",
161 "ஸ்", "ஹ்", "க்ஷ்"
162 ],
163 'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ],
164 'tl' => [ "Ñ", "Ng" ],
165 'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ],
166 'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ],
167 'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ],
168 ];
169
170 /**
171 * @since 1.16.3
172 */
173 const RECORD_LENGTH = 14;
174
175 public function __construct( $locale ) {
176 if ( !extension_loaded( 'intl' ) ) {
177 throw new MWException( 'An ICU collation was requested, ' .
178 'but the intl extension is not available.' );
179 }
180
181 $this->locale = $locale;
182 // Drop everything after the '@' in locale's name
183 $localeParts = explode( '@', $locale );
184 $this->digitTransformLanguage = Language::factory( $locale === 'root' ? 'en' : $localeParts[0] );
185
186 $this->mainCollator = Collator::create( $locale );
187 if ( !$this->mainCollator ) {
188 throw new MWException( "Invalid ICU locale specified for collation: $locale" );
189 }
190
191 $this->primaryCollator = Collator::create( $locale );
192 $this->primaryCollator->setStrength( Collator::PRIMARY );
193 }
194
195 public function getSortKey( $string ) {
196 // intl extension produces non null-terminated
197 // strings. Appending '' fixes it so that it doesn't generate
198 // a warning on each access in debug php.
199 MediaWiki\suppressWarnings();
200 $key = $this->mainCollator->getSortKey( $string ) . '';
201 MediaWiki\restoreWarnings();
202 return $key;
203 }
204
205 public function getPrimarySortKey( $string ) {
206 MediaWiki\suppressWarnings();
207 $key = $this->primaryCollator->getSortKey( $string ) . '';
208 MediaWiki\restoreWarnings();
209 return $key;
210 }
211
212 public function getFirstLetter( $string ) {
213 $string = strval( $string );
214 if ( $string === '' ) {
215 return '';
216 }
217
218 // Check for CJK
219 $firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
220 if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) {
221 return $firstChar;
222 }
223
224 $sortKey = $this->getPrimarySortKey( $string );
225
226 // Do a binary search to find the correct letter to sort under
227 $min = ArrayUtils::findLowerBound(
228 [ $this, 'getSortKeyByLetterIndex' ],
229 $this->getFirstLetterCount(),
230 'strcmp',
231 $sortKey );
232
233 if ( $min === false ) {
234 // Before the first letter
235 return '';
236 }
237 return $this->getLetterByIndex( $min );
238 }
239
240 /**
241 * @since 1.16.3
242 * @return array
243 */
244 public function getFirstLetterData() {
245 if ( $this->firstLetterData === null ) {
246 $cache = ObjectCache::getLocalServerInstance( CACHE_ANYTHING );
247 $cacheKey = $cache->makeKey(
248 'first-letters',
249 $this->locale,
250 $this->digitTransformLanguage->getCode(),
251 self::getICUVersion(),
252 self::FIRST_LETTER_VERSION
253 );
254 $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK, function () {
255 return $this->fetchFirstLetterData();
256 } );
257 }
258 return $this->firstLetterData;
259 }
260
261 /**
262 * @return array
263 * @throws MWException
264 */
265 private function fetchFirstLetterData() {
266 // Generate data from serialized data file
267 if ( isset( self::$tailoringFirstLetters[$this->locale] ) ) {
268 $letters = wfGetPrecompiledData( 'first-letters-root.ser' );
269 // Append additional characters
270 $letters = array_merge( $letters, self::$tailoringFirstLetters[$this->locale] );
271 // Remove unnecessary ones, if any
272 if ( isset( self::$tailoringFirstLetters['-' . $this->locale] ) ) {
273 $letters = array_diff( $letters, self::$tailoringFirstLetters['-' . $this->locale] );
274 }
275 // Apply digit transforms
276 $digits = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ];
277 $letters = array_diff( $letters, $digits );
278 foreach ( $digits as $digit ) {
279 $letters[] = $this->digitTransformLanguage->formatNum( $digit, true );
280 }
281 } else {
282 $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
283 if ( $letters === false ) {
284 throw new MWException( "MediaWiki does not support ICU locale " .
285 "\"{$this->locale}\"" );
286 }
287 }
288
289 /* Sort the letters.
290 *
291 * It's impossible to have the precompiled data file properly sorted,
292 * because the sort order changes depending on ICU version. If the
293 * array is not properly sorted, the binary search will return random
294 * results.
295 *
296 * We also take this opportunity to remove primary collisions.
297 */
298 $letterMap = [];
299 foreach ( $letters as $letter ) {
300 $key = $this->getPrimarySortKey( $letter );
301 if ( isset( $letterMap[$key] ) ) {
302 // Primary collision
303 // Keep whichever one sorts first in the main collator
304 if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) {
305 $letterMap[$key] = $letter;
306 }
307 } else {
308 $letterMap[$key] = $letter;
309 }
310 }
311 ksort( $letterMap, SORT_STRING );
312
313 /* Remove duplicate prefixes. Basically if something has a sortkey
314 * which is a prefix of some other sortkey, then it is an
315 * expansion and probably should not be considered a section
316 * header.
317 *
318 * For example 'þ' is sometimes sorted as if it is the letters
319 * 'th'. Other times it is its own primary element. Another
320 * example is '₨'. Sometimes its a currency symbol. Sometimes it
321 * is an 'R' followed by an 's'.
322 *
323 * Additionally an expanded element should always sort directly
324 * after its first element due to they way sortkeys work.
325 *
326 * UCA sortkey elements are of variable length but no collation
327 * element should be a prefix of some other element, so I think
328 * this is safe. See:
329 * - https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
330 * - http://site.icu-project.org/design/collation/uca-weight-allocation
331 *
332 * Additionally, there is something called primary compression to
333 * worry about. Basically, if you have two primary elements that
334 * are more than one byte and both start with the same byte then
335 * the first byte is dropped on the second primary. Additionally
336 * either \x03 or \xFF may be added to mean that the next primary
337 * does not start with the first byte of the first primary.
338 *
339 * This shouldn't matter much, as the first primary is not
340 * changed, and that is what we are comparing against.
341 *
342 * tl;dr: This makes some assumptions about how icu implements
343 * collations. It seems incredibly unlikely these assumptions
344 * will change, but nonetheless they are assumptions.
345 */
346
347 $prev = false;
348 $duplicatePrefixes = [];
349 foreach ( $letterMap as $key => $value ) {
350 // Remove terminator byte. Otherwise the prefix
351 // comparison will get hung up on that.
352 $trimmedKey = rtrim( $key, "\0" );
353 if ( $prev === false || $prev === '' ) {
354 $prev = $trimmedKey;
355 // We don't yet have a collation element
356 // to compare against, so continue.
357 continue;
358 }
359
360 // Due to the fact the array is sorted, we only have
361 // to compare with the element directly previous
362 // to the current element (skipping expansions).
363 // An element "X" will always sort directly
364 // before "XZ" (Unless we have "XY", but we
365 // do not update $prev in that case).
366 if ( substr( $trimmedKey, 0, strlen( $prev ) ) === $prev ) {
367 $duplicatePrefixes[] = $key;
368 // If this is an expansion, we don't want to
369 // compare the next element to this element,
370 // but to what is currently $prev
371 continue;
372 }
373 $prev = $trimmedKey;
374 }
375 foreach ( $duplicatePrefixes as $badKey ) {
376 wfDebug( "Removing '{$letterMap[$badKey]}' from first letters.\n" );
377 unset( $letterMap[$badKey] );
378 // This code assumes that unsetting does not change sort order.
379 }
380 $data = [
381 'chars' => array_values( $letterMap ),
382 'keys' => array_keys( $letterMap ),
383 ];
384
385 // Reduce memory usage before caching
386 unset( $letterMap );
387
388 return $data;
389 }
390
391 /**
392 * @since 1.16.3
393 */
394 public function getLetterByIndex( $index ) {
395 return $this->getFirstLetterData()['chars'][$index];
396 }
397
398 /**
399 * @since 1.16.3
400 */
401 public function getSortKeyByLetterIndex( $index ) {
402 return $this->getFirstLetterData()['keys'][$index];
403 }
404
405 /**
406 * @since 1.16.3
407 */
408 public function getFirstLetterCount() {
409 return count( $this->getFirstLetterData()['chars'] );
410 }
411
412 /**
413 * @since 1.16.3
414 */
415 public static function isCjk( $codepoint ) {
416 foreach ( self::$cjkBlocks as $block ) {
417 if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
418 return true;
419 }
420 }
421 return false;
422 }
423
424 /**
425 * Return the version of ICU library used by PHP's intl extension,
426 * or false when the extension is not installed of the version
427 * can't be determined.
428 *
429 * The constant INTL_ICU_VERSION this function refers to isn't really
430 * documented. It is available since PHP 5.3.7 (see PHP bug 54561).
431 * This function will return false on older PHPs.
432 *
433 * @since 1.21
434 * @return string|bool
435 */
436 static function getICUVersion() {
437 return defined( 'INTL_ICU_VERSION' ) ? INTL_ICU_VERSION : false;
438 }
439
440 /**
441 * Return the version of Unicode appropriate for the version of ICU library
442 * currently in use, or false when it can't be determined.
443 *
444 * @since 1.21
445 * @return string|bool
446 */
447 static function getUnicodeVersionForICU() {
448 $icuVersion = IcuCollation::getICUVersion();
449 if ( !$icuVersion ) {
450 return false;
451 }
452
453 $versionPrefix = substr( $icuVersion, 0, 3 );
454 // Source: http://site.icu-project.org/download
455 $map = [
456 '50.' => '6.2',
457 '49.' => '6.1',
458 '4.8' => '6.0',
459 '4.6' => '6.0',
460 '4.4' => '5.2',
461 '4.2' => '5.1',
462 '4.0' => '5.1',
463 '3.8' => '5.0',
464 '3.6' => '5.0',
465 '3.4' => '4.1',
466 ];
467
468 if ( isset( $map[$versionPrefix] ) ) {
469 return $map[$versionPrefix];
470 } else {
471 return false;
472 }
473 }
474 }