Merge "Add support for icu-ta collation"
[lhc/web/wiklou.git] / includes / collation / IcuCollation.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 * @file
19 */
20
21 /**
22 * @since 1.16.3
23 */
24 class IcuCollation extends Collation {
25 const FIRST_LETTER_VERSION = 2;
26
27 /** @var Collator */
28 private $primaryCollator;
29
30 /** @var Collator */
31 private $mainCollator;
32
33 /** @var string */
34 private $locale;
35
36 /** @var Language */
37 protected $digitTransformLanguage;
38
39 /** @var array */
40 private $firstLetterData;
41
42 /**
43 * Unified CJK blocks.
44 *
45 * The same definition of a CJK block must be used for both Collation and
46 * generateCollationData.php. These blocks are omitted from the first
47 * letter data, as an optimisation measure and because the default UCA table
48 * is pretty useless for sorting Chinese text anyway. Japanese and Korean
49 * blocks are not included here, because they are smaller and more useful.
50 */
51 private static $cjkBlocks = [
52 [ 0x2E80, 0x2EFF ], // CJK Radicals Supplement
53 [ 0x2F00, 0x2FDF ], // Kangxi Radicals
54 [ 0x2FF0, 0x2FFF ], // Ideographic Description Characters
55 [ 0x3000, 0x303F ], // CJK Symbols and Punctuation
56 [ 0x31C0, 0x31EF ], // CJK Strokes
57 [ 0x3200, 0x32FF ], // Enclosed CJK Letters and Months
58 [ 0x3300, 0x33FF ], // CJK Compatibility
59 [ 0x3400, 0x4DBF ], // CJK Unified Ideographs Extension A
60 [ 0x4E00, 0x9FFF ], // CJK Unified Ideographs
61 [ 0xF900, 0xFAFF ], // CJK Compatibility Ideographs
62 [ 0xFE30, 0xFE4F ], // CJK Compatibility Forms
63 [ 0x20000, 0x2A6DF ], // CJK Unified Ideographs Extension B
64 [ 0x2A700, 0x2B73F ], // CJK Unified Ideographs Extension C
65 [ 0x2B740, 0x2B81F ], // CJK Unified Ideographs Extension D
66 [ 0x2F800, 0x2FA1F ], // CJK Compatibility Ideographs Supplement
67 ];
68
69 /**
70 * Additional characters (or character groups) to be considered separate
71 * letters for given languages, or to be removed from the list of such
72 * letters (denoted by keys starting with '-').
73 *
74 * These are additions to (or subtractions from) the data stored in the
75 * first-letters-root.ser file (which among others includes full basic latin,
76 * cyrillic and greek alphabets).
77 *
78 * "Separate letter" is a letter that would have a separate heading/section
79 * for it in a dictionary or a phone book in this language. This data isn't
80 * used for sorting (the ICU library handles that), only for deciding which
81 * characters (or character groups) to use as headings.
82 *
83 * Initially generated based on the primary level of Unicode collation
84 * tailorings available at http://developer.mimer.com/charts/tailorings.htm ,
85 * later modified.
86 *
87 * Empty arrays are intended; this signifies that the data for the language is
88 * available and that there are, in fact, no additional letters to consider.
89 */
90 private static $tailoringFirstLetters = [
91 // Verified by native speakers
92 'be' => [ "Ё" ],
93 'be-tarask' => [ "Ё" ],
94 'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ],
95 'en' => [],
96 'fa' => [ "آ", "ء", "ه" ],
97 'fi' => [ "Å", "Ä", "Ö" ],
98 'fr' => [],
99 'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ],
100 'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ],
101 'it' => [],
102 'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ],
103 'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ],
104 'pt' => [],
105 'ru' => [],
106 'sv' => [ "Å", "Ä", "Ö" ],
107 'sv@collation=standard' => [ "Å", "Ä", "Ö" ],
108 'uk' => [ "Ґ", "Ь" ],
109 'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ],
110 // Not verified, but likely correct
111 'af' => [],
112 'ast' => [ "Ch", "Ll", "Ñ" ],
113 'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ],
114 'bg' => [],
115 'br' => [ "Ch", "C'h" ],
116 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
117 'ca' => [],
118 'co' => [],
119 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ],
120 'da' => [ "Æ", "Ø", "Å" ],
121 'de' => [],
122 'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ],
123 'el' => [],
124 'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ],
125 'es' => [ "Ñ" ],
126 'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ], // added W for CollationEt (xx-uca-et)
127 'eu' => [ "Ñ" ],
128 'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ],
129 'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ],
130 'fy' => [],
131 'ga' => [],
132 'gd' => [],
133 'gl' => [ "Ch", "Ll", "Ñ" ],
134 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
135 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ],
136 'kk' => [ "Ү", "І" ],
137 'kl' => [ "Æ", "Ø", "Å" ],
138 'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ],
139 'ky' => [ "Ё" ],
140 'la' => [],
141 'lb' => [],
142 'lt' => [ "Č", "Š", "Ž" ],
143 'mk' => [],
144 'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ],
145 'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ],
146 'nl' => [],
147 'no' => [ "Æ", "Ø", "Å" ],
148 'oc' => [],
149 'rm' => [],
150 'ro' => [ "Ă", "Â", "Î", "Ş", "Ţ" ],
151 'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ],
152 'sco' => [],
153 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ],
154 'sl' => [ "Č", "Š", "Ž" ],
155 'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ],
156 'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ],
157 'sr' => [],
158 'ta' => [
159 "\xE0\xAE\x82", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்",
160 "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்",
161 "ஸ்", "ஹ்", "க்ஷ்"
162 ],
163 'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ],
164 'tl' => [ "Ñ", "Ng" ],
165 'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ],
166 'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ],
167 'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ],
168 ];
169
170 /**
171 * @since 1.16.3
172 */
173 const RECORD_LENGTH = 14;
174
175 public function __construct( $locale ) {
176 if ( !extension_loaded( 'intl' ) ) {
177 throw new MWException( 'An ICU collation was requested, ' .
178 'but the intl extension is not available.' );
179 }
180
181 $this->locale = $locale;
182 // Drop everything after the '@' in locale's name
183 $localeParts = explode( '@', $locale );
184 $this->digitTransformLanguage = Language::factory( $locale === 'root' ? 'en' : $localeParts[0] );
185
186 $this->mainCollator = Collator::create( $locale );
187 if ( !$this->mainCollator ) {
188 throw new MWException( "Invalid ICU locale specified for collation: $locale" );
189 }
190
191 $this->primaryCollator = Collator::create( $locale );
192 $this->primaryCollator->setStrength( Collator::PRIMARY );
193 }
194
195 public function getSortKey( $string ) {
196 return $this->mainCollator->getSortKey( $string );
197 }
198
199 public function getPrimarySortKey( $string ) {
200 return $this->primaryCollator->getSortKey( $string );
201 }
202
203 public function getFirstLetter( $string ) {
204 $string = strval( $string );
205 if ( $string === '' ) {
206 return '';
207 }
208
209 // Check for CJK
210 $firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
211 if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) {
212 return $firstChar;
213 }
214
215 $sortKey = $this->getPrimarySortKey( $string );
216
217 // Do a binary search to find the correct letter to sort under
218 $min = ArrayUtils::findLowerBound(
219 [ $this, 'getSortKeyByLetterIndex' ],
220 $this->getFirstLetterCount(),
221 'strcmp',
222 $sortKey );
223
224 if ( $min === false ) {
225 // Before the first letter
226 return '';
227 }
228 return $this->getLetterByIndex( $min );
229 }
230
231 /**
232 * @since 1.16.3
233 * @return array
234 */
235 public function getFirstLetterData() {
236 if ( $this->firstLetterData === null ) {
237 $cache = ObjectCache::getLocalServerInstance( CACHE_ANYTHING );
238 $cacheKey = $cache->makeKey(
239 'first-letters',
240 $this->locale,
241 $this->digitTransformLanguage->getCode(),
242 self::getICUVersion(),
243 self::FIRST_LETTER_VERSION
244 );
245 $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK, function () {
246 return $this->fetchFirstLetterData();
247 } );
248 }
249 return $this->firstLetterData;
250 }
251
252 /**
253 * @return array
254 * @throws MWException
255 */
256 private function fetchFirstLetterData() {
257 // Generate data from serialized data file
258 if ( isset( self::$tailoringFirstLetters[$this->locale] ) ) {
259 $letters = wfGetPrecompiledData( 'first-letters-root.ser' );
260 // Append additional characters
261 $letters = array_merge( $letters, self::$tailoringFirstLetters[$this->locale] );
262 // Remove unnecessary ones, if any
263 if ( isset( self::$tailoringFirstLetters['-' . $this->locale] ) ) {
264 $letters = array_diff( $letters, self::$tailoringFirstLetters['-' . $this->locale] );
265 }
266 // Apply digit transforms
267 $digits = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ];
268 $letters = array_diff( $letters, $digits );
269 foreach ( $digits as $digit ) {
270 $letters[] = $this->digitTransformLanguage->formatNum( $digit, true );
271 }
272 } else {
273 $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
274 if ( $letters === false ) {
275 throw new MWException( "MediaWiki does not support ICU locale " .
276 "\"{$this->locale}\"" );
277 }
278 }
279
280 /* Sort the letters.
281 *
282 * It's impossible to have the precompiled data file properly sorted,
283 * because the sort order changes depending on ICU version. If the
284 * array is not properly sorted, the binary search will return random
285 * results.
286 *
287 * We also take this opportunity to remove primary collisions.
288 */
289 $letterMap = [];
290 foreach ( $letters as $letter ) {
291 $key = $this->getPrimarySortKey( $letter );
292 if ( isset( $letterMap[$key] ) ) {
293 // Primary collision
294 // Keep whichever one sorts first in the main collator
295 if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) {
296 $letterMap[$key] = $letter;
297 }
298 } else {
299 $letterMap[$key] = $letter;
300 }
301 }
302 ksort( $letterMap, SORT_STRING );
303
304 /* Remove duplicate prefixes. Basically if something has a sortkey
305 * which is a prefix of some other sortkey, then it is an
306 * expansion and probably should not be considered a section
307 * header.
308 *
309 * For example 'þ' is sometimes sorted as if it is the letters
310 * 'th'. Other times it is its own primary element. Another
311 * example is '₨'. Sometimes its a currency symbol. Sometimes it
312 * is an 'R' followed by an 's'.
313 *
314 * Additionally an expanded element should always sort directly
315 * after its first element due to they way sortkeys work.
316 *
317 * UCA sortkey elements are of variable length but no collation
318 * element should be a prefix of some other element, so I think
319 * this is safe. See:
320 * - https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
321 * - http://site.icu-project.org/design/collation/uca-weight-allocation
322 *
323 * Additionally, there is something called primary compression to
324 * worry about. Basically, if you have two primary elements that
325 * are more than one byte and both start with the same byte then
326 * the first byte is dropped on the second primary. Additionally
327 * either \x03 or \xFF may be added to mean that the next primary
328 * does not start with the first byte of the first primary.
329 *
330 * This shouldn't matter much, as the first primary is not
331 * changed, and that is what we are comparing against.
332 *
333 * tl;dr: This makes some assumptions about how icu implements
334 * collations. It seems incredibly unlikely these assumptions
335 * will change, but nonetheless they are assumptions.
336 */
337
338 $prev = false;
339 $duplicatePrefixes = [];
340 foreach ( $letterMap as $key => $value ) {
341 // Remove terminator byte. Otherwise the prefix
342 // comparison will get hung up on that.
343 $trimmedKey = rtrim( $key, "\0" );
344 if ( $prev === false || $prev === '' ) {
345 $prev = $trimmedKey;
346 // We don't yet have a collation element
347 // to compare against, so continue.
348 continue;
349 }
350
351 // Due to the fact the array is sorted, we only have
352 // to compare with the element directly previous
353 // to the current element (skipping expansions).
354 // An element "X" will always sort directly
355 // before "XZ" (Unless we have "XY", but we
356 // do not update $prev in that case).
357 if ( substr( $trimmedKey, 0, strlen( $prev ) ) === $prev ) {
358 $duplicatePrefixes[] = $key;
359 // If this is an expansion, we don't want to
360 // compare the next element to this element,
361 // but to what is currently $prev
362 continue;
363 }
364 $prev = $trimmedKey;
365 }
366 foreach ( $duplicatePrefixes as $badKey ) {
367 wfDebug( "Removing '{$letterMap[$badKey]}' from first letters.\n" );
368 unset( $letterMap[$badKey] );
369 // This code assumes that unsetting does not change sort order.
370 }
371 $data = [
372 'chars' => array_values( $letterMap ),
373 'keys' => array_keys( $letterMap ),
374 ];
375
376 // Reduce memory usage before caching
377 unset( $letterMap );
378
379 return $data;
380 }
381
382 /**
383 * @since 1.16.3
384 */
385 public function getLetterByIndex( $index ) {
386 return $this->getFirstLetterData()['chars'][$index];
387 }
388
389 /**
390 * @since 1.16.3
391 */
392 public function getSortKeyByLetterIndex( $index ) {
393 return $this->getFirstLetterData()['keys'][$index];
394 }
395
396 /**
397 * @since 1.16.3
398 */
399 public function getFirstLetterCount() {
400 return count( $this->getFirstLetterData()['chars'] );
401 }
402
403 /**
404 * @since 1.16.3
405 */
406 public static function isCjk( $codepoint ) {
407 foreach ( self::$cjkBlocks as $block ) {
408 if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
409 return true;
410 }
411 }
412 return false;
413 }
414
415 /**
416 * Return the version of ICU library used by PHP's intl extension,
417 * or false when the extension is not installed of the version
418 * can't be determined.
419 *
420 * The constant INTL_ICU_VERSION this function refers to isn't really
421 * documented. It is available since PHP 5.3.7 (see PHP bug 54561).
422 * This function will return false on older PHPs.
423 *
424 * @since 1.21
425 * @return string|bool
426 */
427 static function getICUVersion() {
428 return defined( 'INTL_ICU_VERSION' ) ? INTL_ICU_VERSION : false;
429 }
430
431 /**
432 * Return the version of Unicode appropriate for the version of ICU library
433 * currently in use, or false when it can't be determined.
434 *
435 * @since 1.21
436 * @return string|bool
437 */
438 static function getUnicodeVersionForICU() {
439 $icuVersion = IcuCollation::getICUVersion();
440 if ( !$icuVersion ) {
441 return false;
442 }
443
444 $versionPrefix = substr( $icuVersion, 0, 3 );
445 // Source: http://site.icu-project.org/download
446 $map = [
447 '50.' => '6.2',
448 '49.' => '6.1',
449 '4.8' => '6.0',
450 '4.6' => '6.0',
451 '4.4' => '5.2',
452 '4.2' => '5.1',
453 '4.0' => '5.1',
454 '3.8' => '5.0',
455 '3.6' => '5.0',
456 '3.4' => '4.1',
457 ];
458
459 if ( isset( $map[$versionPrefix] ) ) {
460 return $map[$versionPrefix];
461 } else {
462 return false;
463 }
464 }
465 }