Merge "Do not override content format in EditPage when loading rev."
[lhc/web/wiklou.git] / includes / collation / IcuCollation.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 * @file
19 */
20
21 /**
22 * @since 1.16.3
23 */
24 class IcuCollation extends Collation {
25 const FIRST_LETTER_VERSION = 2;
26
27 /** @var Collator */
28 private $primaryCollator;
29
30 /** @var Collator */
31 private $mainCollator;
32
33 /** @var string */
34 private $locale;
35
36 /** @var Language */
37 protected $digitTransformLanguage;
38
39 /** @var array */
40 private $firstLetterData;
41
42 /**
43 * Unified CJK blocks.
44 *
45 * The same definition of a CJK block must be used for both Collation and
46 * generateCollationData.php. These blocks are omitted from the first
47 * letter data, as an optimisation measure and because the default UCA table
48 * is pretty useless for sorting Chinese text anyway. Japanese and Korean
49 * blocks are not included here, because they are smaller and more useful.
50 */
51 private static $cjkBlocks = [
52 [ 0x2E80, 0x2EFF ], // CJK Radicals Supplement
53 [ 0x2F00, 0x2FDF ], // Kangxi Radicals
54 [ 0x2FF0, 0x2FFF ], // Ideographic Description Characters
55 [ 0x3000, 0x303F ], // CJK Symbols and Punctuation
56 [ 0x31C0, 0x31EF ], // CJK Strokes
57 [ 0x3200, 0x32FF ], // Enclosed CJK Letters and Months
58 [ 0x3300, 0x33FF ], // CJK Compatibility
59 [ 0x3400, 0x4DBF ], // CJK Unified Ideographs Extension A
60 [ 0x4E00, 0x9FFF ], // CJK Unified Ideographs
61 [ 0xF900, 0xFAFF ], // CJK Compatibility Ideographs
62 [ 0xFE30, 0xFE4F ], // CJK Compatibility Forms
63 [ 0x20000, 0x2A6DF ], // CJK Unified Ideographs Extension B
64 [ 0x2A700, 0x2B73F ], // CJK Unified Ideographs Extension C
65 [ 0x2B740, 0x2B81F ], // CJK Unified Ideographs Extension D
66 [ 0x2F800, 0x2FA1F ], // CJK Compatibility Ideographs Supplement
67 ];
68
69 /**
70 * Additional characters (or character groups) to be considered separate
71 * letters for given languages, or to be removed from the list of such
72 * letters (denoted by keys starting with '-').
73 *
74 * These are additions to (or subtractions from) the data stored in the
75 * first-letters-root.ser file (which among others includes full basic latin,
76 * cyrillic and greek alphabets).
77 *
78 * "Separate letter" is a letter that would have a separate heading/section
79 * for it in a dictionary or a phone book in this language. This data isn't
80 * used for sorting (the ICU library handles that), only for deciding which
81 * characters (or character groups) to use as headings.
82 *
83 * Initially generated based on the primary level of Unicode collation
84 * tailorings available at http://developer.mimer.com/charts/tailorings.htm ,
85 * later modified.
86 *
87 * Empty arrays are intended; this signifies that the data for the language is
88 * available and that there are, in fact, no additional letters to consider.
89 */
90 private static $tailoringFirstLetters = [
91 // Verified by native speakers
92 'be' => [ "Ё" ],
93 'be-tarask' => [ "Ё" ],
94 'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ],
95 'en' => [],
96 // RTL, let's put each letter on a new line
97 'fa' => [
98 "آ",
99 "ء",
100 "ه",
101 "ا",
102 "و"
103 ],
104 'fi' => [ "Å", "Ä", "Ö" ],
105 'fr' => [],
106 'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ],
107 'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ],
108 'it' => [],
109 'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ],
110 'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ],
111 'pt' => [],
112 'ru' => [],
113 'sv' => [ "Å", "Ä", "Ö" ],
114 'sv@collation=standard' => [ "Å", "Ä", "Ö" ],
115 'uk' => [ "Ґ", "Ь" ],
116 'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ],
117 // Not verified, but likely correct
118 'af' => [],
119 'ast' => [ "Ch", "Ll", "Ñ" ],
120 'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ],
121 'bg' => [],
122 'br' => [ "Ch", "C'h" ],
123 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
124 'ca' => [],
125 'co' => [],
126 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ],
127 'da' => [ "Æ", "Ø", "Å" ],
128 'de' => [],
129 'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ],
130 'el' => [],
131 'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ],
132 'es' => [ "Ñ" ],
133 'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ], // added W for CollationEt (xx-uca-et)
134 'eu' => [ "Ñ" ],
135 'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ],
136 'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ],
137 'fy' => [],
138 'ga' => [],
139 'gd' => [],
140 'gl' => [ "Ch", "Ll", "Ñ" ],
141 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
142 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ],
143 'kk' => [ "Ү", "І" ],
144 'kl' => [ "Æ", "Ø", "Å" ],
145 'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ],
146 'ky' => [ "Ё" ],
147 'la' => [],
148 'lb' => [],
149 'lt' => [ "Č", "Š", "Ž" ],
150 'mk' => [],
151 'mo' => [ "Ă", "Â", "Î", "Ş", "Ţ" ],
152 'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ],
153 'nl' => [],
154 'no' => [ "Æ", "Ø", "Å" ],
155 'oc' => [],
156 'rm' => [],
157 'ro' => [ "Ă", "Â", "Î", "Ş", "Ţ" ],
158 'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ],
159 'sco' => [],
160 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ],
161 'sl' => [ "Č", "Š", "Ž" ],
162 'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ],
163 'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ],
164 'sr' => [],
165 'ta' => [
166 "\xE0\xAE\x82", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்",
167 "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்",
168 "ஸ்", "ஹ்", "க்ஷ்"
169 ],
170 'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ],
171 'tl' => [ "Ñ", "Ng" ],
172 'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ],
173 'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ],
174 'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ],
175 ];
176
177 /**
178 * @since 1.16.3
179 */
180 const RECORD_LENGTH = 14;
181
182 public function __construct( $locale ) {
183 if ( !extension_loaded( 'intl' ) ) {
184 throw new MWException( 'An ICU collation was requested, ' .
185 'but the intl extension is not available.' );
186 }
187
188 $this->locale = $locale;
189 // Drop everything after the '@' in locale's name
190 $localeParts = explode( '@', $locale );
191 $this->digitTransformLanguage = Language::factory( $locale === 'root' ? 'en' : $localeParts[0] );
192
193 $this->mainCollator = Collator::create( $locale );
194 if ( !$this->mainCollator ) {
195 throw new MWException( "Invalid ICU locale specified for collation: $locale" );
196 }
197
198 $this->primaryCollator = Collator::create( $locale );
199 $this->primaryCollator->setStrength( Collator::PRIMARY );
200 }
201
202 public function getSortKey( $string ) {
203 return $this->mainCollator->getSortKey( $string );
204 }
205
206 public function getPrimarySortKey( $string ) {
207 return $this->primaryCollator->getSortKey( $string );
208 }
209
210 public function getFirstLetter( $string ) {
211 $string = strval( $string );
212 if ( $string === '' ) {
213 return '';
214 }
215
216 // Check for CJK
217 $firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
218 if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) {
219 return $firstChar;
220 }
221
222 $sortKey = $this->getPrimarySortKey( $string );
223
224 // Do a binary search to find the correct letter to sort under
225 $min = ArrayUtils::findLowerBound(
226 [ $this, 'getSortKeyByLetterIndex' ],
227 $this->getFirstLetterCount(),
228 'strcmp',
229 $sortKey );
230
231 if ( $min === false ) {
232 // Before the first letter
233 return '';
234 }
235 return $this->getLetterByIndex( $min );
236 }
237
238 /**
239 * @since 1.16.3
240 * @return array
241 */
242 public function getFirstLetterData() {
243 if ( $this->firstLetterData === null ) {
244 $cache = ObjectCache::getLocalServerInstance( CACHE_ANYTHING );
245 $cacheKey = $cache->makeKey(
246 'first-letters',
247 $this->locale,
248 $this->digitTransformLanguage->getCode(),
249 self::getICUVersion(),
250 self::FIRST_LETTER_VERSION
251 );
252 $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK, function () {
253 return $this->fetchFirstLetterData();
254 } );
255 }
256 return $this->firstLetterData;
257 }
258
259 /**
260 * @return array
261 * @throws MWException
262 */
263 private function fetchFirstLetterData() {
264 // Generate data from serialized data file
265 if ( isset( self::$tailoringFirstLetters[$this->locale] ) ) {
266 $letters = wfGetPrecompiledData( 'first-letters-root.ser' );
267 // Append additional characters
268 $letters = array_merge( $letters, self::$tailoringFirstLetters[$this->locale] );
269 // Remove unnecessary ones, if any
270 if ( isset( self::$tailoringFirstLetters['-' . $this->locale] ) ) {
271 $letters = array_diff( $letters, self::$tailoringFirstLetters['-' . $this->locale] );
272 }
273 // Apply digit transforms
274 $digits = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ];
275 $letters = array_diff( $letters, $digits );
276 foreach ( $digits as $digit ) {
277 $letters[] = $this->digitTransformLanguage->formatNum( $digit, true );
278 }
279 } else {
280 $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
281 if ( $letters === false ) {
282 throw new MWException( "MediaWiki does not support ICU locale " .
283 "\"{$this->locale}\"" );
284 }
285 }
286
287 /* Sort the letters.
288 *
289 * It's impossible to have the precompiled data file properly sorted,
290 * because the sort order changes depending on ICU version. If the
291 * array is not properly sorted, the binary search will return random
292 * results.
293 *
294 * We also take this opportunity to remove primary collisions.
295 */
296 $letterMap = [];
297 foreach ( $letters as $letter ) {
298 $key = $this->getPrimarySortKey( $letter );
299 if ( isset( $letterMap[$key] ) ) {
300 // Primary collision
301 // Keep whichever one sorts first in the main collator
302 if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) {
303 $letterMap[$key] = $letter;
304 }
305 } else {
306 $letterMap[$key] = $letter;
307 }
308 }
309 ksort( $letterMap, SORT_STRING );
310
311 /* Remove duplicate prefixes. Basically if something has a sortkey
312 * which is a prefix of some other sortkey, then it is an
313 * expansion and probably should not be considered a section
314 * header.
315 *
316 * For example 'þ' is sometimes sorted as if it is the letters
317 * 'th'. Other times it is its own primary element. Another
318 * example is '₨'. Sometimes its a currency symbol. Sometimes it
319 * is an 'R' followed by an 's'.
320 *
321 * Additionally an expanded element should always sort directly
322 * after its first element due to they way sortkeys work.
323 *
324 * UCA sortkey elements are of variable length but no collation
325 * element should be a prefix of some other element, so I think
326 * this is safe. See:
327 * - https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
328 * - http://site.icu-project.org/design/collation/uca-weight-allocation
329 *
330 * Additionally, there is something called primary compression to
331 * worry about. Basically, if you have two primary elements that
332 * are more than one byte and both start with the same byte then
333 * the first byte is dropped on the second primary. Additionally
334 * either \x03 or \xFF may be added to mean that the next primary
335 * does not start with the first byte of the first primary.
336 *
337 * This shouldn't matter much, as the first primary is not
338 * changed, and that is what we are comparing against.
339 *
340 * tl;dr: This makes some assumptions about how icu implements
341 * collations. It seems incredibly unlikely these assumptions
342 * will change, but nonetheless they are assumptions.
343 */
344
345 $prev = false;
346 $duplicatePrefixes = [];
347 foreach ( $letterMap as $key => $value ) {
348 // Remove terminator byte. Otherwise the prefix
349 // comparison will get hung up on that.
350 $trimmedKey = rtrim( $key, "\0" );
351 if ( $prev === false || $prev === '' ) {
352 $prev = $trimmedKey;
353 // We don't yet have a collation element
354 // to compare against, so continue.
355 continue;
356 }
357
358 // Due to the fact the array is sorted, we only have
359 // to compare with the element directly previous
360 // to the current element (skipping expansions).
361 // An element "X" will always sort directly
362 // before "XZ" (Unless we have "XY", but we
363 // do not update $prev in that case).
364 if ( substr( $trimmedKey, 0, strlen( $prev ) ) === $prev ) {
365 $duplicatePrefixes[] = $key;
366 // If this is an expansion, we don't want to
367 // compare the next element to this element,
368 // but to what is currently $prev
369 continue;
370 }
371 $prev = $trimmedKey;
372 }
373 foreach ( $duplicatePrefixes as $badKey ) {
374 wfDebug( "Removing '{$letterMap[$badKey]}' from first letters.\n" );
375 unset( $letterMap[$badKey] );
376 // This code assumes that unsetting does not change sort order.
377 }
378 $data = [
379 'chars' => array_values( $letterMap ),
380 'keys' => array_keys( $letterMap ),
381 ];
382
383 // Reduce memory usage before caching
384 unset( $letterMap );
385
386 return $data;
387 }
388
389 /**
390 * @since 1.16.3
391 */
392 public function getLetterByIndex( $index ) {
393 return $this->getFirstLetterData()['chars'][$index];
394 }
395
396 /**
397 * @since 1.16.3
398 */
399 public function getSortKeyByLetterIndex( $index ) {
400 return $this->getFirstLetterData()['keys'][$index];
401 }
402
403 /**
404 * @since 1.16.3
405 */
406 public function getFirstLetterCount() {
407 return count( $this->getFirstLetterData()['chars'] );
408 }
409
410 /**
411 * @since 1.16.3
412 */
413 public static function isCjk( $codepoint ) {
414 foreach ( self::$cjkBlocks as $block ) {
415 if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
416 return true;
417 }
418 }
419 return false;
420 }
421
422 /**
423 * Return the version of ICU library used by PHP's intl extension,
424 * or false when the extension is not installed of the version
425 * can't be determined.
426 *
427 * The constant INTL_ICU_VERSION this function refers to isn't really
428 * documented. It is available since PHP 5.3.7 (see PHP bug 54561).
429 * This function will return false on older PHPs.
430 *
431 * @since 1.21
432 * @return string|bool
433 */
434 static function getICUVersion() {
435 return defined( 'INTL_ICU_VERSION' ) ? INTL_ICU_VERSION : false;
436 }
437
438 /**
439 * Return the version of Unicode appropriate for the version of ICU library
440 * currently in use, or false when it can't be determined.
441 *
442 * @since 1.21
443 * @return string|bool
444 */
445 static function getUnicodeVersionForICU() {
446 $icuVersion = IcuCollation::getICUVersion();
447 if ( !$icuVersion ) {
448 return false;
449 }
450
451 $versionPrefix = substr( $icuVersion, 0, 3 );
452 // Source: http://site.icu-project.org/download
453 $map = [
454 '57.' => '8.0',
455 '56.' => '8.0',
456 '55.' => '7.0',
457 '54.' => '7.0',
458 '53.' => '6.3',
459 '52.' => '6.3',
460 '51.' => '6.2',
461 '50.' => '6.2',
462 '49.' => '6.1',
463 '4.8' => '6.0',
464 '4.6' => '6.0',
465 '4.4' => '5.2',
466 '4.2' => '5.1',
467 '4.0' => '5.1',
468 '3.8' => '5.0',
469 '3.6' => '5.0',
470 '3.4' => '4.1',
471 ];
472
473 if ( isset( $map[$versionPrefix] ) ) {
474 return $map[$versionPrefix];
475 } else {
476 return false;
477 }
478 }
479 }