Add support for using ICU to perform normalization, which is much much faster than...
[lhc/web/wiklou.git] / includes / normal / UtfNormal.php
1 <?php
2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3 # http://www.mediawiki.org/
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 # http://www.gnu.org/copyleft/gpl.html
19
20 /**
21 * Unicode normalization routines for working with UTF-8 strings.
22 * Currently assumes that input strings are valid UTF-8!
23 *
24 * Not as fast as I'd like, but should be usable for most purposes.
25 * UtfNormal::toNFC() will bail early if given ASCII text or text
26 * it can quickly deterimine is already normalized.
27 *
28 * All functions can be called static.
29 *
30 * See description of forms at http://www.unicode.org/reports/tr15/
31 *
32 * @package MediaWiki
33 */
34
35 /** */
36 require_once 'UtfNormalUtil.php';
37 require_once 'UtfNormalData.inc';
38
39 # Load compatibility decompositions on demand if they are needed.
40 global $utfCompatibilityDecomp;
41 $utfCompatibilityDecomp = NULL;
42
43 define( 'UNICODE_HANGUL_FIRST', 0xac00 );
44 define( 'UNICODE_HANGUL_LAST', 0xd7a3 );
45
46 define( 'UNICODE_HANGUL_LBASE', 0x1100 );
47 define( 'UNICODE_HANGUL_VBASE', 0x1161 );
48 define( 'UNICODE_HANGUL_TBASE', 0x11a7 );
49
50 define( 'UNICODE_HANGUL_LCOUNT', 19 );
51 define( 'UNICODE_HANGUL_VCOUNT', 21 );
52 define( 'UNICODE_HANGUL_TCOUNT', 28 );
53 define( 'UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT );
54
55 define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1 );
56 define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
57 define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
58
59 define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
60 define( 'UNICODE_SURROGATE_LAST', 0xdfff );
61 define( 'UNICODE_MAX', 0x10ffff );
62 define( 'UNICODE_REPLACEMENT', 0xfffd );
63
64
65 define( 'UTF8_HANGUL_FIRST', codepointToUtf8( UNICODE_HANGUL_FIRST ) );
66 define( 'UTF8_HANGUL_LAST', codepointToUtf8( UNICODE_HANGUL_LAST ) );
67
68 define( 'UTF8_HANGUL_LBASE', codepointToUtf8( UNICODE_HANGUL_LBASE ) );
69 define( 'UTF8_HANGUL_VBASE', codepointToUtf8( UNICODE_HANGUL_VBASE ) );
70 define( 'UTF8_HANGUL_TBASE', codepointToUtf8( UNICODE_HANGUL_TBASE ) );
71
72 define( 'UTF8_HANGUL_LEND', codepointToUtf8( UNICODE_HANGUL_LEND ) );
73 define( 'UTF8_HANGUL_VEND', codepointToUtf8( UNICODE_HANGUL_VEND ) );
74 define( 'UTF8_HANGUL_TEND', codepointToUtf8( UNICODE_HANGUL_TEND ) );
75
76 define( 'UTF8_SURROGATE_FIRST', codepointToUtf8( UNICODE_SURROGATE_FIRST ) );
77 define( 'UTF8_SURROGATE_LAST', codepointToUtf8( UNICODE_SURROGATE_LAST ) );
78 define( 'UTF8_MAX', codepointToUtf8( UNICODE_MAX ) );
79 define( 'UTF8_REPLACEMENT', codepointToUtf8( UNICODE_REPLACEMENT ) );
80 #define( 'UTF8_REPLACEMENT', '!' );
81
82 define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
83 define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
84 define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
85
86 # These two ranges are illegal
87 define( 'UTF8_FDD0', codepointToUtf8( 0xfdd0 ) );
88 define( 'UTF8_FDEF', codepointToUtf8( 0xfdef ) );
89 define( 'UTF8_FFFE', codepointToUtf8( 0xfffe ) );
90 define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
91
92 define( 'UTF8_HEAD', false );
93 define( 'UTF8_TAIL', true );
94
95
96 /**
97 * For using the ICU wrapper
98 */
99 define( 'UNORM_NONE', 1 );
100 define( 'UNORM_NFD', 2 );
101 define( 'UNORM_NFKD', 3 );
102 define( 'UNORM_NFC', 4 );
103 define( 'UNORM_DEFAULT', UNORM_NFC );
104 define( 'UNORM_NFKC', 5 );
105 define( 'UNORM_FCD', 6 );
106
107 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
108
109 /**
110 *
111 * @package MediaWiki
112 */
113 class UtfNormal {
114 /**
115 * The ultimate convenience function! Clean up invalid UTF-8 sequences,
116 * and convert to normal form C, canonical composition.
117 *
118 * Fast return for pure ASCII strings; some lesser optimizations for
119 * strings containing only known-good characters. Not as fast as toNFC().
120 *
121 * @param string $string a UTF-8 string
122 * @return string a clean, shiny, normalized UTF-8 string
123 */
124 function cleanUp( $string ) {
125 if( UtfNormal::quickIsNFCVerify( $string ) )
126 return $string;
127 else
128 return UtfNormal::NFC( $string );
129 }
130
131 /**
132 * Convert a UTF-8 string to normal form C, canonical composition.
133 * Fast return for pure ASCII strings; some lesser optimizations for
134 * strings containing only known-good characters.
135 *
136 * @param string $string a valid UTF-8 string. Input is not validated.
137 * @return string a UTF-8 string in normal form C
138 */
139 function toNFC( $string ) {
140 if( NORMALIZE_ICU )
141 return utf8_normalize( $string, UNORM_NFC );
142 elseif( UtfNormal::quickIsNFC( $string ) )
143 return $string;
144 else
145 return UtfNormal::NFC( $string );
146 }
147
148 /**
149 * Convert a UTF-8 string to normal form D, canonical decomposition.
150 * Fast return for pure ASCII strings.
151 *
152 * @param string $string a valid UTF-8 string. Input is not validated.
153 * @return string a UTF-8 string in normal form D
154 */
155 function toNFD( $string ) {
156 if( NORMALIZE_ICU )
157 return utf8_normalize( $string, UNORM_NFD );
158 elseif( preg_match( '/[\x80-\xff]/', $string ) )
159 return UtfNormal::NFD( $string );
160 else
161 return $string;
162 }
163
164 /**
165 * Convert a UTF-8 string to normal form KC, compatibility composition.
166 * This may cause irreversible information loss, use judiciously.
167 * Fast return for pure ASCII strings.
168 *
169 * @param string $string a valid UTF-8 string. Input is not validated.
170 * @return string a UTF-8 string in normal form KC
171 */
172 function toNFKC( $string ) {
173 if( NORMALIZE_ICU )
174 return utf8_normalize( $string, UNORM_NFKC );
175 elseif( preg_match( '/[\x80-\xff]/', $string ) )
176 return UtfNormal::NFKC( $string );
177 else
178 return $string;
179 }
180
181 /**
182 * Convert a UTF-8 string to normal form KD, compatibility decomposition.
183 * This may cause irreversible information loss, use judiciously.
184 * Fast return for pure ASCII strings.
185 *
186 * @param string $string a valid UTF-8 string. Input is not validated.
187 * @return string a UTF-8 string in normal form KD
188 */
189 function toNFKD( $string ) {
190 if( NORMALIZE_ICU )
191 return utf8_normalize( $string, UNORM_NFKD );
192 elseif( preg_match( '/[\x80-\xff]/', $string ) )
193 return UtfNormal::NFKD( $string );
194 else
195 return $string;
196 }
197
198 /**
199 * Returns true if the string is _definitely_ in NFC.
200 * Returns false if not or uncertain.
201 * @param string $string a valid UTF-8 string. Input is not validated.
202 * @return bool
203 */
204 function quickIsNFC( $string ) {
205 # ASCII is always valid NFC!
206 # If it's pure ASCII, let it through.
207 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
208
209 global $utfCheckNFC, $utfCombiningClass;
210 $len = strlen( $string );
211 for( $i = 0; $i < $len; $i++ ) {
212 $c = $string{$i};
213 $n = ord( $c );
214 if( $n < 0x80 ) {
215 continue;
216 } elseif( $n >= 0xf0 ) {
217 $c = substr( $string, $i, 4 );
218 $i += 3;
219 } elseif( $n >= 0xe0 ) {
220 $c = substr( $string, $i, 3 );
221 $i += 2;
222 } elseif( $n >= 0xc0 ) {
223 $c = substr( $string, $i, 2 );
224 $i++;
225 }
226 if( isset( $utfCheckNFC[$c] ) ) {
227 # If it's NO or MAYBE, bail and do the slow check.
228 return false;
229 }
230 if( isset( $utfCombiningClass[$c] ) ) {
231 # Combining character? We might have to do sorting, at least.
232 return false;
233 }
234 }
235 return true;
236 }
237
238 /**
239 * Returns true if the string is _definitely_ in NFC.
240 * Returns false if not or uncertain.
241 * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
242 * @return bool
243 */
244 function quickIsNFCVerify( &$string ) {
245 # ASCII is always valid NFC!
246 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
247
248 global $utfCheckNFC, $utfCombiningClass;
249 $len = strlen( $string );
250 $out = '';
251 $state = UTF8_HEAD;
252 $looksNormal = true;
253
254 $rep = false;
255 $head = 0;
256 for( $i = 0; $i < $len; $i++ ) {
257 $c = $string{$i};
258 $n = ord( $c );
259 if( $state == UTF8_TAIL ) {
260 if( $n >= 0x80 && $n < 0xc0 ) {
261 $sequence .= $c;
262 if( --$remaining == 0 ) {
263 if( ($sequence >= UTF8_SURROGATE_FIRST
264 && $sequence <= UTF8_SURROGATE_LAST)
265 || ($head == 0xc0 && $sequence <= UTF8_OVERLONG_A)
266 || ($head == 0xc1 && $sequence <= UTF8_OVERLONG_A)
267 || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
268 || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
269 || ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
270 || ($sequence == UTF8_FFFE)
271 || ($sequence == UTF8_FFFF)
272 || ($sequence > UTF8_MAX) ) {
273 $out .= UTF8_REPLACEMENT;
274 $state = UTF8_HEAD;
275 continue;
276 }
277 if( isset( $utfCheckNFC[$sequence] ) ||
278 isset( $utfCombiningClass[$sequence] ) ) {
279 # If it's NO or MAYBE, we'll have to do the slow check.
280 $looksNormal = false;
281 }
282 $out .= $sequence;
283 $state = UTF8_HEAD;
284 $head = 0;
285 }
286 continue;
287 }
288 # Not a valid tail byte! DIscard the char we've been building.
289 #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
290 $state = UTF8_HEAD;
291 $out .= UTF8_REPLACEMENT;
292 }
293 if( $n < 0x09 ) {
294 $out .= UTF8_REPLACEMENT;
295 } elseif( $n == 0x0a ) {
296 $out .= $c;
297 } elseif( $n < 0x0d ) {
298 $out .= UTF8_REPLACEMENT;
299 } elseif( $n == 0x0d ) {
300 # Strip \r silently
301 } elseif( $n < 0x20 ) {
302 $out .= UTF8_REPLACEMENT;
303 } elseif( $n < 0x80 ) {
304 $out .= $c;
305 } elseif( $n < 0xc0 ) {
306 # illegal tail bytes or head byte of overlong sequence
307 if( $head == 0 ) $out .= UTF8_REPLACEMENT;
308 } elseif( $n < 0xe0 ) {
309 $state = UTF8_TAIL;
310 $remaining = 1;
311 $sequence = $c;
312 $head = $n;
313 } elseif( $n < 0xf0 ) {
314 $state = UTF8_TAIL;
315 $remaining = 2;
316 $sequence = $c;
317 $head = $n;
318 } elseif( $n < 0xf8 ) {
319 $state = UTF8_TAIL;
320 $remaining = 3;
321 $sequence = $c;
322 $head = $n;
323 } elseif( $n < 0xfc ) {
324 $state = UTF8_TAIL;
325 $remaining = 4;
326 $sequence = $c;
327 $head = $n;
328 } elseif( $n < 0xfe ) {
329 $state = UTF8_TAIL;
330 $remaining = 5;
331 $sequence = $c;
332 $head = $n;
333 } else {
334 $out .= UTF8_REPLACEMENT;
335 }
336 }
337 if( $state == UTF8_TAIL ) {
338 $out .= UTF8_REPLACEMENT;
339 }
340 $string = $out;
341 return $looksNormal;
342 }
343
344 # These take a string and run the normalization on them, without
345 # checking for validity or any optimization etc. Input must be
346 # VALID UTF-8!
347 /**
348 * @param string $string
349 * @return string
350 * @access private
351 */
352 function NFC( $string ) {
353 return $out = UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
354 }
355
356 /**
357 * @param string $string
358 * @return string
359 * @access private
360 */
361 function NFD( $string ) {
362 global $utfCanonicalDecomp;
363 return UtfNormal::fastCombiningSort(
364 UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) );
365 }
366
367 /**
368 * @param string $string
369 * @return string
370 * @access private
371 */
372 function NFKC( $string ) {
373 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
374 }
375
376 /**
377 * @param string $string
378 * @return string
379 * @access private
380 */
381 function NFKD( $string ) {
382 global $utfCompatibilityDecomp;
383 if( !isset( $utfCompatibilityDecomp ) ) {
384 require_once( 'UtfNormalDataK.inc' );
385 }
386 return UtfNormal::fastCombiningSort(
387 UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) );
388 }
389
390
391 /**
392 * Perform decomposition of a UTF-8 string into either D or KD form
393 * (depending on which decomposition map is passed to us).
394 * Input is assumed to be *valid* UTF-8. Invalid code will break.
395 * @access private
396 * @param string &$string Valid UTF-8 string
397 * @param array &$map hash of expanded decomposition map
398 * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
399 */
400 function fastDecompose( &$string, &$map ) {
401 $len = strlen( $string );
402 $out = '';
403 for( $i = 0; $i < $len; $i++ ) {
404 $c = $string{$i};
405 $n = ord( $c );
406 if( $n < 0x80 ) {
407 # ASCII chars never decompose
408 # THEY ARE IMMORTAL
409 $out .= $c;
410 continue;
411 } elseif( $n >= 0xf0 ) {
412 $c = substr( $string, $i, 4 );
413 $i += 3;
414 } elseif( $n >= 0xe0 ) {
415 $c = substr( $string, $i, 3 );
416 $i += 2;
417 } elseif( $n >= 0xc0 ) {
418 $c = substr( $string, $i, 2 );
419 $i++;
420 }
421 if( isset( $map[$c] ) ) {
422 $out .= $map[$c];
423 } else {
424 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
425 $out .= UtfNormal::decomposeHangul( $c );
426 } else {
427 $out .= $c;
428 }
429 }
430 }
431 return $out;
432 }
433
434 /**
435 * Decompose a Hangul syllable character into its constituent jamo.
436 * @access private
437 * @param int $c Unicode code point of the character
438 * @return string a UTF-8 string containing a sequence of jamo
439 */
440 function decomposeHangul( $c ) {
441 $codepoint = utf8ToCodepoint( $c );
442 $index = $codepoint - UNICODE_HANGUL_FIRST;
443 $l = IntVal( $index / UNICODE_HANGUL_NCOUNT );
444 $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
445 $t = $index % UNICODE_HANGUL_TCOUNT;
446 $out = codepointToUtf8( $l + UNICODE_HANGUL_LBASE );
447 $out .= codepointToUtf8( $v + UNICODE_HANGUL_VBASE );
448 if( $t ) $out .= codepointToUtf8( $t + UNICODE_HANGUL_TBASE );
449 return $out;
450 }
451
452 /**
453 * Sorts combining characters into canonical order. This is the
454 * final step in creating decomposed normal forms D and KD.
455 * @access private
456 * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
457 * @return string a UTF-8 string with combining characters sorted in canonical order
458 */
459 function fastCombiningSort( $string ) {
460 global $utfCombiningClass;
461 $replacedCount = 1;
462 while( $replacedCount > 0 ) {
463 $replacedCount = 0;
464 $len = strlen( $string );
465 $out = '';
466 $lastClass = -1;
467 $lastChar = '';
468 for( $i = 0; $i < $len; $i++ ) {
469 $c = $string{$i};
470 $n = ord( $c );
471 if( $n >= 0xf0 ) {
472 $c = substr( $string, $i, 4 );
473 $i += 3;
474 } elseif( $n >= 0xe0 ) {
475 $c = substr( $string, $i, 3 );
476 $i += 2;
477 } elseif( $n >= 0xc0 ) {
478 $c = substr( $string, $i, 2 );
479 $i++;
480 }
481 $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
482 if( $lastClass == -1 ) {
483 # First one
484 $lastChar = $c;
485 $lastClass = $class;
486 } elseif( $lastClass > $class && $class > 0 ) {
487 # Swap -- put this one on the stack
488 $out .= $c;
489 $replacedCount++;
490 } else {
491 $out .= $lastChar;
492 $lastChar = $c;
493 $lastClass = $class;
494 }
495 }
496 $out .= $lastChar;
497 $string = $out;
498 }
499 return $string;
500 }
501
502 /**
503 * Produces canonically composed sequences, i.e. normal form C or KC.
504 *
505 * @access private
506 * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
507 * @return string a UTF-8 string with canonical precomposed characters used where possible
508 */
509 function fastCompose( $string ) {
510 global $utfCanonicalComp, $utfCombiningClass;
511 $len = strlen( $string );
512 $out = '';
513 $lastClass = -1;
514 $startChar = '';
515 $combining = '';
516 for( $i = 0; $i < $len; $i++ ) {
517 $c = $string{$i};
518 $n = ord( $c );
519 if( $n >= 0xf0 ) {
520 $c = substr( $string, $i, 4 );
521 $i += 3;
522 } elseif( $n >= 0xe0 ) {
523 $c = substr( $string, $i, 3 );
524 $i += 2;
525 } elseif( $n >= 0xc0 ) {
526 $c = substr( $string, $i, 2 );
527 $i++;
528 }
529 $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
530 $pair = $startChar . $c;
531 if( empty( $utfCombiningClass[$c] ) ) {
532 # New start char
533 if( $lastClass == 0 && isset( $utfCanonicalComp[$pair] ) ) {
534 $startChar = $utfCanonicalComp[$pair];
535 } elseif( $lastClass == 0 &&
536 $c >= UTF8_HANGUL_VBASE &&
537 $c <= UTF8_HANGUL_VEND &&
538 $startChar >= UTF8_HANGUL_LBASE &&
539 $startChar <= UTF8_HANGUL_LEND ) {
540 $lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
541 $vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
542 $hangulPoint = UNICODE_HANGUL_FIRST +
543 UNICODE_HANGUL_TCOUNT *
544 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
545 $startChar = codepointToUtf8( $hangulPoint );
546 } elseif( $lastClass == 0 &&
547 $c >= UTF8_HANGUL_TBASE &&
548 $c <= UTF8_HANGUL_TEND &&
549 $startChar >= UTF8_HANGUL_FIRST &&
550 $startChar <= UTF8_HANGUL_LAST ) {
551 $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
552 $hangulPoint = utf8ToCodepoint( $startChar ) + $tIndex;
553 $startChar = codepointToUtf8( $hangulPoint );
554 } else {
555 $out .= $startChar;
556 $out .= $combining;
557 $startChar = $c;
558 $combining = '';
559 }
560 } else {
561 # A combining char; see what we can do with it
562 if( !empty( $startChar ) &&
563 $lastClass < $class &&
564 $class > 0 &&
565 isset( $utfCanonicalComp[$pair] ) ) {
566 $startChar = $utfCanonicalComp[$pair];
567 $class = 0;
568 } else {
569 $combining .= $c;
570 }
571 }
572 $lastClass = $class;
573 }
574 $out .= $startChar . $combining;
575 return $out;
576 }
577
578 /**
579 * This is just used for the benchmark, comparing how long it takes to
580 * interate through a string without really doing anything of substance.
581 * @param string $string
582 * @return string
583 */
584 function placebo( $string ) {
585 $len = strlen( $string );
586 $out = '';
587 for( $i = 0; $i < $len; $i++ ) {
588 $out .= $string{$i};
589 }
590 return $out;
591 }
592 }
593
594 ?>