includes/normal/UtfNormal.php

   1 <?php
   2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
   3 # http://www.mediawiki.org/
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License along
  16 # with this program; if not, write to the Free Software Foundation, Inc.,
  17 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18 # http://www.gnu.org/copyleft/gpl.html
  19
  20 /**
  21  * Unicode normalization routines for working with UTF-8 strings.
  22  * Currently assumes that input strings are valid UTF-8!
  23  *
  24  * Not as fast as I'd like, but should be usable for most purposes.
  25  * UtfNormal::toNFC() will bail early if given ASCII text or text
  26  * it can quickly deterimine is already normalized.
  27  *
  28  * All functions can be called static.
  29  *
  30  * See description of forms at http://www.unicode.org/reports/tr15/
  31  *
  32  * @package MediaWiki
  33  */
  34
  35 /** */
  36 require_once 'UtfNormalUtil.php';
  37 require_once 'UtfNormalData.inc';
  38
  39 # Load compatibility decompositions on demand if they are needed.
  40 global $utfCompatibilityDecomp;
  41 $utfCompatibilityDecomp = NULL;
  42
  43 define( 'UNICODE_HANGUL_FIRST', 0xac00 );
  44 define( 'UNICODE_HANGUL_LAST',  0xd7a3 );
  45
  46 define( 'UNICODE_HANGUL_LBASE', 0x1100 );
  47 define( 'UNICODE_HANGUL_VBASE', 0x1161 );
  48 define( 'UNICODE_HANGUL_TBASE', 0x11a7 );
  49
  50 define( 'UNICODE_HANGUL_LCOUNT', 19 );
  51 define( 'UNICODE_HANGUL_VCOUNT', 21 );
  52 define( 'UNICODE_HANGUL_TCOUNT', 28 );
  53 define( 'UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT );
  54
  55 define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1 );
  56 define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
  57 define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
  58
  59 define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
  60 define( 'UNICODE_SURROGATE_LAST', 0xdfff );
  61 define( 'UNICODE_MAX', 0x10ffff );
  62 define( 'UNICODE_REPLACEMENT', 0xfffd );
  63
  64
  65 define( 'UTF8_HANGUL_FIRST', codepointToUtf8( UNICODE_HANGUL_FIRST ) );
  66 define( 'UTF8_HANGUL_LAST', codepointToUtf8( UNICODE_HANGUL_LAST ) );
  67
  68 define( 'UTF8_HANGUL_LBASE', codepointToUtf8( UNICODE_HANGUL_LBASE ) );
  69 define( 'UTF8_HANGUL_VBASE', codepointToUtf8( UNICODE_HANGUL_VBASE ) );
  70 define( 'UTF8_HANGUL_TBASE', codepointToUtf8( UNICODE_HANGUL_TBASE ) );
  71
  72 define( 'UTF8_HANGUL_LEND', codepointToUtf8( UNICODE_HANGUL_LEND ) );
  73 define( 'UTF8_HANGUL_VEND', codepointToUtf8( UNICODE_HANGUL_VEND ) );
  74 define( 'UTF8_HANGUL_TEND', codepointToUtf8( UNICODE_HANGUL_TEND ) );
  75
  76 define( 'UTF8_SURROGATE_FIRST', codepointToUtf8( UNICODE_SURROGATE_FIRST ) );
  77 define( 'UTF8_SURROGATE_LAST', codepointToUtf8( UNICODE_SURROGATE_LAST ) );
  78 define( 'UTF8_MAX', codepointToUtf8( UNICODE_MAX ) );
  79 define( 'UTF8_REPLACEMENT', codepointToUtf8( UNICODE_REPLACEMENT ) );
  80 #define( 'UTF8_REPLACEMENT', '!' );
  81
  82 define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
  83 define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
  84 define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
  85
  86 # These two ranges are illegal
  87 define( 'UTF8_FDD0', codepointToUtf8( 0xfdd0 ) );
  88 define( 'UTF8_FDEF', codepointToUtf8( 0xfdef ) );
  89 define( 'UTF8_FFFE', codepointToUtf8( 0xfffe ) );
  90 define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
  91
  92 define( 'UTF8_HEAD', false );
  93 define( 'UTF8_TAIL', true );
  94
  95
  96 /**
  97  * For using the ICU wrapper
  98  */
  99 define( 'UNORM_NONE', 1 );
 100 define( 'UNORM_NFD',  2 );
 101 define( 'UNORM_NFKD', 3 );
 102 define( 'UNORM_NFC',  4 );
 103 define( 'UNORM_DEFAULT', UNORM_NFC );
 104 define( 'UNORM_NFKC', 5 );
 105 define( 'UNORM_FCD',  6 );
 106
 107 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
 108
 109 /**
 110  *
 111  * @package MediaWiki
 112  */
 113 class UtfNormal {
 114         /**
 115          * The ultimate convenience function! Clean up invalid UTF-8 sequences,
 116          * and convert to normal form C, canonical composition.
 117          *
 118          * Fast return for pure ASCII strings; some lesser optimizations for
 119          * strings containing only known-good characters. Not as fast as toNFC().
 120          *
 121          * @param string $string a UTF-8 string
 122          * @return string a clean, shiny, normalized UTF-8 string
 123          */
 124         function cleanUp( $string ) {
 125                 if( UtfNormal::quickIsNFCVerify( $string ) )
 126                         return $string;
 127                 else
 128                         return UtfNormal::NFC( $string );
 129         }
 130
 131         /**
 132          * Convert a UTF-8 string to normal form C, canonical composition.
 133          * Fast return for pure ASCII strings; some lesser optimizations for
 134          * strings containing only known-good characters.
 135          *
 136          * @param string $string a valid UTF-8 string. Input is not validated.
 137          * @return string a UTF-8 string in normal form C
 138          */
 139         function toNFC( $string ) {
 140                 if( NORMALIZE_ICU )
 141                         return utf8_normalize( $string, UNORM_NFC );
 142                 elseif( UtfNormal::quickIsNFC( $string ) )
 143                         return $string;
 144                 else
 145                         return UtfNormal::NFC( $string );
 146         }
 147
 148         /**
 149          * Convert a UTF-8 string to normal form D, canonical decomposition.
 150          * Fast return for pure ASCII strings.
 151          *
 152          * @param string $string a valid UTF-8 string. Input is not validated.
 153          * @return string a UTF-8 string in normal form D
 154          */
 155         function toNFD( $string ) {
 156                 if( NORMALIZE_ICU )
 157                         return utf8_normalize( $string, UNORM_NFD );
 158                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
 159                         return UtfNormal::NFD( $string );
 160                 else
 161                         return $string;
 162         }
 163
 164         /**
 165          * Convert a UTF-8 string to normal form KC, compatibility composition.
 166          * This may cause irreversible information loss, use judiciously.
 167          * Fast return for pure ASCII strings.
 168          *
 169          * @param string $string a valid UTF-8 string. Input is not validated.
 170          * @return string a UTF-8 string in normal form KC
 171          */
 172         function toNFKC( $string ) {
 173                 if( NORMALIZE_ICU )
 174                         return utf8_normalize( $string, UNORM_NFKC );
 175                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
 176                         return UtfNormal::NFKC( $string );
 177                 else
 178                         return $string;
 179         }
 180
 181         /**
 182          * Convert a UTF-8 string to normal form KD, compatibility decomposition.
 183          * This may cause irreversible information loss, use judiciously.
 184          * Fast return for pure ASCII strings.
 185          *
 186          * @param string $string a valid UTF-8 string. Input is not validated.
 187          * @return string a UTF-8 string in normal form KD
 188          */
 189         function toNFKD( $string ) {
 190                 if( NORMALIZE_ICU )
 191                         return utf8_normalize( $string, UNORM_NFKD );
 192                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
 193                         return UtfNormal::NFKD( $string );
 194                 else
 195                         return $string;
 196         }
 197
 198         /**
 199          * Returns true if the string is _definitely_ in NFC.
 200          * Returns false if not or uncertain.
 201          * @param string $string a valid UTF-8 string. Input is not validated.
 202          * @return bool
 203          */
 204         function quickIsNFC( $string ) {
 205                 # ASCII is always valid NFC!
 206                 # If it's pure ASCII, let it through.
 207                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 208
 209                 global $utfCheckNFC, $utfCombiningClass;
 210                 $len = strlen( $string );
 211                 for( $i = 0; $i < $len; $i++ ) {
 212                         $c = $string{$i};
 213                         $n = ord( $c );
 214                         if( $n < 0x80 ) {
 215                                 continue;
 216                         } elseif( $n >= 0xf0 ) {
 217                                 $c = substr( $string, $i, 4 );
 218                                 $i += 3;
 219                         } elseif( $n >= 0xe0 ) {
 220                                 $c = substr( $string, $i, 3 );
 221                                 $i += 2;
 222                         } elseif( $n >= 0xc0 ) {
 223                                 $c = substr( $string, $i, 2 );
 224                                 $i++;
 225                         }
 226                         if( isset( $utfCheckNFC[$c] ) ) {
 227                                 # If it's NO or MAYBE, bail and do the slow check.
 228                                 return false;
 229                         }
 230                         if( isset( $utfCombiningClass[$c] ) ) {
 231                                 # Combining character? We might have to do sorting, at least.
 232                                 return false;
 233                         }
 234                 }
 235                 return true;
 236         }
 237
 238         /**
 239          * Returns true if the string is _definitely_ in NFC.
 240          * Returns false if not or uncertain.
 241          * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
 242          * @return bool
 243          */
 244         function quickIsNFCVerify( &$string ) {
 245                 # ASCII is always valid NFC!
 246                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 247
 248                 global $utfCheckNFC, $utfCombiningClass;
 249                 $len = strlen( $string );
 250                 $out = '';
 251                 $state = UTF8_HEAD;
 252                 $looksNormal = true;
 253
 254                 $rep = false;
 255                 $head = 0;
 256                 for( $i = 0; $i < $len; $i++ ) {
 257                         $c = $string{$i};
 258                         $n = ord( $c );
 259                         if( $state == UTF8_TAIL ) {
 260                                 if( $n >= 0x80 && $n < 0xc0 ) {
 261                                         $sequence .= $c;
 262                                         if( --$remaining == 0 ) {
 263                                                 if( ($sequence >= UTF8_SURROGATE_FIRST
 264                                                                 && $sequence <= UTF8_SURROGATE_LAST)
 265                                                         || ($head == 0xc0 && $sequence <= UTF8_OVERLONG_A)
 266                                                         || ($head == 0xc1 && $sequence <= UTF8_OVERLONG_A)
 267                                                         || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
 268                                                         || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
 269                                                         || ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
 270                                                         || ($sequence == UTF8_FFFE)
 271                                                         || ($sequence == UTF8_FFFF)
 272                                                         || ($sequence > UTF8_MAX) ) {
 273                                                         $out .= UTF8_REPLACEMENT;
 274                                                         $state = UTF8_HEAD;
 275                                                         continue;
 276                                                 }
 277                                                 if( isset( $utfCheckNFC[$sequence] ) ||
 278                                                         isset( $utfCombiningClass[$sequence] ) ) {
 279                                                         # If it's NO or MAYBE, we'll have to do the slow check.
 280                                                         $looksNormal = false;
 281                                                 }
 282                                                 $out .= $sequence;
 283                                                 $state = UTF8_HEAD;
 284                                                 $head = 0;
 285                                         }
 286                                         continue;
 287                                 }
 288                                 # Not a valid tail byte! DIscard the char we've been building.
 289                                 #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
 290                                 $state = UTF8_HEAD;
 291                                 $out .= UTF8_REPLACEMENT;
 292                         }
 293                         if( $n < 0x09 ) {
 294                                 $out .= UTF8_REPLACEMENT;
 295                         } elseif( $n == 0x0a ) {
 296                                 $out .= $c;
 297                         } elseif( $n < 0x0d ) {
 298                                 $out .= UTF8_REPLACEMENT;
 299                         } elseif( $n == 0x0d ) {
 300                                 # Strip \r silently
 301                         } elseif( $n < 0x20 ) {
 302                                 $out .= UTF8_REPLACEMENT;
 303                         } elseif( $n < 0x80 ) {
 304                                 $out .= $c;
 305                         } elseif( $n < 0xc0 ) {
 306                                 # illegal tail bytes or head byte of overlong sequence
 307                                 if( $head == 0 ) $out .= UTF8_REPLACEMENT;
 308                         } elseif( $n < 0xe0 ) {
 309                                 $state = UTF8_TAIL;
 310                                 $remaining = 1;
 311                                 $sequence = $c;
 312                                 $head = $n;
 313                         } elseif( $n < 0xf0 ) {
 314                                 $state = UTF8_TAIL;
 315                                 $remaining = 2;
 316                                 $sequence = $c;
 317                                 $head = $n;
 318                         } elseif( $n < 0xf8 ) {
 319                                 $state = UTF8_TAIL;
 320                                 $remaining = 3;
 321                                 $sequence = $c;
 322                                 $head = $n;
 323                         } elseif( $n < 0xfc ) {
 324                                 $state = UTF8_TAIL;
 325                                 $remaining = 4;
 326                                 $sequence = $c;
 327                                 $head = $n;
 328                         } elseif( $n < 0xfe ) {
 329                                 $state = UTF8_TAIL;
 330                                 $remaining = 5;
 331                                 $sequence = $c;
 332                                 $head = $n;
 333                         } else {
 334                                 $out .= UTF8_REPLACEMENT;
 335                         }
 336                 }
 337                 if( $state == UTF8_TAIL ) {
 338                         $out .= UTF8_REPLACEMENT;
 339                 }
 340                 $string = $out;
 341                 return $looksNormal;
 342         }
 343
 344         # These take a string and run the normalization on them, without
 345         # checking for validity or any optimization etc. Input must be
 346         # VALID UTF-8!
 347         /**
 348          * @param string $string
 349          * @return string
 350          * @access private
 351          */
 352         function NFC( $string ) {
 353                 return $out = UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
 354         }
 355
 356         /**
 357          * @param string $string
 358          * @return string
 359          * @access private
 360          */
 361         function NFD( $string ) {
 362                 global $utfCanonicalDecomp;
 363                 return UtfNormal::fastCombiningSort(
 364                         UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) );
 365         }
 366
 367         /**
 368          * @param string $string
 369          * @return string
 370          * @access private
 371          */
 372         function NFKC( $string ) {
 373                 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
 374         }
 375
 376         /**
 377          * @param string $string
 378          * @return string
 379          * @access private
 380          */
 381         function NFKD( $string ) {
 382                 global $utfCompatibilityDecomp;
 383                 if( !isset( $utfCompatibilityDecomp ) ) {
 384                         require_once( 'UtfNormalDataK.inc' );
 385                 }
 386                 return UtfNormal::fastCombiningSort(
 387                         UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) );
 388         }
 389
 390
 391         /**
 392          * Perform decomposition of a UTF-8 string into either D or KD form
 393          * (depending on which decomposition map is passed to us).
 394          * Input is assumed to be *valid* UTF-8. Invalid code will break.
 395          * @access private
 396          * @param string &$string Valid UTF-8 string
 397          * @param array &$map hash of expanded decomposition map
 398          * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
 399          */
 400         function fastDecompose( &$string, &$map ) {
 401                 $len = strlen( $string );
 402                 $out = '';
 403                 for( $i = 0; $i < $len; $i++ ) {
 404                         $c = $string{$i};
 405                         $n = ord( $c );
 406                         if( $n < 0x80 ) {
 407                                 # ASCII chars never decompose
 408                                 # THEY ARE IMMORTAL
 409                                 $out .= $c;
 410                                 continue;
 411                         } elseif( $n >= 0xf0 ) {
 412                                 $c = substr( $string, $i, 4 );
 413                                 $i += 3;
 414                         } elseif( $n >= 0xe0 ) {
 415                                 $c = substr( $string, $i, 3 );
 416                                 $i += 2;
 417                         } elseif( $n >= 0xc0 ) {
 418                                 $c = substr( $string, $i, 2 );
 419                                 $i++;
 420                         }
 421                         if( isset( $map[$c] ) ) {
 422                                 $out .= $map[$c];
 423                         } else {
 424                                 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
 425                                         $out .= UtfNormal::decomposeHangul( $c );
 426                                 } else {
 427                                         $out .= $c;
 428                                 }
 429                         }
 430                 }
 431                 return $out;
 432         }
 433
 434         /**
 435          * Decompose a Hangul syllable character into its constituent jamo.
 436          * @access private
 437          * @param int $c Unicode code point of the character
 438          * @return string a UTF-8 string containing a sequence of jamo
 439          */
 440         function decomposeHangul( $c ) {
 441                 $codepoint = utf8ToCodepoint( $c );
 442                 $index = $codepoint - UNICODE_HANGUL_FIRST;
 443                 $l = IntVal( $index / UNICODE_HANGUL_NCOUNT );
 444                 $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
 445                 $t = $index % UNICODE_HANGUL_TCOUNT;
 446                 $out = codepointToUtf8( $l + UNICODE_HANGUL_LBASE );
 447                 $out .= codepointToUtf8( $v + UNICODE_HANGUL_VBASE );
 448                 if( $t ) $out .= codepointToUtf8( $t + UNICODE_HANGUL_TBASE );
 449                 return $out;
 450         }
 451
 452         /**
 453          * Sorts combining characters into canonical order. This is the
 454          * final step in creating decomposed normal forms D and KD.
 455          * @access private
 456          * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
 457          * @return string a UTF-8 string with combining characters sorted in canonical order
 458          */
 459         function fastCombiningSort( $string ) {
 460                 global $utfCombiningClass;
 461                 $replacedCount = 1;
 462                 while( $replacedCount > 0 ) {
 463                         $replacedCount = 0;
 464                         $len = strlen( $string );
 465                         $out = '';
 466                         $lastClass = -1;
 467                         $lastChar = '';
 468                         for( $i = 0; $i < $len; $i++ ) {
 469                                 $c = $string{$i};
 470                                 $n = ord( $c );
 471                                 if( $n >= 0xf0 ) {
 472                                         $c = substr( $string, $i, 4 );
 473                                         $i += 3;
 474                                 } elseif( $n >= 0xe0 ) {
 475                                         $c = substr( $string, $i, 3 );
 476                                         $i += 2;
 477                                 } elseif( $n >= 0xc0 ) {
 478                                         $c = substr( $string, $i, 2 );
 479                                         $i++;
 480                                 }
 481                                 $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
 482                                 if( $lastClass == -1 ) {
 483                                         # First one
 484                                         $lastChar = $c;
 485                                         $lastClass = $class;
 486                                 } elseif( $lastClass > $class && $class > 0 ) {
 487                                         # Swap -- put this one on the stack
 488                                         $out .= $c;
 489                                         $replacedCount++;
 490                                 } else {
 491                                         $out .= $lastChar;
 492                                         $lastChar = $c;
 493                                         $lastClass = $class;
 494                                 }
 495                         }
 496                         $out .= $lastChar;
 497                         $string = $out;
 498                 }
 499                 return $string;
 500         }
 501
 502         /**
 503          * Produces canonically composed sequences, i.e. normal form C or KC.
 504          *
 505          * @access private
 506          * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
 507          * @return string a UTF-8 string with canonical precomposed characters used where possible
 508          */
 509         function fastCompose( $string ) {
 510                 global $utfCanonicalComp, $utfCombiningClass;
 511                 $len = strlen( $string );
 512                 $out = '';
 513                 $lastClass = -1;
 514                 $startChar = '';
 515                 $combining = '';
 516                 for( $i = 0; $i < $len; $i++ ) {
 517                         $c = $string{$i};
 518                         $n = ord( $c );
 519                         if( $n >= 0xf0 ) {
 520                                 $c = substr( $string, $i, 4 );
 521                                 $i += 3;
 522                         } elseif( $n >= 0xe0 ) {
 523                                 $c = substr( $string, $i, 3 );
 524                                 $i += 2;
 525                         } elseif( $n >= 0xc0 ) {
 526                                 $c = substr( $string, $i, 2 );
 527                                 $i++;
 528                         }
 529                         $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
 530                         $pair = $startChar . $c;
 531                         if( empty( $utfCombiningClass[$c] ) ) {
 532                                 # New start char
 533                                 if( $lastClass == 0 && isset( $utfCanonicalComp[$pair] ) ) {
 534                                         $startChar = $utfCanonicalComp[$pair];
 535                                 } elseif( $lastClass == 0 &&
 536                                           $c >= UTF8_HANGUL_VBASE &&
 537                                           $c <= UTF8_HANGUL_VEND &&
 538                                           $startChar >= UTF8_HANGUL_LBASE &&
 539                                           $startChar <= UTF8_HANGUL_LEND ) {
 540                                         $lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
 541                                         $vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
 542                                         $hangulPoint = UNICODE_HANGUL_FIRST +
 543                                                 UNICODE_HANGUL_TCOUNT *
 544                                                 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
 545                                         $startChar = codepointToUtf8( $hangulPoint );
 546                                 } elseif( $lastClass == 0 &&
 547                                           $c >= UTF8_HANGUL_TBASE &&
 548                                           $c <= UTF8_HANGUL_TEND &&
 549                                           $startChar >= UTF8_HANGUL_FIRST &&
 550                                           $startChar <= UTF8_HANGUL_LAST ) {
 551                                         $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
 552                                         $hangulPoint = utf8ToCodepoint( $startChar ) + $tIndex;
 553                                         $startChar = codepointToUtf8( $hangulPoint );
 554                                 } else {
 555                                         $out .= $startChar;
 556                                         $out .= $combining;
 557                                         $startChar = $c;
 558                                         $combining = '';
 559                                 }
 560                         } else {
 561                                 # A combining char; see what we can do with it
 562                                 if( !empty( $startChar ) &&
 563                                         $lastClass < $class &&
 564                                         $class > 0 &&
 565                                         isset( $utfCanonicalComp[$pair] ) ) {
 566                                         $startChar = $utfCanonicalComp[$pair];
 567                                         $class = 0;
 568                                 } else {
 569                                         $combining .= $c;
 570                                 }
 571                         }
 572                         $lastClass = $class;
 573                 }
 574                 $out .= $startChar . $combining;
 575                 return $out;
 576         }
 577
 578         /**
 579          * This is just used for the benchmark, comparing how long it takes to
 580          * interate through a string without really doing anything of substance.
 581          * @param string $string
 582          * @return string
 583          */
 584         function placebo( $string ) {
 585                 $len = strlen( $string );
 586                 $out = '';
 587                 for( $i = 0; $i < $len; $i++ ) {
 588                         $out .= $string{$i};
 589                 }
 590                 return $out;
 591         }
 592 }
 593
 594 ?>