Follow-up I5b02aa914916f64492c85ce6dcc3272b6406551a (#4335), also put a link in the...
[lhc/web/wiklou.git] / includes / normal / UtfNormal.php
index b590169..2b400e5 100644 (file)
 <?php
 /**
-* Unicode normalization routines
-*
-* Copyright (C) 2006 Ludovic ARNAUD <ludovic.arnaud@gmail.com>
-*
-* This program is free software; you can redistribute it and/or modify
-* it under the terms of the GNU General Public License as published by
-* the Free Software Foundation; either version 2 of the License, or
-* (at your option) any later version.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-* GNU General Public License for more details.
-*
-* You should have received a copy of the GNU General Public License along
-* with this program; if not, write to the Free Software Foundation, Inc.,
-* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-* http://www.gnu.org/copyleft/gpl.html
-*
-* @author      Ludovic ARNAUD <ludovic.arnaud@gmail.com>
-* @license     http://www.gnu.org/licenses/gpl.txt
-* @package     UtfNormal
-*/
+ * Unicode normalization routines
+ *
+ * Copyright © 2004 Brion Vibber <brion@pobox.com>
+ * http://www.mediawiki.org/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup UtfNormal
+ */
 
-require_once ('UtfNormalDefines.php');
-
-if( function_exists( 'utf8_normalize' ) ) {
+/**
+ * @defgroup UtfNormal UtfNormal
+ */
 
-////////////////////////////////////////////////////////////////////////////////
-//              Wrapper for the utfnormal extension, ICU wrapper              //
-////////////////////////////////////////////////////////////////////////////////
+define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
+define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
 
 /**
-* UtfNormal class for the utfnormal extension
-*
-* @ignore
-*/
+ * Unicode normalization routines for working with UTF-8 strings.
+ * Currently assumes that input strings are valid UTF-8!
+ *
+ * Not as fast as I'd like, but should be usable for most purposes.
+ * UtfNormal::toNFC() will bail early if given ASCII text or text
+ * it can quickly deterimine is already normalized.
+ *
+ * All functions can be called static.
+ *
+ * See description of forms at http://www.unicode.org/reports/tr15/
+ *
+ * @ingroup UtfNormal
+ */
 class UtfNormal {
-       function cleanUp( $str ) {
-               /**
-               * The string below is the list of all autorized characters, sorted by
-               * frequency in latin text
-               */
-               $pos = strspn(
-                       $str,
-                       "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D"
-               );
+       /**
+        * For using the ICU wrapper
+        */
+       const UNORM_NONE = 1;
+       const UNORM_NFD  = 2;
+       const UNORM_NFKD = 3;
+       const UNORM_NFC  = 4;
+       const UNORM_NFKC = 5;
+       const UNORM_FCD  = 6;
+       const UNORM_DEFAULT = self::UNORM_NFC;
 
-               if( !isset( $str[$pos] ) ) {
-                       /**
-                       * ASCII strings with no special chars return immediately
-                       */
-                       return $str;
-               }
+       static $utfCombiningClass = null;
+       static $utfCanonicalComp = null;
+       static $utfCanonicalDecomp = null;
 
-               /**
-               * Check if there is potentially a 0xFFFE or 0xFFFF char (UTF sequence
-               * 0xEFBFBE or 0xEFBFBF) and replace them
-               *
-               * Note: we start searching at position $pos
-               */
-               if( is_int( strpos( $str, "\xEF\xBF", $pos ) ) ) {
-                       $str = str_replace(
-                               array( "\xEF\xBF\xBE", "\xEF\xBF\xBF" ),
-                               array( UTF8_REPLACEMENT, UTF8_REPLACEMENT ),
-                               $str
-                       );
-               }
+       # Load compatibility decompositions on demand if they are needed.
+       static $utfCompatibilityDecomp = null;
 
-               /**
-               * Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
-               *
-               * We replace those characters with a 0xFF byte, which is illegal in
-               * UTF-8 and will in turn be replaced with a Unicode replacement char
-               */
-               $str = strtr(
-                       $str,
-                       "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
-                       "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
-               );
+       static $utfCheckNFC;
 
-               /**
-               * As per the original implementation, "the UnicodeString constructor fails
-               * if the string ends with a head byte". Therefore, if the string ends with
-               * a leading byte we replace it with 0xFF, which is illegal too and will be
-               * replaced with a Unicode replacement character
-               */
-               if( substr( $str, -1 ) >= "\xC0" ) {
-                       $str[strlen($str) - 1] = "\xFF";
+       /**
+        * The ultimate convenience function! Clean up invalid UTF-8 sequences,
+        * and convert to normal form C, canonical composition.
+        *
+        * Fast return for pure ASCII strings; some lesser optimizations for
+        * strings containing only known-good characters. Not as fast as toNFC().
+        *
+        * @param $string String: a UTF-8 string
+        * @return string a clean, shiny, normalized UTF-8 string
+        */
+       static function cleanUp( $string ) {
+               if( NORMALIZE_ICU ) {
+                       $string = self::replaceForNativeNormalize( $string );
+
+                       # UnicodeString constructor fails if the string ends with a
+                       # head byte. Add a junk char at the end, we'll strip it off.
+                       return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
+               } elseif( NORMALIZE_INTL ) {
+                       $string = self::replaceForNativeNormalize( $string );
+                       $norm = normalizer_normalize( $string, Normalizer::FORM_C );
+                       if( $norm === null || $norm === false ) {
+                               # normalizer_normalize will either return false or null
+                               # (depending on which doc you read) if invalid utf8 string.
+                               # quickIsNFCVerify cleans up invalid sequences.
+
+                               if( UtfNormal::quickIsNFCVerify( $string ) ) {
+                                       # if that's true, the string is actually already normal.
+                                       return $string;
+                               } else {
+                                       # Now we are valid but non-normal
+                                       return normalizer_normalize( $string, Normalizer::FORM_C );
+                               }
+                       } else {
+                               return $norm;
+                       }
+               } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
+                       # Side effect -- $string has had UTF-8 errors cleaned up.
+                       return $string;
+               } else {
+                       return UtfNormal::NFC( $string );
                }
-
-               return utf8_normalize( $str, UNORM_NFC );
-       }
-
-       function toNFC( $str ) {
-               return utf8_normalize( $str, UNORM_NFC );
-       }
-
-       function toNFKC( $str ) {
-               return utf8_normalize( $str, UNORM_NFKC );
        }
 
-       function toNFD( $str ) {
-               return utf8_normalize( $str, UNORM_NFD );
-       }
-
-       function toNFKD( $str ) {
-               return utf8_normalize( $str, UNORM_NFKD );
+       /**
+        * Convert a UTF-8 string to normal form C, canonical composition.
+        * Fast return for pure ASCII strings; some lesser optimizations for
+        * strings containing only known-good characters.
+        *
+        * @param $string String: a valid UTF-8 string. Input is not validated.
+        * @return string a UTF-8 string in normal form C
+        */
+       static function toNFC( $string ) {
+               if( NORMALIZE_INTL )
+                       return normalizer_normalize( $string, Normalizer::FORM_C );
+               elseif( NORMALIZE_ICU )
+                       return utf8_normalize( $string, self::UNORM_NFC );
+               elseif( UtfNormal::quickIsNFC( $string ) )
+                       return $string;
+               else
+                       return UtfNormal::NFC( $string );
        }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//                           End of the ICU wrapper                           //
-////////////////////////////////////////////////////////////////////////////////
 
-
-} else {
-
-
-////////////////////////////////////////////////////////////////////////////////
-//        This block will NOT be loaded if the utfnormal extension is         //
-////////////////////////////////////////////////////////////////////////////////
-
-/**
-* Unset global variables
-*/
-unset( $GLOBALS['utfJamoIndex'], $GLOBALS['utfJamoType'], $GLOBALS['utfCheckNFC'], $GLOBALS['utfCombiningClass'], $GLOBALS['utfCanonicalComp'], $GLOBALS['utfCanonicalDecomp'], $GLOBALS['utfCheckNFKC'], $GLOBALS['utfCompatibilityDecomp'] );
-
-/**
-* NFC_QC and NFKC_QC values
-*/
-define( 'UNICODE_QC_MAYBE', 0 );
-define( 'UNICODE_QC_NO', 1 );
-
-/**
-* Contains all the ASCII characters appearing in UTF-8, sorted by frequency
-*/
-define( 'UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F" );
-
-/**
-* Contains all the tail bytes that can appear in the composition of a UTF-8 char
-*/
-define( 'UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A" );
-
-/**
-* Unicode normalization routines
-*
-* A copy of reports of bugs related to this class can be sent to the author directly
-*
-* @package UtfNormal
-*/
-class UtfNormal {
        /**
-       * Validate, cleanup and normalize a string
-       *
-       * The ultimate convenience function! Clean up invalid UTF-8 sequences,
-       * and convert to Normal Form C, canonical composition.
-       *
-       * @param        string  $str    The dirty string
-       * @return       string                  The same string, all shiny and cleaned-up
-       */
-       function cleanup( $str ) {
-               /**
-               * The string below is the list of all autorized characters, sorted by
-               * frequency in latin text
-               */
-               $pos = strspn( $str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D" );
-               $len = strlen( $str );
-
-               if( $pos == $len ) {
-                       /**
-                       * ASCII strings with no special chars return immediately
-                       */
-                       return $str;
-               }
-
-               /**
-               * Note: we do not check for $GLOBALS['utfCanonicalDecomp']. It is assumed
-               * they are always loaded together
-               */
-               if( !isset( $GLOBALS['utfCheckNFC'] ) ) {
-                       include( 'UtfNormalData.inc' );
-               }
-
-               /**
-               * Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
-               *
-               * We replace those characters with a 0xFF byte, which is illegal in
-               * UTF-8 and will in turn be replaced with a UTF replacement char
-               */
-               return UtfNormal::recompose(
-                       strtr(
-                               $str,
-                               "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
-                               "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
-                       ),
-                       $pos, $len, $GLOBALS['utfCheckNFC'], $GLOBALS['utfCanonicalDecomp']
-               );
+        * Convert a UTF-8 string to normal form D, canonical decomposition.
+        * Fast return for pure ASCII strings.
+        *
+        * @param $string String: a valid UTF-8 string. Input is not validated.
+        * @return string a UTF-8 string in normal form D
+        */
+       static function toNFD( $string ) {
+               if( NORMALIZE_INTL )
+                       return normalizer_normalize( $string, Normalizer::FORM_D );
+               elseif( NORMALIZE_ICU )
+                       return utf8_normalize( $string, self::UNORM_NFD );
+               elseif( preg_match( '/[\x80-\xff]/', $string ) )
+                       return UtfNormal::NFD( $string );
+               else
+                       return $string;
        }
 
        /**
-       * Validate and normalize a UTF string to NFC
-       *
-       * @param        string  $str    Unchecked UTF string
-       * @return       string                  The string, validated and in normal form
-       */
-       function toNFC( $str ) {
-               $pos = strspn( $str, UTF8_ASCII_RANGE );
-               $len = strlen( $str );
-
-               if( $pos == $len ) {
-                       /**
-                       * ASCII strings return immediately
-                       */
-                       return $str;
-               }
-
-               if( !isset( $GLOBALS['utfCheckNFC'] ) ) {
-                       include( 'UtfNormalData.inc' );
-               }
-
-               return UtfNormal::recompose( $str, $pos, $len, $GLOBALS['utfCheckNFC'], $GLOBALS['utfCanonicalDecomp'] );
+        * Convert a UTF-8 string to normal form KC, compatibility composition.
+        * This may cause irreversible information loss, use judiciously.
+        * Fast return for pure ASCII strings.
+        *
+        * @param $string String: a valid UTF-8 string. Input is not validated.
+        * @return string a UTF-8 string in normal form KC
+        */
+       static function toNFKC( $string ) {
+               if( NORMALIZE_INTL )
+                       return normalizer_normalize( $string, Normalizer::FORM_KC );
+               elseif( NORMALIZE_ICU )
+                       return utf8_normalize( $string, self::UNORM_NFKC );
+               elseif( preg_match( '/[\x80-\xff]/', $string ) )
+                       return UtfNormal::NFKC( $string );
+               else
+                       return $string;
        }
 
        /**
-       * Validate and normalize a UTF string to NFKC
-       *
-       * @param        string  $str    Unchecked UTF string
-       * @return       string                  The string, validated and in normal form
-       */
-       function toNFKC( $str ) {
-               $pos = strspn( $str, UTF8_ASCII_RANGE );
-               $len = strlen( $str );
-
-               if( $pos == $len ) {
-                       /**
-                       * ASCII strings return immediately
-                       */
-                       return $str;
-               }
-
-               if( !isset( $GLOBALS['utfCheckNFKC'] ) ) {
-                       include( 'UtfNormalDataK.inc' );
-               }
-               if( !isset( $GLOBALS['utfCanonicalComp'] ) ) {
-                       include( 'UtfNormalData.inc' );
-               }
-
-               return UtfNormal::recompose( $str, $pos, $len, $GLOBALS['utfCheckNFKC'], $GLOBALS['utfCompatibilityDecomp'] );
+        * Convert a UTF-8 string to normal form KD, compatibility decomposition.
+        * This may cause irreversible information loss, use judiciously.
+        * Fast return for pure ASCII strings.
+        *
+        * @param $string String: a valid UTF-8 string. Input is not validated.
+        * @return string a UTF-8 string in normal form KD
+        */
+       static function toNFKD( $string ) {
+               if( NORMALIZE_INTL )
+                       return normalizer_normalize( $string, Normalizer::FORM_KD );
+               elseif( NORMALIZE_ICU )
+                       return utf8_normalize( $string, self::UNORM_NFKD );
+               elseif( preg_match( '/[\x80-\xff]/', $string ) )
+                       return UtfNormal::NFKD( $string );
+               else
+                       return $string;
        }
 
        /**
-       * Validate and normalize a UTF string to NFD
-       *
-       * @param        string  $str    Unchecked UTF string
-       * @return       string                  The string, validated and in normal form
-       */
-       function toNFD( $str ) {
-               $pos = strspn( $str, UTF8_ASCII_RANGE );
-               $len = strlen( $str );
-
-               if( $pos == $len ) {
-                       /**
-                       * ASCII strings return immediately
-                       */
-                       return $str;
+        * Load the basic composition data if necessary
+        * @private
+        */
+       static function loadData() {
+               if( !isset( self::$utfCombiningClass ) ) {
+                       require_once( dirname(__FILE__) . '/UtfNormalData.inc' );
                }
-
-               if( !isset( $GLOBALS['utfCanonicalDecomp'] ) ) {
-                       include( 'UtfNormalData.inc' );
-               }
-
-               return UtfNormal::decompose( $str, $pos, $len, $GLOBALS['utfCanonicalDecomp'] );
        }
 
        /**
-       * Validate and normalize a UTF string to NFKD
-       *
-       * @param        string  $str    Unchecked UTF string
-       * @return       string                  The string, validated and in normal form
-       */
-       function toNFKD( $str ) {
-               $pos = strspn( $str, UTF8_ASCII_RANGE );
-               $len = strlen( $str );
-
-               if( $pos == $len ) {
-                       /**
-                       * ASCII strings return immediately
-                       */
-                       return $str;
-               }
-
-               if( !isset( $GLOBALS['utfCompatibilityDecomp'] ) ) {
-                       include( 'UtfNormalDataK.inc' );
+        * Returns true if the string is _definitely_ in NFC.
+        * Returns false if not or uncertain.
+        * @param $string String: a valid UTF-8 string. Input is not validated.
+        * @return bool
+        */
+       static function quickIsNFC( $string ) {
+               # ASCII is always valid NFC!
+               # If it's pure ASCII, let it through.
+               if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+
+               UtfNormal::loadData();
+               $len = strlen( $string );
+               for( $i = 0; $i < $len; $i++ ) {
+                       $c = $string[$i];
+                       $n = ord( $c );
+                       if( $n < 0x80 ) {
+                               continue;
+                       } elseif( $n >= 0xf0 ) {
+                               $c = substr( $string, $i, 4 );
+                               $i += 3;
+                       } elseif( $n >= 0xe0 ) {
+                               $c = substr( $string, $i, 3 );
+                               $i += 2;
+                       } elseif( $n >= 0xc0 ) {
+                               $c = substr( $string, $i, 2 );
+                               $i++;
+                       }
+                       if( isset( self::$utfCheckNFC[$c] ) ) {
+                               # If it's NO or MAYBE, bail and do the slow check.
+                               return false;
+                       }
+                       if( isset( self::$utfCombiningClass[$c] ) ) {
+                               # Combining character? We might have to do sorting, at least.
+                               return false;
+                       }
                }
-
-               return UtfNormal::decompose( $str, $pos, $len, $GLOBALS['utfCompatibilityDecomp'] );
+               return true;
        }
 
-
-       ////////////////////////////////////////////////////////////////////////////
-       //                           Internal functions                           //
-       ////////////////////////////////////////////////////////////////////////////
-
        /**
-       * Recompose a UTF string
-       *
-       * @param        string  $str            Unchecked UTF string
-       * @param        integer $pos            Position of the first UTF char (in bytes)
-       * @param        integer $len            Length of the string (in bytes)
-       * @param        array   $qc                     Quick-check array, passed by reference but never modified
-       * @param        array   $decomp_map     Decomposition mapping, passed by reference but never modified
-       * @return       string                          The string, validated and recomposed
-       *
-       * @access       private
-       */
-       function recompose( $str, $pos, $len, &$qc, &$decomp_map ) {
-               global $utfCombiningClass, $utfCanonicalComp, $utfJamoType, $utfJamoIndex;
-
-               /**
-               * Buffer the last ASCII char before the UTF-8 stuff if applicable
-               */
-               $tmp = '';
-               $i = $tmp_pos = $last_cc = 0;
-
-               if( $pos ) {
-                       $buffer = array(++$i => $str[$pos - 1] );
-               } else {
-                       $buffer = array();
-               }
-
-               /**
-               * UTF char length array
-               *
-               * This array is used to determine the length of a UTF character. Be $c the
-               * result of ($str[$pos] & "\xF0") --where $str is the string we're operating
-               * on and $pos the position of the cursor--, if $utf_len_mask[$c] does not
-               * exist, the byte is an ASCII char. Otherwise, if $utf_len_mask[$c] is greater
-               * than 0, we have a the leading byte of a multibyte character whose length is
-               * $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
-               */
-               $utf_len_mask = array(
-                       /**
-                       * Leading bytes masks
-                       */
-                       "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
-
-                       /**
-                       * Trailing bytes masks
-                       */
-                       "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
-               );
-
-               $extra_check = array(
-                       "\xED"=>1, "\xEF"=>1, "\xC0"=>1, "\xC1"=>1, "\xE0"=>1, "\xF0"=>1,
-                       "\xF4"=>1, "\xF5"=>1, "\xF6"=>1, "\xF7"=>1, "\xF8"=>1, "\xF9"=>1,
-                       "\xFA"=>1, "\xFB"=>1, "\xFC"=>1, "\xFD"=>1, "\xFE"=>1, "\xFF"=>1
-               );
-
-               $utf_validation_mask = array(
-                       2       =>      "\xE0\xC0",
-                       3       =>      "\xF0\xC0\xC0",
-                       4       =>      "\xF8\xC0\xC0\xC0"
-               );
-
-               $utf_validation_check = array(
-                       2       =>      "\xC0\x80",
-                       3       =>      "\xE0\x80\x80",
-                       4       =>      "\xF0\x80\x80\x80"
-               );
-
-               ////////////////////////////////////////////////////////////////////////
-               //                             Main loop                              //
-               ////////////////////////////////////////////////////////////////////////
-
-               do {
-                       ////////////////////////////////////////////////////////////////////
-                       //         STEP 0: Capture the current char and buffer it         //
-                       ////////////////////////////////////////////////////////////////////
-
-                       $c = $str[$pos];
-                       $c_mask = $c & "\xF0";
-
-                       if( isset( $utf_len_mask[$c_mask] ) ) {
-                               /**
-                               * Byte at $pos is either a leading byte or a missplaced trailing byte
-                               */
-                               if( $utf_len = $utf_len_mask[$c_mask] ) {
-                                       /**
-                                       * Capture the char
-                                       */
-                                       $buffer[++$i & 7] = $utf_char = substr( $str, $pos, $utf_len );
-
-                                       /**
-                                       * Let's find out if a thorough check is needed
-                                       */
-                                       if( isset( $qc[$utf_char] ) ) {
-                                               /**
-                                               * If the UTF char is in the qc array then it may not be in normal
-                                               * form. We do nothing here, the actual processing is below this
-                                               * "if" block
-                                               */
-                                       } elseif( isset( $utfCombiningClass[$utf_char] ) ) {
-                                               if( $utfCombiningClass[$utf_char] < $last_cc ) {
-                                                       /**
-                                                       * A combining character that is NOT canonically ordered
-                                                       */
-                                               } else {
-                                                       /**
-                                                       * A combining character that IS canonically ordered, skip
-                                                       * to the next char
-                                                       */
-                                                       $last_cc = $utfCombiningClass[$utf_char];
-
-                                                       $pos += $utf_len;
-                                                       continue;
-                                               }
-                                       } else {
-                                               /**
-                                               * At this point, $utf_char holds a UTF char that we know
-                                               * is not a NF[K]C_QC and is not a combining character. It can
-                                               * be a singleton, a canonical composite, a replacement char or
-                                               * an even an ill-formed bunch of bytes. Let's find out
-                                               */
-                                               $last_cc = 0;
-
-                                               /**
-                                               * Check that we have the correct number of trailing bytes
-                                               */
-                                               if( ( $utf_char & $utf_validation_mask[$utf_len] ) != $utf_validation_check[$utf_len] ) {
-                                                       /**
-                                                       * Current char isn't well-formed or legal: either one or
-                                                       * several trailing bytes are missing, or the Unicode char
-                                                       * has been encoded in a five- or six- byte sequence
-                                                       */
-                                                       if( $utf_char[0] >= "\xF8" ) {
-                                                               if( $utf_char[0] < "\xF8" ) {
-                                                                       $trailing_bytes = 3;
-                                                               } elseif( $utf_char[0] < "\xFC" ) {
-                                                                       $trailing_bytes = 4;
-                                                               }
-                                                               if( $utf_char[0] > "\xFD" ) {
-                                                                       $trailing_bytes = 0;
-                                                               } else {
-                                                                       $trailing_bytes = 5;
-                                                               }
-                                                       } else {
-                                                               $trailing_bytes = $utf_len - 1;
-                                                       }
-
-                                                       $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
-                                                       $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
-                                                       $tmp_pos = $pos;
-
-                                                       continue;
-                                               }
-
-                                               if( isset( $extra_check[$c] ) ) {
-                                                       switch( $c ) {
-                                                               /**
-                                                               * Note: 0xED is quite common in Korean
-                                                               */
-                                                               case "\xED":
-                                                                       if( $utf_char >= "\xED\xA0\x80" ) {
-                                                                               /**
-                                                                               * Surrogates (0xD800..0xDFFF) are not allowed in UTF-8
-                                                                               * (UTF sequence 0xEDA080..0xEDBFBF)
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
-                                                                               $pos += $utf_len;
-                                                                               $tmp_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                                       break;
-
-                                                               /**
-                                                               * Note: 0xEF is quite common in Japanese
-                                                               */
-                                                               case "\xEF":
-                                                                       if( $utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF" ) {
-                                                                               /**
-                                                                               * 0xFFFE and 0xFFFF are explicitly disallowed
-                                                                               * (UTF sequence 0xEFBFBE..0xEFBFBF)
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
-                                                                               $pos += $utf_len;
-                                                                               $tmp_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                                       break;
-
-                                                               case "\xC0":
-                                                               case "\xC1":
-                                                                       if( $utf_char <= "\xC1\xBF" ) {
-                                                                               /**
-                                                                               * Overlong sequence: Unicode char 0x00..0x7F encoded as a
-                                                                               * double-byte UTF char
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
-                                                                               $pos += $utf_len;
-                                                                               $tmp_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                                       break;
-
-                                                               case "\xE0":
-                                                                       if( $utf_char <= "\xE0\x9F\xBF" ) {
-                                                                               /**
-                                                                               * Unicode char 0x0000..0x07FF encoded in 3 bytes
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
-                                                                               $pos += $utf_len;
-                                                                               $tmp_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                                       break;
-
-                                                               case "\xF0":
-                                                                       if( $utf_char <= "\xF0\x8F\xBF\xBF" ) {
-                                                                               /**
-                                                                               * Unicode char 0x0000..0xFFFF encoded in 4 bytes
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
-                                                                               $pos += $utf_len;
-                                                                               $tmp_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                                       break;
-
-                                                               default:
-                                                                       /**
-                                                                       * Five- and six- byte sequences do not need being checked for here anymore
-                                                                       */
-                                                                       if( $utf_char > UTF8_MAX ) {
-                                                                               /**
-                                                                               * Out of the Unicode range
-                                                                               */
-                                                                               if( $utf_char[0] < "\xF8" ) {
-                                                                                       $trailing_bytes = 3;
-                                                                               } elseif( $utf_char[0] < "\xFC" ) {
-                                                                                       $trailing_bytes = 4;
-                                                                               } elseif( $utf_char[0] > "\xFD" ) {
-                                                                                       $trailing_bytes = 0;
-                                                                               } else {
-                                                                                       $trailing_bytes = 5;
-                                                                               }
-
-                                                                               $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
-                                                                               $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
-                                                                               $tmp_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                       }
-                                               }
-
-                                               /**
-                                               * The char is a valid starter, move the cursor and go on
-                                               */
-                                               $pos += $utf_len;
-                                               continue;
-                                       }
+        * Returns true if the string is _definitely_ in NFC.
+        * Returns false if not or uncertain.
+        * @param $string String: a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
+        * @return bool
+        */
+       static function quickIsNFCVerify( &$string ) {
+               # Screen out some characters that eg won't be allowed in XML
+               $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
+
+               # ASCII is always valid NFC!
+               # If we're only ever given plain ASCII, we can avoid the overhead
+               # of initializing the decomposition tables by skipping out early.
+               if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+
+               static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
+               if( !isset( $checkit ) ) {
+                       # Load/build some scary lookup tables...
+                       UtfNormal::loadData();
+
+                       $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
+
+                       # Head bytes for sequences which we should do further validity checks
+                       $checkit = array_flip( array_map( 'chr',
+                                       array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
+                                                  0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+                                                  0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
+
+                       # Each UTF-8 head byte is followed by a certain
+                       # number of tail bytes.
+                       $tailBytes = array();
+                       for( $n = 0; $n < 256; $n++ ) {
+                               if( $n < 0xc0 ) {
+                                       $remaining = 0;
+                               } elseif( $n < 0xe0 ) {
+                                       $remaining = 1;
+                               } elseif( $n < 0xf0 ) {
+                                       $remaining = 2;
+                               } elseif( $n < 0xf8 ) {
+                                       $remaining = 3;
+                               } elseif( $n < 0xfc ) {
+                                       $remaining = 4;
+                               } elseif( $n < 0xfe ) {
+                                       $remaining = 5;
                                } else {
-                                       /**
-                                       * A trailing byte came out of nowhere, we will advance the cursor
-                                       * and treat the this byte and all following trailing bytes as if
-                                       * each of them was a Unicode replacement char
-                                       */
-                                       $spn = strspn( $str, UTF8_TRAILING_BYTES, $pos );
-                                       $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . str_repeat( UTF8_REPLACEMENT, $spn );
-
-                                       $pos += $spn;
-                                       $tmp_pos = $pos;
-                                       continue;
+                                       $remaining = 0;
                                }
+                               $tailBytes[chr($n)] = $remaining;
+                       }
+               }
 
+               # Chop the text into pure-ASCII and non-ASCII areas;
+               # large ASCII parts can be handled much more quickly.
+               # Don't chop up Unicode areas for punctuation, though,
+               # that wastes energy.
+               $matches = array();
+               preg_match_all(
+                       '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
+                       $string, $matches );
+
+               $looksNormal = true;
+               $base = 0;
+               $replace = array();
+               foreach( $matches[1] as $str ) {
+                       $chunk = strlen( $str );
+
+                       if( $str[0] < "\x80" ) {
+                               # ASCII chunk: guaranteed to be valid UTF-8
+                               # and in normal form C, so skip over it.
+                               $base += $chunk;
+                               continue;
+                       }
 
-                               ////////////////////////////////////////////////////////////////////
-                               //                 STEP 1: Decompose current char                 //
-                               ////////////////////////////////////////////////////////////////////
-
-                               /**
-                               * We have found a character that is either:
-                               *  - in the NFC_QC/NFKC_QC list
-                               *  - a non-starter char that is not canonically ordered
-                               *
-                               * We are going to capture the shortest UTF sequence that satisfies
-                               * these two conditions:
-                               *
-                               *  1 - If the sequence does not start at the begginning of the string,
-                               *      it must begin with a starter, and that starter must not have the
-                               *      NF[K]C_QC property equal to "MAYBE"
-                               *
-                               *  2 - If the sequence does not end at the end of the string, it must end
-                               *      with a non-starter and be immediately followed by a starter that
-                               *      is not on the QC list
-                               */
-                               $utf_seq = array();
-                               $last_cc = 0;
-                               $lpos = $pos;
-                               $pos += $utf_len;
-
-                               if( isset( $decomp_map[$utf_char] ) ) {
-                                       $_pos = 0;
-                                       $_len = strlen( $decomp_map[$utf_char] );
+                       # We'll have to examine the chunk byte by byte to ensure
+                       # that it consists of valid UTF-8 sequences, and to see
+                       # if any of them might not be normalized.
+                       #
+                       # Since PHP is not the fastest language on earth, some of
+                       # this code is a little ugly with inner loop optimizations.
+
+                       $head = '';
+                       $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
+
+                       for( $i = -1; --$len; ) {
+                               $remaining = $tailBytes[$c = $str[++$i]];
+                               if( $remaining ) {
+                                       # UTF-8 head byte!
+                                       $sequence = $head = $c;
                                        do {
-                                               $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
-
-                                               if( isset( $_utf_len ) ) {
-                                                       $utf_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
-                                                       $_pos += $_utf_len;
+                                               # Look for the defined number of tail bytes...
+                                               if( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
+                                                       # Legal tail bytes are nice.
+                                                       $sequence .= $c;
                                                } else {
-                                                       $utf_seq[] = $decomp_map[$utf_char][$_pos];
-                                                       ++$_pos;
-                                               }
-                                       }
-                                       while( $_pos < $_len );
-                               } else {
-                                       /**
-                                       * The char is not decomposable
-                                       */
-                                       $utf_seq = array( $utf_char );
-                               }
-
-
-                               ////////////////////////////////////////////////////////////////
-                               //                STEP 2: Capture the starter                 //
-                               ////////////////////////////////////////////////////////////////
-
-                               /**
-                               * Check out the combining class of the first character of the UTF sequence
-                               */
-                               $k = 0;
-                               if( isset( $utfCombiningClass[$utf_seq[0]] ) || $qc[$utf_char] == UNICODE_QC_MAYBE ) {
-                                       /**
-                                       * Not a starter, inspect previous characters
-                                       *
-                                       * The last 8 characters are kept in a buffer so that we don't have
-                                       * to capture them everytime. This is enough for all real-life strings
-                                       * but even if it wasn't, we can capture characters in backward mode,
-                                       * although it is slower than this method.
-                                       *
-                                       * In the following loop, $j starts at the previous buffered character
-                                       * ($i - 1, because current character is at offset $i) and process them
-                                       * in backward mode until we find a starter.
-                                       *
-                                       * $k is the index on each UTF character inside of our UTF sequence.
-                                       * At this time, $utf_seq contains one or more characters numbered 0 to
-                                       * n. $k starts at 0 and for each char we prepend we pre-decrement it
-                                       * and for numbering
-                                       */
-                                       $starter_found = 0;
-                                       $j_min = max(1, $i - 7 );
-                                       for( $j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j ) {
-                                               $utf_char = $buffer[$j & 7];
-                                               $lpos -= strlen( $utf_char );
-
-                                               if( isset( $decomp_map[$utf_char] ) ) {
-                                                       /**
-                                                       * The char is a composite, decompose for storage
-                                                       */
-                                                       $decomp_seq = array();
-                                                       $_pos = 0;
-                                                       $_len = strlen( $decomp_map[$utf_char] );
-                                                       do {
-                                                               $c = $decomp_map[$utf_char][$_pos];
-                                                               $_utf_len =& $utf_len_mask[$c & "\xF0"];
-
-                                                               if( isset( $_utf_len ) ) {
-                                                                       $decomp_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
-                                                                       $_pos += $_utf_len;
-                                                               } else {
-                                                                       $decomp_seq[] = $c;
-                                                                       ++$_pos;
-                                                               }
-                                                       }
-                                                       while( $_pos < $_len );
-
-                                                       /**
-                                                       * Prepend the UTF sequence with our decomposed sequence
-                                                       */
-                                                       if( isset( $decomp_seq[1] ) ) {
-                                                               /**
-                                                               * The char expanded into several chars
-                                                               */
-                                                               $decomp_cnt = count( $decomp_seq );
-                                                               foreach( $decomp_seq as $decomp_i => $decomp_char ) {
-                                                                       $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
-                                                               }
-                                                               $k -= $decomp_cnt;
+                                                       if( 0 == $len ) {
+                                                               # Premature end of string!
+                                                               # Drop a replacement character into output to
+                                                               # represent the invalid UTF-8 sequence.
+                                                               $replace[] = array( UTF8_REPLACEMENT,
+                                                                                                       $base + $i + 1 - strlen( $sequence ),
+                                                                                                       strlen( $sequence ) );
+                                                               break 2;
                                                        } else {
-                                                               /**
-                                                               * Decomposed to a single char, easier to prepend
-                                                               */
-                                                               $utf_seq[--$k] = $decomp_seq[0];
+                                                               # Illegal tail byte; abandon the sequence.
+                                                               $replace[] = array( UTF8_REPLACEMENT,
+                                                                                                       $base + $i - strlen( $sequence ),
+                                                                                                       strlen( $sequence ) );
+                                                               # Back up and reprocess this byte; it may itself
+                                                               # be a legal ASCII or UTF-8 sequence head.
+                                                               --$i;
+                                                               ++$len;
+                                                               continue 2;
                                                        }
-                                               } else {
-                                                       $utf_seq[--$k] = $utf_char;
                                                }
-
-                                               if( !isset( $utfCombiningClass[$utf_seq[$k]] ) ) {
-                                                       /**
-                                                       * We have found our starter
-                                                       */
-                                                       $starter_found = 1;
-                                                       break;
-                                               }
-                                       }
-
-                                       if( !$starter_found && $lpos > $tmp_pos ) {
-                                               /**
-                                               * The starter was not found in the buffer, let's rewind some more
-                                               */
-                                               do {
-                                                       /**
-                                                       * $utf_len_mask contains the masks of both leading bytes and
-                                                       * trailing bytes. If $utf_en > 0 then it's a leading byte,
-                                                       * otherwise it's a trailing byte.
-                                                       */
-                                                       $c = $str[--$lpos];
-                                                       $c_mask = $c & "\xF0";
-
-                                                       if( isset( $utf_len_mask[$c_mask] ) ) {
-                                                               /**
-                                                               * UTF byte
-                                                               */
-                                                               if( $utf_len = $utf_len_mask[$c_mask] ) {
-                                                                       /**
-                                                                       * UTF *leading* byte
-                                                                       */
-                                                                       $utf_char = substr( $str, $lpos, $utf_len );
-
-                                                                       if( isset( $decomp_map[$utf_char] ) ) {
-                                                                               /**
-                                                                               * Decompose the character
-                                                                               */
-                                                                               $decomp_seq = array();
-                                                                               $_pos = 0;
-                                                                               $_len = strlen( $decomp_map[$utf_char] );
-                                                                               do {
-                                                                                       $c = $decomp_map[$utf_char][$_pos];
-                                                                                       $_utf_len =& $utf_len_mask[$c & "\xF0"];
-
-                                                                                       if( isset( $_utf_len ) ) {
-                                                                                               $decomp_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
-                                                                                               $_pos += $_utf_len;
-                                                                                       } else {
-                                                                                               $decomp_seq[] = $c;
-                                                                                               ++$_pos;
-                                                                                       }
-                                                                               }
-                                                                               while( $_pos < $_len );
-
-                                                                               /**
-                                                                               * Prepend the UTF sequence with our decomposed sequence
-                                                                               */
-                                                                               if( isset( $decomp_seq[1] ) ) {
-                                                                                       /**
-                                                                                       * The char expanded into several chars
-                                                                                       */
-                                                                                       $decomp_cnt = count( $decomp_seq );
-                                                                                       foreach( $decomp_seq as $decomp_i => $utf_char ) {
-                                                                                               $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
-                                                                                       }
-                                                                                       $k -= $decomp_cnt;
-                                                                               } else {
-                                                                                       /**
-                                                                                       * Decomposed to a single char, easier to prepend
-                                                                                       */
-                                                                                       $utf_seq[--$k] = $decomp_seq[0];
-                                                                               }
-                                                                       } else {
-                                                                               $utf_seq[--$k] = $utf_char;
-                                                                       }
-                                                               }
-                                                       } else {
-                                                               /**
-                                                               * ASCII char
-                                                               */
-                                                               $utf_seq[--$k] = $c;
+                                       } while( --$remaining );
+
+                                       if( isset( $checkit[$head] ) ) {
+                                               # Do some more detailed validity checks, for
+                                               # invalid characters and illegal sequences.
+                                               if( $head == "\xed" ) {
+                                                       # 0xed is relatively frequent in Korean, which
+                                                       # abuts the surrogate area, so we're doing
+                                                       # this check separately to speed things up.
+
+                                                       if( $sequence >= UTF8_SURROGATE_FIRST ) {
+                                                               # Surrogates are legal only in UTF-16 code.
+                                                               # They are totally forbidden here in UTF-8
+                                                               # utopia.
+                                                               $replace[] = array( UTF8_REPLACEMENT,
+                                                                            $base + $i + 1 - strlen( $sequence ),
+                                                                            strlen( $sequence ) );
+                                                               $head = '';
+                                                               continue;
                                                        }
-                                               }
-                                               while( $lpos > $tmp_pos );
-                                       }
-                               }
-
-
-                               ////////////////////////////////////////////////////////////////
-                               //       STEP 3: Capture following combining modifiers        //
-                               ////////////////////////////////////////////////////////////////
-
-                               while( $pos < $len ) {
-                                       $c_mask = $str[$pos] & "\xF0";
-
-                                       if( isset( $utf_len_mask[$c_mask] ) ) {
-                                               if( $utf_len = $utf_len_mask[$c_mask] ) {
-                                                       $utf_char = substr( $str, $pos, $utf_len );
                                                } else {
-                                                       /**
-                                                       * A trailing byte came out of nowhere
-                                                       *
-                                                       * Trailing bytes are replaced with Unicode replacement chars,
-                                                       * we will just ignore it for now, break out of the loop
-                                                       * as if it was a starter (replacement chars ARE starters)
-                                                       * and let the next loop replace it
-                                                       */
-                                                       break;
-                                               }
-
-                                               if( isset( $utfCombiningClass[$utf_char] ) || isset( $qc[$utf_char] ) ) {
-                                                       /**
-                                                       * Combining character, add it to the sequence and move the cursor
-                                                       */
-                                                       if( isset( $decomp_map[$utf_char] ) ) {
-                                                               /**
-                                                               * Decompose the character
-                                                               */
-                                                               $_pos = 0;
-                                                               $_len = strlen( $decomp_map[$utf_char] );
-                                                               do {
-                                                                       $c = $decomp_map[$utf_char][$_pos];
-                                                                       $_utf_len =& $utf_len_mask[$c & "\xF0"];
-
-                                                                       if( isset( $_utf_len ) ) {
-                                                                               $utf_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
-                                                                               $_pos += $_utf_len;
-                                                                       } else {
-                                                                               $utf_seq[] = $c;
-                                                                               ++$_pos;
-                                                                       }
-                                                               }
-                                                               while( $_pos < $_len );
-                                                       } else {
-                                                               $utf_seq[] = $utf_char;
+                                                       # Slower, but rarer checks...
+                                                       $n = ord( $head );
+                                                       if(
+                                                               # "Overlong sequences" are those that are syntactically
+                                                               # correct but use more UTF-8 bytes than are necessary to
+                                                               # encode a character. Naïve string comparisons can be
+                                                               # tricked into failing to see a match for an ASCII
+                                                               # character, for instance, which can be a security hole
+                                                               # if blacklist checks are being used.
+                                                              ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
+                                                               || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
+                                                               || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
+
+                                                               # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
+                                                               || ($n == 0xef &&
+                                                                          ($sequence == UTF8_FFFE)
+                                                                       || ($sequence == UTF8_FFFF) )
+
+                                                               # Unicode has been limited to 21 bits; longer
+                                                               # sequences are not allowed.
+                                                               || ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
+
+                                                               $replace[] = array( UTF8_REPLACEMENT,
+                                                                                   $base + $i + 1 - strlen( $sequence ),
+                                                                                   strlen( $sequence ) );
+                                                               $head = '';
+                                                               continue;
                                                        }
-
-                                                       $pos += $utf_len;
-                                               } else {
-                                                       /**
-                                                       * Combining class 0 and no QC, break out of the loop
-                                                       *
-                                                       * Note: we do not know if that character is valid. If
-                                                       * it's not, the next iteration will replace it
-                                                       */
-                                                       break;
                                                }
-                                       } else {
-                                               /**
-                                               * ASCII chars are starters
-                                               */
-                                               break;
                                        }
-                               }
-
-
-                               ////////////////////////////////////////////////////////////////
-                               //                  STEP 4: Sort and combine                  //
-                               ////////////////////////////////////////////////////////////////
-
-                               /**
-                               * Here we sort...
-                               */
-                               $k_max = $k + count( $utf_seq );
-                               if( !$k && $k_max == 1 ) {
-                                       /**
-                                       * There is only one char in the UTF sequence, add it then
-                                       * jump to the next iteration of main loop
-                                       *
-                                       * Note: the two commented lines below can be enabled under PHP5
-                                       * for a very small performance gain in most cases
-                                       */
-//                                     if( substr_compare( $str, $utf_seq[0], $lpos, $pos - $lpos ) ) {
-                                               $tmp .= substr( $str, $tmp_pos, $lpos - $tmp_pos ) . $utf_seq[0];
-                                               $tmp_pos = $pos;
-//                                     }
-
-                                       continue;
-                               }
-
-                               /**
-                               * ...there we combine
-                               */
-                               if( isset( $utfCombiningClass[$utf_seq[$k]] ) ) {
-                                       $starter = $nf_seq = '';
-                               } else {
-                                       $starter = $utf_seq[$k++];
-                                       $nf_seq = '';
-                               }
-                               $utf_sort = array();
-
-                               /**
-                               * We add an empty char at the end of the UTF char sequence.
-                               * It will act as a starter and trigger the sort/combine routine
-                               * at the end of the string without altering it
-                               */
-                               $utf_seq[] = '';
-
-                               do {
-                                       $utf_char = $utf_seq[$k++];
-
-                                       if( isset( $utfCombiningClass[$utf_char] ) ) {
-                                               $utf_sort[$utfCombiningClass[$utf_char]][] = $utf_char;
-                                       } else {
-                                               if( empty( $utf_sort ) ) {
-                                                       /**
-                                                       * No combining characters... check for a composite
-                                                       * of the two starters
-                                                       */
-                                                       if( isset( $utfCanonicalComp[$starter . $utf_char] ) ) {
-                                                               /**
-                                                               * Good ol' composite character
-                                                               */
-                                                               $starter = $utfCanonicalComp[$starter . $utf_char];
-                                                       } elseif( isset( $utfJamoType[$utf_char] ) ) {
-                                                               /**
-                                                               * Current char is a composable jamo
-                                                               */
-                                                               if( isset( $utfJamoType[$starter] )
-                                                                && $utfJamoType[$starter] == UNICODE_JAMO_L
-                                                                && $utfJamoType[$utf_char] == UNICODE_JAMO_V ) {
-                                                                       /**
-                                                                       * We have a L jamo followed by a V jamo, we are going
-                                                                       * to prefetch the next char to see if it's a T jamo
-                                                                       */
-                                                                       if( isset( $utfJamoType[$utf_seq[$k]] ) && $utfJamoType[$utf_seq[$k]] == UNICODE_JAMO_T ) {
-                                                                               /**
-                                                                               * L+V+T jamos, combine to a LVT Hangul syllable
-                                                                               * ($k is incremented)
-                                                                               */
-                                                                               $cp = $utfJamoIndex[$starter] + $utfJamoIndex[$utf_char] + $utfJamoIndex[$utf_seq[$k]];
-
-                                                                               ++$k;
-                                                                       } else {
-                                                                               /**
-                                                                               * L+V jamos, combine to a LV Hangul syllable
-                                                                               */
-                                                                               $cp = $utfJamoIndex[$starter] + $utfJamoIndex[$utf_char];
-                                                                       }
-
-                                                                       $starter = chr( 0xE0 | ( $cp >> 12 ) ) . chr( 0x80 | ( ( $cp >> 6 ) & 0x3F ) ) . chr( 0x80 | ( $cp & 0x3F ) );
-                                                               } else {
-                                                                       /**
-                                                                       * Non-composable jamo, just add it to the sequence
-                                                                       */
-                                                                       $nf_seq .= $starter;
-                                                                       $starter = $utf_char;
-                                                               }
-                                                       } else {
-                                                               /**
-                                                               * No composite, just add the first starter to the sequence
-                                                               * then continue with the other one
-                                                               */
-                                                               $nf_seq .= $starter;
-                                                               $starter = $utf_char;
-                                                       }
-                                               } else {
-                                                       ksort( $utf_sort );
-
-                                                       /**
-                                                       * For each class of combining characters
-                                                       */
-                                                       foreach( $utf_sort as $cc => $utf_chars ) {
-                                                               $j = 0;
-
-                                                               do {
-                                                                       /**
-                                                                       * Look for a composite
-                                                                       */
-                                                                       if( isset( $utfCanonicalComp[$starter . $utf_chars[$j]] ) ) {
-                                                                               /**
-                                                                               * Found a composite, replace the starter
-                                                                               */
-                                                                               $starter = $utfCanonicalComp[$starter . $utf_chars[$j]];
-                                                                               unset( $utf_sort[$cc][$j] );
-                                                                       } else {
-                                                                               /**
-                                                                               * No composite, all following characters in that
-                                                                               * class are blocked
-                                                                               */
-                                                                               break;
-                                                                       }
-                                                               }
-                                                               while( isset( $utf_sort[$cc][++$j] ) );
-                                                       }
-
-                                                       /**
-                                                       * Add the starter to the normalized sequence, followed by
-                                                       * non-starters in canonical order
-                                                       */
-                                                       $nf_seq .= $starter;
-                                                       foreach( $utf_sort as $utf_chars ) {
-                                                               if( !empty( $utf_chars ) ) {
-                                                                       $nf_seq .= implode( '', $utf_chars );
-                                                               }
-                                                       }
 
-                                                       /**
-                                                       * Reset the array and go on
-                                                       */
-                                                       $utf_sort = array();
-                                                       $starter = $utf_char;
-                                               }
+                                       if( isset( $utfCheckOrCombining[$sequence] ) ) {
+                                               # If it's NO or MAYBE, we'll have to rip
+                                               # the string apart and put it back together.
+                                               # That's going to be mighty slow.
+                                               $looksNormal = false;
                                        }
-                               }
-                               while( $k <= $k_max );
 
-                               $tmp .= substr( $str, $tmp_pos, $lpos - $tmp_pos ) . $nf_seq;
-                               $tmp_pos = $pos;
-                       } else {
-                               /**
-                               * Only a ASCII char can make the program get here
-                               *
-                               * First we skip the current byte with ++$pos, then we quickly
-                               * skip following ASCII chars with strspn().
-                               *
-                               * The first two "if"'s here can be removed, with the consequences
-                               * of being faster on latin text (lots of ASCII) and slower on
-                               * multi-byte text (where the only ASCII chars are spaces and punctuation)
-                               */
-                               if( ++$pos != $len ) {
-                                       if( $str[$pos] < "\x80" ) {
-                                               $pos += strspn( $str, UTF8_ASCII_RANGE, ++$pos );
-                                               $buffer[++$i & 7] = $str[$pos - 1];
+                                       # The sequence is legal!
+                                       $head = '';
+                               } elseif( $c < "\x80" ) {
+                                       # ASCII byte.
+                                       $head = '';
+                               } elseif( $c < "\xc0" ) {
+                                       # Illegal tail bytes
+                                       if( $head == '' ) {
+                                               # Out of the blue!
+                                               $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
                                        } else {
-                                               $buffer[++$i & 7] = $c;
+                                               # Don't add if we're continuing a broken sequence;
+                                               # we already put a replacement character when we looked
+                                               # at the broken sequence.
+                                               $replace[] = array( '', $base + $i, 1 );
                                        }
+                               } else {
+                                       # Miscellaneous freaks.
+                                       $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
+                                       $head = '';
                                }
                        }
+                       $base += $chunk;
                }
-               while( $pos < $len );
-
-               /**
-               * Now is time to return the string
-               */
-               if( $tmp_pos ) {
-                       /**
-                       * If the $tmp_pos cursor is not at the beggining of the string then at least
-                       * one character was not in normal form. Replace $str with the fixed version
-                       */
-                       if( $tmp_pos == $len ) {
-                               /**
-                               * The $tmp_pos cursor is at the end of $str, therefore $tmp holds the
-                               * whole $str
-                               */
-                               return $tmp;
-                       } else {
-                               /**
-                               * The rightmost chunk of $str has not been appended to $tmp yet
-                               */
-                               return $tmp . substr( $str, $tmp_pos );
+               if( count( $replace ) ) {
+                       # There were illegal UTF-8 sequences we need to fix up.
+                       $out = '';
+                       $last = 0;
+                       foreach( $replace as $rep ) {
+                               list( $replacement, $start, $length ) = $rep;
+                               if( $last < $start ) {
+                                       $out .= substr( $string, $last, $start - $last );
+                               }
+                               $out .= $replacement;
+                               $last = $start + $length;
+                       }
+                       if( $last < strlen( $string ) ) {
+                               $out .= substr( $string, $last );
                        }
+                       $string = $out;
                }
-
-               /**
-               * The string was already in normal form
-               */
-               return $str;
+               return $looksNormal;
        }
 
+       # These take a string and run the normalization on them, without
+       # checking for validity or any optimization etc. Input must be
+       # VALID UTF-8!
        /**
-       * Decompose a UTF string
-       *
-       * @param        string  $str            UTF string
-       * @param        integer $pos            Position of the first UTF char (in bytes)
-       * @param        integer $len            Length of the string (in bytes)
-       * @param        array   $decomp_map     Decomposition mapping, passed by reference but never modified
-       * @return       string                          The string, decomposed and sorted canonically
-       *
-       * @access       private
-       */
-       function decompose( $str, $pos, $len, &$decomp_map ) {
-               global $utfCombiningClass, $utfCanonicalDecomp;
-
-               /**
-               * UTF char length array
-               */
-               $utf_len_mask = array(
-                       /**
-                       * Leading bytes masks
-                       */
-                       "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
-
-                       /**
-                       * Trailing bytes masks
-                       */
-                       "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
-               );
-
-               /**
-               * Some extra checks are triggered on the first byte of a UTF sequence
-               */
-               $extra_check = array(
-                       "\xED"=>1, "\xEF"=>1, "\xC0"=>1, "\xC1"=>1, "\xE0"=>1, "\xF0"=>1,
-                       "\xF4"=>1, "\xF5"=>1, "\xF6"=>1, "\xF7"=>1, "\xF8"=>1, "\xF9"=>1,
-                       "\xFA"=>1, "\xFB"=>1, "\xFC"=>1, "\xFD"=>1, "\xFE"=>1, "\xFF"=>1
-               );
-
-               /**
-               * These masks are used to check if a UTF sequence is well formed.
-               * Here are the only 3 lengths we acknowledge:
-               *   - 2-byte: 110? ???? 10?? ????
-               *   - 3-byte: 1110 ???? 10?? ???? 10?? ????
-               *   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
-               *
-               * Note that 5- and 6- byte sequences are automatically discarded
-               */
-               $utf_validation_mask = array(
-                       2       =>      "\xE0\xC0",
-                       3       =>      "\xF0\xC0\xC0",
-                       4       =>      "\xF8\xC0\xC0\xC0"
-               );
-               $utf_validation_check = array(
-                       2       =>      "\xC0\x80",
-                       3       =>      "\xE0\x80\x80",
-                       4       =>      "\xF0\x80\x80\x80"
-               );
-
-               $tmp = '';
-               $starter_pos = $pos;
-               $tmp_pos = $last_cc = $sort = $dump = 0;
-               $utf_sort = array();
-
-
-               ////////////////////////////////////////////////////////////////////////
-               //                             Main loop                              //
-               ////////////////////////////////////////////////////////////////////////
-
-               do {
-                       ////////////////////////////////////////////////////////////////////
-                       //                STEP 0: Capture the current char                //
-                       ////////////////////////////////////////////////////////////////////
+        * @param $string string
+        * @return string
+        * @private
+        */
+       static function NFC( $string ) {
+               return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
+       }
 
-                       $cur_mask = $str[$pos] & "\xF0";
-                       if( isset( $utf_len_mask[$cur_mask] ) ) {
-                               if( $utf_len = $utf_len_mask[$cur_mask] ) {
-                                       /**
-                                       * Multibyte char
-                                       */
-                                       $utf_char = substr( $str, $pos, $utf_len );
-                                       $pos += $utf_len;
-                               } else {
-                                       /**
-                                       * A trailing byte came out of nowhere, we will treat it and all
-                                       * following trailing bytes as if each of them was a Unicode
-                                       * replacement char and we will advance the cursor
-                                       */
-                                       $spn = strspn( $str, UTF8_TRAILING_BYTES, $pos );
+       /**
+        * @param $string string
+        * @return string
+        * @private
+        */
+       static function NFD( $string ) {
+               UtfNormal::loadData();
+
+               return UtfNormal::fastCombiningSort(
+                       UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
+       }
 
-                                       if( $dump ) {
-                                               $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
+       /**
+        * @param $string string
+        * @return string
+        * @private
+        */
+       static function NFKC( $string ) {
+               return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
+       }
 
-                                               /**
-                                               * Dump combiners
-                                               */
-                                               if( !empty( $utf_sort ) ) {
-                                                       if( $sort ) {
-                                                               ksort( $utf_sort );
-                                                       }
+       /**
+        * @param $string string
+        * @return string
+        * @private
+        */
+       static function NFKD( $string ) {
+               if( !isset( self::$utfCompatibilityDecomp ) ) {
+                       require_once( 'UtfNormalDataK.inc' );
+               }
+               return self::fastCombiningSort(
+                       self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
+       }
 
-                                                       foreach( $utf_sort as $utf_chars ) {
-                                                               $tmp .= implode( '', $utf_chars );
-                                                       }
-                                               }
 
-                                               $tmp .= str_repeat( UTF8_REPLACEMENT, $spn );
-                                               $dump = $sort = 0;
-                                       } else {
-                                               $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . str_repeat( UTF8_REPLACEMENT, $spn );
+       /**
+        * Perform decomposition of a UTF-8 string into either D or KD form
+        * (depending on which decomposition map is passed to us).
+        * Input is assumed to be *valid* UTF-8. Invalid code will break.
+        * @private
+        * @param $string String: valid UTF-8 string
+        * @param $map Array: hash of expanded decomposition map
+        * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
+        */
+       static function fastDecompose( $string, $map ) {
+               UtfNormal::loadData();
+               $len = strlen( $string );
+               $out = '';
+               for( $i = 0; $i < $len; $i++ ) {
+                       $c = $string[$i];
+                       $n = ord( $c );
+                       if( $n < 0x80 ) {
+                               # ASCII chars never decompose
+                               # THEY ARE IMMORTAL
+                               $out .= $c;
+                               continue;
+                       } elseif( $n >= 0xf0 ) {
+                               $c = substr( $string, $i, 4 );
+                               $i += 3;
+                       } elseif( $n >= 0xe0 ) {
+                               $c = substr( $string, $i, 3 );
+                               $i += 2;
+                       } elseif( $n >= 0xc0 ) {
+                               $c = substr( $string, $i, 2 );
+                               $i++;
+                       }
+                       if( isset( $map[$c] ) ) {
+                               $out .= $map[$c];
+                               continue;
+                       } else {
+                               if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
+                                       # Decompose a hangul syllable into jamo;
+                                       # hardcoded for three-byte UTF-8 sequence.
+                                       # A lookup table would be slightly faster,
+                                       # but adds a lot of memory & disk needs.
+                                       #
+                                       $index = ( (ord( $c[0] ) & 0x0f) << 12
+                                                | (ord( $c[1] ) & 0x3f) <<  6
+                                                | (ord( $c[2] ) & 0x3f) )
+                                              - UNICODE_HANGUL_FIRST;
+                                       $l = intval( $index / UNICODE_HANGUL_NCOUNT );
+                                       $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
+                                       $t = $index % UNICODE_HANGUL_TCOUNT;
+                                       $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
+                                       if( $t >= 25 ) {
+                                               $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
+                                       } elseif( $t ) {
+                                               $out .= "\xe1\x86" . chr( 0xa7 + $t );
                                        }
-
-                                       $pos += $spn;
-                                       $tmp_pos = $starter_pos = $pos;
-
-                                       $utf_sort = array();
-                                       $last_cc = 0;
-
                                        continue;
                                }
+                       }
+                       $out .= $c;
+               }
+               return $out;
+       }
 
-
-                               ////////////////////////////////////////////////////////////////////
-                               //          STEP 1: Decide what to do with current char           //
-                               ////////////////////////////////////////////////////////////////////
-
-                               /**
-                               * Now, in that order:
-                               *  - check if that character is decomposable
-                               *  - check if that character is a non-starter
-                               *  - check if that character requires extra checks to be performed
-                               */
-                               if( isset( $decomp_map[$utf_char] ) ) {
-                                       /**
-                                       * Decompose the char
-                                       */
-                                       $_pos = 0;
-                                       $_len = strlen( $decomp_map[$utf_char] );
-
-                                       do {
-                                               $c = $decomp_map[$utf_char][$_pos];
-                                               $_utf_len =& $utf_len_mask[$c & "\xF0"];
-
-                                               if( isset( $_utf_len ) ) {
-                                                       $_utf_char = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
-                                                       $_pos += $_utf_len;
-
-                                                       if( isset( $utfCombiningClass[$_utf_char] ) ) {
-                                                               /**
-                                                               * The character decomposed to a non-starter, buffer it for sorting
-                                                               */
-                                                               $utf_sort[$utfCombiningClass[$_utf_char]][] = $_utf_char;
-
-                                                               if( $utfCombiningClass[$_utf_char] < $last_cc ) {
-                                                                       /**
-                                                                       * Not canonically ordered, will require sorting
-                                                                       */
-                                                                       $sort = $dump = 1;
-                                                               } else {
-                                                                       $dump = 1;
-                                                                       $last_cc = $utfCombiningClass[$_utf_char];
-                                                               }
-                                                       } else {
-                                                               /**
-                                                               * This character decomposition contains a starter,
-                                                               * dump the buffer and continue
-                                                               */
-                                                               if( $dump ) {
-                                                                       $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                                                       /**
-                                                                       * Dump combiners
-                                                                       */
-                                                                       if( !empty( $utf_sort ) ) {
-                                                                               if( $sort ) {
-                                                                                       ksort( $utf_sort );
-                                                                               }
-
-                                                                               foreach( $utf_sort as $utf_chars ) {
-                                                                                       $tmp .= implode( '', $utf_chars );
-                                                                               }
-                                                                       }
-
-                                                                       $tmp .= $_utf_char;
-                                                                       $dump = $sort = 0;
-                                                               } else {
-                                                                       $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos ) . $_utf_char;
-                                                               }
-
-                                                               $tmp_pos = $starter_pos = $pos;
-                                                               $utf_sort = array();
-                                                               $last_cc = 0;
-                                                       }
-                                               } else {
-                                                       /**
-                                                       * This character decomposition contains an ASCII char,
-                                                       * which is a starter. Dump the buffer and continue
-                                                       */
-                                                       ++$_pos;
-                                                       if( $dump ) {
-                                                               $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                                               /**
-                                                               * Dump combiners
-                                                               */
-                                                               if( !empty( $utf_sort ) ) {
-                                                                       if( $sort ) {
-                                                                               ksort( $utf_sort );
-                                                                       }
-
-                                                                       foreach( $utf_sort as $utf_chars ) {
-                                                                               $tmp .= implode( '', $utf_chars );
-                                                                       }
-                                                               }
-
-                                                               $tmp .= $c;
-                                                               $dump = $sort = 0;
-                                                       } else {
-                                                               $tmp .= substr( $str, $tmp_pos, $pos - $utf_len - $tmp_pos ) . $c;
-                                                       }
-
-                                                       $tmp_pos = $starter_pos = $pos;
-                                                       $utf_sort = array();
-                                                       $last_cc = 0;
-                                               }
-                                       }
-                                       while( $_pos < $_len );
-                               } elseif( isset( $utfCombiningClass[$utf_char] ) ) {
-                                       /**
-                                       * Combining character
-                                       */
-                                       if( $utfCombiningClass[$utf_char] < $last_cc ) {
-                                               /**
-                                               * Not in canonical order
-                                               */
-                                               $sort = $dump = 1;
+       /**
+        * Sorts combining characters into canonical order. This is the
+        * final step in creating decomposed normal forms D and KD.
+        * @private
+        * @param $string String: a valid, decomposed UTF-8 string. Input is not validated.
+        * @return string a UTF-8 string with combining characters sorted in canonical order
+        */
+       static function fastCombiningSort( $string ) {
+               UtfNormal::loadData();
+               $len = strlen( $string );
+               $out = '';
+               $combiners = array();
+               $lastClass = -1;
+               for( $i = 0; $i < $len; $i++ ) {
+                       $c = $string[$i];
+                       $n = ord( $c );
+                       if( $n >= 0x80 ) {
+                               if( $n >= 0xf0 ) {
+                                       $c = substr( $string, $i, 4 );
+                                       $i += 3;
+                               } elseif( $n >= 0xe0 ) {
+                                       $c = substr( $string, $i, 3 );
+                                       $i += 2;
+                               } elseif( $n >= 0xc0 ) {
+                                       $c = substr( $string, $i, 2 );
+                                       $i++;
+                               }
+                               if( isset( self::$utfCombiningClass[$c] ) ) {
+                                       $lastClass = self::$utfCombiningClass[$c];
+                                       if( isset( $combiners[$lastClass] ) ) {
+                                               $combiners[$lastClass] .= $c;
                                        } else {
-                                               $last_cc = $utfCombiningClass[$utf_char];
+                                               $combiners[$lastClass] = $c;
                                        }
+                                       continue;
+                               }
+                       }
+                       if( $lastClass ) {
+                               ksort( $combiners );
+                               $out .= implode( '', $combiners );
+                               $combiners = array();
+                       }
+                       $out .= $c;
+                       $lastClass = 0;
+               }
+               if( $lastClass ) {
+                       ksort( $combiners );
+                       $out .= implode( '', $combiners );
+               }
+               return $out;
+       }
 
-                                       $utf_sort[$utfCombiningClass[$utf_char]][] = $utf_char;
-                               } else {
-                                       /**
-                                       * Non-decomposable starter, check out if it's a Hangul syllable
-                                       */
-                                       if( $utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST ) {
-                                               /**
-                                               * Nope, regular UTF char, check that we have the correct number of trailing bytes
-                                               */
-                                               if( ( $utf_char & $utf_validation_mask[$utf_len] ) != $utf_validation_check[$utf_len] ) {
-                                                       /**
-                                                       * Current char isn't well-formed or legal: either one or
-                                                       * several trailing bytes are missing, or the Unicode char
-                                                       * has been encoded in a five- or six- byte sequence
-                                                       */
-                                                       if( $utf_char[0] >= "\xF8" ) {
-                                                               if( $utf_char[0] < "\xF8" ) {
-                                                                       $trailing_bytes = 3;
-                                                               } elseif( $utf_char[0] < "\xFC" ) {
-                                                                       $trailing_bytes = 4;
-                                                               }
-                                                               if( $utf_char[0] > "\xFD" ) {
-                                                                       $trailing_bytes = 0;
-                                                               } else {
-                                                                       $trailing_bytes = 5;
-                                                               }
-                                                       } else {
-                                                               $trailing_bytes = $utf_len - 1;
-                                                       }
-
-                                                       /**
-                                                       * Move the cursor back to its original position then advance
-                                                       * it to the position it should be at
-                                                       */
-                                                       $pos -= $utf_len;
-                                                       $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                                       if( !empty( $utf_sort ) ) {
-                                                               ksort( $utf_sort );
-
-                                                               foreach( $utf_sort as $utf_chars ) {
-                                                                       $tmp .= implode( '', $utf_chars );
-                                                               }
-                                                               $utf_sort = array();
-                                                       }
-
-                                                       $tmp .= UTF8_REPLACEMENT;
-                                                       $dump = $sort = 0;
-
-                                                       $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
-                                                       $tmp_pos = $pos;
-                                                       continue;
-                                               }
-
-                                               if( isset( $extra_check[$utf_char[0]] ) ) {
-                                                       switch( $utf_char[0] ) {
-                                                               /**
-                                                               * Note: 0xED is quite common in Korean
-                                                               */
-                                                               case "\xED":
-                                                                       if( $utf_char >= "\xED\xA0\x80" ) {
-                                                                               /**
-                                                                               * Surrogates (0xD800..0xDFFF) are not allowed in UTF-8
-                                                                               * (UTF sequence 0xEDA080..0xEDBFBF)
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                                                               if( !empty( $utf_sort ) ) {
-                                                                                       ksort( $utf_sort );
-
-                                                                                       foreach( $utf_sort as $utf_chars ) {
-                                                                                               $tmp .= implode( '', $utf_chars );
-                                                                                       }
-                                                                                       $utf_sort = array();
-                                                                               }
-
-                                                                               $tmp .= UTF8_REPLACEMENT;
-                                                                               $dump = $sort = 0;
-
-                                                                               $tmp_pos = $starter_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                                       break;
-
-                                                               /**
-                                                               * Note: 0xEF is quite common in Japanese
-                                                               */
-                                                               case "\xEF":
-                                                                       if( $utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF" ) {
-                                                                               /**
-                                                                               * 0xFFFE and 0xFFFF are explicitly disallowed
-                                                                               * (UTF sequence 0xEFBFBE..0xEFBFBF)
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                                                               if( !empty( $utf_sort ) ) {
-                                                                                       ksort( $utf_sort );
-
-                                                                                       foreach( $utf_sort as $utf_chars ) {
-                                                                                               $tmp .= implode( '', $utf_chars );
-                                                                                       }
-                                                                                       $utf_sort = array();
-                                                                               }
-
-                                                                               $tmp .= UTF8_REPLACEMENT;
-                                                                               $dump = $sort = 0;
-
-                                                                               $tmp_pos = $starter_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                                       break;
-
-                                                               case "\xC0":
-                                                               case "\xC1":
-                                                                       if( $utf_char <= "\xC1\xBF" ) {
-                                                                               /**
-                                                                               * Overlong sequence: Unicode char 0x00..0x7F encoded as a
-                                                                               * double-byte UTF char
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                                                               if( !empty( $utf_sort ) ) {
-                                                                                       ksort( $utf_sort );
-
-                                                                                       foreach( $utf_sort as $utf_chars ) {
-                                                                                               $tmp .= implode( '', $utf_chars );
-                                                                                       }
-                                                                                       $utf_sort = array();
-                                                                               }
-
-                                                                               $tmp .= UTF8_REPLACEMENT;
-                                                                               $dump = $sort = 0;
-
-                                                                               $tmp_pos = $starter_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                                       break;
-
-                                                               case "\xE0":
-                                                                       if( $utf_char <= "\xE0\x9F\xBF" ) {
-                                                                               /**
-                                                                               * Unicode char 0x0000..0x07FF encoded in 3 bytes
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                                                               if( !empty( $utf_sort ) ) {
-                                                                                       ksort( $utf_sort );
-
-                                                                                       foreach( $utf_sort as $utf_chars ) {
-                                                                                               $tmp .= implode( '', $utf_chars );
-                                                                                       }
-                                                                                       $utf_sort = array();
-                                                                               }
-
-                                                                               $tmp .= UTF8_REPLACEMENT;
-                                                                               $dump = $sort = 0;
-
-                                                                               $tmp_pos = $starter_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                                       break;
-
-                                                               case "\xF0":
-                                                                       if( $utf_char <= "\xF0\x8F\xBF\xBF" ) {
-                                                                               /**
-                                                                               * Unicode char 0x0000..0xFFFF encoded in 4 bytes
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                                                               if( !empty( $utf_sort ) ) {
-                                                                                       ksort( $utf_sort );
-
-                                                                                       foreach( $utf_sort as $utf_chars ) {
-                                                                                               $tmp .= implode( '', $utf_chars );
-                                                                                       }
-                                                                                       $utf_sort = array();
-                                                                               }
-
-                                                                               $tmp .= UTF8_REPLACEMENT;
-                                                                               $dump = $sort = 0;
-
-                                                                               $tmp_pos = $starter_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                                       break;
-
-                                                               default:
-                                                                       if( $utf_char > UTF8_MAX ) {
-                                                                               /**
-                                                                               * Out of the Unicode range
-                                                                               */
-                                                                               $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                                                               if( !empty( $utf_sort ) ) {
-                                                                                       ksort( $utf_sort );
-
-                                                                                       foreach( $utf_sort as $utf_chars ) {
-                                                                                               $tmp .= implode( '', $utf_chars );
-                                                                                       }
-                                                                                       $utf_sort = array();
-                                                                               }
-
-                                                                               $tmp .= UTF8_REPLACEMENT;
-                                                                               $dump = $sort = 0;
-
-                                                                               $tmp_pos = $starter_pos = $pos;
-                                                                               continue 2;
-                                                                       }
-                                                       }
-                                               }
+       /**
+        * Produces canonically composed sequences, i.e. normal form C or KC.
+        *
+        * @private
+        * @param $string String: a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
+        * @return string a UTF-8 string with canonical precomposed characters used where possible
+        */
+       static function fastCompose( $string ) {
+               UtfNormal::loadData();
+               $len = strlen( $string );
+               $out = '';
+               $lastClass = -1;
+               $lastHangul = 0;
+               $startChar = '';
+               $combining = '';
+               $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));
+               $x2 = ord(substr(UTF8_HANGUL_TEND,0,1));
+               for( $i = 0; $i < $len; $i++ ) {
+                       $c = $string[$i];
+                       $n = ord( $c );
+                       if( $n < 0x80 ) {
+                               # No combining characters here...
+                               $out .= $startChar;
+                               $out .= $combining;
+                               $startChar = $c;
+                               $combining = '';
+                               $lastClass = 0;
+                               continue;
+                       } elseif( $n >= 0xf0 ) {
+                               $c = substr( $string, $i, 4 );
+                               $i += 3;
+                       } elseif( $n >= 0xe0 ) {
+                               $c = substr( $string, $i, 3 );
+                               $i += 2;
+                       } elseif( $n >= 0xc0 ) {
+                               $c = substr( $string, $i, 2 );
+                               $i++;
+                       }
+                       $pair = $startChar . $c;
+                       if( $n > 0x80 ) {
+                               if( isset( self::$utfCombiningClass[$c] ) ) {
+                                       # A combining char; see what we can do with it
+                                       $class = self::$utfCombiningClass[$c];
+                                       if( !empty( $startChar ) &&
+                                               $lastClass < $class &&
+                                               $class > 0 &&
+                                               isset( self::$utfCanonicalComp[$pair] ) ) {
+                                               $startChar = self::$utfCanonicalComp[$pair];
+                                               $class = 0;
                                        } else {
-                                               /**
-                                               * Hangul syllable
-                                               */
-                                               $idx = ( ( ( ord( $utf_char[0] ) & 0x0F ) << 12 ) | ( ( ord( $utf_char[1] ) & 0x3F ) << 6 ) | ( ord( $utf_char[2] ) & 0x3F ) ) - UNICODE_HANGUL_SBASE;
-
-                                               /**
-                                               * LIndex can only range from 0 to 18, therefore it cannot influence
-                                               * the first two bytes of the L Jamo, which allows us to hardcode
-                                               * them (based on LBase).
-                                               *
-                                               * The same goes for VIndex, but for TIndex there's a catch: the value
-                                               * of the third byte could exceed 0xBF and we would have to increment
-                                               * the second byte
-                                               */
-                                               if( $tIndex = $idx % UNICODE_HANGUL_TCOUNT ) {
-                                                       if( $tIndex < 25 ) {
-                                                               $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
-                                                               $utf_char[8] = chr( 0xA7 + $tIndex );
-                                                       } else {
-                                                               $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
-                                                               $utf_char[8] = chr( 0x67 + $tIndex );
-                                                       }
-                                               } else {
-                                                       $utf_char = "\xE1\x84\x00\xE1\x85\x00";
-                                               }
-
-                                               $utf_char[2] = chr( 0x80 + ( int ) ( $idx / UNICODE_HANGUL_NCOUNT ) );
-                                               $utf_char[5] = chr( 0xA1 + ( int ) ( ( $idx % UNICODE_HANGUL_NCOUNT ) / UNICODE_HANGUL_TCOUNT ) );
-
-
-                                               /**
-                                               * Just like other decompositions, the resulting Jamos must
-                                               * be dumped to the tmp string
-                                               */
-                                               $dump = 1;
+                                               $combining .= $c;
                                        }
-
-                                       /**
-                                       * Do we need to dump stuff to the tmp string?
-                                       */
-                                       if( $dump ) {
-                                               $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                               /**
-                                               * Dump combiners
-                                               */
-                                               if( !empty( $utf_sort ) ) {
-                                                       if( $sort ) {
-                                                               ksort( $utf_sort );
-                                                       }
-
-                                                       foreach( $utf_sort as $utf_chars ) {
-                                                               $tmp .= implode( '', $utf_chars );
+                                       $lastClass = $class;
+                                       $lastHangul = 0;
+                                       continue;
+                               }
+                       }
+                       # New start char
+                       if( $lastClass == 0 ) {
+                               if( isset( self::$utfCanonicalComp[$pair] ) ) {
+                                       $startChar = self::$utfCanonicalComp[$pair];
+                                       $lastHangul = 0;
+                                       continue;
+                               }
+                               if( $n >= $x1 && $n <= $x2 ) {
+                                       # WARNING: Hangul code is painfully slow.
+                                       # I apologize for this ugly, ugly code; however
+                                       # performance is even more teh suck if we call
+                                       # out to nice clean functions. Lookup tables are
+                                       # marginally faster, but require a lot of space.
+                                       #
+                                       if( $c >= UTF8_HANGUL_VBASE &&
+                                               $c <= UTF8_HANGUL_VEND &&
+                                               $startChar >= UTF8_HANGUL_LBASE &&
+                                               $startChar <= UTF8_HANGUL_LEND ) {
+                                               #
+                                               #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
+                                               #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
+                                               $lIndex = ord( $startChar[2] ) - 0x80;
+                                               $vIndex = ord( $c[2]         ) - 0xa1;
+
+                                               $hangulPoint = UNICODE_HANGUL_FIRST +
+                                                       UNICODE_HANGUL_TCOUNT *
+                                                       (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
+
+                                               # Hardcode the limited-range UTF-8 conversion:
+                                               $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
+                                                                        chr( $hangulPoint >>  6 & 0x3f | 0x80 ) .
+                                                                        chr( $hangulPoint       & 0x3f | 0x80 );
+                                               $lastHangul = 0;
+                                               continue;
+                                       } elseif( $c >= UTF8_HANGUL_TBASE &&
+                                                         $c <= UTF8_HANGUL_TEND &&
+                                                         $startChar >= UTF8_HANGUL_FIRST &&
+                                                         $startChar <= UTF8_HANGUL_LAST &&
+                                                         !$lastHangul ) {
+                                               # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
+                                               $tIndex = ord( $c[2] ) - 0xa7;
+                                               if( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + (0x11c0 - 0x11a7);
+
+                                               # Increment the code point by $tIndex, without
+                                               # the function overhead of decoding and recoding UTF-8
+                                               #
+                                               $tail = ord( $startChar[2] ) + $tIndex;
+                                               if( $tail > 0xbf ) {
+                                                       $tail -= 0x40;
+                                                       $mid = ord( $startChar[1] ) + 1;
+                                                       if( $mid > 0xbf ) {
+                                                               $startChar[0] = chr( ord( $startChar[0] ) + 1 );
+                                                               $mid -= 0x40;
                                                        }
+                                                       $startChar[1] = chr( $mid );
                                                }
+                                               $startChar[2] = chr( $tail );
 
-                                               $tmp .= $utf_char;
-                                               $dump = $sort = 0;
-                                               $tmp_pos = $pos;
+                                               # If there's another jamo char after this, *don't* try to merge it.
+                                               $lastHangul = 1;
+                                               continue;
                                        }
-
-                                       $last_cc = 0;
-                                       $utf_sort = array();
-                                       $starter_pos = $pos;
                                }
-                       } else {
-                               /**
-                               * ASCII char, which happens to be a starter (as any other ASCII char)
-                               */
-                               if( $dump ) {
-                                       $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                                       /**
-                                       * Dump combiners
-                                       */
-                                       if( !empty( $utf_sort ) ) {
-                                               if( $sort ) {
-                                                       ksort( $utf_sort );
-                                               }
-
-                                               foreach( $utf_sort as $utf_chars ) {
-                                                       $tmp .= implode( '', $utf_chars );
-                                               }
-                                       }
-
-                                       $tmp .= $str[$pos];
-                                       $dump = $sort = 0;
-                                       $tmp_pos = ++$pos;
-
-                                       $pos += strspn( $str, UTF8_ASCII_RANGE, $pos );
-                               } else {
-                                       $pos += strspn( $str, UTF8_ASCII_RANGE, ++$pos );
-                               }
-
-                               $last_cc = 0;
-                               $utf_sort = array();
-                               $starter_pos = $pos;
                        }
+                       $out .= $startChar;
+                       $out .= $combining;
+                       $startChar = $c;
+                       $combining = '';
+                       $lastClass = 0;
+                       $lastHangul = 0;
                }
-               while( $pos < $len );
-
-               /**
-               * Now is time to return the string
-               */
-               if( $dump ) {
-                       $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
-
-                       /**
-                       * Dump combiners
-                       */
-                       if( !empty( $utf_sort ) ) {
-                               if( $sort ) {
-                                       ksort( $utf_sort );
-                               }
-
-                               foreach( $utf_sort as $utf_chars ) {
-                                       $tmp .= implode( '', $utf_chars );
-                               }
-                       }
-
-                       return $tmp;
+               $out .= $startChar . $combining;
+               return $out;
+       }
 
-               } elseif( $tmp_pos ) {
-                       /**
-                       * If the $tmp_pos cursor was moved then at least one character was not in
-                       * normal form. Replace $str with the fixed version
-                       */
-                       if( $tmp_pos == $len ) {
-                               /**
-                               * The $tmp_pos cursor is at the end of $str, therefore $tmp holds
-                               * the whole $str
-                               */
-                               return $tmp;
-                       } else {
-                               /**
-                               * The rightmost chunk of $str has not been appended to $tmp yet
-                               */
-                               return $tmp . substr( $str, $tmp_pos );
-                       }
+       /**
+        * This is just used for the benchmark, comparing how long it takes to
+        * interate through a string without really doing anything of substance.
+        * @param $string string
+        * @return string
+        */
+       static function placebo( $string ) {
+               $len = strlen( $string );
+               $out = '';
+               for( $i = 0; $i < $len; $i++ ) {
+                       $out .= $string[$i];
                }
-
-               /**
-               * The string was already in normal form
-               */
-               return $str;
+               return $out;
+       }
+       /**
+        * Function to replace some characters that we don't want
+        * but most of the native normalize functions keep.
+        *
+        * @param $string String The string
+        * @return String String with the character codes replaced.
+        */
+       private static function replaceForNativeNormalize( $string ) { 
+               $string = preg_replace(
+                       '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
+                       UTF8_REPLACEMENT,
+                       $string );
+               $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
+               $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
+               return $string;
        }
-}
-
 }