- * Recompose a UTF string
- *
- * @param string $str Unchecked UTF string
- * @param integer $pos Position of the first UTF char (in bytes)
- * @param integer $len Length of the string (in bytes)
- * @param array $qc Quick-check array, passed by reference but never modified
- * @param array $decomp_map Decomposition mapping, passed by reference but never modified
- * @return string The string, validated and recomposed
- *
- * @access private
- */
- function recompose( $str, $pos, $len, &$qc, &$decomp_map ) {
- global $utfCombiningClass, $utfCanonicalComp, $utfJamoType, $utfJamoIndex;
-
- /**
- * Buffer the last ASCII char before the UTF-8 stuff if applicable
- */
- $tmp = '';
- $i = $tmp_pos = $last_cc = 0;
-
- if( $pos ) {
- $buffer = array(++$i => $str[$pos - 1] );
- } else {
- $buffer = array();
- }
-
- /**
- * UTF char length array
- *
- * This array is used to determine the length of a UTF character. Be $c the
- * result of ($str[$pos] & "\xF0") --where $str is the string we're operating
- * on and $pos the position of the cursor--, if $utf_len_mask[$c] does not
- * exist, the byte is an ASCII char. Otherwise, if $utf_len_mask[$c] is greater
- * than 0, we have a the leading byte of a multibyte character whose length is
- * $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
- */
- $utf_len_mask = array(
- /**
- * Leading bytes masks
- */
- "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
-
- /**
- * Trailing bytes masks
- */
- "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
- );
-
- $extra_check = array(
- "\xED"=>1, "\xEF"=>1, "\xC0"=>1, "\xC1"=>1, "\xE0"=>1, "\xF0"=>1,
- "\xF4"=>1, "\xF5"=>1, "\xF6"=>1, "\xF7"=>1, "\xF8"=>1, "\xF9"=>1,
- "\xFA"=>1, "\xFB"=>1, "\xFC"=>1, "\xFD"=>1, "\xFE"=>1, "\xFF"=>1
- );
-
- $utf_validation_mask = array(
- 2 => "\xE0\xC0",
- 3 => "\xF0\xC0\xC0",
- 4 => "\xF8\xC0\xC0\xC0"
- );
-
- $utf_validation_check = array(
- 2 => "\xC0\x80",
- 3 => "\xE0\x80\x80",
- 4 => "\xF0\x80\x80\x80"
- );
-
- ////////////////////////////////////////////////////////////////////////
- // Main loop //
- ////////////////////////////////////////////////////////////////////////
-
- do {
- ////////////////////////////////////////////////////////////////////
- // STEP 0: Capture the current char and buffer it //
- ////////////////////////////////////////////////////////////////////
-
- $c = $str[$pos];
- $c_mask = $c & "\xF0";
-
- if( isset( $utf_len_mask[$c_mask] ) ) {
- /**
- * Byte at $pos is either a leading byte or a missplaced trailing byte
- */
- if( $utf_len = $utf_len_mask[$c_mask] ) {
- /**
- * Capture the char
- */
- $buffer[++$i & 7] = $utf_char = substr( $str, $pos, $utf_len );
-
- /**
- * Let's find out if a thorough check is needed
- */
- if( isset( $qc[$utf_char] ) ) {
- /**
- * If the UTF char is in the qc array then it may not be in normal
- * form. We do nothing here, the actual processing is below this
- * "if" block
- */
- } elseif( isset( $utfCombiningClass[$utf_char] ) ) {
- if( $utfCombiningClass[$utf_char] < $last_cc ) {
- /**
- * A combining character that is NOT canonically ordered
- */
- } else {
- /**
- * A combining character that IS canonically ordered, skip
- * to the next char
- */
- $last_cc = $utfCombiningClass[$utf_char];
-
- $pos += $utf_len;
- continue;
- }
- } else {
- /**
- * At this point, $utf_char holds a UTF char that we know
- * is not a NF[K]C_QC and is not a combining character. It can
- * be a singleton, a canonical composite, a replacement char or
- * an even an ill-formed bunch of bytes. Let's find out
- */
- $last_cc = 0;
-
- /**
- * Check that we have the correct number of trailing bytes
- */
- if( ( $utf_char & $utf_validation_mask[$utf_len] ) != $utf_validation_check[$utf_len] ) {
- /**
- * Current char isn't well-formed or legal: either one or
- * several trailing bytes are missing, or the Unicode char
- * has been encoded in a five- or six- byte sequence
- */
- if( $utf_char[0] >= "\xF8" ) {
- if( $utf_char[0] < "\xF8" ) {
- $trailing_bytes = 3;
- } elseif( $utf_char[0] < "\xFC" ) {
- $trailing_bytes = 4;
- }
- if( $utf_char[0] > "\xFD" ) {
- $trailing_bytes = 0;
- } else {
- $trailing_bytes = 5;
- }
- } else {
- $trailing_bytes = $utf_len - 1;
- }
-
- $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
- $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
- $tmp_pos = $pos;
-
- continue;
- }
-
- if( isset( $extra_check[$c] ) ) {
- switch( $c ) {
- /**
- * Note: 0xED is quite common in Korean
- */
- case "\xED":
- if( $utf_char >= "\xED\xA0\x80" ) {
- /**
- * Surrogates (0xD800..0xDFFF) are not allowed in UTF-8
- * (UTF sequence 0xEDA080..0xEDBFBF)
- */
- $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
- $pos += $utf_len;
- $tmp_pos = $pos;
- continue 2;
- }
- break;
-
- /**
- * Note: 0xEF is quite common in Japanese
- */
- case "\xEF":
- if( $utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF" ) {
- /**
- * 0xFFFE and 0xFFFF are explicitly disallowed
- * (UTF sequence 0xEFBFBE..0xEFBFBF)
- */
- $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
- $pos += $utf_len;
- $tmp_pos = $pos;
- continue 2;
- }
- break;
-
- case "\xC0":
- case "\xC1":
- if( $utf_char <= "\xC1\xBF" ) {
- /**
- * Overlong sequence: Unicode char 0x00..0x7F encoded as a
- * double-byte UTF char
- */
- $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
- $pos += $utf_len;
- $tmp_pos = $pos;
- continue 2;
- }
- break;
-
- case "\xE0":
- if( $utf_char <= "\xE0\x9F\xBF" ) {
- /**
- * Unicode char 0x0000..0x07FF encoded in 3 bytes
- */
- $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
- $pos += $utf_len;
- $tmp_pos = $pos;
- continue 2;
- }
- break;
-
- case "\xF0":
- if( $utf_char <= "\xF0\x8F\xBF\xBF" ) {
- /**
- * Unicode char 0x0000..0xFFFF encoded in 4 bytes
- */
- $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
- $pos += $utf_len;
- $tmp_pos = $pos;
- continue 2;
- }
- break;
-
- default:
- /**
- * Five- and six- byte sequences do not need being checked for here anymore
- */
- if( $utf_char > UTF8_MAX ) {
- /**
- * Out of the Unicode range
- */
- if( $utf_char[0] < "\xF8" ) {
- $trailing_bytes = 3;
- } elseif( $utf_char[0] < "\xFC" ) {
- $trailing_bytes = 4;
- } elseif( $utf_char[0] > "\xFD" ) {
- $trailing_bytes = 0;
- } else {
- $trailing_bytes = 5;
- }
-
- $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
- $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
- $tmp_pos = $pos;
- continue 2;
- }
- }
- }
-
- /**
- * The char is a valid starter, move the cursor and go on
- */
- $pos += $utf_len;
- continue;
- }
+ * Returns true if the string is _definitely_ in NFC.
+ * Returns false if not or uncertain.
+ * @param $string String: a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
+ * @return bool
+ */
+ static function quickIsNFCVerify( &$string ) {
+ # Screen out some characters that eg won't be allowed in XML
+ $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
+
+ # ASCII is always valid NFC!
+ # If we're only ever given plain ASCII, we can avoid the overhead
+ # of initializing the decomposition tables by skipping out early.
+ if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+
+ static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
+ if( !isset( $checkit ) ) {
+ # Load/build some scary lookup tables...
+ UtfNormal::loadData();
+
+ $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
+
+ # Head bytes for sequences which we should do further validity checks
+ $checkit = array_flip( array_map( 'chr',
+ array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
+
+ # Each UTF-8 head byte is followed by a certain
+ # number of tail bytes.
+ $tailBytes = array();
+ for( $n = 0; $n < 256; $n++ ) {
+ if( $n < 0xc0 ) {
+ $remaining = 0;
+ } elseif( $n < 0xe0 ) {
+ $remaining = 1;
+ } elseif( $n < 0xf0 ) {
+ $remaining = 2;
+ } elseif( $n < 0xf8 ) {
+ $remaining = 3;
+ } elseif( $n < 0xfc ) {
+ $remaining = 4;
+ } elseif( $n < 0xfe ) {
+ $remaining = 5;