* @return string a clean, shiny, normalized UTF-8 string
*/
function cleanUp( $string ) {
- if( UtfNormal::quickIsNFCVerify( $string ) )
+ if( NORMALIZE_ICU ) {
+ # We exclude a few chars that ICU would not.
+ $string = preg_replace(
+ '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
+ UTF8_REPLACEMENT,
+ $string );
+ $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
+ $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
+
+ # UnicodeString constructor fails if the string ends with a
+ # head byte. Add a junk char at the end, we'll strip it off.
+ return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
+ } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
+ # Side effect -- $string has had UTF-8 errors cleaned up.
return $string;
- else
+ } else {
return UtfNormal::NFC( $string );
+ }
}
/**
$replace[] = array( UTF8_REPLACEMENT,
$base + $i + 1 - strlen( $sequence ),
strlen( $sequence ) );
+ $head = '';
continue;
}
} else {
$replace[] = array( UTF8_REPLACEMENT,
$base + $i + 1 - strlen( $sequence ),
strlen( $sequence ) );
+ $head = '';
continue;
}
}
$head = '';
} elseif( $c < "\x80" ) {
# ASCII byte.
+ $head = '';
} elseif( $c < "\xc0" ) {
# Illegal tail bytes
if( $head == '' ) {
} else {
# Miscellaneous freaks.
$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
+ $head = '';
}
}
$base += $chunk;
| (ord( $c{1} ) & 0x3f) << 6
| (ord( $c{2} ) & 0x3f) )
- UNICODE_HANGUL_FIRST;
- $l = IntVal( $index / UNICODE_HANGUL_NCOUNT );
- $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
+ $l = intval( $index / UNICODE_HANGUL_NCOUNT );
+ $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
$t = $index % UNICODE_HANGUL_TCOUNT;
$out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
if( $t >= 25 ) {
$len = strlen( $string );
$out = '';
$lastClass = -1;
+ $lastHangul = 0;
$startChar = '';
$combining = '';
$x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));
$combining .= $c;
}
$lastClass = $class;
+ $lastHangul = 0;
continue;
}
}
if( $lastClass == 0 ) {
if( isset( $utfCanonicalComp[$pair] ) ) {
$startChar = $utfCanonicalComp[$pair];
+ $lastHangul = 0;
continue;
}
if( $n >= $x1 && $n <= $x2 ) {
$startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
chr( $hangulPoint & 0x3f | 0x80 );
+ $lastHangul = 0;
continue;
} elseif( $c >= UTF8_HANGUL_TBASE &&
$c <= UTF8_HANGUL_TEND &&
$startChar >= UTF8_HANGUL_FIRST &&
- $startChar <= UTF8_HANGUL_LAST ) {
+ $startChar <= UTF8_HANGUL_LAST &&
+ !$lastHangul ) {
# $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
$tIndex = ord( $c{2} ) - 0xa7;
if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7);
$startChar{1} = chr( $mid );
}
$startChar{2} = chr( $tail );
+
+ # If there's another jamo char after this, *don't* try to merge it.
+ $lastHangul = 1;
continue;
}
}
$startChar = $c;
$combining = '';
$lastClass = 0;
+ $lastHangul = 0;
}
$out .= $startChar . $combining;
return $out;