useless call
[lhc/web/wiklou.git] / includes / normal / UtfNormal.php
index 7dd9072..c6183b5 100644 (file)
@@ -126,10 +126,24 @@ class UtfNormal {
         * @return string a clean, shiny, normalized UTF-8 string
         */
        function cleanUp( $string ) {
-               if( UtfNormal::quickIsNFCVerify( $string ) )
+               if( NORMALIZE_ICU ) {
+                       # We exclude a few chars that ICU would not.
+                       $string = preg_replace(
+                               '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
+                               UTF8_REPLACEMENT,
+                               $string );
+                       $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
+                       $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
+                       
+                       # UnicodeString constructor fails if the string ends with a
+                       # head byte. Add a junk char at the end, we'll strip it off.
+                       return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
+               } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
+                       # Side effect -- $string has had UTF-8 errors cleaned up.
                        return $string;
-               else
+               } else {
                        return UtfNormal::NFC( $string );
+               }
        }
 
        /**
@@ -380,6 +394,7 @@ class UtfNormal {
                                                                $replace[] = array( UTF8_REPLACEMENT,
                                                                             $base + $i + 1 - strlen( $sequence ),
                                                                             strlen( $sequence ) );
+                                                               $head = '';
                                                                continue;
                                                        }
                                                } else {
@@ -408,6 +423,7 @@ class UtfNormal {
                                                                $replace[] = array( UTF8_REPLACEMENT,
                                                                                    $base + $i + 1 - strlen( $sequence ), 
                                                                                    strlen( $sequence ) );
+                                                               $head = '';
                                                                continue;
                                                        }
                                                }
@@ -424,6 +440,7 @@ class UtfNormal {
                                        $head = '';
                                } elseif( $c < "\x80" ) {
                                        # ASCII byte.
+                                       $head = '';
                                } elseif( $c < "\xc0" ) {
                                        # Illegal tail bytes
                                        if( $head == '' ) {
@@ -438,6 +455,7 @@ class UtfNormal {
                                } else {
                                        # Miscellaneous freaks.
                                        $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
+                                       $head = '';
                                }
                        }
                        $base += $chunk;
@@ -555,8 +573,8 @@ class UtfNormal {
                                                 | (ord( $c{1} ) & 0x3f) <<  6
                                                 | (ord( $c{2} ) & 0x3f) )
                                               - UNICODE_HANGUL_FIRST;
-                                       $l = IntVal( $index / UNICODE_HANGUL_NCOUNT );
-                                       $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
+                                       $l = intval( $index / UNICODE_HANGUL_NCOUNT );
+                                       $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
                                        $t = $index % UNICODE_HANGUL_TCOUNT;
                                        $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
                                        if( $t >= 25 ) {
@@ -634,6 +652,7 @@ class UtfNormal {
                $len = strlen( $string );
                $out = '';
                $lastClass = -1;
+               $lastHangul = 0;
                $startChar = '';
                $combining = '';
                $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));
@@ -674,6 +693,7 @@ class UtfNormal {
                                                $combining .= $c;
                                        }
                                        $lastClass = $class;
+                                       $lastHangul = 0;
                                        continue;
                                }
                        }
@@ -681,6 +701,7 @@ class UtfNormal {
                        if( $lastClass == 0 ) {
                                if( isset( $utfCanonicalComp[$pair] ) ) {
                                        $startChar = $utfCanonicalComp[$pair];
+                                       $lastHangul = 0;
                                        continue;
                                }
                                if( $n >= $x1 && $n <= $x2 ) {
@@ -708,11 +729,13 @@ class UtfNormal {
                                                $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
                                                                         chr( $hangulPoint >>  6 & 0x3f | 0x80 ) .
                                                                         chr( $hangulPoint       & 0x3f | 0x80 );
+                                               $lastHangul = 0;
                                                continue;
                                        } elseif( $c >= UTF8_HANGUL_TBASE &&
                                                          $c <= UTF8_HANGUL_TEND &&
                                                          $startChar >= UTF8_HANGUL_FIRST &&
-                                                         $startChar <= UTF8_HANGUL_LAST ) {
+                                                         $startChar <= UTF8_HANGUL_LAST &&
+                                                         !$lastHangul ) {
                                                # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
                                                $tIndex = ord( $c{2} ) - 0xa7;
                                                if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7);
@@ -731,6 +754,9 @@ class UtfNormal {
                                                        $startChar{1} = chr( $mid );
                                                }
                                                $startChar{2} = chr( $tail );
+                                               
+                                               # If there's another jamo char after this, *don't* try to merge it.
+                                               $lastHangul = 1;
                                                continue;
                                        }
                                }
@@ -740,6 +766,7 @@ class UtfNormal {
                        $startChar = $c;
                        $combining = '';
                        $lastClass = 0;
+                       $lastHangul = 0;
                }
                $out .= $startChar . $combining;
                return $out;