Fix composition bug: completed hangul syllable should not be merged with another...
authorBrion Vibber <brion@users.mediawiki.org>
Mon, 15 Nov 2004 00:59:40 +0000 (00:59 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Mon, 15 Nov 2004 00:59:40 +0000 (00:59 +0000)
includes/normal/CleanUpTest.php
includes/normal/UtfNormal.php

index e9156ab..219cc57 100644 (file)
@@ -1,4 +1,36 @@
 <?php
+# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
+# http://www.mediawiki.org/
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or 
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+/**
+ * Additional tests for UtfNormal::cleanUp() function, inclusion
+ * regression checks for known problems.
+ *
+ * Requires PHPUnit.
+ * 
+ * @package UtfNormal
+ * @access private
+ */
+
+if( php_sapi_name() != 'cli' ) {
+       die( "Run me from the command line please.\n" );
+}
+
 /** */
 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
        dl( 'php_utfnormal.so' );
@@ -338,6 +370,15 @@ class CleanUpTest extends PHPUnit_TestCase {
                        bin2hex( $expect ),
                        bin2hex( UtfNormal::cleanUp( $text ) ) );
        }
+       
+       function testHangulRegression() {
+               $text = "\xed\x9c\xaf" . # Hangul char
+                               "\xe1\x87\x81";  # followed by another final jamo
+               $expect = $text;         # Should *not* change.
+               $this->assertEquals(
+                       bin2hex( $expect ),
+                       bin2hex( UtfNormal::cleanUp( $text ) ) );
+       }
 }
 
 
@@ -345,4 +386,8 @@ $suite =& new PHPUnit_TestSuite( 'CleanUpTest' );
 $result = PHPUnit::run( $suite );
 echo $result->toString();
 
+if( !$result->wasSuccessful() ) {
+       exit( -1 );
+}
+exit( 0 );
 ?>
\ No newline at end of file
index 62461d6..55f420e 100644 (file)
@@ -652,6 +652,7 @@ class UtfNormal {
                $len = strlen( $string );
                $out = '';
                $lastClass = -1;
+               $lastHangul = 0;
                $startChar = '';
                $combining = '';
                $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));
@@ -692,6 +693,7 @@ class UtfNormal {
                                                $combining .= $c;
                                        }
                                        $lastClass = $class;
+                                       $lastHangul = 0;
                                        continue;
                                }
                        }
@@ -699,6 +701,7 @@ class UtfNormal {
                        if( $lastClass == 0 ) {
                                if( isset( $utfCanonicalComp[$pair] ) ) {
                                        $startChar = $utfCanonicalComp[$pair];
+                                       $lastHangul = 0;
                                        continue;
                                }
                                if( $n >= $x1 && $n <= $x2 ) {
@@ -726,11 +729,13 @@ class UtfNormal {
                                                $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
                                                                         chr( $hangulPoint >>  6 & 0x3f | 0x80 ) .
                                                                         chr( $hangulPoint       & 0x3f | 0x80 );
+                                               $lastHangul = 0;
                                                continue;
                                        } elseif( $c >= UTF8_HANGUL_TBASE &&
                                                          $c <= UTF8_HANGUL_TEND &&
                                                          $startChar >= UTF8_HANGUL_FIRST &&
-                                                         $startChar <= UTF8_HANGUL_LAST ) {
+                                                         $startChar <= UTF8_HANGUL_LAST &&
+                                                         !$lastHangul ) {
                                                # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
                                                $tIndex = ord( $c{2} ) - 0xa7;
                                                if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7);
@@ -749,6 +754,9 @@ class UtfNormal {
                                                        $startChar{1} = chr( $mid );
                                                }
                                                $startChar{2} = chr( $tail );
+                                               
+                                               # If there's another jamo char after this, *don't* try to merge it.
+                                               $lastHangul = 1;
                                                continue;
                                        }
                                }
@@ -758,6 +766,7 @@ class UtfNormal {
                        $startChar = $c;
                        $combining = '';
                        $lastClass = 0;
+                       $lastHangul = 0;
                }
                $out .= $startChar . $combining;
                return $out;