Support using ICU to do most of the heavy lifting in cleanUp() if the extension is...
authorBrion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 05:17:29 +0000 (05:17 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 05:17:29 +0000 (05:17 +0000)
Modestly faster for roman text (1-2x), 16-20x faster than the PHP looping for already normalized Russian, Japanese, and Korean text.

includes/normal/CleanUpTest.php
includes/normal/Makefile
includes/normal/UtfNormal.php

index 0eb4058..badade6 100644 (file)
@@ -316,6 +316,20 @@ class CleanUpTest extends PHPUnit_TestCase {
                        bin2hex( $expect ),
                        bin2hex( UtfNormal::cleanUp( $text ) ) );
        }
+       
+       function testBomRegression() {
+               $text   = "\xef\xbf\xbe" . # U+FFFE, illegal char
+                         "\xb2" . # bad tail
+                         "\xef" . # bad head
+                         "\x59";
+               $expect = "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\x59";
+               $this->assertEquals(
+                       bin2hex( $expect ),
+                       bin2hex( UtfNormal::cleanUp( $text ) ) );
+       }
 }
 
 
index 1042e12..fcdf238 100644 (file)
@@ -1,3 +1,5 @@
+.PHONY : all test testutf8 testclean icutest bench icubench clean distclean
+
 FETCH=wget
 #FETCH=fetch
 BASE=http://www.unicode.org/Public/UNIDATA
@@ -9,15 +11,26 @@ all : UtfNormalData.inc
 UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
        $(PHP) UtfNormalGenerate.php
 
-test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
+test : testutf8 testclean UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
        $(PHP) UtfNormalTest.php
 
 testutf8 : Utf8Test.php UTF-8-test.txt
        $(PHP) Utf8Test.php
 
+testclean : CleanUpTest.php
+       $(PHP) CleanUpTest.php
+
 bench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/sociology.txt testdata/bulgakov.txt
        $(PHP) UtfNormalBench.php
 
+icutest : UtfNormalData.inc NormalizationTest.txt
+       $(PHP) Utf8Test.php --icu
+       $(PHP) CleanUpTest.php --icu
+       $(PHP) UtfNormalTest.php --icu
+
+icubench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/sociology.txt testdata/bulgakov.txt
+       $(PHP) UtfNormalBench.php --icu
+
 clean :
        rm -f UtfNormalData.inc
 
index 254e9c0..a4c095c 100644 (file)
@@ -126,10 +126,23 @@ class UtfNormal {
         * @return string a clean, shiny, normalized UTF-8 string
         */
        function cleanUp( $string ) {
-               if( UtfNormal::quickIsNFCVerify( $string ) )
+               if( NORMALIZE_ICU ) {
+                       # We exclude a few chars that ICU would not.
+                       $string = preg_replace(
+                               '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
+                               UTF8_REPLACEMENT,
+                               $string );
+                       $str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
+                       
+                       # UnicodeString constructor fails if the string ends with a
+                       # head byte. Add a junk char at the end, we'll strip it off.
+                       return rtrim( utf8_normalize( $str . "\x01", UNORM_NFC ), "\x01" );
+               } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
+                       # Side effect -- $string has had UTF-8 errors cleaned up.
                        return $string;
-               else
+               } else {
                        return UtfNormal::NFC( $string );
+               }
        }
 
        /**