Support using ICU to do most of the heavy lifting in cleanUp() if the extension is...

author Brion Vibber <brion@users.mediawiki.org>

Sun, 14 Nov 2004 05:17:29 +0000 (05:17 +0000)

committer Brion Vibber <brion@users.mediawiki.org>

Sun, 14 Nov 2004 05:17:29 +0000 (05:17 +0000)
author Brion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 05:17:29 +0000 (05:17 +0000)
committer Brion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 05:17:29 +0000 (05:17 +0000)
diff --git a/includes/normal/CleanUpTest.php b/includes/normal/CleanUpTest.php

index 0eb4058..badade6 100644 (file)
--- a/includes/normal/CleanUpTest.php
+++ b/includes/normal/CleanUpTest.php
@@ -316,6 +316,20 @@ class CleanUpTest extends PHPUnit_TestCase {
                         bin2hex( $expect ),
                         bin2hex( UtfNormal::cleanUp( $text ) ) );
         }
+       
+       function testBomRegression() {
+               $text   = "\xef\xbf\xbe" . # U+FFFE, illegal char
+                         "\xb2" . # bad tail
+                         "\xef" . # bad head
+                         "\x59";
+               $expect = "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\x59";
+               $this->assertEquals(
+                       bin2hex( $expect ),
+                       bin2hex( UtfNormal::cleanUp( $text ) ) );
+       }
  }
  
  
diff --git a/includes/normal/Makefile b/includes/normal/Makefile

index 1042e12..fcdf238 100644 (file)
--- a/includes/normal/Makefile
+++ b/includes/normal/Makefile
@@ -1,3 +1,5 @@
+.PHONY : all test testutf8 testclean icutest bench icubench clean distclean
+
  FETCH=wget
  #FETCH=fetch
  BASE=http://www.unicode.org/Public/UNIDATA
@@ -9,15 +11,26 @@ all : UtfNormalData.inc
  UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
         $(PHP) UtfNormalGenerate.php
  
-test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
+test : testutf8 testclean UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
         $(PHP) UtfNormalTest.php
  
  testutf8 : Utf8Test.php UTF-8-test.txt
         $(PHP) Utf8Test.php
  
+testclean : CleanUpTest.php
+       $(PHP) CleanUpTest.php
+
  bench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/sociology.txt testdata/bulgakov.txt
         $(PHP) UtfNormalBench.php
  
+icutest : UtfNormalData.inc NormalizationTest.txt
+       $(PHP) Utf8Test.php --icu
+       $(PHP) CleanUpTest.php --icu
+       $(PHP) UtfNormalTest.php --icu
+
+icubench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/sociology.txt testdata/bulgakov.txt
+       $(PHP) UtfNormalBench.php --icu
+
  clean :
         rm -f UtfNormalData.inc
  
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php

index 254e9c0..a4c095c 100644 (file)
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -126,10 +126,23 @@ class UtfNormal {
          * @return string a clean, shiny, normalized UTF-8 string
          */
         function cleanUp( $string ) {
-               if( UtfNormal::quickIsNFCVerify( $string ) )
+               if( NORMALIZE_ICU ) {
+                       # We exclude a few chars that ICU would not.
+                       $string = preg_replace(
+                               '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
+                               UTF8_REPLACEMENT,
+                               $string );
+                       $str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
+                       
+                       # UnicodeString constructor fails if the string ends with a
+                       # head byte. Add a junk char at the end, we'll strip it off.
+                       return rtrim( utf8_normalize( $str . "\x01", UNORM_NFC ), "\x01" );
+               } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
+                       # Side effect -- $string has had UTF-8 errors cleaned up.
                         return $string;
-               else
+               } else {
                         return UtfNormal::NFC( $string );
+               }
         }
  
         /**
author	Brion Vibber <brion@users.mediawiki.org>
	Sun, 14 Nov 2004 05:17:29 +0000 (05:17 +0000)
committer	Brion Vibber <brion@users.mediawiki.org>
	Sun, 14 Nov 2004 05:17:29 +0000 (05:17 +0000)
includes/normal/CleanUpTest.php		patch \| blob \| history
includes/normal/Makefile		patch \| blob \| history
includes/normal/UtfNormal.php		patch \| blob \| history