Fix regression in ICU-mode UTF-8 verification: U+FFFF is forbidden
authorBrion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 21:36:43 +0000 (21:36 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 21:36:43 +0000 (21:36 +0000)
includes/normal/CleanUpTest.php
includes/normal/UtfNormal.php

index badade6..e9156ab 100644 (file)
@@ -330,6 +330,14 @@ class CleanUpTest extends PHPUnit_TestCase {
                        bin2hex( $expect ),
                        bin2hex( UtfNormal::cleanUp( $text ) ) );
        }
+
+       function testForbiddenRegression() {
+               $text   = "\xef\xbf\xbf"; # U+FFFF, illegal char
+               $expect = "\xef\xbf\xbd";
+               $this->assertEquals(
+                       bin2hex( $expect ),
+                       bin2hex( UtfNormal::cleanUp( $text ) ) );
+       }
 }
 
 
index a4c095c..62461d6 100644 (file)
@@ -132,11 +132,12 @@ class UtfNormal {
                                '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
                                UTF8_REPLACEMENT,
                                $string );
-                       $str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
+                       $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
+                       $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
                        
                        # UnicodeString constructor fails if the string ends with a
                        # head byte. Add a junk char at the end, we'll strip it off.
-                       return rtrim( utf8_normalize( $str . "\x01", UNORM_NFC ), "\x01" );
+                       return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
                } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
                        # Side effect -- $string has had UTF-8 errors cleaned up.
                        return $string;