Fix UTF-8 validation bug where some cases didn't get replacement chars inserted correctly
authorBrion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 02:24:44 +0000 (02:24 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 02:24:44 +0000 (02:24 +0000)
includes/normal/CleanUpTest.php
includes/normal/UtfNormal.php

index 64f7b63..2dd8d56 100644 (file)
@@ -1,4 +1,8 @@
 <?php
+/** */
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+       dl( 'php_utfnormal.so' );
+}
 
 #ini_set( 'memory_limit', '40M' );
 
@@ -214,6 +218,28 @@ class CleanUpTest extends PHPUnit_TestCase {
                        }
                }
        }
+       
+       function testChunkRegression() {
+               # Check for regression against a chunking bug
+               $text   = "\x46\x55\xb8" .
+                         "\xdc\x96" . 
+                         "\xee" .
+                         "\xe7" .
+                         "\x44" .
+                         "\xaa" .
+                         "\x2f\x25";
+               $expect = "\x46\x55\xef\xbf\xbd" .
+                         "\xdc\x96" . 
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\x44" .
+                         "\xef\xbf\xbd" .
+                         "\x2f\x25";
+
+               $this->assertEquals(
+                       bin2hex( $expect ),
+                       bin2hex( UtfNormal::cleanUp( $text ) ) );
+       }
 
 }
 
index 2883342..7dd9072 100644 (file)
@@ -350,7 +350,6 @@ class UtfNormal {
                                                                $replace[] = array( UTF8_REPLACEMENT,
                                                                                                        $base + $i + 1 - strlen( $sequence ),
                                                                                                        strlen( $sequence ) );
-                                                               $base += $chunk;
                                                                break 2;
                                                        } else {
                                                                # Illegal tail byte; abandon the sequence.