3 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
4 dl( 'php_utfnormal.so' );
7 #ini_set( 'memory_limit', '40M' );
9 require_once( 'PHPUnit.php' );
10 require_once( 'UtfNormal.php' );
12 class CleanUpTest
extends PHPUnit_TestCase
{
13 function CleanUpTest( $name ) {
14 $this->PHPUnit_TestCase( $name );
23 function testAscii() {
24 $text = 'This is plain ASCII text.';
25 $this->assertEquals( $text, UtfNormal
::cleanUp( $text ) );
29 $text = "a \x00 null";
30 $expect = "a \xef\xbf\xbd null";
33 bin2hex( UtfNormal
::cleanUp( $text ) ) );
36 function testLatin() {
37 $text = "L'\xc3\xa9cole";
38 $this->assertEquals( $text, UtfNormal
::cleanUp( $text ) );
41 function testLatinNormal() {
42 $text = "L'e\xcc\x81cole";
43 $expect = "L'\xc3\xa9cole";
44 $this->assertEquals( $expect, UtfNormal
::cleanUp( $text ) );
47 # This test is *very* expensive!
48 function XtestAllChars() {
49 $rep = UTF8_REPLACEMENT
;
50 global $utfCanonicalComp, $utfCanonicalDecomp;
51 for( $i = 0x0; $i < UNICODE_MAX
; $i++
) {
52 $char = codepointToUtf8( $i );
53 $clean = UtfNormal
::cleanUp( $char );
54 $x = sprintf( "%04X", $i );
55 if( $i %
0x1000 == 0 ) echo "U+$x\n";
59 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST
) ||
60 ($i > UNICODE_SURROGATE_LAST
&& $i < 0xfffe ) ||
61 ($i > 0xffff && $i <= UNICODE_MAX
) ) {
62 if( isset( $utfCanonicalComp[$char] ) ||
isset( $utfCanonicalDecomp[$char] ) ) {
63 $comp = UtfNormal
::NFC( $char );
67 "U+$x should be decomposed" );
72 "U+$x should be intact" );
75 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
80 function testAllBytes() {
81 $this->doTestBytes( '', '' );
82 $this->doTestBytes( 'x', '' );
83 $this->doTestBytes( '', 'x' );
84 $this->doTestBytes( 'x', 'x' );
87 function doTestBytes( $head, $tail ) {
88 for( $i = 0x0; $i < 256; $i++
) {
89 $char = $head . chr( $i ) . $tail;
90 $clean = UtfNormal
::cleanUp( $char );
91 $x = sprintf( "%02X", $i );
95 ($i > 0x001f && $i < 0x80) ) {
99 "ASCII byte $x should be intact" );
100 if( $char != $clean ) return;
102 $norm = $head . UTF8_REPLACEMENT
. $tail;
106 "Forbidden byte $x should be rejected" );
107 if( $norm != $clean ) return;
112 function testDoubleBytes() {
113 $this->doTestDoubleBytes( '', '' );
114 $this->doTestDoubleBytes( 'x', '' );
115 $this->doTestDoubleBytes( '', 'x' );
116 $this->doTestDoubleBytes( 'x', 'x' );
119 function doTestDoubleBytes( $head, $tail ) {
120 for( $first = 0xc0; $first < 0x100; $first++
) {
121 for( $second = 0x80; $second < 0x100; $second++
) {
122 $char = $head . chr( $first ) . chr( $second ) . $tail;
123 $clean = UtfNormal
::cleanUp( $char );
124 $x = sprintf( "%02X,%02X", $first, $second );
128 $norm = UtfNormal
::NFC( $char );
132 "Pair $x should be intact" );
133 if( $norm != $clean ) return;
134 } elseif( $first > 0xfd ||
$second > 0xbf ) {
135 # fe and ff are not legal head bytes -- expect two replacement chars
136 $norm = $head . UTF8_REPLACEMENT
. UTF8_REPLACEMENT
. $tail;
140 "Forbidden pair $x should be rejected" );
141 if( $norm != $clean ) return;
143 $norm = $head . UTF8_REPLACEMENT
. $tail;
147 "Forbidden pair $x should be rejected" );
148 if( $norm != $clean ) return;
154 function testTripleBytes() {
155 $this->doTestTripleBytes( '', '' );
156 $this->doTestTripleBytes( 'x', '' );
157 $this->doTestTripleBytes( '', 'x' );
158 $this->doTestTripleBytes( 'x', 'x' );
161 function doTestTripleBytes( $head, $tail ) {
162 for( $first = 0xc0; $first < 0x100; $first++
) {
163 for( $second = 0x80; $second < 0x100; $second++
) {
164 #for( $third = 0x80; $third < 0x100; $third++ ) {
165 for( $third = 0x80; $third < 0x81; $third++
) {
166 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
167 $clean = UtfNormal
::cleanUp( $char );
168 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
169 if( $first >= 0xe0 &&
173 if( $first == 0xe0 && $second < 0xa0 ) {
175 bin2hex( $head . UTF8_REPLACEMENT
. $tail ),
177 "Overlong triplet $x should be rejected" );
178 } elseif( $first == 0xed &&
179 ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST
) {
181 bin2hex( $head . UTF8_REPLACEMENT
. $tail ),
183 "Surrogate triplet $x should be rejected" );
186 bin2hex( UtfNormal
::NFC( $char ) ),
188 "Triplet $x should be intact" );
190 } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
192 bin2hex( UtfNormal
::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT
. $tail ),
194 "Valid 2-byte $x + broken tail" );
195 } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
197 bin2hex( $head . UTF8_REPLACEMENT
. UtfNormal
::NFC( chr( $second ) . chr( $third ) . $tail ) ),
199 "Broken head + valid 2-byte $x" );
200 } elseif( ( $first > 0xfd ||
$second > 0xfd ) &&
201 ( ( $second > 0xbf && $third > 0xbf ) ||
202 ( $second < 0xc0 && $third < 0xc0 ) ||
203 ( $second > 0xfd ) ||
204 ( $third > 0xfd ) ) ) {
205 # fe and ff are not legal head bytes -- expect three replacement chars
207 bin2hex( $head . UTF8_REPLACEMENT
. UTF8_REPLACEMENT
. UTF8_REPLACEMENT
. $tail ),
209 "Forbidden triplet $x should be rejected" );
210 } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
212 bin2hex( $head . UTF8_REPLACEMENT
. $tail ),
214 "Forbidden triplet $x should be rejected" );
217 bin2hex( $head . UTF8_REPLACEMENT
. UTF8_REPLACEMENT
. $tail ),
219 "Forbidden triplet $x should be rejected" );
226 function testChunkRegression() {
227 # Check for regression against a chunking bug
228 $text = "\x46\x55\xb8" .
235 $expect = "\x46\x55\xef\xbf\xbd" .
245 bin2hex( UtfNormal
::cleanUp( $text ) ) );
248 function testInterposeRegression() {
263 $expect = "\x4e\x30" .
279 bin2hex( UtfNormal
::cleanUp( $text ) ) );
282 function testOverlongRegression() {
284 "\x1a" . # forbidden ascii
286 "\xc1\xa6" . # overlong sequence
288 "\x1c" . # forbidden ascii
303 bin2hex( UtfNormal
::cleanUp( $text ) ) );
306 function testSurrogateRegression() {
307 $text = "\xed\xb4\x96" . # surrogate 0xDD16
311 $expect = "\xef\xbf\xbd" .
317 bin2hex( UtfNormal
::cleanUp( $text ) ) );
320 function testBomRegression() {
321 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
325 $expect = "\xef\xbf\xbd" .
331 bin2hex( UtfNormal
::cleanUp( $text ) ) );
334 function testForbiddenRegression() {
335 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
336 $expect = "\xef\xbf\xbd";
339 bin2hex( UtfNormal
::cleanUp( $text ) ) );
344 $suite =& new PHPUnit_TestSuite( 'CleanUpTest' );
345 $result = PHPUnit
::run( $suite );
346 echo $result->toString();