Fix UTF-8 validation bug where some cases didn't get replacement chars inserted correctly
[lhc/web/wiklou.git] / includes / normal / CleanUpTest.php
1 <?php
2 /** */
3 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
4 dl( 'php_utfnormal.so' );
5 }
6
7 #ini_set( 'memory_limit', '40M' );
8
9 require_once( 'PHPUnit.php' );
10 require_once( 'UtfNormal.php' );
11
12 class CleanUpTest extends PHPUnit_TestCase {
13 function CleanUpTest( $name ) {
14 $this->PHPUnit_TestCase( $name );
15 }
16
17 function setUp() {
18 }
19
20 function tearDown() {
21 }
22
23 function testAscii() {
24 $text = 'This is plain ASCII text.';
25 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
26 }
27
28 function testNull() {
29 $text = "a \x00 null";
30 $expect = "a \xef\xbf\xbd null";
31 $this->assertEquals(
32 bin2hex( $expect ),
33 bin2hex( UtfNormal::cleanUp( $text ) ) );
34 }
35
36 function testLatin() {
37 $text = "L'\xc3\xa9cole";
38 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
39 }
40
41 function testLatinNormal() {
42 $text = "L'e\xcc\x81cole";
43 $expect = "L'\xc3\xa9cole";
44 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
45 }
46
47 # This test is *very* expensive!
48 function XtestAllChars() {
49 $rep = UTF8_REPLACEMENT;
50 global $utfCanonicalComp, $utfCanonicalDecomp;
51 for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
52 $char = codepointToUtf8( $i );
53 $clean = UtfNormal::cleanUp( $char );
54 $x = sprintf( "%04X", $i );
55 if( $i % 0x1000 == 0 ) echo "U+$x\n";
56 if( $i == 0x0009 ||
57 $i == 0x000a ||
58 $i == 0x000d ||
59 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
60 ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
61 ($i > 0xffff && $i <= UNICODE_MAX ) ) {
62 if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) {
63 $comp = UtfNormal::NFC( $char );
64 $this->assertEquals(
65 bin2hex( $comp ),
66 bin2hex( $clean ),
67 "U+$x should be decomposed" );
68 } else {
69 $this->assertEquals(
70 bin2hex( $char ),
71 bin2hex( $clean ),
72 "U+$x should be intact" );
73 }
74 } else {
75 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
76 }
77 }
78 }
79
80 function testAllBytes() {
81 $this->doTestBytes( '', '' );
82 $this->doTestBytes( 'x', '' );
83 $this->doTestBytes( '', 'x' );
84 $this->doTestBytes( 'x', 'x' );
85 }
86
87 function doTestBytes( $head, $tail ) {
88 for( $i = 0x0; $i < 256; $i++ ) {
89 $char = $head . chr( $i ) . $tail;
90 $clean = UtfNormal::cleanUp( $char );
91 $x = sprintf( "%02X", $i );
92 if( $i == 0x0009 ||
93 $i == 0x000a ||
94 $i == 0x000d ||
95 ($i > 0x001f && $i < 0x80) ) {
96 $this->assertEquals(
97 bin2hex( $char ),
98 bin2hex( $clean ),
99 "ASCII byte $x should be intact" );
100 if( $char != $clean ) return;
101 } else {
102 $norm = $head . UTF8_REPLACEMENT . $tail;
103 $this->assertEquals(
104 bin2hex( $norm ),
105 bin2hex( $clean ),
106 "Forbidden byte $x should be rejected" );
107 if( $norm != $clean ) return;
108 }
109 }
110 }
111
112 function testDoubleBytes() {
113 $this->doTestDoubleBytes( '', '' );
114 $this->doTestDoubleBytes( 'x', '' );
115 $this->doTestDoubleBytes( '', 'x' );
116 $this->doTestDoubleBytes( 'x', 'x' );
117 }
118
119 function doTestDoubleBytes( $head, $tail ) {
120 for( $first = 0xc0; $first < 0x100; $first++ ) {
121 for( $second = 0x80; $second < 0x100; $second++ ) {
122 $char = $head . chr( $first ) . chr( $second ) . $tail;
123 $clean = UtfNormal::cleanUp( $char );
124 $x = sprintf( "%02X,%02X", $first, $second );
125 if( $first > 0xc1 &&
126 $first < 0xe0 &&
127 $second < 0xc0 ) {
128 $norm = UtfNormal::NFC( $char );
129 $this->assertEquals(
130 bin2hex( $norm ),
131 bin2hex( $clean ),
132 "Pair $x should be intact" );
133 if( $norm != $clean ) return;
134 } elseif( $first > 0xfd || $second > 0xbf ) {
135 # fe and ff are not legal head bytes -- expect two replacement chars
136 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
137 $this->assertEquals(
138 bin2hex( $norm ),
139 bin2hex( $clean ),
140 "Forbidden pair $x should be rejected" );
141 if( $norm != $clean ) return;
142 } else {
143 $norm = $head . UTF8_REPLACEMENT . $tail;
144 $this->assertEquals(
145 bin2hex( $norm ),
146 bin2hex( $clean ),
147 "Forbidden pair $x should be rejected" );
148 if( $norm != $clean ) return;
149 }
150 }
151 }
152 }
153
154 function testTripleBytes() {
155 $this->doTestTripleBytes( '', '' );
156 $this->doTestTripleBytes( 'x', '' );
157 $this->doTestTripleBytes( '', 'x' );
158 $this->doTestTripleBytes( 'x', 'x' );
159 }
160
161 function doTestTripleBytes( $head, $tail ) {
162 for( $first = 0xc0; $first < 0x100; $first++ ) {
163 for( $second = 0x80; $second < 0x100; $second++ ) {
164 #for( $third = 0x80; $third < 0x100; $third++ ) {
165 for( $third = 0x80; $third < 0x81; $third++ ) {
166 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
167 $clean = UtfNormal::cleanUp( $char );
168 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
169 if( $first >= 0xe0 &&
170 $first < 0xf0 &&
171 $second < 0xc0 &&
172 $third < 0xc0 ) {
173 if( $first == 0xe0 && $second < 0xa0 ) {
174 $this->assertEquals(
175 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
176 bin2hex( $clean ),
177 "Overlong triplet $x should be rejected" );
178 } elseif( $first == 0xed &&
179 ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST ) {
180 $this->assertEquals(
181 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
182 bin2hex( $clean ),
183 "Surrogate triplet $x should be rejected" );
184 } else {
185 $this->assertEquals(
186 bin2hex( UtfNormal::NFC( $char ) ),
187 bin2hex( $clean ),
188 "Triplet $x should be intact" );
189 }
190 } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
191 $this->assertEquals(
192 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
193 bin2hex( $clean ),
194 "Valid 2-byte $x + broken tail" );
195 } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
196 $this->assertEquals(
197 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
198 bin2hex( $clean ),
199 "Broken head + valid 2-byte $x" );
200 } elseif( $first > 0xfd && ( ( $second > 0xbf && $third > 0xbf ) || ($second < 0xc0 && $third < 0xc0 ) || ($second > 0xfd ) || ($third > 0xfd) ) ) {
201 # fe and ff are not legal head bytes -- expect three replacement chars
202 $this->assertEquals(
203 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
204 bin2hex( $clean ),
205 "Forbidden triplet $x should be rejected" );
206 } elseif( $second < 0xc0 && $second < 0xc0 ) {
207 $this->assertEquals(
208 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
209 bin2hex( $clean ),
210 "Forbidden triplet $x should be rejected" );
211 } else {
212 $this->assertEquals(
213 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
214 bin2hex( $clean ),
215 "Forbidden triplet $x should be rejected" );
216 }
217 }
218 }
219 }
220 }
221
222 function testChunkRegression() {
223 # Check for regression against a chunking bug
224 $text = "\x46\x55\xb8" .
225 "\xdc\x96" .
226 "\xee" .
227 "\xe7" .
228 "\x44" .
229 "\xaa" .
230 "\x2f\x25";
231 $expect = "\x46\x55\xef\xbf\xbd" .
232 "\xdc\x96" .
233 "\xef\xbf\xbd" .
234 "\xef\xbf\xbd" .
235 "\x44" .
236 "\xef\xbf\xbd" .
237 "\x2f\x25";
238
239 $this->assertEquals(
240 bin2hex( $expect ),
241 bin2hex( UtfNormal::cleanUp( $text ) ) );
242 }
243
244 }
245
246
247 $suite =& new PHPUnit_TestSuite( 'CleanUpTest' );
248 $result = PHPUnit::run( $suite );
249 echo $result->toString();
250
251 ?>