Fix UTF-8 validation regression: well-formed but forbidden UTF-8 sequence followed...
[lhc/web/wiklou.git] / includes / normal / CleanUpTest.php
1 <?php
2 /** */
3 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
4 dl( 'php_utfnormal.so' );
5 }
6
7 #ini_set( 'memory_limit', '40M' );
8
9 require_once( 'PHPUnit.php' );
10 require_once( 'UtfNormal.php' );
11
12 class CleanUpTest extends PHPUnit_TestCase {
13 function CleanUpTest( $name ) {
14 $this->PHPUnit_TestCase( $name );
15 }
16
17 function setUp() {
18 }
19
20 function tearDown() {
21 }
22
23 function testAscii() {
24 $text = 'This is plain ASCII text.';
25 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
26 }
27
28 function testNull() {
29 $text = "a \x00 null";
30 $expect = "a \xef\xbf\xbd null";
31 $this->assertEquals(
32 bin2hex( $expect ),
33 bin2hex( UtfNormal::cleanUp( $text ) ) );
34 }
35
36 function testLatin() {
37 $text = "L'\xc3\xa9cole";
38 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
39 }
40
41 function testLatinNormal() {
42 $text = "L'e\xcc\x81cole";
43 $expect = "L'\xc3\xa9cole";
44 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
45 }
46
47 # This test is *very* expensive!
48 function XtestAllChars() {
49 $rep = UTF8_REPLACEMENT;
50 global $utfCanonicalComp, $utfCanonicalDecomp;
51 for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
52 $char = codepointToUtf8( $i );
53 $clean = UtfNormal::cleanUp( $char );
54 $x = sprintf( "%04X", $i );
55 if( $i % 0x1000 == 0 ) echo "U+$x\n";
56 if( $i == 0x0009 ||
57 $i == 0x000a ||
58 $i == 0x000d ||
59 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
60 ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
61 ($i > 0xffff && $i <= UNICODE_MAX ) ) {
62 if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) {
63 $comp = UtfNormal::NFC( $char );
64 $this->assertEquals(
65 bin2hex( $comp ),
66 bin2hex( $clean ),
67 "U+$x should be decomposed" );
68 } else {
69 $this->assertEquals(
70 bin2hex( $char ),
71 bin2hex( $clean ),
72 "U+$x should be intact" );
73 }
74 } else {
75 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
76 }
77 }
78 }
79
80 function testAllBytes() {
81 $this->doTestBytes( '', '' );
82 $this->doTestBytes( 'x', '' );
83 $this->doTestBytes( '', 'x' );
84 $this->doTestBytes( 'x', 'x' );
85 }
86
87 function doTestBytes( $head, $tail ) {
88 for( $i = 0x0; $i < 256; $i++ ) {
89 $char = $head . chr( $i ) . $tail;
90 $clean = UtfNormal::cleanUp( $char );
91 $x = sprintf( "%02X", $i );
92 if( $i == 0x0009 ||
93 $i == 0x000a ||
94 $i == 0x000d ||
95 ($i > 0x001f && $i < 0x80) ) {
96 $this->assertEquals(
97 bin2hex( $char ),
98 bin2hex( $clean ),
99 "ASCII byte $x should be intact" );
100 if( $char != $clean ) return;
101 } else {
102 $norm = $head . UTF8_REPLACEMENT . $tail;
103 $this->assertEquals(
104 bin2hex( $norm ),
105 bin2hex( $clean ),
106 "Forbidden byte $x should be rejected" );
107 if( $norm != $clean ) return;
108 }
109 }
110 }
111
112 function testDoubleBytes() {
113 $this->doTestDoubleBytes( '', '' );
114 $this->doTestDoubleBytes( 'x', '' );
115 $this->doTestDoubleBytes( '', 'x' );
116 $this->doTestDoubleBytes( 'x', 'x' );
117 }
118
119 function doTestDoubleBytes( $head, $tail ) {
120 for( $first = 0xc0; $first < 0x100; $first++ ) {
121 for( $second = 0x80; $second < 0x100; $second++ ) {
122 $char = $head . chr( $first ) . chr( $second ) . $tail;
123 $clean = UtfNormal::cleanUp( $char );
124 $x = sprintf( "%02X,%02X", $first, $second );
125 if( $first > 0xc1 &&
126 $first < 0xe0 &&
127 $second < 0xc0 ) {
128 $norm = UtfNormal::NFC( $char );
129 $this->assertEquals(
130 bin2hex( $norm ),
131 bin2hex( $clean ),
132 "Pair $x should be intact" );
133 if( $norm != $clean ) return;
134 } elseif( $first > 0xfd || $second > 0xbf ) {
135 # fe and ff are not legal head bytes -- expect two replacement chars
136 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
137 $this->assertEquals(
138 bin2hex( $norm ),
139 bin2hex( $clean ),
140 "Forbidden pair $x should be rejected" );
141 if( $norm != $clean ) return;
142 } else {
143 $norm = $head . UTF8_REPLACEMENT . $tail;
144 $this->assertEquals(
145 bin2hex( $norm ),
146 bin2hex( $clean ),
147 "Forbidden pair $x should be rejected" );
148 if( $norm != $clean ) return;
149 }
150 }
151 }
152 }
153
154 function testTripleBytes() {
155 $this->doTestTripleBytes( '', '' );
156 $this->doTestTripleBytes( 'x', '' );
157 $this->doTestTripleBytes( '', 'x' );
158 $this->doTestTripleBytes( 'x', 'x' );
159 }
160
161 function doTestTripleBytes( $head, $tail ) {
162 for( $first = 0xc0; $first < 0x100; $first++ ) {
163 for( $second = 0x80; $second < 0x100; $second++ ) {
164 #for( $third = 0x80; $third < 0x100; $third++ ) {
165 for( $third = 0x80; $third < 0x81; $third++ ) {
166 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
167 $clean = UtfNormal::cleanUp( $char );
168 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
169 if( $first >= 0xe0 &&
170 $first < 0xf0 &&
171 $second < 0xc0 &&
172 $third < 0xc0 ) {
173 if( $first == 0xe0 && $second < 0xa0 ) {
174 $this->assertEquals(
175 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
176 bin2hex( $clean ),
177 "Overlong triplet $x should be rejected" );
178 } elseif( $first == 0xed &&
179 ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST ) {
180 $this->assertEquals(
181 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
182 bin2hex( $clean ),
183 "Surrogate triplet $x should be rejected" );
184 } else {
185 $this->assertEquals(
186 bin2hex( UtfNormal::NFC( $char ) ),
187 bin2hex( $clean ),
188 "Triplet $x should be intact" );
189 }
190 } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
191 $this->assertEquals(
192 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
193 bin2hex( $clean ),
194 "Valid 2-byte $x + broken tail" );
195 } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
196 $this->assertEquals(
197 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
198 bin2hex( $clean ),
199 "Broken head + valid 2-byte $x" );
200 } elseif( ( $first > 0xfd || $second > 0xfd ) &&
201 ( ( $second > 0xbf && $third > 0xbf ) ||
202 ( $second < 0xc0 && $third < 0xc0 ) ||
203 ( $second > 0xfd ) ||
204 ( $third > 0xfd ) ) ) {
205 # fe and ff are not legal head bytes -- expect three replacement chars
206 $this->assertEquals(
207 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
208 bin2hex( $clean ),
209 "Forbidden triplet $x should be rejected" );
210 } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
211 $this->assertEquals(
212 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
213 bin2hex( $clean ),
214 "Forbidden triplet $x should be rejected" );
215 } else {
216 $this->assertEquals(
217 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
218 bin2hex( $clean ),
219 "Forbidden triplet $x should be rejected" );
220 }
221 }
222 }
223 }
224 }
225
226 function testChunkRegression() {
227 # Check for regression against a chunking bug
228 $text = "\x46\x55\xb8" .
229 "\xdc\x96" .
230 "\xee" .
231 "\xe7" .
232 "\x44" .
233 "\xaa" .
234 "\x2f\x25";
235 $expect = "\x46\x55\xef\xbf\xbd" .
236 "\xdc\x96" .
237 "\xef\xbf\xbd" .
238 "\xef\xbf\xbd" .
239 "\x44" .
240 "\xef\xbf\xbd" .
241 "\x2f\x25";
242
243 $this->assertEquals(
244 bin2hex( $expect ),
245 bin2hex( UtfNormal::cleanUp( $text ) ) );
246 }
247
248 function testInterposeRegression() {
249 $text = "\x4e\x30" .
250 "\xb1" . # bad tail
251 "\x3a" .
252 "\x92" . # bad tail
253 "\x62\x3a" .
254 "\x84" . # bad tail
255 "\x43" .
256 "\xc6" . # bad head
257 "\x3f" .
258 "\x92" . # bad tail
259 "\xad" . # bad tail
260 "\x7d" .
261 "\xd9\x95";
262
263 $expect = "\x4e\x30" .
264 "\xef\xbf\xbd" .
265 "\x3a" .
266 "\xef\xbf\xbd" .
267 "\x62\x3a" .
268 "\xef\xbf\xbd" .
269 "\x43" .
270 "\xef\xbf\xbd" .
271 "\x3f" .
272 "\xef\xbf\xbd" .
273 "\xef\xbf\xbd" .
274 "\x7d" .
275 "\xd9\x95";
276
277 $this->assertEquals(
278 bin2hex( $expect ),
279 bin2hex( UtfNormal::cleanUp( $text ) ) );
280 }
281
282 function testOverlongRegression() {
283 $text = "\x67" .
284 "\x1a" . # forbidden ascii
285 "\xea" . # bad head
286 "\xc1\xa6" . # overlong sequence
287 "\xad" . # bad tail
288 "\x1c" . # forbidden ascii
289 "\xb0" . # bad tail
290 "\x3c" .
291 "\x9e"; # bad tail
292 $expect = "\x67" .
293 "\xef\xbf\xbd" .
294 "\xef\xbf\xbd" .
295 "\xef\xbf\xbd" .
296 "\xef\xbf\xbd" .
297 "\xef\xbf\xbd" .
298 "\xef\xbf\xbd" .
299 "\x3c" .
300 "\xef\xbf\xbd";
301 $this->assertEquals(
302 bin2hex( $expect ),
303 bin2hex( UtfNormal::cleanUp( $text ) ) );
304 }
305
306 }
307
308
309 $suite =& new PHPUnit_TestSuite( 'CleanUpTest' );
310 $result = PHPUnit::run( $suite );
311 echo $result->toString();
312
313 ?>