Moved globals $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp, $utfCheckNF...
[lhc/web/wiklou.git] / includes / normal / CleanUpTest.php
1 <?php
2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3 # http://www.mediawiki.org/
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 # http://www.gnu.org/copyleft/gpl.html
19
20
21 if( php_sapi_name() != 'cli' ) {
22 die( "Run me from the command line please.\n" );
23 }
24
25 /** */
26 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
27 dl( 'php_utfnormal.so' );
28 }
29
30 #ini_set( 'memory_limit', '40M' );
31
32 require_once 'PHPUnit/Framework.php';
33 require_once 'PHPUnit/TextUI/TestRunner.php';
34
35 require_once 'UtfNormal.php';
36
37 /**
38 * Additional tests for UtfNormal::cleanUp() function, inclusion
39 * regression checks for known problems.
40 * Requires PHPUnit.
41 *
42 * @ingroup UtfNormal
43 * @private
44 */
45 class CleanUpTest extends PHPUnit_Framework_TestCase {
46 /** @todo document */
47 function setUp() {
48 }
49
50 /** @todo document */
51 function tearDown() {
52 }
53
54 /** @todo document */
55 function testAscii() {
56 $text = 'This is plain ASCII text.';
57 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
58 }
59
60 /** @todo document */
61 function testNull() {
62 $text = "a \x00 null";
63 $expect = "a \xef\xbf\xbd null";
64 $this->assertEquals(
65 bin2hex( $expect ),
66 bin2hex( UtfNormal::cleanUp( $text ) ) );
67 }
68
69 /** @todo document */
70 function testLatin() {
71 $text = "L'\xc3\xa9cole";
72 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
73 }
74
75 /** @todo document */
76 function testLatinNormal() {
77 $text = "L'e\xcc\x81cole";
78 $expect = "L'\xc3\xa9cole";
79 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
80 }
81
82 /**
83 * This test is *very* expensive!
84 * @todo document
85 */
86 function XtestAllChars() {
87 $rep = UTF8_REPLACEMENT;
88 for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
89 $char = codepointToUtf8( $i );
90 $clean = UtfNormal::cleanUp( $char );
91 $x = sprintf( "%04X", $i );
92 if( $i % 0x1000 == 0 ) echo "U+$x\n";
93 if( $i == 0x0009 ||
94 $i == 0x000a ||
95 $i == 0x000d ||
96 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
97 ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
98 ($i > 0xffff && $i <= UNICODE_MAX ) ) {
99 if( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) {
100 $comp = UtfNormal::NFC( $char );
101 $this->assertEquals(
102 bin2hex( $comp ),
103 bin2hex( $clean ),
104 "U+$x should be decomposed" );
105 } else {
106 $this->assertEquals(
107 bin2hex( $char ),
108 bin2hex( $clean ),
109 "U+$x should be intact" );
110 }
111 } else {
112 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
113 }
114 }
115 }
116
117 /** @todo document */
118 function testAllBytes() {
119 $this->doTestBytes( '', '' );
120 $this->doTestBytes( 'x', '' );
121 $this->doTestBytes( '', 'x' );
122 $this->doTestBytes( 'x', 'x' );
123 }
124
125 /** @todo document */
126 function doTestBytes( $head, $tail ) {
127 for( $i = 0x0; $i < 256; $i++ ) {
128 $char = $head . chr( $i ) . $tail;
129 $clean = UtfNormal::cleanUp( $char );
130 $x = sprintf( "%02X", $i );
131 if( $i == 0x0009 ||
132 $i == 0x000a ||
133 $i == 0x000d ||
134 ($i > 0x001f && $i < 0x80) ) {
135 $this->assertEquals(
136 bin2hex( $char ),
137 bin2hex( $clean ),
138 "ASCII byte $x should be intact" );
139 if( $char != $clean ) return;
140 } else {
141 $norm = $head . UTF8_REPLACEMENT . $tail;
142 $this->assertEquals(
143 bin2hex( $norm ),
144 bin2hex( $clean ),
145 "Forbidden byte $x should be rejected" );
146 if( $norm != $clean ) return;
147 }
148 }
149 }
150
151 /** @todo document */
152 function testDoubleBytes() {
153 $this->doTestDoubleBytes( '', '' );
154 $this->doTestDoubleBytes( 'x', '' );
155 $this->doTestDoubleBytes( '', 'x' );
156 $this->doTestDoubleBytes( 'x', 'x' );
157 }
158
159 /**
160 * @todo document
161 */
162 function doTestDoubleBytes( $head, $tail ) {
163 for( $first = 0xc0; $first < 0x100; $first++ ) {
164 for( $second = 0x80; $second < 0x100; $second++ ) {
165 $char = $head . chr( $first ) . chr( $second ) . $tail;
166 $clean = UtfNormal::cleanUp( $char );
167 $x = sprintf( "%02X,%02X", $first, $second );
168 if( $first > 0xc1 &&
169 $first < 0xe0 &&
170 $second < 0xc0 ) {
171 $norm = UtfNormal::NFC( $char );
172 $this->assertEquals(
173 bin2hex( $norm ),
174 bin2hex( $clean ),
175 "Pair $x should be intact" );
176 if( $norm != $clean ) return;
177 } elseif( $first > 0xfd || $second > 0xbf ) {
178 # fe and ff are not legal head bytes -- expect two replacement chars
179 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
180 $this->assertEquals(
181 bin2hex( $norm ),
182 bin2hex( $clean ),
183 "Forbidden pair $x should be rejected" );
184 if( $norm != $clean ) return;
185 } else {
186 $norm = $head . UTF8_REPLACEMENT . $tail;
187 $this->assertEquals(
188 bin2hex( $norm ),
189 bin2hex( $clean ),
190 "Forbidden pair $x should be rejected" );
191 if( $norm != $clean ) return;
192 }
193 }
194 }
195 }
196
197 /** @todo document */
198 function testTripleBytes() {
199 $this->doTestTripleBytes( '', '' );
200 $this->doTestTripleBytes( 'x', '' );
201 $this->doTestTripleBytes( '', 'x' );
202 $this->doTestTripleBytes( 'x', 'x' );
203 }
204
205 /** @todo document */
206 function doTestTripleBytes( $head, $tail ) {
207 for( $first = 0xc0; $first < 0x100; $first++ ) {
208 for( $second = 0x80; $second < 0x100; $second++ ) {
209 #for( $third = 0x80; $third < 0x100; $third++ ) {
210 for( $third = 0x80; $third < 0x81; $third++ ) {
211 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
212 $clean = UtfNormal::cleanUp( $char );
213 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
214 if( $first >= 0xe0 &&
215 $first < 0xf0 &&
216 $second < 0xc0 &&
217 $third < 0xc0 ) {
218 if( $first == 0xe0 && $second < 0xa0 ) {
219 $this->assertEquals(
220 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
221 bin2hex( $clean ),
222 "Overlong triplet $x should be rejected" );
223 } elseif( $first == 0xed &&
224 ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST ) {
225 $this->assertEquals(
226 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
227 bin2hex( $clean ),
228 "Surrogate triplet $x should be rejected" );
229 } else {
230 $this->assertEquals(
231 bin2hex( UtfNormal::NFC( $char ) ),
232 bin2hex( $clean ),
233 "Triplet $x should be intact" );
234 }
235 } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
236 $this->assertEquals(
237 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
238 bin2hex( $clean ),
239 "Valid 2-byte $x + broken tail" );
240 } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
241 $this->assertEquals(
242 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
243 bin2hex( $clean ),
244 "Broken head + valid 2-byte $x" );
245 } elseif( ( $first > 0xfd || $second > 0xfd ) &&
246 ( ( $second > 0xbf && $third > 0xbf ) ||
247 ( $second < 0xc0 && $third < 0xc0 ) ||
248 ( $second > 0xfd ) ||
249 ( $third > 0xfd ) ) ) {
250 # fe and ff are not legal head bytes -- expect three replacement chars
251 $this->assertEquals(
252 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
253 bin2hex( $clean ),
254 "Forbidden triplet $x should be rejected" );
255 } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
256 $this->assertEquals(
257 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
258 bin2hex( $clean ),
259 "Forbidden triplet $x should be rejected" );
260 } else {
261 $this->assertEquals(
262 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
263 bin2hex( $clean ),
264 "Forbidden triplet $x should be rejected" );
265 }
266 }
267 }
268 }
269 }
270
271 /** @todo document */
272 function testChunkRegression() {
273 # Check for regression against a chunking bug
274 $text = "\x46\x55\xb8" .
275 "\xdc\x96" .
276 "\xee" .
277 "\xe7" .
278 "\x44" .
279 "\xaa" .
280 "\x2f\x25";
281 $expect = "\x46\x55\xef\xbf\xbd" .
282 "\xdc\x96" .
283 "\xef\xbf\xbd" .
284 "\xef\xbf\xbd" .
285 "\x44" .
286 "\xef\xbf\xbd" .
287 "\x2f\x25";
288
289 $this->assertEquals(
290 bin2hex( $expect ),
291 bin2hex( UtfNormal::cleanUp( $text ) ) );
292 }
293
294 /** @todo document */
295 function testInterposeRegression() {
296 $text = "\x4e\x30" .
297 "\xb1" . # bad tail
298 "\x3a" .
299 "\x92" . # bad tail
300 "\x62\x3a" .
301 "\x84" . # bad tail
302 "\x43" .
303 "\xc6" . # bad head
304 "\x3f" .
305 "\x92" . # bad tail
306 "\xad" . # bad tail
307 "\x7d" .
308 "\xd9\x95";
309
310 $expect = "\x4e\x30" .
311 "\xef\xbf\xbd" .
312 "\x3a" .
313 "\xef\xbf\xbd" .
314 "\x62\x3a" .
315 "\xef\xbf\xbd" .
316 "\x43" .
317 "\xef\xbf\xbd" .
318 "\x3f" .
319 "\xef\xbf\xbd" .
320 "\xef\xbf\xbd" .
321 "\x7d" .
322 "\xd9\x95";
323
324 $this->assertEquals(
325 bin2hex( $expect ),
326 bin2hex( UtfNormal::cleanUp( $text ) ) );
327 }
328
329 /** @todo document */
330 function testOverlongRegression() {
331 $text = "\x67" .
332 "\x1a" . # forbidden ascii
333 "\xea" . # bad head
334 "\xc1\xa6" . # overlong sequence
335 "\xad" . # bad tail
336 "\x1c" . # forbidden ascii
337 "\xb0" . # bad tail
338 "\x3c" .
339 "\x9e"; # bad tail
340 $expect = "\x67" .
341 "\xef\xbf\xbd" .
342 "\xef\xbf\xbd" .
343 "\xef\xbf\xbd" .
344 "\xef\xbf\xbd" .
345 "\xef\xbf\xbd" .
346 "\xef\xbf\xbd" .
347 "\x3c" .
348 "\xef\xbf\xbd";
349 $this->assertEquals(
350 bin2hex( $expect ),
351 bin2hex( UtfNormal::cleanUp( $text ) ) );
352 }
353
354 /** @todo document */
355 function testSurrogateRegression() {
356 $text = "\xed\xb4\x96" . # surrogate 0xDD16
357 "\x83" . # bad tail
358 "\xb4" . # bad tail
359 "\xac"; # bad head
360 $expect = "\xef\xbf\xbd" .
361 "\xef\xbf\xbd" .
362 "\xef\xbf\xbd" .
363 "\xef\xbf\xbd";
364 $this->assertEquals(
365 bin2hex( $expect ),
366 bin2hex( UtfNormal::cleanUp( $text ) ) );
367 }
368
369 /** @todo document */
370 function testBomRegression() {
371 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
372 "\xb2" . # bad tail
373 "\xef" . # bad head
374 "\x59";
375 $expect = "\xef\xbf\xbd" .
376 "\xef\xbf\xbd" .
377 "\xef\xbf\xbd" .
378 "\x59";
379 $this->assertEquals(
380 bin2hex( $expect ),
381 bin2hex( UtfNormal::cleanUp( $text ) ) );
382 }
383
384 /** @todo document */
385 function testForbiddenRegression() {
386 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
387 $expect = "\xef\xbf\xbd";
388 $this->assertEquals(
389 bin2hex( $expect ),
390 bin2hex( UtfNormal::cleanUp( $text ) ) );
391 }
392
393 /** @todo document */
394 function testHangulRegression() {
395 $text = "\xed\x9c\xaf" . # Hangul char
396 "\xe1\x87\x81"; # followed by another final jamo
397 $expect = $text; # Should *not* change.
398 $this->assertEquals(
399 bin2hex( $expect ),
400 bin2hex( UtfNormal::cleanUp( $text ) ) );
401 }
402 }
403
404
405 $suite = new PHPUnit_Framework_TestSuite( 'CleanUpTest' );
406 $result = PHPUnit_TextUI_TestRunner::run( $suite );
407
408 if( !$result->wasSuccessful() ) {
409 exit( -1 );
410 }
411 exit( 0 );