Remove dismiss from deletion log. This simply doesn't make sense to me, nor does...
[lhc/web/wiklou.git] / includes / normal / CleanUpTest.php
1 <?php
2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3 # http://www.mediawiki.org/
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 # http://www.gnu.org/copyleft/gpl.html
19
20
21 if( php_sapi_name() != 'cli' ) {
22 die( "Run me from the command line please.\n" );
23 }
24
25 /** */
26 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
27 dl( 'php_utfnormal.so' );
28 }
29
30 #ini_set( 'memory_limit', '40M' );
31
32 require_once 'PHPUnit/Framework.php';
33 require_once 'PHPUnit/TextUI/TestRunner.php';
34
35 require_once 'UtfNormal.php';
36
37 /**
38 * Additional tests for UtfNormal::cleanUp() function, inclusion
39 * regression checks for known problems.
40 * Requires PHPUnit.
41 *
42 * @addtogroup UtfNormal
43 * @private
44 */
45 class CleanUpTest extends PHPUnit_Framework_TestCase {
46 /** @todo document */
47 function setUp() {
48 }
49
50 /** @todo document */
51 function tearDown() {
52 }
53
54 /** @todo document */
55 function testAscii() {
56 $text = 'This is plain ASCII text.';
57 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
58 }
59
60 /** @todo document */
61 function testNull() {
62 $text = "a \x00 null";
63 $expect = "a \xef\xbf\xbd null";
64 $this->assertEquals(
65 bin2hex( $expect ),
66 bin2hex( UtfNormal::cleanUp( $text ) ) );
67 }
68
69 /** @todo document */
70 function testLatin() {
71 $text = "L'\xc3\xa9cole";
72 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
73 }
74
75 /** @todo document */
76 function testLatinNormal() {
77 $text = "L'e\xcc\x81cole";
78 $expect = "L'\xc3\xa9cole";
79 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
80 }
81
82 /**
83 * This test is *very* expensive!
84 * @todo document
85 */
86 function XtestAllChars() {
87 $rep = UTF8_REPLACEMENT;
88 global $utfCanonicalComp, $utfCanonicalDecomp;
89 for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
90 $char = codepointToUtf8( $i );
91 $clean = UtfNormal::cleanUp( $char );
92 $x = sprintf( "%04X", $i );
93 if( $i % 0x1000 == 0 ) echo "U+$x\n";
94 if( $i == 0x0009 ||
95 $i == 0x000a ||
96 $i == 0x000d ||
97 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
98 ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
99 ($i > 0xffff && $i <= UNICODE_MAX ) ) {
100 if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) {
101 $comp = UtfNormal::NFC( $char );
102 $this->assertEquals(
103 bin2hex( $comp ),
104 bin2hex( $clean ),
105 "U+$x should be decomposed" );
106 } else {
107 $this->assertEquals(
108 bin2hex( $char ),
109 bin2hex( $clean ),
110 "U+$x should be intact" );
111 }
112 } else {
113 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
114 }
115 }
116 }
117
118 /** @todo document */
119 function testAllBytes() {
120 $this->doTestBytes( '', '' );
121 $this->doTestBytes( 'x', '' );
122 $this->doTestBytes( '', 'x' );
123 $this->doTestBytes( 'x', 'x' );
124 }
125
126 /** @todo document */
127 function doTestBytes( $head, $tail ) {
128 for( $i = 0x0; $i < 256; $i++ ) {
129 $char = $head . chr( $i ) . $tail;
130 $clean = UtfNormal::cleanUp( $char );
131 $x = sprintf( "%02X", $i );
132 if( $i == 0x0009 ||
133 $i == 0x000a ||
134 $i == 0x000d ||
135 ($i > 0x001f && $i < 0x80) ) {
136 $this->assertEquals(
137 bin2hex( $char ),
138 bin2hex( $clean ),
139 "ASCII byte $x should be intact" );
140 if( $char != $clean ) return;
141 } else {
142 $norm = $head . UTF8_REPLACEMENT . $tail;
143 $this->assertEquals(
144 bin2hex( $norm ),
145 bin2hex( $clean ),
146 "Forbidden byte $x should be rejected" );
147 if( $norm != $clean ) return;
148 }
149 }
150 }
151
152 /** @todo document */
153 function testDoubleBytes() {
154 $this->doTestDoubleBytes( '', '' );
155 $this->doTestDoubleBytes( 'x', '' );
156 $this->doTestDoubleBytes( '', 'x' );
157 $this->doTestDoubleBytes( 'x', 'x' );
158 }
159
160 /**
161 * @todo document
162 */
163 function doTestDoubleBytes( $head, $tail ) {
164 for( $first = 0xc0; $first < 0x100; $first++ ) {
165 for( $second = 0x80; $second < 0x100; $second++ ) {
166 $char = $head . chr( $first ) . chr( $second ) . $tail;
167 $clean = UtfNormal::cleanUp( $char );
168 $x = sprintf( "%02X,%02X", $first, $second );
169 if( $first > 0xc1 &&
170 $first < 0xe0 &&
171 $second < 0xc0 ) {
172 $norm = UtfNormal::NFC( $char );
173 $this->assertEquals(
174 bin2hex( $norm ),
175 bin2hex( $clean ),
176 "Pair $x should be intact" );
177 if( $norm != $clean ) return;
178 } elseif( $first > 0xfd || $second > 0xbf ) {
179 # fe and ff are not legal head bytes -- expect two replacement chars
180 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
181 $this->assertEquals(
182 bin2hex( $norm ),
183 bin2hex( $clean ),
184 "Forbidden pair $x should be rejected" );
185 if( $norm != $clean ) return;
186 } else {
187 $norm = $head . UTF8_REPLACEMENT . $tail;
188 $this->assertEquals(
189 bin2hex( $norm ),
190 bin2hex( $clean ),
191 "Forbidden pair $x should be rejected" );
192 if( $norm != $clean ) return;
193 }
194 }
195 }
196 }
197
198 /** @todo document */
199 function testTripleBytes() {
200 $this->doTestTripleBytes( '', '' );
201 $this->doTestTripleBytes( 'x', '' );
202 $this->doTestTripleBytes( '', 'x' );
203 $this->doTestTripleBytes( 'x', 'x' );
204 }
205
206 /** @todo document */
207 function doTestTripleBytes( $head, $tail ) {
208 for( $first = 0xc0; $first < 0x100; $first++ ) {
209 for( $second = 0x80; $second < 0x100; $second++ ) {
210 #for( $third = 0x80; $third < 0x100; $third++ ) {
211 for( $third = 0x80; $third < 0x81; $third++ ) {
212 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
213 $clean = UtfNormal::cleanUp( $char );
214 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
215 if( $first >= 0xe0 &&
216 $first < 0xf0 &&
217 $second < 0xc0 &&
218 $third < 0xc0 ) {
219 if( $first == 0xe0 && $second < 0xa0 ) {
220 $this->assertEquals(
221 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
222 bin2hex( $clean ),
223 "Overlong triplet $x should be rejected" );
224 } elseif( $first == 0xed &&
225 ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST ) {
226 $this->assertEquals(
227 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
228 bin2hex( $clean ),
229 "Surrogate triplet $x should be rejected" );
230 } else {
231 $this->assertEquals(
232 bin2hex( UtfNormal::NFC( $char ) ),
233 bin2hex( $clean ),
234 "Triplet $x should be intact" );
235 }
236 } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
237 $this->assertEquals(
238 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
239 bin2hex( $clean ),
240 "Valid 2-byte $x + broken tail" );
241 } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
242 $this->assertEquals(
243 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
244 bin2hex( $clean ),
245 "Broken head + valid 2-byte $x" );
246 } elseif( ( $first > 0xfd || $second > 0xfd ) &&
247 ( ( $second > 0xbf && $third > 0xbf ) ||
248 ( $second < 0xc0 && $third < 0xc0 ) ||
249 ( $second > 0xfd ) ||
250 ( $third > 0xfd ) ) ) {
251 # fe and ff are not legal head bytes -- expect three replacement chars
252 $this->assertEquals(
253 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
254 bin2hex( $clean ),
255 "Forbidden triplet $x should be rejected" );
256 } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
257 $this->assertEquals(
258 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
259 bin2hex( $clean ),
260 "Forbidden triplet $x should be rejected" );
261 } else {
262 $this->assertEquals(
263 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
264 bin2hex( $clean ),
265 "Forbidden triplet $x should be rejected" );
266 }
267 }
268 }
269 }
270 }
271
272 /** @todo document */
273 function testChunkRegression() {
274 # Check for regression against a chunking bug
275 $text = "\x46\x55\xb8" .
276 "\xdc\x96" .
277 "\xee" .
278 "\xe7" .
279 "\x44" .
280 "\xaa" .
281 "\x2f\x25";
282 $expect = "\x46\x55\xef\xbf\xbd" .
283 "\xdc\x96" .
284 "\xef\xbf\xbd" .
285 "\xef\xbf\xbd" .
286 "\x44" .
287 "\xef\xbf\xbd" .
288 "\x2f\x25";
289
290 $this->assertEquals(
291 bin2hex( $expect ),
292 bin2hex( UtfNormal::cleanUp( $text ) ) );
293 }
294
295 /** @todo document */
296 function testInterposeRegression() {
297 $text = "\x4e\x30" .
298 "\xb1" . # bad tail
299 "\x3a" .
300 "\x92" . # bad tail
301 "\x62\x3a" .
302 "\x84" . # bad tail
303 "\x43" .
304 "\xc6" . # bad head
305 "\x3f" .
306 "\x92" . # bad tail
307 "\xad" . # bad tail
308 "\x7d" .
309 "\xd9\x95";
310
311 $expect = "\x4e\x30" .
312 "\xef\xbf\xbd" .
313 "\x3a" .
314 "\xef\xbf\xbd" .
315 "\x62\x3a" .
316 "\xef\xbf\xbd" .
317 "\x43" .
318 "\xef\xbf\xbd" .
319 "\x3f" .
320 "\xef\xbf\xbd" .
321 "\xef\xbf\xbd" .
322 "\x7d" .
323 "\xd9\x95";
324
325 $this->assertEquals(
326 bin2hex( $expect ),
327 bin2hex( UtfNormal::cleanUp( $text ) ) );
328 }
329
330 /** @todo document */
331 function testOverlongRegression() {
332 $text = "\x67" .
333 "\x1a" . # forbidden ascii
334 "\xea" . # bad head
335 "\xc1\xa6" . # overlong sequence
336 "\xad" . # bad tail
337 "\x1c" . # forbidden ascii
338 "\xb0" . # bad tail
339 "\x3c" .
340 "\x9e"; # bad tail
341 $expect = "\x67" .
342 "\xef\xbf\xbd" .
343 "\xef\xbf\xbd" .
344 "\xef\xbf\xbd" .
345 "\xef\xbf\xbd" .
346 "\xef\xbf\xbd" .
347 "\xef\xbf\xbd" .
348 "\x3c" .
349 "\xef\xbf\xbd";
350 $this->assertEquals(
351 bin2hex( $expect ),
352 bin2hex( UtfNormal::cleanUp( $text ) ) );
353 }
354
355 /** @todo document */
356 function testSurrogateRegression() {
357 $text = "\xed\xb4\x96" . # surrogate 0xDD16
358 "\x83" . # bad tail
359 "\xb4" . # bad tail
360 "\xac"; # bad head
361 $expect = "\xef\xbf\xbd" .
362 "\xef\xbf\xbd" .
363 "\xef\xbf\xbd" .
364 "\xef\xbf\xbd";
365 $this->assertEquals(
366 bin2hex( $expect ),
367 bin2hex( UtfNormal::cleanUp( $text ) ) );
368 }
369
370 /** @todo document */
371 function testBomRegression() {
372 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
373 "\xb2" . # bad tail
374 "\xef" . # bad head
375 "\x59";
376 $expect = "\xef\xbf\xbd" .
377 "\xef\xbf\xbd" .
378 "\xef\xbf\xbd" .
379 "\x59";
380 $this->assertEquals(
381 bin2hex( $expect ),
382 bin2hex( UtfNormal::cleanUp( $text ) ) );
383 }
384
385 /** @todo document */
386 function testForbiddenRegression() {
387 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
388 $expect = "\xef\xbf\xbd";
389 $this->assertEquals(
390 bin2hex( $expect ),
391 bin2hex( UtfNormal::cleanUp( $text ) ) );
392 }
393
394 /** @todo document */
395 function testHangulRegression() {
396 $text = "\xed\x9c\xaf" . # Hangul char
397 "\xe1\x87\x81"; # followed by another final jamo
398 $expect = $text; # Should *not* change.
399 $this->assertEquals(
400 bin2hex( $expect ),
401 bin2hex( UtfNormal::cleanUp( $text ) ) );
402 }
403 }
404
405
406 $suite = new PHPUnit_Framework_TestSuite( 'CleanUpTest' );
407 $result = PHPUnit_TextUI_TestRunner::run( $suite );
408
409 if( !$result->wasSuccessful() ) {
410 exit( -1 );
411 }
412 exit( 0 );
413 ?>