Merge "Allow API results to wrap long lines"
[lhc/web/wiklou.git] / tests / phpunit / includes / normal / CleanUpTest.php
1 <?php
2 /**
3 * Tests for UtfNormal::cleanUp() function.
4 *
5 * Copyright © 2004 Brion Vibber <brion@pobox.com>
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 */
25
26 /**
27 * Additional tests for UtfNormal::cleanUp() function, inclusion
28 * regression checks for known problems.
29 * Requires PHPUnit.
30 *
31 * @ingroup UtfNormal
32 */
33 class CleanUpTest extends MediaWikiTestCase {
34 /** @todo document */
35 function testAscii() {
36 $text = 'This is plain ASCII text.';
37 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
38 }
39
40 /** @todo document */
41 function testNull() {
42 $text = "a \x00 null";
43 $expect = "a \xef\xbf\xbd null";
44 $this->assertEquals(
45 bin2hex( $expect ),
46 bin2hex( UtfNormal::cleanUp( $text ) ) );
47 }
48
49 /** @todo document */
50 function testLatin() {
51 $text = "L'\xc3\xa9cole";
52 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
53 }
54
55 /** @todo document */
56 function testLatinNormal() {
57 $text = "L'e\xcc\x81cole";
58 $expect = "L'\xc3\xa9cole";
59 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
60 }
61
62 /**
63 * This test is *very* expensive!
64 * @todo document
65 */
66 function XtestAllChars() {
67 $rep = UTF8_REPLACEMENT;
68 for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
69 $char = codepointToUtf8( $i );
70 $clean = UtfNormal::cleanUp( $char );
71 $x = sprintf( "%04X", $i );
72
73 if ( $i % 0x1000 == 0 ) {
74 echo "U+$x\n";
75 }
76
77 if ( $i == 0x0009 ||
78 $i == 0x000a ||
79 $i == 0x000d ||
80 ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) ||
81 ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
82 ( $i > 0xffff && $i <= UNICODE_MAX )
83 ) {
84 if ( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) {
85 $comp = UtfNormal::NFC( $char );
86 $this->assertEquals(
87 bin2hex( $comp ),
88 bin2hex( $clean ),
89 "U+$x should be decomposed" );
90 } else {
91 $this->assertEquals(
92 bin2hex( $char ),
93 bin2hex( $clean ),
94 "U+$x should be intact" );
95 }
96 } else {
97 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
98 }
99 }
100 }
101
102 /** @todo document */
103 function testAllBytes() {
104 $this->doTestBytes( '', '' );
105 $this->doTestBytes( 'x', '' );
106 $this->doTestBytes( '', 'x' );
107 $this->doTestBytes( 'x', 'x' );
108 }
109
110 /** @todo document */
111 function doTestBytes( $head, $tail ) {
112 for ( $i = 0x0; $i < 256; $i++ ) {
113 $char = $head . chr( $i ) . $tail;
114 $clean = UtfNormal::cleanUp( $char );
115 $x = sprintf( "%02X", $i );
116
117 if ( $i == 0x0009 ||
118 $i == 0x000a ||
119 $i == 0x000d ||
120 ( $i > 0x001f && $i < 0x80 )
121 ) {
122 $this->assertEquals(
123 bin2hex( $char ),
124 bin2hex( $clean ),
125 "ASCII byte $x should be intact" );
126 if ( $char != $clean ) {
127 return;
128 }
129 } else {
130 $norm = $head . UTF8_REPLACEMENT . $tail;
131 $this->assertEquals(
132 bin2hex( $norm ),
133 bin2hex( $clean ),
134 "Forbidden byte $x should be rejected" );
135 if ( $norm != $clean ) {
136 return;
137 }
138 }
139 }
140 }
141
142 /** @todo document */
143 function testDoubleBytes() {
144 $this->doTestDoubleBytes( '', '' );
145 $this->doTestDoubleBytes( 'x', '' );
146 $this->doTestDoubleBytes( '', 'x' );
147 $this->doTestDoubleBytes( 'x', 'x' );
148 }
149
150 /**
151 * @todo document
152 */
153 function doTestDoubleBytes( $head, $tail ) {
154 for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
155 for ( $second = 0x80; $second < 0x100; $second += 2 ) {
156 $char = $head . chr( $first ) . chr( $second ) . $tail;
157 $clean = UtfNormal::cleanUp( $char );
158 $x = sprintf( "%02X,%02X", $first, $second );
159 if ( $first > 0xc1 &&
160 $first < 0xe0 &&
161 $second < 0xc0
162 ) {
163 $norm = UtfNormal::NFC( $char );
164 $this->assertEquals(
165 bin2hex( $norm ),
166 bin2hex( $clean ),
167 "Pair $x should be intact" );
168 if ( $norm != $clean ) {
169 return;
170 }
171 } elseif ( $first > 0xfd || $second > 0xbf ) {
172 # fe and ff are not legal head bytes -- expect two replacement chars
173 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
174 $this->assertEquals(
175 bin2hex( $norm ),
176 bin2hex( $clean ),
177 "Forbidden pair $x should be rejected" );
178 if ( $norm != $clean ) {
179 return;
180 }
181 } else {
182 $norm = $head . UTF8_REPLACEMENT . $tail;
183 $this->assertEquals(
184 bin2hex( $norm ),
185 bin2hex( $clean ),
186 "Forbidden pair $x should be rejected" );
187 if ( $norm != $clean ) {
188 return;
189 }
190 }
191 }
192 }
193 }
194
195 /** @todo document */
196 function testTripleBytes() {
197 $this->doTestTripleBytes( '', '' );
198 $this->doTestTripleBytes( 'x', '' );
199 $this->doTestTripleBytes( '', 'x' );
200 $this->doTestTripleBytes( 'x', 'x' );
201 }
202
203 /** @todo document */
204 function doTestTripleBytes( $head, $tail ) {
205 for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
206 for ( $second = 0x80; $second < 0x100; $second += 2 ) {
207 #for( $third = 0x80; $third < 0x100; $third++ ) {
208 for ( $third = 0x80; $third < 0x81; $third++ ) {
209 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
210 $clean = UtfNormal::cleanUp( $char );
211 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
212
213 if ( $first >= 0xe0 &&
214 $first < 0xf0 &&
215 $second < 0xc0 &&
216 $third < 0xc0
217 ) {
218 if ( $first == 0xe0 && $second < 0xa0 ) {
219 $this->assertEquals(
220 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
221 bin2hex( $clean ),
222 "Overlong triplet $x should be rejected" );
223 } elseif ( $first == 0xed &&
224 ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST
225 ) {
226 $this->assertEquals(
227 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
228 bin2hex( $clean ),
229 "Surrogate triplet $x should be rejected" );
230 } else {
231 $this->assertEquals(
232 bin2hex( UtfNormal::NFC( $char ) ),
233 bin2hex( $clean ),
234 "Triplet $x should be intact" );
235 }
236 } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
237 $this->assertEquals(
238 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
239 bin2hex( $clean ),
240 "Valid 2-byte $x + broken tail" );
241 } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
242 $this->assertEquals(
243 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
244 bin2hex( $clean ),
245 "Broken head + valid 2-byte $x" );
246 } elseif ( ( $first > 0xfd || $second > 0xfd ) &&
247 ( ( $second > 0xbf && $third > 0xbf ) ||
248 ( $second < 0xc0 && $third < 0xc0 ) ||
249 ( $second > 0xfd ) ||
250 ( $third > 0xfd ) )
251 ) {
252 # fe and ff are not legal head bytes -- expect three replacement chars
253 $this->assertEquals(
254 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
255 bin2hex( $clean ),
256 "Forbidden triplet $x should be rejected" );
257 } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
258 $this->assertEquals(
259 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
260 bin2hex( $clean ),
261 "Forbidden triplet $x should be rejected" );
262 } else {
263 $this->assertEquals(
264 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
265 bin2hex( $clean ),
266 "Forbidden triplet $x should be rejected" );
267 }
268 }
269 }
270 }
271 }
272
273 /** @todo document */
274 function testChunkRegression() {
275 # Check for regression against a chunking bug
276 $text = "\x46\x55\xb8" .
277 "\xdc\x96" .
278 "\xee" .
279 "\xe7" .
280 "\x44" .
281 "\xaa" .
282 "\x2f\x25";
283 $expect = "\x46\x55\xef\xbf\xbd" .
284 "\xdc\x96" .
285 "\xef\xbf\xbd" .
286 "\xef\xbf\xbd" .
287 "\x44" .
288 "\xef\xbf\xbd" .
289 "\x2f\x25";
290
291 $this->assertEquals(
292 bin2hex( $expect ),
293 bin2hex( UtfNormal::cleanUp( $text ) ) );
294 }
295
296 /** @todo document */
297 function testInterposeRegression() {
298 $text = "\x4e\x30" .
299 "\xb1" . # bad tail
300 "\x3a" .
301 "\x92" . # bad tail
302 "\x62\x3a" .
303 "\x84" . # bad tail
304 "\x43" .
305 "\xc6" . # bad head
306 "\x3f" .
307 "\x92" . # bad tail
308 "\xad" . # bad tail
309 "\x7d" .
310 "\xd9\x95";
311
312 $expect = "\x4e\x30" .
313 "\xef\xbf\xbd" .
314 "\x3a" .
315 "\xef\xbf\xbd" .
316 "\x62\x3a" .
317 "\xef\xbf\xbd" .
318 "\x43" .
319 "\xef\xbf\xbd" .
320 "\x3f" .
321 "\xef\xbf\xbd" .
322 "\xef\xbf\xbd" .
323 "\x7d" .
324 "\xd9\x95";
325
326 $this->assertEquals(
327 bin2hex( $expect ),
328 bin2hex( UtfNormal::cleanUp( $text ) ) );
329 }
330
331 /** @todo document */
332 function testOverlongRegression() {
333 $text = "\x67" .
334 "\x1a" . # forbidden ascii
335 "\xea" . # bad head
336 "\xc1\xa6" . # overlong sequence
337 "\xad" . # bad tail
338 "\x1c" . # forbidden ascii
339 "\xb0" . # bad tail
340 "\x3c" .
341 "\x9e"; # bad tail
342 $expect = "\x67" .
343 "\xef\xbf\xbd" .
344 "\xef\xbf\xbd" .
345 "\xef\xbf\xbd" .
346 "\xef\xbf\xbd" .
347 "\xef\xbf\xbd" .
348 "\xef\xbf\xbd" .
349 "\x3c" .
350 "\xef\xbf\xbd";
351 $this->assertEquals(
352 bin2hex( $expect ),
353 bin2hex( UtfNormal::cleanUp( $text ) ) );
354 }
355
356 /** @todo document */
357 function testSurrogateRegression() {
358 $text = "\xed\xb4\x96" . # surrogate 0xDD16
359 "\x83" . # bad tail
360 "\xb4" . # bad tail
361 "\xac"; # bad head
362 $expect = "\xef\xbf\xbd" .
363 "\xef\xbf\xbd" .
364 "\xef\xbf\xbd" .
365 "\xef\xbf\xbd";
366 $this->assertEquals(
367 bin2hex( $expect ),
368 bin2hex( UtfNormal::cleanUp( $text ) ) );
369 }
370
371 /** @todo document */
372 function testBomRegression() {
373 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
374 "\xb2" . # bad tail
375 "\xef" . # bad head
376 "\x59";
377 $expect = "\xef\xbf\xbd" .
378 "\xef\xbf\xbd" .
379 "\xef\xbf\xbd" .
380 "\x59";
381 $this->assertEquals(
382 bin2hex( $expect ),
383 bin2hex( UtfNormal::cleanUp( $text ) ) );
384 }
385
386 /** @todo document */
387 function testForbiddenRegression() {
388 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
389 $expect = "\xef\xbf\xbd";
390 $this->assertEquals(
391 bin2hex( $expect ),
392 bin2hex( UtfNormal::cleanUp( $text ) ) );
393 }
394
395 /** @todo document */
396 function testHangulRegression() {
397 $text = "\xed\x9c\xaf" . # Hangul char
398 "\xe1\x87\x81"; # followed by another final jamo
399 $expect = $text; # Should *not* change.
400 $this->assertEquals(
401 bin2hex( $expect ),
402 bin2hex( UtfNormal::cleanUp( $text ) ) );
403 }
404 }