includes/normal/CleanUpTest.php

   1 <?php
   2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
   3 # http://www.mediawiki.org/
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License along
  16 # with this program; if not, write to the Free Software Foundation, Inc.,
  17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18 # http://www.gnu.org/copyleft/gpl.html
  19
  20 /**
  21  * Additional tests for UtfNormal::cleanUp() function, inclusion
  22  * regression checks for known problems.
  23  *
  24  * Requires PHPUnit.
  25  *
  26  * @package UtfNormal
  27  * @private
  28  */
  29
  30 if( php_sapi_name() != 'cli' ) {
  31         die( "Run me from the command line please.\n" );
  32 }
  33
  34 /** */
  35 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
  36         dl( 'php_utfnormal.so' );
  37 }
  38
  39 #ini_set( 'memory_limit', '40M' );
  40
  41 require_once 'PHPUnit/Framework.php';
  42 require_once 'PHPUnit/TextUI/TestRunner.php';
  43
  44 require_once 'UtfNormal.php';
  45
  46 /**
  47  * @package UtfNormal
  48  */
  49 class CleanUpTest extends PHPUnit_Framework_TestCase {
  50         /** @todo document */
  51         function setUp() {
  52         }
  53
  54         /** @todo document */
  55         function tearDown() {
  56         }
  57
  58         /** @todo document */
  59         function testAscii() {
  60                 $text = 'This is plain ASCII text.';
  61                 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
  62         }
  63
  64         /** @todo document */
  65         function testNull() {
  66                 $text = "a \x00 null";
  67                 $expect = "a \xef\xbf\xbd null";
  68                 $this->assertEquals(
  69                         bin2hex( $expect ),
  70                         bin2hex( UtfNormal::cleanUp( $text ) ) );
  71         }
  72
  73         /** @todo document */
  74         function testLatin() {
  75                 $text = "L'\xc3\xa9cole";
  76                 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
  77         }
  78
  79         /** @todo document */
  80         function testLatinNormal() {
  81                 $text = "L'e\xcc\x81cole";
  82                 $expect = "L'\xc3\xa9cole";
  83                 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
  84         }
  85
  86         /**
  87          * This test is *very* expensive!
  88          * @todo document
  89          */
  90         function XtestAllChars() {
  91                 $rep = UTF8_REPLACEMENT;
  92                 global $utfCanonicalComp, $utfCanonicalDecomp;
  93                 for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
  94                         $char = codepointToUtf8( $i );
  95                         $clean = UtfNormal::cleanUp( $char );
  96                         $x = sprintf( "%04X", $i );
  97                         if( $i % 0x1000 == 0 ) echo "U+$x\n";
  98                         if( $i == 0x0009 ||
  99                             $i == 0x000a ||
 100                             $i == 0x000d ||
 101                             ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
 102                             ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
 103                             ($i > 0xffff && $i <= UNICODE_MAX ) ) {
 104                                 if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) {
 105                                     $comp = UtfNormal::NFC( $char );
 106                                         $this->assertEquals(
 107                                                 bin2hex( $comp ),
 108                                                 bin2hex( $clean ),
 109                                                 "U+$x should be decomposed" );
 110                                 } else {
 111                                         $this->assertEquals(
 112                                                 bin2hex( $char ),
 113                                                 bin2hex( $clean ),
 114                                                 "U+$x should be intact" );
 115                                 }
 116                         } else {
 117                                 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
 118                         }
 119                 }
 120         }
 121
 122         /** @todo document */
 123         function testAllBytes() {
 124                 $this->doTestBytes( '', '' );
 125                 $this->doTestBytes( 'x', '' );
 126                 $this->doTestBytes( '', 'x' );
 127                 $this->doTestBytes( 'x', 'x' );
 128         }
 129
 130         /** @todo document */
 131         function doTestBytes( $head, $tail ) {
 132                 for( $i = 0x0; $i < 256; $i++ ) {
 133                         $char = $head . chr( $i ) . $tail;
 134                         $clean = UtfNormal::cleanUp( $char );
 135                         $x = sprintf( "%02X", $i );
 136                         if( $i == 0x0009 ||
 137                             $i == 0x000a ||
 138                             $i == 0x000d ||
 139                             ($i > 0x001f && $i < 0x80) ) {
 140                                 $this->assertEquals(
 141                                         bin2hex( $char ),
 142                                         bin2hex( $clean ),
 143                                         "ASCII byte $x should be intact" );
 144                                 if( $char != $clean ) return;
 145                         } else {
 146                                 $norm = $head . UTF8_REPLACEMENT . $tail;
 147                                 $this->assertEquals(
 148                                         bin2hex( $norm ),
 149                                         bin2hex( $clean ),
 150                                         "Forbidden byte $x should be rejected" );
 151                                 if( $norm != $clean ) return;
 152                         }
 153                 }
 154         }
 155
 156         /** @todo document */
 157         function testDoubleBytes() {
 158                 $this->doTestDoubleBytes( '', '' );
 159                 $this->doTestDoubleBytes( 'x', '' );
 160                 $this->doTestDoubleBytes( '', 'x' );
 161                 $this->doTestDoubleBytes( 'x', 'x' );
 162         }
 163
 164         /**
 165          * @todo document
 166          */
 167         function doTestDoubleBytes( $head, $tail ) {
 168                 for( $first = 0xc0; $first < 0x100; $first++ ) {
 169                         for( $second = 0x80; $second < 0x100; $second++ ) {
 170                                 $char = $head . chr( $first ) . chr( $second ) . $tail;
 171                                 $clean = UtfNormal::cleanUp( $char );
 172                                 $x = sprintf( "%02X,%02X", $first, $second );
 173                                 if( $first > 0xc1 &&
 174                                     $first < 0xe0 &&
 175                                     $second < 0xc0 ) {
 176                                     $norm = UtfNormal::NFC( $char );
 177                                         $this->assertEquals(
 178                                                 bin2hex( $norm ),
 179                                                 bin2hex( $clean ),
 180                                                 "Pair $x should be intact" );
 181                                     if( $norm != $clean ) return;
 182                                 } elseif( $first > 0xfd || $second > 0xbf ) {
 183                                         # fe and ff are not legal head bytes -- expect two replacement chars
 184                                         $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
 185                                         $this->assertEquals(
 186                                                 bin2hex( $norm ),
 187                                                 bin2hex( $clean ),
 188                                                 "Forbidden pair $x should be rejected" );
 189                                         if( $norm != $clean ) return;
 190                                 } else {
 191                                         $norm = $head . UTF8_REPLACEMENT . $tail;
 192                                         $this->assertEquals(
 193                                                 bin2hex( $norm ),
 194                                                 bin2hex( $clean ),
 195                                                 "Forbidden pair $x should be rejected" );
 196                                         if( $norm != $clean ) return;
 197                                 }
 198                         }
 199                 }
 200         }
 201
 202         /** @todo document */
 203         function testTripleBytes() {
 204                 $this->doTestTripleBytes( '', '' );
 205                 $this->doTestTripleBytes( 'x', '' );
 206                 $this->doTestTripleBytes( '', 'x' );
 207                 $this->doTestTripleBytes( 'x', 'x' );
 208         }
 209
 210         /** @todo document */
 211         function doTestTripleBytes( $head, $tail ) {
 212                 for( $first = 0xc0; $first < 0x100; $first++ ) {
 213                         for( $second = 0x80; $second < 0x100; $second++ ) {
 214                                 #for( $third = 0x80; $third < 0x100; $third++ ) {
 215                                 for( $third = 0x80; $third < 0x81; $third++ ) {
 216                                         $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
 217                                         $clean = UtfNormal::cleanUp( $char );
 218                                         $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
 219                                         if( $first >= 0xe0 &&
 220                                                 $first < 0xf0 &&
 221                                                 $second < 0xc0 &&
 222                                                 $third < 0xc0 ) {
 223                                                 if( $first == 0xe0 && $second < 0xa0 ) {
 224                                                         $this->assertEquals(
 225                                                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
 226                                                                 bin2hex( $clean ),
 227                                                                 "Overlong triplet $x should be rejected" );
 228                                                 } elseif( $first == 0xed &&
 229                                                         ( chr( $first ) . chr( $second ) . chr( $third ))  >= UTF8_SURROGATE_FIRST ) {
 230                                                         $this->assertEquals(
 231                                                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
 232                                                                 bin2hex( $clean ),
 233                                                                 "Surrogate triplet $x should be rejected" );
 234                                                 } else {
 235                                                         $this->assertEquals(
 236                                                                 bin2hex( UtfNormal::NFC( $char ) ),
 237                                                                 bin2hex( $clean ),
 238                                                                 "Triplet $x should be intact" );
 239                                                 }
 240                                         } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
 241                                                 $this->assertEquals(
 242                                                         bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
 243                                                         bin2hex( $clean ),
 244                                                         "Valid 2-byte $x + broken tail" );
 245                                         } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
 246                                                 $this->assertEquals(
 247                                                         bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
 248                                                         bin2hex( $clean ),
 249                                                         "Broken head + valid 2-byte $x" );
 250                                         } elseif( ( $first > 0xfd || $second > 0xfd ) &&
 251                                                     ( ( $second > 0xbf && $third > 0xbf ) ||
 252                                                       ( $second < 0xc0 && $third < 0xc0 ) ||
 253                                                       ( $second > 0xfd ) ||
 254                                                       ( $third > 0xfd ) ) ) {
 255                                                 # fe and ff are not legal head bytes -- expect three replacement chars
 256                                                 $this->assertEquals(
 257                                                         bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
 258                                                         bin2hex( $clean ),
 259                                                         "Forbidden triplet $x should be rejected" );
 260                                         } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
 261                                                 $this->assertEquals(
 262                                                         bin2hex( $head . UTF8_REPLACEMENT . $tail ),
 263                                                         bin2hex( $clean ),
 264                                                         "Forbidden triplet $x should be rejected" );
 265                                         } else {
 266                                                 $this->assertEquals(
 267                                                         bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
 268                                                         bin2hex( $clean ),
 269                                                         "Forbidden triplet $x should be rejected" );
 270                                         }
 271                                 }
 272                         }
 273                 }
 274         }
 275
 276         /** @todo document */
 277         function testChunkRegression() {
 278                 # Check for regression against a chunking bug
 279                 $text   = "\x46\x55\xb8" .
 280                           "\xdc\x96" .
 281                           "\xee" .
 282                           "\xe7" .
 283                           "\x44" .
 284                           "\xaa" .
 285                           "\x2f\x25";
 286                 $expect = "\x46\x55\xef\xbf\xbd" .
 287                           "\xdc\x96" .
 288                           "\xef\xbf\xbd" .
 289                           "\xef\xbf\xbd" .
 290                           "\x44" .
 291                           "\xef\xbf\xbd" .
 292                           "\x2f\x25";
 293
 294                 $this->assertEquals(
 295                         bin2hex( $expect ),
 296                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 297         }
 298
 299         /** @todo document */
 300         function testInterposeRegression() {
 301                 $text   = "\x4e\x30" .
 302                           "\xb1" .              # bad tail
 303                           "\x3a" .
 304                           "\x92" .              # bad tail
 305                           "\x62\x3a" .
 306                           "\x84" .              # bad tail
 307                           "\x43" .
 308                           "\xc6" .              # bad head
 309                           "\x3f" .
 310                           "\x92" .              # bad tail
 311                           "\xad" .              # bad tail
 312                           "\x7d" .
 313                           "\xd9\x95";
 314
 315                 $expect = "\x4e\x30" .
 316                           "\xef\xbf\xbd" .
 317                           "\x3a" .
 318                           "\xef\xbf\xbd" .
 319                           "\x62\x3a" .
 320                           "\xef\xbf\xbd" .
 321                           "\x43" .
 322                           "\xef\xbf\xbd" .
 323                           "\x3f" .
 324                           "\xef\xbf\xbd" .
 325                           "\xef\xbf\xbd" .
 326                           "\x7d" .
 327                           "\xd9\x95";
 328
 329                 $this->assertEquals(
 330                         bin2hex( $expect ),
 331                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 332         }
 333
 334         /** @todo document */
 335         function testOverlongRegression() {
 336                 $text   = "\x67" .
 337                           "\x1a" . # forbidden ascii
 338                           "\xea" . # bad head
 339                           "\xc1\xa6" . # overlong sequence
 340                           "\xad" . # bad tail
 341                           "\x1c" . # forbidden ascii
 342                           "\xb0" . # bad tail
 343                           "\x3c" .
 344                           "\x9e";  # bad tail
 345                 $expect = "\x67" .
 346                           "\xef\xbf\xbd" .
 347                           "\xef\xbf\xbd" .
 348                           "\xef\xbf\xbd" .
 349                           "\xef\xbf\xbd" .
 350                           "\xef\xbf\xbd" .
 351                           "\xef\xbf\xbd" .
 352                           "\x3c" .
 353                           "\xef\xbf\xbd";
 354                 $this->assertEquals(
 355                         bin2hex( $expect ),
 356                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 357         }
 358
 359         /** @todo document */
 360         function testSurrogateRegression() {
 361                 $text   = "\xed\xb4\x96" . # surrogate 0xDD16
 362                           "\x83" . # bad tail
 363                           "\xb4" . # bad tail
 364                           "\xac";  # bad head
 365                 $expect = "\xef\xbf\xbd" .
 366                           "\xef\xbf\xbd" .
 367                           "\xef\xbf\xbd" .
 368                           "\xef\xbf\xbd";
 369                 $this->assertEquals(
 370                         bin2hex( $expect ),
 371                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 372         }
 373
 374         /** @todo document */
 375         function testBomRegression() {
 376                 $text   = "\xef\xbf\xbe" . # U+FFFE, illegal char
 377                           "\xb2" . # bad tail
 378                           "\xef" . # bad head
 379                           "\x59";
 380                 $expect = "\xef\xbf\xbd" .
 381                           "\xef\xbf\xbd" .
 382                           "\xef\xbf\xbd" .
 383                           "\x59";
 384                 $this->assertEquals(
 385                         bin2hex( $expect ),
 386                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 387         }
 388
 389         /** @todo document */
 390         function testForbiddenRegression() {
 391                 $text   = "\xef\xbf\xbf"; # U+FFFF, illegal char
 392                 $expect = "\xef\xbf\xbd";
 393                 $this->assertEquals(
 394                         bin2hex( $expect ),
 395                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 396         }
 397
 398         /** @todo document */
 399         function testHangulRegression() {
 400                 $text = "\xed\x9c\xaf" . # Hangul char
 401                                 "\xe1\x87\x81";  # followed by another final jamo
 402                 $expect = $text;         # Should *not* change.
 403                 $this->assertEquals(
 404                         bin2hex( $expect ),
 405                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 406         }
 407 }
 408
 409
 410 $suite = new PHPUnit_Framework_TestSuite( 'CleanUpTest' );
 411 $result = PHPUnit_TextUI_TestRunner::run( $suite );
 412
 413 if( !$result->wasSuccessful() ) {
 414         exit( -1 );
 415 }
 416 exit( 0 );
 417 ?>