includes/normal/CleanUpTest.php

   1 <?php
   2 /**
   3  * Tests for UtfNormal::cleanUp() function.
   4  *
   5  * Copyright © 2004 Brion Vibber <brion@pobox.com>
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup UtfNormal
  25  */
  26
  27
  28 if( php_sapi_name() != 'cli' ) {
  29         die( "Run me from the command line please.\n" );
  30 }
  31
  32 /** */
  33 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
  34         dl( 'php_utfnormal.so' );
  35 }
  36
  37 #ini_set( 'memory_limit', '40M' );
  38
  39 require_once( 'PHPUnit/Runner/Version.php' );
  40 if( version_compare( PHPUnit_Runner_Version::id(), '3.5.0', '>=' ) ) {
  41     # PHPUnit 3.5.0 introduced a nice autoloader based on class name
  42     require_once( 'PHPUnit/Autoload.php' );
  43 } else {
  44         # Keep the old pre PHPUnit 3.5.0 behaviour for compatibility
  45         require_once 'PHPUnit/Framework.php';
  46 }
  47 require_once 'PHPUnit/TextUI/TestRunner.php';
  48
  49 require_once 'UtfNormal.php';
  50
  51 /**
  52  * Additional tests for UtfNormal::cleanUp() function, inclusion
  53  * regression checks for known problems.
  54  * Requires PHPUnit.
  55  *
  56  * @ingroup UtfNormal
  57  * @private
  58  */
  59 class CleanUpTest extends PHPUnit_Framework_TestCase {
  60         /** @todo document */
  61         function setUp() {
  62         }
  63
  64         /** @todo document */
  65         function tearDown() {
  66         }
  67
  68         /** @todo document */
  69         function testAscii() {
  70                 $text = 'This is plain ASCII text.';
  71                 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
  72         }
  73
  74         /** @todo document */
  75         function testNull() {
  76                 $text = "a \x00 null";
  77                 $expect = "a \xef\xbf\xbd null";
  78                 $this->assertEquals(
  79                         bin2hex( $expect ),
  80                         bin2hex( UtfNormal::cleanUp( $text ) ) );
  81         }
  82
  83         /** @todo document */
  84         function testLatin() {
  85                 $text = "L'\xc3\xa9cole";
  86                 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
  87         }
  88
  89         /** @todo document */
  90         function testLatinNormal() {
  91                 $text = "L'e\xcc\x81cole";
  92                 $expect = "L'\xc3\xa9cole";
  93                 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
  94         }
  95
  96         /**
  97          * This test is *very* expensive!
  98          * @todo document
  99          */
 100         function XtestAllChars() {
 101                 $rep = UTF8_REPLACEMENT;
 102                 for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
 103                         $char = codepointToUtf8( $i );
 104                         $clean = UtfNormal::cleanUp( $char );
 105                         $x = sprintf( "%04X", $i );
 106                         if( $i % 0x1000 == 0 ) echo "U+$x\n";
 107                         if( $i == 0x0009 ||
 108                             $i == 0x000a ||
 109                             $i == 0x000d ||
 110                             ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
 111                             ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
 112                             ($i > 0xffff && $i <= UNICODE_MAX ) ) {
 113                                 if( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) {
 114                                     $comp = UtfNormal::NFC( $char );
 115                                         $this->assertEquals(
 116                                                 bin2hex( $comp ),
 117                                                 bin2hex( $clean ),
 118                                                 "U+$x should be decomposed" );
 119                                 } else {
 120                                         $this->assertEquals(
 121                                                 bin2hex( $char ),
 122                                                 bin2hex( $clean ),
 123                                                 "U+$x should be intact" );
 124                                 }
 125                         } else {
 126                                 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
 127                         }
 128                 }
 129         }
 130
 131         /** @todo document */
 132         function testAllBytes() {
 133                 $this->doTestBytes( '', '' );
 134                 $this->doTestBytes( 'x', '' );
 135                 $this->doTestBytes( '', 'x' );
 136                 $this->doTestBytes( 'x', 'x' );
 137         }
 138
 139         /** @todo document */
 140         function doTestBytes( $head, $tail ) {
 141                 for( $i = 0x0; $i < 256; $i++ ) {
 142                         $char = $head . chr( $i ) . $tail;
 143                         $clean = UtfNormal::cleanUp( $char );
 144                         $x = sprintf( "%02X", $i );
 145                         if( $i == 0x0009 ||
 146                             $i == 0x000a ||
 147                             $i == 0x000d ||
 148                             ($i > 0x001f && $i < 0x80) ) {
 149                                 $this->assertEquals(
 150                                         bin2hex( $char ),
 151                                         bin2hex( $clean ),
 152                                         "ASCII byte $x should be intact" );
 153                                 if( $char != $clean ) return;
 154                         } else {
 155                                 $norm = $head . UTF8_REPLACEMENT . $tail;
 156                                 $this->assertEquals(
 157                                         bin2hex( $norm ),
 158                                         bin2hex( $clean ),
 159                                         "Forbidden byte $x should be rejected" );
 160                                 if( $norm != $clean ) return;
 161                         }
 162                 }
 163         }
 164
 165         /** @todo document */
 166         function testDoubleBytes() {
 167                 $this->doTestDoubleBytes( '', '' );
 168                 $this->doTestDoubleBytes( 'x', '' );
 169                 $this->doTestDoubleBytes( '', 'x' );
 170                 $this->doTestDoubleBytes( 'x', 'x' );
 171         }
 172
 173         /**
 174          * @todo document
 175          */
 176         function doTestDoubleBytes( $head, $tail ) {
 177                 for( $first = 0xc0; $first < 0x100; $first++ ) {
 178                         for( $second = 0x80; $second < 0x100; $second++ ) {
 179                                 $char = $head . chr( $first ) . chr( $second ) . $tail;
 180                                 $clean = UtfNormal::cleanUp( $char );
 181                                 $x = sprintf( "%02X,%02X", $first, $second );
 182                                 if( $first > 0xc1 &&
 183                                     $first < 0xe0 &&
 184                                     $second < 0xc0 ) {
 185                                     $norm = UtfNormal::NFC( $char );
 186                                         $this->assertEquals(
 187                                                 bin2hex( $norm ),
 188                                                 bin2hex( $clean ),
 189                                                 "Pair $x should be intact" );
 190                                     if( $norm != $clean ) return;
 191                                 } elseif( $first > 0xfd || $second > 0xbf ) {
 192                                         # fe and ff are not legal head bytes -- expect two replacement chars
 193                                         $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
 194                                         $this->assertEquals(
 195                                                 bin2hex( $norm ),
 196                                                 bin2hex( $clean ),
 197                                                 "Forbidden pair $x should be rejected" );
 198                                         if( $norm != $clean ) return;
 199                                 } else {
 200                                         $norm = $head . UTF8_REPLACEMENT . $tail;
 201                                         $this->assertEquals(
 202                                                 bin2hex( $norm ),
 203                                                 bin2hex( $clean ),
 204                                                 "Forbidden pair $x should be rejected" );
 205                                         if( $norm != $clean ) return;
 206                                 }
 207                         }
 208                 }
 209         }
 210
 211         /** @todo document */
 212         function testTripleBytes() {
 213                 $this->doTestTripleBytes( '', '' );
 214                 $this->doTestTripleBytes( 'x', '' );
 215                 $this->doTestTripleBytes( '', 'x' );
 216                 $this->doTestTripleBytes( 'x', 'x' );
 217         }
 218
 219         /** @todo document */
 220         function doTestTripleBytes( $head, $tail ) {
 221                 for( $first = 0xc0; $first < 0x100; $first++ ) {
 222                         for( $second = 0x80; $second < 0x100; $second++ ) {
 223                                 #for( $third = 0x80; $third < 0x100; $third++ ) {
 224                                 for( $third = 0x80; $third < 0x81; $third++ ) {
 225                                         $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
 226                                         $clean = UtfNormal::cleanUp( $char );
 227                                         $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
 228                                         if( $first >= 0xe0 &&
 229                                                 $first < 0xf0 &&
 230                                                 $second < 0xc0 &&
 231                                                 $third < 0xc0 ) {
 232                                                 if( $first == 0xe0 && $second < 0xa0 ) {
 233                                                         $this->assertEquals(
 234                                                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
 235                                                                 bin2hex( $clean ),
 236                                                                 "Overlong triplet $x should be rejected" );
 237                                                 } elseif( $first == 0xed &&
 238                                                         ( chr( $first ) . chr( $second ) . chr( $third ))  >= UTF8_SURROGATE_FIRST ) {
 239                                                         $this->assertEquals(
 240                                                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
 241                                                                 bin2hex( $clean ),
 242                                                                 "Surrogate triplet $x should be rejected" );
 243                                                 } else {
 244                                                         $this->assertEquals(
 245                                                                 bin2hex( UtfNormal::NFC( $char ) ),
 246                                                                 bin2hex( $clean ),
 247                                                                 "Triplet $x should be intact" );
 248                                                 }
 249                                         } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
 250                                                 $this->assertEquals(
 251                                                         bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
 252                                                         bin2hex( $clean ),
 253                                                         "Valid 2-byte $x + broken tail" );
 254                                         } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
 255                                                 $this->assertEquals(
 256                                                         bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
 257                                                         bin2hex( $clean ),
 258                                                         "Broken head + valid 2-byte $x" );
 259                                         } elseif( ( $first > 0xfd || $second > 0xfd ) &&
 260                                                     ( ( $second > 0xbf && $third > 0xbf ) ||
 261                                                       ( $second < 0xc0 && $third < 0xc0 ) ||
 262                                                       ( $second > 0xfd ) ||
 263                                                       ( $third > 0xfd ) ) ) {
 264                                                 # fe and ff are not legal head bytes -- expect three replacement chars
 265                                                 $this->assertEquals(
 266                                                         bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
 267                                                         bin2hex( $clean ),
 268                                                         "Forbidden triplet $x should be rejected" );
 269                                         } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
 270                                                 $this->assertEquals(
 271                                                         bin2hex( $head . UTF8_REPLACEMENT . $tail ),
 272                                                         bin2hex( $clean ),
 273                                                         "Forbidden triplet $x should be rejected" );
 274                                         } else {
 275                                                 $this->assertEquals(
 276                                                         bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
 277                                                         bin2hex( $clean ),
 278                                                         "Forbidden triplet $x should be rejected" );
 279                                         }
 280                                 }
 281                         }
 282                 }
 283         }
 284
 285         /** @todo document */
 286         function testChunkRegression() {
 287                 # Check for regression against a chunking bug
 288                 $text   = "\x46\x55\xb8" .
 289                           "\xdc\x96" .
 290                           "\xee" .
 291                           "\xe7" .
 292                           "\x44" .
 293                           "\xaa" .
 294                           "\x2f\x25";
 295                 $expect = "\x46\x55\xef\xbf\xbd" .
 296                           "\xdc\x96" .
 297                           "\xef\xbf\xbd" .
 298                           "\xef\xbf\xbd" .
 299                           "\x44" .
 300                           "\xef\xbf\xbd" .
 301                           "\x2f\x25";
 302
 303                 $this->assertEquals(
 304                         bin2hex( $expect ),
 305                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 306         }
 307
 308         /** @todo document */
 309         function testInterposeRegression() {
 310                 $text   = "\x4e\x30" .
 311                           "\xb1" .              # bad tail
 312                           "\x3a" .
 313                           "\x92" .              # bad tail
 314                           "\x62\x3a" .
 315                           "\x84" .              # bad tail
 316                           "\x43" .
 317                           "\xc6" .              # bad head
 318                           "\x3f" .
 319                           "\x92" .              # bad tail
 320                           "\xad" .              # bad tail
 321                           "\x7d" .
 322                           "\xd9\x95";
 323
 324                 $expect = "\x4e\x30" .
 325                           "\xef\xbf\xbd" .
 326                           "\x3a" .
 327                           "\xef\xbf\xbd" .
 328                           "\x62\x3a" .
 329                           "\xef\xbf\xbd" .
 330                           "\x43" .
 331                           "\xef\xbf\xbd" .
 332                           "\x3f" .
 333                           "\xef\xbf\xbd" .
 334                           "\xef\xbf\xbd" .
 335                           "\x7d" .
 336                           "\xd9\x95";
 337
 338                 $this->assertEquals(
 339                         bin2hex( $expect ),
 340                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 341         }
 342
 343         /** @todo document */
 344         function testOverlongRegression() {
 345                 $text   = "\x67" .
 346                           "\x1a" . # forbidden ascii
 347                           "\xea" . # bad head
 348                           "\xc1\xa6" . # overlong sequence
 349                           "\xad" . # bad tail
 350                           "\x1c" . # forbidden ascii
 351                           "\xb0" . # bad tail
 352                           "\x3c" .
 353                           "\x9e";  # bad tail
 354                 $expect = "\x67" .
 355                           "\xef\xbf\xbd" .
 356                           "\xef\xbf\xbd" .
 357                           "\xef\xbf\xbd" .
 358                           "\xef\xbf\xbd" .
 359                           "\xef\xbf\xbd" .
 360                           "\xef\xbf\xbd" .
 361                           "\x3c" .
 362                           "\xef\xbf\xbd";
 363                 $this->assertEquals(
 364                         bin2hex( $expect ),
 365                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 366         }
 367
 368         /** @todo document */
 369         function testSurrogateRegression() {
 370                 $text   = "\xed\xb4\x96" . # surrogate 0xDD16
 371                           "\x83" . # bad tail
 372                           "\xb4" . # bad tail
 373                           "\xac";  # bad head
 374                 $expect = "\xef\xbf\xbd" .
 375                           "\xef\xbf\xbd" .
 376                           "\xef\xbf\xbd" .
 377                           "\xef\xbf\xbd";
 378                 $this->assertEquals(
 379                         bin2hex( $expect ),
 380                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 381         }
 382
 383         /** @todo document */
 384         function testBomRegression() {
 385                 $text   = "\xef\xbf\xbe" . # U+FFFE, illegal char
 386                           "\xb2" . # bad tail
 387                           "\xef" . # bad head
 388                           "\x59";
 389                 $expect = "\xef\xbf\xbd" .
 390                           "\xef\xbf\xbd" .
 391                           "\xef\xbf\xbd" .
 392                           "\x59";
 393                 $this->assertEquals(
 394                         bin2hex( $expect ),
 395                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 396         }
 397
 398         /** @todo document */
 399         function testForbiddenRegression() {
 400                 $text   = "\xef\xbf\xbf"; # U+FFFF, illegal char
 401                 $expect = "\xef\xbf\xbd";
 402                 $this->assertEquals(
 403                         bin2hex( $expect ),
 404                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 405         }
 406
 407         /** @todo document */
 408         function testHangulRegression() {
 409                 $text = "\xed\x9c\xaf" . # Hangul char
 410                                 "\xe1\x87\x81";  # followed by another final jamo
 411                 $expect = $text;         # Should *not* change.
 412                 $this->assertEquals(
 413                         bin2hex( $expect ),
 414                         bin2hex( UtfNormal::cleanUp( $text ) ) );
 415         }
 416 }
 417
 418
 419 $suite = new PHPUnit_Framework_TestSuite( 'CleanUpTest' );
 420 $result = PHPUnit_TextUI_TestRunner::run( $suite );
 421
 422 if( !$result->wasSuccessful() ) {
 423         exit( -1 );
 424 }
 425 exit( 0 );