Follow up r112423, added release note.
[lhc/web/wiklou.git] / includes / normal / UtfNormalTest2.php
1 #!/usr/bin/php
2 <?php
3 /**
4 * Other tests for the unicode normalization module
5 *
6 * @file
7 * @ingroup UtfNormal
8 */
9
10 if( php_sapi_name() != 'cli' ) {
11 die( "Run me from the command line please.\n" );
12 }
13
14 // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
15 $file = "NormalizationTest.txt";
16
17 // Anything after this character is a comment
18 define ( 'COMMENT', '#' );
19
20 // Semicolons are used to separate the columns
21 define ( 'SEPARATOR', ';' );
22
23 $f = fopen($file, "r");
24
25 /**
26 * The following section will be used for testing different normalization methods.
27 * - Pure PHP
28 ~ no assertion errors
29 ~ 6.25 minutes
30
31 * - php_utfnormal.so or intl extension: both are wrappers around
32 libicu so we list the version of libicu when making the
33 comparison
34
35 * - libicu Ubuntu 3.8.1-3ubuntu1.1 php 5.2.6-3ubuntu4.5
36 ~ 2200 assertion errors
37 ~ 5 seconds
38 ~ output: http://paste2.org/p/921566
39
40 * - libicu Ubuntu 4.2.1-3 php 5.3.2-1ubuntu4.2
41 ~ 1384 assertion errors
42 ~ 15 seconds
43 ~ output: http://paste2.org/p/921435
44
45 * - libicu Debian 4.4.1-5 php 5.3.2-1ubuntu4.2
46 ~ no assertion errors
47 ~ 13 seconds
48
49 * - Tests comparing pure PHP output with libicu output were added
50 later and slow down the runtime.
51 */
52
53 require_once("./UtfNormal.php");
54 function normalize_form_c($c) { return UtfNormal::toNFC($c); }
55 function normalize_form_d($c) { return UtfNormal::toNFD($c); }
56 function normalize_form_kc($c) { return UtfNormal::toNFKC($c); }
57 function normalize_form_kd($c) { return UtfNormal::toNFKD($c); }
58
59 /**
60 * This set of functions is only useful if youve added a param to the
61 * following functions to force pure PHP usage. I decided not to
62 * commit that code since might produce a slowdown in the UTF
63 * normalization code just for the sake of these tests. -- hexmode
64 * @return string
65 */
66 function normalize_form_c_php($c) { return UtfNormal::toNFC($c, "php"); }
67 function normalize_form_d_php($c) { return UtfNormal::toNFD($c, "php"); }
68 function normalize_form_kc_php($c) { return UtfNormal::toNFKC($c, "php"); }
69 function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); }
70
71 assert_options(ASSERT_ACTIVE, 1);
72 assert_options(ASSERT_WARNING, 0);
73 assert_options(ASSERT_QUIET_EVAL, 1);
74 assert_options(ASSERT_CALLBACK, 'my_assert');
75
76 function my_assert( $file, $line, $code ) {
77 global $col, $lineNo;
78 echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
79 }
80
81 $count = 0;
82 $lineNo = 0;
83 if( $f !== false ) {
84 while( ( $col = getRow( $f ) ) !== false ) {
85 $lineNo++;
86
87 if(count($col) == 6) {
88 $count++;
89 if( $count % 100 === 0 ) echo "Count: $count\n";
90 } else {
91 continue;
92 }
93
94 # verify that the pure PHP version is correct
95 $NFCc1 = normalize_form_c($col[0]);
96 $NFCc1p = normalize_form_c_php($col[0]);
97 assert('$NFCc1 === $NFCc1p');
98 $NFCc2 = normalize_form_c($col[1]);
99 $NFCc2p = normalize_form_c_php($col[1]);
100 assert('$NFCc2 === $NFCc2p');
101 $NFCc3 = normalize_form_c($col[2]);
102 $NFCc3p = normalize_form_c_php($col[2]);
103 assert('$NFCc3 === $NFCc3p');
104 $NFCc4 = normalize_form_c($col[3]);
105 $NFCc4p = normalize_form_c_php($col[3]);
106 assert('$NFCc4 === $NFCc4p');
107 $NFCc5 = normalize_form_c($col[4]);
108 $NFCc5p = normalize_form_c_php($col[4]);
109 assert('$NFCc5 === $NFCc5p');
110
111 $NFDc1 = normalize_form_d($col[0]);
112 $NFDc1p = normalize_form_d_php($col[0]);
113 assert('$NFDc1 === $NFDc1p');
114 $NFDc2 = normalize_form_d($col[1]);
115 $NFDc2p = normalize_form_d_php($col[1]);
116 assert('$NFDc2 === $NFDc2p');
117 $NFDc3 = normalize_form_d($col[2]);
118 $NFDc3p = normalize_form_d_php($col[2]);
119 assert('$NFDc3 === $NFDc3p');
120 $NFDc4 = normalize_form_d($col[3]);
121 $NFDc4p = normalize_form_d_php($col[3]);
122 assert('$NFDc4 === $NFDc4p');
123 $NFDc5 = normalize_form_d($col[4]);
124 $NFDc5p = normalize_form_d_php($col[4]);
125 assert('$NFDc5 === $NFDc5p');
126
127 $NFKDc1 = normalize_form_kd($col[0]);
128 $NFKDc1p = normalize_form_kd_php($col[0]);
129 assert('$NFKDc1 === $NFKDc1p');
130 $NFKDc2 = normalize_form_kd($col[1]);
131 $NFKDc2p = normalize_form_kd_php($col[1]);
132 assert('$NFKDc2 === $NFKDc2p');
133 $NFKDc3 = normalize_form_kd($col[2]);
134 $NFKDc3p = normalize_form_kd_php($col[2]);
135 assert('$NFKDc3 === $NFKDc3p');
136 $NFKDc4 = normalize_form_kd($col[3]);
137 $NFKDc4p = normalize_form_kd_php($col[3]);
138 assert('$NFKDc4 === $NFKDc4p');
139 $NFKDc5 = normalize_form_kd($col[4]);
140 $NFKDc5p = normalize_form_kd_php($col[4]);
141 assert('$NFKDc5 === $NFKDc5p');
142
143 $NFKCc1 = normalize_form_kc($col[0]);
144 $NFKCc1p = normalize_form_kc_php($col[0]);
145 assert('$NFKCc1 === $NFKCc1p');
146 $NFKCc2 = normalize_form_kc($col[1]);
147 $NFKCc2p = normalize_form_kc_php($col[1]);
148 assert('$NFKCc2 === $NFKCc2p');
149 $NFKCc3 = normalize_form_kc($col[2]);
150 $NFKCc3p = normalize_form_kc_php($col[2]);
151 assert('$NFKCc3 === $NFKCc3p');
152 $NFKCc4 = normalize_form_kc($col[3]);
153 $NFKCc4p = normalize_form_kc_php($col[3]);
154 assert('$NFKCc4 === $NFKCc4p');
155 $NFKCc5 = normalize_form_kc($col[4]);
156 $NFKCc5p = normalize_form_kc_php($col[4]);
157 assert('$NFKCc5 === $NFKCc5p');
158
159 # c2 == NFC(c1) == NFC(c2) == NFC(c3)
160 assert('$col[1] === $NFCc1');
161 assert('$col[1] === $NFCc2');
162 assert('$col[1] === $NFCc3');
163
164 # c4 == NFC(c4) == NFC(c5)
165 assert('$col[3] === $NFCc4');
166 assert('$col[3] === $NFCc5');
167
168 # c3 == NFD(c1) == NFD(c2) == NFD(c3)
169 assert('$col[2] === $NFDc1');
170 assert('$col[2] === $NFDc2');
171 assert('$col[2] === $NFDc3');
172
173 # c5 == NFD(c4) == NFD(c5)
174 assert('$col[4] === $NFDc4');
175 assert('$col[4] === $NFDc5');
176
177 # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
178 assert('$col[3] === $NFKCc1');
179 assert('$col[3] === $NFKCc2');
180 assert('$col[3] === $NFKCc3');
181 assert('$col[3] === $NFKCc4');
182 assert('$col[3] === $NFKCc5');
183
184 # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
185 assert('$col[4] === $NFKDc1');
186 assert('$col[4] === $NFKDc2');
187 assert('$col[4] === $NFKDc3');
188 assert('$col[4] === $NFKDc4');
189 assert('$col[4] === $NFKDc5');
190 }
191 }
192 echo "done.\n";
193
194 // Compare against http://en.wikipedia.org/wiki/UTF-8#Description
195 function unichr($c) {
196 if ($c <= 0x7F) {
197 return chr($c);
198 } elseif ($c <= 0x7FF) {
199 return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F);
200 } elseif ($c <= 0xFFFF) {
201 return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F)
202 . chr(0x80 | $c & 0x3F);
203 } elseif ($c <= 0x10FFFF) {
204 return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F)
205 . chr(0x80 | $c >> 6 & 0x3F)
206 . chr(0x80 | $c & 0x3F);
207 } else {
208 return false;
209 }
210 }
211
212 function unistr($c) {
213 return implode("", array_map("unichr", array_map("hexdec", explode(" ", $c))));
214 }
215
216 function getRow( $f ) {
217 $row = fgets( $f );
218 if( $row === false ) return false;
219 $row = rtrim($row);
220 $pos = strpos( $row, COMMENT );
221 $pos2 = strpos( $row, ")" );
222 if( $pos === 0 ) return array($row);
223 $c = "";
224
225 if( $pos ) {
226 if($pos2) $c = substr( $row, $pos2 + 2 );
227 else $c = substr( $row, $pos );
228 $row = substr( $row, 0, $pos );
229 }
230
231 $ret = array();
232 foreach( explode( SEPARATOR, $row ) as $ent ) {
233 if( trim( $ent ) !== "" ) {
234 $ret[] = unistr($ent);
235 }
236 }
237 $ret[] = $c;
238
239 return $ret;
240 }