bfad70955384b437abffbc9b2c0e2c3c0cca4071
[lhc/web/wiklou.git] / includes / normal / UtfNormalUtil.php
1 <?php
2 /**
3 * Some of these functions are adapted from places in MediaWiki.
4 * Should probably merge them for consistency.
5 *
6 * Copyright © 2004 Brion Vibber <brion@pobox.com>
7 * http://www.mediawiki.org/
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 * http://www.gnu.org/copyleft/gpl.html
23 *
24 * @file
25 * @ingroup UtfNormal
26 */
27
28 /**
29 * Return UTF-8 sequence for a given Unicode code point.
30 * May die if fed out of range data.
31 *
32 * @param $codepoint Integer:
33 * @return String
34 * @public
35 */
36 function codepointToUtf8( $codepoint ) {
37 if($codepoint < 0x80) return chr($codepoint);
38 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
39 chr($codepoint & 0x3f | 0x80);
40 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
41 chr($codepoint >> 6 & 0x3f | 0x80) .
42 chr($codepoint & 0x3f | 0x80);
43 if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
44 chr($codepoint >> 12 & 0x3f | 0x80) .
45 chr($codepoint >> 6 & 0x3f | 0x80) .
46 chr($codepoint & 0x3f | 0x80);
47
48 echo "Asked for code outside of range ($codepoint)\n";
49 die( -1 );
50 }
51
52 /**
53 * Take a series of space-separated hexadecimal numbers representing
54 * Unicode code points and return a UTF-8 string composed of those
55 * characters. Used by UTF-8 data generation and testing routines.
56 *
57 * @param $sequence String
58 * @return String
59 * @private
60 */
61 function hexSequenceToUtf8( $sequence ) {
62 $utf = '';
63 foreach( explode( ' ', $sequence ) as $hex ) {
64 $n = hexdec( $hex );
65 $utf .= codepointToUtf8( $n );
66 }
67 return $utf;
68 }
69
70 /**
71 * Take a UTF-8 string and return a space-separated series of hex
72 * numbers representing Unicode code points. For debugging.
73 *
74 * @param $str String: UTF-8 string.
75 * @return string
76 * @private
77 */
78 function utf8ToHexSequence( $str ) {
79 return rtrim( preg_replace( '/(.)/uSe',
80 'sprintf("%04x ", utf8ToCodepoint("$1"))',
81 $str ) );
82 }
83
84 /**
85 * Determine the Unicode codepoint of a single-character UTF-8 sequence.
86 * Does not check for invalid input data.
87 *
88 * @param $char String
89 * @return Integer
90 * @public
91 */
92 function utf8ToCodepoint( $char ) {
93 # Find the length
94 $z = ord( $char[0] );
95 if ( $z & 0x80 ) {
96 $length = 0;
97 while ( $z & 0x80 ) {
98 $length++;
99 $z <<= 1;
100 }
101 } else {
102 $length = 1;
103 }
104
105 if ( $length != strlen( $char ) ) {
106 return false;
107 }
108 if ( $length == 1 ) {
109 return ord( $char );
110 }
111
112 # Mask off the length-determining bits and shift back to the original location
113 $z &= 0xff;
114 $z >>= $length;
115
116 # Add in the free bits from subsequent bytes
117 for ( $i=1; $i<$length; $i++ ) {
118 $z <<= 6;
119 $z |= ord( $char[$i] ) & 0x3f;
120 }
121
122 return $z;
123 }
124
125 /**
126 * Escape a string for inclusion in a PHP single-quoted string literal.
127 *
128 * @param $string String: string to be escaped.
129 * @return String: escaped string.
130 * @public
131 */
132 function escapeSingleString( $string ) {
133 return strtr( $string,
134 array(
135 '\\' => '\\\\',
136 '\'' => '\\\''
137 ));
138 }