Revert bogus change for bug 4530; doesn't do as requested
[lhc/web/wiklou.git] / languages / LanguageUtf8.php
1 <?php
2 /**
3 * @package MediaWiki
4 * @subpackage Language
5 */
6
7 if( defined( "MEDIAWIKI" ) ) {
8
9 # This file and LanguageLatin1.php may be included from within functions, so
10 # we need to have global statements
11
12 global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
13 global $wgDBname, $wgMemc;
14
15 $wgInputEncoding = "UTF-8";
16 $wgOutputEncoding = "UTF-8";
17
18 if( function_exists( 'mb_strtoupper' ) ) {
19 mb_internal_encoding('UTF-8');
20 } else {
21 # Hack our own case conversion routines
22
23 # Loading serialized arrays is faster than parsing code :P
24 $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
25 $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
26
27 if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
28 require_once( "includes/Utf8Case.php" );
29 $wgMemc->set( $key1, $wikiUpperChars );
30 $wgMemc->set( $key2, $wikiLowerChars );
31 }
32 }
33
34 /**
35 * Base stuff useful to all UTF-8 based language files
36 * @package MediaWiki
37 */
38 class LanguageUtf8 extends Language {
39
40 # These functions use mbstring library, if it is loaded
41 # or compiled and character mapping arrays otherwise.
42 # In case of language-specific character mismatch
43 # it should be dealt with in Language classes.
44
45 function ucfirst( $str ) {
46 return LanguageUtf8::uc( $str, true );
47 }
48
49 function uc( $str, $first = false ) {
50 if ( function_exists( 'mb_strtoupper' ) )
51 if ( $first )
52 if ( LanguageUtf8::isMultibyte( $str ) )
53 return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
54 else
55 return ucfirst( $str );
56 else
57 return LanguageUtf8::isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );
58 else
59 if ( LanguageUtf8::isMultibyte( $str ) ) {
60 global $wikiUpperChars;
61 $x = $first ? '^' : '';
62 return preg_replace(
63 "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
64 "strtr( \"\$1\" , \$wikiUpperChars )",
65 $str
66 );
67 } else
68 return $first ? ucfirst( $str ) : strtoupper( $str );
69 }
70
71 function lcfirst( $str ) {
72 return LanguageUtf8::lc( $str, true );
73 }
74
75 function lc( $str, $first = false ) {
76 if ( function_exists( 'mb_strtolower' ) )
77 if ( $first )
78 if ( LanguageUtf8::isMultibyte( $str ) )
79 return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
80 else
81 return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );
82 else
83 return LanguageUtf8::isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );
84 else
85 if ( LanguageUtf8::isMultibyte( $str ) ) {
86 global $wikiLowerChars;
87 $x = $first ? '^' : '';
88 return preg_replace(
89 "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
90 "strtr( \"\$1\" , \$wikiLowerChars )",
91 $str
92 );
93 } else
94 return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
95 }
96
97 function isMultibyte( $str ) {
98 return (bool)preg_match( '/^[\x80-\xff]/', $str );
99 }
100
101 function stripForSearch( $string ) {
102 # MySQL fulltext index doesn't grok utf-8, so we
103 # need to fold cases and convert to hex
104
105 # In Language:: it just returns lowercase, maybe
106 # all strtolower on stripped output or argument
107 # should be removed and all stripForSearch
108 # methods adjusted to that.
109
110 wfProfileIn( "LanguageUtf8::stripForSearch" );
111 if( function_exists( 'mb_strtolower' ) ) {
112 $out = preg_replace(
113 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
114 "'U8' . bin2hex( \"$1\" )",
115 mb_strtolower( $string ) );
116 } else {
117 global $wikiLowerChars;
118 $out = preg_replace(
119 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
120 "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
121 $string );
122 }
123 wfProfileOut( "LanguageUtf8::stripForSearch" );
124 return $out;
125 }
126
127 function fallback8bitEncoding() {
128 # Windows codepage 1252 is a superset of iso 8859-1
129 # override this to use difference source encoding to
130 # translate incoming 8-bit URLs.
131 return "windows-1252";
132 }
133
134 function checkTitleEncoding( $s ) {
135 global $wgInputEncoding;
136
137 if( is_array( $s ) ) {
138 wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
139 }
140 # Check for non-UTF-8 URLs
141 $ishigh = preg_match( '/[\x80-\xff]/', $s);
142 if(!$ishigh) return $s;
143
144 $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
145 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
146 if( $isutf8 ) return $s;
147
148 return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
149 }
150
151 function firstChar( $s ) {
152 preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
153 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
154
155 return isset( $matches[1] ) ? $matches[1] : "";
156 }
157
158 # Crop a string from the beginning or end to a certain number of bytes.
159 # (Bytes are used because our storage has limited byte lengths for some
160 # columns in the database.) Multibyte charsets will need to make sure that
161 # only whole characters are included!
162 #
163 # $length does not include the optional ellipsis.
164 # If $length is negative, snip from the beginning
165 function truncate( $string, $length, $ellipsis = "" ) {
166 if( $length == 0 ) {
167 return $ellipsis;
168 }
169 if ( strlen( $string ) <= abs( $length ) ) {
170 return $string;
171 }
172 if( $length > 0 ) {
173 $string = substr( $string, 0, $length );
174 $char = ord( $string[strlen( $string ) - 1] );
175 if ($char >= 0xc0) {
176 # We got the first byte only of a multibyte char; remove it.
177 $string = substr( $string, 0, -1 );
178 } elseif( $char >= 0x80 &&
179 preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
180 '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
181 # We chopped in the middle of a character; remove it
182 $string = $m[1];
183 }
184 return $string . $ellipsis;
185 } else {
186 $string = substr( $string, $length );
187 $char = ord( $string[0] );
188 if( $char >= 0x80 && $char < 0xc0 ) {
189 # We chopped in the middle of a character; remove the whole thing
190 $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
191 }
192 return $ellipsis . $string;
193 }
194 }
195 }
196
197 } # ifdef MEDIAWIKI
198
199 ?>