Somebody forgot to update the interwiki table definition...
[lhc/web/wiklou.git] / languages / LanguageUtf8.php
1 <?php
2 /**
3 * @package MediaWiki
4 * @subpackage Language
5 */
6
7 if( defined( "MEDIAWIKI" ) ) {
8
9 # This file and LanguageLatin1.php may be included from within functions, so
10 # we need to have global statements
11
12 global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
13 global $wgDBname, $wgMemc;
14
15 $wgInputEncoding = "UTF-8";
16 $wgOutputEncoding = "UTF-8";
17
18 if( function_exists( 'mb_strtoupper' ) ) {
19 mb_internal_encoding('UTF-8');
20 } else {
21 # Hack our own case conversion routines
22
23 # Loading serialized arrays is faster than parsing code :P
24 $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
25 $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
26
27 if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
28 require_once( "includes/Utf8Case.php" );
29 $wgMemc->set( $key1, $wikiUpperChars );
30 $wgMemc->set( $key2, $wikiLowerChars );
31 }
32 }
33
34 # Base stuff useful to all UTF-8 based language files
35 class LanguageUtf8 extends Language {
36
37 # These two functions use mbstring library, if it is loaded
38 # or compiled and character mapping arrays otherwise.
39 # In case of language-specific character mismatch
40 # it should be dealt with in Language classes.
41
42 function ucfirst( $string ) {
43 /**
44 * On pages with many links we can get called a lot.
45 * The multibyte uppercase functions are relatively
46 * slow, so check first if we can use a faster ASCII
47 * version instead; it saves a few milliseconds.
48 */
49 if( preg_match( '/^[\x80-\xff]/', $string ) ) {
50 if (function_exists('mb_strtoupper')) {
51 return mb_strtoupper(mb_substr($string,0,1)).mb_substr($string,1);
52 } else {
53 global $wikiUpperChars;
54 return preg_replace (
55 "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
56 "strtr ( \"\$1\" , \$wikiUpperChars )",
57 $string );
58 }
59 }
60 return ucfirst( $string );
61 }
62
63 function lcfirst( $string ) {
64 if (function_exists('mb_strtolower')) {
65 return mb_strtolower(mb_substr($string,0,1)).mb_substr($string,1);
66 } else {
67 global $wikiLowerChars;
68 return preg_replace (
69 "/^([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
70 "strtr ( \"\$1\" , \$wikiLowerChars )",
71 $string );
72 }
73 }
74
75 function stripForSearch( $string ) {
76 # MySQL fulltext index doesn't grok utf-8, so we
77 # need to fold cases and convert to hex
78
79 # In Language:: it just returns lowercase, maybe
80 # all strtolower on stripped output or argument
81 # should be removed and all stripForSearch
82 # methods adjusted to that.
83
84 wfProfileIn( "LanguageUtf8::stripForSearch" );
85 if( function_exists( 'mb_strtolower' ) ) {
86 $out = preg_replace(
87 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
88 "'U8' . bin2hex( \"$1\" )",
89 mb_strtolower( $string ) );
90 } else {
91 global $wikiLowerChars;
92 $out = preg_replace(
93 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
94 "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
95 $string );
96 }
97 wfProfileOut( "LanguageUtf8::stripForSearch" );
98 return $out;
99 }
100
101 function fallback8bitEncoding() {
102 # Windows codepage 1252 is a superset of iso 8859-1
103 # override this to use difference source encoding to
104 # translate incoming 8-bit URLs.
105 return "windows-1252";
106 }
107
108 function checkTitleEncoding( $s ) {
109 global $wgInputEncoding;
110
111 if( is_array( $s ) ) {
112 wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
113 }
114 # Check for non-UTF-8 URLs
115 $ishigh = preg_match( '/[\x80-\xff]/', $s);
116 if(!$ishigh) return $s;
117
118 $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
119 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
120 if( $isutf8 ) return $s;
121
122 return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
123 }
124
125 function firstChar( $s ) {
126 preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
127 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
128
129 return isset( $matches[1] ) ? $matches[1] : "";
130 }
131
132 # Crop a string from the beginning or end to a certain number of bytes.
133 # (Bytes are used because our storage has limited byte lengths for some
134 # columns in the database.) Multibyte charsets will need to make sure that
135 # only whole characters are included!
136 #
137 # $length does not include the optional ellipsis.
138 # If $length is negative, snip from the beginning
139 function truncate( $string, $length, $ellipsis = "" ) {
140 if( $length == 0 ) {
141 return $ellipsis;
142 }
143 if ( strlen( $string ) <= abs( $length ) ) {
144 return $string;
145 }
146 if( $length > 0 ) {
147 $string = substr( $string, 0, $length );
148 $char = ord( $string[strlen( $string ) - 1] );
149 if ($char >= 0xc0) {
150 # We got the first byte only of a multibyte char; remove it.
151 $string = substr( $string, 0, -1 );
152 } elseif( $char >= 0x80 &&
153 preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
154 '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
155 # We chopped in the middle of a character; remove it
156 $string = $m[1];
157 }
158 return $string . $ellipsis;
159 } else {
160 $string = substr( $string, $length );
161 $char = ord( $string[0] );
162 if( $char >= 0x80 && $char < 0xc0 ) {
163 # We chopped in the middle of a character; remove the whole thing
164 $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
165 }
166 return $ellipsis . $string;
167 }
168 }
169 }
170
171 } # ifdef MEDIAWIKI
172
173 ?>