comments
[lhc/web/wiklou.git] / languages / LanguageUtf8.php
1 <?php
2 #$Id$
3 if( defined( "MEDIAWIKI" ) ) {
4
5 $wgInputEncoding = "UTF-8";
6 $wgOutputEncoding = "UTF-8";
7
8 if (function_exists('mb_internal_encoding')) {
9 mb_internal_encoding('UTF-8');
10 } else {
11 # Hack our own case conversion routines
12
13 # Loading serialized arrays is faster than parsing code :P
14 $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
15 $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
16
17 if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
18 require_once( "includes/Utf8Case.php" );
19 $wgMemc->set( $key1, $wikiUpperChars );
20 $wgMemc->set( $key2, $wikiLowerChars );
21 }
22 }
23
24 # Base stuff useful to all UTF-8 based language files
25 class LanguageUtf8 extends Language {
26
27 # These two functions use mbstring library, if it is loaded
28 # or compiled and character mapping arrays otherwise.
29 # In case of language-specific character mismatch
30 # it should be dealt with in Language classes.
31
32 function ucfirst( $string ) {
33 if (function_exists('mb_strtoupper')) {
34 return mb_strtoupper(mb_substr($string,0,1)).mb_substr($string,1);
35 } else {
36 global $wikiUpperChars;
37 return preg_replace (
38 "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
39 "strtr ( \"\$1\" , \$wikiUpperChars )",
40 $string );
41 }
42 }
43
44 function lcfirst( $string ) {
45 if (function_exists('mb_strtolower')) {
46 return mb_strtolower(mb_substr($string,0,1)).mb_substr($string,1);
47 } else {
48 global $wikiLowerChars;
49 return preg_replace (
50 "/^([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
51 "strtr ( \"\$1\" , \$wikiLowerChars )",
52 $string );
53 }
54 }
55
56 function stripForSearch( $string ) {
57 # MySQL fulltext index doesn't grok utf-8, so we
58 # need to fold cases and convert to hex
59
60 # In Language:: it just returns lowercase, maybe
61 # all strtolower on stripped output or argument
62 # should be removed and all stripForSearch
63 # methods adjusted to that.
64
65 wfProfileIn( "LanguageUtf8::stripForSearch" );
66 if( function_exists( 'mb_strtolower' ) ) {
67 $out = preg_replace(
68 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
69 "'U8' . bin2hex( \"$1\" )",
70 mb_strtolower( $string ) );
71 } else {
72 global $wikiLowerChars;
73 $out = preg_replace(
74 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
75 "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
76 $string );
77 }
78 wfProfileOut( "LanguageUtf8::stripForSearch" );
79 return $out;
80 }
81
82 function fallback8bitEncoding() {
83 # Windows codepage 1252 is a superset of iso 8859-1
84 # override this to use difference source encoding to
85 # translate incoming 8-bit URLs.
86 return "windows-1252";
87 }
88
89 function checkTitleEncoding( $s ) {
90 global $wgInputEncoding;
91
92 # Check for non-UTF-8 URLs
93 $ishigh = preg_match( '/[\x80-\xff]/', $s);
94 if(!$ishigh) return $s;
95
96 $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
97 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
98 if( $isutf8 ) return $s;
99
100 return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
101 }
102
103 function firstChar( $s ) {
104 preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
105 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
106
107 return isset( $matches[1] ) ? $matches[1] : "";
108 }
109
110 # Crop a string from the beginning or end to a certain number of bytes.
111 # (Bytes are used because our storage has limited byte lengths for some
112 # columns in the database.) Multibyte charsets will need to make sure that
113 # only whole characters are included!
114 #
115 # $length does not include the optional ellipsis.
116 # If $length is negative, snip from the beginning
117 function truncate( $string, $length, $ellipsis = "" ) {
118 if( $length == 0 ) {
119 return $ellipsis;
120 }
121 if ( strlen( $string ) <= abs( $length ) ) {
122 return $string;
123 }
124 if( $length > 0 ) {
125 $string = substr( $string, 0, $length );
126 $char = ord( $string[strlen( $string ) - 1] );
127 if ($char >= 0xc0) {
128 # We got the first byte only of a multibyte char; remove it.
129 $string = substr( $string, 0, -1 );
130 } elseif( $char >= 0x80 &&
131 preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
132 '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
133 # We chopped in the middle of a character; remove it
134 $string = $m[1];
135 }
136 return $string . $ellipsis;
137 } else {
138 $string = substr( $string, $length );
139 $char = ord( $string[0] );
140 if( $char >= 0x80 && $char < 0xc0 ) {
141 # We chopped in the middle of a character; remove the whole thing
142 $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
143 }
144 return $ellipsis . $string;
145 }
146 }
147 }
148
149 } # ifdef MEDIAWIKI
150
151 ?>