segment() should only do segmentation, and let the caller do the conversion to hex.
[lhc/web/wiklou.git] / includes / ZhClient.php
1 <?php
2 /**
3 * Client for querying zhdaemon
4 *
5 * @package MediaWiki
6 * @version $Id$
7 */
8
9 class ZhClient {
10 var $mHost, $mPort, $mFP, $mConnected;
11
12 /**
13 * Constructor
14 *
15 * @access private
16 */
17 function ZhClient($host, $port) {
18 $this->mHost = $host;
19 $this->mPort = $port;
20 $this->mConnected = $this->connect();
21 }
22
23 /**
24 * Check if connection to zhdaemon is successful
25 *
26 * @access public
27 */
28 function isconnected() {
29 return $this->mConnected;
30 }
31
32 /**
33 * Establish conncetion
34 *
35 * @access private
36 */
37 function connect() {
38 wfSuppressWarnings();
39 $this->mFP = fsockopen($this->mHost, $this->mPort, $errno, $errstr, 30);
40 wfRestoreWarnings();
41 if(!$this->mFP) {
42 return false;
43 }
44 return true;
45 }
46
47 /**
48 * Query the daemon and return the result
49 *
50 * @access private
51 */
52 function query($request) {
53 if(!$this->mConnected)
54 return false;
55
56 fwrite($this->mFP, $request);
57
58 $result=fgets($this->mFP, 1024);
59
60 list($status, $len) = explode(" ", $result);
61 if($status == 'ERROR') {
62 //$len is actually the error code...
63 print "zhdaemon error $len<br />\n";
64 return false;
65 }
66 $bytesread=0;
67 $data='';
68 while(!feof($this->mFP) && $bytesread<$len) {
69 $str= fread($this->mFP, $len-$bytesread);
70 $bytesread += strlen($str);
71 $data .= $str;
72 }
73 //data should be of length $len. otherwise something is wrong
74 if(strlen($data) != $len)
75 return false;
76 return $data;
77 }
78
79 /**
80 * Convert the input to a different language variant
81 *
82 * @param string $text input text
83 * @param string $tolang language variant
84 * @return string the converted text
85 * @access public
86 */
87 function convert($text, $tolang) {
88 $len = strlen($text);
89 $q = "CONV $tolang $len\n$text";
90 $result = $this->query($q);
91 if(!$result)
92 $result = $text;
93 return $result;
94 }
95
96 /**
97 * Convert the input to all possible variants
98 *
99 * @param string $text input text
100 * @return array langcode => converted_string
101 * @access public
102 */
103 function convertToAllVariants($text) {
104 $len = strlen($text);
105 $q = "CONV ALL $len\n$text";
106 $result = $this->query($q);
107 if(!$result)
108 return false;
109 list($infoline, $data) = explode('|', $result);
110 $info = explode(";", $infoline);
111 $ret = array();
112 $i=0;
113 foreach($info as $code => $len) {
114 $ret[strtolower($code)] = substr($data, $i, $len);
115 $i+=$len+1;
116 }
117 return $ret;
118 }
119 /**
120 * Perform word segmentation
121 *
122 * @param string $text input text
123 * @return string segmented text
124 * @access public
125 */
126 function segment($text) {
127 $len = strlen($text);
128 $q = "SEG $len\n$text";
129 $result = $this->query($q);
130 if(!$result) {// fallback to character based segmentation
131 $result = ZhClientFake::segment($text);
132 }
133 return $result;
134 }
135
136 /**
137 * Close the connection
138 *
139 * @access public
140 */
141 function close() {
142 fclose($this->mFP);
143 }
144 }
145
146
147 class ZhClientFake {
148
149 function ZhClientFake() {
150 global $wgMemc, $wgDBname;
151 $this->zh2TW = $wgMemc->get($key1 = "$wgDBname:zhConvert:tw");
152 $this->zh2CN = $wgMemc->get($key2 = "$wgDBname:zhConvert:cn");
153 $this->zh2SG = $wgMemc->get($key3 = "$wgDBname:zhConvert:sg");
154 $this->zh2HK = $wgMemc->get($key4 = "$wgDBname:zhConvert:hk");
155 if(empty($this->zh2TW) || empty($this->zh2CN) || empty($this->zh2SG) || empty($this->zh2HK)) {
156 require_once("includes/ZhConversion.php");
157 global $zh2TW, $zh2CN, $zh2HK, $zh2SG;
158 $this->zh2TW = $zh2TW;
159 $this->zh2CN = $zh2CN;
160 $this->zh2HK = $zh2HK;
161 $this->zh2SG = $zh2SG;
162 $wgMemc->set($key1, $this->zh2TW);
163 $wgMemc->set($key2, $this->zh2CN);
164 $wgMemc->set($key3, $this->zh2SG);
165 $wgMemc->set($key4, $this->zh2HK);
166 }
167 }
168
169 function isconnected() {
170 return true;
171 }
172
173 /**
174 * Convert to zh-tw
175 *
176 * @access private
177 */
178 function zh2tw($text) {
179 return strtr($text, $this->zh2TW);
180 }
181
182 /**
183 * Convert to zh-cn
184 *
185 * @access private
186 */
187 function zh2cn($text) {
188 return strtr($text, $this->zh2CN);
189 }
190
191 /**
192 * Convert to zh-sg
193 *
194 * @access private
195 */
196 function zh2sg($text) {
197 return strtr(strtr($text, $this->zh2CN), $this->zh2SG);
198 }
199
200 /**
201 * Convert to zh-hk
202 *
203 * @access private
204 */
205 function zh2hk($text) {
206 return strtr(strtr($text, $this->zh2TW), $this->zh2HK);
207 }
208
209 /**
210 * Convert the input to a different language variant
211 *
212 * @param string $text input text
213 * @param string $tolang language variant
214 * @return string the converted text
215 * @access public
216 */
217 function convert($text, $tolang) {
218 $t = '';
219 switch($tolang) {
220 case 'zh-cn':
221 $t = $this->zh2cn($text);
222 break;
223 case 'zh-tw':
224 $t = $this->zh2tw($text);
225 break;
226 case 'zh-sg':
227 $t = $this->zh2sg($text);
228 break;
229 case 'zh-hk':
230 $t = $this->zh2hk($text);
231 break;
232 default:
233 $t = $text;
234 }
235 return $t;
236 }
237
238 function convertToAllVariants($text) {
239 $ret = array();
240 $ret['zh-cn'] = $this->zh2cn($text);
241 $ret['zh-tw'] = $this->zh2tw($text);
242 $ret['zh-sg'] = $this->zh2sg($text);
243 $ret['zh-hk'] = $this->zh2hk($text);
244 return $ret;
245 }
246
247 /**
248 * Perform "fake" word segmentation, i.e. treating each character as a word
249 *
250 * @param string $text input text
251 * @return string segmented text
252 * @access public
253 */
254 function segment($text) {
255 /* adapted from LanguageZh_cn.stripForSearch()
256 here we will first separate the single characters,
257 and let the caller conver it to hex
258 */
259 if( function_exists( 'mb_strtolower' ) ) {
260 return preg_replace(
261 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
262 "' ' .\"$1\"",
263 mb_strtolower( $text ) );
264 } else {
265 global $wikiLowerChars;
266 return preg_replace(
267 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
268 "' ' . strtr( \"\$1\", \$wikiLowerChars )",
269 $text );
270 }
271 }
272
273 /**
274 * Close the fake connection
275 *
276 * @access public
277 */
278 function close() { }
279 }
280
281 ?>