Treat each Chinese character as a single word when diffing, and glue them back togeth...
[lhc/web/wiklou.git] / languages / LanguageZh.php
1 <?php
2 require_once( "includes/ZhClient.php" );
3 require_once( "LanguageZh_cn.php");
4 require_once( "LanguageZh_tw.php");
5 require_once( "LanguageZh_sg.php");
6 require_once( "LanguageZh_hk.php");
7
8 /* class that handles both Traditional and Simplified Chinese
9 right now it only distinguish zh_cn and zh_tw (actuall, zh_cn and
10 non-zh_cn), will add support for zh_sg, zh_hk, etc, later.
11 */
12 class LanguageZh extends LanguageZh_cn {
13
14 var $mZhLanguageCode=false;
15 var $mZhClient=false;
16 function LanguageZh() {
17 global $wgUseZhdaemon, $wgZhdaemonHost, $wgZhdaemonPort;
18 global $wgDisableLangConversion;
19
20 $this->mZhLanguageCode = $this->getPreferredVariant();
21 if($wgUseZhdaemon) {
22 $this->mZhClient=new ZhClient($wgZhdaemonHost, $wgZhdaemonPort);
23 if(!$this->mZhClient->isconnected())
24 $this->mZhClient = false;
25 }
26 // fallback to fake client
27 if($this->mZhClient == false)
28 $this->mZhClient=new ZhClientFake();
29 }
30
31 /*
32 get preferred language variants. eventually this will check the
33 user's preference setting as well, once the language option in
34 the setting pages is finalized.
35 */
36 function getPreferredVariant() {
37 global $wgUser;
38
39 if($this->mZhLanguageCode)
40 return $this->mZhLanguageCode;
41
42 // get language variant preference for logged in users
43 if($wgUser->getID()!=0) {
44 $this->mZhLanguageCode = $wgUser->getOption('variant');
45 }
46 else {
47 // see if some zh- variant is set in the http header,
48 $this->mZhLanguageCode="zh-cn";
49 $header = str_replace( '_', '-', strtolower($_SERVER["HTTP_ACCEPT_LANGUAGE"]));
50 $zh = strstr($header, 'zh-');
51 if($zh) {
52 $this->mZhLanguageCode = substr($zh,0,5);
53 }
54 }
55 return $this->mZhLanguageCode;
56 }
57
58 # this should give much better diff info
59 function segmentForDiff( $text ) {
60 return preg_replace(
61 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
62 "' ' .\"$1\"", $text);
63 }
64
65 function unsegmentForDiff( $text ) {
66 return preg_replace(
67 "/ ([\\xc0-\\xff][\\x80-\\xbf]*)/e",
68 "\"$1\"", $text);
69 }
70
71
72
73 function autoConvert($text, $toVariant=false) {
74 if(!$toVariant)
75 $toVariant = $this->getPreferredVariant();
76 $fname="zhautoConvert";
77 wfProfileIn( $fname );
78 $t = $this->mZhClient->convert($text, $toVariant);
79 wfProfileOut( $fname );
80 return $t;
81 }
82
83 function autoConvertToAllVariants($text) {
84 $fname="zhautoConvertToAll";
85 wfProfileIn( $fname );
86 $ret = $this->mZhClient->convertToAllVariants($text);
87 if($ret == false) {//fall back...
88 $ret = Language::autoConvertToAllVariants($text);
89 }
90 wfProfileOut( $fname );
91 return $ret;
92 }
93
94 # only convert titles having more than one character
95 function convertTitle($text) {
96 $len=0;
97 if( function_exists( 'mb_strlen' ) )
98 $len = mb_strlen($text);
99 else
100 $len = strlen($text)/3;
101 if($len>1)
102 return $this->autoConvert( $text);
103 return $text;
104 }
105
106 function getVariants() {
107 return array("zh-cn", "zh-tw", "zh-sg", "zh-hk");
108 }
109
110 function getVariantFallback($v) {
111 switch ($v) {
112 case 'zh-cn': return 'zh-sg'; break;
113 case 'zh-sg': return 'zh-cn'; break;
114 case 'zh-tw': return 'zh-hk'; break;
115 case 'zh-hk': return 'zh-tw'; break;
116 }
117 return false;
118 }
119
120 // word segmentation through ZhClient
121 function stripForSearch( $string ) {
122 $fname="zhsegment";
123 wfProfileIn( $fname );
124 //always convert to zh-cn before indexing. it should be
125 //better to use zh-cn for search, since conversion from
126 //Traditional to Simplified is less ambiguous than the
127 //other way around
128 $t = $this->mZhClient->segment($string);
129 $t = $this->autoConvert($t, 'zh-cn');
130 $t = LanguageUtf8::stripForSearch( $t );
131 wfProfileOut( $fname );
132 return $t;
133
134 }
135
136 function convertForSearchResult( $termsArray ) {
137 $terms = implode( '|', $termsArray );
138 $terms = implode( '|', $this->autoConvertToAllVariants( $terms ) );
139 $ret = array_unique( explode('|', $terms) );
140 return $ret;
141 }
142 }
143 ?>