Changing all the last references from LanguageUtf8 to Language, and finally removing it.
[lhc/web/wiklou.git] / languages / LanguageConverter.php
1 <?php
2 /**
3 * @package MediaWiki
4 * @subpackage Language
5 *
6 * @author Zhengzhu Feng <zhengzhu@gmail.com>
7 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License
8 */
9
10 class LanguageConverter {
11 var $mPreferredVariant='';
12 var $mMainLanguageCode;
13 var $mVariants, $mVariantFallbacks;
14 var $mTablesLoaded = false;
15 var $mUseFss = false;
16 var $mTables;
17 var $mFssObjects;
18 var $mTitleDisplay='';
19 var $mDoTitleConvert=true, $mDoContentConvert=true;
20 var $mCacheKey;
21 var $mLangObj;
22 var $mMarkup;
23 var $mFlags;
24 var $mUcfirst = false;
25 /**
26 * Constructor
27 *
28 * @param string $maincode the main language code of this language
29 * @param array $variants the supported variants of this language
30 * @param array $variantfallback the fallback language of each variant
31 * @param array $markup array defining the markup used for manual conversion
32 * @param array $flags array defining the custom strings that maps to the flags
33 * @access public
34 */
35 function __construct($langobj, $maincode,
36 $variants=array(),
37 $variantfallbacks=array(),
38 $markup=array(),
39 $flags = array()) {
40 global $wgDBname;
41 $this->mLangObj = $langobj;
42 $this->mMainLanguageCode = $maincode;
43 $this->mVariants = $variants;
44 $this->mVariantFallbacks = $variantfallbacks;
45 $this->mCacheKey = $wgDBname . ":conversiontables";
46 $m = array('begin'=>'-{', 'flagsep'=>'|', 'codesep'=>':',
47 'varsep'=>';', 'end'=>'}-');
48 $this->mMarkup = array_merge($m, $markup);
49 $f = array('A'=>'A', 'T'=>'T');
50 $this->mFlags = array_merge($f, $flags);
51 if ( function_exists( 'fss_prep_replace' ) ) {
52 $this->mUseFss = true;
53 }
54 }
55
56 /**
57 * @access public
58 */
59 function getVariants() {
60 return $this->mVariants;
61 }
62
63 /**
64 * in case some variant is not defined in the markup, we need
65 * to have some fallback. for example, in zh, normally people
66 * will define zh-cn and zh-tw, but less so for zh-sg or zh-hk.
67 * when zh-sg is preferred but not defined, we will pick zh-cn
68 * in this case. right now this is only used by zh.
69 *
70 * @param string $v the language code of the variant
71 * @return string the code of the fallback language or false if there is no fallback
72 * @private
73 */
74 function getVariantFallback($v) {
75 return $this->mVariantFallbacks[$v];
76 }
77
78
79 /**
80 * get preferred language variants.
81 * @param boolean $fromUser Get it from $wgUser's preferences
82 * @return string the preferred language code
83 * @access public
84 */
85 function getPreferredVariant( $fromUser = true ) {
86 global $wgUser, $wgRequest;
87
88 if($this->mPreferredVariant)
89 return $this->mPreferredVariant;
90
91 // see if the preference is set in the request
92 $req = $wgRequest->getText( 'variant' );
93 if( in_array( $req, $this->mVariants ) ) {
94 $this->mPreferredVariant = $req;
95 return $req;
96 }
97
98 // get language variant preference from logged in users
99 // Don't call this on stub objects because that causes infinite
100 // recursion during initialisation
101 if( $fromUser && $wgUser->isLoggedIn() ) {
102 $this->mPreferredVariant = $wgUser->getOption('variant');
103 return $this->mPreferredVariant;
104 }
105
106 # FIXME rewrite code for parsing http header. The current code
107 # is written specific for detecting zh- variants
108 if( !$this->mPreferredVariant ) {
109 // see if some supported language variant is set in the
110 // http header, but we don't set the mPreferredVariant
111 // variable in case this is called before the user's
112 // preference is loaded
113 $pv=$this->mMainLanguageCode;
114 if(array_key_exists('HTTP_ACCEPT_LANGUAGE', $_SERVER)) {
115 $header = str_replace( '_', '-', strtolower($_SERVER["HTTP_ACCEPT_LANGUAGE"]));
116 $zh = strstr($header, 'zh-');
117 if($zh) {
118 $pv = substr($zh,0,5);
119 }
120 }
121 return $pv;
122 }
123 }
124
125 /**
126 * dictionary-based conversion
127 *
128 * @param string $text the text to be converted
129 * @param string $toVariant the target language code
130 * @return string the converted text
131 * @private
132 */
133 function autoConvert($text, $toVariant=false) {
134 $fname="LanguageConverter::autoConvert";
135
136 wfProfileIn( $fname );
137
138 if(!$this->mTablesLoaded)
139 $this->loadTables();
140
141 if(!$toVariant)
142 $toVariant = $this->getPreferredVariant();
143 if(!in_array($toVariant, $this->mVariants))
144 return $text;
145
146 /* we convert everything except:
147 1. html markups (anything between < and >)
148 2. html entities
149 3. place holders created by the parser
150 */
151 global $wgParser;
152 if (isset($wgParser))
153 $marker = '|' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+';
154 else
155 $marker = "";
156
157 // this one is needed when the text is inside an html markup
158 $htmlfix = '|<[^>]+=\"[^(>=)]*$|^[^(<>=\")]*\"[^>]*>';
159
160 $reg = '/<[^>]+>|&[a-z#][a-z0-9]+;' . $marker . $htmlfix . '/';
161
162 $matches = preg_split($reg, $text, -1, PREG_SPLIT_OFFSET_CAPTURE);
163
164
165 $m = array_shift($matches);
166 $ret = $this->translate($m[0], $toVariant);
167 $mstart = $m[1]+strlen($m[0]);
168 foreach($matches as $m) {
169 $ret .= substr($text, $mstart, $m[1]-$mstart);
170 $ret .= $this->translate($m[0], $toVariant);
171 $mstart = $m[1] + strlen($m[0]);
172 }
173 wfProfileOut( $fname );
174 return $ret;
175 }
176
177 /**
178 * Translate a string to a variant
179 * Doesn't process markup or do any of that other stuff, for that use convert()
180 *
181 * @param string $text Text to convert
182 * @param string $variant Variant language code
183 * @return string Translated text
184 */
185 function translate( $text, $variant ) {
186 if( !$this->mTablesLoaded )
187 $this->loadTables();
188 if ( $this->mUseFss ) {
189 return fss_exec_replace( $this->mFssObjects[$variant], $text );
190 } else {
191 return strtr( $text, $this->mTables[$variant] );
192 }
193 }
194
195 /**
196 * convert text to all supported variants
197 *
198 * @param string $text the text to be converted
199 * @return array of string
200 * @private
201 */
202 function autoConvertToAllVariants($text) {
203 $fname="LanguageConverter::autoConvertToAllVariants";
204 wfProfileIn( $fname );
205 if( !$this->mTablesLoaded )
206 $this->loadTables();
207
208 $ret = array();
209 foreach($this->mVariants as $variant) {
210 $ret[$variant] = $this->translate($text, $variant);
211 }
212 wfProfileOut( $fname );
213 return $ret;
214 }
215
216 /**
217 * Convert text using a parser object for context
218 */
219 function parserConvert( $text, &$parser ) {
220 global $wgDisableLangConversion;
221 /* don't do anything if this is the conversion table */
222 if ( $parser->mTitle->getNamespace() == NS_MEDIAWIKI &&
223 strpos($parser->mTitle->getText, "Conversiontable") !== false )
224 {
225 return $text;
226 }
227
228 if($wgDisableLangConversion)
229 return $text;
230
231 $text = $this->convert( $text );
232 $parser->mOutput->setTitleText( $this->mTitleDisplay );
233 return $text;
234 }
235
236 /**
237 * convert text to different variants of a language. the automatic
238 * conversion is done in autoConvert(). here we parse the text
239 * marked with -{}-, which specifies special conversions of the
240 * text that can not be accomplished in autoConvert()
241 *
242 * syntax of the markup:
243 * -{code1:text1;code2:text2;...}- or
244 * -{text}- in which case no conversion should take place for text
245 *
246 * @param string $text text to be converted
247 * @param bool $isTitle whether this conversion is for the article title
248 * @return string converted text
249 * @access public
250 */
251 function convert( $text , $isTitle=false) {
252 $mw =& MagicWord::get( 'notitleconvert' );
253 if( $mw->matchAndRemove( $text ) )
254 $this->mDoTitleConvert = false;
255
256 $mw =& MagicWord::get( 'nocontentconvert' );
257 if( $mw->matchAndRemove( $text ) ) {
258 $this->mDoContentConvert = false;
259 }
260
261 // no conversion if redirecting
262 $mw =& MagicWord::get( 'redirect' );
263 if( $mw->matchStart( $text ))
264 return $text;
265
266 if( $isTitle ) {
267 if( !$this->mDoTitleConvert ) {
268 $this->mTitleDisplay = $text;
269 return $text;
270 }
271 if( !empty($this->mTitleDisplay))
272 return $this->mTitleDisplay;
273
274 global $wgRequest;
275 $isredir = $wgRequest->getText( 'redirect', 'yes' );
276 $action = $wgRequest->getText( 'action' );
277 if ( $isredir == 'no' || $action == 'edit' ) {
278 return $text;
279 }
280 else {
281 $this->mTitleDisplay = $this->autoConvert($text);
282 return $this->mTitleDisplay;
283 }
284 }
285
286 if( !$this->mDoContentConvert )
287 return $text;
288
289 $plang = $this->getPreferredVariant();
290 if( isset( $this->mVariantFallbacks[$plang] ) ) {
291 $fallback = $this->mVariantFallbacks[$plang];
292 } else {
293 // This sounds... bad?
294 $fallback = '';
295 }
296
297 $tarray = explode($this->mMarkup['begin'], $text);
298 $tfirst = array_shift($tarray);
299 $text = $this->autoConvert($tfirst);
300 foreach($tarray as $txt) {
301 $marked = explode($this->mMarkup['end'], $txt, 2);
302 $flags = array();
303 $tt = explode($this->mMarkup['flagsep'], $marked[0], 2);
304
305 if(sizeof($tt) == 2) {
306 $f = explode($this->mMarkup['varsep'], $tt[0]);
307 foreach($f as $ff) {
308 $ff = trim($ff);
309 if(array_key_exists($ff, $this->mFlags) &&
310 !array_key_exists($this->mFlags[$ff], $flags))
311 $flags[] = $this->mFlags[$ff];
312 }
313 $rules = $tt[1];
314 }
315 else
316 $rules = $marked[0];
317
318 #FIXME: may cause trouble here...
319 //strip &nbsp; since it interferes with the parsing, plus,
320 //all spaces should be stripped in this tag anyway.
321 $rules = str_replace('&nbsp;', '', $rules);
322
323 $carray = $this->parseManualRule($rules, $flags);
324 $disp = '';
325 if(array_key_exists($plang, $carray))
326 $disp = $carray[$plang];
327 else if(array_key_exists($fallback, $carray))
328 $disp = $carray[$fallback];
329 if($disp) {
330 if(in_array('T', $flags))
331 $this->mTitleDisplay = $disp;
332 else
333 $text .= $disp;
334
335 if(in_array('A', $flags)) {
336 /* modify the conversion table for this session*/
337
338 /* fill in the missing variants, if any,
339 with fallbacks */
340 foreach($this->mVariants as $v) {
341 if(!array_key_exists($v, $carray)) {
342 $vf = $this->getVariantFallback($v);
343 if(array_key_exists($vf, $carray))
344 $carray[$v] = $carray[$vf];
345 }
346 }
347
348 foreach($this->mVariants as $vfrom) {
349 if(!array_key_exists($vfrom, $carray))
350 continue;
351 foreach($this->mVariants as $vto) {
352 if($vfrom == $vto)
353 continue;
354 if(!array_key_exists($vto, $carray))
355 continue;
356 $this->mTables[$vto][$carray[$vfrom]] = $carray[$vto];
357
358 }
359 }
360 if ( $this->mUseFss ) {
361 $this->generateFssObjects();
362 }
363 }
364 }
365 else {
366 $text .= $marked[0];
367 }
368 if(array_key_exists(1, $marked))
369 $text .= $this->autoConvert($marked[1]);
370 }
371
372 return $text;
373 }
374
375 /**
376 * parse the manually marked conversion rule
377 * @param string $rule the text of the rule
378 * @return array of the translation in each variant
379 * @private
380 */
381 function parseManualRule($rules, $flags=array()) {
382
383 $choice = explode($this->mMarkup['varsep'], $rules);
384 $carray = array();
385 if(sizeof($choice) == 1) {
386 /* a single choice */
387 foreach($this->mVariants as $v)
388 $carray[$v] = $choice[0];
389 }
390 else {
391 foreach($choice as $c) {
392 $v = explode($this->mMarkup['codesep'], $c);
393 if(sizeof($v) != 2) // syntax error, skip
394 continue;
395 $carray[trim($v[0])] = trim($v[1]);
396 }
397 }
398 return $carray;
399 }
400
401 /**
402 * if a language supports multiple variants, it is
403 * possible that non-existing link in one variant
404 * actually exists in another variant. this function
405 * tries to find it. See e.g. LanguageZh.php
406 *
407 * @param string $link the name of the link
408 * @param mixed $nt the title object of the link
409 * @return null the input parameters may be modified upon return
410 * @access public
411 */
412 function findVariantLink( &$link, &$nt ) {
413 static $count=0; //used to limit this operation
414 static $cache=array();
415 global $wgDisableLangConversion;
416 $pref = $this->getPreferredVariant();
417 $ns=0;
418 if(is_object($nt))
419 $ns = $nt->getNamespace();
420 if( $count > 50 && $ns != NS_CATEGORY )
421 return;
422 $count++;
423 $variants = $this->autoConvertToAllVariants($link);
424 if($variants == false) //give up
425 return;
426 foreach( $variants as $v ) {
427 if(isset($cache[$v]))
428 continue;
429 $cache[$v] = 1;
430 $varnt = Title::newFromText( $v, $ns );
431 if( $varnt && $varnt->getArticleID() > 0 ) {
432 $nt = $varnt;
433 if( !$wgDisableLangConversion )
434 $link = $v;
435 break;
436 }
437 }
438 }
439
440 /**
441 * returns language specific hash options
442 *
443 * @access public
444 */
445 function getExtraHashOptions() {
446 $variant = $this->getPreferredVariant();
447 return '!' . $variant ;
448 }
449
450 /**
451 * get title text as defined in the body of the article text
452 *
453 * @access public
454 */
455 function getParsedTitle() {
456 return $this->mTitleDisplay;
457 }
458
459 /**
460 * a write lock to the cache
461 *
462 * @private
463 */
464 function lockCache() {
465 global $wgMemc;
466 $success = false;
467 for($i=0; $i<30; $i++) {
468 if($success = $wgMemc->add($this->mCacheKey . "lock", 1, 10))
469 break;
470 sleep(1);
471 }
472 return $success;
473 }
474
475 /**
476 * unlock cache
477 *
478 * @private
479 */
480 function unlockCache() {
481 global $wgMemc;
482 $wgMemc->delete($this->mCacheKey . "lock");
483 }
484
485
486 /**
487 * Load default conversion tables
488 * This method must be implemented in derived class
489 *
490 * @private
491 */
492 function loadDefaultTables() {
493 $name = get_class($this);
494 wfDie("Must implement loadDefaultTables() method in class $name");
495 }
496
497 /**
498 * load conversion tables either from the cache or the disk
499 * @private
500 */
501 function loadTables($fromcache=true) {
502 global $wgMemc;
503 if( $this->mTablesLoaded )
504 return;
505 $this->mTablesLoaded = true;
506 if($fromcache) {
507 $this->mTables = $wgMemc->get( $this->mCacheKey );
508 if( !empty( $this->mTables ) ) //all done
509 return;
510 }
511 // not in cache, or we need a fresh reload.
512 // we will first load the default tables
513 // then update them using things in MediaWiki:Zhconversiontable/*
514 global $wgMessageCache;
515 $this->loadDefaultTables();
516 foreach($this->mVariants as $var) {
517 $cached = $this->parseCachedTable($var);
518 $this->mTables[$var] = array_merge($this->mTables[$var], $cached);
519 }
520
521 $this->postLoadTables();
522
523 if($this->lockCache()) {
524 $wgMemc->set($this->mCacheKey, $this->mTables, 43200);
525 $this->unlockCache();
526 }
527 if ( $this->mUseFss ) {
528 $this->generateFssObjects();
529 }
530 }
531
532 /**
533 * Generate FSS objects. The FSS extension must be available.
534 */
535 function generateFssObjects() {
536 foreach ( $this->mTables as $variant => $table ) {
537 $this->mFssObjects[$variant] = fss_prep_replace( $table );
538 }
539 }
540
541 /**
542 * Hook for post processig after conversion tables are loaded
543 *
544 */
545 function postLoadTables() {}
546
547 /**
548 * Reload the conversion tables
549 *
550 * @private
551 */
552 function reloadTables() {
553 if($this->mTables)
554 unset($this->mTables);
555 $this->mTablesLoaded = false;
556 $this->loadTables(false);
557 }
558
559
560 /**
561 * parse the conversion table stored in the cache
562 *
563 * the tables should be in blocks of the following form:
564
565 * -{
566 * word => word ;
567 * word => word ;
568 * ...
569 * }-
570 *
571 * to make the tables more manageable, subpages are allowed
572 * and will be parsed recursively if $recursive=true
573 *
574 * @private
575 */
576 function parseCachedTable($code, $subpage='', $recursive=true) {
577 global $wgMessageCache;
578 static $parsed = array();
579
580 if(!is_object($wgMessageCache))
581 return array();
582
583 $key = 'Conversiontable/'.$code;
584 if($subpage)
585 $key .= '/' . $subpage;
586
587 if(array_key_exists($key, $parsed))
588 return array();
589
590
591 $txt = $wgMessageCache->get( $key, true, true, true );
592
593 // get all subpage links of the form
594 // [[MediaWiki:conversiontable/zh-xx/...|...]]
595 $linkhead = $this->mLangObj->getNsText(NS_MEDIAWIKI) . ':Conversiontable';
596 $subs = explode('[[', $txt);
597 $sublinks = array();
598 foreach( $subs as $sub ) {
599 $link = explode(']]', $sub, 2);
600 if(count($link) != 2)
601 continue;
602 $b = explode('|', $link[0]);
603 $b = explode('/', trim($b[0]), 3);
604 if(count($b)==3)
605 $sublink = $b[2];
606 else
607 $sublink = '';
608
609 if($b[0] == $linkhead && $b[1] == $code) {
610 $sublinks[] = $sublink;
611 }
612 }
613
614
615 // parse the mappings in this page
616 $blocks = explode($this->mMarkup['begin'], $txt);
617 array_shift($blocks);
618 $ret = array();
619 foreach($blocks as $block) {
620 $mappings = explode($this->mMarkup['end'], $block, 2);
621 $stripped = str_replace(array("'", '"', '*','#'), '', $mappings[0]);
622 $table = explode( ';', $stripped );
623 foreach( $table as $t ) {
624 $m = explode( '=>', $t );
625 if( count( $m ) != 2)
626 continue;
627 // trim any trailling comments starting with '//'
628 $tt = explode('//', $m[1], 2);
629 $ret[trim($m[0])] = trim($tt[0]);
630 }
631 }
632 $parsed[$key] = true;
633
634
635 // recursively parse the subpages
636 if($recursive) {
637 foreach($sublinks as $link) {
638 $s = $this->parseCachedTable($code, $link, $recursive);
639 $ret = array_merge($ret, $s);
640 }
641 }
642
643 if ($this->mUcfirst) {
644 foreach ($ret as $k => $v) {
645 $ret[Language::ucfirst($k)] = Language::ucfirst($v);
646 }
647 }
648 return $ret;
649 }
650
651 /**
652 * Enclose a string with the "no conversion" tag. This is used by
653 * various functions in the Parser
654 *
655 * @param string $text text to be tagged for no conversion
656 * @return string the tagged text
657 */
658 function markNoConversion($text) {
659 # don't mark if already marked
660 if(strpos($text, $this->mMarkup['begin']) ||
661 strpos($text, $this->mMarkup['end']))
662 return $text;
663
664 $ret = $this->mMarkup['begin'] . $text . $this->mMarkup['end'];
665 return $ret;
666 }
667
668 /**
669 * convert the sorting key for category links. this should make different
670 * keys that are variants of each other map to the same key
671 */
672 function convertCategoryKey( $key ) {
673 return $key;
674 }
675 /**
676 * hook to refresh the cache of conversion tables when
677 * MediaWiki:conversiontable* is updated
678 * @private
679 */
680 function OnArticleSaveComplete($article, $user, $text, $summary, $isminor, $iswatch, $section) {
681 $titleobj = $article->getTitle();
682 if($titleobj->getNamespace() == NS_MEDIAWIKI) {
683 /*
684 global $wgContLang; // should be an LanguageZh.
685 if(get_class($wgContLang) != 'languagezh')
686 return true;
687 */
688 $title = $titleobj->getDBkey();
689 $t = explode('/', $title, 3);
690 $c = count($t);
691 if( $c > 1 && $t[0] == 'Conversiontable' ) {
692 if(in_array($t[1], $this->mVariants)) {
693 $this->reloadTables();
694 }
695 }
696 }
697 return true;
698 }
699 }
700
701 ?>