X-Git-Url: http://git.heureux-cyclage.org/?a=blobdiff_plain;f=languages%2FLanguage.php;h=34978774fcba8977b448280016aed2e41b043102;hb=86ddf7cb8a7dba0c9e59a92d754de803c080df7d;hp=4cd4ffa12fdb5766d6a4d334c378f835bd3e002e;hpb=2800ca2db7688802ff61ec38c6a4d7191c5060c3;p=lhc%2Fweb%2Fwiklou.git diff --git a/languages/Language.php b/languages/Language.php index 4cd4ffa12f..34978774fc 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -59,7 +59,6 @@ class Language { var $mNamespaceIds, $namespaceNames, $namespaceAliases; var $dateFormatStrings = array(); - var $minSearchLength; var $mExtendedSpecialPageAliases; /** @@ -1687,85 +1686,27 @@ class Language { function hasWordBreaks() { return true; } - + /** - * Some languages have special punctuation to strip out - * or characters which need to be converted for MySQL's - * indexing to grok it correctly. Make such changes here. - * + * Some languages such as Chinese require word segmentation, + * Specify such segmentation when overridden in derived class. + * * @param $string String * @return String */ - function stripForSearch( $string, $doStrip = true ) { - global $wgDBtype; - if ( $wgDBtype != 'mysql' || $doStrip == false ) { - return $string; - } - - wfProfileIn( __METHOD__ ); - - // MySQL fulltext index doesn't grok utf-8, so we - // need to fold cases and convert to hex - $out = preg_replace_callback( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", - array( $this, 'stripForSearchCallback' ), - $this->lc( $string ) ); - - // And to add insult to injury, the default indexing - // ignores short words... Pad them so we can pass them - // through without reconfiguring the server... - $minLength = $this->minSearchLength(); - if( $minLength > 1 ) { - $n = $minLength-1; - $out = preg_replace( - "/\b(\w{1,$n})\b/", - "$1u800", - $out ); - } - - // Periods within things like hostnames and IP addresses - // are also important -- we want a search for "example.com" - // or "192.168.1.1" to work sanely. - // - // MySQL's search seems to ignore them, so you'd match on - // "example.wikipedia.com" and "192.168.83.1" as well. - $out = preg_replace( - "/(\w)\.(\w|\*)/u", - "$1u82e$2", - $out ); - - wfProfileOut( __METHOD__ ); - return $out; - } - - /** - * Armor a case-folded UTF-8 string to get through MySQL's - * fulltext search without being mucked up by funny charset - * settings or anything else of the sort. - */ - protected function stripForSearchCallback( $matches ) { - return 'u8' . bin2hex( $matches[1] ); + function wordSegmentation( $string ) { + return $string; } /** - * Check MySQL server's ft_min_word_len setting so we know - * if we need to pad short words... + * Some languages have special punctuation need to be normalized. + * Make such changes here. + * + * @param $string String + * @return String */ - protected function minSearchLength() { - if( is_null( $this->minSearchLength ) ) { - $sql = "show global variables like 'ft\\_min\\_word\\_len'"; - $dbr = wfGetDB( DB_SLAVE ); - $result = $dbr->query( $sql ); - $row = $result->fetchObject(); - $result->free(); - - if( $row && $row->Variable_name == 'ft_min_word_len' ) { - $this->minSearchLength = intval( $row->Value ); - } else { - $this->minSearchLength = 0; - } - } - return $this->minSearchLength; + function normalizeForSearch( $string ) { + return $string; } /** @@ -1778,7 +1719,7 @@ class Language { return $string; } - protected static function wordSegmentation( $string, $pattern ) { + protected static function insertSpace( $string, $pattern ) { $string = preg_replace( $pattern, " $1 ", $string ); $string = preg_replace( '/ +/', ' ', $string ); return $string; @@ -2223,6 +2164,7 @@ class Language { if ( strlen( $string ) <= abs( $length ) ) { return $string; } + $stringOriginal = $string; if( $length > 0 ) { $string = substr( $string, 0, $length ); $char = ord( $string[strlen( $string ) - 1] ); @@ -2236,7 +2178,8 @@ class Language { # We chopped in the middle of a character; remove it $string = $m[1]; } - return $string . $ellipsis; + $string = $string . $ellipsis; + } else { $string = substr( $string, $length ); $char = ord( $string[0] ); @@ -2244,7 +2187,13 @@ class Language { # We chopped in the middle of a character; remove the whole thing $string = preg_replace( '/^[\x80-\xbf]+/', '', $string ); } - return $ellipsis . $string; + $string = $ellipsis . $string; + } + # Do not truncate if the ellipsis actually make the string longer. Bug 22181 + if ( strlen( $string ) < strlen( $stringOriginal ) ) { + return $string; + } else { + return $stringOriginal; } }