From: Philip Tzou Date: Tue, 2 Feb 2010 15:09:01 +0000 (+0000) Subject: Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearch into... X-Git-Tag: 1.31.0-rc.0~37977 X-Git-Url: https://git.heureux-cyclage.org/?a=commitdiff_plain;h=d6b6766f3a4c3c0926e26ac3e2ed54c40cdf9b1c;p=lhc%2Fweb%2Fwiklou.git Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearch into wordSegmentation and normalizeForSearch. So the wordSegmentation could be called by search engines separately. --- diff --git a/includes/Title.php b/includes/Title.php index 0f2b353db2..563bd9ea0d 100644 --- a/includes/Title.php +++ b/includes/Title.php @@ -435,7 +435,7 @@ class Title { global $wgContLang; $lc = SearchEngine::legalSearchChars() . '&#;'; - $t = $wgContLang->stripForSearch( $title ); + $t = $wgContLang->normalizeForSearch( $title ); $t = preg_replace( "/[^{$lc}]+/", ' ', $t ); $t = $wgContLang->lc( $t ); diff --git a/includes/search/SearchIBM_DB2.php b/includes/search/SearchIBM_DB2.php index 7f8dfece34..d7587186c3 100644 --- a/includes/search/SearchIBM_DB2.php +++ b/includes/search/SearchIBM_DB2.php @@ -158,10 +158,10 @@ class SearchIBM_DB2 extends SearchEngine { if( is_array( $temp_terms )) { $temp_terms = array_unique( array_values( $temp_terms )); foreach( $temp_terms as $t ) - $q[] = $terms[1] . $wgContLang->stripForSearch( $t ); + $q[] = $terms[1] . $wgContLang->normalizeForSearch( $t ); } else - $q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] ); + $q[] = $terms[1] . $wgContLang->normalizeForSearch( $terms[2] ); if (!empty($terms[3])) { $regexp = preg_quote( $terms[3], '/' ); diff --git a/includes/search/SearchMySQL.php b/includes/search/SearchMySQL.php index 12ba2fa2ec..0c238be892 100644 --- a/includes/search/SearchMySQL.php +++ b/includes/search/SearchMySQL.php @@ -80,7 +80,7 @@ class SearchMySQL extends SearchEngine { // fulltext engine. // For Chinese this also inserts spaces between adjacent Han characters. $strippedVariants = array_map( - array( $wgContLang, 'stripForSearch' ), + array( $wgContLang, 'normalizeForSearch' ), $variants ); // Some languages such as Chinese force all variants to a canonical @@ -95,7 +95,7 @@ class SearchMySQL extends SearchEngine { $stripped = $this->normalizeText( $stripped ); if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { // Hack for Chinese: we need to toss in quotes for - // multiple-character phrases since stripForSearch() + // multiple-character phrases since normalizeForSearch() // added spaces between them to make word breaks. $stripped = '"' . trim( $stripped ) . '"'; } @@ -324,13 +324,16 @@ class SearchMySQL extends SearchEngine { global $wgContLang; wfProfileIn( __METHOD__ ); + + // Some languages such as Chinese require word segmentation + $out = $wgContLang->wordSegmentation( $string ); // MySQL fulltext index doesn't grok utf-8, so we // need to fold cases and convert to hex $out = preg_replace_callback( "/([\\xc0-\\xff][\\x80-\\xbf]*)/", array( $this, 'stripForSearchCallback' ), - $wgContLang->lc( $string ) ); + $wgContLang->lc( $out ) ); // And to add insult to injury, the default indexing // ignores short words... Pad them so we can pass them diff --git a/includes/search/SearchOracle.php b/includes/search/SearchOracle.php index 520fe6f068..e4c5deee55 100644 --- a/includes/search/SearchOracle.php +++ b/includes/search/SearchOracle.php @@ -217,7 +217,7 @@ class SearchOracle extends SearchEngine { private function escapeTerm($t) { global $wgContLang; - $t = $wgContLang->stripForSearch($t); + $t = $wgContLang->normalizeForSearch($t); $t = isset($this->reservedWords[strtoupper($t)]) ? '{'.$t.'}' : $t; $t = preg_replace('/^"(.*)"$/', '($1)', $t); $t = preg_replace('/([-&|])/', '\\\\$1', $t); diff --git a/includes/search/SearchSqlite.php b/includes/search/SearchSqlite.php index 53c093e7f7..54a4b5569b 100644 --- a/includes/search/SearchSqlite.php +++ b/includes/search/SearchSqlite.php @@ -92,7 +92,7 @@ class SearchSqlite extends SearchEngine { // fulltext engine. // For Chinese this also inserts spaces between adjacent Han characters. $strippedVariants = array_map( - array( $wgContLang, 'stripForSearch' ), + array( $wgContLang, 'normalizeForSearch' ), $variants ); // Some languages such as Chinese force all variants to a canonical @@ -106,7 +106,7 @@ class SearchSqlite extends SearchEngine { foreach( $strippedVariants as $stripped ) { if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { // Hack for Chinese: we need to toss in quotes for - // multiple-character phrases since stripForSearch() + // multiple-character phrases since normalizeForSearch() // added spaces between them to make word breaks. $stripped = '"' . trim( $stripped ) . '"'; } diff --git a/includes/search/SearchUpdate.php b/includes/search/SearchUpdate.php index b9c2335151..e30c70e6d1 100644 --- a/includes/search/SearchUpdate.php +++ b/includes/search/SearchUpdate.php @@ -43,7 +43,7 @@ class SearchUpdate { } # Language-specific strip/conversion - $text = $wgContLang->stripForSearch( $this->mText ); + $text = $wgContLang->normalizeForSearch( $this->mText ); wfProfileIn( $fname.'-regexps' ); $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", diff --git a/languages/Language.php b/languages/Language.php index facd0edee6..21ad57a6fb 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1686,15 +1686,26 @@ class Language { function hasWordBreaks() { return true; } + + /** + * Some languages such as Chinese require word segmentation, + * Specify such segmentation when overridden in derived class. + * + * @param $string String + * @return String + */ + function wordSegmentation( $string ) { + return $string; + } /** - * Some languages have special punctuation to strip out. + * Some languages have special punctuation need to be normalized. * Make such changes here. * * @param $string String * @return String */ - function stripForSearch( $string, $doStrip = true ) { + function normalizeForSearch( $string ) { return $string; } @@ -1708,7 +1719,7 @@ class Language { return $string; } - protected static function wordSegmentation( $string, $pattern ) { + protected static function insertSpace( $string, $pattern ) { $string = preg_replace( $pattern, " $1 ", $string ); $string = preg_replace( '/ +/', ' ', $string ); return $string; diff --git a/languages/classes/LanguageGan.php b/languages/classes/LanguageGan.php index f878cf3f2f..3c8b5fdb1f 100644 --- a/languages/classes/LanguageGan.php +++ b/languages/classes/LanguageGan.php @@ -135,9 +135,9 @@ class LanguageGan extends LanguageZh { } // word segmentation - function stripForSearch( $string, $doStrip = true, $autoVariant = 'gan-hans' ) { - // LanguageZh::stripForSearch - return parent::stripForSearch( $string, $doStrip, $autoVariant ); + function normalizeForSearch( $string, $autoVariant = 'gan-hans' ) { + // LanguageZh::normalizeForSearch + return parent::normalizeForSearch( $string, $autoVariant ); } function convertForSearchResult( $termsArray ) { diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php index 41b246f082..4a24260b00 100644 --- a/languages/classes/LanguageJa.php +++ b/languages/classes/LanguageJa.php @@ -6,30 +6,29 @@ * @ingroup Language */ class LanguageJa extends Language { - function stripForSearch( $string, $doStrip = true ) { + function wordSegmentation( $string ) { + // Strip known punctuation ? + // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f - $s = $string; - - if ( $doStrip == true ) { - // Strip known punctuation ? - // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f + // Space strings of like hiragana/katakana/kanji + $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f + $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff + $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' + . '|[\xe4-\xe8][\x80-\xbf]{2}' + . '|\xe9[\x80-\xa5][\x80-\xbf]' + . '|\xe9\xa6[\x80-\x99])'; + # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 + $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/"; + $s = self::insertSpace( $string, $reg ); + return $s; + } - // Space strings of like hiragana/katakana/kanji - $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f - $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff - $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' - . '|[\xe4-\xe8][\x80-\xbf]{2}' - . '|\xe9[\x80-\xa5][\x80-\xbf]' - . '|\xe9\xa6[\x80-\x99])'; - # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 - $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/"; - $s = self::wordSegmentation( $s, $reg ); - } + function normalizeForSearch( $string ) { // Double-width roman characters - $s = self::convertDoubleWidth( $s ); + $s = self::convertDoubleWidth( $string ); # Do general case folding and UTF-8 armoring - return parent::stripForSearch( $s, $doStrip ); + return parent::normalizeForSearch( $s ); } # Italic is not appropriate for Japanese script diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php index f00ac31e6e..6581d788e4 100644 --- a/languages/classes/LanguageYue.php +++ b/languages/classes/LanguageYue.php @@ -3,24 +3,29 @@ * @ingroup Language */ class LanguageYue extends Language { - function stripForSearch( $string, $doStrip = true ) { + function hasWordBreaks() { + return false; + } + + /** + * Eventually this should be a word segmentation; + * for now just treat each character as a word. + * @todo Fixme: only do this for Han characters... + */ + function wordSegmentation( $string ) { + $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; + $s = self::insertSpace( $string, $reg ); + return $s; + } + + function normalizeForSearch( $string ) { wfProfileIn( __METHOD__ ); // Double-width roman characters $s = self::convertDoubleWidth( $string ); - - if ( $doStrip == true ) { - // eventually this should be a word segmentation; - // for now just treat each character as a word. - // @todo Fixme: only do this for Han characters... - $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; - $s = self::wordSegmentation( $s, $reg ); - } - $s = trim( $s ); + $s = parent::normalizeForSearch( $s ); - // Do general case folding and UTF-8 armoring - $s = parent::stripForSearch( $s, $doStrip ); wfProfileOut( __METHOD__ ); return $s; } diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php index 6a004a080d..d89c4c47cb 100644 --- a/languages/classes/LanguageZh.php +++ b/languages/classes/LanguageZh.php @@ -170,8 +170,23 @@ class LanguageZh extends LanguageZh_hans { "\"$1\"", $text); } - // word segmentation - function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) { + /** + * word segmentation + */ + function wordSegmentation( $string ) { + // LanguageZh_hans::wordSegmentation + $s = parent::wordSegmentation( $string ); + return $s; + } + + /** + * auto convert to zh-hans and normalize special characters. + * + * @param $string String + * @param $autoVariant String, default to 'zh-hans' + * @return String + */ + function normalizeForSearch( $string, $autoVariant = 'zh-hans' ) { wfProfileIn( __METHOD__ ); // always convert to zh-hans before indexing. it should be @@ -179,8 +194,8 @@ class LanguageZh extends LanguageZh_hans { // Traditional to Simplified is less ambiguous than the // other way around $s = $this->mConverter->autoConvert( $string, $autoVariant ); - // LanguageZh_hans::stripForSearch - $s = parent::stripForSearch( $s, $doStrip ); + // LanguageZh_hans::normalizeForSearch + $s = parent::normalizeForSearch( $s ); wfProfileOut( __METHOD__ ); return $s; diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php index 2f81960b16..5b03d73145 100644 --- a/languages/classes/LanguageZh_hans.php +++ b/languages/classes/LanguageZh_hans.php @@ -7,25 +7,26 @@ class LanguageZh_hans extends Language { function hasWordBreaks() { return false; } - - function stripForSearch( $string, $doStrip = true ) { + + /** + * Eventually this should be a word segmentation; + * for now just treat each character as a word. + * @todo Fixme: only do this for Han characters... + */ + function wordSegmentation( $string ) { + $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; + $s = self::insertSpace( $string, $reg ); + return $s; + } + + function normalizeForSearch( $string ) { wfProfileIn( __METHOD__ ); // Double-width roman characters $s = self::convertDoubleWidth( $string ); - - if ( $doStrip == true ) { - // Eventually this should be a word segmentation; - // for now just treat each character as a word. - // @todo Fixme: only do this for Han characters... - $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; - $s = self::wordSegmentation( $s, $reg ); - } - $s = trim( $s ); + $s = parent::normalizeForSearch( $s ); - // Do general case folding and UTF-8 armoring - $s = parent::stripForSearch( $s, $doStrip ); wfProfileOut( __METHOD__ ); return $s; }