From: Philip Tzou <philip@users.mediawiki.org>
Date: Tue, 2 Feb 2010 15:09:01 +0000 (+0000)
Subject: Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearch into... 
X-Git-Tag: 1.31.0-rc.0~37977
X-Git-Url: https://git.heureux-cyclage.org/?a=commitdiff_plain;h=d6b6766f3a4c3c0926e26ac3e2ed54c40cdf9b1c;p=lhc%2Fweb%2Fwiklou.git

Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearch into wordSegmentation and normalizeForSearch. So the wordSegmentation could be called by search engines separately.
---

diff --git a/includes/Title.php b/includes/Title.php
index 0f2b353db2..563bd9ea0d 100644
--- a/includes/Title.php
+++ b/includes/Title.php
@@ -435,7 +435,7 @@ class Title {
 		global $wgContLang;
 
 		$lc = SearchEngine::legalSearchChars() . '&#;';
-		$t = $wgContLang->stripForSearch( $title );
+		$t = $wgContLang->normalizeForSearch( $title );
 		$t = preg_replace( "/[^{$lc}]+/", ' ', $t );
 		$t = $wgContLang->lc( $t );
 
diff --git a/includes/search/SearchIBM_DB2.php b/includes/search/SearchIBM_DB2.php
index 7f8dfece34..d7587186c3 100644
--- a/includes/search/SearchIBM_DB2.php
+++ b/includes/search/SearchIBM_DB2.php
@@ -158,10 +158,10 @@ class SearchIBM_DB2 extends SearchEngine {
 				if( is_array( $temp_terms )) {
 					$temp_terms = array_unique( array_values( $temp_terms ));
 					foreach( $temp_terms as $t )
-						$q[] = $terms[1] . $wgContLang->stripForSearch( $t );
+						$q[] = $terms[1] . $wgContLang->normalizeForSearch( $t );
 				}
 				else
-					$q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] );
+					$q[] = $terms[1] . $wgContLang->normalizeForSearch( $terms[2] );
 
 				if (!empty($terms[3])) {
 					$regexp = preg_quote( $terms[3], '/' );
diff --git a/includes/search/SearchMySQL.php b/includes/search/SearchMySQL.php
index 12ba2fa2ec..0c238be892 100644
--- a/includes/search/SearchMySQL.php
+++ b/includes/search/SearchMySQL.php
@@ -80,7 +80,7 @@ class SearchMySQL extends SearchEngine {
 				// fulltext engine.
 				// For Chinese this also inserts spaces between adjacent Han characters.
 				$strippedVariants = array_map(
-					array( $wgContLang, 'stripForSearch' ),
+					array( $wgContLang, 'normalizeForSearch' ),
 					$variants );
 				
 				// Some languages such as Chinese force all variants to a canonical
@@ -95,7 +95,7 @@ class SearchMySQL extends SearchEngine {
 					$stripped = $this->normalizeText( $stripped );
 					if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
 						// Hack for Chinese: we need to toss in quotes for
-						// multiple-character phrases since stripForSearch()
+						// multiple-character phrases since normalizeForSearch()
 						// added spaces between them to make word breaks.
 						$stripped = '"' . trim( $stripped ) . '"';
 					}
@@ -324,13 +324,16 @@ class SearchMySQL extends SearchEngine {
 		global $wgContLang;
 
 		wfProfileIn( __METHOD__ );
+		
+		// Some languages such as Chinese require word segmentation
+		$out = $wgContLang->wordSegmentation( $string );
 
 		// MySQL fulltext index doesn't grok utf-8, so we
 		// need to fold cases and convert to hex
 		$out = preg_replace_callback(
 			"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
 			array( $this, 'stripForSearchCallback' ),
-			$wgContLang->lc( $string ) );
+			$wgContLang->lc( $out ) );
 
 		// And to add insult to injury, the default indexing
 		// ignores short words... Pad them so we can pass them
diff --git a/includes/search/SearchOracle.php b/includes/search/SearchOracle.php
index 520fe6f068..e4c5deee55 100644
--- a/includes/search/SearchOracle.php
+++ b/includes/search/SearchOracle.php
@@ -217,7 +217,7 @@ class SearchOracle extends SearchEngine {
 
 	private function escapeTerm($t) {
 		global $wgContLang;
-		$t = $wgContLang->stripForSearch($t);
+		$t = $wgContLang->normalizeForSearch($t);
 		$t = isset($this->reservedWords[strtoupper($t)]) ? '{'.$t.'}' : $t;
 		$t = preg_replace('/^"(.*)"$/', '($1)', $t);
 		$t = preg_replace('/([-&|])/', '\\\\$1', $t);
diff --git a/includes/search/SearchSqlite.php b/includes/search/SearchSqlite.php
index 53c093e7f7..54a4b5569b 100644
--- a/includes/search/SearchSqlite.php
+++ b/includes/search/SearchSqlite.php
@@ -92,7 +92,7 @@ class SearchSqlite extends SearchEngine {
 				// fulltext engine.
 				// For Chinese this also inserts spaces between adjacent Han characters.
 				$strippedVariants = array_map(
-					array( $wgContLang, 'stripForSearch' ),
+					array( $wgContLang, 'normalizeForSearch' ),
 					$variants );
 				
 				// Some languages such as Chinese force all variants to a canonical
@@ -106,7 +106,7 @@ class SearchSqlite extends SearchEngine {
 				foreach( $strippedVariants as $stripped ) {
 					if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
 						// Hack for Chinese: we need to toss in quotes for
-						// multiple-character phrases since stripForSearch()
+						// multiple-character phrases since normalizeForSearch()
 						// added spaces between them to make word breaks.
 						$stripped = '"' . trim( $stripped ) . '"';
 					}
diff --git a/includes/search/SearchUpdate.php b/includes/search/SearchUpdate.php
index b9c2335151..e30c70e6d1 100644
--- a/includes/search/SearchUpdate.php
+++ b/includes/search/SearchUpdate.php
@@ -43,7 +43,7 @@ class SearchUpdate {
 		}
 
 		# Language-specific strip/conversion
-		$text = $wgContLang->stripForSearch( $this->mText );
+		$text = $wgContLang->normalizeForSearch( $this->mText );
 
 		wfProfileIn( $fname.'-regexps' );
 		$text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
diff --git a/languages/Language.php b/languages/Language.php
index facd0edee6..21ad57a6fb 100644
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1686,15 +1686,26 @@ class Language {
 	function hasWordBreaks() {
 		return true;
 	}
+	
+	/**
+	 * Some languages such as Chinese require word segmentation,
+	 * Specify such segmentation when overridden in derived class.
+	 * 
+	 * @param $string String
+	 * @return String
+	 */
+	function wordSegmentation( $string ) {
+		return $string;
+	}
 
 	/**
-	 * Some languages have special punctuation to strip out.
+	 * Some languages have special punctuation need to be normalized.
 	 * Make such changes here.
 	 *
 	 * @param $string String
 	 * @return String
 	 */
-	function stripForSearch( $string, $doStrip = true ) {
+	function normalizeForSearch( $string ) {
 		return $string;
 	}
 
@@ -1708,7 +1719,7 @@ class Language {
 		return $string;
 	}
 
-	protected static function wordSegmentation( $string, $pattern ) {
+	protected static function insertSpace( $string, $pattern ) {
 		$string = preg_replace( $pattern, " $1 ", $string );
 		$string = preg_replace( '/ +/', ' ', $string );
 		return $string;
diff --git a/languages/classes/LanguageGan.php b/languages/classes/LanguageGan.php
index f878cf3f2f..3c8b5fdb1f 100644
--- a/languages/classes/LanguageGan.php
+++ b/languages/classes/LanguageGan.php
@@ -135,9 +135,9 @@ class LanguageGan extends LanguageZh {
 	}
 
 	// word segmentation
-	function stripForSearch( $string, $doStrip = true, $autoVariant = 'gan-hans' ) {
-		// LanguageZh::stripForSearch
-		return parent::stripForSearch( $string, $doStrip, $autoVariant );
+	function normalizeForSearch( $string, $autoVariant = 'gan-hans' ) {
+		// LanguageZh::normalizeForSearch
+		return parent::normalizeForSearch( $string, $autoVariant );
 	}
 
 	function convertForSearchResult( $termsArray ) {
diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php
index 41b246f082..4a24260b00 100644
--- a/languages/classes/LanguageJa.php
+++ b/languages/classes/LanguageJa.php
@@ -6,30 +6,29 @@
  * @ingroup Language
  */
 class LanguageJa extends Language {
-	function stripForSearch( $string, $doStrip = true ) {
+	function wordSegmentation( $string ) {
+		// Strip known punctuation ?
+		// $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
 
-		$s = $string;
-
-		if ( $doStrip == true ) {
-			// Strip known punctuation ?
-			// $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
+		// Space strings of like hiragana/katakana/kanji
+		$hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
+		$katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
+		$kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
+			. '|[\xe4-\xe8][\x80-\xbf]{2}'
+			. '|\xe9[\x80-\xa5][\x80-\xbf]'
+			. '|\xe9\xa6[\x80-\x99])';
+			# U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
+		$reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
+		$s = self::insertSpace( $string, $reg );
+		return $s;
+	}
 
-			// Space strings of like hiragana/katakana/kanji
-			$hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
-			$katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
-			$kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
-				. '|[\xe4-\xe8][\x80-\xbf]{2}'
-				. '|\xe9[\x80-\xa5][\x80-\xbf]'
-				. '|\xe9\xa6[\x80-\x99])';
-				# U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
-			$reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
-			$s = self::wordSegmentation( $s, $reg );
-		}
+	function normalizeForSearch( $string ) {
 		// Double-width roman characters
-		$s = self::convertDoubleWidth( $s );
+		$s = self::convertDoubleWidth( $string );
 		
 		# Do general case folding and UTF-8 armoring
-		return parent::stripForSearch( $s, $doStrip );
+		return parent::normalizeForSearch( $s );
 	}
 
 	# Italic is not appropriate for Japanese script
diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php
index f00ac31e6e..6581d788e4 100644
--- a/languages/classes/LanguageYue.php
+++ b/languages/classes/LanguageYue.php
@@ -3,24 +3,29 @@
  * @ingroup Language
  */
 class LanguageYue extends Language {
-	function stripForSearch( $string, $doStrip = true ) {
+	function hasWordBreaks() {
+		return false;
+	}
+
+	/**
+	 * Eventually this should be a word segmentation;
+	 * for now just treat each character as a word.
+	 * @todo Fixme: only do this for Han characters...
+	 */
+	function wordSegmentation( $string ) {
+		$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+		$s = self::insertSpace( $string, $reg );
+		return $s;
+	}
+	
+	function normalizeForSearch( $string ) {
 		wfProfileIn( __METHOD__ );
 
 		// Double-width roman characters
 		$s = self::convertDoubleWidth( $string );
-
-		if ( $doStrip == true ) {
-			// eventually this should be a word segmentation;
-			// for now just treat each character as a word.
-			// @todo Fixme: only do this for Han characters...
-			$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
-			$s = self::wordSegmentation( $s, $reg );
-		}
-
 		$s = trim( $s );
+		$s = parent::normalizeForSearch( $s );
 
-		// Do general case folding and UTF-8 armoring
-		$s = parent::stripForSearch( $s, $doStrip );
 		wfProfileOut( __METHOD__ );
 		return $s;
 	}
diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php
index 6a004a080d..d89c4c47cb 100644
--- a/languages/classes/LanguageZh.php
+++ b/languages/classes/LanguageZh.php
@@ -170,8 +170,23 @@ class LanguageZh extends LanguageZh_hans {
 			"\"$1\"", $text);
 	}
 
-	// word segmentation
-	function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) {
+	/**
+	 * word segmentation
+	 */
+	function wordSegmentation( $string ) {
+		// LanguageZh_hans::wordSegmentation
+		$s = parent::wordSegmentation( $string );
+		return $s;
+	}
+
+	/**
+	 * auto convert to zh-hans and normalize special characters.
+	 *
+	 * @param $string String
+	 * @param $autoVariant String, default to 'zh-hans'
+	 * @return String
+	 */
+	function normalizeForSearch( $string, $autoVariant = 'zh-hans' ) {
 		wfProfileIn( __METHOD__ );
 
 		// always convert to zh-hans before indexing. it should be
@@ -179,8 +194,8 @@ class LanguageZh extends LanguageZh_hans {
 		// Traditional to Simplified is less ambiguous than the
 		// other way around
 		$s = $this->mConverter->autoConvert( $string, $autoVariant );
-		// LanguageZh_hans::stripForSearch
-		$s = parent::stripForSearch( $s, $doStrip );
+		// LanguageZh_hans::normalizeForSearch
+		$s = parent::normalizeForSearch( $s );
 		wfProfileOut( __METHOD__ );
 		return $s;
 
diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php
index 2f81960b16..5b03d73145 100644
--- a/languages/classes/LanguageZh_hans.php
+++ b/languages/classes/LanguageZh_hans.php
@@ -7,25 +7,26 @@ class LanguageZh_hans extends Language {
 	function hasWordBreaks() {
 		return false;
 	}
-	
-	function stripForSearch( $string, $doStrip = true ) {
+
+	/**
+	 * Eventually this should be a word segmentation;
+	 * for now just treat each character as a word.
+	 * @todo Fixme: only do this for Han characters...
+	 */
+	function wordSegmentation( $string ) {
+		$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+		$s = self::insertSpace( $string, $reg );
+		return $s;
+	}
+
+	function normalizeForSearch( $string ) {
 		wfProfileIn( __METHOD__ );
 
 		// Double-width roman characters
 		$s = self::convertDoubleWidth( $string );
-
-		if ( $doStrip == true ) {
-			// Eventually this should be a word segmentation;
-			// for now just treat each character as a word.
-			// @todo Fixme: only do this for Han characters...
-			$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
-			$s = self::wordSegmentation( $s, $reg );
-		}
-
 		$s = trim( $s );
+		$s = parent::normalizeForSearch( $s );
 
-		// Do general case folding and UTF-8 armoring
-		$s = parent::stripForSearch( $s, $doStrip );
 		wfProfileOut( __METHOD__ );
 		return $s;
 	}