Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearch into...

author Philip Tzou <philip@users.mediawiki.org>

Tue, 2 Feb 2010 15:09:01 +0000 (15:09 +0000)

committer Philip Tzou <philip@users.mediawiki.org>

Tue, 2 Feb 2010 15:09:01 +0000 (15:09 +0000)
author Philip Tzou <philip@users.mediawiki.org>
Tue, 2 Feb 2010 15:09:01 +0000 (15:09 +0000)
committer Philip Tzou <philip@users.mediawiki.org>
Tue, 2 Feb 2010 15:09:01 +0000 (15:09 +0000)
diff --git a/includes/Title.php b/includes/Title.php

index 0f2b353..563bd9e 100644 (file)
--- a/includes/Title.php
+++ b/includes/Title.php
@@ -435,7 +435,7 @@ class Title {
                 global $wgContLang;
  
                 $lc = SearchEngine::legalSearchChars() . '&#;';
-               $t = $wgContLang->stripForSearch( $title );
+               $t = $wgContLang->normalizeForSearch( $title );
                 $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
                 $t = $wgContLang->lc( $t );
  
diff --git a/includes/search/SearchIBM_DB2.php b/includes/search/SearchIBM_DB2.php

index 7f8dfec..d758718 100644 (file)
--- a/includes/search/SearchIBM_DB2.php
+++ b/includes/search/SearchIBM_DB2.php
@@ -158,10 +158,10 @@ class SearchIBM_DB2 extends SearchEngine {
                                 if( is_array( $temp_terms )) {
                                         $temp_terms = array_unique( array_values( $temp_terms ));
                                         foreach( $temp_terms as $t )
-                                               $q[] = $terms[1] . $wgContLang->stripForSearch( $t );
+                                               $q[] = $terms[1] . $wgContLang->normalizeForSearch( $t );
                                 }
                                 else
-                                       $q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] );
+                                       $q[] = $terms[1] . $wgContLang->normalizeForSearch( $terms[2] );
  
                                 if (!empty($terms[3])) {
                                         $regexp = preg_quote( $terms[3], '/' );
diff --git a/includes/search/SearchMySQL.php b/includes/search/SearchMySQL.php

index 12ba2fa..0c238be 100644 (file)
--- a/includes/search/SearchMySQL.php
+++ b/includes/search/SearchMySQL.php
@@ -80,7 +80,7 @@ class SearchMySQL extends SearchEngine {
                                 // fulltext engine.
                                 // For Chinese this also inserts spaces between adjacent Han characters.
                                 $strippedVariants = array_map(
-                                       array( $wgContLang, 'stripForSearch' ),
+                                       array( $wgContLang, 'normalizeForSearch' ),
                                         $variants );
                                 
                                 // Some languages such as Chinese force all variants to a canonical
@@ -95,7 +95,7 @@ class SearchMySQL extends SearchEngine {
                                         $stripped = $this->normalizeText( $stripped );
                                         if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
                                                 // Hack for Chinese: we need to toss in quotes for
-                                               // multiple-character phrases since stripForSearch()
+                                               // multiple-character phrases since normalizeForSearch()
                                                 // added spaces between them to make word breaks.
                                                 $stripped = '"' . trim( $stripped ) . '"';
                                         }
@@ -324,13 +324,16 @@ class SearchMySQL extends SearchEngine {
                 global $wgContLang;
  
                 wfProfileIn( __METHOD__ );
+               
+               // Some languages such as Chinese require word segmentation
+               $out = $wgContLang->wordSegmentation( $string );
  
                 // MySQL fulltext index doesn't grok utf-8, so we
                 // need to fold cases and convert to hex
                 $out = preg_replace_callback(
                         "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
                         array( $this, 'stripForSearchCallback' ),
-                       $wgContLang->lc( $string ) );
+                       $wgContLang->lc( $out ) );
  
                 // And to add insult to injury, the default indexing
                 // ignores short words... Pad them so we can pass them
diff --git a/includes/search/SearchOracle.php b/includes/search/SearchOracle.php

index 520fe6f..e4c5dee 100644 (file)
--- a/includes/search/SearchOracle.php
+++ b/includes/search/SearchOracle.php
@@ -217,7 +217,7 @@ class SearchOracle extends SearchEngine {
  
         private function escapeTerm($t) {
                 global $wgContLang;
-               $t = $wgContLang->stripForSearch($t);
+               $t = $wgContLang->normalizeForSearch($t);
                 $t = isset($this->reservedWords[strtoupper($t)]) ? '{'.$t.'}' : $t;
                 $t = preg_replace('/^"(.*)"$/', '($1)', $t);
                 $t = preg_replace('/([-&|])/', '\\\\$1', $t);
diff --git a/includes/search/SearchSqlite.php b/includes/search/SearchSqlite.php

index 53c093e..54a4b55 100644 (file)
--- a/includes/search/SearchSqlite.php
+++ b/includes/search/SearchSqlite.php
@@ -92,7 +92,7 @@ class SearchSqlite extends SearchEngine {
                                 // fulltext engine.
                                 // For Chinese this also inserts spaces between adjacent Han characters.
                                 $strippedVariants = array_map(
-                                       array( $wgContLang, 'stripForSearch' ),
+                                       array( $wgContLang, 'normalizeForSearch' ),
                                         $variants );
                                 
                                 // Some languages such as Chinese force all variants to a canonical
@@ -106,7 +106,7 @@ class SearchSqlite extends SearchEngine {
                                 foreach( $strippedVariants as $stripped ) {
                                         if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
                                                 // Hack for Chinese: we need to toss in quotes for
-                                               // multiple-character phrases since stripForSearch()
+                                               // multiple-character phrases since normalizeForSearch()
                                                 // added spaces between them to make word breaks.
                                                 $stripped = '"' . trim( $stripped ) . '"';
                                         }
diff --git a/includes/search/SearchUpdate.php b/includes/search/SearchUpdate.php

index b9c2335..e30c70e 100644 (file)
--- a/includes/search/SearchUpdate.php
+++ b/includes/search/SearchUpdate.php
@@ -43,7 +43,7 @@ class SearchUpdate {
                 }
  
                 # Language-specific strip/conversion
-               $text = $wgContLang->stripForSearch( $this->mText );
+               $text = $wgContLang->normalizeForSearch( $this->mText );
  
                 wfProfileIn( $fname.'-regexps' );
                 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
diff --git a/languages/Language.php b/languages/Language.php

index facd0ed..21ad57a 100644 (file)
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1686,15 +1686,26 @@ class Language {
         function hasWordBreaks() {
                 return true;
         }
+       
+       /**
+        * Some languages such as Chinese require word segmentation,
+        * Specify such segmentation when overridden in derived class.
+        * 
+        * @param $string String
+        * @return String
+        */
+       function wordSegmentation( $string ) {
+               return $string;
+       }
  
         /**
-        * Some languages have special punctuation to strip out.
+        * Some languages have special punctuation need to be normalized.
          * Make such changes here.
          *
          * @param $string String
          * @return String
          */
-       function stripForSearch( $string, $doStrip = true ) {
+       function normalizeForSearch( $string ) {
                 return $string;
         }
  
@@ -1708,7 +1719,7 @@ class Language {
                 return $string;
         }
  
-       protected static function wordSegmentation( $string, $pattern ) {
+       protected static function insertSpace( $string, $pattern ) {
                 $string = preg_replace( $pattern, " $1 ", $string );
                 $string = preg_replace( '/ +/', ' ', $string );
                 return $string;
diff --git a/languages/classes/LanguageGan.php b/languages/classes/LanguageGan.php

index f878cf3..3c8b5fd 100644 (file)
--- a/languages/classes/LanguageGan.php
+++ b/languages/classes/LanguageGan.php
@@ -135,9 +135,9 @@ class LanguageGan extends LanguageZh {
         }
  
         // word segmentation
-       function stripForSearch( $string, $doStrip = true, $autoVariant = 'gan-hans' ) {
-               // LanguageZh::stripForSearch
-               return parent::stripForSearch( $string, $doStrip, $autoVariant );
+       function normalizeForSearch( $string, $autoVariant = 'gan-hans' ) {
+               // LanguageZh::normalizeForSearch
+               return parent::normalizeForSearch( $string, $autoVariant );
         }
  
         function convertForSearchResult( $termsArray ) {
diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php

index 41b246f..4a24260 100644 (file)
--- a/languages/classes/LanguageJa.php
+++ b/languages/classes/LanguageJa.php
@@ -6,30 +6,29 @@
   * @ingroup Language
   */
  class LanguageJa extends Language {
-       function stripForSearch( $string, $doStrip = true ) {
+       function wordSegmentation( $string ) {
+               // Strip known punctuation ?
+               // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
  
-               $s = $string;
-
-               if ( $doStrip == true ) {
-                       // Strip known punctuation ?
-                       // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
+               // Space strings of like hiragana/katakana/kanji
+               $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
+               $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
+               $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
+                       . '|[\xe4-\xe8][\x80-\xbf]{2}'
+                       . '|\xe9[\x80-\xa5][\x80-\xbf]'
+                       . '|\xe9\xa6[\x80-\x99])';
+                       # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
+               $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
+               $s = self::insertSpace( $string, $reg );
+               return $s;
+       }
  
-                       // Space strings of like hiragana/katakana/kanji
-                       $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
-                       $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
-                       $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
-                               . '|[\xe4-\xe8][\x80-\xbf]{2}'
-                               . '|\xe9[\x80-\xa5][\x80-\xbf]'
-                               . '|\xe9\xa6[\x80-\x99])';
-                               # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
-                       $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
-                       $s = self::wordSegmentation( $s, $reg );
-               }
+       function normalizeForSearch( $string ) {
                 // Double-width roman characters
-               $s = self::convertDoubleWidth( $s );
+               $s = self::convertDoubleWidth( $string );
                 
                 # Do general case folding and UTF-8 armoring
-               return parent::stripForSearch( $s, $doStrip );
+               return parent::normalizeForSearch( $s );
         }
  
         # Italic is not appropriate for Japanese script
diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php

index f00ac31..6581d78 100644 (file)
--- a/languages/classes/LanguageYue.php
+++ b/languages/classes/LanguageYue.php
@@ -3,24 +3,29 @@
   * @ingroup Language
   */
  class LanguageYue extends Language {
-       function stripForSearch( $string, $doStrip = true ) {
+       function hasWordBreaks() {
+               return false;
+       }
+
+       /**
+        * Eventually this should be a word segmentation;
+        * for now just treat each character as a word.
+        * @todo Fixme: only do this for Han characters...
+        */
+       function wordSegmentation( $string ) {
+               $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+               $s = self::insertSpace( $string, $reg );
+               return $s;
+       }
+       
+       function normalizeForSearch( $string ) {
                 wfProfileIn( __METHOD__ );
  
                 // Double-width roman characters
                 $s = self::convertDoubleWidth( $string );
-
-               if ( $doStrip == true ) {
-                       // eventually this should be a word segmentation;
-                       // for now just treat each character as a word.
-                       // @todo Fixme: only do this for Han characters...
-                       $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
-                       $s = self::wordSegmentation( $s, $reg );
-               }
-
                 $s = trim( $s );
+               $s = parent::normalizeForSearch( $s );
  
-               // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s, $doStrip );
                 wfProfileOut( __METHOD__ );
                 return $s;
         }
diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php

index 6a004a0..d89c4c4 100644 (file)
--- a/languages/classes/LanguageZh.php
+++ b/languages/classes/LanguageZh.php
@@ -170,8 +170,23 @@ class LanguageZh extends LanguageZh_hans {
                         "\"$1\"", $text);
         }
  
-       // word segmentation
-       function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) {
+       /**
+        * word segmentation
+        */
+       function wordSegmentation( $string ) {
+               // LanguageZh_hans::wordSegmentation
+               $s = parent::wordSegmentation( $string );
+               return $s;
+       }
+
+       /**
+        * auto convert to zh-hans and normalize special characters.
+        *
+        * @param $string String
+        * @param $autoVariant String, default to 'zh-hans'
+        * @return String
+        */
+       function normalizeForSearch( $string, $autoVariant = 'zh-hans' ) {
                 wfProfileIn( __METHOD__ );
  
                 // always convert to zh-hans before indexing. it should be
@@ -179,8 +194,8 @@ class LanguageZh extends LanguageZh_hans {
                 // Traditional to Simplified is less ambiguous than the
                 // other way around
                 $s = $this->mConverter->autoConvert( $string, $autoVariant );
-               // LanguageZh_hans::stripForSearch
-               $s = parent::stripForSearch( $s, $doStrip );
+               // LanguageZh_hans::normalizeForSearch
+               $s = parent::normalizeForSearch( $s );
                 wfProfileOut( __METHOD__ );
                 return $s;
  
diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php

index 2f81960..5b03d73 100644 (file)
--- a/languages/classes/LanguageZh_hans.php
+++ b/languages/classes/LanguageZh_hans.php
@@ -7,25 +7,26 @@ class LanguageZh_hans extends Language {
         function hasWordBreaks() {
                 return false;
         }
-       
-       function stripForSearch( $string, $doStrip = true ) {
+
+       /**
+        * Eventually this should be a word segmentation;
+        * for now just treat each character as a word.
+        * @todo Fixme: only do this for Han characters...
+        */
+       function wordSegmentation( $string ) {
+               $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+               $s = self::insertSpace( $string, $reg );
+               return $s;
+       }
+
+       function normalizeForSearch( $string ) {
                 wfProfileIn( __METHOD__ );
  
                 // Double-width roman characters
                 $s = self::convertDoubleWidth( $string );
-
-               if ( $doStrip == true ) {
-                       // Eventually this should be a word segmentation;
-                       // for now just treat each character as a word.
-                       // @todo Fixme: only do this for Han characters...
-                       $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
-                       $s = self::wordSegmentation( $s, $reg );
-               }
-
                 $s = trim( $s );
+               $s = parent::normalizeForSearch( $s );
  
-               // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s, $doStrip );
                 wfProfileOut( __METHOD__ );
                 return $s;
         }
author	Philip Tzou <philip@users.mediawiki.org>
	Tue, 2 Feb 2010 15:09:01 +0000 (15:09 +0000)
committer	Philip Tzou <philip@users.mediawiki.org>
	Tue, 2 Feb 2010 15:09:01 +0000 (15:09 +0000)
includes/Title.php		patch \| blob \| history
includes/search/SearchIBM_DB2.php		patch \| blob \| history
includes/search/SearchMySQL.php		patch \| blob \| history
includes/search/SearchOracle.php		patch \| blob \| history
includes/search/SearchSqlite.php		patch \| blob \| history
includes/search/SearchUpdate.php		patch \| blob \| history
languages/Language.php		patch \| blob \| history
languages/classes/LanguageGan.php		patch \| blob \| history
languages/classes/LanguageJa.php		patch \| blob \| history
languages/classes/LanguageYue.php		patch \| blob \| history
languages/classes/LanguageZh.php		patch \| blob \| history
languages/classes/LanguageZh_hans.php		patch \| blob \| history