Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearch into...
authorPhilip Tzou <philip@users.mediawiki.org>
Tue, 2 Feb 2010 15:09:01 +0000 (15:09 +0000)
committerPhilip Tzou <philip@users.mediawiki.org>
Tue, 2 Feb 2010 15:09:01 +0000 (15:09 +0000)
12 files changed:
includes/Title.php
includes/search/SearchIBM_DB2.php
includes/search/SearchMySQL.php
includes/search/SearchOracle.php
includes/search/SearchSqlite.php
includes/search/SearchUpdate.php
languages/Language.php
languages/classes/LanguageGan.php
languages/classes/LanguageJa.php
languages/classes/LanguageYue.php
languages/classes/LanguageZh.php
languages/classes/LanguageZh_hans.php

index 0f2b353..563bd9e 100644 (file)
@@ -435,7 +435,7 @@ class Title {
                global $wgContLang;
 
                $lc = SearchEngine::legalSearchChars() . '&#;';
-               $t = $wgContLang->stripForSearch( $title );
+               $t = $wgContLang->normalizeForSearch( $title );
                $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
                $t = $wgContLang->lc( $t );
 
index 7f8dfec..d758718 100644 (file)
@@ -158,10 +158,10 @@ class SearchIBM_DB2 extends SearchEngine {
                                if( is_array( $temp_terms )) {
                                        $temp_terms = array_unique( array_values( $temp_terms ));
                                        foreach( $temp_terms as $t )
-                                               $q[] = $terms[1] . $wgContLang->stripForSearch( $t );
+                                               $q[] = $terms[1] . $wgContLang->normalizeForSearch( $t );
                                }
                                else
-                                       $q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] );
+                                       $q[] = $terms[1] . $wgContLang->normalizeForSearch( $terms[2] );
 
                                if (!empty($terms[3])) {
                                        $regexp = preg_quote( $terms[3], '/' );
index 12ba2fa..0c238be 100644 (file)
@@ -80,7 +80,7 @@ class SearchMySQL extends SearchEngine {
                                // fulltext engine.
                                // For Chinese this also inserts spaces between adjacent Han characters.
                                $strippedVariants = array_map(
-                                       array( $wgContLang, 'stripForSearch' ),
+                                       array( $wgContLang, 'normalizeForSearch' ),
                                        $variants );
                                
                                // Some languages such as Chinese force all variants to a canonical
@@ -95,7 +95,7 @@ class SearchMySQL extends SearchEngine {
                                        $stripped = $this->normalizeText( $stripped );
                                        if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
                                                // Hack for Chinese: we need to toss in quotes for
-                                               // multiple-character phrases since stripForSearch()
+                                               // multiple-character phrases since normalizeForSearch()
                                                // added spaces between them to make word breaks.
                                                $stripped = '"' . trim( $stripped ) . '"';
                                        }
@@ -324,13 +324,16 @@ class SearchMySQL extends SearchEngine {
                global $wgContLang;
 
                wfProfileIn( __METHOD__ );
+               
+               // Some languages such as Chinese require word segmentation
+               $out = $wgContLang->wordSegmentation( $string );
 
                // MySQL fulltext index doesn't grok utf-8, so we
                // need to fold cases and convert to hex
                $out = preg_replace_callback(
                        "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
                        array( $this, 'stripForSearchCallback' ),
-                       $wgContLang->lc( $string ) );
+                       $wgContLang->lc( $out ) );
 
                // And to add insult to injury, the default indexing
                // ignores short words... Pad them so we can pass them
index 520fe6f..e4c5dee 100644 (file)
@@ -217,7 +217,7 @@ class SearchOracle extends SearchEngine {
 
        private function escapeTerm($t) {
                global $wgContLang;
-               $t = $wgContLang->stripForSearch($t);
+               $t = $wgContLang->normalizeForSearch($t);
                $t = isset($this->reservedWords[strtoupper($t)]) ? '{'.$t.'}' : $t;
                $t = preg_replace('/^"(.*)"$/', '($1)', $t);
                $t = preg_replace('/([-&|])/', '\\\\$1', $t);
index 53c093e..54a4b55 100644 (file)
@@ -92,7 +92,7 @@ class SearchSqlite extends SearchEngine {
                                // fulltext engine.
                                // For Chinese this also inserts spaces between adjacent Han characters.
                                $strippedVariants = array_map(
-                                       array( $wgContLang, 'stripForSearch' ),
+                                       array( $wgContLang, 'normalizeForSearch' ),
                                        $variants );
                                
                                // Some languages such as Chinese force all variants to a canonical
@@ -106,7 +106,7 @@ class SearchSqlite extends SearchEngine {
                                foreach( $strippedVariants as $stripped ) {
                                        if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
                                                // Hack for Chinese: we need to toss in quotes for
-                                               // multiple-character phrases since stripForSearch()
+                                               // multiple-character phrases since normalizeForSearch()
                                                // added spaces between them to make word breaks.
                                                $stripped = '"' . trim( $stripped ) . '"';
                                        }
index b9c2335..e30c70e 100644 (file)
@@ -43,7 +43,7 @@ class SearchUpdate {
                }
 
                # Language-specific strip/conversion
-               $text = $wgContLang->stripForSearch( $this->mText );
+               $text = $wgContLang->normalizeForSearch( $this->mText );
 
                wfProfileIn( $fname.'-regexps' );
                $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
index facd0ed..21ad57a 100644 (file)
@@ -1686,15 +1686,26 @@ class Language {
        function hasWordBreaks() {
                return true;
        }
+       
+       /**
+        * Some languages such as Chinese require word segmentation,
+        * Specify such segmentation when overridden in derived class.
+        * 
+        * @param $string String
+        * @return String
+        */
+       function wordSegmentation( $string ) {
+               return $string;
+       }
 
        /**
-        * Some languages have special punctuation to strip out.
+        * Some languages have special punctuation need to be normalized.
         * Make such changes here.
         *
         * @param $string String
         * @return String
         */
-       function stripForSearch( $string, $doStrip = true ) {
+       function normalizeForSearch( $string ) {
                return $string;
        }
 
@@ -1708,7 +1719,7 @@ class Language {
                return $string;
        }
 
-       protected static function wordSegmentation( $string, $pattern ) {
+       protected static function insertSpace( $string, $pattern ) {
                $string = preg_replace( $pattern, " $1 ", $string );
                $string = preg_replace( '/ +/', ' ', $string );
                return $string;
index f878cf3..3c8b5fd 100644 (file)
@@ -135,9 +135,9 @@ class LanguageGan extends LanguageZh {
        }
 
        // word segmentation
-       function stripForSearch( $string, $doStrip = true, $autoVariant = 'gan-hans' ) {
-               // LanguageZh::stripForSearch
-               return parent::stripForSearch( $string, $doStrip, $autoVariant );
+       function normalizeForSearch( $string, $autoVariant = 'gan-hans' ) {
+               // LanguageZh::normalizeForSearch
+               return parent::normalizeForSearch( $string, $autoVariant );
        }
 
        function convertForSearchResult( $termsArray ) {
index 41b246f..4a24260 100644 (file)
@@ -6,30 +6,29 @@
  * @ingroup Language
  */
 class LanguageJa extends Language {
-       function stripForSearch( $string, $doStrip = true ) {
+       function wordSegmentation( $string ) {
+               // Strip known punctuation ?
+               // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
 
-               $s = $string;
-
-               if ( $doStrip == true ) {
-                       // Strip known punctuation ?
-                       // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
+               // Space strings of like hiragana/katakana/kanji
+               $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
+               $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
+               $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
+                       . '|[\xe4-\xe8][\x80-\xbf]{2}'
+                       . '|\xe9[\x80-\xa5][\x80-\xbf]'
+                       . '|\xe9\xa6[\x80-\x99])';
+                       # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
+               $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
+               $s = self::insertSpace( $string, $reg );
+               return $s;
+       }
 
-                       // Space strings of like hiragana/katakana/kanji
-                       $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
-                       $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
-                       $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
-                               . '|[\xe4-\xe8][\x80-\xbf]{2}'
-                               . '|\xe9[\x80-\xa5][\x80-\xbf]'
-                               . '|\xe9\xa6[\x80-\x99])';
-                               # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
-                       $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
-                       $s = self::wordSegmentation( $s, $reg );
-               }
+       function normalizeForSearch( $string ) {
                // Double-width roman characters
-               $s = self::convertDoubleWidth( $s );
+               $s = self::convertDoubleWidth( $string );
                
                # Do general case folding and UTF-8 armoring
-               return parent::stripForSearch( $s, $doStrip );
+               return parent::normalizeForSearch( $s );
        }
 
        # Italic is not appropriate for Japanese script
index f00ac31..6581d78 100644 (file)
@@ -3,24 +3,29 @@
  * @ingroup Language
  */
 class LanguageYue extends Language {
-       function stripForSearch( $string, $doStrip = true ) {
+       function hasWordBreaks() {
+               return false;
+       }
+
+       /**
+        * Eventually this should be a word segmentation;
+        * for now just treat each character as a word.
+        * @todo Fixme: only do this for Han characters...
+        */
+       function wordSegmentation( $string ) {
+               $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+               $s = self::insertSpace( $string, $reg );
+               return $s;
+       }
+       
+       function normalizeForSearch( $string ) {
                wfProfileIn( __METHOD__ );
 
                // Double-width roman characters
                $s = self::convertDoubleWidth( $string );
-
-               if ( $doStrip == true ) {
-                       // eventually this should be a word segmentation;
-                       // for now just treat each character as a word.
-                       // @todo Fixme: only do this for Han characters...
-                       $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
-                       $s = self::wordSegmentation( $s, $reg );
-               }
-
                $s = trim( $s );
+               $s = parent::normalizeForSearch( $s );
 
-               // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s, $doStrip );
                wfProfileOut( __METHOD__ );
                return $s;
        }
index 6a004a0..d89c4c4 100644 (file)
@@ -170,8 +170,23 @@ class LanguageZh extends LanguageZh_hans {
                        "\"$1\"", $text);
        }
 
-       // word segmentation
-       function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) {
+       /**
+        * word segmentation
+        */
+       function wordSegmentation( $string ) {
+               // LanguageZh_hans::wordSegmentation
+               $s = parent::wordSegmentation( $string );
+               return $s;
+       }
+
+       /**
+        * auto convert to zh-hans and normalize special characters.
+        *
+        * @param $string String
+        * @param $autoVariant String, default to 'zh-hans'
+        * @return String
+        */
+       function normalizeForSearch( $string, $autoVariant = 'zh-hans' ) {
                wfProfileIn( __METHOD__ );
 
                // always convert to zh-hans before indexing. it should be
@@ -179,8 +194,8 @@ class LanguageZh extends LanguageZh_hans {
                // Traditional to Simplified is less ambiguous than the
                // other way around
                $s = $this->mConverter->autoConvert( $string, $autoVariant );
-               // LanguageZh_hans::stripForSearch
-               $s = parent::stripForSearch( $s, $doStrip );
+               // LanguageZh_hans::normalizeForSearch
+               $s = parent::normalizeForSearch( $s );
                wfProfileOut( __METHOD__ );
                return $s;
 
index 2f81960..5b03d73 100644 (file)
@@ -7,25 +7,26 @@ class LanguageZh_hans extends Language {
        function hasWordBreaks() {
                return false;
        }
-       
-       function stripForSearch( $string, $doStrip = true ) {
+
+       /**
+        * Eventually this should be a word segmentation;
+        * for now just treat each character as a word.
+        * @todo Fixme: only do this for Han characters...
+        */
+       function wordSegmentation( $string ) {
+               $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+               $s = self::insertSpace( $string, $reg );
+               return $s;
+       }
+
+       function normalizeForSearch( $string ) {
                wfProfileIn( __METHOD__ );
 
                // Double-width roman characters
                $s = self::convertDoubleWidth( $string );
-
-               if ( $doStrip == true ) {
-                       // Eventually this should be a word segmentation;
-                       // for now just treat each character as a word.
-                       // @todo Fixme: only do this for Han characters...
-                       $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
-                       $s = self::wordSegmentation( $s, $reg );
-               }
-
                $s = trim( $s );
+               $s = parent::normalizeForSearch( $s );
 
-               // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s, $doStrip );
                wfProfileOut( __METHOD__ );
                return $s;
        }