From eff719b75da0ad1aa0484e0ec392a00ec93d8c16 Mon Sep 17 00:00:00 2001 From: Max Semenik Date: Fri, 22 Jan 2010 20:36:26 +0000 Subject: [PATCH] Fixed r61214: moved MySQL munging to SearchEngine, updated calls. Can we kill $doStrip now? --- includes/db/Database.php | 12 ----- includes/db/DatabaseMysql.php | 80 ------------------------------ includes/search/SearchEngine.php | 12 +++++ includes/search/SearchMySQL.php | 85 ++++++++++++++++++++++++++++++-- languages/Language.php | 7 +-- 5 files changed, 95 insertions(+), 101 deletions(-) diff --git a/includes/db/Database.php b/includes/db/Database.php index 32c6ec72d6..7f856904a7 100644 --- a/includes/db/Database.php +++ b/includes/db/Database.php @@ -2366,18 +2366,6 @@ abstract class DatabaseBase { return "SearchMySQL"; } - /** - * When overridden in derived class, performs database-specific conversions - * on text to be used for searching or updating search index. - * Default implementation does nothing (simply returns $string). - * - * @param $string string: String to strip - * @return string - */ - public function stripForSearch( $string ) { - return $string; - } - /** * Allow or deny "big selects" for this session only. This is done by setting * the sql_big_selects session variable. diff --git a/includes/db/DatabaseMysql.php b/includes/db/DatabaseMysql.php index 63f267c7af..ea7ef5b977 100644 --- a/includes/db/DatabaseMysql.php +++ b/includes/db/DatabaseMysql.php @@ -7,8 +7,6 @@ * @see Database */ class DatabaseMysql extends DatabaseBase { - static $mMinSearchLength; - function getType() { return 'mysql'; } @@ -368,84 +366,6 @@ class DatabaseMysql extends DatabaseBase { public function unlockTables( $method ) { $this->query( "UNLOCK TABLES", $method ); } - - /** - * Converts some characters for MySQL's indexing to grok it correctly, - * and pads short words to overcome limitations. - */ - function stripForSearch( $string ) { - global $wgContLang; - - wfProfileIn( __METHOD__ ); - - // MySQL fulltext index doesn't grok utf-8, so we - // need to fold cases and convert to hex - $out = preg_replace_callback( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", - array( $this, 'stripForSearchCallback' ), - $wgContLang->lc( $string ) ); - - // And to add insult to injury, the default indexing - // ignores short words... Pad them so we can pass them - // through without reconfiguring the server... - $minLength = $this->minSearchLength(); - if( $minLength > 1 ) { - $n = $minLength - 1; - $out = preg_replace( - "/\b(\w{1,$n})\b/", - "$1u800", - $out ); - } - - // Periods within things like hostnames and IP addresses - // are also important -- we want a search for "example.com" - // or "192.168.1.1" to work sanely. - // - // MySQL's search seems to ignore them, so you'd match on - // "example.wikipedia.com" and "192.168.83.1" as well. - $out = preg_replace( - "/(\w)\.(\w|\*)/u", - "$1u82e$2", - $out ); - - wfProfileOut( __METHOD__ ); - - return $out; - } - - /** - * Armor a case-folded UTF-8 string to get through MySQL's - * fulltext search without being mucked up by funny charset - * settings or anything else of the sort. - */ - protected function stripForSearchCallback( $matches ) { - return 'u8' . bin2hex( $matches[1] ); - } - - /** - * Check MySQL server's ft_min_word_len setting so we know - * if we need to pad short words... - * - * @return int - */ - protected function minSearchLength() { - if( is_null( self::$mMinSearchLength ) ) { - $sql = "show global variables like 'ft\\_min\\_word\\_len'"; - - // Even though this query is pretty fast, let's not overload the master - $dbr = wfGetDB( DB_SLAVE ); - $result = $dbr->query( $sql ); - $row = $result->fetchObject(); - $result->free(); - - if( $row && $row->Variable_name == 'ft_min_word_len' ) { - self::$mMinSearchLength = intval( $row->Value ); - } else { - self::$mMinSearchLength = 0; - } - } - return self::$mMinSearchLength; - } public function setBigSelects( $value = true ) { if ( $value === 'default' ) { diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php index 669d3de2a1..f4ca700d5d 100644 --- a/includes/search/SearchEngine.php +++ b/includes/search/SearchEngine.php @@ -47,6 +47,18 @@ class SearchEngine { return true; } + /** + * When overridden in derived class, performs database-specific conversions + * on text to be used for searching or updating search index. + * Default implementation does nothing (simply returns $string). + * + * @param $string string: String to process + * @return string + */ + public function normalizeText( $string ) { + return $string; + } + /** * Transform search term in cases when parts of the query came as different GET params (when supported) * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive diff --git a/includes/search/SearchMySQL.php b/includes/search/SearchMySQL.php index ab55caf9b5..12ba2fa2ec 100644 --- a/includes/search/SearchMySQL.php +++ b/includes/search/SearchMySQL.php @@ -28,6 +28,7 @@ */ class SearchMySQL extends SearchEngine { var $strictMatching = true; + static $mMinSearchLength; /** @todo document */ function __construct( $db ) { @@ -91,6 +92,7 @@ class SearchMySQL extends SearchEngine { if( count( $strippedVariants) > 1 ) $searchon .= '('; foreach( $strippedVariants as $stripped ) { + $stripped = $this->normalizeText( $stripped ); if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { // Hack for Chinese: we need to toss in quotes for // multiple-character phrases since stripForSearch() @@ -292,8 +294,8 @@ class SearchMySQL extends SearchEngine { array( 'si_page' ), array( 'si_page' => $id, - 'si_title' => $title, - 'si_text' => $text + 'si_title' => $this->normalizeText( $title ), + 'si_text' => $this->normalizeText( $text ) ), __METHOD__ ); } @@ -308,11 +310,88 @@ class SearchMySQL extends SearchEngine { $dbw = wfGetDB( DB_MASTER ); $dbw->update( 'searchindex', - array( 'si_title' => $title ), + array( 'si_title' => $this->normalizeText( $title ) ), array( 'si_page' => $id ), __METHOD__, array( $dbw->lowPriorityOption() ) ); } + + /** + * Converts some characters for MySQL's indexing to grok it correctly, + * and pads short words to overcome limitations. + */ + function normalizeText( $string ) { + global $wgContLang; + + wfProfileIn( __METHOD__ ); + + // MySQL fulltext index doesn't grok utf-8, so we + // need to fold cases and convert to hex + $out = preg_replace_callback( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", + array( $this, 'stripForSearchCallback' ), + $wgContLang->lc( $string ) ); + + // And to add insult to injury, the default indexing + // ignores short words... Pad them so we can pass them + // through without reconfiguring the server... + $minLength = $this->minSearchLength(); + if( $minLength > 1 ) { + $n = $minLength - 1; + $out = preg_replace( + "/\b(\w{1,$n})\b/", + "$1u800", + $out ); + } + + // Periods within things like hostnames and IP addresses + // are also important -- we want a search for "example.com" + // or "192.168.1.1" to work sanely. + // + // MySQL's search seems to ignore them, so you'd match on + // "example.wikipedia.com" and "192.168.83.1" as well. + $out = preg_replace( + "/(\w)\.(\w|\*)/u", + "$1u82e$2", + $out ); + + wfProfileOut( __METHOD__ ); + + return $out; + } + + /** + * Armor a case-folded UTF-8 string to get through MySQL's + * fulltext search without being mucked up by funny charset + * settings or anything else of the sort. + */ + protected function stripForSearchCallback( $matches ) { + return 'u8' . bin2hex( $matches[1] ); + } + + /** + * Check MySQL server's ft_min_word_len setting so we know + * if we need to pad short words... + * + * @return int + */ + protected function minSearchLength() { + if( is_null( self::$mMinSearchLength ) ) { + $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'"; + + $dbr = wfGetDB( DB_SLAVE ); + $result = $dbr->query( $sql ); + $row = $result->fetchObject(); + $result->free(); + + if( $row && $row->Variable_name == 'ft_min_word_len' ) { + self::$mMinSearchLength = intval( $row->Value ); + } else { + self::$mMinSearchLength = 0; + } + } + return self::$mMinSearchLength; + } } /** diff --git a/languages/Language.php b/languages/Language.php index 3ea96164d8..8c072f81c8 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1695,12 +1695,7 @@ class Language { * @return String */ function stripForSearch( $string, $doStrip = true ) { - if ( !$doStrip ) { - return $string; - } - - $dbr = wfGetDB( DB_SLAVE ); - return $dbr->stripForSearch( $string ); + return $string; } /** -- 2.20.1