includes/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $prefix = '';
  17         var $searchTerms = array();
  18         var $namespaces = array( NS_MAIN );
  19         var $showRedirects = false;
  20
  21         /**
  22          * Perform a full text search query and return a result set.
  23          * If title searches are not supported or disabled, return null.
  24          * STUB
  25          *
  26          * @param $term String: raw search term
  27          * @return SearchResultSet
  28          */
  29         function searchText( $term ) {
  30                 return null;
  31         }
  32
  33         /**
  34          * Perform a title-only search query and return a result set.
  35          * If title searches are not supported or disabled, return null.
  36          * STUB
  37          *
  38          * @param $term String: raw search term
  39          * @return SearchResultSet
  40          */
  41         function searchTitle( $term ) {
  42                 return null;
  43         }
  44
  45         /** If this search backend can list/unlist redirects */
  46         function acceptListRedirects() {
  47                 return true;
  48         }
  49
  50         /**
  51          * Transform search term in cases when parts of the query came as different GET params (when supported)
  52          * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
  53          */
  54         function transformSearchTerm( $term ) {
  55                 return $term;
  56         }
  57
  58         /**
  59          * If an exact title match can be find, or a very slightly close match,
  60          * return the title. If no match, returns NULL.
  61          *
  62          * @param $searchterm String
  63          * @return Title
  64          */
  65         public static function getNearMatch( $searchterm ) {
  66                 global $wgContLang;
  67
  68                 $allSearchTerms = array($searchterm);
  69
  70                 if($wgContLang->hasVariants()){
  71                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  72                 }
  73
  74                 foreach($allSearchTerms as $term){
  75
  76                         # Exact match? No need to look further.
  77                         $title = Title::newFromText( $term );
  78                         if (is_null($title))
  79                                 return NULL;
  80
  81                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
  82                                 return $title;
  83                         }
  84
  85                         # See if it still otherwise has content is some sane sense
  86                         $article = MediaWiki::articleFromTitle( $title );
  87                         if( $article->hasViewableContent() ) {
  88                                 return $title;
  89                         }
  90
  91                         # Now try all lower case (i.e. first letter capitalized)
  92                         #
  93                         $title = Title::newFromText( $wgContLang->lc( $term ) );
  94                         if ( $title && $title->exists() ) {
  95                                 return $title;
  96                         }
  97
  98                         # Now try capitalized string
  99                         #
 100                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
 101                         if ( $title && $title->exists() ) {
 102                                 return $title;
 103                         }
 104
 105                         # Now try all upper case
 106                         #
 107                         $title = Title::newFromText( $wgContLang->uc( $term ) );
 108                         if ( $title && $title->exists() ) {
 109                                 return $title;
 110                         }
 111
 112                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
 113                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
 114                         if ( $title && $title->exists() ) {
 115                                 return $title;
 116                         }
 117
 118                         // Give hooks a chance at better match variants
 119                         $title = null;
 120                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 121                                 return $title;
 122                         }
 123                 }
 124
 125                 $title = Title::newFromText( $searchterm );
 126
 127                 # Entering an IP address goes to the contributions page
 128                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 129                         || User::isIP( trim( $searchterm ) ) ) {
 130                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 131                 }
 132
 133
 134                 # Entering a user goes to the user page whether it's there or not
 135                 if ( $title->getNamespace() == NS_USER ) {
 136                         return $title;
 137                 }
 138
 139                 # Go to images that exist even if there's no local page.
 140                 # There may have been a funny upload, or it may be on a shared
 141                 # file repository such as Wikimedia Commons.
 142                 if( $title->getNamespace() == NS_FILE ) {
 143                         $image = wfFindFile( $title );
 144                         if( $image ) {
 145                                 return $title;
 146                         }
 147                 }
 148
 149                 # MediaWiki namespace? Page may be "implied" if not customized.
 150                 # Just return it, with caps forced as the message system likes it.
 151                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 152                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 153                 }
 154
 155                 # Quoted term? Try without the quotes...
 156                 $matches = array();
 157                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 158                         return SearchEngine::getNearMatch( $matches[1] );
 159                 }
 160
 161                 return NULL;
 162         }
 163
 164         public static function legalSearchChars() {
 165                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
 166         }
 167
 168         /**
 169          * Set the maximum number of results to return
 170          * and how many to skip before returning the first.
 171          *
 172          * @param $limit Integer
 173          * @param $offset Integer
 174          */
 175         function setLimitOffset( $limit, $offset = 0 ) {
 176                 $this->limit = intval( $limit );
 177                 $this->offset = intval( $offset );
 178         }
 179
 180         /**
 181          * Set which namespaces the search should include.
 182          * Give an array of namespace index numbers.
 183          *
 184          * @param $namespaces Array
 185          */
 186         function setNamespaces( $namespaces ) {
 187                 $this->namespaces = $namespaces;
 188         }
 189
 190         /**
 191          * Parse some common prefixes: all (search everything)
 192          * or namespace names
 193          *
 194          * @param $query String
 195          */
 196         function replacePrefixes( $query ){
 197                 global $wgContLang;
 198
 199                 if( strpos($query,':') === false )
 200                         return $query; // nothing to do
 201
 202                 $parsed = $query;
 203                 $allkeyword = wfMsgForContent('searchall').":";
 204                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 205                         $this->namespaces = null;
 206                         $parsed = substr($query,strlen($allkeyword));
 207                 } else if( strpos($query,':') !== false ) {
 208                         $prefix = substr($query,0,strpos($query,':'));
 209                         $index = $wgContLang->getNsIndex($prefix);
 210                         if($index !== false){
 211                                 $this->namespaces = array($index);
 212                                 $parsed = substr($query,strlen($prefix)+1);
 213                         }
 214                 }
 215                 if(trim($parsed) == '')
 216                         return $query; // prefix was the whole query
 217
 218                 return $parsed;
 219         }
 220
 221         /**
 222          * Make a list of searchable namespaces and their canonical names.
 223          * @return Array
 224          */
 225         public static function searchableNamespaces() {
 226                 global $wgContLang;
 227                 $arr = array();
 228                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 229                         if( $ns >= NS_MAIN ) {
 230                                 $arr[$ns] = $name;
 231                         }
 232                 }
 233                 return $arr;
 234         }
 235
 236         /**
 237          * Extract default namespaces to search from the given user's
 238          * settings, returning a list of index numbers.
 239          *
 240          * @param $user User
 241          * @return Array
 242          */
 243         public static function userNamespaces( $user ) {
 244                 $arr = Preferences::loadOldSearchNs( $user );
 245                 $searchableNamespaces = SearchEngine::searchableNamespaces();
 246
 247                 $arr = array_intersect( $arr, array_keys($searchableNamespaces) ); // Filter
 248
 249                 return $arr;
 250         }
 251
 252         /**
 253          * Find snippet highlight settings for a given user
 254          *
 255          * @param $user User
 256          * @return Array contextlines, contextchars
 257          */
 258         public static function userHighlightPrefs( &$user ){
 259                 //$contextlines = $user->getOption( 'contextlines',  5 );
 260                 //$contextchars = $user->getOption( 'contextchars', 50 );
 261                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 262                 $contextchars = 75; // same as above.... :P
 263                 return array($contextlines, $contextchars);
 264         }
 265
 266         /**
 267          * An array of namespaces indexes to be searched by default
 268          *
 269          * @return Array
 270          */
 271         public static function defaultNamespaces(){
 272                 global $wgNamespacesToBeSearchedDefault;
 273
 274                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 275         }
 276
 277         /**
 278          * Get a list of namespace names useful for showing in tooltips
 279          * and preferences
 280          *
 281          * @param $namespaces Array
 282          */
 283         public static function namespacesAsText( $namespaces ){
 284                 global $wgContLang;
 285
 286                 $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
 287                 foreach( $formatted as $key => $ns ){
 288                         if ( empty($ns) )
 289                                 $formatted[$key] = wfMsg( 'blanknamespace' );
 290                 }
 291                 return $formatted;
 292         }
 293
 294         /**
 295          * An array of "project" namespaces indexes typically searched
 296          * by logged-in users
 297          *
 298          * @return Array
 299          */
 300         public static function projectNamespaces() {
 301                 global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
 302
 303                 return array_keys( $wgNamespacesToBeSearchedProject, true );
 304         }
 305
 306         /**
 307          * An array of "project" namespaces indexes typically searched
 308          * by logged-in users in addition to the default namespaces
 309          *
 310          * @return Array
 311          */
 312         public static function defaultAndProjectNamespaces() {
 313                 global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
 314
 315                 return array_keys( $wgNamespacesToBeSearchedDefault +
 316                         $wgNamespacesToBeSearchedProject, true);
 317         }
 318
 319         /**
 320          * Return a 'cleaned up' search string
 321          *
 322          * @param $text String
 323          * @return String
 324          */
 325         function filter( $text ) {
 326                 $lc = $this->legalSearchChars();
 327                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 328         }
 329         /**
 330          * Load up the appropriate search engine class for the currently
 331          * active database backend, and return a configured instance.
 332          *
 333          * @return SearchEngine
 334          */
 335         public static function create() {
 336                 global $wgSearchType;
 337                 $dbr = wfGetDB( DB_SLAVE );
 338                 if( $wgSearchType ) {
 339                         $class = $wgSearchType;
 340                 } else {
 341                         $class = $dbr->getSearchEngine();
 342                 }
 343                 $search = new $class( $dbr );
 344                 $search->setLimitOffset(0,0);
 345                 return $search;
 346         }
 347
 348         /**
 349          * Create or update the search index record for the given page.
 350          * Title and text should be pre-processed.
 351          * STUB
 352          *
 353          * @param $id Integer
 354          * @param $title String
 355          * @param $text String
 356          */
 357         function update( $id, $title, $text ) {
 358                 // no-op
 359         }
 360
 361         /**
 362          * Update a search index record's title only.
 363          * Title should be pre-processed.
 364          * STUB
 365          *
 366          * @param $id Integer
 367          * @param $title String
 368          */
 369         function updateTitle( $id, $title ) {
 370                 // no-op
 371         }
 372
 373         /**
 374          * Get OpenSearch suggestion template
 375          *
 376          * @return String
 377          */
 378         public static function getOpenSearchTemplate() {
 379                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 380                 if( $wgOpenSearchTemplate )     {
 381                         return $wgOpenSearchTemplate;
 382                 } else {
 383                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
 384                         if( !$ns ) $ns = "0";
 385                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 386                 }
 387         }
 388
 389         /**
 390          * Get internal MediaWiki Suggest template
 391          *
 392          * @return String
 393          */
 394         public static function getMWSuggestTemplate() {
 395                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 396                 if($wgMWSuggestTemplate)
 397                         return $wgMWSuggestTemplate;
 398                 else
 399                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
 400         }
 401 }
 402
 403 /**
 404  * @ingroup Search
 405  */
 406 class SearchResultSet {
 407         /**
 408          * Fetch an array of regular expression fragments for matching
 409          * the search terms as parsed by this engine in a text extract.
 410          * STUB
 411          *
 412          * @return Array
 413          */
 414         function termMatches() {
 415                 return array();
 416         }
 417
 418         function numRows() {
 419                 return 0;
 420         }
 421
 422         /**
 423          * Return true if results are included in this result set.
 424          * STUB
 425          *
 426          * @return Boolean
 427          */
 428         function hasResults() {
 429                 return false;
 430         }
 431
 432         /**
 433          * Some search modes return a total hit count for the query
 434          * in the entire article database. This may include pages
 435          * in namespaces that would not be matched on the given
 436          * settings.
 437          *
 438          * Return null if no total hits number is supported.
 439          *
 440          * @return Integer
 441          */
 442         function getTotalHits() {
 443                 return null;
 444         }
 445
 446         /**
 447          * Some search modes return a suggested alternate term if there are
 448          * no exact hits. Returns true if there is one on this set.
 449          *
 450          * @return Boolean
 451          */
 452         function hasSuggestion() {
 453                 return false;
 454         }
 455
 456         /**
 457          * @return String: suggested query, null if none
 458          */
 459         function getSuggestionQuery(){
 460                 return null;
 461         }
 462
 463         /**
 464          * @return String: HTML highlighted suggested query, '' if none
 465          */
 466         function getSuggestionSnippet(){
 467                 return '';
 468         }
 469
 470         /**
 471          * Return information about how and from where the results were fetched,
 472          * should be useful for diagnostics and debugging
 473          *
 474          * @return String
 475          */
 476         function getInfo() {
 477                 return null;
 478         }
 479
 480         /**
 481          * Return a result set of hits on other (multiple) wikis associated with this one
 482          *
 483          * @return SearchResultSet
 484          */
 485         function getInterwikiResults() {
 486                 return null;
 487         }
 488
 489         /**
 490          * Check if there are results on other wikis
 491          *
 492          * @return Boolean
 493          */
 494         function hasInterwikiResults() {
 495                 return $this->getInterwikiResults() != null;
 496         }
 497
 498
 499         /**
 500          * Fetches next search result, or false.
 501          * STUB
 502          *
 503          * @return SearchResult
 504          */
 505         function next() {
 506                 return false;
 507         }
 508
 509         /**
 510          * Frees the result set, if applicable.
 511          */
 512         function free() {
 513                 // ...
 514         }
 515 }
 516
 517
 518 /**
 519  * @ingroup Search
 520  */
 521 class SearchResultTooMany {
 522         ## Some search engines may bail out if too many matches are found
 523 }
 524
 525
 526 /**
 527  * @todo Fixme: This class is horribly factored. It would probably be better to
 528  * have a useful base class to which you pass some standard information, then
 529  * let the fancy self-highlighters extend that.
 530  * @ingroup Search
 531  */
 532 class SearchResult {
 533         var $mRevision = null;
 534         var $mImage = null;
 535
 536         function __construct( $row ) {
 537                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 538                 if( !is_null($this->mTitle) ){
 539                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 540                         if( $this->mTitle->getNamespace() === NS_FILE )
 541                                 $this->mImage = wfFindFile( $this->mTitle );
 542                 }
 543         }
 544
 545         /**
 546          * Check if this is result points to an invalid title
 547          *
 548          * @return Boolean
 549          */
 550         function isBrokenTitle(){
 551                 if( is_null($this->mTitle) )
 552                         return true;
 553                 return false;
 554         }
 555
 556         /**
 557          * Check if target page is missing, happens when index is out of date
 558          *
 559          * @return Boolean
 560          */
 561         function isMissingRevision(){
 562                 return !$this->mRevision && !$this->mImage;
 563         }
 564
 565         /**
 566          * @return Title
 567          */
 568         function getTitle() {
 569                 return $this->mTitle;
 570         }
 571
 572         /**
 573          * @return Double or null if not supported
 574          */
 575         function getScore() {
 576                 return null;
 577         }
 578
 579         /**
 580          * Lazy initialization of article text from DB
 581          */
 582         protected function initText(){
 583                 if( !isset($this->mText) ){
 584                         if($this->mRevision != null)
 585                                 $this->mText = $this->mRevision->getText();
 586                         else // TODO: can we fetch raw wikitext for commons images?
 587                                 $this->mText = '';
 588
 589                 }
 590         }
 591
 592         /**
 593          * @param $terms Array: terms to highlight
 594          * @return String: highlighted text snippet, null (and not '') if not supported
 595          */
 596         function getTextSnippet($terms){
 597                 global $wgUser, $wgAdvancedSearchHighlighting;
 598                 $this->initText();
 599                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 600                 $h = new SearchHighlighter();
 601                 if( $wgAdvancedSearchHighlighting )
 602                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 603                 else
 604                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 605         }
 606
 607         /**
 608          * @param $terms Array: terms to highlight
 609          * @return String: highlighted title, '' if not supported
 610          */
 611         function getTitleSnippet($terms){
 612                 return '';
 613         }
 614
 615         /**
 616          * @param $terms Array: terms to highlight
 617          * @return String: highlighted redirect name (redirect to this page), '' if none or not supported
 618          */
 619         function getRedirectSnippet($terms){
 620                 return '';
 621         }
 622
 623         /**
 624          * @return Title object for the redirect to this page, null if none or not supported
 625          */
 626         function getRedirectTitle(){
 627                 return null;
 628         }
 629
 630         /**
 631          * @return string highlighted relevant section name, null if none or not supported
 632          */
 633         function getSectionSnippet(){
 634                 return '';
 635         }
 636
 637         /**
 638          * @return Title object (pagename+fragment) for the section, null if none or not supported
 639          */
 640         function getSectionTitle(){
 641                 return null;
 642         }
 643
 644         /**
 645          * @return String: timestamp
 646          */
 647         function getTimestamp(){
 648                 if( $this->mRevision )
 649                         return $this->mRevision->getTimestamp();
 650                 else if( $this->mImage )
 651                         return $this->mImage->getTimestamp();
 652                 return '';
 653         }
 654
 655         /**
 656          * @return Integer: number of words
 657          */
 658         function getWordCount(){
 659                 $this->initText();
 660                 return str_word_count( $this->mText );
 661         }
 662
 663         /**
 664          * @return Integer: size in bytes
 665          */
 666         function getByteSize(){
 667                 $this->initText();
 668                 return strlen( $this->mText );
 669         }
 670
 671         /**
 672          * @return Boolean if hit has related articles
 673          */
 674         function hasRelated(){
 675                 return false;
 676         }
 677
 678         /**
 679          * @return String: interwiki prefix of the title (return iw even if title is broken)
 680          */
 681         function getInterwikiPrefix(){
 682                 return '';
 683         }
 684 }
 685
 686 /**
 687  * Highlight bits of wikitext
 688  *
 689  * @ingroup Search
 690  */
 691 class SearchHighlighter {
 692         var $mCleanWikitext = true;
 693
 694         function SearchHighlighter($cleanupWikitext = true){
 695                 $this->mCleanWikitext = $cleanupWikitext;
 696         }
 697
 698         /**
 699          * Default implementation of wikitext highlighting
 700          *
 701          * @param $text String
 702          * @param $terms Array: terms to highlight (unescaped)
 703          * @param $contextlines Integer
 704          * @param $contextchars Integer
 705          * @return String
 706          */
 707         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 708                 global $wgLang, $wgContLang;
 709                 global $wgSearchHighlightBoundaries;
 710                 $fname = __METHOD__;
 711
 712                 if($text == '')
 713                         return '';
 714
 715                 // spli text into text + templates/links/tables
 716                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 717                 // first capture group is for detecting nested templates/links/tables/references
 718                 $endPatterns = array(
 719                         1 => '/(\{\{)|(\}\})/', // template
 720                         2 => '/(\[\[)|(\]\])/', // image
 721                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 722
 723                 // FIXME: this should prolly be a hook or something
 724                 if(function_exists('wfCite')){
 725                         $spat .= '|(<ref>)'; // references via cite extension
 726                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 727                 }
 728                 $spat .= '/';
 729                 $textExt = array(); // text extracts
 730                 $otherExt = array();  // other extracts
 731                 wfProfileIn( "$fname-split" );
 732                 $start = 0;
 733                 $textLen = strlen($text);
 734                 $count = 0; // sequence number to maintain ordering
 735                 while( $start < $textLen ){
 736                         // find start of template/image/table
 737                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 738                                 $epat = '';
 739                                 foreach($matches as $key => $val){
 740                                         if($key > 0 && $val[1] != -1){
 741                                                 if($key == 2){
 742                                                         // see if this is an image link
 743                                                         $ns = substr($val[0],2,-1);
 744                                                         if( $wgContLang->getNsIndex($ns) != NS_FILE )
 745                                                                 break;
 746
 747                                                 }
 748                                                 $epat = $endPatterns[$key];
 749                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 750                                                 $start = $val[1];
 751                                                 break;
 752                                         }
 753                                 }
 754                                 if( $epat ){
 755                                         // find end (and detect any nested elements)
 756                                         $level = 0;
 757                                         $offset = $start + 1;
 758                                         $found = false;
 759                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 760                                                 if( array_key_exists(2,$endMatches) ){
 761                                                         // found end
 762                                                         if($level == 0){
 763                                                                 $len = strlen($endMatches[2][0]);
 764                                                                 $off = $endMatches[2][1];
 765                                                                 $this->splitAndAdd( $otherExt, $count,
 766                                                                         substr( $text, $start, $off + $len  - $start ) );
 767                                                                 $start = $off + $len;
 768                                                                 $found = true;
 769                                                                 break;
 770                                                         } else{
 771                                                                 // end of nested element
 772                                                                 $level -= 1;
 773                                                         }
 774                                                 } else{
 775                                                         // nested
 776                                                         $level += 1;
 777                                                 }
 778                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 779                                         }
 780                                         if( ! $found ){
 781                                                 // couldn't find appropriate closing tag, skip
 782                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 783                                                 $start += strlen($matches[0][0]);
 784                                         }
 785                                         continue;
 786                                 }
 787                         }
 788                         // else: add as text extract
 789                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 790                         break;
 791                 }
 792
 793                 $all = $textExt + $otherExt; // these have disjunct key sets
 794
 795                 wfProfileOut( "$fname-split" );
 796
 797                 // prepare regexps
 798                 foreach( $terms as $index => $term ) {
 799                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 800                         if(preg_match('/[\x80-\xff]/', $term) ){
 801                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 802                         } else {
 803                                 $terms[$index] = $term;
 804                         }
 805                 }
 806                 $anyterm = implode( '|', $terms );
 807                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 808
 809                 // FIXME: a hack to scale contextchars, a correct solution
 810                 // would be to have contextchars actually be char and not byte
 811                 // length, and do proper utf-8 substrings and lengths everywhere,
 812                 // but PHP is making that very hard and unclean to implement :(
 813                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 814                 $contextchars = intval( $contextchars * $scale );
 815
 816                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 817                 $patPost = "($wgSearchHighlightBoundaries|$)";
 818
 819                 $pat1 = "/(".$phrase.")/ui";
 820                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 821
 822                 wfProfileIn( "$fname-extract" );
 823
 824                 $left = $contextlines;
 825
 826                 $snippets = array();
 827                 $offsets = array();
 828
 829                 // show beginning only if it contains all words
 830                 $first = 0;
 831                 $firstText = '';
 832                 foreach($textExt as $index => $line){
 833                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 834                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 835                                 $first = $index;
 836                                 break;
 837                         }
 838                 }
 839                 if( $firstText ){
 840                         $succ = true;
 841                         // check if first text contains all terms
 842                         foreach($terms as $term){
 843                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 844                                         $succ = false;
 845                                         break;
 846                                 }
 847                         }
 848                         if( $succ ){
 849                                 $snippets[$first] = $firstText;
 850                                 $offsets[$first] = 0;
 851                         }
 852                 }
 853                 if( ! $snippets ) {
 854                         // match whole query on text
 855                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 856                         // match whole query on templates/tables/images
 857                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 858                         // match any words on text
 859                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 860                         // match any words on templates/tables/images
 861                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 862
 863                         ksort($snippets);
 864                 }
 865
 866                 // add extra chars to each snippet to make snippets constant size
 867                 $extended = array();
 868                 if( count( $snippets ) == 0){
 869                         // couldn't find the target words, just show beginning of article
 870                         $targetchars = $contextchars * $contextlines;
 871                         $snippets[$first] = '';
 872                         $offsets[$first] = 0;
 873                 } else{
 874                         // if begin of the article contains the whole phrase, show only that !!
 875                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 876                             && $offsets[$first] < $contextchars * 2 ){
 877                                 $snippets = array ($first => $snippets[$first]);
 878                         }
 879
 880                         // calc by how much to extend existing snippets
 881                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 882                 }
 883
 884                 foreach($snippets as $index => $line){
 885                         $extended[$index] = $line;
 886                         $len = strlen($line);
 887                         if( $len < $targetchars - 20 ){
 888                                 // complete this line
 889                                 if($len < strlen( $all[$index] )){
 890                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 891                                         $len = strlen( $extended[$index] );
 892                                 }
 893
 894                                 // add more lines
 895                                 $add = $index + 1;
 896                                 while( $len < $targetchars - 20
 897                                        && array_key_exists($add,$all)
 898                                        && !array_key_exists($add,$snippets) ){
 899                                     $offsets[$add] = 0;
 900                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 901                                         $extended[$add] = $tt;
 902                                         $len += strlen( $tt );
 903                                         $add++;
 904                                 }
 905                         }
 906                 }
 907
 908                 //$snippets = array_map('htmlspecialchars', $extended);
 909                 $snippets = $extended;
 910                 $last = -1;
 911                 $extract = '';
 912                 foreach($snippets as $index => $line){
 913                         if($last == -1)
 914                                 $extract .= $line; // first line
 915                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 916                                 $extract .= " ".$line; // continous lines
 917                         else
 918                                 $extract .= '<b> ... </b>' . $line;
 919
 920                         $last = $index;
 921                 }
 922                 if( $extract )
 923                         $extract .= '<b> ... </b>';
 924
 925                 $processed = array();
 926                 foreach($terms as $term){
 927                         if( ! isset($processed[$term]) ){
 928                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 929                                 $extract = preg_replace( $pat3,
 930                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 931                                 $processed[$term] = true;
 932                         }
 933                 }
 934
 935                 wfProfileOut( "$fname-extract" );
 936
 937                 return $extract;
 938         }
 939
 940         /**
 941          * Split text into lines and add it to extracts array
 942          *
 943          * @param $extracts Array: index -> $line
 944          * @param $count Integer
 945          * @param $text String
 946          */
 947         function splitAndAdd(&$extracts, &$count, $text){
 948                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 949                 foreach($split as $line){
 950                         $tt = trim($line);
 951                         if( $tt )
 952                                 $extracts[$count++] = $tt;
 953                 }
 954         }
 955
 956         /**
 957          * Do manual case conversion for non-ascii chars
 958          *
 959          * @param $matches Array
 960          */
 961         function caseCallback($matches){
 962                 global $wgContLang;
 963                 if( strlen($matches[0]) > 1 ){
 964                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 965                 } else
 966                         return $matches[0];
 967         }
 968
 969         /**
 970          * Extract part of the text from start to end, but by
 971          * not chopping up words
 972          * @param $text String
 973          * @param $start Integer
 974          * @param $end Integer
 975          * @param $posStart Integer: (out) actual start position
 976          * @param $posEnd Integer: (out) actual end position
 977          * @return String
 978          */
 979         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 980                 global $wgContLang;
 981
 982                 if( $start != 0)
 983                         $start = $this->position( $text, $start, 1 );
 984                 if( $end >= strlen($text) )
 985                         $end = strlen($text);
 986                 else
 987                         $end = $this->position( $text, $end );
 988
 989                 if(!is_null($posStart))
 990                         $posStart = $start;
 991                 if(!is_null($posEnd))
 992                         $posEnd = $end;
 993
 994                 if($end > $start)
 995                         return substr($text, $start, $end-$start);
 996                 else
 997                         return '';
 998         }
 999
1000         /**
1001          * Find a nonletter near a point (index) in the text
1002          *
1003          * @param $text String
1004          * @param $point Integer
1005          * @param $offset Integer: offset to found index
1006          * @return Integer: nearest nonletter index, or beginning of utf8 char if none
1007          */
1008         function position($text, $point, $offset=0 ){
1009                 $tolerance = 10;
1010                 $s = max( 0, $point - $tolerance );
1011                 $l = min( strlen($text), $point + $tolerance ) - $s;
1012                 $m = array();
1013                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
1014                         return $m[0][1] + $s + $offset;
1015                 } else{
1016                         // check if point is on a valid first UTF8 char
1017                         $char = ord( $text[$point] );
1018                         while( $char >= 0x80 && $char < 0xc0 ) {
1019                                 // skip trailing bytes
1020                                 $point++;
1021                                 if($point >= strlen($text))
1022                                         return strlen($text);
1023                                 $char = ord( $text[$point] );
1024                         }
1025                         return $point;
1026
1027                 }
1028         }
1029
1030         /**
1031          * Search extracts for a pattern, and return snippets
1032          *
1033          * @param $pattern String: regexp for matching lines
1034          * @param $extracts Array: extracts to search
1035          * @param $linesleft Integer: number of extracts to make
1036          * @param $contextchars Integer: length of snippet
1037          * @param $out Array: map for highlighted snippets
1038          * @param $offsets Array: map of starting points of snippets
1039          * @protected
1040          */
1041         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1042                 if($linesleft == 0)
1043                         return; // nothing to do
1044                 foreach($extracts as $index => $line){
1045                         if( array_key_exists($index,$out) )
1046                                 continue; // this line already highlighted
1047
1048                         $m = array();
1049                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1050                                 continue;
1051
1052                         $offset = $m[0][1];
1053                         $len = strlen($m[0][0]);
1054                         if($offset + $len < $contextchars)
1055                                 $begin = 0;
1056                         elseif( $len > $contextchars)
1057                                 $begin = $offset;
1058                         else
1059                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1060
1061                         $end = $begin + $contextchars;
1062
1063                         $posBegin = $begin;
1064                         // basic snippet from this line
1065                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1066                         $offsets[$index] = $posBegin;
1067                         $linesleft--;
1068                         if($linesleft == 0)
1069                                 return;
1070                 }
1071         }
1072
1073         /**
1074          * Basic wikitext removal
1075          * @protected
1076          */
1077         function removeWiki($text) {
1078                 $fname = __METHOD__;
1079                 wfProfileIn( $fname );
1080
1081                 //$text = preg_replace("/'{2,5}/", "", $text);
1082                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1083                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1084                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1085                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1086                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1087                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1088                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1089                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1090                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1091                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1092                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1093                 $text = preg_replace("/'''''/", "", $text);
1094                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1095                 $text = preg_replace("/''/", "", $text);
1096
1097                 wfProfileOut( $fname );
1098                 return $text;
1099         }
1100
1101         /**
1102          * callback to replace [[target|caption]] kind of links, if
1103          * the target is category or image, leave it
1104          *
1105          * @param $matches Array
1106          */
1107         function linkReplace($matches){
1108                 $colon = strpos( $matches[1], ':' );
1109                 if( $colon === false )
1110                         return $matches[2]; // replace with caption
1111                 global $wgContLang;
1112                 $ns = substr( $matches[1], 0, $colon );
1113                 $index = $wgContLang->getNsIndex($ns);
1114                 if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
1115                         return $matches[0]; // return the whole thing
1116                 else
1117                         return $matches[2];
1118
1119         }
1120
1121         /**
1122      * Simple & fast snippet extraction, but gives completely unrelevant
1123      * snippets
1124      *
1125      * @param $text String
1126      * @param $terms Array
1127      * @param $contextlines Integer
1128      * @param $contextchars Integer
1129      * @return String
1130      */
1131     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1132         global $wgLang, $wgContLang;
1133         $fname = __METHOD__;
1134
1135         $lines = explode( "\n", $text );
1136
1137         $terms = implode( '|', $terms );
1138         $max = intval( $contextchars ) + 1;
1139         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1140
1141         $lineno = 0;
1142
1143         $extract = "";
1144         wfProfileIn( "$fname-extract" );
1145         foreach ( $lines as $line ) {
1146             if ( 0 == $contextlines ) {
1147                 break;
1148             }
1149             ++$lineno;
1150             $m = array();
1151             if ( ! preg_match( $pat1, $line, $m ) ) {
1152                 continue;
1153             }
1154             --$contextlines;
1155             $pre = $wgContLang->truncate( $m[1], -$contextchars );
1156
1157             if ( count( $m ) < 3 ) {
1158                 $post = '';
1159             } else {
1160                 $post = $wgContLang->truncate( $m[3], $contextchars );
1161             }
1162
1163             $found = $m[2];
1164
1165             $line = htmlspecialchars( $pre . $found . $post );
1166             $pat2 = '/(' . $terms . ")/i";
1167             $line = preg_replace( $pat2,
1168               "<span class='searchmatch'>\\1</span>", $line );
1169
1170             $extract .= "${line}\n";
1171         }
1172         wfProfileOut( "$fname-extract" );
1173
1174         return $extract;
1175     }
1176
1177 }
1178
1179 /**
1180  * Dummy class to be used when non-supported Database engine is present.
1181  * @todo Fixme: dummy class should probably try something at least mildly useful,
1182  * such as a LIKE search through titles.
1183  * @ingroup Search
1184  */
1185 class SearchEngineDummy extends SearchEngine {
1186         // no-op
1187 }