includes/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $searchTerms = array();
  17         var $namespaces = array( NS_MAIN );
  18         var $showRedirects = false;
  19
  20         /**
  21          * Perform a full text search query and return a result set.
  22          * If title searches are not supported or disabled, return null.
  23          *
  24          * @param string $term - Raw search term
  25          * @return SearchResultSet
  26          * @access public
  27          * @abstract
  28          */
  29         function searchText( $term ) {
  30                 return null;
  31         }
  32
  33         /**
  34          * Perform a title-only search query and return a result set.
  35          * If title searches are not supported or disabled, return null.
  36          *
  37          * @param string $term - Raw search term
  38          * @return SearchResultSet
  39          * @access public
  40          * @abstract
  41          */
  42         function searchTitle( $term ) {
  43                 return null;
  44         }
  45
  46         /**
  47          * If an exact title match can be find, or a very slightly close match,
  48          * return the title. If no match, returns NULL.
  49          *
  50          * @param string $term
  51          * @return Title
  52          */
  53         public static function getNearMatch( $searchterm ) {
  54                 global $wgContLang;
  55
  56                 $allSearchTerms = array($searchterm);
  57
  58                 if($wgContLang->hasVariants()){
  59                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  60                 }
  61
  62                 foreach($allSearchTerms as $term){
  63
  64                         # Exact match? No need to look further.
  65                         $title = Title::newFromText( $term );
  66                         if (is_null($title))
  67                                 return NULL;
  68
  69                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal()
  70                              || $title->exists() ) {
  71                                 return $title;
  72                         }
  73
  74                         # Now try all lower case (i.e. first letter capitalized)
  75                         #
  76                         $title = Title::newFromText( $wgContLang->lc( $term ) );
  77                         if ( $title && $title->exists() ) {
  78                                 return $title;
  79                         }
  80
  81                         # Now try capitalized string
  82                         #
  83                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
  84                         if ( $title && $title->exists() ) {
  85                                 return $title;
  86                         }
  87
  88                         # Now try all upper case
  89                         #
  90                         $title = Title::newFromText( $wgContLang->uc( $term ) );
  91                         if ( $title && $title->exists() ) {
  92                                 return $title;
  93                         }
  94
  95                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
  96                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
  97                         if ( $title && $title->exists() ) {
  98                                 return $title;
  99                         }
 100
 101                         global $wgCapitalLinks, $wgContLang;
 102                         if( !$wgCapitalLinks ) {
 103                                 // Catch differs-by-first-letter-case-only
 104                                 $title = Title::newFromText( $wgContLang->ucfirst( $term ) );
 105                                 if ( $title && $title->exists() ) {
 106                                         return $title;
 107                                 }
 108                                 $title = Title::newFromText( $wgContLang->lcfirst( $term ) );
 109                                 if ( $title && $title->exists() ) {
 110                                         return $title;
 111                                 }
 112                         }
 113
 114                         // Give hooks a chance at better match variants
 115                         $title = null;
 116                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 117                                 return $title;
 118                         }
 119                 }
 120
 121                 $title = Title::newFromText( $searchterm );
 122
 123                 # Entering an IP address goes to the contributions page
 124                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 125                         || User::isIP( trim( $searchterm ) ) ) {
 126                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 127                 }
 128
 129
 130                 # Entering a user goes to the user page whether it's there or not
 131                 if ( $title->getNamespace() == NS_USER ) {
 132                         return $title;
 133                 }
 134
 135                 # Go to images that exist even if there's no local page.
 136                 # There may have been a funny upload, or it may be on a shared
 137                 # file repository such as Wikimedia Commons.
 138                 if( $title->getNamespace() == NS_IMAGE ) {
 139                         $image = wfFindFile( $title );
 140                         if( $image ) {
 141                                 return $title;
 142                         }
 143                 }
 144
 145                 # MediaWiki namespace? Page may be "implied" if not customized.
 146                 # Just return it, with caps forced as the message system likes it.
 147                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 148                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 149                 }
 150
 151                 # Quoted term? Try without the quotes...
 152                 $matches = array();
 153                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 154                         return SearchEngine::getNearMatch( $matches[1] );
 155                 }
 156
 157                 return NULL;
 158         }
 159
 160         public static function legalSearchChars() {
 161                 return "A-Za-z_'0-9\\x80-\\xFF\\-";
 162         }
 163
 164         /**
 165          * Set the maximum number of results to return
 166          * and how many to skip before returning the first.
 167          *
 168          * @param int $limit
 169          * @param int $offset
 170          * @access public
 171          */
 172         function setLimitOffset( $limit, $offset = 0 ) {
 173                 $this->limit = intval( $limit );
 174                 $this->offset = intval( $offset );
 175         }
 176
 177         /**
 178          * Set which namespaces the search should include.
 179          * Give an array of namespace index numbers.
 180          *
 181          * @param array $namespaces
 182          * @access public
 183          */
 184         function setNamespaces( $namespaces ) {
 185                 $this->namespaces = $namespaces;
 186         }
 187
 188         /**
 189          * Parse some common prefixes: all (search everything)
 190          * or namespace names
 191          *
 192          * @param string $query
 193          */
 194         function replacePrefixes( $query ){
 195                 global $wgContLang;
 196
 197                 if( strpos($query,':') === false )
 198                         return $query; // nothing to do
 199
 200                 $parsed = $query;
 201                 $allkeyword = wfMsgForContent('searchall').":";
 202                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 203                         $this->namespaces = null;
 204                         $parsed = substr($query,strlen($allkeyword));
 205                 } else if( strpos($query,':') !== false ) {
 206                         $prefix = substr($query,0,strpos($query,':'));
 207                         $index = $wgContLang->getNsIndex($prefix);
 208                         if($index !== false){
 209                                 $this->namespaces = array($index);
 210                                 $parsed = substr($query,strlen($prefix)+1);
 211                         }
 212                 }
 213                 if(trim($parsed) == '')
 214                         return $query; // prefix was the whole query
 215
 216                 return $parsed;
 217         }
 218
 219         /**
 220          * Make a list of searchable namespaces and their canonical names.
 221          * @return array
 222          */
 223         public static function searchableNamespaces() {
 224                 global $wgContLang;
 225                 $arr = array();
 226                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 227                         if( $ns >= NS_MAIN ) {
 228                                 $arr[$ns] = $name;
 229                         }
 230                 }
 231                 return $arr;
 232         }
 233
 234         /**
 235          * Extract default namespaces to search from the given user's
 236          * settings, returning a list of index numbers.
 237          *
 238          * @param User $user
 239          * @return array
 240          * @static
 241          */
 242         public static function userNamespaces( &$user ) {
 243                 $arr = array();
 244                 foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
 245                         if( $user->getOption( 'searchNs' . $ns ) ) {
 246                                 $arr[] = $ns;
 247                         }
 248                 }
 249                 return $arr;
 250         }
 251
 252         /**
 253          * Find snippet highlight settings for a given user
 254          *
 255          * @param User $user
 256          * @return array contextlines, contextchars
 257          * @static
 258          */
 259         public static function userHighlightPrefs( &$user ){
 260                 //$contextlines = $user->getOption( 'contextlines',  5 );
 261                 //$contextchars = $user->getOption( 'contextchars', 50 );
 262                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 263                 $contextchars = 75; // same as above.... :P
 264                 return array($contextlines, $contextchars);
 265         }
 266
 267         /**
 268          * An array of namespaces indexes to be searched by default
 269          *
 270          * @return array
 271          * @static
 272          */
 273         public static function defaultNamespaces(){
 274                 global $wgNamespacesToBeSearchedDefault;
 275
 276                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 277         }
 278
 279         /**
 280          * Return a 'cleaned up' search string
 281          *
 282          * @return string
 283          * @access public
 284          */
 285         function filter( $text ) {
 286                 $lc = $this->legalSearchChars();
 287                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 288         }
 289         /**
 290          * Load up the appropriate search engine class for the currently
 291          * active database backend, and return a configured instance.
 292          *
 293          * @return SearchEngine
 294          */
 295         public static function create() {
 296                 global $wgDBtype, $wgSearchType;
 297                 if( $wgSearchType ) {
 298                         $class = $wgSearchType;
 299                 } elseif( $wgDBtype == 'mysql' ) {
 300                         $class = 'SearchMySQL';
 301                 } else if ( $wgDBtype == 'postgres' ) {
 302                         $class = 'SearchPostgres';
 303                 } else if ( $wgDBtype == 'oracle' ) {
 304                         $class = 'SearchOracle';
 305                 } else {
 306                         $class = 'SearchEngineDummy';
 307                 }
 308                 $search = new $class( wfGetDB( DB_SLAVE ) );
 309                 $search->setLimitOffset(0,0);
 310                 return $search;
 311         }
 312
 313         /**
 314          * Create or update the search index record for the given page.
 315          * Title and text should be pre-processed.
 316          *
 317          * @param int $id
 318          * @param string $title
 319          * @param string $text
 320          * @abstract
 321          */
 322         function update( $id, $title, $text ) {
 323                 // no-op
 324         }
 325
 326         /**
 327          * Update a search index record's title only.
 328          * Title should be pre-processed.
 329          *
 330          * @param int $id
 331          * @param string $title
 332          * @abstract
 333          */
 334         function updateTitle( $id, $title ) {
 335                 // no-op
 336         }
 337
 338         /**
 339          * Get OpenSearch suggestion template
 340          *
 341          * @return string
 342          * @static
 343          */
 344         public static function getOpenSearchTemplate() {
 345                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 346                 if($wgOpenSearchTemplate)
 347                         return $wgOpenSearchTemplate;
 348                 else{
 349                         $ns = implode(',',SearchEngine::defaultNamespaces());
 350                         if(!$ns) $ns = "0";
 351                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 352                 }
 353         }
 354
 355         /**
 356          * Get internal MediaWiki Suggest template
 357          *
 358          * @return string
 359          * @static
 360          */
 361         public static function getMWSuggestTemplate() {
 362                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 363                 if($wgMWSuggestTemplate)
 364                         return $wgMWSuggestTemplate;
 365                 else
 366                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}';
 367         }
 368 }
 369
 370 /**
 371  * @ingroup Search
 372  */
 373 class SearchResultSet {
 374         /**
 375          * Fetch an array of regular expression fragments for matching
 376          * the search terms as parsed by this engine in a text extract.
 377          *
 378          * @return array
 379          * @access public
 380          * @abstract
 381          */
 382         function termMatches() {
 383                 return array();
 384         }
 385
 386         function numRows() {
 387                 return 0;
 388         }
 389
 390         /**
 391          * Return true if results are included in this result set.
 392          * @return bool
 393          * @abstract
 394          */
 395         function hasResults() {
 396                 return false;
 397         }
 398
 399         /**
 400          * Some search modes return a total hit count for the query
 401          * in the entire article database. This may include pages
 402          * in namespaces that would not be matched on the given
 403          * settings.
 404          *
 405          * Return null if no total hits number is supported.
 406          *
 407          * @return int
 408          * @access public
 409          */
 410         function getTotalHits() {
 411                 return null;
 412         }
 413
 414         /**
 415          * Some search modes return a suggested alternate term if there are
 416          * no exact hits. Returns true if there is one on this set.
 417          *
 418          * @return bool
 419          * @access public
 420          */
 421         function hasSuggestion() {
 422                 return false;
 423         }
 424
 425         /**
 426          * @return string suggested query, null if none
 427          */
 428         function getSuggestionQuery(){
 429                 return null;
 430         }
 431
 432         /**
 433          * @return string highlighted suggested query, '' if none
 434          */
 435         function getSuggestionSnippet(){
 436                 return '';
 437         }
 438
 439         /**
 440          * Return information about how and from where the results were fetched,
 441          * should be useful for diagnostics and debugging
 442          *
 443          * @return string
 444          */
 445         function getInfo() {
 446                 return null;
 447         }
 448
 449         /**
 450          * Return a result set of hits on other (multiple) wikis associated with this one
 451          *
 452          * @return SearchResultSet
 453          */
 454         function getInterwikiResults() {
 455                 return null;
 456         }
 457
 458         /**
 459          * Check if there are results on other wikis
 460          *
 461          * @return boolean
 462          */
 463         function hasInterwikiResults() {
 464                 return $this->getInterwikiResults() != null;
 465         }
 466
 467
 468         /**
 469          * Fetches next search result, or false.
 470          * @return SearchResult
 471          * @access public
 472          * @abstract
 473          */
 474         function next() {
 475                 return false;
 476         }
 477
 478         /**
 479          * Frees the result set, if applicable.
 480          * @ access public
 481          */
 482         function free() {
 483                 // ...
 484         }
 485 }
 486
 487
 488 /**
 489  * @ingroup Search
 490  */
 491 class SearchResultTooMany {
 492         ## Some search engines may bail out if too many matches are found
 493 }
 494
 495
 496 /**
 497  * @ingroup Search
 498  */
 499 class SearchResult {
 500         var $mRevision = null;
 501
 502         function SearchResult( $row ) {
 503                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 504                 if( !is_null($this->mTitle) )
 505                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 506         }
 507
 508         /**
 509          * Check if this is result points to an invalid title
 510          *
 511          * @return boolean
 512          * @access public
 513          */
 514         function isBrokenTitle(){
 515                 if( is_null($this->mTitle) )
 516                         return true;
 517                 return false;
 518         }
 519
 520         /**
 521          * Check if target page is missing, happens when index is out of date
 522          *
 523          * @return boolean
 524          * @access public
 525          */
 526         function isMissingRevision(){
 527                 if( !$this->mRevision )
 528                         return true;
 529                 return false;
 530         }
 531
 532         /**
 533          * @return Title
 534          * @access public
 535          */
 536         function getTitle() {
 537                 return $this->mTitle;
 538         }
 539
 540         /**
 541          * @return double or null if not supported
 542          */
 543         function getScore() {
 544                 return null;
 545         }
 546
 547         /**
 548          * Lazy initialization of article text from DB
 549          */
 550         protected function initText(){
 551                 if( !isset($this->mText) ){
 552                         $this->mText = $this->mRevision->getText();
 553                 }
 554         }
 555
 556         /**
 557          * @param array $terms terms to highlight
 558          * @return string highlighted text snippet, null (and not '') if not supported
 559          */
 560         function getTextSnippet($terms){
 561                 global $wgUser, $wgAdvancedSearchHighlighting;
 562                 $this->initText();
 563                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 564                 $h = new SearchHighlighter();
 565                 if( $wgAdvancedSearchHighlighting )
 566                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 567                 else
 568                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 569         }
 570
 571         /**
 572          * @param array $terms terms to highlight
 573          * @return string highlighted title, '' if not supported
 574          */
 575         function getTitleSnippet($terms){
 576                 return '';
 577         }
 578
 579         /**
 580          * @param array $terms terms to highlight
 581          * @return string highlighted redirect name (redirect to this page), '' if none or not supported
 582          */
 583         function getRedirectSnippet($terms){
 584                 return '';
 585         }
 586
 587         /**
 588          * @return Title object for the redirect to this page, null if none or not supported
 589          */
 590         function getRedirectTitle(){
 591                 return null;
 592         }
 593
 594         /**
 595          * @return string highlighted relevant section name, null if none or not supported
 596          */
 597         function getSectionSnippet(){
 598                 return '';
 599         }
 600
 601         /**
 602          * @return Title object (pagename+fragment) for the section, null if none or not supported
 603          */
 604         function getSectionTitle(){
 605                 return null;
 606         }
 607
 608         /**
 609          * @return string timestamp
 610          */
 611         function getTimestamp(){
 612                 return $this->mRevision->getTimestamp();
 613         }
 614
 615         /**
 616          * @return int number of words
 617          */
 618         function getWordCount(){
 619                 $this->initText();
 620                 return str_word_count( $this->mText );
 621         }
 622
 623         /**
 624          * @return int size in bytes
 625          */
 626         function getByteSize(){
 627                 $this->initText();
 628                 return strlen( $this->mText );
 629         }
 630
 631         /**
 632          * @return boolean if hit has related articles
 633          */
 634         function hasRelated(){
 635                 return false;
 636         }
 637
 638         /**
 639          * @return interwiki prefix of the title (return iw even if title is broken)
 640          */
 641         function getInterwikiPrefix(){
 642                 return '';
 643         }
 644 }
 645
 646 /**
 647  * Highlight bits of wikitext
 648  *
 649  * @ingroup Search
 650  */
 651 class SearchHighlighter {
 652         var $mCleanWikitext = true;
 653
 654         function SearchHighlighter($cleanupWikitext = true){
 655                 $this->mCleanWikitext = $cleanupWikitext;
 656         }
 657
 658         /**
 659          * Default implementation of wikitext highlighting
 660          *
 661          * @param string $text
 662          * @param array $terms Terms to highlight (unescaped)
 663          * @param int $contextlines
 664          * @param int $contextchars
 665          * @return string
 666          */
 667         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 668                 global $wgLang, $wgContLang;
 669                 global $wgSearchHighlightBoundaries;
 670                 $fname = __METHOD__;
 671
 672                 if($text == '')
 673                         return '';
 674
 675                 // spli text into text + templates/links/tables
 676                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 677                 // first capture group is for detecting nested templates/links/tables/references
 678                 $endPatterns = array(
 679                         1 => '/(\{\{)|(\}\})/', // template
 680                         2 => '/(\[\[)|(\]\])/', // image
 681                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 682
 683                 // FIXME: this should prolly be a hook or something
 684                 if(function_exists('wfCite')){
 685                         $spat .= '|(<ref>)'; // references via cite extension
 686                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 687                 }
 688                 $spat .= '/';
 689                 $textExt = array(); // text extracts
 690                 $otherExt = array();  // other extracts
 691                 wfProfileIn( "$fname-split" );
 692                 $start = 0;
 693                 $textLen = strlen($text);
 694                 $count = 0; // sequence number to maintain ordering
 695                 while( $start < $textLen ){
 696                         // find start of template/image/table
 697                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 698                                 $epat = '';
 699                                 foreach($matches as $key => $val){
 700                                         if($key > 0 && $val[1] != -1){
 701                                                 if($key == 2){
 702                                                         // see if this is an image link
 703                                                         $ns = substr($val[0],2,-1);
 704                                                         if( $wgContLang->getNsIndex($ns) != NS_IMAGE )
 705                                                                 break;
 706
 707                                                 }
 708                                                 $epat = $endPatterns[$key];
 709                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 710                                                 $start = $val[1];
 711                                                 break;
 712                                         }
 713                                 }
 714                                 if( $epat ){
 715                                         // find end (and detect any nested elements)
 716                                         $level = 0;
 717                                         $offset = $start + 1;
 718                                         $found = false;
 719                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 720                                                 if( array_key_exists(2,$endMatches) ){
 721                                                         // found end
 722                                                         if($level == 0){
 723                                                                 $len = strlen($endMatches[2][0]);
 724                                                                 $off = $endMatches[2][1];
 725                                                                 $this->splitAndAdd( $otherExt, $count,
 726                                                                         substr( $text, $start, $off + $len  - $start ) );
 727                                                                 $start = $off + $len;
 728                                                                 $found = true;
 729                                                                 break;
 730                                                         } else{
 731                                                                 // end of nested element
 732                                                                 $level -= 1;
 733                                                         }
 734                                                 } else{
 735                                                         // nested
 736                                                         $level += 1;
 737                                                 }
 738                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 739                                         }
 740                                         if( ! $found ){
 741                                                 // couldn't find appropriate closing tag, skip
 742                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 743                                                 $start += strlen($matches[0][0]);
 744                                         }
 745                                         continue;
 746                                 }
 747                         }
 748                         // else: add as text extract
 749                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 750                         break;
 751                 }
 752
 753                 $all = $textExt + $otherExt; // these have disjunct key sets
 754
 755                 wfProfileOut( "$fname-split" );
 756
 757                 // prepare regexps
 758                 foreach( $terms as $index => $term ) {
 759                         $terms[$index] = preg_quote( $term, '/' );
 760                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 761                         if(preg_match('/[\x80-\xff]/', $term) ){
 762                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 763                         }
 764
 765
 766                 }
 767                 $anyterm = implode( '|', $terms );
 768                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 769
 770                 // FIXME: a hack to scale contextchars, a correct solution
 771                 // would be to have contextchars actually be char and not byte
 772                 // length, and do proper utf-8 substrings and lengths everywhere,
 773                 // but PHP is making that very hard and unclean to implement :(
 774                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 775                 $contextchars = intval( $contextchars * $scale );
 776
 777                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 778                 $patPost = "($wgSearchHighlightBoundaries|$)";
 779
 780                 $pat1 = "/(".$phrase.")/ui";
 781                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 782
 783                 wfProfileIn( "$fname-extract" );
 784
 785                 $left = $contextlines;
 786
 787                 $snippets = array();
 788                 $offsets = array();
 789
 790                 // show beginning only if it contains all words
 791                 $first = 0;
 792                 $firstText = '';
 793                 foreach($textExt as $index => $line){
 794                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 795                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 796                                 $first = $index;
 797                                 break;
 798                         }
 799                 }
 800                 if( $firstText ){
 801                         $succ = true;
 802                         // check if first text contains all terms
 803                         foreach($terms as $term){
 804                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 805                                         $succ = false;
 806                                         break;
 807                                 }
 808                         }
 809                         if( $succ ){
 810                                 $snippets[$first] = $firstText;
 811                                 $offsets[$first] = 0;
 812                         }
 813                 }
 814                 if( ! $snippets ) {
 815                         // match whole query on text
 816                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 817                         // match whole query on templates/tables/images
 818                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 819                         // match any words on text
 820                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 821                         // match any words on templates/tables/images
 822                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 823
 824                         ksort($snippets);
 825                 }
 826
 827                 // add extra chars to each snippet to make snippets constant size
 828                 $extended = array();
 829                 if( count( $snippets ) == 0){
 830                         // couldn't find the target words, just show beginning of article
 831                         $targetchars = $contextchars * $contextlines;
 832                         $snippets[$first] = '';
 833                         $offsets[$first] = 0;
 834                 } else{
 835                         // if begin of the article contains the whole phrase, show only that !!
 836                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 837                             && $offsets[$first] < $contextchars * 2 ){
 838                                 $snippets = array ($first => $snippets[$first]);
 839                         }
 840
 841                         // calc by how much to extend existing snippets
 842                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 843                 }
 844
 845                 foreach($snippets as $index => $line){
 846                         $extended[$index] = $line;
 847                         $len = strlen($line);
 848                         if( $len < $targetchars - 20 ){
 849                                 // complete this line
 850                                 if($len < strlen( $all[$index] )){
 851                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 852                                         $len = strlen( $extended[$index] );
 853                                 }
 854
 855                                 // add more lines
 856                                 $add = $index + 1;
 857                                 while( $len < $targetchars - 20
 858                                        && array_key_exists($add,$all)
 859                                        && !array_key_exists($add,$snippets) ){
 860                                     $offsets[$add] = 0;
 861                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 862                                         $extended[$add] = $tt;
 863                                         $len += strlen( $tt );
 864                                         $add++;
 865                                 }
 866                         }
 867                 }
 868
 869                 //$snippets = array_map('htmlspecialchars', $extended);
 870                 $snippets = $extended;
 871                 $last = -1;
 872                 $extract = '';
 873                 foreach($snippets as $index => $line){
 874                         if($last == -1)
 875                                 $extract .= $line; // first line
 876                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 877                                 $extract .= " ".$line; // continous lines
 878                         else
 879                                 $extract .= '<b> ... </b>' . $line;
 880
 881                         $last = $index;
 882                 }
 883                 if( $extract )
 884                         $extract .= '<b> ... </b>';
 885
 886                 $processed = array();
 887                 foreach($terms as $term){
 888                         if( ! isset($processed[$term]) ){
 889                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 890                                 $extract = preg_replace( $pat3,
 891                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 892                                 $processed[$term] = true;
 893                         }
 894                 }
 895
 896                 wfProfileOut( "$fname-extract" );
 897
 898                 return $extract;
 899         }
 900
 901         /**
 902          * Split text into lines and add it to extracts array
 903          *
 904          * @param array $extracts index -> $line
 905          * @param int $count
 906          * @param string $text
 907          */
 908         function splitAndAdd(&$extracts, &$count, $text){
 909                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 910                 foreach($split as $line){
 911                         $tt = trim($line);
 912                         if( $tt )
 913                                 $extracts[$count++] = $tt;
 914                 }
 915         }
 916
 917         /**
 918          * Do manual case conversion for non-ascii chars
 919          *
 920          * @param unknown_type $matches
 921          */
 922         function caseCallback($matches){
 923                 global $wgContLang;
 924                 if( strlen($matches[0]) > 1 ){
 925                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 926                 } else
 927                         return $matches[0];
 928         }
 929
 930         /**
 931          * Extract part of the text from start to end, but by
 932          * not chopping up words
 933          * @param string $text
 934          * @param int $start
 935          * @param int $end
 936          * @param int $posStart (out) actual start position
 937          * @param int $posEnd (out) actual end position
 938          * @return string
 939          */
 940         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 941                 global $wgContLang;
 942
 943                 if( $start != 0)
 944                         $start = $this->position( $text, $start, 1 );
 945                 if( $end >= strlen($text) )
 946                         $end = strlen($text);
 947                 else
 948                         $end = $this->position( $text, $end );
 949
 950                 if(!is_null($posStart))
 951                         $posStart = $start;
 952                 if(!is_null($posEnd))
 953                         $posEnd = $end;
 954
 955                 if($end > $start)
 956                         return substr($text, $start, $end-$start);
 957                 else
 958                         return '';
 959         }
 960
 961         /**
 962          * Find a nonletter near a point (index) in the text
 963          *
 964          * @param string $text
 965          * @param int $point
 966          * @param int $offset to found index
 967          * @return int nearest nonletter index, or beginning of utf8 char if none
 968          */
 969         function position($text, $point, $offset=0 ){
 970                 $tolerance = 10;
 971                 $s = max( 0, $point - $tolerance );
 972                 $l = min( strlen($text), $point + $tolerance ) - $s;
 973                 $m = array();
 974                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
 975                         return $m[0][1] + $s + $offset;
 976                 } else{
 977                         // check if point is on a valid first UTF8 char
 978                         $char = ord( $text[$point] );
 979                         while( $char >= 0x80 && $char < 0xc0 ) {
 980                                 // skip trailing bytes
 981                                 $point++;
 982                                 if($point >= strlen($text))
 983                                         return strlen($text);
 984                                 $char = ord( $text[$point] );
 985                         }
 986                         return $point;
 987
 988                 }
 989         }
 990
 991         /**
 992          * Search extracts for a pattern, and return snippets
 993          *
 994          * @param string $pattern regexp for matching lines
 995          * @param array $extracts extracts to search
 996          * @param int $linesleft number of extracts to make
 997          * @param int $contextchars length of snippet
 998          * @param array $out map for highlighted snippets
 999          * @param array $offsets map of starting points of snippets
1000          * @protected
1001          */
1002         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1003                 if($linesleft == 0)
1004                         return; // nothing to do
1005                 foreach($extracts as $index => $line){
1006                         if( array_key_exists($index,$out) )
1007                                 continue; // this line already highlighted
1008
1009                         $m = array();
1010                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1011                                 continue;
1012
1013                         $offset = $m[0][1];
1014                         $len = strlen($m[0][0]);
1015                         if($offset + $len < $contextchars)
1016                                 $begin = 0;
1017                         elseif( $len > $contextchars)
1018                                 $begin = $offset;
1019                         else
1020                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1021
1022                         $end = $begin + $contextchars;
1023
1024                         $posBegin = $begin;
1025                         // basic snippet from this line
1026                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1027                         $offsets[$index] = $posBegin;
1028                         $linesleft--;
1029                         if($linesleft == 0)
1030                                 return;
1031                 }
1032         }
1033
1034         /**
1035          * Basic wikitext removal
1036          * @protected
1037          */
1038         function removeWiki($text) {
1039                 $fname = __METHOD__;
1040                 wfProfileIn( $fname );
1041
1042                 //$text = preg_replace("/'{2,5}/", "", $text);
1043                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1044                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1045                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1046                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1047                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1048                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1049                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1050                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1051                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1052                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1053                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1054                 $text = preg_replace("/'''''/", "", $text);
1055                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1056                 $text = preg_replace("/''/", "", $text);
1057
1058                 wfProfileOut( $fname );
1059                 return $text;
1060         }
1061
1062         /**
1063          * callback to replace [[target|caption]] kind of links, if
1064          * the target is category or image, leave it
1065          *
1066          * @param array $matches
1067          */
1068         function linkReplace($matches){
1069                 $colon = strpos( $matches[1], ':' );
1070                 if( $colon === false )
1071                         return $matches[2]; // replace with caption
1072                 global $wgContLang;
1073                 $ns = substr( $matches[1], 0, $colon );
1074                 $index = $wgContLang->getNsIndex($ns);
1075                 if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) )
1076                         return $matches[0]; // return the whole thing
1077                 else
1078                         return $matches[2];
1079
1080         }
1081
1082         /**
1083      * Simple & fast snippet extraction, but gives completely unrelevant
1084      * snippets
1085      *
1086      * @param string $text
1087      * @param array $terms
1088      * @param int $contextlines
1089      * @param int $contextchars
1090      * @return string
1091      */
1092     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1093         global $wgLang, $wgContLang;
1094         $fname = __METHOD__;
1095
1096         $lines = explode( "\n", $text );
1097
1098         $terms = implode( '|', $terms );
1099         $terms = str_replace( '/', "\\/", $terms);
1100         $max = intval( $contextchars ) + 1;
1101         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1102
1103         $lineno = 0;
1104
1105         $extract = "";
1106         wfProfileIn( "$fname-extract" );
1107         foreach ( $lines as $line ) {
1108             if ( 0 == $contextlines ) {
1109                 break;
1110             }
1111             ++$lineno;
1112             $m = array();
1113             if ( ! preg_match( $pat1, $line, $m ) ) {
1114                 continue;
1115             }
1116             --$contextlines;
1117             $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
1118
1119             if ( count( $m ) < 3 ) {
1120                 $post = '';
1121             } else {
1122                 $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
1123             }
1124
1125             $found = $m[2];
1126
1127             $line = htmlspecialchars( $pre . $found . $post );
1128             $pat2 = '/(' . $terms . ")/i";
1129             $line = preg_replace( $pat2,
1130               "<span class='searchmatch'>\\1</span>", $line );
1131
1132             $extract .= "${line}\n";
1133         }
1134         wfProfileOut( "$fname-extract" );
1135
1136         return $extract;
1137     }
1138
1139 }
1140
1141 /**
1142  * @ingroup Search
1143  */
1144 class SearchEngineDummy {
1145         function search( $term ) {
1146                 return null;
1147         }
1148         function setLimitOffset($l, $o) {}
1149         function legalSearchChars() {}
1150         function update() {}
1151         function setnamespaces() {}
1152         function searchtitle() {}
1153         function searchtext() {}
1154 }