includes/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $searchTerms = array();
  17         var $namespaces = array( NS_MAIN );
  18         var $showRedirects = false;
  19
  20         /**
  21          * Perform a full text search query and return a result set.
  22          * If title searches are not supported or disabled, return null.
  23          *
  24          * @param string $term - Raw search term
  25          * @return SearchResultSet
  26          * @access public
  27          * @abstract
  28          */
  29         function searchText( $term ) {
  30                 return null;
  31         }
  32
  33         /**
  34          * Perform a title-only search query and return a result set.
  35          * If title searches are not supported or disabled, return null.
  36          *
  37          * @param string $term - Raw search term
  38          * @return SearchResultSet
  39          * @access public
  40          * @abstract
  41          */
  42         function searchTitle( $term ) {
  43                 return null;
  44         }
  45
  46         /**
  47          * If an exact title match can be find, or a very slightly close match,
  48          * return the title. If no match, returns NULL.
  49          *
  50          * @param string $term
  51          * @return Title
  52          */
  53         public static function getNearMatch( $searchterm ) {
  54                 global $wgContLang;
  55
  56                 $allSearchTerms = array($searchterm);
  57
  58                 if($wgContLang->hasVariants()){
  59                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  60                 }
  61
  62                 foreach($allSearchTerms as $term){
  63
  64                         # Exact match? No need to look further.
  65                         $title = Title::newFromText( $term );
  66                         if (is_null($title))
  67                                 return NULL;
  68
  69                         if ( $title->getNamespace() == NS_SPECIAL || $title->exists() ) {
  70                                 return $title;
  71                         }
  72
  73                         # Now try all lower case (i.e. first letter capitalized)
  74                         #
  75                         $title = Title::newFromText( $wgContLang->lc( $term ) );
  76                         if ( $title->exists() ) {
  77                                 return $title;
  78                         }
  79
  80                         # Now try capitalized string
  81                         #
  82                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
  83                         if ( $title->exists() ) {
  84                                 return $title;
  85                         }
  86
  87                         # Now try all upper case
  88                         #
  89                         $title = Title::newFromText( $wgContLang->uc( $term ) );
  90                         if ( $title->exists() ) {
  91                                 return $title;
  92                         }
  93
  94                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
  95                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
  96                         if ( $title->exists() ) {
  97                                 return $title;
  98                         }
  99
 100                         global $wgCapitalLinks, $wgContLang;
 101                         if( !$wgCapitalLinks ) {
 102                                 // Catch differs-by-first-letter-case-only
 103                                 $title = Title::newFromText( $wgContLang->ucfirst( $term ) );
 104                                 if ( $title->exists() ) {
 105                                         return $title;
 106                                 }
 107                                 $title = Title::newFromText( $wgContLang->lcfirst( $term ) );
 108                                 if ( $title->exists() ) {
 109                                         return $title;
 110                                 }
 111                         }
 112
 113                         // Give hooks a chance at better match variants
 114                         $title = null;
 115                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 116                                 return $title;
 117                         }
 118                 }
 119
 120                 $title = Title::newFromText( $searchterm );
 121
 122                 # Entering an IP address goes to the contributions page
 123                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 124                         || User::isIP( trim( $searchterm ) ) ) {
 125                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 126                 }
 127
 128
 129                 # Entering a user goes to the user page whether it's there or not
 130                 if ( $title->getNamespace() == NS_USER ) {
 131                         return $title;
 132                 }
 133
 134                 # Go to images that exist even if there's no local page.
 135                 # There may have been a funny upload, or it may be on a shared
 136                 # file repository such as Wikimedia Commons.
 137                 if( $title->getNamespace() == NS_IMAGE ) {
 138                         $image = wfFindFile( $title );
 139                         if( $image ) {
 140                                 return $title;
 141                         }
 142                 }
 143
 144                 # MediaWiki namespace? Page may be "implied" if not customized.
 145                 # Just return it, with caps forced as the message system likes it.
 146                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 147                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 148                 }
 149
 150                 # Quoted term? Try without the quotes...
 151                 $matches = array();
 152                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 153                         return SearchEngine::getNearMatch( $matches[1] );
 154                 }
 155
 156                 return NULL;
 157         }
 158
 159         public static function legalSearchChars() {
 160                 return "A-Za-z_'0-9\\x80-\\xFF\\-";
 161         }
 162
 163         /**
 164          * Set the maximum number of results to return
 165          * and how many to skip before returning the first.
 166          *
 167          * @param int $limit
 168          * @param int $offset
 169          * @access public
 170          */
 171         function setLimitOffset( $limit, $offset = 0 ) {
 172                 $this->limit = intval( $limit );
 173                 $this->offset = intval( $offset );
 174         }
 175
 176         /**
 177          * Set which namespaces the search should include.
 178          * Give an array of namespace index numbers.
 179          *
 180          * @param array $namespaces
 181          * @access public
 182          */
 183         function setNamespaces( $namespaces ) {
 184                 $this->namespaces = $namespaces;
 185         }
 186
 187         /**
 188          * Parse some common prefixes: all (search everything)
 189          * or namespace names
 190          *
 191          * @param string $query
 192          */
 193         function replacePrefixes( $query ){
 194                 global $wgContLang;
 195
 196                 if( strpos($query,':') === false )
 197                         return $query; // nothing to do
 198
 199                 $parsed = $query;
 200                 $allkeyword = wfMsgForContent('searchall').":";
 201                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 202                         $this->namespaces = null;
 203                         $parsed = substr($query,strlen($allkeyword));
 204                 } else if( strpos($query,':') !== false ) {
 205                         $prefix = substr($query,0,strpos($query,':'));
 206                         $index = $wgContLang->getNsIndex($prefix);
 207                         if($index !== false){
 208                                 $this->namespaces = array($index);
 209                                 $parsed = substr($query,strlen($prefix)+1);
 210                         }
 211                 }
 212                 if(trim($parsed) == '')
 213                         return $query; // prefix was the whole query
 214
 215                 return $parsed;
 216         }
 217
 218         /**
 219          * Make a list of searchable namespaces and their canonical names.
 220          * @return array
 221          */
 222         public static function searchableNamespaces() {
 223                 global $wgContLang;
 224                 $arr = array();
 225                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 226                         if( $ns >= NS_MAIN ) {
 227                                 $arr[$ns] = $name;
 228                         }
 229                 }
 230                 return $arr;
 231         }
 232
 233         /**
 234          * Extract default namespaces to search from the given user's
 235          * settings, returning a list of index numbers.
 236          *
 237          * @param User $user
 238          * @return array
 239          * @static
 240          */
 241         public static function userNamespaces( &$user ) {
 242                 $arr = array();
 243                 foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
 244                         if( $user->getOption( 'searchNs' . $ns ) ) {
 245                                 $arr[] = $ns;
 246                         }
 247                 }
 248                 return $arr;
 249         }
 250
 251         /**
 252          * Find snippet highlight settings for a given user
 253          *
 254          * @param User $user
 255          * @return array contextlines, contextchars
 256          * @static
 257          */
 258         public static function userHighlightPrefs( &$user ){
 259                 //$contextlines = $user->getOption( 'contextlines',  5 );
 260                 //$contextchars = $user->getOption( 'contextchars', 50 );
 261                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 262                 $contextchars = 75; // same as above.... :P
 263                 return array($contextlines, $contextchars);
 264         }
 265
 266         /**
 267          * An array of namespaces indexes to be searched by default
 268          *
 269          * @return array
 270          * @static
 271          */
 272         public static function defaultNamespaces(){
 273                 global $wgNamespacesToBeSearchedDefault;
 274
 275                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 276         }
 277
 278         /**
 279          * Return a 'cleaned up' search string
 280          *
 281          * @return string
 282          * @access public
 283          */
 284         function filter( $text ) {
 285                 $lc = $this->legalSearchChars();
 286                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 287         }
 288         /**
 289          * Load up the appropriate search engine class for the currently
 290          * active database backend, and return a configured instance.
 291          *
 292          * @return SearchEngine
 293          */
 294         public static function create() {
 295                 global $wgDBtype, $wgSearchType;
 296                 if( $wgSearchType ) {
 297                         $class = $wgSearchType;
 298                 } elseif( $wgDBtype == 'mysql' ) {
 299                         $class = 'SearchMySQL';
 300                 } else if ( $wgDBtype == 'postgres' ) {
 301                         $class = 'SearchPostgres';
 302                 } else if ( $wgDBtype == 'oracle' ) {
 303                         $class = 'SearchOracle';
 304                 } else {
 305                         $class = 'SearchEngineDummy';
 306                 }
 307                 $search = new $class( wfGetDB( DB_SLAVE ) );
 308                 $search->setLimitOffset(0,0);
 309                 return $search;
 310         }
 311
 312         /**
 313          * Create or update the search index record for the given page.
 314          * Title and text should be pre-processed.
 315          *
 316          * @param int $id
 317          * @param string $title
 318          * @param string $text
 319          * @abstract
 320          */
 321         function update( $id, $title, $text ) {
 322                 // no-op
 323         }
 324
 325         /**
 326          * Update a search index record's title only.
 327          * Title should be pre-processed.
 328          *
 329          * @param int $id
 330          * @param string $title
 331          * @abstract
 332          */
 333         function updateTitle( $id, $title ) {
 334                 // no-op
 335         }
 336
 337         /**
 338          * Get OpenSearch suggestion template
 339          *
 340          * @return string
 341          * @static
 342          */
 343         public static function getOpenSearchTemplate() {
 344                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 345                 if($wgOpenSearchTemplate)
 346                         return $wgOpenSearchTemplate;
 347                 else{
 348                         $ns = implode(',',SearchEngine::defaultNamespaces());
 349                         if(!$ns) $ns = "0";
 350                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 351                 }
 352         }
 353
 354         /**
 355          * Get internal MediaWiki Suggest template
 356          *
 357          * @return string
 358          * @static
 359          */
 360         public static function getMWSuggestTemplate() {
 361                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 362                 if($wgMWSuggestTemplate)
 363                         return $wgMWSuggestTemplate;
 364                 else
 365                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}';
 366         }
 367 }
 368
 369 /**
 370  * @ingroup Search
 371  */
 372 class SearchResultSet {
 373         /**
 374          * Fetch an array of regular expression fragments for matching
 375          * the search terms as parsed by this engine in a text extract.
 376          *
 377          * @return array
 378          * @access public
 379          * @abstract
 380          */
 381         function termMatches() {
 382                 return array();
 383         }
 384
 385         function numRows() {
 386                 return 0;
 387         }
 388
 389         /**
 390          * Return true if results are included in this result set.
 391          * @return bool
 392          * @abstract
 393          */
 394         function hasResults() {
 395                 return false;
 396         }
 397
 398         /**
 399          * Some search modes return a total hit count for the query
 400          * in the entire article database. This may include pages
 401          * in namespaces that would not be matched on the given
 402          * settings.
 403          *
 404          * Return null if no total hits number is supported.
 405          *
 406          * @return int
 407          * @access public
 408          */
 409         function getTotalHits() {
 410                 return null;
 411         }
 412
 413         /**
 414          * Some search modes return a suggested alternate term if there are
 415          * no exact hits. Returns true if there is one on this set.
 416          *
 417          * @return bool
 418          * @access public
 419          */
 420         function hasSuggestion() {
 421                 return false;
 422         }
 423
 424         /**
 425          * @return string suggested query, null if none
 426          */
 427         function getSuggestionQuery(){
 428                 return null;
 429         }
 430
 431         /**
 432          * @return string highlighted suggested query, '' if none
 433          */
 434         function getSuggestionSnippet(){
 435                 return '';
 436         }
 437
 438         /**
 439          * Return information about how and from where the results were fetched,
 440          * should be useful for diagnostics and debugging
 441          *
 442          * @return string
 443          */
 444         function getInfo() {
 445                 return null;
 446         }
 447
 448         /**
 449          * Return a result set of hits on other (multiple) wikis associated with this one
 450          *
 451          * @return SearchResultSet
 452          */
 453         function getInterwikiResults() {
 454                 return null;
 455         }
 456
 457         /**
 458          * Check if there are results on other wikis
 459          *
 460          * @return boolean
 461          */
 462         function hasInterwikiResults() {
 463                 return $this->getInterwikiResults() != null;
 464         }
 465
 466
 467         /**
 468          * Fetches next search result, or false.
 469          * @return SearchResult
 470          * @access public
 471          * @abstract
 472          */
 473         function next() {
 474                 return false;
 475         }
 476
 477         /**
 478          * Frees the result set, if applicable.
 479          * @ access public
 480          */
 481         function free() {
 482                 // ...
 483         }
 484 }
 485
 486
 487 /**
 488  * @ingroup Search
 489  */
 490 class SearchResultTooMany {
 491         ## Some search engines may bail out if too many matches are found
 492 }
 493
 494
 495 /**
 496  * @ingroup Search
 497  */
 498 class SearchResult {
 499         var $mRevision = null;
 500
 501         function SearchResult( $row ) {
 502                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 503                 if( !is_null($this->mTitle) )
 504                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 505         }
 506
 507         /**
 508          * Check if this is result points to an invalid title
 509          *
 510          * @return boolean
 511          * @access public
 512          */
 513         function isBrokenTitle(){
 514                 if( is_null($this->mTitle) )
 515                         return true;
 516                 return false;
 517         }
 518
 519         /**
 520          * Check if target page is missing, happens when index is out of date
 521          *
 522          * @return boolean
 523          * @access public
 524          */
 525         function isMissingRevision(){
 526                 if( !$this->mRevision )
 527                         return true;
 528                 return false;
 529         }
 530
 531         /**
 532          * @return Title
 533          * @access public
 534          */
 535         function getTitle() {
 536                 return $this->mTitle;
 537         }
 538
 539         /**
 540          * @return double or null if not supported
 541          */
 542         function getScore() {
 543                 return null;
 544         }
 545
 546         /**
 547          * Lazy initialization of article text from DB
 548          */
 549         protected function initText(){
 550                 if( !isset($this->mText) ){
 551                         $this->mText = $this->mRevision->getText();
 552                 }
 553         }
 554
 555         /**
 556          * @param array $terms terms to highlight
 557          * @return string highlighted text snippet, null (and not '') if not supported
 558          */
 559         function getTextSnippet($terms){
 560                 global $wgUser, $wgAdvancedSearchHighlighting;
 561                 $this->initText();
 562                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 563                 $h = new SearchHighlighter();
 564                 if( $wgAdvancedSearchHighlighting )
 565                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 566                 else
 567                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 568         }
 569
 570         /**
 571          * @param array $terms terms to highlight
 572          * @return string highlighted title, '' if not supported
 573          */
 574         function getTitleSnippet($terms){
 575                 return '';
 576         }
 577
 578         /**
 579          * @param array $terms terms to highlight
 580          * @return string highlighted redirect name (redirect to this page), '' if none or not supported
 581          */
 582         function getRedirectSnippet($terms){
 583                 return '';
 584         }
 585
 586         /**
 587          * @return Title object for the redirect to this page, null if none or not supported
 588          */
 589         function getRedirectTitle(){
 590                 return null;
 591         }
 592
 593         /**
 594          * @return string highlighted relevant section name, null if none or not supported
 595          */
 596         function getSectionSnippet(){
 597                 return '';
 598         }
 599
 600         /**
 601          * @return Title object (pagename+fragment) for the section, null if none or not supported
 602          */
 603         function getSectionTitle(){
 604                 return null;
 605         }
 606
 607         /**
 608          * @return string timestamp
 609          */
 610         function getTimestamp(){
 611                 return $this->mRevision->getTimestamp();
 612         }
 613
 614         /**
 615          * @return int number of words
 616          */
 617         function getWordCount(){
 618                 $this->initText();
 619                 return str_word_count( $this->mText );
 620         }
 621
 622         /**
 623          * @return int size in bytes
 624          */
 625         function getByteSize(){
 626                 $this->initText();
 627                 return strlen( $this->mText );
 628         }
 629
 630         /**
 631          * @return boolean if hit has related articles
 632          */
 633         function hasRelated(){
 634                 return false;
 635         }
 636
 637         /**
 638          * @return interwiki prefix of the title (return iw even if title is broken)
 639          */
 640         function getInterwikiPrefix(){
 641                 return '';
 642         }
 643 }
 644
 645 /**
 646  * Highlight bits of wikitext
 647  *
 648  * @ingroup Search
 649  */
 650 class SearchHighlighter {
 651         var $mCleanWikitext = true;
 652
 653         function SearchHighlighter($cleanupWikitext = true){
 654                 $this->mCleanWikitext = $cleanupWikitext;
 655         }
 656
 657         /**
 658          * Default implementation of wikitext highlighting
 659          *
 660          * @param string $text
 661          * @param array $terms Terms to highlight (unescaped)
 662          * @param int $contextlines
 663          * @param int $contextchars
 664          * @return string
 665          */
 666         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 667                 global $wgLang, $wgContLang;
 668                 global $wgSearchHighlightBoundaries;
 669                 $fname = __METHOD__;
 670
 671                 if($text == '')
 672                         return '';
 673
 674                 // spli text into text + templates/links/tables
 675                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 676                 // first capture group is for detecting nested templates/links/tables/references
 677                 $endPatterns = array(
 678                         1 => '/(\{\{)|(\}\})/', // template
 679                         2 => '/(\[\[)|(\]\])/', // image
 680                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 681
 682                 // FIXME: this should prolly be a hook or something
 683                 if(function_exists('wfCite')){
 684                         $spat .= '|(<ref>)'; // references via cite extension
 685                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 686                 }
 687                 $spat .= '/';
 688                 $textExt = array(); // text extracts
 689                 $otherExt = array();  // other extracts
 690                 wfProfileIn( "$fname-split" );
 691                 $start = 0;
 692                 $textLen = strlen($text);
 693                 $count = 0; // sequence number to maintain ordering
 694                 while( $start < $textLen ){
 695                         // find start of template/image/table
 696                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 697                                 $epat = '';
 698                                 foreach($matches as $key => $val){
 699                                         if($key > 0 && $val[1] != -1){
 700                                                 if($key == 2){
 701                                                         // see if this is an image link
 702                                                         $ns = substr($val[0],2,-1);
 703                                                         if( $wgContLang->getNsIndex($ns) != NS_IMAGE )
 704                                                                 break;
 705
 706                                                 }
 707                                                 $epat = $endPatterns[$key];
 708                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 709                                                 $start = $val[1];
 710                                                 break;
 711                                         }
 712                                 }
 713                                 if( $epat ){
 714                                         // find end (and detect any nested elements)
 715                                         $level = 0;
 716                                         $offset = $start + 1;
 717                                         $found = false;
 718                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 719                                                 if( array_key_exists(2,$endMatches) ){
 720                                                         // found end
 721                                                         if($level == 0){
 722                                                                 $len = strlen($endMatches[2][0]);
 723                                                                 $off = $endMatches[2][1];
 724                                                                 $this->splitAndAdd( $otherExt, $count,
 725                                                                         substr( $text, $start, $off + $len  - $start ) );
 726                                                                 $start = $off + $len;
 727                                                                 $found = true;
 728                                                                 break;
 729                                                         } else{
 730                                                                 // end of nested element
 731                                                                 $level -= 1;
 732                                                         }
 733                                                 } else{
 734                                                         // nested
 735                                                         $level += 1;
 736                                                 }
 737                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 738                                         }
 739                                         if( ! $found ){
 740                                                 // couldn't find appropriate closing tag, skip
 741                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 742                                                 $start += strlen($matches[0][0]);
 743                                         }
 744                                         continue;
 745                                 }
 746                         }
 747                         // else: add as text extract
 748                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 749                         break;
 750                 }
 751
 752                 $all = $textExt + $otherExt; // these have disjunct key sets
 753
 754                 wfProfileOut( "$fname-split" );
 755
 756                 // prepare regexps
 757                 foreach( $terms as $index => $term ) {
 758                         $terms[$index] = preg_quote( $term, '/' );
 759                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 760                         if(preg_match('/[\x80-\xff]/', $term) ){
 761                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 762                         }
 763
 764
 765                 }
 766                 $anyterm = implode( '|', $terms );
 767                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 768
 769                 // FIXME: a hack to scale contextchars, a correct solution
 770                 // would be to have contextchars actually be char and not byte
 771                 // length, and do proper utf-8 substrings and lengths everywhere,
 772                 // but PHP is making that very hard and unclean to implement :(
 773                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 774                 $contextchars = intval( $contextchars * $scale );
 775
 776                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 777                 $patPost = "($wgSearchHighlightBoundaries|$)";
 778
 779                 $pat1 = "/(".$phrase.")/ui";
 780                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 781
 782                 wfProfileIn( "$fname-extract" );
 783
 784                 $left = $contextlines;
 785
 786                 $snippets = array();
 787                 $offsets = array();
 788
 789                 // show beginning only if it contains all words
 790                 $first = 0;
 791                 $firstText = '';
 792                 foreach($textExt as $index => $line){
 793                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 794                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 795                                 $first = $index;
 796                                 break;
 797                         }
 798                 }
 799                 if( $firstText ){
 800                         $succ = true;
 801                         // check if first text contains all terms
 802                         foreach($terms as $term){
 803                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 804                                         $succ = false;
 805                                         break;
 806                                 }
 807                         }
 808                         if( $succ ){
 809                                 $snippets[$first] = $firstText;
 810                                 $offsets[$first] = 0;
 811                         }
 812                 }
 813                 if( ! $snippets ) {
 814                         // match whole query on text
 815                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 816                         // match whole query on templates/tables/images
 817                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 818                         // match any words on text
 819                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 820                         // match any words on templates/tables/images
 821                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 822
 823                         ksort($snippets);
 824                 }
 825
 826                 // add extra chars to each snippet to make snippets constant size
 827                 $extended = array();
 828                 if( count( $snippets ) == 0){
 829                         // couldn't find the target words, just show beginning of article
 830                         $targetchars = $contextchars * $contextlines;
 831                         $snippets[$first] = '';
 832                         $offsets[$first] = 0;
 833                 } else{
 834                         // if begin of the article contains the whole phrase, show only that !!
 835                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 836                             && $offsets[$first] < $contextchars * 2 ){
 837                                 $snippets = array ($first => $snippets[$first]);
 838                         }
 839
 840                         // calc by how much to extend existing snippets
 841                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 842                 }
 843
 844                 foreach($snippets as $index => $line){
 845                         $extended[$index] = $line;
 846                         $len = strlen($line);
 847                         if( $len < $targetchars - 20 ){
 848                                 // complete this line
 849                                 if($len < strlen( $all[$index] )){
 850                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 851                                         $len = strlen( $extended[$index] );
 852                                 }
 853
 854                                 // add more lines
 855                                 $add = $index + 1;
 856                                 while( $len < $targetchars - 20
 857                                        && array_key_exists($add,$all)
 858                                        && !array_key_exists($add,$snippets) ){
 859                                     $offsets[$add] = 0;
 860                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 861                                         $extended[$add] = $tt;
 862                                         $len += strlen( $tt );
 863                                         $add++;
 864                                 }
 865                         }
 866                 }
 867
 868                 //$snippets = array_map('htmlspecialchars', $extended);
 869                 $snippets = $extended;
 870                 $last = -1;
 871                 $extract = '';
 872                 foreach($snippets as $index => $line){
 873                         if($last == -1)
 874                                 $extract .= $line; // first line
 875                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 876                                 $extract .= " ".$line; // continous lines
 877                         else
 878                                 $extract .= '<b> ... </b>' . $line;
 879
 880                         $last = $index;
 881                 }
 882                 if( $extract )
 883                         $extract .= '<b> ... </b>';
 884
 885                 $processed = array();
 886                 foreach($terms as $term){
 887                         if( ! isset($processed[$term]) ){
 888                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 889                                 $extract = preg_replace( $pat3,
 890                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 891                                 $processed[$term] = true;
 892                         }
 893                 }
 894
 895                 wfProfileOut( "$fname-extract" );
 896
 897                 return $extract;
 898         }
 899
 900         /**
 901          * Split text into lines and add it to extracts array
 902          *
 903          * @param array $extracts index -> $line
 904          * @param int $count
 905          * @param string $text
 906          */
 907         function splitAndAdd(&$extracts, &$count, $text){
 908                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 909                 foreach($split as $line){
 910                         $tt = trim($line);
 911                         if( $tt )
 912                                 $extracts[$count++] = $tt;
 913                 }
 914         }
 915
 916         /**
 917          * Do manual case conversion for non-ascii chars
 918          *
 919          * @param unknown_type $matches
 920          */
 921         function caseCallback($matches){
 922                 global $wgContLang;
 923                 if( strlen($matches[0]) > 1 ){
 924                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 925                 } else
 926                         return $matches[0];
 927         }
 928
 929         /**
 930          * Extract part of the text from start to end, but by
 931          * not chopping up words
 932          * @param string $text
 933          * @param int $start
 934          * @param int $end
 935          * @param int $posStart (out) actual start position
 936          * @param int $posEnd (out) actual end position
 937          * @return string
 938          */
 939         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 940                 global $wgContLang;
 941
 942                 if( $start != 0)
 943                         $start = $this->position( $text, $start, 1 );
 944                 if( $end >= strlen($text) )
 945                         $end = strlen($text);
 946                 else
 947                         $end = $this->position( $text, $end );
 948
 949                 if(!is_null($posStart))
 950                         $posStart = $start;
 951                 if(!is_null($posEnd))
 952                         $posEnd = $end;
 953
 954                 if($end > $start)
 955                         return substr($text, $start, $end-$start);
 956                 else
 957                         return '';
 958         }
 959
 960         /**
 961          * Find a nonletter near a point (index) in the text
 962          *
 963          * @param string $text
 964          * @param int $point
 965          * @param int $offset to found index
 966          * @return int nearest nonletter index, or beginning of utf8 char if none
 967          */
 968         function position($text, $point, $offset=0 ){
 969                 $tolerance = 10;
 970                 $s = max( 0, $point - $tolerance );
 971                 $l = min( strlen($text), $point + $tolerance ) - $s;
 972                 $m = array();
 973                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
 974                         return $m[0][1] + $s + $offset;
 975                 } else{
 976                         // check if point is on a valid first UTF8 char
 977                         $char = ord( $text[$point] );
 978                         while( $char >= 0x80 && $char < 0xc0 ) {
 979                                 // skip trailing bytes
 980                                 $point++;
 981                                 if($point >= strlen($text))
 982                                         return strlen($text);
 983                                 $char = ord( $text[$point] );
 984                         }
 985                         return $point;
 986
 987                 }
 988         }
 989
 990         /**
 991          * Search extracts for a pattern, and return snippets
 992          *
 993          * @param string $pattern regexp for matching lines
 994          * @param array $extracts extracts to search
 995          * @param int $linesleft number of extracts to make
 996          * @param int $contextchars length of snippet
 997          * @param array $out map for highlighted snippets
 998          * @param array $offsets map of starting points of snippets
 999          * @protected
1000          */
1001         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1002                 if($linesleft == 0)
1003                         return; // nothing to do
1004                 foreach($extracts as $index => $line){
1005                         if( array_key_exists($index,$out) )
1006                                 continue; // this line already highlighted
1007
1008                         $m = array();
1009                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1010                                 continue;
1011
1012                         $offset = $m[0][1];
1013                         $len = strlen($m[0][0]);
1014                         if($offset + $len < $contextchars)
1015                                 $begin = 0;
1016                         elseif( $len > $contextchars)
1017                                 $begin = $offset;
1018                         else
1019                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1020
1021                         $end = $begin + $contextchars;
1022
1023                         $posBegin = $begin;
1024                         // basic snippet from this line
1025                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1026                         $offsets[$index] = $posBegin;
1027                         $linesleft--;
1028                         if($linesleft == 0)
1029                                 return;
1030                 }
1031         }
1032
1033         /**
1034          * Basic wikitext removal
1035          * @protected
1036          */
1037         function removeWiki($text) {
1038                 $fname = __METHOD__;
1039                 wfProfileIn( $fname );
1040
1041                 //$text = preg_replace("/'{2,5}/", "", $text);
1042                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1043                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1044                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1045                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1046                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1047                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1048                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1049                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1050                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1051                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1052                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1053                 $text = preg_replace("/'''''/", "", $text);
1054                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1055                 $text = preg_replace("/''/", "", $text);
1056
1057                 wfProfileOut( $fname );
1058                 return $text;
1059         }
1060
1061         /**
1062          * callback to replace [[target|caption]] kind of links, if
1063          * the target is category or image, leave it
1064          *
1065          * @param array $matches
1066          */
1067         function linkReplace($matches){
1068                 $colon = strpos( $matches[1], ':' );
1069                 if( $colon === false )
1070                         return $matches[2]; // replace with caption
1071                 global $wgContLang;
1072                 $ns = substr( $matches[1], 0, $colon );
1073                 $index = $wgContLang->getNsIndex($ns);
1074                 if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) )
1075                         return $matches[0]; // return the whole thing
1076                 else
1077                         return $matches[2];
1078
1079         }
1080
1081         /**
1082      * Simple & fast snippet extraction, but gives completely unrelevant
1083      * snippets
1084      *
1085      * @param string $text
1086      * @param array $terms
1087      * @param int $contextlines
1088      * @param int $contextchars
1089      * @return string
1090      */
1091     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1092         global $wgLang, $wgContLang;
1093         $fname = __METHOD__;
1094
1095         $lines = explode( "\n", $text );
1096
1097         $terms = implode( '|', $terms );
1098         $terms = str_replace( '/', "\\/", $terms);
1099         $max = intval( $contextchars ) + 1;
1100         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1101
1102         $lineno = 0;
1103
1104         $extract = "";
1105         wfProfileIn( "$fname-extract" );
1106         foreach ( $lines as $line ) {
1107             if ( 0 == $contextlines ) {
1108                 break;
1109             }
1110             ++$lineno;
1111             $m = array();
1112             if ( ! preg_match( $pat1, $line, $m ) ) {
1113                 continue;
1114             }
1115             --$contextlines;
1116             $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
1117
1118             if ( count( $m ) < 3 ) {
1119                 $post = '';
1120             } else {
1121                 $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
1122             }
1123
1124             $found = $m[2];
1125
1126             $line = htmlspecialchars( $pre . $found . $post );
1127             $pat2 = '/(' . $terms . ")/i";
1128             $line = preg_replace( $pat2,
1129               "<span class='searchmatch'>\\1</span>", $line );
1130
1131             $extract .= "${line}\n";
1132         }
1133         wfProfileOut( "$fname-extract" );
1134
1135         return $extract;
1136     }
1137
1138 }
1139
1140 /**
1141  * @ingroup Search
1142  */
1143 class SearchEngineDummy {
1144         function search( $term ) {
1145                 return null;
1146         }
1147         function setLimitOffset($l, $o) {}
1148         function legalSearchChars() {}
1149         function update() {}
1150         function setnamespaces() {}
1151         function searchtitle() {}
1152         function searchtext() {}
1153 }