includes/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $searchTerms = array();
  17         var $namespaces = array( NS_MAIN );
  18         var $showRedirects = false;
  19
  20         /**
  21          * Perform a full text search query and return a result set.
  22          * If title searches are not supported or disabled, return null.
  23          *
  24          * @param string $term - Raw search term
  25          * @return SearchResultSet
  26          * @access public
  27          * @abstract
  28          */
  29         function searchText( $term ) {
  30                 return null;
  31         }
  32
  33         /**
  34          * Perform a title-only search query and return a result set.
  35          * If title searches are not supported or disabled, return null.
  36          *
  37          * @param string $term - Raw search term
  38          * @return SearchResultSet
  39          * @access public
  40          * @abstract
  41          */
  42         function searchTitle( $term ) {
  43                 return null;
  44         }
  45
  46         /**
  47          * If an exact title match can be find, or a very slightly close match,
  48          * return the title. If no match, returns NULL.
  49          *
  50          * @param string $term
  51          * @return Title
  52          */
  53         public static function getNearMatch( $searchterm ) {
  54                 global $wgContLang;
  55
  56                 $allSearchTerms = array($searchterm);
  57
  58                 if($wgContLang->hasVariants()){
  59                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  60                 }
  61
  62                 foreach($allSearchTerms as $term){
  63
  64                         # Exact match? No need to look further.
  65                         $title = Title::newFromText( $term );
  66                         if (is_null($title))
  67                                 return NULL;
  68
  69                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal()
  70                              || $title->exists() ) {
  71                                 return $title;
  72                         }
  73
  74                         # Now try all lower case (i.e. first letter capitalized)
  75                         #
  76                         $title = Title::newFromText( $wgContLang->lc( $term ) );
  77                         if ( $title && $title->exists() ) {
  78                                 return $title;
  79                         }
  80
  81                         # Now try capitalized string
  82                         #
  83                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
  84                         if ( $title && $title->exists() ) {
  85                                 return $title;
  86                         }
  87
  88                         # Now try all upper case
  89                         #
  90                         $title = Title::newFromText( $wgContLang->uc( $term ) );
  91                         if ( $title && $title->exists() ) {
  92                                 return $title;
  93                         }
  94
  95                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
  96                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
  97                         if ( $title && $title->exists() ) {
  98                                 return $title;
  99                         }
 100
 101                         // Give hooks a chance at better match variants
 102                         $title = null;
 103                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 104                                 return $title;
 105                         }
 106                 }
 107
 108                 $title = Title::newFromText( $searchterm );
 109
 110                 # Entering an IP address goes to the contributions page
 111                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 112                         || User::isIP( trim( $searchterm ) ) ) {
 113                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 114                 }
 115
 116
 117                 # Entering a user goes to the user page whether it's there or not
 118                 if ( $title->getNamespace() == NS_USER ) {
 119                         return $title;
 120                 }
 121
 122                 # Go to images that exist even if there's no local page.
 123                 # There may have been a funny upload, or it may be on a shared
 124                 # file repository such as Wikimedia Commons.
 125                 if( $title->getNamespace() == NS_IMAGE ) {
 126                         $image = wfFindFile( $title );
 127                         if( $image ) {
 128                                 return $title;
 129                         }
 130                 }
 131
 132                 # MediaWiki namespace? Page may be "implied" if not customized.
 133                 # Just return it, with caps forced as the message system likes it.
 134                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 135                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 136                 }
 137
 138                 # Quoted term? Try without the quotes...
 139                 $matches = array();
 140                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 141                         return SearchEngine::getNearMatch( $matches[1] );
 142                 }
 143
 144                 return NULL;
 145         }
 146
 147         public static function legalSearchChars() {
 148                 return "A-Za-z_'0-9\\x80-\\xFF\\-";
 149         }
 150
 151         /**
 152          * Set the maximum number of results to return
 153          * and how many to skip before returning the first.
 154          *
 155          * @param int $limit
 156          * @param int $offset
 157          * @access public
 158          */
 159         function setLimitOffset( $limit, $offset = 0 ) {
 160                 $this->limit = intval( $limit );
 161                 $this->offset = intval( $offset );
 162         }
 163
 164         /**
 165          * Set which namespaces the search should include.
 166          * Give an array of namespace index numbers.
 167          *
 168          * @param array $namespaces
 169          * @access public
 170          */
 171         function setNamespaces( $namespaces ) {
 172                 $this->namespaces = $namespaces;
 173         }
 174
 175         /**
 176          * Parse some common prefixes: all (search everything)
 177          * or namespace names
 178          *
 179          * @param string $query
 180          */
 181         function replacePrefixes( $query ){
 182                 global $wgContLang;
 183
 184                 if( strpos($query,':') === false )
 185                         return $query; // nothing to do
 186
 187                 $parsed = $query;
 188                 $allkeyword = wfMsgForContent('searchall').":";
 189                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 190                         $this->namespaces = null;
 191                         $parsed = substr($query,strlen($allkeyword));
 192                 } else if( strpos($query,':') !== false ) {
 193                         $prefix = substr($query,0,strpos($query,':'));
 194                         $index = $wgContLang->getNsIndex($prefix);
 195                         if($index !== false){
 196                                 $this->namespaces = array($index);
 197                                 $parsed = substr($query,strlen($prefix)+1);
 198                         }
 199                 }
 200                 if(trim($parsed) == '')
 201                         return $query; // prefix was the whole query
 202
 203                 return $parsed;
 204         }
 205
 206         /**
 207          * Make a list of searchable namespaces and their canonical names.
 208          * @return array
 209          */
 210         public static function searchableNamespaces() {
 211                 global $wgContLang;
 212                 $arr = array();
 213                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 214                         if( $ns >= NS_MAIN ) {
 215                                 $arr[$ns] = $name;
 216                         }
 217                 }
 218                 return $arr;
 219         }
 220
 221         /**
 222          * Extract default namespaces to search from the given user's
 223          * settings, returning a list of index numbers.
 224          *
 225          * @param User $user
 226          * @return array
 227          * @static
 228          */
 229         public static function userNamespaces( &$user ) {
 230                 $arr = array();
 231                 foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
 232                         if( $user->getOption( 'searchNs' . $ns ) ) {
 233                                 $arr[] = $ns;
 234                         }
 235                 }
 236                 return $arr;
 237         }
 238
 239         /**
 240          * Find snippet highlight settings for a given user
 241          *
 242          * @param User $user
 243          * @return array contextlines, contextchars
 244          * @static
 245          */
 246         public static function userHighlightPrefs( &$user ){
 247                 //$contextlines = $user->getOption( 'contextlines',  5 );
 248                 //$contextchars = $user->getOption( 'contextchars', 50 );
 249                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 250                 $contextchars = 75; // same as above.... :P
 251                 return array($contextlines, $contextchars);
 252         }
 253
 254         /**
 255          * An array of namespaces indexes to be searched by default
 256          *
 257          * @return array
 258          * @static
 259          */
 260         public static function defaultNamespaces(){
 261                 global $wgNamespacesToBeSearchedDefault;
 262
 263                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 264         }
 265
 266         /**
 267          * Return a 'cleaned up' search string
 268          *
 269          * @return string
 270          * @access public
 271          */
 272         function filter( $text ) {
 273                 $lc = $this->legalSearchChars();
 274                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 275         }
 276         /**
 277          * Load up the appropriate search engine class for the currently
 278          * active database backend, and return a configured instance.
 279          *
 280          * @fixme Ask the database class for his default search class
 281          * instead of knowing about every backend here.
 282          * @return SearchEngine
 283          */
 284         public static function create() {
 285                 global $wgDBtype, $wgSearchType;
 286                 if( $wgSearchType ) {
 287                         $class = $wgSearchType;
 288                 } elseif( $wgDBtype == 'mysql' ) {
 289                         $class = 'SearchMySQL';
 290                 } else if ( $wgDBtype == 'postgres' ) {
 291                         $class = 'SearchPostgres';
 292                 } else if ( $wgDBtype == 'oracle' ) {
 293                         $class = 'SearchOracle';
 294                 } else {
 295                         $class = 'SearchEngineDummy';
 296                 }
 297                 $search = new $class( wfGetDB( DB_SLAVE ) );
 298                 $search->setLimitOffset(0,0);
 299                 return $search;
 300         }
 301
 302         /**
 303          * Create or update the search index record for the given page.
 304          * Title and text should be pre-processed.
 305          *
 306          * @param int $id
 307          * @param string $title
 308          * @param string $text
 309          * @abstract
 310          */
 311         function update( $id, $title, $text ) {
 312                 // no-op
 313         }
 314
 315         /**
 316          * Update a search index record's title only.
 317          * Title should be pre-processed.
 318          *
 319          * @param int $id
 320          * @param string $title
 321          * @abstract
 322          */
 323         function updateTitle( $id, $title ) {
 324                 // no-op
 325         }
 326
 327         /**
 328          * Get OpenSearch suggestion template
 329          *
 330          * @return string
 331          * @static
 332          */
 333         public static function getOpenSearchTemplate() {
 334                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 335                 if($wgOpenSearchTemplate)
 336                         return $wgOpenSearchTemplate;
 337                 else{
 338                         $ns = implode(',',SearchEngine::defaultNamespaces());
 339                         if(!$ns) $ns = "0";
 340                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 341                 }
 342         }
 343
 344         /**
 345          * Get internal MediaWiki Suggest template
 346          *
 347          * @return string
 348          * @static
 349          */
 350         public static function getMWSuggestTemplate() {
 351                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 352                 if($wgMWSuggestTemplate)
 353                         return $wgMWSuggestTemplate;
 354                 else
 355                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}';
 356         }
 357 }
 358
 359 /**
 360  * @ingroup Search
 361  */
 362 class SearchResultSet {
 363         /**
 364          * Fetch an array of regular expression fragments for matching
 365          * the search terms as parsed by this engine in a text extract.
 366          *
 367          * @return array
 368          * @access public
 369          * @abstract
 370          */
 371         function termMatches() {
 372                 return array();
 373         }
 374
 375         function numRows() {
 376                 return 0;
 377         }
 378
 379         /**
 380          * Return true if results are included in this result set.
 381          * @return bool
 382          * @abstract
 383          */
 384         function hasResults() {
 385                 return false;
 386         }
 387
 388         /**
 389          * Some search modes return a total hit count for the query
 390          * in the entire article database. This may include pages
 391          * in namespaces that would not be matched on the given
 392          * settings.
 393          *
 394          * Return null if no total hits number is supported.
 395          *
 396          * @return int
 397          * @access public
 398          */
 399         function getTotalHits() {
 400                 return null;
 401         }
 402
 403         /**
 404          * Some search modes return a suggested alternate term if there are
 405          * no exact hits. Returns true if there is one on this set.
 406          *
 407          * @return bool
 408          * @access public
 409          */
 410         function hasSuggestion() {
 411                 return false;
 412         }
 413
 414         /**
 415          * @return string suggested query, null if none
 416          */
 417         function getSuggestionQuery(){
 418                 return null;
 419         }
 420
 421         /**
 422          * @return string highlighted suggested query, '' if none
 423          */
 424         function getSuggestionSnippet(){
 425                 return '';
 426         }
 427
 428         /**
 429          * Return information about how and from where the results were fetched,
 430          * should be useful for diagnostics and debugging
 431          *
 432          * @return string
 433          */
 434         function getInfo() {
 435                 return null;
 436         }
 437
 438         /**
 439          * Return a result set of hits on other (multiple) wikis associated with this one
 440          *
 441          * @return SearchResultSet
 442          */
 443         function getInterwikiResults() {
 444                 return null;
 445         }
 446
 447         /**
 448          * Check if there are results on other wikis
 449          *
 450          * @return boolean
 451          */
 452         function hasInterwikiResults() {
 453                 return $this->getInterwikiResults() != null;
 454         }
 455
 456
 457         /**
 458          * Fetches next search result, or false.
 459          * @return SearchResult
 460          * @access public
 461          * @abstract
 462          */
 463         function next() {
 464                 return false;
 465         }
 466
 467         /**
 468          * Frees the result set, if applicable.
 469          * @ access public
 470          */
 471         function free() {
 472                 // ...
 473         }
 474 }
 475
 476
 477 /**
 478  * @ingroup Search
 479  */
 480 class SearchResultTooMany {
 481         ## Some search engines may bail out if too many matches are found
 482 }
 483
 484
 485 /**
 486  * @fixme This class is horribly factored. It would probably be better to have
 487  * a useful base class to which you pass some standard information, then let
 488  * the fancy self-highlighters extend that.
 489  * @ingroup Search
 490  */
 491 class SearchResult {
 492         var $mRevision = null;
 493
 494         function SearchResult( $row ) {
 495                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 496                 if( !is_null($this->mTitle) )
 497                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 498         }
 499
 500         /**
 501          * Check if this is result points to an invalid title
 502          *
 503          * @return boolean
 504          * @access public
 505          */
 506         function isBrokenTitle(){
 507                 if( is_null($this->mTitle) )
 508                         return true;
 509                 return false;
 510         }
 511
 512         /**
 513          * Check if target page is missing, happens when index is out of date
 514          *
 515          * @return boolean
 516          * @access public
 517          */
 518         function isMissingRevision(){
 519                 if( !$this->mRevision )
 520                         return true;
 521                 return false;
 522         }
 523
 524         /**
 525          * @return Title
 526          * @access public
 527          */
 528         function getTitle() {
 529                 return $this->mTitle;
 530         }
 531
 532         /**
 533          * @return double or null if not supported
 534          */
 535         function getScore() {
 536                 return null;
 537         }
 538
 539         /**
 540          * Lazy initialization of article text from DB
 541          */
 542         protected function initText(){
 543                 if( !isset($this->mText) ){
 544                         $this->mText = $this->mRevision->getText();
 545                 }
 546         }
 547
 548         /**
 549          * @param array $terms terms to highlight
 550          * @return string highlighted text snippet, null (and not '') if not supported
 551          */
 552         function getTextSnippet($terms){
 553                 global $wgUser, $wgAdvancedSearchHighlighting;
 554                 $this->initText();
 555                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 556                 $h = new SearchHighlighter();
 557                 if( $wgAdvancedSearchHighlighting )
 558                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 559                 else
 560                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 561         }
 562
 563         /**
 564          * @param array $terms terms to highlight
 565          * @return string highlighted title, '' if not supported
 566          */
 567         function getTitleSnippet($terms){
 568                 return '';
 569         }
 570
 571         /**
 572          * @param array $terms terms to highlight
 573          * @return string highlighted redirect name (redirect to this page), '' if none or not supported
 574          */
 575         function getRedirectSnippet($terms){
 576                 return '';
 577         }
 578
 579         /**
 580          * @return Title object for the redirect to this page, null if none or not supported
 581          */
 582         function getRedirectTitle(){
 583                 return null;
 584         }
 585
 586         /**
 587          * @return string highlighted relevant section name, null if none or not supported
 588          */
 589         function getSectionSnippet(){
 590                 return '';
 591         }
 592
 593         /**
 594          * @return Title object (pagename+fragment) for the section, null if none or not supported
 595          */
 596         function getSectionTitle(){
 597                 return null;
 598         }
 599
 600         /**
 601          * @return string timestamp
 602          */
 603         function getTimestamp(){
 604                 return $this->mRevision->getTimestamp();
 605         }
 606
 607         /**
 608          * @return int number of words
 609          */
 610         function getWordCount(){
 611                 $this->initText();
 612                 return str_word_count( $this->mText );
 613         }
 614
 615         /**
 616          * @return int size in bytes
 617          */
 618         function getByteSize(){
 619                 $this->initText();
 620                 return strlen( $this->mText );
 621         }
 622
 623         /**
 624          * @return boolean if hit has related articles
 625          */
 626         function hasRelated(){
 627                 return false;
 628         }
 629
 630         /**
 631          * @return interwiki prefix of the title (return iw even if title is broken)
 632          */
 633         function getInterwikiPrefix(){
 634                 return '';
 635         }
 636 }
 637
 638 /**
 639  * Highlight bits of wikitext
 640  *
 641  * @ingroup Search
 642  */
 643 class SearchHighlighter {
 644         var $mCleanWikitext = true;
 645
 646         function SearchHighlighter($cleanupWikitext = true){
 647                 $this->mCleanWikitext = $cleanupWikitext;
 648         }
 649
 650         /**
 651          * Default implementation of wikitext highlighting
 652          *
 653          * @param string $text
 654          * @param array $terms Terms to highlight (unescaped)
 655          * @param int $contextlines
 656          * @param int $contextchars
 657          * @return string
 658          */
 659         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 660                 global $wgLang, $wgContLang;
 661                 global $wgSearchHighlightBoundaries;
 662                 $fname = __METHOD__;
 663
 664                 if($text == '')
 665                         return '';
 666
 667                 // spli text into text + templates/links/tables
 668                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 669                 // first capture group is for detecting nested templates/links/tables/references
 670                 $endPatterns = array(
 671                         1 => '/(\{\{)|(\}\})/', // template
 672                         2 => '/(\[\[)|(\]\])/', // image
 673                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 674
 675                 // FIXME: this should prolly be a hook or something
 676                 if(function_exists('wfCite')){
 677                         $spat .= '|(<ref>)'; // references via cite extension
 678                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 679                 }
 680                 $spat .= '/';
 681                 $textExt = array(); // text extracts
 682                 $otherExt = array();  // other extracts
 683                 wfProfileIn( "$fname-split" );
 684                 $start = 0;
 685                 $textLen = strlen($text);
 686                 $count = 0; // sequence number to maintain ordering
 687                 while( $start < $textLen ){
 688                         // find start of template/image/table
 689                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 690                                 $epat = '';
 691                                 foreach($matches as $key => $val){
 692                                         if($key > 0 && $val[1] != -1){
 693                                                 if($key == 2){
 694                                                         // see if this is an image link
 695                                                         $ns = substr($val[0],2,-1);
 696                                                         if( $wgContLang->getNsIndex($ns) != NS_IMAGE )
 697                                                                 break;
 698
 699                                                 }
 700                                                 $epat = $endPatterns[$key];
 701                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 702                                                 $start = $val[1];
 703                                                 break;
 704                                         }
 705                                 }
 706                                 if( $epat ){
 707                                         // find end (and detect any nested elements)
 708                                         $level = 0;
 709                                         $offset = $start + 1;
 710                                         $found = false;
 711                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 712                                                 if( array_key_exists(2,$endMatches) ){
 713                                                         // found end
 714                                                         if($level == 0){
 715                                                                 $len = strlen($endMatches[2][0]);
 716                                                                 $off = $endMatches[2][1];
 717                                                                 $this->splitAndAdd( $otherExt, $count,
 718                                                                         substr( $text, $start, $off + $len  - $start ) );
 719                                                                 $start = $off + $len;
 720                                                                 $found = true;
 721                                                                 break;
 722                                                         } else{
 723                                                                 // end of nested element
 724                                                                 $level -= 1;
 725                                                         }
 726                                                 } else{
 727                                                         // nested
 728                                                         $level += 1;
 729                                                 }
 730                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 731                                         }
 732                                         if( ! $found ){
 733                                                 // couldn't find appropriate closing tag, skip
 734                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 735                                                 $start += strlen($matches[0][0]);
 736                                         }
 737                                         continue;
 738                                 }
 739                         }
 740                         // else: add as text extract
 741                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 742                         break;
 743                 }
 744
 745                 $all = $textExt + $otherExt; // these have disjunct key sets
 746
 747                 wfProfileOut( "$fname-split" );
 748
 749                 // prepare regexps
 750                 foreach( $terms as $index => $term ) {
 751                         $terms[$index] = preg_quote( $term, '/' );
 752                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 753                         if(preg_match('/[\x80-\xff]/', $term) ){
 754                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 755                         }
 756
 757
 758                 }
 759                 $anyterm = implode( '|', $terms );
 760                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 761
 762                 // FIXME: a hack to scale contextchars, a correct solution
 763                 // would be to have contextchars actually be char and not byte
 764                 // length, and do proper utf-8 substrings and lengths everywhere,
 765                 // but PHP is making that very hard and unclean to implement :(
 766                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 767                 $contextchars = intval( $contextchars * $scale );
 768
 769                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 770                 $patPost = "($wgSearchHighlightBoundaries|$)";
 771
 772                 $pat1 = "/(".$phrase.")/ui";
 773                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 774
 775                 wfProfileIn( "$fname-extract" );
 776
 777                 $left = $contextlines;
 778
 779                 $snippets = array();
 780                 $offsets = array();
 781
 782                 // show beginning only if it contains all words
 783                 $first = 0;
 784                 $firstText = '';
 785                 foreach($textExt as $index => $line){
 786                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 787                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 788                                 $first = $index;
 789                                 break;
 790                         }
 791                 }
 792                 if( $firstText ){
 793                         $succ = true;
 794                         // check if first text contains all terms
 795                         foreach($terms as $term){
 796                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 797                                         $succ = false;
 798                                         break;
 799                                 }
 800                         }
 801                         if( $succ ){
 802                                 $snippets[$first] = $firstText;
 803                                 $offsets[$first] = 0;
 804                         }
 805                 }
 806                 if( ! $snippets ) {
 807                         // match whole query on text
 808                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 809                         // match whole query on templates/tables/images
 810                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 811                         // match any words on text
 812                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 813                         // match any words on templates/tables/images
 814                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 815
 816                         ksort($snippets);
 817                 }
 818
 819                 // add extra chars to each snippet to make snippets constant size
 820                 $extended = array();
 821                 if( count( $snippets ) == 0){
 822                         // couldn't find the target words, just show beginning of article
 823                         $targetchars = $contextchars * $contextlines;
 824                         $snippets[$first] = '';
 825                         $offsets[$first] = 0;
 826                 } else{
 827                         // if begin of the article contains the whole phrase, show only that !!
 828                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 829                             && $offsets[$first] < $contextchars * 2 ){
 830                                 $snippets = array ($first => $snippets[$first]);
 831                         }
 832
 833                         // calc by how much to extend existing snippets
 834                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 835                 }
 836
 837                 foreach($snippets as $index => $line){
 838                         $extended[$index] = $line;
 839                         $len = strlen($line);
 840                         if( $len < $targetchars - 20 ){
 841                                 // complete this line
 842                                 if($len < strlen( $all[$index] )){
 843                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 844                                         $len = strlen( $extended[$index] );
 845                                 }
 846
 847                                 // add more lines
 848                                 $add = $index + 1;
 849                                 while( $len < $targetchars - 20
 850                                        && array_key_exists($add,$all)
 851                                        && !array_key_exists($add,$snippets) ){
 852                                     $offsets[$add] = 0;
 853                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 854                                         $extended[$add] = $tt;
 855                                         $len += strlen( $tt );
 856                                         $add++;
 857                                 }
 858                         }
 859                 }
 860
 861                 //$snippets = array_map('htmlspecialchars', $extended);
 862                 $snippets = $extended;
 863                 $last = -1;
 864                 $extract = '';
 865                 foreach($snippets as $index => $line){
 866                         if($last == -1)
 867                                 $extract .= $line; // first line
 868                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 869                                 $extract .= " ".$line; // continous lines
 870                         else
 871                                 $extract .= '<b> ... </b>' . $line;
 872
 873                         $last = $index;
 874                 }
 875                 if( $extract )
 876                         $extract .= '<b> ... </b>';
 877
 878                 $processed = array();
 879                 foreach($terms as $term){
 880                         if( ! isset($processed[$term]) ){
 881                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 882                                 $extract = preg_replace( $pat3,
 883                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 884                                 $processed[$term] = true;
 885                         }
 886                 }
 887
 888                 wfProfileOut( "$fname-extract" );
 889
 890                 return $extract;
 891         }
 892
 893         /**
 894          * Split text into lines and add it to extracts array
 895          *
 896          * @param array $extracts index -> $line
 897          * @param int $count
 898          * @param string $text
 899          */
 900         function splitAndAdd(&$extracts, &$count, $text){
 901                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 902                 foreach($split as $line){
 903                         $tt = trim($line);
 904                         if( $tt )
 905                                 $extracts[$count++] = $tt;
 906                 }
 907         }
 908
 909         /**
 910          * Do manual case conversion for non-ascii chars
 911          *
 912          * @param unknown_type $matches
 913          */
 914         function caseCallback($matches){
 915                 global $wgContLang;
 916                 if( strlen($matches[0]) > 1 ){
 917                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 918                 } else
 919                         return $matches[0];
 920         }
 921
 922         /**
 923          * Extract part of the text from start to end, but by
 924          * not chopping up words
 925          * @param string $text
 926          * @param int $start
 927          * @param int $end
 928          * @param int $posStart (out) actual start position
 929          * @param int $posEnd (out) actual end position
 930          * @return string
 931          */
 932         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 933                 global $wgContLang;
 934
 935                 if( $start != 0)
 936                         $start = $this->position( $text, $start, 1 );
 937                 if( $end >= strlen($text) )
 938                         $end = strlen($text);
 939                 else
 940                         $end = $this->position( $text, $end );
 941
 942                 if(!is_null($posStart))
 943                         $posStart = $start;
 944                 if(!is_null($posEnd))
 945                         $posEnd = $end;
 946
 947                 if($end > $start)
 948                         return substr($text, $start, $end-$start);
 949                 else
 950                         return '';
 951         }
 952
 953         /**
 954          * Find a nonletter near a point (index) in the text
 955          *
 956          * @param string $text
 957          * @param int $point
 958          * @param int $offset to found index
 959          * @return int nearest nonletter index, or beginning of utf8 char if none
 960          */
 961         function position($text, $point, $offset=0 ){
 962                 $tolerance = 10;
 963                 $s = max( 0, $point - $tolerance );
 964                 $l = min( strlen($text), $point + $tolerance ) - $s;
 965                 $m = array();
 966                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
 967                         return $m[0][1] + $s + $offset;
 968                 } else{
 969                         // check if point is on a valid first UTF8 char
 970                         $char = ord( $text[$point] );
 971                         while( $char >= 0x80 && $char < 0xc0 ) {
 972                                 // skip trailing bytes
 973                                 $point++;
 974                                 if($point >= strlen($text))
 975                                         return strlen($text);
 976                                 $char = ord( $text[$point] );
 977                         }
 978                         return $point;
 979
 980                 }
 981         }
 982
 983         /**
 984          * Search extracts for a pattern, and return snippets
 985          *
 986          * @param string $pattern regexp for matching lines
 987          * @param array $extracts extracts to search
 988          * @param int $linesleft number of extracts to make
 989          * @param int $contextchars length of snippet
 990          * @param array $out map for highlighted snippets
 991          * @param array $offsets map of starting points of snippets
 992          * @protected
 993          */
 994         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
 995                 if($linesleft == 0)
 996                         return; // nothing to do
 997                 foreach($extracts as $index => $line){
 998                         if( array_key_exists($index,$out) )
 999                                 continue; // this line already highlighted
1000
1001                         $m = array();
1002                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1003                                 continue;
1004
1005                         $offset = $m[0][1];
1006                         $len = strlen($m[0][0]);
1007                         if($offset + $len < $contextchars)
1008                                 $begin = 0;
1009                         elseif( $len > $contextchars)
1010                                 $begin = $offset;
1011                         else
1012                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1013
1014                         $end = $begin + $contextchars;
1015
1016                         $posBegin = $begin;
1017                         // basic snippet from this line
1018                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1019                         $offsets[$index] = $posBegin;
1020                         $linesleft--;
1021                         if($linesleft == 0)
1022                                 return;
1023                 }
1024         }
1025
1026         /**
1027          * Basic wikitext removal
1028          * @protected
1029          */
1030         function removeWiki($text) {
1031                 $fname = __METHOD__;
1032                 wfProfileIn( $fname );
1033
1034                 //$text = preg_replace("/'{2,5}/", "", $text);
1035                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1036                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1037                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1038                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1039                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1040                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1041                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1042                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1043                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1044                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1045                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1046                 $text = preg_replace("/'''''/", "", $text);
1047                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1048                 $text = preg_replace("/''/", "", $text);
1049
1050                 wfProfileOut( $fname );
1051                 return $text;
1052         }
1053
1054         /**
1055          * callback to replace [[target|caption]] kind of links, if
1056          * the target is category or image, leave it
1057          *
1058          * @param array $matches
1059          */
1060         function linkReplace($matches){
1061                 $colon = strpos( $matches[1], ':' );
1062                 if( $colon === false )
1063                         return $matches[2]; // replace with caption
1064                 global $wgContLang;
1065                 $ns = substr( $matches[1], 0, $colon );
1066                 $index = $wgContLang->getNsIndex($ns);
1067                 if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) )
1068                         return $matches[0]; // return the whole thing
1069                 else
1070                         return $matches[2];
1071
1072         }
1073
1074         /**
1075      * Simple & fast snippet extraction, but gives completely unrelevant
1076      * snippets
1077      *
1078      * @param string $text
1079      * @param array $terms
1080      * @param int $contextlines
1081      * @param int $contextchars
1082      * @return string
1083      */
1084     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1085         global $wgLang, $wgContLang;
1086         $fname = __METHOD__;
1087
1088         $lines = explode( "\n", $text );
1089
1090         $terms = implode( '|', $terms );
1091         $terms = str_replace( '/', "\\/", $terms);
1092         $max = intval( $contextchars ) + 1;
1093         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1094
1095         $lineno = 0;
1096
1097         $extract = "";
1098         wfProfileIn( "$fname-extract" );
1099         foreach ( $lines as $line ) {
1100             if ( 0 == $contextlines ) {
1101                 break;
1102             }
1103             ++$lineno;
1104             $m = array();
1105             if ( ! preg_match( $pat1, $line, $m ) ) {
1106                 continue;
1107             }
1108             --$contextlines;
1109             $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
1110
1111             if ( count( $m ) < 3 ) {
1112                 $post = '';
1113             } else {
1114                 $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
1115             }
1116
1117             $found = $m[2];
1118
1119             $line = htmlspecialchars( $pre . $found . $post );
1120             $pat2 = '/(' . $terms . ")/i";
1121             $line = preg_replace( $pat2,
1122               "<span class='searchmatch'>\\1</span>", $line );
1123
1124             $extract .= "${line}\n";
1125         }
1126         wfProfileOut( "$fname-extract" );
1127
1128         return $extract;
1129     }
1130
1131 }
1132
1133 /**
1134  * Dummy class to be used when non-supported Database engine is present.
1135  * @fixme Dummy class should probably try something at least mildly useful,
1136  * such as a LIKE search through titles.
1137  * @ingroup Search
1138  */
1139 class SearchEngineDummy extends SearchEngine {
1140         // no-op
1141 }