includes/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $prefix = '';
  17         var $searchTerms = array();
  18         var $namespaces = array( NS_MAIN );
  19         var $showRedirects = false;
  20
  21         /**
  22          * Perform a full text search query and return a result set.
  23          * If title searches are not supported or disabled, return null.
  24          *
  25          * @param string $term - Raw search term
  26          * @return SearchResultSet
  27          * @access public
  28          * @abstract
  29          */
  30         function searchText( $term ) {
  31                 return null;
  32         }
  33
  34         /**
  35          * Perform a title-only search query and return a result set.
  36          * If title searches are not supported or disabled, return null.
  37          *
  38          * @param string $term - Raw search term
  39          * @return SearchResultSet
  40          * @access public
  41          * @abstract
  42          */
  43         function searchTitle( $term ) {
  44                 return null;
  45         }
  46
  47         /** If this search backend can list/unlist redirects */
  48         function acceptListRedirects() {
  49                 return true;
  50         }
  51
  52         /**
  53          * Transform search term in cases when parts of the query came as different GET params (when supported)
  54          * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
  55          */
  56         function transformSearchTerm( $term ) {
  57                 return $term;
  58         }
  59
  60         /**
  61          * If an exact title match can be find, or a very slightly close match,
  62          * return the title. If no match, returns NULL.
  63          *
  64          * @param string $term
  65          * @return Title
  66          */
  67         public static function getNearMatch( $searchterm ) {
  68                 global $wgContLang;
  69
  70                 $allSearchTerms = array($searchterm);
  71
  72                 if($wgContLang->hasVariants()){
  73                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  74                 }
  75
  76                 foreach($allSearchTerms as $term){
  77
  78                         # Exact match? No need to look further.
  79                         $title = Title::newFromText( $term );
  80                         if (is_null($title))
  81                                 return NULL;
  82
  83                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
  84                                 return $title;
  85                         }
  86
  87                         # See if it still otherwise has content is some sane sense
  88                         $article = MediaWiki::articleFromTitle( $title );
  89                         if( $article->hasViewableContent() ) {
  90                                 return $title;
  91                         }
  92
  93                         # Now try all lower case (i.e. first letter capitalized)
  94                         #
  95                         $title = Title::newFromText( $wgContLang->lc( $term ) );
  96                         if ( $title && $title->exists() ) {
  97                                 return $title;
  98                         }
  99
 100                         # Now try capitalized string
 101                         #
 102                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
 103                         if ( $title && $title->exists() ) {
 104                                 return $title;
 105                         }
 106
 107                         # Now try all upper case
 108                         #
 109                         $title = Title::newFromText( $wgContLang->uc( $term ) );
 110                         if ( $title && $title->exists() ) {
 111                                 return $title;
 112                         }
 113
 114                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
 115                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
 116                         if ( $title && $title->exists() ) {
 117                                 return $title;
 118                         }
 119
 120                         // Give hooks a chance at better match variants
 121                         $title = null;
 122                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 123                                 return $title;
 124                         }
 125                 }
 126
 127                 $title = Title::newFromText( $searchterm );
 128
 129                 # Entering an IP address goes to the contributions page
 130                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 131                         || User::isIP( trim( $searchterm ) ) ) {
 132                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 133                 }
 134
 135
 136                 # Entering a user goes to the user page whether it's there or not
 137                 if ( $title->getNamespace() == NS_USER ) {
 138                         return $title;
 139                 }
 140
 141                 # Go to images that exist even if there's no local page.
 142                 # There may have been a funny upload, or it may be on a shared
 143                 # file repository such as Wikimedia Commons.
 144                 if( $title->getNamespace() == NS_FILE ) {
 145                         $image = wfFindFile( $title );
 146                         if( $image ) {
 147                                 return $title;
 148                         }
 149                 }
 150
 151                 # MediaWiki namespace? Page may be "implied" if not customized.
 152                 # Just return it, with caps forced as the message system likes it.
 153                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 154                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 155                 }
 156
 157                 # Quoted term? Try without the quotes...
 158                 $matches = array();
 159                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 160                         return SearchEngine::getNearMatch( $matches[1] );
 161                 }
 162
 163                 return NULL;
 164         }
 165
 166         public static function legalSearchChars() {
 167                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
 168         }
 169
 170         /**
 171          * Set the maximum number of results to return
 172          * and how many to skip before returning the first.
 173          *
 174          * @param int $limit
 175          * @param int $offset
 176          * @access public
 177          */
 178         function setLimitOffset( $limit, $offset = 0 ) {
 179                 $this->limit = intval( $limit );
 180                 $this->offset = intval( $offset );
 181         }
 182
 183         /**
 184          * Set which namespaces the search should include.
 185          * Give an array of namespace index numbers.
 186          *
 187          * @param array $namespaces
 188          * @access public
 189          */
 190         function setNamespaces( $namespaces ) {
 191                 $this->namespaces = $namespaces;
 192         }
 193
 194         /**
 195          * Parse some common prefixes: all (search everything)
 196          * or namespace names
 197          *
 198          * @param string $query
 199          */
 200         function replacePrefixes( $query ){
 201                 global $wgContLang;
 202
 203                 if( strpos($query,':') === false )
 204                         return $query; // nothing to do
 205
 206                 $parsed = $query;
 207                 $allkeyword = wfMsgForContent('searchall').":";
 208                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 209                         $this->namespaces = null;
 210                         $parsed = substr($query,strlen($allkeyword));
 211                 } else if( strpos($query,':') !== false ) {
 212                         $prefix = substr($query,0,strpos($query,':'));
 213                         $index = $wgContLang->getNsIndex($prefix);
 214                         if($index !== false){
 215                                 $this->namespaces = array($index);
 216                                 $parsed = substr($query,strlen($prefix)+1);
 217                         }
 218                 }
 219                 if(trim($parsed) == '')
 220                         return $query; // prefix was the whole query
 221
 222                 return $parsed;
 223         }
 224
 225         /**
 226          * Make a list of searchable namespaces and their canonical names.
 227          * @return array
 228          */
 229         public static function searchableNamespaces() {
 230                 global $wgContLang;
 231                 $arr = array();
 232                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 233                         if( $ns >= NS_MAIN ) {
 234                                 $arr[$ns] = $name;
 235                         }
 236                 }
 237                 return $arr;
 238         }
 239
 240         /**
 241          * Extract default namespaces to search from the given user's
 242          * settings, returning a list of index numbers.
 243          *
 244          * @param User $user
 245          * @return array
 246          * @static
 247          */
 248         public static function userNamespaces( &$user ) {
 249                 $arr = array();
 250                 foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
 251                         if( $user->getOption( 'searchNs' . $ns ) ) {
 252                                 $arr[] = $ns;
 253                         }
 254                 }
 255                 return $arr;
 256         }
 257
 258         /**
 259          * Find snippet highlight settings for a given user
 260          *
 261          * @param User $user
 262          * @return array contextlines, contextchars
 263          * @static
 264          */
 265         public static function userHighlightPrefs( &$user ){
 266                 //$contextlines = $user->getOption( 'contextlines',  5 );
 267                 //$contextchars = $user->getOption( 'contextchars', 50 );
 268                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 269                 $contextchars = 75; // same as above.... :P
 270                 return array($contextlines, $contextchars);
 271         }
 272
 273         /**
 274          * An array of namespaces indexes to be searched by default
 275          *
 276          * @return array
 277          * @static
 278          */
 279         public static function defaultNamespaces(){
 280                 global $wgNamespacesToBeSearchedDefault;
 281
 282                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 283         }
 284
 285         /**
 286          * Get a list of namespace names useful for showing in tooltips
 287          * and preferences
 288          *
 289          * @param unknown_type $namespaces
 290          */
 291         public static function namespacesAsText( $namespaces ){
 292                 global $wgContLang;
 293
 294                 $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
 295                 foreach( $formatted as $key => $ns ){
 296                         if ( empty($ns) )
 297                                 $formatted[$key] = wfMsg( 'blanknamespace' );
 298                 }
 299                 return $formatted;
 300         }
 301
 302         /**
 303          * An array of "project" namespaces indexes typically searched
 304          * by logged-in users
 305          *
 306          * @return array
 307          * @static
 308          */
 309         public static function projectNamespaces() {
 310                 global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
 311
 312                 return array_keys( $wgNamespacesToBeSearchedProject, true );
 313         }
 314
 315         /**
 316          * An array of "project" namespaces indexes typically searched
 317          * by logged-in users in addition to the default namespaces
 318          *
 319          * @return array
 320          * @static
 321          */
 322         public static function defaultAndProjectNamespaces() {
 323                 global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
 324
 325                 return array_keys( $wgNamespacesToBeSearchedDefault +
 326                         $wgNamespacesToBeSearchedProject, true);
 327         }
 328
 329         /**
 330          * Return a 'cleaned up' search string
 331          *
 332          * @return string
 333          * @access public
 334          */
 335         function filter( $text ) {
 336                 $lc = $this->legalSearchChars();
 337                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 338         }
 339         /**
 340          * Load up the appropriate search engine class for the currently
 341          * active database backend, and return a configured instance.
 342          *
 343          * @return SearchEngine
 344          */
 345         public static function create() {
 346                 global $wgSearchType;
 347                 $dbr = wfGetDB( DB_SLAVE );
 348                 if( $wgSearchType ) {
 349                         $class = $wgSearchType;
 350                 } else {
 351                         $class = $dbr->getSearchEngine();
 352                 }
 353                 $search = new $class( $dbr );
 354                 $search->setLimitOffset(0,0);
 355                 return $search;
 356         }
 357
 358         /**
 359          * Create or update the search index record for the given page.
 360          * Title and text should be pre-processed.
 361          *
 362          * @param int $id
 363          * @param string $title
 364          * @param string $text
 365          * @abstract
 366          */
 367         function update( $id, $title, $text ) {
 368                 // no-op
 369         }
 370
 371         /**
 372          * Update a search index record's title only.
 373          * Title should be pre-processed.
 374          *
 375          * @param int $id
 376          * @param string $title
 377          * @abstract
 378          */
 379         function updateTitle( $id, $title ) {
 380                 // no-op
 381         }
 382
 383         /**
 384          * Get OpenSearch suggestion template
 385          *
 386          * @return string
 387          * @static
 388          */
 389         public static function getOpenSearchTemplate() {
 390                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 391                 if( $wgOpenSearchTemplate )     {
 392                         return $wgOpenSearchTemplate;
 393                 } else {
 394                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
 395                         if( !$ns ) $ns = "0";
 396                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 397                 }
 398         }
 399
 400         /**
 401          * Get internal MediaWiki Suggest template
 402          *
 403          * @return string
 404          * @static
 405          */
 406         public static function getMWSuggestTemplate() {
 407                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 408                 if($wgMWSuggestTemplate)
 409                         return $wgMWSuggestTemplate;
 410                 else
 411                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
 412         }
 413 }
 414
 415 /**
 416  * @ingroup Search
 417  */
 418 class SearchResultSet {
 419         /**
 420          * Fetch an array of regular expression fragments for matching
 421          * the search terms as parsed by this engine in a text extract.
 422          *
 423          * @return array
 424          * @access public
 425          * @abstract
 426          */
 427         function termMatches() {
 428                 return array();
 429         }
 430
 431         function numRows() {
 432                 return 0;
 433         }
 434
 435         /**
 436          * Return true if results are included in this result set.
 437          * @return bool
 438          * @abstract
 439          */
 440         function hasResults() {
 441                 return false;
 442         }
 443
 444         /**
 445          * Some search modes return a total hit count for the query
 446          * in the entire article database. This may include pages
 447          * in namespaces that would not be matched on the given
 448          * settings.
 449          *
 450          * Return null if no total hits number is supported.
 451          *
 452          * @return int
 453          * @access public
 454          */
 455         function getTotalHits() {
 456                 return null;
 457         }
 458
 459         /**
 460          * Some search modes return a suggested alternate term if there are
 461          * no exact hits. Returns true if there is one on this set.
 462          *
 463          * @return bool
 464          * @access public
 465          */
 466         function hasSuggestion() {
 467                 return false;
 468         }
 469
 470         /**
 471          * @return string suggested query, null if none
 472          */
 473         function getSuggestionQuery(){
 474                 return null;
 475         }
 476
 477         /**
 478          * @return string HTML highlighted suggested query, '' if none
 479          */
 480         function getSuggestionSnippet(){
 481                 return '';
 482         }
 483
 484         /**
 485          * Return information about how and from where the results were fetched,
 486          * should be useful for diagnostics and debugging
 487          *
 488          * @return string
 489          */
 490         function getInfo() {
 491                 return null;
 492         }
 493
 494         /**
 495          * Return a result set of hits on other (multiple) wikis associated with this one
 496          *
 497          * @return SearchResultSet
 498          */
 499         function getInterwikiResults() {
 500                 return null;
 501         }
 502
 503         /**
 504          * Check if there are results on other wikis
 505          *
 506          * @return boolean
 507          */
 508         function hasInterwikiResults() {
 509                 return $this->getInterwikiResults() != null;
 510         }
 511
 512
 513         /**
 514          * Fetches next search result, or false.
 515          * @return SearchResult
 516          * @access public
 517          * @abstract
 518          */
 519         function next() {
 520                 return false;
 521         }
 522
 523         /**
 524          * Frees the result set, if applicable.
 525          * @ access public
 526          */
 527         function free() {
 528                 // ...
 529         }
 530 }
 531
 532
 533 /**
 534  * @ingroup Search
 535  */
 536 class SearchResultTooMany {
 537         ## Some search engines may bail out if too many matches are found
 538 }
 539
 540
 541 /**
 542  * @fixme This class is horribly factored. It would probably be better to have
 543  * a useful base class to which you pass some standard information, then let
 544  * the fancy self-highlighters extend that.
 545  * @ingroup Search
 546  */
 547 class SearchResult {
 548         var $mRevision = null;
 549         var $mImage = null;
 550
 551         function __construct( $row ) {
 552                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 553                 if( !is_null($this->mTitle) ){
 554                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 555                         if( $this->mTitle->getNamespace() === NS_FILE )
 556                                 $this->mImage = wfFindFile( $this->mTitle );
 557                 }
 558         }
 559
 560         /**
 561          * Check if this is result points to an invalid title
 562          *
 563          * @return boolean
 564          * @access public
 565          */
 566         function isBrokenTitle(){
 567                 if( is_null($this->mTitle) )
 568                         return true;
 569                 return false;
 570         }
 571
 572         /**
 573          * Check if target page is missing, happens when index is out of date
 574          *
 575          * @return boolean
 576          * @access public
 577          */
 578         function isMissingRevision(){
 579                 return !$this->mRevision && !$this->mImage;
 580         }
 581
 582         /**
 583          * @return Title
 584          * @access public
 585          */
 586         function getTitle() {
 587                 return $this->mTitle;
 588         }
 589
 590         /**
 591          * @return double or null if not supported
 592          */
 593         function getScore() {
 594                 return null;
 595         }
 596
 597         /**
 598          * Lazy initialization of article text from DB
 599          */
 600         protected function initText(){
 601                 if( !isset($this->mText) ){
 602                         if($this->mRevision != null)
 603                                 $this->mText = $this->mRevision->getText();
 604                         else // TODO: can we fetch raw wikitext for commons images?
 605                                 $this->mText = '';
 606
 607                 }
 608         }
 609
 610         /**
 611          * @param array $terms terms to highlight
 612          * @return string highlighted text snippet, null (and not '') if not supported
 613          */
 614         function getTextSnippet($terms){
 615                 global $wgUser, $wgAdvancedSearchHighlighting;
 616                 $this->initText();
 617                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 618                 $h = new SearchHighlighter();
 619                 if( $wgAdvancedSearchHighlighting )
 620                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 621                 else
 622                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 623         }
 624
 625         /**
 626          * @param array $terms terms to highlight
 627          * @return string highlighted title, '' if not supported
 628          */
 629         function getTitleSnippet($terms){
 630                 return '';
 631         }
 632
 633         /**
 634          * @param array $terms terms to highlight
 635          * @return string highlighted redirect name (redirect to this page), '' if none or not supported
 636          */
 637         function getRedirectSnippet($terms){
 638                 return '';
 639         }
 640
 641         /**
 642          * @return Title object for the redirect to this page, null if none or not supported
 643          */
 644         function getRedirectTitle(){
 645                 return null;
 646         }
 647
 648         /**
 649          * @return string highlighted relevant section name, null if none or not supported
 650          */
 651         function getSectionSnippet(){
 652                 return '';
 653         }
 654
 655         /**
 656          * @return Title object (pagename+fragment) for the section, null if none or not supported
 657          */
 658         function getSectionTitle(){
 659                 return null;
 660         }
 661
 662         /**
 663          * @return string timestamp
 664          */
 665         function getTimestamp(){
 666                 if( $this->mRevision )
 667                         return $this->mRevision->getTimestamp();
 668                 else if( $this->mImage )
 669                         return $this->mImage->getTimestamp();
 670                 return '';
 671         }
 672
 673         /**
 674          * @return int number of words
 675          */
 676         function getWordCount(){
 677                 $this->initText();
 678                 return str_word_count( $this->mText );
 679         }
 680
 681         /**
 682          * @return int size in bytes
 683          */
 684         function getByteSize(){
 685                 $this->initText();
 686                 return strlen( $this->mText );
 687         }
 688
 689         /**
 690          * @return boolean if hit has related articles
 691          */
 692         function hasRelated(){
 693                 return false;
 694         }
 695
 696         /**
 697          * @return interwiki prefix of the title (return iw even if title is broken)
 698          */
 699         function getInterwikiPrefix(){
 700                 return '';
 701         }
 702 }
 703
 704 /**
 705  * Highlight bits of wikitext
 706  *
 707  * @ingroup Search
 708  */
 709 class SearchHighlighter {
 710         var $mCleanWikitext = true;
 711
 712         function SearchHighlighter($cleanupWikitext = true){
 713                 $this->mCleanWikitext = $cleanupWikitext;
 714         }
 715
 716         /**
 717          * Default implementation of wikitext highlighting
 718          *
 719          * @param string $text
 720          * @param array $terms Terms to highlight (unescaped)
 721          * @param int $contextlines
 722          * @param int $contextchars
 723          * @return string
 724          */
 725         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 726                 global $wgLang, $wgContLang;
 727                 global $wgSearchHighlightBoundaries;
 728                 $fname = __METHOD__;
 729
 730                 if($text == '')
 731                         return '';
 732
 733                 // spli text into text + templates/links/tables
 734                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 735                 // first capture group is for detecting nested templates/links/tables/references
 736                 $endPatterns = array(
 737                         1 => '/(\{\{)|(\}\})/', // template
 738                         2 => '/(\[\[)|(\]\])/', // image
 739                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 740
 741                 // FIXME: this should prolly be a hook or something
 742                 if(function_exists('wfCite')){
 743                         $spat .= '|(<ref>)'; // references via cite extension
 744                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 745                 }
 746                 $spat .= '/';
 747                 $textExt = array(); // text extracts
 748                 $otherExt = array();  // other extracts
 749                 wfProfileIn( "$fname-split" );
 750                 $start = 0;
 751                 $textLen = strlen($text);
 752                 $count = 0; // sequence number to maintain ordering
 753                 while( $start < $textLen ){
 754                         // find start of template/image/table
 755                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 756                                 $epat = '';
 757                                 foreach($matches as $key => $val){
 758                                         if($key > 0 && $val[1] != -1){
 759                                                 if($key == 2){
 760                                                         // see if this is an image link
 761                                                         $ns = substr($val[0],2,-1);
 762                                                         if( $wgContLang->getNsIndex($ns) != NS_FILE )
 763                                                                 break;
 764
 765                                                 }
 766                                                 $epat = $endPatterns[$key];
 767                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 768                                                 $start = $val[1];
 769                                                 break;
 770                                         }
 771                                 }
 772                                 if( $epat ){
 773                                         // find end (and detect any nested elements)
 774                                         $level = 0;
 775                                         $offset = $start + 1;
 776                                         $found = false;
 777                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 778                                                 if( array_key_exists(2,$endMatches) ){
 779                                                         // found end
 780                                                         if($level == 0){
 781                                                                 $len = strlen($endMatches[2][0]);
 782                                                                 $off = $endMatches[2][1];
 783                                                                 $this->splitAndAdd( $otherExt, $count,
 784                                                                         substr( $text, $start, $off + $len  - $start ) );
 785                                                                 $start = $off + $len;
 786                                                                 $found = true;
 787                                                                 break;
 788                                                         } else{
 789                                                                 // end of nested element
 790                                                                 $level -= 1;
 791                                                         }
 792                                                 } else{
 793                                                         // nested
 794                                                         $level += 1;
 795                                                 }
 796                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 797                                         }
 798                                         if( ! $found ){
 799                                                 // couldn't find appropriate closing tag, skip
 800                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 801                                                 $start += strlen($matches[0][0]);
 802                                         }
 803                                         continue;
 804                                 }
 805                         }
 806                         // else: add as text extract
 807                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 808                         break;
 809                 }
 810
 811                 $all = $textExt + $otherExt; // these have disjunct key sets
 812
 813                 wfProfileOut( "$fname-split" );
 814
 815                 // prepare regexps
 816                 foreach( $terms as $index => $term ) {
 817                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 818                         if(preg_match('/[\x80-\xff]/', $term) ){
 819                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 820                         } else {
 821                                 $terms[$index] = $term;
 822                         }
 823                 }
 824                 $anyterm = implode( '|', $terms );
 825                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 826
 827                 // FIXME: a hack to scale contextchars, a correct solution
 828                 // would be to have contextchars actually be char and not byte
 829                 // length, and do proper utf-8 substrings and lengths everywhere,
 830                 // but PHP is making that very hard and unclean to implement :(
 831                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 832                 $contextchars = intval( $contextchars * $scale );
 833
 834                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 835                 $patPost = "($wgSearchHighlightBoundaries|$)";
 836
 837                 $pat1 = "/(".$phrase.")/ui";
 838                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 839
 840                 wfProfileIn( "$fname-extract" );
 841
 842                 $left = $contextlines;
 843
 844                 $snippets = array();
 845                 $offsets = array();
 846
 847                 // show beginning only if it contains all words
 848                 $first = 0;
 849                 $firstText = '';
 850                 foreach($textExt as $index => $line){
 851                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 852                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 853                                 $first = $index;
 854                                 break;
 855                         }
 856                 }
 857                 if( $firstText ){
 858                         $succ = true;
 859                         // check if first text contains all terms
 860                         foreach($terms as $term){
 861                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 862                                         $succ = false;
 863                                         break;
 864                                 }
 865                         }
 866                         if( $succ ){
 867                                 $snippets[$first] = $firstText;
 868                                 $offsets[$first] = 0;
 869                         }
 870                 }
 871                 if( ! $snippets ) {
 872                         // match whole query on text
 873                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 874                         // match whole query on templates/tables/images
 875                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 876                         // match any words on text
 877                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 878                         // match any words on templates/tables/images
 879                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 880
 881                         ksort($snippets);
 882                 }
 883
 884                 // add extra chars to each snippet to make snippets constant size
 885                 $extended = array();
 886                 if( count( $snippets ) == 0){
 887                         // couldn't find the target words, just show beginning of article
 888                         $targetchars = $contextchars * $contextlines;
 889                         $snippets[$first] = '';
 890                         $offsets[$first] = 0;
 891                 } else{
 892                         // if begin of the article contains the whole phrase, show only that !!
 893                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 894                             && $offsets[$first] < $contextchars * 2 ){
 895                                 $snippets = array ($first => $snippets[$first]);
 896                         }
 897
 898                         // calc by how much to extend existing snippets
 899                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 900                 }
 901
 902                 foreach($snippets as $index => $line){
 903                         $extended[$index] = $line;
 904                         $len = strlen($line);
 905                         if( $len < $targetchars - 20 ){
 906                                 // complete this line
 907                                 if($len < strlen( $all[$index] )){
 908                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 909                                         $len = strlen( $extended[$index] );
 910                                 }
 911
 912                                 // add more lines
 913                                 $add = $index + 1;
 914                                 while( $len < $targetchars - 20
 915                                        && array_key_exists($add,$all)
 916                                        && !array_key_exists($add,$snippets) ){
 917                                     $offsets[$add] = 0;
 918                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 919                                         $extended[$add] = $tt;
 920                                         $len += strlen( $tt );
 921                                         $add++;
 922                                 }
 923                         }
 924                 }
 925
 926                 //$snippets = array_map('htmlspecialchars', $extended);
 927                 $snippets = $extended;
 928                 $last = -1;
 929                 $extract = '';
 930                 foreach($snippets as $index => $line){
 931                         if($last == -1)
 932                                 $extract .= $line; // first line
 933                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 934                                 $extract .= " ".$line; // continous lines
 935                         else
 936                                 $extract .= '<b> ... </b>' . $line;
 937
 938                         $last = $index;
 939                 }
 940                 if( $extract )
 941                         $extract .= '<b> ... </b>';
 942
 943                 $processed = array();
 944                 foreach($terms as $term){
 945                         if( ! isset($processed[$term]) ){
 946                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 947                                 $extract = preg_replace( $pat3,
 948                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 949                                 $processed[$term] = true;
 950                         }
 951                 }
 952
 953                 wfProfileOut( "$fname-extract" );
 954
 955                 return $extract;
 956         }
 957
 958         /**
 959          * Split text into lines and add it to extracts array
 960          *
 961          * @param array $extracts index -> $line
 962          * @param int $count
 963          * @param string $text
 964          */
 965         function splitAndAdd(&$extracts, &$count, $text){
 966                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 967                 foreach($split as $line){
 968                         $tt = trim($line);
 969                         if( $tt )
 970                                 $extracts[$count++] = $tt;
 971                 }
 972         }
 973
 974         /**
 975          * Do manual case conversion for non-ascii chars
 976          *
 977          * @param unknown_type $matches
 978          */
 979         function caseCallback($matches){
 980                 global $wgContLang;
 981                 if( strlen($matches[0]) > 1 ){
 982                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 983                 } else
 984                         return $matches[0];
 985         }
 986
 987         /**
 988          * Extract part of the text from start to end, but by
 989          * not chopping up words
 990          * @param string $text
 991          * @param int $start
 992          * @param int $end
 993          * @param int $posStart (out) actual start position
 994          * @param int $posEnd (out) actual end position
 995          * @return string
 996          */
 997         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 998                 global $wgContLang;
 999
1000                 if( $start != 0)
1001                         $start = $this->position( $text, $start, 1 );
1002                 if( $end >= strlen($text) )
1003                         $end = strlen($text);
1004                 else
1005                         $end = $this->position( $text, $end );
1006
1007                 if(!is_null($posStart))
1008                         $posStart = $start;
1009                 if(!is_null($posEnd))
1010                         $posEnd = $end;
1011
1012                 if($end > $start)
1013                         return substr($text, $start, $end-$start);
1014                 else
1015                         return '';
1016         }
1017
1018         /**
1019          * Find a nonletter near a point (index) in the text
1020          *
1021          * @param string $text
1022          * @param int $point
1023          * @param int $offset to found index
1024          * @return int nearest nonletter index, or beginning of utf8 char if none
1025          */
1026         function position($text, $point, $offset=0 ){
1027                 $tolerance = 10;
1028                 $s = max( 0, $point - $tolerance );
1029                 $l = min( strlen($text), $point + $tolerance ) - $s;
1030                 $m = array();
1031                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
1032                         return $m[0][1] + $s + $offset;
1033                 } else{
1034                         // check if point is on a valid first UTF8 char
1035                         $char = ord( $text[$point] );
1036                         while( $char >= 0x80 && $char < 0xc0 ) {
1037                                 // skip trailing bytes
1038                                 $point++;
1039                                 if($point >= strlen($text))
1040                                         return strlen($text);
1041                                 $char = ord( $text[$point] );
1042                         }
1043                         return $point;
1044
1045                 }
1046         }
1047
1048         /**
1049          * Search extracts for a pattern, and return snippets
1050          *
1051          * @param string $pattern regexp for matching lines
1052          * @param array $extracts extracts to search
1053          * @param int $linesleft number of extracts to make
1054          * @param int $contextchars length of snippet
1055          * @param array $out map for highlighted snippets
1056          * @param array $offsets map of starting points of snippets
1057          * @protected
1058          */
1059         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1060                 if($linesleft == 0)
1061                         return; // nothing to do
1062                 foreach($extracts as $index => $line){
1063                         if( array_key_exists($index,$out) )
1064                                 continue; // this line already highlighted
1065
1066                         $m = array();
1067                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1068                                 continue;
1069
1070                         $offset = $m[0][1];
1071                         $len = strlen($m[0][0]);
1072                         if($offset + $len < $contextchars)
1073                                 $begin = 0;
1074                         elseif( $len > $contextchars)
1075                                 $begin = $offset;
1076                         else
1077                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1078
1079                         $end = $begin + $contextchars;
1080
1081                         $posBegin = $begin;
1082                         // basic snippet from this line
1083                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1084                         $offsets[$index] = $posBegin;
1085                         $linesleft--;
1086                         if($linesleft == 0)
1087                                 return;
1088                 }
1089         }
1090
1091         /**
1092          * Basic wikitext removal
1093          * @protected
1094          */
1095         function removeWiki($text) {
1096                 $fname = __METHOD__;
1097                 wfProfileIn( $fname );
1098
1099                 //$text = preg_replace("/'{2,5}/", "", $text);
1100                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1101                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1102                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1103                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1104                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1105                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1106                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1107                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1108                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1109                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1110                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1111                 $text = preg_replace("/'''''/", "", $text);
1112                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1113                 $text = preg_replace("/''/", "", $text);
1114
1115                 wfProfileOut( $fname );
1116                 return $text;
1117         }
1118
1119         /**
1120          * callback to replace [[target|caption]] kind of links, if
1121          * the target is category or image, leave it
1122          *
1123          * @param array $matches
1124          */
1125         function linkReplace($matches){
1126                 $colon = strpos( $matches[1], ':' );
1127                 if( $colon === false )
1128                         return $matches[2]; // replace with caption
1129                 global $wgContLang;
1130                 $ns = substr( $matches[1], 0, $colon );
1131                 $index = $wgContLang->getNsIndex($ns);
1132                 if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
1133                         return $matches[0]; // return the whole thing
1134                 else
1135                         return $matches[2];
1136
1137         }
1138
1139         /**
1140      * Simple & fast snippet extraction, but gives completely unrelevant
1141      * snippets
1142      *
1143      * @param string $text
1144      * @param array $terms
1145      * @param int $contextlines
1146      * @param int $contextchars
1147      * @return string
1148      */
1149     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1150         global $wgLang, $wgContLang;
1151         $fname = __METHOD__;
1152
1153         $lines = explode( "\n", $text );
1154
1155         $terms = implode( '|', $terms );
1156         $max = intval( $contextchars ) + 1;
1157         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1158
1159         $lineno = 0;
1160
1161         $extract = "";
1162         wfProfileIn( "$fname-extract" );
1163         foreach ( $lines as $line ) {
1164             if ( 0 == $contextlines ) {
1165                 break;
1166             }
1167             ++$lineno;
1168             $m = array();
1169             if ( ! preg_match( $pat1, $line, $m ) ) {
1170                 continue;
1171             }
1172             --$contextlines;
1173             $pre = $wgContLang->truncate( $m[1], -$contextchars );
1174
1175             if ( count( $m ) < 3 ) {
1176                 $post = '';
1177             } else {
1178                 $post = $wgContLang->truncate( $m[3], $contextchars );
1179             }
1180
1181             $found = $m[2];
1182
1183             $line = htmlspecialchars( $pre . $found . $post );
1184             $pat2 = '/(' . $terms . ")/i";
1185             $line = preg_replace( $pat2,
1186               "<span class='searchmatch'>\\1</span>", $line );
1187
1188             $extract .= "${line}\n";
1189         }
1190         wfProfileOut( "$fname-extract" );
1191
1192         return $extract;
1193     }
1194
1195 }
1196
1197 /**
1198  * Dummy class to be used when non-supported Database engine is present.
1199  * @fixme Dummy class should probably try something at least mildly useful,
1200  * such as a LIKE search through titles.
1201  * @ingroup Search
1202  */
1203 class SearchEngineDummy extends SearchEngine {
1204         // no-op
1205 }