WARNING: HUGE COMMIT
[lhc/web/wiklou.git] / includes / SearchEngine.php
1 <?php
2 /**
3 * @defgroup Search Search
4 *
5 * @file
6 * @ingroup Search
7 */
8
9 /**
10 * Contain a class for special pages
11 * @ingroup Search
12 */
13 class SearchEngine {
14 var $limit = 10;
15 var $offset = 0;
16 var $searchTerms = array();
17 var $namespaces = array( NS_MAIN );
18 var $showRedirects = false;
19
20 /**
21 * Perform a full text search query and return a result set.
22 * If title searches are not supported or disabled, return null.
23 *
24 * @param string $term - Raw search term
25 * @return SearchResultSet
26 * @access public
27 * @abstract
28 */
29 function searchText( $term ) {
30 return null;
31 }
32
33 /**
34 * Perform a title-only search query and return a result set.
35 * If title searches are not supported or disabled, return null.
36 *
37 * @param string $term - Raw search term
38 * @return SearchResultSet
39 * @access public
40 * @abstract
41 */
42 function searchTitle( $term ) {
43 return null;
44 }
45
46 /**
47 * If an exact title match can be find, or a very slightly close match,
48 * return the title. If no match, returns NULL.
49 *
50 * @param string $term
51 * @return Title
52 */
53 public static function getNearMatch( $searchterm ) {
54 global $wgContLang;
55
56 $allSearchTerms = array($searchterm);
57
58 if($wgContLang->hasVariants()){
59 $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
60 }
61
62 foreach($allSearchTerms as $term){
63
64 # Exact match? No need to look further.
65 $title = Title::newFromText( $term );
66 if (is_null($title))
67 return NULL;
68
69 if ( $title->getNamespace() == NS_SPECIAL || $title->exists() ) {
70 return $title;
71 }
72
73 # Now try all lower case (i.e. first letter capitalized)
74 #
75 $title = Title::newFromText( $wgContLang->lc( $term ) );
76 if ( $title->exists() ) {
77 return $title;
78 }
79
80 # Now try capitalized string
81 #
82 $title = Title::newFromText( $wgContLang->ucwords( $term ) );
83 if ( $title->exists() ) {
84 return $title;
85 }
86
87 # Now try all upper case
88 #
89 $title = Title::newFromText( $wgContLang->uc( $term ) );
90 if ( $title->exists() ) {
91 return $title;
92 }
93
94 # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
95 $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
96 if ( $title->exists() ) {
97 return $title;
98 }
99
100 global $wgCapitalLinks, $wgContLang;
101 if( !$wgCapitalLinks ) {
102 // Catch differs-by-first-letter-case-only
103 $title = Title::newFromText( $wgContLang->ucfirst( $term ) );
104 if ( $title->exists() ) {
105 return $title;
106 }
107 $title = Title::newFromText( $wgContLang->lcfirst( $term ) );
108 if ( $title->exists() ) {
109 return $title;
110 }
111 }
112
113 // Give hooks a chance at better match variants
114 $title = null;
115 if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
116 return $title;
117 }
118 }
119
120 $title = Title::newFromText( $searchterm );
121
122 # Entering an IP address goes to the contributions page
123 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
124 || User::isIP( trim( $searchterm ) ) ) {
125 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
126 }
127
128
129 # Entering a user goes to the user page whether it's there or not
130 if ( $title->getNamespace() == NS_USER ) {
131 return $title;
132 }
133
134 # Go to images that exist even if there's no local page.
135 # There may have been a funny upload, or it may be on a shared
136 # file repository such as Wikimedia Commons.
137 if( $title->getNamespace() == NS_IMAGE ) {
138 $image = wfFindFile( $title );
139 if( $image ) {
140 return $title;
141 }
142 }
143
144 # MediaWiki namespace? Page may be "implied" if not customized.
145 # Just return it, with caps forced as the message system likes it.
146 if( $title->getNamespace() == NS_MEDIAWIKI ) {
147 return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
148 }
149
150 # Quoted term? Try without the quotes...
151 $matches = array();
152 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
153 return SearchEngine::getNearMatch( $matches[1] );
154 }
155
156 return NULL;
157 }
158
159 public static function legalSearchChars() {
160 return "A-Za-z_'0-9\\x80-\\xFF\\-";
161 }
162
163 /**
164 * Set the maximum number of results to return
165 * and how many to skip before returning the first.
166 *
167 * @param int $limit
168 * @param int $offset
169 * @access public
170 */
171 function setLimitOffset( $limit, $offset = 0 ) {
172 $this->limit = intval( $limit );
173 $this->offset = intval( $offset );
174 }
175
176 /**
177 * Set which namespaces the search should include.
178 * Give an array of namespace index numbers.
179 *
180 * @param array $namespaces
181 * @access public
182 */
183 function setNamespaces( $namespaces ) {
184 $this->namespaces = $namespaces;
185 }
186
187 /**
188 * Parse some common prefixes: all (search everything)
189 * or namespace names
190 *
191 * @param string $query
192 */
193 function replacePrefixes( $query ){
194 global $wgContLang;
195
196 if( strpos($query,':') === false )
197 return $query; // nothing to do
198
199 $parsed = $query;
200 $allkeyword = wfMsgForContent('searchall').":";
201 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
202 $this->namespaces = null;
203 $parsed = substr($query,strlen($allkeyword));
204 } else if( strpos($query,':') !== false ) {
205 $prefix = substr($query,0,strpos($query,':'));
206 $index = $wgContLang->getNsIndex($prefix);
207 if($index !== false){
208 $this->namespaces = array($index);
209 $parsed = substr($query,strlen($prefix)+1);
210 }
211 }
212 if(trim($parsed) == '')
213 return $query; // prefix was the whole query
214
215 return $parsed;
216 }
217
218 /**
219 * Make a list of searchable namespaces and their canonical names.
220 * @return array
221 */
222 public static function searchableNamespaces() {
223 global $wgContLang;
224 $arr = array();
225 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
226 if( $ns >= NS_MAIN ) {
227 $arr[$ns] = $name;
228 }
229 }
230 return $arr;
231 }
232
233 /**
234 * Extract default namespaces to search from the given user's
235 * settings, returning a list of index numbers.
236 *
237 * @param User $user
238 * @return array
239 * @static
240 */
241 public static function userNamespaces( &$user ) {
242 $arr = array();
243 foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
244 if( $user->getOption( 'searchNs' . $ns ) ) {
245 $arr[] = $ns;
246 }
247 }
248 return $arr;
249 }
250
251 /**
252 * Find snippet highlight settings for a given user
253 *
254 * @param User $user
255 * @return array contextlines, contextchars
256 * @static
257 */
258 public static function userHighlightPrefs( &$user ){
259 //$contextlines = $user->getOption( 'contextlines', 5 );
260 //$contextchars = $user->getOption( 'contextchars', 50 );
261 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
262 $contextchars = 75; // same as above.... :P
263 return array($contextlines, $contextchars);
264 }
265
266 /**
267 * An array of namespaces indexes to be searched by default
268 *
269 * @return array
270 * @static
271 */
272 public static function defaultNamespaces(){
273 global $wgNamespacesToBeSearchedDefault;
274
275 return array_keys($wgNamespacesToBeSearchedDefault, true);
276 }
277
278 /**
279 * Return a 'cleaned up' search string
280 *
281 * @return string
282 * @access public
283 */
284 function filter( $text ) {
285 $lc = $this->legalSearchChars();
286 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
287 }
288 /**
289 * Load up the appropriate search engine class for the currently
290 * active database backend, and return a configured instance.
291 *
292 * @return SearchEngine
293 */
294 public static function create() {
295 global $wgDBtype, $wgSearchType;
296 if( $wgSearchType ) {
297 $class = $wgSearchType;
298 } elseif( $wgDBtype == 'mysql' ) {
299 $class = 'SearchMySQL';
300 } else if ( $wgDBtype == 'postgres' ) {
301 $class = 'SearchPostgres';
302 } else if ( $wgDBtype == 'oracle' ) {
303 $class = 'SearchOracle';
304 } else {
305 $class = 'SearchEngineDummy';
306 }
307 $search = new $class( wfGetDB( DB_SLAVE ) );
308 $search->setLimitOffset(0,0);
309 return $search;
310 }
311
312 /**
313 * Create or update the search index record for the given page.
314 * Title and text should be pre-processed.
315 *
316 * @param int $id
317 * @param string $title
318 * @param string $text
319 * @abstract
320 */
321 function update( $id, $title, $text ) {
322 // no-op
323 }
324
325 /**
326 * Update a search index record's title only.
327 * Title should be pre-processed.
328 *
329 * @param int $id
330 * @param string $title
331 * @abstract
332 */
333 function updateTitle( $id, $title ) {
334 // no-op
335 }
336
337 /**
338 * Get OpenSearch suggestion template
339 *
340 * @return string
341 * @static
342 */
343 public static function getOpenSearchTemplate() {
344 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
345 if($wgOpenSearchTemplate)
346 return $wgOpenSearchTemplate;
347 else{
348 $ns = implode(',',SearchEngine::defaultNamespaces());
349 if(!$ns) $ns = "0";
350 return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
351 }
352 }
353
354 /**
355 * Get internal MediaWiki Suggest template
356 *
357 * @return string
358 * @static
359 */
360 public static function getMWSuggestTemplate() {
361 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
362 if($wgMWSuggestTemplate)
363 return $wgMWSuggestTemplate;
364 else
365 return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}';
366 }
367 }
368
369 /**
370 * @ingroup Search
371 */
372 class SearchResultSet {
373 /**
374 * Fetch an array of regular expression fragments for matching
375 * the search terms as parsed by this engine in a text extract.
376 *
377 * @return array
378 * @access public
379 * @abstract
380 */
381 function termMatches() {
382 return array();
383 }
384
385 function numRows() {
386 return 0;
387 }
388
389 /**
390 * Return true if results are included in this result set.
391 * @return bool
392 * @abstract
393 */
394 function hasResults() {
395 return false;
396 }
397
398 /**
399 * Some search modes return a total hit count for the query
400 * in the entire article database. This may include pages
401 * in namespaces that would not be matched on the given
402 * settings.
403 *
404 * Return null if no total hits number is supported.
405 *
406 * @return int
407 * @access public
408 */
409 function getTotalHits() {
410 return null;
411 }
412
413 /**
414 * Some search modes return a suggested alternate term if there are
415 * no exact hits. Returns true if there is one on this set.
416 *
417 * @return bool
418 * @access public
419 */
420 function hasSuggestion() {
421 return false;
422 }
423
424 /**
425 * @return string suggested query, null if none
426 */
427 function getSuggestionQuery(){
428 return null;
429 }
430
431 /**
432 * @return string highlighted suggested query, '' if none
433 */
434 function getSuggestionSnippet(){
435 return '';
436 }
437
438 /**
439 * Return information about how and from where the results were fetched,
440 * should be useful for diagnostics and debugging
441 *
442 * @return string
443 */
444 function getInfo() {
445 return null;
446 }
447
448 /**
449 * Return a result set of hits on other (multiple) wikis associated with this one
450 *
451 * @return SearchResultSet
452 */
453 function getInterwikiResults() {
454 return null;
455 }
456
457 /**
458 * Check if there are results on other wikis
459 *
460 * @return boolean
461 */
462 function hasInterwikiResults() {
463 return $this->getInterwikiResults() != null;
464 }
465
466
467 /**
468 * Fetches next search result, or false.
469 * @return SearchResult
470 * @access public
471 * @abstract
472 */
473 function next() {
474 return false;
475 }
476
477 /**
478 * Frees the result set, if applicable.
479 * @ access public
480 */
481 function free() {
482 // ...
483 }
484 }
485
486
487 /**
488 * @ingroup Search
489 */
490 class SearchResultTooMany {
491 ## Some search engines may bail out if too many matches are found
492 }
493
494
495 /**
496 * @ingroup Search
497 */
498 class SearchResult {
499 var $mRevision = null;
500
501 function SearchResult( $row ) {
502 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
503 if( !is_null($this->mTitle) )
504 $this->mRevision = Revision::newFromTitle( $this->mTitle );
505 }
506
507 /**
508 * Check if this is result points to an invalid title
509 *
510 * @return boolean
511 * @access public
512 */
513 function isBrokenTitle(){
514 if( is_null($this->mTitle) )
515 return true;
516 return false;
517 }
518
519 /**
520 * Check if target page is missing, happens when index is out of date
521 *
522 * @return boolean
523 * @access public
524 */
525 function isMissingRevision(){
526 if( !$this->mRevision )
527 return true;
528 return false;
529 }
530
531 /**
532 * @return Title
533 * @access public
534 */
535 function getTitle() {
536 return $this->mTitle;
537 }
538
539 /**
540 * @return double or null if not supported
541 */
542 function getScore() {
543 return null;
544 }
545
546 /**
547 * Lazy initialization of article text from DB
548 */
549 protected function initText(){
550 if( !isset($this->mText) ){
551 $this->mText = $this->mRevision->getText();
552 }
553 }
554
555 /**
556 * @param array $terms terms to highlight
557 * @return string highlighted text snippet, null (and not '') if not supported
558 */
559 function getTextSnippet($terms){
560 global $wgUser, $wgAdvancedSearchHighlighting;
561 $this->initText();
562 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
563 $h = new SearchHighlighter();
564 if( $wgAdvancedSearchHighlighting )
565 return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
566 else
567 return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
568 }
569
570 /**
571 * @param array $terms terms to highlight
572 * @return string highlighted title, '' if not supported
573 */
574 function getTitleSnippet($terms){
575 return '';
576 }
577
578 /**
579 * @param array $terms terms to highlight
580 * @return string highlighted redirect name (redirect to this page), '' if none or not supported
581 */
582 function getRedirectSnippet($terms){
583 return '';
584 }
585
586 /**
587 * @return Title object for the redirect to this page, null if none or not supported
588 */
589 function getRedirectTitle(){
590 return null;
591 }
592
593 /**
594 * @return string highlighted relevant section name, null if none or not supported
595 */
596 function getSectionSnippet(){
597 return '';
598 }
599
600 /**
601 * @return Title object (pagename+fragment) for the section, null if none or not supported
602 */
603 function getSectionTitle(){
604 return null;
605 }
606
607 /**
608 * @return string timestamp
609 */
610 function getTimestamp(){
611 return $this->mRevision->getTimestamp();
612 }
613
614 /**
615 * @return int number of words
616 */
617 function getWordCount(){
618 $this->initText();
619 return str_word_count( $this->mText );
620 }
621
622 /**
623 * @return int size in bytes
624 */
625 function getByteSize(){
626 $this->initText();
627 return strlen( $this->mText );
628 }
629
630 /**
631 * @return boolean if hit has related articles
632 */
633 function hasRelated(){
634 return false;
635 }
636
637 /**
638 * @return interwiki prefix of the title (return iw even if title is broken)
639 */
640 function getInterwikiPrefix(){
641 return '';
642 }
643 }
644
645 /**
646 * Highlight bits of wikitext
647 *
648 * @ingroup Search
649 */
650 class SearchHighlighter {
651 var $mCleanWikitext = true;
652
653 function SearchHighlighter($cleanupWikitext = true){
654 $this->mCleanWikitext = $cleanupWikitext;
655 }
656
657 /**
658 * Default implementation of wikitext highlighting
659 *
660 * @param string $text
661 * @param array $terms Terms to highlight (unescaped)
662 * @param int $contextlines
663 * @param int $contextchars
664 * @return string
665 */
666 public function highlightText( $text, $terms, $contextlines, $contextchars ) {
667 global $wgLang, $wgContLang;
668 global $wgSearchHighlightBoundaries;
669 $fname = __METHOD__;
670
671 if($text == '')
672 return '';
673
674 // spli text into text + templates/links/tables
675 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
676 // first capture group is for detecting nested templates/links/tables/references
677 $endPatterns = array(
678 1 => '/(\{\{)|(\}\})/', // template
679 2 => '/(\[\[)|(\]\])/', // image
680 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
681
682 // FIXME: this should prolly be a hook or something
683 if(function_exists('wfCite')){
684 $spat .= '|(<ref>)'; // references via cite extension
685 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
686 }
687 $spat .= '/';
688 $textExt = array(); // text extracts
689 $otherExt = array(); // other extracts
690 wfProfileIn( "$fname-split" );
691 $start = 0;
692 $textLen = strlen($text);
693 $count = 0; // sequence number to maintain ordering
694 while( $start < $textLen ){
695 // find start of template/image/table
696 if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
697 $epat = '';
698 foreach($matches as $key => $val){
699 if($key > 0 && $val[1] != -1){
700 if($key == 2){
701 // see if this is an image link
702 $ns = substr($val[0],2,-1);
703 if( $wgContLang->getNsIndex($ns) != NS_IMAGE )
704 break;
705
706 }
707 $epat = $endPatterns[$key];
708 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
709 $start = $val[1];
710 break;
711 }
712 }
713 if( $epat ){
714 // find end (and detect any nested elements)
715 $level = 0;
716 $offset = $start + 1;
717 $found = false;
718 while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
719 if( array_key_exists(2,$endMatches) ){
720 // found end
721 if($level == 0){
722 $len = strlen($endMatches[2][0]);
723 $off = $endMatches[2][1];
724 $this->splitAndAdd( $otherExt, $count,
725 substr( $text, $start, $off + $len - $start ) );
726 $start = $off + $len;
727 $found = true;
728 break;
729 } else{
730 // end of nested element
731 $level -= 1;
732 }
733 } else{
734 // nested
735 $level += 1;
736 }
737 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
738 }
739 if( ! $found ){
740 // couldn't find appropriate closing tag, skip
741 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
742 $start += strlen($matches[0][0]);
743 }
744 continue;
745 }
746 }
747 // else: add as text extract
748 $this->splitAndAdd( $textExt, $count, substr($text,$start) );
749 break;
750 }
751
752 $all = $textExt + $otherExt; // these have disjunct key sets
753
754 wfProfileOut( "$fname-split" );
755
756 // prepare regexps
757 foreach( $terms as $index => $term ) {
758 $terms[$index] = preg_quote( $term, '/' );
759 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
760 if(preg_match('/[\x80-\xff]/', $term) ){
761 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
762 }
763
764
765 }
766 $anyterm = implode( '|', $terms );
767 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
768
769 // FIXME: a hack to scale contextchars, a correct solution
770 // would be to have contextchars actually be char and not byte
771 // length, and do proper utf-8 substrings and lengths everywhere,
772 // but PHP is making that very hard and unclean to implement :(
773 $scale = strlen($anyterm) / mb_strlen($anyterm);
774 $contextchars = intval( $contextchars * $scale );
775
776 $patPre = "(^|$wgSearchHighlightBoundaries)";
777 $patPost = "($wgSearchHighlightBoundaries|$)";
778
779 $pat1 = "/(".$phrase.")/ui";
780 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
781
782 wfProfileIn( "$fname-extract" );
783
784 $left = $contextlines;
785
786 $snippets = array();
787 $offsets = array();
788
789 // show beginning only if it contains all words
790 $first = 0;
791 $firstText = '';
792 foreach($textExt as $index => $line){
793 if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
794 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
795 $first = $index;
796 break;
797 }
798 }
799 if( $firstText ){
800 $succ = true;
801 // check if first text contains all terms
802 foreach($terms as $term){
803 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
804 $succ = false;
805 break;
806 }
807 }
808 if( $succ ){
809 $snippets[$first] = $firstText;
810 $offsets[$first] = 0;
811 }
812 }
813 if( ! $snippets ) {
814 // match whole query on text
815 $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
816 // match whole query on templates/tables/images
817 $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
818 // match any words on text
819 $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
820 // match any words on templates/tables/images
821 $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
822
823 ksort($snippets);
824 }
825
826 // add extra chars to each snippet to make snippets constant size
827 $extended = array();
828 if( count( $snippets ) == 0){
829 // couldn't find the target words, just show beginning of article
830 $targetchars = $contextchars * $contextlines;
831 $snippets[$first] = '';
832 $offsets[$first] = 0;
833 } else{
834 // if begin of the article contains the whole phrase, show only that !!
835 if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
836 && $offsets[$first] < $contextchars * 2 ){
837 $snippets = array ($first => $snippets[$first]);
838 }
839
840 // calc by how much to extend existing snippets
841 $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
842 }
843
844 foreach($snippets as $index => $line){
845 $extended[$index] = $line;
846 $len = strlen($line);
847 if( $len < $targetchars - 20 ){
848 // complete this line
849 if($len < strlen( $all[$index] )){
850 $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
851 $len = strlen( $extended[$index] );
852 }
853
854 // add more lines
855 $add = $index + 1;
856 while( $len < $targetchars - 20
857 && array_key_exists($add,$all)
858 && !array_key_exists($add,$snippets) ){
859 $offsets[$add] = 0;
860 $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
861 $extended[$add] = $tt;
862 $len += strlen( $tt );
863 $add++;
864 }
865 }
866 }
867
868 //$snippets = array_map('htmlspecialchars', $extended);
869 $snippets = $extended;
870 $last = -1;
871 $extract = '';
872 foreach($snippets as $index => $line){
873 if($last == -1)
874 $extract .= $line; // first line
875 elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
876 $extract .= " ".$line; // continous lines
877 else
878 $extract .= '<b> ... </b>' . $line;
879
880 $last = $index;
881 }
882 if( $extract )
883 $extract .= '<b> ... </b>';
884
885 $processed = array();
886 foreach($terms as $term){
887 if( ! isset($processed[$term]) ){
888 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
889 $extract = preg_replace( $pat3,
890 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
891 $processed[$term] = true;
892 }
893 }
894
895 wfProfileOut( "$fname-extract" );
896
897 return $extract;
898 }
899
900 /**
901 * Split text into lines and add it to extracts array
902 *
903 * @param array $extracts index -> $line
904 * @param int $count
905 * @param string $text
906 */
907 function splitAndAdd(&$extracts, &$count, $text){
908 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
909 foreach($split as $line){
910 $tt = trim($line);
911 if( $tt )
912 $extracts[$count++] = $tt;
913 }
914 }
915
916 /**
917 * Do manual case conversion for non-ascii chars
918 *
919 * @param unknown_type $matches
920 */
921 function caseCallback($matches){
922 global $wgContLang;
923 if( strlen($matches[0]) > 1 ){
924 return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
925 } else
926 return $matches[0];
927 }
928
929 /**
930 * Extract part of the text from start to end, but by
931 * not chopping up words
932 * @param string $text
933 * @param int $start
934 * @param int $end
935 * @param int $posStart (out) actual start position
936 * @param int $posEnd (out) actual end position
937 * @return string
938 */
939 function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
940 global $wgContLang;
941
942 if( $start != 0)
943 $start = $this->position( $text, $start, 1 );
944 if( $end >= strlen($text) )
945 $end = strlen($text);
946 else
947 $end = $this->position( $text, $end );
948
949 if(!is_null($posStart))
950 $posStart = $start;
951 if(!is_null($posEnd))
952 $posEnd = $end;
953
954 if($end > $start)
955 return substr($text, $start, $end-$start);
956 else
957 return '';
958 }
959
960 /**
961 * Find a nonletter near a point (index) in the text
962 *
963 * @param string $text
964 * @param int $point
965 * @param int $offset to found index
966 * @return int nearest nonletter index, or beginning of utf8 char if none
967 */
968 function position($text, $point, $offset=0 ){
969 $tolerance = 10;
970 $s = max( 0, $point - $tolerance );
971 $l = min( strlen($text), $point + $tolerance ) - $s;
972 $m = array();
973 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
974 return $m[0][1] + $s + $offset;
975 } else{
976 // check if point is on a valid first UTF8 char
977 $char = ord( $text[$point] );
978 while( $char >= 0x80 && $char < 0xc0 ) {
979 // skip trailing bytes
980 $point++;
981 if($point >= strlen($text))
982 return strlen($text);
983 $char = ord( $text[$point] );
984 }
985 return $point;
986
987 }
988 }
989
990 /**
991 * Search extracts for a pattern, and return snippets
992 *
993 * @param string $pattern regexp for matching lines
994 * @param array $extracts extracts to search
995 * @param int $linesleft number of extracts to make
996 * @param int $contextchars length of snippet
997 * @param array $out map for highlighted snippets
998 * @param array $offsets map of starting points of snippets
999 * @protected
1000 */
1001 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1002 if($linesleft == 0)
1003 return; // nothing to do
1004 foreach($extracts as $index => $line){
1005 if( array_key_exists($index,$out) )
1006 continue; // this line already highlighted
1007
1008 $m = array();
1009 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1010 continue;
1011
1012 $offset = $m[0][1];
1013 $len = strlen($m[0][0]);
1014 if($offset + $len < $contextchars)
1015 $begin = 0;
1016 elseif( $len > $contextchars)
1017 $begin = $offset;
1018 else
1019 $begin = $offset + intval( ($len - $contextchars) / 2 );
1020
1021 $end = $begin + $contextchars;
1022
1023 $posBegin = $begin;
1024 // basic snippet from this line
1025 $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1026 $offsets[$index] = $posBegin;
1027 $linesleft--;
1028 if($linesleft == 0)
1029 return;
1030 }
1031 }
1032
1033 /**
1034 * Basic wikitext removal
1035 * @protected
1036 */
1037 function removeWiki($text) {
1038 $fname = __METHOD__;
1039 wfProfileIn( $fname );
1040
1041 //$text = preg_replace("/'{2,5}/", "", $text);
1042 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1043 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1044 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1045 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1046 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1047 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1048 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1049 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1050 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1051 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1052 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1053 $text = preg_replace("/'''''/", "", $text);
1054 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1055 $text = preg_replace("/''/", "", $text);
1056
1057 wfProfileOut( $fname );
1058 return $text;
1059 }
1060
1061 /**
1062 * callback to replace [[target|caption]] kind of links, if
1063 * the target is category or image, leave it
1064 *
1065 * @param array $matches
1066 */
1067 function linkReplace($matches){
1068 $colon = strpos( $matches[1], ':' );
1069 if( $colon === false )
1070 return $matches[2]; // replace with caption
1071 global $wgContLang;
1072 $ns = substr( $matches[1], 0, $colon );
1073 $index = $wgContLang->getNsIndex($ns);
1074 if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) )
1075 return $matches[0]; // return the whole thing
1076 else
1077 return $matches[2];
1078
1079 }
1080
1081 /**
1082 * Simple & fast snippet extraction, but gives completely unrelevant
1083 * snippets
1084 *
1085 * @param string $text
1086 * @param array $terms
1087 * @param int $contextlines
1088 * @param int $contextchars
1089 * @return string
1090 */
1091 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1092 global $wgLang, $wgContLang;
1093 $fname = __METHOD__;
1094
1095 $lines = explode( "\n", $text );
1096
1097 $terms = implode( '|', $terms );
1098 $terms = str_replace( '/', "\\/", $terms);
1099 $max = intval( $contextchars ) + 1;
1100 $pat1 = "/(.*)($terms)(.{0,$max})/i";
1101
1102 $lineno = 0;
1103
1104 $extract = "";
1105 wfProfileIn( "$fname-extract" );
1106 foreach ( $lines as $line ) {
1107 if ( 0 == $contextlines ) {
1108 break;
1109 }
1110 ++$lineno;
1111 $m = array();
1112 if ( ! preg_match( $pat1, $line, $m ) ) {
1113 continue;
1114 }
1115 --$contextlines;
1116 $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
1117
1118 if ( count( $m ) < 3 ) {
1119 $post = '';
1120 } else {
1121 $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
1122 }
1123
1124 $found = $m[2];
1125
1126 $line = htmlspecialchars( $pre . $found . $post );
1127 $pat2 = '/(' . $terms . ")/i";
1128 $line = preg_replace( $pat2,
1129 "<span class='searchmatch'>\\1</span>", $line );
1130
1131 $extract .= "${line}\n";
1132 }
1133 wfProfileOut( "$fname-extract" );
1134
1135 return $extract;
1136 }
1137
1138 }
1139
1140 /**
1141 * @ingroup Search
1142 */
1143 class SearchEngineDummy {
1144 function search( $term ) {
1145 return null;
1146 }
1147 function setLimitOffset($l, $o) {}
1148 function legalSearchChars() {}
1149 function update() {}
1150 function setnamespaces() {}
1151 function searchtitle() {}
1152 function searchtext() {}
1153 }