Make use of Title::isSpecialPage() in various places
[lhc/web/wiklou.git] / includes / search / SearchEngine.php
1 <?php
2 /**
3 * Basic search engine
4 *
5 * @file
6 * @ingroup Search
7 */
8
9 /**
10 * @defgroup Search Search
11 */
12
13 /**
14 * Contain a class for special pages
15 * @ingroup Search
16 */
17 class SearchEngine {
18 var $limit = 10;
19 var $offset = 0;
20 var $prefix = '';
21 var $searchTerms = array();
22 var $namespaces = array( NS_MAIN );
23 var $showRedirects = false;
24
25 /// Feature values
26 protected $features = array();
27
28 /**
29 * @var DatabaseBase
30 */
31 protected $db;
32
33 function __construct($db = null) {
34 if ( $db ) {
35 $this->db = $db;
36 } else {
37 $this->db = wfGetDB( DB_SLAVE );
38 }
39 }
40
41 /**
42 * Perform a full text search query and return a result set.
43 * If title searches are not supported or disabled, return null.
44 * STUB
45 *
46 * @param $term String: raw search term
47 * @return SearchResultSet
48 */
49 function searchText( $term ) {
50 return null;
51 }
52
53 /**
54 * Perform a title-only search query and return a result set.
55 * If title searches are not supported or disabled, return null.
56 * STUB
57 *
58 * @param $term String: raw search term
59 * @return SearchResultSet
60 */
61 function searchTitle( $term ) {
62 return null;
63 }
64
65 /**
66 * If this search backend can list/unlist redirects
67 * @deprecated since 1.18 Call supports( 'list-redirects' );
68 */
69 function acceptListRedirects() {
70 return $this->supports( 'list-redirects' );
71 }
72
73 /**
74 * @since 1.18
75 * @param $feature String
76 * @return Boolean
77 */
78 public function supports( $feature ) {
79 switch( $feature ) {
80 case 'list-redirects':
81 return true;
82 case 'title-suffix-filter':
83 default:
84 return false;
85 }
86 }
87
88 /**
89 * Way to pass custom data for engines
90 * @since 1.18
91 * @param $feature String
92 * @param $data Mixed
93 * @return Noolean
94 */
95 public function setFeatureData( $feature, $data ) {
96 $this->features[$feature] = $data;
97 }
98
99 /**
100 * When overridden in derived class, performs database-specific conversions
101 * on text to be used for searching or updating search index.
102 * Default implementation does nothing (simply returns $string).
103 *
104 * @param $string string: String to process
105 * @return string
106 */
107 public function normalizeText( $string ) {
108 global $wgContLang;
109
110 // Some languages such as Chinese require word segmentation
111 return $wgContLang->segmentByWord( $string );
112 }
113
114 /**
115 * Transform search term in cases when parts of the query came as different GET params (when supported)
116 * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
117 */
118 function transformSearchTerm( $term ) {
119 return $term;
120 }
121
122 /**
123 * If an exact title match can be found, or a very slightly close match,
124 * return the title. If no match, returns NULL.
125 *
126 * @param $searchterm String
127 * @return Title
128 */
129 public static function getNearMatch( $searchterm ) {
130 $title = self::getNearMatchInternal( $searchterm );
131
132 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
133 return $title;
134 }
135
136 /**
137 * Do a near match (see SearchEngine::getNearMatch) and wrap it into a
138 * SearchResultSet.
139 *
140 * @param $searchterm string
141 * @return SearchResultSet
142 */
143 public static function getNearMatchResultSet( $searchterm ) {
144 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) );
145 }
146
147 /**
148 * Really find the title match.
149 */
150 private static function getNearMatchInternal( $searchterm ) {
151 global $wgContLang;
152
153 $allSearchTerms = array( $searchterm );
154
155 if ( $wgContLang->hasVariants() ) {
156 $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) );
157 }
158
159 $titleResult = null;
160 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
161 return $titleResult;
162 }
163
164 $context = new RequestContext;
165
166 foreach ( $allSearchTerms as $term ) {
167
168 # Exact match? No need to look further.
169 $title = Title::newFromText( $term );
170 if ( is_null( $title ) ){
171 return null;
172 }
173
174 if ( $title->isSpecialPage() || $title->isExternal() || $title->exists() ) {
175 return $title;
176 }
177
178 # See if it still otherwise has content is some sane sense
179 $context->setTitle( $title );
180 $article = Article::newFromTitle( $title, $context );
181 if ( $article->hasViewableContent() ) {
182 return $title;
183 }
184
185 # Now try all lower case (i.e. first letter capitalized)
186 #
187 $title = Title::newFromText( $wgContLang->lc( $term ) );
188 if ( $title && $title->exists() ) {
189 return $title;
190 }
191
192 # Now try capitalized string
193 #
194 $title = Title::newFromText( $wgContLang->ucwords( $term ) );
195 if ( $title && $title->exists() ) {
196 return $title;
197 }
198
199 # Now try all upper case
200 #
201 $title = Title::newFromText( $wgContLang->uc( $term ) );
202 if ( $title && $title->exists() ) {
203 return $title;
204 }
205
206 # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
207 $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) );
208 if ( $title && $title->exists() ) {
209 return $title;
210 }
211
212 // Give hooks a chance at better match variants
213 $title = null;
214 if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
215 return $title;
216 }
217 }
218
219 $title = Title::newFromText( $searchterm );
220
221 # Entering an IP address goes to the contributions page
222 if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) )
223 || User::isIP( trim( $searchterm ) ) ) {
224 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
225 }
226
227
228 # Entering a user goes to the user page whether it's there or not
229 if ( $title->getNamespace() == NS_USER ) {
230 return $title;
231 }
232
233 # Go to images that exist even if there's no local page.
234 # There may have been a funny upload, or it may be on a shared
235 # file repository such as Wikimedia Commons.
236 if ( $title->getNamespace() == NS_FILE ) {
237 $image = wfFindFile( $title );
238 if ( $image ) {
239 return $title;
240 }
241 }
242
243 # MediaWiki namespace? Page may be "implied" if not customized.
244 # Just return it, with caps forced as the message system likes it.
245 if ( $title->getNamespace() == NS_MEDIAWIKI ) {
246 return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
247 }
248
249 # Quoted term? Try without the quotes...
250 $matches = array();
251 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
252 return SearchEngine::getNearMatch( $matches[1] );
253 }
254
255 return null;
256 }
257
258 public static function legalSearchChars() {
259 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
260 }
261
262 /**
263 * Set the maximum number of results to return
264 * and how many to skip before returning the first.
265 *
266 * @param $limit Integer
267 * @param $offset Integer
268 */
269 function setLimitOffset( $limit, $offset = 0 ) {
270 $this->limit = intval( $limit );
271 $this->offset = intval( $offset );
272 }
273
274 /**
275 * Set which namespaces the search should include.
276 * Give an array of namespace index numbers.
277 *
278 * @param $namespaces Array
279 */
280 function setNamespaces( $namespaces ) {
281 $this->namespaces = $namespaces;
282 }
283
284 /**
285 * Parse some common prefixes: all (search everything)
286 * or namespace names
287 *
288 * @param $query String
289 */
290 function replacePrefixes( $query ) {
291 global $wgContLang;
292
293 $parsed = $query;
294 if ( strpos( $query, ':' ) === false ) { // nothing to do
295 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
296 return $parsed;
297 }
298
299 $allkeyword = wfMsgForContent( 'searchall' ) . ":";
300 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) {
301 $this->namespaces = null;
302 $parsed = substr( $query, strlen( $allkeyword ) );
303 } elseif ( strpos( $query, ':' ) !== false ) {
304 $prefix = substr( $query, 0, strpos( $query, ':' ) );
305 $index = $wgContLang->getNsIndex( $prefix );
306 if ( $index !== false ) {
307 $this->namespaces = array( $index );
308 $parsed = substr( $query, strlen( $prefix ) + 1 );
309 }
310 }
311 if ( trim( $parsed ) == '' )
312 $parsed = $query; // prefix was the whole query
313
314 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
315
316 return $parsed;
317 }
318
319 /**
320 * Make a list of searchable namespaces and their canonical names.
321 * @return Array
322 */
323 public static function searchableNamespaces() {
324 global $wgContLang;
325 $arr = array();
326 foreach ( $wgContLang->getNamespaces() as $ns => $name ) {
327 if ( $ns >= NS_MAIN ) {
328 $arr[$ns] = $name;
329 }
330 }
331
332 wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
333 return $arr;
334 }
335
336 /**
337 * Extract default namespaces to search from the given user's
338 * settings, returning a list of index numbers.
339 *
340 * @param $user User
341 * @return Array
342 */
343 public static function userNamespaces( $user ) {
344 global $wgSearchEverythingOnlyLoggedIn;
345
346 // get search everything preference, that can be set to be read for logged-in users
347 $searcheverything = false;
348 if ( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() )
349 || !$wgSearchEverythingOnlyLoggedIn )
350 $searcheverything = $user->getOption( 'searcheverything' );
351
352 // searcheverything overrides other options
353 if ( $searcheverything )
354 return array_keys( SearchEngine::searchableNamespaces() );
355
356 $arr = Preferences::loadOldSearchNs( $user );
357 $searchableNamespaces = SearchEngine::searchableNamespaces();
358
359 $arr = array_intersect( $arr, array_keys( $searchableNamespaces ) ); // Filter
360
361 return $arr;
362 }
363
364 /**
365 * Find snippet highlight settings for all users
366 *
367 * @return Array contextlines, contextchars
368 */
369 public static function userHighlightPrefs() {
370 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
371 $contextchars = 75; // same as above.... :P
372 return array( $contextlines, $contextchars );
373 }
374
375 /**
376 * An array of namespaces indexes to be searched by default
377 *
378 * @return Array
379 */
380 public static function defaultNamespaces() {
381 global $wgNamespacesToBeSearchedDefault;
382
383 return array_keys( $wgNamespacesToBeSearchedDefault, true );
384 }
385
386 /**
387 * Get a list of namespace names useful for showing in tooltips
388 * and preferences
389 *
390 * @param $namespaces Array
391 */
392 public static function namespacesAsText( $namespaces ) {
393 global $wgContLang;
394
395 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces );
396 foreach ( $formatted as $key => $ns ) {
397 if ( empty( $ns ) )
398 $formatted[$key] = wfMsg( 'blanknamespace' );
399 }
400 return $formatted;
401 }
402
403 /**
404 * Return the help namespaces to be shown on Special:Search
405 *
406 * @return Array
407 */
408 public static function helpNamespaces() {
409 global $wgNamespacesToBeSearchedHelp;
410
411 return array_keys( $wgNamespacesToBeSearchedHelp, true );
412 }
413
414 /**
415 * Return a 'cleaned up' search string
416 *
417 * @param $text String
418 * @return String
419 */
420 function filter( $text ) {
421 $lc = $this->legalSearchChars();
422 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
423 }
424 /**
425 * Load up the appropriate search engine class for the currently
426 * active database backend, and return a configured instance.
427 *
428 * @return SearchEngine
429 */
430 public static function create() {
431 global $wgSearchType;
432 $dbr = null;
433 if ( $wgSearchType ) {
434 $class = $wgSearchType;
435 } else {
436 $dbr = wfGetDB( DB_SLAVE );
437 $class = $dbr->getSearchEngine();
438 }
439 $search = new $class( $dbr );
440 $search->setLimitOffset( 0, 0 );
441 return $search;
442 }
443
444 /**
445 * Create or update the search index record for the given page.
446 * Title and text should be pre-processed.
447 * STUB
448 *
449 * @param $id Integer
450 * @param $title String
451 * @param $text String
452 */
453 function update( $id, $title, $text ) {
454 // no-op
455 }
456
457 /**
458 * Update a search index record's title only.
459 * Title should be pre-processed.
460 * STUB
461 *
462 * @param $id Integer
463 * @param $title String
464 */
465 function updateTitle( $id, $title ) {
466 // no-op
467 }
468
469 /**
470 * Get OpenSearch suggestion template
471 *
472 * @return String
473 */
474 public static function getOpenSearchTemplate() {
475 global $wgOpenSearchTemplate, $wgCanonicalServer;
476 if ( $wgOpenSearchTemplate ) {
477 return $wgOpenSearchTemplate;
478 } else {
479 $ns = implode( '|', SearchEngine::defaultNamespaces() );
480 if ( !$ns ) {
481 $ns = "0";
482 }
483 return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns;
484 }
485 }
486
487 /**
488 * Get internal MediaWiki Suggest template
489 *
490 * @return String
491 */
492 public static function getMWSuggestTemplate() {
493 global $wgMWSuggestTemplate, $wgServer;
494 if ( $wgMWSuggestTemplate )
495 return $wgMWSuggestTemplate;
496 else
497 return $wgServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
498 }
499 }
500
501 /**
502 * @ingroup Search
503 */
504 class SearchResultSet {
505 /**
506 * Fetch an array of regular expression fragments for matching
507 * the search terms as parsed by this engine in a text extract.
508 * STUB
509 *
510 * @return Array
511 */
512 function termMatches() {
513 return array();
514 }
515
516 function numRows() {
517 return 0;
518 }
519
520 /**
521 * Return true if results are included in this result set.
522 * STUB
523 *
524 * @return Boolean
525 */
526 function hasResults() {
527 return false;
528 }
529
530 /**
531 * Some search modes return a total hit count for the query
532 * in the entire article database. This may include pages
533 * in namespaces that would not be matched on the given
534 * settings.
535 *
536 * Return null if no total hits number is supported.
537 *
538 * @return Integer
539 */
540 function getTotalHits() {
541 return null;
542 }
543
544 /**
545 * Some search modes return a suggested alternate term if there are
546 * no exact hits. Returns true if there is one on this set.
547 *
548 * @return Boolean
549 */
550 function hasSuggestion() {
551 return false;
552 }
553
554 /**
555 * @return String: suggested query, null if none
556 */
557 function getSuggestionQuery() {
558 return null;
559 }
560
561 /**
562 * @return String: HTML highlighted suggested query, '' if none
563 */
564 function getSuggestionSnippet() {
565 return '';
566 }
567
568 /**
569 * Return information about how and from where the results were fetched,
570 * should be useful for diagnostics and debugging
571 *
572 * @return String
573 */
574 function getInfo() {
575 return null;
576 }
577
578 /**
579 * Return a result set of hits on other (multiple) wikis associated with this one
580 *
581 * @return SearchResultSet
582 */
583 function getInterwikiResults() {
584 return null;
585 }
586
587 /**
588 * Check if there are results on other wikis
589 *
590 * @return Boolean
591 */
592 function hasInterwikiResults() {
593 return $this->getInterwikiResults() != null;
594 }
595
596 /**
597 * Fetches next search result, or false.
598 * STUB
599 *
600 * @return SearchResult
601 */
602 function next() {
603 return false;
604 }
605
606 /**
607 * Frees the result set, if applicable.
608 */
609 function free() {
610 // ...
611 }
612 }
613
614 /**
615 * This class is used for different SQL-based search engines shipped with MediaWiki
616 */
617 class SqlSearchResultSet extends SearchResultSet {
618
619 protected $mResultSet;
620
621 function __construct( $resultSet, $terms ) {
622 $this->mResultSet = $resultSet;
623 $this->mTerms = $terms;
624 }
625
626 function termMatches() {
627 return $this->mTerms;
628 }
629
630 function numRows() {
631 if ( $this->mResultSet === false )
632 return false;
633
634 return $this->mResultSet->numRows();
635 }
636
637 function next() {
638 if ( $this->mResultSet === false )
639 return false;
640
641 $row = $this->mResultSet->fetchObject();
642 if ( $row === false )
643 return false;
644
645 return SearchResult::newFromRow( $row );
646 }
647
648 function free() {
649 if ( $this->mResultSet === false )
650 return false;
651
652 $this->mResultSet->free();
653 }
654 }
655
656 /**
657 * @ingroup Search
658 */
659 class SearchResultTooMany {
660 # # Some search engines may bail out if too many matches are found
661 }
662
663
664 /**
665 * @todo FIXME: This class is horribly factored. It would probably be better to
666 * have a useful base class to which you pass some standard information, then
667 * let the fancy self-highlighters extend that.
668 * @ingroup Search
669 */
670 class SearchResult {
671
672 /**
673 * @var Revision
674 */
675 var $mRevision = null;
676 var $mImage = null;
677
678 /**
679 * @var Title
680 */
681 var $mTitle;
682
683 /**
684 * @var String
685 */
686 var $mText;
687
688 /**
689 * Return a new SearchResult and initializes it with a title.
690 *
691 * @param $title Title
692 * @return SearchResult
693 */
694 public static function newFromTitle( $title ) {
695 $result = new self();
696 $result->initFromTitle( $title );
697 return $result;
698 }
699 /**
700 * Return a new SearchResult and initializes it with a row.
701 *
702 * @param $row object
703 * @return SearchResult
704 */
705 public static function newFromRow( $row ) {
706 $result = new self();
707 $result->initFromRow( $row );
708 return $result;
709 }
710
711 public function __construct( $row = null ) {
712 if ( !is_null( $row ) ) {
713 // Backwards compatibility with pre-1.17 callers
714 $this->initFromRow( $row );
715 }
716 }
717
718 /**
719 * Initialize from a database row. Makes a Title and passes that to
720 * initFromTitle.
721 *
722 * @param $row object
723 */
724 protected function initFromRow( $row ) {
725 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) );
726 }
727
728 /**
729 * Initialize from a Title and if possible initializes a corresponding
730 * Revision and File.
731 *
732 * @param $title Title
733 */
734 protected function initFromTitle( $title ) {
735 $this->mTitle = $title;
736 if ( !is_null( $this->mTitle ) ) {
737 $this->mRevision = Revision::newFromTitle( $this->mTitle );
738 if ( $this->mTitle->getNamespace() === NS_FILE )
739 $this->mImage = wfFindFile( $this->mTitle );
740 }
741 }
742
743 /**
744 * Check if this is result points to an invalid title
745 *
746 * @return Boolean
747 */
748 function isBrokenTitle() {
749 if ( is_null( $this->mTitle ) )
750 return true;
751 return false;
752 }
753
754 /**
755 * Check if target page is missing, happens when index is out of date
756 *
757 * @return Boolean
758 */
759 function isMissingRevision() {
760 return !$this->mRevision && !$this->mImage;
761 }
762
763 /**
764 * @return Title
765 */
766 function getTitle() {
767 return $this->mTitle;
768 }
769
770 /**
771 * @return Double or null if not supported
772 */
773 function getScore() {
774 return null;
775 }
776
777 /**
778 * Lazy initialization of article text from DB
779 */
780 protected function initText() {
781 if ( !isset( $this->mText ) ) {
782 if ( $this->mRevision != null )
783 $this->mText = $this->mRevision->getText();
784 else // TODO: can we fetch raw wikitext for commons images?
785 $this->mText = '';
786
787 }
788 }
789
790 /**
791 * @param $terms Array: terms to highlight
792 * @return String: highlighted text snippet, null (and not '') if not supported
793 */
794 function getTextSnippet( $terms ) {
795 global $wgUser, $wgAdvancedSearchHighlighting;
796 $this->initText();
797 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser );
798 $h = new SearchHighlighter();
799 if ( $wgAdvancedSearchHighlighting )
800 return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
801 else
802 return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
803 }
804
805 /**
806 * @param $terms Array: terms to highlight
807 * @return String: highlighted title, '' if not supported
808 */
809 function getTitleSnippet( $terms ) {
810 return '';
811 }
812
813 /**
814 * @param $terms Array: terms to highlight
815 * @return String: highlighted redirect name (redirect to this page), '' if none or not supported
816 */
817 function getRedirectSnippet( $terms ) {
818 return '';
819 }
820
821 /**
822 * @return Title object for the redirect to this page, null if none or not supported
823 */
824 function getRedirectTitle() {
825 return null;
826 }
827
828 /**
829 * @return string highlighted relevant section name, null if none or not supported
830 */
831 function getSectionSnippet() {
832 return '';
833 }
834
835 /**
836 * @return Title object (pagename+fragment) for the section, null if none or not supported
837 */
838 function getSectionTitle() {
839 return null;
840 }
841
842 /**
843 * @return String: timestamp
844 */
845 function getTimestamp() {
846 if ( $this->mRevision )
847 return $this->mRevision->getTimestamp();
848 elseif ( $this->mImage )
849 return $this->mImage->getTimestamp();
850 return '';
851 }
852
853 /**
854 * @return Integer: number of words
855 */
856 function getWordCount() {
857 $this->initText();
858 return str_word_count( $this->mText );
859 }
860
861 /**
862 * @return Integer: size in bytes
863 */
864 function getByteSize() {
865 $this->initText();
866 return strlen( $this->mText );
867 }
868
869 /**
870 * @return Boolean if hit has related articles
871 */
872 function hasRelated() {
873 return false;
874 }
875
876 /**
877 * @return String: interwiki prefix of the title (return iw even if title is broken)
878 */
879 function getInterwikiPrefix() {
880 return '';
881 }
882 }
883 /**
884 * A SearchResultSet wrapper for SearchEngine::getNearMatch
885 */
886 class SearchNearMatchResultSet extends SearchResultSet {
887 private $fetched = false;
888 /**
889 * @param $match mixed Title if matched, else null
890 */
891 public function __construct( $match ) {
892 $this->result = $match;
893 }
894 public function hasResult() {
895 return (bool)$this->result;
896 }
897 public function numRows() {
898 return $this->hasResults() ? 1 : 0;
899 }
900 public function next() {
901 if ( $this->fetched || !$this->result ) {
902 return false;
903 }
904 $this->fetched = true;
905 return SearchResult::newFromTitle( $this->result );
906 }
907 }
908
909 /**
910 * Highlight bits of wikitext
911 *
912 * @ingroup Search
913 */
914 class SearchHighlighter {
915 var $mCleanWikitext = true;
916
917 function __construct( $cleanupWikitext = true ) {
918 $this->mCleanWikitext = $cleanupWikitext;
919 }
920
921 /**
922 * Default implementation of wikitext highlighting
923 *
924 * @param $text String
925 * @param $terms Array: terms to highlight (unescaped)
926 * @param $contextlines Integer
927 * @param $contextchars Integer
928 * @return String
929 */
930 public function highlightText( $text, $terms, $contextlines, $contextchars ) {
931 global $wgContLang;
932 global $wgSearchHighlightBoundaries;
933 $fname = __METHOD__;
934
935 if ( $text == '' )
936 return '';
937
938 // spli text into text + templates/links/tables
939 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
940 // first capture group is for detecting nested templates/links/tables/references
941 $endPatterns = array(
942 1 => '/(\{\{)|(\}\})/', // template
943 2 => '/(\[\[)|(\]\])/', // image
944 3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
945
946 // @todo FIXME: This should prolly be a hook or something
947 if ( function_exists( 'wfCite' ) ) {
948 $spat .= '|(<ref>)'; // references via cite extension
949 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
950 }
951 $spat .= '/';
952 $textExt = array(); // text extracts
953 $otherExt = array(); // other extracts
954 wfProfileIn( "$fname-split" );
955 $start = 0;
956 $textLen = strlen( $text );
957 $count = 0; // sequence number to maintain ordering
958 while ( $start < $textLen ) {
959 // find start of template/image/table
960 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
961 $epat = '';
962 foreach ( $matches as $key => $val ) {
963 if ( $key > 0 && $val[1] != - 1 ) {
964 if ( $key == 2 ) {
965 // see if this is an image link
966 $ns = substr( $val[0], 2, - 1 );
967 if ( $wgContLang->getNsIndex( $ns ) != NS_FILE )
968 break;
969
970 }
971 $epat = $endPatterns[$key];
972 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
973 $start = $val[1];
974 break;
975 }
976 }
977 if ( $epat ) {
978 // find end (and detect any nested elements)
979 $level = 0;
980 $offset = $start + 1;
981 $found = false;
982 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
983 if ( array_key_exists( 2, $endMatches ) ) {
984 // found end
985 if ( $level == 0 ) {
986 $len = strlen( $endMatches[2][0] );
987 $off = $endMatches[2][1];
988 $this->splitAndAdd( $otherExt, $count,
989 substr( $text, $start, $off + $len - $start ) );
990 $start = $off + $len;
991 $found = true;
992 break;
993 } else {
994 // end of nested element
995 $level -= 1;
996 }
997 } else {
998 // nested
999 $level += 1;
1000 }
1001 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
1002 }
1003 if ( ! $found ) {
1004 // couldn't find appropriate closing tag, skip
1005 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
1006 $start += strlen( $matches[0][0] );
1007 }
1008 continue;
1009 }
1010 }
1011 // else: add as text extract
1012 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
1013 break;
1014 }
1015
1016 $all = $textExt + $otherExt; // these have disjunct key sets
1017
1018 wfProfileOut( "$fname-split" );
1019
1020 // prepare regexps
1021 foreach ( $terms as $index => $term ) {
1022 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
1023 if ( preg_match( '/[\x80-\xff]/', $term ) ) {
1024 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
1025 } else {
1026 $terms[$index] = $term;
1027 }
1028 }
1029 $anyterm = implode( '|', $terms );
1030 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
1031
1032 // @todo FIXME: A hack to scale contextchars, a correct solution
1033 // would be to have contextchars actually be char and not byte
1034 // length, and do proper utf-8 substrings and lengths everywhere,
1035 // but PHP is making that very hard and unclean to implement :(
1036 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
1037 $contextchars = intval( $contextchars * $scale );
1038
1039 $patPre = "(^|$wgSearchHighlightBoundaries)";
1040 $patPost = "($wgSearchHighlightBoundaries|$)";
1041
1042 $pat1 = "/(" . $phrase . ")/ui";
1043 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
1044
1045 wfProfileIn( "$fname-extract" );
1046
1047 $left = $contextlines;
1048
1049 $snippets = array();
1050 $offsets = array();
1051
1052 // show beginning only if it contains all words
1053 $first = 0;
1054 $firstText = '';
1055 foreach ( $textExt as $index => $line ) {
1056 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
1057 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
1058 $first = $index;
1059 break;
1060 }
1061 }
1062 if ( $firstText ) {
1063 $succ = true;
1064 // check if first text contains all terms
1065 foreach ( $terms as $term ) {
1066 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
1067 $succ = false;
1068 break;
1069 }
1070 }
1071 if ( $succ ) {
1072 $snippets[$first] = $firstText;
1073 $offsets[$first] = 0;
1074 }
1075 }
1076 if ( ! $snippets ) {
1077 // match whole query on text
1078 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
1079 // match whole query on templates/tables/images
1080 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
1081 // match any words on text
1082 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
1083 // match any words on templates/tables/images
1084 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
1085
1086 ksort( $snippets );
1087 }
1088
1089 // add extra chars to each snippet to make snippets constant size
1090 $extended = array();
1091 if ( count( $snippets ) == 0 ) {
1092 // couldn't find the target words, just show beginning of article
1093 if ( array_key_exists( $first, $all ) ) {
1094 $targetchars = $contextchars * $contextlines;
1095 $snippets[$first] = '';
1096 $offsets[$first] = 0;
1097 }
1098 } else {
1099 // if begin of the article contains the whole phrase, show only that !!
1100 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
1101 && $offsets[$first] < $contextchars * 2 ) {
1102 $snippets = array ( $first => $snippets[$first] );
1103 }
1104
1105 // calc by how much to extend existing snippets
1106 $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
1107 }
1108
1109 foreach ( $snippets as $index => $line ) {
1110 $extended[$index] = $line;
1111 $len = strlen( $line );
1112 if ( $len < $targetchars - 20 ) {
1113 // complete this line
1114 if ( $len < strlen( $all[$index] ) ) {
1115 $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
1116 $len = strlen( $extended[$index] );
1117 }
1118
1119 // add more lines
1120 $add = $index + 1;
1121 while ( $len < $targetchars - 20
1122 && array_key_exists( $add, $all )
1123 && !array_key_exists( $add, $snippets ) ) {
1124 $offsets[$add] = 0;
1125 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
1126 $extended[$add] = $tt;
1127 $len += strlen( $tt );
1128 $add++;
1129 }
1130 }
1131 }
1132
1133 // $snippets = array_map('htmlspecialchars', $extended);
1134 $snippets = $extended;
1135 $last = - 1;
1136 $extract = '';
1137 foreach ( $snippets as $index => $line ) {
1138 if ( $last == - 1 )
1139 $extract .= $line; // first line
1140 elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) )
1141 $extract .= " " . $line; // continous lines
1142 else
1143 $extract .= '<b> ... </b>' . $line;
1144
1145 $last = $index;
1146 }
1147 if ( $extract )
1148 $extract .= '<b> ... </b>';
1149
1150 $processed = array();
1151 foreach ( $terms as $term ) {
1152 if ( ! isset( $processed[$term] ) ) {
1153 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
1154 $extract = preg_replace( $pat3,
1155 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
1156 $processed[$term] = true;
1157 }
1158 }
1159
1160 wfProfileOut( "$fname-extract" );
1161
1162 return $extract;
1163 }
1164
1165 /**
1166 * Split text into lines and add it to extracts array
1167 *
1168 * @param $extracts Array: index -> $line
1169 * @param $count Integer
1170 * @param $text String
1171 */
1172 function splitAndAdd( &$extracts, &$count, $text ) {
1173 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
1174 foreach ( $split as $line ) {
1175 $tt = trim( $line );
1176 if ( $tt )
1177 $extracts[$count++] = $tt;
1178 }
1179 }
1180
1181 /**
1182 * Do manual case conversion for non-ascii chars
1183 *
1184 * @param $matches Array
1185 */
1186 function caseCallback( $matches ) {
1187 global $wgContLang;
1188 if ( strlen( $matches[0] ) > 1 ) {
1189 return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
1190 } else
1191 return $matches[0];
1192 }
1193
1194 /**
1195 * Extract part of the text from start to end, but by
1196 * not chopping up words
1197 * @param $text String
1198 * @param $start Integer
1199 * @param $end Integer
1200 * @param $posStart Integer: (out) actual start position
1201 * @param $posEnd Integer: (out) actual end position
1202 * @return String
1203 */
1204 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
1205 if ( $start != 0 )
1206 $start = $this->position( $text, $start, 1 );
1207 if ( $end >= strlen( $text ) )
1208 $end = strlen( $text );
1209 else
1210 $end = $this->position( $text, $end );
1211
1212 if ( !is_null( $posStart ) )
1213 $posStart = $start;
1214 if ( !is_null( $posEnd ) )
1215 $posEnd = $end;
1216
1217 if ( $end > $start )
1218 return substr( $text, $start, $end - $start );
1219 else
1220 return '';
1221 }
1222
1223 /**
1224 * Find a nonletter near a point (index) in the text
1225 *
1226 * @param $text String
1227 * @param $point Integer
1228 * @param $offset Integer: offset to found index
1229 * @return Integer: nearest nonletter index, or beginning of utf8 char if none
1230 */
1231 function position( $text, $point, $offset = 0 ) {
1232 $tolerance = 10;
1233 $s = max( 0, $point - $tolerance );
1234 $l = min( strlen( $text ), $point + $tolerance ) - $s;
1235 $m = array();
1236 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
1237 return $m[0][1] + $s + $offset;
1238 } else {
1239 // check if point is on a valid first UTF8 char
1240 $char = ord( $text[$point] );
1241 while ( $char >= 0x80 && $char < 0xc0 ) {
1242 // skip trailing bytes
1243 $point++;
1244 if ( $point >= strlen( $text ) )
1245 return strlen( $text );
1246 $char = ord( $text[$point] );
1247 }
1248 return $point;
1249
1250 }
1251 }
1252
1253 /**
1254 * Search extracts for a pattern, and return snippets
1255 *
1256 * @param $pattern String: regexp for matching lines
1257 * @param $extracts Array: extracts to search
1258 * @param $linesleft Integer: number of extracts to make
1259 * @param $contextchars Integer: length of snippet
1260 * @param $out Array: map for highlighted snippets
1261 * @param $offsets Array: map of starting points of snippets
1262 * @protected
1263 */
1264 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
1265 if ( $linesleft == 0 )
1266 return; // nothing to do
1267 foreach ( $extracts as $index => $line ) {
1268 if ( array_key_exists( $index, $out ) )
1269 continue; // this line already highlighted
1270
1271 $m = array();
1272 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1273 continue;
1274
1275 $offset = $m[0][1];
1276 $len = strlen( $m[0][0] );
1277 if ( $offset + $len < $contextchars )
1278 $begin = 0;
1279 elseif ( $len > $contextchars )
1280 $begin = $offset;
1281 else
1282 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
1283
1284 $end = $begin + $contextchars;
1285
1286 $posBegin = $begin;
1287 // basic snippet from this line
1288 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
1289 $offsets[$index] = $posBegin;
1290 $linesleft--;
1291 if ( $linesleft == 0 )
1292 return;
1293 }
1294 }
1295
1296 /**
1297 * Basic wikitext removal
1298 * @protected
1299 */
1300 function removeWiki( $text ) {
1301 $fname = __METHOD__;
1302 wfProfileIn( $fname );
1303
1304 // $text = preg_replace("/'{2,5}/", "", $text);
1305 // $text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1306 // $text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1307 // $text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1308 // $text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1309 // $text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1310 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
1311 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
1312 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
1313 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
1314 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1315 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
1316 $text = preg_replace( "/'''''/", "", $text );
1317 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
1318 $text = preg_replace( "/''/", "", $text );
1319
1320 wfProfileOut( $fname );
1321 return $text;
1322 }
1323
1324 /**
1325 * callback to replace [[target|caption]] kind of links, if
1326 * the target is category or image, leave it
1327 *
1328 * @param $matches Array
1329 */
1330 function linkReplace( $matches ) {
1331 $colon = strpos( $matches[1], ':' );
1332 if ( $colon === false )
1333 return $matches[2]; // replace with caption
1334 global $wgContLang;
1335 $ns = substr( $matches[1], 0, $colon );
1336 $index = $wgContLang->getNsIndex( $ns );
1337 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) )
1338 return $matches[0]; // return the whole thing
1339 else
1340 return $matches[2];
1341
1342 }
1343
1344 /**
1345 * Simple & fast snippet extraction, but gives completely unrelevant
1346 * snippets
1347 *
1348 * @param $text String
1349 * @param $terms Array
1350 * @param $contextlines Integer
1351 * @param $contextchars Integer
1352 * @return String
1353 */
1354 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1355 global $wgContLang;
1356 $fname = __METHOD__;
1357
1358 $lines = explode( "\n", $text );
1359
1360 $terms = implode( '|', $terms );
1361 $max = intval( $contextchars ) + 1;
1362 $pat1 = "/(.*)($terms)(.{0,$max})/i";
1363
1364 $lineno = 0;
1365
1366 $extract = "";
1367 wfProfileIn( "$fname-extract" );
1368 foreach ( $lines as $line ) {
1369 if ( 0 == $contextlines ) {
1370 break;
1371 }
1372 ++$lineno;
1373 $m = array();
1374 if ( ! preg_match( $pat1, $line, $m ) ) {
1375 continue;
1376 }
1377 --$contextlines;
1378 // truncate function changes ... to relevant i18n message.
1379 $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
1380
1381 if ( count( $m ) < 3 ) {
1382 $post = '';
1383 } else {
1384 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
1385 }
1386
1387 $found = $m[2];
1388
1389 $line = htmlspecialchars( $pre . $found . $post );
1390 $pat2 = '/(' . $terms . ")/i";
1391 $line = preg_replace( $pat2,
1392 "<span class='searchmatch'>\\1</span>", $line );
1393
1394 $extract .= "${line}\n";
1395 }
1396 wfProfileOut( "$fname-extract" );
1397
1398 return $extract;
1399 }
1400
1401 }
1402
1403 /**
1404 * Dummy class to be used when non-supported Database engine is present.
1405 * @todo FIXME: Dummy class should probably try something at least mildly useful,
1406 * such as a LIKE search through titles.
1407 * @ingroup Search
1408 */
1409 class SearchEngineDummy extends SearchEngine {
1410 // no-op
1411 }