Merge "Type hint against LinkTarget in WatchedItemStore"
[lhc/web/wiklou.git] / includes / search / SearchHighlighter.php
1 <?php
2 /**
3 * Basic search engine highlighting
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Search
22 */
23
24 use MediaWiki\MediaWikiServices;
25
26 /**
27 * Highlight bits of wikitext
28 *
29 * @ingroup Search
30 */
31 class SearchHighlighter {
32 protected $mCleanWikitext = true;
33
34 /**
35 * @warning If you pass false to this constructor, then
36 * the caller is responsible for HTML escaping.
37 * @param bool $cleanupWikitext
38 */
39 function __construct( $cleanupWikitext = true ) {
40 $this->mCleanWikitext = $cleanupWikitext;
41 }
42
43 /**
44 * Wikitext highlighting when $wgAdvancedSearchHighlighting = true
45 *
46 * @param string $text
47 * @param string[] $terms Terms to highlight (not html escaped but
48 * regex escaped via SearchDatabase::regexTerm())
49 * @param int $contextlines
50 * @param int $contextchars
51 * @return string
52 */
53 public function highlightText( $text, $terms, $contextlines, $contextchars ) {
54 global $wgSearchHighlightBoundaries;
55
56 if ( $text == '' ) {
57 return '';
58 }
59
60 // spli text into text + templates/links/tables
61 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
62 // first capture group is for detecting nested templates/links/tables/references
63 $endPatterns = [
64 1 => '/(\{\{)|(\}\})/', // template
65 2 => '/(\[\[)|(\]\])/', // image
66 3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
67
68 // @todo FIXME: This should prolly be a hook or something
69 // instead of hardcoding a class name from the Cite extension
70 if ( class_exists( 'Cite' ) ) {
71 $spat .= '|(<ref>)'; // references via cite extension
72 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
73 }
74 $spat .= '/';
75 $textExt = []; // text extracts
76 $otherExt = []; // other extracts
77 $start = 0;
78 $textLen = strlen( $text );
79 $count = 0; // sequence number to maintain ordering
80 while ( $start < $textLen ) {
81 // find start of template/image/table
82 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
83 $epat = '';
84 foreach ( $matches as $key => $val ) {
85 if ( $key > 0 && $val[1] != -1 ) {
86 if ( $key == 2 ) {
87 // see if this is an image link
88 $ns = substr( $val[0], 2, -1 );
89 if (
90 MediaWikiServices::getInstance()->getContentLanguage()->
91 getNsIndex( $ns ) != NS_FILE
92 ) {
93 break;
94 }
95
96 }
97 $epat = $endPatterns[$key];
98 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
99 $start = $val[1];
100 break;
101 }
102 }
103 if ( $epat ) {
104 // find end (and detect any nested elements)
105 $level = 0;
106 $offset = $start + 1;
107 $found = false;
108 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
109 if ( array_key_exists( 2, $endMatches ) ) {
110 // found end
111 if ( $level == 0 ) {
112 $len = strlen( $endMatches[2][0] );
113 $off = $endMatches[2][1];
114 $this->splitAndAdd( $otherExt, $count,
115 substr( $text, $start, $off + $len - $start ) );
116 $start = $off + $len;
117 $found = true;
118 break;
119 } else {
120 // end of nested element
121 $level -= 1;
122 }
123 } else {
124 // nested
125 $level += 1;
126 }
127 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
128 }
129 if ( !$found ) {
130 // couldn't find appropriate closing tag, skip
131 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
132 $start += strlen( $matches[0][0] );
133 }
134 continue;
135 }
136 }
137 // else: add as text extract
138 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
139 break;
140 }
141
142 $all = $textExt + $otherExt; // these have disjunct key sets
143
144 // prepare regexps
145 foreach ( $terms as $index => $term ) {
146 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
147 if ( preg_match( '/[\x80-\xff]/', $term ) ) {
148 $terms[$index] = preg_replace_callback(
149 '/./us',
150 [ $this, 'caseCallback' ],
151 $terms[$index]
152 );
153 } else {
154 $terms[$index] = $term;
155 }
156 }
157 $anyterm = implode( '|', $terms );
158 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
159 // @todo FIXME: A hack to scale contextchars, a correct solution
160 // would be to have contextchars actually be char and not byte
161 // length, and do proper utf-8 substrings and lengths everywhere,
162 // but PHP is making that very hard and unclean to implement :(
163 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
164 $contextchars = intval( $contextchars * $scale );
165
166 $patPre = "(^|$wgSearchHighlightBoundaries)";
167 $patPost = "($wgSearchHighlightBoundaries|$)";
168
169 $pat1 = "/(" . $phrase . ")/ui";
170 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
171
172 $left = $contextlines;
173
174 $snippets = [];
175 $offsets = [];
176
177 // show beginning only if it contains all words
178 $first = 0;
179 $firstText = '';
180 foreach ( $textExt as $index => $line ) {
181 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
182 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
183 $first = $index;
184 break;
185 }
186 }
187 if ( $firstText ) {
188 $succ = true;
189 // check if first text contains all terms
190 foreach ( $terms as $term ) {
191 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
192 $succ = false;
193 break;
194 }
195 }
196 if ( $succ ) {
197 $snippets[$first] = $firstText;
198 $offsets[$first] = 0;
199 }
200 }
201 if ( !$snippets ) {
202 // match whole query on text
203 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
204 // match whole query on templates/tables/images
205 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
206 // match any words on text
207 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
208 // match any words on templates/tables/images
209 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
210
211 ksort( $snippets );
212 }
213
214 // add extra chars to each snippet to make snippets constant size
215 $extended = [];
216 if ( count( $snippets ) == 0 ) {
217 // couldn't find the target words, just show beginning of article
218 if ( array_key_exists( $first, $all ) ) {
219 $targetchars = $contextchars * $contextlines;
220 $snippets[$first] = '';
221 $offsets[$first] = 0;
222 }
223 } else {
224 // if begin of the article contains the whole phrase, show only that !!
225 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
226 && $offsets[$first] < $contextchars * 2 ) {
227 $snippets = [ $first => $snippets[$first] ];
228 }
229
230 // calc by how much to extend existing snippets
231 $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
232 }
233
234 foreach ( $snippets as $index => $line ) {
235 $extended[$index] = $line;
236 $len = strlen( $line );
237 if ( $len < $targetchars - 20 ) {
238 // complete this line
239 if ( $len < strlen( $all[$index] ) ) {
240 $extended[$index] = $this->extract(
241 $all[$index],
242 $offsets[$index],
243 $offsets[$index] + $targetchars,
244 $offsets[$index]
245 );
246 $len = strlen( $extended[$index] );
247 }
248
249 // add more lines
250 $add = $index + 1;
251 while ( $len < $targetchars - 20
252 && array_key_exists( $add, $all )
253 && !array_key_exists( $add, $snippets ) ) {
254 $offsets[$add] = 0;
255 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
256 $extended[$add] = $tt;
257 $len += strlen( $tt );
258 $add++;
259 }
260 }
261 }
262
263 // $snippets = array_map( 'htmlspecialchars', $extended );
264 $snippets = $extended;
265 $last = -1;
266 $extract = '';
267 foreach ( $snippets as $index => $line ) {
268 if ( $last == -1 ) {
269 $extract .= $line; // first line
270 } elseif ( $last + 1 == $index
271 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
272 ) {
273 $extract .= " " . $line; // continous lines
274 } else {
275 $extract .= '<b> ... </b>' . $line;
276 }
277
278 $last = $index;
279 }
280 if ( $extract ) {
281 $extract .= '<b> ... </b>';
282 }
283
284 $processed = [];
285 foreach ( $terms as $term ) {
286 if ( !isset( $processed[$term] ) ) {
287 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
288 $extract = preg_replace( $pat3,
289 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
290 $processed[$term] = true;
291 }
292 }
293
294 return $extract;
295 }
296
297 /**
298 * Split text into lines and add it to extracts array
299 *
300 * @param array &$extracts Index -> $line
301 * @param int &$count
302 * @param string $text
303 */
304 function splitAndAdd( &$extracts, &$count, $text ) {
305 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
306 foreach ( $split as $line ) {
307 $tt = trim( $line );
308 if ( $tt ) {
309 $extracts[$count++] = $tt;
310 }
311 }
312 }
313
314 /**
315 * Do manual case conversion for non-ascii chars
316 *
317 * @param array $matches
318 * @return string
319 */
320 function caseCallback( $matches ) {
321 if ( strlen( $matches[0] ) > 1 ) {
322 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
323 return '[' . $contLang->lc( $matches[0] ) .
324 $contLang->uc( $matches[0] ) . ']';
325 } else {
326 return $matches[0];
327 }
328 }
329
330 /**
331 * Extract part of the text from start to end, but by
332 * not chopping up words
333 * @param string $text
334 * @param int $start
335 * @param int $end
336 * @param int|null &$posStart (out) actual start position
337 * @param int|null &$posEnd (out) actual end position
338 * @return string
339 */
340 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
341 if ( $start != 0 ) {
342 $start = $this->position( $text, $start, 1 );
343 }
344 if ( $end >= strlen( $text ) ) {
345 $end = strlen( $text );
346 } else {
347 $end = $this->position( $text, $end );
348 }
349
350 if ( !is_null( $posStart ) ) {
351 $posStart = $start;
352 }
353 if ( !is_null( $posEnd ) ) {
354 $posEnd = $end;
355 }
356
357 if ( $end > $start ) {
358 return substr( $text, $start, $end - $start );
359 } else {
360 return '';
361 }
362 }
363
364 /**
365 * Find a nonletter near a point (index) in the text
366 *
367 * @param string $text
368 * @param int $point
369 * @param int $offset Offset to found index
370 * @return int Nearest nonletter index, or beginning of utf8 char if none
371 */
372 function position( $text, $point, $offset = 0 ) {
373 $tolerance = 10;
374 $s = max( 0, $point - $tolerance );
375 $l = min( strlen( $text ), $point + $tolerance ) - $s;
376 $m = [];
377
378 if ( preg_match(
379 '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
380 substr( $text, $s, $l ),
381 $m,
382 PREG_OFFSET_CAPTURE
383 ) ) {
384 return $m[0][1] + $s + $offset;
385 } else {
386 // check if point is on a valid first UTF8 char
387 $char = ord( $text[$point] );
388 while ( $char >= 0x80 && $char < 0xc0 ) {
389 // skip trailing bytes
390 $point++;
391 if ( $point >= strlen( $text ) ) {
392 return strlen( $text );
393 }
394 $char = ord( $text[$point] );
395 }
396
397 return $point;
398
399 }
400 }
401
402 /**
403 * Search extracts for a pattern, and return snippets
404 *
405 * @param string $pattern Regexp for matching lines
406 * @param array $extracts Extracts to search
407 * @param int &$linesleft Number of extracts to make
408 * @param int &$contextchars Length of snippet
409 * @param array &$out Map for highlighted snippets
410 * @param array &$offsets Map of starting points of snippets
411 * @protected
412 */
413 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
414 if ( $linesleft == 0 ) {
415 return; // nothing to do
416 }
417 foreach ( $extracts as $index => $line ) {
418 if ( array_key_exists( $index, $out ) ) {
419 continue; // this line already highlighted
420 }
421
422 $m = [];
423 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
424 continue;
425 }
426
427 $offset = $m[0][1];
428 $len = strlen( $m[0][0] );
429 if ( $offset + $len < $contextchars ) {
430 $begin = 0;
431 } elseif ( $len > $contextchars ) {
432 $begin = $offset;
433 } else {
434 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
435 }
436
437 $end = $begin + $contextchars;
438
439 $posBegin = $begin;
440 // basic snippet from this line
441 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
442 $offsets[$index] = $posBegin;
443 $linesleft--;
444 if ( $linesleft == 0 ) {
445 return;
446 }
447 }
448 }
449
450 /**
451 * Basic wikitext removal
452 * @protected
453 * @param string $text
454 * @return mixed
455 */
456 function removeWiki( $text ) {
457 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
458 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
459 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
460 $text = preg_replace_callback(
461 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
462 [ $this, 'linkReplace' ],
463 $text
464 );
465 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
466 $text = preg_replace( "/'''''/", "", $text );
467 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
468 $text = preg_replace( "/''/", "", $text );
469
470 // Note, the previous /<\/?[^>]+>/ is insufficient
471 // for XSS safety as the HTML tag can span multiple
472 // search results (T144845).
473 $text = Sanitizer::escapeHtmlAllowEntities( $text );
474 return $text;
475 }
476
477 /**
478 * callback to replace [[target|caption]] kind of links, if
479 * the target is category or image, leave it
480 *
481 * @param array $matches
482 * @return string
483 */
484 function linkReplace( $matches ) {
485 $colon = strpos( $matches[1], ':' );
486 if ( $colon === false ) {
487 return $matches[2]; // replace with caption
488 }
489 $ns = substr( $matches[1], 0, $colon );
490 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
491 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
492 return $matches[0]; // return the whole thing
493 } else {
494 return $matches[2];
495 }
496 }
497
498 /**
499 * Simple & fast snippet extraction, but gives completely unrelevant
500 * snippets
501 *
502 * Used when $wgAdvancedSearchHighlighting is false.
503 *
504 * @param string $text
505 * @param string[] $terms Escaped for regex by SearchDatabase::regexTerm()
506 * @param int $contextlines
507 * @param int $contextchars
508 * @return string
509 */
510 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
511 $lines = explode( "\n", $text );
512
513 $terms = implode( '|', $terms );
514 $max = intval( $contextchars ) + 1;
515 $pat1 = "/(.*)($terms)(.{0,$max})/i";
516
517 $lineno = 0;
518
519 $extract = "";
520 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
521 foreach ( $lines as $line ) {
522 if ( $contextlines == 0 ) {
523 break;
524 }
525 ++$lineno;
526 $m = [];
527 if ( !preg_match( $pat1, $line, $m ) ) {
528 continue;
529 }
530 --$contextlines;
531 // truncate function changes ... to relevant i18n message.
532 $pre = $contLang->truncateForVisual( $m[1], - $contextchars, '...', false );
533
534 if ( count( $m ) < 3 ) {
535 $post = '';
536 } else {
537 $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
538 }
539
540 $found = $m[2];
541
542 $line = htmlspecialchars( $pre . $found . $post );
543 $pat2 = '/(' . $terms . ")/i";
544 $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
545
546 $extract .= "${line}\n";
547 }
548
549 return $extract;
550 }
551
552 /**
553 * Returns the first few lines of the text
554 *
555 * @param string $text
556 * @param int $contextlines Max number of returned lines
557 * @param int $contextchars Average number of characters per line
558 * @return string
559 */
560 public function highlightNone( $text, $contextlines, $contextchars ) {
561 $match = [];
562 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
563 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
564 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
565
566 // Trim and limit to max number of chars
567 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
568 return str_replace( "\n", '<br>', $text );
569 }
570 }