Merge "Add .pipeline/ with dev image variant"
[lhc/web/wiklou.git] / includes / search / SearchHighlighter.php
1 <?php
2 /**
3 * Basic search engine highlighting
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Search
22 */
23
24 use MediaWiki\MediaWikiServices;
25
26 /**
27 * Highlight bits of wikitext
28 *
29 * @ingroup Search
30 */
31 class SearchHighlighter {
32 const DEFAULT_CONTEXT_LINES = 2;
33 const DEFAULT_CONTEXT_CHARS = 75;
34
35 protected $mCleanWikitext = true;
36
37 /**
38 * @warning If you pass false to this constructor, then
39 * the caller is responsible for HTML escaping.
40 * @param bool $cleanupWikitext
41 */
42 function __construct( $cleanupWikitext = true ) {
43 $this->mCleanWikitext = $cleanupWikitext;
44 }
45
46 /**
47 * Wikitext highlighting when $wgAdvancedSearchHighlighting = true
48 *
49 * @param string $text
50 * @param string[] $terms Terms to highlight (not html escaped but
51 * regex escaped via SearchDatabase::regexTerm())
52 * @param int $contextlines
53 * @param int $contextchars
54 * @return string
55 */
56 public function highlightText(
57 $text,
58 $terms,
59 $contextlines = self::DEFAULT_CONTEXT_LINES,
60 $contextchars = self::DEFAULT_CONTEXT_CHARS
61 ) {
62 global $wgSearchHighlightBoundaries;
63
64 if ( $text == '' ) {
65 return '';
66 }
67
68 // spli text into text + templates/links/tables
69 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
70 // first capture group is for detecting nested templates/links/tables/references
71 $endPatterns = [
72 1 => '/(\{\{)|(\}\})/', // template
73 2 => '/(\[\[)|(\]\])/', // image
74 3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
75
76 // @todo FIXME: This should prolly be a hook or something
77 // instead of hardcoding the name of the Cite extension
78 if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
79 $spat .= '|(<ref>)'; // references via cite extension
80 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
81 }
82 $spat .= '/';
83 $textExt = []; // text extracts
84 $otherExt = []; // other extracts
85 $start = 0;
86 $textLen = strlen( $text );
87 $count = 0; // sequence number to maintain ordering
88 while ( $start < $textLen ) {
89 // find start of template/image/table
90 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
91 $epat = '';
92 foreach ( $matches as $key => $val ) {
93 if ( $key > 0 && $val[1] != -1 ) {
94 if ( $key == 2 ) {
95 // see if this is an image link
96 $ns = substr( $val[0], 2, -1 );
97 if (
98 MediaWikiServices::getInstance()->getContentLanguage()->
99 getNsIndex( $ns ) != NS_FILE
100 ) {
101 break;
102 }
103
104 }
105 $epat = $endPatterns[$key];
106 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
107 $start = $val[1];
108 break;
109 }
110 }
111 if ( $epat ) {
112 // find end (and detect any nested elements)
113 $level = 0;
114 $offset = $start + 1;
115 $found = false;
116 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
117 if ( array_key_exists( 2, $endMatches ) ) {
118 // found end
119 if ( $level == 0 ) {
120 $len = strlen( $endMatches[2][0] );
121 $off = $endMatches[2][1];
122 $this->splitAndAdd( $otherExt, $count,
123 substr( $text, $start, $off + $len - $start ) );
124 $start = $off + $len;
125 $found = true;
126 break;
127 } else {
128 // end of nested element
129 $level -= 1;
130 }
131 } else {
132 // nested
133 $level += 1;
134 }
135 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
136 }
137 if ( !$found ) {
138 // couldn't find appropriate closing tag, skip
139 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
140 $start += strlen( $matches[0][0] );
141 }
142 continue;
143 }
144 }
145 // else: add as text extract
146 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
147 break;
148 }
149
150 $all = $textExt + $otherExt; // these have disjunct key sets
151
152 // prepare regexps
153 foreach ( $terms as $index => $term ) {
154 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
155 if ( preg_match( '/[\x80-\xff]/', $term ) ) {
156 $terms[$index] = preg_replace_callback(
157 '/./us',
158 [ $this, 'caseCallback' ],
159 $terms[$index]
160 );
161 } else {
162 $terms[$index] = $term;
163 }
164 }
165 $anyterm = implode( '|', $terms );
166 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
167 // @todo FIXME: A hack to scale contextchars, a correct solution
168 // would be to have contextchars actually be char and not byte
169 // length, and do proper utf-8 substrings and lengths everywhere,
170 // but PHP is making that very hard and unclean to implement :(
171 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
172 $contextchars = intval( $contextchars * $scale );
173
174 $patPre = "(^|$wgSearchHighlightBoundaries)";
175 $patPost = "($wgSearchHighlightBoundaries|$)";
176
177 $pat1 = "/(" . $phrase . ")/ui";
178 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
179
180 $left = $contextlines;
181
182 $snippets = [];
183 $offsets = [];
184
185 // show beginning only if it contains all words
186 $first = 0;
187 $firstText = '';
188 foreach ( $textExt as $index => $line ) {
189 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
190 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
191 $first = $index;
192 break;
193 }
194 }
195 if ( $firstText ) {
196 $succ = true;
197 // check if first text contains all terms
198 foreach ( $terms as $term ) {
199 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
200 $succ = false;
201 break;
202 }
203 }
204 if ( $succ ) {
205 $snippets[$first] = $firstText;
206 $offsets[$first] = 0;
207 }
208 }
209 if ( !$snippets ) {
210 // match whole query on text
211 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
212 // match whole query on templates/tables/images
213 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
214 // match any words on text
215 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
216 // match any words on templates/tables/images
217 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
218
219 ksort( $snippets );
220 }
221
222 // add extra chars to each snippet to make snippets constant size
223 $extended = [];
224 if ( count( $snippets ) == 0 ) {
225 // couldn't find the target words, just show beginning of article
226 if ( array_key_exists( $first, $all ) ) {
227 $targetchars = $contextchars * $contextlines;
228 $snippets[$first] = '';
229 $offsets[$first] = 0;
230 }
231 } else {
232 // if begin of the article contains the whole phrase, show only that !!
233 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
234 && $offsets[$first] < $contextchars * 2 ) {
235 $snippets = [ $first => $snippets[$first] ];
236 }
237
238 // calc by how much to extend existing snippets
239 $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
240 }
241
242 foreach ( $snippets as $index => $line ) {
243 $extended[$index] = $line;
244 $len = strlen( $line );
245 if ( $len < $targetchars - 20 ) {
246 // complete this line
247 if ( $len < strlen( $all[$index] ) ) {
248 $extended[$index] = $this->extract(
249 $all[$index],
250 $offsets[$index],
251 $offsets[$index] + $targetchars,
252 $offsets[$index]
253 );
254 $len = strlen( $extended[$index] );
255 }
256
257 // add more lines
258 $add = $index + 1;
259 while ( $len < $targetchars - 20
260 && array_key_exists( $add, $all )
261 && !array_key_exists( $add, $snippets ) ) {
262 $offsets[$add] = 0;
263 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
264 $extended[$add] = $tt;
265 $len += strlen( $tt );
266 $add++;
267 }
268 }
269 }
270
271 // $snippets = array_map( 'htmlspecialchars', $extended );
272 $snippets = $extended;
273 $last = -1;
274 $extract = '';
275 foreach ( $snippets as $index => $line ) {
276 if ( $last == -1 ) {
277 $extract .= $line; // first line
278 } elseif ( $last + 1 == $index
279 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
280 ) {
281 $extract .= " " . $line; // continous lines
282 } else {
283 $extract .= '<b> ... </b>' . $line;
284 }
285
286 $last = $index;
287 }
288 if ( $extract ) {
289 $extract .= '<b> ... </b>';
290 }
291
292 $processed = [];
293 foreach ( $terms as $term ) {
294 if ( !isset( $processed[$term] ) ) {
295 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
296 $extract = preg_replace( $pat3,
297 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
298 $processed[$term] = true;
299 }
300 }
301
302 return $extract;
303 }
304
305 /**
306 * Split text into lines and add it to extracts array
307 *
308 * @param array &$extracts Index -> $line
309 * @param int &$count
310 * @param string $text
311 */
312 function splitAndAdd( &$extracts, &$count, $text ) {
313 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
314 foreach ( $split as $line ) {
315 $tt = trim( $line );
316 if ( $tt ) {
317 $extracts[$count++] = $tt;
318 }
319 }
320 }
321
322 /**
323 * Do manual case conversion for non-ascii chars
324 *
325 * @param array $matches
326 * @return string
327 */
328 function caseCallback( $matches ) {
329 if ( strlen( $matches[0] ) > 1 ) {
330 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
331 return '[' . $contLang->lc( $matches[0] ) .
332 $contLang->uc( $matches[0] ) . ']';
333 } else {
334 return $matches[0];
335 }
336 }
337
338 /**
339 * Extract part of the text from start to end, but by
340 * not chopping up words
341 * @param string $text
342 * @param int $start
343 * @param int $end
344 * @param int|null &$posStart (out) actual start position
345 * @param int|null &$posEnd (out) actual end position
346 * @return string
347 */
348 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
349 if ( $start != 0 ) {
350 $start = $this->position( $text, $start, 1 );
351 }
352 if ( $end >= strlen( $text ) ) {
353 $end = strlen( $text );
354 } else {
355 $end = $this->position( $text, $end );
356 }
357
358 if ( !is_null( $posStart ) ) {
359 $posStart = $start;
360 }
361 if ( !is_null( $posEnd ) ) {
362 $posEnd = $end;
363 }
364
365 if ( $end > $start ) {
366 return substr( $text, $start, $end - $start );
367 } else {
368 return '';
369 }
370 }
371
372 /**
373 * Find a nonletter near a point (index) in the text
374 *
375 * @param string $text
376 * @param int $point
377 * @param int $offset Offset to found index
378 * @return int Nearest nonletter index, or beginning of utf8 char if none
379 */
380 function position( $text, $point, $offset = 0 ) {
381 $tolerance = 10;
382 $s = max( 0, $point - $tolerance );
383 $l = min( strlen( $text ), $point + $tolerance ) - $s;
384 $m = [];
385
386 if ( preg_match(
387 '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
388 substr( $text, $s, $l ),
389 $m,
390 PREG_OFFSET_CAPTURE
391 ) ) {
392 return $m[0][1] + $s + $offset;
393 } else {
394 // check if point is on a valid first UTF8 char
395 $char = ord( $text[$point] );
396 while ( $char >= 0x80 && $char < 0xc0 ) {
397 // skip trailing bytes
398 $point++;
399 if ( $point >= strlen( $text ) ) {
400 return strlen( $text );
401 }
402 $char = ord( $text[$point] );
403 }
404
405 return $point;
406
407 }
408 }
409
410 /**
411 * Search extracts for a pattern, and return snippets
412 *
413 * @param string $pattern Regexp for matching lines
414 * @param array $extracts Extracts to search
415 * @param int &$linesleft Number of extracts to make
416 * @param int &$contextchars Length of snippet
417 * @param array &$out Map for highlighted snippets
418 * @param array &$offsets Map of starting points of snippets
419 * @protected
420 */
421 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
422 if ( $linesleft == 0 ) {
423 return; // nothing to do
424 }
425 foreach ( $extracts as $index => $line ) {
426 if ( array_key_exists( $index, $out ) ) {
427 continue; // this line already highlighted
428 }
429
430 $m = [];
431 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
432 continue;
433 }
434
435 $offset = $m[0][1];
436 $len = strlen( $m[0][0] );
437 if ( $offset + $len < $contextchars ) {
438 $begin = 0;
439 } elseif ( $len > $contextchars ) {
440 $begin = $offset;
441 } else {
442 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
443 }
444
445 $end = $begin + $contextchars;
446
447 $posBegin = $begin;
448 // basic snippet from this line
449 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
450 $offsets[$index] = $posBegin;
451 $linesleft--;
452 if ( $linesleft == 0 ) {
453 return;
454 }
455 }
456 }
457
458 /**
459 * Basic wikitext removal
460 * @protected
461 * @param string $text
462 * @return mixed
463 */
464 function removeWiki( $text ) {
465 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
466 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
467 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
468 $text = preg_replace_callback(
469 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
470 [ $this, 'linkReplace' ],
471 $text
472 );
473 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
474 $text = preg_replace( "/'''''/", "", $text );
475 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
476 $text = preg_replace( "/''/", "", $text );
477
478 // Note, the previous /<\/?[^>]+>/ is insufficient
479 // for XSS safety as the HTML tag can span multiple
480 // search results (T144845).
481 $text = Sanitizer::escapeHtmlAllowEntities( $text );
482 return $text;
483 }
484
485 /**
486 * callback to replace [[target|caption]] kind of links, if
487 * the target is category or image, leave it
488 *
489 * @param array $matches
490 * @return string
491 */
492 function linkReplace( $matches ) {
493 $colon = strpos( $matches[1], ':' );
494 if ( $colon === false ) {
495 return $matches[2]; // replace with caption
496 }
497 $ns = substr( $matches[1], 0, $colon );
498 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
499 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
500 return $matches[0]; // return the whole thing
501 } else {
502 return $matches[2];
503 }
504 }
505
506 /**
507 * Simple & fast snippet extraction, but gives completely unrelevant
508 * snippets
509 *
510 * Used when $wgAdvancedSearchHighlighting is false.
511 *
512 * @param string $text
513 * @param string[] $terms Escaped for regex by SearchDatabase::regexTerm()
514 * @param int $contextlines
515 * @param int $contextchars
516 * @return string
517 */
518 public function highlightSimple(
519 $text,
520 $terms,
521 $contextlines = self::DEFAULT_CONTEXT_LINES,
522 $contextchars = self::DEFAULT_CONTEXT_CHARS
523 ) {
524 $lines = explode( "\n", $text );
525
526 $terms = implode( '|', $terms );
527 $max = intval( $contextchars ) + 1;
528 $pat1 = "/(.*)($terms)(.{0,$max})/i";
529
530 $lineno = 0;
531
532 $extract = "";
533 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
534 foreach ( $lines as $line ) {
535 if ( $contextlines == 0 ) {
536 break;
537 }
538 ++$lineno;
539 $m = [];
540 if ( !preg_match( $pat1, $line, $m ) ) {
541 continue;
542 }
543 --$contextlines;
544 // truncate function changes ... to relevant i18n message.
545 $pre = $contLang->truncateForVisual( $m[1], - $contextchars, '...', false );
546
547 if ( count( $m ) < 3 ) {
548 $post = '';
549 } else {
550 $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
551 }
552
553 $found = $m[2];
554
555 $line = htmlspecialchars( $pre . $found . $post );
556 $pat2 = '/(' . $terms . ")/i";
557 $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
558
559 $extract .= "${line}\n";
560 }
561
562 return $extract;
563 }
564
565 /**
566 * Returns the first few lines of the text
567 *
568 * @param string $text
569 * @param int $contextlines Max number of returned lines
570 * @param int $contextchars Average number of characters per line
571 * @return string
572 */
573 public function highlightNone(
574 $text,
575 $contextlines = self::DEFAULT_CONTEXT_LINES,
576 $contextchars = self::DEFAULT_CONTEXT_CHARS
577 ) {
578 $match = [];
579 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
580 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
581 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
582
583 // Trim and limit to max number of chars
584 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
585 return str_replace( "\n", '<br>', $text );
586 }
587 }