includes/search/SearchHighlighter.php

   1 <?php
   2 /**
   3  * Basic search engine highlighting
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  * @ingroup Search
  22  */
  23
  24 use MediaWiki\MediaWikiServices;
  25
  26 /**
  27  * Highlight bits of wikitext
  28  *
  29  * @ingroup Search
  30  */
  31 class SearchHighlighter {
  32         protected $mCleanWikitext = true;
  33
  34         /**
  35          * @warning If you pass false to this constructor, then
  36          *  the caller is responsible for HTML escaping.
  37          * @param bool $cleanupWikitext
  38          */
  39         function __construct( $cleanupWikitext = true ) {
  40                 $this->mCleanWikitext = $cleanupWikitext;
  41         }
  42
  43         /**
  44          * Wikitext highlighting when $wgAdvancedSearchHighlighting = true
  45          *
  46          * @param string $text
  47          * @param string[] $terms Terms to highlight (not html escaped but
  48          *   regex escaped via SearchDatabase::regexTerm())
  49          * @param int $contextlines
  50          * @param int $contextchars
  51          * @return string
  52          */
  53         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
  54                 global $wgSearchHighlightBoundaries;
  55
  56                 if ( $text == '' ) {
  57                         return '';
  58                 }
  59
  60                 // spli text into text + templates/links/tables
  61                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
  62                 // first capture group is for detecting nested templates/links/tables/references
  63                 $endPatterns = [
  64                         1 => '/(\{\{)|(\}\})/', // template
  65                         2 => '/(\[\[)|(\]\])/', // image
  66                         3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
  67
  68                 // @todo FIXME: This should prolly be a hook or something
  69                 // instead of hardcoding a class name from the Cite extension
  70                 if ( class_exists( 'Cite' ) ) {
  71                         $spat .= '|(<ref>)'; // references via cite extension
  72                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
  73                 }
  74                 $spat .= '/';
  75                 $textExt = []; // text extracts
  76                 $otherExt = []; // other extracts
  77                 $start = 0;
  78                 $textLen = strlen( $text );
  79                 $count = 0; // sequence number to maintain ordering
  80                 while ( $start < $textLen ) {
  81                         // find start of template/image/table
  82                         if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
  83                                 $epat = '';
  84                                 foreach ( $matches as $key => $val ) {
  85                                         if ( $key > 0 && $val[1] != -1 ) {
  86                                                 if ( $key == 2 ) {
  87                                                         // see if this is an image link
  88                                                         $ns = substr( $val[0], 2, -1 );
  89                                                         if (
  90                                                                 MediaWikiServices::getInstance()->getContentLanguage()->
  91                                                                 getNsIndex( $ns ) != NS_FILE
  92                                                         ) {
  93                                                                 break;
  94                                                         }
  95
  96                                                 }
  97                                                 $epat = $endPatterns[$key];
  98                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
  99                                                 $start = $val[1];
 100                                                 break;
 101                                         }
 102                                 }
 103                                 if ( $epat ) {
 104                                         // find end (and detect any nested elements)
 105                                         $level = 0;
 106                                         $offset = $start + 1;
 107                                         $found = false;
 108                                         while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
 109                                                 if ( array_key_exists( 2, $endMatches ) ) {
 110                                                         // found end
 111                                                         if ( $level == 0 ) {
 112                                                                 $len = strlen( $endMatches[2][0] );
 113                                                                 $off = $endMatches[2][1];
 114                                                                 $this->splitAndAdd( $otherExt, $count,
 115                                                                         substr( $text, $start, $off + $len - $start ) );
 116                                                                 $start = $off + $len;
 117                                                                 $found = true;
 118                                                                 break;
 119                                                         } else {
 120                                                                 // end of nested element
 121                                                                 $level -= 1;
 122                                                         }
 123                                                 } else {
 124                                                         // nested
 125                                                         $level += 1;
 126                                                 }
 127                                                 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
 128                                         }
 129                                         if ( !$found ) {
 130                                                 // couldn't find appropriate closing tag, skip
 131                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
 132                                                 $start += strlen( $matches[0][0] );
 133                                         }
 134                                         continue;
 135                                 }
 136                         }
 137                         // else: add as text extract
 138                         $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
 139                         break;
 140                 }
 141
 142                 $all = $textExt + $otherExt; // these have disjunct key sets
 143
 144                 // prepare regexps
 145                 foreach ( $terms as $index => $term ) {
 146                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 147                         if ( preg_match( '/[\x80-\xff]/', $term ) ) {
 148                                 $terms[$index] = preg_replace_callback(
 149                                         '/./us',
 150                                         [ $this, 'caseCallback' ],
 151                                         $terms[$index]
 152                                 );
 153                         } else {
 154                                 $terms[$index] = $term;
 155                         }
 156                 }
 157                 $anyterm = implode( '|', $terms );
 158                 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
 159                 // @todo FIXME: A hack to scale contextchars, a correct solution
 160                 // would be to have contextchars actually be char and not byte
 161                 // length, and do proper utf-8 substrings and lengths everywhere,
 162                 // but PHP is making that very hard and unclean to implement :(
 163                 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
 164                 $contextchars = intval( $contextchars * $scale );
 165
 166                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 167                 $patPost = "($wgSearchHighlightBoundaries|$)";
 168
 169                 $pat1 = "/(" . $phrase . ")/ui";
 170                 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
 171
 172                 $left = $contextlines;
 173
 174                 $snippets = [];
 175                 $offsets = [];
 176
 177                 // show beginning only if it contains all words
 178                 $first = 0;
 179                 $firstText = '';
 180                 foreach ( $textExt as $index => $line ) {
 181                         if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
 182                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 183                                 $first = $index;
 184                                 break;
 185                         }
 186                 }
 187                 if ( $firstText ) {
 188                         $succ = true;
 189                         // check if first text contains all terms
 190                         foreach ( $terms as $term ) {
 191                                 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
 192                                         $succ = false;
 193                                         break;
 194                                 }
 195                         }
 196                         if ( $succ ) {
 197                                 $snippets[$first] = $firstText;
 198                                 $offsets[$first] = 0;
 199                         }
 200                 }
 201                 if ( !$snippets ) {
 202                         // match whole query on text
 203                         $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
 204                         // match whole query on templates/tables/images
 205                         $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
 206                         // match any words on text
 207                         $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
 208                         // match any words on templates/tables/images
 209                         $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
 210
 211                         ksort( $snippets );
 212                 }
 213
 214                 // add extra chars to each snippet to make snippets constant size
 215                 $extended = [];
 216                 if ( count( $snippets ) == 0 ) {
 217                         // couldn't find the target words, just show beginning of article
 218                         if ( array_key_exists( $first, $all ) ) {
 219                                 $targetchars = $contextchars * $contextlines;
 220                                 $snippets[$first] = '';
 221                                 $offsets[$first] = 0;
 222                         }
 223                 } else {
 224                         // if begin of the article contains the whole phrase, show only that !!
 225                         if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
 226                                 && $offsets[$first] < $contextchars * 2 ) {
 227                                 $snippets = [ $first => $snippets[$first] ];
 228                         }
 229
 230                         // calc by how much to extend existing snippets
 231                         $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
 232                 }
 233
 234                 foreach ( $snippets as $index => $line ) {
 235                         $extended[$index] = $line;
 236                         $len = strlen( $line );
 237                         if ( $len < $targetchars - 20 ) {
 238                                 // complete this line
 239                                 if ( $len < strlen( $all[$index] ) ) {
 240                                         $extended[$index] = $this->extract(
 241                                                 $all[$index],
 242                                                 $offsets[$index],
 243                                                 $offsets[$index] + $targetchars,
 244                                                 $offsets[$index]
 245                                         );
 246                                         $len = strlen( $extended[$index] );
 247                                 }
 248
 249                                 // add more lines
 250                                 $add = $index + 1;
 251                                 while ( $len < $targetchars - 20
 252                                                 && array_key_exists( $add, $all )
 253                                                 && !array_key_exists( $add, $snippets ) ) {
 254                                         $offsets[$add] = 0;
 255                                         $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 256                                         $extended[$add] = $tt;
 257                                         $len += strlen( $tt );
 258                                         $add++;
 259                                 }
 260                         }
 261                 }
 262
 263                 // $snippets = array_map( 'htmlspecialchars', $extended );
 264                 $snippets = $extended;
 265                 $last = -1;
 266                 $extract = '';
 267                 foreach ( $snippets as $index => $line ) {
 268                         if ( $last == -1 ) {
 269                                 $extract .= $line; // first line
 270                         } elseif ( $last + 1 == $index
 271                                 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
 272                         ) {
 273                                 $extract .= " " . $line; // continous lines
 274                         } else {
 275                                 $extract .= '<b> ... </b>' . $line;
 276                         }
 277
 278                         $last = $index;
 279                 }
 280                 if ( $extract ) {
 281                         $extract .= '<b> ... </b>';
 282                 }
 283
 284                 $processed = [];
 285                 foreach ( $terms as $term ) {
 286                         if ( !isset( $processed[$term] ) ) {
 287                                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
 288                                 $extract = preg_replace( $pat3,
 289                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 290                                 $processed[$term] = true;
 291                         }
 292                 }
 293
 294                 return $extract;
 295         }
 296
 297         /**
 298          * Split text into lines and add it to extracts array
 299          *
 300          * @param array &$extracts Index -> $line
 301          * @param int &$count
 302          * @param string $text
 303          */
 304         function splitAndAdd( &$extracts, &$count, $text ) {
 305                 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
 306                 foreach ( $split as $line ) {
 307                         $tt = trim( $line );
 308                         if ( $tt ) {
 309                                 $extracts[$count++] = $tt;
 310                         }
 311                 }
 312         }
 313
 314         /**
 315          * Do manual case conversion for non-ascii chars
 316          *
 317          * @param array $matches
 318          * @return string
 319          */
 320         function caseCallback( $matches ) {
 321                 if ( strlen( $matches[0] ) > 1 ) {
 322                         $contLang = MediaWikiServices::getInstance()->getContentLanguage();
 323                         return '[' . $contLang->lc( $matches[0] ) .
 324                                 $contLang->uc( $matches[0] ) . ']';
 325                 } else {
 326                         return $matches[0];
 327                 }
 328         }
 329
 330         /**
 331          * Extract part of the text from start to end, but by
 332          * not chopping up words
 333          * @param string $text
 334          * @param int $start
 335          * @param int $end
 336          * @param int|null &$posStart (out) actual start position
 337          * @param int|null &$posEnd (out) actual end position
 338          * @return string
 339          */
 340         function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
 341                 if ( $start != 0 ) {
 342                         $start = $this->position( $text, $start, 1 );
 343                 }
 344                 if ( $end >= strlen( $text ) ) {
 345                         $end = strlen( $text );
 346                 } else {
 347                         $end = $this->position( $text, $end );
 348                 }
 349
 350                 if ( !is_null( $posStart ) ) {
 351                         $posStart = $start;
 352                 }
 353                 if ( !is_null( $posEnd ) ) {
 354                         $posEnd = $end;
 355                 }
 356
 357                 if ( $end > $start ) {
 358                         return substr( $text, $start, $end - $start );
 359                 } else {
 360                         return '';
 361                 }
 362         }
 363
 364         /**
 365          * Find a nonletter near a point (index) in the text
 366          *
 367          * @param string $text
 368          * @param int $point
 369          * @param int $offset Offset to found index
 370          * @return int Nearest nonletter index, or beginning of utf8 char if none
 371          */
 372         function position( $text, $point, $offset = 0 ) {
 373                 $tolerance = 10;
 374                 $s = max( 0, $point - $tolerance );
 375                 $l = min( strlen( $text ), $point + $tolerance ) - $s;
 376                 $m = [];
 377
 378                 if ( preg_match(
 379                         '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
 380                         substr( $text, $s, $l ),
 381                         $m,
 382                         PREG_OFFSET_CAPTURE
 383                 ) ) {
 384                         return $m[0][1] + $s + $offset;
 385                 } else {
 386                         // check if point is on a valid first UTF8 char
 387                         $char = ord( $text[$point] );
 388                         while ( $char >= 0x80 && $char < 0xc0 ) {
 389                                 // skip trailing bytes
 390                                 $point++;
 391                                 if ( $point >= strlen( $text ) ) {
 392                                         return strlen( $text );
 393                                 }
 394                                 $char = ord( $text[$point] );
 395                         }
 396
 397                         return $point;
 398
 399                 }
 400         }
 401
 402         /**
 403          * Search extracts for a pattern, and return snippets
 404          *
 405          * @param string $pattern Regexp for matching lines
 406          * @param array $extracts Extracts to search
 407          * @param int &$linesleft Number of extracts to make
 408          * @param int &$contextchars Length of snippet
 409          * @param array &$out Map for highlighted snippets
 410          * @param array &$offsets Map of starting points of snippets
 411          * @protected
 412          */
 413         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
 414                 if ( $linesleft == 0 ) {
 415                         return; // nothing to do
 416                 }
 417                 foreach ( $extracts as $index => $line ) {
 418                         if ( array_key_exists( $index, $out ) ) {
 419                                 continue; // this line already highlighted
 420                         }
 421
 422                         $m = [];
 423                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
 424                                 continue;
 425                         }
 426
 427                         $offset = $m[0][1];
 428                         $len = strlen( $m[0][0] );
 429                         if ( $offset + $len < $contextchars ) {
 430                                 $begin = 0;
 431                         } elseif ( $len > $contextchars ) {
 432                                 $begin = $offset;
 433                         } else {
 434                                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
 435                         }
 436
 437                         $end = $begin + $contextchars;
 438
 439                         $posBegin = $begin;
 440                         // basic snippet from this line
 441                         $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
 442                         $offsets[$index] = $posBegin;
 443                         $linesleft--;
 444                         if ( $linesleft == 0 ) {
 445                                 return;
 446                         }
 447                 }
 448         }
 449
 450         /**
 451          * Basic wikitext removal
 452          * @protected
 453          * @param string $text
 454          * @return mixed
 455          */
 456         function removeWiki( $text ) {
 457                 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
 458                 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
 459                 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
 460                 $text = preg_replace_callback(
 461                         "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
 462                         [ $this, 'linkReplace' ],
 463                         $text
 464                 );
 465                 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
 466                 $text = preg_replace( "/'''''/", "", $text );
 467                 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
 468                 $text = preg_replace( "/''/", "", $text );
 469
 470                 // Note, the previous /<\/?[^>]+>/ is insufficient
 471                 // for XSS safety as the HTML tag can span multiple
 472                 // search results (T144845).
 473                 $text = Sanitizer::escapeHtmlAllowEntities( $text );
 474                 return $text;
 475         }
 476
 477         /**
 478          * callback to replace [[target|caption]] kind of links, if
 479          * the target is category or image, leave it
 480          *
 481          * @param array $matches
 482          * @return string
 483          */
 484         function linkReplace( $matches ) {
 485                 $colon = strpos( $matches[1], ':' );
 486                 if ( $colon === false ) {
 487                         return $matches[2]; // replace with caption
 488                 }
 489                 $ns = substr( $matches[1], 0, $colon );
 490                 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
 491                 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
 492                         return $matches[0]; // return the whole thing
 493                 } else {
 494                         return $matches[2];
 495                 }
 496         }
 497
 498         /**
 499          * Simple & fast snippet extraction, but gives completely unrelevant
 500          * snippets
 501          *
 502          * Used when $wgAdvancedSearchHighlighting is false.
 503          *
 504          * @param string $text
 505          * @param string[] $terms Escaped for regex by SearchDatabase::regexTerm()
 506          * @param int $contextlines
 507          * @param int $contextchars
 508          * @return string
 509          */
 510         public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
 511                 $lines = explode( "\n", $text );
 512
 513                 $terms = implode( '|', $terms );
 514                 $max = intval( $contextchars ) + 1;
 515                 $pat1 = "/(.*)($terms)(.{0,$max})/i";
 516
 517                 $lineno = 0;
 518
 519                 $extract = "";
 520                 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
 521                 foreach ( $lines as $line ) {
 522                         if ( $contextlines == 0 ) {
 523                                 break;
 524                         }
 525                         ++$lineno;
 526                         $m = [];
 527                         if ( !preg_match( $pat1, $line, $m ) ) {
 528                                 continue;
 529                         }
 530                         --$contextlines;
 531                         // truncate function changes ... to relevant i18n message.
 532                         $pre = $contLang->truncateForVisual( $m[1], - $contextchars, '...', false );
 533
 534                         if ( count( $m ) < 3 ) {
 535                                 $post = '';
 536                         } else {
 537                                 $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
 538                         }
 539
 540                         $found = $m[2];
 541
 542                         $line = htmlspecialchars( $pre . $found . $post );
 543                         $pat2 = '/(' . $terms . ")/i";
 544                         $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
 545
 546                         $extract .= "${line}\n";
 547                 }
 548
 549                 return $extract;
 550         }
 551
 552         /**
 553          * Returns the first few lines of the text
 554          *
 555          * @param string $text
 556          * @param int $contextlines Max number of returned lines
 557          * @param int $contextchars Average number of characters per line
 558          * @return string
 559          */
 560         public function highlightNone( $text, $contextlines, $contextchars ) {
 561                 $match = [];
 562                 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
 563                 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
 564                 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
 565
 566                 // Trim and limit to max number of chars
 567                 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
 568                 return str_replace( "\n", '<br>', $text );
 569         }
 570 }