includes/search/SearchHighlighter.php

   1 <?php
   2 /**
   3  * Basic search engine highlighting
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  * @ingroup Search
  22  */
  23
  24 /**
  25  * Highlight bits of wikitext
  26  *
  27  * @ingroup Search
  28  */
  29 class SearchHighlighter {
  30         protected $mCleanWikitext = true;
  31
  32         function __construct( $cleanupWikitext = true ) {
  33                 $this->mCleanWikitext = $cleanupWikitext;
  34         }
  35
  36         /**
  37          * Default implementation of wikitext highlighting
  38          *
  39          * @param string $text
  40          * @param array $terms Terms to highlight (unescaped)
  41          * @param int $contextlines
  42          * @param int $contextchars
  43          * @return string
  44          */
  45         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
  46                 global $wgContLang, $wgSearchHighlightBoundaries;
  47
  48                 if ( $text == '' ) {
  49                         return '';
  50                 }
  51
  52                 // spli text into text + templates/links/tables
  53                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
  54                 // first capture group is for detecting nested templates/links/tables/references
  55                 $endPatterns = array(
  56                         1 => '/(\{\{)|(\}\})/', // template
  57                         2 => '/(\[\[)|(\]\])/', // image
  58                         3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
  59
  60                 // @todo FIXME: This should prolly be a hook or something
  61                 if ( function_exists( 'wfCite' ) ) {
  62                         $spat .= '|(<ref>)'; // references via cite extension
  63                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
  64                 }
  65                 $spat .= '/';
  66                 $textExt = array(); // text extracts
  67                 $otherExt = array(); // other extracts
  68                 $start = 0;
  69                 $textLen = strlen( $text );
  70                 $count = 0; // sequence number to maintain ordering
  71                 while ( $start < $textLen ) {
  72                         // find start of template/image/table
  73                         if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
  74                                 $epat = '';
  75                                 foreach ( $matches as $key => $val ) {
  76                                         if ( $key > 0 && $val[1] != - 1 ) {
  77                                                 if ( $key == 2 ) {
  78                                                         // see if this is an image link
  79                                                         $ns = substr( $val[0], 2, - 1 );
  80                                                         if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
  81                                                                 break;
  82                                                         }
  83
  84                                                 }
  85                                                 $epat = $endPatterns[$key];
  86                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
  87                                                 $start = $val[1];
  88                                                 break;
  89                                         }
  90                                 }
  91                                 if ( $epat ) {
  92                                         // find end (and detect any nested elements)
  93                                         $level = 0;
  94                                         $offset = $start + 1;
  95                                         $found = false;
  96                                         while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
  97                                                 if ( array_key_exists( 2, $endMatches ) ) {
  98                                                         // found end
  99                                                         if ( $level == 0 ) {
 100                                                                 $len = strlen( $endMatches[2][0] );
 101                                                                 $off = $endMatches[2][1];
 102                                                                 $this->splitAndAdd( $otherExt, $count,
 103                                                                         substr( $text, $start, $off + $len - $start ) );
 104                                                                 $start = $off + $len;
 105                                                                 $found = true;
 106                                                                 break;
 107                                                         } else {
 108                                                                 // end of nested element
 109                                                                 $level -= 1;
 110                                                         }
 111                                                 } else {
 112                                                         // nested
 113                                                         $level += 1;
 114                                                 }
 115                                                 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
 116                                         }
 117                                         if ( !$found ) {
 118                                                 // couldn't find appropriate closing tag, skip
 119                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
 120                                                 $start += strlen( $matches[0][0] );
 121                                         }
 122                                         continue;
 123                                 }
 124                         }
 125                         // else: add as text extract
 126                         $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
 127                         break;
 128                 }
 129
 130                 $all = $textExt + $otherExt; // these have disjunct key sets
 131
 132                 // prepare regexps
 133                 foreach ( $terms as $index => $term ) {
 134                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 135                         if ( preg_match( '/[\x80-\xff]/', $term ) ) {
 136                                 $terms[$index] = preg_replace_callback(
 137                                         '/./us',
 138                                         array( $this, 'caseCallback' ),
 139                                         $terms[$index]
 140                                 );
 141                         } else {
 142                                 $terms[$index] = $term;
 143                         }
 144                 }
 145                 $anyterm = implode( '|', $terms );
 146                 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
 147
 148                 // @todo FIXME: A hack to scale contextchars, a correct solution
 149                 // would be to have contextchars actually be char and not byte
 150                 // length, and do proper utf-8 substrings and lengths everywhere,
 151                 // but PHP is making that very hard and unclean to implement :(
 152                 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
 153                 $contextchars = intval( $contextchars * $scale );
 154
 155                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 156                 $patPost = "($wgSearchHighlightBoundaries|$)";
 157
 158                 $pat1 = "/(" . $phrase . ")/ui";
 159                 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
 160
 161                 $left = $contextlines;
 162
 163                 $snippets = array();
 164                 $offsets = array();
 165
 166                 // show beginning only if it contains all words
 167                 $first = 0;
 168                 $firstText = '';
 169                 foreach ( $textExt as $index => $line ) {
 170                         if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
 171                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 172                                 $first = $index;
 173                                 break;
 174                         }
 175                 }
 176                 if ( $firstText ) {
 177                         $succ = true;
 178                         // check if first text contains all terms
 179                         foreach ( $terms as $term ) {
 180                                 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
 181                                         $succ = false;
 182                                         break;
 183                                 }
 184                         }
 185                         if ( $succ ) {
 186                                 $snippets[$first] = $firstText;
 187                                 $offsets[$first] = 0;
 188                         }
 189                 }
 190                 if ( !$snippets ) {
 191                         // match whole query on text
 192                         $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
 193                         // match whole query on templates/tables/images
 194                         $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
 195                         // match any words on text
 196                         $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
 197                         // match any words on templates/tables/images
 198                         $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
 199
 200                         ksort( $snippets );
 201                 }
 202
 203                 // add extra chars to each snippet to make snippets constant size
 204                 $extended = array();
 205                 if ( count( $snippets ) == 0 ) {
 206                         // couldn't find the target words, just show beginning of article
 207                         if ( array_key_exists( $first, $all ) ) {
 208                                 $targetchars = $contextchars * $contextlines;
 209                                 $snippets[$first] = '';
 210                                 $offsets[$first] = 0;
 211                         }
 212                 } else {
 213                         // if begin of the article contains the whole phrase, show only that !!
 214                         if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
 215                                 && $offsets[$first] < $contextchars * 2 ) {
 216                                 $snippets = array( $first => $snippets[$first] );
 217                         }
 218
 219                         // calc by how much to extend existing snippets
 220                         $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
 221                 }
 222
 223                 foreach ( $snippets as $index => $line ) {
 224                         $extended[$index] = $line;
 225                         $len = strlen( $line );
 226                         if ( $len < $targetchars - 20 ) {
 227                                 // complete this line
 228                                 if ( $len < strlen( $all[$index] ) ) {
 229                                         $extended[$index] = $this->extract(
 230                                                 $all[$index],
 231                                                 $offsets[$index],
 232                                                 $offsets[$index] + $targetchars,
 233                                                 $offsets[$index]
 234                                         );
 235                                         $len = strlen( $extended[$index] );
 236                                 }
 237
 238                                 // add more lines
 239                                 $add = $index + 1;
 240                                 while ( $len < $targetchars - 20
 241                                                 && array_key_exists( $add, $all )
 242                                                 && !array_key_exists( $add, $snippets ) ) {
 243                                         $offsets[$add] = 0;
 244                                         $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 245                                         $extended[$add] = $tt;
 246                                         $len += strlen( $tt );
 247                                         $add++;
 248                                 }
 249                         }
 250                 }
 251
 252                 // $snippets = array_map( 'htmlspecialchars', $extended );
 253                 $snippets = $extended;
 254                 $last = - 1;
 255                 $extract = '';
 256                 foreach ( $snippets as $index => $line ) {
 257                         if ( $last == - 1 ) {
 258                                 $extract .= $line; // first line
 259                         } elseif ( $last + 1 == $index
 260                                 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
 261                         ) {
 262                                 $extract .= " " . $line; // continous lines
 263                         } else {
 264                                 $extract .= '<b> ... </b>' . $line;
 265                         }
 266
 267                         $last = $index;
 268                 }
 269                 if ( $extract ) {
 270                         $extract .= '<b> ... </b>';
 271                 }
 272
 273                 $processed = array();
 274                 foreach ( $terms as $term ) {
 275                         if ( !isset( $processed[$term] ) ) {
 276                                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
 277                                 $extract = preg_replace( $pat3,
 278                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 279                                 $processed[$term] = true;
 280                         }
 281                 }
 282
 283                 return $extract;
 284         }
 285
 286         /**
 287          * Split text into lines and add it to extracts array
 288          *
 289          * @param array $extracts Index -> $line
 290          * @param int $count
 291          * @param string $text
 292          */
 293         function splitAndAdd( &$extracts, &$count, $text ) {
 294                 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
 295                 foreach ( $split as $line ) {
 296                         $tt = trim( $line );
 297                         if ( $tt ) {
 298                                 $extracts[$count++] = $tt;
 299                         }
 300                 }
 301         }
 302
 303         /**
 304          * Do manual case conversion for non-ascii chars
 305          *
 306          * @param array $matches
 307          * @return string
 308          */
 309         function caseCallback( $matches ) {
 310                 global $wgContLang;
 311                 if ( strlen( $matches[0] ) > 1 ) {
 312                         return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
 313                 } else {
 314                         return $matches[0];
 315                 }
 316         }
 317
 318         /**
 319          * Extract part of the text from start to end, but by
 320          * not chopping up words
 321          * @param string $text
 322          * @param int $start
 323          * @param int $end
 324          * @param int $posStart (out) actual start position
 325          * @param int $posEnd (out) actual end position
 326          * @return string
 327          */
 328         function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
 329                 if ( $start != 0 ) {
 330                         $start = $this->position( $text, $start, 1 );
 331                 }
 332                 if ( $end >= strlen( $text ) ) {
 333                         $end = strlen( $text );
 334                 } else {
 335                         $end = $this->position( $text, $end );
 336                 }
 337
 338                 if ( !is_null( $posStart ) ) {
 339                         $posStart = $start;
 340                 }
 341                 if ( !is_null( $posEnd ) ) {
 342                         $posEnd = $end;
 343                 }
 344
 345                 if ( $end > $start ) {
 346                         return substr( $text, $start, $end - $start );
 347                 } else {
 348                         return '';
 349                 }
 350         }
 351
 352         /**
 353          * Find a nonletter near a point (index) in the text
 354          *
 355          * @param string $text
 356          * @param int $point
 357          * @param int $offset Offset to found index
 358          * @return int Nearest nonletter index, or beginning of utf8 char if none
 359          */
 360         function position( $text, $point, $offset = 0 ) {
 361                 $tolerance = 10;
 362                 $s = max( 0, $point - $tolerance );
 363                 $l = min( strlen( $text ), $point + $tolerance ) - $s;
 364                 $m = array();
 365
 366                 if ( preg_match(
 367                         '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
 368                         substr( $text, $s, $l ),
 369                         $m,
 370                         PREG_OFFSET_CAPTURE
 371                 ) ) {
 372                         return $m[0][1] + $s + $offset;
 373                 } else {
 374                         // check if point is on a valid first UTF8 char
 375                         $char = ord( $text[$point] );
 376                         while ( $char >= 0x80 && $char < 0xc0 ) {
 377                                 // skip trailing bytes
 378                                 $point++;
 379                                 if ( $point >= strlen( $text ) ) {
 380                                         return strlen( $text );
 381                                 }
 382                                 $char = ord( $text[$point] );
 383                         }
 384
 385                         return $point;
 386
 387                 }
 388         }
 389
 390         /**
 391          * Search extracts for a pattern, and return snippets
 392          *
 393          * @param string $pattern Regexp for matching lines
 394          * @param array $extracts Extracts to search
 395          * @param int $linesleft Number of extracts to make
 396          * @param int $contextchars Length of snippet
 397          * @param array $out Map for highlighted snippets
 398          * @param array $offsets Map of starting points of snippets
 399          * @protected
 400          */
 401         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
 402                 if ( $linesleft == 0 ) {
 403                         return; // nothing to do
 404                 }
 405                 foreach ( $extracts as $index => $line ) {
 406                         if ( array_key_exists( $index, $out ) ) {
 407                                 continue; // this line already highlighted
 408                         }
 409
 410                         $m = array();
 411                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
 412                                 continue;
 413                         }
 414
 415                         $offset = $m[0][1];
 416                         $len = strlen( $m[0][0] );
 417                         if ( $offset + $len < $contextchars ) {
 418                                 $begin = 0;
 419                         } elseif ( $len > $contextchars ) {
 420                                 $begin = $offset;
 421                         } else {
 422                                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
 423                         }
 424
 425                         $end = $begin + $contextchars;
 426
 427                         $posBegin = $begin;
 428                         // basic snippet from this line
 429                         $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
 430                         $offsets[$index] = $posBegin;
 431                         $linesleft--;
 432                         if ( $linesleft == 0 ) {
 433                                 return;
 434                         }
 435                 }
 436         }
 437
 438         /**
 439          * Basic wikitext removal
 440          * @protected
 441          * @param string $text
 442          * @return mixed
 443          */
 444         function removeWiki( $text ) {
 445                 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
 446                 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
 447                 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
 448                 $text = preg_replace_callback(
 449                         "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
 450                         array( $this, 'linkReplace' ),
 451                         $text
 452                 );
 453                 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
 454                 $text = preg_replace( "/'''''/", "", $text );
 455                 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
 456                 $text = preg_replace( "/''/", "", $text );
 457
 458                 return $text;
 459         }
 460
 461         /**
 462          * callback to replace [[target|caption]] kind of links, if
 463          * the target is category or image, leave it
 464          *
 465          * @param array $matches
 466          * @return string
 467          */
 468         function linkReplace( $matches ) {
 469                 $colon = strpos( $matches[1], ':' );
 470                 if ( $colon === false ) {
 471                         return $matches[2]; // replace with caption
 472                 }
 473                 global $wgContLang;
 474                 $ns = substr( $matches[1], 0, $colon );
 475                 $index = $wgContLang->getNsIndex( $ns );
 476                 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
 477                         return $matches[0]; // return the whole thing
 478                 } else {
 479                         return $matches[2];
 480                 }
 481         }
 482
 483         /**
 484          * Simple & fast snippet extraction, but gives completely unrelevant
 485          * snippets
 486          *
 487          * @param string $text
 488          * @param array $terms
 489          * @param int $contextlines
 490          * @param int $contextchars
 491          * @return string
 492          */
 493         public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
 494                 global $wgContLang;
 495
 496                 $lines = explode( "\n", $text );
 497
 498                 $terms = implode( '|', $terms );
 499                 $max = intval( $contextchars ) + 1;
 500                 $pat1 = "/(.*)($terms)(.{0,$max})/i";
 501
 502                 $lineno = 0;
 503
 504                 $extract = "";
 505                 foreach ( $lines as $line ) {
 506                         if ( 0 == $contextlines ) {
 507                                 break;
 508                         }
 509                         ++$lineno;
 510                         $m = array();
 511                         if ( !preg_match( $pat1, $line, $m ) ) {
 512                                 continue;
 513                         }
 514                         --$contextlines;
 515                         // truncate function changes ... to relevant i18n message.
 516                         $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
 517
 518                         if ( count( $m ) < 3 ) {
 519                                 $post = '';
 520                         } else {
 521                                 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
 522                         }
 523
 524                         $found = $m[2];
 525
 526                         $line = htmlspecialchars( $pre . $found . $post );
 527                         $pat2 = '/(' . $terms . ")/i";
 528                         $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
 529
 530                         $extract .= "${line}\n";
 531                 }
 532
 533                 return $extract;
 534         }
 535
 536         /**
 537          * Returns the first few lines of the text
 538          *
 539          * @param string $text
 540          * @param int $contextlines Max number of returned lines
 541          * @param int $contextchars Average number of characters per line
 542          * @return string
 543          */
 544         public function highlightNone( $text, $contextlines, $contextchars ) {
 545                 $match = array();
 546                 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
 547                 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
 548                 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
 549                 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) ); // trim and limit to max number of chars
 550                 return str_replace( "\n", '<br>', $text );
 551         }
 552 }