includes/content/WikiTextStructure.php

   1 <?php
   2
   3 use HtmlFormatter\HtmlFormatter;
   4
   5 /**
   6  * Class allowing to explore structure of parsed wikitext.
   7  */
   8 class WikiTextStructure {
   9         /**
  10          * @var string
  11          */
  12         private $openingText;
  13         /**
  14          * @var string
  15          */
  16         private $allText;
  17         /**
  18          * @var string[]
  19          */
  20         private $auxText = [];
  21         /**
  22          * @var ParserOutput
  23          */
  24         private $parserOutput;
  25
  26         /**
  27          * @var string[] selectors to elements that are excluded entirely from search
  28          */
  29         private $excludedElementSelectors = [
  30                 // "it looks like you don't have javascript enabled..." – do not need to index
  31                 'audio', 'video',
  32                 // CSS stylesheets aren't content
  33                 'style',
  34                 // The [1] for references
  35                 'sup.reference',
  36                 // The ↑ next to references in the references section
  37                 '.mw-cite-backlink',
  38                 // Headings are already indexed in their own field.
  39                 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
  40                 // Collapsed fields are hidden by default so we don't want them showing up.
  41                 '.autocollapse',
  42                 // Content explicitly decided to be not searchable by editors such
  43                 // as custom navigation templates.
  44                 '.navigation-not-searchable'
  45         ];
  46
  47         /**
  48          * @var string[] selectors to elements that are considered auxiliary to article text for search
  49          */
  50         private $auxiliaryElementSelectors = [
  51                 // Thumbnail captions aren't really part of the text proper
  52                 '.thumbcaption',
  53                 // Neither are tables
  54                 'table',
  55                 // Common style for "See also:".
  56                 '.rellink',
  57                 // Common style for calling out helpful links at the top of the article.
  58                 '.dablink',
  59                 // New class users can use to mark stuff as auxiliary to searches.
  60                 '.searchaux',
  61         ];
  62
  63         /**
  64          * @param ParserOutput $parserOutput
  65          */
  66         public function __construct( ParserOutput $parserOutput ) {
  67                 $this->parserOutput = $parserOutput;
  68         }
  69
  70         /**
  71          * Get headings on the page.
  72          * @return string[]
  73          * First strip out things that look like references.  We can't use HTML filtering because
  74          * the references come back as <sup> tags without a class.  To keep from breaking stuff like
  75          *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
  76          * we don't remove the whole <sup> tag.  We also don't want to strip the <sup> tag and remove
  77          * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
  78          * or something.  Whatever.  So we only strip things that look like <sup> tags wrapping a
  79          * reference.  And since the data looks like:
  80          *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
  81          * we can not really use HtmlFormatter as we have no suitable selector.
  82          */
  83         public function headings() {
  84                 $headings = [];
  85                 $ignoredHeadings = $this->getIgnoredHeadings();
  86                 foreach ( $this->parserOutput->getSections() as $heading ) {
  87                         $heading = $heading[ 'line' ];
  88
  89                         // Some wikis wrap the brackets in a span:
  90                         // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
  91                         $heading = preg_replace( '/<\/?span>/', '', $heading );
  92                         // Normalize [] so the following regexp would work.
  93                         $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
  94                         $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
  95
  96                         // Strip tags from the heading or else we'll display them (escaped) in search results
  97                         $heading = trim( Sanitizer::stripAllTags( $heading ) );
  98
  99                         // Note that we don't take the level of the heading into account - all headings are equal.
 100                         // Except the ones we ignore.
 101                         if ( !in_array( $heading, $ignoredHeadings ) ) {
 102                                 $headings[] = $heading;
 103                         }
 104                 }
 105                 return $headings;
 106         }
 107
 108         /**
 109          * Parse a message content into an array. This function is generally used to
 110          * parse settings stored as i18n messages (see search-ignored-headings).
 111          *
 112          * @param string $message
 113          * @return string[]
 114          */
 115         public static function parseSettingsInMessage( $message ) {
 116                 $lines = explode( "\n", $message );
 117                 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
 118                 $lines = array_map( 'trim', $lines );          // Remove extra spaces
 119                 $lines = array_filter( $lines );               // Remove empty lines
 120                 return $lines;
 121         }
 122
 123         /**
 124          * Get list of heading to ignore.
 125          * @return string[]
 126          */
 127         private function getIgnoredHeadings() {
 128                 static $ignoredHeadings = null;
 129                 if ( $ignoredHeadings === null ) {
 130                         $ignoredHeadings = [];
 131                         $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
 132                         if ( $source->isBlank() ) {
 133                                 // Try old version too, just in case
 134                                 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
 135                         }
 136                         if ( !$source->isDisabled() ) {
 137                                 $lines = self::parseSettingsInMessage( $source->plain() );
 138                                 $ignoredHeadings = $lines;               // Now we just have headings!
 139                         }
 140                 }
 141                 return $ignoredHeadings;
 142         }
 143
 144         /**
 145          * Extract parts of the text - opening, main and auxiliary.
 146          */
 147         private function extractWikitextParts() {
 148                 if ( !is_null( $this->allText ) ) {
 149                         return;
 150                 }
 151                 $text = $this->parserOutput->getText( [
 152                         'enableSectionEditTokens' => false,
 153                         'allowTOC' => false,
 154                 ] );
 155                 if ( strlen( $text ) == 0 ) {
 156                         $this->allText = "";
 157                         // empty text - nothing to seek here
 158                         return;
 159                 }
 160                 $opening = null;
 161
 162                 $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
 163
 164                 // Add extra spacing around break tags so text crammed together like<br>this
 165                 // doesn't make one word.
 166                 $text = str_replace( '<br', "\n<br", $text );
 167
 168                 $formatter = new HtmlFormatter( $text );
 169
 170                 // Strip elements from the page that we never want in the search text.
 171                 $formatter->remove( $this->excludedElementSelectors );
 172                 $formatter->filterContent();
 173
 174                 // Strip elements from the page that are auxiliary text.  These will still be
 175                 // searched but matches will be ranked lower and non-auxiliary matches will be
 176                 // preferred in highlighting.
 177                 $formatter->remove( $this->auxiliaryElementSelectors );
 178                 $auxiliaryElements = $formatter->filterContent();
 179                 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 180                 foreach ( $auxiliaryElements as $auxiliaryElement ) {
 181                         $this->auxText[] =
 182                                 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
 183                 }
 184         }
 185
 186         /**
 187          * Get text before first heading.
 188          * @param string $text
 189          * @return string|null
 190          */
 191         private function extractHeadingBeforeFirstHeading( $text ) {
 192                 $matches = [];
 193                 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
 194                         // There isn't a first heading so we interpret this as the article
 195                         // being entirely without heading.
 196                         return null;
 197                 }
 198                 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
 199                 if ( !$text ) {
 200                         // There isn't any text before the first heading so we declare there isn't
 201                         // a first heading.
 202                         return null;
 203                 }
 204
 205                 $formatter = new HtmlFormatter( $text );
 206                 $formatter->remove( $this->excludedElementSelectors );
 207                 $formatter->remove( $this->auxiliaryElementSelectors );
 208                 $formatter->filterContent();
 209                 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 210
 211                 if ( !$text ) {
 212                         // There isn't any text after filtering before the first heading so we declare
 213                         // that there isn't a first heading.
 214                         return null;
 215                 }
 216
 217                 return $text;
 218         }
 219
 220         /**
 221          * Get opening text
 222          * @return string
 223          */
 224         public function getOpeningText() {
 225                 $this->extractWikitextParts();
 226                 return $this->openingText;
 227         }
 228
 229         /**
 230          * Get main text
 231          * @return string
 232          */
 233         public function getMainText() {
 234                 $this->extractWikitextParts();
 235                 return $this->allText;
 236         }
 237
 238         /**
 239          * Get auxiliary text
 240          * @return string[]
 241          */
 242         public function getAuxiliaryText() {
 243                 $this->extractWikitextParts();
 244                 return $this->auxText;
 245         }
 246
 247         /**
 248          * Get the defaultsort property
 249          * @return string|null
 250          */
 251         public function getDefaultSort() {
 252                 return $this->parserOutput->getProperty( 'defaultsort' );
 253         }
 254 }