includes/content/WikiTextStructure.php

   1 <?php
   2
   3 use HtmlFormatter\HtmlFormatter;
   4
   5 /**
   6  * Class allowing to explore structure of parsed wikitext.
   7  */
   8 class WikiTextStructure {
   9         /**
  10          * @var string
  11          */
  12         private $openingText;
  13         /**
  14          * @var string
  15          */
  16         private $allText;
  17         /**
  18          * @var string[]
  19          */
  20         private $auxText = [];
  21         /**
  22          * @var ParserOutput
  23          */
  24         private $parserOutput;
  25
  26         /**
  27          * @var string[] selectors to elements that are excluded entirely from search
  28          */
  29         private $excludedElementSelectors = [
  30                 'audio', 'video',       // "it looks like you don't have javascript enabled..."
  31                                         // do not need to index
  32                 'sup.reference',        // The [1] for references
  33                 '.mw-cite-backlink',    // The ↑ next to references in the references section
  34                 'h1', 'h2', 'h3',       // Headings are already indexed in their own field.
  35                 'h5', 'h6', 'h4',
  36                 '.autocollapse',        // Collapsed fields are hidden by default so we don't want them
  37                                                                 // showing up.
  38         ];
  39
  40         /**
  41          * @var string[] selectors to elements that are considered auxiliary to article text for search
  42          */
  43         private $auxiliaryElementSelectors = [
  44                 '.thumbcaption',        // Thumbnail captions aren't really part of the text proper
  45                 'table',                // Neither are tables
  46                 '.rellink',             // Common style for "See also:".
  47                 '.dablink',             // Common style for calling out helpful links at the top
  48                                                                 // of the article.
  49                 '.searchaux',           // New class users can use to mark stuff as auxiliary to searches.
  50         ];
  51
  52         /**
  53          * WikiTextStructure constructor.
  54          * @param ParserOutput $parserOutput
  55          */
  56         public function __construct( ParserOutput $parserOutput ) {
  57                 $this->parserOutput = $parserOutput;
  58         }
  59
  60         /**
  61          * Get headings on the page.
  62          * @return string[]
  63          * First strip out things that look like references.  We can't use HTML filtering because
  64          * the references come back as <sup> tags without a class.  To keep from breaking stuff like
  65          *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
  66          * we don't remove the whole <sup> tag.  We also don't want to strip the <sup> tag and remove
  67          * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
  68          * or something.  Whatever.  So we only strip things that look like <sup> tags wrapping a
  69          * reference.  And since the data looks like:
  70          *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
  71          * we can not really use HtmlFormatter as we have no suitable selector.
  72          */
  73         public function headings() {
  74                 $headings = [];
  75                 $ignoredHeadings = $this->getIgnoredHeadings();
  76                 foreach ( $this->parserOutput->getSections() as $heading ) {
  77                         $heading = $heading[ 'line' ];
  78
  79                         // Some wikis wrap the brackets in a span:
  80                         // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
  81                         $heading = preg_replace( '/<\/?span>/', '', $heading );
  82                         // Normalize [] so the following regexp would work.
  83                         $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
  84                         $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
  85
  86                         // Strip tags from the heading or else we'll display them (escaped) in search results
  87                         $heading = trim( Sanitizer::stripAllTags( $heading ) );
  88
  89                         // Note that we don't take the level of the heading into account - all headings are equal.
  90                         // Except the ones we ignore.
  91                         if ( !in_array( $heading, $ignoredHeadings ) ) {
  92                                 $headings[] = $heading;
  93                         }
  94                 }
  95                 return $headings;
  96         }
  97
  98         /**
  99          * Parse a message content into an array. This function is generally used to
 100          * parse settings stored as i18n messages (see search-ignored-headings).
 101          *
 102          * @param string $message
 103          * @return string[]
 104          */
 105         public static function parseSettingsInMessage( $message ) {
 106                 $lines = explode( "\n", $message );
 107                 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
 108                 $lines = array_map( 'trim', $lines );          // Remove extra spaces
 109                 $lines = array_filter( $lines );               // Remove empty lines
 110                 return $lines;
 111         }
 112
 113         /**
 114          * Get list of heading to ignore.
 115          * @return string[]
 116          */
 117         private function getIgnoredHeadings() {
 118                 static $ignoredHeadings = null;
 119                 if ( $ignoredHeadings === null ) {
 120                         $ignoredHeadings = [];
 121                         $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
 122                         if ( $source->isBlank() ) {
 123                                 // Try old version too, just in case
 124                                 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
 125                         }
 126                         if ( !$source->isDisabled() ) {
 127                                 $lines = self::parseSettingsInMessage( $source->plain() );
 128                                 $ignoredHeadings = $lines;               // Now we just have headings!
 129                         }
 130                 }
 131                 return $ignoredHeadings;
 132         }
 133
 134         /**
 135          * Extract parts of the text - opening, main and auxiliary.
 136          */
 137         private function extractWikitextParts() {
 138                 if ( !is_null( $this->allText ) ) {
 139                         return;
 140                 }
 141                 $this->parserOutput->setEditSectionTokens( false );
 142                 $this->parserOutput->setTOCEnabled( false );
 143                 $text = $this->parserOutput->getText();
 144                 if ( strlen( $text ) == 0 ) {
 145                         $this->allText = "";
 146                         // empty text - nothing to seek here
 147                         return;
 148                 }
 149                 $opening = null;
 150
 151                 $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
 152
 153                 // Add extra spacing around break tags so text crammed together like<br>this
 154                 // doesn't make one word.
 155                 $text = str_replace( '<br', "\n<br", $text );
 156
 157                 $formatter = new HtmlFormatter( $text );
 158
 159                 // Strip elements from the page that we never want in the search text.
 160                 $formatter->remove( $this->excludedElementSelectors );
 161                 $formatter->filterContent();
 162
 163                 // Strip elements from the page that are auxiliary text.  These will still be
 164                 // searched but matches will be ranked lower and non-auxiliary matches will be
 165                 // preferred in highlighting.
 166                 $formatter->remove( $this->auxiliaryElementSelectors );
 167                 $auxiliaryElements = $formatter->filterContent();
 168                 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 169                 foreach ( $auxiliaryElements as $auxiliaryElement ) {
 170                         $this->auxText[] =
 171                                 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
 172                 }
 173         }
 174
 175         /**
 176          * Get text before first heading.
 177          * @param string $text
 178          * @return string|null
 179          */
 180         private function extractHeadingBeforeFirstHeading( $text ) {
 181                 $matches = [];
 182                 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
 183                         // There isn't a first heading so we interpret this as the article
 184                         // being entirely without heading.
 185                         return null;
 186                 }
 187                 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
 188                 if ( !$text ) {
 189                         // There isn't any text before the first heading so we declare there isn't
 190                         // a first heading.
 191                         return null;
 192                 }
 193
 194                 $formatter = new HtmlFormatter( $text );
 195                 $formatter->remove( $this->excludedElementSelectors );
 196                 $formatter->remove( $this->auxiliaryElementSelectors );
 197                 $formatter->filterContent();
 198                 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 199
 200                 if ( !$text ) {
 201                         // There isn't any text after filtering before the first heading so we declare
 202                         // that there isn't a first heading.
 203                         return null;
 204                 }
 205
 206                 return $text;
 207         }
 208
 209         /**
 210          * Get opening text
 211          * @return string
 212          */
 213         public function getOpeningText() {
 214                 $this->extractWikitextParts();
 215                 return $this->openingText;
 216         }
 217
 218         /**
 219          * Get main text
 220          * @return string
 221          */
 222         public function getMainText() {
 223                 $this->extractWikitextParts();
 224                 return $this->allText;
 225         }
 226
 227         /**
 228          * Get auxiliary text
 229          * @return string[]
 230          */
 231         public function getAuxiliaryText() {
 232                 $this->extractWikitextParts();
 233                 return $this->auxText;
 234         }
 235
 236         /**
 237          * Get the defaultsort property
 238          * @return string|null
 239          */
 240         public function getDefaultSort() {
 241                 return $this->parserOutput->getProperty( 'defaultsort' );
 242         }
 243 }