Make content handlers assemble content for search
[lhc/web/wiklou.git] / includes / content / WikiTextStructure.php
1 <?php
2
3 use HtmlFormatter\HtmlFormatter;
4 use MediaWiki\Logger\LoggerFactory;
5
6 /**
7 * Class allowing to explore structure of parsed wikitext.
8 */
9 class WikiTextStructure {
10 /**
11 * @var string
12 */
13 private $openingText;
14 /**
15 * @var string
16 */
17 private $allText;
18 /**
19 * @var string[]
20 */
21 private $auxText = [];
22 /**
23 * @var ParserOutput
24 */
25 private $parserOutput;
26
27 /**
28 * @var string[] selectors to elements that are excluded entirely from search
29 */
30 private $excludedElementSelectors = [
31 'audio', 'video', // "it looks like you don't have javascript enabled..."
32 // do not need to index
33 'sup.reference', // The [1] for references
34 '.mw-cite-backlink', // The ↑ next to references in the references section
35 'h1', 'h2', 'h3', // Headings are already indexed in their own field.
36 'h5', 'h6', 'h4',
37 '.autocollapse', // Collapsed fields are hidden by default so we don't want them
38 // showing up.
39 ];
40
41 /**
42 * @var string[] selectors to elements that are considered auxiliary to article text for search
43 */
44 private $auxiliaryElementSelectors = [
45 '.thumbcaption', // Thumbnail captions aren't really part of the text proper
46 'table', // Neither are tables
47 '.rellink', // Common style for "See also:".
48 '.dablink', // Common style for calling out helpful links at the top
49 // of the article.
50 '.searchaux', // New class users can use to mark stuff as auxiliary to searches.
51 ];
52
53 /**
54 * WikiTextStructure constructor.
55 * @param ParserOutput $parserOutput
56 */
57 public function __construct( ParserOutput $parserOutput ) {
58 $this->parserOutput = $parserOutput;
59 }
60
61 /**
62 * Get categories in the text.
63 * @return string[]
64 */
65 public function categories() {
66 $categories = [];
67 foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) {
68 $categories[] = Category::newFromName( $key )->getTitle()->getText();
69 }
70 return $categories;
71 }
72
73 /**
74 * Get outgoing links.
75 * @return string[]
76 */
77 public function outgoingLinks() {
78 $outgoingLinks = [];
79 foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) {
80 foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) {
81 $outgoingLinks[] =
82 Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey();
83 }
84 }
85 return $outgoingLinks;
86 }
87
88 /**
89 * Get templates in the text.
90 * @return string[]
91 */
92 public function templates() {
93 $templates = [];
94 foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) {
95 foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
96 $templateTitle = Title::makeTitleSafe( $tNS, $tDbKey );
97 if ( $templateTitle && $templateTitle->exists() ) {
98 $templates[] = $templateTitle->getPrefixedText();
99 }
100 }
101 }
102 return $templates;
103 }
104
105 /**
106 * Get headings on the page.
107 * @return string[]
108 * First strip out things that look like references. We can't use HTML filtering because
109 * the references come back as <sup> tags without a class. To keep from breaking stuff like
110 * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
111 * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
112 * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
113 * or something. Whatever. So we only strip things that look like <sup> tags wrapping a
114 * reference. And since the data looks like:
115 * Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
116 * we can not really use HtmlFormatter as we have no suitable selector.
117 */
118 public function headings() {
119 $headings = [];
120 $ignoredHeadings = $this->getIgnoredHeadings();
121 foreach ( $this->parserOutput->getSections() as $heading ) {
122 $heading = $heading[ 'line' ];
123
124 // Some wikis wrap the brackets in a span:
125 // http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
126 $heading = preg_replace( '/<\/?span>/', '', $heading );
127 // Normalize [] so the following regexp would work.
128 $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
129 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
130
131 // Strip tags from the heading or else we'll display them (escaped) in search results
132 $heading = trim( Sanitizer::stripAllTags( $heading ) );
133
134 // Note that we don't take the level of the heading into account - all headings are equal.
135 // Except the ones we ignore.
136 if ( !in_array( $heading, $ignoredHeadings ) ) {
137 $headings[] = $heading;
138 }
139 }
140 return $headings;
141 }
142
143 /**
144 * Parse a message content into an array. This function is generally used to
145 * parse settings stored as i18n messages (see search-ignored-headings).
146 *
147 * @param string $message
148 * @return string[]
149 */
150 public static function parseSettingsInMessage( $message ) {
151 $lines = explode( "\n", $message );
152 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
153 $lines = array_map( 'trim', $lines ); // Remove extra spaces
154 $lines = array_filter( $lines ); // Remove empty lines
155 return $lines;
156 }
157
158 /**
159 * Get list of heading to ignore.
160 * @return string[]
161 */
162 private function getIgnoredHeadings() {
163 static $ignoredHeadings = null;
164 if ( $ignoredHeadings === null ) {
165 // FIXME: will be renamed in next patches to search-ignored-headings
166 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
167 $ignoredHeadings = [];
168 if ( !$source->isDisabled() ) {
169 $lines = self::parseSettingsInMessage( $source->plain() );
170 $ignoredHeadings = $lines; // Now we just have headings!
171 }
172 }
173 return $ignoredHeadings;
174 }
175
176 /**
177 * Extract parts of the text - opening, main and auxiliary.
178 */
179 private function extractWikitextParts() {
180 if ( !is_null( $this->allText ) ) {
181 return;
182 }
183 $this->parserOutput->setEditSectionTokens( false );
184 $this->parserOutput->setTOCEnabled( false );
185 $text = $this->parserOutput->getText();
186 if ( strlen( $text ) == 0 ) {
187 $this->allText = "";
188 // empty text - nothing to seek here
189 return;
190 }
191 $opening = null;
192
193 $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
194
195 // Add extra spacing around break tags so text crammed together like<br>this
196 // doesn't make one word.
197 $text = str_replace( '<br', "\n<br", $text );
198
199 $formatter = new HtmlFormatter( $text );
200
201 // Strip elements from the page that we never want in the search text.
202 $formatter->remove( $this->excludedElementSelectors );
203 $formatter->filterContent();
204
205 // Strip elements from the page that are auxiliary text. These will still be
206 // searched but matches will be ranked lower and non-auxiliary matches will be
207 // preferred in highlighting.
208 $formatter->remove( $this->auxiliaryElementSelectors );
209 $auxiliaryElements = $formatter->filterContent();
210 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
211 foreach ( $auxiliaryElements as $auxiliaryElement ) {
212 $this->auxText[] =
213 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
214 }
215 }
216
217 /**
218 * Get text before first heading.
219 * @param string $text
220 * @return string|null
221 */
222 private function extractHeadingBeforeFirstHeading( $text ) {
223 $matches = [];
224 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
225 // There isn't a first heading so we interpret this as the article
226 // being entirely without heading.
227 return null;
228 }
229 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
230 if ( !$text ) {
231 // There isn't any text before the first heading so we declare there isn't
232 // a first heading.
233 return null;
234 }
235
236 $formatter = new HtmlFormatter( $text );
237 $formatter->remove( $this->excludedElementSelectors );
238 $formatter->remove( $this->auxiliaryElementSelectors );
239 $formatter->filterContent();
240 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
241
242 if ( !$text ) {
243 // There isn't any text after filtering before the first heading so we declare
244 // that there isn't a first heading.
245 return null;
246 }
247
248 return $text;
249 }
250
251 /**
252 * Get opening text
253 * @return string
254 */
255 public function getOpeningText() {
256 $this->extractWikitextParts();
257 return $this->openingText;
258 }
259
260 /**
261 * Get main text
262 * @return string
263 */
264 public function getMainText() {
265 $this->extractWikitextParts();
266 return $this->allText;
267 }
268
269 /**
270 * Get auxiliary text
271 * @return string[]
272 */
273 public function getAuxiliaryText() {
274 $this->extractWikitextParts();
275 return $this->auxText;
276 }
277 }