Add an "editor" extension type
[lhc/web/wiklou.git] / includes / content / WikiTextStructure.php
1 <?php
2
3 use HtmlFormatter\HtmlFormatter;
4
5 /**
6 * Class allowing to explore structure of parsed wikitext.
7 */
8 class WikiTextStructure {
9 /**
10 * @var string
11 */
12 private $openingText;
13 /**
14 * @var string
15 */
16 private $allText;
17 /**
18 * @var string[]
19 */
20 private $auxText = [];
21 /**
22 * @var ParserOutput
23 */
24 private $parserOutput;
25
26 /**
27 * @var string[] selectors to elements that are excluded entirely from search
28 */
29 private $excludedElementSelectors = [
30 // "it looks like you don't have javascript enabled..." – do not need to index
31 'audio', 'video',
32 // CSS stylesheets aren't content
33 'style',
34 // The [1] for references
35 'sup.reference',
36 // The ↑ next to references in the references section
37 '.mw-cite-backlink',
38 // Headings are already indexed in their own field.
39 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
40 // Collapsed fields are hidden by default so we don't want them showing up.
41 '.autocollapse',
42 // Content explicitly decided to be not searchable by editors such
43 // as custom navigation templates.
44 '.navigation-not-searchable'
45 ];
46
47 /**
48 * @var string[] selectors to elements that are considered auxiliary to article text for search
49 */
50 private $auxiliaryElementSelectors = [
51 // Thumbnail captions aren't really part of the text proper
52 '.thumbcaption',
53 // Neither are tables
54 'table',
55 // Common style for "See also:".
56 '.rellink',
57 // Common style for calling out helpful links at the top of the article.
58 '.dablink',
59 // New class users can use to mark stuff as auxiliary to searches.
60 '.searchaux',
61 ];
62
63 /**
64 * @param ParserOutput $parserOutput
65 */
66 public function __construct( ParserOutput $parserOutput ) {
67 $this->parserOutput = $parserOutput;
68 }
69
70 /**
71 * Get headings on the page.
72 * @return string[]
73 * First strip out things that look like references. We can't use HTML filtering because
74 * the references come back as <sup> tags without a class. To keep from breaking stuff like
75 * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
76 * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
77 * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
78 * or something. Whatever. So we only strip things that look like <sup> tags wrapping a
79 * reference. And since the data looks like:
80 * Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
81 * we can not really use HtmlFormatter as we have no suitable selector.
82 */
83 public function headings() {
84 $headings = [];
85 $ignoredHeadings = $this->getIgnoredHeadings();
86 foreach ( $this->parserOutput->getSections() as $heading ) {
87 $heading = $heading[ 'line' ];
88
89 // Some wikis wrap the brackets in a span:
90 // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
91 $heading = preg_replace( '/<\/?span>/', '', $heading );
92 // Normalize [] so the following regexp would work.
93 $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
94 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
95
96 // Strip tags from the heading or else we'll display them (escaped) in search results
97 $heading = trim( Sanitizer::stripAllTags( $heading ) );
98
99 // Note that we don't take the level of the heading into account - all headings are equal.
100 // Except the ones we ignore.
101 if ( !in_array( $heading, $ignoredHeadings ) ) {
102 $headings[] = $heading;
103 }
104 }
105 return $headings;
106 }
107
108 /**
109 * Parse a message content into an array. This function is generally used to
110 * parse settings stored as i18n messages (see search-ignored-headings).
111 *
112 * @param string $message
113 * @return string[]
114 */
115 public static function parseSettingsInMessage( $message ) {
116 $lines = explode( "\n", $message );
117 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
118 $lines = array_map( 'trim', $lines ); // Remove extra spaces
119 $lines = array_filter( $lines ); // Remove empty lines
120 return $lines;
121 }
122
123 /**
124 * Get list of heading to ignore.
125 * @return string[]
126 */
127 private function getIgnoredHeadings() {
128 static $ignoredHeadings = null;
129 if ( $ignoredHeadings === null ) {
130 $ignoredHeadings = [];
131 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
132 if ( $source->isBlank() ) {
133 // Try old version too, just in case
134 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
135 }
136 if ( !$source->isDisabled() ) {
137 $lines = self::parseSettingsInMessage( $source->plain() );
138 $ignoredHeadings = $lines; // Now we just have headings!
139 }
140 }
141 return $ignoredHeadings;
142 }
143
144 /**
145 * Extract parts of the text - opening, main and auxiliary.
146 */
147 private function extractWikitextParts() {
148 if ( !is_null( $this->allText ) ) {
149 return;
150 }
151 $text = $this->parserOutput->getText( [
152 'enableSectionEditTokens' => false,
153 'allowTOC' => false,
154 ] );
155 if ( strlen( $text ) == 0 ) {
156 $this->allText = "";
157 // empty text - nothing to seek here
158 return;
159 }
160 $opening = null;
161
162 $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
163
164 // Add extra spacing around break tags so text crammed together like<br>this
165 // doesn't make one word.
166 $text = str_replace( '<br', "\n<br", $text );
167
168 $formatter = new HtmlFormatter( $text );
169
170 // Strip elements from the page that we never want in the search text.
171 $formatter->remove( $this->excludedElementSelectors );
172 $formatter->filterContent();
173
174 // Strip elements from the page that are auxiliary text. These will still be
175 // searched but matches will be ranked lower and non-auxiliary matches will be
176 // preferred in highlighting.
177 $formatter->remove( $this->auxiliaryElementSelectors );
178 $auxiliaryElements = $formatter->filterContent();
179 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
180 foreach ( $auxiliaryElements as $auxiliaryElement ) {
181 $this->auxText[] =
182 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
183 }
184 }
185
186 /**
187 * Get text before first heading.
188 * @param string $text
189 * @return string|null
190 */
191 private function extractHeadingBeforeFirstHeading( $text ) {
192 $matches = [];
193 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
194 // There isn't a first heading so we interpret this as the article
195 // being entirely without heading.
196 return null;
197 }
198 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
199 if ( !$text ) {
200 // There isn't any text before the first heading so we declare there isn't
201 // a first heading.
202 return null;
203 }
204
205 $formatter = new HtmlFormatter( $text );
206 $formatter->remove( $this->excludedElementSelectors );
207 $formatter->remove( $this->auxiliaryElementSelectors );
208 $formatter->filterContent();
209 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
210
211 if ( !$text ) {
212 // There isn't any text after filtering before the first heading so we declare
213 // that there isn't a first heading.
214 return null;
215 }
216
217 return $text;
218 }
219
220 /**
221 * Get opening text
222 * @return string
223 */
224 public function getOpeningText() {
225 $this->extractWikitextParts();
226 return $this->openingText;
227 }
228
229 /**
230 * Get main text
231 * @return string
232 */
233 public function getMainText() {
234 $this->extractWikitextParts();
235 return $this->allText;
236 }
237
238 /**
239 * Get auxiliary text
240 * @return string[]
241 */
242 public function getAuxiliaryText() {
243 $this->extractWikitextParts();
244 return $this->auxText;
245 }
246
247 /**
248 * Get the defaultsort property
249 * @return string|null
250 */
251 public function getDefaultSort() {
252 return $this->parserOutput->getProperty( 'defaultsort' );
253 }
254 }