Merge "Added a separate error message for mkdir failures"
[lhc/web/wiklou.git] / includes / content / WikiTextStructure.php
1 <?php
2
3 use HtmlFormatter\HtmlFormatter;
4
5 /**
6 * Class allowing to explore structure of parsed wikitext.
7 */
8 class WikiTextStructure {
9 /**
10 * @var string
11 */
12 private $openingText;
13 /**
14 * @var string
15 */
16 private $allText;
17 /**
18 * @var string[]
19 */
20 private $auxText = [];
21 /**
22 * @var ParserOutput
23 */
24 private $parserOutput;
25
26 /**
27 * @var string[] selectors to elements that are excluded entirely from search
28 */
29 private $excludedElementSelectors = [
30 // "it looks like you don't have javascript enabled..." – do not need to index
31 'audio', 'video',
32 // The [1] for references
33 'sup.reference',
34 // The ↑ next to references in the references section
35 '.mw-cite-backlink',
36 // Headings are already indexed in their own field.
37 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
38 // Collapsed fields are hidden by default so we don't want them showing up.
39 '.autocollapse',
40 // Content explicitly decided to be not searchable by editors such
41 // as custom navigation templates.
42 '.navigation-not-searchable'
43 ];
44
45 /**
46 * @var string[] selectors to elements that are considered auxiliary to article text for search
47 */
48 private $auxiliaryElementSelectors = [
49 // Thumbnail captions aren't really part of the text proper
50 '.thumbcaption',
51 // Neither are tables
52 'table',
53 // Common style for "See also:".
54 '.rellink',
55 // Common style for calling out helpful links at the top of the article.
56 '.dablink',
57 // New class users can use to mark stuff as auxiliary to searches.
58 '.searchaux',
59 ];
60
61 /**
62 * @param ParserOutput $parserOutput
63 */
64 public function __construct( ParserOutput $parserOutput ) {
65 $this->parserOutput = $parserOutput;
66 }
67
68 /**
69 * Get headings on the page.
70 * @return string[]
71 * First strip out things that look like references. We can't use HTML filtering because
72 * the references come back as <sup> tags without a class. To keep from breaking stuff like
73 * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
74 * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
75 * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
76 * or something. Whatever. So we only strip things that look like <sup> tags wrapping a
77 * reference. And since the data looks like:
78 * Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
79 * we can not really use HtmlFormatter as we have no suitable selector.
80 */
81 public function headings() {
82 $headings = [];
83 $ignoredHeadings = $this->getIgnoredHeadings();
84 foreach ( $this->parserOutput->getSections() as $heading ) {
85 $heading = $heading[ 'line' ];
86
87 // Some wikis wrap the brackets in a span:
88 // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
89 $heading = preg_replace( '/<\/?span>/', '', $heading );
90 // Normalize [] so the following regexp would work.
91 $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
92 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
93
94 // Strip tags from the heading or else we'll display them (escaped) in search results
95 $heading = trim( Sanitizer::stripAllTags( $heading ) );
96
97 // Note that we don't take the level of the heading into account - all headings are equal.
98 // Except the ones we ignore.
99 if ( !in_array( $heading, $ignoredHeadings ) ) {
100 $headings[] = $heading;
101 }
102 }
103 return $headings;
104 }
105
106 /**
107 * Parse a message content into an array. This function is generally used to
108 * parse settings stored as i18n messages (see search-ignored-headings).
109 *
110 * @param string $message
111 * @return string[]
112 */
113 public static function parseSettingsInMessage( $message ) {
114 $lines = explode( "\n", $message );
115 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
116 $lines = array_map( 'trim', $lines ); // Remove extra spaces
117 $lines = array_filter( $lines ); // Remove empty lines
118 return $lines;
119 }
120
121 /**
122 * Get list of heading to ignore.
123 * @return string[]
124 */
125 private function getIgnoredHeadings() {
126 static $ignoredHeadings = null;
127 if ( $ignoredHeadings === null ) {
128 $ignoredHeadings = [];
129 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
130 if ( $source->isBlank() ) {
131 // Try old version too, just in case
132 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
133 }
134 if ( !$source->isDisabled() ) {
135 $lines = self::parseSettingsInMessage( $source->plain() );
136 $ignoredHeadings = $lines; // Now we just have headings!
137 }
138 }
139 return $ignoredHeadings;
140 }
141
142 /**
143 * Extract parts of the text - opening, main and auxiliary.
144 */
145 private function extractWikitextParts() {
146 if ( !is_null( $this->allText ) ) {
147 return;
148 }
149 $this->parserOutput->setEditSectionTokens( false );
150 $this->parserOutput->setTOCEnabled( false );
151 $text = $this->parserOutput->getText();
152 if ( strlen( $text ) == 0 ) {
153 $this->allText = "";
154 // empty text - nothing to seek here
155 return;
156 }
157 $opening = null;
158
159 $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
160
161 // Add extra spacing around break tags so text crammed together like<br>this
162 // doesn't make one word.
163 $text = str_replace( '<br', "\n<br", $text );
164
165 $formatter = new HtmlFormatter( $text );
166
167 // Strip elements from the page that we never want in the search text.
168 $formatter->remove( $this->excludedElementSelectors );
169 $formatter->filterContent();
170
171 // Strip elements from the page that are auxiliary text. These will still be
172 // searched but matches will be ranked lower and non-auxiliary matches will be
173 // preferred in highlighting.
174 $formatter->remove( $this->auxiliaryElementSelectors );
175 $auxiliaryElements = $formatter->filterContent();
176 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
177 foreach ( $auxiliaryElements as $auxiliaryElement ) {
178 $this->auxText[] =
179 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
180 }
181 }
182
183 /**
184 * Get text before first heading.
185 * @param string $text
186 * @return string|null
187 */
188 private function extractHeadingBeforeFirstHeading( $text ) {
189 $matches = [];
190 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
191 // There isn't a first heading so we interpret this as the article
192 // being entirely without heading.
193 return null;
194 }
195 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
196 if ( !$text ) {
197 // There isn't any text before the first heading so we declare there isn't
198 // a first heading.
199 return null;
200 }
201
202 $formatter = new HtmlFormatter( $text );
203 $formatter->remove( $this->excludedElementSelectors );
204 $formatter->remove( $this->auxiliaryElementSelectors );
205 $formatter->filterContent();
206 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
207
208 if ( !$text ) {
209 // There isn't any text after filtering before the first heading so we declare
210 // that there isn't a first heading.
211 return null;
212 }
213
214 return $text;
215 }
216
217 /**
218 * Get opening text
219 * @return string
220 */
221 public function getOpeningText() {
222 $this->extractWikitextParts();
223 return $this->openingText;
224 }
225
226 /**
227 * Get main text
228 * @return string
229 */
230 public function getMainText() {
231 $this->extractWikitextParts();
232 return $this->allText;
233 }
234
235 /**
236 * Get auxiliary text
237 * @return string[]
238 */
239 public function getAuxiliaryText() {
240 $this->extractWikitextParts();
241 return $this->auxText;
242 }
243
244 /**
245 * Get the defaultsort property
246 * @return string|null
247 */
248 public function getDefaultSort() {
249 return $this->parserOutput->getProperty( 'defaultsort' );
250 }
251 }