ResourceLoaderLanguageDataModule: Clean up useless methods and fragile state
[lhc/web/wiklou.git] / includes / HtmlFormatter.php
1 <?php
2 /**
3 * Performs transformations of HTML by wrapping around libxml2 and working
4 * around its countless bugs.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 */
23 class HtmlFormatter {
24 /**
25 * @var DOMDocument
26 */
27 private $doc;
28
29 private $html;
30 private $itemsToRemove = array();
31 private $elementsToFlatten = array();
32 protected $removeMedia = false;
33
34 /**
35 * Constructor
36 *
37 * @param string $html Text to process
38 */
39 public function __construct( $html ) {
40 $this->html = $html;
41 }
42
43 /**
44 * Turns a chunk of HTML into a proper document
45 * @param string $html
46 * @return string
47 */
48 public static function wrapHTML( $html ) {
49 return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
50 }
51
52 /**
53 * Override this in descendant class to modify HTML after it has been converted from DOM tree
54 * @param string $html HTML to process
55 * @return string Processed HTML
56 */
57 protected function onHtmlReady( $html ) {
58 return $html;
59 }
60
61 /**
62 * @return DOMDocument DOM to manipulate
63 */
64 public function getDoc() {
65 if ( !$this->doc ) {
66 $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
67
68 // Workaround for bug that caused spaces before references
69 // to disappear during processing:
70 // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
71 //
72 // Please replace with a better fix if one can be found.
73 $html = str_replace( ' <', '&#32;<', $html );
74
75 libxml_use_internal_errors( true );
76 $loader = libxml_disable_entity_loader();
77 $this->doc = new DOMDocument();
78 $this->doc->strictErrorChecking = false;
79 $this->doc->loadHTML( $html );
80 libxml_disable_entity_loader( $loader );
81 libxml_use_internal_errors( false );
82 $this->doc->encoding = 'UTF-8';
83 }
84 return $this->doc;
85 }
86
87 /**
88 * Sets whether images/videos/sounds should be removed from output
89 * @param bool $flag
90 */
91 public function setRemoveMedia( $flag = true ) {
92 $this->removeMedia = $flag;
93 }
94
95 /**
96 * Adds one or more selector of content to remove. A subset of CSS selector
97 * syntax is supported:
98 *
99 * <tag>
100 * <tag>.class
101 * .<class>
102 * #<id>
103 *
104 * @param array|string $selectors Selector(s) of stuff to remove
105 */
106 public function remove( $selectors ) {
107 $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
108 }
109
110 /**
111 * Adds one or more element name to the list to flatten (remove tag, but not its content)
112 * Can accept undelimited regexes
113 *
114 * Note this interface may fail in surprising unexpected ways due to usage of regexes,
115 * so should not be relied on for HTML markup security measures.
116 *
117 * @param array|string $elements Name(s) of tag(s) to flatten
118 */
119 public function flatten( $elements ) {
120 $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
121 }
122
123 /**
124 * Instructs the formatter to flatten all tags
125 */
126 public function flattenAllTags() {
127 $this->flatten( '[?!]?[a-z0-9]+' );
128 }
129
130 /**
131 * Removes content we've chosen to remove. The text of the removed elements can be
132 * extracted with the getText method.
133 * @return array of removed DOMElements
134 */
135 public function filterContent() {
136 wfProfileIn( __METHOD__ );
137 $removals = $this->parseItemsToRemove();
138
139 if ( !$removals ) {
140 wfProfileOut( __METHOD__ );
141 return array();
142 }
143
144 $doc = $this->getDoc();
145
146 // Remove tags
147
148 // You can't remove DOMNodes from a DOMNodeList as you're iterating
149 // over them in a foreach loop. It will seemingly leave the internal
150 // iterator on the foreach out of wack and results will be quite
151 // strange. Though, making a queue of items to remove seems to work.
152 $domElemsToRemove = array();
153 foreach ( $removals['TAG'] as $tagToRemove ) {
154 $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
155 foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
156 if ( $tagToRemoveNode ) {
157 $domElemsToRemove[] = $tagToRemoveNode;
158 }
159 }
160 }
161 $removed = $this->removeElements( $domElemsToRemove );
162
163 // Elements with named IDs
164 $domElemsToRemove = array();
165 foreach ( $removals['ID'] as $itemToRemove ) {
166 $itemToRemoveNode = $doc->getElementById( $itemToRemove );
167 if ( $itemToRemoveNode ) {
168 $domElemsToRemove[] = $itemToRemoveNode;
169 }
170 }
171 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
172
173 // CSS Classes
174 $domElemsToRemove = array();
175 $xpath = new DOMXpath( $doc );
176 foreach ( $removals['CLASS'] as $classToRemove ) {
177 $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
178
179 /** @var $element DOMElement */
180 foreach ( $elements as $element ) {
181 $classes = $element->getAttribute( 'class' );
182 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
183 $domElemsToRemove[] = $element;
184 }
185 }
186 }
187 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
188
189 // Tags with CSS Classes
190 foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
191 $parts = explode( '.', $classToRemove );
192
193 $elements = $xpath->query(
194 '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
195 );
196 $removed = array_merge( $removed, $this->removeElements( $elements ) );
197 }
198
199 wfProfileOut( __METHOD__ );
200 return $removed;
201 }
202
203 /**
204 * Removes a list of elelments from DOMDocument
205 * @param array|DOMNodeList $elements
206 * @return array of removed elements
207 */
208 private function removeElements( $elements ) {
209 $list = $elements;
210 if ( $elements instanceof DOMNodeList ) {
211 $list = array();
212 foreach ( $elements as $element ) {
213 $list[] = $element;
214 }
215 }
216 /** @var $element DOMElement */
217 foreach ( $list as $element ) {
218 if ( $element->parentNode ) {
219 $element->parentNode->removeChild( $element );
220 }
221 }
222 return $list;
223 }
224
225 /**
226 * libxml in its usual pointlessness converts many chars to entities - this function
227 * perfoms a reverse conversion
228 * @param string $html
229 * @return string
230 */
231 private function fixLibXML( $html ) {
232 wfProfileIn( __METHOD__ );
233 static $replacements;
234 if ( ! $replacements ) {
235 // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
236 // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
237 $replacements = new ReplacementArray( array(
238 '&quot;' => '&amp;quot;',
239 '&amp;' => '&amp;amp;',
240 '&lt;' => '&amp;lt;',
241 '&gt;' => '&amp;gt;',
242 ) );
243 }
244 $html = $replacements->replace( $html );
245 $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
246 wfProfileOut( __METHOD__ );
247 return $html;
248 }
249
250 /**
251 * Performs final transformations and returns resulting HTML. Note that if you want to call this
252 * both without an element and with an element you should call it without an element first. If you
253 * specify the $element in the method it'll change the underlying dom and you won't be able to get
254 * it back.
255 *
256 * @param DOMElement|string|null $element ID of element to get HTML from or false to get it from the whole tree
257 * @return string Processed HTML
258 */
259 public function getText( $element = null ) {
260 wfProfileIn( __METHOD__ );
261
262 if ( $this->doc ) {
263 wfProfileIn( __METHOD__ . '-dom' );
264 if ( $element !== null && !( $element instanceof DOMElement ) ) {
265 $element = $this->doc->getElementById( $element );
266 }
267 if ( $element ) {
268 $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
269 $nodesArray = array();
270 foreach ( $body->childNodes as $node ) {
271 $nodesArray[] = $node;
272 }
273 foreach ( $nodesArray as $nodeArray ) {
274 $body->removeChild( $nodeArray );
275 }
276 $body->appendChild( $element );
277 }
278 $html = $this->doc->saveHTML();
279 wfProfileOut( __METHOD__ . '-dom' );
280
281 wfProfileIn( __METHOD__ . '-fixes' );
282 $html = $this->fixLibXml( $html );
283 if ( wfIsWindows() ) {
284 // Cleanup for CRLF misprocessing of unknown origin on Windows.
285 //
286 // If this error continues in the future, please track it down in the
287 // XML code paths if possible and fix there.
288 $html = str_replace( '&#13;', '', $html );
289 }
290 $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
291 wfProfileOut( __METHOD__ . '-fixes' );
292 } else {
293 $html = $this->html;
294 }
295 $html = $this->onHtmlReady( $html );
296
297 wfProfileIn( __METHOD__ . '-flatten' );
298 if ( $this->elementsToFlatten ) {
299 $elements = implode( '|', $this->elementsToFlatten );
300 $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
301 }
302 wfProfileOut( __METHOD__ . '-flatten' );
303
304 wfProfileOut( __METHOD__ );
305 return $html;
306 }
307
308 /**
309 * @param string $selector CSS selector to parse
310 * @param string $type
311 * @param string $rawName
312 * @return bool Whether the selector was successfully recognised
313 */
314 protected function parseSelector( $selector, &$type, &$rawName ) {
315 if ( strpos( $selector, '.' ) === 0 ) {
316 $type = 'CLASS';
317 $rawName = substr( $selector, 1 );
318 } elseif ( strpos( $selector, '#' ) === 0 ) {
319 $type = 'ID';
320 $rawName = substr( $selector, 1 );
321 } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
322 $type = 'TAG_CLASS';
323 $rawName = $selector;
324 } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
325 $type = 'TAG';
326 $rawName = $selector;
327 } else {
328 throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
329 }
330
331 return true;
332 }
333
334 /**
335 * Transforms CSS selectors into an internal representation suitable for processing
336 * @return array
337 */
338 protected function parseItemsToRemove() {
339 wfProfileIn( __METHOD__ );
340 $removals = array(
341 'ID' => array(),
342 'TAG' => array(),
343 'CLASS' => array(),
344 'TAG_CLASS' => array(),
345 );
346
347 foreach ( $this->itemsToRemove as $itemToRemove ) {
348 $type = '';
349 $rawName = '';
350 if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
351 $removals[$type][] = $rawName;
352 }
353 }
354
355 if ( $this->removeMedia ) {
356 $removals['TAG'][] = 'img';
357 $removals['TAG'][] = 'audio';
358 $removals['TAG'][] = 'video';
359 }
360
361 wfProfileOut( __METHOD__ );
362 return $removals;
363 }
364 }