Merge "SpecialMovepage: Convert form to use OOUI controls"
[lhc/web/wiklou.git] / includes / HtmlFormatter.php
1 <?php
2 /**
3 * Performs transformations of HTML by wrapping around libxml2 and working
4 * around its countless bugs.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 */
23 class HtmlFormatter {
24 /**
25 * @var DOMDocument
26 */
27 private $doc;
28
29 private $html;
30 private $itemsToRemove = array();
31 private $elementsToFlatten = array();
32 protected $removeMedia = false;
33
34 /**
35 * Constructor
36 *
37 * @param string $html Text to process
38 */
39 public function __construct( $html ) {
40 $this->html = $html;
41 }
42
43 /**
44 * Turns a chunk of HTML into a proper document
45 * @param string $html
46 * @return string
47 */
48 public static function wrapHTML( $html ) {
49 return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
50 }
51
52 /**
53 * Override this in descendant class to modify HTML after it has been converted from DOM tree
54 * @param string $html HTML to process
55 * @return string Processed HTML
56 */
57 protected function onHtmlReady( $html ) {
58 return $html;
59 }
60
61 /**
62 * @return DOMDocument DOM to manipulate
63 */
64 public function getDoc() {
65 if ( !$this->doc ) {
66 // DOMDocument::loadHTML apparently isn't very good with encodings, so
67 // convert input to ASCII by encoding everything above 128 as entities.
68 if ( function_exists( 'mb_convert_encoding' ) ) {
69 $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
70 } else {
71 $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) {
72 return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
73 }, $this->html );
74 }
75
76 // Workaround for bug that caused spaces before references
77 // to disappear during processing:
78 // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
79 //
80 // Please replace with a better fix if one can be found.
81 $html = str_replace( ' <', '&#32;<', $html );
82
83 libxml_use_internal_errors( true );
84 $loader = libxml_disable_entity_loader();
85 $this->doc = new DOMDocument();
86 $this->doc->strictErrorChecking = false;
87 $this->doc->loadHTML( $html );
88 libxml_disable_entity_loader( $loader );
89 libxml_use_internal_errors( false );
90 $this->doc->encoding = 'UTF-8';
91 }
92 return $this->doc;
93 }
94
95 /**
96 * Sets whether images/videos/sounds should be removed from output
97 * @param bool $flag
98 */
99 public function setRemoveMedia( $flag = true ) {
100 $this->removeMedia = $flag;
101 }
102
103 /**
104 * Adds one or more selector of content to remove. A subset of CSS selector
105 * syntax is supported:
106 *
107 * <tag>
108 * <tag>.class
109 * .<class>
110 * #<id>
111 *
112 * @param array|string $selectors Selector(s) of stuff to remove
113 */
114 public function remove( $selectors ) {
115 $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
116 }
117
118 /**
119 * Adds one or more element name to the list to flatten (remove tag, but not its content)
120 * Can accept undelimited regexes
121 *
122 * Note this interface may fail in surprising unexpected ways due to usage of regexes,
123 * so should not be relied on for HTML markup security measures.
124 *
125 * @param array|string $elements Name(s) of tag(s) to flatten
126 */
127 public function flatten( $elements ) {
128 $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
129 }
130
131 /**
132 * Instructs the formatter to flatten all tags
133 */
134 public function flattenAllTags() {
135 $this->flatten( '[?!]?[a-z0-9]+' );
136 }
137
138 /**
139 * Removes content we've chosen to remove. The text of the removed elements can be
140 * extracted with the getText method.
141 * @return array Array of removed DOMElements
142 */
143 public function filterContent() {
144 $removals = $this->parseItemsToRemove();
145
146 // Bail out early if nothing to do
147 if ( array_reduce( $removals,
148 function ( $carry, $item ) {
149 return $carry && !$item;
150 },
151 true
152 ) ) {
153 return array();
154 }
155
156 $doc = $this->getDoc();
157
158 // Remove tags
159
160 // You can't remove DOMNodes from a DOMNodeList as you're iterating
161 // over them in a foreach loop. It will seemingly leave the internal
162 // iterator on the foreach out of wack and results will be quite
163 // strange. Though, making a queue of items to remove seems to work.
164 $domElemsToRemove = array();
165 foreach ( $removals['TAG'] as $tagToRemove ) {
166 $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
167 foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
168 if ( $tagToRemoveNode ) {
169 $domElemsToRemove[] = $tagToRemoveNode;
170 }
171 }
172 }
173 $removed = $this->removeElements( $domElemsToRemove );
174
175 // Elements with named IDs
176 $domElemsToRemove = array();
177 foreach ( $removals['ID'] as $itemToRemove ) {
178 $itemToRemoveNode = $doc->getElementById( $itemToRemove );
179 if ( $itemToRemoveNode ) {
180 $domElemsToRemove[] = $itemToRemoveNode;
181 }
182 }
183 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
184
185 // CSS Classes
186 $domElemsToRemove = array();
187 $xpath = new DOMXPath( $doc );
188 foreach ( $removals['CLASS'] as $classToRemove ) {
189 $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
190
191 /** @var $element DOMElement */
192 foreach ( $elements as $element ) {
193 $classes = $element->getAttribute( 'class' );
194 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
195 $domElemsToRemove[] = $element;
196 }
197 }
198 }
199 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
200
201 // Tags with CSS Classes
202 foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
203 $parts = explode( '.', $classToRemove );
204
205 $elements = $xpath->query(
206 '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
207 );
208 $removed = array_merge( $removed, $this->removeElements( $elements ) );
209 }
210
211 return $removed;
212 }
213
214 /**
215 * Removes a list of elelments from DOMDocument
216 * @param array|DOMNodeList $elements
217 * @return array Array of removed elements
218 */
219 private function removeElements( $elements ) {
220 $list = $elements;
221 if ( $elements instanceof DOMNodeList ) {
222 $list = array();
223 foreach ( $elements as $element ) {
224 $list[] = $element;
225 }
226 }
227 /** @var $element DOMElement */
228 foreach ( $list as $element ) {
229 if ( $element->parentNode ) {
230 $element->parentNode->removeChild( $element );
231 }
232 }
233 return $list;
234 }
235
236 /**
237 * libxml in its usual pointlessness converts many chars to entities - this function
238 * perfoms a reverse conversion
239 * @param string $html
240 * @return string
241 */
242 private function fixLibXML( $html ) {
243 static $replacements;
244 if ( !$replacements ) {
245 // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
246 // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
247 $replacements = new ReplacementArray( array(
248 '&quot;' => '&amp;quot;',
249 '&amp;' => '&amp;amp;',
250 '&lt;' => '&amp;lt;',
251 '&gt;' => '&amp;gt;',
252 ) );
253 }
254 $html = $replacements->replace( $html );
255
256 if ( function_exists( 'mb_convert_encoding' ) ) {
257 // Just in case the conversion in getDoc() above used named
258 // entities that aren't known to html_entity_decode().
259 $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
260 } else {
261 $html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' );
262 }
263 return $html;
264 }
265
266 /**
267 * Performs final transformations and returns resulting HTML. Note that if you want to call this
268 * both without an element and with an element you should call it without an element first. If you
269 * specify the $element in the method it'll change the underlying dom and you won't be able to get
270 * it back.
271 *
272 * @param DOMElement|string|null $element ID of element to get HTML from or
273 * false to get it from the whole tree
274 * @return string Processed HTML
275 */
276 public function getText( $element = null ) {
277
278 if ( $this->doc ) {
279 if ( $element !== null && !( $element instanceof DOMElement ) ) {
280 $element = $this->doc->getElementById( $element );
281 }
282 if ( $element ) {
283 $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
284 $nodesArray = array();
285 foreach ( $body->childNodes as $node ) {
286 $nodesArray[] = $node;
287 }
288 foreach ( $nodesArray as $nodeArray ) {
289 $body->removeChild( $nodeArray );
290 }
291 $body->appendChild( $element );
292 }
293 $html = $this->doc->saveHTML();
294
295 $html = $this->fixLibXml( $html );
296 if ( wfIsWindows() ) {
297 // Cleanup for CRLF misprocessing of unknown origin on Windows.
298 //
299 // If this error continues in the future, please track it down in the
300 // XML code paths if possible and fix there.
301 $html = str_replace( '&#13;', '', $html );
302 }
303 } else {
304 $html = $this->html;
305 }
306 // Remove stuff added by wrapHTML()
307 $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
308 $html = $this->onHtmlReady( $html );
309
310 if ( $this->elementsToFlatten ) {
311 $elements = implode( '|', $this->elementsToFlatten );
312 $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
313 }
314
315 return $html;
316 }
317
318 /**
319 * Helper function for parseItemsToRemove(). This function extracts the selector type
320 * and the raw name of a selector from a CSS-style selector string and assigns those
321 * values to parameters passed by reference. For example, if given '#toc' as the
322 * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
323 * @param string $selector CSS selector to parse
324 * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
325 * @param string $rawName The raw name of the selector
326 * @return bool Whether the selector was successfully recognised
327 * @throws MWException
328 */
329 protected function parseSelector( $selector, &$type, &$rawName ) {
330 if ( strpos( $selector, '.' ) === 0 ) {
331 $type = 'CLASS';
332 $rawName = substr( $selector, 1 );
333 } elseif ( strpos( $selector, '#' ) === 0 ) {
334 $type = 'ID';
335 $rawName = substr( $selector, 1 );
336 } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
337 $type = 'TAG_CLASS';
338 $rawName = $selector;
339 } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
340 $type = 'TAG';
341 $rawName = $selector;
342 } else {
343 throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
344 }
345
346 return true;
347 }
348
349 /**
350 * Transforms CSS-style selectors into an internal representation suitable for
351 * processing by filterContent()
352 * @return array
353 */
354 protected function parseItemsToRemove() {
355 $removals = array(
356 'ID' => array(),
357 'TAG' => array(),
358 'CLASS' => array(),
359 'TAG_CLASS' => array(),
360 );
361
362 foreach ( $this->itemsToRemove as $itemToRemove ) {
363 $type = '';
364 $rawName = '';
365 if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
366 $removals[$type][] = $rawName;
367 }
368 }
369
370 if ( $this->removeMedia ) {
371 $removals['TAG'][] = 'img';
372 $removals['TAG'][] = 'audio';
373 $removals['TAG'][] = 'video';
374 }
375
376 return $removals;
377 }
378 }