Move HtmlFormatter from MobileFrontend
[lhc/web/wiklou.git] / includes / HtmlFormatter.php
1 <?php
2 /**
3 * Performs transformations of HTML by wrapping around libxml2 and working
4 * around its countless bugs.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 */
23 class HtmlFormatter {
24 /**
25 * @var DOMDocument
26 */
27 private $doc;
28
29 private $html;
30 private $itemsToRemove = array();
31 private $elementsToFlatten = array();
32 protected $removeMedia = false;
33
34 /**
35 * Constructor
36 *
37 * @param string $html: Text to process
38 */
39 public function __construct( $html ) {
40 $this->html = $html;
41 }
42
43 /**
44 * Turns a chunk of HTML into a proper document
45 * @param string $html
46 * @return string
47 */
48 public static function wrapHTML( $html ) {
49 return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
50 }
51
52 /**
53 * Override this in descendant class to modify HTML after it has been converted from DOM tree
54 * @param string $html: HTML to process
55 * @return string: Processed HTML
56 */
57 protected function onHtmlReady( $html ) {
58 return $html;
59 }
60
61 /**
62 * @return DOMDocument: DOM to manipulate
63 */
64 public function getDoc() {
65 if ( !$this->doc ) {
66 $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
67
68 // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
69 $html = str_replace( ' <', '&#32;<', $html );
70
71 libxml_use_internal_errors( true );
72 $this->doc = new DOMDocument();
73 $this->doc->strictErrorChecking = false;
74 $this->doc->loadHTML( $html );
75 libxml_use_internal_errors( false );
76 $this->doc->encoding = 'UTF-8';
77 }
78 return $this->doc;
79 }
80
81 /**
82 * Sets whether images/videos/sounds should be removed from output
83 * @param bool $flag
84 */
85 public function setRemoveMedia( $flag = true ) {
86 $this->removeMedia = $flag;
87 }
88
89 /**
90 * Adds one or more selector of content to remove
91 * @param Array|string $selectors: Selector(s) of stuff to remove
92 */
93 public function remove( $selectors ) {
94 $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
95 }
96
97 /**
98 * Adds one or more element name to the list to flatten (remove tag, but not its content)
99 * Can accept undelimited regexes
100 * @param Array|string $elements: Name(s) of tag(s) to flatten
101 */
102 public function flatten( $elements ) {
103 $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
104 }
105
106 /**
107 * Instructs the formatter to flatten all tags
108 */
109 public function flattenAllTags() {
110 $this->flatten( '[?!]?[a-z0-9]+' );
111 }
112
113 /**
114 * Removes content we've chosen to remove
115 */
116 public function filterContent() {
117 wfProfileIn( __METHOD__ );
118 $removals = $this->parseItemsToRemove();
119
120 if ( !$removals ) {
121 return;
122 }
123
124 $doc = $this->getDoc();
125
126 // Remove tags
127
128 // You can't remove DOMNodes from a DOMNodeList as you're iterating
129 // over them in a foreach loop. It will seemingly leave the internal
130 // iterator on the foreach out of wack and results will be quite
131 // strange. Though, making a queue of items to remove seems to work.
132 $domElemsToRemove = array();
133 foreach ( $removals['TAG'] as $tagToRemove ) {
134 $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
135 foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
136 if ( $tagToRemoveNode ) {
137 $domElemsToRemove[] = $tagToRemoveNode;
138 }
139 }
140 }
141
142 $this->removeElements( $domElemsToRemove );
143
144 // Elements with named IDs
145 $domElemsToRemove = array();
146 foreach ( $removals['ID'] as $itemToRemove ) {
147 $itemToRemoveNode = $doc->getElementById( $itemToRemove );
148 if ( $itemToRemoveNode ) {
149 $domElemsToRemove[] = $itemToRemoveNode;
150 }
151 }
152 $this->removeElements( $domElemsToRemove );
153
154 // CSS Classes
155 $domElemsToRemove = array();
156 $xpath = new DOMXpath( $doc );
157 foreach ( $removals['CLASS'] as $classToRemove ) {
158 $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
159
160 /** @var $element DOMElement */
161 foreach ( $elements as $element ) {
162 $classes = $element->getAttribute( 'class' );
163 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
164 $domElemsToRemove[] = $element;
165 }
166 }
167 }
168 $this->removeElements( $domElemsToRemove );
169
170 // Tags with CSS Classes
171 foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
172 $parts = explode( '.', $classToRemove );
173
174 $elements = $xpath->query(
175 '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
176 );
177
178 $this->removeElements( $elements );
179 }
180
181 wfProfileOut( __METHOD__ );
182 }
183
184 /**
185 * Removes a list of elelments from DOMDocument
186 * @param array|DOMNodeList $elements
187 */
188 private function removeElements( $elements ) {
189 $list = $elements;
190 if ( $elements instanceof DOMNodeList ) {
191 $list = array();
192 foreach ( $elements as $element ) {
193 $list[] = $element;
194 }
195 }
196 /** @var $element DOMElement */
197 foreach ( $list as $element ) {
198 if ( $element->parentNode ) {
199 $element->parentNode->removeChild( $element );
200 }
201 }
202 }
203
204 /**
205 * libxml in its usual pointlessness converts many chars to entities - this function
206 * perfoms a reverse conversion
207 * @param string $html
208 * @return string
209 */
210 private function fixLibXML( $html ) {
211 wfProfileIn( __METHOD__ );
212 static $replacements;
213 if ( ! $replacements ) {
214 // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
215 // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
216 $replacements = new ReplacementArray( array(
217 '&quot;' => '&amp;quot;',
218 '&amp;' => '&amp;amp;',
219 '&lt;' => '&amp;lt;',
220 '&gt;' => '&amp;gt;',
221 ) );
222 }
223 $html = $replacements->replace( $html );
224 $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
225 wfProfileOut( __METHOD__ );
226 return $html;
227 }
228
229 /**
230 * Performs final transformations and returns resulting HTML
231 *
232 * @param DOMElement|string|null $element: ID of element to get HTML from or false to get it from the whole tree
233 * @return string: Processed HTML
234 */
235 public function getText( $element = null ) {
236 wfProfileIn( __METHOD__ );
237
238 if ( $this->doc ) {
239 if ( $element !== null && !( $element instanceof DOMElement ) ) {
240 $element = $this->doc->getElementById( $element );
241 }
242 if ( $element ) {
243 $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
244 $nodesArray = array();
245 foreach ( $body->childNodes as $node ) {
246 $nodesArray[] = $node;
247 }
248 foreach ( $nodesArray as $nodeArray ) {
249 $body->removeChild( $nodeArray );
250 }
251 $body->appendChild( $element );
252 }
253 $html = $this->doc->saveHTML();
254 $html = $this->fixLibXml( $html );
255 } else {
256 $html = $this->html;
257 }
258 if ( wfIsWindows() ) {
259 $html = str_replace( '&#13;', '', $html );
260 }
261 $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
262 $html = $this->onHtmlReady( $html );
263
264 if ( $this->elementsToFlatten ) {
265 $elements = implode( '|', $this->elementsToFlatten );
266 $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
267 }
268
269 wfProfileOut( __METHOD__ );
270 return $html;
271 }
272
273 /**
274 * @param $selector: CSS selector to parse
275 * @param $type
276 * @param $rawName
277 * @return bool: Whether the selector was successfully recognised
278 */
279 protected function parseSelector( $selector, &$type, &$rawName ) {
280 if ( strpos( $selector, '.' ) === 0 ) {
281 $type = 'CLASS';
282 $rawName = substr( $selector, 1 );
283 } elseif ( strpos( $selector, '#' ) === 0 ) {
284 $type = 'ID';
285 $rawName = substr( $selector, 1 );
286 } elseif ( strpos( $selector, '.' ) !== 0 &&
287 strpos( $selector, '.' ) !== false )
288 {
289 $type = 'TAG_CLASS';
290 $rawName = $selector;
291 } elseif ( strpos( $selector, '[' ) === false
292 && strpos( $selector, ']' ) === false )
293 {
294 $type = 'TAG';
295 $rawName = $selector;
296 } else {
297 throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
298 }
299
300 return true;
301 }
302
303 /**
304 * Transforms CSS selectors into an internal representation suitable for processing
305 * @return array
306 */
307 protected function parseItemsToRemove() {
308 wfProfileIn( __METHOD__ );
309 $removals = array(
310 'ID' => array(),
311 'TAG' => array(),
312 'CLASS' => array(),
313 'TAG_CLASS' => array(),
314 );
315
316 foreach ( $this->itemsToRemove as $itemToRemove ) {
317 $type = '';
318 $rawName = '';
319 if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
320 $removals[$type][] = $rawName;
321 }
322 }
323
324 if ( $this->removeMedia ) {
325 $removals['TAG'][] = 'img';
326 $removals['TAG'][] = 'audio';
327 $removals['TAG'][] = 'video';
328 }
329
330 wfProfileOut( __METHOD__ );
331 return $removals;
332 }
333 }