includes/HtmlFormatter.php

   1 <?php
   2 /**
   3  * Performs transformations of HTML by wrapping around libxml2 and working
   4  * around its countless bugs.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along
  17  * with this program; if not, write to the Free Software Foundation, Inc.,
  18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19  * http://www.gnu.org/copyleft/gpl.html
  20  *
  21  * @file
  22  */
  23 class HtmlFormatter {
  24         /**
  25          * @var DOMDocument
  26          */
  27         private $doc;
  28
  29         private $html;
  30         private $itemsToRemove = array();
  31         private $elementsToFlatten = array();
  32         protected $removeMedia = false;
  33
  34         /**
  35          * Constructor
  36          *
  37          * @param string $html Text to process
  38          */
  39         public function __construct( $html ) {
  40                 $this->html = $html;
  41         }
  42
  43         /**
  44          * Turns a chunk of HTML into a proper document
  45          * @param string $html
  46          * @return string
  47          */
  48         public static function wrapHTML( $html ) {
  49                 return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
  50         }
  51
  52         /**
  53          * Override this in descendant class to modify HTML after it has been converted from DOM tree
  54          * @param string $html HTML to process
  55          * @return string Processed HTML
  56          */
  57         protected function onHtmlReady( $html ) {
  58                 return $html;
  59         }
  60
  61         /**
  62          * @return DOMDocument DOM to manipulate
  63          */
  64         public function getDoc() {
  65                 if ( !$this->doc ) {
  66                         // DOMDocument::loadHTML apparently isn't very good with encodings, so
  67                         // convert input to ASCII by encoding everything above 128 as entities.
  68                         if ( function_exists( 'mb_convert_encoding' ) ) {
  69                                 $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
  70                         } else {
  71                                 $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) {
  72                                         return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
  73                                 }, $this->html );
  74                         }
  75
  76                         // Workaround for bug that caused spaces before references
  77                         // to disappear during processing:
  78                         // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
  79                         //
  80                         // Please replace with a better fix if one can be found.
  81                         $html = str_replace( ' <', '&#32;<', $html );
  82
  83                         libxml_use_internal_errors( true );
  84                         $loader = libxml_disable_entity_loader();
  85                         $this->doc = new DOMDocument();
  86                         $this->doc->strictErrorChecking = false;
  87                         $this->doc->loadHTML( $html );
  88                         libxml_disable_entity_loader( $loader );
  89                         libxml_use_internal_errors( false );
  90                         $this->doc->encoding = 'UTF-8';
  91                 }
  92                 return $this->doc;
  93         }
  94
  95         /**
  96          * Sets whether images/videos/sounds should be removed from output
  97          * @param bool $flag
  98          */
  99         public function setRemoveMedia( $flag = true ) {
 100                 $this->removeMedia = $flag;
 101         }
 102
 103         /**
 104          * Adds one or more selector of content to remove. A subset of CSS selector
 105          * syntax is supported:
 106          *
 107          *   <tag>
 108          *   <tag>.class
 109          *   .<class>
 110          *   #<id>
 111          *
 112          * @param array|string $selectors Selector(s) of stuff to remove
 113          */
 114         public function remove( $selectors ) {
 115                 $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
 116         }
 117
 118         /**
 119          * Adds one or more element name to the list to flatten (remove tag, but not its content)
 120          * Can accept undelimited regexes
 121          *
 122          * Note this interface may fail in surprising unexpected ways due to usage of regexes,
 123          * so should not be relied on for HTML markup security measures.
 124          *
 125          * @param array|string $elements Name(s) of tag(s) to flatten
 126          */
 127         public function flatten( $elements ) {
 128                 $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
 129         }
 130
 131         /**
 132          * Instructs the formatter to flatten all tags
 133          */
 134         public function flattenAllTags() {
 135                 $this->flatten( '[?!]?[a-z0-9]+' );
 136         }
 137
 138         /**
 139          * Removes content we've chosen to remove.  The text of the removed elements can be
 140          * extracted with the getText method.
 141          * @return array Array of removed DOMElements
 142          */
 143         public function filterContent() {
 144                 $removals = $this->parseItemsToRemove();
 145
 146                 // Bail out early if nothing to do
 147                 if ( array_reduce( $removals,
 148                         function ( $carry, $item ) {
 149                                 return $carry && !$item;
 150                         },
 151                         true
 152                 ) ) {
 153                         return array();
 154                 }
 155
 156                 $doc = $this->getDoc();
 157
 158                 // Remove tags
 159
 160                 // You can't remove DOMNodes from a DOMNodeList as you're iterating
 161                 // over them in a foreach loop. It will seemingly leave the internal
 162                 // iterator on the foreach out of wack and results will be quite
 163                 // strange. Though, making a queue of items to remove seems to work.
 164                 $domElemsToRemove = array();
 165                 foreach ( $removals['TAG'] as $tagToRemove ) {
 166                         $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
 167                         foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
 168                                 if ( $tagToRemoveNode ) {
 169                                         $domElemsToRemove[] = $tagToRemoveNode;
 170                                 }
 171                         }
 172                 }
 173                 $removed = $this->removeElements( $domElemsToRemove );
 174
 175                 // Elements with named IDs
 176                 $domElemsToRemove = array();
 177                 foreach ( $removals['ID'] as $itemToRemove ) {
 178                         $itemToRemoveNode = $doc->getElementById( $itemToRemove );
 179                         if ( $itemToRemoveNode ) {
 180                                 $domElemsToRemove[] = $itemToRemoveNode;
 181                         }
 182                 }
 183                 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
 184
 185                 // CSS Classes
 186                 $domElemsToRemove = array();
 187                 $xpath = new DOMXPath( $doc );
 188                 foreach ( $removals['CLASS'] as $classToRemove ) {
 189                         $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
 190
 191                         /** @var $element DOMElement */
 192                         foreach ( $elements as $element ) {
 193                                 $classes = $element->getAttribute( 'class' );
 194                                 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
 195                                         $domElemsToRemove[] = $element;
 196                                 }
 197                         }
 198                 }
 199                 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
 200
 201                 // Tags with CSS Classes
 202                 foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
 203                         $parts = explode( '.', $classToRemove );
 204
 205                         $elements = $xpath->query(
 206                                 '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
 207                         );
 208                         $removed = array_merge( $removed, $this->removeElements( $elements ) );
 209                 }
 210
 211                 return $removed;
 212         }
 213
 214         /**
 215          * Removes a list of elelments from DOMDocument
 216          * @param array|DOMNodeList $elements
 217          * @return array Array of removed elements
 218          */
 219         private function removeElements( $elements ) {
 220                 $list = $elements;
 221                 if ( $elements instanceof DOMNodeList ) {
 222                         $list = array();
 223                         foreach ( $elements as $element ) {
 224                                 $list[] = $element;
 225                         }
 226                 }
 227                 /** @var $element DOMElement */
 228                 foreach ( $list as $element ) {
 229                         if ( $element->parentNode ) {
 230                                 $element->parentNode->removeChild( $element );
 231                         }
 232                 }
 233                 return $list;
 234         }
 235
 236         /**
 237          * libxml in its usual pointlessness converts many chars to entities - this function
 238          * perfoms a reverse conversion
 239          * @param string $html
 240          * @return string
 241          */
 242         private function fixLibXML( $html ) {
 243                 static $replacements;
 244                 if ( !$replacements ) {
 245                         // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
 246                         // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
 247                         $replacements = new ReplacementArray( array(
 248                                 '&quot;' => '&amp;quot;',
 249                                 '&amp;' => '&amp;amp;',
 250                                 '&lt;' => '&amp;lt;',
 251                                 '&gt;' => '&amp;gt;',
 252                         ) );
 253                 }
 254                 $html = $replacements->replace( $html );
 255
 256                 if ( function_exists( 'mb_convert_encoding' ) ) {
 257                         // Just in case the conversion in getDoc() above used named
 258                         // entities that aren't known to html_entity_decode().
 259                         $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
 260                 } else {
 261                         $html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' );
 262                 }
 263                 return $html;
 264         }
 265
 266         /**
 267          * Performs final transformations and returns resulting HTML.  Note that if you want to call this
 268          * both without an element and with an element you should call it without an element first.  If you
 269          * specify the $element in the method it'll change the underlying dom and you won't be able to get
 270          * it back.
 271          *
 272          * @param DOMElement|string|null $element ID of element to get HTML from or
 273          *   false to get it from the whole tree
 274          * @return string Processed HTML
 275          */
 276         public function getText( $element = null ) {
 277
 278                 if ( $this->doc ) {
 279                         if ( $element !== null && !( $element instanceof DOMElement ) ) {
 280                                 $element = $this->doc->getElementById( $element );
 281                         }
 282                         if ( $element ) {
 283                                 $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
 284                                 $nodesArray = array();
 285                                 foreach ( $body->childNodes as $node ) {
 286                                         $nodesArray[] = $node;
 287                                 }
 288                                 foreach ( $nodesArray as $nodeArray ) {
 289                                         $body->removeChild( $nodeArray );
 290                                 }
 291                                 $body->appendChild( $element );
 292                         }
 293                         $html = $this->doc->saveHTML();
 294
 295                         $html = $this->fixLibXml( $html );
 296                         if ( wfIsWindows() ) {
 297                                 // Cleanup for CRLF misprocessing of unknown origin on Windows.
 298                                 //
 299                                 // If this error continues in the future, please track it down in the
 300                                 // XML code paths if possible and fix there.
 301                                 $html = str_replace( '&#13;', '', $html );
 302                         }
 303                 } else {
 304                         $html = $this->html;
 305                 }
 306                 // Remove stuff added by wrapHTML()
 307                 $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
 308                 $html = $this->onHtmlReady( $html );
 309
 310                 if ( $this->elementsToFlatten ) {
 311                         $elements = implode( '|', $this->elementsToFlatten );
 312                         $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
 313                 }
 314
 315                 return $html;
 316         }
 317
 318         /**
 319          * Helper function for parseItemsToRemove(). This function extracts the selector type
 320          * and the raw name of a selector from a CSS-style selector string and assigns those
 321          * values to parameters passed by reference. For example, if given '#toc' as the
 322          * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
 323          * @param string $selector CSS selector to parse
 324          * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
 325          * @param string $rawName The raw name of the selector
 326          * @return bool Whether the selector was successfully recognised
 327          * @throws MWException
 328          */
 329         protected function parseSelector( $selector, &$type, &$rawName ) {
 330                 if ( strpos( $selector, '.' ) === 0 ) {
 331                         $type = 'CLASS';
 332                         $rawName = substr( $selector, 1 );
 333                 } elseif ( strpos( $selector, '#' ) === 0 ) {
 334                         $type = 'ID';
 335                         $rawName = substr( $selector, 1 );
 336                 } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
 337                         $type = 'TAG_CLASS';
 338                         $rawName = $selector;
 339                 } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
 340                         $type = 'TAG';
 341                         $rawName = $selector;
 342                 } else {
 343                         throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
 344                 }
 345
 346                 return true;
 347         }
 348
 349         /**
 350          * Transforms CSS-style selectors into an internal representation suitable for
 351          * processing by filterContent()
 352          * @return array
 353          */
 354         protected function parseItemsToRemove() {
 355                 $removals = array(
 356                         'ID' => array(),
 357                         'TAG' => array(),
 358                         'CLASS' => array(),
 359                         'TAG_CLASS' => array(),
 360                 );
 361
 362                 foreach ( $this->itemsToRemove as $itemToRemove ) {
 363                         $type = '';
 364                         $rawName = '';
 365                         if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
 366                                 $removals[$type][] = $rawName;
 367                         }
 368                 }
 369
 370                 if ( $this->removeMedia ) {
 371                         $removals['TAG'][] = 'img';
 372                         $removals['TAG'][] = 'audio';
 373                         $removals['TAG'][] = 'video';
 374                 }
 375
 376                 return $removals;
 377         }
 378 }