X-Git-Url: https://git.heureux-cyclage.org/?p=lhc%2Fweb%2Fwiklou.git;a=blobdiff_plain;f=includes%2FHtmlFormatter.php;h=9bae8b5fefc8cdf531519966b6d992611d9a469c;hp=574977574950752ea2e39c6d42490dd8da11d60e;hb=ba00b23a1d6fea3f05c617d3df73d60ab28dfdf4;hpb=a00cef7d9f71fa2f53e683b7630d9ce90915d6be diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php index 5749775749..9bae8b5fef 100644 --- a/includes/HtmlFormatter.php +++ b/includes/HtmlFormatter.php @@ -1,7 +1,7 @@ html = $html; - } - - /** - * Turns a chunk of HTML into a proper document - * @param string $html - * @return string - */ - public static function wrapHTML( $html ) { - return '' . $html . ''; - } - - /** - * Override this in descendant class to modify HTML after it has been converted from DOM tree - * @param string $html HTML to process - * @return string Processed HTML - */ - protected function onHtmlReady( $html ) { - return $html; - } - - /** - * @return DOMDocument DOM to manipulate - */ - public function getDoc() { - if ( !$this->doc ) { - // DOMDocument::loadHTML apparently isn't very good with encodings, so - // convert input to ASCII by encoding everything above 128 as entities. - if ( function_exists( 'mb_convert_encoding' ) ) { - $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); - } else { - $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) { - return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';'; - }, $this->html ); - } - - // Workaround for bug that caused spaces before references - // to disappear during processing: https://phabricator.wikimedia.org/T55086 - // TODO: Please replace with a better fix if one can be found. - $html = str_replace( ' <', ' <', $html ); - - libxml_use_internal_errors( true ); - $loader = libxml_disable_entity_loader(); - $this->doc = new DOMDocument(); - $this->doc->strictErrorChecking = false; - $this->doc->loadHTML( $html ); - libxml_disable_entity_loader( $loader ); - libxml_use_internal_errors( false ); - $this->doc->encoding = 'UTF-8'; - } - return $this->doc; - } - - /** - * Sets whether images/videos/sounds should be removed from output - * @param bool $flag - */ - public function setRemoveMedia( $flag = true ) { - $this->removeMedia = $flag; - } - - /** - * Adds one or more selector of content to remove. A subset of CSS selector - * syntax is supported: - * - * - * .class - * . - * # - * - * @param array|string $selectors Selector(s) of stuff to remove - */ - public function remove( $selectors ) { - $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); - } - - /** - * Adds one or more element name to the list to flatten (remove tag, but not its content) - * Can accept undelimited regexes - * - * Note this interface may fail in surprising unexpected ways due to usage of regexes, - * so should not be relied on for HTML markup security measures. - * - * @param array|string $elements Name(s) of tag(s) to flatten - */ - public function flatten( $elements ) { - $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); - } - - /** - * Instructs the formatter to flatten all tags - */ - public function flattenAllTags() { - $this->flatten( '[?!]?[a-z0-9]+' ); - } - - /** - * Removes content we've chosen to remove. The text of the removed elements can be - * extracted with the getText method. - * @return array Array of removed DOMElements - */ - public function filterContent() { - $removals = $this->parseItemsToRemove(); - - // Bail out early if nothing to do - if ( array_reduce( $removals, - function ( $carry, $item ) { - return $carry && !$item; - }, - true - ) ) { - return []; - } - - $doc = $this->getDoc(); - - // Remove tags - - // You can't remove DOMNodes from a DOMNodeList as you're iterating - // over them in a foreach loop. It will seemingly leave the internal - // iterator on the foreach out of wack and results will be quite - // strange. Though, making a queue of items to remove seems to work. - $domElemsToRemove = []; - foreach ( $removals['TAG'] as $tagToRemove ) { - $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); - foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { - if ( $tagToRemoveNode ) { - $domElemsToRemove[] = $tagToRemoveNode; - } - } - } - $removed = $this->removeElements( $domElemsToRemove ); - - // Elements with named IDs - $domElemsToRemove = []; - foreach ( $removals['ID'] as $itemToRemove ) { - $itemToRemoveNode = $doc->getElementById( $itemToRemove ); - if ( $itemToRemoveNode ) { - $domElemsToRemove[] = $itemToRemoveNode; - } - } - $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); - - // CSS Classes - $domElemsToRemove = []; - $xpath = new DOMXPath( $doc ); - foreach ( $removals['CLASS'] as $classToRemove ) { - $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); - - /** @var $element DOMElement */ - foreach ( $elements as $element ) { - $classes = $element->getAttribute( 'class' ); - if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { - $domElemsToRemove[] = $element; - } - } - } - $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); - - // Tags with CSS Classes - foreach ( $removals['TAG_CLASS'] as $classToRemove ) { - $parts = explode( '.', $classToRemove ); - - $elements = $xpath->query( - '//' . $parts[0] . '[@class="' . $parts[1] . '"]' - ); - $removed = array_merge( $removed, $this->removeElements( $elements ) ); - } - - return $removed; - } - - /** - * Removes a list of elelments from DOMDocument - * @param array|DOMNodeList $elements - * @return array Array of removed elements - */ - private function removeElements( $elements ) { - $list = $elements; - if ( $elements instanceof DOMNodeList ) { - $list = []; - foreach ( $elements as $element ) { - $list[] = $element; - } - } - /** @var $element DOMElement */ - foreach ( $list as $element ) { - if ( $element->parentNode ) { - $element->parentNode->removeChild( $element ); - } - } - return $list; - } - - /** - * libxml in its usual pointlessness converts many chars to entities - this function - * perfoms a reverse conversion - * @param string $html - * @return string - */ - private function fixLibXML( $html ) { - static $replacements; - if ( !$replacements ) { - // We don't include rules like '"' => '&quot;' because entities had already been - // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! - $replacements = new ReplacementArray( [ - '"' => '&quot;', - '&' => '&amp;', - '<' => '&lt;', - '>' => '&gt;', - ] ); - } - $html = $replacements->replace( $html ); - - if ( function_exists( 'mb_convert_encoding' ) ) { - // Just in case the conversion in getDoc() above used named - // entities that aren't known to html_entity_decode(). - $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); - } else { - $html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' ); - } - return $html; - } - - /** - * Performs final transformations and returns resulting HTML. Note that if you want to call this - * both without an element and with an element you should call it without an element first. If you - * specify the $element in the method it'll change the underlying dom and you won't be able to get - * it back. - * - * @param DOMElement|string|null $element ID of element to get HTML from or - * false to get it from the whole tree - * @return string Processed HTML - */ - public function getText( $element = null ) { - - if ( $this->doc ) { - if ( $element !== null && !( $element instanceof DOMElement ) ) { - $element = $this->doc->getElementById( $element ); - } - if ( $element ) { - $body = $this->doc->getElementsByTagName( 'body' )->item( 0 ); - $nodesArray = []; - foreach ( $body->childNodes as $node ) { - $nodesArray[] = $node; - } - foreach ( $nodesArray as $nodeArray ) { - $body->removeChild( $nodeArray ); - } - $body->appendChild( $element ); - } - $html = $this->doc->saveHTML(); - - $html = $this->fixLibXML( $html ); - if ( wfIsWindows() ) { - // Cleanup for CRLF misprocessing of unknown origin on Windows. - // If this error continues in the future, please track it down in the - // XML code paths if possible and fix there. - $html = str_replace( ' ', '', $html ); - } - } else { - $html = $this->html; - } - // Remove stuff added by wrapHTML() - $html = preg_replace( '/|^.*?|<\/body>.*$/s', '', $html ); - $html = $this->onHtmlReady( $html ); - - if ( $this->elementsToFlatten ) { - $elements = implode( '|', $this->elementsToFlatten ); - $html = preg_replace( "#]*>#is", '', $html ); - } - - return $html; - } - - /** - * Helper function for parseItemsToRemove(). This function extracts the selector type - * and the raw name of a selector from a CSS-style selector string and assigns those - * values to parameters passed by reference. For example, if given '#toc' as the - * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName. - * @param string $selector CSS selector to parse - * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG) - * @param string $rawName The raw name of the selector - * @return bool Whether the selector was successfully recognised - * @throws MWException - */ - protected function parseSelector( $selector, &$type, &$rawName ) { - if ( strpos( $selector, '.' ) === 0 ) { - $type = 'CLASS'; - $rawName = substr( $selector, 1 ); - } elseif ( strpos( $selector, '#' ) === 0 ) { - $type = 'ID'; - $rawName = substr( $selector, 1 ); - } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) { - $type = 'TAG_CLASS'; - $rawName = $selector; - } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) { - $type = 'TAG'; - $rawName = $selector; - } else { - throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" ); - } - - return true; - } - - /** - * Transforms CSS-style selectors into an internal representation suitable for - * processing by filterContent() - * @return array - */ - protected function parseItemsToRemove() { - $removals = [ - 'ID' => [], - 'TAG' => [], - 'CLASS' => [], - 'TAG_CLASS' => [], - ]; - - foreach ( $this->itemsToRemove as $itemToRemove ) { - $type = ''; - $rawName = ''; - if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { - $removals[$type][] = $rawName; - } - } - - if ( $this->removeMedia ) { - $removals['TAG'][] = 'img'; - $removals['TAG'][] = 'audio'; - $removals['TAG'][] = 'video'; - } - - return $removals; - } +class HtmlFormatter extends HtmlFormatter\HtmlFormatter { }