X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FHtmlFormatter.php;h=2d81b28242da4dea4bb34d008c36f89ba2ae4b22;hb=a2ea9f8b9289f197844e3282c2ac39e59c549996;hp=ccbfba82119d67db66f82fa535f6e067b477dd9e;hpb=d78b4eeff8cdf47ba6f5da1257c4a6e9de688316;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php index ccbfba8211..2d81b28242 100644 --- a/includes/HtmlFormatter.php +++ b/includes/HtmlFormatter.php @@ -63,13 +63,19 @@ class HtmlFormatter { */ public function getDoc() { if ( !$this->doc ) { - $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); + // DOMDocument::loadHTML apparently isn't very good with encodings, so + // convert input to ASCII by encoding everything above 128 as entities. + if ( function_exists( 'mb_convert_encoding' ) ) { + $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); + } else { + $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) { + return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';'; + }, $this->html ); + } // Workaround for bug that caused spaces before references - // to disappear during processing: - // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086 - // - // Please replace with a better fix if one can be found. + // to disappear during processing: https://phabricator.wikimedia.org/T55086 + // TODO: Please replace with a better fix if one can be found. $html = str_replace( ' <', ' <', $html ); libxml_use_internal_errors( true ); @@ -133,7 +139,6 @@ class HtmlFormatter { * @return array Array of removed DOMElements */ public function filterContent() { - wfProfileIn( __METHOD__ ); $removals = $this->parseItemsToRemove(); // Bail out early if nothing to do @@ -143,7 +148,6 @@ class HtmlFormatter { }, true ) ) { - wfProfileOut( __METHOD__ ); return array(); } @@ -178,7 +182,7 @@ class HtmlFormatter { // CSS Classes $domElemsToRemove = array(); - $xpath = new DOMXpath( $doc ); + $xpath = new DOMXPath( $doc ); foreach ( $removals['CLASS'] as $classToRemove ) { $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); @@ -202,7 +206,6 @@ class HtmlFormatter { $removed = array_merge( $removed, $this->removeElements( $elements ) ); } - wfProfileOut( __METHOD__ ); return $removed; } @@ -235,7 +238,6 @@ class HtmlFormatter { * @return string */ private function fixLibXML( $html ) { - wfProfileIn( __METHOD__ ); static $replacements; if ( !$replacements ) { // We don't include rules like '"' => '&quot;' because entities had already been @@ -248,8 +250,14 @@ class HtmlFormatter { ) ); } $html = $replacements->replace( $html ); - $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); - wfProfileOut( __METHOD__ ); + + if ( function_exists( 'mb_convert_encoding' ) ) { + // Just in case the conversion in getDoc() above used named + // entities that aren't known to html_entity_decode(). + $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); + } else { + $html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' ); + } return $html; } @@ -264,10 +272,8 @@ class HtmlFormatter { * @return string Processed HTML */ public function getText( $element = null ) { - wfProfileIn( __METHOD__ ); if ( $this->doc ) { - wfProfileIn( __METHOD__ . '-dom' ); if ( $element !== null && !( $element instanceof DOMElement ) ) { $element = $this->doc->getElementById( $element ); } @@ -283,18 +289,14 @@ class HtmlFormatter { $body->appendChild( $element ); } $html = $this->doc->saveHTML(); - wfProfileOut( __METHOD__ . '-dom' ); - wfProfileIn( __METHOD__ . '-fixes' ); $html = $this->fixLibXml( $html ); if ( wfIsWindows() ) { // Cleanup for CRLF misprocessing of unknown origin on Windows. - // // If this error continues in the future, please track it down in the // XML code paths if possible and fix there. $html = str_replace( ' ', '', $html ); } - wfProfileOut( __METHOD__ . '-fixes' ); } else { $html = $this->html; } @@ -302,14 +304,11 @@ class HtmlFormatter { $html = preg_replace( '/|^.*?|<\/body>.*$/s', '', $html ); $html = $this->onHtmlReady( $html ); - wfProfileIn( __METHOD__ . '-flatten' ); if ( $this->elementsToFlatten ) { $elements = implode( '|', $this->elementsToFlatten ); $html = preg_replace( "#]*>#is", '', $html ); } - wfProfileOut( __METHOD__ . '-flatten' ); - wfProfileOut( __METHOD__ ); return $html; } @@ -322,6 +321,7 @@ class HtmlFormatter { * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG) * @param string $rawName The raw name of the selector * @return bool Whether the selector was successfully recognised + * @throws MWException */ protected function parseSelector( $selector, &$type, &$rawName ) { if ( strpos( $selector, '.' ) === 0 ) { @@ -349,7 +349,6 @@ class HtmlFormatter { * @return array */ protected function parseItemsToRemove() { - wfProfileIn( __METHOD__ ); $removals = array( 'ID' => array(), 'TAG' => array(), @@ -371,7 +370,6 @@ class HtmlFormatter { $removals['TAG'][] = 'video'; } - wfProfileOut( __METHOD__ ); return $removals; } }