Merge "Switch Special:FileDuplicateSearch to OOUI"
[lhc/web/wiklou.git] / includes / HtmlFormatter.php
index 221cefb..206f0f7 100644 (file)
@@ -27,8 +27,8 @@ class HtmlFormatter {
        private $doc;
 
        private $html;
-       private $itemsToRemove = array();
-       private $elementsToFlatten = array();
+       private $itemsToRemove = [];
+       private $elementsToFlatten = [];
        protected $removeMedia = false;
 
        /**
@@ -63,21 +63,13 @@ class HtmlFormatter {
         */
        public function getDoc() {
                if ( !$this->doc ) {
-                       // DOMDocument::loadHTML apparently isn't very good with encodings, so
+                       // DOMDocument::loadHTML isn't very good with encodings, so
                        // convert input to ASCII by encoding everything above 128 as entities.
-                       if ( function_exists( 'mb_convert_encoding' ) ) {
-                               $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
-                       } else {
-                               $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) {
-                                       return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
-                               }, $this->html );
-                       }
+                       $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
 
                        // Workaround for bug that caused spaces before references
-                       // to disappear during processing:
-                       // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
-                       //
-                       // Please replace with a better fix if one can be found.
+                       // to disappear during processing: https://phabricator.wikimedia.org/T55086
+                       // TODO: Please replace with a better fix if one can be found.
                        $html = str_replace( ' <', '&#32;<', $html );
 
                        libxml_use_internal_errors( true );
@@ -150,7 +142,7 @@ class HtmlFormatter {
                        },
                        true
                ) ) {
-                       return array();
+                       return [];
                }
 
                $doc = $this->getDoc();
@@ -161,7 +153,7 @@ class HtmlFormatter {
                // over them in a foreach loop. It will seemingly leave the internal
                // iterator on the foreach out of wack and results will be quite
                // strange. Though, making a queue of items to remove seems to work.
-               $domElemsToRemove = array();
+               $domElemsToRemove = [];
                foreach ( $removals['TAG'] as $tagToRemove ) {
                        $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
                        foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
@@ -173,7 +165,7 @@ class HtmlFormatter {
                $removed = $this->removeElements( $domElemsToRemove );
 
                // Elements with named IDs
-               $domElemsToRemove = array();
+               $domElemsToRemove = [];
                foreach ( $removals['ID'] as $itemToRemove ) {
                        $itemToRemoveNode = $doc->getElementById( $itemToRemove );
                        if ( $itemToRemoveNode ) {
@@ -183,7 +175,7 @@ class HtmlFormatter {
                $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
 
                // CSS Classes
-               $domElemsToRemove = array();
+               $domElemsToRemove = [];
                $xpath = new DOMXPath( $doc );
                foreach ( $removals['CLASS'] as $classToRemove ) {
                        $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
@@ -219,7 +211,7 @@ class HtmlFormatter {
        private function removeElements( $elements ) {
                $list = $elements;
                if ( $elements instanceof DOMNodeList ) {
-                       $list = array();
+                       $list = [];
                        foreach ( $elements as $element ) {
                                $list[] = $element;
                        }
@@ -244,22 +236,19 @@ class HtmlFormatter {
                if ( !$replacements ) {
                        // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
                        // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
-                       $replacements = new ReplacementArray( array(
+                       $replacements = new ReplacementArray( [
                                '&quot;' => '&amp;quot;',
                                '&amp;' => '&amp;amp;',
                                '&lt;' => '&amp;lt;',
                                '&gt;' => '&amp;gt;',
-                       ) );
+                       ] );
                }
                $html = $replacements->replace( $html );
 
-               if ( function_exists( 'mb_convert_encoding' ) ) {
-                       // Just in case the conversion in getDoc() above used named
-                       // entities that aren't known to html_entity_decode().
-                       $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
-               } else {
-                       $html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' );
-               }
+               // Just in case the conversion in getDoc() above used named
+               // entities that aren't known to html_entity_decode().
+               $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
+
                return $html;
        }
 
@@ -281,7 +270,7 @@ class HtmlFormatter {
                        }
                        if ( $element ) {
                                $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
-                               $nodesArray = array();
+                               $nodesArray = [];
                                foreach ( $body->childNodes as $node ) {
                                        $nodesArray[] = $node;
                                }
@@ -292,10 +281,9 @@ class HtmlFormatter {
                        }
                        $html = $this->doc->saveHTML();
 
-                       $html = $this->fixLibXml( $html );
+                       $html = $this->fixLibXML( $html );
                        if ( wfIsWindows() ) {
                                // Cleanup for CRLF misprocessing of unknown origin on Windows.
-                               //
                                // If this error continues in the future, please track it down in the
                                // XML code paths if possible and fix there.
                                $html = str_replace( '&#13;', '', $html );
@@ -352,12 +340,12 @@ class HtmlFormatter {
         * @return array
         */
        protected function parseItemsToRemove() {
-               $removals = array(
-                       'ID' => array(),
-                       'TAG' => array(),
-                       'CLASS' => array(),
-                       'TAG_CLASS' => array(),
-               );
+               $removals = [
+                       'ID' => [],
+                       'TAG' => [],
+                       'CLASS' => [],
+                       'TAG_CLASS' => [],
+               ];
 
                foreach ( $this->itemsToRemove as $itemToRemove ) {
                        $type = '';