Merge "Special:Newpages feed now shows first revision instead of latest revision"
[lhc/web/wiklou.git] / includes / tidy / Balancer.php
index 069b460..b29a3af 100644 (file)
@@ -75,7 +75,7 @@ class BalanceSets {
                self::HTML_NAMESPACE => [
                        'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
                        'frame' => true,
-                       'plaintext' => true, 'isindex' => true,
+                       'plaintext' => true,
                        'xmp' => true, 'iframe' => true, 'noembed' => true,
                        'noscript' => true, 'script' => true,
                        'title' => true
@@ -119,9 +119,9 @@ class BalanceSets {
                        'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
                        'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
                        'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
-                       'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
+                       'input' => true, 'li' => true, 'link' => true,
                        'listing' => true, 'main' => true, 'marquee' => true,
-                       'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
+                       'menu' => true, 'meta' => true, 'nav' => true,
                        'noembed' => true, 'noframes' => true, 'noscript' => true,
                        'object' => true, 'ol' => true, 'p' => true, 'param' => true,
                        'plaintext' => true, 'pre' => true, 'script' => true,
@@ -156,7 +156,8 @@ class BalanceSets {
 
        public static $impliedEndTagsSet = [
                self::HTML_NAMESPACE => [
-                       'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
+                       'dd' => true, 'dt' => true, 'li' => true,
+                       'menuitem' => true, 'optgroup' => true,
                        'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
                        'rt' => true, 'rtc' => true
                ]
@@ -328,7 +329,7 @@ class BalanceElement {
        /**
         * Parent of this element, or the string "flat" if this element has
         * already been flattened into its parent.
-        * @var string|null $parent
+        * @var BalanceElement|string|null $parent
         */
        public $parent;
 
@@ -337,7 +338,7 @@ class BalanceElement {
         * child will be an actual BalanceElement object; the rest will
         * be strings, representing either text nodes or flattened
         * BalanceElement objects.
-        * @var array $children
+        * @var BalanceElement[]|string[] $children
         */
        public $children;
 
@@ -465,6 +466,7 @@ class BalanceElement {
         * in its parent by that string.
         *
         * @param array $config Balancer configuration; see Balancer::__construct().
+        * @return string
         *
         * @see __toString()
         */
@@ -497,6 +499,16 @@ class BalanceElement {
                                        $this->attribs = [ 'class' => "mw-empty-elt" ];
                                }
                                $blank = false;
+                       } elseif (
+                               $this->isA( BalanceSets::$extraLinefeedSet ) &&
+                               count( $this->children ) > 0 &&
+                               substr( $this->children[0], 0, 1 ) == "\n"
+                       ) {
+                               // Double the linefeed after pre/listing/textarea
+                               // according to the (old) HTML5 fragment serialization
+                               // algorithm (see https://github.com/whatwg/html/issues/944)
+                               // to ensure this will round-trip.
+                               array_unshift( $this->children, "\n" );
                        }
                        $flat = $blank ? '' : "{$this}";
                } else {
@@ -528,15 +540,6 @@ class BalanceElement {
                                $out .= "{$elt}";
                        }
                        $out .= "</{$this->localName}>";
-                       if (
-                               $this->isA( BalanceSets::$extraLinefeedSet ) &&
-                               $out[$len] === "\n"
-                       ) {
-                               // Double the linefeed after pre/listing/textarea
-                               // according to the HTML5 fragment serialization algorithm.
-                               $out = substr( $out, 0, $len + 1 ) .
-                                       substr( $out, $len );
-                       }
                } else {
                        $out = "<{$this->localName}{$encAttribs} />";
                        Assert::invariant(
@@ -653,7 +656,7 @@ class BalanceElement {
 class BalanceStack implements IteratorAggregate {
        /**
         * Backing storage for the stack.
-        * @var array $elements
+        * @var BalanceElement[] $elements
         */
        private $elements = [];
        /**
@@ -717,6 +720,7 @@ class BalanceStack implements IteratorAggregate {
        /**
         * Insert text at the appropriate place for inserting a node.
         * @param string $value
+        * @param bool $isComment
         * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
         */
        public function insertText( $value, $isComment = false ) {
@@ -906,7 +910,7 @@ class BalanceStack implements IteratorAggregate {
        /**
         * Return an iterator over this stack which visits the current node
         * first, and the root node last.
-        * @return Iterator
+        * @return \Iterator
         */
        public function getIterator() {
                return new ReverseArrayIterator( $this->elements );
@@ -1080,6 +1084,8 @@ class BalanceStack implements IteratorAggregate {
        /**
         * Foster parent the given $elt in the stack of open elements.
         * @param BalanceElement|string $elt
+        * @return BalanceElement|string
+        *
         * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
         */
        private function fosterParent( $elt ) {
@@ -1197,7 +1203,7 @@ class BalanceStack implements IteratorAggregate {
                        $furthestBlock = null;
                        $furthestBlockIndex = -1;
                        $stackLength = $this->length();
-                       for ( $i = $index+1; $i < $stackLength; $i++ ) {
+                       for ( $i = $index + 1; $i < $stackLength; $i++ ) {
                                if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
                                        $furthestBlock = $this->node( $i );
                                        $furthestBlockIndex = $i;
@@ -1219,7 +1225,7 @@ class BalanceStack implements IteratorAggregate {
 
                        // Let the common ancestor be the element immediately above
                        // the formatting element in the stack of open elements.
-                       $ancestor = $this->node( $index-1 );
+                       $ancestor = $this->node( $index - 1 );
 
                        // Let a bookmark note the position of the formatting
                        // element in the list of active formatting elements
@@ -1406,6 +1412,7 @@ class BalanceActiveFormattingElements {
        private $noahTableStack = [ [] ];
 
        public function __destruct() {
+               $next = null;
                for ( $node = $this->head; $node; $node = $next ) {
                        $next = $node->nextAFE;
                        $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
@@ -1520,6 +1527,7 @@ class BalanceActiveFormattingElements {
 
        /**
         * Determine whether an element is in the list of formatting elements.
+        * @param BalanceElement $elt
         * @return boolean
         */
        public function isInList( BalanceElement $elt ) {
@@ -1529,6 +1537,8 @@ class BalanceActiveFormattingElements {
        /**
         * Find the element $elt in the list and remove it.
         * Used when parsing &lt;a&gt; in body mode.
+        *
+        * @param BalanceElement $elt
         */
        public function remove( BalanceElement $elt ) {
                if ( $this->head !== $elt && !$elt->prevAFE ) {
@@ -1597,6 +1607,9 @@ class BalanceActiveFormattingElements {
 
        /**
         * Find element $a in the list and replace it with element $b
+        *
+        * @param BalanceElement $a
+        * @param BalanceElement $b
         */
        public function replace( BalanceElement $a, BalanceElement $b ) {
                if ( $this->head !== $a && !$a->prevAFE ) {
@@ -1628,6 +1641,9 @@ class BalanceActiveFormattingElements {
 
        /**
         * Find $a in the list and insert $b after it.
+
+        * @param BalanceElement $a
+        * @param BalanceElement $b
         */
        public function insertAfter( BalanceElement $a, BalanceElement $b ) {
                if ( $this->head !== $a && !$a->prevAFE ) {
@@ -1756,7 +1772,7 @@ class BalanceActiveFormattingElements {
  *   and escaped.
  * - All null characters are assumed to have been removed.
  * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
- *   <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
+ *   <frame>, <plaintext>, <xmp>, <iframe>,
  *   <noembed>, <noscript>, <script>, <title>.  As a result,
  *   further simplifications can be made:
  *   - `frameset-ok` is not tracked.
@@ -1778,9 +1794,12 @@ class BalanceActiveFormattingElements {
  */
 class Balancer {
        private $parseMode;
+       /** @var \Iterator */
        private $bitsIterator;
        private $allowedHtmlElements;
+       /** @var BalanceActiveFormattingElements */
        private $afe;
+       /** @var BalanceStack */
        private $stack;
        private $strict;
        private $allowComments;
@@ -1795,12 +1814,17 @@ class Balancer {
        private $inRCDATA;
        private $inRAWTEXT;
 
+       /** @var callable|null */
+       private $processingCallback;
+       /** @var array */
+       private $processingArgs;
+
        /**
         * Valid HTML5 comments.
         * Regex borrowed from Tim Starling's "remex-html" project.
         */
        const VALID_COMMENT_REGEX = "~ !--
-               (                             # 1. Comment match detector
+               (                           # 1. Comment match detector
                        > | -> | # Invalid short close
                        (                         # 2. Comment contents
                                (?:
@@ -1815,15 +1839,15 @@ class Balancer {
                        (                         # 3. Comment close
                                --> |   # Normal close
                                --!> |  # Comment end bang
-                               (                     # 4. Indicate matches requiring EOF
-                                       --! |   # EOF in comment end bang state
-                                       -- |    # EOF in comment end state
-                                       -  |    # EOF in comment end dash state
-                                               # EOF in comment state
+                               (                       # 4. Indicate matches requiring EOF
+                                       --! |                   # EOF in comment end bang state
+                                       -- |                    # EOF in comment end state
+                                       -  |                    # EOF in comment end dash state
+                                       (?#nothing)             # EOF in comment state
                                )
                        )
                )
-               ([^<]*) \z                    # 5. Non-tag text after the comment
+               ([^<]*) \z                  # 5. Non-tag text after the comment
                ~xs";
 
        /**
@@ -1843,7 +1867,9 @@ class Balancer {
         *         provide historical compatibility with the old "tidy"
         *         program: <p>-wrapping is done to the children of
         *         <body> and <blockquote> elements, and empty elements
-        *         are removed.
+        *         are removed.  The <pre>/<listing>/<textarea> serialization
+        *         is also tweaked to allow lossless round trips.
+        *         (See: https://github.com/whatwg/html/issues/944)
         *     'allowComments': boolean, defaults to true.
         *         When true, allows HTML comments in the input.
         *         The Sanitizer generally strips all comments, so if you
@@ -1865,7 +1891,7 @@ class Balancer {
                        $bad = array_uintersect_assoc(
                                $this->allowedHtmlElements,
                                BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
-                               function( $a, $b ) {
+                               function ( $a, $b ) {
                                        // Ignore the values (just intersect the keys) by saying
                                        // all values are equal to each other.
                                        return 0;
@@ -1975,6 +2001,7 @@ class Balancer {
                // Some hoops we have to jump through
                $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
 
+               // The spec calls this the "tree construction dispatcher".
                $isForeign = true;
                if (
                        $this->stack->length() === 0 ||
@@ -2015,6 +2042,9 @@ class Balancer {
                if ( $token === 'text' ) {
                        $this->stack->insertText( $value );
                        return true;
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                } elseif ( $token === 'tag' ) {
                        switch ( $value ) {
                        case 'font':
@@ -2086,7 +2116,7 @@ class Balancer {
                                return $this->insertToken( $token, $value, $attribs, $selfClose );
                        }
                        // "Any other start tag"
-                       $adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
+                       $adjusted = ( $this->fragmentContext && $this->stack->length() === 1 ) ?
                                $this->fragmentContext : $this->stack->currentNode;
                        $this->stack->insertForeignElement(
                                $adjusted->namespaceURI, $value, $attribs
@@ -2126,7 +2156,7 @@ class Balancer {
                if (
                        $this->allowComments &&
                        !( $this->inRCDATA || $this->inRAWTEXT ) &&
-                       preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
+                       preg_match( self::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
                        // verify EOF condition where necessary
                        ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
                ) {
@@ -2212,7 +2242,7 @@ class Balancer {
 
        private function switchMode( $mode ) {
                Assert::parameter(
-                       substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
+                       substr( $mode, -4 ) === 'Mode', '$mode', 'should end in Mode'
                );
                $oldMode = $this->parseMode;
                $this->parseMode = $mode;
@@ -2237,8 +2267,8 @@ class Balancer {
                                switch ( $node->localName ) {
                                case 'select':
                                        $stackLength = $this->stack->length();
-                                       for ( $j = $i + 1; $j < $stackLength-1; $j++ ) {
-                                               $ancestor = $this->stack->node( $stackLength-$j-1 );
+                                       for ( $j = $i + 1; $j < $stackLength - 1; $j++ ) {
+                                               $ancestor = $this->stack->node( $stackLength - $j - 1 );
                                                if ( $ancestor->isHtmlNamed( 'template' ) ) {
                                                        break;
                                                }
@@ -2446,7 +2476,6 @@ class Balancer {
                        case 'header':
                        case 'hgroup':
                        case 'main':
-                       case 'menu':
                        case 'nav':
                        case 'ol':
                        case 'p':
@@ -2459,6 +2488,16 @@ class Balancer {
                                $this->stack->insertHTMLElement( $value, $attribs );
                                return true;
 
+                       case 'menu':
+                               if ( $this->stack->inButtonScope( "p" ) ) {
+                                       $this->inBodyMode( 'endtag', 'p' );
+                               }
+                               if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+                                       $this->stack->pop();
+                               }
+                               $this->stack->insertHTMLElement( $value, $attribs );
+                               return true;
+
                        case 'h1':
                        case 'h2':
                        case 'h3':
@@ -2582,7 +2621,7 @@ class Balancer {
                        case 'tt':
                        case 'u':
                                $this->afe->reconstruct( $this->stack );
-                               $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
+                               $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
                                return true;
 
                        case 'nobr':
@@ -2591,7 +2630,7 @@ class Balancer {
                                        $this->inBodyMode( 'endtag', 'nobr' );
                                        $this->afe->reconstruct( $this->stack );
                                }
-                               $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
+                               $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
                                return true;
 
                        case 'applet':
@@ -2634,7 +2673,6 @@ class Balancer {
                                // (hence we don't need to examine the tag's "type" attribute)
                                return true;
 
-                       case 'menuitem':
                        case 'param':
                        case 'source':
                        case 'track':
@@ -2646,6 +2684,9 @@ class Balancer {
                                if ( $this->stack->inButtonScope( 'p' ) ) {
                                        $this->inBodyMode( 'endtag', 'p' );
                                }
+                               if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+                                       $this->stack->pop();
+                               }
                                $this->stack->insertHTMLElement( $value, $attribs );
                                $this->stack->pop();
                                return true;
@@ -2654,8 +2695,6 @@ class Balancer {
                                // warts!
                                return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
 
-                       // OMITTED: <isindex>
-
                        case 'textarea':
                                $this->stack->insertHTMLElement( $value, $attribs );
                                $this->ignoreLinefeed = true;
@@ -2693,6 +2732,14 @@ class Balancer {
                                $this->stack->insertHTMLElement( $value, $attribs );
                                return true;
 
+                       case 'menuitem':
+                               if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+                                       $this->stack->pop();
+                               }
+                               $this->afe->reconstruct( $this->stack );
+                               $this->stack->insertHTMLElement( $value, $attribs );
+                               return true;
+
                        case 'rb':
                        case 'rtc':
                                if ( $this->stack->inScope( 'ruby' ) ) {