Support <textarea> tags in Balancer.
authorC. Scott Ananian <cscott@cscott.net>
Fri, 15 Jul 2016 22:46:14 +0000 (18:46 -0400)
committerTim Starling <tstarling@wikimedia.org>
Thu, 21 Jul 2016 03:37:10 +0000 (03:37 +0000)
Change-Id: I63c2fd1c343362e49cf3b5a258fc98489744ad68

includes/tidy/Balancer.php
tests/phpunit/includes/tidy/BalancerTest.php

index 37807ba..b2d6ba1 100644 (file)
@@ -75,7 +75,7 @@ class BalanceSets {
                self::HTML_NAMESPACE => [
                        'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
                        'frame' => true,
-                       'plaintext' => true, 'isindex' => true, 'textarea' => true,
+                       'plaintext' => true, 'isindex' => true,
                        'xmp' => true, 'iframe' => true, 'noembed' => true,
                        'noscript' => true, 'script' => true,
                        'title' => true
@@ -92,6 +92,12 @@ class BalanceSets {
                ]
        ];
 
+       public static $extraLinefeedSet = [
+               self::HTML_NAMESPACE => [
+                       'pre' => true, 'textarea' => true, 'listing' => true,
+               ]
+       ];
+
        public static $headingSet = [
                self::HTML_NAMESPACE => [
                        'h1' => true, 'h2' => true, 'h3' => true,
@@ -513,11 +519,21 @@ class BalanceElement {
                }
                if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
                        $out = "<{$this->localName}{$encAttribs}>";
+                       $len = strlen( $out );
                        // flatten children
                        foreach ( $this->children as $elt ) {
                                $out .= "{$elt}";
                        }
                        $out .= "</{$this->localName}>";
+                       if (
+                               $this->isA( BalanceSets::$extraLinefeedSet ) &&
+                               $out[$len] === "\n"
+                       ) {
+                               // Double the linefeed after pre/listing/textarea
+                               // according to the HTML5 fragment serialization algorithm.
+                               $out = substr( $out, 0, $len + 1 ) .
+                                       substr( $out, $len );
+                       }
                } else {
                        $out = "<{$this->localName}{$encAttribs} />";
                        Assert::invariant(
@@ -1740,18 +1756,19 @@ class BalanceActiveFormattingElements {
  * - The document is never in "quirks mode".
  * - All occurrences of < and > have been entity escaped, so we
  *   can parse tags by simply splitting on those two characters.
+ *   (This also simplifies the handling of < inside <textarea>.)
  *   The character < must not appear inside comments.
  *   Similarly, all attributes have been "cleaned" and are double-quoted
  *   and escaped.
  * - All null characters are assumed to have been removed.
- * - We don't alter linefeeds after <pre>/<listing>.
  * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
- *   <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
+ *   <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
  *   <noembed>, <noscript>, <script>, <title>.  As a result,
  *   further simplifications can be made:
  *   - `frameset-ok` is not tracked.
  *   - `head element pointer` is not tracked (but presumed non-null)
- *   - Tokenizer has only a single mode.
+ *   - Tokenizer has only a single mode. (<textarea> wants RCDATA and
+ *     <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
  *
  *   We generally mark places where we omit cases from the spec due to
  *   disallowed elements with a comment: `# OMITTED: <element-name>`.
@@ -1775,11 +1792,14 @@ class Balancer {
        private $tidyCompat;
        private $allowComments;
 
-       private $textIntegrationMode = false;
+       private $textIntegrationMode;
        private $pendingTableText;
        private $originalInsertionMode;
        private $fragmentContext;
        private $formElementPointer;
+       private $ignoreLinefeed;
+       private $inRCDATA;
+       private $inRAWTEXT;
 
        /**
         * Valid HTML5 comments.
@@ -1890,6 +1910,11 @@ class Balancer {
                $this->processingCallback = $processingCallback;
                $this->processingArgs = $processingArgs;
 
+               $this->textIntegrationMode =
+                       $this->ignoreLinefeed =
+                       $this->inRCDATA =
+                       $this->inRAWTEXT = false;
+
                # The stack is constructed with an <html> element already on it.
                # Set this up as a fragment parsed with <body> as the context.
                $this->fragmentContext =
@@ -1942,6 +1967,19 @@ class Balancer {
                        # Don't actually inject the empty string as a text token.
                        return true;
                }
+               // Support pre/listing/textarea by suppressing initial linefeed
+               if ( $this->ignoreLinefeed ) {
+                       $this->ignoreLinefeed = false;
+                       if ( $token === 'text' ) {
+                               if ( $value[0] === "\n" ) {
+                                       if ( $value === "\n" ) {
+                                               # Nothing would be left, don't inject the empty string.
+                                               return true;
+                                       }
+                                       $value = substr( $value, 1 );
+                               }
+                       }
+               }
                // Some hoops we have to jump through
                $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
 
@@ -2095,6 +2133,7 @@ class Balancer {
                # are stripped in the Sanitizer) but may be generated by extensions.
                if (
                        $this->allowComments &&
+                       !( $this->inRCDATA || $this->inRAWTEXT ) &&
                        preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
                        /* verify EOF condition where necessary */
                        ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
@@ -2129,6 +2168,22 @@ class Balancer {
                        $slash = $t = $attribStr = $brace = $rest = null;
                }
                $goodtag = $t;
+               if ( $this->inRCDATA ) {
+                       if ( $slash && $t === $this->inRCDATA ) {
+                               $this->inRCDATA = false;
+                       } else {
+                               // No tags allowed; this emulates the "rcdata" tokenizer mode.
+                               $goodtag = false;
+                       }
+               }
+               if ( $this->inRAWTEXT ) {
+                       if ( $slash && $t === $this->inRAWTEXT ) {
+                               $this->inRAWTEXT = false;
+                       } else {
+                               // No tags allowed, no entity-escaping done.
+                               $goodtag = false;
+                       }
+               }
                $sanitize = $this->allowedHtmlElements !== null;
                if ( $sanitize ) {
                        $goodtag = $t && isset( $this->allowedHtmlElements[$t] );
@@ -2155,6 +2210,8 @@ class Balancer {
                if ( $goodtag ) {
                        $rest = str_replace( '>', '&gt;', $rest );
                        $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
+               } elseif ( $this->inRAWTEXT ) {
+                       $this->insertToken( 'text', "<$x" );
                } else {
                        # bad tag; serialize entire thing as text.
                        $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
@@ -2260,7 +2317,7 @@ class Balancer {
 
        private function parseRawText( $value, $attribs = null ) {
                $this->stack->insertHTMLElement( $value, $attribs );
-               // XXX switch tokenizer to rawtext state?
+               $this->inRAWTEXT = $value;
                $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
                return true;
        }
@@ -2431,9 +2488,8 @@ class Balancer {
                                        $this->inBodyMode( 'endtag', 'p' );
                                }
                                $this->stack->insertHTMLElement( $value, $attribs );
-                               # As described in "simplifications" above:
-                               # 1. We don't touch the next token, even if it's a linefeed.
-                               # 2. OMITTED: frameset_ok
+                               $this->ignoreLinefeed = true;
+                               # OMITTED: frameset_ok
                                return true;
 
                        case 'form':
@@ -2607,7 +2663,14 @@ class Balancer {
                                return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
 
                        # OMITTED: <isindex>
-                       # OMITTED: <textarea>
+
+                       case 'textarea':
+                               $this->stack->insertHTMLElement( $value, $attribs );
+                               $this->ignoreLinefeed = true;
+                               $this->inRCDATA = $value; // emulate rcdata tokenizer mode
+                               # OMITTED: frameset_ok
+                               return true;
+
                        # OMITTED: <xmp>
                        # OMITTED: <iframe>
                        # OMITTED: <noembed>
index aa43ac7..f2e41bd 100644 (file)
@@ -99,7 +99,6 @@ class BalancerTest extends MediaWikiTestCase {
                                        isset( $case['document']['props']['tags']['script'] ) ||
                                        isset( $case['document']['props']['tags']['svg script'] ) ||
                                        isset( $case['document']['props']['tags']['svg title'] ) ||
-                                       isset( $case['document']['props']['tags']['textarea'] ) ||
                                        isset( $case['document']['props']['tags']['title'] ) ||
                                        isset( $case['document']['props']['tags']['xmp'] )
                                ) {