Support tokenizing simple HTML comments in the Balancer.
authorC. Scott Ananian <cscott@cscott.net>
Fri, 15 Jul 2016 16:36:35 +0000 (12:36 -0400)
committerTim Starling <tstarling@wikimedia.org>
Thu, 21 Jul 2016 03:27:53 +0000 (03:27 +0000)
Change-Id: Ib780595b13b7145e99867d16e3c225e6b2b91884

includes/tidy/Balancer.php
tests/phpunit/includes/tidy/BalancerTest.php

index 0fa96bd..37807ba 100644 (file)
@@ -43,6 +43,11 @@ use \Sanitizer;
 # as soon as possible (usually as soon as the tag is closed) to reduce
 # its memory footprint.
 
+# We've been gradually lifting some of these restrictions to handle
+# non-sanitized output generated by extensions, but we shortcut the tokenizer
+# for speed (primarily by splitting on `<`) and so rely on syntactic
+# well-formedness.
+
 # On the other hand, I've been pretty careful to note with comments in the
 # code the places where this implementation omits features of the spec or
 # depends on the MediaWiki Sanitizer.  Perhaps in the future we'll want to
@@ -676,19 +681,29 @@ class BalanceStack implements IteratorAggregate {
                return $out;
        }
 
+       /**
+        * Insert a comment at the appropriate place for inserting a node.
+        * @param string $value Content of the comment.
+        * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
+        */
+       public function insertComment( $value ) {
+               // Just another type of text node, except for tidy p-wrapping.
+               return $this->insertText( '<!--' . $value . '-->', true );
+       }
+
        /**
         * Insert text at the appropriate place for inserting a node.
         * @param string $value
         * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
         */
-       public function insertText( $value ) {
+       public function insertText( $value, $isComment = false ) {
                if (
                        $this->fosterParentMode &&
                        $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
                ) {
                        $this->fosterParent( $value );
                } elseif (
-                       $this->tidyCompat &&
+                       $this->tidyCompat && !$isComment &&
                        $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
                ) {
                        $this->insertHTMLELement( 'mw:p-wrap', [] );
@@ -1725,9 +1740,10 @@ class BalanceActiveFormattingElements {
  * - The document is never in "quirks mode".
  * - All occurrences of < and > have been entity escaped, so we
  *   can parse tags by simply splitting on those two characters.
+ *   The character < must not appear inside comments.
  *   Similarly, all attributes have been "cleaned" and are double-quoted
  *   and escaped.
- * - All comments and null characters are assumed to have been removed.
+ * - All null characters are assumed to have been removed.
  * - We don't alter linefeeds after <pre>/<listing>.
  * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
  *   <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
@@ -1757,6 +1773,7 @@ class Balancer {
        private $stack;
        private $strict;
        private $tidyCompat;
+       private $allowComments;
 
        private $textIntegrationMode = false;
        private $pendingTableText;
@@ -1764,6 +1781,37 @@ class Balancer {
        private $fragmentContext;
        private $formElementPointer;
 
+       /**
+        * Valid HTML5 comments.
+        * Regex borrowed from Tim Starling's "remex-html" project.
+        */
+       const VALID_COMMENT_REGEX = "~ !--
+               (                             # 1. Comment match detector
+                       > | -> | # Invalid short close
+                       (                         # 2. Comment contents
+                               (?:
+                                       (?! --> )
+                                       (?! --!> )
+                                       (?! --! \z )
+                                       (?! -- \z )
+                                       (?! - \z )
+                                       .
+                               )*+
+                       )
+                       (                         # 3. Comment close
+                               --> |   # Normal close
+                               --!> |  # Comment end bang
+                               (                     # 4. Indicate matches requiring EOF
+                                       --! |   # EOF in comment end bang state
+                                       -- |    # EOF in comment end state
+                                       -  |    # EOF in comment end dash state
+                                               # EOF in comment state
+                               )
+                       )
+               )
+               ([^<]*) \z                    # 5. Non-tag text after the comment
+               ~xs";
+
        /**
         * Create a new Balancer.
         * @param array $config Balancer configuration.  Includes:
@@ -1782,16 +1830,23 @@ class Balancer {
         *         program: <p>-wrapping is done to the children of
         *         <body> and <blockquote> elements, and empty elements
         *         are removed.
+        *     'allowComments': boolean, defaults to true.
+        *         When true, allows HTML comments in the input.
+        *         The Sanitizer generally strips all comments, so if you
+        *         are running on sanitized output you can set this to
+        *         false to get a bit more performance.
         */
        public function __construct( array $config = [] ) {
                $config = $config + [
                        'strict' => false,
                        'allowedHtmlElements' => null,
                        'tidyCompat' => false,
+                       'allowComments' => true,
                ];
                $this->allowedHtmlElements = $config['allowedHtmlElements'];
                $this->strict = $config['strict'];
                $this->tidyCompat = $config['tidyCompat'];
+               $this->allowComments = $config['allowComments'];
                if ( $this->allowedHtmlElements !== null ) {
                        # Sanity check!
                        $bad = array_uintersect_assoc(
@@ -2030,12 +2085,26 @@ class Balancer {
 
        /**
         * Grab the next "token" from $bitsIterator.  This is either a open/close
-        * tag or text, depending on whether the Sanitizer approves.
+        * tag or text or a comment, depending on whether the Sanitizer approves.
         */
        private function advance() {
                $x = $this->bitsIterator->current();
                $this->bitsIterator->next();
                $regs = [];
+               # Handle comments.  These won't be generated by mediawiki (they
+               # are stripped in the Sanitizer) but may be generated by extensions.
+               if (
+                       $this->allowComments &&
+                       preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
+                       /* verify EOF condition where necessary */
+                       ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
+               ) {
+                       $contents = $regs[2][0];
+                       $rest = $regs[5][0];
+                       $this->insertToken( 'comment', $contents );
+                       $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
+                       return;
+               }
                # $slash: Does the current element start with a '/'?
                # $t: Current element name
                # $attribStr: String between element name and >
@@ -2272,6 +2341,9 @@ class Balancer {
                                // ignore any other end tag
                                return true;
                        }
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                }
 
                // If not handled above
@@ -2782,6 +2854,9 @@ class Balancer {
                                }
                        }
                        return true;
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                } else {
                        Assert::invariant( false, "Bad token type: $token" );
                }
@@ -2885,6 +2960,9 @@ class Balancer {
                                return $this->inHeadMode( $token, $value, $attribs, $selfclose );
                        }
                        // Fall through for "anything else" clause.
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                }
                // This is the "anything else" case:
                $this->stack->fosterParentMode = true;
@@ -3012,6 +3090,9 @@ class Balancer {
                        // Fall through for "anything else".
                } elseif ( $token === 'eof' ) {
                        return $this->inBodyMode( $token, $value, $attribs, $selfclose );
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                }
 
                // Anything else
@@ -3289,6 +3370,9 @@ class Balancer {
                        case 'template':
                                return $this->inHeadMode( $token, $value, $attribs, $selfclose );
                        }
+               } elseif ( $token === 'comment' ) {
+                       $this->stack->insertComment( $value );
+                       return true;
                }
                // anything else: just ignore the token
                return true;
@@ -3320,7 +3404,7 @@ class Balancer {
        }
 
        private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
-               if ( $token === 'text' ) {
+               if ( $token === 'text' || $token === 'comment' ) {
                        return $this->inBodyMode( $token, $value, $attribs, $selfclose );
                } elseif ( $token === 'eof' ) {
                        if ( $this->stack->indexOf( 'template' ) < 0 ) {
index 213982a..aa43ac7 100644 (file)
@@ -15,6 +15,7 @@ class BalancerTest extends MediaWikiTestCase {
                        'strict' => false, /* not strict */
                        'allowedHtmlElements' => null, /* no sanitization */
                        'tidyCompat' => false, /* standard parser */
+                       'allowComments' => true, /* comment parsing */
                ] );
        }
 
@@ -70,9 +71,12 @@ class BalancerTest extends MediaWikiTestCase {
                                // Normalize case of SVG attributes.
                                $html = str_replace( 'foreignObject', 'foreignobject', $html );
 
-                               if ( isset( $case['document']['props']['comment'] ) ) {
-                                       // Skip tests which include HTML comments, which
-                                       // the balancer requires to have been stripped.
+                               if (
+                                       isset( $case['document']['props']['comment'] ) &&
+                                       preg_match( ',<!--[^>]*<,', $html )
+                               ) {
+                                       // Skip tests which include HTML comments containing
+                                       // the < character, which we don't support.
                                        continue;
                                }
                                if ( strpos( $case['data'], '<![CDATA[' ) !== false ) {