From bd2589168209128e6901abe487151970e1d97ace Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Fri, 15 Jul 2016 12:36:35 -0400 Subject: [PATCH] Support tokenizing simple HTML comments in the Balancer. Change-Id: Ib780595b13b7145e99867d16e3c225e6b2b91884 --- includes/tidy/Balancer.php | 94 ++++++++++++++++++-- tests/phpunit/includes/tidy/BalancerTest.php | 10 ++- 2 files changed, 96 insertions(+), 8 deletions(-) diff --git a/includes/tidy/Balancer.php b/includes/tidy/Balancer.php index 0fa96bd37a..37807ba25a 100644 --- a/includes/tidy/Balancer.php +++ b/includes/tidy/Balancer.php @@ -43,6 +43,11 @@ use \Sanitizer; # as soon as possible (usually as soon as the tag is closed) to reduce # its memory footprint. +# We've been gradually lifting some of these restrictions to handle +# non-sanitized output generated by extensions, but we shortcut the tokenizer +# for speed (primarily by splitting on `<`) and so rely on syntactic +# well-formedness. + # On the other hand, I've been pretty careful to note with comments in the # code the places where this implementation omits features of the spec or # depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to @@ -676,19 +681,29 @@ class BalanceStack implements IteratorAggregate { return $out; } + /** + * Insert a comment at the appropriate place for inserting a node. + * @param string $value Content of the comment. + * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment + */ + public function insertComment( $value ) { + // Just another type of text node, except for tidy p-wrapping. + return $this->insertText( '', true ); + } + /** * Insert text at the appropriate place for inserting a node. * @param string $value * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node */ - public function insertText( $value ) { + public function insertText( $value, $isComment = false ) { if ( $this->fosterParentMode && $this->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) { $this->fosterParent( $value ); } elseif ( - $this->tidyCompat && + $this->tidyCompat && !$isComment && $this->currentNode->isA( BalanceSets::$tidyPWrapSet ) ) { $this->insertHTMLELement( 'mw:p-wrap', [] ); @@ -1725,9 +1740,10 @@ class BalanceActiveFormattingElements { * - The document is never in "quirks mode". * - All occurrences of < and > have been entity escaped, so we * can parse tags by simply splitting on those two characters. + * The character < must not appear inside comments. * Similarly, all attributes have been "cleaned" and are double-quoted * and escaped. - * - All comments and null characters are assumed to have been removed. + * - All null characters are assumed to have been removed. * - We don't alter linefeeds after
/.
  * - The following elements are disallowed: , , , ,
  *   , , <isindex>, <textarea>, <xmp>, <iframe>,
@@ -1757,6 +1773,7 @@ class Balancer {
 	private $stack;
 	private $strict;
 	private $tidyCompat;
+	private $allowComments;
 
 	private $textIntegrationMode = false;
 	private $pendingTableText;
@@ -1764,6 +1781,37 @@ class Balancer {
 	private $fragmentContext;
 	private $formElementPointer;
 
+	/**
+	 * Valid HTML5 comments.
+	 * Regex borrowed from Tim Starling's "remex-html" project.
+	 */
+	const VALID_COMMENT_REGEX = "~ !--
+		(                             # 1. Comment match detector
+			> | -> | # Invalid short close
+			(                         # 2. Comment contents
+				(?:
+					(?! --> )
+					(?! --!> )
+					(?! --! \z )
+					(?! -- \z )
+					(?! - \z )
+					.
+				)*+
+			)
+			(                         # 3. Comment close
+				--> |   # Normal close
+				--!> |  # Comment end bang
+				(                     # 4. Indicate matches requiring EOF
+					--! |   # EOF in comment end bang state
+					-- |    # EOF in comment end state
+					-  |    # EOF in comment end dash state
+					        # EOF in comment state
+				)
+			)
+		)
+		([^<]*) \z                    # 5. Non-tag text after the comment
+		~xs";
+
 	/**
 	 * Create a new Balancer.
 	 * @param array $config Balancer configuration.  Includes:
@@ -1782,16 +1830,23 @@ class Balancer {
 	 *         program: <p>-wrapping is done to the children of
 	 *         <body> and <blockquote> elements, and empty elements
 	 *         are removed.
+	 *     'allowComments': boolean, defaults to true.
+	 *         When true, allows HTML comments in the input.
+	 *         The Sanitizer generally strips all comments, so if you
+	 *         are running on sanitized output you can set this to
+	 *         false to get a bit more performance.
 	 */
 	public function __construct( array $config = [] ) {
 		$config = $config + [
 			'strict' => false,
 			'allowedHtmlElements' => null,
 			'tidyCompat' => false,
+			'allowComments' => true,
 		];
 		$this->allowedHtmlElements = $config['allowedHtmlElements'];
 		$this->strict = $config['strict'];
 		$this->tidyCompat = $config['tidyCompat'];
+		$this->allowComments = $config['allowComments'];
 		if ( $this->allowedHtmlElements !== null ) {
 			# Sanity check!
 			$bad = array_uintersect_assoc(
@@ -2030,12 +2085,26 @@ class Balancer {
 
 	/**
 	 * Grab the next "token" from $bitsIterator.  This is either a open/close
-	 * tag or text, depending on whether the Sanitizer approves.
+	 * tag or text or a comment, depending on whether the Sanitizer approves.
 	 */
 	private function advance() {
 		$x = $this->bitsIterator->current();
 		$this->bitsIterator->next();
 		$regs = [];
+		# Handle comments.  These won't be generated by mediawiki (they
+		# are stripped in the Sanitizer) but may be generated by extensions.
+		if (
+			$this->allowComments &&
+			preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
+			/* verify EOF condition where necessary */
+			( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
+		) {
+			$contents = $regs[2][0];
+			$rest = $regs[5][0];
+			$this->insertToken( 'comment', $contents );
+			$this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
+			return;
+		}
 		# $slash: Does the current element start with a '/'?
 		# $t: Current element name
 		# $attribStr: String between element name and >
@@ -2272,6 +2341,9 @@ class Balancer {
 				// ignore any other end tag
 				return true;
 			}
+		} elseif ( $token === 'comment' ) {
+			$this->stack->insertComment( $value );
+			return true;
 		}
 
 		// If not handled above
@@ -2782,6 +2854,9 @@ class Balancer {
 				}
 			}
 			return true;
+		} elseif ( $token === 'comment' ) {
+			$this->stack->insertComment( $value );
+			return true;
 		} else {
 			Assert::invariant( false, "Bad token type: $token" );
 		}
@@ -2885,6 +2960,9 @@ class Balancer {
 				return $this->inHeadMode( $token, $value, $attribs, $selfclose );
 			}
 			// Fall through for "anything else" clause.
+		} elseif ( $token === 'comment' ) {
+			$this->stack->insertComment( $value );
+			return true;
 		}
 		// This is the "anything else" case:
 		$this->stack->fosterParentMode = true;
@@ -3012,6 +3090,9 @@ class Balancer {
 			// Fall through for "anything else".
 		} elseif ( $token === 'eof' ) {
 			return $this->inBodyMode( $token, $value, $attribs, $selfclose );
+		} elseif ( $token === 'comment' ) {
+			$this->stack->insertComment( $value );
+			return true;
 		}
 
 		// Anything else
@@ -3289,6 +3370,9 @@ class Balancer {
 			case 'template':
 				return $this->inHeadMode( $token, $value, $attribs, $selfclose );
 			}
+		} elseif ( $token === 'comment' ) {
+			$this->stack->insertComment( $value );
+			return true;
 		}
 		// anything else: just ignore the token
 		return true;
@@ -3320,7 +3404,7 @@ class Balancer {
 	}
 
 	private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
-		if ( $token === 'text' ) {
+		if ( $token === 'text' || $token === 'comment' ) {
 			return $this->inBodyMode( $token, $value, $attribs, $selfclose );
 		} elseif ( $token === 'eof' ) {
 			if ( $this->stack->indexOf( 'template' ) < 0 ) {
diff --git a/tests/phpunit/includes/tidy/BalancerTest.php b/tests/phpunit/includes/tidy/BalancerTest.php
index 213982ad90..aa43ac7605 100644
--- a/tests/phpunit/includes/tidy/BalancerTest.php
+++ b/tests/phpunit/includes/tidy/BalancerTest.php
@@ -15,6 +15,7 @@ class BalancerTest extends MediaWikiTestCase {
 			'strict' => false, /* not strict */
 			'allowedHtmlElements' => null, /* no sanitization */
 			'tidyCompat' => false, /* standard parser */
+			'allowComments' => true, /* comment parsing */
 		] );
 	}
 
@@ -70,9 +71,12 @@ class BalancerTest extends MediaWikiTestCase {
 				// Normalize case of SVG attributes.
 				$html = str_replace( 'foreignObject', 'foreignobject', $html );
 
-				if ( isset( $case['document']['props']['comment'] ) ) {
-					// Skip tests which include HTML comments, which
-					// the balancer requires to have been stripped.
+				if (
+					isset( $case['document']['props']['comment'] ) &&
+					preg_match( ',<!--[^>]*<,', $html )
+				) {
+					// Skip tests which include HTML comments containing
+					// the < character, which we don't support.
 					continue;
 				}
 				if ( strpos( $case['data'], '<![CDATA[' ) !== false ) {
-- 
2.20.1