self::HTML_NAMESPACE => [
'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
'frame' => true,
- 'plaintext' => true, 'isindex' => true,
+ 'plaintext' => true,
'xmp' => true, 'iframe' => true, 'noembed' => true,
'noscript' => true, 'script' => true,
'title' => true
'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
- 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
+ 'input' => true, 'li' => true, 'link' => true,
'listing' => true, 'main' => true, 'marquee' => true,
- 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
+ 'menu' => true, 'meta' => true, 'nav' => true,
'noembed' => true, 'noframes' => true, 'noscript' => true,
'object' => true, 'ol' => true, 'p' => true, 'param' => true,
'plaintext' => true, 'pre' => true, 'script' => true,
public static $impliedEndTagsSet = [
self::HTML_NAMESPACE => [
- 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
+ 'dd' => true, 'dt' => true, 'li' => true,
+ 'menuitem' => true, 'optgroup' => true,
'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
'rt' => true, 'rtc' => true
]
/**
* Parent of this element, or the string "flat" if this element has
* already been flattened into its parent.
- * @var string|null $parent
+ * @var BalanceElement|string|null $parent
*/
public $parent;
* child will be an actual BalanceElement object; the rest will
* be strings, representing either text nodes or flattened
* BalanceElement objects.
- * @var array $children
+ * @var BalanceElement[]|string[] $children
*/
public $children;
* in its parent by that string.
*
* @param array $config Balancer configuration; see Balancer::__construct().
+ * @return string
*
* @see __toString()
*/
$this->attribs = [ 'class' => "mw-empty-elt" ];
}
$blank = false;
+ } elseif (
+ $this->isA( BalanceSets::$extraLinefeedSet ) &&
+ count( $this->children ) > 0 &&
+ substr( $this->children[0], 0, 1 ) == "\n"
+ ) {
+ // Double the linefeed after pre/listing/textarea
+ // according to the (old) HTML5 fragment serialization
+ // algorithm (see https://github.com/whatwg/html/issues/944)
+ // to ensure this will round-trip.
+ array_unshift( $this->children, "\n" );
}
$flat = $blank ? '' : "{$this}";
} else {
$out .= "{$elt}";
}
$out .= "</{$this->localName}>";
- if (
- $this->isA( BalanceSets::$extraLinefeedSet ) &&
- $out[$len] === "\n"
- ) {
- // Double the linefeed after pre/listing/textarea
- // according to the HTML5 fragment serialization algorithm.
- $out = substr( $out, 0, $len + 1 ) .
- substr( $out, $len );
- }
} else {
$out = "<{$this->localName}{$encAttribs} />";
Assert::invariant(
class BalanceStack implements IteratorAggregate {
/**
* Backing storage for the stack.
- * @var array $elements
+ * @var BalanceElement[] $elements
*/
private $elements = [];
/**
/**
* Insert text at the appropriate place for inserting a node.
* @param string $value
+ * @param bool $isComment
* @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
*/
public function insertText( $value, $isComment = false ) {
/**
* Return an iterator over this stack which visits the current node
* first, and the root node last.
- * @return Iterator
+ * @return \Iterator
*/
public function getIterator() {
return new ReverseArrayIterator( $this->elements );
/**
* Foster parent the given $elt in the stack of open elements.
* @param BalanceElement|string $elt
+ * @return BalanceElement|string
+ *
* @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
*/
private function fosterParent( $elt ) {
private $noahTableStack = [ [] ];
public function __destruct() {
+ $next = null;
for ( $node = $this->head; $node; $node = $next ) {
$next = $node->nextAFE;
$node->prevAFE = $node->nextAFE = $node->nextNoah = null;
/**
* Determine whether an element is in the list of formatting elements.
+ * @param BalanceElement $elt
* @return boolean
*/
public function isInList( BalanceElement $elt ) {
/**
* Find the element $elt in the list and remove it.
* Used when parsing <a> in body mode.
+ *
+ * @param BalanceElement $elt
*/
public function remove( BalanceElement $elt ) {
if ( $this->head !== $elt && !$elt->prevAFE ) {
/**
* Find element $a in the list and replace it with element $b
+ *
+ * @param BalanceElement $a
+ * @param BalanceElement $b
*/
public function replace( BalanceElement $a, BalanceElement $b ) {
if ( $this->head !== $a && !$a->prevAFE ) {
/**
* Find $a in the list and insert $b after it.
+
+ * @param BalanceElement $a
+ * @param BalanceElement $b
*/
public function insertAfter( BalanceElement $a, BalanceElement $b ) {
if ( $this->head !== $a && !$a->prevAFE ) {
* and escaped.
* - All null characters are assumed to have been removed.
* - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
- * <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
+ * <frame>, <plaintext>, <xmp>, <iframe>,
* <noembed>, <noscript>, <script>, <title>. As a result,
* further simplifications can be made:
* - `frameset-ok` is not tracked.
*/
class Balancer {
private $parseMode;
+ /** @var \Iterator */
private $bitsIterator;
private $allowedHtmlElements;
+ /** @var BalanceActiveFormattingElements */
private $afe;
+ /** @var BalanceStack */
private $stack;
private $strict;
private $allowComments;
private $inRCDATA;
private $inRAWTEXT;
+ /** @var callable|null */
+ private $processingCallback;
+ /** @var array */
+ private $processingArgs;
+
/**
* Valid HTML5 comments.
* Regex borrowed from Tim Starling's "remex-html" project.
*/
const VALID_COMMENT_REGEX = "~ !--
- ( # 1. Comment match detector
+ ( # 1. Comment match detector
> | -> | # Invalid short close
( # 2. Comment contents
(?:
( # 3. Comment close
--> | # Normal close
--!> | # Comment end bang
- ( # 4. Indicate matches requiring EOF
- --! | # EOF in comment end bang state
- -- | # EOF in comment end state
- - | # EOF in comment end dash state
- # EOF in comment state
+ ( # 4. Indicate matches requiring EOF
+ --! | # EOF in comment end bang state
+ -- | # EOF in comment end state
+ - | # EOF in comment end dash state
+ (?#nothing) # EOF in comment state
)
)
)
- ([^<]*) \z # 5. Non-tag text after the comment
+ ([^<]*) \z # 5. Non-tag text after the comment
~xs";
/**
* provide historical compatibility with the old "tidy"
* program: <p>-wrapping is done to the children of
* <body> and <blockquote> elements, and empty elements
- * are removed.
+ * are removed. The <pre>/<listing>/<textarea> serialization
+ * is also tweaked to allow lossless round trips.
+ * (See: https://github.com/whatwg/html/issues/944)
* 'allowComments': boolean, defaults to true.
* When true, allows HTML comments in the input.
* The Sanitizer generally strips all comments, so if you
$bad = array_uintersect_assoc(
$this->allowedHtmlElements,
BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
- function( $a, $b ) {
+ function ( $a, $b ) {
// Ignore the values (just intersect the keys) by saying
// all values are equal to each other.
return 0;
// Some hoops we have to jump through
$adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
+ // The spec calls this the "tree construction dispatcher".
$isForeign = true;
if (
$this->stack->length() === 0 ||
if ( $token === 'text' ) {
$this->stack->insertText( $value );
return true;
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
} elseif ( $token === 'tag' ) {
switch ( $value ) {
case 'font':
case 'header':
case 'hgroup':
case 'main':
- case 'menu':
case 'nav':
case 'ol':
case 'p':
$this->stack->insertHTMLElement( $value, $attribs );
return true;
+ case 'menu':
+ if ( $this->stack->inButtonScope( "p" ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
case 'h1':
case 'h2':
case 'h3':
case 'tt':
case 'u':
$this->afe->reconstruct( $this->stack );
- $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
+ $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
return true;
case 'nobr':
$this->inBodyMode( 'endtag', 'nobr' );
$this->afe->reconstruct( $this->stack );
}
- $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
+ $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
return true;
case 'applet':
// (hence we don't need to examine the tag's "type" attribute)
return true;
- case 'menuitem':
case 'param':
case 'source':
case 'track':
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
// warts!
return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
- // OMITTED: <isindex>
-
case 'textarea':
$this->stack->insertHTMLElement( $value, $attribs );
$this->ignoreLinefeed = true;
$this->stack->insertHTMLElement( $value, $attribs );
return true;
+ case 'menuitem':
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
case 'rb':
case 'rtc':
if ( $this->stack->inScope( 'ruby' ) ) {