self::HTML_NAMESPACE => [
'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
'frame' => true,
- 'plaintext' => true, 'isindex' => true,
+ 'plaintext' => true,
'xmp' => true, 'iframe' => true, 'noembed' => true,
'noscript' => true, 'script' => true,
'title' => true
'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
- 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
+ 'input' => true, 'li' => true, 'link' => true,
'listing' => true, 'main' => true, 'marquee' => true,
- 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
+ 'menu' => true, 'meta' => true, 'nav' => true,
'noembed' => true, 'noframes' => true, 'noscript' => true,
'object' => true, 'ol' => true, 'p' => true, 'param' => true,
'plaintext' => true, 'pre' => true, 'script' => true,
public static $impliedEndTagsSet = [
self::HTML_NAMESPACE => [
- 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
+ 'dd' => true, 'dt' => true, 'li' => true,
+ 'menuitem' => true, 'optgroup' => true,
'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
'rt' => true, 'rtc' => true
]
$this->attribs = [ 'class' => "mw-empty-elt" ];
}
$blank = false;
+ } elseif (
+ $this->isA( BalanceSets::$extraLinefeedSet ) &&
+ count( $this->children ) > 0 &&
+ substr( $this->children[0], 0, 1 ) == "\n"
+ ) {
+ // Double the linefeed after pre/listing/textarea
+ // according to the (old) HTML5 fragment serialization
+ // algorithm (see https://github.com/whatwg/html/issues/944)
+ // to ensure this will round-trip.
+ array_unshift( $this->children, "\n" );
}
$flat = $blank ? '' : "{$this}";
} else {
$out .= "{$elt}";
}
$out .= "</{$this->localName}>";
- if (
- $this->isA( BalanceSets::$extraLinefeedSet ) &&
- $out[$len] === "\n"
- ) {
- // Double the linefeed after pre/listing/textarea
- // according to the HTML5 fragment serialization algorithm.
- $out = substr( $out, 0, $len + 1 ) .
- substr( $out, $len );
- }
} else {
$out = "<{$this->localName}{$encAttribs} />";
Assert::invariant(
private $noahTableStack = [ [] ];
public function __destruct() {
+ $next = null;
for ( $node = $this->head; $node; $node = $next ) {
$next = $node->nextAFE;
$node->prevAFE = $node->nextAFE = $node->nextNoah = null;
* and escaped.
* - All null characters are assumed to have been removed.
* - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
- * <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
+ * <frame>, <plaintext>, <xmp>, <iframe>,
* <noembed>, <noscript>, <script>, <title>. As a result,
* further simplifications can be made:
* - `frameset-ok` is not tracked.
* Regex borrowed from Tim Starling's "remex-html" project.
*/
const VALID_COMMENT_REGEX = "~ !--
- ( # 1. Comment match detector
+ ( # 1. Comment match detector
> | -> | # Invalid short close
( # 2. Comment contents
(?:
( # 3. Comment close
--> | # Normal close
--!> | # Comment end bang
- ( # 4. Indicate matches requiring EOF
- --! | # EOF in comment end bang state
- -- | # EOF in comment end state
- - | # EOF in comment end dash state
- # EOF in comment state
+ ( # 4. Indicate matches requiring EOF
+ --! | # EOF in comment end bang state
+ -- | # EOF in comment end state
+ - | # EOF in comment end dash state
+ (?#nothing) # EOF in comment state
)
)
)
- ([^<]*) \z # 5. Non-tag text after the comment
+ ([^<]*) \z # 5. Non-tag text after the comment
~xs";
/**
* provide historical compatibility with the old "tidy"
* program: <p>-wrapping is done to the children of
* <body> and <blockquote> elements, and empty elements
- * are removed.
+ * are removed. The <pre>/<listing>/<textarea> serialization
+ * is also tweaked to allow lossless round trips.
+ * (See: https://github.com/whatwg/html/issues/944)
* 'allowComments': boolean, defaults to true.
* When true, allows HTML comments in the input.
* The Sanitizer generally strips all comments, so if you
$bad = array_uintersect_assoc(
$this->allowedHtmlElements,
BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
- function( $a, $b ) {
+ function ( $a, $b ) {
// Ignore the values (just intersect the keys) by saying
// all values are equal to each other.
return 0;
// Some hoops we have to jump through
$adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
+ // The spec calls this the "tree construction dispatcher".
$isForeign = true;
if (
$this->stack->length() === 0 ||
if ( $token === 'text' ) {
$this->stack->insertText( $value );
return true;
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
} elseif ( $token === 'tag' ) {
switch ( $value ) {
case 'font':
case 'header':
case 'hgroup':
case 'main':
- case 'menu':
case 'nav':
case 'ol':
case 'p':
$this->stack->insertHTMLElement( $value, $attribs );
return true;
+ case 'menu':
+ if ( $this->stack->inButtonScope( "p" ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
case 'h1':
case 'h2':
case 'h3':
// (hence we don't need to examine the tag's "type" attribute)
return true;
- case 'menuitem':
case 'param':
case 'source':
case 'track':
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
// warts!
return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
- // OMITTED: <isindex>
-
case 'textarea':
$this->stack->insertHTMLElement( $value, $attribs );
$this->ignoreLinefeed = true;
$this->stack->insertHTMLElement( $value, $attribs );
return true;
+ case 'menuitem':
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
case 'rb':
case 'rtc':
if ( $this->stack->inScope( 'ruby' ) ) {