X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2Ftidy%2FBalancer.php;h=069b460fd8abf5bedcf16944f0e90f9796237899;hb=531ed101ccd14dc7e2cf2858a67b2523ef6a79ff;hp=b2d6ba1bcfa374078769286b71e3e1e75fe42512;hpb=5c1b3768ed022978edb038df14e0f15874eea7b4;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/tidy/Balancer.php b/includes/tidy/Balancer.php index b2d6ba1bcf..069b460fd8 100644 --- a/includes/tidy/Balancer.php +++ b/includes/tidy/Balancer.php @@ -32,31 +32,31 @@ use \IteratorAggregate; use \ReverseArrayIterator; use \Sanitizer; -# A note for future librarization[1] -- this file is a good candidate -# for splitting into an independent library, except that it is currently -# highly optimized for MediaWiki use. It only implements the portions -# of the HTML5 tree builder used by tags supported by MediaWiki, and -# does not contain a true tokenizer pass, instead relying on -# comment stripping, attribute normalization, and escaping done by -# the MediaWiki Sanitizer. It also deliberately avoids building -# a true DOM in memory, instead serializing elements to an output string -# as soon as possible (usually as soon as the tag is closed) to reduce -# its memory footprint. - -# We've been gradually lifting some of these restrictions to handle -# non-sanitized output generated by extensions, but we shortcut the tokenizer -# for speed (primarily by splitting on `<`) and so rely on syntactic -# well-formedness. - -# On the other hand, I've been pretty careful to note with comments in the -# code the places where this implementation omits features of the spec or -# depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to -# implement the missing pieces and make this a standalone PHP HTML5 parser. -# In order to do so, some sort of MediaWiki-specific API will need -# to be added to (a) allow the Balancer to bypass the tokenizer, -# and (b) support on-the-fly flattening instead of DOM node creation. - -# [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki +// A note for future librarization[1] -- this file is a good candidate +// for splitting into an independent library, except that it is currently +// highly optimized for MediaWiki use. It only implements the portions +// of the HTML5 tree builder used by tags supported by MediaWiki, and +// does not contain a true tokenizer pass, instead relying on +// comment stripping, attribute normalization, and escaping done by +// the MediaWiki Sanitizer. It also deliberately avoids building +// a true DOM in memory, instead serializing elements to an output string +// as soon as possible (usually as soon as the tag is closed) to reduce +// its memory footprint. + +// We've been gradually lifting some of these restrictions to handle +// non-sanitized output generated by extensions, but we shortcut the tokenizer +// for speed (primarily by splitting on `<`) and so rely on syntactic +// well-formedness. + +// On the other hand, I've been pretty careful to note with comments in the +// code the places where this implementation omits features of the spec or +// depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to +// implement the missing pieces and make this a standalone PHP HTML5 parser. +// In order to do so, some sort of MediaWiki-specific API will need +// to be added to (a) allow the Balancer to bypass the tokenizer, +// and (b) support on-the-fly flattening instead of DOM node creation. + +// [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki /** * Utility constants and sets for the HTML5 tree building algorithm. @@ -464,20 +464,23 @@ class BalanceElement { * by the HTML serialization specification, and replace this node * in its parent by that string. * + * @param array $config Balancer configuration; see Balancer::__construct(). + * * @see __toString() */ - public function flatten( $tidyCompat = false ) { + public function flatten( array $config ) { Assert::parameter( $this->parent !== null, '$this', 'must be a child' ); Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' ); $idx = array_search( $this, $this->parent->children, true ); Assert::parameter( $idx !== false, '$this', 'must be a child of its parent' ); + $tidyCompat = $config['tidyCompat']; if ( $tidyCompat ) { $blank = true; foreach ( $this->children as $elt ) { if ( !is_string( $elt ) ) { - $elt = $elt->flatten( $tidyCompat ); + $elt = $elt->flatten( $config ); } if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) { $blank = false; @@ -500,7 +503,7 @@ class BalanceElement { $flat = "{$this}"; } $this->parent->children[$idx] = $flat; - $this->parent = 'flat'; # for assertion checking + $this->parent = 'flat'; // for assertion checking return $flat; } @@ -544,7 +547,7 @@ class BalanceElement { return $out; } - # Utility functions on BalanceElements. + // Utility functions on BalanceElements. /** * Determine if $this represents a specific HTML tag, is a member of @@ -561,7 +564,7 @@ class BalanceElement { return isset( $set[$this->namespaceURI] ) && isset( $set[$this->namespaceURI][$this->localName] ); } else { - # assume this is an HTML element name. + // assume this is an HTML element name. return $this->isHtml() && $this->localName === $set; } } @@ -661,9 +664,11 @@ class BalanceStack implements IteratorAggregate { */ public $fosterParentMode = false; /** - * Tidy compatibility mode, determines behavior of body/blockquote + * Configuration options governing flattening. + * @var array $config + * @see Balancer::__construct() */ - public $tidyCompat = false; + private $config; /** * Reference to the current element */ @@ -672,14 +677,16 @@ class BalanceStack implements IteratorAggregate { /** * Create a new BalanceStack with a single BalanceElement on it, * representing the root <html> node. + * @param array $config Balancer configuration; see Balancer::_construct(). */ - public function __construct() { - # always a root element on the stack + public function __construct( array $config ) { + // always a root element on the stack array_push( $this->elements, new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] ) ); $this->currentNode = $this->elements[0]; + $this->config = $config; } /** @@ -692,7 +699,7 @@ class BalanceStack implements IteratorAggregate { $out = ''; foreach ( $this->elements[0]->children as $elt ) { $out .= is_string( $elt ) ? $elt : - $elt->flatten( $this->tidyCompat ); + $elt->flatten( $this->config ); } return $out; } @@ -719,10 +726,10 @@ class BalanceStack implements IteratorAggregate { ) { $this->fosterParent( $value ); } elseif ( - $this->tidyCompat && !$isComment && + $this->config['tidyCompat'] && !$isComment && $this->currentNode->isA( BalanceSets::$tidyPWrapSet ) ) { - $this->insertHTMLELement( 'mw:p-wrap', [] ); + $this->insertHTMLElement( 'mw:p-wrap', [] ); return $this->insertText( $value ); } else { $this->currentNode->appendChild( $value ); @@ -970,7 +977,7 @@ class BalanceStack implements IteratorAggregate { $this->currentNode = null; } if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) { - $elt->flatten( $this->tidyCompat ); + $elt->flatten( $this->config ); } } @@ -980,7 +987,6 @@ class BalanceStack implements IteratorAggregate { * @param int $idx */ public function popTo( $idx ) { - $length = count( $this->elements ); for ( $length = count( $this->elements ); $length > $idx; $length-- ) { $this->pop(); } @@ -1045,7 +1051,7 @@ class BalanceStack implements IteratorAggregate { // otherwise, it will eventually serialize when the parent // is serialized, we just hold onto the memory for its // tree of objects a little longer. - $elt->flatten( $this->tidyCompat ); + $elt->flatten( $this->config ); } Assert::postcondition( array_search( $elt, $this->elements, true ) === false, @@ -1069,7 +1075,7 @@ class BalanceStack implements IteratorAggregate { } } - # Fostering and adoption. + // Fostering and adoption. /** * Foster parent the given $elt in the stack of open elements. @@ -1086,7 +1092,7 @@ class BalanceStack implements IteratorAggregate { $parent = $this->elements[$lastTemplate]; } elseif ( $lastTable >= 0 ) { $parent = $this->elements[$lastTable]->parent; - # Assume all tables have parents, since we're not running scripts! + // Assume all tables have parents, since we're not running scripts! Assert::invariant( $parent !== null, "All tables should have parents" ); @@ -1095,7 +1101,7 @@ class BalanceStack implements IteratorAggregate { $parent = $this->elements[0]; // the `html` element. } - if ( $this->tidyCompat ) { + if ( $this->config['tidyCompat'] ) { if ( is_string( $elt ) ) { // We're fostering text: do we need a p-wrapper? if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) { @@ -1151,34 +1157,28 @@ class BalanceStack implements IteratorAggregate { return true; // no more handling required } - // Let outer loop counter be zero. - $outer = 0; - // Outer loop: If outer loop counter is greater than or // equal to eight, then abort these steps. - while ( $outer < 8 ) { - // Increment outer loop counter by one. - $outer++; - + for ( $outer = 0; $outer < 8; $outer++ ) { // Let the formatting element be the last element in the list // of active formatting elements that: is between the end of // the list and the last scope marker in the list, if any, or // the start of the list otherwise, and has the same tag name // as the token. - $fmtelt = $afe->findElementByTag( $tag ); + $fmtElt = $afe->findElementByTag( $tag ); // If there is no such node, then abort these steps and instead // act as described in the "any other end tag" entry below. - if ( !$fmtelt ) { + if ( !$fmtElt ) { return false; // false means handle by the default case } // Otherwise, if there is such a node, but that node is not in // the stack of open elements, then this is a parse error; // remove the element from the list, and abort these steps. - $index = $this->indexOf( $fmtelt ); + $index = $this->indexOf( $fmtElt ); if ( $index < 0 ) { - $afe->remove( $fmtelt ); + $afe->remove( $fmtElt ); return true; // true means no more handling required } @@ -1186,7 +1186,7 @@ class BalanceStack implements IteratorAggregate { // the stack of open elements, but the element is not in scope, // then this is a parse error; ignore the token, and abort // these steps. - if ( !$this->inScope( $fmtelt ) ) { + if ( !$this->inScope( $fmtElt ) ) { return true; } @@ -1194,13 +1194,13 @@ class BalanceStack implements IteratorAggregate { // open elements that is lower in the stack than the formatting // element, and is an element in the special category. There // might not be one. - $furthestblock = null; - $furthestblockindex = -1; - $stacklen = $this->length(); - for ( $i = $index+1; $i < $stacklen; $i++ ) { + $furthestBlock = null; + $furthestBlockIndex = -1; + $stackLength = $this->length(); + for ( $i = $index+1; $i < $stackLength; $i++ ) { if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) { - $furthestblock = $this->node( $i ); - $furthestblockindex = $i; + $furthestBlock = $this->node( $i ); + $furthestBlockIndex = $i; break; } } @@ -1211,140 +1211,134 @@ class BalanceStack implements IteratorAggregate { // up to and including the formatting element, and remove the // formatting element from the list of active formatting // elements. - if ( !$furthestblock ) { - $this->popTag( $fmtelt ); - $afe->remove( $fmtelt ); - return true; - } else { - // Let the common ancestor be the element immediately above - // the formatting element in the stack of open elements. - $ancestor = $this->node( $index-1 ); - - // Let a bookmark note the position of the formatting - // element in the list of active formatting elements - // relative to the elements on either side of it in the - // list. - $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] ); - $afe->insertAfter( $fmtelt, $BOOKMARK ); - - // Let node and last node be the furthest block. - $node = $furthestblock; - $lastnode = $furthestblock; - $nodeindex = $furthestblockindex; - $isAFE = false; - - // Let inner loop counter be zero. - $inner = 0; - - while ( true ) { - - // Increment inner loop counter by one. - $inner++; - - // Let node be the element immediately above node in - // the stack of open elements, or if node is no longer - // in the stack of open elements (e.g. because it got - // removed by this algorithm), the element that was - // immediately above node in the stack of open elements - // before node was removed. - $node = $this->node( --$nodeindex ); - - // If node is the formatting element, then go - // to the next step in the overall algorithm. - if ( $node === $fmtelt ) break; - - // If the inner loop counter is greater than three and node - // is in the list of active formatting elements, then remove - // node from the list of active formatting elements. - $isAFE = $afe->isInList( $node ); - if ( $inner > 3 && $isAFE ) { - $afe->remove( $node ); - $isAFE = false; - } - - // If node is not in the list of active formatting - // elements, then remove node from the stack of open - // elements and then go back to the step labeled inner - // loop. - if ( !$isAFE ) { - // Don't flatten here, since we're about to relocate - // parts of this $node. - $this->removeElement( $node, false ); - continue; - } - - // Create an element for the token for which the - // element node was created with common ancestor as - // the intended parent, replace the entry for node - // in the list of active formatting elements with an - // entry for the new element, replace the entry for - // node in the stack of open elements with an entry for - // the new element, and let node be the new element. - $newelt = new BalanceElement( - $node->namespaceURI, $node->localName, $node->attribs ); - $afe->replace( $node, $newelt ); - $this->replaceAt( $nodeindex, $newelt ); - $node = $newelt; - - // If last node is the furthest block, then move the - // aforementioned bookmark to be immediately after the - // new node in the list of active formatting elements. - if ( $lastnode === $furthestblock ) { - $afe->remove( $BOOKMARK ); - $afe->insertAfter( $newelt, $BOOKMARK ); - } - - // Insert last node into node, first removing it from - // its previous parent node if any. - $node->appendChild( $lastnode ); - - // Let last node be node. - $lastnode = $node; - } - - // If the common ancestor node is a table, tbody, tfoot, - // thead, or tr element, then, foster parent whatever last - // node ended up being in the previous step, first removing - // it from its previous parent node if any. - if ( - $this->fosterParentMode && - $ancestor->isA( BalanceSets::$tableSectionRowSet ) - ) { - $this->fosterParent( $lastnode ); - } else { - // Otherwise, append whatever last node ended up being in - // the previous step to the common ancestor node, first - // removing it from its previous parent node if any. - $ancestor->appendChild( $lastnode ); + if ( !$furthestBlock ) { + $this->popTag( $fmtElt ); + $afe->remove( $fmtElt ); + return true; + } + + // Let the common ancestor be the element immediately above + // the formatting element in the stack of open elements. + $ancestor = $this->node( $index-1 ); + + // Let a bookmark note the position of the formatting + // element in the list of active formatting elements + // relative to the elements on either side of it in the + // list. + $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] ); + $afe->insertAfter( $fmtElt, $BOOKMARK ); + + // Let node and last node be the furthest block. + $node = $furthestBlock; + $lastNode = $furthestBlock; + $nodeIndex = $furthestBlockIndex; + $isAFE = false; + + // Inner loop + for ( $inner = 1; true; $inner++ ) { + // Let node be the element immediately above node in + // the stack of open elements, or if node is no longer + // in the stack of open elements (e.g. because it got + // removed by this algorithm), the element that was + // immediately above node in the stack of open elements + // before node was removed. + $node = $this->node( --$nodeIndex ); + + // If node is the formatting element, then go + // to the next step in the overall algorithm. + if ( $node === $fmtElt ) break; + + // If the inner loop counter is greater than three and node + // is in the list of active formatting elements, then remove + // node from the list of active formatting elements. + $isAFE = $afe->isInList( $node ); + if ( $inner > 3 && $isAFE ) { + $afe->remove( $node ); + $isAFE = false; + } + + // If node is not in the list of active formatting + // elements, then remove node from the stack of open + // elements and then go back to the step labeled inner + // loop. + if ( !$isAFE ) { + // Don't flatten here, since we're about to relocate + // parts of this $node. + $this->removeElement( $node, false ); + continue; } // Create an element for the token for which the - // formatting element was created, with furthest block - // as the intended parent. - $newelt2 = new BalanceElement( - $fmtelt->namespaceURI, $fmtelt->localName, $fmtelt->attribs ); + // element node was created with common ancestor as + // the intended parent, replace the entry for node + // in the list of active formatting elements with an + // entry for the new element, replace the entry for + // node in the stack of open elements with an entry for + // the new element, and let node be the new element. + $newElt = new BalanceElement( + $node->namespaceURI, $node->localName, $node->attribs ); + $afe->replace( $node, $newElt ); + $this->replaceAt( $nodeIndex, $newElt ); + $node = $newElt; + + // If last node is the furthest block, then move the + // aforementioned bookmark to be immediately after the + // new node in the list of active formatting elements. + if ( $lastNode === $furthestBlock ) { + $afe->remove( $BOOKMARK ); + $afe->insertAfter( $newElt, $BOOKMARK ); + } + + // Insert last node into node, first removing it from + // its previous parent node if any. + $node->appendChild( $lastNode ); + + // Let last node be node. + $lastNode = $node; + } + + // If the common ancestor node is a table, tbody, tfoot, + // thead, or tr element, then, foster parent whatever last + // node ended up being in the previous step, first removing + // it from its previous parent node if any. + if ( + $this->fosterParentMode && + $ancestor->isA( BalanceSets::$tableSectionRowSet ) + ) { + $this->fosterParent( $lastNode ); + } else { + // Otherwise, append whatever last node ended up being in + // the previous step to the common ancestor node, first + // removing it from its previous parent node if any. + $ancestor->appendChild( $lastNode ); + } - // Take all of the child nodes of the furthest block and - // append them to the element created in the last step. - $newelt2->adoptChildren( $furthestblock ); + // Create an element for the token for which the + // formatting element was created, with furthest block + // as the intended parent. + $newElt2 = new BalanceElement( + $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs ); - // Append that new element to the furthest block. - $furthestblock->appendChild( $newelt2 ); + // Take all of the child nodes of the furthest block and + // append them to the element created in the last step. + $newElt2->adoptChildren( $furthestBlock ); - // Remove the formatting element from the list of active - // formatting elements, and insert the new element into the - // list of active formatting elements at the position of - // the aforementioned bookmark. - $afe->remove( $fmtelt ); - $afe->replace( $BOOKMARK, $newelt2 ); + // Append that new element to the furthest block. + $furthestBlock->appendChild( $newElt2 ); - // Remove the formatting element from the stack of open - // elements, and insert the new element into the stack of - // open elements immediately below the position of the - // furthest block in that stack. - $this->removeElement( $fmtelt ); - $this->insertAfter( $furthestblock, $newelt2 ); - } + // Remove the formatting element from the list of active + // formatting elements, and insert the new element into the + // list of active formatting elements at the position of + // the aforementioned bookmark. + $afe->remove( $fmtElt ); + $afe->replace( $BOOKMARK, $newElt2 ); + + // Remove the formatting element from the stack of open + // elements, and insert the new element into the stack of + // open elements immediately below the position of the + // furthest block in that stack. + $this->removeElement( $fmtElt ); + $this->insertAfter( $furthestBlock, $newElt2 ); } return true; @@ -1677,11 +1671,11 @@ class BalanceActiveFormattingElements { // Loop backward through the list until we find a marker or an // open element - $foundit = false; + $foundIt = false; while ( $entry->prevAFE ) { $entry = $entry->prevAFE; if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) { - $foundit = true; + $foundIt = true; break; } } @@ -1690,7 +1684,7 @@ class BalanceActiveFormattingElements { // the first element if we didn't find a marker or open element), // recreating formatting elements and pushing them back onto the list // of open elements. - if ( $foundit ) { + if ( $foundIt ) { $entry = $entry->nextAFE; } do { @@ -1771,7 +1765,7 @@ class BalanceActiveFormattingElements { *