Fix changes list misaligned arrow
[lhc/web/wiklou.git] / includes / tidy / Balancer.php
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
25 */
26 namespace MediaWiki\Tidy;
27
28 use Wikimedia\Assert\Assert;
29 use Wikimedia\Assert\ParameterAssertionException;
30 use \ExplodeIterator;
31 use \IteratorAggregate;
32 use \ReverseArrayIterator;
33 use \Sanitizer;
34
35 // A note for future librarization[1] -- this file is a good candidate
36 // for splitting into an independent library, except that it is currently
37 // highly optimized for MediaWiki use. It only implements the portions
38 // of the HTML5 tree builder used by tags supported by MediaWiki, and
39 // does not contain a true tokenizer pass, instead relying on
40 // comment stripping, attribute normalization, and escaping done by
41 // the MediaWiki Sanitizer. It also deliberately avoids building
42 // a true DOM in memory, instead serializing elements to an output string
43 // as soon as possible (usually as soon as the tag is closed) to reduce
44 // its memory footprint.
45
46 // We've been gradually lifting some of these restrictions to handle
47 // non-sanitized output generated by extensions, but we shortcut the tokenizer
48 // for speed (primarily by splitting on `<`) and so rely on syntactic
49 // well-formedness.
50
51 // On the other hand, I've been pretty careful to note with comments in the
52 // code the places where this implementation omits features of the spec or
53 // depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
54 // implement the missing pieces and make this a standalone PHP HTML5 parser.
55 // In order to do so, some sort of MediaWiki-specific API will need
56 // to be added to (a) allow the Balancer to bypass the tokenizer,
57 // and (b) support on-the-fly flattening instead of DOM node creation.
58
59 // [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
60
61 /**
62 * Utility constants and sets for the HTML5 tree building algorithm.
63 * Sets are associative arrays indexed first by namespace and then by
64 * lower-cased tag name.
65 *
66 * @ingroup Parser
67 * @since 1.27
68 */
69 class BalanceSets {
70 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
71 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
72 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
73
74 public static $unsupportedSet = [
75 self::HTML_NAMESPACE => [
76 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
77 'frame' => true,
78 'plaintext' => true,
79 'xmp' => true, 'iframe' => true, 'noembed' => true,
80 'noscript' => true, 'script' => true,
81 'title' => true
82 ]
83 ];
84
85 public static $emptyElementSet = [
86 self::HTML_NAMESPACE => [
87 'area' => true, 'base' => true, 'basefont' => true,
88 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
89 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
90 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
91 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
92 ]
93 ];
94
95 public static $extraLinefeedSet = [
96 self::HTML_NAMESPACE => [
97 'pre' => true, 'textarea' => true, 'listing' => true,
98 ]
99 ];
100
101 public static $headingSet = [
102 self::HTML_NAMESPACE => [
103 'h1' => true, 'h2' => true, 'h3' => true,
104 'h4' => true, 'h5' => true, 'h6' => true
105 ]
106 ];
107
108 public static $specialSet = [
109 self::HTML_NAMESPACE => [
110 'address' => true, 'applet' => true, 'area' => true,
111 'article' => true, 'aside' => true, 'base' => true,
112 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
113 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
114 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
115 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
116 'dt' => true, 'embed' => true, 'fieldset' => true,
117 'figcaption' => true, 'figure' => true, 'footer' => true,
118 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
119 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
120 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
121 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
122 'input' => true, 'li' => true, 'link' => true,
123 'listing' => true, 'main' => true, 'marquee' => true,
124 'menu' => true, 'meta' => true, 'nav' => true,
125 'noembed' => true, 'noframes' => true, 'noscript' => true,
126 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
127 'plaintext' => true, 'pre' => true, 'script' => true,
128 'section' => true, 'select' => true, 'source' => true,
129 'style' => true, 'summary' => true, 'table' => true,
130 'tbody' => true, 'td' => true, 'template' => true,
131 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
132 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
133 'wbr' => true, 'xmp' => true
134 ],
135 self::SVG_NAMESPACE => [
136 'foreignobject' => true, 'desc' => true, 'title' => true
137 ],
138 self::MATHML_NAMESPACE => [
139 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
140 'mtext' => true, 'annotation-xml' => true
141 ]
142 ];
143
144 public static $addressDivPSet = [
145 self::HTML_NAMESPACE => [
146 'address' => true, 'div' => true, 'p' => true
147 ]
148 ];
149
150 public static $tableSectionRowSet = [
151 self::HTML_NAMESPACE => [
152 'table' => true, 'thead' => true, 'tbody' => true,
153 'tfoot' => true, 'tr' => true
154 ]
155 ];
156
157 public static $impliedEndTagsSet = [
158 self::HTML_NAMESPACE => [
159 'dd' => true, 'dt' => true, 'li' => true,
160 'menuitem' => true, 'optgroup' => true,
161 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
162 'rt' => true, 'rtc' => true
163 ]
164 ];
165
166 public static $thoroughImpliedEndTagsSet = [
167 self::HTML_NAMESPACE => [
168 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
169 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
170 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
171 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
172 'thead' => true, 'tr' => true
173 ]
174 ];
175
176 public static $tableCellSet = [
177 self::HTML_NAMESPACE => [
178 'td' => true, 'th' => true
179 ]
180 ];
181 public static $tableContextSet = [
182 self::HTML_NAMESPACE => [
183 'table' => true, 'template' => true, 'html' => true
184 ]
185 ];
186
187 public static $tableBodyContextSet = [
188 self::HTML_NAMESPACE => [
189 'tbody' => true, 'tfoot' => true, 'thead' => true,
190 'template' => true, 'html' => true
191 ]
192 ];
193
194 public static $tableRowContextSet = [
195 self::HTML_NAMESPACE => [
196 'tr' => true, 'template' => true, 'html' => true
197 ]
198 ];
199
200 // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
201 public static $formAssociatedSet = [
202 self::HTML_NAMESPACE => [
203 'button' => true, 'fieldset' => true, 'input' => true,
204 'keygen' => true, 'object' => true, 'output' => true,
205 'select' => true, 'textarea' => true, 'img' => true
206 ]
207 ];
208
209 public static $inScopeSet = [
210 self::HTML_NAMESPACE => [
211 'applet' => true, 'caption' => true, 'html' => true,
212 'marquee' => true, 'object' => true,
213 'table' => true, 'td' => true, 'template' => true,
214 'th' => true
215 ],
216 self::SVG_NAMESPACE => [
217 'foreignobject' => true, 'desc' => true, 'title' => true
218 ],
219 self::MATHML_NAMESPACE => [
220 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
221 'mtext' => true, 'annotation-xml' => true
222 ]
223 ];
224
225 private static $inListItemScopeSet = null;
226 public static function inListItemScopeSet() {
227 if ( self::$inListItemScopeSet === null ) {
228 self::$inListItemScopeSet = self::$inScopeSet;
229 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
230 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
231 }
232 return self::$inListItemScopeSet;
233 }
234
235 private static $inButtonScopeSet = null;
236 public static function inButtonScopeSet() {
237 if ( self::$inButtonScopeSet === null ) {
238 self::$inButtonScopeSet = self::$inScopeSet;
239 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
240 }
241 return self::$inButtonScopeSet;
242 }
243
244 public static $inTableScopeSet = [
245 self::HTML_NAMESPACE => [
246 'html' => true, 'table' => true, 'template' => true
247 ]
248 ];
249
250 public static $inInvertedSelectScopeSet = [
251 self::HTML_NAMESPACE => [
252 'option' => true, 'optgroup' => true
253 ]
254 ];
255
256 public static $mathmlTextIntegrationPointSet = [
257 self::MATHML_NAMESPACE => [
258 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
259 'mtext' => true
260 ]
261 ];
262
263 public static $htmlIntegrationPointSet = [
264 self::SVG_NAMESPACE => [
265 'foreignobject' => true,
266 'desc' => true,
267 'title' => true
268 ]
269 ];
270
271 // For tidy compatibility.
272 public static $tidyPWrapSet = [
273 self::HTML_NAMESPACE => [
274 'body' => true, 'blockquote' => true,
275 // We parse with <body> as the fragment context, but the top-level
276 // element on the stack is actually <html>. We could use the
277 // "adjusted current node" everywhere to work around this, but it's
278 // easier just to add <html> to the p-wrap set.
279 'html' => true,
280 ],
281 ];
282 public static $tidyInlineSet = [
283 self::HTML_NAMESPACE => [
284 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
285 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
286 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
287 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
288 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
289 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
290 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
291 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
292 's' => true, 'samp' => true, 'select' => true, 'small' => true,
293 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
294 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
295 'var' => true,
296 ],
297 ];
298 }
299
300 /**
301 * A BalanceElement is a simplified version of a DOM Node. The main
302 * difference is that we only keep BalanceElements around for nodes
303 * currently on the BalanceStack of open elements. As soon as an
304 * element is closed, with some minor exceptions relating to the
305 * tree builder "adoption agency algorithm", the element and all its
306 * children are serialized to a string using the flatten() method.
307 * This keeps our memory usage low.
308 *
309 * @ingroup Parser
310 * @since 1.27
311 */
312 class BalanceElement {
313 /**
314 * The namespace of the element.
315 * @var string $namespaceURI
316 */
317 public $namespaceURI;
318 /**
319 * The lower-cased name of the element.
320 * @var string $localName
321 */
322 public $localName;
323 /**
324 * Attributes for the element, in array form
325 * @var array $attribs
326 */
327 public $attribs;
328
329 /**
330 * Parent of this element, or the string "flat" if this element has
331 * already been flattened into its parent.
332 * @var BalanceElement|string|null $parent
333 */
334 public $parent;
335
336 /**
337 * An array of children of this element. Typically only the last
338 * child will be an actual BalanceElement object; the rest will
339 * be strings, representing either text nodes or flattened
340 * BalanceElement objects.
341 * @var BalanceElement[]|string[] $children
342 */
343 public $children;
344
345 /**
346 * A unique string identifier for Noah's Ark purposes, lazy initialized
347 */
348 private $noahKey;
349
350 /**
351 * The next active formatting element in the list, or null if this is the
352 * end of the AFE list or if the element is not in the AFE list.
353 */
354 public $nextAFE;
355
356 /**
357 * The previous active formatting element in the list, or null if this is
358 * the start of the list or if the element is not in the AFE list.
359 */
360 public $prevAFE;
361
362 /**
363 * The next element in the Noah's Ark species bucket.
364 */
365 public $nextNoah;
366
367 /**
368 * Make a new BalanceElement corresponding to the HTML DOM Element
369 * with the given localname, namespace, and attributes.
370 *
371 * @param string $namespaceURI The namespace of the element.
372 * @param string $localName The lowercased name of the tag.
373 * @param array $attribs Attributes of the element
374 */
375 public function __construct( $namespaceURI, $localName, array $attribs ) {
376 $this->localName = $localName;
377 $this->namespaceURI = $namespaceURI;
378 $this->attribs = $attribs;
379 $this->contents = '';
380 $this->parent = null;
381 $this->children = [];
382 }
383
384 /**
385 * Remove the given child from this element.
386 * @param BalanceElement $elt
387 */
388 private function removeChild( BalanceElement $elt ) {
389 Assert::precondition(
390 $this->parent !== 'flat', "Can't removeChild after flattening $this"
391 );
392 Assert::parameter(
393 $elt->parent === $this, 'elt', 'must have $this as a parent'
394 );
395 $idx = array_search( $elt, $this->children, true );
396 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
397 $elt->parent = null;
398 array_splice( $this->children, $idx, 1 );
399 }
400
401 /**
402 * Find $a in the list of children and insert $b before it.
403 * @param BalanceElement $a
404 * @param BalanceElement|string $b
405 */
406 public function insertBefore( BalanceElement $a, $b ) {
407 Assert::precondition(
408 $this->parent !== 'flat', "Can't insertBefore after flattening."
409 );
410 $idx = array_search( $a, $this->children, true );
411 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
412 if ( is_string( $b ) ) {
413 array_splice( $this->children, $idx, 0, [ $b ] );
414 } else {
415 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
416 if ( $b->parent !== null ) {
417 $b->parent->removeChild( $b );
418 }
419 array_splice( $this->children, $idx, 0, [ $b ] );
420 $b->parent = $this;
421 }
422 }
423
424 /**
425 * Append $elt to the end of the list of children.
426 * @param BalanceElement|string $elt
427 */
428 public function appendChild( $elt ) {
429 Assert::precondition(
430 $this->parent !== 'flat', "Can't appendChild after flattening."
431 );
432 if ( is_string( $elt ) ) {
433 array_push( $this->children, $elt );
434 return;
435 }
436 // Remove $elt from parent, if it had one.
437 if ( $elt->parent !== null ) {
438 $elt->parent->removeChild( $elt );
439 }
440 array_push( $this->children, $elt );
441 $elt->parent = $this;
442 }
443
444 /**
445 * Transfer all of the children of $elt to $this.
446 * @param BalanceElement $elt
447 */
448 public function adoptChildren( BalanceElement $elt ) {
449 Assert::precondition(
450 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
451 );
452 foreach ( $elt->children as $child ) {
453 if ( !is_string( $child ) ) {
454 // This is an optimization which avoids an O(n^2) set of
455 // array_splice operations.
456 $child->parent = null;
457 }
458 $this->appendChild( $child );
459 }
460 $elt->children = [];
461 }
462
463 /**
464 * Flatten this node and all of its children into a string, as specified
465 * by the HTML serialization specification, and replace this node
466 * in its parent by that string.
467 *
468 * @param array $config Balancer configuration; see Balancer::__construct().
469 * @return string
470 *
471 * @see __toString()
472 */
473 public function flatten( array $config ) {
474 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
475 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
476 $idx = array_search( $this, $this->parent->children, true );
477 Assert::parameter(
478 $idx !== false, '$this', 'must be a child of its parent'
479 );
480 $tidyCompat = $config['tidyCompat'];
481 if ( $tidyCompat ) {
482 $blank = true;
483 foreach ( $this->children as $elt ) {
484 if ( !is_string( $elt ) ) {
485 $elt = $elt->flatten( $config );
486 }
487 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
488 $blank = false;
489 }
490 }
491 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
492 $this->localName = 'p';
493 } elseif ( $blank ) {
494 // Add 'mw-empty-elt' class so elements can be hidden via CSS
495 // for compatibility with legacy tidy.
496 if ( !count( $this->attribs ) &&
497 ( $this->localName === 'tr' || $this->localName === 'li' )
498 ) {
499 $this->attribs = [ 'class' => "mw-empty-elt" ];
500 }
501 $blank = false;
502 } elseif (
503 $this->isA( BalanceSets::$extraLinefeedSet ) &&
504 count( $this->children ) > 0 &&
505 substr( $this->children[0], 0, 1 ) == "\n"
506 ) {
507 // Double the linefeed after pre/listing/textarea
508 // according to the (old) HTML5 fragment serialization
509 // algorithm (see https://github.com/whatwg/html/issues/944)
510 // to ensure this will round-trip.
511 array_unshift( $this->children, "\n" );
512 }
513 $flat = $blank ? '' : "{$this}";
514 } else {
515 $flat = "{$this}";
516 }
517 $this->parent->children[$idx] = $flat;
518 $this->parent = 'flat'; // for assertion checking
519 return $flat;
520 }
521
522 /**
523 * Serialize this node and all of its children to a string, as specified
524 * by the HTML serialization specification.
525 *
526 * @return string The serialization of the BalanceElement
527 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
528 */
529 public function __toString() {
530 $encAttribs = '';
531 foreach ( $this->attribs as $name => $value ) {
532 $encValue = Sanitizer::encodeAttribute( $value );
533 $encAttribs .= " $name=\"$encValue\"";
534 }
535 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
536 $out = "<{$this->localName}{$encAttribs}>";
537 $len = strlen( $out );
538 // flatten children
539 foreach ( $this->children as $elt ) {
540 $out .= "{$elt}";
541 }
542 $out .= "</{$this->localName}>";
543 } else {
544 $out = "<{$this->localName}{$encAttribs} />";
545 Assert::invariant(
546 count( $this->children ) === 0,
547 "Empty elements shouldn't have children."
548 );
549 }
550 return $out;
551 }
552
553 // Utility functions on BalanceElements.
554
555 /**
556 * Determine if $this represents a specific HTML tag, is a member of
557 * a tag set, or is equal to another BalanceElement.
558 *
559 * @param BalanceElement|array|string $set The target BalanceElement,
560 * set (from the BalanceSets class), or string (HTML tag name).
561 * @return bool
562 */
563 public function isA( $set ) {
564 if ( $set instanceof BalanceElement ) {
565 return $this === $set;
566 } elseif ( is_array( $set ) ) {
567 return isset( $set[$this->namespaceURI] ) &&
568 isset( $set[$this->namespaceURI][$this->localName] );
569 } else {
570 // assume this is an HTML element name.
571 return $this->isHtml() && $this->localName === $set;
572 }
573 }
574
575 /**
576 * Determine if this element is an HTML element with the specified name
577 * @param string $tagName
578 * @return bool
579 */
580 public function isHtmlNamed( $tagName ) {
581 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
582 && $this->localName === $tagName;
583 }
584
585 /**
586 * Determine if $this represents an element in the HTML namespace.
587 *
588 * @return bool
589 */
590 public function isHtml() {
591 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
592 }
593
594 /**
595 * Determine if $this represents a MathML text integration point,
596 * as defined in the HTML5 specification.
597 *
598 * @return bool
599 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
600 */
601 public function isMathmlTextIntegrationPoint() {
602 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
603 }
604
605 /**
606 * Determine if $this represents an HTML integration point,
607 * as defined in the HTML5 specification.
608 *
609 * @return bool
610 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
611 */
612 public function isHtmlIntegrationPoint() {
613 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
614 return true;
615 }
616 if (
617 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
618 $this->localName === 'annotation-xml' &&
619 isset( $this->attribs['encoding'] ) &&
620 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
621 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
622 ) {
623 return true;
624 }
625 return false;
626 }
627
628 /**
629 * Get a string key for the Noah's Ark algorithm
630 * @return string
631 */
632 public function getNoahKey() {
633 if ( $this->noahKey === null ) {
634 $attribs = $this->attribs;
635 ksort( $attribs );
636 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
637 }
638 return $this->noahKey;
639 }
640 }
641
642 /**
643 * The "stack of open elements" as defined in the HTML5 tree builder
644 * spec. This contains methods to ensure that content (start tags, text)
645 * are inserted at the correct place in the output string, and to
646 * flatten BalanceElements are they are closed to avoid holding onto
647 * a complete DOM tree for the document in memory.
648 *
649 * The stack defines a PHP iterator to traverse it in "reverse order",
650 * that is, the most-recently-added element is visited first in a
651 * foreach loop.
652 *
653 * @ingroup Parser
654 * @since 1.27
655 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
656 */
657 class BalanceStack implements IteratorAggregate {
658 /**
659 * Backing storage for the stack.
660 * @var BalanceElement[] $elements
661 */
662 private $elements = [];
663 /**
664 * Foster parent mode determines how nodes are inserted into the
665 * stack.
666 * @var bool $fosterParentMode
667 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
668 */
669 public $fosterParentMode = false;
670 /**
671 * Configuration options governing flattening.
672 * @var array $config
673 * @see Balancer::__construct()
674 */
675 private $config;
676 /**
677 * Reference to the current element
678 */
679 public $currentNode;
680
681 /**
682 * Create a new BalanceStack with a single BalanceElement on it,
683 * representing the root &lt;html&gt; node.
684 * @param array $config Balancer configuration; see Balancer::_construct().
685 */
686 public function __construct( array $config ) {
687 // always a root <html> element on the stack
688 array_push(
689 $this->elements,
690 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
691 );
692 $this->currentNode = $this->elements[0];
693 $this->config = $config;
694 }
695
696 /**
697 * Return a string representing the output of the tree builder:
698 * all the children of the root &lt;html&gt; node.
699 * @return string
700 */
701 public function getOutput() {
702 // Don't include the outer '<html>....</html>'
703 $out = '';
704 foreach ( $this->elements[0]->children as $elt ) {
705 $out .= is_string( $elt ) ? $elt :
706 $elt->flatten( $this->config );
707 }
708 return $out;
709 }
710
711 /**
712 * Insert a comment at the appropriate place for inserting a node.
713 * @param string $value Content of the comment.
714 * @return string
715 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
716 */
717 public function insertComment( $value ) {
718 // Just another type of text node, except for tidy p-wrapping.
719 return $this->insertText( '<!--' . $value . '-->', true );
720 }
721
722 /**
723 * Insert text at the appropriate place for inserting a node.
724 * @param string $value
725 * @param bool $isComment
726 * @return string
727 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
728 */
729 public function insertText( $value, $isComment = false ) {
730 if (
731 $this->fosterParentMode &&
732 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
733 ) {
734 $this->fosterParent( $value );
735 } elseif (
736 $this->config['tidyCompat'] && !$isComment &&
737 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
738 ) {
739 $this->insertHTMLElement( 'mw:p-wrap', [] );
740 return $this->insertText( $value );
741 } else {
742 $this->currentNode->appendChild( $value );
743 }
744 }
745
746 /**
747 * Insert a BalanceElement at the appropriate place, pushing it
748 * on to the open elements stack.
749 * @param string $namespaceURI The element namespace
750 * @param string $tag The tag name
751 * @param string $attribs Normalized attributes, as a string.
752 * @return BalanceElement
753 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
754 */
755 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
756 return $this->insertElement(
757 new BalanceElement( $namespaceURI, $tag, $attribs )
758 );
759 }
760
761 /**
762 * Insert an HTML element at the appropriate place, pushing it on to
763 * the open elements stack.
764 * @param string $tag The tag name
765 * @param string $attribs Normalized attributes, as a string.
766 * @return BalanceElement
767 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
768 */
769 public function insertHTMLElement( $tag, $attribs ) {
770 return $this->insertForeignElement(
771 BalanceSets::HTML_NAMESPACE, $tag, $attribs
772 );
773 }
774
775 /**
776 * Insert an element at the appropriate place and push it on to the
777 * open elements stack.
778 * @param BalanceElement $elt
779 * @return BalanceElement
780 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
781 */
782 public function insertElement( BalanceElement $elt ) {
783 if (
784 $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
785 !$elt->isA( BalanceSets::$tidyInlineSet )
786 ) {
787 // Tidy compatibility.
788 $this->pop();
789 }
790 if (
791 $this->fosterParentMode &&
792 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
793 ) {
794 $elt = $this->fosterParent( $elt );
795 } else {
796 $this->currentNode->appendChild( $elt );
797 }
798 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
799 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
800 array_push( $this->elements, $elt );
801 $this->currentNode = $elt;
802 return $elt;
803 }
804
805 /**
806 * Determine if the stack has $tag in scope.
807 * @param BalanceElement|array|string $tag
808 * @return bool
809 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
810 */
811 public function inScope( $tag ) {
812 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
813 }
814
815 /**
816 * Determine if the stack has $tag in button scope.
817 * @param BalanceElement|array|string $tag
818 * @return bool
819 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
820 */
821 public function inButtonScope( $tag ) {
822 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
823 }
824
825 /**
826 * Determine if the stack has $tag in list item scope.
827 * @param BalanceElement|array|string $tag
828 * @return bool
829 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
830 */
831 public function inListItemScope( $tag ) {
832 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
833 }
834
835 /**
836 * Determine if the stack has $tag in table scope.
837 * @param BalanceElement|array|string $tag
838 * @return bool
839 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
840 */
841 public function inTableScope( $tag ) {
842 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
843 }
844
845 /**
846 * Determine if the stack has $tag in select scope.
847 * @param BalanceElement|array|string $tag
848 * @return bool
849 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
850 */
851 public function inSelectScope( $tag ) {
852 // Can't use inSpecificScope to implement this, since it involves
853 // *inverting* a set of tags. Implement manually.
854 foreach ( $this as $elt ) {
855 if ( $elt->isA( $tag ) ) {
856 return true;
857 }
858 if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
859 return false;
860 }
861 }
862 return false;
863 }
864
865 /**
866 * Determine if the stack has $tag in a specific scope, $set.
867 * @param BalanceElement|array|string $tag
868 * @param BalanceElement|array|string $set
869 * @return bool
870 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
871 */
872 public function inSpecificScope( $tag, $set ) {
873 foreach ( $this as $elt ) {
874 if ( $elt->isA( $tag ) ) {
875 return true;
876 }
877 if ( $elt->isA( $set ) ) {
878 return false;
879 }
880 }
881 return false;
882 }
883
884 /**
885 * Generate implied end tags.
886 * @param string $butnot
887 * @param bool $thorough True if we should generate end tags thoroughly.
888 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
889 */
890 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
891 $endTagSet = $thorough ?
892 BalanceSets::$thoroughImpliedEndTagsSet :
893 BalanceSets::$impliedEndTagsSet;
894 while ( $this->currentNode ) {
895 if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
896 break;
897 }
898 if ( !$this->currentNode->isA( $endTagSet ) ) {
899 break;
900 }
901 $this->pop();
902 }
903 }
904
905 /**
906 * Return the adjusted current node.
907 * @param string $fragmentContext
908 * @return string
909 */
910 public function adjustedCurrentNode( $fragmentContext ) {
911 return ( $fragmentContext && count( $this->elements ) === 1 ) ?
912 $fragmentContext : $this->currentNode;
913 }
914
915 /**
916 * Return an iterator over this stack which visits the current node
917 * first, and the root node last.
918 * @return \Iterator
919 */
920 public function getIterator() {
921 return new ReverseArrayIterator( $this->elements );
922 }
923
924 /**
925 * Return the BalanceElement at the given position $idx, where
926 * position 0 represents the root element.
927 * @param int $idx
928 * @return BalanceElement
929 */
930 public function node( $idx ) {
931 return $this->elements[ $idx ];
932 }
933
934 /**
935 * Replace the element at position $idx in the BalanceStack with $elt.
936 * @param int $idx
937 * @param BalanceElement $elt
938 */
939 public function replaceAt( $idx, BalanceElement $elt ) {
940 Assert::precondition(
941 $this->elements[$idx]->parent !== 'flat',
942 'Replaced element should not have already been flattened.'
943 );
944 Assert::precondition(
945 $elt->parent !== 'flat',
946 'New element should not have already been flattened.'
947 );
948 $this->elements[$idx] = $elt;
949 if ( $idx === count( $this->elements ) - 1 ) {
950 $this->currentNode = $elt;
951 }
952 }
953
954 /**
955 * Return the position of the given BalanceElement, set, or
956 * HTML tag name string in the BalanceStack.
957 * @param BalanceElement|array|string $tag
958 * @return int
959 */
960 public function indexOf( $tag ) {
961 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
962 if ( $this->elements[$i]->isA( $tag ) ) {
963 return $i;
964 }
965 }
966 return -1;
967 }
968
969 /**
970 * Return the number of elements currently in the BalanceStack.
971 * @return int
972 */
973 public function length() {
974 return count( $this->elements );
975 }
976
977 /**
978 * Remove the current node from the BalanceStack, flattening it
979 * in the process.
980 */
981 public function pop() {
982 $elt = array_pop( $this->elements );
983 if ( count( $this->elements ) ) {
984 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
985 } else {
986 $this->currentNode = null;
987 }
988 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
989 $elt->flatten( $this->config );
990 }
991 }
992
993 /**
994 * Remove all nodes up to and including position $idx from the
995 * BalanceStack, flattening them in the process.
996 * @param int $idx
997 */
998 public function popTo( $idx ) {
999 for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
1000 $this->pop();
1001 }
1002 }
1003
1004 /**
1005 * Pop elements off the stack up to and including the first
1006 * element with the specified HTML tagname (or matching the given
1007 * set).
1008 * @param BalanceElement|array|string $tag
1009 */
1010 public function popTag( $tag ) {
1011 while ( $this->currentNode ) {
1012 if ( $this->currentNode->isA( $tag ) ) {
1013 $this->pop();
1014 break;
1015 }
1016 $this->pop();
1017 }
1018 }
1019
1020 /**
1021 * Pop elements off the stack *not including* the first element
1022 * in the specified set.
1023 * @param BalanceElement|array|string $set
1024 */
1025 public function clearToContext( $set ) {
1026 // Note that we don't loop to 0. Never pop the <html> elt off.
1027 for ( $length = count( $this->elements ); $length > 1; $length-- ) {
1028 if ( $this->currentNode->isA( $set ) ) {
1029 break;
1030 }
1031 $this->pop();
1032 }
1033 }
1034
1035 /**
1036 * Remove the given $elt from the BalanceStack, optionally
1037 * flattening it in the process.
1038 * @param BalanceElement $elt The element to remove.
1039 * @param bool $flatten Whether to flatten the removed element.
1040 */
1041 public function removeElement( BalanceElement $elt, $flatten = true ) {
1042 Assert::parameter(
1043 $elt->parent !== 'flat',
1044 '$elt',
1045 '$elt should not already have been flattened.'
1046 );
1047 Assert::parameter(
1048 $elt->parent->parent !== 'flat',
1049 '$elt',
1050 'The parent of $elt should not already have been flattened.'
1051 );
1052 $idx = array_search( $elt, $this->elements, true );
1053 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
1054 array_splice( $this->elements, $idx, 1 );
1055 if ( $idx === count( $this->elements ) ) {
1056 $this->currentNode = $this->elements[$idx - 1];
1057 }
1058 if ( $flatten ) {
1059 // serialize $elt into its parent
1060 // otherwise, it will eventually serialize when the parent
1061 // is serialized, we just hold onto the memory for its
1062 // tree of objects a little longer.
1063 $elt->flatten( $this->config );
1064 }
1065 Assert::postcondition(
1066 array_search( $elt, $this->elements, true ) === false,
1067 '$elt should no longer be in open elements stack'
1068 );
1069 }
1070
1071 /**
1072 * Find $a in the BalanceStack and insert $b after it.
1073 * @param BalanceElement $a
1074 * @param BalanceElement $b
1075 */
1076 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1077 $idx = $this->indexOf( $a );
1078 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1079 if ( $idx === count( $this->elements ) - 1 ) {
1080 array_push( $this->elements, $b );
1081 $this->currentNode = $b;
1082 } else {
1083 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1084 }
1085 }
1086
1087 // Fostering and adoption.
1088
1089 /**
1090 * Foster parent the given $elt in the stack of open elements.
1091 * @param BalanceElement|string $elt
1092 * @return BalanceElement|string
1093 *
1094 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1095 */
1096 private function fosterParent( $elt ) {
1097 $lastTable = $this->indexOf( 'table' );
1098 $lastTemplate = $this->indexOf( 'template' );
1099 $parent = null;
1100 $before = null;
1101
1102 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1103 $parent = $this->elements[$lastTemplate];
1104 } elseif ( $lastTable >= 0 ) {
1105 $parent = $this->elements[$lastTable]->parent;
1106 // Assume all tables have parents, since we're not running scripts!
1107 Assert::invariant(
1108 $parent !== null, "All tables should have parents"
1109 );
1110 $before = $this->elements[$lastTable];
1111 } else {
1112 $parent = $this->elements[0]; // the `html` element.
1113 }
1114
1115 if ( $this->config['tidyCompat'] ) {
1116 if ( is_string( $elt ) ) {
1117 // We're fostering text: do we need a p-wrapper?
1118 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1119 $this->insertHTMLElement( 'mw:p-wrap', [] );
1120 $this->insertText( $elt );
1121 return $elt;
1122 }
1123 } else {
1124 // We're fostering an element; do we need to merge p-wrappers?
1125 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1126 $idx = $before ?
1127 array_search( $before, $parent->children, true ) :
1128 count( $parent->children );
1129 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1130 if (
1131 $after instanceof BalanceElement &&
1132 $after->isHtmlNamed( 'mw:p-wrap' )
1133 ) {
1134 return $after; // Re-use existing p-wrapper.
1135 }
1136 }
1137 }
1138 }
1139
1140 if ( $before ) {
1141 $parent->insertBefore( $before, $elt );
1142 } else {
1143 $parent->appendChild( $elt );
1144 }
1145 return $elt;
1146 }
1147
1148 /**
1149 * Run the "adoption agency algoritm" (AAA) for the given subject
1150 * tag name.
1151 * @param string $tag The subject tag name.
1152 * @param BalanceActiveFormattingElements $afe The current
1153 * active formatting elements list.
1154 * @return true if the adoption agency algorithm "did something", false
1155 * if more processing is required by the caller.
1156 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1157 */
1158 public function adoptionAgency( $tag, $afe ) {
1159 // If the current node is an HTML element whose tag name is subject,
1160 // and the current node is not in the list of active formatting
1161 // elements, then pop the current node off the stack of open
1162 // elements and abort these steps.
1163 if (
1164 $this->currentNode->isHtmlNamed( $tag ) &&
1165 !$afe->isInList( $this->currentNode )
1166 ) {
1167 $this->pop();
1168 return true; // no more handling required
1169 }
1170
1171 // Outer loop: If outer loop counter is greater than or
1172 // equal to eight, then abort these steps.
1173 for ( $outer = 0; $outer < 8; $outer++ ) {
1174 // Let the formatting element be the last element in the list
1175 // of active formatting elements that: is between the end of
1176 // the list and the last scope marker in the list, if any, or
1177 // the start of the list otherwise, and has the same tag name
1178 // as the token.
1179 $fmtElt = $afe->findElementByTag( $tag );
1180
1181 // If there is no such node, then abort these steps and instead
1182 // act as described in the "any other end tag" entry below.
1183 if ( !$fmtElt ) {
1184 return false; // false means handle by the default case
1185 }
1186
1187 // Otherwise, if there is such a node, but that node is not in
1188 // the stack of open elements, then this is a parse error;
1189 // remove the element from the list, and abort these steps.
1190 $index = $this->indexOf( $fmtElt );
1191 if ( $index < 0 ) {
1192 $afe->remove( $fmtElt );
1193 return true; // true means no more handling required
1194 }
1195
1196 // Otherwise, if there is such a node, and that node is also in
1197 // the stack of open elements, but the element is not in scope,
1198 // then this is a parse error; ignore the token, and abort
1199 // these steps.
1200 if ( !$this->inScope( $fmtElt ) ) {
1201 return true;
1202 }
1203
1204 // Let the furthest block be the topmost node in the stack of
1205 // open elements that is lower in the stack than the formatting
1206 // element, and is an element in the special category. There
1207 // might not be one.
1208 $furthestBlock = null;
1209 $furthestBlockIndex = -1;
1210 $stackLength = $this->length();
1211 for ( $i = $index + 1; $i < $stackLength; $i++ ) {
1212 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1213 $furthestBlock = $this->node( $i );
1214 $furthestBlockIndex = $i;
1215 break;
1216 }
1217 }
1218
1219 // If there is no furthest block, then the UA must skip the
1220 // subsequent steps and instead just pop all the nodes from the
1221 // bottom of the stack of open elements, from the current node
1222 // up to and including the formatting element, and remove the
1223 // formatting element from the list of active formatting
1224 // elements.
1225 if ( !$furthestBlock ) {
1226 $this->popTag( $fmtElt );
1227 $afe->remove( $fmtElt );
1228 return true;
1229 }
1230
1231 // Let the common ancestor be the element immediately above
1232 // the formatting element in the stack of open elements.
1233 $ancestor = $this->node( $index - 1 );
1234
1235 // Let a bookmark note the position of the formatting
1236 // element in the list of active formatting elements
1237 // relative to the elements on either side of it in the
1238 // list.
1239 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1240 $afe->insertAfter( $fmtElt, $BOOKMARK );
1241
1242 // Let node and last node be the furthest block.
1243 $node = $furthestBlock;
1244 $lastNode = $furthestBlock;
1245 $nodeIndex = $furthestBlockIndex;
1246 $isAFE = false;
1247
1248 // Inner loop
1249 for ( $inner = 1; true; $inner++ ) {
1250 // Let node be the element immediately above node in
1251 // the stack of open elements, or if node is no longer
1252 // in the stack of open elements (e.g. because it got
1253 // removed by this algorithm), the element that was
1254 // immediately above node in the stack of open elements
1255 // before node was removed.
1256 $node = $this->node( --$nodeIndex );
1257
1258 // If node is the formatting element, then go
1259 // to the next step in the overall algorithm.
1260 if ( $node === $fmtElt ) break;
1261
1262 // If the inner loop counter is greater than three and node
1263 // is in the list of active formatting elements, then remove
1264 // node from the list of active formatting elements.
1265 $isAFE = $afe->isInList( $node );
1266 if ( $inner > 3 && $isAFE ) {
1267 $afe->remove( $node );
1268 $isAFE = false;
1269 }
1270
1271 // If node is not in the list of active formatting
1272 // elements, then remove node from the stack of open
1273 // elements and then go back to the step labeled inner
1274 // loop.
1275 if ( !$isAFE ) {
1276 // Don't flatten here, since we're about to relocate
1277 // parts of this $node.
1278 $this->removeElement( $node, false );
1279 continue;
1280 }
1281
1282 // Create an element for the token for which the
1283 // element node was created with common ancestor as
1284 // the intended parent, replace the entry for node
1285 // in the list of active formatting elements with an
1286 // entry for the new element, replace the entry for
1287 // node in the stack of open elements with an entry for
1288 // the new element, and let node be the new element.
1289 $newElt = new BalanceElement(
1290 $node->namespaceURI, $node->localName, $node->attribs );
1291 $afe->replace( $node, $newElt );
1292 $this->replaceAt( $nodeIndex, $newElt );
1293 $node = $newElt;
1294
1295 // If last node is the furthest block, then move the
1296 // aforementioned bookmark to be immediately after the
1297 // new node in the list of active formatting elements.
1298 if ( $lastNode === $furthestBlock ) {
1299 $afe->remove( $BOOKMARK );
1300 $afe->insertAfter( $newElt, $BOOKMARK );
1301 }
1302
1303 // Insert last node into node, first removing it from
1304 // its previous parent node if any.
1305 $node->appendChild( $lastNode );
1306
1307 // Let last node be node.
1308 $lastNode = $node;
1309 }
1310
1311 // If the common ancestor node is a table, tbody, tfoot,
1312 // thead, or tr element, then, foster parent whatever last
1313 // node ended up being in the previous step, first removing
1314 // it from its previous parent node if any.
1315 if (
1316 $this->fosterParentMode &&
1317 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1318 ) {
1319 $this->fosterParent( $lastNode );
1320 } else {
1321 // Otherwise, append whatever last node ended up being in
1322 // the previous step to the common ancestor node, first
1323 // removing it from its previous parent node if any.
1324 $ancestor->appendChild( $lastNode );
1325 }
1326
1327 // Create an element for the token for which the
1328 // formatting element was created, with furthest block
1329 // as the intended parent.
1330 $newElt2 = new BalanceElement(
1331 $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs );
1332
1333 // Take all of the child nodes of the furthest block and
1334 // append them to the element created in the last step.
1335 $newElt2->adoptChildren( $furthestBlock );
1336
1337 // Append that new element to the furthest block.
1338 $furthestBlock->appendChild( $newElt2 );
1339
1340 // Remove the formatting element from the list of active
1341 // formatting elements, and insert the new element into the
1342 // list of active formatting elements at the position of
1343 // the aforementioned bookmark.
1344 $afe->remove( $fmtElt );
1345 $afe->replace( $BOOKMARK, $newElt2 );
1346
1347 // Remove the formatting element from the stack of open
1348 // elements, and insert the new element into the stack of
1349 // open elements immediately below the position of the
1350 // furthest block in that stack.
1351 $this->removeElement( $fmtElt );
1352 $this->insertAfter( $furthestBlock, $newElt2 );
1353 }
1354
1355 return true;
1356 }
1357
1358 /**
1359 * Return the contents of the open elements stack as a string for
1360 * debugging.
1361 * @return string
1362 */
1363 public function __toString() {
1364 $r = [];
1365 foreach ( $this->elements as $elt ) {
1366 array_push( $r, $elt->localName );
1367 }
1368 return implode( $r, ' ' );
1369 }
1370 }
1371
1372 /**
1373 * A pseudo-element used as a marker in the list of active formatting elements
1374 *
1375 * @ingroup Parser
1376 * @since 1.27
1377 */
1378 class BalanceMarker {
1379 public $nextAFE;
1380 public $prevAFE;
1381 }
1382
1383 /**
1384 * The list of active formatting elements, which is used to handle
1385 * mis-nested formatting element tags in the HTML5 tree builder
1386 * specification.
1387 *
1388 * @ingroup Parser
1389 * @since 1.27
1390 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1391 */
1392 class BalanceActiveFormattingElements {
1393 /** The last (most recent) element in the list */
1394 private $tail;
1395
1396 /** The first (least recent) element in the list */
1397 private $head;
1398
1399 /**
1400 * An array of arrays representing the population of elements in each bucket
1401 * according to the Noah's Ark clause. The outer array is stack-like, with each
1402 * integer-indexed element representing a segment of the list, bounded by
1403 * markers. The first element represents the segment of the list before the
1404 * first marker.
1405 *
1406 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1407 * identifies each bucket according to the rules in the spec. The value in
1408 * the inner array is the first (least recently inserted) element in the bucket,
1409 * and subsequent members of the bucket can be found by iterating through the
1410 * singly-linked list via $node->nextNoah.
1411 *
1412 * This is optimised for the most common case of inserting into a bucket
1413 * with zero members, and deleting a bucket containing one member. In the
1414 * worst case, iteration through the list is still O(1) in the document
1415 * size, since each bucket can have at most 3 members.
1416 */
1417 private $noahTableStack = [ [] ];
1418
1419 public function __destruct() {
1420 $next = null;
1421 for ( $node = $this->head; $node; $node = $next ) {
1422 $next = $node->nextAFE;
1423 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1424 }
1425 $this->head = $this->tail = $this->noahTableStack = null;
1426 }
1427
1428 public function insertMarker() {
1429 $elt = new BalanceMarker;
1430 if ( $this->tail ) {
1431 $this->tail->nextAFE = $elt;
1432 $elt->prevAFE = $this->tail;
1433 } else {
1434 $this->head = $elt;
1435 }
1436 $this->tail = $elt;
1437 $this->noahTableStack[] = [];
1438 }
1439
1440 /**
1441 * Follow the steps required when the spec requires us to "push onto the
1442 * list of active formatting elements".
1443 * @param BalanceElement $elt
1444 */
1445 public function push( BalanceElement $elt ) {
1446 // Must not be in the list already
1447 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1448 throw new ParameterAssertionException( '$elt',
1449 'Cannot insert a node into the AFE list twice' );
1450 }
1451
1452 // "Noah's Ark clause" -- if there are already three copies of
1453 // this element before we encounter a marker, then drop the last
1454 // one.
1455 $noahKey = $elt->getNoahKey();
1456 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1457 if ( !isset( $table[$noahKey] ) ) {
1458 $table[$noahKey] = $elt;
1459 } else {
1460 $count = 1;
1461 $head = $tail = $table[$noahKey];
1462 while ( $tail->nextNoah ) {
1463 $tail = $tail->nextNoah;
1464 $count++;
1465 }
1466 if ( $count >= 3 ) {
1467 $this->remove( $head );
1468 }
1469 $tail->nextNoah = $elt;
1470 }
1471 // Add to the main AFE list
1472 if ( $this->tail ) {
1473 $this->tail->nextAFE = $elt;
1474 $elt->prevAFE = $this->tail;
1475 } else {
1476 $this->head = $elt;
1477 }
1478 $this->tail = $elt;
1479 }
1480
1481 /**
1482 * Follow the steps required when the spec asks us to "clear the list of
1483 * active formatting elements up to the last marker".
1484 */
1485 public function clearToMarker() {
1486 // Iterate back through the list starting from the tail
1487 $tail = $this->tail;
1488 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1489 // Unlink the element
1490 $prev = $tail->prevAFE;
1491 $tail->prevAFE = null;
1492 if ( $prev ) {
1493 $prev->nextAFE = null;
1494 }
1495 $tail->nextNoah = null;
1496 $tail = $prev;
1497 }
1498 // If we finished on a marker, unlink it and pop it off the Noah table stack
1499 if ( $tail ) {
1500 $prev = $tail->prevAFE;
1501 if ( $prev ) {
1502 $prev->nextAFE = null;
1503 }
1504 $tail = $prev;
1505 array_pop( $this->noahTableStack );
1506 } else {
1507 // No marker: wipe the top-level Noah table (which is the only one)
1508 $this->noahTableStack[0] = [];
1509 }
1510 // If we removed all the elements, clear the head pointer
1511 if ( !$tail ) {
1512 $this->head = null;
1513 }
1514 $this->tail = $tail;
1515 }
1516
1517 /**
1518 * Find and return the last element with the specified tag between the
1519 * end of the list and the last marker on the list.
1520 * Used when parsing &lt;a&gt; "in body mode".
1521 * @param string $tag
1522 * @return null|Node
1523 */
1524 public function findElementByTag( $tag ) {
1525 $elt = $this->tail;
1526 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1527 if ( $elt->localName === $tag ) {
1528 return $elt;
1529 }
1530 $elt = $elt->prevAFE;
1531 }
1532 return null;
1533 }
1534
1535 /**
1536 * Determine whether an element is in the list of formatting elements.
1537 * @param BalanceElement $elt
1538 * @return bool
1539 */
1540 public function isInList( BalanceElement $elt ) {
1541 return $this->head === $elt || $elt->prevAFE;
1542 }
1543
1544 /**
1545 * Find the element $elt in the list and remove it.
1546 * Used when parsing &lt;a&gt; in body mode.
1547 *
1548 * @param BalanceElement $elt
1549 */
1550 public function remove( BalanceElement $elt ) {
1551 if ( $this->head !== $elt && !$elt->prevAFE ) {
1552 throw new ParameterAssertionException( '$elt',
1553 "Attempted to remove an element which is not in the AFE list" );
1554 }
1555 // Update head and tail pointers
1556 if ( $this->head === $elt ) {
1557 $this->head = $elt->nextAFE;
1558 }
1559 if ( $this->tail === $elt ) {
1560 $this->tail = $elt->prevAFE;
1561 }
1562 // Update previous element
1563 if ( $elt->prevAFE ) {
1564 $elt->prevAFE->nextAFE = $elt->nextAFE;
1565 }
1566 // Update next element
1567 if ( $elt->nextAFE ) {
1568 $elt->nextAFE->prevAFE = $elt->prevAFE;
1569 }
1570 // Clear pointers so that isInList() etc. will work
1571 $elt->prevAFE = $elt->nextAFE = null;
1572 // Update Noah list
1573 $this->removeFromNoahList( $elt );
1574 }
1575
1576 private function addToNoahList( BalanceElement $elt ) {
1577 $noahKey = $elt->getNoahKey();
1578 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1579 if ( !isset( $table[$noahKey] ) ) {
1580 $table[$noahKey] = $elt;
1581 } else {
1582 $tail = $table[$noahKey];
1583 while ( $tail->nextNoah ) {
1584 $tail = $tail->nextNoah;
1585 }
1586 $tail->nextNoah = $elt;
1587 }
1588 }
1589
1590 private function removeFromNoahList( BalanceElement $elt ) {
1591 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1592 $key = $elt->getNoahKey();
1593 $noahElt = $table[$key];
1594 if ( $noahElt === $elt ) {
1595 if ( $noahElt->nextNoah ) {
1596 $table[$key] = $noahElt->nextNoah;
1597 $noahElt->nextNoah = null;
1598 } else {
1599 unset( $table[$key] );
1600 }
1601 } else {
1602 do {
1603 $prevNoahElt = $noahElt;
1604 $noahElt = $prevNoahElt->nextNoah;
1605 if ( $noahElt === $elt ) {
1606 // Found it, unlink
1607 $prevNoahElt->nextNoah = $elt->nextNoah;
1608 $elt->nextNoah = null;
1609 break;
1610 }
1611 } while ( $noahElt );
1612 }
1613 }
1614
1615 /**
1616 * Find element $a in the list and replace it with element $b
1617 *
1618 * @param BalanceElement $a
1619 * @param BalanceElement $b
1620 */
1621 public function replace( BalanceElement $a, BalanceElement $b ) {
1622 if ( $this->head !== $a && !$a->prevAFE ) {
1623 throw new ParameterAssertionException( '$a',
1624 "Attempted to replace an element which is not in the AFE list" );
1625 }
1626 // Update head and tail pointers
1627 if ( $this->head === $a ) {
1628 $this->head = $b;
1629 }
1630 if ( $this->tail === $a ) {
1631 $this->tail = $b;
1632 }
1633 // Update previous element
1634 if ( $a->prevAFE ) {
1635 $a->prevAFE->nextAFE = $b;
1636 }
1637 // Update next element
1638 if ( $a->nextAFE ) {
1639 $a->nextAFE->prevAFE = $b;
1640 }
1641 $b->prevAFE = $a->prevAFE;
1642 $b->nextAFE = $a->nextAFE;
1643 $a->nextAFE = $a->prevAFE = null;
1644 // Update Noah list
1645 $this->removeFromNoahList( $a );
1646 $this->addToNoahList( $b );
1647 }
1648
1649 /**
1650 * Find $a in the list and insert $b after it.
1651
1652 * @param BalanceElement $a
1653 * @param BalanceElement $b
1654 */
1655 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1656 if ( $this->head !== $a && !$a->prevAFE ) {
1657 throw new ParameterAssertionException( '$a',
1658 "Attempted to insert after an element which is not in the AFE list" );
1659 }
1660 if ( $this->tail === $a ) {
1661 $this->tail = $b;
1662 }
1663 if ( $a->nextAFE ) {
1664 $a->nextAFE->prevAFE = $b;
1665 }
1666 $b->nextAFE = $a->nextAFE;
1667 $b->prevAFE = $a;
1668 $a->nextAFE = $b;
1669 $this->addToNoahList( $b );
1670 }
1671
1672 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1673 /**
1674 * Reconstruct the active formatting elements.
1675 * @param BalanceStack $stack The open elements stack
1676 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1677 */
1678 // @codingStandardsIgnoreEnd
1679 public function reconstruct( $stack ) {
1680 $entry = $this->tail;
1681 // If there are no entries in the list of active formatting elements,
1682 // then there is nothing to reconstruct
1683 if ( !$entry ) {
1684 return;
1685 }
1686 // If the last is a marker, do nothing.
1687 if ( $entry instanceof BalanceMarker ) {
1688 return;
1689 }
1690 // Or if it is an open element, do nothing.
1691 if ( $stack->indexOf( $entry ) >= 0 ) {
1692 return;
1693 }
1694
1695 // Loop backward through the list until we find a marker or an
1696 // open element
1697 $foundIt = false;
1698 while ( $entry->prevAFE ) {
1699 $entry = $entry->prevAFE;
1700 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1701 $foundIt = true;
1702 break;
1703 }
1704 }
1705
1706 // Now loop forward, starting from the element after the current one (or
1707 // the first element if we didn't find a marker or open element),
1708 // recreating formatting elements and pushing them back onto the list
1709 // of open elements.
1710 if ( $foundIt ) {
1711 $entry = $entry->nextAFE;
1712 }
1713 do {
1714 $newElement = $stack->insertHTMLElement(
1715 $entry->localName,
1716 $entry->attribs );
1717 $this->replace( $entry, $newElement );
1718 $entry = $newElement->nextAFE;
1719 } while ( $entry );
1720 }
1721
1722 /**
1723 * Get a string representation of the AFE list, for debugging
1724 */
1725 public function __toString() {
1726 $prev = null;
1727 $s = '';
1728 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1729 if ( $node instanceof BalanceMarker ) {
1730 $s .= "MARKER\n";
1731 continue;
1732 }
1733 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1734 if ( $node->nextNoah ) {
1735 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1736 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1737 ')';
1738 }
1739 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1740 $s .= " (reverse link is wrong!)";
1741 }
1742 $s .= "\n";
1743 }
1744 if ( $prev !== $this->tail ) {
1745 $s .= "(tail pointer is wrong!)\n";
1746 }
1747 return $s;
1748 }
1749 }
1750
1751 /**
1752 * An implementation of the tree building portion of the HTML5 parsing
1753 * spec.
1754 *
1755 * This is used to balance and tidy output so that the result can
1756 * always be cleanly serialized/deserialized by an HTML5 parser. It
1757 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1758 * a number of constraints which are not enforced by the HTML5 parsing
1759 * process. But the result will be free of gross errors: misnested or
1760 * unclosed tags, for example, and will be unchanged by spec-complient
1761 * parsing followed by serialization.
1762 *
1763 * The tree building stage is structured as a state machine.
1764 * When comparing the implementation to
1765 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1766 * note that each state is implemented as a function with a
1767 * name ending in `Mode` (because the HTML spec refers to them
1768 * as insertion modes). The current insertion mode is held by
1769 * the $parseMode property.
1770 *
1771 * The following simplifications have been made:
1772 * - We handle body content only (ie, we start `in body`.)
1773 * - The document is never in "quirks mode".
1774 * - All occurrences of < and > have been entity escaped, so we
1775 * can parse tags by simply splitting on those two characters.
1776 * (This also simplifies the handling of < inside <textarea>.)
1777 * The character < must not appear inside comments.
1778 * Similarly, all attributes have been "cleaned" and are double-quoted
1779 * and escaped.
1780 * - All null characters are assumed to have been removed.
1781 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1782 * <frame>, <plaintext>, <xmp>, <iframe>,
1783 * <noembed>, <noscript>, <script>, <title>. As a result,
1784 * further simplifications can be made:
1785 * - `frameset-ok` is not tracked.
1786 * - `head element pointer` is not tracked (but presumed non-null)
1787 * - Tokenizer has only a single mode. (<textarea> wants RCDATA and
1788 * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
1789 *
1790 * We generally mark places where we omit cases from the spec due to
1791 * disallowed elements with a comment: `// OMITTED: <element-name>`.
1792 *
1793 * The HTML spec keeps a flag during the parsing process to track
1794 * whether or not a "parse error" has been encountered. We don't
1795 * bother to track that flag, we just implement the error-handling
1796 * process as specified.
1797 *
1798 * @ingroup Parser
1799 * @since 1.27
1800 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1801 */
1802 class Balancer {
1803 private $parseMode;
1804 /** @var \Iterator */
1805 private $bitsIterator;
1806 private $allowedHtmlElements;
1807 /** @var BalanceActiveFormattingElements */
1808 private $afe;
1809 /** @var BalanceStack */
1810 private $stack;
1811 private $strict;
1812 private $allowComments;
1813 private $config;
1814
1815 private $textIntegrationMode;
1816 private $pendingTableText;
1817 private $originalInsertionMode;
1818 private $fragmentContext;
1819 private $formElementPointer;
1820 private $ignoreLinefeed;
1821 private $inRCDATA;
1822 private $inRAWTEXT;
1823
1824 /** @var callable|null */
1825 private $processingCallback;
1826 /** @var array */
1827 private $processingArgs;
1828
1829 /**
1830 * Valid HTML5 comments.
1831 * Regex borrowed from Tim Starling's "remex-html" project.
1832 */
1833 const VALID_COMMENT_REGEX = "~ !--
1834 ( # 1. Comment match detector
1835 > | -> | # Invalid short close
1836 ( # 2. Comment contents
1837 (?:
1838 (?! --> )
1839 (?! --!> )
1840 (?! --! \z )
1841 (?! -- \z )
1842 (?! - \z )
1843 .
1844 )*+
1845 )
1846 ( # 3. Comment close
1847 --> | # Normal close
1848 --!> | # Comment end bang
1849 ( # 4. Indicate matches requiring EOF
1850 --! | # EOF in comment end bang state
1851 -- | # EOF in comment end state
1852 - | # EOF in comment end dash state
1853 (?#nothing) # EOF in comment state
1854 )
1855 )
1856 )
1857 ([^<]*) \z # 5. Non-tag text after the comment
1858 ~xs";
1859
1860 /**
1861 * Create a new Balancer.
1862 * @param array $config Balancer configuration. Includes:
1863 * 'strict' : boolean, defaults to false.
1864 * When true, enforces syntactic constraints on input:
1865 * all non-tag '<' must be escaped, all attributes must be
1866 * separated by a single space and double-quoted. This is
1867 * consistent with the output of the Sanitizer.
1868 * 'allowedHtmlElements' : array, defaults to null.
1869 * When present, the keys of this associative array give
1870 * the acceptable HTML tag names. When not present, no
1871 * tag sanitization is done.
1872 * 'tidyCompat' : boolean, defaults to false.
1873 * When true, the serialization algorithm is tweaked to
1874 * provide historical compatibility with the old "tidy"
1875 * program: <p>-wrapping is done to the children of
1876 * <body> and <blockquote> elements, and empty elements
1877 * are removed. The <pre>/<listing>/<textarea> serialization
1878 * is also tweaked to allow lossless round trips.
1879 * (See: https://github.com/whatwg/html/issues/944)
1880 * 'allowComments': boolean, defaults to true.
1881 * When true, allows HTML comments in the input.
1882 * The Sanitizer generally strips all comments, so if you
1883 * are running on sanitized output you can set this to
1884 * false to get a bit more performance.
1885 */
1886 public function __construct( array $config = [] ) {
1887 $this->config = $config = $config + [
1888 'strict' => false,
1889 'allowedHtmlElements' => null,
1890 'tidyCompat' => false,
1891 'allowComments' => true,
1892 ];
1893 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1894 $this->strict = $config['strict'];
1895 $this->allowComments = $config['allowComments'];
1896 if ( $this->allowedHtmlElements !== null ) {
1897 // Sanity check!
1898 $bad = array_uintersect_assoc(
1899 $this->allowedHtmlElements,
1900 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1901 function ( $a, $b ) {
1902 // Ignore the values (just intersect the keys) by saying
1903 // all values are equal to each other.
1904 return 0;
1905 }
1906 );
1907 if ( count( $bad ) > 0 ) {
1908 $badstr = implode( array_keys( $bad ), ',' );
1909 throw new ParameterAssertionException(
1910 '$config',
1911 'Balance attempted with sanitization including ' .
1912 "unsupported elements: {$badstr}"
1913 );
1914 }
1915 }
1916 }
1917
1918 /**
1919 * Return a balanced HTML string for the HTML fragment given by $text,
1920 * subject to the caveats listed in the class description. The result
1921 * will typically be idempotent -- that is, rebalancing the output
1922 * would result in no change.
1923 *
1924 * @param string $text The markup to be balanced
1925 * @param callable $processingCallback Callback to do any variable or
1926 * parameter replacements in HTML attributes values
1927 * @param array|bool $processingArgs Arguments for the processing callback
1928 * @return string The balanced markup
1929 */
1930 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1931 $this->parseMode = 'inBodyMode';
1932 $this->bitsIterator = new ExplodeIterator( '<', $text );
1933 $this->afe = new BalanceActiveFormattingElements();
1934 $this->stack = new BalanceStack( $this->config );
1935 $this->processingCallback = $processingCallback;
1936 $this->processingArgs = $processingArgs;
1937
1938 $this->textIntegrationMode =
1939 $this->ignoreLinefeed =
1940 $this->inRCDATA =
1941 $this->inRAWTEXT = false;
1942
1943 // The stack is constructed with an <html> element already on it.
1944 // Set this up as a fragment parsed with <body> as the context.
1945 $this->fragmentContext =
1946 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1947 $this->resetInsertionMode();
1948 $this->formElementPointer = null;
1949 for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) {
1950 if ( $e->isHtmlNamed( 'form' ) ) {
1951 $this->formElementPointer = $e;
1952 break;
1953 }
1954 }
1955
1956 // First element is text not tag
1957 $x = $this->bitsIterator->current();
1958 $this->bitsIterator->next();
1959 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1960 // Now process each tag.
1961 while ( $this->bitsIterator->valid() ) {
1962 $this->advance();
1963 }
1964 $this->insertToken( 'eof', null );
1965 $result = $this->stack->getOutput();
1966 // Free memory before returning.
1967 $this->bitsIterator = null;
1968 $this->afe = null;
1969 $this->stack = null;
1970 $this->fragmentContext = null;
1971 $this->formElementPointer = null;
1972 return $result;
1973 }
1974
1975 /**
1976 * Pass a token to the tree builder. The $token will be one of the
1977 * strings "tag", "endtag", or "text".
1978 */
1979 private function insertToken( $token, $value, $attribs = null, $selfClose = false ) {
1980 // validate tags against $unsupportedSet
1981 if ( $token === 'tag' || $token === 'endtag' ) {
1982 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1983 // As described in "simplifications" above, these tags are
1984 // not supported in the balancer.
1985 Assert::invariant(
1986 !$this->strict,
1987 "Unsupported $token <$value> found."
1988 );
1989 return false;
1990 }
1991 } elseif ( $token === 'text' && $value === '' ) {
1992 // Don't actually inject the empty string as a text token.
1993 return true;
1994 }
1995 // Support pre/listing/textarea by suppressing initial linefeed
1996 if ( $this->ignoreLinefeed ) {
1997 $this->ignoreLinefeed = false;
1998 if ( $token === 'text' ) {
1999 if ( $value[0] === "\n" ) {
2000 if ( $value === "\n" ) {
2001 // Nothing would be left, don't inject the empty string.
2002 return true;
2003 }
2004 $value = substr( $value, 1 );
2005 }
2006 }
2007 }
2008 // Some hoops we have to jump through
2009 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
2010
2011 // The spec calls this the "tree construction dispatcher".
2012 $isForeign = true;
2013 if (
2014 $this->stack->length() === 0 ||
2015 $adjusted->isHtml() ||
2016 $token === 'eof'
2017 ) {
2018 $isForeign = false;
2019 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
2020 if ( $token === 'text' ) {
2021 $isForeign = false;
2022 } elseif (
2023 $token === 'tag' &&
2024 $value !== 'mglyph' && $value !== 'malignmark'
2025 ) {
2026 $isForeign = false;
2027 }
2028 } elseif (
2029 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
2030 $adjusted->localName === 'annotation-xml' &&
2031 $token === 'tag' && $value === 'svg'
2032 ) {
2033 $isForeign = false;
2034 } elseif (
2035 $adjusted->isHtmlIntegrationPoint() &&
2036 ( $token === 'tag' || $token === 'text' )
2037 ) {
2038 $isForeign = false;
2039 }
2040 if ( $isForeign ) {
2041 return $this->insertForeignToken( $token, $value, $attribs, $selfClose );
2042 } else {
2043 $func = $this->parseMode;
2044 return $this->$func( $token, $value, $attribs, $selfClose );
2045 }
2046 }
2047
2048 private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) {
2049 if ( $token === 'text' ) {
2050 $this->stack->insertText( $value );
2051 return true;
2052 } elseif ( $token === 'comment' ) {
2053 $this->stack->insertComment( $value );
2054 return true;
2055 } elseif ( $token === 'tag' ) {
2056 switch ( $value ) {
2057 case 'font':
2058 if ( isset( $attribs['color'] )
2059 || isset( $attribs['face'] )
2060 || isset( $attribs['size'] )
2061 ) {
2062 break;
2063 }
2064 // otherwise, fall through
2065 case 'b':
2066 case 'big':
2067 case 'blockquote':
2068 case 'body':
2069 case 'br':
2070 case 'center':
2071 case 'code':
2072 case 'dd':
2073 case 'div':
2074 case 'dl':
2075 case 'dt':
2076 case 'em':
2077 case 'embed':
2078 case 'h1':
2079 case 'h2':
2080 case 'h3':
2081 case 'h4':
2082 case 'h5':
2083 case 'h6':
2084 case 'head':
2085 case 'hr':
2086 case 'i':
2087 case 'img':
2088 case 'li':
2089 case 'listing':
2090 case 'menu':
2091 case 'meta':
2092 case 'nobr':
2093 case 'ol':
2094 case 'p':
2095 case 'pre':
2096 case 'ruby':
2097 case 's':
2098 case 'small':
2099 case 'span':
2100 case 'strong':
2101 case 'strike':
2102 case 'sub':
2103 case 'sup':
2104 case 'table':
2105 case 'tt':
2106 case 'u':
2107 case 'ul':
2108 case 'var':
2109 if ( $this->fragmentContext ) {
2110 break;
2111 }
2112 while ( true ) {
2113 $this->stack->pop();
2114 $node = $this->stack->currentNode;
2115 if (
2116 $node->isMathmlTextIntegrationPoint() ||
2117 $node->isHtmlIntegrationPoint() ||
2118 $node->isHtml()
2119 ) {
2120 break;
2121 }
2122 }
2123 return $this->insertToken( $token, $value, $attribs, $selfClose );
2124 }
2125 // "Any other start tag"
2126 $adjusted = ( $this->fragmentContext && $this->stack->length() === 1 ) ?
2127 $this->fragmentContext : $this->stack->currentNode;
2128 $this->stack->insertForeignElement(
2129 $adjusted->namespaceURI, $value, $attribs
2130 );
2131 if ( $selfClose ) {
2132 $this->stack->pop();
2133 }
2134 return true;
2135 } elseif ( $token === 'endtag' ) {
2136 $first = true;
2137 foreach ( $this->stack as $i => $node ) {
2138 if ( $node->isHtml() && !$first ) {
2139 // process the end tag as HTML
2140 $func = $this->parseMode;
2141 return $this->$func( $token, $value, $attribs, $selfClose );
2142 } elseif ( $i === 0 ) {
2143 return true;
2144 } elseif ( $node->localName === $value ) {
2145 $this->stack->popTag( $node );
2146 return true;
2147 }
2148 $first = false;
2149 }
2150 }
2151 }
2152
2153 /**
2154 * Grab the next "token" from $bitsIterator. This is either a open/close
2155 * tag or text or a comment, depending on whether the Sanitizer approves.
2156 */
2157 private function advance() {
2158 $x = $this->bitsIterator->current();
2159 $this->bitsIterator->next();
2160 $regs = [];
2161 // Handle comments. These won't be generated by mediawiki (they
2162 // are stripped in the Sanitizer) but may be generated by extensions.
2163 if (
2164 $this->allowComments &&
2165 !( $this->inRCDATA || $this->inRAWTEXT ) &&
2166 preg_match( self::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
2167 // verify EOF condition where necessary
2168 ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
2169 ) {
2170 $contents = $regs[2][0];
2171 $rest = $regs[5][0];
2172 $this->insertToken( 'comment', $contents );
2173 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2174 return;
2175 }
2176 // $slash: Does the current element start with a '/'?
2177 // $t: Current element name
2178 // $attribStr: String between element name and >
2179 // $brace: Ending '>' or '/>'
2180 // $rest: Everything until the next element from the $bitsIterator
2181 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2182 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2183 $t = strtolower( $t );
2184 if ( $this->strict ) {
2185 // Verify that attributes are all properly double-quoted
2186 Assert::invariant(
2187 preg_match(
2188 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2189 ),
2190 "Bad attribute string found"
2191 );
2192 }
2193 } else {
2194 Assert::invariant(
2195 !$this->strict, "< found which does not start a valid tag"
2196 );
2197 $slash = $t = $attribStr = $brace = $rest = null;
2198 }
2199 $goodTag = $t;
2200 if ( $this->inRCDATA ) {
2201 if ( $slash && $t === $this->inRCDATA ) {
2202 $this->inRCDATA = false;
2203 } else {
2204 // No tags allowed; this emulates the "rcdata" tokenizer mode.
2205 $goodTag = false;
2206 }
2207 }
2208 if ( $this->inRAWTEXT ) {
2209 if ( $slash && $t === $this->inRAWTEXT ) {
2210 $this->inRAWTEXT = false;
2211 } else {
2212 // No tags allowed, no entity-escaping done.
2213 $goodTag = false;
2214 }
2215 }
2216 $sanitize = $this->allowedHtmlElements !== null;
2217 if ( $sanitize ) {
2218 $goodTag = $t && isset( $this->allowedHtmlElements[$t] );
2219 }
2220 if ( $goodTag ) {
2221 if ( is_callable( $this->processingCallback ) ) {
2222 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2223 }
2224 if ( $sanitize ) {
2225 $goodTag = Sanitizer::validateTag( $attribStr, $t );
2226 }
2227 }
2228 if ( $goodTag ) {
2229 if ( $sanitize ) {
2230 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2231 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2232 } else {
2233 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2234 }
2235 $goodTag = $this->insertToken(
2236 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2237 );
2238 }
2239 if ( $goodTag ) {
2240 $rest = str_replace( '>', '&gt;', $rest );
2241 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2242 } elseif ( $this->inRAWTEXT ) {
2243 $this->insertToken( 'text', "<$x" );
2244 } else {
2245 // bad tag; serialize entire thing as text.
2246 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2247 }
2248 }
2249
2250 private function switchMode( $mode ) {
2251 Assert::parameter(
2252 substr( $mode, -4 ) === 'Mode', '$mode', 'should end in Mode'
2253 );
2254 $oldMode = $this->parseMode;
2255 $this->parseMode = $mode;
2256 return $oldMode;
2257 }
2258
2259 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) {
2260 $this->switchMode( $mode );
2261 return $this->insertToken( $token, $value, $attribs, $selfClose );
2262 }
2263
2264 private function resetInsertionMode() {
2265 $last = false;
2266 foreach ( $this->stack as $i => $node ) {
2267 if ( $i === 0 ) {
2268 $last = true;
2269 if ( $this->fragmentContext ) {
2270 $node = $this->fragmentContext;
2271 }
2272 }
2273 if ( $node->isHtml() ) {
2274 switch ( $node->localName ) {
2275 case 'select':
2276 $stackLength = $this->stack->length();
2277 for ( $j = $i + 1; $j < $stackLength - 1; $j++ ) {
2278 $ancestor = $this->stack->node( $stackLength - $j - 1 );
2279 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2280 break;
2281 }
2282 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2283 $this->switchMode( 'inSelectInTableMode' );
2284 return;
2285 }
2286 }
2287 $this->switchMode( 'inSelectMode' );
2288 return;
2289 case 'tr':
2290 $this->switchMode( 'inRowMode' );
2291 return;
2292 case 'tbody':
2293 case 'tfoot':
2294 case 'thead':
2295 $this->switchMode( 'inTableBodyMode' );
2296 return;
2297 case 'caption':
2298 $this->switchMode( 'inCaptionMode' );
2299 return;
2300 case 'colgroup':
2301 $this->switchMode( 'inColumnGroupMode' );
2302 return;
2303 case 'table':
2304 $this->switchMode( 'inTableMode' );
2305 return;
2306 case 'template':
2307 $this->switchMode(
2308 array_slice( $this->templateInsertionModes, -1 )[0]
2309 );
2310 return;
2311 case 'body':
2312 $this->switchMode( 'inBodyMode' );
2313 return;
2314 // OMITTED: <frameset>
2315 // OMITTED: <html>
2316 // OMITTED: <head>
2317 default:
2318 if ( !$last ) {
2319 // OMITTED: <head>
2320 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2321 $this->switchMode( 'inCellMode' );
2322 return;
2323 }
2324 }
2325 }
2326 }
2327 if ( $last ) {
2328 $this->switchMode( 'inBodyMode' );
2329 return;
2330 }
2331 }
2332 }
2333
2334 private function stopParsing() {
2335 // Most of the spec methods are inapplicable, other than step 2:
2336 // "pop all the nodes off the stack of open elements".
2337 // We're going to keep the top-most <html> element on the stack, though.
2338
2339 // Clear the AFE list first, otherwise the element objects will stay live
2340 // during serialization, potentially using O(N^2) memory. Note that
2341 // popping the stack will never result in reconstructing the active
2342 // formatting elements.
2343 $this->afe = null;
2344 $this->stack->popTo( 1 );
2345 }
2346
2347 private function parseRawText( $value, $attribs = null ) {
2348 $this->stack->insertHTMLElement( $value, $attribs );
2349 $this->inRAWTEXT = $value;
2350 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2351 return true;
2352 }
2353
2354 private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) {
2355 if ( $token === 'text' ) {
2356 $this->stack->insertText( $value );
2357 return true;
2358 } elseif ( $token === 'eof' ) {
2359 $this->stack->pop();
2360 return $this->switchModeAndReprocess(
2361 $this->originalInsertionMode, $token, $value, $attribs, $selfClose
2362 );
2363 } elseif ( $token === 'endtag' ) {
2364 $this->stack->pop();
2365 $this->switchMode( $this->originalInsertionMode );
2366 return true;
2367 }
2368 return true;
2369 }
2370
2371 private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) {
2372 if ( $token === 'text' ) {
2373 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2374 $this->stack->insertText( $matches[0] );
2375 $value = substr( $value, strlen( $matches[0] ) );
2376 }
2377 if ( strlen( $value ) === 0 ) {
2378 return true; // All text handled.
2379 }
2380 // Fall through to handle non-whitespace below.
2381 } elseif ( $token === 'tag' ) {
2382 switch ( $value ) {
2383 case 'meta':
2384 // OMITTED: in a full HTML parser, this might change the encoding.
2385 // falls through
2386 // OMITTED: <html>
2387 case 'base':
2388 case 'basefont':
2389 case 'bgsound':
2390 case 'link':
2391 $this->stack->insertHTMLElement( $value, $attribs );
2392 $this->stack->pop();
2393 return true;
2394 // OMITTED: <title>
2395 // OMITTED: <noscript>
2396 case 'noframes':
2397 case 'style':
2398 return $this->parseRawText( $value, $attribs );
2399 // OMITTED: <script>
2400 case 'template':
2401 $this->stack->insertHTMLElement( $value, $attribs );
2402 $this->afe->insertMarker();
2403 // OMITTED: frameset_ok
2404 $this->switchMode( 'inTemplateMode' );
2405 $this->templateInsertionModes[] = $this->parseMode;
2406 return true;
2407 // OMITTED: <head>
2408 }
2409 } elseif ( $token === 'endtag' ) {
2410 switch ( $value ) {
2411 // OMITTED: <head>
2412 // OMITTED: <body>
2413 // OMITTED: <html>
2414 case 'br':
2415 break; // handle at the bottom of the function
2416 case 'template':
2417 if ( $this->stack->indexOf( $value ) < 0 ) {
2418 return true; // Ignore the token.
2419 }
2420 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2421 $this->stack->popTag( $value );
2422 $this->afe->clearToMarker();
2423 array_pop( $this->templateInsertionModes );
2424 $this->resetInsertionMode();
2425 return true;
2426 default:
2427 // ignore any other end tag
2428 return true;
2429 }
2430 } elseif ( $token === 'comment' ) {
2431 $this->stack->insertComment( $value );
2432 return true;
2433 }
2434
2435 // If not handled above
2436 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2437 // Then redo this one
2438 return $this->insertToken( $token, $value, $attribs, $selfClose );
2439 }
2440
2441 private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
2442 if ( $token === 'text' ) {
2443 $this->afe->reconstruct( $this->stack );
2444 $this->stack->insertText( $value );
2445 return true;
2446 } elseif ( $token === 'eof' ) {
2447 if ( !empty( $this->templateInsertionModes ) ) {
2448 return $this->inTemplateMode( $token, $value, $attribs, $selfClose );
2449 }
2450 $this->stopParsing();
2451 return true;
2452 } elseif ( $token === 'tag' ) {
2453 switch ( $value ) {
2454 // OMITTED: <html>
2455 case 'base':
2456 case 'basefont':
2457 case 'bgsound':
2458 case 'link':
2459 case 'meta':
2460 case 'noframes':
2461 // OMITTED: <script>
2462 case 'style':
2463 case 'template':
2464 // OMITTED: <title>
2465 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2466 // OMITTED: <body>
2467 // OMITTED: <frameset>
2468
2469 case 'address':
2470 case 'article':
2471 case 'aside':
2472 case 'blockquote':
2473 case 'center':
2474 case 'details':
2475 case 'dialog':
2476 case 'dir':
2477 case 'div':
2478 case 'dl':
2479 case 'fieldset':
2480 case 'figcaption':
2481 case 'figure':
2482 case 'footer':
2483 case 'header':
2484 case 'hgroup':
2485 case 'main':
2486 case 'nav':
2487 case 'ol':
2488 case 'p':
2489 case 'section':
2490 case 'summary':
2491 case 'ul':
2492 if ( $this->stack->inButtonScope( 'p' ) ) {
2493 $this->inBodyMode( 'endtag', 'p' );
2494 }
2495 $this->stack->insertHTMLElement( $value, $attribs );
2496 return true;
2497
2498 case 'menu':
2499 if ( $this->stack->inButtonScope( "p" ) ) {
2500 $this->inBodyMode( 'endtag', 'p' );
2501 }
2502 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2503 $this->stack->pop();
2504 }
2505 $this->stack->insertHTMLElement( $value, $attribs );
2506 return true;
2507
2508 case 'h1':
2509 case 'h2':
2510 case 'h3':
2511 case 'h4':
2512 case 'h5':
2513 case 'h6':
2514 if ( $this->stack->inButtonScope( 'p' ) ) {
2515 $this->inBodyMode( 'endtag', 'p' );
2516 }
2517 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2518 $this->stack->pop();
2519 }
2520 $this->stack->insertHTMLElement( $value, $attribs );
2521 return true;
2522
2523 case 'pre':
2524 case 'listing':
2525 if ( $this->stack->inButtonScope( 'p' ) ) {
2526 $this->inBodyMode( 'endtag', 'p' );
2527 }
2528 $this->stack->insertHTMLElement( $value, $attribs );
2529 $this->ignoreLinefeed = true;
2530 // OMITTED: frameset_ok
2531 return true;
2532
2533 case 'form':
2534 if (
2535 $this->formElementPointer &&
2536 $this->stack->indexOf( 'template' ) < 0
2537 ) {
2538 return true; // in a form, not in a template.
2539 }
2540 if ( $this->stack->inButtonScope( "p" ) ) {
2541 $this->inBodyMode( 'endtag', 'p' );
2542 }
2543 $elt = $this->stack->insertHTMLElement( $value, $attribs );
2544 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2545 $this->formElementPointer = $elt;
2546 }
2547 return true;
2548
2549 case 'li':
2550 // OMITTED: frameset_ok
2551 foreach ( $this->stack as $node ) {
2552 if ( $node->isHtmlNamed( 'li' ) ) {
2553 $this->inBodyMode( 'endtag', 'li' );
2554 break;
2555 }
2556 if (
2557 $node->isA( BalanceSets::$specialSet ) &&
2558 !$node->isA( BalanceSets::$addressDivPSet )
2559 ) {
2560 break;
2561 }
2562 }
2563 if ( $this->stack->inButtonScope( 'p' ) ) {
2564 $this->inBodyMode( 'endtag', 'p' );
2565 }
2566 $this->stack->insertHTMLElement( $value, $attribs );
2567 return true;
2568
2569 case 'dd':
2570 case 'dt':
2571 // OMITTED: frameset_ok
2572 foreach ( $this->stack as $node ) {
2573 if ( $node->isHtmlNamed( 'dd' ) ) {
2574 $this->inBodyMode( 'endtag', 'dd' );
2575 break;
2576 }
2577 if ( $node->isHtmlNamed( 'dt' ) ) {
2578 $this->inBodyMode( 'endtag', 'dt' );
2579 break;
2580 }
2581 if (
2582 $node->isA( BalanceSets::$specialSet ) &&
2583 !$node->isA( BalanceSets::$addressDivPSet )
2584 ) {
2585 break;
2586 }
2587 }
2588 if ( $this->stack->inButtonScope( 'p' ) ) {
2589 $this->inBodyMode( 'endtag', 'p' );
2590 }
2591 $this->stack->insertHTMLElement( $value, $attribs );
2592 return true;
2593
2594 // OMITTED: <plaintext>
2595
2596 case 'button':
2597 if ( $this->stack->inScope( 'button' ) ) {
2598 $this->inBodyMode( 'endtag', 'button' );
2599 return $this->insertToken( $token, $value, $attribs, $selfClose );
2600 }
2601 $this->afe->reconstruct( $this->stack );
2602 $this->stack->insertHTMLElement( $value, $attribs );
2603 return true;
2604
2605 case 'a':
2606 $activeElement = $this->afe->findElementByTag( 'a' );
2607 if ( $activeElement ) {
2608 $this->inBodyMode( 'endtag', 'a' );
2609 if ( $this->afe->isInList( $activeElement ) ) {
2610 $this->afe->remove( $activeElement );
2611 // Don't flatten here, since when we fall
2612 // through below we might foster parent
2613 // the new <a> tag inside this one.
2614 $this->stack->removeElement( $activeElement, false );
2615 }
2616 }
2617 // Falls through
2618 case 'b':
2619 case 'big':
2620 case 'code':
2621 case 'em':
2622 case 'font':
2623 case 'i':
2624 case 's':
2625 case 'small':
2626 case 'strike':
2627 case 'strong':
2628 case 'tt':
2629 case 'u':
2630 $this->afe->reconstruct( $this->stack );
2631 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
2632 return true;
2633
2634 case 'nobr':
2635 $this->afe->reconstruct( $this->stack );
2636 if ( $this->stack->inScope( 'nobr' ) ) {
2637 $this->inBodyMode( 'endtag', 'nobr' );
2638 $this->afe->reconstruct( $this->stack );
2639 }
2640 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
2641 return true;
2642
2643 case 'applet':
2644 case 'marquee':
2645 case 'object':
2646 $this->afe->reconstruct( $this->stack );
2647 $this->stack->insertHTMLElement( $value, $attribs );
2648 $this->afe->insertMarker();
2649 // OMITTED: frameset_ok
2650 return true;
2651
2652 case 'table':
2653 // The document is never in "quirks mode"; see simplifications
2654 // above.
2655 if ( $this->stack->inButtonScope( 'p' ) ) {
2656 $this->inBodyMode( 'endtag', 'p' );
2657 }
2658 $this->stack->insertHTMLElement( $value, $attribs );
2659 // OMITTED: frameset_ok
2660 $this->switchMode( 'inTableMode' );
2661 return true;
2662
2663 case 'area':
2664 case 'br':
2665 case 'embed':
2666 case 'img':
2667 case 'keygen':
2668 case 'wbr':
2669 $this->afe->reconstruct( $this->stack );
2670 $this->stack->insertHTMLElement( $value, $attribs );
2671 $this->stack->pop();
2672 // OMITTED: frameset_ok
2673 return true;
2674
2675 case 'input':
2676 $this->afe->reconstruct( $this->stack );
2677 $this->stack->insertHTMLElement( $value, $attribs );
2678 $this->stack->pop();
2679 // OMITTED: frameset_ok
2680 // (hence we don't need to examine the tag's "type" attribute)
2681 return true;
2682
2683 case 'param':
2684 case 'source':
2685 case 'track':
2686 $this->stack->insertHTMLElement( $value, $attribs );
2687 $this->stack->pop();
2688 return true;
2689
2690 case 'hr':
2691 if ( $this->stack->inButtonScope( 'p' ) ) {
2692 $this->inBodyMode( 'endtag', 'p' );
2693 }
2694 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2695 $this->stack->pop();
2696 }
2697 $this->stack->insertHTMLElement( $value, $attribs );
2698 $this->stack->pop();
2699 return true;
2700
2701 case 'image':
2702 // warts!
2703 return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
2704
2705 case 'textarea':
2706 $this->stack->insertHTMLElement( $value, $attribs );
2707 $this->ignoreLinefeed = true;
2708 $this->inRCDATA = $value; // emulate rcdata tokenizer mode
2709 // OMITTED: frameset_ok
2710 return true;
2711
2712 // OMITTED: <xmp>
2713 // OMITTED: <iframe>
2714 // OMITTED: <noembed>
2715 // OMITTED: <noscript>
2716
2717 case 'select':
2718 $this->afe->reconstruct( $this->stack );
2719 $this->stack->insertHTMLElement( $value, $attribs );
2720 switch ( $this->parseMode ) {
2721 case 'inTableMode':
2722 case 'inCaptionMode':
2723 case 'inTableBodyMode':
2724 case 'inRowMode':
2725 case 'inCellMode':
2726 $this->switchMode( 'inSelectInTableMode' );
2727 return true;
2728 default:
2729 $this->switchMode( 'inSelectMode' );
2730 return true;
2731 }
2732
2733 case 'optgroup':
2734 case 'option':
2735 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
2736 $this->inBodyMode( 'endtag', 'option' );
2737 }
2738 $this->afe->reconstruct( $this->stack );
2739 $this->stack->insertHTMLElement( $value, $attribs );
2740 return true;
2741
2742 case 'menuitem':
2743 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2744 $this->stack->pop();
2745 }
2746 $this->afe->reconstruct( $this->stack );
2747 $this->stack->insertHTMLElement( $value, $attribs );
2748 return true;
2749
2750 case 'rb':
2751 case 'rtc':
2752 if ( $this->stack->inScope( 'ruby' ) ) {
2753 $this->stack->generateImpliedEndTags();
2754 }
2755 $this->stack->insertHTMLElement( $value, $attribs );
2756 return true;
2757
2758 case 'rp':
2759 case 'rt':
2760 if ( $this->stack->inScope( 'ruby' ) ) {
2761 $this->stack->generateImpliedEndTags( 'rtc' );
2762 }
2763 $this->stack->insertHTMLElement( $value, $attribs );
2764 return true;
2765
2766 case 'math':
2767 $this->afe->reconstruct( $this->stack );
2768 // We skip the spec's "adjust MathML attributes" and
2769 // "adjust foreign attributes" steps, since the browser will
2770 // do this later when it parses the output and it doesn't affect
2771 // balancing.
2772 $this->stack->insertForeignElement(
2773 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2774 );
2775 if ( $selfClose ) {
2776 // emit explicit </math> tag.
2777 $this->stack->pop();
2778 }
2779 return true;
2780
2781 case 'svg':
2782 $this->afe->reconstruct( $this->stack );
2783 // We skip the spec's "adjust SVG attributes" and
2784 // "adjust foreign attributes" steps, since the browser will
2785 // do this later when it parses the output and it doesn't affect
2786 // balancing.
2787 $this->stack->insertForeignElement(
2788 BalanceSets::SVG_NAMESPACE, $value, $attribs
2789 );
2790 if ( $selfClose ) {
2791 // emit explicit </svg> tag.
2792 $this->stack->pop();
2793 }
2794 return true;
2795
2796 case 'caption':
2797 case 'col':
2798 case 'colgroup':
2799 // OMITTED: <frame>
2800 case 'head':
2801 case 'tbody':
2802 case 'td':
2803 case 'tfoot':
2804 case 'th':
2805 case 'thead':
2806 case 'tr':
2807 // Ignore table tags if we're not inTableMode
2808 return true;
2809 }
2810
2811 // Handle any other start tag here
2812 $this->afe->reconstruct( $this->stack );
2813 $this->stack->insertHTMLElement( $value, $attribs );
2814 return true;
2815 } elseif ( $token === 'endtag' ) {
2816 switch ( $value ) {
2817 // </body>,</html> are unsupported.
2818
2819 case 'template':
2820 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2821
2822 case 'address':
2823 case 'article':
2824 case 'aside':
2825 case 'blockquote':
2826 case 'button':
2827 case 'center':
2828 case 'details':
2829 case 'dialog':
2830 case 'dir':
2831 case 'div':
2832 case 'dl':
2833 case 'fieldset':
2834 case 'figcaption':
2835 case 'figure':
2836 case 'footer':
2837 case 'header':
2838 case 'hgroup':
2839 case 'listing':
2840 case 'main':
2841 case 'menu':
2842 case 'nav':
2843 case 'ol':
2844 case 'pre':
2845 case 'section':
2846 case 'summary':
2847 case 'ul':
2848 // Ignore if there is not a matching open tag
2849 if ( !$this->stack->inScope( $value ) ) {
2850 return true;
2851 }
2852 $this->stack->generateImpliedEndTags();
2853 $this->stack->popTag( $value );
2854 return true;
2855
2856 case 'form':
2857 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2858 $openform = $this->formElementPointer;
2859 $this->formElementPointer = null;
2860 if ( !$openform || !$this->stack->inScope( $openform ) ) {
2861 return true;
2862 }
2863 $this->stack->generateImpliedEndTags();
2864 // Don't flatten yet if we're removing a <form> element
2865 // out-of-order. (eg. `<form><div></form>`)
2866 $flatten = ( $this->stack->currentNode === $openform );
2867 $this->stack->removeElement( $openform, $flatten );
2868 } else {
2869 if ( !$this->stack->inScope( 'form' ) ) {
2870 return true;
2871 }
2872 $this->stack->generateImpliedEndTags();
2873 $this->stack->popTag( 'form' );
2874 }
2875 return true;
2876
2877 case 'p':
2878 if ( !$this->stack->inButtonScope( 'p' ) ) {
2879 $this->inBodyMode( 'tag', 'p', [] );
2880 return $this->insertToken( $token, $value, $attribs, $selfClose );
2881 }
2882 $this->stack->generateImpliedEndTags( $value );
2883 $this->stack->popTag( $value );
2884 return true;
2885
2886 case 'li':
2887 if ( !$this->stack->inListItemScope( $value ) ) {
2888 return true; // ignore
2889 }
2890 $this->stack->generateImpliedEndTags( $value );
2891 $this->stack->popTag( $value );
2892 return true;
2893
2894 case 'dd':
2895 case 'dt':
2896 if ( !$this->stack->inScope( $value ) ) {
2897 return true; // ignore
2898 }
2899 $this->stack->generateImpliedEndTags( $value );
2900 $this->stack->popTag( $value );
2901 return true;
2902
2903 case 'h1':
2904 case 'h2':
2905 case 'h3':
2906 case 'h4':
2907 case 'h5':
2908 case 'h6':
2909 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2910 return true; // ignore
2911 }
2912 $this->stack->generateImpliedEndTags();
2913 $this->stack->popTag( BalanceSets::$headingSet );
2914 return true;
2915
2916 case 'sarcasm':
2917 // Take a deep breath, then:
2918 break;
2919
2920 case 'a':
2921 case 'b':
2922 case 'big':
2923 case 'code':
2924 case 'em':
2925 case 'font':
2926 case 'i':
2927 case 'nobr':
2928 case 's':
2929 case 'small':
2930 case 'strike':
2931 case 'strong':
2932 case 'tt':
2933 case 'u':
2934 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2935 return true; // If we did something, we're done.
2936 }
2937 break; // Go to the "any other end tag" case.
2938
2939 case 'applet':
2940 case 'marquee':
2941 case 'object':
2942 if ( !$this->stack->inScope( $value ) ) {
2943 return true; // ignore
2944 }
2945 $this->stack->generateImpliedEndTags();
2946 $this->stack->popTag( $value );
2947 $this->afe->clearToMarker();
2948 return true;
2949
2950 case 'br':
2951 // Turn </br> into <br>
2952 return $this->inBodyMode( 'tag', $value, [] );
2953 }
2954
2955 // Any other end tag goes here
2956 foreach ( $this->stack as $i => $node ) {
2957 if ( $node->isHtmlNamed( $value ) ) {
2958 $this->stack->generateImpliedEndTags( $value );
2959 $this->stack->popTo( $i ); // including $i
2960 break;
2961 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2962 return true; // ignore this close token.
2963 }
2964 }
2965 return true;
2966 } elseif ( $token === 'comment' ) {
2967 $this->stack->insertComment( $value );
2968 return true;
2969 } else {
2970 Assert::invariant( false, "Bad token type: $token" );
2971 }
2972 }
2973
2974 private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) {
2975 if ( $token === 'text' ) {
2976 if ( $this->textIntegrationMode ) {
2977 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
2978 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2979 $this->pendingTableText = '';
2980 $this->originalInsertionMode = $this->parseMode;
2981 return $this->switchModeAndReprocess( 'inTableTextMode',
2982 $token, $value, $attribs, $selfClose );
2983 }
2984 // fall through to default case.
2985 } elseif ( $token === 'eof' ) {
2986 $this->stopParsing();
2987 return true;
2988 } elseif ( $token === 'tag' ) {
2989 switch ( $value ) {
2990 case 'caption':
2991 $this->afe->insertMarker();
2992 $this->stack->insertHTMLElement( $value, $attribs );
2993 $this->switchMode( 'inCaptionMode' );
2994 return true;
2995 case 'colgroup':
2996 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2997 $this->stack->insertHTMLElement( $value, $attribs );
2998 $this->switchMode( 'inColumnGroupMode' );
2999 return true;
3000 case 'col':
3001 $this->inTableMode( 'tag', 'colgroup', [] );
3002 return $this->insertToken( $token, $value, $attribs, $selfClose );
3003 case 'tbody':
3004 case 'tfoot':
3005 case 'thead':
3006 $this->stack->clearToContext( BalanceSets::$tableContextSet );
3007 $this->stack->insertHTMLElement( $value, $attribs );
3008 $this->switchMode( 'inTableBodyMode' );
3009 return true;
3010 case 'td':
3011 case 'th':
3012 case 'tr':
3013 $this->inTableMode( 'tag', 'tbody', [] );
3014 return $this->insertToken( $token, $value, $attribs, $selfClose );
3015 case 'table':
3016 if ( !$this->stack->inTableScope( $value ) ) {
3017 return true; // Ignore this tag.
3018 }
3019 $this->inTableMode( 'endtag', $value );
3020 return $this->insertToken( $token, $value, $attribs, $selfClose );
3021
3022 case 'style':
3023 // OMITTED: <script>
3024 case 'template':
3025 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3026
3027 case 'input':
3028 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
3029 break; // Handle this as "everything else"
3030 }
3031 $this->stack->insertHTMLElement( $value, $attribs );
3032 $this->stack->pop();
3033 return true;
3034
3035 case 'form':
3036 if (
3037 $this->formElementPointer ||
3038 $this->stack->indexOf( 'template' ) >= 0
3039 ) {
3040 return true; // ignore this token
3041 }
3042 $this->formElementPointer =
3043 $this->stack->insertHTMLElement( $value, $attribs );
3044 $this->stack->popTag( $this->formElementPointer );
3045 return true;
3046 }
3047 // Fall through for "anything else" clause.
3048 } elseif ( $token === 'endtag' ) {
3049 switch ( $value ) {
3050 case 'table':
3051 if ( !$this->stack->inTableScope( $value ) ) {
3052 return true; // Ignore.
3053 }
3054 $this->stack->popTag( $value );
3055 $this->resetInsertionMode();
3056 return true;
3057 // OMITTED: <body>
3058 case 'caption':
3059 case 'col':
3060 case 'colgroup':
3061 // OMITTED: <html>
3062 case 'tbody':
3063 case 'td':
3064 case 'tfoot':
3065 case 'th':
3066 case 'thead':
3067 case 'tr':
3068 return true; // Ignore the token.
3069 case 'template':
3070 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3071 }
3072 // Fall through for "anything else" clause.
3073 } elseif ( $token === 'comment' ) {
3074 $this->stack->insertComment( $value );
3075 return true;
3076 }
3077 // This is the "anything else" case:
3078 $this->stack->fosterParentMode = true;
3079 $this->inBodyMode( $token, $value, $attribs, $selfClose );
3080 $this->stack->fosterParentMode = false;
3081 return true;
3082 }
3083
3084 private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) {
3085 if ( $token === 'text' ) {
3086 $this->pendingTableText .= $value;
3087 return true;
3088 }
3089 // Non-text token:
3090 $text = $this->pendingTableText;
3091 $this->pendingTableText = '';
3092 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
3093 // This should match the "anything else" case inTableMode
3094 $this->stack->fosterParentMode = true;
3095 $this->inBodyMode( 'text', $text );
3096 $this->stack->fosterParentMode = false;
3097 } else {
3098 // Pending text is just whitespace.
3099 $this->stack->insertText( $text );
3100 }
3101 return $this->switchModeAndReprocess(
3102 $this->originalInsertionMode, $token, $value, $attribs, $selfClose
3103 );
3104 }
3105
3106 // helper for inCaptionMode
3107 private function endCaption() {
3108 if ( !$this->stack->inTableScope( 'caption' ) ) {
3109 return false;
3110 }
3111 $this->stack->generateImpliedEndTags();
3112 $this->stack->popTag( 'caption' );
3113 $this->afe->clearToMarker();
3114 $this->switchMode( 'inTableMode' );
3115 return true;
3116 }
3117
3118 private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) {
3119 if ( $token === 'tag' ) {
3120 switch ( $value ) {
3121 case 'caption':
3122 case 'col':
3123 case 'colgroup':
3124 case 'tbody':
3125 case 'td':
3126 case 'tfoot':
3127 case 'th':
3128 case 'thead':
3129 case 'tr':
3130 if ( $this->endCaption() ) {
3131 $this->insertToken( $token, $value, $attribs, $selfClose );
3132 }
3133 return true;
3134 }
3135 // Fall through to "anything else" case.
3136 } elseif ( $token === 'endtag' ) {
3137 switch ( $value ) {
3138 case 'caption':
3139 $this->endCaption();
3140 return true;
3141 case 'table':
3142 if ( $this->endCaption() ) {
3143 $this->insertToken( $token, $value, $attribs, $selfClose );
3144 }
3145 return true;
3146 case 'body':
3147 case 'col':
3148 case 'colgroup':
3149 // OMITTED: <html>
3150 case 'tbody':
3151 case 'td':
3152 case 'tfoot':
3153 case 'th':
3154 case 'thead':
3155 case 'tr':
3156 // Ignore the token
3157 return true;
3158 }
3159 // Fall through to "anything else" case.
3160 }
3161 // The Anything Else case
3162 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3163 }
3164
3165 private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) {
3166 if ( $token === 'text' ) {
3167 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
3168 $this->stack->insertText( $matches[0] );
3169 $value = substr( $value, strlen( $matches[0] ) );
3170 }
3171 if ( strlen( $value ) === 0 ) {
3172 return true; // All text handled.
3173 }
3174 // Fall through to handle non-whitespace below.
3175 } elseif ( $token === 'tag' ) {
3176 switch ( $value ) {
3177 // OMITTED: <html>
3178 case 'col':
3179 $this->stack->insertHTMLElement( $value, $attribs );
3180 $this->stack->pop();
3181 return true;
3182 case 'template':
3183 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3184 }
3185 // Fall through for "anything else".
3186 } elseif ( $token === 'endtag' ) {
3187 switch ( $value ) {
3188 case 'colgroup':
3189 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3190 return true; // Ignore the token.
3191 }
3192 $this->stack->pop();
3193 $this->switchMode( 'inTableMode' );
3194 return true;
3195 case 'col':
3196 return true; // Ignore the token.
3197 case 'template':
3198 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3199 }
3200 // Fall through for "anything else".
3201 } elseif ( $token === 'eof' ) {
3202 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3203 } elseif ( $token === 'comment' ) {
3204 $this->stack->insertComment( $value );
3205 return true;
3206 }
3207
3208 // Anything else
3209 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3210 return true; // Ignore the token.
3211 }
3212 $this->inColumnGroupMode( 'endtag', 'colgroup' );
3213 return $this->insertToken( $token, $value, $attribs, $selfClose );
3214 }
3215
3216 // Helper function for inTableBodyMode
3217 private function endSection() {
3218 if ( !(
3219 $this->stack->inTableScope( 'tbody' ) ||
3220 $this->stack->inTableScope( 'thead' ) ||
3221 $this->stack->inTableScope( 'tfoot' )
3222 ) ) {
3223 return false;
3224 }
3225 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3226 $this->stack->pop();
3227 $this->switchMode( 'inTableMode' );
3228 return true;
3229 }
3230 private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
3231 if ( $token === 'tag' ) {
3232 switch ( $value ) {
3233 case 'tr':
3234 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3235 $this->stack->insertHTMLElement( $value, $attribs );
3236 $this->switchMode( 'inRowMode' );
3237 return true;
3238 case 'th':
3239 case 'td':
3240 $this->inTableBodyMode( 'tag', 'tr', [] );
3241 $this->insertToken( $token, $value, $attribs, $selfClose );
3242 return true;
3243 case 'caption':
3244 case 'col':
3245 case 'colgroup':
3246 case 'tbody':
3247 case 'tfoot':
3248 case 'thead':
3249 if ( $this->endSection() ) {
3250 $this->insertToken( $token, $value, $attribs, $selfClose );
3251 }
3252 return true;
3253 }
3254 } elseif ( $token === 'endtag' ) {
3255 switch ( $value ) {
3256 case 'table':
3257 if ( $this->endSection() ) {
3258 $this->insertToken( $token, $value, $attribs, $selfClose );
3259 }
3260 return true;
3261 case 'tbody':
3262 case 'tfoot':
3263 case 'thead':
3264 if ( $this->stack->inTableScope( $value ) ) {
3265 $this->endSection();
3266 }
3267 return true;
3268 // OMITTED: <body>
3269 case 'caption':
3270 case 'col':
3271 case 'colgroup':
3272 // OMITTED: <html>
3273 case 'td':
3274 case 'th':
3275 case 'tr':
3276 return true; // Ignore the token.
3277 }
3278 }
3279 // Anything else:
3280 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3281 }
3282
3283 // Helper function for inRowMode
3284 private function endRow() {
3285 if ( !$this->stack->inTableScope( 'tr' ) ) {
3286 return false;
3287 }
3288 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3289 $this->stack->pop();
3290 $this->switchMode( 'inTableBodyMode' );
3291 return true;
3292 }
3293 private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) {
3294 if ( $token === 'tag' ) {
3295 switch ( $value ) {
3296 case 'th':
3297 case 'td':
3298 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3299 $this->stack->insertHTMLElement( $value, $attribs );
3300 $this->switchMode( 'inCellMode' );
3301 $this->afe->insertMarker();
3302 return true;
3303 case 'caption':
3304 case 'col':
3305 case 'colgroup':
3306 case 'tbody':
3307 case 'tfoot':
3308 case 'thead':
3309 case 'tr':
3310 if ( $this->endRow() ) {
3311 $this->insertToken( $token, $value, $attribs, $selfClose );
3312 }
3313 return true;
3314 }
3315 } elseif ( $token === 'endtag' ) {
3316 switch ( $value ) {
3317 case 'tr':
3318 $this->endRow();
3319 return true;
3320 case 'table':
3321 if ( $this->endRow() ) {
3322 $this->insertToken( $token, $value, $attribs, $selfClose );
3323 }
3324 return true;
3325 case 'tbody':
3326 case 'tfoot':
3327 case 'thead':
3328 if (
3329 $this->stack->inTableScope( $value ) &&
3330 $this->endRow()
3331 ) {
3332 $this->insertToken( $token, $value, $attribs, $selfClose );
3333 }
3334 return true;
3335 // OMITTED: <body>
3336 case 'caption':
3337 case 'col':
3338 case 'colgroup':
3339 // OMITTED: <html>
3340 case 'td':
3341 case 'th':
3342 return true; // Ignore the token.
3343 }
3344 }
3345 // Anything else:
3346 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3347 }
3348
3349 // Helper for inCellMode
3350 private function endCell() {
3351 if ( $this->stack->inTableScope( 'td' ) ) {
3352 $this->inCellMode( 'endtag', 'td' );
3353 return true;
3354 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3355 $this->inCellMode( 'endtag', 'th' );
3356 return true;
3357 } else {
3358 return false;
3359 }
3360 }
3361 private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) {
3362 if ( $token === 'tag' ) {
3363 switch ( $value ) {
3364 case 'caption':
3365 case 'col':
3366 case 'colgroup':
3367 case 'tbody':
3368 case 'td':
3369 case 'tfoot':
3370 case 'th':
3371 case 'thead':
3372 case 'tr':
3373 if ( $this->endCell() ) {
3374 $this->insertToken( $token, $value, $attribs, $selfClose );
3375 }
3376 return true;
3377 }
3378 } elseif ( $token === 'endtag' ) {
3379 switch ( $value ) {
3380 case 'td':
3381 case 'th':
3382 if ( $this->stack->inTableScope( $value ) ) {
3383 $this->stack->generateImpliedEndTags();
3384 $this->stack->popTag( $value );
3385 $this->afe->clearToMarker();
3386 $this->switchMode( 'inRowMode' );
3387 }
3388 return true;
3389 // OMITTED: <body>
3390 case 'caption':
3391 case 'col':
3392 case 'colgroup':
3393 // OMITTED: <html>
3394 return true;
3395
3396 case 'table':
3397 case 'tbody':
3398 case 'tfoot':
3399 case 'thead':
3400 case 'tr':
3401 if ( $this->stack->inTableScope( $value ) ) {
3402 $this->stack->generateImpliedEndTags();
3403 $this->stack->popTag( BalanceSets::$tableCellSet );
3404 $this->afe->clearToMarker();
3405 $this->switchMode( 'inRowMode' );
3406 $this->insertToken( $token, $value, $attribs, $selfClose );
3407 }
3408 return true;
3409 }
3410 }
3411 // Anything else:
3412 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3413 }
3414
3415 private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) {
3416 if ( $token === 'text' ) {
3417 $this->stack->insertText( $value );
3418 return true;
3419 } elseif ( $token === 'eof' ) {
3420 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3421 } elseif ( $token === 'tag' ) {
3422 switch ( $value ) {
3423 // OMITTED: <html>
3424 case 'option':
3425 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3426 $this->stack->pop();
3427 }
3428 $this->stack->insertHTMLElement( $value, $attribs );
3429 return true;
3430 case 'optgroup':
3431 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3432 $this->stack->pop();
3433 }
3434 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3435 $this->stack->pop();
3436 }
3437 $this->stack->insertHTMLElement( $value, $attribs );
3438 return true;
3439 case 'select':
3440 $this->inSelectMode( 'endtag', $value ); // treat it like endtag
3441 return true;
3442 case 'input':
3443 case 'keygen':
3444 case 'textarea':
3445 if ( !$this->stack->inSelectScope( 'select' ) ) {
3446 return true; // ignore token (fragment case)
3447 }
3448 $this->inSelectMode( 'endtag', 'select' );
3449 return $this->insertToken( $token, $value, $attribs, $selfClose );
3450 case 'script':
3451 case 'template':
3452 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3453 }
3454 } elseif ( $token === 'endtag' ) {
3455 switch ( $value ) {
3456 case 'optgroup':
3457 if (
3458 $this->stack->currentNode->isHtmlNamed( 'option' ) &&
3459 $this->stack->length() >= 2 &&
3460 $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
3461 ) {
3462 $this->stack->pop();
3463 }
3464 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3465 $this->stack->pop();
3466 }
3467 return true;
3468 case 'option':
3469 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3470 $this->stack->pop();
3471 }
3472 return true;
3473 case 'select':
3474 if ( !$this->stack->inSelectScope( $value ) ) {
3475 return true; // fragment case
3476 }
3477 $this->stack->popTag( $value );
3478 $this->resetInsertionMode();
3479 return true;
3480 case 'template':
3481 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3482 }
3483 } elseif ( $token === 'comment' ) {
3484 $this->stack->insertComment( $value );
3485 return true;
3486 }
3487 // anything else: just ignore the token
3488 return true;
3489 }
3490
3491 private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) {
3492 switch ( $value ) {
3493 case 'caption':
3494 case 'table':
3495 case 'tbody':
3496 case 'tfoot':
3497 case 'thead':
3498 case 'tr':
3499 case 'td':
3500 case 'th':
3501 if ( $token === 'tag' ) {
3502 $this->inSelectInTableMode( 'endtag', 'select' );
3503 return $this->insertToken( $token, $value, $attribs, $selfClose );
3504 } elseif ( $token === 'endtag' ) {
3505 if ( $this->stack->inTableScope( $value ) ) {
3506 $this->inSelectInTableMode( 'endtag', 'select' );
3507 return $this->insertToken( $token, $value, $attribs, $selfClose );
3508 }
3509 return true;
3510 }
3511 }
3512 // anything else
3513 return $this->inSelectMode( $token, $value, $attribs, $selfClose );
3514 }
3515
3516 private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) {
3517 if ( $token === 'text' || $token === 'comment' ) {
3518 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3519 } elseif ( $token === 'eof' ) {
3520 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3521 $this->stopParsing();
3522 } else {
3523 $this->stack->popTag( 'template' );
3524 $this->afe->clearToMarker();
3525 array_pop( $this->templateInsertionModes );
3526 $this->resetInsertionMode();
3527 $this->insertToken( $token, $value, $attribs, $selfClose );
3528 }
3529 return true;
3530 } elseif ( $token === 'tag' ) {
3531 switch ( $value ) {
3532 case 'base':
3533 case 'basefont':
3534 case 'bgsound':
3535 case 'link':
3536 case 'meta':
3537 case 'noframes':
3538 // OMITTED: <script>
3539 case 'style':
3540 case 'template':
3541 // OMITTED: <title>
3542 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3543
3544 case 'caption':
3545 case 'colgroup':
3546 case 'tbody':
3547 case 'tfoot':
3548 case 'thead':
3549 return $this->switchModeAndReprocess(
3550 'inTableMode', $token, $value, $attribs, $selfClose
3551 );
3552
3553 case 'col':
3554 return $this->switchModeAndReprocess(
3555 'inColumnGroupMode', $token, $value, $attribs, $selfClose
3556 );
3557
3558 case 'tr':
3559 return $this->switchModeAndReprocess(
3560 'inTableBodyMode', $token, $value, $attribs, $selfClose
3561 );
3562
3563 case 'td':
3564 case 'th':
3565 return $this->switchModeAndReprocess(
3566 'inRowMode', $token, $value, $attribs, $selfClose
3567 );
3568 }
3569 return $this->switchModeAndReprocess(
3570 'inBodyMode', $token, $value, $attribs, $selfClose
3571 );
3572 } elseif ( $token === 'endtag' ) {
3573 switch ( $value ) {
3574 case 'template':
3575 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3576 }
3577 return true;
3578 } else {
3579 Assert::invariant( false, "Bad token type: $token" );
3580 }
3581 }
3582 }