Use {{int:}} on MediaWiki:Blockedtext and MediaWiki:Autoblockedtext
[lhc/web/wiklou.git] / includes / tidy / Balancer.php
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
25 */
26
27 namespace MediaWiki\Tidy;
28
29 use ExplodeIterator;
30 use IteratorAggregate;
31 use ReverseArrayIterator;
32 use Sanitizer;
33 use Wikimedia\Assert\Assert;
34 use Wikimedia\Assert\ParameterAssertionException;
35
36 // A note for future librarization[1] -- this file is a good candidate
37 // for splitting into an independent library, except that it is currently
38 // highly optimized for MediaWiki use. It only implements the portions
39 // of the HTML5 tree builder used by tags supported by MediaWiki, and
40 // does not contain a true tokenizer pass, instead relying on
41 // comment stripping, attribute normalization, and escaping done by
42 // the MediaWiki Sanitizer. It also deliberately avoids building
43 // a true DOM in memory, instead serializing elements to an output string
44 // as soon as possible (usually as soon as the tag is closed) to reduce
45 // its memory footprint.
46
47 // We've been gradually lifting some of these restrictions to handle
48 // non-sanitized output generated by extensions, but we shortcut the tokenizer
49 // for speed (primarily by splitting on `<`) and so rely on syntactic
50 // well-formedness.
51
52 // On the other hand, I've been pretty careful to note with comments in the
53 // code the places where this implementation omits features of the spec or
54 // depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
55 // implement the missing pieces and make this a standalone PHP HTML5 parser.
56 // In order to do so, some sort of MediaWiki-specific API will need
57 // to be added to (a) allow the Balancer to bypass the tokenizer,
58 // and (b) support on-the-fly flattening instead of DOM node creation.
59
60 // [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
61
62 /**
63 * Utility constants and sets for the HTML5 tree building algorithm.
64 * Sets are associative arrays indexed first by namespace and then by
65 * lower-cased tag name.
66 *
67 * @ingroup Parser
68 * @since 1.27
69 */
70 class BalanceSets {
71 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
72 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
73 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
74
75 public static $unsupportedSet = [
76 self::HTML_NAMESPACE => [
77 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
78 'frame' => true,
79 'plaintext' => true,
80 'xmp' => true, 'iframe' => true, 'noembed' => true,
81 'noscript' => true, 'script' => true,
82 'title' => true
83 ]
84 ];
85
86 public static $emptyElementSet = [
87 self::HTML_NAMESPACE => [
88 'area' => true, 'base' => true, 'basefont' => true,
89 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
90 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
91 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
92 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
93 ]
94 ];
95
96 public static $extraLinefeedSet = [
97 self::HTML_NAMESPACE => [
98 'pre' => true, 'textarea' => true, 'listing' => true,
99 ]
100 ];
101
102 public static $headingSet = [
103 self::HTML_NAMESPACE => [
104 'h1' => true, 'h2' => true, 'h3' => true,
105 'h4' => true, 'h5' => true, 'h6' => true
106 ]
107 ];
108
109 public static $specialSet = [
110 self::HTML_NAMESPACE => [
111 'address' => true, 'applet' => true, 'area' => true,
112 'article' => true, 'aside' => true, 'base' => true,
113 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
114 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
115 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
116 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
117 'dt' => true, 'embed' => true, 'fieldset' => true,
118 'figcaption' => true, 'figure' => true, 'footer' => true,
119 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
120 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
121 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
122 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
123 'input' => true, 'li' => true, 'link' => true,
124 'listing' => true, 'main' => true, 'marquee' => true,
125 'menu' => true, 'meta' => true, 'nav' => true,
126 'noembed' => true, 'noframes' => true, 'noscript' => true,
127 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
128 'plaintext' => true, 'pre' => true, 'script' => true,
129 'section' => true, 'select' => true, 'source' => true,
130 'style' => true, 'summary' => true, 'table' => true,
131 'tbody' => true, 'td' => true, 'template' => true,
132 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
133 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
134 'wbr' => true, 'xmp' => true
135 ],
136 self::SVG_NAMESPACE => [
137 'foreignobject' => true, 'desc' => true, 'title' => true
138 ],
139 self::MATHML_NAMESPACE => [
140 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
141 'mtext' => true, 'annotation-xml' => true
142 ]
143 ];
144
145 public static $addressDivPSet = [
146 self::HTML_NAMESPACE => [
147 'address' => true, 'div' => true, 'p' => true
148 ]
149 ];
150
151 public static $tableSectionRowSet = [
152 self::HTML_NAMESPACE => [
153 'table' => true, 'thead' => true, 'tbody' => true,
154 'tfoot' => true, 'tr' => true
155 ]
156 ];
157
158 public static $impliedEndTagsSet = [
159 self::HTML_NAMESPACE => [
160 'dd' => true, 'dt' => true, 'li' => true,
161 'menuitem' => true, 'optgroup' => true,
162 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
163 'rt' => true, 'rtc' => true
164 ]
165 ];
166
167 public static $thoroughImpliedEndTagsSet = [
168 self::HTML_NAMESPACE => [
169 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
170 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
171 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
172 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
173 'thead' => true, 'tr' => true
174 ]
175 ];
176
177 public static $tableCellSet = [
178 self::HTML_NAMESPACE => [
179 'td' => true, 'th' => true
180 ]
181 ];
182 public static $tableContextSet = [
183 self::HTML_NAMESPACE => [
184 'table' => true, 'template' => true, 'html' => true
185 ]
186 ];
187
188 public static $tableBodyContextSet = [
189 self::HTML_NAMESPACE => [
190 'tbody' => true, 'tfoot' => true, 'thead' => true,
191 'template' => true, 'html' => true
192 ]
193 ];
194
195 public static $tableRowContextSet = [
196 self::HTML_NAMESPACE => [
197 'tr' => true, 'template' => true, 'html' => true
198 ]
199 ];
200
201 // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
202 public static $formAssociatedSet = [
203 self::HTML_NAMESPACE => [
204 'button' => true, 'fieldset' => true, 'input' => true,
205 'keygen' => true, 'object' => true, 'output' => true,
206 'select' => true, 'textarea' => true, 'img' => true
207 ]
208 ];
209
210 public static $inScopeSet = [
211 self::HTML_NAMESPACE => [
212 'applet' => true, 'caption' => true, 'html' => true,
213 'marquee' => true, 'object' => true,
214 'table' => true, 'td' => true, 'template' => true,
215 'th' => true
216 ],
217 self::SVG_NAMESPACE => [
218 'foreignobject' => true, 'desc' => true, 'title' => true
219 ],
220 self::MATHML_NAMESPACE => [
221 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
222 'mtext' => true, 'annotation-xml' => true
223 ]
224 ];
225
226 private static $inListItemScopeSet = null;
227 public static function inListItemScopeSet() {
228 if ( self::$inListItemScopeSet === null ) {
229 self::$inListItemScopeSet = self::$inScopeSet;
230 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
231 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
232 }
233 return self::$inListItemScopeSet;
234 }
235
236 private static $inButtonScopeSet = null;
237 public static function inButtonScopeSet() {
238 if ( self::$inButtonScopeSet === null ) {
239 self::$inButtonScopeSet = self::$inScopeSet;
240 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
241 }
242 return self::$inButtonScopeSet;
243 }
244
245 public static $inTableScopeSet = [
246 self::HTML_NAMESPACE => [
247 'html' => true, 'table' => true, 'template' => true
248 ]
249 ];
250
251 public static $inInvertedSelectScopeSet = [
252 self::HTML_NAMESPACE => [
253 'option' => true, 'optgroup' => true
254 ]
255 ];
256
257 public static $mathmlTextIntegrationPointSet = [
258 self::MATHML_NAMESPACE => [
259 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
260 'mtext' => true
261 ]
262 ];
263
264 public static $htmlIntegrationPointSet = [
265 self::SVG_NAMESPACE => [
266 'foreignobject' => true,
267 'desc' => true,
268 'title' => true
269 ]
270 ];
271
272 // For tidy compatibility.
273 public static $tidyPWrapSet = [
274 self::HTML_NAMESPACE => [
275 'body' => true, 'blockquote' => true,
276 // We parse with <body> as the fragment context, but the top-level
277 // element on the stack is actually <html>. We could use the
278 // "adjusted current node" everywhere to work around this, but it's
279 // easier just to add <html> to the p-wrap set.
280 'html' => true,
281 ],
282 ];
283 public static $tidyInlineSet = [
284 self::HTML_NAMESPACE => [
285 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
286 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
287 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
288 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
289 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
290 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
291 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
292 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
293 's' => true, 'samp' => true, 'select' => true, 'small' => true,
294 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
295 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
296 'var' => true,
297 ],
298 ];
299 }
300
301 /**
302 * A BalanceElement is a simplified version of a DOM Node. The main
303 * difference is that we only keep BalanceElements around for nodes
304 * currently on the BalanceStack of open elements. As soon as an
305 * element is closed, with some minor exceptions relating to the
306 * tree builder "adoption agency algorithm", the element and all its
307 * children are serialized to a string using the flatten() method.
308 * This keeps our memory usage low.
309 *
310 * @ingroup Parser
311 * @since 1.27
312 */
313 class BalanceElement {
314 /**
315 * The namespace of the element.
316 * @var string $namespaceURI
317 */
318 public $namespaceURI;
319 /**
320 * The lower-cased name of the element.
321 * @var string $localName
322 */
323 public $localName;
324 /**
325 * Attributes for the element, in array form
326 * @var array $attribs
327 */
328 public $attribs;
329
330 /**
331 * Parent of this element, or the string "flat" if this element has
332 * already been flattened into its parent.
333 * @var BalanceElement|string|null $parent
334 */
335 public $parent;
336
337 /**
338 * An array of children of this element. Typically only the last
339 * child will be an actual BalanceElement object; the rest will
340 * be strings, representing either text nodes or flattened
341 * BalanceElement objects.
342 * @var BalanceElement[]|string[] $children
343 */
344 public $children;
345
346 /**
347 * A unique string identifier for Noah's Ark purposes, lazy initialized
348 */
349 private $noahKey;
350
351 /**
352 * The next active formatting element in the list, or null if this is the
353 * end of the AFE list or if the element is not in the AFE list.
354 */
355 public $nextAFE;
356
357 /**
358 * The previous active formatting element in the list, or null if this is
359 * the start of the list or if the element is not in the AFE list.
360 */
361 public $prevAFE;
362
363 /**
364 * The next element in the Noah's Ark species bucket.
365 */
366 public $nextNoah;
367
368 /**
369 * Make a new BalanceElement corresponding to the HTML DOM Element
370 * with the given localname, namespace, and attributes.
371 *
372 * @param string $namespaceURI The namespace of the element.
373 * @param string $localName The lowercased name of the tag.
374 * @param array $attribs Attributes of the element
375 */
376 public function __construct( $namespaceURI, $localName, array $attribs ) {
377 $this->localName = $localName;
378 $this->namespaceURI = $namespaceURI;
379 $this->attribs = $attribs;
380 $this->contents = '';
381 $this->parent = null;
382 $this->children = [];
383 }
384
385 /**
386 * Remove the given child from this element.
387 * @param BalanceElement $elt
388 */
389 private function removeChild( BalanceElement $elt ) {
390 Assert::precondition(
391 $this->parent !== 'flat', "Can't removeChild after flattening $this"
392 );
393 Assert::parameter(
394 $elt->parent === $this, 'elt', 'must have $this as a parent'
395 );
396 $idx = array_search( $elt, $this->children, true );
397 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
398 $elt->parent = null;
399 array_splice( $this->children, $idx, 1 );
400 }
401
402 /**
403 * Find $a in the list of children and insert $b before it.
404 * @param BalanceElement $a
405 * @param BalanceElement|string $b
406 */
407 public function insertBefore( BalanceElement $a, $b ) {
408 Assert::precondition(
409 $this->parent !== 'flat', "Can't insertBefore after flattening."
410 );
411 $idx = array_search( $a, $this->children, true );
412 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
413 if ( is_string( $b ) ) {
414 array_splice( $this->children, $idx, 0, [ $b ] );
415 } else {
416 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
417 if ( $b->parent !== null ) {
418 $b->parent->removeChild( $b );
419 }
420 array_splice( $this->children, $idx, 0, [ $b ] );
421 $b->parent = $this;
422 }
423 }
424
425 /**
426 * Append $elt to the end of the list of children.
427 * @param BalanceElement|string $elt
428 */
429 public function appendChild( $elt ) {
430 Assert::precondition(
431 $this->parent !== 'flat', "Can't appendChild after flattening."
432 );
433 if ( is_string( $elt ) ) {
434 array_push( $this->children, $elt );
435 return;
436 }
437 // Remove $elt from parent, if it had one.
438 if ( $elt->parent !== null ) {
439 $elt->parent->removeChild( $elt );
440 }
441 array_push( $this->children, $elt );
442 $elt->parent = $this;
443 }
444
445 /**
446 * Transfer all of the children of $elt to $this.
447 * @param BalanceElement $elt
448 */
449 public function adoptChildren( BalanceElement $elt ) {
450 Assert::precondition(
451 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
452 );
453 foreach ( $elt->children as $child ) {
454 if ( !is_string( $child ) ) {
455 // This is an optimization which avoids an O(n^2) set of
456 // array_splice operations.
457 $child->parent = null;
458 }
459 $this->appendChild( $child );
460 }
461 $elt->children = [];
462 }
463
464 /**
465 * Flatten this node and all of its children into a string, as specified
466 * by the HTML serialization specification, and replace this node
467 * in its parent by that string.
468 *
469 * @param array $config Balancer configuration; see Balancer::__construct().
470 * @return string
471 *
472 * @see __toString()
473 */
474 public function flatten( array $config ) {
475 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
476 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
477 $idx = array_search( $this, $this->parent->children, true );
478 Assert::parameter(
479 $idx !== false, '$this', 'must be a child of its parent'
480 );
481 $tidyCompat = $config['tidyCompat'];
482 if ( $tidyCompat ) {
483 $blank = true;
484 foreach ( $this->children as $elt ) {
485 if ( !is_string( $elt ) ) {
486 $elt = $elt->flatten( $config );
487 }
488 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
489 $blank = false;
490 }
491 }
492 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
493 $this->localName = 'p';
494 } elseif ( $blank ) {
495 // Add 'mw-empty-elt' class so elements can be hidden via CSS
496 // for compatibility with legacy tidy.
497 if ( !count( $this->attribs ) &&
498 ( $this->localName === 'tr' || $this->localName === 'li' )
499 ) {
500 $this->attribs = [ 'class' => "mw-empty-elt" ];
501 }
502 $blank = false;
503 } elseif (
504 $this->isA( BalanceSets::$extraLinefeedSet ) &&
505 count( $this->children ) > 0 &&
506 substr( $this->children[0], 0, 1 ) == "\n"
507 ) {
508 // Double the linefeed after pre/listing/textarea
509 // according to the (old) HTML5 fragment serialization
510 // algorithm (see https://github.com/whatwg/html/issues/944)
511 // to ensure this will round-trip.
512 array_unshift( $this->children, "\n" );
513 }
514 $flat = $blank ? '' : "{$this}";
515 } else {
516 $flat = "{$this}";
517 }
518 $this->parent->children[$idx] = $flat;
519 $this->parent = 'flat'; // for assertion checking
520 return $flat;
521 }
522
523 /**
524 * Serialize this node and all of its children to a string, as specified
525 * by the HTML serialization specification.
526 *
527 * @return string The serialization of the BalanceElement
528 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
529 */
530 public function __toString() {
531 $encAttribs = '';
532 foreach ( $this->attribs as $name => $value ) {
533 $encValue = Sanitizer::encodeAttribute( $value );
534 $encAttribs .= " $name=\"$encValue\"";
535 }
536 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
537 $out = "<{$this->localName}{$encAttribs}>";
538 $len = strlen( $out );
539 // flatten children
540 foreach ( $this->children as $elt ) {
541 $out .= "{$elt}";
542 }
543 $out .= "</{$this->localName}>";
544 } else {
545 $out = "<{$this->localName}{$encAttribs} />";
546 Assert::invariant(
547 count( $this->children ) === 0,
548 "Empty elements shouldn't have children."
549 );
550 }
551 return $out;
552 }
553
554 // Utility functions on BalanceElements.
555
556 /**
557 * Determine if $this represents a specific HTML tag, is a member of
558 * a tag set, or is equal to another BalanceElement.
559 *
560 * @param BalanceElement|array|string $set The target BalanceElement,
561 * set (from the BalanceSets class), or string (HTML tag name).
562 * @return bool
563 */
564 public function isA( $set ) {
565 if ( $set instanceof BalanceElement ) {
566 return $this === $set;
567 } elseif ( is_array( $set ) ) {
568 return isset( $set[$this->namespaceURI] ) &&
569 isset( $set[$this->namespaceURI][$this->localName] );
570 } else {
571 // assume this is an HTML element name.
572 return $this->isHtml() && $this->localName === $set;
573 }
574 }
575
576 /**
577 * Determine if this element is an HTML element with the specified name
578 * @param string $tagName
579 * @return bool
580 */
581 public function isHtmlNamed( $tagName ) {
582 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
583 && $this->localName === $tagName;
584 }
585
586 /**
587 * Determine if $this represents an element in the HTML namespace.
588 *
589 * @return bool
590 */
591 public function isHtml() {
592 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
593 }
594
595 /**
596 * Determine if $this represents a MathML text integration point,
597 * as defined in the HTML5 specification.
598 *
599 * @return bool
600 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
601 */
602 public function isMathmlTextIntegrationPoint() {
603 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
604 }
605
606 /**
607 * Determine if $this represents an HTML integration point,
608 * as defined in the HTML5 specification.
609 *
610 * @return bool
611 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
612 */
613 public function isHtmlIntegrationPoint() {
614 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
615 return true;
616 }
617 if (
618 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
619 $this->localName === 'annotation-xml' &&
620 isset( $this->attribs['encoding'] ) &&
621 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
622 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
623 ) {
624 return true;
625 }
626 return false;
627 }
628
629 /**
630 * Get a string key for the Noah's Ark algorithm
631 * @return string
632 */
633 public function getNoahKey() {
634 if ( $this->noahKey === null ) {
635 $attribs = $this->attribs;
636 ksort( $attribs );
637 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
638 }
639 return $this->noahKey;
640 }
641 }
642
643 /**
644 * The "stack of open elements" as defined in the HTML5 tree builder
645 * spec. This contains methods to ensure that content (start tags, text)
646 * are inserted at the correct place in the output string, and to
647 * flatten BalanceElements are they are closed to avoid holding onto
648 * a complete DOM tree for the document in memory.
649 *
650 * The stack defines a PHP iterator to traverse it in "reverse order",
651 * that is, the most-recently-added element is visited first in a
652 * foreach loop.
653 *
654 * @ingroup Parser
655 * @since 1.27
656 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
657 */
658 class BalanceStack implements IteratorAggregate {
659 /**
660 * Backing storage for the stack.
661 * @var BalanceElement[] $elements
662 */
663 private $elements = [];
664 /**
665 * Foster parent mode determines how nodes are inserted into the
666 * stack.
667 * @var bool $fosterParentMode
668 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
669 */
670 public $fosterParentMode = false;
671 /**
672 * Configuration options governing flattening.
673 * @var array $config
674 * @see Balancer::__construct()
675 */
676 private $config;
677 /**
678 * Reference to the current element
679 */
680 public $currentNode;
681
682 /**
683 * Create a new BalanceStack with a single BalanceElement on it,
684 * representing the root &lt;html&gt; node.
685 * @param array $config Balancer configuration; see Balancer::_construct().
686 */
687 public function __construct( array $config ) {
688 // always a root <html> element on the stack
689 array_push(
690 $this->elements,
691 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
692 );
693 $this->currentNode = $this->elements[0];
694 $this->config = $config;
695 }
696
697 /**
698 * Return a string representing the output of the tree builder:
699 * all the children of the root &lt;html&gt; node.
700 * @return string
701 */
702 public function getOutput() {
703 // Don't include the outer '<html>....</html>'
704 $out = '';
705 foreach ( $this->elements[0]->children as $elt ) {
706 $out .= is_string( $elt ) ? $elt :
707 $elt->flatten( $this->config );
708 }
709 return $out;
710 }
711
712 /**
713 * Insert a comment at the appropriate place for inserting a node.
714 * @param string $value Content of the comment.
715 * @return string
716 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
717 */
718 public function insertComment( $value ) {
719 // Just another type of text node, except for tidy p-wrapping.
720 return $this->insertText( '<!--' . $value . '-->', true );
721 }
722
723 /**
724 * Insert text at the appropriate place for inserting a node.
725 * @param string $value
726 * @param bool $isComment
727 * @return string
728 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
729 */
730 public function insertText( $value, $isComment = false ) {
731 if (
732 $this->fosterParentMode &&
733 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
734 ) {
735 $this->fosterParent( $value );
736 } elseif (
737 $this->config['tidyCompat'] && !$isComment &&
738 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
739 ) {
740 $this->insertHTMLElement( 'mw:p-wrap', [] );
741 return $this->insertText( $value );
742 } else {
743 $this->currentNode->appendChild( $value );
744 }
745 }
746
747 /**
748 * Insert a BalanceElement at the appropriate place, pushing it
749 * on to the open elements stack.
750 * @param string $namespaceURI The element namespace
751 * @param string $tag The tag name
752 * @param string $attribs Normalized attributes, as a string.
753 * @return BalanceElement
754 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
755 */
756 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
757 return $this->insertElement(
758 new BalanceElement( $namespaceURI, $tag, $attribs )
759 );
760 }
761
762 /**
763 * Insert an HTML element at the appropriate place, pushing it on to
764 * the open elements stack.
765 * @param string $tag The tag name
766 * @param string $attribs Normalized attributes, as a string.
767 * @return BalanceElement
768 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
769 */
770 public function insertHTMLElement( $tag, $attribs ) {
771 return $this->insertForeignElement(
772 BalanceSets::HTML_NAMESPACE, $tag, $attribs
773 );
774 }
775
776 /**
777 * Insert an element at the appropriate place and push it on to the
778 * open elements stack.
779 * @param BalanceElement $elt
780 * @return BalanceElement
781 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
782 */
783 public function insertElement( BalanceElement $elt ) {
784 if (
785 $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
786 !$elt->isA( BalanceSets::$tidyInlineSet )
787 ) {
788 // Tidy compatibility.
789 $this->pop();
790 }
791 if (
792 $this->fosterParentMode &&
793 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
794 ) {
795 $elt = $this->fosterParent( $elt );
796 } else {
797 $this->currentNode->appendChild( $elt );
798 }
799 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
800 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
801 array_push( $this->elements, $elt );
802 $this->currentNode = $elt;
803 return $elt;
804 }
805
806 /**
807 * Determine if the stack has $tag in scope.
808 * @param BalanceElement|array|string $tag
809 * @return bool
810 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
811 */
812 public function inScope( $tag ) {
813 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
814 }
815
816 /**
817 * Determine if the stack has $tag in button scope.
818 * @param BalanceElement|array|string $tag
819 * @return bool
820 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
821 */
822 public function inButtonScope( $tag ) {
823 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
824 }
825
826 /**
827 * Determine if the stack has $tag in list item scope.
828 * @param BalanceElement|array|string $tag
829 * @return bool
830 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
831 */
832 public function inListItemScope( $tag ) {
833 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
834 }
835
836 /**
837 * Determine if the stack has $tag in table scope.
838 * @param BalanceElement|array|string $tag
839 * @return bool
840 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
841 */
842 public function inTableScope( $tag ) {
843 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
844 }
845
846 /**
847 * Determine if the stack has $tag in select scope.
848 * @param BalanceElement|array|string $tag
849 * @return bool
850 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
851 */
852 public function inSelectScope( $tag ) {
853 // Can't use inSpecificScope to implement this, since it involves
854 // *inverting* a set of tags. Implement manually.
855 foreach ( $this as $elt ) {
856 if ( $elt->isA( $tag ) ) {
857 return true;
858 }
859 if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
860 return false;
861 }
862 }
863 return false;
864 }
865
866 /**
867 * Determine if the stack has $tag in a specific scope, $set.
868 * @param BalanceElement|array|string $tag
869 * @param BalanceElement|array|string $set
870 * @return bool
871 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
872 */
873 public function inSpecificScope( $tag, $set ) {
874 foreach ( $this as $elt ) {
875 if ( $elt->isA( $tag ) ) {
876 return true;
877 }
878 if ( $elt->isA( $set ) ) {
879 return false;
880 }
881 }
882 return false;
883 }
884
885 /**
886 * Generate implied end tags.
887 * @param string $butnot
888 * @param bool $thorough True if we should generate end tags thoroughly.
889 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
890 */
891 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
892 $endTagSet = $thorough ?
893 BalanceSets::$thoroughImpliedEndTagsSet :
894 BalanceSets::$impliedEndTagsSet;
895 while ( $this->currentNode ) {
896 if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
897 break;
898 }
899 if ( !$this->currentNode->isA( $endTagSet ) ) {
900 break;
901 }
902 $this->pop();
903 }
904 }
905
906 /**
907 * Return the adjusted current node.
908 * @param string $fragmentContext
909 * @return string
910 */
911 public function adjustedCurrentNode( $fragmentContext ) {
912 return ( $fragmentContext && count( $this->elements ) === 1 ) ?
913 $fragmentContext : $this->currentNode;
914 }
915
916 /**
917 * Return an iterator over this stack which visits the current node
918 * first, and the root node last.
919 * @return \Iterator
920 */
921 public function getIterator() {
922 return new ReverseArrayIterator( $this->elements );
923 }
924
925 /**
926 * Return the BalanceElement at the given position $idx, where
927 * position 0 represents the root element.
928 * @param int $idx
929 * @return BalanceElement
930 */
931 public function node( $idx ) {
932 return $this->elements[ $idx ];
933 }
934
935 /**
936 * Replace the element at position $idx in the BalanceStack with $elt.
937 * @param int $idx
938 * @param BalanceElement $elt
939 */
940 public function replaceAt( $idx, BalanceElement $elt ) {
941 Assert::precondition(
942 $this->elements[$idx]->parent !== 'flat',
943 'Replaced element should not have already been flattened.'
944 );
945 Assert::precondition(
946 $elt->parent !== 'flat',
947 'New element should not have already been flattened.'
948 );
949 $this->elements[$idx] = $elt;
950 if ( $idx === count( $this->elements ) - 1 ) {
951 $this->currentNode = $elt;
952 }
953 }
954
955 /**
956 * Return the position of the given BalanceElement, set, or
957 * HTML tag name string in the BalanceStack.
958 * @param BalanceElement|array|string $tag
959 * @return int
960 */
961 public function indexOf( $tag ) {
962 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
963 if ( $this->elements[$i]->isA( $tag ) ) {
964 return $i;
965 }
966 }
967 return -1;
968 }
969
970 /**
971 * Return the number of elements currently in the BalanceStack.
972 * @return int
973 */
974 public function length() {
975 return count( $this->elements );
976 }
977
978 /**
979 * Remove the current node from the BalanceStack, flattening it
980 * in the process.
981 */
982 public function pop() {
983 $elt = array_pop( $this->elements );
984 if ( count( $this->elements ) ) {
985 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
986 } else {
987 $this->currentNode = null;
988 }
989 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
990 $elt->flatten( $this->config );
991 }
992 }
993
994 /**
995 * Remove all nodes up to and including position $idx from the
996 * BalanceStack, flattening them in the process.
997 * @param int $idx
998 */
999 public function popTo( $idx ) {
1000 for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
1001 $this->pop();
1002 }
1003 }
1004
1005 /**
1006 * Pop elements off the stack up to and including the first
1007 * element with the specified HTML tagname (or matching the given
1008 * set).
1009 * @param BalanceElement|array|string $tag
1010 */
1011 public function popTag( $tag ) {
1012 while ( $this->currentNode ) {
1013 if ( $this->currentNode->isA( $tag ) ) {
1014 $this->pop();
1015 break;
1016 }
1017 $this->pop();
1018 }
1019 }
1020
1021 /**
1022 * Pop elements off the stack *not including* the first element
1023 * in the specified set.
1024 * @param BalanceElement|array|string $set
1025 */
1026 public function clearToContext( $set ) {
1027 // Note that we don't loop to 0. Never pop the <html> elt off.
1028 for ( $length = count( $this->elements ); $length > 1; $length-- ) {
1029 if ( $this->currentNode->isA( $set ) ) {
1030 break;
1031 }
1032 $this->pop();
1033 }
1034 }
1035
1036 /**
1037 * Remove the given $elt from the BalanceStack, optionally
1038 * flattening it in the process.
1039 * @param BalanceElement $elt The element to remove.
1040 * @param bool $flatten Whether to flatten the removed element.
1041 */
1042 public function removeElement( BalanceElement $elt, $flatten = true ) {
1043 Assert::parameter(
1044 $elt->parent !== 'flat',
1045 '$elt',
1046 '$elt should not already have been flattened.'
1047 );
1048 Assert::parameter(
1049 $elt->parent->parent !== 'flat',
1050 '$elt',
1051 'The parent of $elt should not already have been flattened.'
1052 );
1053 $idx = array_search( $elt, $this->elements, true );
1054 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
1055 array_splice( $this->elements, $idx, 1 );
1056 if ( $idx === count( $this->elements ) ) {
1057 $this->currentNode = $this->elements[$idx - 1];
1058 }
1059 if ( $flatten ) {
1060 // serialize $elt into its parent
1061 // otherwise, it will eventually serialize when the parent
1062 // is serialized, we just hold onto the memory for its
1063 // tree of objects a little longer.
1064 $elt->flatten( $this->config );
1065 }
1066 Assert::postcondition(
1067 array_search( $elt, $this->elements, true ) === false,
1068 '$elt should no longer be in open elements stack'
1069 );
1070 }
1071
1072 /**
1073 * Find $a in the BalanceStack and insert $b after it.
1074 * @param BalanceElement $a
1075 * @param BalanceElement $b
1076 */
1077 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1078 $idx = $this->indexOf( $a );
1079 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1080 if ( $idx === count( $this->elements ) - 1 ) {
1081 array_push( $this->elements, $b );
1082 $this->currentNode = $b;
1083 } else {
1084 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1085 }
1086 }
1087
1088 // Fostering and adoption.
1089
1090 /**
1091 * Foster parent the given $elt in the stack of open elements.
1092 * @param BalanceElement|string $elt
1093 * @return BalanceElement|string
1094 *
1095 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1096 */
1097 private function fosterParent( $elt ) {
1098 $lastTable = $this->indexOf( 'table' );
1099 $lastTemplate = $this->indexOf( 'template' );
1100 $parent = null;
1101 $before = null;
1102
1103 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1104 $parent = $this->elements[$lastTemplate];
1105 } elseif ( $lastTable >= 0 ) {
1106 $parent = $this->elements[$lastTable]->parent;
1107 // Assume all tables have parents, since we're not running scripts!
1108 Assert::invariant(
1109 $parent !== null, "All tables should have parents"
1110 );
1111 $before = $this->elements[$lastTable];
1112 } else {
1113 $parent = $this->elements[0]; // the `html` element.
1114 }
1115
1116 if ( $this->config['tidyCompat'] ) {
1117 if ( is_string( $elt ) ) {
1118 // We're fostering text: do we need a p-wrapper?
1119 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1120 $this->insertHTMLElement( 'mw:p-wrap', [] );
1121 $this->insertText( $elt );
1122 return $elt;
1123 }
1124 } else {
1125 // We're fostering an element; do we need to merge p-wrappers?
1126 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1127 $idx = $before ?
1128 array_search( $before, $parent->children, true ) :
1129 count( $parent->children );
1130 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1131 if (
1132 $after instanceof BalanceElement &&
1133 $after->isHtmlNamed( 'mw:p-wrap' )
1134 ) {
1135 return $after; // Re-use existing p-wrapper.
1136 }
1137 }
1138 }
1139 }
1140
1141 if ( $before ) {
1142 $parent->insertBefore( $before, $elt );
1143 } else {
1144 $parent->appendChild( $elt );
1145 }
1146 return $elt;
1147 }
1148
1149 /**
1150 * Run the "adoption agency algoritm" (AAA) for the given subject
1151 * tag name.
1152 * @param string $tag The subject tag name.
1153 * @param BalanceActiveFormattingElements $afe The current
1154 * active formatting elements list.
1155 * @return true if the adoption agency algorithm "did something", false
1156 * if more processing is required by the caller.
1157 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1158 */
1159 public function adoptionAgency( $tag, $afe ) {
1160 // If the current node is an HTML element whose tag name is subject,
1161 // and the current node is not in the list of active formatting
1162 // elements, then pop the current node off the stack of open
1163 // elements and abort these steps.
1164 if (
1165 $this->currentNode->isHtmlNamed( $tag ) &&
1166 !$afe->isInList( $this->currentNode )
1167 ) {
1168 $this->pop();
1169 return true; // no more handling required
1170 }
1171
1172 // Outer loop: If outer loop counter is greater than or
1173 // equal to eight, then abort these steps.
1174 for ( $outer = 0; $outer < 8; $outer++ ) {
1175 // Let the formatting element be the last element in the list
1176 // of active formatting elements that: is between the end of
1177 // the list and the last scope marker in the list, if any, or
1178 // the start of the list otherwise, and has the same tag name
1179 // as the token.
1180 $fmtElt = $afe->findElementByTag( $tag );
1181
1182 // If there is no such node, then abort these steps and instead
1183 // act as described in the "any other end tag" entry below.
1184 if ( !$fmtElt ) {
1185 return false; // false means handle by the default case
1186 }
1187
1188 // Otherwise, if there is such a node, but that node is not in
1189 // the stack of open elements, then this is a parse error;
1190 // remove the element from the list, and abort these steps.
1191 $index = $this->indexOf( $fmtElt );
1192 if ( $index < 0 ) {
1193 $afe->remove( $fmtElt );
1194 return true; // true means no more handling required
1195 }
1196
1197 // Otherwise, if there is such a node, and that node is also in
1198 // the stack of open elements, but the element is not in scope,
1199 // then this is a parse error; ignore the token, and abort
1200 // these steps.
1201 if ( !$this->inScope( $fmtElt ) ) {
1202 return true;
1203 }
1204
1205 // Let the furthest block be the topmost node in the stack of
1206 // open elements that is lower in the stack than the formatting
1207 // element, and is an element in the special category. There
1208 // might not be one.
1209 $furthestBlock = null;
1210 $furthestBlockIndex = -1;
1211 $stackLength = $this->length();
1212 for ( $i = $index + 1; $i < $stackLength; $i++ ) {
1213 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1214 $furthestBlock = $this->node( $i );
1215 $furthestBlockIndex = $i;
1216 break;
1217 }
1218 }
1219
1220 // If there is no furthest block, then the UA must skip the
1221 // subsequent steps and instead just pop all the nodes from the
1222 // bottom of the stack of open elements, from the current node
1223 // up to and including the formatting element, and remove the
1224 // formatting element from the list of active formatting
1225 // elements.
1226 if ( !$furthestBlock ) {
1227 $this->popTag( $fmtElt );
1228 $afe->remove( $fmtElt );
1229 return true;
1230 }
1231
1232 // Let the common ancestor be the element immediately above
1233 // the formatting element in the stack of open elements.
1234 $ancestor = $this->node( $index - 1 );
1235
1236 // Let a bookmark note the position of the formatting
1237 // element in the list of active formatting elements
1238 // relative to the elements on either side of it in the
1239 // list.
1240 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1241 $afe->insertAfter( $fmtElt, $BOOKMARK );
1242
1243 // Let node and last node be the furthest block.
1244 $node = $furthestBlock;
1245 $lastNode = $furthestBlock;
1246 $nodeIndex = $furthestBlockIndex;
1247 $isAFE = false;
1248
1249 // Inner loop
1250 for ( $inner = 1; true; $inner++ ) {
1251 // Let node be the element immediately above node in
1252 // the stack of open elements, or if node is no longer
1253 // in the stack of open elements (e.g. because it got
1254 // removed by this algorithm), the element that was
1255 // immediately above node in the stack of open elements
1256 // before node was removed.
1257 $node = $this->node( --$nodeIndex );
1258
1259 // If node is the formatting element, then go
1260 // to the next step in the overall algorithm.
1261 if ( $node === $fmtElt ) break;
1262
1263 // If the inner loop counter is greater than three and node
1264 // is in the list of active formatting elements, then remove
1265 // node from the list of active formatting elements.
1266 $isAFE = $afe->isInList( $node );
1267 if ( $inner > 3 && $isAFE ) {
1268 $afe->remove( $node );
1269 $isAFE = false;
1270 }
1271
1272 // If node is not in the list of active formatting
1273 // elements, then remove node from the stack of open
1274 // elements and then go back to the step labeled inner
1275 // loop.
1276 if ( !$isAFE ) {
1277 // Don't flatten here, since we're about to relocate
1278 // parts of this $node.
1279 $this->removeElement( $node, false );
1280 continue;
1281 }
1282
1283 // Create an element for the token for which the
1284 // element node was created with common ancestor as
1285 // the intended parent, replace the entry for node
1286 // in the list of active formatting elements with an
1287 // entry for the new element, replace the entry for
1288 // node in the stack of open elements with an entry for
1289 // the new element, and let node be the new element.
1290 $newElt = new BalanceElement(
1291 $node->namespaceURI, $node->localName, $node->attribs );
1292 $afe->replace( $node, $newElt );
1293 $this->replaceAt( $nodeIndex, $newElt );
1294 $node = $newElt;
1295
1296 // If last node is the furthest block, then move the
1297 // aforementioned bookmark to be immediately after the
1298 // new node in the list of active formatting elements.
1299 if ( $lastNode === $furthestBlock ) {
1300 $afe->remove( $BOOKMARK );
1301 $afe->insertAfter( $newElt, $BOOKMARK );
1302 }
1303
1304 // Insert last node into node, first removing it from
1305 // its previous parent node if any.
1306 $node->appendChild( $lastNode );
1307
1308 // Let last node be node.
1309 $lastNode = $node;
1310 }
1311
1312 // If the common ancestor node is a table, tbody, tfoot,
1313 // thead, or tr element, then, foster parent whatever last
1314 // node ended up being in the previous step, first removing
1315 // it from its previous parent node if any.
1316 if (
1317 $this->fosterParentMode &&
1318 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1319 ) {
1320 $this->fosterParent( $lastNode );
1321 } else {
1322 // Otherwise, append whatever last node ended up being in
1323 // the previous step to the common ancestor node, first
1324 // removing it from its previous parent node if any.
1325 $ancestor->appendChild( $lastNode );
1326 }
1327
1328 // Create an element for the token for which the
1329 // formatting element was created, with furthest block
1330 // as the intended parent.
1331 $newElt2 = new BalanceElement(
1332 $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs );
1333
1334 // Take all of the child nodes of the furthest block and
1335 // append them to the element created in the last step.
1336 $newElt2->adoptChildren( $furthestBlock );
1337
1338 // Append that new element to the furthest block.
1339 $furthestBlock->appendChild( $newElt2 );
1340
1341 // Remove the formatting element from the list of active
1342 // formatting elements, and insert the new element into the
1343 // list of active formatting elements at the position of
1344 // the aforementioned bookmark.
1345 $afe->remove( $fmtElt );
1346 $afe->replace( $BOOKMARK, $newElt2 );
1347
1348 // Remove the formatting element from the stack of open
1349 // elements, and insert the new element into the stack of
1350 // open elements immediately below the position of the
1351 // furthest block in that stack.
1352 $this->removeElement( $fmtElt );
1353 $this->insertAfter( $furthestBlock, $newElt2 );
1354 }
1355
1356 return true;
1357 }
1358
1359 /**
1360 * Return the contents of the open elements stack as a string for
1361 * debugging.
1362 * @return string
1363 */
1364 public function __toString() {
1365 $r = [];
1366 foreach ( $this->elements as $elt ) {
1367 array_push( $r, $elt->localName );
1368 }
1369 return implode( ' ', $r );
1370 }
1371 }
1372
1373 /**
1374 * A pseudo-element used as a marker in the list of active formatting elements
1375 *
1376 * @ingroup Parser
1377 * @since 1.27
1378 */
1379 class BalanceMarker {
1380 public $nextAFE;
1381 public $prevAFE;
1382 }
1383
1384 /**
1385 * The list of active formatting elements, which is used to handle
1386 * mis-nested formatting element tags in the HTML5 tree builder
1387 * specification.
1388 *
1389 * @ingroup Parser
1390 * @since 1.27
1391 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1392 */
1393 class BalanceActiveFormattingElements {
1394 /** The last (most recent) element in the list */
1395 private $tail;
1396
1397 /** The first (least recent) element in the list */
1398 private $head;
1399
1400 /**
1401 * An array of arrays representing the population of elements in each bucket
1402 * according to the Noah's Ark clause. The outer array is stack-like, with each
1403 * integer-indexed element representing a segment of the list, bounded by
1404 * markers. The first element represents the segment of the list before the
1405 * first marker.
1406 *
1407 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1408 * identifies each bucket according to the rules in the spec. The value in
1409 * the inner array is the first (least recently inserted) element in the bucket,
1410 * and subsequent members of the bucket can be found by iterating through the
1411 * singly-linked list via $node->nextNoah.
1412 *
1413 * This is optimised for the most common case of inserting into a bucket
1414 * with zero members, and deleting a bucket containing one member. In the
1415 * worst case, iteration through the list is still O(1) in the document
1416 * size, since each bucket can have at most 3 members.
1417 */
1418 private $noahTableStack = [ [] ];
1419
1420 public function __destruct() {
1421 $next = null;
1422 for ( $node = $this->head; $node; $node = $next ) {
1423 $next = $node->nextAFE;
1424 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1425 }
1426 $this->head = $this->tail = $this->noahTableStack = null;
1427 }
1428
1429 public function insertMarker() {
1430 $elt = new BalanceMarker;
1431 if ( $this->tail ) {
1432 $this->tail->nextAFE = $elt;
1433 $elt->prevAFE = $this->tail;
1434 } else {
1435 $this->head = $elt;
1436 }
1437 $this->tail = $elt;
1438 $this->noahTableStack[] = [];
1439 }
1440
1441 /**
1442 * Follow the steps required when the spec requires us to "push onto the
1443 * list of active formatting elements".
1444 * @param BalanceElement $elt
1445 */
1446 public function push( BalanceElement $elt ) {
1447 // Must not be in the list already
1448 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1449 throw new ParameterAssertionException( '$elt',
1450 'Cannot insert a node into the AFE list twice' );
1451 }
1452
1453 // "Noah's Ark clause" -- if there are already three copies of
1454 // this element before we encounter a marker, then drop the last
1455 // one.
1456 $noahKey = $elt->getNoahKey();
1457 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1458 if ( !isset( $table[$noahKey] ) ) {
1459 $table[$noahKey] = $elt;
1460 } else {
1461 $count = 1;
1462 $head = $tail = $table[$noahKey];
1463 while ( $tail->nextNoah ) {
1464 $tail = $tail->nextNoah;
1465 $count++;
1466 }
1467 if ( $count >= 3 ) {
1468 $this->remove( $head );
1469 }
1470 $tail->nextNoah = $elt;
1471 }
1472 // Add to the main AFE list
1473 if ( $this->tail ) {
1474 $this->tail->nextAFE = $elt;
1475 $elt->prevAFE = $this->tail;
1476 } else {
1477 $this->head = $elt;
1478 }
1479 $this->tail = $elt;
1480 }
1481
1482 /**
1483 * Follow the steps required when the spec asks us to "clear the list of
1484 * active formatting elements up to the last marker".
1485 */
1486 public function clearToMarker() {
1487 // Iterate back through the list starting from the tail
1488 $tail = $this->tail;
1489 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1490 // Unlink the element
1491 $prev = $tail->prevAFE;
1492 $tail->prevAFE = null;
1493 if ( $prev ) {
1494 $prev->nextAFE = null;
1495 }
1496 $tail->nextNoah = null;
1497 $tail = $prev;
1498 }
1499 // If we finished on a marker, unlink it and pop it off the Noah table stack
1500 if ( $tail ) {
1501 $prev = $tail->prevAFE;
1502 if ( $prev ) {
1503 $prev->nextAFE = null;
1504 }
1505 $tail = $prev;
1506 array_pop( $this->noahTableStack );
1507 } else {
1508 // No marker: wipe the top-level Noah table (which is the only one)
1509 $this->noahTableStack[0] = [];
1510 }
1511 // If we removed all the elements, clear the head pointer
1512 if ( !$tail ) {
1513 $this->head = null;
1514 }
1515 $this->tail = $tail;
1516 }
1517
1518 /**
1519 * Find and return the last element with the specified tag between the
1520 * end of the list and the last marker on the list.
1521 * Used when parsing &lt;a&gt; "in body mode".
1522 * @param string $tag
1523 * @return null|Node
1524 */
1525 public function findElementByTag( $tag ) {
1526 $elt = $this->tail;
1527 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1528 if ( $elt->localName === $tag ) {
1529 return $elt;
1530 }
1531 $elt = $elt->prevAFE;
1532 }
1533 return null;
1534 }
1535
1536 /**
1537 * Determine whether an element is in the list of formatting elements.
1538 * @param BalanceElement $elt
1539 * @return bool
1540 */
1541 public function isInList( BalanceElement $elt ) {
1542 return $this->head === $elt || $elt->prevAFE;
1543 }
1544
1545 /**
1546 * Find the element $elt in the list and remove it.
1547 * Used when parsing &lt;a&gt; in body mode.
1548 *
1549 * @param BalanceElement $elt
1550 */
1551 public function remove( BalanceElement $elt ) {
1552 if ( $this->head !== $elt && !$elt->prevAFE ) {
1553 throw new ParameterAssertionException( '$elt',
1554 "Attempted to remove an element which is not in the AFE list" );
1555 }
1556 // Update head and tail pointers
1557 if ( $this->head === $elt ) {
1558 $this->head = $elt->nextAFE;
1559 }
1560 if ( $this->tail === $elt ) {
1561 $this->tail = $elt->prevAFE;
1562 }
1563 // Update previous element
1564 if ( $elt->prevAFE ) {
1565 $elt->prevAFE->nextAFE = $elt->nextAFE;
1566 }
1567 // Update next element
1568 if ( $elt->nextAFE ) {
1569 $elt->nextAFE->prevAFE = $elt->prevAFE;
1570 }
1571 // Clear pointers so that isInList() etc. will work
1572 $elt->prevAFE = $elt->nextAFE = null;
1573 // Update Noah list
1574 $this->removeFromNoahList( $elt );
1575 }
1576
1577 private function addToNoahList( BalanceElement $elt ) {
1578 $noahKey = $elt->getNoahKey();
1579 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1580 if ( !isset( $table[$noahKey] ) ) {
1581 $table[$noahKey] = $elt;
1582 } else {
1583 $tail = $table[$noahKey];
1584 while ( $tail->nextNoah ) {
1585 $tail = $tail->nextNoah;
1586 }
1587 $tail->nextNoah = $elt;
1588 }
1589 }
1590
1591 private function removeFromNoahList( BalanceElement $elt ) {
1592 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1593 $key = $elt->getNoahKey();
1594 $noahElt = $table[$key];
1595 if ( $noahElt === $elt ) {
1596 if ( $noahElt->nextNoah ) {
1597 $table[$key] = $noahElt->nextNoah;
1598 $noahElt->nextNoah = null;
1599 } else {
1600 unset( $table[$key] );
1601 }
1602 } else {
1603 do {
1604 $prevNoahElt = $noahElt;
1605 $noahElt = $prevNoahElt->nextNoah;
1606 if ( $noahElt === $elt ) {
1607 // Found it, unlink
1608 $prevNoahElt->nextNoah = $elt->nextNoah;
1609 $elt->nextNoah = null;
1610 break;
1611 }
1612 } while ( $noahElt );
1613 }
1614 }
1615
1616 /**
1617 * Find element $a in the list and replace it with element $b
1618 *
1619 * @param BalanceElement $a
1620 * @param BalanceElement $b
1621 */
1622 public function replace( BalanceElement $a, BalanceElement $b ) {
1623 if ( $this->head !== $a && !$a->prevAFE ) {
1624 throw new ParameterAssertionException( '$a',
1625 "Attempted to replace an element which is not in the AFE list" );
1626 }
1627 // Update head and tail pointers
1628 if ( $this->head === $a ) {
1629 $this->head = $b;
1630 }
1631 if ( $this->tail === $a ) {
1632 $this->tail = $b;
1633 }
1634 // Update previous element
1635 if ( $a->prevAFE ) {
1636 $a->prevAFE->nextAFE = $b;
1637 }
1638 // Update next element
1639 if ( $a->nextAFE ) {
1640 $a->nextAFE->prevAFE = $b;
1641 }
1642 $b->prevAFE = $a->prevAFE;
1643 $b->nextAFE = $a->nextAFE;
1644 $a->nextAFE = $a->prevAFE = null;
1645 // Update Noah list
1646 $this->removeFromNoahList( $a );
1647 $this->addToNoahList( $b );
1648 }
1649
1650 /**
1651 * Find $a in the list and insert $b after it.
1652
1653 * @param BalanceElement $a
1654 * @param BalanceElement $b
1655 */
1656 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1657 if ( $this->head !== $a && !$a->prevAFE ) {
1658 throw new ParameterAssertionException( '$a',
1659 "Attempted to insert after an element which is not in the AFE list" );
1660 }
1661 if ( $this->tail === $a ) {
1662 $this->tail = $b;
1663 }
1664 if ( $a->nextAFE ) {
1665 $a->nextAFE->prevAFE = $b;
1666 }
1667 $b->nextAFE = $a->nextAFE;
1668 $b->prevAFE = $a;
1669 $a->nextAFE = $b;
1670 $this->addToNoahList( $b );
1671 }
1672
1673 /**
1674 * Reconstruct the active formatting elements.
1675 * @param BalanceStack $stack The open elements stack
1676 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1677 */
1678 public function reconstruct( $stack ) {
1679 $entry = $this->tail;
1680 // If there are no entries in the list of active formatting elements,
1681 // then there is nothing to reconstruct
1682 if ( !$entry ) {
1683 return;
1684 }
1685 // If the last is a marker, do nothing.
1686 if ( $entry instanceof BalanceMarker ) {
1687 return;
1688 }
1689 // Or if it is an open element, do nothing.
1690 if ( $stack->indexOf( $entry ) >= 0 ) {
1691 return;
1692 }
1693
1694 // Loop backward through the list until we find a marker or an
1695 // open element
1696 $foundIt = false;
1697 while ( $entry->prevAFE ) {
1698 $entry = $entry->prevAFE;
1699 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1700 $foundIt = true;
1701 break;
1702 }
1703 }
1704
1705 // Now loop forward, starting from the element after the current one (or
1706 // the first element if we didn't find a marker or open element),
1707 // recreating formatting elements and pushing them back onto the list
1708 // of open elements.
1709 if ( $foundIt ) {
1710 $entry = $entry->nextAFE;
1711 }
1712 do {
1713 $newElement = $stack->insertHTMLElement(
1714 $entry->localName,
1715 $entry->attribs );
1716 $this->replace( $entry, $newElement );
1717 $entry = $newElement->nextAFE;
1718 } while ( $entry );
1719 }
1720
1721 /**
1722 * Get a string representation of the AFE list, for debugging
1723 */
1724 public function __toString() {
1725 $prev = null;
1726 $s = '';
1727 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1728 if ( $node instanceof BalanceMarker ) {
1729 $s .= "MARKER\n";
1730 continue;
1731 }
1732 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1733 if ( $node->nextNoah ) {
1734 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1735 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1736 ')';
1737 }
1738 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1739 $s .= " (reverse link is wrong!)";
1740 }
1741 $s .= "\n";
1742 }
1743 if ( $prev !== $this->tail ) {
1744 $s .= "(tail pointer is wrong!)\n";
1745 }
1746 return $s;
1747 }
1748 }
1749
1750 /**
1751 * An implementation of the tree building portion of the HTML5 parsing
1752 * spec.
1753 *
1754 * This is used to balance and tidy output so that the result can
1755 * always be cleanly serialized/deserialized by an HTML5 parser. It
1756 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1757 * a number of constraints which are not enforced by the HTML5 parsing
1758 * process. But the result will be free of gross errors: misnested or
1759 * unclosed tags, for example, and will be unchanged by spec-complient
1760 * parsing followed by serialization.
1761 *
1762 * The tree building stage is structured as a state machine.
1763 * When comparing the implementation to
1764 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1765 * note that each state is implemented as a function with a
1766 * name ending in `Mode` (because the HTML spec refers to them
1767 * as insertion modes). The current insertion mode is held by
1768 * the $parseMode property.
1769 *
1770 * The following simplifications have been made:
1771 * - We handle body content only (ie, we start `in body`.)
1772 * - The document is never in "quirks mode".
1773 * - All occurrences of < and > have been entity escaped, so we
1774 * can parse tags by simply splitting on those two characters.
1775 * (This also simplifies the handling of < inside <textarea>.)
1776 * The character < must not appear inside comments.
1777 * Similarly, all attributes have been "cleaned" and are double-quoted
1778 * and escaped.
1779 * - All null characters are assumed to have been removed.
1780 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1781 * <frame>, <plaintext>, <xmp>, <iframe>,
1782 * <noembed>, <noscript>, <script>, <title>. As a result,
1783 * further simplifications can be made:
1784 * - `frameset-ok` is not tracked.
1785 * - `head element pointer` is not tracked (but presumed non-null)
1786 * - Tokenizer has only a single mode. (<textarea> wants RCDATA and
1787 * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
1788 *
1789 * We generally mark places where we omit cases from the spec due to
1790 * disallowed elements with a comment: `// OMITTED: <element-name>`.
1791 *
1792 * The HTML spec keeps a flag during the parsing process to track
1793 * whether or not a "parse error" has been encountered. We don't
1794 * bother to track that flag, we just implement the error-handling
1795 * process as specified.
1796 *
1797 * @ingroup Parser
1798 * @since 1.27
1799 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1800 */
1801 class Balancer {
1802 private $parseMode;
1803 /** @var \Iterator */
1804 private $bitsIterator;
1805 private $allowedHtmlElements;
1806 /** @var BalanceActiveFormattingElements */
1807 private $afe;
1808 /** @var BalanceStack */
1809 private $stack;
1810 private $strict;
1811 private $allowComments;
1812 private $config;
1813
1814 private $textIntegrationMode;
1815 private $pendingTableText;
1816 private $originalInsertionMode;
1817 private $fragmentContext;
1818 private $formElementPointer;
1819 private $ignoreLinefeed;
1820 private $inRCDATA;
1821 private $inRAWTEXT;
1822
1823 /** @var callable|null */
1824 private $processingCallback;
1825 /** @var array */
1826 private $processingArgs;
1827
1828 /**
1829 * Valid HTML5 comments.
1830 * Regex borrowed from Tim Starling's "remex-html" project.
1831 */
1832 const VALID_COMMENT_REGEX = "~ !--
1833 ( # 1. Comment match detector
1834 > | -> | # Invalid short close
1835 ( # 2. Comment contents
1836 (?:
1837 (?! --> )
1838 (?! --!> )
1839 (?! --! \z )
1840 (?! -- \z )
1841 (?! - \z )
1842 .
1843 )*+
1844 )
1845 ( # 3. Comment close
1846 --> | # Normal close
1847 --!> | # Comment end bang
1848 ( # 4. Indicate matches requiring EOF
1849 --! | # EOF in comment end bang state
1850 -- | # EOF in comment end state
1851 - | # EOF in comment end dash state
1852 (?#nothing) # EOF in comment state
1853 )
1854 )
1855 )
1856 ([^<]*) \z # 5. Non-tag text after the comment
1857 ~xs";
1858
1859 /**
1860 * Create a new Balancer.
1861 * @param array $config Balancer configuration. Includes:
1862 * 'strict' : boolean, defaults to false.
1863 * When true, enforces syntactic constraints on input:
1864 * all non-tag '<' must be escaped, all attributes must be
1865 * separated by a single space and double-quoted. This is
1866 * consistent with the output of the Sanitizer.
1867 * 'allowedHtmlElements' : array, defaults to null.
1868 * When present, the keys of this associative array give
1869 * the acceptable HTML tag names. When not present, no
1870 * tag sanitization is done.
1871 * 'tidyCompat' : boolean, defaults to false.
1872 * When true, the serialization algorithm is tweaked to
1873 * provide historical compatibility with the old "tidy"
1874 * program: <p>-wrapping is done to the children of
1875 * <body> and <blockquote> elements, and empty elements
1876 * are removed. The <pre>/<listing>/<textarea> serialization
1877 * is also tweaked to allow lossless round trips.
1878 * (See: https://github.com/whatwg/html/issues/944)
1879 * 'allowComments': boolean, defaults to true.
1880 * When true, allows HTML comments in the input.
1881 * The Sanitizer generally strips all comments, so if you
1882 * are running on sanitized output you can set this to
1883 * false to get a bit more performance.
1884 */
1885 public function __construct( array $config = [] ) {
1886 $this->config = $config = $config + [
1887 'strict' => false,
1888 'allowedHtmlElements' => null,
1889 'tidyCompat' => false,
1890 'allowComments' => true,
1891 ];
1892 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1893 $this->strict = $config['strict'];
1894 $this->allowComments = $config['allowComments'];
1895 if ( $this->allowedHtmlElements !== null ) {
1896 // Sanity check!
1897 $bad = array_uintersect_assoc(
1898 $this->allowedHtmlElements,
1899 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1900 function ( $a, $b ) {
1901 // Ignore the values (just intersect the keys) by saying
1902 // all values are equal to each other.
1903 return 0;
1904 }
1905 );
1906 if ( count( $bad ) > 0 ) {
1907 $badstr = implode( ',', array_keys( $bad ) );
1908 throw new ParameterAssertionException(
1909 '$config',
1910 'Balance attempted with sanitization including ' .
1911 "unsupported elements: {$badstr}"
1912 );
1913 }
1914 }
1915 }
1916
1917 /**
1918 * Return a balanced HTML string for the HTML fragment given by $text,
1919 * subject to the caveats listed in the class description. The result
1920 * will typically be idempotent -- that is, rebalancing the output
1921 * would result in no change.
1922 *
1923 * @param string $text The markup to be balanced
1924 * @param callable $processingCallback Callback to do any variable or
1925 * parameter replacements in HTML attributes values
1926 * @param array|bool $processingArgs Arguments for the processing callback
1927 * @return string The balanced markup
1928 */
1929 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1930 $this->parseMode = 'inBodyMode';
1931 $this->bitsIterator = new ExplodeIterator( '<', $text );
1932 $this->afe = new BalanceActiveFormattingElements();
1933 $this->stack = new BalanceStack( $this->config );
1934 $this->processingCallback = $processingCallback;
1935 $this->processingArgs = $processingArgs;
1936
1937 $this->textIntegrationMode =
1938 $this->ignoreLinefeed =
1939 $this->inRCDATA =
1940 $this->inRAWTEXT = false;
1941
1942 // The stack is constructed with an <html> element already on it.
1943 // Set this up as a fragment parsed with <body> as the context.
1944 $this->fragmentContext =
1945 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1946 $this->resetInsertionMode();
1947 $this->formElementPointer = null;
1948 for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) {
1949 if ( $e->isHtmlNamed( 'form' ) ) {
1950 $this->formElementPointer = $e;
1951 break;
1952 }
1953 }
1954
1955 // First element is text not tag
1956 $x = $this->bitsIterator->current();
1957 $this->bitsIterator->next();
1958 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1959 // Now process each tag.
1960 while ( $this->bitsIterator->valid() ) {
1961 $this->advance();
1962 }
1963 $this->insertToken( 'eof', null );
1964 $result = $this->stack->getOutput();
1965 // Free memory before returning.
1966 $this->bitsIterator = null;
1967 $this->afe = null;
1968 $this->stack = null;
1969 $this->fragmentContext = null;
1970 $this->formElementPointer = null;
1971 return $result;
1972 }
1973
1974 /**
1975 * Pass a token to the tree builder. The $token will be one of the
1976 * strings "tag", "endtag", or "text".
1977 */
1978 private function insertToken( $token, $value, $attribs = null, $selfClose = false ) {
1979 // validate tags against $unsupportedSet
1980 if ( $token === 'tag' || $token === 'endtag' ) {
1981 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1982 // As described in "simplifications" above, these tags are
1983 // not supported in the balancer.
1984 Assert::invariant(
1985 !$this->strict,
1986 "Unsupported $token <$value> found."
1987 );
1988 return false;
1989 }
1990 } elseif ( $token === 'text' && $value === '' ) {
1991 // Don't actually inject the empty string as a text token.
1992 return true;
1993 }
1994 // Support pre/listing/textarea by suppressing initial linefeed
1995 if ( $this->ignoreLinefeed ) {
1996 $this->ignoreLinefeed = false;
1997 if ( $token === 'text' ) {
1998 if ( $value[0] === "\n" ) {
1999 if ( $value === "\n" ) {
2000 // Nothing would be left, don't inject the empty string.
2001 return true;
2002 }
2003 $value = substr( $value, 1 );
2004 }
2005 }
2006 }
2007 // Some hoops we have to jump through
2008 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
2009
2010 // The spec calls this the "tree construction dispatcher".
2011 $isForeign = true;
2012 if (
2013 $this->stack->length() === 0 ||
2014 $adjusted->isHtml() ||
2015 $token === 'eof'
2016 ) {
2017 $isForeign = false;
2018 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
2019 if ( $token === 'text' ) {
2020 $isForeign = false;
2021 } elseif (
2022 $token === 'tag' &&
2023 $value !== 'mglyph' && $value !== 'malignmark'
2024 ) {
2025 $isForeign = false;
2026 }
2027 } elseif (
2028 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
2029 $adjusted->localName === 'annotation-xml' &&
2030 $token === 'tag' && $value === 'svg'
2031 ) {
2032 $isForeign = false;
2033 } elseif (
2034 $adjusted->isHtmlIntegrationPoint() &&
2035 ( $token === 'tag' || $token === 'text' )
2036 ) {
2037 $isForeign = false;
2038 }
2039 if ( $isForeign ) {
2040 return $this->insertForeignToken( $token, $value, $attribs, $selfClose );
2041 } else {
2042 $func = $this->parseMode;
2043 return $this->$func( $token, $value, $attribs, $selfClose );
2044 }
2045 }
2046
2047 private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) {
2048 if ( $token === 'text' ) {
2049 $this->stack->insertText( $value );
2050 return true;
2051 } elseif ( $token === 'comment' ) {
2052 $this->stack->insertComment( $value );
2053 return true;
2054 } elseif ( $token === 'tag' ) {
2055 switch ( $value ) {
2056 case 'font':
2057 if ( isset( $attribs['color'] )
2058 || isset( $attribs['face'] )
2059 || isset( $attribs['size'] )
2060 ) {
2061 break;
2062 }
2063 // otherwise, fall through
2064 case 'b':
2065 case 'big':
2066 case 'blockquote':
2067 case 'body':
2068 case 'br':
2069 case 'center':
2070 case 'code':
2071 case 'dd':
2072 case 'div':
2073 case 'dl':
2074 case 'dt':
2075 case 'em':
2076 case 'embed':
2077 case 'h1':
2078 case 'h2':
2079 case 'h3':
2080 case 'h4':
2081 case 'h5':
2082 case 'h6':
2083 case 'head':
2084 case 'hr':
2085 case 'i':
2086 case 'img':
2087 case 'li':
2088 case 'listing':
2089 case 'menu':
2090 case 'meta':
2091 case 'nobr':
2092 case 'ol':
2093 case 'p':
2094 case 'pre':
2095 case 'ruby':
2096 case 's':
2097 case 'small':
2098 case 'span':
2099 case 'strong':
2100 case 'strike':
2101 case 'sub':
2102 case 'sup':
2103 case 'table':
2104 case 'tt':
2105 case 'u':
2106 case 'ul':
2107 case 'var':
2108 if ( $this->fragmentContext ) {
2109 break;
2110 }
2111 while ( true ) {
2112 $this->stack->pop();
2113 $node = $this->stack->currentNode;
2114 if (
2115 $node->isMathmlTextIntegrationPoint() ||
2116 $node->isHtmlIntegrationPoint() ||
2117 $node->isHtml()
2118 ) {
2119 break;
2120 }
2121 }
2122 return $this->insertToken( $token, $value, $attribs, $selfClose );
2123 }
2124 // "Any other start tag"
2125 $adjusted = ( $this->fragmentContext && $this->stack->length() === 1 ) ?
2126 $this->fragmentContext : $this->stack->currentNode;
2127 $this->stack->insertForeignElement(
2128 $adjusted->namespaceURI, $value, $attribs
2129 );
2130 if ( $selfClose ) {
2131 $this->stack->pop();
2132 }
2133 return true;
2134 } elseif ( $token === 'endtag' ) {
2135 $first = true;
2136 foreach ( $this->stack as $i => $node ) {
2137 if ( $node->isHtml() && !$first ) {
2138 // process the end tag as HTML
2139 $func = $this->parseMode;
2140 return $this->$func( $token, $value, $attribs, $selfClose );
2141 } elseif ( $i === 0 ) {
2142 return true;
2143 } elseif ( $node->localName === $value ) {
2144 $this->stack->popTag( $node );
2145 return true;
2146 }
2147 $first = false;
2148 }
2149 }
2150 }
2151
2152 /**
2153 * Grab the next "token" from $bitsIterator. This is either a open/close
2154 * tag or text or a comment, depending on whether the Sanitizer approves.
2155 */
2156 private function advance() {
2157 $x = $this->bitsIterator->current();
2158 $this->bitsIterator->next();
2159 $regs = [];
2160 // Handle comments. These won't be generated by mediawiki (they
2161 // are stripped in the Sanitizer) but may be generated by extensions.
2162 if (
2163 $this->allowComments &&
2164 !( $this->inRCDATA || $this->inRAWTEXT ) &&
2165 preg_match( self::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
2166 // verify EOF condition where necessary
2167 ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
2168 ) {
2169 $contents = $regs[2][0];
2170 $rest = $regs[5][0];
2171 $this->insertToken( 'comment', $contents );
2172 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2173 return;
2174 }
2175 // $slash: Does the current element start with a '/'?
2176 // $t: Current element name
2177 // $attribStr: String between element name and >
2178 // $brace: Ending '>' or '/>'
2179 // $rest: Everything until the next element from the $bitsIterator
2180 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2181 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2182 $t = strtolower( $t );
2183 if ( $this->strict ) {
2184 // Verify that attributes are all properly double-quoted
2185 Assert::invariant(
2186 preg_match(
2187 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2188 ),
2189 "Bad attribute string found"
2190 );
2191 }
2192 } else {
2193 Assert::invariant(
2194 !$this->strict, "< found which does not start a valid tag"
2195 );
2196 $slash = $t = $attribStr = $brace = $rest = null;
2197 }
2198 $goodTag = $t;
2199 if ( $this->inRCDATA ) {
2200 if ( $slash && $t === $this->inRCDATA ) {
2201 $this->inRCDATA = false;
2202 } else {
2203 // No tags allowed; this emulates the "rcdata" tokenizer mode.
2204 $goodTag = false;
2205 }
2206 }
2207 if ( $this->inRAWTEXT ) {
2208 if ( $slash && $t === $this->inRAWTEXT ) {
2209 $this->inRAWTEXT = false;
2210 } else {
2211 // No tags allowed, no entity-escaping done.
2212 $goodTag = false;
2213 }
2214 }
2215 $sanitize = $this->allowedHtmlElements !== null;
2216 if ( $sanitize ) {
2217 $goodTag = $t && isset( $this->allowedHtmlElements[$t] );
2218 }
2219 if ( $goodTag ) {
2220 if ( is_callable( $this->processingCallback ) ) {
2221 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2222 }
2223 if ( $sanitize ) {
2224 $goodTag = Sanitizer::validateTag( $attribStr, $t );
2225 }
2226 }
2227 if ( $goodTag ) {
2228 if ( $sanitize ) {
2229 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2230 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2231 } else {
2232 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2233 }
2234 $goodTag = $this->insertToken(
2235 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2236 );
2237 }
2238 if ( $goodTag ) {
2239 $rest = str_replace( '>', '&gt;', $rest );
2240 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2241 } elseif ( $this->inRAWTEXT ) {
2242 $this->insertToken( 'text', "<$x" );
2243 } else {
2244 // bad tag; serialize entire thing as text.
2245 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2246 }
2247 }
2248
2249 private function switchMode( $mode ) {
2250 Assert::parameter(
2251 substr( $mode, -4 ) === 'Mode', '$mode', 'should end in Mode'
2252 );
2253 $oldMode = $this->parseMode;
2254 $this->parseMode = $mode;
2255 return $oldMode;
2256 }
2257
2258 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) {
2259 $this->switchMode( $mode );
2260 return $this->insertToken( $token, $value, $attribs, $selfClose );
2261 }
2262
2263 private function resetInsertionMode() {
2264 $last = false;
2265 foreach ( $this->stack as $i => $node ) {
2266 if ( $i === 0 ) {
2267 $last = true;
2268 if ( $this->fragmentContext ) {
2269 $node = $this->fragmentContext;
2270 }
2271 }
2272 if ( $node->isHtml() ) {
2273 switch ( $node->localName ) {
2274 case 'select':
2275 $stackLength = $this->stack->length();
2276 for ( $j = $i + 1; $j < $stackLength - 1; $j++ ) {
2277 $ancestor = $this->stack->node( $stackLength - $j - 1 );
2278 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2279 break;
2280 }
2281 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2282 $this->switchMode( 'inSelectInTableMode' );
2283 return;
2284 }
2285 }
2286 $this->switchMode( 'inSelectMode' );
2287 return;
2288 case 'tr':
2289 $this->switchMode( 'inRowMode' );
2290 return;
2291 case 'tbody':
2292 case 'tfoot':
2293 case 'thead':
2294 $this->switchMode( 'inTableBodyMode' );
2295 return;
2296 case 'caption':
2297 $this->switchMode( 'inCaptionMode' );
2298 return;
2299 case 'colgroup':
2300 $this->switchMode( 'inColumnGroupMode' );
2301 return;
2302 case 'table':
2303 $this->switchMode( 'inTableMode' );
2304 return;
2305 case 'template':
2306 $this->switchMode(
2307 array_slice( $this->templateInsertionModes, -1 )[0]
2308 );
2309 return;
2310 case 'body':
2311 $this->switchMode( 'inBodyMode' );
2312 return;
2313 // OMITTED: <frameset>
2314 // OMITTED: <html>
2315 // OMITTED: <head>
2316 default:
2317 if ( !$last ) {
2318 // OMITTED: <head>
2319 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2320 $this->switchMode( 'inCellMode' );
2321 return;
2322 }
2323 }
2324 }
2325 }
2326 if ( $last ) {
2327 $this->switchMode( 'inBodyMode' );
2328 return;
2329 }
2330 }
2331 }
2332
2333 private function stopParsing() {
2334 // Most of the spec methods are inapplicable, other than step 2:
2335 // "pop all the nodes off the stack of open elements".
2336 // We're going to keep the top-most <html> element on the stack, though.
2337
2338 // Clear the AFE list first, otherwise the element objects will stay live
2339 // during serialization, potentially using O(N^2) memory. Note that
2340 // popping the stack will never result in reconstructing the active
2341 // formatting elements.
2342 $this->afe = null;
2343 $this->stack->popTo( 1 );
2344 }
2345
2346 private function parseRawText( $value, $attribs = null ) {
2347 $this->stack->insertHTMLElement( $value, $attribs );
2348 $this->inRAWTEXT = $value;
2349 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2350 return true;
2351 }
2352
2353 private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) {
2354 if ( $token === 'text' ) {
2355 $this->stack->insertText( $value );
2356 return true;
2357 } elseif ( $token === 'eof' ) {
2358 $this->stack->pop();
2359 return $this->switchModeAndReprocess(
2360 $this->originalInsertionMode, $token, $value, $attribs, $selfClose
2361 );
2362 } elseif ( $token === 'endtag' ) {
2363 $this->stack->pop();
2364 $this->switchMode( $this->originalInsertionMode );
2365 return true;
2366 }
2367 return true;
2368 }
2369
2370 private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) {
2371 if ( $token === 'text' ) {
2372 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2373 $this->stack->insertText( $matches[0] );
2374 $value = substr( $value, strlen( $matches[0] ) );
2375 }
2376 if ( strlen( $value ) === 0 ) {
2377 return true; // All text handled.
2378 }
2379 // Fall through to handle non-whitespace below.
2380 } elseif ( $token === 'tag' ) {
2381 switch ( $value ) {
2382 case 'meta':
2383 // OMITTED: in a full HTML parser, this might change the encoding.
2384 // falls through
2385 // OMITTED: <html>
2386 case 'base':
2387 case 'basefont':
2388 case 'bgsound':
2389 case 'link':
2390 $this->stack->insertHTMLElement( $value, $attribs );
2391 $this->stack->pop();
2392 return true;
2393 // OMITTED: <title>
2394 // OMITTED: <noscript>
2395 case 'noframes':
2396 case 'style':
2397 return $this->parseRawText( $value, $attribs );
2398 // OMITTED: <script>
2399 case 'template':
2400 $this->stack->insertHTMLElement( $value, $attribs );
2401 $this->afe->insertMarker();
2402 // OMITTED: frameset_ok
2403 $this->switchMode( 'inTemplateMode' );
2404 $this->templateInsertionModes[] = $this->parseMode;
2405 return true;
2406 // OMITTED: <head>
2407 }
2408 } elseif ( $token === 'endtag' ) {
2409 switch ( $value ) {
2410 // OMITTED: <head>
2411 // OMITTED: <body>
2412 // OMITTED: <html>
2413 case 'br':
2414 break; // handle at the bottom of the function
2415 case 'template':
2416 if ( $this->stack->indexOf( $value ) < 0 ) {
2417 return true; // Ignore the token.
2418 }
2419 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2420 $this->stack->popTag( $value );
2421 $this->afe->clearToMarker();
2422 array_pop( $this->templateInsertionModes );
2423 $this->resetInsertionMode();
2424 return true;
2425 default:
2426 // ignore any other end tag
2427 return true;
2428 }
2429 } elseif ( $token === 'comment' ) {
2430 $this->stack->insertComment( $value );
2431 return true;
2432 }
2433
2434 // If not handled above
2435 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2436 // Then redo this one
2437 return $this->insertToken( $token, $value, $attribs, $selfClose );
2438 }
2439
2440 private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
2441 if ( $token === 'text' ) {
2442 $this->afe->reconstruct( $this->stack );
2443 $this->stack->insertText( $value );
2444 return true;
2445 } elseif ( $token === 'eof' ) {
2446 if ( !empty( $this->templateInsertionModes ) ) {
2447 return $this->inTemplateMode( $token, $value, $attribs, $selfClose );
2448 }
2449 $this->stopParsing();
2450 return true;
2451 } elseif ( $token === 'tag' ) {
2452 switch ( $value ) {
2453 // OMITTED: <html>
2454 case 'base':
2455 case 'basefont':
2456 case 'bgsound':
2457 case 'link':
2458 case 'meta':
2459 case 'noframes':
2460 // OMITTED: <script>
2461 case 'style':
2462 case 'template':
2463 // OMITTED: <title>
2464 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2465 // OMITTED: <body>
2466 // OMITTED: <frameset>
2467
2468 case 'address':
2469 case 'article':
2470 case 'aside':
2471 case 'blockquote':
2472 case 'center':
2473 case 'details':
2474 case 'dialog':
2475 case 'dir':
2476 case 'div':
2477 case 'dl':
2478 case 'fieldset':
2479 case 'figcaption':
2480 case 'figure':
2481 case 'footer':
2482 case 'header':
2483 case 'hgroup':
2484 case 'main':
2485 case 'nav':
2486 case 'ol':
2487 case 'p':
2488 case 'section':
2489 case 'summary':
2490 case 'ul':
2491 if ( $this->stack->inButtonScope( 'p' ) ) {
2492 $this->inBodyMode( 'endtag', 'p' );
2493 }
2494 $this->stack->insertHTMLElement( $value, $attribs );
2495 return true;
2496
2497 case 'menu':
2498 if ( $this->stack->inButtonScope( "p" ) ) {
2499 $this->inBodyMode( 'endtag', 'p' );
2500 }
2501 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2502 $this->stack->pop();
2503 }
2504 $this->stack->insertHTMLElement( $value, $attribs );
2505 return true;
2506
2507 case 'h1':
2508 case 'h2':
2509 case 'h3':
2510 case 'h4':
2511 case 'h5':
2512 case 'h6':
2513 if ( $this->stack->inButtonScope( 'p' ) ) {
2514 $this->inBodyMode( 'endtag', 'p' );
2515 }
2516 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2517 $this->stack->pop();
2518 }
2519 $this->stack->insertHTMLElement( $value, $attribs );
2520 return true;
2521
2522 case 'pre':
2523 case 'listing':
2524 if ( $this->stack->inButtonScope( 'p' ) ) {
2525 $this->inBodyMode( 'endtag', 'p' );
2526 }
2527 $this->stack->insertHTMLElement( $value, $attribs );
2528 $this->ignoreLinefeed = true;
2529 // OMITTED: frameset_ok
2530 return true;
2531
2532 case 'form':
2533 if (
2534 $this->formElementPointer &&
2535 $this->stack->indexOf( 'template' ) < 0
2536 ) {
2537 return true; // in a form, not in a template.
2538 }
2539 if ( $this->stack->inButtonScope( "p" ) ) {
2540 $this->inBodyMode( 'endtag', 'p' );
2541 }
2542 $elt = $this->stack->insertHTMLElement( $value, $attribs );
2543 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2544 $this->formElementPointer = $elt;
2545 }
2546 return true;
2547
2548 case 'li':
2549 // OMITTED: frameset_ok
2550 foreach ( $this->stack as $node ) {
2551 if ( $node->isHtmlNamed( 'li' ) ) {
2552 $this->inBodyMode( 'endtag', 'li' );
2553 break;
2554 }
2555 if (
2556 $node->isA( BalanceSets::$specialSet ) &&
2557 !$node->isA( BalanceSets::$addressDivPSet )
2558 ) {
2559 break;
2560 }
2561 }
2562 if ( $this->stack->inButtonScope( 'p' ) ) {
2563 $this->inBodyMode( 'endtag', 'p' );
2564 }
2565 $this->stack->insertHTMLElement( $value, $attribs );
2566 return true;
2567
2568 case 'dd':
2569 case 'dt':
2570 // OMITTED: frameset_ok
2571 foreach ( $this->stack as $node ) {
2572 if ( $node->isHtmlNamed( 'dd' ) ) {
2573 $this->inBodyMode( 'endtag', 'dd' );
2574 break;
2575 }
2576 if ( $node->isHtmlNamed( 'dt' ) ) {
2577 $this->inBodyMode( 'endtag', 'dt' );
2578 break;
2579 }
2580 if (
2581 $node->isA( BalanceSets::$specialSet ) &&
2582 !$node->isA( BalanceSets::$addressDivPSet )
2583 ) {
2584 break;
2585 }
2586 }
2587 if ( $this->stack->inButtonScope( 'p' ) ) {
2588 $this->inBodyMode( 'endtag', 'p' );
2589 }
2590 $this->stack->insertHTMLElement( $value, $attribs );
2591 return true;
2592
2593 // OMITTED: <plaintext>
2594
2595 case 'button':
2596 if ( $this->stack->inScope( 'button' ) ) {
2597 $this->inBodyMode( 'endtag', 'button' );
2598 return $this->insertToken( $token, $value, $attribs, $selfClose );
2599 }
2600 $this->afe->reconstruct( $this->stack );
2601 $this->stack->insertHTMLElement( $value, $attribs );
2602 return true;
2603
2604 case 'a':
2605 $activeElement = $this->afe->findElementByTag( 'a' );
2606 if ( $activeElement ) {
2607 $this->inBodyMode( 'endtag', 'a' );
2608 if ( $this->afe->isInList( $activeElement ) ) {
2609 $this->afe->remove( $activeElement );
2610 // Don't flatten here, since when we fall
2611 // through below we might foster parent
2612 // the new <a> tag inside this one.
2613 $this->stack->removeElement( $activeElement, false );
2614 }
2615 }
2616 // Falls through
2617 case 'b':
2618 case 'big':
2619 case 'code':
2620 case 'em':
2621 case 'font':
2622 case 'i':
2623 case 's':
2624 case 'small':
2625 case 'strike':
2626 case 'strong':
2627 case 'tt':
2628 case 'u':
2629 $this->afe->reconstruct( $this->stack );
2630 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
2631 return true;
2632
2633 case 'nobr':
2634 $this->afe->reconstruct( $this->stack );
2635 if ( $this->stack->inScope( 'nobr' ) ) {
2636 $this->inBodyMode( 'endtag', 'nobr' );
2637 $this->afe->reconstruct( $this->stack );
2638 }
2639 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
2640 return true;
2641
2642 case 'applet':
2643 case 'marquee':
2644 case 'object':
2645 $this->afe->reconstruct( $this->stack );
2646 $this->stack->insertHTMLElement( $value, $attribs );
2647 $this->afe->insertMarker();
2648 // OMITTED: frameset_ok
2649 return true;
2650
2651 case 'table':
2652 // The document is never in "quirks mode"; see simplifications
2653 // above.
2654 if ( $this->stack->inButtonScope( 'p' ) ) {
2655 $this->inBodyMode( 'endtag', 'p' );
2656 }
2657 $this->stack->insertHTMLElement( $value, $attribs );
2658 // OMITTED: frameset_ok
2659 $this->switchMode( 'inTableMode' );
2660 return true;
2661
2662 case 'area':
2663 case 'br':
2664 case 'embed':
2665 case 'img':
2666 case 'keygen':
2667 case 'wbr':
2668 $this->afe->reconstruct( $this->stack );
2669 $this->stack->insertHTMLElement( $value, $attribs );
2670 $this->stack->pop();
2671 // OMITTED: frameset_ok
2672 return true;
2673
2674 case 'input':
2675 $this->afe->reconstruct( $this->stack );
2676 $this->stack->insertHTMLElement( $value, $attribs );
2677 $this->stack->pop();
2678 // OMITTED: frameset_ok
2679 // (hence we don't need to examine the tag's "type" attribute)
2680 return true;
2681
2682 case 'param':
2683 case 'source':
2684 case 'track':
2685 $this->stack->insertHTMLElement( $value, $attribs );
2686 $this->stack->pop();
2687 return true;
2688
2689 case 'hr':
2690 if ( $this->stack->inButtonScope( 'p' ) ) {
2691 $this->inBodyMode( 'endtag', 'p' );
2692 }
2693 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2694 $this->stack->pop();
2695 }
2696 $this->stack->insertHTMLElement( $value, $attribs );
2697 $this->stack->pop();
2698 return true;
2699
2700 case 'image':
2701 // warts!
2702 return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
2703
2704 case 'textarea':
2705 $this->stack->insertHTMLElement( $value, $attribs );
2706 $this->ignoreLinefeed = true;
2707 $this->inRCDATA = $value; // emulate rcdata tokenizer mode
2708 // OMITTED: frameset_ok
2709 return true;
2710
2711 // OMITTED: <xmp>
2712 // OMITTED: <iframe>
2713 // OMITTED: <noembed>
2714 // OMITTED: <noscript>
2715
2716 case 'select':
2717 $this->afe->reconstruct( $this->stack );
2718 $this->stack->insertHTMLElement( $value, $attribs );
2719 switch ( $this->parseMode ) {
2720 case 'inTableMode':
2721 case 'inCaptionMode':
2722 case 'inTableBodyMode':
2723 case 'inRowMode':
2724 case 'inCellMode':
2725 $this->switchMode( 'inSelectInTableMode' );
2726 return true;
2727 default:
2728 $this->switchMode( 'inSelectMode' );
2729 return true;
2730 }
2731
2732 case 'optgroup':
2733 case 'option':
2734 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
2735 $this->inBodyMode( 'endtag', 'option' );
2736 }
2737 $this->afe->reconstruct( $this->stack );
2738 $this->stack->insertHTMLElement( $value, $attribs );
2739 return true;
2740
2741 case 'menuitem':
2742 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2743 $this->stack->pop();
2744 }
2745 $this->afe->reconstruct( $this->stack );
2746 $this->stack->insertHTMLElement( $value, $attribs );
2747 return true;
2748
2749 case 'rb':
2750 case 'rtc':
2751 if ( $this->stack->inScope( 'ruby' ) ) {
2752 $this->stack->generateImpliedEndTags();
2753 }
2754 $this->stack->insertHTMLElement( $value, $attribs );
2755 return true;
2756
2757 case 'rp':
2758 case 'rt':
2759 if ( $this->stack->inScope( 'ruby' ) ) {
2760 $this->stack->generateImpliedEndTags( 'rtc' );
2761 }
2762 $this->stack->insertHTMLElement( $value, $attribs );
2763 return true;
2764
2765 case 'math':
2766 $this->afe->reconstruct( $this->stack );
2767 // We skip the spec's "adjust MathML attributes" and
2768 // "adjust foreign attributes" steps, since the browser will
2769 // do this later when it parses the output and it doesn't affect
2770 // balancing.
2771 $this->stack->insertForeignElement(
2772 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2773 );
2774 if ( $selfClose ) {
2775 // emit explicit </math> tag.
2776 $this->stack->pop();
2777 }
2778 return true;
2779
2780 case 'svg':
2781 $this->afe->reconstruct( $this->stack );
2782 // We skip the spec's "adjust SVG attributes" and
2783 // "adjust foreign attributes" steps, since the browser will
2784 // do this later when it parses the output and it doesn't affect
2785 // balancing.
2786 $this->stack->insertForeignElement(
2787 BalanceSets::SVG_NAMESPACE, $value, $attribs
2788 );
2789 if ( $selfClose ) {
2790 // emit explicit </svg> tag.
2791 $this->stack->pop();
2792 }
2793 return true;
2794
2795 case 'caption':
2796 case 'col':
2797 case 'colgroup':
2798 // OMITTED: <frame>
2799 case 'head':
2800 case 'tbody':
2801 case 'td':
2802 case 'tfoot':
2803 case 'th':
2804 case 'thead':
2805 case 'tr':
2806 // Ignore table tags if we're not inTableMode
2807 return true;
2808 }
2809
2810 // Handle any other start tag here
2811 $this->afe->reconstruct( $this->stack );
2812 $this->stack->insertHTMLElement( $value, $attribs );
2813 return true;
2814 } elseif ( $token === 'endtag' ) {
2815 switch ( $value ) {
2816 // </body>,</html> are unsupported.
2817
2818 case 'template':
2819 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2820
2821 case 'address':
2822 case 'article':
2823 case 'aside':
2824 case 'blockquote':
2825 case 'button':
2826 case 'center':
2827 case 'details':
2828 case 'dialog':
2829 case 'dir':
2830 case 'div':
2831 case 'dl':
2832 case 'fieldset':
2833 case 'figcaption':
2834 case 'figure':
2835 case 'footer':
2836 case 'header':
2837 case 'hgroup':
2838 case 'listing':
2839 case 'main':
2840 case 'menu':
2841 case 'nav':
2842 case 'ol':
2843 case 'pre':
2844 case 'section':
2845 case 'summary':
2846 case 'ul':
2847 // Ignore if there is not a matching open tag
2848 if ( !$this->stack->inScope( $value ) ) {
2849 return true;
2850 }
2851 $this->stack->generateImpliedEndTags();
2852 $this->stack->popTag( $value );
2853 return true;
2854
2855 case 'form':
2856 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2857 $openform = $this->formElementPointer;
2858 $this->formElementPointer = null;
2859 if ( !$openform || !$this->stack->inScope( $openform ) ) {
2860 return true;
2861 }
2862 $this->stack->generateImpliedEndTags();
2863 // Don't flatten yet if we're removing a <form> element
2864 // out-of-order. (eg. `<form><div></form>`)
2865 $flatten = ( $this->stack->currentNode === $openform );
2866 $this->stack->removeElement( $openform, $flatten );
2867 } else {
2868 if ( !$this->stack->inScope( 'form' ) ) {
2869 return true;
2870 }
2871 $this->stack->generateImpliedEndTags();
2872 $this->stack->popTag( 'form' );
2873 }
2874 return true;
2875
2876 case 'p':
2877 if ( !$this->stack->inButtonScope( 'p' ) ) {
2878 $this->inBodyMode( 'tag', 'p', [] );
2879 return $this->insertToken( $token, $value, $attribs, $selfClose );
2880 }
2881 $this->stack->generateImpliedEndTags( $value );
2882 $this->stack->popTag( $value );
2883 return true;
2884
2885 case 'li':
2886 if ( !$this->stack->inListItemScope( $value ) ) {
2887 return true; // ignore
2888 }
2889 $this->stack->generateImpliedEndTags( $value );
2890 $this->stack->popTag( $value );
2891 return true;
2892
2893 case 'dd':
2894 case 'dt':
2895 if ( !$this->stack->inScope( $value ) ) {
2896 return true; // ignore
2897 }
2898 $this->stack->generateImpliedEndTags( $value );
2899 $this->stack->popTag( $value );
2900 return true;
2901
2902 case 'h1':
2903 case 'h2':
2904 case 'h3':
2905 case 'h4':
2906 case 'h5':
2907 case 'h6':
2908 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2909 return true; // ignore
2910 }
2911 $this->stack->generateImpliedEndTags();
2912 $this->stack->popTag( BalanceSets::$headingSet );
2913 return true;
2914
2915 case 'sarcasm':
2916 // Take a deep breath, then:
2917 break;
2918
2919 case 'a':
2920 case 'b':
2921 case 'big':
2922 case 'code':
2923 case 'em':
2924 case 'font':
2925 case 'i':
2926 case 'nobr':
2927 case 's':
2928 case 'small':
2929 case 'strike':
2930 case 'strong':
2931 case 'tt':
2932 case 'u':
2933 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2934 return true; // If we did something, we're done.
2935 }
2936 break; // Go to the "any other end tag" case.
2937
2938 case 'applet':
2939 case 'marquee':
2940 case 'object':
2941 if ( !$this->stack->inScope( $value ) ) {
2942 return true; // ignore
2943 }
2944 $this->stack->generateImpliedEndTags();
2945 $this->stack->popTag( $value );
2946 $this->afe->clearToMarker();
2947 return true;
2948
2949 case 'br':
2950 // Turn </br> into <br>
2951 return $this->inBodyMode( 'tag', $value, [] );
2952 }
2953
2954 // Any other end tag goes here
2955 foreach ( $this->stack as $i => $node ) {
2956 if ( $node->isHtmlNamed( $value ) ) {
2957 $this->stack->generateImpliedEndTags( $value );
2958 $this->stack->popTo( $i ); // including $i
2959 break;
2960 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2961 return true; // ignore this close token.
2962 }
2963 }
2964 return true;
2965 } elseif ( $token === 'comment' ) {
2966 $this->stack->insertComment( $value );
2967 return true;
2968 } else {
2969 Assert::invariant( false, "Bad token type: $token" );
2970 }
2971 }
2972
2973 private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) {
2974 if ( $token === 'text' ) {
2975 if ( $this->textIntegrationMode ) {
2976 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
2977 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2978 $this->pendingTableText = '';
2979 $this->originalInsertionMode = $this->parseMode;
2980 return $this->switchModeAndReprocess( 'inTableTextMode',
2981 $token, $value, $attribs, $selfClose );
2982 }
2983 // fall through to default case.
2984 } elseif ( $token === 'eof' ) {
2985 $this->stopParsing();
2986 return true;
2987 } elseif ( $token === 'tag' ) {
2988 switch ( $value ) {
2989 case 'caption':
2990 $this->afe->insertMarker();
2991 $this->stack->insertHTMLElement( $value, $attribs );
2992 $this->switchMode( 'inCaptionMode' );
2993 return true;
2994 case 'colgroup':
2995 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2996 $this->stack->insertHTMLElement( $value, $attribs );
2997 $this->switchMode( 'inColumnGroupMode' );
2998 return true;
2999 case 'col':
3000 $this->inTableMode( 'tag', 'colgroup', [] );
3001 return $this->insertToken( $token, $value, $attribs, $selfClose );
3002 case 'tbody':
3003 case 'tfoot':
3004 case 'thead':
3005 $this->stack->clearToContext( BalanceSets::$tableContextSet );
3006 $this->stack->insertHTMLElement( $value, $attribs );
3007 $this->switchMode( 'inTableBodyMode' );
3008 return true;
3009 case 'td':
3010 case 'th':
3011 case 'tr':
3012 $this->inTableMode( 'tag', 'tbody', [] );
3013 return $this->insertToken( $token, $value, $attribs, $selfClose );
3014 case 'table':
3015 if ( !$this->stack->inTableScope( $value ) ) {
3016 return true; // Ignore this tag.
3017 }
3018 $this->inTableMode( 'endtag', $value );
3019 return $this->insertToken( $token, $value, $attribs, $selfClose );
3020
3021 case 'style':
3022 // OMITTED: <script>
3023 case 'template':
3024 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3025
3026 case 'input':
3027 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
3028 break; // Handle this as "everything else"
3029 }
3030 $this->stack->insertHTMLElement( $value, $attribs );
3031 $this->stack->pop();
3032 return true;
3033
3034 case 'form':
3035 if (
3036 $this->formElementPointer ||
3037 $this->stack->indexOf( 'template' ) >= 0
3038 ) {
3039 return true; // ignore this token
3040 }
3041 $this->formElementPointer =
3042 $this->stack->insertHTMLElement( $value, $attribs );
3043 $this->stack->popTag( $this->formElementPointer );
3044 return true;
3045 }
3046 // Fall through for "anything else" clause.
3047 } elseif ( $token === 'endtag' ) {
3048 switch ( $value ) {
3049 case 'table':
3050 if ( !$this->stack->inTableScope( $value ) ) {
3051 return true; // Ignore.
3052 }
3053 $this->stack->popTag( $value );
3054 $this->resetInsertionMode();
3055 return true;
3056 // OMITTED: <body>
3057 case 'caption':
3058 case 'col':
3059 case 'colgroup':
3060 // OMITTED: <html>
3061 case 'tbody':
3062 case 'td':
3063 case 'tfoot':
3064 case 'th':
3065 case 'thead':
3066 case 'tr':
3067 return true; // Ignore the token.
3068 case 'template':
3069 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3070 }
3071 // Fall through for "anything else" clause.
3072 } elseif ( $token === 'comment' ) {
3073 $this->stack->insertComment( $value );
3074 return true;
3075 }
3076 // This is the "anything else" case:
3077 $this->stack->fosterParentMode = true;
3078 $this->inBodyMode( $token, $value, $attribs, $selfClose );
3079 $this->stack->fosterParentMode = false;
3080 return true;
3081 }
3082
3083 private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) {
3084 if ( $token === 'text' ) {
3085 $this->pendingTableText .= $value;
3086 return true;
3087 }
3088 // Non-text token:
3089 $text = $this->pendingTableText;
3090 $this->pendingTableText = '';
3091 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
3092 // This should match the "anything else" case inTableMode
3093 $this->stack->fosterParentMode = true;
3094 $this->inBodyMode( 'text', $text );
3095 $this->stack->fosterParentMode = false;
3096 } else {
3097 // Pending text is just whitespace.
3098 $this->stack->insertText( $text );
3099 }
3100 return $this->switchModeAndReprocess(
3101 $this->originalInsertionMode, $token, $value, $attribs, $selfClose
3102 );
3103 }
3104
3105 // helper for inCaptionMode
3106 private function endCaption() {
3107 if ( !$this->stack->inTableScope( 'caption' ) ) {
3108 return false;
3109 }
3110 $this->stack->generateImpliedEndTags();
3111 $this->stack->popTag( 'caption' );
3112 $this->afe->clearToMarker();
3113 $this->switchMode( 'inTableMode' );
3114 return true;
3115 }
3116
3117 private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) {
3118 if ( $token === 'tag' ) {
3119 switch ( $value ) {
3120 case 'caption':
3121 case 'col':
3122 case 'colgroup':
3123 case 'tbody':
3124 case 'td':
3125 case 'tfoot':
3126 case 'th':
3127 case 'thead':
3128 case 'tr':
3129 if ( $this->endCaption() ) {
3130 $this->insertToken( $token, $value, $attribs, $selfClose );
3131 }
3132 return true;
3133 }
3134 // Fall through to "anything else" case.
3135 } elseif ( $token === 'endtag' ) {
3136 switch ( $value ) {
3137 case 'caption':
3138 $this->endCaption();
3139 return true;
3140 case 'table':
3141 if ( $this->endCaption() ) {
3142 $this->insertToken( $token, $value, $attribs, $selfClose );
3143 }
3144 return true;
3145 case 'body':
3146 case 'col':
3147 case 'colgroup':
3148 // OMITTED: <html>
3149 case 'tbody':
3150 case 'td':
3151 case 'tfoot':
3152 case 'th':
3153 case 'thead':
3154 case 'tr':
3155 // Ignore the token
3156 return true;
3157 }
3158 // Fall through to "anything else" case.
3159 }
3160 // The Anything Else case
3161 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3162 }
3163
3164 private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) {
3165 if ( $token === 'text' ) {
3166 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
3167 $this->stack->insertText( $matches[0] );
3168 $value = substr( $value, strlen( $matches[0] ) );
3169 }
3170 if ( strlen( $value ) === 0 ) {
3171 return true; // All text handled.
3172 }
3173 // Fall through to handle non-whitespace below.
3174 } elseif ( $token === 'tag' ) {
3175 switch ( $value ) {
3176 // OMITTED: <html>
3177 case 'col':
3178 $this->stack->insertHTMLElement( $value, $attribs );
3179 $this->stack->pop();
3180 return true;
3181 case 'template':
3182 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3183 }
3184 // Fall through for "anything else".
3185 } elseif ( $token === 'endtag' ) {
3186 switch ( $value ) {
3187 case 'colgroup':
3188 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3189 return true; // Ignore the token.
3190 }
3191 $this->stack->pop();
3192 $this->switchMode( 'inTableMode' );
3193 return true;
3194 case 'col':
3195 return true; // Ignore the token.
3196 case 'template':
3197 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3198 }
3199 // Fall through for "anything else".
3200 } elseif ( $token === 'eof' ) {
3201 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3202 } elseif ( $token === 'comment' ) {
3203 $this->stack->insertComment( $value );
3204 return true;
3205 }
3206
3207 // Anything else
3208 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3209 return true; // Ignore the token.
3210 }
3211 $this->inColumnGroupMode( 'endtag', 'colgroup' );
3212 return $this->insertToken( $token, $value, $attribs, $selfClose );
3213 }
3214
3215 // Helper function for inTableBodyMode
3216 private function endSection() {
3217 if ( !(
3218 $this->stack->inTableScope( 'tbody' ) ||
3219 $this->stack->inTableScope( 'thead' ) ||
3220 $this->stack->inTableScope( 'tfoot' )
3221 ) ) {
3222 return false;
3223 }
3224 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3225 $this->stack->pop();
3226 $this->switchMode( 'inTableMode' );
3227 return true;
3228 }
3229 private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
3230 if ( $token === 'tag' ) {
3231 switch ( $value ) {
3232 case 'tr':
3233 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3234 $this->stack->insertHTMLElement( $value, $attribs );
3235 $this->switchMode( 'inRowMode' );
3236 return true;
3237 case 'th':
3238 case 'td':
3239 $this->inTableBodyMode( 'tag', 'tr', [] );
3240 $this->insertToken( $token, $value, $attribs, $selfClose );
3241 return true;
3242 case 'caption':
3243 case 'col':
3244 case 'colgroup':
3245 case 'tbody':
3246 case 'tfoot':
3247 case 'thead':
3248 if ( $this->endSection() ) {
3249 $this->insertToken( $token, $value, $attribs, $selfClose );
3250 }
3251 return true;
3252 }
3253 } elseif ( $token === 'endtag' ) {
3254 switch ( $value ) {
3255 case 'table':
3256 if ( $this->endSection() ) {
3257 $this->insertToken( $token, $value, $attribs, $selfClose );
3258 }
3259 return true;
3260 case 'tbody':
3261 case 'tfoot':
3262 case 'thead':
3263 if ( $this->stack->inTableScope( $value ) ) {
3264 $this->endSection();
3265 }
3266 return true;
3267 // OMITTED: <body>
3268 case 'caption':
3269 case 'col':
3270 case 'colgroup':
3271 // OMITTED: <html>
3272 case 'td':
3273 case 'th':
3274 case 'tr':
3275 return true; // Ignore the token.
3276 }
3277 }
3278 // Anything else:
3279 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3280 }
3281
3282 // Helper function for inRowMode
3283 private function endRow() {
3284 if ( !$this->stack->inTableScope( 'tr' ) ) {
3285 return false;
3286 }
3287 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3288 $this->stack->pop();
3289 $this->switchMode( 'inTableBodyMode' );
3290 return true;
3291 }
3292 private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) {
3293 if ( $token === 'tag' ) {
3294 switch ( $value ) {
3295 case 'th':
3296 case 'td':
3297 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3298 $this->stack->insertHTMLElement( $value, $attribs );
3299 $this->switchMode( 'inCellMode' );
3300 $this->afe->insertMarker();
3301 return true;
3302 case 'caption':
3303 case 'col':
3304 case 'colgroup':
3305 case 'tbody':
3306 case 'tfoot':
3307 case 'thead':
3308 case 'tr':
3309 if ( $this->endRow() ) {
3310 $this->insertToken( $token, $value, $attribs, $selfClose );
3311 }
3312 return true;
3313 }
3314 } elseif ( $token === 'endtag' ) {
3315 switch ( $value ) {
3316 case 'tr':
3317 $this->endRow();
3318 return true;
3319 case 'table':
3320 if ( $this->endRow() ) {
3321 $this->insertToken( $token, $value, $attribs, $selfClose );
3322 }
3323 return true;
3324 case 'tbody':
3325 case 'tfoot':
3326 case 'thead':
3327 if (
3328 $this->stack->inTableScope( $value ) &&
3329 $this->endRow()
3330 ) {
3331 $this->insertToken( $token, $value, $attribs, $selfClose );
3332 }
3333 return true;
3334 // OMITTED: <body>
3335 case 'caption':
3336 case 'col':
3337 case 'colgroup':
3338 // OMITTED: <html>
3339 case 'td':
3340 case 'th':
3341 return true; // Ignore the token.
3342 }
3343 }
3344 // Anything else:
3345 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3346 }
3347
3348 // Helper for inCellMode
3349 private function endCell() {
3350 if ( $this->stack->inTableScope( 'td' ) ) {
3351 $this->inCellMode( 'endtag', 'td' );
3352 return true;
3353 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3354 $this->inCellMode( 'endtag', 'th' );
3355 return true;
3356 } else {
3357 return false;
3358 }
3359 }
3360 private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) {
3361 if ( $token === 'tag' ) {
3362 switch ( $value ) {
3363 case 'caption':
3364 case 'col':
3365 case 'colgroup':
3366 case 'tbody':
3367 case 'td':
3368 case 'tfoot':
3369 case 'th':
3370 case 'thead':
3371 case 'tr':
3372 if ( $this->endCell() ) {
3373 $this->insertToken( $token, $value, $attribs, $selfClose );
3374 }
3375 return true;
3376 }
3377 } elseif ( $token === 'endtag' ) {
3378 switch ( $value ) {
3379 case 'td':
3380 case 'th':
3381 if ( $this->stack->inTableScope( $value ) ) {
3382 $this->stack->generateImpliedEndTags();
3383 $this->stack->popTag( $value );
3384 $this->afe->clearToMarker();
3385 $this->switchMode( 'inRowMode' );
3386 }
3387 return true;
3388 // OMITTED: <body>
3389 case 'caption':
3390 case 'col':
3391 case 'colgroup':
3392 // OMITTED: <html>
3393 return true;
3394
3395 case 'table':
3396 case 'tbody':
3397 case 'tfoot':
3398 case 'thead':
3399 case 'tr':
3400 if ( $this->stack->inTableScope( $value ) ) {
3401 $this->stack->generateImpliedEndTags();
3402 $this->stack->popTag( BalanceSets::$tableCellSet );
3403 $this->afe->clearToMarker();
3404 $this->switchMode( 'inRowMode' );
3405 $this->insertToken( $token, $value, $attribs, $selfClose );
3406 }
3407 return true;
3408 }
3409 }
3410 // Anything else:
3411 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3412 }
3413
3414 private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) {
3415 if ( $token === 'text' ) {
3416 $this->stack->insertText( $value );
3417 return true;
3418 } elseif ( $token === 'eof' ) {
3419 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3420 } elseif ( $token === 'tag' ) {
3421 switch ( $value ) {
3422 // OMITTED: <html>
3423 case 'option':
3424 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3425 $this->stack->pop();
3426 }
3427 $this->stack->insertHTMLElement( $value, $attribs );
3428 return true;
3429 case 'optgroup':
3430 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3431 $this->stack->pop();
3432 }
3433 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3434 $this->stack->pop();
3435 }
3436 $this->stack->insertHTMLElement( $value, $attribs );
3437 return true;
3438 case 'select':
3439 $this->inSelectMode( 'endtag', $value ); // treat it like endtag
3440 return true;
3441 case 'input':
3442 case 'keygen':
3443 case 'textarea':
3444 if ( !$this->stack->inSelectScope( 'select' ) ) {
3445 return true; // ignore token (fragment case)
3446 }
3447 $this->inSelectMode( 'endtag', 'select' );
3448 return $this->insertToken( $token, $value, $attribs, $selfClose );
3449 case 'script':
3450 case 'template':
3451 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3452 }
3453 } elseif ( $token === 'endtag' ) {
3454 switch ( $value ) {
3455 case 'optgroup':
3456 if (
3457 $this->stack->currentNode->isHtmlNamed( 'option' ) &&
3458 $this->stack->length() >= 2 &&
3459 $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
3460 ) {
3461 $this->stack->pop();
3462 }
3463 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3464 $this->stack->pop();
3465 }
3466 return true;
3467 case 'option':
3468 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3469 $this->stack->pop();
3470 }
3471 return true;
3472 case 'select':
3473 if ( !$this->stack->inSelectScope( $value ) ) {
3474 return true; // fragment case
3475 }
3476 $this->stack->popTag( $value );
3477 $this->resetInsertionMode();
3478 return true;
3479 case 'template':
3480 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3481 }
3482 } elseif ( $token === 'comment' ) {
3483 $this->stack->insertComment( $value );
3484 return true;
3485 }
3486 // anything else: just ignore the token
3487 return true;
3488 }
3489
3490 private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) {
3491 switch ( $value ) {
3492 case 'caption':
3493 case 'table':
3494 case 'tbody':
3495 case 'tfoot':
3496 case 'thead':
3497 case 'tr':
3498 case 'td':
3499 case 'th':
3500 if ( $token === 'tag' ) {
3501 $this->inSelectInTableMode( 'endtag', 'select' );
3502 return $this->insertToken( $token, $value, $attribs, $selfClose );
3503 } elseif ( $token === 'endtag' ) {
3504 if ( $this->stack->inTableScope( $value ) ) {
3505 $this->inSelectInTableMode( 'endtag', 'select' );
3506 return $this->insertToken( $token, $value, $attribs, $selfClose );
3507 }
3508 return true;
3509 }
3510 }
3511 // anything else
3512 return $this->inSelectMode( $token, $value, $attribs, $selfClose );
3513 }
3514
3515 private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) {
3516 if ( $token === 'text' || $token === 'comment' ) {
3517 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3518 } elseif ( $token === 'eof' ) {
3519 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3520 $this->stopParsing();
3521 } else {
3522 $this->stack->popTag( 'template' );
3523 $this->afe->clearToMarker();
3524 array_pop( $this->templateInsertionModes );
3525 $this->resetInsertionMode();
3526 $this->insertToken( $token, $value, $attribs, $selfClose );
3527 }
3528 return true;
3529 } elseif ( $token === 'tag' ) {
3530 switch ( $value ) {
3531 case 'base':
3532 case 'basefont':
3533 case 'bgsound':
3534 case 'link':
3535 case 'meta':
3536 case 'noframes':
3537 // OMITTED: <script>
3538 case 'style':
3539 case 'template':
3540 // OMITTED: <title>
3541 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3542
3543 case 'caption':
3544 case 'colgroup':
3545 case 'tbody':
3546 case 'tfoot':
3547 case 'thead':
3548 return $this->switchModeAndReprocess(
3549 'inTableMode', $token, $value, $attribs, $selfClose
3550 );
3551
3552 case 'col':
3553 return $this->switchModeAndReprocess(
3554 'inColumnGroupMode', $token, $value, $attribs, $selfClose
3555 );
3556
3557 case 'tr':
3558 return $this->switchModeAndReprocess(
3559 'inTableBodyMode', $token, $value, $attribs, $selfClose
3560 );
3561
3562 case 'td':
3563 case 'th':
3564 return $this->switchModeAndReprocess(
3565 'inRowMode', $token, $value, $attribs, $selfClose
3566 );
3567 }
3568 return $this->switchModeAndReprocess(
3569 'inBodyMode', $token, $value, $attribs, $selfClose
3570 );
3571 } elseif ( $token === 'endtag' ) {
3572 switch ( $value ) {
3573 case 'template':
3574 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3575 }
3576 return true;
3577 } else {
3578 Assert::invariant( false, "Bad token type: $token" );
3579 }
3580 }
3581 }