Merge "maintenance: Document secondary purpose of --server"
[lhc/web/wiklou.git] / includes / tidy / Balancer.php
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
25 */
26
27 namespace MediaWiki\Tidy;
28
29 use ExplodeIterator;
30 use IteratorAggregate;
31 use ReverseArrayIterator;
32 use Sanitizer;
33 use Wikimedia\Assert\Assert;
34 use Wikimedia\Assert\ParameterAssertionException;
35
36 // A note for future librarization[1] -- this file is a good candidate
37 // for splitting into an independent library, except that it is currently
38 // highly optimized for MediaWiki use. It only implements the portions
39 // of the HTML5 tree builder used by tags supported by MediaWiki, and
40 // does not contain a true tokenizer pass, instead relying on
41 // comment stripping, attribute normalization, and escaping done by
42 // the MediaWiki Sanitizer. It also deliberately avoids building
43 // a true DOM in memory, instead serializing elements to an output string
44 // as soon as possible (usually as soon as the tag is closed) to reduce
45 // its memory footprint.
46
47 // We've been gradually lifting some of these restrictions to handle
48 // non-sanitized output generated by extensions, but we shortcut the tokenizer
49 // for speed (primarily by splitting on `<`) and so rely on syntactic
50 // well-formedness.
51
52 // On the other hand, I've been pretty careful to note with comments in the
53 // code the places where this implementation omits features of the spec or
54 // depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
55 // implement the missing pieces and make this a standalone PHP HTML5 parser.
56 // In order to do so, some sort of MediaWiki-specific API will need
57 // to be added to (a) allow the Balancer to bypass the tokenizer,
58 // and (b) support on-the-fly flattening instead of DOM node creation.
59
60 // [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
61
62 /**
63 * Utility constants and sets for the HTML5 tree building algorithm.
64 * Sets are associative arrays indexed first by namespace and then by
65 * lower-cased tag name.
66 *
67 * @ingroup Parser
68 * @since 1.27
69 */
70 class BalanceSets {
71 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
72 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
73 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
74
75 public static $unsupportedSet = [
76 self::HTML_NAMESPACE => [
77 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
78 'frame' => true,
79 'plaintext' => true,
80 'xmp' => true, 'iframe' => true, 'noembed' => true,
81 'noscript' => true, 'script' => true,
82 'title' => true
83 ]
84 ];
85
86 public static $emptyElementSet = [
87 self::HTML_NAMESPACE => [
88 'area' => true, 'base' => true, 'basefont' => true,
89 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
90 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
91 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
92 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
93 ]
94 ];
95
96 public static $extraLinefeedSet = [
97 self::HTML_NAMESPACE => [
98 'pre' => true, 'textarea' => true, 'listing' => true,
99 ]
100 ];
101
102 public static $headingSet = [
103 self::HTML_NAMESPACE => [
104 'h1' => true, 'h2' => true, 'h3' => true,
105 'h4' => true, 'h5' => true, 'h6' => true
106 ]
107 ];
108
109 public static $specialSet = [
110 self::HTML_NAMESPACE => [
111 'address' => true, 'applet' => true, 'area' => true,
112 'article' => true, 'aside' => true, 'base' => true,
113 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
114 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
115 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
116 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
117 'dt' => true, 'embed' => true, 'fieldset' => true,
118 'figcaption' => true, 'figure' => true, 'footer' => true,
119 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
120 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
121 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
122 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
123 'input' => true, 'li' => true, 'link' => true,
124 'listing' => true, 'main' => true, 'marquee' => true,
125 'menu' => true, 'meta' => true, 'nav' => true,
126 'noembed' => true, 'noframes' => true, 'noscript' => true,
127 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
128 'plaintext' => true, 'pre' => true, 'script' => true,
129 'section' => true, 'select' => true, 'source' => true,
130 'style' => true, 'summary' => true, 'table' => true,
131 'tbody' => true, 'td' => true, 'template' => true,
132 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
133 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
134 'wbr' => true, 'xmp' => true
135 ],
136 self::SVG_NAMESPACE => [
137 'foreignobject' => true, 'desc' => true, 'title' => true
138 ],
139 self::MATHML_NAMESPACE => [
140 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
141 'mtext' => true, 'annotation-xml' => true
142 ]
143 ];
144
145 public static $addressDivPSet = [
146 self::HTML_NAMESPACE => [
147 'address' => true, 'div' => true, 'p' => true
148 ]
149 ];
150
151 public static $tableSectionRowSet = [
152 self::HTML_NAMESPACE => [
153 'table' => true, 'thead' => true, 'tbody' => true,
154 'tfoot' => true, 'tr' => true
155 ]
156 ];
157
158 public static $impliedEndTagsSet = [
159 self::HTML_NAMESPACE => [
160 'dd' => true, 'dt' => true, 'li' => true,
161 'menuitem' => true, 'optgroup' => true,
162 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
163 'rt' => true, 'rtc' => true
164 ]
165 ];
166
167 public static $thoroughImpliedEndTagsSet = [
168 self::HTML_NAMESPACE => [
169 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
170 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
171 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
172 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
173 'thead' => true, 'tr' => true
174 ]
175 ];
176
177 public static $tableCellSet = [
178 self::HTML_NAMESPACE => [
179 'td' => true, 'th' => true
180 ]
181 ];
182 public static $tableContextSet = [
183 self::HTML_NAMESPACE => [
184 'table' => true, 'template' => true, 'html' => true
185 ]
186 ];
187
188 public static $tableBodyContextSet = [
189 self::HTML_NAMESPACE => [
190 'tbody' => true, 'tfoot' => true, 'thead' => true,
191 'template' => true, 'html' => true
192 ]
193 ];
194
195 public static $tableRowContextSet = [
196 self::HTML_NAMESPACE => [
197 'tr' => true, 'template' => true, 'html' => true
198 ]
199 ];
200
201 // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
202 public static $formAssociatedSet = [
203 self::HTML_NAMESPACE => [
204 'button' => true, 'fieldset' => true, 'input' => true,
205 'keygen' => true, 'object' => true, 'output' => true,
206 'select' => true, 'textarea' => true, 'img' => true
207 ]
208 ];
209
210 public static $inScopeSet = [
211 self::HTML_NAMESPACE => [
212 'applet' => true, 'caption' => true, 'html' => true,
213 'marquee' => true, 'object' => true,
214 'table' => true, 'td' => true, 'template' => true,
215 'th' => true
216 ],
217 self::SVG_NAMESPACE => [
218 'foreignobject' => true, 'desc' => true, 'title' => true
219 ],
220 self::MATHML_NAMESPACE => [
221 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
222 'mtext' => true, 'annotation-xml' => true
223 ]
224 ];
225
226 private static $inListItemScopeSet = null;
227 public static function inListItemScopeSet() {
228 if ( self::$inListItemScopeSet === null ) {
229 self::$inListItemScopeSet = self::$inScopeSet;
230 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
231 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
232 }
233 return self::$inListItemScopeSet;
234 }
235
236 private static $inButtonScopeSet = null;
237 public static function inButtonScopeSet() {
238 if ( self::$inButtonScopeSet === null ) {
239 self::$inButtonScopeSet = self::$inScopeSet;
240 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
241 }
242 return self::$inButtonScopeSet;
243 }
244
245 public static $inTableScopeSet = [
246 self::HTML_NAMESPACE => [
247 'html' => true, 'table' => true, 'template' => true
248 ]
249 ];
250
251 public static $inInvertedSelectScopeSet = [
252 self::HTML_NAMESPACE => [
253 'option' => true, 'optgroup' => true
254 ]
255 ];
256
257 public static $mathmlTextIntegrationPointSet = [
258 self::MATHML_NAMESPACE => [
259 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
260 'mtext' => true
261 ]
262 ];
263
264 public static $htmlIntegrationPointSet = [
265 self::SVG_NAMESPACE => [
266 'foreignobject' => true,
267 'desc' => true,
268 'title' => true
269 ]
270 ];
271
272 // For tidy compatibility.
273 public static $tidyPWrapSet = [
274 self::HTML_NAMESPACE => [
275 'body' => true, 'blockquote' => true,
276 // We parse with <body> as the fragment context, but the top-level
277 // element on the stack is actually <html>. We could use the
278 // "adjusted current node" everywhere to work around this, but it's
279 // easier just to add <html> to the p-wrap set.
280 'html' => true,
281 ],
282 ];
283 public static $tidyInlineSet = [
284 self::HTML_NAMESPACE => [
285 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
286 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
287 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
288 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
289 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
290 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
291 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
292 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
293 's' => true, 'samp' => true, 'select' => true, 'small' => true,
294 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
295 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
296 'var' => true,
297 // Those defined in tidy.conf
298 'video' => true, 'audio' => true, 'bdi' => true, 'data' => true,
299 'time' => true, 'mark' => true,
300 ],
301 ];
302 }
303
304 /**
305 * A BalanceElement is a simplified version of a DOM Node. The main
306 * difference is that we only keep BalanceElements around for nodes
307 * currently on the BalanceStack of open elements. As soon as an
308 * element is closed, with some minor exceptions relating to the
309 * tree builder "adoption agency algorithm", the element and all its
310 * children are serialized to a string using the flatten() method.
311 * This keeps our memory usage low.
312 *
313 * @ingroup Parser
314 * @since 1.27
315 */
316 class BalanceElement {
317 /**
318 * The namespace of the element.
319 * @var string $namespaceURI
320 */
321 public $namespaceURI;
322 /**
323 * The lower-cased name of the element.
324 * @var string $localName
325 */
326 public $localName;
327 /**
328 * Attributes for the element, in array form
329 * @var array $attribs
330 */
331 public $attribs;
332
333 /**
334 * Parent of this element, or the string "flat" if this element has
335 * already been flattened into its parent.
336 * @var BalanceElement|string|null $parent
337 */
338 public $parent;
339
340 /**
341 * An array of children of this element. Typically only the last
342 * child will be an actual BalanceElement object; the rest will
343 * be strings, representing either text nodes or flattened
344 * BalanceElement objects.
345 * @var BalanceElement[]|string[] $children
346 */
347 public $children;
348
349 /**
350 * A unique string identifier for Noah's Ark purposes, lazy initialized
351 */
352 private $noahKey;
353
354 /**
355 * The next active formatting element in the list, or null if this is the
356 * end of the AFE list or if the element is not in the AFE list.
357 */
358 public $nextAFE;
359
360 /**
361 * The previous active formatting element in the list, or null if this is
362 * the start of the list or if the element is not in the AFE list.
363 */
364 public $prevAFE;
365
366 /**
367 * The next element in the Noah's Ark species bucket.
368 */
369 public $nextNoah;
370
371 /**
372 * Make a new BalanceElement corresponding to the HTML DOM Element
373 * with the given localname, namespace, and attributes.
374 *
375 * @param string $namespaceURI The namespace of the element.
376 * @param string $localName The lowercased name of the tag.
377 * @param array $attribs Attributes of the element
378 */
379 public function __construct( $namespaceURI, $localName, array $attribs ) {
380 $this->localName = $localName;
381 $this->namespaceURI = $namespaceURI;
382 $this->attribs = $attribs;
383 $this->contents = '';
384 $this->parent = null;
385 $this->children = [];
386 }
387
388 /**
389 * Remove the given child from this element.
390 * @param BalanceElement $elt
391 */
392 private function removeChild( BalanceElement $elt ) {
393 Assert::precondition(
394 $this->parent !== 'flat', "Can't removeChild after flattening $this"
395 );
396 Assert::parameter(
397 $elt->parent === $this, 'elt', 'must have $this as a parent'
398 );
399 $idx = array_search( $elt, $this->children, true );
400 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
401 $elt->parent = null;
402 array_splice( $this->children, $idx, 1 );
403 }
404
405 /**
406 * Find $a in the list of children and insert $b before it.
407 * @param BalanceElement $a
408 * @param BalanceElement|string $b
409 */
410 public function insertBefore( BalanceElement $a, $b ) {
411 Assert::precondition(
412 $this->parent !== 'flat', "Can't insertBefore after flattening."
413 );
414 $idx = array_search( $a, $this->children, true );
415 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
416 if ( is_string( $b ) ) {
417 array_splice( $this->children, $idx, 0, [ $b ] );
418 } else {
419 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
420 if ( $b->parent !== null ) {
421 $b->parent->removeChild( $b );
422 }
423 array_splice( $this->children, $idx, 0, [ $b ] );
424 $b->parent = $this;
425 }
426 }
427
428 /**
429 * Append $elt to the end of the list of children.
430 * @param BalanceElement|string $elt
431 */
432 public function appendChild( $elt ) {
433 Assert::precondition(
434 $this->parent !== 'flat', "Can't appendChild after flattening."
435 );
436 if ( is_string( $elt ) ) {
437 array_push( $this->children, $elt );
438 return;
439 }
440 // Remove $elt from parent, if it had one.
441 if ( $elt->parent !== null ) {
442 $elt->parent->removeChild( $elt );
443 }
444 array_push( $this->children, $elt );
445 $elt->parent = $this;
446 }
447
448 /**
449 * Transfer all of the children of $elt to $this.
450 * @param BalanceElement $elt
451 */
452 public function adoptChildren( BalanceElement $elt ) {
453 Assert::precondition(
454 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
455 );
456 foreach ( $elt->children as $child ) {
457 if ( !is_string( $child ) ) {
458 // This is an optimization which avoids an O(n^2) set of
459 // array_splice operations.
460 $child->parent = null;
461 }
462 $this->appendChild( $child );
463 }
464 $elt->children = [];
465 }
466
467 /**
468 * Flatten this node and all of its children into a string, as specified
469 * by the HTML serialization specification, and replace this node
470 * in its parent by that string.
471 *
472 * @param array $config Balancer configuration; see Balancer::__construct().
473 * @return string
474 *
475 * @see __toString()
476 */
477 public function flatten( array $config ) {
478 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
479 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
480 $idx = array_search( $this, $this->parent->children, true );
481 Assert::parameter(
482 $idx !== false, '$this', 'must be a child of its parent'
483 );
484 $tidyCompat = $config['tidyCompat'];
485 if ( $tidyCompat ) {
486 $blank = true;
487 foreach ( $this->children as $elt ) {
488 if ( !is_string( $elt ) ) {
489 $elt = $elt->flatten( $config );
490 }
491 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
492 $blank = false;
493 }
494 }
495 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
496 $this->localName = 'p';
497 } elseif ( $blank ) {
498 // Add 'mw-empty-elt' class so elements can be hidden via CSS
499 // for compatibility with legacy tidy.
500 if ( !count( $this->attribs ) &&
501 ( $this->localName === 'tr' || $this->localName === 'li' )
502 ) {
503 $this->attribs = [ 'class' => "mw-empty-elt" ];
504 }
505 $blank = false;
506 } elseif (
507 $this->isA( BalanceSets::$extraLinefeedSet ) &&
508 count( $this->children ) > 0 &&
509 substr( $this->children[0], 0, 1 ) == "\n"
510 ) {
511 // Double the linefeed after pre/listing/textarea
512 // according to the (old) HTML5 fragment serialization
513 // algorithm (see https://github.com/whatwg/html/issues/944)
514 // to ensure this will round-trip.
515 array_unshift( $this->children, "\n" );
516 }
517 $flat = $blank ? '' : "{$this}";
518 } else {
519 $flat = "{$this}";
520 }
521 $this->parent->children[$idx] = $flat;
522 $this->parent = 'flat'; // for assertion checking
523 return $flat;
524 }
525
526 /**
527 * Serialize this node and all of its children to a string, as specified
528 * by the HTML serialization specification.
529 *
530 * @return string The serialization of the BalanceElement
531 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
532 */
533 public function __toString() {
534 $encAttribs = '';
535 foreach ( $this->attribs as $name => $value ) {
536 $encValue = Sanitizer::encodeAttribute( $value );
537 $encAttribs .= " $name=\"$encValue\"";
538 }
539 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
540 $out = "<{$this->localName}{$encAttribs}>";
541 $len = strlen( $out );
542 // flatten children
543 foreach ( $this->children as $elt ) {
544 $out .= "{$elt}";
545 }
546 $out .= "</{$this->localName}>";
547 } else {
548 $out = "<{$this->localName}{$encAttribs} />";
549 Assert::invariant(
550 count( $this->children ) === 0,
551 "Empty elements shouldn't have children."
552 );
553 }
554 return $out;
555 }
556
557 // Utility functions on BalanceElements.
558
559 /**
560 * Determine if $this represents a specific HTML tag, is a member of
561 * a tag set, or is equal to another BalanceElement.
562 *
563 * @param BalanceElement|array|string $set The target BalanceElement,
564 * set (from the BalanceSets class), or string (HTML tag name).
565 * @return bool
566 */
567 public function isA( $set ) {
568 if ( $set instanceof BalanceElement ) {
569 return $this === $set;
570 } elseif ( is_array( $set ) ) {
571 return isset( $set[$this->namespaceURI] ) &&
572 isset( $set[$this->namespaceURI][$this->localName] );
573 } else {
574 // assume this is an HTML element name.
575 return $this->isHtml() && $this->localName === $set;
576 }
577 }
578
579 /**
580 * Determine if this element is an HTML element with the specified name
581 * @param string $tagName
582 * @return bool
583 */
584 public function isHtmlNamed( $tagName ) {
585 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
586 && $this->localName === $tagName;
587 }
588
589 /**
590 * Determine if $this represents an element in the HTML namespace.
591 *
592 * @return bool
593 */
594 public function isHtml() {
595 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
596 }
597
598 /**
599 * Determine if $this represents a MathML text integration point,
600 * as defined in the HTML5 specification.
601 *
602 * @return bool
603 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
604 */
605 public function isMathmlTextIntegrationPoint() {
606 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
607 }
608
609 /**
610 * Determine if $this represents an HTML integration point,
611 * as defined in the HTML5 specification.
612 *
613 * @return bool
614 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
615 */
616 public function isHtmlIntegrationPoint() {
617 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
618 return true;
619 }
620 if (
621 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
622 $this->localName === 'annotation-xml' &&
623 isset( $this->attribs['encoding'] ) &&
624 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
625 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
626 ) {
627 return true;
628 }
629 return false;
630 }
631
632 /**
633 * Get a string key for the Noah's Ark algorithm
634 * @return string
635 */
636 public function getNoahKey() {
637 if ( $this->noahKey === null ) {
638 $attribs = $this->attribs;
639 ksort( $attribs );
640 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
641 }
642 return $this->noahKey;
643 }
644 }
645
646 /**
647 * The "stack of open elements" as defined in the HTML5 tree builder
648 * spec. This contains methods to ensure that content (start tags, text)
649 * are inserted at the correct place in the output string, and to
650 * flatten BalanceElements are they are closed to avoid holding onto
651 * a complete DOM tree for the document in memory.
652 *
653 * The stack defines a PHP iterator to traverse it in "reverse order",
654 * that is, the most-recently-added element is visited first in a
655 * foreach loop.
656 *
657 * @ingroup Parser
658 * @since 1.27
659 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
660 */
661 class BalanceStack implements IteratorAggregate {
662 /**
663 * Backing storage for the stack.
664 * @var BalanceElement[] $elements
665 */
666 private $elements = [];
667 /**
668 * Foster parent mode determines how nodes are inserted into the
669 * stack.
670 * @var bool $fosterParentMode
671 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
672 */
673 public $fosterParentMode = false;
674 /**
675 * Configuration options governing flattening.
676 * @var array $config
677 * @see Balancer::__construct()
678 */
679 private $config;
680 /**
681 * Reference to the current element
682 */
683 public $currentNode;
684
685 /**
686 * Create a new BalanceStack with a single BalanceElement on it,
687 * representing the root &lt;html&gt; node.
688 * @param array $config Balancer configuration; see Balancer::_construct().
689 */
690 public function __construct( array $config ) {
691 // always a root <html> element on the stack
692 array_push(
693 $this->elements,
694 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
695 );
696 $this->currentNode = $this->elements[0];
697 $this->config = $config;
698 }
699
700 /**
701 * Return a string representing the output of the tree builder:
702 * all the children of the root &lt;html&gt; node.
703 * @return string
704 */
705 public function getOutput() {
706 // Don't include the outer '<html>....</html>'
707 $out = '';
708 foreach ( $this->elements[0]->children as $elt ) {
709 $out .= is_string( $elt ) ? $elt :
710 $elt->flatten( $this->config );
711 }
712 return $out;
713 }
714
715 /**
716 * Insert a comment at the appropriate place for inserting a node.
717 * @param string $value Content of the comment.
718 * @return string
719 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
720 */
721 public function insertComment( $value ) {
722 // Just another type of text node, except for tidy p-wrapping.
723 return $this->insertText( '<!--' . $value . '-->', true );
724 }
725
726 /**
727 * Insert text at the appropriate place for inserting a node.
728 * @param string $value
729 * @param bool $isComment
730 * @return string
731 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
732 */
733 public function insertText( $value, $isComment = false ) {
734 if (
735 $this->fosterParentMode &&
736 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
737 ) {
738 $this->fosterParent( $value );
739 } elseif (
740 $this->config['tidyCompat'] && !$isComment &&
741 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
742 ) {
743 $this->insertHTMLElement( 'mw:p-wrap', [] );
744 return $this->insertText( $value );
745 } else {
746 $this->currentNode->appendChild( $value );
747 }
748 }
749
750 /**
751 * Insert a BalanceElement at the appropriate place, pushing it
752 * on to the open elements stack.
753 * @param string $namespaceURI The element namespace
754 * @param string $tag The tag name
755 * @param string $attribs Normalized attributes, as a string.
756 * @return BalanceElement
757 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
758 */
759 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
760 return $this->insertElement(
761 new BalanceElement( $namespaceURI, $tag, $attribs )
762 );
763 }
764
765 /**
766 * Insert an HTML element at the appropriate place, pushing it on to
767 * the open elements stack.
768 * @param string $tag The tag name
769 * @param string $attribs Normalized attributes, as a string.
770 * @return BalanceElement
771 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
772 */
773 public function insertHTMLElement( $tag, $attribs ) {
774 return $this->insertForeignElement(
775 BalanceSets::HTML_NAMESPACE, $tag, $attribs
776 );
777 }
778
779 /**
780 * Insert an element at the appropriate place and push it on to the
781 * open elements stack.
782 * @param BalanceElement $elt
783 * @return BalanceElement
784 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
785 */
786 public function insertElement( BalanceElement $elt ) {
787 if (
788 $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
789 !$elt->isA( BalanceSets::$tidyInlineSet )
790 ) {
791 // Tidy compatibility.
792 $this->pop();
793 }
794 if (
795 $this->fosterParentMode &&
796 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
797 ) {
798 $elt = $this->fosterParent( $elt );
799 } else {
800 $this->currentNode->appendChild( $elt );
801 }
802 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
803 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
804 array_push( $this->elements, $elt );
805 $this->currentNode = $elt;
806 return $elt;
807 }
808
809 /**
810 * Determine if the stack has $tag in scope.
811 * @param BalanceElement|array|string $tag
812 * @return bool
813 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
814 */
815 public function inScope( $tag ) {
816 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
817 }
818
819 /**
820 * Determine if the stack has $tag in button scope.
821 * @param BalanceElement|array|string $tag
822 * @return bool
823 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
824 */
825 public function inButtonScope( $tag ) {
826 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
827 }
828
829 /**
830 * Determine if the stack has $tag in list item scope.
831 * @param BalanceElement|array|string $tag
832 * @return bool
833 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
834 */
835 public function inListItemScope( $tag ) {
836 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
837 }
838
839 /**
840 * Determine if the stack has $tag in table scope.
841 * @param BalanceElement|array|string $tag
842 * @return bool
843 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
844 */
845 public function inTableScope( $tag ) {
846 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
847 }
848
849 /**
850 * Determine if the stack has $tag in select scope.
851 * @param BalanceElement|array|string $tag
852 * @return bool
853 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
854 */
855 public function inSelectScope( $tag ) {
856 // Can't use inSpecificScope to implement this, since it involves
857 // *inverting* a set of tags. Implement manually.
858 foreach ( $this as $elt ) {
859 if ( $elt->isA( $tag ) ) {
860 return true;
861 }
862 if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
863 return false;
864 }
865 }
866 return false;
867 }
868
869 /**
870 * Determine if the stack has $tag in a specific scope, $set.
871 * @param BalanceElement|array|string $tag
872 * @param BalanceElement|array|string $set
873 * @return bool
874 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
875 */
876 public function inSpecificScope( $tag, $set ) {
877 foreach ( $this as $elt ) {
878 if ( $elt->isA( $tag ) ) {
879 return true;
880 }
881 if ( $elt->isA( $set ) ) {
882 return false;
883 }
884 }
885 return false;
886 }
887
888 /**
889 * Generate implied end tags.
890 * @param string $butnot
891 * @param bool $thorough True if we should generate end tags thoroughly.
892 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
893 */
894 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
895 $endTagSet = $thorough ?
896 BalanceSets::$thoroughImpliedEndTagsSet :
897 BalanceSets::$impliedEndTagsSet;
898 while ( $this->currentNode ) {
899 if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
900 break;
901 }
902 if ( !$this->currentNode->isA( $endTagSet ) ) {
903 break;
904 }
905 $this->pop();
906 }
907 }
908
909 /**
910 * Return the adjusted current node.
911 * @param string $fragmentContext
912 * @return string
913 */
914 public function adjustedCurrentNode( $fragmentContext ) {
915 return ( $fragmentContext && count( $this->elements ) === 1 ) ?
916 $fragmentContext : $this->currentNode;
917 }
918
919 /**
920 * Return an iterator over this stack which visits the current node
921 * first, and the root node last.
922 * @return \Iterator
923 */
924 public function getIterator() {
925 return new ReverseArrayIterator( $this->elements );
926 }
927
928 /**
929 * Return the BalanceElement at the given position $idx, where
930 * position 0 represents the root element.
931 * @param int $idx
932 * @return BalanceElement
933 */
934 public function node( $idx ) {
935 return $this->elements[ $idx ];
936 }
937
938 /**
939 * Replace the element at position $idx in the BalanceStack with $elt.
940 * @param int $idx
941 * @param BalanceElement $elt
942 */
943 public function replaceAt( $idx, BalanceElement $elt ) {
944 Assert::precondition(
945 $this->elements[$idx]->parent !== 'flat',
946 'Replaced element should not have already been flattened.'
947 );
948 Assert::precondition(
949 $elt->parent !== 'flat',
950 'New element should not have already been flattened.'
951 );
952 $this->elements[$idx] = $elt;
953 if ( $idx === count( $this->elements ) - 1 ) {
954 $this->currentNode = $elt;
955 }
956 }
957
958 /**
959 * Return the position of the given BalanceElement, set, or
960 * HTML tag name string in the BalanceStack.
961 * @param BalanceElement|array|string $tag
962 * @return int
963 */
964 public function indexOf( $tag ) {
965 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
966 if ( $this->elements[$i]->isA( $tag ) ) {
967 return $i;
968 }
969 }
970 return -1;
971 }
972
973 /**
974 * Return the number of elements currently in the BalanceStack.
975 * @return int
976 */
977 public function length() {
978 return count( $this->elements );
979 }
980
981 /**
982 * Remove the current node from the BalanceStack, flattening it
983 * in the process.
984 */
985 public function pop() {
986 $elt = array_pop( $this->elements );
987 if ( count( $this->elements ) ) {
988 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
989 } else {
990 $this->currentNode = null;
991 }
992 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
993 $elt->flatten( $this->config );
994 }
995 }
996
997 /**
998 * Remove all nodes up to and including position $idx from the
999 * BalanceStack, flattening them in the process.
1000 * @param int $idx
1001 */
1002 public function popTo( $idx ) {
1003 for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
1004 $this->pop();
1005 }
1006 }
1007
1008 /**
1009 * Pop elements off the stack up to and including the first
1010 * element with the specified HTML tagname (or matching the given
1011 * set).
1012 * @param BalanceElement|array|string $tag
1013 */
1014 public function popTag( $tag ) {
1015 while ( $this->currentNode ) {
1016 if ( $this->currentNode->isA( $tag ) ) {
1017 $this->pop();
1018 break;
1019 }
1020 $this->pop();
1021 }
1022 }
1023
1024 /**
1025 * Pop elements off the stack *not including* the first element
1026 * in the specified set.
1027 * @param BalanceElement|array|string $set
1028 */
1029 public function clearToContext( $set ) {
1030 // Note that we don't loop to 0. Never pop the <html> elt off.
1031 for ( $length = count( $this->elements ); $length > 1; $length-- ) {
1032 if ( $this->currentNode->isA( $set ) ) {
1033 break;
1034 }
1035 $this->pop();
1036 }
1037 }
1038
1039 /**
1040 * Remove the given $elt from the BalanceStack, optionally
1041 * flattening it in the process.
1042 * @param BalanceElement $elt The element to remove.
1043 * @param bool $flatten Whether to flatten the removed element.
1044 */
1045 public function removeElement( BalanceElement $elt, $flatten = true ) {
1046 Assert::parameter(
1047 $elt->parent !== 'flat',
1048 '$elt',
1049 '$elt should not already have been flattened.'
1050 );
1051 Assert::parameter(
1052 $elt->parent->parent !== 'flat',
1053 '$elt',
1054 'The parent of $elt should not already have been flattened.'
1055 );
1056 $idx = array_search( $elt, $this->elements, true );
1057 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
1058 array_splice( $this->elements, $idx, 1 );
1059 if ( $idx === count( $this->elements ) ) {
1060 $this->currentNode = $this->elements[$idx - 1];
1061 }
1062 if ( $flatten ) {
1063 // serialize $elt into its parent
1064 // otherwise, it will eventually serialize when the parent
1065 // is serialized, we just hold onto the memory for its
1066 // tree of objects a little longer.
1067 $elt->flatten( $this->config );
1068 }
1069 Assert::postcondition(
1070 array_search( $elt, $this->elements, true ) === false,
1071 '$elt should no longer be in open elements stack'
1072 );
1073 }
1074
1075 /**
1076 * Find $a in the BalanceStack and insert $b after it.
1077 * @param BalanceElement $a
1078 * @param BalanceElement $b
1079 */
1080 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1081 $idx = $this->indexOf( $a );
1082 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1083 if ( $idx === count( $this->elements ) - 1 ) {
1084 array_push( $this->elements, $b );
1085 $this->currentNode = $b;
1086 } else {
1087 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1088 }
1089 }
1090
1091 // Fostering and adoption.
1092
1093 /**
1094 * Foster parent the given $elt in the stack of open elements.
1095 * @param BalanceElement|string $elt
1096 * @return BalanceElement|string
1097 *
1098 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1099 */
1100 private function fosterParent( $elt ) {
1101 $lastTable = $this->indexOf( 'table' );
1102 $lastTemplate = $this->indexOf( 'template' );
1103 $parent = null;
1104 $before = null;
1105
1106 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1107 $parent = $this->elements[$lastTemplate];
1108 } elseif ( $lastTable >= 0 ) {
1109 $parent = $this->elements[$lastTable]->parent;
1110 // Assume all tables have parents, since we're not running scripts!
1111 Assert::invariant(
1112 $parent !== null, "All tables should have parents"
1113 );
1114 $before = $this->elements[$lastTable];
1115 } else {
1116 $parent = $this->elements[0]; // the `html` element.
1117 }
1118
1119 if ( $this->config['tidyCompat'] ) {
1120 if ( is_string( $elt ) ) {
1121 // We're fostering text: do we need a p-wrapper?
1122 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1123 $this->insertHTMLElement( 'mw:p-wrap', [] );
1124 $this->insertText( $elt );
1125 return $elt;
1126 }
1127 } else {
1128 // We're fostering an element; do we need to merge p-wrappers?
1129 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1130 $idx = $before ?
1131 array_search( $before, $parent->children, true ) :
1132 count( $parent->children );
1133 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1134 if (
1135 $after instanceof BalanceElement &&
1136 $after->isHtmlNamed( 'mw:p-wrap' )
1137 ) {
1138 return $after; // Re-use existing p-wrapper.
1139 }
1140 }
1141 }
1142 }
1143
1144 if ( $before ) {
1145 $parent->insertBefore( $before, $elt );
1146 } else {
1147 $parent->appendChild( $elt );
1148 }
1149 return $elt;
1150 }
1151
1152 /**
1153 * Run the "adoption agency algoritm" (AAA) for the given subject
1154 * tag name.
1155 * @param string $tag The subject tag name.
1156 * @param BalanceActiveFormattingElements $afe The current
1157 * active formatting elements list.
1158 * @return true if the adoption agency algorithm "did something", false
1159 * if more processing is required by the caller.
1160 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1161 */
1162 public function adoptionAgency( $tag, $afe ) {
1163 // If the current node is an HTML element whose tag name is subject,
1164 // and the current node is not in the list of active formatting
1165 // elements, then pop the current node off the stack of open
1166 // elements and abort these steps.
1167 if (
1168 $this->currentNode->isHtmlNamed( $tag ) &&
1169 !$afe->isInList( $this->currentNode )
1170 ) {
1171 $this->pop();
1172 return true; // no more handling required
1173 }
1174
1175 // Outer loop: If outer loop counter is greater than or
1176 // equal to eight, then abort these steps.
1177 for ( $outer = 0; $outer < 8; $outer++ ) {
1178 // Let the formatting element be the last element in the list
1179 // of active formatting elements that: is between the end of
1180 // the list and the last scope marker in the list, if any, or
1181 // the start of the list otherwise, and has the same tag name
1182 // as the token.
1183 $fmtElt = $afe->findElementByTag( $tag );
1184
1185 // If there is no such node, then abort these steps and instead
1186 // act as described in the "any other end tag" entry below.
1187 if ( !$fmtElt ) {
1188 return false; // false means handle by the default case
1189 }
1190
1191 // Otherwise, if there is such a node, but that node is not in
1192 // the stack of open elements, then this is a parse error;
1193 // remove the element from the list, and abort these steps.
1194 $index = $this->indexOf( $fmtElt );
1195 if ( $index < 0 ) {
1196 $afe->remove( $fmtElt );
1197 return true; // true means no more handling required
1198 }
1199
1200 // Otherwise, if there is such a node, and that node is also in
1201 // the stack of open elements, but the element is not in scope,
1202 // then this is a parse error; ignore the token, and abort
1203 // these steps.
1204 if ( !$this->inScope( $fmtElt ) ) {
1205 return true;
1206 }
1207
1208 // Let the furthest block be the topmost node in the stack of
1209 // open elements that is lower in the stack than the formatting
1210 // element, and is an element in the special category. There
1211 // might not be one.
1212 $furthestBlock = null;
1213 $furthestBlockIndex = -1;
1214 $stackLength = $this->length();
1215 for ( $i = $index + 1; $i < $stackLength; $i++ ) {
1216 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1217 $furthestBlock = $this->node( $i );
1218 $furthestBlockIndex = $i;
1219 break;
1220 }
1221 }
1222
1223 // If there is no furthest block, then the UA must skip the
1224 // subsequent steps and instead just pop all the nodes from the
1225 // bottom of the stack of open elements, from the current node
1226 // up to and including the formatting element, and remove the
1227 // formatting element from the list of active formatting
1228 // elements.
1229 if ( !$furthestBlock ) {
1230 $this->popTag( $fmtElt );
1231 $afe->remove( $fmtElt );
1232 return true;
1233 }
1234
1235 // Let the common ancestor be the element immediately above
1236 // the formatting element in the stack of open elements.
1237 $ancestor = $this->node( $index - 1 );
1238
1239 // Let a bookmark note the position of the formatting
1240 // element in the list of active formatting elements
1241 // relative to the elements on either side of it in the
1242 // list.
1243 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1244 $afe->insertAfter( $fmtElt, $BOOKMARK );
1245
1246 // Let node and last node be the furthest block.
1247 $node = $furthestBlock;
1248 $lastNode = $furthestBlock;
1249 $nodeIndex = $furthestBlockIndex;
1250 $isAFE = false;
1251
1252 // Inner loop
1253 for ( $inner = 1; true; $inner++ ) {
1254 // Let node be the element immediately above node in
1255 // the stack of open elements, or if node is no longer
1256 // in the stack of open elements (e.g. because it got
1257 // removed by this algorithm), the element that was
1258 // immediately above node in the stack of open elements
1259 // before node was removed.
1260 $node = $this->node( --$nodeIndex );
1261
1262 // If node is the formatting element, then go
1263 // to the next step in the overall algorithm.
1264 if ( $node === $fmtElt ) break;
1265
1266 // If the inner loop counter is greater than three and node
1267 // is in the list of active formatting elements, then remove
1268 // node from the list of active formatting elements.
1269 $isAFE = $afe->isInList( $node );
1270 if ( $inner > 3 && $isAFE ) {
1271 $afe->remove( $node );
1272 $isAFE = false;
1273 }
1274
1275 // If node is not in the list of active formatting
1276 // elements, then remove node from the stack of open
1277 // elements and then go back to the step labeled inner
1278 // loop.
1279 if ( !$isAFE ) {
1280 // Don't flatten here, since we're about to relocate
1281 // parts of this $node.
1282 $this->removeElement( $node, false );
1283 continue;
1284 }
1285
1286 // Create an element for the token for which the
1287 // element node was created with common ancestor as
1288 // the intended parent, replace the entry for node
1289 // in the list of active formatting elements with an
1290 // entry for the new element, replace the entry for
1291 // node in the stack of open elements with an entry for
1292 // the new element, and let node be the new element.
1293 $newElt = new BalanceElement(
1294 $node->namespaceURI, $node->localName, $node->attribs );
1295 $afe->replace( $node, $newElt );
1296 $this->replaceAt( $nodeIndex, $newElt );
1297 $node = $newElt;
1298
1299 // If last node is the furthest block, then move the
1300 // aforementioned bookmark to be immediately after the
1301 // new node in the list of active formatting elements.
1302 if ( $lastNode === $furthestBlock ) {
1303 $afe->remove( $BOOKMARK );
1304 $afe->insertAfter( $newElt, $BOOKMARK );
1305 }
1306
1307 // Insert last node into node, first removing it from
1308 // its previous parent node if any.
1309 $node->appendChild( $lastNode );
1310
1311 // Let last node be node.
1312 $lastNode = $node;
1313 }
1314
1315 // If the common ancestor node is a table, tbody, tfoot,
1316 // thead, or tr element, then, foster parent whatever last
1317 // node ended up being in the previous step, first removing
1318 // it from its previous parent node if any.
1319 if (
1320 $this->fosterParentMode &&
1321 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1322 ) {
1323 $this->fosterParent( $lastNode );
1324 } else {
1325 // Otherwise, append whatever last node ended up being in
1326 // the previous step to the common ancestor node, first
1327 // removing it from its previous parent node if any.
1328 $ancestor->appendChild( $lastNode );
1329 }
1330
1331 // Create an element for the token for which the
1332 // formatting element was created, with furthest block
1333 // as the intended parent.
1334 $newElt2 = new BalanceElement(
1335 $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs );
1336
1337 // Take all of the child nodes of the furthest block and
1338 // append them to the element created in the last step.
1339 $newElt2->adoptChildren( $furthestBlock );
1340
1341 // Append that new element to the furthest block.
1342 $furthestBlock->appendChild( $newElt2 );
1343
1344 // Remove the formatting element from the list of active
1345 // formatting elements, and insert the new element into the
1346 // list of active formatting elements at the position of
1347 // the aforementioned bookmark.
1348 $afe->remove( $fmtElt );
1349 $afe->replace( $BOOKMARK, $newElt2 );
1350
1351 // Remove the formatting element from the stack of open
1352 // elements, and insert the new element into the stack of
1353 // open elements immediately below the position of the
1354 // furthest block in that stack.
1355 $this->removeElement( $fmtElt );
1356 $this->insertAfter( $furthestBlock, $newElt2 );
1357 }
1358
1359 return true;
1360 }
1361
1362 /**
1363 * Return the contents of the open elements stack as a string for
1364 * debugging.
1365 * @return string
1366 */
1367 public function __toString() {
1368 $r = [];
1369 foreach ( $this->elements as $elt ) {
1370 array_push( $r, $elt->localName );
1371 }
1372 return implode( ' ', $r );
1373 }
1374 }
1375
1376 /**
1377 * A pseudo-element used as a marker in the list of active formatting elements
1378 *
1379 * @ingroup Parser
1380 * @since 1.27
1381 */
1382 class BalanceMarker {
1383 public $nextAFE;
1384 public $prevAFE;
1385 }
1386
1387 /**
1388 * The list of active formatting elements, which is used to handle
1389 * mis-nested formatting element tags in the HTML5 tree builder
1390 * specification.
1391 *
1392 * @ingroup Parser
1393 * @since 1.27
1394 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1395 */
1396 class BalanceActiveFormattingElements {
1397 /** The last (most recent) element in the list */
1398 private $tail;
1399
1400 /** The first (least recent) element in the list */
1401 private $head;
1402
1403 /**
1404 * An array of arrays representing the population of elements in each bucket
1405 * according to the Noah's Ark clause. The outer array is stack-like, with each
1406 * integer-indexed element representing a segment of the list, bounded by
1407 * markers. The first element represents the segment of the list before the
1408 * first marker.
1409 *
1410 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1411 * identifies each bucket according to the rules in the spec. The value in
1412 * the inner array is the first (least recently inserted) element in the bucket,
1413 * and subsequent members of the bucket can be found by iterating through the
1414 * singly-linked list via $node->nextNoah.
1415 *
1416 * This is optimised for the most common case of inserting into a bucket
1417 * with zero members, and deleting a bucket containing one member. In the
1418 * worst case, iteration through the list is still O(1) in the document
1419 * size, since each bucket can have at most 3 members.
1420 */
1421 private $noahTableStack = [ [] ];
1422
1423 public function __destruct() {
1424 $next = null;
1425 for ( $node = $this->head; $node; $node = $next ) {
1426 $next = $node->nextAFE;
1427 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1428 }
1429 $this->head = $this->tail = $this->noahTableStack = null;
1430 }
1431
1432 public function insertMarker() {
1433 $elt = new BalanceMarker;
1434 if ( $this->tail ) {
1435 $this->tail->nextAFE = $elt;
1436 $elt->prevAFE = $this->tail;
1437 } else {
1438 $this->head = $elt;
1439 }
1440 $this->tail = $elt;
1441 $this->noahTableStack[] = [];
1442 }
1443
1444 /**
1445 * Follow the steps required when the spec requires us to "push onto the
1446 * list of active formatting elements".
1447 * @param BalanceElement $elt
1448 */
1449 public function push( BalanceElement $elt ) {
1450 // Must not be in the list already
1451 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1452 throw new ParameterAssertionException( '$elt',
1453 'Cannot insert a node into the AFE list twice' );
1454 }
1455
1456 // "Noah's Ark clause" -- if there are already three copies of
1457 // this element before we encounter a marker, then drop the last
1458 // one.
1459 $noahKey = $elt->getNoahKey();
1460 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1461 if ( !isset( $table[$noahKey] ) ) {
1462 $table[$noahKey] = $elt;
1463 } else {
1464 $count = 1;
1465 $head = $tail = $table[$noahKey];
1466 while ( $tail->nextNoah ) {
1467 $tail = $tail->nextNoah;
1468 $count++;
1469 }
1470 if ( $count >= 3 ) {
1471 $this->remove( $head );
1472 }
1473 $tail->nextNoah = $elt;
1474 }
1475 // Add to the main AFE list
1476 if ( $this->tail ) {
1477 $this->tail->nextAFE = $elt;
1478 $elt->prevAFE = $this->tail;
1479 } else {
1480 $this->head = $elt;
1481 }
1482 $this->tail = $elt;
1483 }
1484
1485 /**
1486 * Follow the steps required when the spec asks us to "clear the list of
1487 * active formatting elements up to the last marker".
1488 */
1489 public function clearToMarker() {
1490 // Iterate back through the list starting from the tail
1491 $tail = $this->tail;
1492 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1493 // Unlink the element
1494 $prev = $tail->prevAFE;
1495 $tail->prevAFE = null;
1496 if ( $prev ) {
1497 $prev->nextAFE = null;
1498 }
1499 $tail->nextNoah = null;
1500 $tail = $prev;
1501 }
1502 // If we finished on a marker, unlink it and pop it off the Noah table stack
1503 if ( $tail ) {
1504 $prev = $tail->prevAFE;
1505 if ( $prev ) {
1506 $prev->nextAFE = null;
1507 }
1508 $tail = $prev;
1509 array_pop( $this->noahTableStack );
1510 } else {
1511 // No marker: wipe the top-level Noah table (which is the only one)
1512 $this->noahTableStack[0] = [];
1513 }
1514 // If we removed all the elements, clear the head pointer
1515 if ( !$tail ) {
1516 $this->head = null;
1517 }
1518 $this->tail = $tail;
1519 }
1520
1521 /**
1522 * Find and return the last element with the specified tag between the
1523 * end of the list and the last marker on the list.
1524 * Used when parsing &lt;a&gt; "in body mode".
1525 * @param string $tag
1526 * @return null|Node
1527 */
1528 public function findElementByTag( $tag ) {
1529 $elt = $this->tail;
1530 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1531 if ( $elt->localName === $tag ) {
1532 return $elt;
1533 }
1534 $elt = $elt->prevAFE;
1535 }
1536 return null;
1537 }
1538
1539 /**
1540 * Determine whether an element is in the list of formatting elements.
1541 * @param BalanceElement $elt
1542 * @return bool
1543 */
1544 public function isInList( BalanceElement $elt ) {
1545 return $this->head === $elt || $elt->prevAFE;
1546 }
1547
1548 /**
1549 * Find the element $elt in the list and remove it.
1550 * Used when parsing &lt;a&gt; in body mode.
1551 *
1552 * @param BalanceElement $elt
1553 */
1554 public function remove( BalanceElement $elt ) {
1555 if ( $this->head !== $elt && !$elt->prevAFE ) {
1556 throw new ParameterAssertionException( '$elt',
1557 "Attempted to remove an element which is not in the AFE list" );
1558 }
1559 // Update head and tail pointers
1560 if ( $this->head === $elt ) {
1561 $this->head = $elt->nextAFE;
1562 }
1563 if ( $this->tail === $elt ) {
1564 $this->tail = $elt->prevAFE;
1565 }
1566 // Update previous element
1567 if ( $elt->prevAFE ) {
1568 $elt->prevAFE->nextAFE = $elt->nextAFE;
1569 }
1570 // Update next element
1571 if ( $elt->nextAFE ) {
1572 $elt->nextAFE->prevAFE = $elt->prevAFE;
1573 }
1574 // Clear pointers so that isInList() etc. will work
1575 $elt->prevAFE = $elt->nextAFE = null;
1576 // Update Noah list
1577 $this->removeFromNoahList( $elt );
1578 }
1579
1580 private function addToNoahList( BalanceElement $elt ) {
1581 $noahKey = $elt->getNoahKey();
1582 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1583 if ( !isset( $table[$noahKey] ) ) {
1584 $table[$noahKey] = $elt;
1585 } else {
1586 $tail = $table[$noahKey];
1587 while ( $tail->nextNoah ) {
1588 $tail = $tail->nextNoah;
1589 }
1590 $tail->nextNoah = $elt;
1591 }
1592 }
1593
1594 private function removeFromNoahList( BalanceElement $elt ) {
1595 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1596 $key = $elt->getNoahKey();
1597 $noahElt = $table[$key];
1598 if ( $noahElt === $elt ) {
1599 if ( $noahElt->nextNoah ) {
1600 $table[$key] = $noahElt->nextNoah;
1601 $noahElt->nextNoah = null;
1602 } else {
1603 unset( $table[$key] );
1604 }
1605 } else {
1606 do {
1607 $prevNoahElt = $noahElt;
1608 $noahElt = $prevNoahElt->nextNoah;
1609 if ( $noahElt === $elt ) {
1610 // Found it, unlink
1611 $prevNoahElt->nextNoah = $elt->nextNoah;
1612 $elt->nextNoah = null;
1613 break;
1614 }
1615 } while ( $noahElt );
1616 }
1617 }
1618
1619 /**
1620 * Find element $a in the list and replace it with element $b
1621 *
1622 * @param BalanceElement $a
1623 * @param BalanceElement $b
1624 */
1625 public function replace( BalanceElement $a, BalanceElement $b ) {
1626 if ( $this->head !== $a && !$a->prevAFE ) {
1627 throw new ParameterAssertionException( '$a',
1628 "Attempted to replace an element which is not in the AFE list" );
1629 }
1630 // Update head and tail pointers
1631 if ( $this->head === $a ) {
1632 $this->head = $b;
1633 }
1634 if ( $this->tail === $a ) {
1635 $this->tail = $b;
1636 }
1637 // Update previous element
1638 if ( $a->prevAFE ) {
1639 $a->prevAFE->nextAFE = $b;
1640 }
1641 // Update next element
1642 if ( $a->nextAFE ) {
1643 $a->nextAFE->prevAFE = $b;
1644 }
1645 $b->prevAFE = $a->prevAFE;
1646 $b->nextAFE = $a->nextAFE;
1647 $a->nextAFE = $a->prevAFE = null;
1648 // Update Noah list
1649 $this->removeFromNoahList( $a );
1650 $this->addToNoahList( $b );
1651 }
1652
1653 /**
1654 * Find $a in the list and insert $b after it.
1655
1656 * @param BalanceElement $a
1657 * @param BalanceElement $b
1658 */
1659 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1660 if ( $this->head !== $a && !$a->prevAFE ) {
1661 throw new ParameterAssertionException( '$a',
1662 "Attempted to insert after an element which is not in the AFE list" );
1663 }
1664 if ( $this->tail === $a ) {
1665 $this->tail = $b;
1666 }
1667 if ( $a->nextAFE ) {
1668 $a->nextAFE->prevAFE = $b;
1669 }
1670 $b->nextAFE = $a->nextAFE;
1671 $b->prevAFE = $a;
1672 $a->nextAFE = $b;
1673 $this->addToNoahList( $b );
1674 }
1675
1676 /**
1677 * Reconstruct the active formatting elements.
1678 * @param BalanceStack $stack The open elements stack
1679 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1680 */
1681 public function reconstruct( $stack ) {
1682 $entry = $this->tail;
1683 // If there are no entries in the list of active formatting elements,
1684 // then there is nothing to reconstruct
1685 if ( !$entry ) {
1686 return;
1687 }
1688 // If the last is a marker, do nothing.
1689 if ( $entry instanceof BalanceMarker ) {
1690 return;
1691 }
1692 // Or if it is an open element, do nothing.
1693 if ( $stack->indexOf( $entry ) >= 0 ) {
1694 return;
1695 }
1696
1697 // Loop backward through the list until we find a marker or an
1698 // open element
1699 $foundIt = false;
1700 while ( $entry->prevAFE ) {
1701 $entry = $entry->prevAFE;
1702 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1703 $foundIt = true;
1704 break;
1705 }
1706 }
1707
1708 // Now loop forward, starting from the element after the current one (or
1709 // the first element if we didn't find a marker or open element),
1710 // recreating formatting elements and pushing them back onto the list
1711 // of open elements.
1712 if ( $foundIt ) {
1713 $entry = $entry->nextAFE;
1714 }
1715 do {
1716 $newElement = $stack->insertHTMLElement(
1717 $entry->localName,
1718 $entry->attribs );
1719 $this->replace( $entry, $newElement );
1720 $entry = $newElement->nextAFE;
1721 } while ( $entry );
1722 }
1723
1724 /**
1725 * Get a string representation of the AFE list, for debugging
1726 */
1727 public function __toString() {
1728 $prev = null;
1729 $s = '';
1730 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1731 if ( $node instanceof BalanceMarker ) {
1732 $s .= "MARKER\n";
1733 continue;
1734 }
1735 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1736 if ( $node->nextNoah ) {
1737 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1738 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1739 ')';
1740 }
1741 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1742 $s .= " (reverse link is wrong!)";
1743 }
1744 $s .= "\n";
1745 }
1746 if ( $prev !== $this->tail ) {
1747 $s .= "(tail pointer is wrong!)\n";
1748 }
1749 return $s;
1750 }
1751 }
1752
1753 /**
1754 * An implementation of the tree building portion of the HTML5 parsing
1755 * spec.
1756 *
1757 * This is used to balance and tidy output so that the result can
1758 * always be cleanly serialized/deserialized by an HTML5 parser. It
1759 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1760 * a number of constraints which are not enforced by the HTML5 parsing
1761 * process. But the result will be free of gross errors: misnested or
1762 * unclosed tags, for example, and will be unchanged by spec-complient
1763 * parsing followed by serialization.
1764 *
1765 * The tree building stage is structured as a state machine.
1766 * When comparing the implementation to
1767 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1768 * note that each state is implemented as a function with a
1769 * name ending in `Mode` (because the HTML spec refers to them
1770 * as insertion modes). The current insertion mode is held by
1771 * the $parseMode property.
1772 *
1773 * The following simplifications have been made:
1774 * - We handle body content only (ie, we start `in body`.)
1775 * - The document is never in "quirks mode".
1776 * - All occurrences of < and > have been entity escaped, so we
1777 * can parse tags by simply splitting on those two characters.
1778 * (This also simplifies the handling of < inside <textarea>.)
1779 * The character < must not appear inside comments.
1780 * Similarly, all attributes have been "cleaned" and are double-quoted
1781 * and escaped.
1782 * - All null characters are assumed to have been removed.
1783 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1784 * <frame>, <plaintext>, <xmp>, <iframe>,
1785 * <noembed>, <noscript>, <script>, <title>. As a result,
1786 * further simplifications can be made:
1787 * - `frameset-ok` is not tracked.
1788 * - `head element pointer` is not tracked (but presumed non-null)
1789 * - Tokenizer has only a single mode. (<textarea> wants RCDATA and
1790 * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
1791 *
1792 * We generally mark places where we omit cases from the spec due to
1793 * disallowed elements with a comment: `// OMITTED: <element-name>`.
1794 *
1795 * The HTML spec keeps a flag during the parsing process to track
1796 * whether or not a "parse error" has been encountered. We don't
1797 * bother to track that flag, we just implement the error-handling
1798 * process as specified.
1799 *
1800 * @ingroup Parser
1801 * @since 1.27
1802 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1803 */
1804 class Balancer {
1805 private $parseMode;
1806 /** @var \Iterator */
1807 private $bitsIterator;
1808 private $allowedHtmlElements;
1809 /** @var BalanceActiveFormattingElements */
1810 private $afe;
1811 /** @var BalanceStack */
1812 private $stack;
1813 private $strict;
1814 private $allowComments;
1815 private $config;
1816
1817 private $textIntegrationMode;
1818 private $pendingTableText;
1819 private $originalInsertionMode;
1820 private $fragmentContext;
1821 private $formElementPointer;
1822 private $ignoreLinefeed;
1823 private $inRCDATA;
1824 private $inRAWTEXT;
1825
1826 /** @var callable|null */
1827 private $processingCallback;
1828 /** @var array */
1829 private $processingArgs;
1830
1831 /**
1832 * Valid HTML5 comments.
1833 * Regex borrowed from Tim Starling's "remex-html" project.
1834 */
1835 const VALID_COMMENT_REGEX = "~ !--
1836 ( # 1. Comment match detector
1837 > | -> | # Invalid short close
1838 ( # 2. Comment contents
1839 (?:
1840 (?! --> )
1841 (?! --!> )
1842 (?! --! \z )
1843 (?! -- \z )
1844 (?! - \z )
1845 .
1846 )*+
1847 )
1848 ( # 3. Comment close
1849 --> | # Normal close
1850 --!> | # Comment end bang
1851 ( # 4. Indicate matches requiring EOF
1852 --! | # EOF in comment end bang state
1853 -- | # EOF in comment end state
1854 - | # EOF in comment end dash state
1855 (?#nothing) # EOF in comment state
1856 )
1857 )
1858 )
1859 ([^<]*) \z # 5. Non-tag text after the comment
1860 ~xs";
1861
1862 /**
1863 * Create a new Balancer.
1864 * @param array $config Balancer configuration. Includes:
1865 * 'strict' : boolean, defaults to false.
1866 * When true, enforces syntactic constraints on input:
1867 * all non-tag '<' must be escaped, all attributes must be
1868 * separated by a single space and double-quoted. This is
1869 * consistent with the output of the Sanitizer.
1870 * 'allowedHtmlElements' : array, defaults to null.
1871 * When present, the keys of this associative array give
1872 * the acceptable HTML tag names. When not present, no
1873 * tag sanitization is done.
1874 * 'tidyCompat' : boolean, defaults to false.
1875 * When true, the serialization algorithm is tweaked to
1876 * provide historical compatibility with the old "tidy"
1877 * program: <p>-wrapping is done to the children of
1878 * <body> and <blockquote> elements, and empty elements
1879 * are removed. The <pre>/<listing>/<textarea> serialization
1880 * is also tweaked to allow lossless round trips.
1881 * (See: https://github.com/whatwg/html/issues/944)
1882 * 'allowComments': boolean, defaults to true.
1883 * When true, allows HTML comments in the input.
1884 * The Sanitizer generally strips all comments, so if you
1885 * are running on sanitized output you can set this to
1886 * false to get a bit more performance.
1887 */
1888 public function __construct( array $config = [] ) {
1889 $this->config = $config = $config + [
1890 'strict' => false,
1891 'allowedHtmlElements' => null,
1892 'tidyCompat' => false,
1893 'allowComments' => true,
1894 ];
1895 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1896 $this->strict = $config['strict'];
1897 $this->allowComments = $config['allowComments'];
1898 if ( $this->allowedHtmlElements !== null ) {
1899 // Sanity check!
1900 $bad = array_uintersect_assoc(
1901 $this->allowedHtmlElements,
1902 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1903 function ( $a, $b ) {
1904 // Ignore the values (just intersect the keys) by saying
1905 // all values are equal to each other.
1906 return 0;
1907 }
1908 );
1909 if ( count( $bad ) > 0 ) {
1910 $badstr = implode( ',', array_keys( $bad ) );
1911 throw new ParameterAssertionException(
1912 '$config',
1913 'Balance attempted with sanitization including ' .
1914 "unsupported elements: {$badstr}"
1915 );
1916 }
1917 }
1918 }
1919
1920 /**
1921 * Return a balanced HTML string for the HTML fragment given by $text,
1922 * subject to the caveats listed in the class description. The result
1923 * will typically be idempotent -- that is, rebalancing the output
1924 * would result in no change.
1925 *
1926 * @param string $text The markup to be balanced
1927 * @param callable $processingCallback Callback to do any variable or
1928 * parameter replacements in HTML attributes values
1929 * @param array|bool $processingArgs Arguments for the processing callback
1930 * @return string The balanced markup
1931 */
1932 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1933 $this->parseMode = 'inBodyMode';
1934 $this->bitsIterator = new ExplodeIterator( '<', $text );
1935 $this->afe = new BalanceActiveFormattingElements();
1936 $this->stack = new BalanceStack( $this->config );
1937 $this->processingCallback = $processingCallback;
1938 $this->processingArgs = $processingArgs;
1939
1940 $this->textIntegrationMode =
1941 $this->ignoreLinefeed =
1942 $this->inRCDATA =
1943 $this->inRAWTEXT = false;
1944
1945 // The stack is constructed with an <html> element already on it.
1946 // Set this up as a fragment parsed with <body> as the context.
1947 $this->fragmentContext =
1948 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1949 $this->resetInsertionMode();
1950 $this->formElementPointer = null;
1951 for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) {
1952 if ( $e->isHtmlNamed( 'form' ) ) {
1953 $this->formElementPointer = $e;
1954 break;
1955 }
1956 }
1957
1958 // First element is text not tag
1959 $x = $this->bitsIterator->current();
1960 $this->bitsIterator->next();
1961 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1962 // Now process each tag.
1963 while ( $this->bitsIterator->valid() ) {
1964 $this->advance();
1965 }
1966 $this->insertToken( 'eof', null );
1967 $result = $this->stack->getOutput();
1968 // Free memory before returning.
1969 $this->bitsIterator = null;
1970 $this->afe = null;
1971 $this->stack = null;
1972 $this->fragmentContext = null;
1973 $this->formElementPointer = null;
1974 return $result;
1975 }
1976
1977 /**
1978 * Pass a token to the tree builder. The $token will be one of the
1979 * strings "tag", "endtag", or "text".
1980 */
1981 private function insertToken( $token, $value, $attribs = null, $selfClose = false ) {
1982 // validate tags against $unsupportedSet
1983 if ( $token === 'tag' || $token === 'endtag' ) {
1984 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1985 // As described in "simplifications" above, these tags are
1986 // not supported in the balancer.
1987 Assert::invariant(
1988 !$this->strict,
1989 "Unsupported $token <$value> found."
1990 );
1991 return false;
1992 }
1993 } elseif ( $token === 'text' && $value === '' ) {
1994 // Don't actually inject the empty string as a text token.
1995 return true;
1996 }
1997 // Support pre/listing/textarea by suppressing initial linefeed
1998 if ( $this->ignoreLinefeed ) {
1999 $this->ignoreLinefeed = false;
2000 if ( $token === 'text' ) {
2001 if ( $value[0] === "\n" ) {
2002 if ( $value === "\n" ) {
2003 // Nothing would be left, don't inject the empty string.
2004 return true;
2005 }
2006 $value = substr( $value, 1 );
2007 }
2008 }
2009 }
2010 // Some hoops we have to jump through
2011 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
2012
2013 // The spec calls this the "tree construction dispatcher".
2014 $isForeign = true;
2015 if (
2016 $this->stack->length() === 0 ||
2017 $adjusted->isHtml() ||
2018 $token === 'eof'
2019 ) {
2020 $isForeign = false;
2021 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
2022 if ( $token === 'text' ) {
2023 $isForeign = false;
2024 } elseif (
2025 $token === 'tag' &&
2026 $value !== 'mglyph' && $value !== 'malignmark'
2027 ) {
2028 $isForeign = false;
2029 }
2030 } elseif (
2031 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
2032 $adjusted->localName === 'annotation-xml' &&
2033 $token === 'tag' && $value === 'svg'
2034 ) {
2035 $isForeign = false;
2036 } elseif (
2037 $adjusted->isHtmlIntegrationPoint() &&
2038 ( $token === 'tag' || $token === 'text' )
2039 ) {
2040 $isForeign = false;
2041 }
2042 if ( $isForeign ) {
2043 return $this->insertForeignToken( $token, $value, $attribs, $selfClose );
2044 } else {
2045 $func = $this->parseMode;
2046 return $this->$func( $token, $value, $attribs, $selfClose );
2047 }
2048 }
2049
2050 private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) {
2051 if ( $token === 'text' ) {
2052 $this->stack->insertText( $value );
2053 return true;
2054 } elseif ( $token === 'comment' ) {
2055 $this->stack->insertComment( $value );
2056 return true;
2057 } elseif ( $token === 'tag' ) {
2058 switch ( $value ) {
2059 case 'font':
2060 if ( isset( $attribs['color'] )
2061 || isset( $attribs['face'] )
2062 || isset( $attribs['size'] )
2063 ) {
2064 break;
2065 }
2066 // otherwise, fall through
2067 case 'b':
2068 case 'big':
2069 case 'blockquote':
2070 case 'body':
2071 case 'br':
2072 case 'center':
2073 case 'code':
2074 case 'dd':
2075 case 'div':
2076 case 'dl':
2077 case 'dt':
2078 case 'em':
2079 case 'embed':
2080 case 'h1':
2081 case 'h2':
2082 case 'h3':
2083 case 'h4':
2084 case 'h5':
2085 case 'h6':
2086 case 'head':
2087 case 'hr':
2088 case 'i':
2089 case 'img':
2090 case 'li':
2091 case 'listing':
2092 case 'menu':
2093 case 'meta':
2094 case 'nobr':
2095 case 'ol':
2096 case 'p':
2097 case 'pre':
2098 case 'ruby':
2099 case 's':
2100 case 'small':
2101 case 'span':
2102 case 'strong':
2103 case 'strike':
2104 case 'sub':
2105 case 'sup':
2106 case 'table':
2107 case 'tt':
2108 case 'u':
2109 case 'ul':
2110 case 'var':
2111 if ( $this->fragmentContext ) {
2112 break;
2113 }
2114 while ( true ) {
2115 $this->stack->pop();
2116 $node = $this->stack->currentNode;
2117 if (
2118 $node->isMathmlTextIntegrationPoint() ||
2119 $node->isHtmlIntegrationPoint() ||
2120 $node->isHtml()
2121 ) {
2122 break;
2123 }
2124 }
2125 return $this->insertToken( $token, $value, $attribs, $selfClose );
2126 }
2127 // "Any other start tag"
2128 $adjusted = ( $this->fragmentContext && $this->stack->length() === 1 ) ?
2129 $this->fragmentContext : $this->stack->currentNode;
2130 $this->stack->insertForeignElement(
2131 $adjusted->namespaceURI, $value, $attribs
2132 );
2133 if ( $selfClose ) {
2134 $this->stack->pop();
2135 }
2136 return true;
2137 } elseif ( $token === 'endtag' ) {
2138 $first = true;
2139 foreach ( $this->stack as $i => $node ) {
2140 if ( $node->isHtml() && !$first ) {
2141 // process the end tag as HTML
2142 $func = $this->parseMode;
2143 return $this->$func( $token, $value, $attribs, $selfClose );
2144 } elseif ( $i === 0 ) {
2145 return true;
2146 } elseif ( $node->localName === $value ) {
2147 $this->stack->popTag( $node );
2148 return true;
2149 }
2150 $first = false;
2151 }
2152 }
2153 }
2154
2155 /**
2156 * Grab the next "token" from $bitsIterator. This is either a open/close
2157 * tag or text or a comment, depending on whether the Sanitizer approves.
2158 */
2159 private function advance() {
2160 $x = $this->bitsIterator->current();
2161 $this->bitsIterator->next();
2162 $regs = [];
2163 // Handle comments. These won't be generated by mediawiki (they
2164 // are stripped in the Sanitizer) but may be generated by extensions.
2165 if (
2166 $this->allowComments &&
2167 !( $this->inRCDATA || $this->inRAWTEXT ) &&
2168 preg_match( self::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
2169 // verify EOF condition where necessary
2170 ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
2171 ) {
2172 $contents = $regs[2][0];
2173 $rest = $regs[5][0];
2174 $this->insertToken( 'comment', $contents );
2175 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2176 return;
2177 }
2178 // $slash: Does the current element start with a '/'?
2179 // $t: Current element name
2180 // $attribStr: String between element name and >
2181 // $brace: Ending '>' or '/>'
2182 // $rest: Everything until the next element from the $bitsIterator
2183 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2184 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2185 $t = strtolower( $t );
2186 if ( $this->strict ) {
2187 // Verify that attributes are all properly double-quoted
2188 Assert::invariant(
2189 preg_match(
2190 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2191 ),
2192 "Bad attribute string found"
2193 );
2194 }
2195 } else {
2196 Assert::invariant(
2197 !$this->strict, "< found which does not start a valid tag"
2198 );
2199 $slash = $t = $attribStr = $brace = $rest = null;
2200 }
2201 $goodTag = $t;
2202 if ( $this->inRCDATA ) {
2203 if ( $slash && $t === $this->inRCDATA ) {
2204 $this->inRCDATA = false;
2205 } else {
2206 // No tags allowed; this emulates the "rcdata" tokenizer mode.
2207 $goodTag = false;
2208 }
2209 }
2210 if ( $this->inRAWTEXT ) {
2211 if ( $slash && $t === $this->inRAWTEXT ) {
2212 $this->inRAWTEXT = false;
2213 } else {
2214 // No tags allowed, no entity-escaping done.
2215 $goodTag = false;
2216 }
2217 }
2218 $sanitize = $this->allowedHtmlElements !== null;
2219 if ( $sanitize ) {
2220 $goodTag = $t && isset( $this->allowedHtmlElements[$t] );
2221 }
2222 if ( $goodTag ) {
2223 if ( is_callable( $this->processingCallback ) ) {
2224 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2225 }
2226 if ( $sanitize ) {
2227 $goodTag = Sanitizer::validateTag( $attribStr, $t );
2228 }
2229 }
2230 if ( $goodTag ) {
2231 if ( $sanitize ) {
2232 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2233 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2234 } else {
2235 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2236 }
2237 $goodTag = $this->insertToken(
2238 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2239 );
2240 }
2241 if ( $goodTag ) {
2242 $rest = str_replace( '>', '&gt;', $rest );
2243 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2244 } elseif ( $this->inRAWTEXT ) {
2245 $this->insertToken( 'text', "<$x" );
2246 } else {
2247 // bad tag; serialize entire thing as text.
2248 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2249 }
2250 }
2251
2252 private function switchMode( $mode ) {
2253 Assert::parameter(
2254 substr( $mode, -4 ) === 'Mode', '$mode', 'should end in Mode'
2255 );
2256 $oldMode = $this->parseMode;
2257 $this->parseMode = $mode;
2258 return $oldMode;
2259 }
2260
2261 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) {
2262 $this->switchMode( $mode );
2263 return $this->insertToken( $token, $value, $attribs, $selfClose );
2264 }
2265
2266 private function resetInsertionMode() {
2267 $last = false;
2268 foreach ( $this->stack as $i => $node ) {
2269 if ( $i === 0 ) {
2270 $last = true;
2271 if ( $this->fragmentContext ) {
2272 $node = $this->fragmentContext;
2273 }
2274 }
2275 if ( $node->isHtml() ) {
2276 switch ( $node->localName ) {
2277 case 'select':
2278 $stackLength = $this->stack->length();
2279 for ( $j = $i + 1; $j < $stackLength - 1; $j++ ) {
2280 $ancestor = $this->stack->node( $stackLength - $j - 1 );
2281 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2282 break;
2283 }
2284 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2285 $this->switchMode( 'inSelectInTableMode' );
2286 return;
2287 }
2288 }
2289 $this->switchMode( 'inSelectMode' );
2290 return;
2291 case 'tr':
2292 $this->switchMode( 'inRowMode' );
2293 return;
2294 case 'tbody':
2295 case 'tfoot':
2296 case 'thead':
2297 $this->switchMode( 'inTableBodyMode' );
2298 return;
2299 case 'caption':
2300 $this->switchMode( 'inCaptionMode' );
2301 return;
2302 case 'colgroup':
2303 $this->switchMode( 'inColumnGroupMode' );
2304 return;
2305 case 'table':
2306 $this->switchMode( 'inTableMode' );
2307 return;
2308 case 'template':
2309 $this->switchMode(
2310 array_slice( $this->templateInsertionModes, -1 )[0]
2311 );
2312 return;
2313 case 'body':
2314 $this->switchMode( 'inBodyMode' );
2315 return;
2316 // OMITTED: <frameset>
2317 // OMITTED: <html>
2318 // OMITTED: <head>
2319 default:
2320 if ( !$last ) {
2321 // OMITTED: <head>
2322 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2323 $this->switchMode( 'inCellMode' );
2324 return;
2325 }
2326 }
2327 }
2328 }
2329 if ( $last ) {
2330 $this->switchMode( 'inBodyMode' );
2331 return;
2332 }
2333 }
2334 }
2335
2336 private function stopParsing() {
2337 // Most of the spec methods are inapplicable, other than step 2:
2338 // "pop all the nodes off the stack of open elements".
2339 // We're going to keep the top-most <html> element on the stack, though.
2340
2341 // Clear the AFE list first, otherwise the element objects will stay live
2342 // during serialization, potentially using O(N^2) memory. Note that
2343 // popping the stack will never result in reconstructing the active
2344 // formatting elements.
2345 $this->afe = null;
2346 $this->stack->popTo( 1 );
2347 }
2348
2349 private function parseRawText( $value, $attribs = null ) {
2350 $this->stack->insertHTMLElement( $value, $attribs );
2351 $this->inRAWTEXT = $value;
2352 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2353 return true;
2354 }
2355
2356 private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) {
2357 if ( $token === 'text' ) {
2358 $this->stack->insertText( $value );
2359 return true;
2360 } elseif ( $token === 'eof' ) {
2361 $this->stack->pop();
2362 return $this->switchModeAndReprocess(
2363 $this->originalInsertionMode, $token, $value, $attribs, $selfClose
2364 );
2365 } elseif ( $token === 'endtag' ) {
2366 $this->stack->pop();
2367 $this->switchMode( $this->originalInsertionMode );
2368 return true;
2369 }
2370 return true;
2371 }
2372
2373 private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) {
2374 if ( $token === 'text' ) {
2375 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2376 $this->stack->insertText( $matches[0] );
2377 $value = substr( $value, strlen( $matches[0] ) );
2378 }
2379 if ( strlen( $value ) === 0 ) {
2380 return true; // All text handled.
2381 }
2382 // Fall through to handle non-whitespace below.
2383 } elseif ( $token === 'tag' ) {
2384 switch ( $value ) {
2385 case 'meta':
2386 // OMITTED: in a full HTML parser, this might change the encoding.
2387 // falls through
2388 // OMITTED: <html>
2389 case 'base':
2390 case 'basefont':
2391 case 'bgsound':
2392 case 'link':
2393 $this->stack->insertHTMLElement( $value, $attribs );
2394 $this->stack->pop();
2395 return true;
2396 // OMITTED: <title>
2397 // OMITTED: <noscript>
2398 case 'noframes':
2399 case 'style':
2400 return $this->parseRawText( $value, $attribs );
2401 // OMITTED: <script>
2402 case 'template':
2403 $this->stack->insertHTMLElement( $value, $attribs );
2404 $this->afe->insertMarker();
2405 // OMITTED: frameset_ok
2406 $this->switchMode( 'inTemplateMode' );
2407 $this->templateInsertionModes[] = $this->parseMode;
2408 return true;
2409 // OMITTED: <head>
2410 }
2411 } elseif ( $token === 'endtag' ) {
2412 switch ( $value ) {
2413 // OMITTED: <head>
2414 // OMITTED: <body>
2415 // OMITTED: <html>
2416 case 'br':
2417 break; // handle at the bottom of the function
2418 case 'template':
2419 if ( $this->stack->indexOf( $value ) < 0 ) {
2420 return true; // Ignore the token.
2421 }
2422 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2423 $this->stack->popTag( $value );
2424 $this->afe->clearToMarker();
2425 array_pop( $this->templateInsertionModes );
2426 $this->resetInsertionMode();
2427 return true;
2428 default:
2429 // ignore any other end tag
2430 return true;
2431 }
2432 } elseif ( $token === 'comment' ) {
2433 $this->stack->insertComment( $value );
2434 return true;
2435 }
2436
2437 // If not handled above
2438 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2439 // Then redo this one
2440 return $this->insertToken( $token, $value, $attribs, $selfClose );
2441 }
2442
2443 private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
2444 if ( $token === 'text' ) {
2445 $this->afe->reconstruct( $this->stack );
2446 $this->stack->insertText( $value );
2447 return true;
2448 } elseif ( $token === 'eof' ) {
2449 if ( !empty( $this->templateInsertionModes ) ) {
2450 return $this->inTemplateMode( $token, $value, $attribs, $selfClose );
2451 }
2452 $this->stopParsing();
2453 return true;
2454 } elseif ( $token === 'tag' ) {
2455 switch ( $value ) {
2456 // OMITTED: <html>
2457 case 'base':
2458 case 'basefont':
2459 case 'bgsound':
2460 case 'link':
2461 case 'meta':
2462 case 'noframes':
2463 // OMITTED: <script>
2464 case 'style':
2465 case 'template':
2466 // OMITTED: <title>
2467 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2468 // OMITTED: <body>
2469 // OMITTED: <frameset>
2470
2471 case 'address':
2472 case 'article':
2473 case 'aside':
2474 case 'blockquote':
2475 case 'center':
2476 case 'details':
2477 case 'dialog':
2478 case 'dir':
2479 case 'div':
2480 case 'dl':
2481 case 'fieldset':
2482 case 'figcaption':
2483 case 'figure':
2484 case 'footer':
2485 case 'header':
2486 case 'hgroup':
2487 case 'main':
2488 case 'nav':
2489 case 'ol':
2490 case 'p':
2491 case 'section':
2492 case 'summary':
2493 case 'ul':
2494 if ( $this->stack->inButtonScope( 'p' ) ) {
2495 $this->inBodyMode( 'endtag', 'p' );
2496 }
2497 $this->stack->insertHTMLElement( $value, $attribs );
2498 return true;
2499
2500 case 'menu':
2501 if ( $this->stack->inButtonScope( "p" ) ) {
2502 $this->inBodyMode( 'endtag', 'p' );
2503 }
2504 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2505 $this->stack->pop();
2506 }
2507 $this->stack->insertHTMLElement( $value, $attribs );
2508 return true;
2509
2510 case 'h1':
2511 case 'h2':
2512 case 'h3':
2513 case 'h4':
2514 case 'h5':
2515 case 'h6':
2516 if ( $this->stack->inButtonScope( 'p' ) ) {
2517 $this->inBodyMode( 'endtag', 'p' );
2518 }
2519 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2520 $this->stack->pop();
2521 }
2522 $this->stack->insertHTMLElement( $value, $attribs );
2523 return true;
2524
2525 case 'pre':
2526 case 'listing':
2527 if ( $this->stack->inButtonScope( 'p' ) ) {
2528 $this->inBodyMode( 'endtag', 'p' );
2529 }
2530 $this->stack->insertHTMLElement( $value, $attribs );
2531 $this->ignoreLinefeed = true;
2532 // OMITTED: frameset_ok
2533 return true;
2534
2535 case 'form':
2536 if (
2537 $this->formElementPointer &&
2538 $this->stack->indexOf( 'template' ) < 0
2539 ) {
2540 return true; // in a form, not in a template.
2541 }
2542 if ( $this->stack->inButtonScope( "p" ) ) {
2543 $this->inBodyMode( 'endtag', 'p' );
2544 }
2545 $elt = $this->stack->insertHTMLElement( $value, $attribs );
2546 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2547 $this->formElementPointer = $elt;
2548 }
2549 return true;
2550
2551 case 'li':
2552 // OMITTED: frameset_ok
2553 foreach ( $this->stack as $node ) {
2554 if ( $node->isHtmlNamed( 'li' ) ) {
2555 $this->inBodyMode( 'endtag', 'li' );
2556 break;
2557 }
2558 if (
2559 $node->isA( BalanceSets::$specialSet ) &&
2560 !$node->isA( BalanceSets::$addressDivPSet )
2561 ) {
2562 break;
2563 }
2564 }
2565 if ( $this->stack->inButtonScope( 'p' ) ) {
2566 $this->inBodyMode( 'endtag', 'p' );
2567 }
2568 $this->stack->insertHTMLElement( $value, $attribs );
2569 return true;
2570
2571 case 'dd':
2572 case 'dt':
2573 // OMITTED: frameset_ok
2574 foreach ( $this->stack as $node ) {
2575 if ( $node->isHtmlNamed( 'dd' ) ) {
2576 $this->inBodyMode( 'endtag', 'dd' );
2577 break;
2578 }
2579 if ( $node->isHtmlNamed( 'dt' ) ) {
2580 $this->inBodyMode( 'endtag', 'dt' );
2581 break;
2582 }
2583 if (
2584 $node->isA( BalanceSets::$specialSet ) &&
2585 !$node->isA( BalanceSets::$addressDivPSet )
2586 ) {
2587 break;
2588 }
2589 }
2590 if ( $this->stack->inButtonScope( 'p' ) ) {
2591 $this->inBodyMode( 'endtag', 'p' );
2592 }
2593 $this->stack->insertHTMLElement( $value, $attribs );
2594 return true;
2595
2596 // OMITTED: <plaintext>
2597
2598 case 'button':
2599 if ( $this->stack->inScope( 'button' ) ) {
2600 $this->inBodyMode( 'endtag', 'button' );
2601 return $this->insertToken( $token, $value, $attribs, $selfClose );
2602 }
2603 $this->afe->reconstruct( $this->stack );
2604 $this->stack->insertHTMLElement( $value, $attribs );
2605 return true;
2606
2607 case 'a':
2608 $activeElement = $this->afe->findElementByTag( 'a' );
2609 if ( $activeElement ) {
2610 $this->inBodyMode( 'endtag', 'a' );
2611 if ( $this->afe->isInList( $activeElement ) ) {
2612 $this->afe->remove( $activeElement );
2613 // Don't flatten here, since when we fall
2614 // through below we might foster parent
2615 // the new <a> tag inside this one.
2616 $this->stack->removeElement( $activeElement, false );
2617 }
2618 }
2619 // Falls through
2620 case 'b':
2621 case 'big':
2622 case 'code':
2623 case 'em':
2624 case 'font':
2625 case 'i':
2626 case 's':
2627 case 'small':
2628 case 'strike':
2629 case 'strong':
2630 case 'tt':
2631 case 'u':
2632 $this->afe->reconstruct( $this->stack );
2633 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
2634 return true;
2635
2636 case 'nobr':
2637 $this->afe->reconstruct( $this->stack );
2638 if ( $this->stack->inScope( 'nobr' ) ) {
2639 $this->inBodyMode( 'endtag', 'nobr' );
2640 $this->afe->reconstruct( $this->stack );
2641 }
2642 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
2643 return true;
2644
2645 case 'applet':
2646 case 'marquee':
2647 case 'object':
2648 $this->afe->reconstruct( $this->stack );
2649 $this->stack->insertHTMLElement( $value, $attribs );
2650 $this->afe->insertMarker();
2651 // OMITTED: frameset_ok
2652 return true;
2653
2654 case 'table':
2655 // The document is never in "quirks mode"; see simplifications
2656 // above.
2657 if ( $this->stack->inButtonScope( 'p' ) ) {
2658 $this->inBodyMode( 'endtag', 'p' );
2659 }
2660 $this->stack->insertHTMLElement( $value, $attribs );
2661 // OMITTED: frameset_ok
2662 $this->switchMode( 'inTableMode' );
2663 return true;
2664
2665 case 'area':
2666 case 'br':
2667 case 'embed':
2668 case 'img':
2669 case 'keygen':
2670 case 'wbr':
2671 $this->afe->reconstruct( $this->stack );
2672 $this->stack->insertHTMLElement( $value, $attribs );
2673 $this->stack->pop();
2674 // OMITTED: frameset_ok
2675 return true;
2676
2677 case 'input':
2678 $this->afe->reconstruct( $this->stack );
2679 $this->stack->insertHTMLElement( $value, $attribs );
2680 $this->stack->pop();
2681 // OMITTED: frameset_ok
2682 // (hence we don't need to examine the tag's "type" attribute)
2683 return true;
2684
2685 case 'param':
2686 case 'source':
2687 case 'track':
2688 $this->stack->insertHTMLElement( $value, $attribs );
2689 $this->stack->pop();
2690 return true;
2691
2692 case 'hr':
2693 if ( $this->stack->inButtonScope( 'p' ) ) {
2694 $this->inBodyMode( 'endtag', 'p' );
2695 }
2696 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2697 $this->stack->pop();
2698 }
2699 $this->stack->insertHTMLElement( $value, $attribs );
2700 $this->stack->pop();
2701 return true;
2702
2703 case 'image':
2704 // warts!
2705 return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
2706
2707 case 'textarea':
2708 $this->stack->insertHTMLElement( $value, $attribs );
2709 $this->ignoreLinefeed = true;
2710 $this->inRCDATA = $value; // emulate rcdata tokenizer mode
2711 // OMITTED: frameset_ok
2712 return true;
2713
2714 // OMITTED: <xmp>
2715 // OMITTED: <iframe>
2716 // OMITTED: <noembed>
2717 // OMITTED: <noscript>
2718
2719 case 'select':
2720 $this->afe->reconstruct( $this->stack );
2721 $this->stack->insertHTMLElement( $value, $attribs );
2722 switch ( $this->parseMode ) {
2723 case 'inTableMode':
2724 case 'inCaptionMode':
2725 case 'inTableBodyMode':
2726 case 'inRowMode':
2727 case 'inCellMode':
2728 $this->switchMode( 'inSelectInTableMode' );
2729 return true;
2730 default:
2731 $this->switchMode( 'inSelectMode' );
2732 return true;
2733 }
2734
2735 case 'optgroup':
2736 case 'option':
2737 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
2738 $this->inBodyMode( 'endtag', 'option' );
2739 }
2740 $this->afe->reconstruct( $this->stack );
2741 $this->stack->insertHTMLElement( $value, $attribs );
2742 return true;
2743
2744 case 'menuitem':
2745 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2746 $this->stack->pop();
2747 }
2748 $this->afe->reconstruct( $this->stack );
2749 $this->stack->insertHTMLElement( $value, $attribs );
2750 return true;
2751
2752 case 'rb':
2753 case 'rtc':
2754 if ( $this->stack->inScope( 'ruby' ) ) {
2755 $this->stack->generateImpliedEndTags();
2756 }
2757 $this->stack->insertHTMLElement( $value, $attribs );
2758 return true;
2759
2760 case 'rp':
2761 case 'rt':
2762 if ( $this->stack->inScope( 'ruby' ) ) {
2763 $this->stack->generateImpliedEndTags( 'rtc' );
2764 }
2765 $this->stack->insertHTMLElement( $value, $attribs );
2766 return true;
2767
2768 case 'math':
2769 $this->afe->reconstruct( $this->stack );
2770 // We skip the spec's "adjust MathML attributes" and
2771 // "adjust foreign attributes" steps, since the browser will
2772 // do this later when it parses the output and it doesn't affect
2773 // balancing.
2774 $this->stack->insertForeignElement(
2775 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2776 );
2777 if ( $selfClose ) {
2778 // emit explicit </math> tag.
2779 $this->stack->pop();
2780 }
2781 return true;
2782
2783 case 'svg':
2784 $this->afe->reconstruct( $this->stack );
2785 // We skip the spec's "adjust SVG attributes" and
2786 // "adjust foreign attributes" steps, since the browser will
2787 // do this later when it parses the output and it doesn't affect
2788 // balancing.
2789 $this->stack->insertForeignElement(
2790 BalanceSets::SVG_NAMESPACE, $value, $attribs
2791 );
2792 if ( $selfClose ) {
2793 // emit explicit </svg> tag.
2794 $this->stack->pop();
2795 }
2796 return true;
2797
2798 case 'caption':
2799 case 'col':
2800 case 'colgroup':
2801 // OMITTED: <frame>
2802 case 'head':
2803 case 'tbody':
2804 case 'td':
2805 case 'tfoot':
2806 case 'th':
2807 case 'thead':
2808 case 'tr':
2809 // Ignore table tags if we're not inTableMode
2810 return true;
2811 }
2812
2813 // Handle any other start tag here
2814 $this->afe->reconstruct( $this->stack );
2815 $this->stack->insertHTMLElement( $value, $attribs );
2816 return true;
2817 } elseif ( $token === 'endtag' ) {
2818 switch ( $value ) {
2819 // </body>,</html> are unsupported.
2820
2821 case 'template':
2822 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2823
2824 case 'address':
2825 case 'article':
2826 case 'aside':
2827 case 'blockquote':
2828 case 'button':
2829 case 'center':
2830 case 'details':
2831 case 'dialog':
2832 case 'dir':
2833 case 'div':
2834 case 'dl':
2835 case 'fieldset':
2836 case 'figcaption':
2837 case 'figure':
2838 case 'footer':
2839 case 'header':
2840 case 'hgroup':
2841 case 'listing':
2842 case 'main':
2843 case 'menu':
2844 case 'nav':
2845 case 'ol':
2846 case 'pre':
2847 case 'section':
2848 case 'summary':
2849 case 'ul':
2850 // Ignore if there is not a matching open tag
2851 if ( !$this->stack->inScope( $value ) ) {
2852 return true;
2853 }
2854 $this->stack->generateImpliedEndTags();
2855 $this->stack->popTag( $value );
2856 return true;
2857
2858 case 'form':
2859 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2860 $openform = $this->formElementPointer;
2861 $this->formElementPointer = null;
2862 if ( !$openform || !$this->stack->inScope( $openform ) ) {
2863 return true;
2864 }
2865 $this->stack->generateImpliedEndTags();
2866 // Don't flatten yet if we're removing a <form> element
2867 // out-of-order. (eg. `<form><div></form>`)
2868 $flatten = ( $this->stack->currentNode === $openform );
2869 $this->stack->removeElement( $openform, $flatten );
2870 } else {
2871 if ( !$this->stack->inScope( 'form' ) ) {
2872 return true;
2873 }
2874 $this->stack->generateImpliedEndTags();
2875 $this->stack->popTag( 'form' );
2876 }
2877 return true;
2878
2879 case 'p':
2880 if ( !$this->stack->inButtonScope( 'p' ) ) {
2881 $this->inBodyMode( 'tag', 'p', [] );
2882 return $this->insertToken( $token, $value, $attribs, $selfClose );
2883 }
2884 $this->stack->generateImpliedEndTags( $value );
2885 $this->stack->popTag( $value );
2886 return true;
2887
2888 case 'li':
2889 if ( !$this->stack->inListItemScope( $value ) ) {
2890 return true; // ignore
2891 }
2892 $this->stack->generateImpliedEndTags( $value );
2893 $this->stack->popTag( $value );
2894 return true;
2895
2896 case 'dd':
2897 case 'dt':
2898 if ( !$this->stack->inScope( $value ) ) {
2899 return true; // ignore
2900 }
2901 $this->stack->generateImpliedEndTags( $value );
2902 $this->stack->popTag( $value );
2903 return true;
2904
2905 case 'h1':
2906 case 'h2':
2907 case 'h3':
2908 case 'h4':
2909 case 'h5':
2910 case 'h6':
2911 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2912 return true; // ignore
2913 }
2914 $this->stack->generateImpliedEndTags();
2915 $this->stack->popTag( BalanceSets::$headingSet );
2916 return true;
2917
2918 case 'sarcasm':
2919 // Take a deep breath, then:
2920 break;
2921
2922 case 'a':
2923 case 'b':
2924 case 'big':
2925 case 'code':
2926 case 'em':
2927 case 'font':
2928 case 'i':
2929 case 'nobr':
2930 case 's':
2931 case 'small':
2932 case 'strike':
2933 case 'strong':
2934 case 'tt':
2935 case 'u':
2936 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2937 return true; // If we did something, we're done.
2938 }
2939 break; // Go to the "any other end tag" case.
2940
2941 case 'applet':
2942 case 'marquee':
2943 case 'object':
2944 if ( !$this->stack->inScope( $value ) ) {
2945 return true; // ignore
2946 }
2947 $this->stack->generateImpliedEndTags();
2948 $this->stack->popTag( $value );
2949 $this->afe->clearToMarker();
2950 return true;
2951
2952 case 'br':
2953 // Turn </br> into <br>
2954 return $this->inBodyMode( 'tag', $value, [] );
2955 }
2956
2957 // Any other end tag goes here
2958 foreach ( $this->stack as $i => $node ) {
2959 if ( $node->isHtmlNamed( $value ) ) {
2960 $this->stack->generateImpliedEndTags( $value );
2961 $this->stack->popTo( $i ); // including $i
2962 break;
2963 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2964 return true; // ignore this close token.
2965 }
2966 }
2967 return true;
2968 } elseif ( $token === 'comment' ) {
2969 $this->stack->insertComment( $value );
2970 return true;
2971 } else {
2972 Assert::invariant( false, "Bad token type: $token" );
2973 }
2974 }
2975
2976 private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) {
2977 if ( $token === 'text' ) {
2978 if ( $this->textIntegrationMode ) {
2979 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
2980 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2981 $this->pendingTableText = '';
2982 $this->originalInsertionMode = $this->parseMode;
2983 return $this->switchModeAndReprocess( 'inTableTextMode',
2984 $token, $value, $attribs, $selfClose );
2985 }
2986 // fall through to default case.
2987 } elseif ( $token === 'eof' ) {
2988 $this->stopParsing();
2989 return true;
2990 } elseif ( $token === 'tag' ) {
2991 switch ( $value ) {
2992 case 'caption':
2993 $this->afe->insertMarker();
2994 $this->stack->insertHTMLElement( $value, $attribs );
2995 $this->switchMode( 'inCaptionMode' );
2996 return true;
2997 case 'colgroup':
2998 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2999 $this->stack->insertHTMLElement( $value, $attribs );
3000 $this->switchMode( 'inColumnGroupMode' );
3001 return true;
3002 case 'col':
3003 $this->inTableMode( 'tag', 'colgroup', [] );
3004 return $this->insertToken( $token, $value, $attribs, $selfClose );
3005 case 'tbody':
3006 case 'tfoot':
3007 case 'thead':
3008 $this->stack->clearToContext( BalanceSets::$tableContextSet );
3009 $this->stack->insertHTMLElement( $value, $attribs );
3010 $this->switchMode( 'inTableBodyMode' );
3011 return true;
3012 case 'td':
3013 case 'th':
3014 case 'tr':
3015 $this->inTableMode( 'tag', 'tbody', [] );
3016 return $this->insertToken( $token, $value, $attribs, $selfClose );
3017 case 'table':
3018 if ( !$this->stack->inTableScope( $value ) ) {
3019 return true; // Ignore this tag.
3020 }
3021 $this->inTableMode( 'endtag', $value );
3022 return $this->insertToken( $token, $value, $attribs, $selfClose );
3023
3024 case 'style':
3025 // OMITTED: <script>
3026 case 'template':
3027 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3028
3029 case 'input':
3030 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
3031 break; // Handle this as "everything else"
3032 }
3033 $this->stack->insertHTMLElement( $value, $attribs );
3034 $this->stack->pop();
3035 return true;
3036
3037 case 'form':
3038 if (
3039 $this->formElementPointer ||
3040 $this->stack->indexOf( 'template' ) >= 0
3041 ) {
3042 return true; // ignore this token
3043 }
3044 $this->formElementPointer =
3045 $this->stack->insertHTMLElement( $value, $attribs );
3046 $this->stack->popTag( $this->formElementPointer );
3047 return true;
3048 }
3049 // Fall through for "anything else" clause.
3050 } elseif ( $token === 'endtag' ) {
3051 switch ( $value ) {
3052 case 'table':
3053 if ( !$this->stack->inTableScope( $value ) ) {
3054 return true; // Ignore.
3055 }
3056 $this->stack->popTag( $value );
3057 $this->resetInsertionMode();
3058 return true;
3059 // OMITTED: <body>
3060 case 'caption':
3061 case 'col':
3062 case 'colgroup':
3063 // OMITTED: <html>
3064 case 'tbody':
3065 case 'td':
3066 case 'tfoot':
3067 case 'th':
3068 case 'thead':
3069 case 'tr':
3070 return true; // Ignore the token.
3071 case 'template':
3072 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3073 }
3074 // Fall through for "anything else" clause.
3075 } elseif ( $token === 'comment' ) {
3076 $this->stack->insertComment( $value );
3077 return true;
3078 }
3079 // This is the "anything else" case:
3080 $this->stack->fosterParentMode = true;
3081 $this->inBodyMode( $token, $value, $attribs, $selfClose );
3082 $this->stack->fosterParentMode = false;
3083 return true;
3084 }
3085
3086 private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) {
3087 if ( $token === 'text' ) {
3088 $this->pendingTableText .= $value;
3089 return true;
3090 }
3091 // Non-text token:
3092 $text = $this->pendingTableText;
3093 $this->pendingTableText = '';
3094 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
3095 // This should match the "anything else" case inTableMode
3096 $this->stack->fosterParentMode = true;
3097 $this->inBodyMode( 'text', $text );
3098 $this->stack->fosterParentMode = false;
3099 } else {
3100 // Pending text is just whitespace.
3101 $this->stack->insertText( $text );
3102 }
3103 return $this->switchModeAndReprocess(
3104 $this->originalInsertionMode, $token, $value, $attribs, $selfClose
3105 );
3106 }
3107
3108 // helper for inCaptionMode
3109 private function endCaption() {
3110 if ( !$this->stack->inTableScope( 'caption' ) ) {
3111 return false;
3112 }
3113 $this->stack->generateImpliedEndTags();
3114 $this->stack->popTag( 'caption' );
3115 $this->afe->clearToMarker();
3116 $this->switchMode( 'inTableMode' );
3117 return true;
3118 }
3119
3120 private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) {
3121 if ( $token === 'tag' ) {
3122 switch ( $value ) {
3123 case 'caption':
3124 case 'col':
3125 case 'colgroup':
3126 case 'tbody':
3127 case 'td':
3128 case 'tfoot':
3129 case 'th':
3130 case 'thead':
3131 case 'tr':
3132 if ( $this->endCaption() ) {
3133 $this->insertToken( $token, $value, $attribs, $selfClose );
3134 }
3135 return true;
3136 }
3137 // Fall through to "anything else" case.
3138 } elseif ( $token === 'endtag' ) {
3139 switch ( $value ) {
3140 case 'caption':
3141 $this->endCaption();
3142 return true;
3143 case 'table':
3144 if ( $this->endCaption() ) {
3145 $this->insertToken( $token, $value, $attribs, $selfClose );
3146 }
3147 return true;
3148 case 'body':
3149 case 'col':
3150 case 'colgroup':
3151 // OMITTED: <html>
3152 case 'tbody':
3153 case 'td':
3154 case 'tfoot':
3155 case 'th':
3156 case 'thead':
3157 case 'tr':
3158 // Ignore the token
3159 return true;
3160 }
3161 // Fall through to "anything else" case.
3162 }
3163 // The Anything Else case
3164 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3165 }
3166
3167 private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) {
3168 if ( $token === 'text' ) {
3169 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
3170 $this->stack->insertText( $matches[0] );
3171 $value = substr( $value, strlen( $matches[0] ) );
3172 }
3173 if ( strlen( $value ) === 0 ) {
3174 return true; // All text handled.
3175 }
3176 // Fall through to handle non-whitespace below.
3177 } elseif ( $token === 'tag' ) {
3178 switch ( $value ) {
3179 // OMITTED: <html>
3180 case 'col':
3181 $this->stack->insertHTMLElement( $value, $attribs );
3182 $this->stack->pop();
3183 return true;
3184 case 'template':
3185 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3186 }
3187 // Fall through for "anything else".
3188 } elseif ( $token === 'endtag' ) {
3189 switch ( $value ) {
3190 case 'colgroup':
3191 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3192 return true; // Ignore the token.
3193 }
3194 $this->stack->pop();
3195 $this->switchMode( 'inTableMode' );
3196 return true;
3197 case 'col':
3198 return true; // Ignore the token.
3199 case 'template':
3200 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3201 }
3202 // Fall through for "anything else".
3203 } elseif ( $token === 'eof' ) {
3204 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3205 } elseif ( $token === 'comment' ) {
3206 $this->stack->insertComment( $value );
3207 return true;
3208 }
3209
3210 // Anything else
3211 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3212 return true; // Ignore the token.
3213 }
3214 $this->inColumnGroupMode( 'endtag', 'colgroup' );
3215 return $this->insertToken( $token, $value, $attribs, $selfClose );
3216 }
3217
3218 // Helper function for inTableBodyMode
3219 private function endSection() {
3220 if ( !(
3221 $this->stack->inTableScope( 'tbody' ) ||
3222 $this->stack->inTableScope( 'thead' ) ||
3223 $this->stack->inTableScope( 'tfoot' )
3224 ) ) {
3225 return false;
3226 }
3227 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3228 $this->stack->pop();
3229 $this->switchMode( 'inTableMode' );
3230 return true;
3231 }
3232 private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
3233 if ( $token === 'tag' ) {
3234 switch ( $value ) {
3235 case 'tr':
3236 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3237 $this->stack->insertHTMLElement( $value, $attribs );
3238 $this->switchMode( 'inRowMode' );
3239 return true;
3240 case 'th':
3241 case 'td':
3242 $this->inTableBodyMode( 'tag', 'tr', [] );
3243 $this->insertToken( $token, $value, $attribs, $selfClose );
3244 return true;
3245 case 'caption':
3246 case 'col':
3247 case 'colgroup':
3248 case 'tbody':
3249 case 'tfoot':
3250 case 'thead':
3251 if ( $this->endSection() ) {
3252 $this->insertToken( $token, $value, $attribs, $selfClose );
3253 }
3254 return true;
3255 }
3256 } elseif ( $token === 'endtag' ) {
3257 switch ( $value ) {
3258 case 'table':
3259 if ( $this->endSection() ) {
3260 $this->insertToken( $token, $value, $attribs, $selfClose );
3261 }
3262 return true;
3263 case 'tbody':
3264 case 'tfoot':
3265 case 'thead':
3266 if ( $this->stack->inTableScope( $value ) ) {
3267 $this->endSection();
3268 }
3269 return true;
3270 // OMITTED: <body>
3271 case 'caption':
3272 case 'col':
3273 case 'colgroup':
3274 // OMITTED: <html>
3275 case 'td':
3276 case 'th':
3277 case 'tr':
3278 return true; // Ignore the token.
3279 }
3280 }
3281 // Anything else:
3282 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3283 }
3284
3285 // Helper function for inRowMode
3286 private function endRow() {
3287 if ( !$this->stack->inTableScope( 'tr' ) ) {
3288 return false;
3289 }
3290 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3291 $this->stack->pop();
3292 $this->switchMode( 'inTableBodyMode' );
3293 return true;
3294 }
3295 private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) {
3296 if ( $token === 'tag' ) {
3297 switch ( $value ) {
3298 case 'th':
3299 case 'td':
3300 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3301 $this->stack->insertHTMLElement( $value, $attribs );
3302 $this->switchMode( 'inCellMode' );
3303 $this->afe->insertMarker();
3304 return true;
3305 case 'caption':
3306 case 'col':
3307 case 'colgroup':
3308 case 'tbody':
3309 case 'tfoot':
3310 case 'thead':
3311 case 'tr':
3312 if ( $this->endRow() ) {
3313 $this->insertToken( $token, $value, $attribs, $selfClose );
3314 }
3315 return true;
3316 }
3317 } elseif ( $token === 'endtag' ) {
3318 switch ( $value ) {
3319 case 'tr':
3320 $this->endRow();
3321 return true;
3322 case 'table':
3323 if ( $this->endRow() ) {
3324 $this->insertToken( $token, $value, $attribs, $selfClose );
3325 }
3326 return true;
3327 case 'tbody':
3328 case 'tfoot':
3329 case 'thead':
3330 if (
3331 $this->stack->inTableScope( $value ) &&
3332 $this->endRow()
3333 ) {
3334 $this->insertToken( $token, $value, $attribs, $selfClose );
3335 }
3336 return true;
3337 // OMITTED: <body>
3338 case 'caption':
3339 case 'col':
3340 case 'colgroup':
3341 // OMITTED: <html>
3342 case 'td':
3343 case 'th':
3344 return true; // Ignore the token.
3345 }
3346 }
3347 // Anything else:
3348 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3349 }
3350
3351 // Helper for inCellMode
3352 private function endCell() {
3353 if ( $this->stack->inTableScope( 'td' ) ) {
3354 $this->inCellMode( 'endtag', 'td' );
3355 return true;
3356 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3357 $this->inCellMode( 'endtag', 'th' );
3358 return true;
3359 } else {
3360 return false;
3361 }
3362 }
3363 private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) {
3364 if ( $token === 'tag' ) {
3365 switch ( $value ) {
3366 case 'caption':
3367 case 'col':
3368 case 'colgroup':
3369 case 'tbody':
3370 case 'td':
3371 case 'tfoot':
3372 case 'th':
3373 case 'thead':
3374 case 'tr':
3375 if ( $this->endCell() ) {
3376 $this->insertToken( $token, $value, $attribs, $selfClose );
3377 }
3378 return true;
3379 }
3380 } elseif ( $token === 'endtag' ) {
3381 switch ( $value ) {
3382 case 'td':
3383 case 'th':
3384 if ( $this->stack->inTableScope( $value ) ) {
3385 $this->stack->generateImpliedEndTags();
3386 $this->stack->popTag( $value );
3387 $this->afe->clearToMarker();
3388 $this->switchMode( 'inRowMode' );
3389 }
3390 return true;
3391 // OMITTED: <body>
3392 case 'caption':
3393 case 'col':
3394 case 'colgroup':
3395 // OMITTED: <html>
3396 return true;
3397
3398 case 'table':
3399 case 'tbody':
3400 case 'tfoot':
3401 case 'thead':
3402 case 'tr':
3403 if ( $this->stack->inTableScope( $value ) ) {
3404 $this->stack->generateImpliedEndTags();
3405 $this->stack->popTag( BalanceSets::$tableCellSet );
3406 $this->afe->clearToMarker();
3407 $this->switchMode( 'inRowMode' );
3408 $this->insertToken( $token, $value, $attribs, $selfClose );
3409 }
3410 return true;
3411 }
3412 }
3413 // Anything else:
3414 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3415 }
3416
3417 private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) {
3418 if ( $token === 'text' ) {
3419 $this->stack->insertText( $value );
3420 return true;
3421 } elseif ( $token === 'eof' ) {
3422 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3423 } elseif ( $token === 'tag' ) {
3424 switch ( $value ) {
3425 // OMITTED: <html>
3426 case 'option':
3427 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3428 $this->stack->pop();
3429 }
3430 $this->stack->insertHTMLElement( $value, $attribs );
3431 return true;
3432 case 'optgroup':
3433 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3434 $this->stack->pop();
3435 }
3436 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3437 $this->stack->pop();
3438 }
3439 $this->stack->insertHTMLElement( $value, $attribs );
3440 return true;
3441 case 'select':
3442 $this->inSelectMode( 'endtag', $value ); // treat it like endtag
3443 return true;
3444 case 'input':
3445 case 'keygen':
3446 case 'textarea':
3447 if ( !$this->stack->inSelectScope( 'select' ) ) {
3448 return true; // ignore token (fragment case)
3449 }
3450 $this->inSelectMode( 'endtag', 'select' );
3451 return $this->insertToken( $token, $value, $attribs, $selfClose );
3452 case 'script':
3453 case 'template':
3454 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3455 }
3456 } elseif ( $token === 'endtag' ) {
3457 switch ( $value ) {
3458 case 'optgroup':
3459 if (
3460 $this->stack->currentNode->isHtmlNamed( 'option' ) &&
3461 $this->stack->length() >= 2 &&
3462 $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
3463 ) {
3464 $this->stack->pop();
3465 }
3466 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3467 $this->stack->pop();
3468 }
3469 return true;
3470 case 'option':
3471 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3472 $this->stack->pop();
3473 }
3474 return true;
3475 case 'select':
3476 if ( !$this->stack->inSelectScope( $value ) ) {
3477 return true; // fragment case
3478 }
3479 $this->stack->popTag( $value );
3480 $this->resetInsertionMode();
3481 return true;
3482 case 'template':
3483 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3484 }
3485 } elseif ( $token === 'comment' ) {
3486 $this->stack->insertComment( $value );
3487 return true;
3488 }
3489 // anything else: just ignore the token
3490 return true;
3491 }
3492
3493 private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) {
3494 switch ( $value ) {
3495 case 'caption':
3496 case 'table':
3497 case 'tbody':
3498 case 'tfoot':
3499 case 'thead':
3500 case 'tr':
3501 case 'td':
3502 case 'th':
3503 if ( $token === 'tag' ) {
3504 $this->inSelectInTableMode( 'endtag', 'select' );
3505 return $this->insertToken( $token, $value, $attribs, $selfClose );
3506 } elseif ( $token === 'endtag' ) {
3507 if ( $this->stack->inTableScope( $value ) ) {
3508 $this->inSelectInTableMode( 'endtag', 'select' );
3509 return $this->insertToken( $token, $value, $attribs, $selfClose );
3510 }
3511 return true;
3512 }
3513 }
3514 // anything else
3515 return $this->inSelectMode( $token, $value, $attribs, $selfClose );
3516 }
3517
3518 private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) {
3519 if ( $token === 'text' || $token === 'comment' ) {
3520 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3521 } elseif ( $token === 'eof' ) {
3522 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3523 $this->stopParsing();
3524 } else {
3525 $this->stack->popTag( 'template' );
3526 $this->afe->clearToMarker();
3527 array_pop( $this->templateInsertionModes );
3528 $this->resetInsertionMode();
3529 $this->insertToken( $token, $value, $attribs, $selfClose );
3530 }
3531 return true;
3532 } elseif ( $token === 'tag' ) {
3533 switch ( $value ) {
3534 case 'base':
3535 case 'basefont':
3536 case 'bgsound':
3537 case 'link':
3538 case 'meta':
3539 case 'noframes':
3540 // OMITTED: <script>
3541 case 'style':
3542 case 'template':
3543 // OMITTED: <title>
3544 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3545
3546 case 'caption':
3547 case 'colgroup':
3548 case 'tbody':
3549 case 'tfoot':
3550 case 'thead':
3551 return $this->switchModeAndReprocess(
3552 'inTableMode', $token, $value, $attribs, $selfClose
3553 );
3554
3555 case 'col':
3556 return $this->switchModeAndReprocess(
3557 'inColumnGroupMode', $token, $value, $attribs, $selfClose
3558 );
3559
3560 case 'tr':
3561 return $this->switchModeAndReprocess(
3562 'inTableBodyMode', $token, $value, $attribs, $selfClose
3563 );
3564
3565 case 'td':
3566 case 'th':
3567 return $this->switchModeAndReprocess(
3568 'inRowMode', $token, $value, $attribs, $selfClose
3569 );
3570 }
3571 return $this->switchModeAndReprocess(
3572 'inBodyMode', $token, $value, $attribs, $selfClose
3573 );
3574 } elseif ( $token === 'endtag' ) {
3575 switch ( $value ) {
3576 case 'template':
3577 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3578 }
3579 return true;
3580 } else {
3581 Assert::invariant( false, "Bad token type: $token" );
3582 }
3583 }
3584 }