df9bb9674314e6393b0ee86e8b2ea89627caa276
[lhc/web/wiklou.git] / includes / tidy / RemexCompatMunger.php
1 <?php
2
3 namespace MediaWiki\Tidy;
4
5 use RemexHtml\HTMLData;
6 use RemexHtml\Serializer\Serializer;
7 use RemexHtml\Serializer\SerializerNode;
8 use RemexHtml\Tokenizer\Attributes;
9 use RemexHtml\Tokenizer\PlainAttributes;
10 use RemexHtml\TreeBuilder\TreeBuilder;
11 use RemexHtml\TreeBuilder\TreeHandler;
12 use RemexHtml\TreeBuilder\Element;
13
14 /**
15 * @internal
16 */
17 class RemexCompatMunger implements TreeHandler {
18 private static $onlyInlineElements = [
19 "a" => true,
20 "abbr" => true,
21 "acronym" => true,
22 "applet" => true,
23 "b" => true,
24 "basefont" => true,
25 "bdo" => true,
26 "big" => true,
27 "br" => true,
28 "button" => true,
29 "cite" => true,
30 "code" => true,
31 "del" => true,
32 "dfn" => true,
33 "em" => true,
34 "font" => true,
35 "i" => true,
36 "iframe" => true,
37 "img" => true,
38 "input" => true,
39 "ins" => true,
40 "kbd" => true,
41 "label" => true,
42 "legend" => true,
43 "map" => true,
44 "object" => true,
45 "param" => true,
46 "q" => true,
47 "rb" => true,
48 "rbc" => true,
49 "rp" => true,
50 "rt" => true,
51 "rtc" => true,
52 "ruby" => true,
53 "s" => true,
54 "samp" => true,
55 "select" => true,
56 "small" => true,
57 "span" => true,
58 "strike" => true,
59 "strong" => true,
60 "sub" => true,
61 "sup" => true,
62 "textarea" => true,
63 "tt" => true,
64 "u" => true,
65 "var" => true,
66 // Those defined in tidy.conf
67 "video" => true,
68 "audio" => true,
69 "bdi" => true,
70 "data" => true,
71 "time" => true,
72 "mark" => true,
73 ];
74
75 /**
76 * For the purposes of this class, "metadata" elements are those that
77 * should neither trigger p-wrapping nor stop an outer p-wrapping,
78 * typically those that are themselves invisible in a browser's rendering.
79 * This isn't a complete list, it's just the tags that we're likely to
80 * encounter in practice.
81 * @var array
82 */
83 private static $metadataElements = [
84 'style' => true,
85 'script' => true,
86 'link' => true,
87 'meta' => true,
88 ];
89
90 private static $formattingElements = [
91 'a' => true,
92 'b' => true,
93 'big' => true,
94 'code' => true,
95 'em' => true,
96 'font' => true,
97 'i' => true,
98 'nobr' => true,
99 's' => true,
100 'small' => true,
101 'strike' => true,
102 'strong' => true,
103 'tt' => true,
104 'u' => true,
105 ];
106
107 /** @var Serializer */
108 private $serializer;
109
110 /** @var bool */
111 private $trace;
112
113 /**
114 * @param Serializer $serializer
115 * @param bool $trace
116 */
117 public function __construct( Serializer $serializer, $trace = false ) {
118 $this->serializer = $serializer;
119 $this->trace = $trace;
120 }
121
122 public function startDocument( $fragmentNamespace, $fragmentName ) {
123 $this->serializer->startDocument( $fragmentNamespace, $fragmentName );
124 $root = $this->serializer->getRootNode();
125 $root->snData = new RemexMungerData;
126 $root->snData->needsPWrapping = true;
127 }
128
129 public function endDocument( $pos ) {
130 $this->serializer->endDocument( $pos );
131 }
132
133 private function getParentForInsert( $preposition, $refElement ) {
134 if ( $preposition === TreeBuilder::ROOT ) {
135 return [ $this->serializer->getRootNode(), null ];
136 } elseif ( $preposition === TreeBuilder::BEFORE ) {
137 $refNode = $refElement->userData;
138 return [ $this->serializer->getParentNode( $refNode ), $refNode ];
139 } else {
140 $refNode = $refElement->userData;
141 $refData = $refNode->snData;
142 if ( $refData->currentCloneElement ) {
143 // Follow a chain of clone links if necessary
144 $origRefData = $refData;
145 while ( $refData->currentCloneElement ) {
146 $refElement = $refData->currentCloneElement;
147 $refNode = $refElement->userData;
148 $refData = $refNode->snData;
149 }
150 // Cache the end of the chain in the requested element
151 $origRefData->currentCloneElement = $refElement;
152 } elseif ( $refData->childPElement ) {
153 $refElement = $refData->childPElement;
154 $refNode = $refElement->userData;
155 }
156 return [ $refNode, $refNode ];
157 }
158 }
159
160 /**
161 * Insert a p-wrapper
162 *
163 * @param SerializerNode $parent
164 * @param int $sourceStart
165 * @return SerializerNode
166 */
167 private function insertPWrapper( SerializerNode $parent, $sourceStart ) {
168 $pWrap = new Element( HTMLData::NS_HTML, 'mw:p-wrap', new PlainAttributes );
169 $this->serializer->insertElement( TreeBuilder::UNDER, $parent, $pWrap, false,
170 $sourceStart, 0 );
171 $data = new RemexMungerData;
172 $data->isPWrapper = true;
173 $data->wrapBaseNode = $parent;
174 $pWrap->userData->snData = $data;
175 $parent->snData->childPElement = $pWrap;
176 return $pWrap->userData;
177 }
178
179 public function characters( $preposition, $refElement, $text, $start, $length,
180 $sourceStart, $sourceLength
181 ) {
182 $isBlank = strspn( $text, "\t\n\f\r ", $start, $length ) === $length;
183
184 list( $parent, $refNode ) = $this->getParentForInsert( $preposition, $refElement );
185 $parentData = $parent->snData;
186
187 if ( $preposition === TreeBuilder::UNDER ) {
188 if ( $parentData->needsPWrapping && !$isBlank ) {
189 // Add a p-wrapper for bare text under body/blockquote
190 $refNode = $this->insertPWrapper( $refNode, $sourceStart );
191 $parent = $refNode;
192 $parentData = $parent->snData;
193 } elseif ( $parentData->isSplittable && !$parentData->ancestorPNode ) {
194 // The parent is splittable and in block mode, so split the tag stack
195 $refNode = $this->splitTagStack( $refNode, true, $sourceStart );
196 $parent = $refNode;
197 $parentData = $parent->snData;
198 }
199 }
200
201 if ( !$isBlank ) {
202 // Non-whitespace characters detected
203 $parentData->nonblankNodeCount++;
204 }
205 $this->serializer->characters( $preposition, $refNode, $text, $start,
206 $length, $sourceStart, $sourceLength );
207 }
208
209 private function trace( $msg ) {
210 if ( $this->trace ) {
211 wfDebug( "[RCM] $msg" );
212 }
213 }
214
215 /**
216 * Insert or reparent an element. Create p-wrappers or split the tag stack
217 * as necessary.
218 *
219 * Consider the following insertion locations. The parent may be:
220 *
221 * - A: A body or blockquote (!!needsPWrapping)
222 * - B: A p-wrapper (!!isPWrapper)
223 * - C: A descendant of a p-wrapper (!!ancestorPNode)
224 * - CS: With splittable formatting elements in the stack region up to
225 * the p-wrapper
226 * - CU: With one or more unsplittable elements in the stack region up
227 * to the p-wrapper
228 * - D: Not a descendant of a p-wrapper (!ancestorNode)
229 * - DS: With splittable formatting elements in the stack region up to
230 * the body or blockquote
231 * - DU: With one or more unsplittable elements in the stack region up
232 * to the body or blockquote
233 *
234 * And consider that we may insert two types of element:
235 * - b: block
236 * - i: inline
237 *
238 * We handle the insertion as follows:
239 *
240 * - A/i: Create a p-wrapper, insert under it
241 * - A/b: Insert as normal
242 * - B/i: Insert as normal
243 * - B/b: Close the p-wrapper, insert under the body/blockquote (wrap
244 * base) instead)
245 * - C/i: Insert as normal
246 * - CS/b: Split the tag stack, insert the block under cloned formatting
247 * elements which have the wrap base (the parent of the p-wrap) as
248 * their ultimate parent.
249 * - CU/b: Disable the p-wrap, by reparenting the currently open child
250 * of the p-wrap under the p-wrap's parent. Then insert the block as
251 * normal.
252 * - D/b: Insert as normal
253 * - DS/i: Split the tag stack, creating a new p-wrapper as the ultimate
254 * parent of the formatting elements thus cloned. The parent of the
255 * p-wrapper is the body or blockquote.
256 * - DU/i: Insert as normal
257 *
258 * FIXME: fostering ($preposition == BEFORE) is mostly done by inserting as
259 * normal, the full algorithm is not followed.
260 *
261 * @param int $preposition
262 * @param Element|SerializerNode|null $refElement
263 * @param Element $element
264 * @param bool $void
265 * @param int $sourceStart
266 * @param int $sourceLength
267 */
268 public function insertElement( $preposition, $refElement, Element $element, $void,
269 $sourceStart, $sourceLength
270 ) {
271 list( $parent, $newRef ) = $this->getParentForInsert( $preposition, $refElement );
272 $parentData = $parent->snData;
273 $elementName = $element->htmlName;
274
275 $inline = isset( self::$onlyInlineElements[$elementName] );
276 $under = $preposition === TreeBuilder::UNDER;
277 $elementToEnd = null;
278
279 if ( isset( self::$metadataElements[$elementName] ) ) {
280 // The element is a metadata element, that we allow to appear in
281 // both inline and block contexts.
282 $this->trace( 'insert metadata' );
283 } elseif ( $under && $parentData->isPWrapper && !$inline ) {
284 // [B/b] The element is non-inline and the parent is a p-wrapper,
285 // close the parent and insert into its parent instead
286 $this->trace( 'insert B/b' );
287 $newParent = $this->serializer->getParentNode( $parent );
288 $parent = $newParent;
289 $parentData = $parent->snData;
290 $pElement = $parentData->childPElement;
291 $parentData->childPElement = null;
292 $newRef = $refElement->userData;
293 } elseif ( $under && $parentData->isSplittable
294 && (bool)$parentData->ancestorPNode !== $inline
295 ) {
296 // [CS/b, DS/i] The parent is splittable and the current element is
297 // inline in block context, or if the current element is a block
298 // under a p-wrapper, split the tag stack.
299 $this->trace( $inline ? 'insert DS/i' : 'insert CS/b' );
300 $newRef = $this->splitTagStack( $newRef, $inline, $sourceStart );
301 $parent = $newRef;
302 $parentData = $parent->snData;
303 } elseif ( $under && $parentData->needsPWrapping && $inline ) {
304 // [A/i] If the element is inline and we are in body/blockquote,
305 // we need to create a p-wrapper
306 $this->trace( 'insert A/i' );
307 $newRef = $this->insertPWrapper( $newRef, $sourceStart );
308 $parent = $newRef;
309 $parentData = $parent->snData;
310 } elseif ( $parentData->ancestorPNode && !$inline ) {
311 // [CU/b] If the element is non-inline and (despite attempting to
312 // split above) there is still an ancestor p-wrap, disable that
313 // p-wrap
314 $this->trace( 'insert CU/b' );
315 $this->disablePWrapper( $parent, $sourceStart );
316 } else {
317 // [A/b, B/i, C/i, D/b, DU/i] insert as normal
318 $this->trace( 'insert normal' );
319 }
320
321 // An element with element children is a non-blank element
322 $parentData->nonblankNodeCount++;
323
324 // Insert the element downstream and so initialise its userData
325 $this->serializer->insertElement( $preposition, $newRef,
326 $element, $void, $sourceStart, $sourceLength );
327
328 // Initialise snData
329 if ( !$element->userData->snData ) {
330 $elementData = $element->userData->snData = new RemexMungerData;
331 } else {
332 $elementData = $element->userData->snData;
333 }
334 if ( ( $parentData->isPWrapper || $parentData->isSplittable )
335 && isset( self::$formattingElements[$elementName] )
336 ) {
337 $elementData->isSplittable = true;
338 }
339 if ( $parentData->isPWrapper ) {
340 $elementData->ancestorPNode = $parent;
341 } elseif ( $parentData->ancestorPNode ) {
342 $elementData->ancestorPNode = $parentData->ancestorPNode;
343 }
344 if ( $parentData->wrapBaseNode ) {
345 $elementData->wrapBaseNode = $parentData->wrapBaseNode;
346 } elseif ( $parentData->needsPWrapping ) {
347 $elementData->wrapBaseNode = $parent;
348 }
349 if ( $elementName === 'body'
350 || $elementName === 'blockquote'
351 || $elementName === 'html'
352 ) {
353 $elementData->needsPWrapping = true;
354 }
355 }
356
357 /**
358 * Clone nodes in a stack range and return the new parent
359 *
360 * @param SerializerNode $parentNode
361 * @param bool $inline
362 * @param int $pos The source position
363 * @return SerializerNode
364 */
365 private function splitTagStack( SerializerNode $parentNode, $inline, $pos ) {
366 $parentData = $parentNode->snData;
367 $wrapBase = $parentData->wrapBaseNode;
368 $pWrap = $parentData->ancestorPNode;
369 if ( !$pWrap ) {
370 $cloneEnd = $wrapBase;
371 } else {
372 $cloneEnd = $parentData->ancestorPNode;
373 }
374
375 $serializer = $this->serializer;
376 $node = $parentNode;
377 $root = $serializer->getRootNode();
378 $nodes = [];
379 $removableNodes = [];
380 while ( $node !== $cloneEnd ) {
381 $nextParent = $serializer->getParentNode( $node );
382 if ( $nextParent === $root ) {
383 throw new \Exception( 'Did not find end of clone range' );
384 }
385 $nodes[] = $node;
386 if ( $node->snData->nonblankNodeCount === 0 ) {
387 $removableNodes[] = $node;
388 $nextParent->snData->nonblankNodeCount--;
389 }
390 $node = $nextParent;
391 }
392
393 if ( $inline ) {
394 $pWrap = $this->insertPWrapper( $wrapBase, $pos );
395 $node = $pWrap;
396 } else {
397 if ( $pWrap ) {
398 // End the p-wrap which was open, cancel the diversion
399 $wrapBase->snData->childPElement = null;
400 }
401 $pWrap = null;
402 $node = $wrapBase;
403 }
404
405 for ( $i = count( $nodes ) - 1; $i >= 0; $i-- ) {
406 $oldNode = $nodes[$i];
407 $oldData = $oldNode->snData;
408 $nodeParent = $node;
409 $element = new Element( $oldNode->namespace, $oldNode->name, $oldNode->attrs );
410 $this->serializer->insertElement( TreeBuilder::UNDER, $nodeParent,
411 $element, false, $pos, 0 );
412 $oldData->currentCloneElement = $element;
413
414 $newNode = $element->userData;
415 $newData = $newNode->snData = new RemexMungerData;
416 if ( $pWrap ) {
417 $newData->ancestorPNode = $pWrap;
418 }
419 $newData->isSplittable = true;
420 $newData->wrapBaseNode = $wrapBase;
421 $newData->isPWrapper = $oldData->isPWrapper;
422
423 $nodeParent->snData->nonblankNodeCount++;
424
425 $node = $newNode;
426 }
427 foreach ( $removableNodes as $rNode ) {
428 $fakeElement = new Element( $rNode->namespace, $rNode->name, $rNode->attrs );
429 $fakeElement->userData = $rNode;
430 $this->serializer->removeNode( $fakeElement, $pos );
431 }
432 return $node;
433 }
434
435 /**
436 * Find the ancestor of $node which is a child of a p-wrapper, and
437 * reparent that node so that it is placed after the end of the p-wrapper
438 */
439 private function disablePWrapper( SerializerNode $node, $sourceStart ) {
440 $nodeData = $node->snData;
441 $pWrapNode = $nodeData->ancestorPNode;
442 $newParent = $this->serializer->getParentNode( $pWrapNode );
443 if ( $pWrapNode !== $this->serializer->getLastChild( $newParent ) ) {
444 // Fostering or something? Abort!
445 return;
446 }
447
448 $nextParent = $node;
449 do {
450 $victim = $nextParent;
451 $victim->snData->ancestorPNode = null;
452 $nextParent = $this->serializer->getParentNode( $victim );
453 } while ( $nextParent !== $pWrapNode );
454
455 // Make a fake Element to use in a reparenting operation
456 $victimElement = new Element( $victim->namespace, $victim->name, $victim->attrs );
457 $victimElement->userData = $victim;
458
459 // Reparent
460 $this->serializer->insertElement( TreeBuilder::UNDER, $newParent, $victimElement,
461 false, $sourceStart, 0 );
462
463 // Decrement nonblank node count
464 $pWrapNode->snData->nonblankNodeCount--;
465
466 // Cancel the diversion so that no more elements are inserted under this p-wrap
467 $newParent->snData->childPElement = null;
468 }
469
470 public function endTag( Element $element, $sourceStart, $sourceLength ) {
471 $data = $element->userData->snData;
472 if ( $data->childPElement ) {
473 $this->endTag( $data->childPElement, $sourceStart, 0 );
474 }
475 $this->serializer->endTag( $element, $sourceStart, $sourceLength );
476 $element->userData->snData = null;
477 $element->userData = null;
478 }
479
480 public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
481 $this->serializer->doctype( $name, $public, $system, $quirks,
482 $sourceStart, $sourceLength );
483 }
484
485 public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) {
486 list( $parent, $refNode ) = $this->getParentForInsert( $preposition, $refElement );
487 $this->serializer->comment( $preposition, $refNode, $text,
488 $sourceStart, $sourceLength );
489 }
490
491 public function error( $text, $pos ) {
492 $this->serializer->error( $text, $pos );
493 }
494
495 public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) {
496 $this->serializer->mergeAttributes( $element, $attrs, $sourceStart );
497 }
498
499 public function removeNode( Element $element, $sourceStart ) {
500 $this->serializer->removeNode( $element, $sourceStart );
501 }
502
503 public function reparentChildren( Element $element, Element $newParent, $sourceStart ) {
504 $self = $element->userData;
505 if ( $self->snData->childPElement ) {
506 // Reparent under the p-wrapper instead, so that e.g.
507 // <blockquote><mw:p-wrap>...</mw:p-wrap></blockquote>
508 // becomes
509 // <blockquote><mw:p-wrap><i>...</i></mw:p-wrap></blockquote>
510
511 // The formatting element should not be the parent of the p-wrap.
512 // Without this special case, the insertElement() of the <i> below
513 // would be diverted into the p-wrapper, causing infinite recursion
514 // (T178632)
515 $this->reparentChildren( $self->snData->childPElement, $newParent, $sourceStart );
516 return;
517 }
518
519 $children = $self->children;
520 $self->children = [];
521 $this->insertElement( TreeBuilder::UNDER, $element, $newParent, false, $sourceStart, 0 );
522 $newParentNode = $newParent->userData;
523 $newParentId = $newParentNode->id;
524 foreach ( $children as $child ) {
525 if ( is_object( $child ) ) {
526 $this->trace( "reparent <{$child->name}>" );
527 $child->parentId = $newParentId;
528 }
529 }
530 $newParentNode->children = $children;
531 }
532 }