API: Go back to using the good old str_replace() hacks rather than Title methods...
[lhc/web/wiklou.git] / includes / HTMLDiff.php
1 <?php
2 /* Copyright (C) 2008 Guy Van den Broeck <guy@guyvdb.eu>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * or see http://www.gnu.org/
18 */
19
20 /**
21 * Any element in the DOM tree of an HTML document.
22 */
23 class Node {
24
25 public $parent;
26
27 protected $parentTree;
28
29 public $whiteBefore = false;
30
31 public $whiteAfter = false;
32
33 function __construct($parent) {
34 $this->parent = $parent;
35 }
36
37 public function getParentTree() {
38 if (!isset($this->parentTree)) {
39 if (!is_null($this->parent)) {
40 $this->parentTree = $this->parent->getParentTree();
41 $this->parentTree[] = $this->parent;
42 } else {
43 $this->parentTree = array();
44 }
45 }
46 return $this->parentTree;
47 }
48
49 public function getLastCommonParent(Node $other) {
50 $result = new LastCommonParentResult();
51
52 $myParents = $this->getParentTree();
53 $otherParents = $other->getParentTree();
54
55 $i = 1;
56 $isSame = true;
57 $nbMyParents = count($myParents);
58 $nbOtherParents = count($otherParents);
59 while ($isSame && $i < $nbMyParents && $i < $nbOtherParents) {
60 if (!$myParents[$i]->openingTag === $otherParents[$i]->openingTag) {
61 $isSame = false;
62 } else {
63 // After a while, the index i-1 must be the last common parent
64 $i++;
65 }
66 }
67
68 $result->lastCommonParentDepth = $i - 1;
69 $result->parent = $myParents[$i - 1];
70
71 if (!$isSame || $nbMyParents > $nbOtherParents) {
72 // Not all tags matched, or all tags matched but
73 // there are tags left in this tree
74 $result->indexInLastCommonParent = $myParents[$i - 1]->getIndexOf($myParents[$i]);
75 $result->splittingNeeded = true;
76 } else if ($nbMyParents <= $nbOtherParents) {
77 $result->indexInLastCommonParent = $myParents[$i - 1]->getIndexOf($this);
78 }
79 return $result;
80 }
81
82 public function setParent($parent) {
83 $this->parent = $parent;
84 unset($this->parentTree);
85 }
86
87 public function inPre() {
88 $tree = $this->getParentTree();
89 foreach ($tree as &$ancestor) {
90 if ($ancestor->isPre()) {
91 return true;
92 }
93 }
94 return false;
95 }
96 }
97
98 /**
99 * Node that can contain other nodes. Represents an HTML tag.
100 */
101 class TagNode extends Node {
102
103 public $children = array();
104
105 public $qName;
106
107 public $attributes = array();
108
109 public $openingTag;
110
111 function __construct($parent, $qName, /*array*/ $attributes) {
112 parent::__construct($parent);
113 $this->qName = strtolower($qName);
114 foreach($attributes as $key => &$value){
115 $this->attributes[strtolower($key)] = $value;
116 }
117 return $this->openingTag = Xml::openElement($this->qName, $this->attributes);
118 }
119
120 public function addChildAbsolute(Node $node, $index) {
121 array_splice($this->children, $index, 0, array($node));
122 }
123
124 public function getIndexOf(Node $child) {
125 // don't trust array_search with objects
126 foreach ($this->children as $key => &$value){
127 if ($value === $child) {
128 return $key;
129 }
130 }
131 return null;
132 }
133
134 public function getNbChildren() {
135 return count($this->children);
136 }
137
138 public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
139 $nodes = array();
140
141 $allDeleted = false;
142 $somethingDeleted = false;
143 $hasNonDeletedDescendant = false;
144
145 if (empty($this->children)) {
146 return $nodes;
147 }
148
149 foreach ($this->children as &$child) {
150 $allDeleted_local = false;
151 $somethingDeleted_local = false;
152 $childrenChildren = $child->getMinimalDeletedSet($id, $allDeleted_local, $somethingDeleted_local);
153 if ($somethingDeleted_local) {
154 $nodes = array_merge($nodes, $childrenChildren);
155 $somethingDeleted = true;
156 }
157 if (!$allDeleted_local) {
158 $hasNonDeletedDescendant = true;
159 }
160 }
161 if (!$hasNonDeletedDescendant) {
162 $nodes = array($this);
163 $allDeleted = true;
164 }
165 return $nodes;
166 }
167
168 public function splitUntil(TagNode $parent, Node $split, $includeLeft) {
169 $splitOccured = false;
170 if ($parent !== $this) {
171 $part1 = new TagNode(null, $this->qName, $this->attributes);
172 $part2 = new TagNode(null, $this->qName, $this->attributes);
173 $part1->setParent($this->parent);
174 $part2->setParent($this->parent);
175
176 $onSplit = false;
177 $pastSplit = false;
178 foreach ($this->children as &$child)
179 {
180 if ($child === $split) {
181 $onSplit = true;
182 }
183 if(!$pastSplit || ($onSplit && $includeLeft)) {
184 $child->setParent($part1);
185 $part1->children[] = $child;
186 } else {
187 $child->setParent($part2);
188 $part2->children[] = $child;
189 }
190 if ($onSplit) {
191 $onSplit = false;
192 $pastSplit = true;
193 }
194 }
195 $myindexinparent = $this->parent->getIndexOf($this);
196 if (!empty($part1->children)) {
197 $this->parent->addChildAbsolute($part1, $myindexinparent);
198 }
199 if (!empty($part2->children)) {
200 $this->parent->addChildAbsolute($part2, $myindexinparent);
201 }
202 if (!empty($part1->children) && !empty($part2->children)) {
203 $splitOccured = true;
204 }
205
206 $this->parent->removeChild($myindexinparent);
207
208 if ($includeLeft) {
209 $this->parent->splitUntil($parent, $part1, $includeLeft);
210 } else {
211 $this->parent->splitUntil($parent, $part2, $includeLeft);
212 }
213 }
214 return $splitOccured;
215
216 }
217
218 private function removeChild($index) {
219 unset($this->children[$index]);
220 $this->children = array_values($this->children);
221 }
222
223 public static $blocks = array('html', 'body','p','blockquote', 'h1',
224 'h2', 'h3', 'h4', 'h5', 'pre', 'div', 'ul', 'ol', 'li', 'table',
225 'tbody', 'tr', 'td', 'th', 'br');
226
227 public function copyTree() {
228 $newThis = new TagNode(null, $this->qName, $this->attributes);
229 $newThis->whiteBefore = $this->whiteBefore;
230 $newThis->whiteAfter = $this->whiteAfter;
231 foreach ($this->children as &$child) {
232 $newChild = $child->copyTree();
233 $newChild->setParent($newThis);
234 $newThis->children[] = $newChild;
235 }
236 return $newThis;
237 }
238
239 public function getMatchRatio(TagNode $other) {
240 $txtComp = new TextOnlyComparator($other);
241 return $txtComp->getMatchRatio(new TextOnlyComparator($this));
242 }
243
244 public function expandWhiteSpace() {
245 $shift = 0;
246 $spaceAdded = false;
247
248 $nbOriginalChildren = $this->getNbChildren();
249 for ($i = 0; $i < $nbOriginalChildren; ++$i) {
250 $child = $this->children[$i + $shift];
251
252 if ($child instanceof TagNode) {
253 if (!$child->isPre()) {
254 $child->expandWhiteSpace();
255 }
256 }
257 if (!$spaceAdded && $child->whiteBefore) {
258 $ws = new WhiteSpaceNode(null, ' ', $child->getLeftMostChild());
259 $ws->setParent($this);
260 $this->addChildAbsolute($ws,$i + ($shift++));
261 }
262 if ($child->whiteAfter) {
263 $ws = new WhiteSpaceNode(null, ' ', $child->getRightMostChild());
264 $ws->setParent($this);
265 $this->addChildAbsolute($ws,$i + 1 + ($shift++));
266 $spaceAdded = true;
267 } else {
268 $spaceAdded = false;
269 }
270
271 }
272 }
273
274 public function getLeftMostChild() {
275 if (empty($this->children)) {
276 return $this;
277 }
278 return $this->children[0]->getLeftMostChild();
279 }
280
281 public function getRightMostChild() {
282 if (empty($this->children)) {
283 return $this;
284 }
285 return $this->children[$this->getNbChildren() - 1]->getRightMostChild();
286 }
287
288 public function isPre() {
289 return 0 == strcasecmp($this->qName,'pre');
290 }
291
292 public static function toDiffLine(TagNode $node) {
293 return $node->openingTag;
294 }
295 }
296
297 /**
298 * Represents a piece of text in the HTML file.
299 */
300 class TextNode extends Node {
301
302 public $text;
303
304 public $modification;
305
306 function __construct($parent, $text) {
307 parent::__construct($parent);
308 $this->modification = new Modification(Modification::NONE);
309 $this->text = $text;
310 }
311
312 public function copyTree() {
313 $clone = clone $this;
314 $clone->setParent(null);
315 return $clone;
316 }
317
318 public function getLeftMostChild() {
319 return $this;
320 }
321
322 public function getRightMostChild() {
323 return $this;
324 }
325
326 public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
327 if ($this->modification->type == Modification::REMOVED
328 && $this->modification->id == $id){
329 $somethingDeleted = true;
330 $allDeleted = true;
331 return array($this);
332 }
333 return array();
334 }
335
336 public function isSameText($other) {
337 if (is_null($other) || ! $other instanceof TextNode) {
338 return false;
339 }
340 return str_replace('\n', ' ',$this->text) === str_replace('\n', ' ',$other->text);
341 }
342
343 public static function toDiffLine(TextNode $node) {
344 return str_replace('\n', ' ',$node->text);
345 }
346 }
347
348 class WhiteSpaceNode extends TextNode {
349
350 function __construct($parent, $s, Node $like = null) {
351 parent::__construct($parent, $s);
352 if(!is_null($like) && $like instanceof TextNode) {
353 $newModification = clone $like->modification;
354 $newModification->firstOfID = false;
355 $this->modification = $newModification;
356 }
357 }
358 }
359
360 /**
361 * Represents the root of a HTML document.
362 */
363 class BodyNode extends TagNode {
364
365 function __construct() {
366 parent::__construct(null, 'body', array());
367 }
368
369 public function copyTree() {
370 $newThis = new BodyNode();
371 foreach ($this->children as &$child) {
372 $newChild = $child->copyTree();
373 $newChild->setParent($newThis);
374 $newThis->children[] = $newChild;
375 }
376 return $newThis;
377 }
378
379 public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
380 $nodes = array();
381 foreach ($this->children as &$child) {
382 $childrenChildren = $child->getMinimalDeletedSet($id,
383 $allDeleted, $somethingDeleted);
384 $nodes = array_merge($nodes, $childrenChildren);
385 }
386 return $nodes;
387 }
388
389 }
390
391 /**
392 * Represents an image in HTML. Even though images do not contain any text they
393 * are independent visible objects on the page. They are logically a TextNode.
394 */
395 class ImageNode extends TextNode {
396
397 public $attributes;
398
399 function __construct(TagNode $parent, /*array*/ $attrs) {
400 if(!array_key_exists('src', $attrs)) {
401 //wfDebug('Image without a source:');
402 //foreach ($attrs as $key => &$value) {
403 //wfDebug("$key = $value");
404 //}
405 parent::__construct($parent, '<img></img>');
406 }else{
407 parent::__construct($parent, '<img>' . strtolower($attrs['src']) . '</img>');
408 }
409 $this->attributes = $attrs;
410 }
411
412 public function isSameText($other) {
413 if (is_null($other) || ! $other instanceof ImageNode) {
414 return false;
415 }
416 return $this->text === $other->text;
417 }
418
419 }
420
421 class DummyNode extends Node {
422
423 function __construct() {
424 // no op
425 }
426
427 }
428
429 /**
430 * When detecting the last common parent of two nodes, all results are stored as
431 * a LastCommonParentResult.
432 */
433 class LastCommonParentResult {
434
435 // Parent
436 public $parent;
437
438 // Splitting
439 public $splittingNeeded = false;
440
441 // Depth
442 public $lastCommonParentDepth = -1;
443
444 // Index
445 public $indexInLastCommonParent = -1;
446 }
447
448 class Modification{
449
450 const NONE = 1;
451 const REMOVED = 2;
452 const ADDED = 4;
453 const CHANGED = 8;
454
455 public $type;
456
457 public $id = -1;
458
459 public $firstOfID = false;
460
461 public $changes;
462
463 function __construct($type) {
464 $this->type = $type;
465 }
466
467 public static function typeToString($type) {
468 switch($type) {
469 case self::NONE: return 'none';
470 case self::REMOVED: return 'removed';
471 case self::ADDED: return 'added';
472 case self::CHANGED: return 'changed';
473 }
474 }
475 }
476
477 class DomTreeBuilder {
478
479 public $textNodes = array();
480
481 public $bodyNode;
482
483 private $currentParent;
484
485 private $newWord = '';
486
487 protected $bodyStarted = false;
488
489 protected $bodyEnded = false;
490
491 private $whiteSpaceBeforeThis = false;
492
493 private $lastSibling;
494
495 private $notInPre = true;
496
497 function __construct() {
498 $this->bodyNode = $this->currentParent = new BodyNode();
499 $this->lastSibling = new DummyNode();
500 }
501
502 /**
503 * Must be called manually
504 */
505 public function endDocument() {
506 $this->endWord();
507 //wfDebug(count($this->textNodes) . ' text nodes in document.');
508 }
509
510 public function startElement($parser, $name, /*array*/ $attributes) {
511 if (strcasecmp($name, 'body') != 0) {
512 //wfDebug("Starting $name node.");
513 $this->endWord();
514
515 $newNode = new TagNode($this->currentParent, $name, $attributes);
516 $this->currentParent->children[] = $newNode;
517 $this->currentParent = $newNode;
518 $this->lastSibling = new DummyNode();
519 if ($this->whiteSpaceBeforeThis && !in_array(strtolower($this->currentParent->qName),TagNode::$blocks)) {
520 $this->currentParent->whiteBefore = true;
521 }
522 $this->whiteSpaceBeforeThis = false;
523 if(strcasecmp($name, 'pre') == 0) {
524 $this->notInPre = false;
525 }
526 }
527 }
528
529 public function endElement($parser, $name) {
530 if(strcasecmp($name, 'body') != 0) {
531 //wfDebug("Ending $name node.");
532 if (0 == strcasecmp($name,'img')) {
533 // Insert a dummy leaf for the image
534 $img = new ImageNode($this->currentParent, $this->currentParent->attributes);
535 $this->currentParent->children[] = $img;
536 $img->whiteBefore = $this->whiteSpaceBeforeThis;
537 $this->lastSibling = $img;
538 $this->textNodes[] = $img;
539 }
540 $this->endWord();
541 if (!in_array(strtolower($this->currentParent->qName),TagNode::$blocks)) {
542 $this->lastSibling = $this->currentParent;
543 } else {
544 $this->lastSibling = new DummyNode();
545 }
546 $this->currentParent = $this->currentParent->parent;
547 $this->whiteSpaceBeforeThis = false;
548 if (!$this->notInPre && strcasecmp($name, 'pre') == 0) {
549 $this->notInPre = true;
550 }
551 } else {
552 $this->endDocument();
553 }
554 }
555
556 const regex = '/([\s\.\,\"\\\'\(\)\?\:\;\!\{\}\-\+\*\=\_\[\]\&\|\$]{1})/';
557 const whitespace = '/^[\s]{1}$/';
558 const delimiter = '/^[\s\.\,\"\\\'\(\)\?\:\;\!\{\}\-\+\*\=\_\[\]\&\|\$]{1}$/';
559
560 public function characters($parser, $data) {
561 $matches = preg_split(self::regex, $data, -1, PREG_SPLIT_DELIM_CAPTURE);
562
563 foreach($matches as &$word) {
564 if (preg_match(self::whitespace, $word) && $this->notInPre) {
565 $this->endWord();
566 $this->lastSibling->whiteAfter = true;
567 $this->whiteSpaceBeforeThis = true;
568 } else if (preg_match(self::delimiter, $word)) {
569 $this->endWord();
570 $textNode = new TextNode($this->currentParent, $word);
571 $this->currentParent->children[] = $textNode;
572 $textNode->whiteBefore = $this->whiteSpaceBeforeThis;
573 $this->whiteSpaceBeforeThis = false;
574 $this->lastSibling = $textNode;
575 $this->textNodes[] = $textNode;
576 } else {
577 $this->newWord .= $word;
578 }
579 }
580 }
581
582 private function endWord() {
583 if ($this->newWord !== '') {
584 $node = new TextNode($this->currentParent, $this->newWord);
585 $this->currentParent->children[] = $node;
586 $node->whiteBefore = $this->whiteSpaceBeforeThis;
587 $this->whiteSpaceBeforeThis = false;
588 $this->lastSibling = $node;
589 $this->textNodes[] = $node;
590 $this->newWord = "";
591 }
592 }
593
594 public function getDiffLines() {
595 return array_map(array('TextNode','toDiffLine'), $this->textNodes);
596 }
597 }
598
599 class TextNodeDiffer {
600
601 private $textNodes;
602 public $bodyNode;
603
604 private $oldTextNodes;
605 private $oldBodyNode;
606
607 private $newID = 0;
608
609 private $changedID = 0;
610
611 private $changedIDUsed = false;
612
613 // used to remove the whitespace between a red and green block
614 private $whiteAfterLastChangedPart = false;
615
616 private $deletedID = 0;
617
618 function __construct(DomTreeBuilder $tree, DomTreeBuilder $oldTree) {
619 $this->textNodes = $tree->textNodes;
620 $this->bodyNode = $tree->bodyNode;
621 $this->oldTextNodes = $oldTree->textNodes;
622 $this->oldBodyNode = $oldTree->bodyNode;
623 }
624
625 public function markAsNew($start, $end) {
626 if ($end <= $start) {
627 return;
628 }
629
630 if ($this->whiteAfterLastChangedPart) {
631 $this->textNodes[$start]->whiteBefore = false;
632 }
633
634 for ($i = $start; $i < $end; ++$i) {
635 $mod = new Modification(Modification::ADDED);
636 $mod->id = $this->newID;
637 $this->textNodes[$i]->modification = $mod;
638 }
639 if ($start < $end) {
640 $this->textNodes[$start]->modification->firstOfID = true;
641 }
642 ++$this->newID;
643 }
644
645 public function handlePossibleChangedPart($leftstart, $leftend, $rightstart, $rightend) {
646 $i = $rightstart;
647 $j = $leftstart;
648
649 if ($this->changedIDUsed) {
650 ++$this->changedID;
651 $this->changedIDUsed = false;
652 }
653
654 $changes;
655 while ($i < $rightend) {
656 $acthis = new AncestorComparator($this->textNodes[$i]->getParentTree());
657 $acother = new AncestorComparator($this->oldTextNodes[$j]->getParentTree());
658 $result = $acthis->getResult($acother);
659 unset($acthis, $acother);
660
661 if ($result->changed) {
662 $mod = new Modification(Modification::CHANGED);
663
664 if (!$this->changedIDUsed) {
665 $mod->firstOfID = true;
666 } else if (!is_null($result->changes) && $result->changes !== $this->changes) {
667 ++$this->changedID;
668 $mod->firstOfID = true;
669 }
670
671 $mod->changes = $result->changes;
672 $mod->id = $this->changedID;
673
674 $this->textNodes[$i]->modification = $mod;
675 $this->changes = $result->changes;
676 $this->changedIDUsed = true;
677 } else if ($this->changedIDUsed) {
678 ++$this->changedID;
679 $this->changedIDUsed = false;
680 }
681 ++$i;
682 ++$j;
683 }
684 }
685
686 public function markAsDeleted($start, $end, $before) {
687
688 if ($end <= $start) {
689 return;
690 }
691
692 if ($before > 0 && $this->textNodes[$before - 1]->whiteAfter) {
693 $this->whiteAfterLastChangedPart = true;
694 } else {
695 $this->whiteAfterLastChangedPart = false;
696 }
697
698 for ($i = $start; $i < $end; ++$i) {
699 $mod = new Modification(Modification::REMOVED);
700 $mod->id = $this->deletedID;
701
702 // oldTextNodes is used here because we're going to move its deleted
703 // elements to this tree!
704 $this->oldTextNodes[$i]->modification = $mod;
705 }
706 $this->oldTextNodes[$start]->modification->firstOfID = true;
707
708 $root = $this->oldTextNodes[$start]->getLastCommonParent($this->oldTextNodes[$end-1])->parent;
709
710 $junk1 = $junk2 = null;
711 $deletedNodes = $root->getMinimalDeletedSet($this->deletedID, $junk1, $junk2);
712
713 //wfDebug("Minimal set of deleted nodes of size " . count($deletedNodes));
714
715 // Set prevLeaf to the leaf after which the old HTML needs to be
716 // inserted
717 if ($before > 0) {
718 $prevLeaf = $this->textNodes[$before - 1];
719 }
720 // Set nextLeaf to the leaf before which the old HTML needs to be
721 // inserted
722 if ($before < count($this->textNodes)) {
723 $nextLeaf = $this->textNodes[$before];
724 }
725
726 while (count($deletedNodes) > 0) {
727 if (isset($prevLeaf)) {
728 $prevResult = $prevLeaf->getLastCommonParent($deletedNodes[0]);
729 } else {
730 $prevResult = new LastCommonParentResult();
731 $prevResult->parent = $this->bodyNode;
732 $prevResult->indexInLastCommonParent = 0;
733 }
734 if (isset($nextleaf)) {
735 $nextResult = $nextLeaf->getLastCommonParent($deletedNodes[count($deletedNodes) - 1]);
736 } else {
737 $nextResult = new LastCommonParentResult();
738 $nextResult->parent = $this->bodyNode;
739 $nextResult->indexInLastCommonParent = $this->bodyNode->getNbChildren();
740 }
741
742 if ($prevResult->lastCommonParentDepth == $nextResult->lastCommonParentDepth) {
743 // We need some metric to choose which way to add-...
744 if ($deletedNodes[0]->parent === $deletedNodes[count($deletedNodes) - 1]->parent
745 && $prevResult->parent === $nextResult->parent) {
746 // The difference is not in the parent
747 $prevResult->lastCommonParentDepth = $prevResult->lastCommonParentDepth + 1;
748 } else {
749 // The difference is in the parent, so compare them
750 // now THIS is tricky
751 $distancePrev = $deletedNodes[0]->parent->getMatchRatio($prevResult->parent);
752 $distanceNext = $deletedNodes[count($deletedNodes) - 1]->parent->getMatchRatio($nextResult->parent);
753
754 if ($distancePrev <= $distanceNext) {
755 $prevResult->lastCommonParentDepth = $prevResult->lastCommonParentDepth + 1;
756 } else {
757 $nextResult->lastCommonParentDepth = $nextResult->lastCommonParentDepth + 1;
758 }
759 }
760
761 }
762
763 if ($prevResult->lastCommonParentDepth > $nextResult->lastCommonParentDepth) {
764 // Inserting at the front
765 if ($prevResult->splittingNeeded) {
766 $prevLeaf->parent->splitUntil($prevResult->parent, $prevLeaf, true);
767 }
768 $prevLeaf = $deletedNodes[0]->copyTree();
769 unset($deletedNodes[0]);
770 $deletedNodes = array_values($deletedNodes);
771 $prevLeaf->setParent($prevResult->parent);
772 $prevResult->parent->addChildAbsolute($prevLeaf,$prevResult->indexInLastCommonParent + 1);
773 } else if ($prevResult->lastCommonParentDepth < $nextResult->lastCommonParentDepth) {
774 // Inserting at the back
775 if ($nextResult->splittingNeeded) {
776 $splitOccured = $nextLeaf->parent->splitUntil($nextResult->parent, $nextLeaf, false);
777 if ($splitOccured) {
778 // The place where to insert is shifted one place to the
779 // right
780 $nextResult->indexInLastCommonParent = $nextResult->indexInLastCommonParent + 1;
781 }
782 }
783 $nextLeaf = $deletedNodes[count(deletedNodes) - 1]->copyTree();
784 unset($deletedNodes[count(deletedNodes) - 1]);
785 $deletedNodes = array_values($deletedNodes);
786 $nextLeaf->setParent($nextResult->parent);
787 $nextResult->parent->addChildAbsolute($nextLeaf,$nextResult->indexInLastCommonParent);
788 } else {
789 throw new Exception("Uh?");
790 }
791 }
792 ++$this->deletedID;
793 }
794
795 public function expandWhiteSpace() {
796 $this->bodyNode->expandWhiteSpace();
797 }
798
799 public function lengthNew(){
800 return count($this->textNodes);
801 }
802
803 public function lengthOld(){
804 return count($this->oldTextNodes);
805 }
806 }
807
808 class HTMLDiffer {
809
810 private $output;
811
812 function __construct($output) {
813 $this->output = $output;
814 }
815
816 function htmlDiff($from, $to) {
817 wfProfileIn( __METHOD__ );
818 // Create an XML parser
819 $xml_parser = xml_parser_create('');
820
821 $domfrom = new DomTreeBuilder();
822
823 // Set the functions to handle opening and closing tags
824 xml_set_element_handler($xml_parser, array($domfrom, "startElement"), array($domfrom, "endElement"));
825
826 // Set the function to handle blocks of character data
827 xml_set_character_data_handler($xml_parser, array($domfrom, "characters"));
828
829 //wfDebug('Parsing '.strlen($from)." characters worth of HTML\n");
830 if (!xml_parse($xml_parser, '<?xml version="1.0" encoding="UTF-8"?>'.Sanitizer::hackDocType().'<body>', false)
831 || !xml_parse($xml_parser, $from, false)
832 || !xml_parse($xml_parser, '</body>', true)){
833 $error = xml_error_string(xml_get_error_code($xml_parser));
834 $line = xml_get_current_line_number($xml_parser);
835 wfDebug("XML error: $error at line $line\n");
836 }
837 xml_parser_free($xml_parser);
838 unset($from);
839
840 $xml_parser = xml_parser_create('');
841
842 $domto = new DomTreeBuilder();
843
844 // Set the functions to handle opening and closing tags
845 xml_set_element_handler($xml_parser, array($domto, "startElement"), array($domto, "endElement"));
846
847 // Set the function to handle blocks of character data
848 xml_set_character_data_handler($xml_parser, array($domto, "characters"));
849
850 //wfDebug('Parsing '.strlen($to)." characters worth of HTML\n");
851 if (!xml_parse($xml_parser, '<?xml version="1.0" encoding="UTF-8"?>'.Sanitizer::hackDocType().'<body>', false)
852 || !xml_parse($xml_parser, $to, false)
853 || !xml_parse($xml_parser, '</body>', true)){
854 $error = xml_error_string(xml_get_error_code($xml_parser));
855 $line = xml_get_current_line_number($xml_parser);
856 wfDebug("XML error: $error at line $line\n");
857 }
858 xml_parser_free($xml_parser);
859 unset($to);
860
861 $diffengine = new WikiDiff3();
862 $differences = $this->preProcess($diffengine->diff_range($domfrom->getDiffLines(), $domto->getDiffLines()));
863 unset($xml_parser, $diffengine);
864
865 $domdiffer = new TextNodeDiffer($domto, $domfrom);
866
867 $currentIndexLeft = 0;
868 $currentIndexRight = 0;
869 foreach ($differences as &$d) {
870 if ($d->leftstart > $currentIndexLeft) {
871 $domdiffer->handlePossibleChangedPart($currentIndexLeft, $d->leftstart,
872 $currentIndexRight, $d->rightstart);
873 }
874 if ($d->leftlength > 0) {
875 $domdiffer->markAsDeleted($d->leftstart, $d->leftend, $d->rightstart);
876 }
877 $domdiffer->markAsNew($d->rightstart, $d->rightend);
878
879 $currentIndexLeft = $d->leftend;
880 $currentIndexRight = $d->rightend;
881 }
882 $oldLength = $domdiffer->lengthOld();
883 if ($currentIndexLeft < $oldLength) {
884 $domdiffer->handlePossibleChangedPart($currentIndexLeft, $oldLength, $currentIndexRight, $domdiffer->lengthNew());
885 }
886 $domdiffer->expandWhiteSpace();
887 $output = new HTMLOutput('htmldiff', $this->output);
888 $output->parse($domdiffer->bodyNode);
889 wfProfileOut( __METHOD__ );
890 }
891
892 private function preProcess(/*array*/ $differences) {
893 $newRanges = array();
894
895 $nbDifferences = count($differences);
896 for ($i = 0; $i < $nbDifferences; ++$i) {
897 $leftStart = $differences[$i]->leftstart;
898 $leftEnd = $differences[$i]->leftend;
899 $rightStart = $differences[$i]->rightstart;
900 $rightEnd = $differences[$i]->rightend;
901
902 $leftLength = $leftEnd - $leftStart;
903 $rightLength = $rightEnd - $rightStart;
904
905 while ($i + 1 < $nbDifferences && self::score($leftLength,
906 $differences[$i + 1]->leftlength,
907 $rightLength,
908 $differences[$i + 1]->rightlength)
909 > ($differences[$i + 1]->leftstart - $leftEnd)) {
910 $leftEnd = $differences[$i + 1]->leftend;
911 $rightEnd = $differences[$i + 1]->rightend;
912 $leftLength = $leftEnd - $leftStart;
913 $rightLength = $rightEnd - $rightStart;
914 ++$i;
915 }
916 $newRanges[] = new RangeDifference($leftStart, $leftEnd, $rightStart, $rightEnd);
917 }
918 return $newRanges;
919 }
920
921 /**
922 * Heuristic to merge differences for readability.
923 */
924 public static function score($ll, $nll, $rl, $nrl) {
925 if (($ll == 0 && $nll == 0)
926 || ($rl == 0 && $nrl == 0)) {
927 return 0;
928 }
929 $numbers = array($ll, $nll, $rl, $nrl);
930 $d = 0;
931 foreach ($numbers as &$number) {
932 while ($number > 3) {
933 $d += 3;
934 $number -= 3;
935 $number *= 0.5;
936 }
937 $d += $number;
938
939 }
940 return $d / (1.5 * count($numbers));
941 }
942
943 }
944
945 class TextOnlyComparator {
946
947 public $leafs = array();
948
949 function _construct(TagNode $tree) {
950 $this->addRecursive($tree);
951 $this->leafs = array_map(array('TextNode','toDiffLine'), $this->leafs);
952 }
953
954 private function addRecursive(TagNode $tree) {
955 foreach ($tree->children as &$child) {
956 if ($child instanceof TagNode) {
957 $this->addRecursive($child);
958 } else if ($child instanceof TextNode) {
959 $this->leafs[] = $node;
960 }
961 }
962 }
963
964 public function getMatchRatio(TextOnlyComparator $other) {
965 $nbOthers = count($other->leafs);
966 $nbThis = count($this->leafs);
967 if($nbOthers == 0 || $nbThis == 0){
968 return -log(0);
969 }
970
971 $diffengine = new WikiDiff3(25000, 1.35);
972 $diffengine->diff($this->leafs, $other->leafs);
973
974 $lcsLength = $diffengine->getLcsLength();
975
976 $distanceThis = $nbThis-$lcsLength;
977
978 return (2.0 - $lcsLength/$nbOthers - $lcsLength/$nbThis) / 2.0;
979 }
980 }
981
982 class AncestorComparatorResult {
983
984 public $changed = false;
985
986 public $changes = "";
987 }
988
989 /**
990 * A comparator used when calculating the difference in ancestry of two Nodes.
991 */
992 class AncestorComparator {
993
994 public $ancestors;
995 public $ancestorsText;
996
997 function __construct(/*array*/ $ancestors) {
998 $this->ancestors = $ancestors;
999 $this->ancestorsText = array_map(array('TagNode','toDiffLine'), $ancestors);
1000 }
1001
1002 public $compareTxt = "";
1003
1004 public function getResult(AncestorComparator $other) {
1005 $result = new AncestorComparatorResult();
1006
1007 $diffengine = new WikiDiff3(10000, 1.35);
1008 $differences = $diffengine->diff_range($other->ancestorsText,$this->ancestorsText);
1009
1010 if (count($differences) == 0){
1011 return $result;
1012 }
1013 $changeTxt = new ChangeTextGenerator($this, $other);
1014
1015 $result->changed = true;
1016 $result->changes = $changeTxt->getChanged($differences)->toString();
1017
1018 return $result;
1019 }
1020 }
1021
1022 class ChangeTextGenerator {
1023
1024 private $ancestorComparator;
1025 private $other;
1026
1027 private $factory;
1028
1029 function __construct(AncestorComparator $ancestorComparator, AncestorComparator $other) {
1030 $this->ancestorComparator = $ancestorComparator;
1031 $this->other = $other;
1032 $this->factory = new TagToStringFactory();
1033 }
1034
1035 public function getChanged(/*array*/ $differences) {
1036 $txt = new ChangeText;
1037 $rootlistopened = false;
1038 if (count($differences) > 1) {
1039 $txt->addHtml('<ul class="changelist">');
1040 $rootlistopened = true;
1041 }
1042 $nbDifferences = count($differences);
1043 for ($j = 0; $j < $nbDifferences; ++$j) {
1044 $d = $differences[$j];
1045 $lvl1listopened = false;
1046 if ($rootlistopened) {
1047 $txt->addHtml('<li>');
1048 }
1049 if ($d->leftlength + $d->rightlength > 1) {
1050 $txt->addHtml('<ul class="changelist">');
1051 $lvl1listopened = true;
1052 }
1053 // left are the old ones
1054 for ($i = $d->leftstart; $i < $d->leftend; ++$i) {
1055 if ($lvl1listopened){
1056 $txt->addHtml('<li>');
1057 }
1058 // add a bullet for a old tag
1059 $this->addTagOld($txt, $this->other->ancestors[$i]);
1060 if ($lvl1listopened){
1061 $txt->addHtml('</li>');
1062 }
1063 }
1064 // right are the new ones
1065 for ($i = $d->rightstart; $i < $d->rightend; ++$i) {
1066 if ($lvl1listopened){
1067 $txt->addHtml('<li>');
1068 }
1069 // add a bullet for a new tag
1070 $this->addTagNew($txt, $this->ancestorComparator->ancestors[$i]);
1071
1072 if ($lvl1listopened){
1073 $txt->addHtml('</li>');
1074 }
1075 }
1076 if ($lvl1listopened) {
1077 $txt->addHtml('</ul>');
1078 }
1079 if ($rootlistopened) {
1080 $txt->addHtml('</li>');
1081 }
1082 }
1083 if ($rootlistopened) {
1084 $txt->addHtml('</ul>');
1085 }
1086 return $txt;
1087 }
1088
1089 private function addTagOld(ChangeText $txt, TagNode $ancestor) {
1090 $this->factory->create($ancestor)->getRemovedDescription($txt);
1091 }
1092
1093 private function addTagNew(ChangeText $txt, TagNode $ancestor) {
1094 $this->factory->create($ancestor)->getAddedDescription($txt);
1095 }
1096 }
1097
1098 class ChangeText {
1099
1100 private $txt = "";
1101
1102 public function addHtml($s) {
1103 $this->txt .= $s;
1104 }
1105
1106 public function toString() {
1107 return $this->txt;
1108 }
1109 }
1110
1111 class TagToStringFactory {
1112
1113 private static $containerTags = array('html', 'body', 'p', 'blockquote',
1114 'h1', 'h2', 'h3', 'h4', 'h5', 'pre', 'div', 'ul', 'ol', 'li',
1115 'table', 'tbody', 'tr', 'td', 'th', 'br', 'hr', 'code', 'dl',
1116 'dt', 'dd', 'input', 'form', 'img', 'span', 'a');
1117
1118 private static $styleTags = array('i', 'b', 'strong', 'em', 'font',
1119 'big', 'del', 'tt', 'sub', 'sup', 'strike');
1120
1121 const MOVED = 1;
1122 const STYLE = 2;
1123 const UNKNOWN = 4;
1124
1125 public function create(TagNode $node) {
1126 $sem = $this->getChangeSemantic($node->qName);
1127 if (strcasecmp($node->qName,'a') == 0) {
1128 return new AnchorToString($node, $sem);
1129 }
1130 if (strcasecmp($node->qName,'img') == 0) {
1131 return new NoContentTagToString($node, $sem);
1132 }
1133 return new TagToString($node, $sem);
1134 }
1135
1136 protected function getChangeSemantic($qname) {
1137 if (in_array(strtolower($qname),self::$containerTags)) {
1138 return self::MOVED;
1139 }
1140 if (in_array(strtolower($qname),self::$styleTags)) {
1141 return self::STYLE;
1142 }
1143 return self::UNKNOWN;
1144 }
1145 }
1146
1147 class TagToString {
1148
1149 protected $node;
1150
1151 protected $sem;
1152
1153 function __construct(TagNode $node, $sem) {
1154 $this->node = $node;
1155 $this->sem = $sem;
1156 }
1157
1158 public function getRemovedDescription(ChangeText $txt) {
1159 $tagDescription = wfMsgExt('diff-' . $this->node->qName, 'parseinline' );
1160 if(!$tagDescription){
1161 $tagDescription = $this->node->qName;
1162 }
1163 if ($this->sem == TagToStringFactory::MOVED) {
1164 $txt->addHtml( wfMsgExt( 'diff-movedoutof', 'parseinline', $tagDescription ) );
1165 } else if ($this->sem == TagToStringFactory::STYLE) {
1166 $txt->addHtml($tagDescription . ' ' . wfMsgExt( 'diff-styleremoved' , 'parseinline' ) );
1167 } else {
1168 $txt->addHtml($tagDescription . ' ' . wfMsgExt( 'diff-removed' , 'parseinline' ) );
1169 }
1170 $this->addAttributes($txt, $this->node->attributes);
1171 $txt->addHtml('.');
1172 }
1173
1174 public function getAddedDescription(ChangeText $txt) {
1175 $tagDescription = wfMsgExt('diff-' . $this->node->qName, 'parseinline' );
1176 if(!$tagDescription){
1177 $tagDescription = $this->node->qName;
1178 }
1179 if ($this->sem == TagToStringFactory::MOVED) {
1180 $txt->addHtml( wfMsgExt( 'diff-movedto' , 'parseinline', $tagDescription) );
1181 } else if ($this->sem == TagToStringFactory::STYLE) {
1182 $txt->addHtml($tagDescription . ' ' . wfMsgExt( 'diff-styleadded', 'parseinline' ) );
1183 } else {
1184 $txt->addHtml($tagDescription . ' ' . wfMsgExt( 'diff-added', 'parseinline' ) );
1185 }
1186 $this->addAttributes($txt, $this->node->attributes);
1187 $txt->addHtml('.');
1188 }
1189
1190 protected function addAttributes(ChangeText $txt, array $attributes) {
1191 if (count($attributes) < 1) {
1192 return;
1193 }
1194 $firstOne = true;
1195 $nbAttributes_min_1 = count($attributes)-1;
1196 $keys = array_keys($attributes);
1197 for ($i=0;$i<$nbAttributes_min_1;$i++) {
1198 $key = $keys[$i];
1199 $attr = $attributes[$key];
1200 if($firstOne) {
1201 $firstOne = false;
1202 $txt->addHtml( wfMsgExt('diff-with', 'escapenoentities', $this->translateArgument($key), htmlspecialchars($attr) ) );
1203 continue;
1204 }
1205 $txt->addHtml( wfMsgExt( 'comma-separator', 'escapenoentities' ) .
1206 wfMsgExt( 'diff-with-additional', 'escapenoentities',
1207 $this->translateArgument( $key ), htmlspecialchars( $attr ) )
1208 );
1209 }
1210
1211 if ($nbAttributes_min_1 > 0) {
1212 $txt->addHtml( wfMsgExt( 'diff-with-final', 'escapenoentities',
1213 $this->translateArgument($keys[$nbAttributes_min_1]),
1214 htmlspecialchars($attributes[$keys[$nbAttributes_min_1]]) ) );
1215 }
1216 }
1217
1218 protected function translateArgument($name) {
1219 $translation = wfMsgExt('diff-' . $name, 'parseinline' );
1220 if ( wfEmptyMsg( 'diff-' . $name, $translation ) ) {
1221 $translation = $name;
1222 }
1223 return htmlspecialchars( $translation );
1224 }
1225 }
1226
1227 class NoContentTagToString extends TagToString {
1228
1229 function __construct(TagNode $node, $sem) {
1230 parent::__construct($node, $sem);
1231 }
1232
1233 public function getAddedDescription(ChangeText $txt) {
1234 $tagDescription = wfMsgExt('diff-' . $this->node->qName, 'parseinline' );
1235 if(!$tagDescription){
1236 $tagDescription = $this->node->qName;
1237 }
1238 $txt->addHtml( wfMsgExt('diff-changedto', 'parseinline' ) . ' ' . $tagDescription);
1239 $this->addAttributes($txt, $this->node->attributes);
1240 $txt->addHtml('.');
1241 }
1242
1243 public function getRemovedDescription(ChangeText $txt) {
1244 $txt->addHtml( wfMsgExt('diff-changedfrom', 'parseinline' ) . ' ' . $tagDescription);
1245 $this->addAttributes($txt, $this->node->attributes);
1246 $txt->addHtml('.');
1247 }
1248 }
1249
1250 class AnchorToString extends TagToString {
1251
1252 function __construct(TagNode $node, $sem) {
1253 parent::__construct($node, $sem);
1254 }
1255
1256 protected function addAttributes(ChangeText $txt, array $attributes) {
1257 if (array_key_exists('href', $attributes)) {
1258 $txt->addHtml(' ' . wfMsgExt( 'diff-withdestination', 'parseinline' ) . ' ' . htmlspecialchars($attributes['href']));
1259 unset($attributes['href']);
1260 }
1261 parent::addAttributes($txt, $attributes);
1262 }
1263 }
1264
1265 /**
1266 * Takes a branch root and creates an HTML file for it.
1267 */
1268 class HTMLOutput{
1269
1270 private $prefix;
1271 private $handler;
1272
1273 function __construct($prefix, $handler) {
1274 $this->prefix = $prefix;
1275 $this->handler = $handler;
1276 }
1277
1278 public function parse(TagNode $node) {
1279 $handler = &$this->handler;
1280
1281 if (strcasecmp($node->qName, 'img') != 0 && strcasecmp($node->qName, 'body') != 0) {
1282 $handler->startElement($node->qName, $node->attributes);
1283 }
1284
1285 $newStarted = false;
1286 $remStarted = false;
1287 $changeStarted = false;
1288 $changeTXT = '';
1289
1290 foreach ($node->children as &$child) {
1291 if ($child instanceof TagNode) {
1292 if ($newStarted) {
1293 $handler->endElement('span');
1294 $newStarted = false;
1295 } else if ($changeStarted) {
1296 $handler->endElement('span');
1297 $changeStarted = false;
1298 } else if ($remStarted) {
1299 $handler->endElement('span');
1300 $remStarted = false;
1301 }
1302 $this->parse($child);
1303 } else if ($child instanceof TextNode) {
1304 $mod = $child->modification;
1305
1306 if ($newStarted && ($mod->type != Modification::ADDED || $mod->firstOfID)) {
1307 $handler->endElement('span');
1308 $newStarted = false;
1309 } else if ($changeStarted && ($mod->type != Modification::CHANGED
1310 || $mod->changes != $changeTXT || $mod->firstOfID)) {
1311 $handler->endElement('span');
1312 $changeStarted = false;
1313 } else if ($remStarted && ($mod->type != Modification::REMOVED || $mod ->firstOfID)) {
1314 $handler->endElement('span');
1315 $remStarted = false;
1316 }
1317
1318 // no else because a removed part can just be closed and a new
1319 // part can start
1320 if (!$newStarted && $mod->type == Modification::ADDED) {
1321 $attrs = array('class' => 'diff-html-added');
1322 if ($mod->firstOfID) {
1323 $attrs['id'] = "added-{$this->prefix}-{$mod->id}";
1324 }
1325 $handler->startElement('span', $attrs);
1326 $newStarted = true;
1327 } else if (!$changeStarted && $mod->type == Modification::CHANGED) {
1328 $attrs = array('class' => 'diff-html-changed');
1329 if ($mod->firstOfID) {
1330 $attrs['id'] = "changed-{$this->prefix}-{$mod->id}";
1331 }
1332 $handler->startElement('span', $attrs);
1333
1334 //tooltip
1335 $handler->startElement('span', array('class' => 'tip'));
1336 $handler->html($mod->changes);
1337 $handler->endElement('span');
1338
1339 $changeStarted = true;
1340 $changeTXT = $mod->changes;
1341 } else if (!$remStarted && $mod->type == Modification::REMOVED) {
1342 $attrs = array('class'=>'diff-html-removed');
1343 if ($mod->firstOfID) {
1344 $attrs['id'] = "removed-{$this->prefix}-{$mod->id}";
1345 }
1346 $handler->startElement('span', $attrs);
1347 $remStarted = true;
1348 }
1349
1350 $chars = $child->text;
1351
1352 if ($child instanceof ImageNode) {
1353 $this->writeImage($child);
1354 } else {
1355 $handler->characters($chars);
1356 }
1357 }
1358 }
1359
1360 if ($newStarted) {
1361 $handler->endElement('span');
1362 $newStarted = false;
1363 } else if ($changeStarted) {
1364 $handler->endElement('span');
1365 $changeStarted = false;
1366 } else if ($remStarted) {
1367 $handler->endElement('span');
1368 $remStarted = false;
1369 }
1370
1371 if (strcasecmp($node->qName, 'img') != 0
1372 && strcasecmp($node->qName, 'body') != 0) {
1373 $handler->endElement($node->qName);
1374 }
1375 }
1376
1377 private function writeImage(ImageNode $imgNode) {
1378 $attrs = $imgNode->attributes;
1379 $this->handler->startElement('img', $attrs);
1380 $this->handler->endElement('img');
1381 }
1382 }
1383
1384 class EchoingContentHandler {
1385
1386 function startElement($qname, /*array*/ $arguments) {
1387 echo Xml::openElement($qname, $arguments);
1388 }
1389
1390 function endElement($qname){
1391 echo Xml::closeElement($qname);
1392 }
1393
1394 function characters($chars){
1395 echo htmlspecialchars($chars);
1396 }
1397
1398 function html($html){
1399 echo $html;
1400 }
1401
1402 }
1403
1404 class DelegatingContentHandler {
1405
1406 private $delegate;
1407
1408 function __construct($delegate) {
1409 $this->delegate = $delegate;
1410 }
1411
1412 function startElement($qname, /*array*/ $arguments) {
1413 $this->delegate->addHtml(Xml::openElement($qname, $arguments));
1414 }
1415
1416 function endElement($qname){
1417 $this->delegate->addHtml(Xml::closeElement($qname));
1418 }
1419
1420 function characters($chars){
1421 $this->delegate->addHtml(htmlspecialchars($chars));
1422 }
1423
1424 function html($html){
1425 $this->delegate->addHtml($html);
1426 }
1427 }