2 /* Copyright (C) 2008 Guy Van den Broeck <guy@guyvdb.eu>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * or see http://www.gnu.org/
21 * Any element in the DOM tree of an HTML document.
27 protected $parentTree;
29 public $whiteBefore = false;
31 public $whiteAfter = false;
33 function __construct($parent) {
34 $this->parent
= $parent;
37 public function getParentTree() {
38 if (!isset($this->parentTree
)) {
39 if (!is_null($this->parent
)) {
40 $this->parentTree
= $this->parent
->getParentTree();
41 $this->parentTree
[] = $this->parent
;
43 $this->parentTree
= array();
46 return $this->parentTree
;
49 public function getLastCommonParent(Node
$other) {
50 $result = new LastCommonParentResult();
52 $myParents = $this->getParentTree();
53 $otherParents = $other->getParentTree();
57 $nbMyParents = count($myParents);
58 $nbOtherParents = count($otherParents);
59 while ($isSame && $i < $nbMyParents && $i < $nbOtherParents) {
60 if (!$myParents[$i]->openingTag
=== $otherParents[$i]->openingTag
) {
63 // After a while, the index i-1 must be the last common parent
68 $result->lastCommonParentDepth
= $i - 1;
69 $result->parent
= $myParents[$i - 1];
71 if (!$isSame ||
$nbMyParents > $nbOtherParents) {
72 // Not all tags matched, or all tags matched but
73 // there are tags left in this tree
74 $result->indexInLastCommonParent
= $myParents[$i - 1]->getIndexOf($myParents[$i]);
75 $result->splittingNeeded
= true;
76 } else if ($nbMyParents <= $nbOtherParents) {
77 $result->indexInLastCommonParent
= $myParents[$i - 1]->getIndexOf($this);
82 public function setParent($parent) {
83 $this->parent
= $parent;
84 unset($this->parentTree
);
87 public function inPre() {
88 $tree = $this->getParentTree();
89 foreach ($tree as &$ancestor) {
90 if ($ancestor->isPre()) {
99 * Node that can contain other nodes. Represents an HTML tag.
101 class TagNode
extends Node
{
103 public $children = array();
107 public $attributes = array();
111 function __construct($parent, $qName, /*array*/ $attributes) {
112 parent
::__construct($parent);
113 $this->qName
= strtolower($qName);
114 foreach($attributes as $key => &$value){
115 $this->attributes
[strtolower($key)] = $value;
117 return $this->openingTag
= Xml
::openElement($this->qName
, $this->attributes
);
120 public function addChildAbsolute(Node
$node, $index) {
121 array_splice($this->children
, $index, 0, array($node));
124 public function getIndexOf(Node
$child) {
125 // don't trust array_search with objects
126 foreach ($this->children
as $key => &$value){
127 if ($value === $child) {
134 public function getNbChildren() {
135 return count($this->children
);
138 public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
142 $somethingDeleted = false;
143 $hasNonDeletedDescendant = false;
145 if (empty($this->children
)) {
149 foreach ($this->children
as &$child) {
150 $allDeleted_local = false;
151 $somethingDeleted_local = false;
152 $childrenChildren = $child->getMinimalDeletedSet($id, $allDeleted_local, $somethingDeleted_local);
153 if ($somethingDeleted_local) {
154 $nodes = array_merge($nodes, $childrenChildren);
155 $somethingDeleted = true;
157 if (!$allDeleted_local) {
158 $hasNonDeletedDescendant = true;
161 if (!$hasNonDeletedDescendant) {
162 $nodes = array($this);
168 public function splitUntil(TagNode
$parent, Node
$split, $includeLeft) {
169 $splitOccured = false;
170 if ($parent !== $this) {
171 $part1 = new TagNode(null, $this->qName
, $this->attributes
);
172 $part2 = new TagNode(null, $this->qName
, $this->attributes
);
173 $part1->setParent($this->parent
);
174 $part2->setParent($this->parent
);
178 foreach ($this->children
as &$child)
180 if ($child === $split) {
183 if(!$pastSplit ||
($onSplit && $includeLeft)) {
184 $child->setParent($part1);
185 $part1->children
[] = $child;
187 $child->setParent($part2);
188 $part2->children
[] = $child;
195 $myindexinparent = $this->parent
->getIndexOf($this);
196 if (!empty($part1->children
)) {
197 $this->parent
->addChildAbsolute($part1, $myindexinparent);
199 if (!empty($part2->children
)) {
200 $this->parent
->addChildAbsolute($part2, $myindexinparent);
202 if (!empty($part1->children
) && !empty($part2->children
)) {
203 $splitOccured = true;
206 $this->parent
->removeChild($myindexinparent);
209 $this->parent
->splitUntil($parent, $part1, $includeLeft);
211 $this->parent
->splitUntil($parent, $part2, $includeLeft);
214 return $splitOccured;
218 private function removeChild($index) {
219 unset($this->children
[$index]);
220 $this->children
= array_values($this->children
);
223 public static $blocks = array('html', 'body','p','blockquote', 'h1',
224 'h2', 'h3', 'h4', 'h5', 'pre', 'div', 'ul', 'ol', 'li', 'table',
225 'tbody', 'tr', 'td', 'th', 'br');
227 public function copyTree() {
228 $newThis = new TagNode(null, $this->qName
, $this->attributes
);
229 $newThis->whiteBefore
= $this->whiteBefore
;
230 $newThis->whiteAfter
= $this->whiteAfter
;
231 foreach ($this->children
as &$child) {
232 $newChild = $child->copyTree();
233 $newChild->setParent($newThis);
234 $newThis->children
[] = $newChild;
239 public function getMatchRatio(TagNode
$other) {
240 $txtComp = new TextOnlyComparator($other);
241 return $txtComp->getMatchRatio(new TextOnlyComparator($this));
244 public function expandWhiteSpace() {
248 $nbOriginalChildren = $this->getNbChildren();
249 for ($i = 0; $i < $nbOriginalChildren; ++
$i) {
250 $child = $this->children
[$i +
$shift];
252 if ($child instanceof TagNode
) {
253 if (!$child->isPre()) {
254 $child->expandWhiteSpace();
257 if (!$spaceAdded && $child->whiteBefore
) {
258 $ws = new WhiteSpaceNode(null, ' ', $child->getLeftMostChild());
259 $ws->setParent($this);
260 $this->addChildAbsolute($ws,$i +
($shift++
));
262 if ($child->whiteAfter
) {
263 $ws = new WhiteSpaceNode(null, ' ', $child->getRightMostChild());
264 $ws->setParent($this);
265 $this->addChildAbsolute($ws,$i +
1 +
($shift++
));
274 public function getLeftMostChild() {
275 if (empty($this->children
)) {
278 return $this->children
[0]->getLeftMostChild();
281 public function getRightMostChild() {
282 if (empty($this->children
)) {
285 return $this->children
[$this->getNbChildren() - 1]->getRightMostChild();
288 public function isPre() {
289 return 0 == strcasecmp($this->qName
,'pre');
292 public static function toDiffLine(TagNode
$node) {
293 return $node->openingTag
;
298 * Represents a piece of text in the HTML file.
300 class TextNode
extends Node
{
304 public $modification;
306 function __construct($parent, $text) {
307 parent
::__construct($parent);
308 $this->modification
= new Modification(Modification
::NONE
);
312 public function copyTree() {
313 $clone = clone $this;
314 $clone->setParent(null);
318 public function getLeftMostChild() {
322 public function getRightMostChild() {
326 public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
327 if ($this->modification
->type
== Modification
::REMOVED
328 && $this->modification
->id
== $id){
329 $somethingDeleted = true;
336 public function isSameText($other) {
337 if (is_null($other) ||
! $other instanceof TextNode
) {
340 return str_replace('\n', ' ',$this->text
) === str_replace('\n', ' ',$other->text
);
343 public static function toDiffLine(TextNode
$node) {
344 return str_replace('\n', ' ',$node->text
);
348 class WhiteSpaceNode
extends TextNode
{
350 function __construct($parent, $s, Node
$like = null) {
351 parent
::__construct($parent, $s);
352 if(!is_null($like) && $like instanceof TextNode
) {
353 $newModification = clone $like->modification
;
354 $newModification->firstOfID
= false;
355 $this->modification
= $newModification;
361 * Represents the root of a HTML document.
363 class BodyNode
extends TagNode
{
365 function __construct() {
366 parent
::__construct(null, 'body', array());
369 public function copyTree() {
370 $newThis = new BodyNode();
371 foreach ($this->children
as &$child) {
372 $newChild = $child->copyTree();
373 $newChild->setParent($newThis);
374 $newThis->children
[] = $newChild;
379 public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
381 foreach ($this->children
as &$child) {
382 $childrenChildren = $child->getMinimalDeletedSet($id,
383 $allDeleted, $somethingDeleted);
384 $nodes = array_merge($nodes, $childrenChildren);
392 * Represents an image in HTML. Even though images do not contain any text they
393 * are independent visible objects on the page. They are logically a TextNode.
395 class ImageNode
extends TextNode
{
399 function __construct(TagNode
$parent, /*array*/ $attrs) {
400 if(!array_key_exists('src', $attrs)) {
401 //wfDebug('Image without a source:');
402 //foreach ($attrs as $key => &$value) {
403 //wfDebug("$key = $value");
405 parent
::__construct($parent, '<img></img>');
407 parent
::__construct($parent, '<img>' . strtolower($attrs['src']) . '</img>');
409 $this->attributes
= $attrs;
412 public function isSameText($other) {
413 if (is_null($other) ||
! $other instanceof ImageNode
) {
416 return $this->text
=== $other->text
;
421 class DummyNode
extends Node
{
423 function __construct() {
430 * When detecting the last common parent of two nodes, all results are stored as
431 * a LastCommonParentResult.
433 class LastCommonParentResult
{
439 public $splittingNeeded = false;
442 public $lastCommonParentDepth = -1;
445 public $indexInLastCommonParent = -1;
463 public $firstOfID = false;
467 function __construct($type) {
471 public static function typeToString($type) {
473 case self
::NONE
: return 'none';
474 case self
::REMOVED
: return 'removed';
475 case self
::ADDED
: return 'added';
476 case self
::CHANGED
: return 'changed';
481 class DomTreeBuilder
{
483 public $textNodes = array();
487 private $currentParent;
489 private $newWord = '';
491 protected $bodyStarted = false;
493 protected $bodyEnded = false;
495 private $whiteSpaceBeforeThis = false;
497 private $lastSibling;
499 private $notInPre = true;
501 function __construct() {
502 $this->bodyNode
= $this->currentParent
= new BodyNode();
503 $this->lastSibling
= new DummyNode();
507 * Must be called manually
509 public function endDocument() {
511 //wfDebug(count($this->textNodes) . ' text nodes in document.');
514 public function startElement($parser, $name, /*array*/ $attributes) {
515 if (strcasecmp($name, 'body') != 0) {
516 //wfDebug("Starting $name node.");
519 $newNode = new TagNode($this->currentParent
, $name, $attributes);
520 $this->currentParent
->children
[] = $newNode;
521 $this->currentParent
= $newNode;
522 $this->lastSibling
= new DummyNode();
523 if ($this->whiteSpaceBeforeThis
&& !in_array(strtolower($this->currentParent
->qName
),TagNode
::$blocks)) {
524 $this->currentParent
->whiteBefore
= true;
526 $this->whiteSpaceBeforeThis
= false;
527 if(strcasecmp($name, 'pre') == 0) {
528 $this->notInPre
= false;
533 public function endElement($parser, $name) {
534 if(strcasecmp($name, 'body') != 0) {
535 //wfDebug("Ending $name node.");
536 if (0 == strcasecmp($name,'img')) {
537 // Insert a dummy leaf for the image
538 $img = new ImageNode($this->currentParent
, $this->currentParent
->attributes
);
539 $this->currentParent
->children
[] = $img;
540 $img->whiteBefore
= $this->whiteSpaceBeforeThis
;
541 $this->lastSibling
= $img;
542 $this->textNodes
[] = $img;
545 if (!in_array(strtolower($this->currentParent
->qName
),TagNode
::$blocks)) {
546 $this->lastSibling
= $this->currentParent
;
548 $this->lastSibling
= new DummyNode();
550 $this->currentParent
= $this->currentParent
->parent
;
551 $this->whiteSpaceBeforeThis
= false;
552 if (!$this->notInPre
&& strcasecmp($name, 'pre') == 0) {
553 $this->notInPre
= true;
556 $this->endDocument();
560 const regex
= '/([\s\.\,\"\\\'\(\)\?\:\;\!\{\}\-\+\*\=\_\[\]\&\|\$]{1})/';
561 const whitespace
= '/^[\s]{1}$/';
562 const delimiter
= '/^[\s\.\,\"\\\'\(\)\?\:\;\!\{\}\-\+\*\=\_\[\]\&\|\$]{1}$/';
564 public function characters($parser, $data) {
565 $matches = preg_split(self
::regex
, $data, -1, PREG_SPLIT_DELIM_CAPTURE
);
567 foreach($matches as &$word) {
568 if (preg_match(self
::whitespace
, $word) && $this->notInPre
) {
570 $this->lastSibling
->whiteAfter
= true;
571 $this->whiteSpaceBeforeThis
= true;
572 } else if (preg_match(self
::delimiter
, $word)) {
574 $textNode = new TextNode($this->currentParent
, $word);
575 $this->currentParent
->children
[] = $textNode;
576 $textNode->whiteBefore
= $this->whiteSpaceBeforeThis
;
577 $this->whiteSpaceBeforeThis
= false;
578 $this->lastSibling
= $textNode;
579 $this->textNodes
[] = $textNode;
581 $this->newWord
.= $word;
586 private function endWord() {
587 if ($this->newWord
!== '') {
588 $node = new TextNode($this->currentParent
, $this->newWord
);
589 $this->currentParent
->children
[] = $node;
590 $node->whiteBefore
= $this->whiteSpaceBeforeThis
;
591 $this->whiteSpaceBeforeThis
= false;
592 $this->lastSibling
= $node;
593 $this->textNodes
[] = $node;
598 public function getDiffLines() {
599 return array_map(array('TextNode','toDiffLine'), $this->textNodes
);
603 class TextNodeDiffer
{
608 private $oldTextNodes;
609 private $oldBodyNode;
611 private $lastModified = array();
615 private $changedID = 0;
617 private $changedIDUsed = false;
619 // used to remove the whitespace between a red and green block
620 private $whiteAfterLastChangedPart = false;
622 private $deletedID = 0;
624 function __construct(DomTreeBuilder
$tree, DomTreeBuilder
$oldTree) {
625 $this->textNodes
= $tree->textNodes
;
626 $this->bodyNode
= $tree->bodyNode
;
627 $this->oldTextNodes
= $oldTree->textNodes
;
628 $this->oldBodyNode
= $oldTree->bodyNode
;
631 public function markAsNew($start, $end) {
632 if ($end <= $start) {
636 if ($this->whiteAfterLastChangedPart
) {
637 $this->textNodes
[$start]->whiteBefore
= false;
640 $nextLastModified = array();
642 for ($i = $start; $i < $end; ++
$i) {
643 $mod = new Modification(Modification
::ADDED
);
644 $mod->id
= $this->newID
;
645 if (count($this->lastModified
) > 0) {
646 $mod->prevMod
= $this->lastModified
[0];
647 if (is_null($this->lastModified
[0]->nextMod
)) {
648 foreach ($this->lastModified
as &$lastMod) {
649 $lastMod->nextMod
= $mod;
653 $nextLastModified[] = $mod;
654 $this->textNodes
[$i]->modification
= $mod;
657 $this->textNodes
[$start]->modification
->firstOfID
= true;
660 $this->lastModified
= $nextLastModified;
663 public function handlePossibleChangedPart($leftstart, $leftend, $rightstart, $rightend) {
667 if ($this->changedIDUsed
) {
669 $this->changedIDUsed
= false;
672 $nextLastModified = array();
675 while ($i < $rightend) {
676 $acthis = new AncestorComparator($this->textNodes
[$i]->getParentTree());
677 $acother = new AncestorComparator($this->oldTextNodes
[$j]->getParentTree());
678 $result = $acthis->getResult($acother);
679 unset($acthis, $acother);
681 $nbLastModified = count($this->lastModified
);
682 if ($result->changed
) {
683 $mod = new Modification(Modification
::CHANGED
);
685 if (!$this->changedIDUsed
) {
686 $mod->firstOfID
= true;
687 if (count($nextLastModified) > 0) {
688 $this->lastModified
= $nextLastModified;
689 $nextLastModified = array();
691 } else if (!is_null($result->changes
) && $result->changes
!== $this->changes
) {
693 $mod->firstOfID
= true;
694 if (count($nextLastModified) > 0) {
695 $this->lastModified
= $nextLastModified;
696 $nextLastModified = array();
700 if ($nbLastModified > 0) {
701 $mod->prevMod
= $this->lastModified
[0];
702 if (is_null($this->lastModified
[0]->nextMod
)) {
703 foreach ($this->lastModified
as &$lastMod) {
704 $lastMod->nextMod
= $mod;
708 $nextLastModified[] = $mod;
710 $mod->changes
= $result->changes
;
711 $mod->id
= $this->changedID
;
713 $this->textNodes
[$i]->modification
= $mod;
714 $this->changes
= $result->changes
;
715 $this->changedIDUsed
= true;
716 } else if ($this->changedIDUsed
) {
718 $this->changedIDUsed
= false;
723 if (count($nextLastModified) > 0) {
724 $this->lastModified
= $nextLastModified;
728 public function markAsDeleted($start, $end, $before) {
730 if ($end <= $start) {
734 if ($before > 0 && $this->textNodes
[$before - 1]->whiteAfter
) {
735 $this->whiteAfterLastChangedPart
= true;
737 $this->whiteAfterLastChangedPart
= false;
740 $nextLastModified = array();
742 for ($i = $start; $i < $end; ++
$i) {
743 $mod = new Modification(Modification
::REMOVED
);
744 $mod->id
= $this->deletedID
;
745 if (count($this->lastModified
) > 0) {
746 $mod->prevMod
= $this->lastModified
[0];
747 if (is_null($this->lastModified
[0]->nextMod
)) {
748 foreach ($this->lastModified
as &$lastMod) {
749 $lastMod->nextMod
= $mod;
753 $nextLastModified[] = $mod;
755 // oldTextNodes is used here because we're going to move its deleted
756 // elements to this tree!
757 $this->oldTextNodes
[$i]->modification
= $mod;
759 $this->oldTextNodes
[$start]->modification
->firstOfID
= true;
761 $root = $this->oldTextNodes
[$start]->getLastCommonParent($this->oldTextNodes
[$end-1])->parent
;
763 $junk1 = $junk2 = null;
764 $deletedNodes = $root->getMinimalDeletedSet($this->deletedID
, $junk1, $junk2);
766 //wfDebug("Minimal set of deleted nodes of size " . count($deletedNodes));
768 // Set prevLeaf to the leaf after which the old HTML needs to be
771 $prevLeaf = $this->textNodes
[$before - 1];
773 // Set nextLeaf to the leaf before which the old HTML needs to be
775 if ($before < count($this->textNodes
)) {
776 $nextLeaf = $this->textNodes
[$before];
779 while (count($deletedNodes) > 0) {
780 if (isset($prevLeaf)) {
781 $prevResult = $prevLeaf->getLastCommonParent($deletedNodes[0]);
783 $prevResult = new LastCommonParentResult();
784 $prevResult->parent
= $this->bodyNode
;
785 $prevResult->indexInLastCommonParent
= 0;
787 if (isset($nextleaf)) {
788 $nextResult = $nextLeaf->getLastCommonParent($deletedNodes[count($deletedNodes) - 1]);
790 $nextResult = new LastCommonParentResult();
791 $nextResult->parent
= $this->bodyNode
;
792 $nextResult->indexInLastCommonParent
= $this->bodyNode
->getNbChildren();
795 if ($prevResult->lastCommonParentDepth
== $nextResult->lastCommonParentDepth
) {
796 // We need some metric to choose which way to add-...
797 if ($deletedNodes[0]->parent
=== $deletedNodes[count($deletedNodes) - 1]->parent
798 && $prevResult->parent
=== $nextResult->parent
) {
799 // The difference is not in the parent
800 $prevResult->lastCommonParentDepth
= $prevResult->lastCommonParentDepth +
1;
802 // The difference is in the parent, so compare them
803 // now THIS is tricky
804 $distancePrev = $deletedNodes[0]->parent
->getMatchRatio($prevResult->parent
);
805 $distanceNext = $deletedNodes[count($deletedNodes) - 1]->parent
->getMatchRatio($nextResult->parent
);
807 if ($distancePrev <= $distanceNext) {
808 $prevResult->lastCommonParentDepth
= $prevResult->lastCommonParentDepth +
1;
810 $nextResult->lastCommonParentDepth
= $nextResult->lastCommonParentDepth +
1;
816 if ($prevResult->lastCommonParentDepth
> $nextResult->lastCommonParentDepth
) {
817 // Inserting at the front
818 if ($prevResult->splittingNeeded
) {
819 $prevLeaf->parent
->splitUntil($prevResult->parent
, $prevLeaf, true);
821 $prevLeaf = $deletedNodes[0]->copyTree();
822 unset($deletedNodes[0]);
823 $deletedNodes = array_values($deletedNodes);
824 $prevLeaf->setParent($prevResult->parent
);
825 $prevResult->parent
->addChildAbsolute($prevLeaf,$prevResult->indexInLastCommonParent +
1);
826 } else if ($prevResult->lastCommonParentDepth
< $nextResult->lastCommonParentDepth
) {
827 // Inserting at the back
828 if ($nextResult->splittingNeeded
) {
829 $splitOccured = $nextLeaf->parent
->splitUntil($nextResult->parent
, $nextLeaf, false);
831 // The place where to insert is shifted one place to the
833 $nextResult->indexInLastCommonParent
= $nextResult->indexInLastCommonParent +
1;
836 $nextLeaf = $deletedNodes[count(deletedNodes
) - 1]->copyTree();
837 unset($deletedNodes[count(deletedNodes
) - 1]);
838 $deletedNodes = array_values($deletedNodes);
839 $nextLeaf->setParent($nextResult->parent
);
840 $nextResult->parent
->addChildAbsolute($nextLeaf,$nextResult->indexInLastCommonParent
);
842 throw new Exception("Uh?");
845 $this->lastModified
= $nextLastModified;
849 public function expandWhiteSpace() {
850 $this->bodyNode
->expandWhiteSpace();
853 public function lengthNew(){
854 return count($this->textNodes
);
857 public function lengthOld(){
858 return count($this->oldTextNodes
);
866 function __construct($output) {
867 $this->output
= $output;
870 function htmlDiff($from, $to) {
871 wfProfileIn( __METHOD__
);
872 // Create an XML parser
873 $xml_parser = xml_parser_create('');
875 $domfrom = new DomTreeBuilder();
877 // Set the functions to handle opening and closing tags
878 xml_set_element_handler($xml_parser, array($domfrom, "startElement"), array($domfrom, "endElement"));
880 // Set the function to handle blocks of character data
881 xml_set_character_data_handler($xml_parser, array($domfrom, "characters"));
883 //wfDebug('Parsing '.strlen($from)." characters worth of HTML\n");
884 if (!xml_parse($xml_parser, '<?xml version="1.0" encoding="UTF-8"?>'.Sanitizer
::hackDocType().'<body>', false)
885 ||
!xml_parse($xml_parser, $from, false)
886 ||
!xml_parse($xml_parser, '</body>', true)){
887 $error = xml_error_string(xml_get_error_code($xml_parser));
888 $line = xml_get_current_line_number($xml_parser);
889 wfDebug("XML error: $error at line $line\n");
891 xml_parser_free($xml_parser);
894 $xml_parser = xml_parser_create('');
896 $domto = new DomTreeBuilder();
898 // Set the functions to handle opening and closing tags
899 xml_set_element_handler($xml_parser, array($domto, "startElement"), array($domto, "endElement"));
901 // Set the function to handle blocks of character data
902 xml_set_character_data_handler($xml_parser, array($domto, "characters"));
904 //wfDebug('Parsing '.strlen($to)." characters worth of HTML\n");
905 if (!xml_parse($xml_parser, '<?xml version="1.0" encoding="UTF-8"?>'.Sanitizer
::hackDocType().'<body>', false)
906 ||
!xml_parse($xml_parser, $to, false)
907 ||
!xml_parse($xml_parser, '</body>', true)){
908 $error = xml_error_string(xml_get_error_code($xml_parser));
909 $line = xml_get_current_line_number($xml_parser);
910 wfDebug("XML error: $error at line $line\n");
912 xml_parser_free($xml_parser);
915 $diffengine = new WikiDiff3();
916 $differences = $this->preProcess($diffengine->diff_range($domfrom->getDiffLines(), $domto->getDiffLines()));
917 unset($xml_parser, $diffengine);
919 $domdiffer = new TextNodeDiffer($domto, $domfrom);
921 $currentIndexLeft = 0;
922 $currentIndexRight = 0;
923 foreach ($differences as &$d) {
924 if ($d->leftstart
> $currentIndexLeft) {
925 $domdiffer->handlePossibleChangedPart($currentIndexLeft, $d->leftstart
,
926 $currentIndexRight, $d->rightstart
);
928 if ($d->leftlength
> 0) {
929 $domdiffer->markAsDeleted($d->leftstart
, $d->leftend
, $d->rightstart
);
931 $domdiffer->markAsNew($d->rightstart
, $d->rightend
);
933 $currentIndexLeft = $d->leftend
;
934 $currentIndexRight = $d->rightend
;
936 $oldLength = $domdiffer->lengthOld();
937 if ($currentIndexLeft < $oldLength) {
938 $domdiffer->handlePossibleChangedPart($currentIndexLeft, $oldLength, $currentIndexRight, $domdiffer->lengthNew());
940 $domdiffer->expandWhiteSpace();
941 $output = new HTMLOutput('htmldiff', $this->output
);
942 $output->parse($domdiffer->bodyNode
);
943 wfProfileOut( __METHOD__
);
946 private function preProcess(/*array*/ $differences) {
947 $newRanges = array();
949 $nbDifferences = count($differences);
950 for ($i = 0; $i < $nbDifferences; ++
$i) {
951 $leftStart = $differences[$i]->leftstart
;
952 $leftEnd = $differences[$i]->leftend
;
953 $rightStart = $differences[$i]->rightstart
;
954 $rightEnd = $differences[$i]->rightend
;
956 $leftLength = $leftEnd - $leftStart;
957 $rightLength = $rightEnd - $rightStart;
959 while ($i +
1 < $nbDifferences && self
::score($leftLength,
960 $differences[$i +
1]->leftlength
,
962 $differences[$i +
1]->rightlength
)
963 > ($differences[$i +
1]->leftstart
- $leftEnd)) {
964 $leftEnd = $differences[$i +
1]->leftend
;
965 $rightEnd = $differences[$i +
1]->rightend
;
966 $leftLength = $leftEnd - $leftStart;
967 $rightLength = $rightEnd - $rightStart;
970 $newRanges[] = new RangeDifference($leftStart, $leftEnd, $rightStart, $rightEnd);
976 * Heuristic to merge differences for readability.
978 public static function score($ll, $nll, $rl, $nrl) {
979 if (($ll == 0 && $nll == 0)
980 ||
($rl == 0 && $nrl == 0)) {
983 $numbers = array($ll, $nll, $rl, $nrl);
985 foreach ($numbers as &$number) {
986 while ($number > 3) {
994 return $d / (1.5 * count($numbers));
999 class TextOnlyComparator
{
1001 public $leafs = array();
1003 function _construct(TagNode
$tree) {
1004 $this->addRecursive($tree);
1005 $this->leafs
= array_map(array('TextNode','toDiffLine'), $this->leafs
);
1008 private function addRecursive(TagNode
$tree) {
1009 foreach ($tree->children
as &$child) {
1010 if ($child instanceof TagNode
) {
1011 $this->addRecursive($child);
1012 } else if ($child instanceof TextNode
) {
1013 $this->leafs
[] = $node;
1018 public function getMatchRatio(TextOnlyComparator
$other) {
1019 $nbOthers = count($other->leafs
);
1020 $nbThis = count($this->leafs
);
1021 if($nbOthers == 0 ||
$nbThis == 0){
1025 $diffengine = new WikiDiff3(25000, 1.35);
1026 $diffengine->diff($this->leafs
, $other->leafs
);
1028 $lcsLength = $diffengine->getLcsLength();
1030 $distanceThis = $nbThis-$lcsLength;
1032 return (2.0 - $lcsLength/$nbOthers - $lcsLength/$nbThis) / 2.0;
1036 class AncestorComparatorResult
{
1038 public $changed = false;
1040 public $changes = "";
1044 * A comparator used when calculating the difference in ancestry of two Nodes.
1046 class AncestorComparator
{
1049 public $ancestorsText;
1051 function __construct(/*array*/ $ancestors) {
1052 $this->ancestors
= $ancestors;
1053 $this->ancestorsText
= array_map(array('TagNode','toDiffLine'), $ancestors);
1056 public $compareTxt = "";
1058 public function getResult(AncestorComparator
$other) {
1059 $result = new AncestorComparatorResult();
1061 $diffengine = new WikiDiff3(10000, 1.35);
1062 $differences = $diffengine->diff_range($other->ancestorsText
,$this->ancestorsText
);
1064 if (count($differences) == 0){
1067 $changeTxt = new ChangeTextGenerator($this, $other);
1069 $result->changed
= true;
1070 $result->changes
= $changeTxt->getChanged($differences)->toString();
1076 class ChangeTextGenerator
{
1078 private $ancestorComparator;
1083 function __construct(AncestorComparator
$ancestorComparator, AncestorComparator
$other) {
1084 $this->ancestorComparator
= $ancestorComparator;
1085 $this->other
= $other;
1086 $this->factory
= new TagToStringFactory();
1089 public function getChanged(/*array*/ $differences) {
1090 $txt = new ChangeText
;
1091 $rootlistopened = false;
1092 if (count($differences) > 1) {
1093 $txt->addHtml('<ul class="changelist">');
1094 $rootlistopened = true;
1096 $nbDifferences = count($differences);
1097 for ($j = 0; $j < $nbDifferences; ++
$j) {
1098 $d = $differences[$j];
1099 $lvl1listopened = false;
1100 if ($rootlistopened) {
1101 $txt->addHtml('<li>');
1103 if ($d->leftlength +
$d->rightlength
> 1) {
1104 $txt->addHtml('<ul class="changelist">');
1105 $lvl1listopened = true;
1107 // left are the old ones
1108 for ($i = $d->leftstart
; $i < $d->leftend
; ++
$i) {
1109 if ($lvl1listopened){
1110 $txt->addHtml('<li>');
1112 // add a bullet for a old tag
1113 $this->addTagOld($txt, $this->other
->ancestors
[$i]);
1114 if ($lvl1listopened){
1115 $txt->addHtml('</li>');
1118 // right are the new ones
1119 for ($i = $d->rightstart
; $i < $d->rightend
; ++
$i) {
1120 if ($lvl1listopened){
1121 $txt->addHtml('<li>');
1123 // add a bullet for a new tag
1124 $this->addTagNew($txt, $this->ancestorComparator
->ancestors
[$i]);
1126 if ($lvl1listopened){
1127 $txt->addHtml('</li>');
1130 if ($lvl1listopened) {
1131 $txt->addHtml('</ul>');
1133 if ($rootlistopened) {
1134 $txt->addHtml('</li>');
1137 if ($rootlistopened) {
1138 $txt->addHtml('</ul>');
1143 private function addTagOld(ChangeText
$txt, TagNode
$ancestor) {
1144 $this->factory
->create($ancestor)->getRemovedDescription($txt);
1147 private function addTagNew(ChangeText
$txt, TagNode
$ancestor) {
1148 $this->factory
->create($ancestor)->getAddedDescription($txt);
1156 public function addHtml($s) {
1160 public function toString() {
1165 class TagToStringFactory
{
1167 private static $containerTags = array('html', 'body', 'p', 'blockquote',
1168 'h1', 'h2', 'h3', 'h4', 'h5', 'pre', 'div', 'ul', 'ol', 'li',
1169 'table', 'tbody', 'tr', 'td', 'th', 'br', 'hr', 'code', 'dl',
1170 'dt', 'dd', 'input', 'form', 'img', 'span', 'a');
1172 private static $styleTags = array('i', 'b', 'strong', 'em', 'font',
1173 'big', 'del', 'tt', 'sub', 'sup', 'strike');
1179 public function create(TagNode
$node) {
1180 $sem = $this->getChangeSemantic($node->qName
);
1181 if (strcasecmp($node->qName
,'a') == 0) {
1182 return new AnchorToString($node, $sem);
1184 if (strcasecmp($node->qName
,'img') == 0) {
1185 return new NoContentTagToString($node, $sem);
1187 return new TagToString($node, $sem);
1190 protected function getChangeSemantic($qname) {
1191 if (in_array(strtolower($qname),self
::$containerTags)) {
1194 if (in_array(strtolower($qname),self
::$styleTags)) {
1197 return self
::UNKNOWN
;
1207 function __construct(TagNode
$node, $sem) {
1208 $this->node
= $node;
1212 public function getRemovedDescription(ChangeText
$txt) {
1213 $tagDescription = wfMsgExt('diff-' . $this->node
->qName
, 'parseinline' );
1214 if(!$tagDescription){
1215 $tagDescription = $this->node
->qName
;
1217 if ($this->sem
== TagToStringFactory
::MOVED
) {
1218 $txt->addHtml( wfMsgExt( 'diff-movedoutof', 'parseinline', $tagDescription ) );
1219 } else if ($this->sem
== TagToStringFactory
::STYLE
) {
1220 $txt->addHtml($tagDescription . ' ' . wfMsgExt( 'diff-styleremoved' , 'parseinline' ) );
1222 $txt->addHtml($tagDescription . ' ' . wfMsgExt( 'diff-removed' , 'parseinline' ) );
1224 $this->addAttributes($txt, $this->node
->attributes
);
1228 public function getAddedDescription(ChangeText
$txt) {
1229 $tagDescription = wfMsgExt('diff-' . $this->node
->qName
, 'parseinline' );
1230 if(!$tagDescription){
1231 $tagDescription = $this->node
->qName
;
1233 if ($this->sem
== TagToStringFactory
::MOVED
) {
1234 $txt->addHtml( wfMsgExt( 'diff-movedto' , 'parseinline', $tagDescription) );
1235 } else if ($this->sem
== TagToStringFactory
::STYLE
) {
1236 $txt->addHtml($tagDescription . ' ' . wfMsgExt( 'diff-styleadded', 'parseinline' ) );
1238 $txt->addHtml($tagDescription . ' ' . wfMsgExt( 'diff-added', 'parseinline' ) );
1240 $this->addAttributes($txt, $this->node
->attributes
);
1244 protected function addAttributes(ChangeText
$txt, array $attributes) {
1245 if (count($attributes) < 1) {
1249 $nbAttributes_min_1 = count($attributes)-1;
1250 $keys = array_keys($attributes);
1251 for ($i=0;$i<$nbAttributes_min_1;$i++
) {
1253 $attr = $attributes[$key];
1256 $txt->addHtml( wfMsgExt('diff-with', 'escapenoentities', $this->translateArgument($key), htmlspecialchars($attr) ) );
1259 $txt->addHtml( wfMsgExt( 'comma-separator', 'escapenoentities' ) .
1260 wfMsgExt( 'diff-with-additional', 'escapenoentities',
1261 $this->translateArgument( $key ), htmlspecialchars( $attr ) )
1265 if ($nbAttributes_min_1 > 0) {
1266 $txt->addHtml( wfMsgExt( 'diff-with-final', 'escapenoentities',
1267 $this->translateArgument($keys[$nbAttributes_min_1]),
1268 htmlspecialchars($attributes[$keys[$nbAttributes_min_1]]) ) );
1272 protected function translateArgument($name) {
1273 $translation = wfMsgExt('diff-' . $name, 'parseinline' );
1274 if ( wfEmptyMsg( 'diff-' . $name, $translation ) ) {
1275 $translation = $name;
1277 return htmlspecialchars( $translation );
1281 class NoContentTagToString
extends TagToString
{
1283 function __construct(TagNode
$node, $sem) {
1284 parent
::__construct($node, $sem);
1287 public function getAddedDescription(ChangeText
$txt) {
1288 $tagDescription = wfMsgExt('diff-' . $this->node
->qName
, 'parseinline' );
1289 if(!$tagDescription){
1290 $tagDescription = $this->node
->qName
;
1292 $txt->addHtml( wfMsgExt('diff-changedto', 'parseinline' ) . ' ' . $tagDescription);
1293 $this->addAttributes($txt, $this->node
->attributes
);
1297 public function getRemovedDescription(ChangeText
$txt) {
1298 $txt->addHtml( wfMsgExt('diff-changedfrom', 'parseinline' ) . ' ' . $tagDescription);
1299 $this->addAttributes($txt, $this->node
->attributes
);
1304 class AnchorToString
extends TagToString
{
1306 function __construct(TagNode
$node, $sem) {
1307 parent
::__construct($node, $sem);
1310 protected function addAttributes(ChangeText
$txt, array $attributes) {
1311 if (array_key_exists('href', $attributes)) {
1312 $txt->addHtml(' ' . wfMsgExt( 'diff-withdestination', 'parseinline' ) . ' ' . htmlspecialchars($attributes['href']));
1313 unset($attributes['href']);
1315 parent
::addAttributes($txt, $attributes);
1320 * Takes a branch root and creates an HTML file for it.
1327 function __construct($prefix, $handler) {
1328 $this->prefix
= $prefix;
1329 $this->handler
= $handler;
1332 public function parse(TagNode
$node) {
1333 $handler = &$this->handler
;
1335 if (strcasecmp($node->qName
, 'img') != 0 && strcasecmp($node->qName
, 'body') != 0) {
1336 $handler->startElement($node->qName
, $node->attributes
);
1339 $newStarted = false;
1340 $remStarted = false;
1341 $changeStarted = false;
1344 foreach ($node->children
as &$child) {
1345 if ($child instanceof TagNode
) {
1347 $handler->endElement('span');
1348 $newStarted = false;
1349 } else if ($changeStarted) {
1350 $handler->endElement('span');
1351 $changeStarted = false;
1352 } else if ($remStarted) {
1353 $handler->endElement('span');
1354 $remStarted = false;
1356 $this->parse($child);
1357 } else if ($child instanceof TextNode
) {
1358 $mod = $child->modification
;
1360 if ($newStarted && ($mod->type
!= Modification
::ADDED ||
$mod->firstOfID
)) {
1361 $handler->endElement('span');
1362 $newStarted = false;
1363 } else if ($changeStarted && ($mod->type
!= Modification
::CHANGED
1364 ||
$mod->changes
!= $changeTXT ||
$mod->firstOfID
)) {
1365 $handler->endElement('span');
1366 $changeStarted = false;
1367 } else if ($remStarted && ($mod->type
!= Modification
::REMOVED ||
$mod ->firstOfID
)) {
1368 $handler->endElement('span');
1369 $remStarted = false;
1372 // no else because a removed part can just be closed and a new
1374 if (!$newStarted && $mod->type
== Modification
::ADDED
) {
1375 $attrs = array('class' => 'diff-html-added');
1376 if ($mod->firstOfID
) {
1377 $attrs['id'] = "added-{$this->prefix}-{$mod->id}";
1379 $this->addAttributes($mod, $attrs);
1380 $attrs['onclick'] = 'return tipA(constructToolTipA(this));';
1381 $handler->startElement('span', $attrs);
1383 } else if (!$changeStarted && $mod->type
== Modification
::CHANGED
) {
1384 $attrs = array('class' => 'diff-html-changed');
1385 if ($mod->firstOfID
) {
1386 $attrs['id'] = "changed-{$this->prefix}-{$mod->id}";
1388 $this->addAttributes($mod, $attrs);
1389 $attrs['onclick'] = 'return tipC(constructToolTipC(this));';
1390 $handler->startElement('span', $attrs);
1393 $handler->startElement('span', array('class' => 'tip'));
1394 $handler->html($mod->changes
);
1395 $handler->endElement('span');
1397 $changeStarted = true;
1398 $changeTXT = $mod->changes
;
1399 } else if (!$remStarted && $mod->type
== Modification
::REMOVED
) {
1400 $attrs = array('class'=>'diff-html-removed');
1401 if ($mod->firstOfID
) {
1402 $attrs['id'] = "removed-{$this->prefix}-{$mod->id}";
1404 $this->addAttributes($mod, $attrs);
1405 $attrs['onclick'] = 'return tipR(constructToolTipR(this));';
1406 $handler->startElement('span', $attrs);
1410 $chars = $child->text
;
1412 if ($child instanceof ImageNode
) {
1413 $this->writeImage($child);
1415 $handler->characters($chars);
1422 $handler->endElement('span');
1423 $newStarted = false;
1424 } else if ($changeStarted) {
1425 $handler->endElement('span');
1426 $changeStarted = false;
1427 } else if ($remStarted) {
1428 $handler->endElement('span');
1429 $remStarted = false;
1432 if (strcasecmp($node->qName
, 'img') != 0
1433 && strcasecmp($node->qName
, 'body') != 0) {
1434 $handler->endElement($node->qName
);
1438 private function writeImage(ImageNode
$imgNode) {
1439 $attrs = $imgNode->attributes
;
1440 if ($imgNode->modification
->type
== Modification
::REMOVED
) {
1441 $attrs['changeType']='diff-removed-image';
1442 } else if ($imgNode->modification
->type
== Modification
::ADDED
) {
1443 $attrs['changeType'] = 'diff-added-image';
1445 $attrs['onload'] = 'updateOverlays()';
1446 $attrs['onError'] = 'updateOverlays()';
1447 $attrs['onAbort'] = 'updateOverlays()';
1448 $this->handler
->startElement('img', $attrs);
1449 $this->handler
->endElement('img');
1452 private function addAttributes(Modification
$mod, /*array*/ &$attrs) {
1453 if (is_null($mod->prevMod
)) {
1454 $previous = 'first-' . $this->prefix
;
1456 $type = Modification
::typeToString($mod->prevMod
->type
);
1457 $previous = "$type-{$this->prefix}-{$mod->prevMod->id}";
1459 $attrs['previous'] = $previous;
1461 $type = Modification
::typeToString($mod->type
);
1462 $changeId = "$type-{$this->prefix}-{$mod->id}";
1463 $attrs['changeId'] = $changeId;
1465 if (is_null($mod->nextMod
)) {
1466 $next = "last-{$this->prefix}";
1468 $type = Modification
::typeToString($mod->nextMod
->type
);
1469 $next = "$type-{$this->prefix}-{$mod->nextMod->id}";
1471 $attrs['next'] = $next;
1475 class EchoingContentHandler
{
1477 function startElement($qname, /*array*/ $arguments) {
1478 echo Xml
::openElement($qname, $arguments);
1481 function endElement($qname){
1482 echo Xml
::closeElement($qname);
1485 function characters($chars){
1486 echo htmlspecialchars($chars);
1489 function html($html){
1495 class DelegatingContentHandler
{
1499 function __construct($delegate) {
1500 $this->delegate
= $delegate;
1503 function startElement($qname, /*array*/ $arguments) {
1504 $this->delegate
->addHtml(Xml
::openElement($qname, $arguments));
1507 function endElement($qname){
1508 $this->delegate
->addHtml(Xml
::closeElement($qname));
1511 function characters($chars){
1512 $this->delegate
->addHtml(htmlspecialchars($chars));
1515 function html($html){
1516 $this->delegate
->addHtml($html);