Merge "wfTempDir try harder to get a tmp dir on Windows"
[lhc/web/wiklou.git] / includes / diff / DairikiDiff.php
1 <?php
2 /**
3 * A PHP diff engine for phpwiki. (Taken from phpwiki-1.3.3)
4 *
5 * Copyright © 2000, 2001 Geoffrey T. Dairiki <dairiki@dairiki.org>
6 * You may copy this code freely under the conditions of the GPL.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup DifferenceEngine
25 * @defgroup DifferenceEngine DifferenceEngine
26 */
27
28 /**
29 * The base class for all other DiffOp classes.
30 *
31 * The classes that extend DiffOp are: DiffOpCopy, DiffOpDelete, DiffOpAdd and
32 * DiffOpChange. FakeDiffOp also extends DiffOp, but it is not located in this file.
33 *
34 * @private
35 * @ingroup DifferenceEngine
36 */
37 abstract class DiffOp {
38
39 /**
40 * @var string
41 */
42 public $type;
43
44 /**
45 * @var string[]
46 */
47 public $orig;
48
49 /**
50 * @var string[]
51 */
52 public $closing;
53
54 /**
55 * @return string
56 */
57 public function getType() {
58 return $this->type;
59 }
60
61 /**
62 * @return string[]
63 */
64 public function getOrig() {
65 return $this->orig;
66 }
67
68 /**
69 * @param int $i
70 * @return string|null
71 */
72 public function getClosing( $i = null ) {
73 if ( $i === null ) {
74 return $this->closing;
75 }
76 if ( array_key_exists( $i, $this->closing ) ) {
77 return $this->closing[$i];
78 }
79 return null;
80 }
81
82 abstract public function reverse();
83
84 /**
85 * @return int
86 */
87 public function norig() {
88 return $this->orig ? count( $this->orig ) : 0;
89 }
90
91 /**
92 * @return int
93 */
94 public function nclosing() {
95 return $this->closing ? count( $this->closing ) : 0;
96 }
97 }
98
99 /**
100 * Extends DiffOp. Used to mark strings that have been
101 * copied from one string array to the other.
102 *
103 * @private
104 * @ingroup DifferenceEngine
105 */
106 class DiffOpCopy extends DiffOp {
107 public $type = 'copy';
108
109 public function __construct( $orig, $closing = false ) {
110 if ( !is_array( $closing ) ) {
111 $closing = $orig;
112 }
113 $this->orig = $orig;
114 $this->closing = $closing;
115 }
116
117 /**
118 * @return DiffOpCopy
119 */
120 public function reverse() {
121 return new DiffOpCopy( $this->closing, $this->orig );
122 }
123 }
124
125 /**
126 * Extends DiffOp. Used to mark strings that have been
127 * deleted from the first string array.
128 *
129 * @private
130 * @ingroup DifferenceEngine
131 */
132 class DiffOpDelete extends DiffOp {
133 public $type = 'delete';
134
135 public function __construct( $lines ) {
136 $this->orig = $lines;
137 $this->closing = false;
138 }
139
140 /**
141 * @return DiffOpAdd
142 */
143 public function reverse() {
144 return new DiffOpAdd( $this->orig );
145 }
146 }
147
148 /**
149 * Extends DiffOp. Used to mark strings that have been
150 * added from the first string array.
151 *
152 * @private
153 * @ingroup DifferenceEngine
154 */
155 class DiffOpAdd extends DiffOp {
156 public $type = 'add';
157
158 public function __construct( $lines ) {
159 $this->closing = $lines;
160 $this->orig = false;
161 }
162
163 /**
164 * @return DiffOpDelete
165 */
166 public function reverse() {
167 return new DiffOpDelete( $this->closing );
168 }
169 }
170
171 /**
172 * Extends DiffOp. Used to mark strings that have been
173 * changed from the first string array (both added and subtracted).
174 *
175 * @private
176 * @ingroup DifferenceEngine
177 */
178 class DiffOpChange extends DiffOp {
179 public $type = 'change';
180
181 public function __construct( $orig, $closing ) {
182 $this->orig = $orig;
183 $this->closing = $closing;
184 }
185
186 /**
187 * @return DiffOpChange
188 */
189 public function reverse() {
190 return new DiffOpChange( $this->closing, $this->orig );
191 }
192 }
193
194 /**
195 * Class used internally by Diff to actually compute the diffs.
196 *
197 * The algorithm used here is mostly lifted from the perl module
198 * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
199 * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
200 *
201 * More ideas are taken from:
202 * http://www.ics.uci.edu/~eppstein/161/960229.html
203 *
204 * Some ideas (and a bit of code) are from analyze.c, from GNU
205 * diffutils-2.7, which can be found at:
206 * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
207 *
208 * closingly, some ideas (subdivision by NCHUNKS > 2, and some optimizations)
209 * are my own.
210 *
211 * Line length limits for robustness added by Tim Starling, 2005-08-31
212 * Alternative implementation added by Guy Van den Broeck, 2008-07-30
213 *
214 * @author Geoffrey T. Dairiki, Tim Starling, Guy Van den Broeck
215 * @private
216 * @ingroup DifferenceEngine
217 */
218 class DiffEngine {
219 const MAX_XREF_LENGTH = 10000;
220
221 protected $xchanged, $ychanged;
222
223 protected $xv = [], $yv = [];
224 protected $xind = [], $yind = [];
225
226 protected $seq = [], $in_seq = [];
227
228 protected $lcs = 0;
229
230 /**
231 * @param string[] $from_lines
232 * @param string[] $to_lines
233 *
234 * @return DiffOp[]
235 */
236 public function diff( $from_lines, $to_lines ) {
237
238 // Diff and store locally
239 $this->diffLocal( $from_lines, $to_lines );
240
241 // Merge edits when possible
242 $this->shiftBoundaries( $from_lines, $this->xchanged, $this->ychanged );
243 $this->shiftBoundaries( $to_lines, $this->ychanged, $this->xchanged );
244
245 // Compute the edit operations.
246 $n_from = count( $from_lines );
247 $n_to = count( $to_lines );
248
249 $edits = [];
250 $xi = $yi = 0;
251 while ( $xi < $n_from || $yi < $n_to ) {
252 assert( $yi < $n_to || $this->xchanged[$xi] );
253 assert( $xi < $n_from || $this->ychanged[$yi] );
254
255 // Skip matching "snake".
256 $copy = [];
257 while ( $xi < $n_from && $yi < $n_to
258 && !$this->xchanged[$xi] && !$this->ychanged[$yi]
259 ) {
260 $copy[] = $from_lines[$xi++];
261 ++$yi;
262 }
263 if ( $copy ) {
264 $edits[] = new DiffOpCopy( $copy );
265 }
266
267 // Find deletes & adds.
268 $delete = [];
269 while ( $xi < $n_from && $this->xchanged[$xi] ) {
270 $delete[] = $from_lines[$xi++];
271 }
272
273 $add = [];
274 while ( $yi < $n_to && $this->ychanged[$yi] ) {
275 $add[] = $to_lines[$yi++];
276 }
277
278 if ( $delete && $add ) {
279 $edits[] = new DiffOpChange( $delete, $add );
280 } elseif ( $delete ) {
281 $edits[] = new DiffOpDelete( $delete );
282 } elseif ( $add ) {
283 $edits[] = new DiffOpAdd( $add );
284 }
285 }
286
287 return $edits;
288 }
289
290 /**
291 * @param string[] $from_lines
292 * @param string[] $to_lines
293 */
294 private function diffLocal( $from_lines, $to_lines ) {
295 $wikidiff3 = new WikiDiff3();
296 $wikidiff3->diff( $from_lines, $to_lines );
297 $this->xchanged = $wikidiff3->removed;
298 $this->ychanged = $wikidiff3->added;
299 }
300
301 /**
302 * Adjust inserts/deletes of identical lines to join changes
303 * as much as possible.
304 *
305 * We do something when a run of changed lines include a
306 * line at one end and has an excluded, identical line at the other.
307 * We are free to choose which identical line is included.
308 * `compareseq' usually chooses the one at the beginning,
309 * but usually it is cleaner to consider the following identical line
310 * to be the "change".
311 *
312 * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
313 */
314 private function shiftBoundaries( $lines, &$changed, $other_changed ) {
315 $i = 0;
316 $j = 0;
317
318 assert( count( $lines ) == count( $changed ) );
319 $len = count( $lines );
320 $other_len = count( $other_changed );
321
322 while ( 1 ) {
323 /*
324 * Scan forwards to find beginning of another run of changes.
325 * Also keep track of the corresponding point in the other file.
326 *
327 * Throughout this code, $i and $j are adjusted together so that
328 * the first $i elements of $changed and the first $j elements
329 * of $other_changed both contain the same number of zeros
330 * (unchanged lines).
331 * Furthermore, $j is always kept so that $j == $other_len or
332 * $other_changed[$j] == false.
333 */
334 while ( $j < $other_len && $other_changed[$j] ) {
335 $j++;
336 }
337
338 while ( $i < $len && !$changed[$i] ) {
339 assert( $j < $other_len && ! $other_changed[$j] );
340 $i++;
341 $j++;
342 while ( $j < $other_len && $other_changed[$j] ) {
343 $j++;
344 }
345 }
346
347 if ( $i == $len ) {
348 break;
349 }
350
351 $start = $i;
352
353 // Find the end of this run of changes.
354 while ( ++$i < $len && $changed[$i] ) {
355 continue;
356 }
357
358 do {
359 /*
360 * Record the length of this run of changes, so that
361 * we can later determine whether the run has grown.
362 */
363 $runlength = $i - $start;
364
365 /*
366 * Move the changed region back, so long as the
367 * previous unchanged line matches the last changed one.
368 * This merges with previous changed regions.
369 */
370 while ( $start > 0 && $lines[$start - 1] == $lines[$i - 1] ) {
371 $changed[--$start] = 1;
372 $changed[--$i] = false;
373 while ( $start > 0 && $changed[$start - 1] ) {
374 $start--;
375 }
376 assert( $j > 0 );
377 while ( $other_changed[--$j] ) {
378 continue;
379 }
380 assert( $j >= 0 && !$other_changed[$j] );
381 }
382
383 /*
384 * Set CORRESPONDING to the end of the changed run, at the last
385 * point where it corresponds to a changed run in the other file.
386 * CORRESPONDING == LEN means no such point has been found.
387 */
388 $corresponding = $j < $other_len ? $i : $len;
389
390 /*
391 * Move the changed region forward, so long as the
392 * first changed line matches the following unchanged one.
393 * This merges with following changed regions.
394 * Do this second, so that if there are no merges,
395 * the changed region is moved forward as far as possible.
396 */
397 while ( $i < $len && $lines[$start] == $lines[$i] ) {
398 $changed[$start++] = false;
399 $changed[$i++] = 1;
400 while ( $i < $len && $changed[$i] ) {
401 $i++;
402 }
403
404 assert( $j < $other_len && ! $other_changed[$j] );
405 $j++;
406 if ( $j < $other_len && $other_changed[$j] ) {
407 $corresponding = $i;
408 while ( $j < $other_len && $other_changed[$j] ) {
409 $j++;
410 }
411 }
412 }
413 } while ( $runlength != $i - $start );
414
415 /*
416 * If possible, move the fully-merged run of changes
417 * back to a corresponding run in the other file.
418 */
419 while ( $corresponding < $i ) {
420 $changed[--$start] = 1;
421 $changed[--$i] = 0;
422 assert( $j > 0 );
423 while ( $other_changed[--$j] ) {
424 continue;
425 }
426 assert( $j >= 0 && !$other_changed[$j] );
427 }
428 }
429 }
430 }
431
432 /**
433 * Class representing a 'diff' between two sequences of strings.
434 * @todo document
435 * @private
436 * @ingroup DifferenceEngine
437 */
438 class Diff {
439
440 /**
441 * @var DiffOp[]
442 */
443 public $edits;
444
445 /**
446 * Constructor.
447 * Computes diff between sequences of strings.
448 *
449 * @param string[] $from_lines An array of strings.
450 * Typically these are lines from a file.
451 * @param string[] $to_lines An array of strings.
452 */
453 public function __construct( $from_lines, $to_lines ) {
454 $eng = new DiffEngine;
455 $this->edits = $eng->diff( $from_lines, $to_lines );
456 }
457
458 /**
459 * @return DiffOp[]
460 */
461 public function getEdits() {
462 return $this->edits;
463 }
464
465 /**
466 * Compute reversed Diff.
467 *
468 * SYNOPSIS:
469 *
470 * $diff = new Diff($lines1, $lines2);
471 * $rev = $diff->reverse();
472 *
473 * @return Object A Diff object representing the inverse of the
474 * original diff.
475 */
476 public function reverse() {
477 $rev = $this;
478 $rev->edits = [];
479 /** @var DiffOp $edit */
480 foreach ( $this->edits as $edit ) {
481 $rev->edits[] = $edit->reverse();
482 }
483
484 return $rev;
485 }
486
487 /**
488 * Check for empty diff.
489 *
490 * @return bool True if two sequences were identical.
491 */
492 public function isEmpty() {
493 foreach ( $this->edits as $edit ) {
494 if ( $edit->type != 'copy' ) {
495 return false;
496 }
497 }
498
499 return true;
500 }
501
502 /**
503 * Compute the length of the Longest Common Subsequence (LCS).
504 *
505 * This is mostly for diagnostic purposed.
506 *
507 * @return int The length of the LCS.
508 */
509 public function lcs() {
510 $lcs = 0;
511 foreach ( $this->edits as $edit ) {
512 if ( $edit->type == 'copy' ) {
513 $lcs += count( $edit->orig );
514 }
515 }
516
517 return $lcs;
518 }
519
520 /**
521 * Get the original set of lines.
522 *
523 * This reconstructs the $from_lines parameter passed to the
524 * constructor.
525 *
526 * @return string[] The original sequence of strings.
527 */
528 public function orig() {
529 $lines = [];
530
531 foreach ( $this->edits as $edit ) {
532 if ( $edit->orig ) {
533 array_splice( $lines, count( $lines ), 0, $edit->orig );
534 }
535 }
536
537 return $lines;
538 }
539
540 /**
541 * Get the closing set of lines.
542 *
543 * This reconstructs the $to_lines parameter passed to the
544 * constructor.
545 *
546 * @return string[] The sequence of strings.
547 */
548 public function closing() {
549 $lines = [];
550
551 foreach ( $this->edits as $edit ) {
552 if ( $edit->closing ) {
553 array_splice( $lines, count( $lines ), 0, $edit->closing );
554 }
555 }
556
557 return $lines;
558 }
559 }
560
561 /**
562 * @todo document, bad name.
563 * @private
564 * @ingroup DifferenceEngine
565 */
566 class MappedDiff extends Diff {
567 /**
568 * Constructor.
569 *
570 * Computes diff between sequences of strings.
571 *
572 * This can be used to compute things like
573 * case-insensitve diffs, or diffs which ignore
574 * changes in white-space.
575 *
576 * @param string[] $from_lines An array of strings.
577 * Typically these are lines from a file.
578 * @param string[] $to_lines An array of strings.
579 * @param string[] $mapped_from_lines This array should
580 * have the same size number of elements as $from_lines.
581 * The elements in $mapped_from_lines and
582 * $mapped_to_lines are what is actually compared
583 * when computing the diff.
584 * @param string[] $mapped_to_lines This array should
585 * have the same number of elements as $to_lines.
586 */
587 public function __construct( $from_lines, $to_lines,
588 $mapped_from_lines, $mapped_to_lines ) {
589
590 assert( count( $from_lines ) == count( $mapped_from_lines ) );
591 assert( count( $to_lines ) == count( $mapped_to_lines ) );
592
593 parent::__construct( $mapped_from_lines, $mapped_to_lines );
594
595 $xi = $yi = 0;
596 $editCount = count( $this->edits );
597 for ( $i = 0; $i < $editCount; $i++ ) {
598 $orig = &$this->edits[$i]->orig;
599 if ( is_array( $orig ) ) {
600 $orig = array_slice( $from_lines, $xi, count( $orig ) );
601 $xi += count( $orig );
602 }
603
604 $closing = &$this->edits[$i]->closing;
605 if ( is_array( $closing ) ) {
606 $closing = array_slice( $to_lines, $yi, count( $closing ) );
607 $yi += count( $closing );
608 }
609 }
610 }
611 }
612
613 /**
614 * Additions by Axel Boldt follow, partly taken from diff.php, phpwiki-1.3.3
615 */
616
617 /**
618 * @todo document
619 * @private
620 * @ingroup DifferenceEngine
621 */
622 class HWLDFWordAccumulator {
623 public $insClass = ' class="diffchange diffchange-inline"';
624 public $delClass = ' class="diffchange diffchange-inline"';
625
626 private $lines = [];
627 private $line = '';
628 private $group = '';
629 private $tag = '';
630
631 /**
632 * @param string $new_tag
633 */
634 private function flushGroup( $new_tag ) {
635 if ( $this->group !== '' ) {
636 if ( $this->tag == 'ins' ) {
637 $this->line .= "<ins{$this->insClass}>" .
638 htmlspecialchars( $this->group ) . '</ins>';
639 } elseif ( $this->tag == 'del' ) {
640 $this->line .= "<del{$this->delClass}>" .
641 htmlspecialchars( $this->group ) . '</del>';
642 } else {
643 $this->line .= htmlspecialchars( $this->group );
644 }
645 }
646 $this->group = '';
647 $this->tag = $new_tag;
648 }
649
650 /**
651 * @param string $new_tag
652 */
653 private function flushLine( $new_tag ) {
654 $this->flushGroup( $new_tag );
655 if ( $this->line != '' ) {
656 array_push( $this->lines, $this->line );
657 } else {
658 # make empty lines visible by inserting an NBSP
659 array_push( $this->lines, '&#160;' );
660 }
661 $this->line = '';
662 }
663
664 /**
665 * @param string[] $words
666 * @param string $tag
667 */
668 public function addWords( $words, $tag = '' ) {
669 if ( $tag != $this->tag ) {
670 $this->flushGroup( $tag );
671 }
672
673 foreach ( $words as $word ) {
674 // new-line should only come as first char of word.
675 if ( $word == '' ) {
676 continue;
677 }
678 if ( $word[0] == "\n" ) {
679 $this->flushLine( $tag );
680 $word = substr( $word, 1 );
681 }
682 assert( !strstr( $word, "\n" ) );
683 $this->group .= $word;
684 }
685 }
686
687 /**
688 * @return string[]
689 */
690 public function getLines() {
691 $this->flushLine( '~done' );
692
693 return $this->lines;
694 }
695 }
696
697 /**
698 * @todo document
699 * @private
700 * @ingroup DifferenceEngine
701 */
702 class WordLevelDiff extends MappedDiff {
703 const MAX_LINE_LENGTH = 10000;
704
705 /**
706 * @param string[] $orig_lines
707 * @param string[] $closing_lines
708 */
709 public function __construct( $orig_lines, $closing_lines ) {
710
711 list( $orig_words, $orig_stripped ) = $this->split( $orig_lines );
712 list( $closing_words, $closing_stripped ) = $this->split( $closing_lines );
713
714 parent::__construct( $orig_words, $closing_words,
715 $orig_stripped, $closing_stripped );
716 }
717
718 /**
719 * @param string[] $lines
720 *
721 * @return array[]
722 */
723 private function split( $lines ) {
724
725 $words = [];
726 $stripped = [];
727 $first = true;
728 foreach ( $lines as $line ) {
729 # If the line is too long, just pretend the entire line is one big word
730 # This prevents resource exhaustion problems
731 if ( $first ) {
732 $first = false;
733 } else {
734 $words[] = "\n";
735 $stripped[] = "\n";
736 }
737 if ( strlen( $line ) > self::MAX_LINE_LENGTH ) {
738 $words[] = $line;
739 $stripped[] = $line;
740 } else {
741 $m = [];
742 if ( preg_match_all( '/ ( [^\S\n]+ | [0-9_A-Za-z\x80-\xff]+ | . ) (?: (?!< \n) [^\S\n])? /xs',
743 $line, $m )
744 ) {
745 foreach ( $m[0] as $word ) {
746 $words[] = $word;
747 }
748 foreach ( $m[1] as $stripped_word ) {
749 $stripped[] = $stripped_word;
750 }
751 }
752 }
753 }
754
755 return [ $words, $stripped ];
756 }
757
758 /**
759 * @return string[]
760 */
761 public function orig() {
762 $orig = new HWLDFWordAccumulator;
763
764 foreach ( $this->edits as $edit ) {
765 if ( $edit->type == 'copy' ) {
766 $orig->addWords( $edit->orig );
767 } elseif ( $edit->orig ) {
768 $orig->addWords( $edit->orig, 'del' );
769 }
770 }
771 $lines = $orig->getLines();
772
773 return $lines;
774 }
775
776 /**
777 * @return string[]
778 */
779 public function closing() {
780 $closing = new HWLDFWordAccumulator;
781
782 foreach ( $this->edits as $edit ) {
783 if ( $edit->type == 'copy' ) {
784 $closing->addWords( $edit->closing );
785 } elseif ( $edit->closing ) {
786 $closing->addWords( $edit->closing, 'ins' );
787 }
788 }
789 $lines = $closing->getLines();
790
791 return $lines;
792 }
793
794 }