Merge "Allow local interwiki links with an empty title part"
[lhc/web/wiklou.git] / includes / diff / DairikiDiff.php
1 <?php
2 /**
3 * A PHP diff engine for phpwiki. (Taken from phpwiki-1.3.3)
4 *
5 * Copyright © 2000, 2001 Geoffrey T. Dairiki <dairiki@dairiki.org>
6 * You may copy this code freely under the conditions of the GPL.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup DifferenceEngine
25 * @defgroup DifferenceEngine DifferenceEngine
26 */
27
28 /**
29 * @todo document
30 * @private
31 * @ingroup DifferenceEngine
32 */
33 abstract class DiffOp {
34
35 /**
36 * @var string
37 */
38 public $type;
39
40 /**
41 * @var string[]
42 */
43 public $orig;
44
45 /**
46 * @var string[]
47 */
48 public $closing;
49
50 /**
51 * @return string
52 */
53 public function getType() {
54 return $this->type;
55 }
56
57 /**
58 * @return string[]
59 */
60 public function getOrig() {
61 return $this->orig;
62 }
63
64 /**
65 * @param int $i
66 * @return string|null
67 */
68 public function getClosing( $i = null ) {
69 if ( $i === null ) {
70 return $this->closing;
71 }
72 if ( array_key_exists( $i, $this->closing ) ) {
73 return $this->closing[$i];
74 }
75 return null;
76 }
77
78 abstract public function reverse();
79
80 /**
81 * @return int
82 */
83 public function norig() {
84 return $this->orig ? count( $this->orig ) : 0;
85 }
86
87 /**
88 * @return int
89 */
90 public function nclosing() {
91 return $this->closing ? count( $this->closing ) : 0;
92 }
93 }
94
95 /**
96 * @todo document
97 * @private
98 * @ingroup DifferenceEngine
99 */
100 class DiffOpCopy extends DiffOp {
101 public $type = 'copy';
102
103 public function __construct( $orig, $closing = false ) {
104 if ( !is_array( $closing ) ) {
105 $closing = $orig;
106 }
107 $this->orig = $orig;
108 $this->closing = $closing;
109 }
110
111 /**
112 * @return DiffOpCopy
113 */
114 public function reverse() {
115 return new DiffOpCopy( $this->closing, $this->orig );
116 }
117 }
118
119 /**
120 * @todo document
121 * @private
122 * @ingroup DifferenceEngine
123 */
124 class DiffOpDelete extends DiffOp {
125 public $type = 'delete';
126
127 public function __construct( $lines ) {
128 $this->orig = $lines;
129 $this->closing = false;
130 }
131
132 /**
133 * @return DiffOpAdd
134 */
135 public function reverse() {
136 return new DiffOpAdd( $this->orig );
137 }
138 }
139
140 /**
141 * @todo document
142 * @private
143 * @ingroup DifferenceEngine
144 */
145 class DiffOpAdd extends DiffOp {
146 public $type = 'add';
147
148 public function __construct( $lines ) {
149 $this->closing = $lines;
150 $this->orig = false;
151 }
152
153 /**
154 * @return DiffOpDelete
155 */
156 public function reverse() {
157 return new DiffOpDelete( $this->closing );
158 }
159 }
160
161 /**
162 * @todo document
163 * @private
164 * @ingroup DifferenceEngine
165 */
166 class DiffOpChange extends DiffOp {
167 public $type = 'change';
168
169 public function __construct( $orig, $closing ) {
170 $this->orig = $orig;
171 $this->closing = $closing;
172 }
173
174 /**
175 * @return DiffOpChange
176 */
177 public function reverse() {
178 return new DiffOpChange( $this->closing, $this->orig );
179 }
180 }
181
182 /**
183 * Class used internally by Diff to actually compute the diffs.
184 *
185 * The algorithm used here is mostly lifted from the perl module
186 * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
187 * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
188 *
189 * More ideas are taken from:
190 * http://www.ics.uci.edu/~eppstein/161/960229.html
191 *
192 * Some ideas are (and a bit of code) are from from analyze.c, from GNU
193 * diffutils-2.7, which can be found at:
194 * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
195 *
196 * closingly, some ideas (subdivision by NCHUNKS > 2, and some optimizations)
197 * are my own.
198 *
199 * Line length limits for robustness added by Tim Starling, 2005-08-31
200 * Alternative implementation added by Guy Van den Broeck, 2008-07-30
201 *
202 * @author Geoffrey T. Dairiki, Tim Starling, Guy Van den Broeck
203 * @private
204 * @ingroup DifferenceEngine
205 */
206 class DiffEngine {
207 const MAX_XREF_LENGTH = 10000;
208
209 protected $xchanged, $ychanged;
210
211 protected $xv = array(), $yv = array();
212 protected $xind = array(), $yind = array();
213
214 protected $seq = array(), $in_seq = array();
215
216 protected $lcs = 0;
217
218 /**
219 * @param string[] $from_lines
220 * @param string[] $to_lines
221 *
222 * @return DiffOp[]
223 */
224 public function diff( $from_lines, $to_lines ) {
225 wfProfileIn( __METHOD__ );
226
227 // Diff and store locally
228 $this->diffLocal( $from_lines, $to_lines );
229
230 // Merge edits when possible
231 $this->shiftBoundaries( $from_lines, $this->xchanged, $this->ychanged );
232 $this->shiftBoundaries( $to_lines, $this->ychanged, $this->xchanged );
233
234 // Compute the edit operations.
235 $n_from = count( $from_lines );
236 $n_to = count( $to_lines );
237
238 $edits = array();
239 $xi = $yi = 0;
240 while ( $xi < $n_from || $yi < $n_to ) {
241 assert( '$yi < $n_to || $this->xchanged[$xi]' );
242 assert( '$xi < $n_from || $this->ychanged[$yi]' );
243
244 // Skip matching "snake".
245 $copy = array();
246 while ( $xi < $n_from && $yi < $n_to
247 && !$this->xchanged[$xi] && !$this->ychanged[$yi]
248 ) {
249 $copy[] = $from_lines[$xi++];
250 ++$yi;
251 }
252 if ( $copy ) {
253 $edits[] = new DiffOpCopy( $copy );
254 }
255
256 // Find deletes & adds.
257 $delete = array();
258 while ( $xi < $n_from && $this->xchanged[$xi] ) {
259 $delete[] = $from_lines[$xi++];
260 }
261
262 $add = array();
263 while ( $yi < $n_to && $this->ychanged[$yi] ) {
264 $add[] = $to_lines[$yi++];
265 }
266
267 if ( $delete && $add ) {
268 $edits[] = new DiffOpChange( $delete, $add );
269 } elseif ( $delete ) {
270 $edits[] = new DiffOpDelete( $delete );
271 } elseif ( $add ) {
272 $edits[] = new DiffOpAdd( $add );
273 }
274 }
275 wfProfileOut( __METHOD__ );
276
277 return $edits;
278 }
279
280 /**
281 * @param string[] $from_lines
282 * @param string[] $to_lines
283 */
284 private function diffLocal( $from_lines, $to_lines ) {
285 global $wgExternalDiffEngine;
286 wfProfileIn( __METHOD__ );
287
288 if ( $wgExternalDiffEngine == 'wikidiff3' ) {
289 // wikidiff3
290 $wikidiff3 = new WikiDiff3();
291 $wikidiff3->diff( $from_lines, $to_lines );
292 $this->xchanged = $wikidiff3->removed;
293 $this->ychanged = $wikidiff3->added;
294 unset( $wikidiff3 );
295 } else {
296 // old diff
297 $n_from = count( $from_lines );
298 $n_to = count( $to_lines );
299 $this->xchanged = $this->ychanged = array();
300 $this->xv = $this->yv = array();
301 $this->xind = $this->yind = array();
302 unset( $this->seq );
303 unset( $this->in_seq );
304 unset( $this->lcs );
305
306 // Skip leading common lines.
307 for ( $skip = 0; $skip < $n_from && $skip < $n_to; $skip++ ) {
308 if ( $from_lines[$skip] !== $to_lines[$skip] ) {
309 break;
310 }
311 $this->xchanged[$skip] = $this->ychanged[$skip] = false;
312 }
313 // Skip trailing common lines.
314 $xi = $n_from;
315 $yi = $n_to;
316 for ( $endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++ ) {
317 if ( $from_lines[$xi] !== $to_lines[$yi] ) {
318 break;
319 }
320 $this->xchanged[$xi] = $this->ychanged[$yi] = false;
321 }
322
323 // Ignore lines which do not exist in both files.
324 for ( $xi = $skip; $xi < $n_from - $endskip; $xi++ ) {
325 $xhash[$this->lineHash( $from_lines[$xi] )] = 1;
326 }
327
328 for ( $yi = $skip; $yi < $n_to - $endskip; $yi++ ) {
329 $line = $to_lines[$yi];
330 if ( ( $this->ychanged[$yi] = empty( $xhash[$this->lineHash( $line )] ) ) ) {
331 continue;
332 }
333 $yhash[$this->lineHash( $line )] = 1;
334 $this->yv[] = $line;
335 $this->yind[] = $yi;
336 }
337 for ( $xi = $skip; $xi < $n_from - $endskip; $xi++ ) {
338 $line = $from_lines[$xi];
339 if ( ( $this->xchanged[$xi] = empty( $yhash[$this->lineHash( $line )] ) ) ) {
340 continue;
341 }
342 $this->xv[] = $line;
343 $this->xind[] = $xi;
344 }
345
346 // Find the LCS.
347 $this->compareSeq( 0, count( $this->xv ), 0, count( $this->yv ) );
348 }
349 wfProfileOut( __METHOD__ );
350 }
351
352 /**
353 * Returns the whole line if it's small enough, or the MD5 hash otherwise
354 *
355 * @param string $line
356 *
357 * @return string
358 */
359 private function lineHash( $line ) {
360 if ( strlen( $line ) > self::MAX_XREF_LENGTH ) {
361 return md5( $line );
362 } else {
363 return $line;
364 }
365 }
366
367 /**
368 * Divide the Largest Common Subsequence (LCS) of the sequences
369 * [XOFF, XLIM) and [YOFF, YLIM) into NCHUNKS approximately equally
370 * sized segments.
371 *
372 * Returns (LCS, PTS). LCS is the length of the LCS. PTS is an
373 * array of NCHUNKS+1 (X, Y) indexes giving the diving points between
374 * sub sequences. The first sub-sequence is contained in [X0, X1),
375 * [Y0, Y1), the second in [X1, X2), [Y1, Y2) and so on. Note
376 * that (X0, Y0) == (XOFF, YOFF) and
377 * (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
378 *
379 * This function assumes that the first lines of the specified portions
380 * of the two files do not match, and likewise that the last lines do not
381 * match. The caller must trim matching lines from the beginning and end
382 * of the portions it is going to specify.
383 *
384 * @param int $xoff
385 * @param int $xlim
386 * @param int $yoff
387 * @param int $ylim
388 * @param int $nchunks
389 *
390 * @return array List of two elements, integer and array[].
391 */
392 private function diag( $xoff, $xlim, $yoff, $ylim, $nchunks ) {
393 $flip = false;
394
395 if ( $xlim - $xoff > $ylim - $yoff ) {
396 // Things seems faster (I'm not sure I understand why)
397 // when the shortest sequence in X.
398 $flip = true;
399 list( $xoff, $xlim, $yoff, $ylim ) = array( $yoff, $ylim, $xoff, $xlim );
400 }
401
402 if ( $flip ) {
403 for ( $i = $ylim - 1; $i >= $yoff; $i-- ) {
404 $ymatches[$this->xv[$i]][] = $i;
405 }
406 } else {
407 for ( $i = $ylim - 1; $i >= $yoff; $i-- ) {
408 $ymatches[$this->yv[$i]][] = $i;
409 }
410 }
411
412 $this->lcs = 0;
413 $this->seq[0] = $yoff - 1;
414 $this->in_seq = array();
415 $ymids[0] = array();
416
417 $numer = $xlim - $xoff + $nchunks - 1;
418 $x = $xoff;
419 for ( $chunk = 0; $chunk < $nchunks; $chunk++ ) {
420 if ( $chunk > 0 ) {
421 for ( $i = 0; $i <= $this->lcs; $i++ ) {
422 $ymids[$i][$chunk - 1] = $this->seq[$i];
423 }
424 }
425
426 $x1 = $xoff + (int)( ( $numer + ( $xlim - $xoff ) * $chunk ) / $nchunks );
427 // @codingStandardsIgnoreFile Ignore Squiz.WhiteSpace.SemicolonSpacing.Incorrect
428 for ( ; $x < $x1; $x++ ) {
429 // @codingStandardsIgnoreEnd
430 $line = $flip ? $this->yv[$x] : $this->xv[$x];
431 if ( empty( $ymatches[$line] ) ) {
432 continue;
433 }
434
435 $k = 0;
436 $matches = $ymatches[$line];
437 reset( $matches );
438 while ( list( , $y ) = each( $matches ) ) {
439 if ( empty( $this->in_seq[$y] ) ) {
440 $k = $this->lcsPos( $y );
441 assert( '$k > 0' );
442 $ymids[$k] = $ymids[$k - 1];
443 break;
444 }
445 }
446
447 while ( list( , $y ) = each( $matches ) ) {
448 if ( $y > $this->seq[$k - 1] ) {
449 assert( '$y < $this->seq[$k]' );
450 // Optimization: this is a common case:
451 // next match is just replacing previous match.
452 $this->in_seq[$this->seq[$k]] = false;
453 $this->seq[$k] = $y;
454 $this->in_seq[$y] = 1;
455 } elseif ( empty( $this->in_seq[$y] ) ) {
456 $k = $this->lcsPos( $y );
457 assert( '$k > 0' );
458 $ymids[$k] = $ymids[$k - 1];
459 }
460 }
461 }
462 }
463
464 $seps[] = $flip ? array( $yoff, $xoff ) : array( $xoff, $yoff );
465 $ymid = $ymids[$this->lcs];
466 for ( $n = 0; $n < $nchunks - 1; $n++ ) {
467 $x1 = $xoff + (int)( ( $numer + ( $xlim - $xoff ) * $n ) / $nchunks );
468 $y1 = $ymid[$n] + 1;
469 $seps[] = $flip ? array( $y1, $x1 ) : array( $x1, $y1 );
470 }
471 $seps[] = $flip ? array( $ylim, $xlim ) : array( $xlim, $ylim );
472
473 return array( $this->lcs, $seps );
474 }
475
476 /**
477 * @param int $ypos
478 *
479 * @return int
480 */
481 private function lcsPos( $ypos ) {
482 $end = $this->lcs;
483 if ( $end == 0 || $ypos > $this->seq[$end] ) {
484 $this->seq[++$this->lcs] = $ypos;
485 $this->in_seq[$ypos] = 1;
486
487 return $this->lcs;
488 }
489
490 $beg = 1;
491 while ( $beg < $end ) {
492 $mid = (int)( ( $beg + $end ) / 2 );
493 if ( $ypos > $this->seq[$mid] ) {
494 $beg = $mid + 1;
495 } else {
496 $end = $mid;
497 }
498 }
499
500 assert( '$ypos != $this->seq[$end]' );
501
502 $this->in_seq[$this->seq[$end]] = false;
503 $this->seq[$end] = $ypos;
504 $this->in_seq[$ypos] = 1;
505
506 return $end;
507 }
508
509 /**
510 * Find LCS of two sequences.
511 *
512 * The results are recorded in the vectors $this->{x,y}changed[], by
513 * storing a 1 in the element for each line that is an insertion
514 * or deletion (ie. is not in the LCS).
515 *
516 * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
517 *
518 * Note that XLIM, YLIM are exclusive bounds.
519 * All line numbers are origin-0 and discarded lines are not counted.
520 *
521 * @param int $xoff
522 * @param int $xlim
523 * @param int $yoff
524 * @param int $ylim
525 */
526 private function compareSeq( $xoff, $xlim, $yoff, $ylim ) {
527 // Slide down the bottom initial diagonal.
528 while ( $xoff < $xlim && $yoff < $ylim && $this->xv[$xoff] == $this->yv[$yoff] ) {
529 ++$xoff;
530 ++$yoff;
531 }
532
533 // Slide up the top initial diagonal.
534 while ( $xlim > $xoff && $ylim > $yoff
535 && $this->xv[$xlim - 1] == $this->yv[$ylim - 1]
536 ) {
537 --$xlim;
538 --$ylim;
539 }
540
541 if ( $xoff == $xlim || $yoff == $ylim ) {
542 $lcs = 0;
543 } else {
544 // This is ad hoc but seems to work well.
545 // $nchunks = sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5);
546 // $nchunks = max(2,min(8,(int)$nchunks));
547 $nchunks = min( 7, $xlim - $xoff, $ylim - $yoff ) + 1;
548 list( $lcs, $seps ) = $this->diag( $xoff, $xlim, $yoff, $ylim, $nchunks );
549 }
550
551 if ( $lcs == 0 ) {
552 // X and Y sequences have no common subsequence:
553 // mark all changed.
554 while ( $yoff < $ylim ) {
555 $this->ychanged[$this->yind[$yoff++]] = 1;
556 }
557 while ( $xoff < $xlim ) {
558 $this->xchanged[$this->xind[$xoff++]] = 1;
559 }
560 } else {
561 // Use the partitions to split this problem into subproblems.
562 reset( $seps );
563 $pt1 = $seps[0];
564 while ( $pt2 = next( $seps ) ) {
565 $this->compareSeq( $pt1[0], $pt2[0], $pt1[1], $pt2[1] );
566 $pt1 = $pt2;
567 }
568 }
569 }
570
571 /**
572 * Adjust inserts/deletes of identical lines to join changes
573 * as much as possible.
574 *
575 * We do something when a run of changed lines include a
576 * line at one end and has an excluded, identical line at the other.
577 * We are free to choose which identical line is included.
578 * `compareseq' usually chooses the one at the beginning,
579 * but usually it is cleaner to consider the following identical line
580 * to be the "change".
581 *
582 * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
583 */
584 private function shiftBoundaries( $lines, &$changed, $other_changed ) {
585 wfProfileIn( __METHOD__ );
586 $i = 0;
587 $j = 0;
588
589 assert( 'count($lines) == count($changed)' );
590 $len = count( $lines );
591 $other_len = count( $other_changed );
592
593 while ( 1 ) {
594 /*
595 * Scan forwards to find beginning of another run of changes.
596 * Also keep track of the corresponding point in the other file.
597 *
598 * Throughout this code, $i and $j are adjusted together so that
599 * the first $i elements of $changed and the first $j elements
600 * of $other_changed both contain the same number of zeros
601 * (unchanged lines).
602 * Furthermore, $j is always kept so that $j == $other_len or
603 * $other_changed[$j] == false.
604 */
605 while ( $j < $other_len && $other_changed[$j] ) {
606 $j++;
607 }
608
609 while ( $i < $len && !$changed[$i] ) {
610 assert( '$j < $other_len && ! $other_changed[$j]' );
611 $i++;
612 $j++;
613 while ( $j < $other_len && $other_changed[$j] ) {
614 $j++;
615 }
616 }
617
618 if ( $i == $len ) {
619 break;
620 }
621
622 $start = $i;
623
624 // Find the end of this run of changes.
625 while ( ++$i < $len && $changed[$i] ) {
626 continue;
627 }
628
629 do {
630 /*
631 * Record the length of this run of changes, so that
632 * we can later determine whether the run has grown.
633 */
634 $runlength = $i - $start;
635
636 /*
637 * Move the changed region back, so long as the
638 * previous unchanged line matches the last changed one.
639 * This merges with previous changed regions.
640 */
641 while ( $start > 0 && $lines[$start - 1] == $lines[$i - 1] ) {
642 $changed[--$start] = 1;
643 $changed[--$i] = false;
644 while ( $start > 0 && $changed[$start - 1] ) {
645 $start--;
646 }
647 assert( '$j > 0' );
648 while ( $other_changed[--$j] ) {
649 continue;
650 }
651 assert( '$j >= 0 && !$other_changed[$j]' );
652 }
653
654 /*
655 * Set CORRESPONDING to the end of the changed run, at the last
656 * point where it corresponds to a changed run in the other file.
657 * CORRESPONDING == LEN means no such point has been found.
658 */
659 $corresponding = $j < $other_len ? $i : $len;
660
661 /*
662 * Move the changed region forward, so long as the
663 * first changed line matches the following unchanged one.
664 * This merges with following changed regions.
665 * Do this second, so that if there are no merges,
666 * the changed region is moved forward as far as possible.
667 */
668 while ( $i < $len && $lines[$start] == $lines[$i] ) {
669 $changed[$start++] = false;
670 $changed[$i++] = 1;
671 while ( $i < $len && $changed[$i] ) {
672 $i++;
673 }
674
675 assert( '$j < $other_len && ! $other_changed[$j]' );
676 $j++;
677 if ( $j < $other_len && $other_changed[$j] ) {
678 $corresponding = $i;
679 while ( $j < $other_len && $other_changed[$j] ) {
680 $j++;
681 }
682 }
683 }
684 } while ( $runlength != $i - $start );
685
686 /*
687 * If possible, move the fully-merged run of changes
688 * back to a corresponding run in the other file.
689 */
690 while ( $corresponding < $i ) {
691 $changed[--$start] = 1;
692 $changed[--$i] = 0;
693 assert( '$j > 0' );
694 while ( $other_changed[--$j] ) {
695 continue;
696 }
697 assert( '$j >= 0 && !$other_changed[$j]' );
698 }
699 }
700 wfProfileOut( __METHOD__ );
701 }
702 }
703
704 /**
705 * Class representing a 'diff' between two sequences of strings.
706 * @todo document
707 * @private
708 * @ingroup DifferenceEngine
709 */
710 class Diff {
711
712 /**
713 * @var DiffOp[]
714 */
715 public $edits;
716
717 /**
718 * Constructor.
719 * Computes diff between sequences of strings.
720 *
721 * @param string[] $from_lines An array of strings.
722 * Typically these are lines from a file.
723 * @param string[] $to_lines An array of strings.
724 */
725 public function __construct( $from_lines, $to_lines ) {
726 $eng = new DiffEngine;
727 $this->edits = $eng->diff( $from_lines, $to_lines );
728 }
729
730 /**
731 * @return DiffOp[]
732 */
733 public function getEdits() {
734 return $this->edits;
735 }
736
737 /**
738 * Compute reversed Diff.
739 *
740 * SYNOPSIS:
741 *
742 * $diff = new Diff($lines1, $lines2);
743 * $rev = $diff->reverse();
744 *
745 * @return Object A Diff object representing the inverse of the
746 * original diff.
747 */
748 public function reverse() {
749 $rev = $this;
750 $rev->edits = array();
751 /** @var DiffOp $edit */
752 foreach ( $this->edits as $edit ) {
753 $rev->edits[] = $edit->reverse();
754 }
755
756 return $rev;
757 }
758
759 /**
760 * Check for empty diff.
761 *
762 * @return bool True if two sequences were identical.
763 */
764 public function isEmpty() {
765 foreach ( $this->edits as $edit ) {
766 if ( $edit->type != 'copy' ) {
767 return false;
768 }
769 }
770
771 return true;
772 }
773
774 /**
775 * Compute the length of the Longest Common Subsequence (LCS).
776 *
777 * This is mostly for diagnostic purposed.
778 *
779 * @return int The length of the LCS.
780 */
781 public function lcs() {
782 $lcs = 0;
783 foreach ( $this->edits as $edit ) {
784 if ( $edit->type == 'copy' ) {
785 $lcs += count( $edit->orig );
786 }
787 }
788
789 return $lcs;
790 }
791
792 /**
793 * Get the original set of lines.
794 *
795 * This reconstructs the $from_lines parameter passed to the
796 * constructor.
797 *
798 * @return string[] The original sequence of strings.
799 */
800 public function orig() {
801 $lines = array();
802
803 foreach ( $this->edits as $edit ) {
804 if ( $edit->orig ) {
805 array_splice( $lines, count( $lines ), 0, $edit->orig );
806 }
807 }
808
809 return $lines;
810 }
811
812 /**
813 * Get the closing set of lines.
814 *
815 * This reconstructs the $to_lines parameter passed to the
816 * constructor.
817 *
818 * @return string[] The sequence of strings.
819 */
820 public function closing() {
821 $lines = array();
822
823 foreach ( $this->edits as $edit ) {
824 if ( $edit->closing ) {
825 array_splice( $lines, count( $lines ), 0, $edit->closing );
826 }
827 }
828
829 return $lines;
830 }
831 }
832
833 /**
834 * @todo document, bad name.
835 * @private
836 * @ingroup DifferenceEngine
837 */
838 class MappedDiff extends Diff {
839 /**
840 * Constructor.
841 *
842 * Computes diff between sequences of strings.
843 *
844 * This can be used to compute things like
845 * case-insensitve diffs, or diffs which ignore
846 * changes in white-space.
847 *
848 * @param string[] $from_lines An array of strings.
849 * Typically these are lines from a file.
850 * @param string[] $to_lines An array of strings.
851 * @param string[] $mapped_from_lines This array should
852 * have the same size number of elements as $from_lines.
853 * The elements in $mapped_from_lines and
854 * $mapped_to_lines are what is actually compared
855 * when computing the diff.
856 * @param string[] $mapped_to_lines This array should
857 * have the same number of elements as $to_lines.
858 */
859 public function __construct( $from_lines, $to_lines,
860 $mapped_from_lines, $mapped_to_lines ) {
861 wfProfileIn( __METHOD__ );
862
863 assert( 'count( $from_lines ) == count( $mapped_from_lines )' );
864 assert( 'count( $to_lines ) == count( $mapped_to_lines )' );
865
866 parent::__construct( $mapped_from_lines, $mapped_to_lines );
867
868 $xi = $yi = 0;
869 $editCount = count( $this->edits );
870 for ( $i = 0; $i < $editCount; $i++ ) {
871 $orig = &$this->edits[$i]->orig;
872 if ( is_array( $orig ) ) {
873 $orig = array_slice( $from_lines, $xi, count( $orig ) );
874 $xi += count( $orig );
875 }
876
877 $closing = &$this->edits[$i]->closing;
878 if ( is_array( $closing ) ) {
879 $closing = array_slice( $to_lines, $yi, count( $closing ) );
880 $yi += count( $closing );
881 }
882 }
883 wfProfileOut( __METHOD__ );
884 }
885 }
886
887 /**
888 * Additions by Axel Boldt follow, partly taken from diff.php, phpwiki-1.3.3
889 */
890
891 /**
892 * @todo document
893 * @private
894 * @ingroup DifferenceEngine
895 */
896 class HWLDFWordAccumulator {
897 public $insClass = ' class="diffchange diffchange-inline"';
898 public $delClass = ' class="diffchange diffchange-inline"';
899
900 private $lines = array();
901 private $line = '';
902 private $group = '';
903 private $tag = '';
904
905 /**
906 * @param string $new_tag
907 */
908 private function flushGroup( $new_tag ) {
909 if ( $this->group !== '' ) {
910 if ( $this->tag == 'ins' ) {
911 $this->line .= "<ins{$this->insClass}>" .
912 htmlspecialchars( $this->group ) . '</ins>';
913 } elseif ( $this->tag == 'del' ) {
914 $this->line .= "<del{$this->delClass}>" .
915 htmlspecialchars( $this->group ) . '</del>';
916 } else {
917 $this->line .= htmlspecialchars( $this->group );
918 }
919 }
920 $this->group = '';
921 $this->tag = $new_tag;
922 }
923
924 /**
925 * @param string $new_tag
926 */
927 private function flushLine( $new_tag ) {
928 $this->flushGroup( $new_tag );
929 if ( $this->line != '' ) {
930 array_push( $this->lines, $this->line );
931 } else {
932 # make empty lines visible by inserting an NBSP
933 array_push( $this->lines, '&#160;' );
934 }
935 $this->line = '';
936 }
937
938 /**
939 * @param string[] $words
940 * @param string $tag
941 */
942 public function addWords( $words, $tag = '' ) {
943 if ( $tag != $this->tag ) {
944 $this->flushGroup( $tag );
945 }
946
947 foreach ( $words as $word ) {
948 // new-line should only come as first char of word.
949 if ( $word == '' ) {
950 continue;
951 }
952 if ( $word[0] == "\n" ) {
953 $this->flushLine( $tag );
954 $word = substr( $word, 1 );
955 }
956 assert( '!strstr( $word, "\n" )' );
957 $this->group .= $word;
958 }
959 }
960
961 /**
962 * @return string[]
963 */
964 public function getLines() {
965 $this->flushLine( '~done' );
966
967 return $this->lines;
968 }
969 }
970
971 /**
972 * @todo document
973 * @private
974 * @ingroup DifferenceEngine
975 */
976 class WordLevelDiff extends MappedDiff {
977 const MAX_LINE_LENGTH = 10000;
978
979 /**
980 * @param string[] $orig_lines
981 * @param string[] $closing_lines
982 */
983 public function __construct( $orig_lines, $closing_lines ) {
984 wfProfileIn( __METHOD__ );
985
986 list( $orig_words, $orig_stripped ) = $this->split( $orig_lines );
987 list( $closing_words, $closing_stripped ) = $this->split( $closing_lines );
988
989 parent::__construct( $orig_words, $closing_words,
990 $orig_stripped, $closing_stripped );
991 wfProfileOut( __METHOD__ );
992 }
993
994 /**
995 * @param string[] $lines
996 *
997 * @return array[]
998 */
999 private function split( $lines ) {
1000 wfProfileIn( __METHOD__ );
1001
1002 $words = array();
1003 $stripped = array();
1004 $first = true;
1005 foreach ( $lines as $line ) {
1006 # If the line is too long, just pretend the entire line is one big word
1007 # This prevents resource exhaustion problems
1008 if ( $first ) {
1009 $first = false;
1010 } else {
1011 $words[] = "\n";
1012 $stripped[] = "\n";
1013 }
1014 if ( strlen( $line ) > self::MAX_LINE_LENGTH ) {
1015 $words[] = $line;
1016 $stripped[] = $line;
1017 } else {
1018 $m = array();
1019 if ( preg_match_all( '/ ( [^\S\n]+ | [0-9_A-Za-z\x80-\xff]+ | . ) (?: (?!< \n) [^\S\n])? /xs',
1020 $line, $m )
1021 ) {
1022 foreach ( $m[0] as $word ) {
1023 $words[] = $word;
1024 }
1025 foreach ( $m[1] as $stripped_word ) {
1026 $stripped[] = $stripped_word;
1027 }
1028 }
1029 }
1030 }
1031 wfProfileOut( __METHOD__ );
1032
1033 return array( $words, $stripped );
1034 }
1035
1036 /**
1037 * @return string[]
1038 */
1039 public function orig() {
1040 wfProfileIn( __METHOD__ );
1041 $orig = new HWLDFWordAccumulator;
1042
1043 foreach ( $this->edits as $edit ) {
1044 if ( $edit->type == 'copy' ) {
1045 $orig->addWords( $edit->orig );
1046 } elseif ( $edit->orig ) {
1047 $orig->addWords( $edit->orig, 'del' );
1048 }
1049 }
1050 $lines = $orig->getLines();
1051 wfProfileOut( __METHOD__ );
1052
1053 return $lines;
1054 }
1055
1056 /**
1057 * @return string[]
1058 */
1059 public function closing() {
1060 wfProfileIn( __METHOD__ );
1061 $closing = new HWLDFWordAccumulator;
1062
1063 foreach ( $this->edits as $edit ) {
1064 if ( $edit->type == 'copy' ) {
1065 $closing->addWords( $edit->closing );
1066 } elseif ( $edit->closing ) {
1067 $closing->addWords( $edit->closing, 'ins' );
1068 }
1069 }
1070 $lines = $closing->getLines();
1071 wfProfileOut( __METHOD__ );
1072
1073 return $lines;
1074 }
1075
1076 }