72db5e132d54042c8e2fe566e5fe03b1efdf7e82
[lhc/web/wiklou.git] / includes / diff / DairikiDiff.php
1 <?php
2 /**
3 * A PHP diff engine for phpwiki. (Taken from phpwiki-1.3.3)
4 *
5 * Copyright © 2000, 2001 Geoffrey T. Dairiki <dairiki@dairiki.org>
6 * You may copy this code freely under the conditions of the GPL.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup DifferenceEngine
25 * @defgroup DifferenceEngine DifferenceEngine
26 */
27
28 /**
29 * The base class for all other DiffOp classes.
30 *
31 * The classes that extend DiffOp are: DiffOpCopy, DiffOpDelete, DiffOpAdd and
32 * DiffOpChange. FakeDiffOp also extends DiffOp, but it is not located in this file.
33 *
34 * @private
35 * @ingroup DifferenceEngine
36 */
37 abstract class DiffOp {
38
39 /**
40 * @var string
41 */
42 public $type;
43
44 /**
45 * @var string[]
46 */
47 public $orig;
48
49 /**
50 * @var string[]
51 */
52 public $closing;
53
54 /**
55 * @return string
56 */
57 public function getType() {
58 return $this->type;
59 }
60
61 /**
62 * @return string[]
63 */
64 public function getOrig() {
65 return $this->orig;
66 }
67
68 /**
69 * @param int $i
70 * @return string|null
71 */
72 public function getClosing( $i = null ) {
73 if ( $i === null ) {
74 return $this->closing;
75 }
76 if ( array_key_exists( $i, $this->closing ) ) {
77 return $this->closing[$i];
78 }
79 return null;
80 }
81
82 abstract public function reverse();
83
84 /**
85 * @return int
86 */
87 public function norig() {
88 return $this->orig ? count( $this->orig ) : 0;
89 }
90
91 /**
92 * @return int
93 */
94 public function nclosing() {
95 return $this->closing ? count( $this->closing ) : 0;
96 }
97 }
98
99 /**
100 * Extends DiffOp. Used to mark strings that have been
101 * copied from one string array to the other.
102 *
103 * @private
104 * @ingroup DifferenceEngine
105 */
106 class DiffOpCopy extends DiffOp {
107 public $type = 'copy';
108
109 public function __construct( $orig, $closing = false ) {
110 if ( !is_array( $closing ) ) {
111 $closing = $orig;
112 }
113 $this->orig = $orig;
114 $this->closing = $closing;
115 }
116
117 /**
118 * @return DiffOpCopy
119 */
120 public function reverse() {
121 return new DiffOpCopy( $this->closing, $this->orig );
122 }
123 }
124
125 /**
126 * Extends DiffOp. Used to mark strings that have been
127 * deleted from the first string array.
128 *
129 * @private
130 * @ingroup DifferenceEngine
131 */
132 class DiffOpDelete extends DiffOp {
133 public $type = 'delete';
134
135 public function __construct( $lines ) {
136 $this->orig = $lines;
137 $this->closing = false;
138 }
139
140 /**
141 * @return DiffOpAdd
142 */
143 public function reverse() {
144 return new DiffOpAdd( $this->orig );
145 }
146 }
147
148 /**
149 * Extends DiffOp. Used to mark strings that have been
150 * added from the first string array.
151 *
152 * @private
153 * @ingroup DifferenceEngine
154 */
155 class DiffOpAdd extends DiffOp {
156 public $type = 'add';
157
158 public function __construct( $lines ) {
159 $this->closing = $lines;
160 $this->orig = false;
161 }
162
163 /**
164 * @return DiffOpDelete
165 */
166 public function reverse() {
167 return new DiffOpDelete( $this->closing );
168 }
169 }
170
171 /**
172 * Extends DiffOp. Used to mark strings that have been
173 * changed from the first string array (both added and subtracted).
174 *
175 * @private
176 * @ingroup DifferenceEngine
177 */
178 class DiffOpChange extends DiffOp {
179 public $type = 'change';
180
181 public function __construct( $orig, $closing ) {
182 $this->orig = $orig;
183 $this->closing = $closing;
184 }
185
186 /**
187 * @return DiffOpChange
188 */
189 public function reverse() {
190 return new DiffOpChange( $this->closing, $this->orig );
191 }
192 }
193
194 /**
195 * Class used internally by Diff to actually compute the diffs.
196 *
197 * The algorithm used here is mostly lifted from the perl module
198 * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
199 * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
200 *
201 * More ideas are taken from:
202 * http://www.ics.uci.edu/~eppstein/161/960229.html
203 *
204 * Some ideas (and a bit of code) are from analyze.c, from GNU
205 * diffutils-2.7, which can be found at:
206 * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
207 *
208 * closingly, some ideas (subdivision by NCHUNKS > 2, and some optimizations)
209 * are my own.
210 *
211 * Line length limits for robustness added by Tim Starling, 2005-08-31
212 * Alternative implementation added by Guy Van den Broeck, 2008-07-30
213 *
214 * @author Geoffrey T. Dairiki, Tim Starling, Guy Van den Broeck
215 * @private
216 * @ingroup DifferenceEngine
217 */
218 class DiffEngine {
219 const MAX_XREF_LENGTH = 10000;
220
221 protected $xchanged, $ychanged;
222
223 protected $xv = [], $yv = [];
224 protected $xind = [], $yind = [];
225
226 protected $seq = [], $in_seq = [];
227
228 protected $lcs = 0;
229
230 /**
231 * @param string[] $from_lines
232 * @param string[] $to_lines
233 *
234 * @return DiffOp[]
235 */
236 public function diff( $from_lines, $to_lines ) {
237
238 // Diff and store locally
239 $this->diffLocal( $from_lines, $to_lines );
240
241 // Merge edits when possible
242 $this->shiftBoundaries( $from_lines, $this->xchanged, $this->ychanged );
243 $this->shiftBoundaries( $to_lines, $this->ychanged, $this->xchanged );
244
245 // Compute the edit operations.
246 $n_from = count( $from_lines );
247 $n_to = count( $to_lines );
248
249 $edits = [];
250 $xi = $yi = 0;
251 while ( $xi < $n_from || $yi < $n_to ) {
252 assert( $yi < $n_to || $this->xchanged[$xi] );
253 assert( $xi < $n_from || $this->ychanged[$yi] );
254
255 // Skip matching "snake".
256 $copy = [];
257 while ( $xi < $n_from && $yi < $n_to
258 && !$this->xchanged[$xi] && !$this->ychanged[$yi]
259 ) {
260 $copy[] = $from_lines[$xi++];
261 ++$yi;
262 }
263 if ( $copy ) {
264 $edits[] = new DiffOpCopy( $copy );
265 }
266
267 // Find deletes & adds.
268 $delete = [];
269 while ( $xi < $n_from && $this->xchanged[$xi] ) {
270 $delete[] = $from_lines[$xi++];
271 }
272
273 $add = [];
274 while ( $yi < $n_to && $this->ychanged[$yi] ) {
275 $add[] = $to_lines[$yi++];
276 }
277
278 if ( $delete && $add ) {
279 $edits[] = new DiffOpChange( $delete, $add );
280 } elseif ( $delete ) {
281 $edits[] = new DiffOpDelete( $delete );
282 } elseif ( $add ) {
283 $edits[] = new DiffOpAdd( $add );
284 }
285 }
286
287 return $edits;
288 }
289
290 /**
291 * @param string[] $from_lines
292 * @param string[] $to_lines
293 */
294 private function diffLocal( $from_lines, $to_lines ) {
295 $wikidiff3 = new WikiDiff3();
296 $wikidiff3->diff( $from_lines, $to_lines );
297 $this->xchanged = $wikidiff3->removed;
298 $this->ychanged = $wikidiff3->added;
299 }
300
301 /**
302 * Adjust inserts/deletes of identical lines to join changes
303 * as much as possible.
304 *
305 * We do something when a run of changed lines include a
306 * line at one end and has an excluded, identical line at the other.
307 * We are free to choose which identical line is included.
308 * `compareseq' usually chooses the one at the beginning,
309 * but usually it is cleaner to consider the following identical line
310 * to be the "change".
311 *
312 * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
313 */
314 private function shiftBoundaries( $lines, &$changed, $other_changed ) {
315 $i = 0;
316 $j = 0;
317
318 assert( count( $lines ) == count( $changed ) );
319 $len = count( $lines );
320 $other_len = count( $other_changed );
321
322 while ( 1 ) {
323 /*
324 * Scan forwards to find beginning of another run of changes.
325 * Also keep track of the corresponding point in the other file.
326 *
327 * Throughout this code, $i and $j are adjusted together so that
328 * the first $i elements of $changed and the first $j elements
329 * of $other_changed both contain the same number of zeros
330 * (unchanged lines).
331 * Furthermore, $j is always kept so that $j == $other_len or
332 * $other_changed[$j] == false.
333 */
334 while ( $j < $other_len && $other_changed[$j] ) {
335 $j++;
336 }
337
338 while ( $i < $len && !$changed[$i] ) {
339 assert( $j < $other_len && ! $other_changed[$j] );
340 $i++;
341 $j++;
342 while ( $j < $other_len && $other_changed[$j] ) {
343 $j++;
344 }
345 }
346
347 if ( $i == $len ) {
348 break;
349 }
350
351 $start = $i;
352
353 // Find the end of this run of changes.
354 while ( ++$i < $len && $changed[$i] ) {
355 continue;
356 }
357
358 do {
359 /*
360 * Record the length of this run of changes, so that
361 * we can later determine whether the run has grown.
362 */
363 $runlength = $i - $start;
364
365 /*
366 * Move the changed region back, so long as the
367 * previous unchanged line matches the last changed one.
368 * This merges with previous changed regions.
369 */
370 while ( $start > 0 && $lines[$start - 1] == $lines[$i - 1] ) {
371 $changed[--$start] = 1;
372 $changed[--$i] = false;
373 while ( $start > 0 && $changed[$start - 1] ) {
374 $start--;
375 }
376 assert( $j > 0 );
377 while ( $other_changed[--$j] ) {
378 continue;
379 }
380 assert( $j >= 0 && !$other_changed[$j] );
381 }
382
383 /*
384 * Set CORRESPONDING to the end of the changed run, at the last
385 * point where it corresponds to a changed run in the other file.
386 * CORRESPONDING == LEN means no such point has been found.
387 */
388 $corresponding = $j < $other_len ? $i : $len;
389
390 /*
391 * Move the changed region forward, so long as the
392 * first changed line matches the following unchanged one.
393 * This merges with following changed regions.
394 * Do this second, so that if there are no merges,
395 * the changed region is moved forward as far as possible.
396 */
397 while ( $i < $len && $lines[$start] == $lines[$i] ) {
398 $changed[$start++] = false;
399 $changed[$i++] = 1;
400 while ( $i < $len && $changed[$i] ) {
401 $i++;
402 }
403
404 assert( $j < $other_len && ! $other_changed[$j] );
405 $j++;
406 if ( $j < $other_len && $other_changed[$j] ) {
407 $corresponding = $i;
408 while ( $j < $other_len && $other_changed[$j] ) {
409 $j++;
410 }
411 }
412 }
413 } while ( $runlength != $i - $start );
414
415 /*
416 * If possible, move the fully-merged run of changes
417 * back to a corresponding run in the other file.
418 */
419 while ( $corresponding < $i ) {
420 $changed[--$start] = 1;
421 $changed[--$i] = 0;
422 assert( $j > 0 );
423 while ( $other_changed[--$j] ) {
424 continue;
425 }
426 assert( $j >= 0 && !$other_changed[$j] );
427 }
428 }
429 }
430 }
431
432 /**
433 * Class representing a 'diff' between two sequences of strings.
434 * @todo document
435 * @private
436 * @ingroup DifferenceEngine
437 */
438 class Diff {
439
440 /**
441 * @var DiffOp[]
442 */
443 public $edits;
444
445 /**
446 * Constructor.
447 * Computes diff between sequences of strings.
448 *
449 * @param string[] $from_lines An array of strings.
450 * Typically these are lines from a file.
451 * @param string[] $to_lines An array of strings.
452 */
453 public function __construct( $from_lines, $to_lines ) {
454 $eng = new DiffEngine;
455 $this->edits = $eng->diff( $from_lines, $to_lines );
456 }
457
458 /**
459 * @return DiffOp[]
460 */
461 public function getEdits() {
462 return $this->edits;
463 }
464
465 /**
466 * Compute reversed Diff.
467 *
468 * SYNOPSIS:
469 *
470 * $diff = new Diff($lines1, $lines2);
471 * $rev = $diff->reverse();
472 *
473 * @return Object A Diff object representing the inverse of the
474 * original diff.
475 */
476 public function reverse() {
477 $rev = $this;
478 $rev->edits = [];
479 /** @var DiffOp $edit */
480 foreach ( $this->edits as $edit ) {
481 $rev->edits[] = $edit->reverse();
482 }
483
484 return $rev;
485 }
486
487 /**
488 * Check for empty diff.
489 *
490 * @return bool True if two sequences were identical.
491 */
492 public function isEmpty() {
493 foreach ( $this->edits as $edit ) {
494 if ( $edit->type != 'copy' ) {
495 return false;
496 }
497 }
498
499 return true;
500 }
501
502 /**
503 * Compute the length of the Longest Common Subsequence (LCS).
504 *
505 * This is mostly for diagnostic purposed.
506 *
507 * @return int The length of the LCS.
508 */
509 public function lcs() {
510 $lcs = 0;
511 foreach ( $this->edits as $edit ) {
512 if ( $edit->type == 'copy' ) {
513 $lcs += count( $edit->orig );
514 }
515 }
516
517 return $lcs;
518 }
519
520 /**
521 * Get the original set of lines.
522 *
523 * This reconstructs the $from_lines parameter passed to the
524 * constructor.
525 *
526 * @return string[] The original sequence of strings.
527 */
528 public function orig() {
529 $lines = [];
530
531 foreach ( $this->edits as $edit ) {
532 if ( $edit->orig ) {
533 array_splice( $lines, count( $lines ), 0, $edit->orig );
534 }
535 }
536
537 return $lines;
538 }
539
540 /**
541 * Get the closing set of lines.
542 *
543 * This reconstructs the $to_lines parameter passed to the
544 * constructor.
545 *
546 * @return string[] The sequence of strings.
547 */
548 public function closing() {
549 $lines = [];
550
551 foreach ( $this->edits as $edit ) {
552 if ( $edit->closing ) {
553 array_splice( $lines, count( $lines ), 0, $edit->closing );
554 }
555 }
556
557 return $lines;
558 }
559 }
560
561 /**
562 * @deprecated Alias for WordAccumulator, to be soon removed
563 */
564 class HWLDFWordAccumulator extends MediaWiki\Diff\WordAccumulator {
565 }