2 /* Copyright (C) 2008 Guy Van den Broeck <guy@guyvdb.eu>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * or see http://www.gnu.org/
21 * This diff implementation is mainly lifted from the LCS algorithm of the Eclipse project which
22 * in turn is based on Myers' "An O(ND) difference algorithm and its variations"
23 * (http://citeseer.ist.psu.edu/myers86ond.html) with range compression (see Wu et al.'s
24 * "An O(NP) Sequence Comparison Algorithm").
26 * This implementation supports an upper bound on the excution time.
28 * Complexity: O((M + N)D) worst case time, O(M + N + D^2) expected time, O(M + N) space
30 * @author Guy Van den Broeck
31 * @ingroup DifferenceEngine
45 private $maxDifferences;
46 private $lcsLengthCorrectedForHeuristic = false;
52 public $heuristicUsed;
54 function __construct($tooLong = 2000000, $powLimit = 1.45){
55 $this->tooLong
= $tooLong;
56 $this->powLimit
= $powLimit;
59 public function diff(/*array*/ $from, /*array*/ $to){
60 //remember initial lengths
64 $this->heuristicUsed
= false;
67 $removed = $m > 0 ?
array_fill(0, $m, true) : array();
68 $added = $n > 0 ?
array_fill(0, $n, true) : array();
70 //reduce the complexity for the next step (intentionally done twice)
71 //remove common tokens at the start
73 while($i < $m && $i < $n && $from[$i] === $to[$i]) {
74 $removed[$i] = $added[$i] = false;
75 unset($from[$i], $to[$i]);
79 //remove common tokens at the end
81 while($i +
$j <= $m && $i +
$j <= $n && $from[$m - $j] === $to[$n - $j]) {
82 $removed[$m - $j] = $added[$n - $j] = false;
83 unset($from[$m - $j], $to[$n - $j]);
87 $this->from
= $newFromIndex = $this->to
= $newToIndex = array();
89 //remove tokens not in both sequences
91 foreach( $from as $key ) {
92 $shared[$key] = false;
95 foreach($to as $index => &$el) {
96 if(array_key_exists($el, $shared)) {
100 $newToIndex[] = $index;
103 foreach($from as $index => &$el) {
107 $newFromIndex[] = $index;
111 unset($shared, $from, $to);
113 $this->m
= count($this->from
);
114 $this->n
= count($this->to
);
116 $this->removed
= $this->m
> 0 ?
array_fill(0, $this->m
, true) : array();
117 $this->added
= $this->n
> 0 ?
array_fill(0, $this->n
, true) : array();
119 if ($this->m
== 0 ||
$this->n
== 0) {
122 $this->maxDifferences
= ceil(($this->m +
$this->n
) / 2.0);
123 if ($this->m
* $this->n
> $this->tooLong
) {
124 // limit complexity to D^POW_LIMIT for long sequences
125 $this->maxDifferences
= floor(pow($this->maxDifferences
, $this->powLimit
- 1.0));
126 wfDebug("Limiting max number of differences to $this->maxDifferences\n");
130 * The common prefixes and suffixes are always part of some LCS, include
131 * them now to reduce our search space
133 $max = min($this->m
, $this->n
);
134 for ($forwardBound = 0; $forwardBound < $max
135 && $this->from
[$forwardBound] === $this->to
[$forwardBound];
137 $this->removed
[$forwardBound] = $this->added
[$forwardBound] = false;
140 $backBoundL1 = $this->m
- 1;
141 $backBoundL2 = $this->n
- 1;
143 while ($backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound
144 && $this->from
[$backBoundL1] === $this->to
[$backBoundL2]) {
145 $this->removed
[$backBoundL1--] = $this->added
[$backBoundL2--] = false;
148 $temp = array_fill(0, $this->m +
$this->n +
1, 0);
149 $V = array($temp, $temp);
150 $snake = array(0, 0, 0);
152 $this->length
= $forwardBound +
$this->m
- $backBoundL1 - 1
153 +
$this->lcs_rec($forwardBound, $backBoundL1,
154 $forwardBound, $backBoundL2, $V, $snake);
160 $this->length +
= $i +
$j - 1;
162 foreach($this->removed
as $key => &$removed_elem) {
164 $removed[$newFromIndex[$key]] = false;
167 foreach($this->added
as $key => &$added_elem) {
169 $added[$newToIndex[$key]] = false;
172 $this->removed
= $removed;
173 $this->added
= $added;
176 function diff_range($from_lines, $to_lines) {
177 // Diff and store locally
178 $this->diff($from_lines, $to_lines);
179 unset($from_lines, $to_lines);
183 while ($xi < $this->m ||
$yi < $this->n
) {
185 while ($xi < $this->m
&& $yi < $this->n
186 && !$this->removed
[$xi]
187 && !$this->added
[$yi]) {
191 // Find deletes & adds.
193 while ($xi < $this->m
&& $this->removed
[$xi]) {
198 while ($yi < $this->n
&& $this->added
[$yi]) {
202 if ($xi > $xstart ||
$yi > $ystart) {
203 $ranges[] = new RangeDifference($xstart, $xi,
210 private function lcs_rec($bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake) {
211 // check that both sequences are non-empty
212 if ($bottoml1 > $topl1 ||
$bottoml2 > $topl2) {
216 $d = $this->find_middle_snake($bottoml1, $topl1, $bottoml2,
219 // need to store these so we don't lose them when they're
220 // overwritten by the recursion
225 // the middle snake is part of the LCS, store it
226 for ($i = 0; $i < $len; ++
$i) {
227 $this->removed
[$startx +
$i] = $this->added
[$starty +
$i] = false;
232 +
$this->lcs_rec($bottoml1, $startx - 1, $bottoml2,
233 $starty - 1, $V, $snake)
234 +
$this->lcs_rec($startx +
$len, $topl1, $starty +
$len,
236 } else if ($d == 1) {
238 * In this case the sequences differ by exactly 1 line. We have
239 * already saved all the lines after the difference in the for loop
240 * above, now we need to save all the lines before the difference.
242 $max = min($startx - $bottoml1, $starty - $bottoml2);
243 for ($i = 0; $i < $max; ++
$i) {
244 $this->removed
[$bottoml1 +
$i] =
245 $this->added
[$bottoml2 +
$i] = false;
252 private function find_middle_snake($bottoml1, $topl1, $bottoml2,$topl2, &$V, &$snake) {
253 $from = &$this->from
;
257 $snake0 = &$snake[0];
258 $snake1 = &$snake[1];
259 $snake2 = &$snake[2];
260 $bottoml1_min_1 = $bottoml1-1;
261 $bottoml2_min_1 = $bottoml2-1;
262 $N = $topl1 - $bottoml1_min_1;
263 $M = $topl2 - $bottoml2_min_1;
265 $maxabsx = $N+
$bottoml1;
266 $maxabsy = $M+
$bottoml2;
267 $limit = min($this->maxDifferences
, ceil(($N +
$M ) / 2));
269 //value_to_add_forward: a 0 or 1 that we add to the start
270 // offset to make it odd/even
272 $value_to_add_forward = 1;
274 $value_to_add_forward = 0;
278 $value_to_add_backward = 1;
280 $value_to_add_backward = 0;
283 $start_forward = -$M;
285 $start_backward = -$N;
288 $limit_min_1 = $limit - 1;
289 $limit_plus_1 = $limit +
1;
291 $V0[$limit_plus_1] = 0;
292 $V1[$limit_min_1] = $N;
293 $limit = min($this->maxDifferences
, ceil(($N +
$M ) / 2));
295 if (($delta & 1) == 1) {
296 for ($d = 0; $d <= $limit; ++
$d) {
297 $start_diag = max($value_to_add_forward +
$start_forward, -$d);
298 $end_diag = min($end_forward, $d);
299 $value_to_add_forward = 1 - $value_to_add_forward;
301 // compute forward furthest reaching paths
302 for ($k = $start_diag; $k <= $end_diag; $k +
= 2) {
303 if ($k == -$d ||
($k < $d
304 && $V0[$limit_min_1 +
$k] < $V0[$limit_plus_1 +
$k])) {
305 $x = $V0[$limit_plus_1 +
$k];
307 $x = $V0[$limit_min_1 +
$k] +
1;
310 $absx = $snake0 = $x +
$bottoml1;
311 $absy = $snake1 = $x - $k +
$bottoml2;
313 while ($absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy]) {
317 $x = $absx-$bottoml1;
319 $snake2 = $absx -$snake0;
320 $V0[$limit +
$k] = $x;
321 if ($k >= $delta - $d +
1 && $k <= $delta +
$d - 1
322 && $x >= $V1[$limit +
$k - $delta]) {
326 // check to see if we can cut down the diagonal range
327 if ($x >= $N && $end_forward > $k - 1) {
328 $end_forward = $k - 1;
329 } else if ($absy - $bottoml2 >= $M) {
330 $start_forward = $k +
1;
331 $value_to_add_forward = 0;
335 $start_diag = max($value_to_add_backward +
$start_backward, -$d);
336 $end_diag = min($end_backward, $d);
337 $value_to_add_backward = 1 - $value_to_add_backward;
339 // compute backward furthest reaching paths
340 for ($k = $start_diag; $k <= $end_diag; $k +
= 2) {
342 ||
($k != -$d && $V1[$limit_min_1 +
$k] < $V1[$limit_plus_1 +
$k])) {
343 $x = $V1[$limit_min_1 +
$k];
345 $x = $V1[$limit_plus_1 +
$k] - 1;
348 $y = $x - $k - $delta;
351 while ($x > 0 && $y > 0
352 && $from[$x +
$bottoml1_min_1] === $to[$y +
$bottoml2_min_1]) {
357 $V1[$limit +
$k] = $x;
359 // check to see if we can cut down our diagonal range
361 $start_backward = $k +
1;
362 $value_to_add_backward = 0;
363 } else if ($y <= 0 && $end_backward > $k - 1) {
364 $end_backward = $k - 1;
369 for ($d = 0; $d <= $limit; ++
$d) {
370 $start_diag = max($value_to_add_forward +
$start_forward, -$d);
371 $end_diag = min($end_forward, $d);
372 $value_to_add_forward = 1 - $value_to_add_forward;
374 // compute forward furthest reaching paths
375 for ($k = $start_diag; $k <= $end_diag; $k +
= 2) {
377 ||
($k < $d && $V0[$limit_min_1 +
$k] < $V0[$limit_plus_1 +
$k])) {
378 $x = $V0[$limit_plus_1 +
$k];
380 $x = $V0[$limit_min_1 +
$k] +
1;
383 $absx = $snake0 = $x +
$bottoml1;
384 $absy = $snake1 = $x - $k +
$bottoml2;
386 while ($absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy]) {
390 $x = $absx-$bottoml1;
391 $snake2 = $absx -$snake0;
392 $V0[$limit +
$k] = $x;
394 // check to see if we can cut down the diagonal range
395 if ($x >= $N && $end_forward > $k - 1) {
396 $end_forward = $k - 1;
397 } else if ($absy-$bottoml2 >= $M) {
398 $start_forward = $k +
1;
399 $value_to_add_forward = 0;
403 $start_diag = max($value_to_add_backward +
$start_backward, -$d);
404 $end_diag = min($end_backward, $d);
405 $value_to_add_backward = 1 - $value_to_add_backward;
407 // compute backward furthest reaching paths
408 for ($k = $start_diag; $k <= $end_diag; $k +
= 2) {
410 ||
($k != -$d && $V1[$limit_min_1 +
$k] < $V1[$limit_plus_1 +
$k])) {
411 $x = $V1[$limit_min_1 +
$k];
413 $x = $V1[$limit_plus_1 +
$k] - 1;
416 $y = $x - $k - $delta;
419 while ($x > 0 && $y > 0
420 && $from[$x +
$bottoml1_min_1] === $to[$y +
$bottoml2_min_1]) {
425 $V1[$limit +
$k] = $x;
427 if ($k >= -$delta - $d && $k <= $d - $delta
428 && $x <= $V0[$limit +
$k +
$delta]) {
429 $snake0 = $bottoml1 +
$x;
430 $snake1 = $bottoml2 +
$y;
434 // check to see if we can cut down our diagonal range
436 $start_backward = $k +
1;
437 $value_to_add_backward = 0;
438 } else if ($y <= 0 && $end_backward > $k - 1) {
439 $end_backward = $k - 1;
445 * computing the true LCS is too expensive, instead find the diagonal
446 * with the most progress and pretend a midle snake of length 0 occurs
450 $most_progress = self
::findMostProgress($M, $N, $limit, $V);
452 $snake0 = $bottoml1 +
$most_progress[0];
453 $snake1 = $bottoml2 +
$most_progress[1];
455 wfDebug("Computing the LCS is too expensive. Using a heuristic.\n");
456 $this->heuristicUsed
= true;
458 * HACK: since we didn't really finish the LCS computation
459 * we don't really know the length of the SES. We don't do
460 * anything with the result anyway, unless it's <=1. We know
461 * for a fact SES > 1 so 5 is as good a number as any to
466 private static function findMostProgress($M, $N, $limit, $V) {
469 if (($M & 1) == ($limit & 1)) {
470 $forward_start_diag = max(-$M, -$limit);
472 $forward_start_diag = max(1 - $M, -$limit);
475 $forward_end_diag = min($N, $limit);
477 if (($N & 1) == ($limit & 1)) {
478 $backward_start_diag = max(-$N, -$limit);
480 $backward_start_diag = max(1 - $N, -$limit);
483 $backward_end_diag = -min($M, $limit);
485 $temp = array(0, 0, 0);
488 $max_progress = array_fill(0, ceil(max($forward_end_diag - $forward_start_diag,
489 $backward_end_diag - $backward_start_diag) / 2), $temp);
490 $num_progress = 0; // the 1st entry is current, it is initialized
493 // first search the forward diagonals
494 for ($k = $forward_start_diag; $k <= $forward_end_diag; $k +
= 2) {
495 $x = $V[0][$limit +
$k];
497 if ($x > $N ||
$y > $M) {
502 if ($progress > $max_progress[0][2]) {
504 $max_progress[0][0] = $x;
505 $max_progress[0][1] = $y;
506 $max_progress[0][2] = $progress;
507 } else if ($progress == $max_progress[0][2]) {
509 $max_progress[$num_progress][0] = $x;
510 $max_progress[$num_progress][1] = $y;
511 $max_progress[$num_progress][2] = $progress;
515 $max_progress_forward = true; // initially the maximum
516 // progress is in the forward
519 // now search the backward diagonals
520 for ($k = $backward_start_diag; $k <= $backward_end_diag; $k +
= 2) {
521 $x = $V[1][$limit +
$k];
522 $y = $x - $k - $delta;
523 if ($x < 0 ||
$y < 0) {
527 $progress = $N - $x +
$M - $y;
528 if ($progress > $max_progress[0][2]) {
530 $max_progress_forward = false;
531 $max_progress[0][0] = $x;
532 $max_progress[0][1] = $y;
533 $max_progress[0][2] = $progress;
534 } else if ($progress == $max_progress[0][2] && !$max_progress_forward) {
536 $max_progress[$num_progress][0] = $x;
537 $max_progress[$num_progress][1] = $y;
538 $max_progress[$num_progress][2] = $progress;
542 // return the middle diagonal with maximal progress.
543 return $max_progress[floor($num_progress / 2)];
546 public function getLcsLength(){
547 if($this->heuristicUsed
&& !$this->lcsLengthCorrectedForHeuristic
){
548 $this->lcsLengthCorrectedForHeuristic
= true;
549 $this->length
= $this->m
-array_sum($this->added
);
551 return $this->length
;
557 * Alternative representation of a set of changes, by the index
558 * ranges that are changed.
560 * @ingroup DifferenceEngine
562 class RangeDifference
{
572 function __construct($leftstart, $leftend, $rightstart, $rightend){
573 $this->leftstart
= $leftstart;
574 $this->leftend
= $leftend;
575 $this->leftlength
= $leftend - $leftstart;
576 $this->rightstart
= $rightstart;
577 $this->rightend
= $rightend;
578 $this->rightlength
= $rightend - $rightstart;