2 /* Copyright (C) 2008 Guy Van den Broeck <guy@guyvdb.eu>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * or see http://www.gnu.org/
21 * Implementation of the Diff algorithm.
23 * DO NOT USE ON PRODUCTION SYSTEMS
25 * The algorithm is based on the O(NP) LCS algorithm as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm"
26 * and extended with my own ideas.
28 * @return array($from_removed, $to_added)
29 * TRUE if the token was removed or added.
31 * @author Guy Van den Broeck
33 function wikidiff3_diff( /*array*/ $from, /*array*/ $to, $boundRunningTime=FALSE, $max_NP_before_bound = 800000){
34 wfProfileIn( __METHOD__
);
39 $result_from = array_fill(0,$m,TRUE);
40 $result_to = array_fill(0,$n,TRUE);
42 //reduce the complexity for the next step (intentionally done twice)
43 //remove common tokens at the start
45 while($i<$m && $i<$n && $from[$i]===$to[$i]){
46 $result_from[$i] = $result_to[$i] = FALSE;
47 unset($from[$i],$to[$i]);
51 //remove common tokens at the end
53 while($i+
$j<=$m && $i+
$j<=$n && $from[$m-$j]===$to[$n-$j]){
54 $result_from[$m-$j] = $result_to[$n-$j] = FALSE;
55 unset($from[$m-$j],$to[$n-$j]);
59 $newFrom = $newFromIndex = $newTo = $newToIndex = array();
61 //remove tokens not in both sequences
62 $shared = array_fill_keys($from,FALSE);
63 foreach($to as $index => $el){
64 if(array_key_exists($el,$shared)){
68 $newToIndex[] = $index;
71 foreach($from as $index => $el){
75 $newFromIndex[] = $index;
79 unset($from, $to, $shared);
81 $m2 = sizeof($newFrom);
83 $offsetx = $offsety = 0;
84 $from_inLcs = new InLcs($m2);
85 $to_inLcs = new InLcs($n2);
87 wikidiff3_diffPart($newFrom, $newTo, $from_inLcs, $to_inLcs, $m2, $n2, $offsetx, $offsety, $m2, $boundRunningTime, $max_NP_before_bound);
89 foreach($from_inLcs->inLcs
as $key => $in){
91 $result_from[$newFromIndex[$key]] = FALSE;
94 foreach($to_inLcs->inLcs
as $key => $in){
96 $result_to[$newToIndex[$key]] = FALSE;
100 wfProfileOut( __METHOD__
);
101 return array($result_from, $result_to);
104 function wikidiff3_diffPart( /*array*/ $a, /*array*/ $b, InLcs
$a_inLcs, InLcs
$b_inLcs, $m, $n, $offsetx, $offsety, $bestKnownLcs, $boundRunningTime=FALSE, $max_NP_before_bound = 800000){
105 if($bestKnownLcs==0 ||
$m==0 ||
$n==0){
110 return wikidiff3_diffPart($b, $a, $b_inLcs, $a_inLcs, $n, $m, $offsety, $offsetx, $bestKnownLcs, $boundRunningTime, $max_NP_before_bound);
113 $a_inLcs_sym = &$a_inLcs->inLcs
;
114 $b_inLcs_sym = &$b_inLcs->inLcs
;
116 //remove common tokens at the start
117 while($m>0 && $n>0 && $a[0]===$b[0]){
118 $a_inLcs_sym[$offsetx] = $b_inLcs_sym[$offsety] = TRUE;
119 ++
$offsetx; ++
$offsety;
126 //remove common tokens at the end
127 while($m>0 && $n>0 && $a[$m-1]===$b[$n-1]){
128 $a_inLcs_sym[$offsetx+
$m-1] = $b_inLcs_sym[$offsety+
$n-1] = TRUE;
135 if($bestKnownLcs==0 ||
$m==0 ||
$n==0){
139 return wikidiff3_diffPart($b, $a, $b_inLcs, $a_inLcs, $n, $m, $offsety, $offsetx, $bestKnownLcs, $boundRunningTime, $max_NP_before_bound);
143 $delta_plus_1 = $delta+
1;
144 $delta_min_1 = $delta-1;
146 $fpForw = $snakeBeginForw = $snakeEndForw = array_fill( 0, $delta_plus_1, -1);
147 $lcsSizeForw = $snakekForw = $lcsSizeBack = $snakekBack = array_fill( 0, $delta_plus_1, 0);
148 $fpBack = $snakeBeginBack = $snakeEndBack = array_fill( 0, $delta_plus_1, $n);
149 $overlap = $delta>1 ?
array_fill( 1, $delta_min_1, FALSE) : array();
150 $bestLcsLength = $bestLcsLengthTop = $bestLcsLengthBottom = -1;
152 if($boundRunningTime){
153 $maxp_before_bound = max((-$delta+
sqrt($delta*$delta+
($max_NP_before_bound<<2)))>>1,2);
154 if($maxp_before_bound>=$m){
155 $boundRunningTime = false;
156 unset($maxp_before_bound);
162 $maxp=$m_min_1-$bestLcsLength;
166 if($boundRunningTime && $p>$maxp_before_bound){
167 // bound the running time by stopping early
168 if($bestLcsLength>=0){
169 // accept the best LCS thusfar
172 $bestLcsProgressForw = $bestkForw = 0;
173 foreach($lcsSizeForw as $k => $localLcsProgress){
174 if($localLcsProgress>$bestLcsProgressForw ||
($localLcsProgress==$bestLcsProgressForw && $bestkForw > $k)){
175 $bestLcsProgressForw = $localLcsProgress;
179 $bestLcsProgressBack = $bestkBack = 0;
180 foreach($lcsSizeBack as $k => $localLcsProgress){
181 if($localLcsProgress>$bestLcsProgressBack ||
($localLcsProgress==$bestLcsProgressBack && $bestkBack < $k)){
182 $bestLcsProgressBack = $localLcsProgress;
186 if($fpForw[$bestkForw]-$bestkForw>$fpBack[$bestkBack]-$bestkBack){
187 // This is hard, maybe try some more? Can this even happen? A solution will be found soon anyway.
190 $topSnakeStart = $snakeBeginForw[$bestkForw];
191 $topSnakeEnd = $snakeEndForw[$bestkForw];
192 $topSnakek = $snakekForw[$bestkForw];
193 $bestLcsLengthTop = $lcsSizeForw[$bestkBack] +
$topSnakeStart - $topSnakeEnd;
195 $bottomSnakeStart = $snakeEndBack[$bestkBack]+
1;
196 $bottomSnakeEnd = $snakeBeginBack[$bestkBack]+
1;
197 $bottomSnakek = $snakekBack[$bestkBack];
198 $bestLcsLengthBottom = $lcsSizeBack[$bestkBack] +
$bottomSnakeStart - $bottomSnakeEnd;
200 if($bottomSnakeStart-$topSnakeEnd>$n*0.6 && ($bottomSnakeStart-$bottomSnakek)-($topSnakeEnd-$topSnakek)>$m*0.6){
201 // cut the sequence from both sides (there isn't enough progress, 60% of the sequence is not covered)
202 // also diff the middle now
203 if($bottomSnakeEnd>($fpForw[$bestkForw]>>1)){
204 // do the middle diff between the snakes
205 wfDebug(" Limiting diff run time from both sides, middle between snakes\n");
206 $m_t = ($bottomSnakeStart-$bottomSnakek)-($topSnakeEnd-$topSnakek);
207 $n_t = $bottomSnakeStart-$topSnakeEnd;
208 $a_t = array_slice($a, $topSnakeEnd-$topSnakek, $m_t);
209 $b_t = array_slice($b, $topSnakeEnd, $n_t);
210 $offsetx_t = $offsetx+
($topSnakeEnd-$topSnakek);
211 $offsety_t = $offsety+
$topSnakeEnd;
213 // the snake is too early in the sequence, do the middle diff between endpoints of progress
214 wfDebug(" Limiting diff run time from both sides, middle between endpoints\n");
215 $m_t = ($fpBack[$bestkBack]+
1-$bestkBack)-($fpForw[$bestkForw]-$bestkForw);
216 $n_t = $fpBack[$bestkBack]+
1-$fpForw[$bestkForw];
217 $a_t = array_slice($a, $fpForw[$bestkForw]-$bestkForw, $m_t);
218 $b_t = array_slice($b, $fpForw[$bestkForw], $n_t);
219 $offsetx_t = $offsetx+
($fpForw[$bestkForw]-$bestkForw);
220 $offsety_t = $offsety+
$fpForw[$bestkForw];
222 wikidiff3_diffPart($a_t, $b_t, $a_inLcs, $b_inLcs, $m_t, $n_t, $offsetx_t, $offsety_t, $m_t, $boundRunningTime, $max_NP_before_bound);
223 }else if(min($n-$bottomSnakeStart, $m-($bottomSnakeStart-$bottomSnakek))>min($topSnakeEnd, $topSnakeEnd-$topSnakek)){
224 // the bottom snake is more interesting
225 wfDebug("Limiting diff run time from bottom side\n");
226 $topSnakeStart = $bottomSnakeStart;
227 $topSnakeEnd = $bottomSnakeEnd;
228 $topSnakek = $bottomSnakek;
229 $bestLcsLengthTop = $topSnakeEnd-$topSnakek;
230 $bottomSnakeStart = $bottomSnakeEnd;
232 // the top snake is more interesting
233 wfDebug(" Limiting diff run time from top side\n");
234 $bottomSnakeStart = $topSnakeEnd;
235 $bottomSnakeEnd = $topSnakeEnd;
236 $bottomSnakek = $topSnakek;
237 $bestLcsLengthBottom = $m-($bottomSnakeEnd-$bottomSnakek);
244 $overlap[-$p] = $overlap[$delta+
$p] = FALSE;
246 $min_p_min_1 = -$p-1;
247 $delta_plus_1_plus_p = $delta_plus_1+
$p;
250 $fpForw[$min_p_min_1] = $snakeEndForw[$min_p_min_1] = $snakekForw[$min_p_min_1] = $snakeBeginForw[$min_p_min_1] = $fpForw[$delta_plus_1_plus_p] =
251 $snakeBeginForw[$delta_plus_1_plus_p] = $snakeEndForw[$delta_plus_1_plus_p] = $snakekForw[$delta_plus_1_plus_p] = -1;
252 $lcsSizeForw[$min_p_min_1] = $lcsSizeForw[$delta_plus_1_plus_p] = 0;
259 $fpBelow = $fpForw[$k_min_1]+
1;
260 $fpAbove = $fpForw[$k_plus_1];
262 if($fpBelow>$fpAbove){
263 $oldy = $y = $fpBelow;
264 $lcsSizeForw[$k] = $lcsSizeForw[$k_min_1];
265 $snakeBeginForw[$k] = $snakeBeginForw[$k_min_1];
266 $snakeEndForw[$k] = $snakeEndForw[$k_min_1];
267 $snakekForw[$k] = $snakekForw[$k_min_1];
269 $oldy = $y = $fpAbove;
270 $lcsSizeForw[$k] = $lcsSizeForw[$k_plus_1];
271 $snakeBeginForw[$k] = $snakeBeginForw[$k_plus_1];
272 $snakeEndForw[$k] = $snakeEndForw[$k_plus_1];
273 $snakekForw[$k] = $snakekForw[$k_plus_1];
276 while($x < $m && $y < $n && $a[$x]===$b[$y]){
281 $lcsSizeForw[$k] +
= $y-$oldy;
282 $snakeBeginForw[$k] = $oldy;
283 $snakeEndForw[$k]= $y;
284 $snakekForw[$k] = $k;
286 if($y>=$fpBack[$k] && !$overlap[$k]){
289 $lcsLength = $lcsSizeForw[$k]+
$lcsSizeBack[$k];
290 if($y>$fpBack[$k]+
1){
291 $snakeoverlap = $y-$fpBack[$k]-1;
292 $lcsLength -= $snakeoverlap;
294 if($lcsLength>$bestLcsLength){
295 // a better sequence found!
296 $bestLcsLength = $lcsLength;
298 $topSnakeStart = $snakeBeginForw[$k];
299 $topSnakeEnd = $snakeEndForw[$k];
300 $topSnakek = $snakekForw[$k];
302 // aligned snakes bite (\ )
304 $bottomSnakeStart = max($snakeEndBack[$k]+
1, $topSnakeEnd+
max(0,$snakekBack[$k]-$snakekForw[$k]));
305 $bottomSnakeEnd = max($snakeBeginBack[$k]+
1, $bottomSnakeStart);
306 $bottomSnakek = $snakekBack[$k];
308 if($bottomSnakeEnd<$y){
309 $bottomSnakeStart = $y;
310 $bottomSnakeEnd = $y;
314 $bestLcsLengthTop = $lcsSizeForw[$k]-($snakeEndForw[$k]-$snakeBeginForw[$k]);
315 $bestLcsLengthBottom = $lcsSizeBack[$k]-($snakeBeginBack[$k]-$snakeEndBack[$k]);
316 if($bestKnownLcs==$lcsLength){
320 $maxp=$m_min_1-$bestLcsLength;
327 }else if($k==$delta_min_1){
335 $fpBack[$min_p_min_1] = $snakeBeginBack[$min_p_min_1] = $snakeEndBack[$min_p_min_1] = $snakekBack[$min_p_min_1] =
336 $fpBack[$delta_plus_1_plus_p] = $snakeBeginBack[$delta_plus_1_plus_p] = $snakeEndBack[$delta_plus_1_plus_p] = $snakekBack[$delta_plus_1_plus_p] = $n;
337 $lcsSizeBack[$min_p_min_1] = $lcsSizeBack[$delta_plus_1_plus_p] = 0;
344 $fpBelow = $fpBack[$k_min_1];
345 $fpAbove = $fpBack[$k_plus_1]-1;
347 if($fpBelow<$fpAbove){
348 $oldy = $y = $fpBelow;
349 $lcsSizeBack[$k] = $lcsSizeBack[$k_min_1];
350 $snakeBeginBack[$k] = $snakeBeginBack[$k_min_1];
351 $snakeEndBack[$k] = $snakeEndBack[$k_min_1];
352 $snakekBack[$k] = $snakekBack[$k_min_1];
354 $oldy = $y = $fpAbove;
355 $lcsSizeBack[$k] = $lcsSizeBack[$k_plus_1];
356 $snakeBeginBack[$k] = $snakeBeginBack[$k_plus_1];
357 $snakeEndBack[$k] = $snakeEndBack[$k_plus_1];
358 $snakekBack[$k] = $snakekBack[$k_plus_1];
361 while($x > -1 && $y > -1 && $a[$x]===$b[$y]){
366 $lcsSizeBack[$k] +
= $oldy-$y;
367 $snakeBeginBack[$k] = $oldy;
368 $snakeEndBack[$k] = $y;
369 $snakekBack[$k] = $k;
371 if($fpForw[$k]>=$y && !$overlap[$k]){
374 $lcsLength = $lcsSizeForw[$k]+
$lcsSizeBack[$k];
375 if($fpForw[$k]>$y+
1){
376 $snakeoverlap = $fpForw[$k]-$y-1;
377 $lcsLength -= $snakeoverlap;
379 if($lcsLength>$bestLcsLength){
380 // a better sequence found!
381 $bestLcsLength = $lcsLength;
383 $topSnakeStart = $snakeBeginForw[$k];
384 $topSnakeEnd = $snakeEndForw[$k];
385 $topSnakek = $snakekForw[$k];
387 // aligned snakes bite (\ )
389 $bottomSnakeStart = max($snakeEndBack[$k]+
1, $topSnakeEnd+
max(0,$snakekBack[$k]-$snakekForw[$k]));
390 $bottomSnakeEnd = max($snakeBeginBack[$k]+
1, $bottomSnakeStart);
391 $bottomSnakek = $snakekBack[$k];
393 if($bottomSnakeEnd<$fpForw[$k]){
394 $bottomSnakeStart = $fpForw[$k];
395 $bottomSnakeEnd = $fpForw[$k];
399 $bestLcsLengthTop = $lcsSizeForw[$k]-($snakeEndForw[$k]-$snakeBeginForw[$k]);
400 $bestLcsLengthBottom = $lcsSizeBack[$k]-($snakeBeginBack[$k]-$snakeEndBack[$k]);
401 if($bestKnownLcs==$lcsLength){
405 $maxp=$m_min_1-$bestLcsLength;
420 unset($fpForw, $fpBack, $lcsSizeForw, $lcsSizeBack);
421 unset($snakeBeginForw, $snakeBeginBack, $snakeEndForw, $snakeEndBack, $snakekForw, $snakekBack);
424 // Mark snakes as in LCS
425 $maxi = $offsetx+
$topSnakeEnd-$topSnakek;
426 for($i=$offsetx+
$topSnakeStart-$topSnakek;$i<$maxi;++
$i){
427 $a_inLcs_sym[$i] = TRUE;
429 $maxi = $offsety+
$topSnakeEnd;
430 for($i=$offsety+
$topSnakeStart;$i<$maxi;++
$i){
431 $b_inLcs_sym[$i] = TRUE;
433 $maxi = $offsetx+
$bottomSnakeEnd-$bottomSnakek;
434 for($i=$offsetx+
$bottomSnakeStart-$bottomSnakek;$i<$maxi;++
$i){
435 $a_inLcs_sym[$i] = TRUE;
437 $maxi = $offsety+
$bottomSnakeEnd;
438 for($i=$offsety+
$bottomSnakeStart;$i<$maxi;++
$i){
439 $b_inLcs_sym[$i] = TRUE;
442 $m_t = $topSnakeStart-$topSnakek;
443 $a_t = array_slice($a, 0, $m_t);
444 $b_t = array_slice($b, 0, $topSnakeStart);
446 $m_b = $m-($bottomSnakeEnd-$bottomSnakek);
447 $n_b = $n-$bottomSnakeEnd;
448 $a_b = array_slice($a, $bottomSnakeEnd-$bottomSnakek, $m_b);
449 $b_b = array_slice($b, $bottomSnakeEnd, $n_b);
451 wikidiff3_diffPart($a_t, $b_t, $a_inLcs, $b_inLcs, $m_t, $topSnakeStart, $offsetx, $offsety, $bestLcsLengthTop, $boundRunningTime, $max_NP_before_bound);
453 wikidiff3_diffPart($a_b, $b_b, $a_inLcs, $b_inLcs, $m_b, $n_b, $offsetx+
($bottomSnakeEnd-$bottomSnakek), $offsety+
$bottomSnakeEnd, $bestLcsLengthBottom, $boundRunningTime, $max_NP_before_bound);
460 function __construct($length){
461 $this->inLcs
= $length>0 ?
array_fill(0,$length,FALSE): array();
465 * Get the length of the Longest Common Subsequence (computed)
467 public function getLcsLength(){
468 return array_sum($this->inLcs
);