Filecache should check &useskin. This should be backported.
[lhc/web/wiklou.git] / includes / Diff.php
1 <?php
2 /* Copyright (C) 2008 Guy Van den Broeck <guy@guyvdb.eu>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * or see http://www.gnu.org/
18 */
19
20 /**
21 * Implementation of the Diff algorithm.
22 *
23 * DO NOT USE ON PRODUCTION SYSTEMS
24 *
25 * The algorithm is based on the O(NP) LCS algorithm as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm"
26 * and extended with my own ideas.
27 *
28 * @return array($from_removed, $to_added)
29 * TRUE if the token was removed or added.
30 *
31 * @author Guy Van den Broeck
32 */
33 function wikidiff3_diff( /*array*/ $from, /*array*/ $to, $boundRunningTime=FALSE, $max_NP_before_bound = 800000){
34 wfProfileIn( __METHOD__ );
35
36 $m = sizeof($from);
37 $n = sizeof($to);
38
39 $result_from = array_fill(0,$m,TRUE);
40 $result_to = array_fill(0,$n,TRUE);
41
42 //reduce the complexity for the next step (intentionally done twice)
43 //remove common tokens at the start
44 $i=0;
45 while($i<$m && $i<$n && $from[$i]===$to[$i]){
46 $result_from[$i] = $result_to[$i] = FALSE;
47 unset($from[$i],$to[$i]);
48 ++$i;
49 }
50
51 //remove common tokens at the end
52 $j=1;
53 while($i+$j<=$m && $i+$j<=$n && $from[$m-$j]===$to[$n-$j]){
54 $result_from[$m-$j] = $result_to[$n-$j] = FALSE;
55 unset($from[$m-$j],$to[$n-$j]);
56 ++$j;
57 }
58
59 $newFrom = $newFromIndex = $newTo = $newToIndex = array();
60
61 //remove tokens not in both sequences
62 $shared = array_fill_keys($from,FALSE);
63 foreach($to as $index => $el){
64 if(array_key_exists($el,$shared)){
65 //keep it
66 $shared[$el] = TRUE;
67 $newTo[] = $el;
68 $newToIndex[] = $index;
69 }
70 }
71 foreach($from as $index => $el){
72 if($shared[$el]){
73 //keep it
74 $newFrom[] = $el;
75 $newFromIndex[] = $index;
76 }
77 }
78
79 unset($from, $to, $shared);
80
81 $m2 = sizeof($newFrom);
82 $n2 = sizeof($newTo);
83 $offsetx = $offsety = 0;
84 $from_inLcs = new InLcs($m2);
85 $to_inLcs = new InLcs($n2);
86
87 wikidiff3_diffPart($newFrom, $newTo, $from_inLcs, $to_inLcs, $m2, $n2, $offsetx, $offsety, $m2, $boundRunningTime, $max_NP_before_bound);
88
89 foreach($from_inLcs->inLcs as $key => $in){
90 if($in){
91 $result_from[$newFromIndex[$key]] = FALSE;
92 }
93 }
94 foreach($to_inLcs->inLcs as $key => $in){
95 if($in){
96 $result_to[$newToIndex[$key]] = FALSE;
97 }
98 }
99
100 wfProfileOut( __METHOD__ );
101 return array($result_from, $result_to);
102 }
103
104 function wikidiff3_diffPart( /*array*/ $a, /*array*/ $b, InLcs $a_inLcs, InLcs $b_inLcs, $m, $n, $offsetx, $offsety, $bestKnownLcs, $boundRunningTime=FALSE, $max_NP_before_bound = 800000){
105 if($bestKnownLcs==0 || $m==0 || $n==0){
106 return;
107 }
108
109 if($m>$n){
110 return wikidiff3_diffPart($b, $a, $b_inLcs, $a_inLcs, $n, $m, $offsety, $offsetx, $bestKnownLcs, $boundRunningTime, $max_NP_before_bound);
111 }
112
113 $a_inLcs_sym = &$a_inLcs->inLcs;
114 $b_inLcs_sym = &$b_inLcs->inLcs;
115
116 //remove common tokens at the start
117 while($m>0 && $n>0 && $a[0]===$b[0]){
118 $a_inLcs_sym[$offsetx] = $b_inLcs_sym[$offsety] = TRUE;
119 ++$offsetx; ++$offsety;
120 --$m; --$n;
121 --$bestKnownLcs;
122 array_shift($a);
123 array_shift($b);
124 }
125
126 //remove common tokens at the end
127 while($m>0 && $n>0 && $a[$m-1]===$b[$n-1]){
128 $a_inLcs_sym[$offsetx+$m-1] = $b_inLcs_sym[$offsety+$n-1] = TRUE;
129 --$m; --$n;
130 --$bestKnownLcs;
131 array_pop($a);
132 array_pop($b);
133 }
134
135 if($bestKnownLcs==0 || $m==0 || $n==0){
136 return;
137 }
138 if($m>$n){
139 return wikidiff3_diffPart($b, $a, $b_inLcs, $a_inLcs, $n, $m, $offsety, $offsetx, $bestKnownLcs, $boundRunningTime, $max_NP_before_bound);
140 }
141
142 $delta = $n-$m;
143 $delta_plus_1 = $delta+1;
144 $delta_min_1 = $delta-1;
145
146 $fpForw = $snakeBeginForw = $snakeEndForw = array_fill( 0, $delta_plus_1, -1);
147 $lcsSizeForw = $snakekForw = $lcsSizeBack = $snakekBack = array_fill( 0, $delta_plus_1, 0);
148 $fpBack = $snakeBeginBack = $snakeEndBack = array_fill( 0, $delta_plus_1, $n);
149 $overlap = $delta>1 ? array_fill( 1, $delta_min_1, FALSE) : array();
150 $bestLcsLength = $bestLcsLengthTop = $bestLcsLengthBottom = -1;
151
152 if($boundRunningTime){
153 $maxp_before_bound = max((-$delta+sqrt($delta*$delta+($max_NP_before_bound<<2)))>>1,2);
154 if($maxp_before_bound>=$m){
155 $boundRunningTime = false;
156 unset($maxp_before_bound);
157 }
158 }
159
160 $p=-1;
161 $m_min_1 = $m-1;
162 $maxp=$m_min_1-$bestLcsLength;
163
164 while($p<$maxp){
165
166 if($boundRunningTime && $p>$maxp_before_bound){
167 // bound the running time by stopping early
168 if($bestLcsLength>=0){
169 // accept the best LCS thusfar
170 break;
171 }else{
172 $bestLcsProgressForw = $bestkForw = 0;
173 foreach($lcsSizeForw as $k => $localLcsProgress){
174 if($localLcsProgress>$bestLcsProgressForw || ($localLcsProgress==$bestLcsProgressForw && $bestkForw > $k)){
175 $bestLcsProgressForw = $localLcsProgress;
176 $bestkForw = $k;
177 }
178 }
179 $bestLcsProgressBack = $bestkBack = 0;
180 foreach($lcsSizeBack as $k => $localLcsProgress){
181 if($localLcsProgress>$bestLcsProgressBack || ($localLcsProgress==$bestLcsProgressBack && $bestkBack < $k)){
182 $bestLcsProgressBack = $localLcsProgress;
183 $bestkBack = $k;
184 }
185 }
186 if($fpForw[$bestkForw]-$bestkForw>$fpBack[$bestkBack]-$bestkBack){
187 // This is hard, maybe try some more? Can this even happen? A solution will be found soon anyway.
188 }else{
189 // lets do this
190 $topSnakeStart = $snakeBeginForw[$bestkForw];
191 $topSnakeEnd = $snakeEndForw[$bestkForw];
192 $topSnakek = $snakekForw[$bestkForw];
193 $bestLcsLengthTop = $lcsSizeForw[$bestkBack] + $topSnakeStart - $topSnakeEnd;
194
195 $bottomSnakeStart = $snakeEndBack[$bestkBack]+1;
196 $bottomSnakeEnd = $snakeBeginBack[$bestkBack]+1;
197 $bottomSnakek = $snakekBack[$bestkBack];
198 $bestLcsLengthBottom = $lcsSizeBack[$bestkBack] + $bottomSnakeStart - $bottomSnakeEnd;
199
200 if($bottomSnakeStart-$topSnakeEnd>$n*0.6 && ($bottomSnakeStart-$bottomSnakek)-($topSnakeEnd-$topSnakek)>$m*0.6){
201 // cut the sequence from both sides (there isn't enough progress, 60% of the sequence is not covered)
202 // also diff the middle now
203 if($bottomSnakeEnd>($fpForw[$bestkForw]>>1)){
204 // do the middle diff between the snakes
205 wfDebug(" Limiting diff run time from both sides, middle between snakes\n");
206 $m_t = ($bottomSnakeStart-$bottomSnakek)-($topSnakeEnd-$topSnakek);
207 $n_t = $bottomSnakeStart-$topSnakeEnd;
208 $a_t = array_slice($a, $topSnakeEnd-$topSnakek, $m_t);
209 $b_t = array_slice($b, $topSnakeEnd, $n_t);
210 $offsetx_t = $offsetx+($topSnakeEnd-$topSnakek);
211 $offsety_t = $offsety+$topSnakeEnd;
212 }else{
213 // the snake is too early in the sequence, do the middle diff between endpoints of progress
214 wfDebug(" Limiting diff run time from both sides, middle between endpoints\n");
215 $m_t = ($fpBack[$bestkBack]+1-$bestkBack)-($fpForw[$bestkForw]-$bestkForw);
216 $n_t = $fpBack[$bestkBack]+1-$fpForw[$bestkForw];
217 $a_t = array_slice($a, $fpForw[$bestkForw]-$bestkForw, $m_t);
218 $b_t = array_slice($b, $fpForw[$bestkForw], $n_t);
219 $offsetx_t = $offsetx+($fpForw[$bestkForw]-$bestkForw);
220 $offsety_t = $offsety+$fpForw[$bestkForw];
221 }
222 wikidiff3_diffPart($a_t, $b_t, $a_inLcs, $b_inLcs, $m_t, $n_t, $offsetx_t, $offsety_t, $m_t, $boundRunningTime, $max_NP_before_bound);
223 }else if(min($n-$bottomSnakeStart, $m-($bottomSnakeStart-$bottomSnakek))>min($topSnakeEnd, $topSnakeEnd-$topSnakek)){
224 // the bottom snake is more interesting
225 wfDebug("Limiting diff run time from bottom side\n");
226 $topSnakeStart = $bottomSnakeStart;
227 $topSnakeEnd = $bottomSnakeEnd;
228 $topSnakek = $bottomSnakek;
229 $bestLcsLengthTop = $topSnakeEnd-$topSnakek;
230 $bottomSnakeStart = $bottomSnakeEnd;
231 }else{
232 // the top snake is more interesting
233 wfDebug(" Limiting diff run time from top side\n");
234 $bottomSnakeStart = $topSnakeEnd;
235 $bottomSnakeEnd = $topSnakeEnd;
236 $bottomSnakek = $topSnakek;
237 $bestLcsLengthBottom = $m-($bottomSnakeEnd-$bottomSnakek);
238 }
239 break;
240 }
241 }
242 }
243 ++$p;
244 $overlap[-$p] = $overlap[$delta+$p] = FALSE;
245
246 $min_p_min_1 = -$p-1;
247 $delta_plus_1_plus_p = $delta_plus_1+$p;
248
249 // forward
250 $fpForw[$min_p_min_1] = $snakeEndForw[$min_p_min_1] = $snakekForw[$min_p_min_1] = $snakeBeginForw[$min_p_min_1] = $fpForw[$delta_plus_1_plus_p] =
251 $snakeBeginForw[$delta_plus_1_plus_p] = $snakeEndForw[$delta_plus_1_plus_p] = $snakekForw[$delta_plus_1_plus_p] = -1;
252 $lcsSizeForw[$min_p_min_1] = $lcsSizeForw[$delta_plus_1_plus_p] = 0;
253
254 $k=-$p;
255 do {
256 $k_plus_1 = $k+1;
257 $k_min_1 = $k-1;
258
259 $fpBelow = $fpForw[$k_min_1]+1;
260 $fpAbove = $fpForw[$k_plus_1];
261 $y = &$fpForw[$k];
262 if($fpBelow>$fpAbove){
263 $oldy = $y = $fpBelow;
264 $lcsSizeForw[$k] = $lcsSizeForw[$k_min_1];
265 $snakeBeginForw[$k] = $snakeBeginForw[$k_min_1];
266 $snakeEndForw[$k] = $snakeEndForw[$k_min_1];
267 $snakekForw[$k] = $snakekForw[$k_min_1];
268 }else{
269 $oldy = $y = $fpAbove;
270 $lcsSizeForw[$k] = $lcsSizeForw[$k_plus_1];
271 $snakeBeginForw[$k] = $snakeBeginForw[$k_plus_1];
272 $snakeEndForw[$k] = $snakeEndForw[$k_plus_1];
273 $snakekForw[$k] = $snakekForw[$k_plus_1];
274 }
275 $x = $y-$k;
276 while($x < $m && $y < $n && $a[$x]===$b[$y]){
277 ++$x;
278 ++$y;
279 }
280 if($y-$oldy>0){
281 $lcsSizeForw[$k] += $y-$oldy;
282 $snakeBeginForw[$k] = $oldy;
283 $snakeEndForw[$k]= $y;
284 $snakekForw[$k] = $k;
285 }
286 if($y>=$fpBack[$k] && !$overlap[$k]){
287 // there is overlap
288 $overlap[$k]=TRUE;
289 $lcsLength = $lcsSizeForw[$k]+$lcsSizeBack[$k];
290 if($y>$fpBack[$k]+1){
291 $snakeoverlap = $y-$fpBack[$k]-1;
292 $lcsLength -= $snakeoverlap;
293 }
294 if($lcsLength>$bestLcsLength){
295 // a better sequence found!
296 $bestLcsLength = $lcsLength;
297
298 $topSnakeStart = $snakeBeginForw[$k];
299 $topSnakeEnd = $snakeEndForw[$k];
300 $topSnakek = $snakekForw[$k];
301
302 // aligned snakes bite (\ )
303 // ( \ \)
304 $bottomSnakeStart = max($snakeEndBack[$k]+1, $topSnakeEnd+max(0,$snakekBack[$k]-$snakekForw[$k]));
305 $bottomSnakeEnd = max($snakeBeginBack[$k]+1, $bottomSnakeStart);
306 $bottomSnakek = $snakekBack[$k];
307
308 if($bottomSnakeEnd<$y){
309 $bottomSnakeStart = $y;
310 $bottomSnakeEnd = $y;
311 $bottomSnakek = $k;
312 }
313
314 $bestLcsLengthTop = $lcsSizeForw[$k]-($snakeEndForw[$k]-$snakeBeginForw[$k]);
315 $bestLcsLengthBottom = $lcsSizeBack[$k]-($snakeBeginBack[$k]-$snakeEndBack[$k]);
316 if($bestKnownLcs==$lcsLength){
317 $fpForw[$k]=$y;
318 break 2;
319 }
320 $maxp=$m_min_1-$bestLcsLength;
321 }
322 }
323 if($k<$delta_min_1){
324 ++$k;
325 }else if($k>$delta){
326 --$k;
327 }else if($k==$delta_min_1){
328 $k = $delta+$p;
329 }else{
330 break;
331 }
332 } while(TRUE);
333
334 // backward
335 $fpBack[$min_p_min_1] = $snakeBeginBack[$min_p_min_1] = $snakeEndBack[$min_p_min_1] = $snakekBack[$min_p_min_1] =
336 $fpBack[$delta_plus_1_plus_p] = $snakeBeginBack[$delta_plus_1_plus_p] = $snakeEndBack[$delta_plus_1_plus_p] = $snakekBack[$delta_plus_1_plus_p] = $n;
337 $lcsSizeBack[$min_p_min_1] = $lcsSizeBack[$delta_plus_1_plus_p] = 0;
338
339 $k=$delta+$p;
340 do {
341 $k_plus_1 = $k+1;
342 $k_min_1 = $k-1;
343
344 $fpBelow = $fpBack[$k_min_1];
345 $fpAbove = $fpBack[$k_plus_1]-1;
346 $y = &$fpBack[$k];
347 if($fpBelow<$fpAbove){
348 $oldy = $y = $fpBelow;
349 $lcsSizeBack[$k] = $lcsSizeBack[$k_min_1];
350 $snakeBeginBack[$k] = $snakeBeginBack[$k_min_1];
351 $snakeEndBack[$k] = $snakeEndBack[$k_min_1];
352 $snakekBack[$k] = $snakekBack[$k_min_1];
353 }else{
354 $oldy = $y = $fpAbove;
355 $lcsSizeBack[$k] = $lcsSizeBack[$k_plus_1];
356 $snakeBeginBack[$k] = $snakeBeginBack[$k_plus_1];
357 $snakeEndBack[$k] = $snakeEndBack[$k_plus_1];
358 $snakekBack[$k] = $snakekBack[$k_plus_1];
359 }
360 $x = $y-$k;
361 while($x > -1 && $y > -1 && $a[$x]===$b[$y]){
362 --$x;
363 --$y;
364 }
365 if($oldy-$y>0){
366 $lcsSizeBack[$k] += $oldy-$y;
367 $snakeBeginBack[$k] = $oldy;
368 $snakeEndBack[$k] = $y;
369 $snakekBack[$k] = $k;
370 }
371 if($fpForw[$k]>=$y && !$overlap[$k]){
372 // there is overlap
373 $overlap[$k]=TRUE;
374 $lcsLength = $lcsSizeForw[$k]+$lcsSizeBack[$k];
375 if($fpForw[$k]>$y+1){
376 $snakeoverlap = $fpForw[$k]-$y-1;
377 $lcsLength -= $snakeoverlap;
378 }
379 if($lcsLength>$bestLcsLength){
380 // a better sequence found!
381 $bestLcsLength = $lcsLength;
382
383 $topSnakeStart = $snakeBeginForw[$k];
384 $topSnakeEnd = $snakeEndForw[$k];
385 $topSnakek = $snakekForw[$k];
386
387 // aligned snakes bite (\ )
388 // ( \ \)
389 $bottomSnakeStart = max($snakeEndBack[$k]+1, $topSnakeEnd+max(0,$snakekBack[$k]-$snakekForw[$k]));
390 $bottomSnakeEnd = max($snakeBeginBack[$k]+1, $bottomSnakeStart);
391 $bottomSnakek = $snakekBack[$k];
392
393 if($bottomSnakeEnd<$fpForw[$k]){
394 $bottomSnakeStart = $fpForw[$k];
395 $bottomSnakeEnd = $fpForw[$k];
396 $bottomSnakek = $k;
397 }
398
399 $bestLcsLengthTop = $lcsSizeForw[$k]-($snakeEndForw[$k]-$snakeBeginForw[$k]);
400 $bestLcsLengthBottom = $lcsSizeBack[$k]-($snakeBeginBack[$k]-$snakeEndBack[$k]);
401 if($bestKnownLcs==$lcsLength){
402 $fpBack[$k] = $y;
403 break 2;
404 }
405 $maxp=$m_min_1-$bestLcsLength;
406 }
407 }
408 if($k>1){
409 --$k;
410 }else if($k<0){
411 ++$k;
412 }else if($k==1){
413 $k = -$p;
414 }else{
415 break;
416 }
417 } while(TRUE);
418 }
419
420 unset($fpForw, $fpBack, $lcsSizeForw, $lcsSizeBack);
421 unset($snakeBeginForw, $snakeBeginBack, $snakeEndForw, $snakeEndBack, $snakekForw, $snakekBack);
422 unset($overlap);
423
424 // Mark snakes as in LCS
425 $maxi = $offsetx+$topSnakeEnd-$topSnakek;
426 for($i=$offsetx+$topSnakeStart-$topSnakek;$i<$maxi;++$i){
427 $a_inLcs_sym[$i] = TRUE;
428 }
429 $maxi = $offsety+$topSnakeEnd;
430 for($i=$offsety+$topSnakeStart;$i<$maxi;++$i){
431 $b_inLcs_sym[$i] = TRUE;
432 }
433 $maxi = $offsetx+$bottomSnakeEnd-$bottomSnakek;
434 for($i=$offsetx+$bottomSnakeStart-$bottomSnakek;$i<$maxi;++$i){
435 $a_inLcs_sym[$i] = TRUE;
436 }
437 $maxi = $offsety+$bottomSnakeEnd;
438 for($i=$offsety+$bottomSnakeStart;$i<$maxi;++$i){
439 $b_inLcs_sym[$i] = TRUE;
440 }
441
442 $m_t = $topSnakeStart-$topSnakek;
443 $a_t = array_slice($a, 0, $m_t);
444 $b_t = array_slice($b, 0, $topSnakeStart);
445
446 $m_b = $m-($bottomSnakeEnd-$bottomSnakek);
447 $n_b = $n-$bottomSnakeEnd;
448 $a_b = array_slice($a, $bottomSnakeEnd-$bottomSnakek, $m_b);
449 $b_b = array_slice($b, $bottomSnakeEnd, $n_b);
450
451 wikidiff3_diffPart($a_t, $b_t, $a_inLcs, $b_inLcs, $m_t, $topSnakeStart, $offsetx, $offsety, $bestLcsLengthTop, $boundRunningTime, $max_NP_before_bound);
452
453 wikidiff3_diffPart($a_b, $b_b, $a_inLcs, $b_inLcs, $m_b, $n_b, $offsetx+($bottomSnakeEnd-$bottomSnakek), $offsety+$bottomSnakeEnd, $bestLcsLengthBottom, $boundRunningTime, $max_NP_before_bound);
454 }
455
456 class InLcs {
457
458 public $inLcs;
459
460 function __construct($length){
461 $this->inLcs = $length>0 ? array_fill(0,$length,FALSE): array();
462 }
463
464 /**
465 * Get the length of the Longest Common Subsequence (computed)
466 */
467 public function getLcsLength(){
468 return array_sum($this->inLcs);
469 }
470
471 }