1 | /* |
2 | * Copyright © 2003 Davide Libenzi |
3 | * 2018 Benjamin Otte |
4 | * |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Lesser General Public |
7 | * License as published by the Free Software Foundation; either |
8 | * version 2.1 of the License, or (at your option) any later version. |
9 | * |
10 | * This library is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Lesser General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU Lesser General Public |
16 | * License along with this library. If not, see <http://www.gnu.org/licenses/>. |
17 | * |
18 | * Authors: Davide Libenzi <davidel@xmailserver.org> |
19 | * Benjamin Otte <otte@gnome.org> |
20 | */ |
21 | |
22 | #include "config.h" |
23 | |
24 | #include "gskdiffprivate.h" |
25 | |
26 | |
27 | #define XDL_MAX_COST_MIN 256 |
28 | #define XDL_HEUR_MIN_COST 256 |
29 | #define XDL_LINE_MAX G_MAXSSIZE |
30 | #define XDL_SNAKE_CNT 20 |
31 | #define XDL_K_HEUR 4 |
32 | #define MAXCOST 20 |
33 | |
34 | struct _GskDiffSettings { |
35 | GCompareDataFunc compare_func; |
36 | GskKeepFunc keep_func; |
37 | GskDeleteFunc delete_func; |
38 | GskInsertFunc insert_func; |
39 | |
40 | guint allow_abort : 1; |
41 | }; |
42 | |
43 | typedef struct _SplitResult { |
44 | long i1, i2; |
45 | int min_lo, min_hi; |
46 | } SplitResult; |
47 | |
48 | GskDiffSettings * |
49 | gsk_diff_settings_new (GCompareDataFunc compare_func, |
50 | GskKeepFunc keep_func, |
51 | GskDeleteFunc delete_func, |
52 | GskInsertFunc insert_func) |
53 | { |
54 | GskDiffSettings *settings; |
55 | |
56 | settings = g_slice_new0 (GskDiffSettings); |
57 | |
58 | settings->compare_func = compare_func; |
59 | settings->keep_func = keep_func; |
60 | settings->delete_func = delete_func; |
61 | settings->insert_func = insert_func; |
62 | |
63 | return settings; |
64 | } |
65 | |
66 | void |
67 | gsk_diff_settings_set_allow_abort (GskDiffSettings *settings, |
68 | gboolean allow_abort) |
69 | { |
70 | settings->allow_abort = allow_abort; |
71 | } |
72 | |
73 | void |
74 | gsk_diff_settings_free (GskDiffSettings *settings) |
75 | { |
76 | g_slice_free (GskDiffSettings, settings); |
77 | } |
78 | |
79 | /* |
80 | * See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers. |
81 | * Basically considers a "box" (off1, off2, lim1, lim2) and scan from both |
82 | * the forward diagonal starting from (off1, off2) and the backward diagonal |
83 | * starting from (lim1, lim2). If the K values on the same diagonal crosses |
84 | * returns the furthest point of reach. We might end up having to expensive |
85 | * cases using this algorithm is full, so a little bit of heuristic is needed |
86 | * to cut the search and to return a suboptimal point. |
87 | */ |
88 | static GskDiffResult |
89 | split (gconstpointer *elem1, |
90 | gssize off1, |
91 | gssize lim1, |
92 | gconstpointer *elem2, |
93 | gssize off2, |
94 | gssize lim2, |
95 | gssize *kvdf, |
96 | gssize *kvdb, |
97 | gboolean need_min, |
98 | const GskDiffSettings *settings, |
99 | gpointer data, |
100 | SplitResult *spl) |
101 | { |
102 | gssize dmin = off1 - lim2, dmax = lim1 - off2; |
103 | gssize fmid = off1 - off2, bmid = lim1 - lim2; |
104 | gboolean odd = (fmid - bmid) & 1; |
105 | gssize fmin = fmid, fmax = fmid; |
106 | gssize bmin = bmid, bmax = bmid; |
107 | gssize ec, d, i1, i2, prev1, best, dd, v, k; |
108 | |
109 | /* |
110 | * Set initial diagonal values for both forward and backward path. |
111 | */ |
112 | kvdf[fmid] = off1; |
113 | kvdb[bmid] = lim1; |
114 | |
115 | for (ec = 1;; ec++) |
116 | { |
117 | gboolean got_snake = FALSE; |
118 | |
119 | /* |
120 | * We need to extent the diagonal "domain" by one. If the next |
121 | * values exits the box boundaries we need to change it in the |
122 | * opposite direction because (max - min) must be a power of two. |
123 | * Also we initialize the external K value to -1 so that we can |
124 | * avoid extra conditions check inside the core loop. |
125 | */ |
126 | if (fmin > dmin) |
127 | kvdf[--fmin - 1] = -1; |
128 | else |
129 | ++fmin; |
130 | if (fmax < dmax) |
131 | kvdf[++fmax + 1] = -1; |
132 | else |
133 | --fmax; |
134 | |
135 | for (d = fmax; d >= fmin; d -= 2) |
136 | { |
137 | if (kvdf[d - 1] >= kvdf[d + 1]) |
138 | i1 = kvdf[d - 1] + 1; |
139 | else |
140 | i1 = kvdf[d + 1]; |
141 | prev1 = i1; |
142 | i2 = i1 - d; |
143 | for (; i1 < lim1 && i2 < lim2; i1++, i2++) |
144 | { |
145 | if (settings->compare_func (elem1[i1], elem2[i2], data) != 0) |
146 | break; |
147 | } |
148 | if (i1 - prev1 > XDL_SNAKE_CNT) |
149 | got_snake = TRUE; |
150 | kvdf[d] = i1; |
151 | if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) |
152 | { |
153 | spl->i1 = i1; |
154 | spl->i2 = i2; |
155 | spl->min_lo = spl->min_hi = 1; |
156 | return GSK_DIFF_OK; |
157 | } |
158 | } |
159 | |
160 | /* |
161 | * We need to extent the diagonal "domain" by one. If the next |
162 | * values exits the box boundaries we need to change it in the |
163 | * opposite direction because (max - min) must be a power of two. |
164 | * Also we initialize the external K value to -1 so that we can |
165 | * avoid extra conditions check inside the core loop. |
166 | */ |
167 | if (bmin > dmin) |
168 | kvdb[--bmin - 1] = XDL_LINE_MAX; |
169 | else |
170 | ++bmin; |
171 | if (bmax < dmax) |
172 | kvdb[++bmax + 1] = XDL_LINE_MAX; |
173 | else |
174 | --bmax; |
175 | |
176 | for (d = bmax; d >= bmin; d -= 2) |
177 | { |
178 | if (kvdb[d - 1] < kvdb[d + 1]) |
179 | i1 = kvdb[d - 1]; |
180 | else |
181 | i1 = kvdb[d + 1] - 1; |
182 | prev1 = i1; |
183 | i2 = i1 - d; |
184 | for (; i1 > off1 && i2 > off2; i1--, i2--) |
185 | { |
186 | if (settings->compare_func (elem1[i1 - 1], elem2[i2 - 1], data) != 0) |
187 | break; |
188 | } |
189 | if (prev1 - i1 > XDL_SNAKE_CNT) |
190 | got_snake = TRUE; |
191 | kvdb[d] = i1; |
192 | if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) |
193 | { |
194 | spl->i1 = i1; |
195 | spl->i2 = i2; |
196 | spl->min_lo = spl->min_hi = 1; |
197 | return GSK_DIFF_OK; |
198 | } |
199 | } |
200 | |
201 | if (need_min) |
202 | continue; |
203 | |
204 | /* |
205 | * If the edit cost is above the heuristic trigger and if |
206 | * we got a good snake, we sample current diagonals to see |
207 | * if some of them have reached an "interesting" path. Our |
208 | * measure is a function of the distance from the diagonal |
209 | * corner (i1 + i2) penalized with the distance from the |
210 | * mid diagonal itself. If this value is above the current |
211 | * edit cost times a magic factor (XDL_K_HEUR) we consider |
212 | * it interesting. |
213 | */ |
214 | if (got_snake && ec > XDL_HEUR_MIN_COST) |
215 | { |
216 | for (best = 0, d = fmax; d >= fmin; d -= 2) |
217 | { |
218 | dd = d > fmid ? d - fmid: fmid - d; |
219 | i1 = kvdf[d]; |
220 | i2 = i1 - d; |
221 | v = (i1 - off1) + (i2 - off2) - dd; |
222 | |
223 | if (v > XDL_K_HEUR * ec && v > best && |
224 | off1 + XDL_SNAKE_CNT <= i1 && i1 < lim1 && |
225 | off2 + XDL_SNAKE_CNT <= i2 && i2 < lim2) |
226 | { |
227 | for (k = 1; ; k++) |
228 | { |
229 | if (settings->compare_func (elem1[i1 - k], elem2[i2 - k], data) != 0) |
230 | break; |
231 | if (k == XDL_SNAKE_CNT) |
232 | { |
233 | best = v; |
234 | spl->i1 = i1; |
235 | spl->i2 = i2; |
236 | break; |
237 | } |
238 | } |
239 | } |
240 | } |
241 | if (best > 0) |
242 | { |
243 | spl->min_lo = 1; |
244 | spl->min_hi = 0; |
245 | return GSK_DIFF_OK; |
246 | } |
247 | |
248 | for (best = 0, d = bmax; d >= bmin; d -= 2) |
249 | { |
250 | dd = d > bmid ? d - bmid: bmid - d; |
251 | i1 = kvdb[d]; |
252 | i2 = i1 - d; |
253 | v = (lim1 - i1) + (lim2 - i2) - dd; |
254 | |
255 | if (v > XDL_K_HEUR * ec && v > best && |
256 | off1 < i1 && i1 <= lim1 - XDL_SNAKE_CNT && |
257 | off2 < i2 && i2 <= lim2 - XDL_SNAKE_CNT) |
258 | { |
259 | for (k = 0; ; k++) |
260 | { |
261 | if (settings->compare_func (elem1[i1 + k], elem2[i2 + k], data) != 0) |
262 | break; |
263 | |
264 | if (k == XDL_SNAKE_CNT - 1) |
265 | { |
266 | best = v; |
267 | spl->i1 = i1; |
268 | spl->i2 = i2; |
269 | break; |
270 | } |
271 | } |
272 | } |
273 | } |
274 | if (best > 0) |
275 | { |
276 | spl->min_lo = 0; |
277 | spl->min_hi = 1; |
278 | return GSK_DIFF_OK; |
279 | } |
280 | } |
281 | |
282 | /* |
283 | * Enough is enough. We spent too much time here and now we collect |
284 | * the furthest reaching path using the (i1 + i2) measure. |
285 | */ |
286 | if (ec >= MAXCOST) |
287 | { |
288 | gssize fbest, fbest1, bbest, bbest1; |
289 | |
290 | if (settings->allow_abort) |
291 | return GSK_DIFF_ABORTED; |
292 | |
293 | fbest = fbest1 = -1; |
294 | for (d = fmax; d >= fmin; d -= 2) |
295 | { |
296 | i1 = MIN (kvdf[d], lim1); |
297 | i2 = i1 - d; |
298 | if (lim2 < i2) |
299 | i1 = lim2 + d, i2 = lim2; |
300 | if (fbest < i1 + i2) |
301 | { |
302 | fbest = i1 + i2; |
303 | fbest1 = i1; |
304 | } |
305 | } |
306 | |
307 | bbest = bbest1 = XDL_LINE_MAX; |
308 | for (d = bmax; d >= bmin; d -= 2) |
309 | { |
310 | i1 = MAX (off1, kvdb[d]); |
311 | i2 = i1 - d; |
312 | if (i2 < off2) |
313 | i1 = off2 + d, i2 = off2; |
314 | if (i1 + i2 < bbest) |
315 | { |
316 | bbest = i1 + i2; |
317 | bbest1 = i1; |
318 | } |
319 | } |
320 | |
321 | if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) |
322 | { |
323 | spl->i1 = fbest1; |
324 | spl->i2 = fbest - fbest1; |
325 | spl->min_lo = 1; |
326 | spl->min_hi = 0; |
327 | } |
328 | else |
329 | { |
330 | spl->i1 = bbest1; |
331 | spl->i2 = bbest - bbest1; |
332 | spl->min_lo = 0; |
333 | spl->min_hi = 1; |
334 | } |
335 | |
336 | return GSK_DIFF_OK; |
337 | } |
338 | } |
339 | } |
340 | |
341 | /* |
342 | * Rule: "Divide et Impera". Recursively split the box in sub-boxes by calling |
343 | * the box splitting function. Note that the real job (marking changed lines) |
344 | * is done in the two boundary reaching checks. |
345 | */ |
346 | static GskDiffResult |
347 | compare (gconstpointer *elem1, |
348 | gssize off1, |
349 | gssize lim1, |
350 | gconstpointer *elem2, |
351 | gssize off2, |
352 | gssize lim2, |
353 | gssize *kvdf, |
354 | gssize *kvdb, |
355 | gboolean need_min, |
356 | const GskDiffSettings *settings, |
357 | gpointer data) |
358 | { |
359 | GskDiffResult res; |
360 | |
361 | /* |
362 | * Shrink the box by walking through each diagonal snake (SW and NE). |
363 | */ |
364 | for (; off1 < lim1 && off2 < lim2; off1++, off2++) |
365 | { |
366 | if (settings->compare_func (elem1[off1], elem2[off2], data) != 0) |
367 | break; |
368 | |
369 | res = settings->keep_func (elem1[off1], elem2[off2], data); |
370 | if (res != GSK_DIFF_OK) |
371 | return res; |
372 | } |
373 | |
374 | for (; off1 < lim1 && off2 < lim2; lim1--, lim2--) |
375 | { |
376 | if (settings->compare_func (elem1[lim1 - 1], elem2[lim2 - 1], data) != 0) |
377 | break; |
378 | |
379 | res = settings->keep_func (elem1[lim1 - 1], elem2[lim2 - 1], data); |
380 | if (res != GSK_DIFF_OK) |
381 | return res; |
382 | } |
383 | |
384 | /* |
385 | * If one dimension is empty, then all records on the other one must |
386 | * be obviously changed. |
387 | */ |
388 | if (off1 == lim1) |
389 | { |
390 | for (; off2 < lim2; off2++) |
391 | { |
392 | res = settings->insert_func (elem2[off2], off2, data); |
393 | if (res != GSK_DIFF_OK) |
394 | return res; |
395 | } |
396 | } |
397 | else if (off2 == lim2) |
398 | { |
399 | for (; off1 < lim1; off1++) |
400 | { |
401 | res = settings->delete_func (elem1[off1], off1, data); |
402 | if (res != GSK_DIFF_OK) |
403 | return res; |
404 | } |
405 | } |
406 | else |
407 | { |
408 | SplitResult spl = { 0, }; |
409 | |
410 | /* |
411 | * Divide ... |
412 | */ |
413 | res = split (elem1, off1, lim1, |
414 | elem2, off2, lim2, |
415 | kvdf, kvdb, need_min, |
416 | settings, data, |
417 | spl: &spl); |
418 | if (res != GSK_DIFF_OK) |
419 | return res; |
420 | |
421 | /* |
422 | * ... et Impera. |
423 | */ |
424 | res = compare (elem1, off1, lim1: spl.i1, |
425 | elem2, off2, lim2: spl.i2, |
426 | kvdf, kvdb, need_min: spl.min_lo, |
427 | settings, data); |
428 | if (res != GSK_DIFF_OK) |
429 | return res; |
430 | res = compare (elem1, off1: spl.i1, lim1, |
431 | elem2, off2: spl.i2, lim2, |
432 | kvdf, kvdb, need_min: spl.min_hi, |
433 | settings, data); |
434 | if (res != GSK_DIFF_OK) |
435 | return res; |
436 | } |
437 | |
438 | return GSK_DIFF_OK; |
439 | } |
440 | |
441 | #if 0 |
442 | ndiags = xe->xdf1.nreff + xe->xdf2.nreff + 3; |
443 | if (!(kvd = (long *) xdl_malloc((2 * ndiags + 2) * sizeof(long)))) { |
444 | |
445 | xdl_free_env(xe); |
446 | return -1; |
447 | } |
448 | kvdf = kvd; |
449 | kvdb = kvdf + ndiags; |
450 | kvdf += xe->xdf2.nreff + 1; |
451 | kvdb += xe->xdf2.nreff + 1; |
452 | |
453 | xenv.mxcost = xdl_bogosqrt(ndiags); |
454 | if (xenv.mxcost < XDL_MAX_COST_MIN) |
455 | xenv.mxcost = XDL_MAX_COST_MIN; |
456 | xenv.snake_cnt = XDL_SNAKE_CNT; |
457 | xenv.heur_min = XDL_HEUR_MIN_COST; |
458 | |
459 | dd1.nrec = xe->xdf1.nreff; |
460 | dd1.ha = xe->xdf1.ha; |
461 | dd1.rchg = xe->xdf1.rchg; |
462 | dd1.rindex = xe->xdf1.rindex; |
463 | dd2.nrec = xe->xdf2.nreff; |
464 | dd2.ha = xe->xdf2.ha; |
465 | dd2.rchg = xe->xdf2.rchg; |
466 | dd2.rindex = xe->xdf2.rindex; |
467 | #endif |
468 | |
469 | GskDiffResult |
470 | gsk_diff (gconstpointer *elem1, |
471 | gsize n1, |
472 | gconstpointer *elem2, |
473 | gsize n2, |
474 | const GskDiffSettings *settings, |
475 | gpointer data) |
476 | { |
477 | gsize ndiags; |
478 | gssize *kvd, *kvdf, *kvdb; |
479 | GskDiffResult res; |
480 | |
481 | ndiags = n1 + n2 + 3; |
482 | |
483 | kvd = g_new (gssize, 2 * ndiags + 2); |
484 | kvdf = kvd; |
485 | kvdb = kvd + ndiags; |
486 | kvdf += n2 + 1; |
487 | kvdb += n2 + 1; |
488 | |
489 | res = compare (elem1, off1: 0, lim1: n1, |
490 | elem2, off2: 0, lim2: n2, |
491 | kvdf, kvdb, FALSE, |
492 | settings, data); |
493 | |
494 | g_free (mem: kvd); |
495 | |
496 | return res; |
497 | } |
498 | |
499 | |