1 | //! Myers' diff algorithm. |
2 | //! |
3 | //! * time: `O((N+M)D)` |
4 | //! * space `O(N+M)` |
5 | //! |
6 | //! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf) |
7 | //! describing it. |
8 | //! |
9 | //! The implementation of this algorithm is based on the implementation by |
10 | //! Brandon Williams. |
11 | //! |
12 | //! # Heuristics |
13 | //! |
14 | //! At present this implementation of Myers' does not implement any more advanced |
15 | //! heuristics that would solve some pathological cases. For instance passing two |
16 | //! large and completely distinct sequences to the algorithm will make it spin |
17 | //! without making reasonable progress. Currently the only protection in the |
18 | //! library against this is to pass a deadline to the diffing algorithm. |
19 | //! |
20 | //! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15). |
21 | |
22 | use std::ops::{Index, IndexMut, Range}; |
23 | use std::time::Instant; |
24 | |
25 | use crate::algorithms::utils::{common_prefix_len, common_suffix_len, is_empty_range}; |
26 | use crate::algorithms::DiffHook; |
27 | |
28 | /// Myers' diff algorithm. |
29 | /// |
30 | /// Diff `old`, between indices `old_range` and `new` between indices `new_range`. |
31 | pub fn diff<Old, New, D>( |
32 | d: &mut D, |
33 | old: &Old, |
34 | old_range: Range<usize>, |
35 | new: &New, |
36 | new_range: Range<usize>, |
37 | ) -> Result<(), D::Error> |
38 | where |
39 | Old: Index<usize> + ?Sized, |
40 | New: Index<usize> + ?Sized, |
41 | D: DiffHook, |
42 | New::Output: PartialEq<Old::Output>, |
43 | { |
44 | diff_deadline(d, old, old_range, new, new_range, deadline:None) |
45 | } |
46 | |
47 | /// Myers' diff algorithm with deadline. |
48 | /// |
49 | /// Diff `old`, between indices `old_range` and `new` between indices `new_range`. |
50 | /// |
51 | /// This diff is done with an optional deadline that defines the maximal |
52 | /// execution time permitted before it bails and falls back to an approximation. |
53 | pub fn diff_deadline<Old, New, D>( |
54 | d: &mut D, |
55 | old: &Old, |
56 | old_range: Range<usize>, |
57 | new: &New, |
58 | new_range: Range<usize>, |
59 | deadline: Option<Instant>, |
60 | ) -> Result<(), D::Error> |
61 | where |
62 | Old: Index<usize> + ?Sized, |
63 | New: Index<usize> + ?Sized, |
64 | D: DiffHook, |
65 | New::Output: PartialEq<Old::Output>, |
66 | { |
67 | let max_d: usize = max_d(len1:old_range.len(), len2:new_range.len()); |
68 | let mut vb: V = V::new(max_d); |
69 | let mut vf: V = V::new(max_d); |
70 | conquer( |
71 | d, old, old_range, new, new_range, &mut vf, &mut vb, deadline, |
72 | )?; |
73 | d.finish() |
74 | } |
75 | |
76 | // A D-path is a path which starts at (0,0) that has exactly D non-diagonal |
77 | // edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge |
78 | // and then a possibly empty sequence of diagonal edges called a snake. |
79 | |
80 | /// `V` contains the endpoints of the furthest reaching `D-paths`. For each |
81 | /// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because |
82 | /// `y` can be computed from `x - k`. In other words, `V` is an array of integers |
83 | /// where `V[k]` contains the row index of the endpoint of the furthest reaching |
84 | /// path in diagonal `k`. |
85 | /// |
86 | /// We can't use a traditional Vec to represent `V` since we use `k` as an index |
87 | /// and it can take on negative values. So instead `V` is represented as a |
88 | /// light-weight wrapper around a Vec plus an `offset` which is the maximum value |
89 | /// `k` can take on in order to map negative `k`'s back to a value >= 0. |
90 | #[derive (Debug)] |
91 | struct V { |
92 | offset: isize, |
93 | v: Vec<usize>, // Look into initializing this to -1 and storing isize |
94 | } |
95 | |
96 | impl V { |
97 | fn new(max_d: usize) -> Self { |
98 | Self { |
99 | offset: max_d as isize, |
100 | v: vec![0; 2 * max_d], |
101 | } |
102 | } |
103 | |
104 | fn len(&self) -> usize { |
105 | self.v.len() |
106 | } |
107 | } |
108 | |
109 | impl Index<isize> for V { |
110 | type Output = usize; |
111 | |
112 | fn index(&self, index: isize) -> &Self::Output { |
113 | &self.v[(index + self.offset) as usize] |
114 | } |
115 | } |
116 | |
117 | impl IndexMut<isize> for V { |
118 | fn index_mut(&mut self, index: isize) -> &mut Self::Output { |
119 | &mut self.v[(index + self.offset) as usize] |
120 | } |
121 | } |
122 | |
123 | fn max_d(len1: usize, len2: usize) -> usize { |
124 | // XXX look into reducing the need to have the additional '+ 1' |
125 | (len1 + len2 + 1) / 2 + 1 |
126 | } |
127 | |
128 | #[inline (always)] |
129 | fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) { |
130 | (range.start..at, at..range.end) |
131 | } |
132 | |
133 | /// A `Snake` is a sequence of diagonal edges in the edit graph. Normally |
134 | /// a snake has a start end end point (and it is possible for a snake to have |
135 | /// a length of zero, meaning the start and end points are the same) however |
136 | /// we do not need the end point which is why it's not implemented here. |
137 | /// |
138 | /// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes |
139 | /// some of which may be empty. The divide step requires finding the ceil(D/2) + |
140 | /// 1 or middle snake of an optimal D-path. The idea for doing so is to |
141 | /// simultaneously run the basic algorithm in both the forward and reverse |
142 | /// directions until furthest reaching forward and reverse paths starting at |
143 | /// opposing corners 'overlap'. |
144 | fn find_middle_snake<Old, New>( |
145 | old: &Old, |
146 | old_range: Range<usize>, |
147 | new: &New, |
148 | new_range: Range<usize>, |
149 | vf: &mut V, |
150 | vb: &mut V, |
151 | deadline: Option<Instant>, |
152 | ) -> Option<(usize, usize)> |
153 | where |
154 | Old: Index<usize> + ?Sized, |
155 | New: Index<usize> + ?Sized, |
156 | New::Output: PartialEq<Old::Output>, |
157 | { |
158 | let n = old_range.len(); |
159 | let m = new_range.len(); |
160 | |
161 | // By Lemma 1 in the paper, the optimal edit script length is odd or even as |
162 | // `delta` is odd or even. |
163 | let delta = n as isize - m as isize; |
164 | let odd = delta & 1 == 1; |
165 | |
166 | // The initial point at (0, -1) |
167 | vf[1] = 0; |
168 | // The initial point at (N, M+1) |
169 | vb[1] = 0; |
170 | |
171 | // We only need to explore ceil(D/2) + 1 |
172 | let d_max = max_d(n, m); |
173 | assert!(vf.len() >= d_max); |
174 | assert!(vb.len() >= d_max); |
175 | |
176 | for d in 0..d_max as isize { |
177 | // are we running for too long? |
178 | if let Some(deadline) = deadline { |
179 | if Instant::now() > deadline { |
180 | break; |
181 | } |
182 | } |
183 | |
184 | // Forward path |
185 | for k in (-d..=d).rev().step_by(2) { |
186 | let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) { |
187 | vf[k + 1] |
188 | } else { |
189 | vf[k - 1] + 1 |
190 | }; |
191 | let y = (x as isize - k) as usize; |
192 | |
193 | // The coordinate of the start of a snake |
194 | let (x0, y0) = (x, y); |
195 | // While these sequences are identical, keep moving through the |
196 | // graph with no cost |
197 | if x < old_range.len() && y < new_range.len() { |
198 | let advance = common_prefix_len( |
199 | old, |
200 | old_range.start + x..old_range.end, |
201 | new, |
202 | new_range.start + y..new_range.end, |
203 | ); |
204 | x += advance; |
205 | } |
206 | |
207 | // This is the new best x value |
208 | vf[k] = x; |
209 | |
210 | // Only check for connections from the forward search when N - M is |
211 | // odd and when there is a reciprocal k line coming from the other |
212 | // direction. |
213 | if odd && (k - delta).abs() <= (d - 1) { |
214 | // TODO optimize this so we don't have to compare against n |
215 | if vf[k] + vb[-(k - delta)] >= n { |
216 | // Return the snake |
217 | return Some((x0 + old_range.start, y0 + new_range.start)); |
218 | } |
219 | } |
220 | } |
221 | |
222 | // Backward path |
223 | for k in (-d..=d).rev().step_by(2) { |
224 | let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) { |
225 | vb[k + 1] |
226 | } else { |
227 | vb[k - 1] + 1 |
228 | }; |
229 | let mut y = (x as isize - k) as usize; |
230 | |
231 | // The coordinate of the start of a snake |
232 | if x < n && y < m { |
233 | let advance = common_suffix_len( |
234 | old, |
235 | old_range.start..old_range.start + n - x, |
236 | new, |
237 | new_range.start..new_range.start + m - y, |
238 | ); |
239 | x += advance; |
240 | y += advance; |
241 | } |
242 | |
243 | // This is the new best x value |
244 | vb[k] = x; |
245 | |
246 | if !odd && (k - delta).abs() <= d { |
247 | // TODO optimize this so we don't have to compare against n |
248 | if vb[k] + vf[-(k - delta)] >= n { |
249 | // Return the snake |
250 | return Some((n - x + old_range.start, m - y + new_range.start)); |
251 | } |
252 | } |
253 | } |
254 | |
255 | // TODO: Maybe there's an opportunity to optimize and bail early? |
256 | } |
257 | |
258 | // deadline reached |
259 | None |
260 | } |
261 | |
262 | #[allow (clippy::too_many_arguments)] |
263 | fn conquer<Old, New, D>( |
264 | d: &mut D, |
265 | old: &Old, |
266 | mut old_range: Range<usize>, |
267 | new: &New, |
268 | mut new_range: Range<usize>, |
269 | vf: &mut V, |
270 | vb: &mut V, |
271 | deadline: Option<Instant>, |
272 | ) -> Result<(), D::Error> |
273 | where |
274 | Old: Index<usize> + ?Sized, |
275 | New: Index<usize> + ?Sized, |
276 | D: DiffHook, |
277 | New::Output: PartialEq<Old::Output>, |
278 | { |
279 | // Check for common prefix |
280 | let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone()); |
281 | if common_prefix_len > 0 { |
282 | d.equal(old_range.start, new_range.start, common_prefix_len)?; |
283 | } |
284 | old_range.start += common_prefix_len; |
285 | new_range.start += common_prefix_len; |
286 | |
287 | // Check for common suffix |
288 | let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone()); |
289 | let common_suffix = ( |
290 | old_range.end - common_suffix_len, |
291 | new_range.end - common_suffix_len, |
292 | ); |
293 | old_range.end -= common_suffix_len; |
294 | new_range.end -= common_suffix_len; |
295 | |
296 | if is_empty_range(&old_range) && is_empty_range(&new_range) { |
297 | // Do nothing |
298 | } else if is_empty_range(&new_range) { |
299 | d.delete(old_range.start, old_range.len(), new_range.start)?; |
300 | } else if is_empty_range(&old_range) { |
301 | d.insert(old_range.start, new_range.start, new_range.len())?; |
302 | } else if let Some((x_start, y_start)) = find_middle_snake( |
303 | old, |
304 | old_range.clone(), |
305 | new, |
306 | new_range.clone(), |
307 | vf, |
308 | vb, |
309 | deadline, |
310 | ) { |
311 | let (old_a, old_b) = split_at(old_range, x_start); |
312 | let (new_a, new_b) = split_at(new_range, y_start); |
313 | conquer(d, old, old_a, new, new_a, vf, vb, deadline)?; |
314 | conquer(d, old, old_b, new, new_b, vf, vb, deadline)?; |
315 | } else { |
316 | d.delete( |
317 | old_range.start, |
318 | old_range.end - old_range.start, |
319 | new_range.start, |
320 | )?; |
321 | d.insert( |
322 | old_range.start, |
323 | new_range.start, |
324 | new_range.end - new_range.start, |
325 | )?; |
326 | } |
327 | |
328 | if common_suffix_len > 0 { |
329 | d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?; |
330 | } |
331 | |
332 | Ok(()) |
333 | } |
334 | |
335 | #[test ] |
336 | fn test_find_middle_snake() { |
337 | let a = &b"ABCABBA" [..]; |
338 | let b = &b"CBABAC" [..]; |
339 | let max_d = max_d(a.len(), b.len()); |
340 | let mut vf = V::new(max_d); |
341 | let mut vb = V::new(max_d); |
342 | let (x_start, y_start) = |
343 | find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb, None).unwrap(); |
344 | assert_eq!(x_start, 4); |
345 | assert_eq!(y_start, 1); |
346 | } |
347 | |
348 | #[test ] |
349 | fn test_diff() { |
350 | let a: &[usize] = &[0, 1, 2, 3, 4]; |
351 | let b: &[usize] = &[0, 1, 2, 9, 4]; |
352 | |
353 | let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); |
354 | diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); |
355 | insta::assert_debug_snapshot!(d.into_inner().ops()); |
356 | } |
357 | |
358 | #[test ] |
359 | fn test_contiguous() { |
360 | let a: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5]; |
361 | let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7]; |
362 | |
363 | let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); |
364 | diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); |
365 | insta::assert_debug_snapshot!(d.into_inner().ops()); |
366 | } |
367 | |
368 | #[test ] |
369 | fn test_pat() { |
370 | let a: &[usize] = &[0, 1, 3, 4, 5]; |
371 | let b: &[usize] = &[0, 1, 4, 5, 8, 9]; |
372 | |
373 | let mut d = crate::algorithms::Capture::new(); |
374 | diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); |
375 | insta::assert_debug_snapshot!(d.ops()); |
376 | } |
377 | |
378 | #[test ] |
379 | fn test_deadline_reached() { |
380 | use std::ops::Index; |
381 | use std::time::Duration; |
382 | |
383 | let a = (0..100).collect::<Vec<_>>(); |
384 | let mut b = (0..100).collect::<Vec<_>>(); |
385 | b[10] = 99; |
386 | b[50] = 99; |
387 | b[25] = 99; |
388 | |
389 | struct SlowIndex<'a>(&'a [usize]); |
390 | |
391 | impl<'a> Index<usize> for SlowIndex<'a> { |
392 | type Output = usize; |
393 | |
394 | fn index(&self, index: usize) -> &Self::Output { |
395 | std::thread::sleep(Duration::from_millis(1)); |
396 | &self.0[index] |
397 | } |
398 | } |
399 | |
400 | let slow_a = SlowIndex(&a); |
401 | let slow_b = SlowIndex(&b); |
402 | |
403 | // don't give it enough time to do anything interesting |
404 | let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); |
405 | diff_deadline( |
406 | &mut d, |
407 | &slow_a, |
408 | 0..a.len(), |
409 | &slow_b, |
410 | 0..b.len(), |
411 | Some(Instant::now() + Duration::from_millis(50)), |
412 | ) |
413 | .unwrap(); |
414 | insta::assert_debug_snapshot!(d.into_inner().ops()); |
415 | } |
416 | |
417 | #[test ] |
418 | fn test_finish_called() { |
419 | struct HasRunFinish(bool); |
420 | |
421 | impl DiffHook for HasRunFinish { |
422 | type Error = (); |
423 | fn finish(&mut self) -> Result<(), Self::Error> { |
424 | self.0 = true; |
425 | Ok(()) |
426 | } |
427 | } |
428 | |
429 | let mut d = HasRunFinish(false); |
430 | let slice = &[1, 2]; |
431 | let slice2 = &[1, 2, 3]; |
432 | diff(&mut d, slice, 0..slice.len(), slice2, 0..slice2.len()).unwrap(); |
433 | assert!(d.0); |
434 | |
435 | let mut d = HasRunFinish(false); |
436 | let slice = &[1, 2]; |
437 | diff(&mut d, slice, 0..slice.len(), slice, 0..slice.len()).unwrap(); |
438 | assert!(d.0); |
439 | |
440 | let mut d = HasRunFinish(false); |
441 | let slice: &[u8] = &[]; |
442 | diff(&mut d, slice, 0..slice.len(), slice, 0..slice.len()).unwrap(); |
443 | assert!(d.0); |
444 | } |
445 | |