myers.rs source code [crates/similar/src/algorithms/myers.rs]

1	//! Myers' diff algorithm.
2	//!
3	//! time: `O((N+M)D)`*
4	//! space `O(N+M)`*
5	//!
6	//! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf)
7	//! describing it.
8	//!
9	//! The implementation of this algorithm is based on the implementation by
10	//! Brandon Williams.
11	//!
12	//! # Heuristics
13	//!
14	//! At present this implementation of Myers' does not implement any more advanced
15	//! heuristics that would solve some pathological cases. For instance passing two
16	//! large and completely distinct sequences to the algorithm will make it spin
17	//! without making reasonable progress. Currently the only protection in the
18	//! library against this is to pass a deadline to the diffing algorithm.
19	//!
20	//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
21
22	use std::ops::{Index, IndexMut, Range};
23	use std::time::Instant;
24
25	use crate::algorithms::utils::{common_prefix_len, common_suffix_len, is_empty_range};
26	use crate::algorithms::DiffHook;
27
28	/// Myers' diff algorithm.
29	///
30	/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
31	pub fn diff<Old, New, D>(
32	d: &mut D,
33	old: &Old,
34	old_range: Range<usize>,
35	new: &New,
36	new_range: Range<usize>,
37	) -> Result<(), D::Error>
38	where
39	Old: Index<usize> + ?Sized,
40	New: Index<usize> + ?Sized,
41	D: DiffHook,
42	New::Output: PartialEq<Old::Output>,
43	{
44	diff_deadline(d, old, old_range, new, new_range, deadline:None)
45	}
46
47	/// Myers' diff algorithm with deadline.
48	///
49	/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
50	///
51	/// This diff is done with an optional deadline that defines the maximal
52	/// execution time permitted before it bails and falls back to an approximation.
53	pub fn diff_deadline<Old, New, D>(
54	d: &mut D,
55	old: &Old,
56	old_range: Range<usize>,
57	new: &New,
58	new_range: Range<usize>,
59	deadline: Option<Instant>,
60	) -> Result<(), D::Error>
61	where
62	Old: Index<usize> + ?Sized,
63	New: Index<usize> + ?Sized,
64	D: DiffHook,
65	New::Output: PartialEq<Old::Output>,
66	{
67	let max_d: usize = max_d(len1:old_range.len(), len2:new_range.len());
68	let mut vb: V = V::new(max_d);
69	let mut vf: V = V::new(max_d);
70	conquer(
71	d, old, old_range, new, new_range, &mut vf, &mut vb, deadline,
72	)?;
73	d.finish()
74	}
75
76	// A D-path is a path which starts at (0,0) that has exactly D non-diagonal
77	// edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge
78	// and then a possibly empty sequence of diagonal edges called a snake.
79
80	/// `V` contains the endpoints of the furthest reaching `D-paths`. For each
81	/// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because
82	/// `y` can be computed from `x - k`. In other words, `V` is an array of integers
83	/// where `V[k]` contains the row index of the endpoint of the furthest reaching
84	/// path in diagonal `k`.
85	///
86	/// We can't use a traditional Vec to represent `V` since we use `k` as an index
87	/// and it can take on negative values. So instead `V` is represented as a
88	/// light-weight wrapper around a Vec plus an `offset` which is the maximum value
89	/// `k` can take on in order to map negative `k`'s back to a value >= 0.
90	#[derive(Debug)]
91	struct V {
92	offset: isize,
93	v: Vec<usize>, // Look into initializing this to -1 and storing isize
94	}
95
96	impl V {
97	fn new(max_d: usize) -> Self {
98	Self {
99	offset: max_d as isize,
100	v: vec![`0`; `2` * max_d],
101	}
102	}
103
104	fn len(&self) -> usize {
105	self.v.len()
106	}
107	}
108
109	impl Index<isize> for V {
110	type Output = usize;
111
112	fn index(&self, index: isize) -> &Self::Output {
113	&self.v[(index + self.offset) as usize]
114	}
115	}
116
117	impl IndexMut<isize> for V {
118	fn index_mut(&mut self, index: isize) -> &mut Self::Output {
119	&mut self.v[(index + self.offset) as usize]
120	}
121	}
122
123	fn max_d(len1: usize, len2: usize) -> usize {
124	// XXX look into reducing the need to have the additional '+ 1'
125	(len1 + len2 + `1`) / `2` + `1`
126	}
127
128	#[inline(always)]
129	fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
130	(range.start..at, at..range.end)
131	}
132
133	/// A `Snake` is a sequence of diagonal edges in the edit graph. Normally
134	/// a snake has a start end end point (and it is possible for a snake to have
135	/// a length of zero, meaning the start and end points are the same) however
136	/// we do not need the end point which is why it's not implemented here.
137	///
138	/// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes
139	/// some of which may be empty. The divide step requires finding the ceil(D/2) +
140	/// 1 or middle snake of an optimal D-path. The idea for doing so is to
141	/// simultaneously run the basic algorithm in both the forward and reverse
142	/// directions until furthest reaching forward and reverse paths starting at
143	/// opposing corners 'overlap'.
144	fn find_middle_snake<Old, New>(
145	old: &Old,
146	old_range: Range<usize>,
147	new: &New,
148	new_range: Range<usize>,
149	vf: &mut V,
150	vb: &mut V,
151	deadline: Option<Instant>,
152	) -> Option<(usize, usize)>
153	where
154	Old: Index<usize> + ?Sized,
155	New: Index<usize> + ?Sized,
156	New::Output: PartialEq<Old::Output>,
157	{
158	let n = old_range.len();
159	let m = new_range.len();
160
161	// By Lemma 1 in the paper, the optimal edit script length is odd or even as
162	// `delta` is odd or even.
163	let delta = n as isize - m as isize;
164	let odd = delta & `1` == `1`;
165
166	// The initial point at (0, -1)
167	vf[`1`] = `0`;
168	// The initial point at (N, M+1)
169	vb[`1`] = `0`;
170
171	// We only need to explore ceil(D/2) + 1
172	let d_max = max_d(n, m);
173	assert!(vf.len() >= d_max);
174	assert!(vb.len() >= d_max);
175
176	for d in `0`..d_max as isize {
177	// are we running for too long?
178	if let Some(deadline) = deadline {
179	if Instant::now() > deadline {
180	break;
181	}
182	}
183
184	// Forward path
185	for k in (-d..=d).rev().step_by(`2`) {
186	let mut x = if k == -d \|\| (k != d && vf[k - `1`] < vf[k + `1`]) {
187	vf[k + `1`]
188	} else {
189	vf[k - `1`] + `1`
190	};
191	let y = (x as isize - k) as usize;
192
193	// The coordinate of the start of a snake
194	let (x0, y0) = (x, y);
195	// While these sequences are identical, keep moving through the
196	// graph with no cost
197	if x < old_range.len() && y < new_range.len() {
198	let advance = common_prefix_len(
199	old,
200	old_range.start + x..old_range.end,
201	new,
202	new_range.start + y..new_range.end,
203	);
204	x += advance;
205	}
206
207	// This is the new best x value
208	vf[k] = x;
209
210	// Only check for connections from the forward search when N - M is
211	// odd and when there is a reciprocal k line coming from the other
212	// direction.
213	if odd && (k - delta).abs() <= (d - `1`) {
214	// TODO optimize this so we don't have to compare against n
215	if vf[k] + vb[-(k - delta)] >= n {
216	// Return the snake
217	return Some((x0 + old_range.start, y0 + new_range.start));
218	}
219	}
220	}
221
222	// Backward path
223	for k in (-d..=d).rev().step_by(`2`) {
224	let mut x = if k == -d \|\| (k != d && vb[k - `1`] < vb[k + `1`]) {
225	vb[k + `1`]
226	} else {
227	vb[k - `1`] + `1`
228	};
229	let mut y = (x as isize - k) as usize;
230
231	// The coordinate of the start of a snake
232	if x < n && y < m {
233	let advance = common_suffix_len(
234	old,
235	old_range.start..old_range.start + n - x,
236	new,
237	new_range.start..new_range.start + m - y,
238	);
239	x += advance;
240	y += advance;
241	}
242
243	// This is the new best x value
244	vb[k] = x;
245
246	if !odd && (k - delta).abs() <= d {
247	// TODO optimize this so we don't have to compare against n
248	if vb[k] + vf[-(k - delta)] >= n {
249	// Return the snake
250	return Some((n - x + old_range.start, m - y + new_range.start));
251	}
252	}
253	}
254
255	// TODO: Maybe there's an opportunity to optimize and bail early?
256	}
257
258	// deadline reached
259	None
260	}
261
262	#[allow(clippy::too_many_arguments)]
263	fn conquer<Old, New, D>(
264	d: &mut D,
265	old: &Old,
266	mut old_range: Range<usize>,
267	new: &New,
268	mut new_range: Range<usize>,
269	vf: &mut V,
270	vb: &mut V,
271	deadline: Option<Instant>,
272	) -> Result<(), D::Error>
273	where
274	Old: Index<usize> + ?Sized,
275	New: Index<usize> + ?Sized,
276	D: DiffHook,
277	New::Output: PartialEq<Old::Output>,
278	{
279	// Check for common prefix
280	let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
281	if common_prefix_len > `0` {
282	d.equal(old_range.start, new_range.start, common_prefix_len)?;
283	}
284	old_range.start += common_prefix_len;
285	new_range.start += common_prefix_len;
286
287	// Check for common suffix
288	let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone());
289	let common_suffix = (
290	old_range.end - common_suffix_len,
291	new_range.end - common_suffix_len,
292	);
293	old_range.end -= common_suffix_len;
294	new_range.end -= common_suffix_len;
295
296	if is_empty_range(&old_range) && is_empty_range(&new_range) {
297	// Do nothing
298	} else if is_empty_range(&new_range) {
299	d.delete(old_range.start, old_range.len(), new_range.start)?;
300	} else if is_empty_range(&old_range) {
301	d.insert(old_range.start, new_range.start, new_range.len())?;
302	} else if let Some((x_start, y_start)) = find_middle_snake(
303	old,
304	old_range.clone(),
305	new,
306	new_range.clone(),
307	vf,
308	vb,
309	deadline,
310	) {
311	let (old_a, old_b) = split_at(old_range, x_start);
312	let (new_a, new_b) = split_at(new_range, y_start);
313	conquer(d, old, old_a, new, new_a, vf, vb, deadline)?;
314	conquer(d, old, old_b, new, new_b, vf, vb, deadline)?;
315	} else {
316	d.delete(
317	old_range.start,
318	old_range.end - old_range.start,
319	new_range.start,
320	)?;
321	d.insert(
322	old_range.start,
323	new_range.start,
324	new_range.end - new_range.start,
325	)?;
326	}
327
328	if common_suffix_len > `0` {
329	d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?;
330	}
331
332	Ok(())
333	}
334
335	#[test]
336	fn test_find_middle_snake() {
337	let a = &b"ABCABBA"[..];
338	let b = &b"CBABAC"[..];
339	let max_d = max_d(a.len(), b.len());
340	let mut vf = V::new(max_d);
341	let mut vb = V::new(max_d);
342	let (x_start, y_start) =
343	find_middle_snake(a, `0`..a.len(), b, `0`..b.len(), &mut vf, &mut vb, None).unwrap();
344	assert_eq!(x_start, `4`);
345	assert_eq!(y_start, `1`);
346	}
347
348	#[test]
349	fn test_diff() {
350	let a: &[usize] = &[`0`, `1`, `2`, `3`, `4`];
351	let b: &[usize] = &[`0`, `1`, `2`, `9`, `4`];
352
353	let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
354	diff(&mut d, a, `0`..a.len(), b, `0`..b.len()).unwrap();
355	insta::assert_debug_snapshot!(d.into_inner().ops());
356	}
357
358	#[test]
359	fn test_contiguous() {
360	let a: &[usize] = &[`0`, `1`, `2`, `3`, `4`, `4`, `4`, `5`];
361	let b: &[usize] = &[`0`, `1`, `2`, `8`, `9`, `4`, `4`, `7`];
362
363	let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
364	diff(&mut d, a, `0`..a.len(), b, `0`..b.len()).unwrap();
365	insta::assert_debug_snapshot!(d.into_inner().ops());
366	}
367
368	#[test]
369	fn test_pat() {
370	let a: &[usize] = &[`0`, `1`, `3`, `4`, `5`];
371	let b: &[usize] = &[`0`, `1`, `4`, `5`, `8`, `9`];
372
373	let mut d = crate::algorithms::Capture::new();
374	diff(&mut d, a, `0`..a.len(), b, `0`..b.len()).unwrap();
375	insta::assert_debug_snapshot!(d.ops());
376	}
377
378	#[test]
379	fn test_deadline_reached() {
380	use std::ops::Index;
381	use std::time::Duration;
382
383	let a = (`0`..`100`).collect::<Vec<_>>();
384	let mut b = (`0`..`100`).collect::<Vec<_>>();
385	b[`10`] = `99`;
386	b[`50`] = `99`;
387	b[`25`] = `99`;
388
389	struct SlowIndex<'a>(&'a [usize]);
390
391	impl<'a> Index<usize> for SlowIndex<'a> {
392	type Output = usize;
393
394	fn index(&self, index: usize) -> &Self::Output {
395	std::thread::sleep(Duration::from_millis(`1`));
396	&self.0[index]
397	}
398	}
399
400	let slow_a = SlowIndex(&a);
401	let slow_b = SlowIndex(&b);
402
403	// don't give it enough time to do anything interesting
404	let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
405	diff_deadline(
406	&mut d,
407	&slow_a,
408	`0`..a.len(),
409	&slow_b,
410	`0`..b.len(),
411	Some(Instant::now() + Duration::from_millis(`50`)),
412	)
413	.unwrap();
414	insta::assert_debug_snapshot!(d.into_inner().ops());
415	}
416
417	#[test]
418	fn test_finish_called() {
419	struct HasRunFinish(bool);
420
421	impl DiffHook for HasRunFinish {
422	type Error = ();
423	fn finish(&mut self) -> Result<(), Self::Error> {
424	self.0 = `true`;
425	Ok(())
426	}
427	}
428
429	let mut d = HasRunFinish(`false`);
430	let slice = &[`1`, `2`];
431	let slice2 = &[`1`, `2`, `3`];
432	diff(&mut d, slice, `0`..slice.len(), slice2, `0`..slice2.len()).unwrap();
433	assert!(d.`0`);
434
435	let mut d = HasRunFinish(`false`);
436	let slice = &[`1`, `2`];
437	diff(&mut d, slice, `0`..slice.len(), slice, `0`..slice.len()).unwrap();
438	assert!(d.`0`);
439
440	let mut d = HasRunFinish(`false`);
441	let slice: &[u8] = &[];
442	diff(&mut d, slice, `0`..slice.len(), slice, `0`..slice.len()).unwrap();
443	assert!(d.`0`);
444	}
445