| 1 | //! Myers' diff algorithm. |
| 2 | //! |
| 3 | //! * time: `O((N+M)D)` |
| 4 | //! * space `O(N+M)` |
| 5 | //! |
| 6 | //! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf) |
| 7 | //! describing it. |
| 8 | //! |
| 9 | //! The implementation of this algorithm is based on the implementation by |
| 10 | //! Brandon Williams. |
| 11 | //! |
| 12 | //! # Heuristics |
| 13 | //! |
| 14 | //! At present this implementation of Myers' does not implement any more advanced |
| 15 | //! heuristics that would solve some pathological cases. For instance passing two |
| 16 | //! large and completely distinct sequences to the algorithm will make it spin |
| 17 | //! without making reasonable progress. Currently the only protection in the |
| 18 | //! library against this is to pass a deadline to the diffing algorithm. |
| 19 | //! |
| 20 | //! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15). |
| 21 | |
| 22 | use std::ops::{Index, IndexMut, Range}; |
| 23 | use std::time::Instant; |
| 24 | |
| 25 | use crate::algorithms::utils::{common_prefix_len, common_suffix_len, is_empty_range}; |
| 26 | use crate::algorithms::DiffHook; |
| 27 | |
| 28 | /// Myers' diff algorithm. |
| 29 | /// |
| 30 | /// Diff `old`, between indices `old_range` and `new` between indices `new_range`. |
| 31 | pub fn diff<Old, New, D>( |
| 32 | d: &mut D, |
| 33 | old: &Old, |
| 34 | old_range: Range<usize>, |
| 35 | new: &New, |
| 36 | new_range: Range<usize>, |
| 37 | ) -> Result<(), D::Error> |
| 38 | where |
| 39 | Old: Index<usize> + ?Sized, |
| 40 | New: Index<usize> + ?Sized, |
| 41 | D: DiffHook, |
| 42 | New::Output: PartialEq<Old::Output>, |
| 43 | { |
| 44 | diff_deadline(d, old, old_range, new, new_range, deadline:None) |
| 45 | } |
| 46 | |
| 47 | /// Myers' diff algorithm with deadline. |
| 48 | /// |
| 49 | /// Diff `old`, between indices `old_range` and `new` between indices `new_range`. |
| 50 | /// |
| 51 | /// This diff is done with an optional deadline that defines the maximal |
| 52 | /// execution time permitted before it bails and falls back to an approximation. |
| 53 | pub fn diff_deadline<Old, New, D>( |
| 54 | d: &mut D, |
| 55 | old: &Old, |
| 56 | old_range: Range<usize>, |
| 57 | new: &New, |
| 58 | new_range: Range<usize>, |
| 59 | deadline: Option<Instant>, |
| 60 | ) -> Result<(), D::Error> |
| 61 | where |
| 62 | Old: Index<usize> + ?Sized, |
| 63 | New: Index<usize> + ?Sized, |
| 64 | D: DiffHook, |
| 65 | New::Output: PartialEq<Old::Output>, |
| 66 | { |
| 67 | let max_d: usize = max_d(len1:old_range.len(), len2:new_range.len()); |
| 68 | let mut vb: V = V::new(max_d); |
| 69 | let mut vf: V = V::new(max_d); |
| 70 | conquer( |
| 71 | d, old, old_range, new, new_range, &mut vf, &mut vb, deadline, |
| 72 | )?; |
| 73 | d.finish() |
| 74 | } |
| 75 | |
| 76 | // A D-path is a path which starts at (0,0) that has exactly D non-diagonal |
| 77 | // edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge |
| 78 | // and then a possibly empty sequence of diagonal edges called a snake. |
| 79 | |
| 80 | /// `V` contains the endpoints of the furthest reaching `D-paths`. For each |
| 81 | /// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because |
| 82 | /// `y` can be computed from `x - k`. In other words, `V` is an array of integers |
| 83 | /// where `V[k]` contains the row index of the endpoint of the furthest reaching |
| 84 | /// path in diagonal `k`. |
| 85 | /// |
| 86 | /// We can't use a traditional Vec to represent `V` since we use `k` as an index |
| 87 | /// and it can take on negative values. So instead `V` is represented as a |
| 88 | /// light-weight wrapper around a Vec plus an `offset` which is the maximum value |
| 89 | /// `k` can take on in order to map negative `k`'s back to a value >= 0. |
| 90 | #[derive (Debug)] |
| 91 | struct V { |
| 92 | offset: isize, |
| 93 | v: Vec<usize>, // Look into initializing this to -1 and storing isize |
| 94 | } |
| 95 | |
| 96 | impl V { |
| 97 | fn new(max_d: usize) -> Self { |
| 98 | Self { |
| 99 | offset: max_d as isize, |
| 100 | v: vec![0; 2 * max_d], |
| 101 | } |
| 102 | } |
| 103 | |
| 104 | fn len(&self) -> usize { |
| 105 | self.v.len() |
| 106 | } |
| 107 | } |
| 108 | |
| 109 | impl Index<isize> for V { |
| 110 | type Output = usize; |
| 111 | |
| 112 | fn index(&self, index: isize) -> &Self::Output { |
| 113 | &self.v[(index + self.offset) as usize] |
| 114 | } |
| 115 | } |
| 116 | |
| 117 | impl IndexMut<isize> for V { |
| 118 | fn index_mut(&mut self, index: isize) -> &mut Self::Output { |
| 119 | &mut self.v[(index + self.offset) as usize] |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | fn max_d(len1: usize, len2: usize) -> usize { |
| 124 | // XXX look into reducing the need to have the additional '+ 1' |
| 125 | (len1 + len2 + 1) / 2 + 1 |
| 126 | } |
| 127 | |
| 128 | #[inline (always)] |
| 129 | fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) { |
| 130 | (range.start..at, at..range.end) |
| 131 | } |
| 132 | |
| 133 | /// A `Snake` is a sequence of diagonal edges in the edit graph. Normally |
| 134 | /// a snake has a start end end point (and it is possible for a snake to have |
| 135 | /// a length of zero, meaning the start and end points are the same) however |
| 136 | /// we do not need the end point which is why it's not implemented here. |
| 137 | /// |
| 138 | /// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes |
| 139 | /// some of which may be empty. The divide step requires finding the ceil(D/2) + |
| 140 | /// 1 or middle snake of an optimal D-path. The idea for doing so is to |
| 141 | /// simultaneously run the basic algorithm in both the forward and reverse |
| 142 | /// directions until furthest reaching forward and reverse paths starting at |
| 143 | /// opposing corners 'overlap'. |
| 144 | fn find_middle_snake<Old, New>( |
| 145 | old: &Old, |
| 146 | old_range: Range<usize>, |
| 147 | new: &New, |
| 148 | new_range: Range<usize>, |
| 149 | vf: &mut V, |
| 150 | vb: &mut V, |
| 151 | deadline: Option<Instant>, |
| 152 | ) -> Option<(usize, usize)> |
| 153 | where |
| 154 | Old: Index<usize> + ?Sized, |
| 155 | New: Index<usize> + ?Sized, |
| 156 | New::Output: PartialEq<Old::Output>, |
| 157 | { |
| 158 | let n = old_range.len(); |
| 159 | let m = new_range.len(); |
| 160 | |
| 161 | // By Lemma 1 in the paper, the optimal edit script length is odd or even as |
| 162 | // `delta` is odd or even. |
| 163 | let delta = n as isize - m as isize; |
| 164 | let odd = delta & 1 == 1; |
| 165 | |
| 166 | // The initial point at (0, -1) |
| 167 | vf[1] = 0; |
| 168 | // The initial point at (N, M+1) |
| 169 | vb[1] = 0; |
| 170 | |
| 171 | // We only need to explore ceil(D/2) + 1 |
| 172 | let d_max = max_d(n, m); |
| 173 | assert!(vf.len() >= d_max); |
| 174 | assert!(vb.len() >= d_max); |
| 175 | |
| 176 | for d in 0..d_max as isize { |
| 177 | // are we running for too long? |
| 178 | if let Some(deadline) = deadline { |
| 179 | if Instant::now() > deadline { |
| 180 | break; |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | // Forward path |
| 185 | for k in (-d..=d).rev().step_by(2) { |
| 186 | let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) { |
| 187 | vf[k + 1] |
| 188 | } else { |
| 189 | vf[k - 1] + 1 |
| 190 | }; |
| 191 | let y = (x as isize - k) as usize; |
| 192 | |
| 193 | // The coordinate of the start of a snake |
| 194 | let (x0, y0) = (x, y); |
| 195 | // While these sequences are identical, keep moving through the |
| 196 | // graph with no cost |
| 197 | if x < old_range.len() && y < new_range.len() { |
| 198 | let advance = common_prefix_len( |
| 199 | old, |
| 200 | old_range.start + x..old_range.end, |
| 201 | new, |
| 202 | new_range.start + y..new_range.end, |
| 203 | ); |
| 204 | x += advance; |
| 205 | } |
| 206 | |
| 207 | // This is the new best x value |
| 208 | vf[k] = x; |
| 209 | |
| 210 | // Only check for connections from the forward search when N - M is |
| 211 | // odd and when there is a reciprocal k line coming from the other |
| 212 | // direction. |
| 213 | if odd && (k - delta).abs() <= (d - 1) { |
| 214 | // TODO optimize this so we don't have to compare against n |
| 215 | if vf[k] + vb[-(k - delta)] >= n { |
| 216 | // Return the snake |
| 217 | return Some((x0 + old_range.start, y0 + new_range.start)); |
| 218 | } |
| 219 | } |
| 220 | } |
| 221 | |
| 222 | // Backward path |
| 223 | for k in (-d..=d).rev().step_by(2) { |
| 224 | let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) { |
| 225 | vb[k + 1] |
| 226 | } else { |
| 227 | vb[k - 1] + 1 |
| 228 | }; |
| 229 | let mut y = (x as isize - k) as usize; |
| 230 | |
| 231 | // The coordinate of the start of a snake |
| 232 | if x < n && y < m { |
| 233 | let advance = common_suffix_len( |
| 234 | old, |
| 235 | old_range.start..old_range.start + n - x, |
| 236 | new, |
| 237 | new_range.start..new_range.start + m - y, |
| 238 | ); |
| 239 | x += advance; |
| 240 | y += advance; |
| 241 | } |
| 242 | |
| 243 | // This is the new best x value |
| 244 | vb[k] = x; |
| 245 | |
| 246 | if !odd && (k - delta).abs() <= d { |
| 247 | // TODO optimize this so we don't have to compare against n |
| 248 | if vb[k] + vf[-(k - delta)] >= n { |
| 249 | // Return the snake |
| 250 | return Some((n - x + old_range.start, m - y + new_range.start)); |
| 251 | } |
| 252 | } |
| 253 | } |
| 254 | |
| 255 | // TODO: Maybe there's an opportunity to optimize and bail early? |
| 256 | } |
| 257 | |
| 258 | // deadline reached |
| 259 | None |
| 260 | } |
| 261 | |
| 262 | #[allow (clippy::too_many_arguments)] |
| 263 | fn conquer<Old, New, D>( |
| 264 | d: &mut D, |
| 265 | old: &Old, |
| 266 | mut old_range: Range<usize>, |
| 267 | new: &New, |
| 268 | mut new_range: Range<usize>, |
| 269 | vf: &mut V, |
| 270 | vb: &mut V, |
| 271 | deadline: Option<Instant>, |
| 272 | ) -> Result<(), D::Error> |
| 273 | where |
| 274 | Old: Index<usize> + ?Sized, |
| 275 | New: Index<usize> + ?Sized, |
| 276 | D: DiffHook, |
| 277 | New::Output: PartialEq<Old::Output>, |
| 278 | { |
| 279 | // Check for common prefix |
| 280 | let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone()); |
| 281 | if common_prefix_len > 0 { |
| 282 | d.equal(old_range.start, new_range.start, common_prefix_len)?; |
| 283 | } |
| 284 | old_range.start += common_prefix_len; |
| 285 | new_range.start += common_prefix_len; |
| 286 | |
| 287 | // Check for common suffix |
| 288 | let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone()); |
| 289 | let common_suffix = ( |
| 290 | old_range.end - common_suffix_len, |
| 291 | new_range.end - common_suffix_len, |
| 292 | ); |
| 293 | old_range.end -= common_suffix_len; |
| 294 | new_range.end -= common_suffix_len; |
| 295 | |
| 296 | if is_empty_range(&old_range) && is_empty_range(&new_range) { |
| 297 | // Do nothing |
| 298 | } else if is_empty_range(&new_range) { |
| 299 | d.delete(old_range.start, old_range.len(), new_range.start)?; |
| 300 | } else if is_empty_range(&old_range) { |
| 301 | d.insert(old_range.start, new_range.start, new_range.len())?; |
| 302 | } else if let Some((x_start, y_start)) = find_middle_snake( |
| 303 | old, |
| 304 | old_range.clone(), |
| 305 | new, |
| 306 | new_range.clone(), |
| 307 | vf, |
| 308 | vb, |
| 309 | deadline, |
| 310 | ) { |
| 311 | let (old_a, old_b) = split_at(old_range, x_start); |
| 312 | let (new_a, new_b) = split_at(new_range, y_start); |
| 313 | conquer(d, old, old_a, new, new_a, vf, vb, deadline)?; |
| 314 | conquer(d, old, old_b, new, new_b, vf, vb, deadline)?; |
| 315 | } else { |
| 316 | d.delete( |
| 317 | old_range.start, |
| 318 | old_range.end - old_range.start, |
| 319 | new_range.start, |
| 320 | )?; |
| 321 | d.insert( |
| 322 | old_range.start, |
| 323 | new_range.start, |
| 324 | new_range.end - new_range.start, |
| 325 | )?; |
| 326 | } |
| 327 | |
| 328 | if common_suffix_len > 0 { |
| 329 | d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?; |
| 330 | } |
| 331 | |
| 332 | Ok(()) |
| 333 | } |
| 334 | |
| 335 | #[test ] |
| 336 | fn test_find_middle_snake() { |
| 337 | let a = &b"ABCABBA" [..]; |
| 338 | let b = &b"CBABAC" [..]; |
| 339 | let max_d = max_d(a.len(), b.len()); |
| 340 | let mut vf = V::new(max_d); |
| 341 | let mut vb = V::new(max_d); |
| 342 | let (x_start, y_start) = |
| 343 | find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb, None).unwrap(); |
| 344 | assert_eq!(x_start, 4); |
| 345 | assert_eq!(y_start, 1); |
| 346 | } |
| 347 | |
| 348 | #[test ] |
| 349 | fn test_diff() { |
| 350 | let a: &[usize] = &[0, 1, 2, 3, 4]; |
| 351 | let b: &[usize] = &[0, 1, 2, 9, 4]; |
| 352 | |
| 353 | let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); |
| 354 | diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); |
| 355 | insta::assert_debug_snapshot!(d.into_inner().ops()); |
| 356 | } |
| 357 | |
| 358 | #[test ] |
| 359 | fn test_contiguous() { |
| 360 | let a: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5]; |
| 361 | let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7]; |
| 362 | |
| 363 | let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); |
| 364 | diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); |
| 365 | insta::assert_debug_snapshot!(d.into_inner().ops()); |
| 366 | } |
| 367 | |
| 368 | #[test ] |
| 369 | fn test_pat() { |
| 370 | let a: &[usize] = &[0, 1, 3, 4, 5]; |
| 371 | let b: &[usize] = &[0, 1, 4, 5, 8, 9]; |
| 372 | |
| 373 | let mut d = crate::algorithms::Capture::new(); |
| 374 | diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); |
| 375 | insta::assert_debug_snapshot!(d.ops()); |
| 376 | } |
| 377 | |
| 378 | #[test ] |
| 379 | fn test_deadline_reached() { |
| 380 | use std::ops::Index; |
| 381 | use std::time::Duration; |
| 382 | |
| 383 | let a = (0..100).collect::<Vec<_>>(); |
| 384 | let mut b = (0..100).collect::<Vec<_>>(); |
| 385 | b[10] = 99; |
| 386 | b[50] = 99; |
| 387 | b[25] = 99; |
| 388 | |
| 389 | struct SlowIndex<'a>(&'a [usize]); |
| 390 | |
| 391 | impl<'a> Index<usize> for SlowIndex<'a> { |
| 392 | type Output = usize; |
| 393 | |
| 394 | fn index(&self, index: usize) -> &Self::Output { |
| 395 | std::thread::sleep(Duration::from_millis(1)); |
| 396 | &self.0[index] |
| 397 | } |
| 398 | } |
| 399 | |
| 400 | let slow_a = SlowIndex(&a); |
| 401 | let slow_b = SlowIndex(&b); |
| 402 | |
| 403 | // don't give it enough time to do anything interesting |
| 404 | let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); |
| 405 | diff_deadline( |
| 406 | &mut d, |
| 407 | &slow_a, |
| 408 | 0..a.len(), |
| 409 | &slow_b, |
| 410 | 0..b.len(), |
| 411 | Some(Instant::now() + Duration::from_millis(50)), |
| 412 | ) |
| 413 | .unwrap(); |
| 414 | insta::assert_debug_snapshot!(d.into_inner().ops()); |
| 415 | } |
| 416 | |
| 417 | #[test ] |
| 418 | fn test_finish_called() { |
| 419 | struct HasRunFinish(bool); |
| 420 | |
| 421 | impl DiffHook for HasRunFinish { |
| 422 | type Error = (); |
| 423 | fn finish(&mut self) -> Result<(), Self::Error> { |
| 424 | self.0 = true; |
| 425 | Ok(()) |
| 426 | } |
| 427 | } |
| 428 | |
| 429 | let mut d = HasRunFinish(false); |
| 430 | let slice = &[1, 2]; |
| 431 | let slice2 = &[1, 2, 3]; |
| 432 | diff(&mut d, slice, 0..slice.len(), slice2, 0..slice2.len()).unwrap(); |
| 433 | assert!(d.0); |
| 434 | |
| 435 | let mut d = HasRunFinish(false); |
| 436 | let slice = &[1, 2]; |
| 437 | diff(&mut d, slice, 0..slice.len(), slice, 0..slice.len()).unwrap(); |
| 438 | assert!(d.0); |
| 439 | |
| 440 | let mut d = HasRunFinish(false); |
| 441 | let slice: &[u8] = &[]; |
| 442 | diff(&mut d, slice, 0..slice.len(), slice, 0..slice.len()).unwrap(); |
| 443 | assert!(d.0); |
| 444 | } |
| 445 | |