| 1 | //! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar) |
| 2 | //! |
| 3 | //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github |
| 4 | //! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust |
| 5 | //! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs |
| 6 | //! |
| 7 | //! <br> |
| 8 | //! |
| 9 | //! ## Diff library with semantic cleanup, based on Google's diff-match-patch |
| 10 | //! |
| 11 | //! This library is a port of the Diff component of [Diff Match Patch] to Rust. |
| 12 | //! The diff implementation is based on [Myers' diff algorithm] but includes |
| 13 | //! some [semantic cleanups] to increase human readability by factoring out |
| 14 | //! commonalities which are likely to be coincidental. |
| 15 | //! |
| 16 | //! Diff Match Patch was originally built in 2006 to power Google Docs. |
| 17 | //! |
| 18 | //! # Interface |
| 19 | //! |
| 20 | //! Here is the entire API of the Rust implementation. It operates on borrowed |
| 21 | //! strings and the return value of the diff algorithm is a vector of chunks |
| 22 | //! pointing into slices of those input strings. |
| 23 | //! |
| 24 | //! ``` |
| 25 | //! pub enum Chunk<'a> { |
| 26 | //! Equal(&'a str), |
| 27 | //! Delete(&'a str), |
| 28 | //! Insert(&'a str), |
| 29 | //! } |
| 30 | //! |
| 31 | //! # const IGNORE: &str = stringify! { |
| 32 | //! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>; |
| 33 | //! # }; |
| 34 | //! ``` |
| 35 | //! |
| 36 | //! [Diff Match Patch]: https://github.com/google/diff-match-patch |
| 37 | //! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf |
| 38 | //! [semantic cleanups]: https://neil.fraser.name/writing/diff/ |
| 39 | |
| 40 | #![doc (html_root_url = "https://docs.rs/dissimilar/1.0.10" )] |
| 41 | #![allow ( |
| 42 | clippy::blocks_in_conditions, |
| 43 | clippy::bool_to_int_with_if, |
| 44 | clippy::cast_possible_wrap, |
| 45 | clippy::cast_sign_loss, |
| 46 | clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127 |
| 47 | clippy::collapsible_else_if, |
| 48 | clippy::comparison_chain, |
| 49 | clippy::implied_bounds_in_impls, |
| 50 | clippy::items_after_test_module, // https://github.com/rust-lang/rust-clippy/issues/10713 |
| 51 | clippy::let_underscore_untyped, |
| 52 | clippy::match_same_arms, |
| 53 | clippy::module_name_repetitions, |
| 54 | clippy::must_use_candidate, |
| 55 | clippy::new_without_default, |
| 56 | clippy::octal_escapes, |
| 57 | clippy::shadow_unrelated, |
| 58 | clippy::similar_names, |
| 59 | clippy::too_many_lines, |
| 60 | clippy::unseparated_literal_suffix, |
| 61 | unused_parens, // false positive on Some(&(mut diff)) pattern |
| 62 | )] |
| 63 | |
| 64 | mod find; |
| 65 | mod range; |
| 66 | |
| 67 | #[cfg (test)] |
| 68 | mod tests; |
| 69 | |
| 70 | use crate::range::{slice, Range}; |
| 71 | use std::cmp; |
| 72 | use std::collections::VecDeque; |
| 73 | use std::fmt::{self, Debug, Display, Write}; |
| 74 | |
| 75 | #[derive (Copy, Clone, PartialEq, Eq)] |
| 76 | pub enum Chunk<'a> { |
| 77 | Equal(&'a str), |
| 78 | Delete(&'a str), |
| 79 | Insert(&'a str), |
| 80 | } |
| 81 | |
| 82 | #[derive (Copy, Clone)] |
| 83 | enum Diff<'a, 'b> { |
| 84 | Equal(Range<'a>, Range<'b>), |
| 85 | Delete(Range<'a>), |
| 86 | Insert(Range<'b>), |
| 87 | } |
| 88 | |
| 89 | impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> { |
| 90 | fn text(&self) -> Range<'tmp> { |
| 91 | match *self { |
| 92 | Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range, |
| 93 | } |
| 94 | } |
| 95 | |
| 96 | fn grow_left(&mut self, increment: usize) { |
| 97 | self.for_each(|range| { |
| 98 | range.offset -= increment; |
| 99 | range.len += increment; |
| 100 | }); |
| 101 | } |
| 102 | |
| 103 | fn grow_right(&mut self, increment: usize) { |
| 104 | self.for_each(|range| range.len += increment); |
| 105 | } |
| 106 | |
| 107 | fn shift_left(&mut self, increment: usize) { |
| 108 | self.for_each(|range| range.offset -= increment); |
| 109 | } |
| 110 | |
| 111 | fn shift_right(&mut self, increment: usize) { |
| 112 | self.for_each(|range| range.offset += increment); |
| 113 | } |
| 114 | |
| 115 | fn for_each(&mut self, f: impl Fn(&mut Range)) { |
| 116 | match self { |
| 117 | Diff::Equal(range1, range2) => { |
| 118 | f(range1); |
| 119 | f(range2); |
| 120 | } |
| 121 | Diff::Delete(range) => f(range), |
| 122 | Diff::Insert(range) => f(range), |
| 123 | } |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> { |
| 128 | let chars1: Vec<char> = text1.chars().collect(); |
| 129 | let chars2: Vec<char> = text2.chars().collect(); |
| 130 | let range1 = Range::new(&chars1, ..); |
| 131 | let range2 = Range::new(&chars2, ..); |
| 132 | |
| 133 | let mut solution = main(range1, range2); |
| 134 | cleanup_char_boundary(&mut solution); |
| 135 | cleanup_semantic(&mut solution); |
| 136 | cleanup_merge(&mut solution); |
| 137 | |
| 138 | let mut chunks = Vec::new(); |
| 139 | let mut pos1 = 0; |
| 140 | let mut pos2 = 0; |
| 141 | for diff in solution.diffs { |
| 142 | chunks.push(match diff { |
| 143 | Diff::Equal(range, _) => { |
| 144 | let len = range.len_bytes(); |
| 145 | let chunk = Chunk::Equal(&text1[pos1..pos1 + len]); |
| 146 | pos1 += len; |
| 147 | pos2 += len; |
| 148 | chunk |
| 149 | } |
| 150 | Diff::Delete(range) => { |
| 151 | let len = range.len_bytes(); |
| 152 | let chunk = Chunk::Delete(&text1[pos1..pos1 + len]); |
| 153 | pos1 += len; |
| 154 | chunk |
| 155 | } |
| 156 | Diff::Insert(range) => { |
| 157 | let len = range.len_bytes(); |
| 158 | let chunk = Chunk::Insert(&text2[pos2..pos2 + len]); |
| 159 | pos2 += len; |
| 160 | chunk |
| 161 | } |
| 162 | }); |
| 163 | } |
| 164 | chunks |
| 165 | } |
| 166 | |
| 167 | struct Solution<'a, 'b> { |
| 168 | text1: Range<'a>, |
| 169 | text2: Range<'b>, |
| 170 | diffs: Vec<Diff<'a, 'b>>, |
| 171 | } |
| 172 | |
| 173 | fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> { |
| 174 | let whole1 = text1; |
| 175 | let whole2 = text2; |
| 176 | |
| 177 | // Trim off common prefix. |
| 178 | let common_prefix_len = common_prefix(text1, text2); |
| 179 | let common_prefix = Diff::Equal( |
| 180 | text1.substring(..common_prefix_len), |
| 181 | text2.substring(..common_prefix_len), |
| 182 | ); |
| 183 | text1 = text1.substring(common_prefix_len..); |
| 184 | text2 = text2.substring(common_prefix_len..); |
| 185 | |
| 186 | // Trim off common suffix. |
| 187 | let common_suffix_len = common_suffix(text1, text2); |
| 188 | let common_suffix = Diff::Equal( |
| 189 | text1.substring(text1.len - common_suffix_len..), |
| 190 | text2.substring(text2.len - common_suffix_len..), |
| 191 | ); |
| 192 | text1 = text1.substring(..text1.len - common_suffix_len); |
| 193 | text2 = text2.substring(..text2.len - common_suffix_len); |
| 194 | |
| 195 | // Compute the diff on the middle block. |
| 196 | let mut solution = Solution { |
| 197 | text1: whole1, |
| 198 | text2: whole2, |
| 199 | diffs: compute(text1, text2), |
| 200 | }; |
| 201 | |
| 202 | // Restore the prefix and suffix. |
| 203 | if common_prefix_len > 0 { |
| 204 | solution.diffs.insert(0, common_prefix); |
| 205 | } |
| 206 | if common_suffix_len > 0 { |
| 207 | solution.diffs.push(common_suffix); |
| 208 | } |
| 209 | |
| 210 | cleanup_merge(&mut solution); |
| 211 | |
| 212 | solution |
| 213 | } |
| 214 | |
| 215 | // Find the differences between two texts. Assumes that the texts do not have |
| 216 | // any common prefix or suffix. |
| 217 | fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> { |
| 218 | match (text1.is_empty(), text2.is_empty()) { |
| 219 | (true, true) => return Vec::new(), |
| 220 | (true, false) => return vec![Diff::Insert(text2)], |
| 221 | (false, true) => return vec![Diff::Delete(text1)], |
| 222 | (false, false) => {} |
| 223 | } |
| 224 | |
| 225 | // Check for entire shorter text inside the longer text. |
| 226 | if text1.len > text2.len { |
| 227 | if let Some(i) = text1.find(text2) { |
| 228 | return vec![ |
| 229 | Diff::Delete(text1.substring(..i)), |
| 230 | Diff::Equal(text1.substring(i..i + text2.len), text2), |
| 231 | Diff::Delete(text1.substring(i + text2.len..)), |
| 232 | ]; |
| 233 | } |
| 234 | } else { |
| 235 | if let Some(i) = text2.find(text1) { |
| 236 | return vec![ |
| 237 | Diff::Insert(text2.substring(..i)), |
| 238 | Diff::Equal(text1, text2.substring(i..i + text1.len)), |
| 239 | Diff::Insert(text2.substring(i + text1.len..)), |
| 240 | ]; |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | if text1.len == 1 || text2.len == 1 { |
| 245 | // Single character string. |
| 246 | // After the previous check, the character can't be an equality. |
| 247 | return vec![Diff::Delete(text1), Diff::Insert(text2)]; |
| 248 | } |
| 249 | |
| 250 | bisect(text1, text2) |
| 251 | } |
| 252 | |
| 253 | // Find the 'middle snake' of a diff, split the problem in two and return the |
| 254 | // recursively constructed diff. |
| 255 | // |
| 256 | // See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. |
| 257 | fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> { |
| 258 | let max_d = (text1.len + text2.len + 1) / 2; |
| 259 | let v_offset = max_d; |
| 260 | let v_len = 2 * max_d; |
| 261 | let mut v1 = vec![-1isize; v_len]; |
| 262 | let mut v2 = vec![-1isize; v_len]; |
| 263 | v1[v_offset + 1] = 0; |
| 264 | v2[v_offset + 1] = 0; |
| 265 | let delta = text1.len as isize - text2.len as isize; |
| 266 | // If the total number of characters is odd, then the front path will |
| 267 | // collide with the reverse path. |
| 268 | let front = delta % 2 != 0; |
| 269 | // Offsets for start and end of k loop. |
| 270 | // Prevents mapping of space beyond the grid. |
| 271 | let mut k1start = 0; |
| 272 | let mut k1end = 0; |
| 273 | let mut k2start = 0; |
| 274 | let mut k2end = 0; |
| 275 | for d in 0..max_d as isize { |
| 276 | // Walk the front path one step. |
| 277 | let mut k1 = -d + k1start; |
| 278 | while k1 <= d - k1end { |
| 279 | let k1_offset = (v_offset as isize + k1) as usize; |
| 280 | let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) { |
| 281 | v1[k1_offset + 1] |
| 282 | } else { |
| 283 | v1[k1_offset - 1] + 1 |
| 284 | } as usize; |
| 285 | let mut y1 = (x1 as isize - k1) as usize; |
| 286 | if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) { |
| 287 | let advance = common_prefix(s1, s2); |
| 288 | x1 += advance; |
| 289 | y1 += advance; |
| 290 | } |
| 291 | v1[k1_offset] = x1 as isize; |
| 292 | if x1 > text1.len { |
| 293 | // Ran off the right of the graph. |
| 294 | k1end += 2; |
| 295 | } else if y1 > text2.len { |
| 296 | // Ran off the bottom of the graph. |
| 297 | k1start += 2; |
| 298 | } else if front { |
| 299 | let k2_offset = v_offset as isize + delta - k1; |
| 300 | if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 { |
| 301 | // Mirror x2 onto top-left coordinate system. |
| 302 | let x2 = text1.len as isize - v2[k2_offset as usize]; |
| 303 | if x1 as isize >= x2 { |
| 304 | // Overlap detected. |
| 305 | return bisect_split(text1, text2, x1, y1); |
| 306 | } |
| 307 | } |
| 308 | } |
| 309 | k1 += 2; |
| 310 | } |
| 311 | |
| 312 | // Walk the reverse path one step. |
| 313 | let mut k2 = -d + k2start; |
| 314 | while k2 <= d - k2end { |
| 315 | let k2_offset = (v_offset as isize + k2) as usize; |
| 316 | let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) { |
| 317 | v2[k2_offset + 1] |
| 318 | } else { |
| 319 | v2[k2_offset - 1] + 1 |
| 320 | } as usize; |
| 321 | let mut y2 = (x2 as isize - k2) as usize; |
| 322 | if x2 < text1.len && y2 < text2.len { |
| 323 | let advance = common_suffix( |
| 324 | text1.substring(..text1.len - x2), |
| 325 | text2.substring(..text2.len - y2), |
| 326 | ); |
| 327 | x2 += advance; |
| 328 | y2 += advance; |
| 329 | } |
| 330 | v2[k2_offset] = x2 as isize; |
| 331 | if x2 > text1.len { |
| 332 | // Ran off the left of the graph. |
| 333 | k2end += 2; |
| 334 | } else if y2 > text2.len { |
| 335 | // Ran off the top of the graph. |
| 336 | k2start += 2; |
| 337 | } else if !front { |
| 338 | let k1_offset = v_offset as isize + delta - k2; |
| 339 | if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 { |
| 340 | let x1 = v1[k1_offset as usize] as usize; |
| 341 | let y1 = v_offset + x1 - k1_offset as usize; |
| 342 | // Mirror x2 onto top-left coordinate system. |
| 343 | x2 = text1.len - x2; |
| 344 | if x1 >= x2 { |
| 345 | // Overlap detected. |
| 346 | return bisect_split(text1, text2, x1, y1); |
| 347 | } |
| 348 | } |
| 349 | } |
| 350 | k2 += 2; |
| 351 | } |
| 352 | } |
| 353 | // Number of diffs equals number of characters, no commonality at all. |
| 354 | vec![Diff::Delete(text1), Diff::Insert(text2)] |
| 355 | } |
| 356 | |
| 357 | // Given the location of the 'middle snake', split the diff in two parts and |
| 358 | // recurse. |
| 359 | fn bisect_split<'a, 'b>( |
| 360 | text1: Range<'a>, |
| 361 | text2: Range<'b>, |
| 362 | x: usize, |
| 363 | y: usize, |
| 364 | ) -> Vec<Diff<'a, 'b>> { |
| 365 | let (text1a: Range<'_>, text1b: Range<'_>) = text1.split_at(mid:x); |
| 366 | let (text2a: Range<'_>, text2b: Range<'_>) = text2.split_at(mid:y); |
| 367 | |
| 368 | // Compute both diffs serially. |
| 369 | let mut diffs: Vec> = main(text1:text1a, text2:text2a).diffs; |
| 370 | diffs.extend(iter:main(text1:text1b, text2:text2b).diffs); |
| 371 | |
| 372 | diffs |
| 373 | } |
| 374 | |
| 375 | // Determine the length of the common prefix of two strings. |
| 376 | fn common_prefix(text1: Range, text2: Range) -> usize { |
| 377 | for (i: usize, (b1: char, b2: char)) in text1.chars().zip(text2.chars()).enumerate() { |
| 378 | if b1 != b2 { |
| 379 | return i; |
| 380 | } |
| 381 | } |
| 382 | cmp::min(v1:text1.len, v2:text2.len) |
| 383 | } |
| 384 | |
| 385 | // Determine the length of the common suffix of two strings. |
| 386 | fn common_suffix(text1: Range, text2: Range) -> usize { |
| 387 | for (i: usize, (b1: char, b2: char)) in text1.chars().rev().zip(text2.chars().rev()).enumerate() { |
| 388 | if b1 != b2 { |
| 389 | return i; |
| 390 | } |
| 391 | } |
| 392 | cmp::min(v1:text1.len, v2:text2.len) |
| 393 | } |
| 394 | |
| 395 | // Determine if the suffix of one string is the prefix of another. |
| 396 | // |
| 397 | // Returns the number of characters common to the end of the first string and |
| 398 | // the start of the second string. |
| 399 | fn common_overlap(mut text1: Range, mut text2: Range) -> usize { |
| 400 | // Eliminate the null case. |
| 401 | if text1.is_empty() || text2.is_empty() { |
| 402 | return 0; |
| 403 | } |
| 404 | // Truncate the longer string. |
| 405 | if text1.len > text2.len { |
| 406 | text1 = text1.substring(text1.len - text2.len..); |
| 407 | } else if text1.len < text2.len { |
| 408 | text2 = text2.substring(..text1.len); |
| 409 | } |
| 410 | // Quick check for the worst case. |
| 411 | if slice(text1) == slice(text2) { |
| 412 | return text1.len; |
| 413 | } |
| 414 | |
| 415 | // Start by looking for a single character match |
| 416 | // and increase length until no match is found. |
| 417 | // Performance analysis: https://neil.fraser.name/news/2010/11/04/ |
| 418 | let mut best = 0; |
| 419 | let mut length = 1; |
| 420 | loop { |
| 421 | let pattern = text1.substring(text1.len - length..); |
| 422 | let found = match text2.find(pattern) { |
| 423 | Some(found) => found, |
| 424 | None => return best, |
| 425 | }; |
| 426 | length += found; |
| 427 | if found == 0 |
| 428 | || slice(text1.substring(text1.len - length..)) == slice(text2.substring(..length)) |
| 429 | { |
| 430 | best = length; |
| 431 | length += 1; |
| 432 | } |
| 433 | } |
| 434 | } |
| 435 | |
| 436 | fn cleanup_char_boundary(solution: &mut Solution) { |
| 437 | fn is_segmentation_boundary(doc: &[char], pos: usize) -> bool { |
| 438 | // FIXME: use unicode-segmentation crate? |
| 439 | let _ = doc; |
| 440 | let _ = pos; |
| 441 | true |
| 442 | } |
| 443 | |
| 444 | fn boundary_down(doc: &[char], pos: usize) -> usize { |
| 445 | let mut adjust = 0; |
| 446 | while !is_segmentation_boundary(doc, pos - adjust) { |
| 447 | adjust += 1; |
| 448 | } |
| 449 | adjust |
| 450 | } |
| 451 | |
| 452 | fn boundary_up(doc: &[char], pos: usize) -> usize { |
| 453 | let mut adjust = 0; |
| 454 | while !is_segmentation_boundary(doc, pos + adjust) { |
| 455 | adjust += 1; |
| 456 | } |
| 457 | adjust |
| 458 | } |
| 459 | |
| 460 | fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) { |
| 461 | let prev_end = prev.offset + prev.len; |
| 462 | if prev_end > range.offset { |
| 463 | let delta = cmp::min(prev_end - range.offset, range.len); |
| 464 | range.offset += delta; |
| 465 | range.len -= delta; |
| 466 | } |
| 467 | } |
| 468 | |
| 469 | let mut read = 0; |
| 470 | let mut retain = 0; |
| 471 | let mut last_delete = Range::empty(); |
| 472 | let mut last_insert = Range::empty(); |
| 473 | while let Some(&(mut diff)) = solution.diffs.get(read) { |
| 474 | read += 1; |
| 475 | match &mut diff { |
| 476 | Diff::Equal(range1, range2) => { |
| 477 | let adjust = boundary_up(range1.doc, range1.offset); |
| 478 | // If the whole range is sub-character, skip it. |
| 479 | if range1.len <= adjust { |
| 480 | continue; |
| 481 | } |
| 482 | range1.offset += adjust; |
| 483 | range1.len -= adjust; |
| 484 | range2.offset += adjust; |
| 485 | range2.len -= adjust; |
| 486 | let adjust = boundary_down(range1.doc, range1.offset + range1.len); |
| 487 | range1.len -= adjust; |
| 488 | range2.len -= adjust; |
| 489 | last_delete = Range::empty(); |
| 490 | last_insert = Range::empty(); |
| 491 | } |
| 492 | Diff::Delete(range) => { |
| 493 | skip_overlap(&last_delete, range); |
| 494 | if range.len == 0 { |
| 495 | continue; |
| 496 | } |
| 497 | let adjust = boundary_down(range.doc, range.offset); |
| 498 | range.offset -= adjust; |
| 499 | range.len += adjust; |
| 500 | let adjust = boundary_up(range.doc, range.offset + range.len); |
| 501 | range.len += adjust; |
| 502 | last_delete = *range; |
| 503 | } |
| 504 | Diff::Insert(range) => { |
| 505 | skip_overlap(&last_insert, range); |
| 506 | if range.len == 0 { |
| 507 | continue; |
| 508 | } |
| 509 | let adjust = boundary_down(range.doc, range.offset); |
| 510 | range.offset -= adjust; |
| 511 | range.len += adjust; |
| 512 | let adjust = boundary_up(range.doc, range.offset + range.len); |
| 513 | range.len += adjust; |
| 514 | last_insert = *range; |
| 515 | } |
| 516 | } |
| 517 | solution.diffs[retain] = diff; |
| 518 | retain += 1; |
| 519 | } |
| 520 | |
| 521 | solution.diffs.truncate(retain); |
| 522 | } |
| 523 | |
| 524 | // Reduce the number of edits by eliminating semantically trivial equalities. |
| 525 | fn cleanup_semantic(solution: &mut Solution) { |
| 526 | let mut diffs = &mut solution.diffs; |
| 527 | if diffs.is_empty() { |
| 528 | return; |
| 529 | } |
| 530 | |
| 531 | let mut changes = false; |
| 532 | let mut equalities = VecDeque::new(); // Double-ended queue of equalities. |
| 533 | let mut last_equality = None; // Always equal to equalities.peek().text |
| 534 | let mut pointer = 0; |
| 535 | // Number of characters that changed prior to the equality. |
| 536 | let mut len_insertions1 = 0; |
| 537 | let mut len_deletions1 = 0; |
| 538 | // Number of characters that changed after the equality. |
| 539 | let mut len_insertions2 = 0; |
| 540 | let mut len_deletions2 = 0; |
| 541 | while let Some(&this_diff) = diffs.get(pointer) { |
| 542 | match this_diff { |
| 543 | Diff::Equal(text1, text2) => { |
| 544 | equalities.push_back(pointer); |
| 545 | len_insertions1 = len_insertions2; |
| 546 | len_deletions1 = len_deletions2; |
| 547 | len_insertions2 = 0; |
| 548 | len_deletions2 = 0; |
| 549 | last_equality = Some((text1, text2)); |
| 550 | pointer += 1; |
| 551 | continue; |
| 552 | } |
| 553 | Diff::Delete(text) => len_deletions2 += text.len, |
| 554 | Diff::Insert(text) => len_insertions2 += text.len, |
| 555 | } |
| 556 | // Eliminate an equality that is smaller or equal to the edits on both |
| 557 | // sides of it. |
| 558 | if last_equality.map_or(false, |(last_equality, _)| { |
| 559 | last_equality.len <= cmp::max(len_insertions1, len_deletions1) |
| 560 | && last_equality.len <= cmp::max(len_insertions2, len_deletions2) |
| 561 | }) { |
| 562 | // Jump back to offending equality. |
| 563 | pointer = equalities.pop_back().unwrap(); |
| 564 | |
| 565 | // Replace equality with a delete. |
| 566 | diffs[pointer] = Diff::Delete(last_equality.unwrap().0); |
| 567 | // Insert a corresponding insert. |
| 568 | diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1)); |
| 569 | |
| 570 | len_insertions1 = 0; // Reset the counters. |
| 571 | len_insertions2 = 0; |
| 572 | len_deletions1 = 0; |
| 573 | len_deletions2 = 0; |
| 574 | last_equality = None; |
| 575 | changes = true; |
| 576 | |
| 577 | // Throw away the previous equality (it needs to be reevaluated). |
| 578 | equalities.pop_back(); |
| 579 | if let Some(back) = equalities.back() { |
| 580 | // There is a safe equality we can fall back to. |
| 581 | pointer = *back; |
| 582 | } else { |
| 583 | // There are no previous equalities, jump back to the start. |
| 584 | pointer = 0; |
| 585 | continue; |
| 586 | } |
| 587 | } |
| 588 | pointer += 1; |
| 589 | } |
| 590 | |
| 591 | // Normalize the diff. |
| 592 | if changes { |
| 593 | cleanup_merge(solution); |
| 594 | } |
| 595 | cleanup_semantic_lossless(solution); |
| 596 | diffs = &mut solution.diffs; |
| 597 | |
| 598 | // Find any overlaps between deletions and insertions. |
| 599 | // e.g: <del>abcxxx</del><ins>xxxdef</ins> |
| 600 | // -> <del>abc</del>xxx<ins>def</ins> |
| 601 | // e.g: <del>xxxabc</del><ins>defxxx</ins> |
| 602 | // -> <ins>def</ins>xxx<del>abc</del> |
| 603 | // Only extract an overlap if it is as big as the edit ahead or behind it. |
| 604 | let mut pointer = 1; |
| 605 | while let Some(&this_diff) = diffs.get(pointer) { |
| 606 | let prev_diff = diffs[pointer - 1]; |
| 607 | if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) { |
| 608 | let overlap_len1 = common_overlap(deletion, insertion); |
| 609 | let overlap_len2 = common_overlap(insertion, deletion); |
| 610 | let overlap_min = cmp::min(deletion.len, insertion.len); |
| 611 | if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min { |
| 612 | // Overlap found. Insert an equality and trim the surrounding edits. |
| 613 | diffs.insert( |
| 614 | pointer, |
| 615 | Diff::Equal( |
| 616 | deletion.substring(deletion.len - overlap_len1..deletion.len), |
| 617 | insertion.substring(..overlap_len1), |
| 618 | ), |
| 619 | ); |
| 620 | diffs[pointer - 1] = |
| 621 | Diff::Delete(deletion.substring(..deletion.len - overlap_len1)); |
| 622 | diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..)); |
| 623 | } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min { |
| 624 | // Reverse overlap found. |
| 625 | // Insert an equality and swap and trim the surrounding edits. |
| 626 | diffs.insert( |
| 627 | pointer, |
| 628 | Diff::Equal( |
| 629 | deletion.substring(..overlap_len2), |
| 630 | insertion.substring(insertion.len - overlap_len2..insertion.len), |
| 631 | ), |
| 632 | ); |
| 633 | diffs[pointer - 1] = |
| 634 | Diff::Insert(insertion.substring(..insertion.len - overlap_len2)); |
| 635 | diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..)); |
| 636 | } |
| 637 | pointer += 1; |
| 638 | } |
| 639 | pointer += 1; |
| 640 | } |
| 641 | } |
| 642 | |
| 643 | // Look for single edits surrounded on both sides by equalities which can be |
| 644 | // shifted sideways to align the edit to a word boundary. |
| 645 | // |
| 646 | // e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. |
| 647 | fn cleanup_semantic_lossless(solution: &mut Solution) { |
| 648 | let diffs = &mut solution.diffs; |
| 649 | let mut pointer = 1; |
| 650 | while let Some(&next_diff) = diffs.get(pointer + 1) { |
| 651 | let prev_diff = diffs[pointer - 1]; |
| 652 | if let ( |
| 653 | Diff::Equal(mut prev_equal1, mut prev_equal2), |
| 654 | Diff::Equal(mut next_equal1, mut next_equal2), |
| 655 | ) = (prev_diff, next_diff) |
| 656 | { |
| 657 | // This is a single edit surrounded by equalities. |
| 658 | let mut edit = diffs[pointer]; |
| 659 | |
| 660 | // First, shift the edit as far left as possible. |
| 661 | let common_offset = common_suffix(prev_equal1, edit.text()); |
| 662 | let original_prev_len = prev_equal1.len; |
| 663 | prev_equal1.len -= common_offset; |
| 664 | prev_equal2.len -= common_offset; |
| 665 | edit.shift_left(common_offset); |
| 666 | next_equal1.offset -= common_offset; |
| 667 | next_equal1.len += common_offset; |
| 668 | next_equal2.offset -= common_offset; |
| 669 | next_equal2.len += common_offset; |
| 670 | |
| 671 | // Second, step character by character right, looking for the best fit. |
| 672 | let mut best_prev_equal = (prev_equal1, prev_equal2); |
| 673 | let mut best_edit = edit; |
| 674 | let mut best_next_equal = (next_equal1, next_equal2); |
| 675 | let mut best_score = cleanup_semantic_score(prev_equal1, edit.text()) |
| 676 | + cleanup_semantic_score(edit.text(), next_equal1); |
| 677 | while !edit.text().is_empty() |
| 678 | && !next_equal1.is_empty() |
| 679 | && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap() |
| 680 | { |
| 681 | prev_equal1.len += 1; |
| 682 | prev_equal2.len += 1; |
| 683 | edit.shift_right(1); |
| 684 | next_equal1.offset += 1; |
| 685 | next_equal1.len -= 1; |
| 686 | next_equal2.offset += 1; |
| 687 | next_equal2.len -= 1; |
| 688 | let score = cleanup_semantic_score(prev_equal1, edit.text()) |
| 689 | + cleanup_semantic_score(edit.text(), next_equal1); |
| 690 | // The >= encourages trailing rather than leading whitespace on edits. |
| 691 | if score >= best_score { |
| 692 | best_score = score; |
| 693 | best_prev_equal = (prev_equal1, prev_equal2); |
| 694 | best_edit = edit; |
| 695 | best_next_equal = (next_equal1, next_equal2); |
| 696 | } |
| 697 | } |
| 698 | |
| 699 | if original_prev_len != best_prev_equal.0.len { |
| 700 | // We have an improvement, save it back to the diff. |
| 701 | if best_next_equal.0.is_empty() { |
| 702 | diffs.remove(pointer + 1); |
| 703 | } else { |
| 704 | diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1); |
| 705 | } |
| 706 | diffs[pointer] = best_edit; |
| 707 | if best_prev_equal.0.is_empty() { |
| 708 | diffs.remove(pointer - 1); |
| 709 | pointer -= 1; |
| 710 | } else { |
| 711 | diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1); |
| 712 | } |
| 713 | } |
| 714 | } |
| 715 | pointer += 1; |
| 716 | } |
| 717 | } |
| 718 | |
| 719 | // Given two strings, compute a score representing whether the internal boundary |
| 720 | // falls on logical boundaries. |
| 721 | // |
| 722 | // Scores range from 6 (best) to 0 (worst). |
| 723 | fn cleanup_semantic_score(one: Range, two: Range) -> usize { |
| 724 | if one.is_empty() || two.is_empty() { |
| 725 | // Edges are the best. |
| 726 | return 6; |
| 727 | } |
| 728 | |
| 729 | // Each port of this function behaves slightly differently due to subtle |
| 730 | // differences in each language's definition of things like 'whitespace'. |
| 731 | // Since this function's purpose is largely cosmetic, the choice has been |
| 732 | // made to use each language's native features rather than force total |
| 733 | // conformity. |
| 734 | let char1 = one.chars().next_back().unwrap(); |
| 735 | let char2 = two.chars().next().unwrap(); |
| 736 | let non_alphanumeric1 = !char1.is_ascii_alphanumeric(); |
| 737 | let non_alphanumeric2 = !char2.is_ascii_alphanumeric(); |
| 738 | let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace(); |
| 739 | let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace(); |
| 740 | let line_break1 = whitespace1 && char1.is_control(); |
| 741 | let line_break2 = whitespace2 && char2.is_control(); |
| 742 | let blank_line1 = |
| 743 | line_break1 && (one.ends_with([' \n' , ' \n' ]) || one.ends_with([' \n' , ' \r' , ' \n' ])); |
| 744 | let blank_line2 = |
| 745 | line_break2 && (two.starts_with([' \n' , ' \n' ]) || two.starts_with([' \r' , ' \n' , ' \r' , ' \n' ])); |
| 746 | |
| 747 | if blank_line1 || blank_line2 { |
| 748 | // Five points for blank lines. |
| 749 | 5 |
| 750 | } else if line_break1 || line_break2 { |
| 751 | // Four points for line breaks. |
| 752 | 4 |
| 753 | } else if non_alphanumeric1 && !whitespace1 && whitespace2 { |
| 754 | // Three points for end of sentences. |
| 755 | 3 |
| 756 | } else if whitespace1 || whitespace2 { |
| 757 | // Two points for whitespace. |
| 758 | 2 |
| 759 | } else if non_alphanumeric1 || non_alphanumeric2 { |
| 760 | // One point for non-alphanumeric. |
| 761 | 1 |
| 762 | } else { |
| 763 | 0 |
| 764 | } |
| 765 | } |
| 766 | |
| 767 | // Reorder and merge like edit sections. Merge equalities. Any edit section can |
| 768 | // move as long as it doesn't cross an equality. |
| 769 | fn cleanup_merge(solution: &mut Solution) { |
| 770 | let diffs = &mut solution.diffs; |
| 771 | while !diffs.is_empty() { |
| 772 | diffs.push(Diff::Equal( |
| 773 | solution.text1.substring(solution.text1.len..), |
| 774 | solution.text2.substring(solution.text2.len..), |
| 775 | )); // Add a dummy entry at the end. |
| 776 | let mut pointer = 0; |
| 777 | let mut count_delete = 0; |
| 778 | let mut count_insert = 0; |
| 779 | let mut text_delete = Range::empty(); |
| 780 | let mut text_insert = Range::empty(); |
| 781 | while let Some(&this_diff) = diffs.get(pointer) { |
| 782 | match this_diff { |
| 783 | Diff::Insert(text) => { |
| 784 | count_insert += 1; |
| 785 | if text_insert.is_empty() { |
| 786 | text_insert = text; |
| 787 | } else { |
| 788 | text_insert.len += text.len; |
| 789 | } |
| 790 | } |
| 791 | Diff::Delete(text) => { |
| 792 | count_delete += 1; |
| 793 | if text_delete.is_empty() { |
| 794 | text_delete = text; |
| 795 | } else { |
| 796 | text_delete.len += text.len; |
| 797 | } |
| 798 | } |
| 799 | Diff::Equal(text, _) => { |
| 800 | let count_both = count_delete + count_insert; |
| 801 | if count_both > 1 { |
| 802 | let both_types = count_delete != 0 && count_insert != 0; |
| 803 | // Delete the offending records. |
| 804 | diffs.drain(pointer - count_both..pointer); |
| 805 | pointer -= count_both; |
| 806 | if both_types { |
| 807 | // Factor out any common prefix. |
| 808 | let common_length = common_prefix(text_insert, text_delete); |
| 809 | if common_length != 0 { |
| 810 | if pointer > 0 { |
| 811 | match &mut diffs[pointer - 1] { |
| 812 | Diff::Equal(this_diff1, this_diff2) => { |
| 813 | this_diff1.len += common_length; |
| 814 | this_diff2.len += common_length; |
| 815 | } |
| 816 | _ => unreachable!( |
| 817 | "previous diff should have been an equality" |
| 818 | ), |
| 819 | } |
| 820 | } else { |
| 821 | diffs.insert( |
| 822 | pointer, |
| 823 | Diff::Equal( |
| 824 | text_delete.substring(..common_length), |
| 825 | text_insert.substring(..common_length), |
| 826 | ), |
| 827 | ); |
| 828 | pointer += 1; |
| 829 | } |
| 830 | text_insert = text_insert.substring(common_length..); |
| 831 | text_delete = text_delete.substring(common_length..); |
| 832 | } |
| 833 | // Factor out any common suffix. |
| 834 | let common_length = common_suffix(text_insert, text_delete); |
| 835 | if common_length != 0 { |
| 836 | diffs[pointer].grow_left(common_length); |
| 837 | text_insert.len -= common_length; |
| 838 | text_delete.len -= common_length; |
| 839 | } |
| 840 | } |
| 841 | // Insert the merged records. |
| 842 | if !text_delete.is_empty() { |
| 843 | diffs.insert(pointer, Diff::Delete(text_delete)); |
| 844 | pointer += 1; |
| 845 | } |
| 846 | if !text_insert.is_empty() { |
| 847 | diffs.insert(pointer, Diff::Insert(text_insert)); |
| 848 | pointer += 1; |
| 849 | } |
| 850 | } else if pointer > 0 { |
| 851 | if let Some(Diff::Equal(prev_equal1, prev_equal2)) = |
| 852 | diffs.get_mut(pointer - 1) |
| 853 | { |
| 854 | // Merge this equality with the previous one. |
| 855 | prev_equal1.len += text.len; |
| 856 | prev_equal2.len += text.len; |
| 857 | diffs.remove(pointer); |
| 858 | pointer -= 1; |
| 859 | } |
| 860 | } |
| 861 | count_insert = 0; |
| 862 | count_delete = 0; |
| 863 | text_delete = Range::empty(); |
| 864 | text_insert = Range::empty(); |
| 865 | } |
| 866 | } |
| 867 | pointer += 1; |
| 868 | } |
| 869 | if diffs.last().unwrap().text().is_empty() { |
| 870 | diffs.pop(); // Remove the dummy entry at the end. |
| 871 | } |
| 872 | |
| 873 | // Second pass: look for single edits surrounded on both sides by equalities |
| 874 | // which can be shifted sideways to eliminate an equality. |
| 875 | // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC |
| 876 | let mut changes = false; |
| 877 | let mut pointer = 1; |
| 878 | // Intentionally ignore the first and last element (don't need checking). |
| 879 | while let Some(&next_diff) = diffs.get(pointer + 1) { |
| 880 | let prev_diff = diffs[pointer - 1]; |
| 881 | let this_diff = diffs[pointer]; |
| 882 | if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) { |
| 883 | // This is a single edit surrounded by equalities. |
| 884 | if this_diff.text().ends_with(prev_diff) { |
| 885 | // Shift the edit over the previous equality. |
| 886 | diffs[pointer].shift_left(prev_diff.len); |
| 887 | diffs[pointer + 1].grow_left(prev_diff.len); |
| 888 | diffs.remove(pointer - 1); // Delete prev_diff. |
| 889 | changes = true; |
| 890 | } else if this_diff.text().starts_with(next_diff) { |
| 891 | // Shift the edit over the next equality. |
| 892 | diffs[pointer - 1].grow_right(next_diff.len); |
| 893 | diffs[pointer].shift_right(next_diff.len); |
| 894 | diffs.remove(pointer + 1); // Delete next_diff. |
| 895 | changes = true; |
| 896 | } |
| 897 | } |
| 898 | pointer += 1; |
| 899 | } |
| 900 | // If shifts were made, the diff needs reordering and another shift sweep. |
| 901 | if !changes { |
| 902 | return; |
| 903 | } |
| 904 | } |
| 905 | } |
| 906 | |
| 907 | impl Debug for Chunk<'_> { |
| 908 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
| 909 | let (name: &'static str, text: &str) = match *self { |
| 910 | Chunk::Equal(text: &str) => ("Equal" , text), |
| 911 | Chunk::Delete(text: &str) => ("Delete" , text), |
| 912 | Chunk::Insert(text: &str) => ("Insert" , text), |
| 913 | }; |
| 914 | write!(formatter, " {}( {:?})" , name, text) |
| 915 | } |
| 916 | } |
| 917 | |
| 918 | impl Debug for Diff<'_, '_> { |
| 919 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
| 920 | let (name: &'static str, range: Range<'_>) = match *self { |
| 921 | Diff::Equal(range: Range<'_>, _) => ("Equal" , range), |
| 922 | Diff::Delete(range: Range<'_>) => ("Delete" , range), |
| 923 | Diff::Insert(range: Range<'_>) => ("Insert" , range), |
| 924 | }; |
| 925 | formatter.write_str(data:name)?; |
| 926 | formatter.write_str(data:"( \"" )?; |
| 927 | for ch: char in range.chars() { |
| 928 | if ch == ' \'' { |
| 929 | // escape_debug turns this into "\'" which is unnecessary. |
| 930 | formatter.write_char(ch)?; |
| 931 | } else { |
| 932 | Display::fmt(&ch.escape_debug(), f:formatter)?; |
| 933 | } |
| 934 | } |
| 935 | formatter.write_str(data:" \")" )?; |
| 936 | Ok(()) |
| 937 | } |
| 938 | } |
| 939 | |