1 | //! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar) |
2 | //! |
3 | //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github |
4 | //! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust |
5 | //! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs |
6 | //! |
7 | //! <br> |
8 | //! |
9 | //! ## Diff library with semantic cleanup, based on Google's diff-match-patch |
10 | //! |
11 | //! This library is a port of the Diff component of [Diff Match Patch] to Rust. |
12 | //! The diff implementation is based on [Myers' diff algorithm] but includes |
13 | //! some [semantic cleanups] to increase human readability by factoring out |
14 | //! commonalities which are likely to be coincidental. |
15 | //! |
16 | //! Diff Match Patch was originally built in 2006 to power Google Docs. |
17 | //! |
18 | //! # Interface |
19 | //! |
20 | //! Here is the entire API of the Rust implementation. It operates on borrowed |
21 | //! strings and the return value of the diff algorithm is a vector of chunks |
22 | //! pointing into slices of those input strings. |
23 | //! |
24 | //! ``` |
25 | //! pub enum Chunk<'a> { |
26 | //! Equal(&'a str), |
27 | //! Delete(&'a str), |
28 | //! Insert(&'a str), |
29 | //! } |
30 | //! |
31 | //! # const IGNORE: &str = stringify! { |
32 | //! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>; |
33 | //! # }; |
34 | //! ``` |
35 | //! |
36 | //! [Diff Match Patch]: https://github.com/google/diff-match-patch |
37 | //! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf |
38 | //! [semantic cleanups]: https://neil.fraser.name/writing/diff/ |
39 | |
40 | #![doc (html_root_url = "https://docs.rs/dissimilar/1.0.6" )] |
41 | #![allow ( |
42 | clippy::blocks_in_if_conditions, |
43 | clippy::bool_to_int_with_if, |
44 | clippy::cast_possible_wrap, |
45 | clippy::cast_sign_loss, |
46 | clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127 |
47 | clippy::collapsible_else_if, |
48 | clippy::comparison_chain, |
49 | clippy::match_same_arms, |
50 | clippy::module_name_repetitions, |
51 | clippy::must_use_candidate, |
52 | clippy::new_without_default, |
53 | clippy::octal_escapes, |
54 | clippy::shadow_unrelated, |
55 | clippy::similar_names, |
56 | clippy::too_many_lines, |
57 | clippy::unseparated_literal_suffix, |
58 | unused_parens, // false positive on Some(&(mut diff)) pattern |
59 | )] |
60 | |
61 | mod find; |
62 | mod range; |
63 | |
64 | #[cfg (test)] |
65 | mod tests; |
66 | |
67 | use crate::range::{slice, Range}; |
68 | use std::cmp; |
69 | use std::collections::VecDeque; |
70 | use std::fmt::{self, Debug, Display, Write}; |
71 | |
72 | #[derive (Copy, Clone, PartialEq, Eq)] |
73 | pub enum Chunk<'a> { |
74 | Equal(&'a str), |
75 | Delete(&'a str), |
76 | Insert(&'a str), |
77 | } |
78 | |
79 | #[derive (Copy, Clone)] |
80 | enum Diff<'a, 'b> { |
81 | Equal(Range<'a>, Range<'b>), |
82 | Delete(Range<'a>), |
83 | Insert(Range<'b>), |
84 | } |
85 | |
86 | impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> { |
87 | fn text(&self) -> Range<'tmp> { |
88 | match *self { |
89 | Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range, |
90 | } |
91 | } |
92 | |
93 | fn grow_left(&mut self, increment: usize) { |
94 | self.for_each(|range| { |
95 | range.offset -= increment; |
96 | range.len += increment; |
97 | }); |
98 | } |
99 | |
100 | fn grow_right(&mut self, increment: usize) { |
101 | self.for_each(|range| range.len += increment); |
102 | } |
103 | |
104 | fn shift_left(&mut self, increment: usize) { |
105 | self.for_each(|range| range.offset -= increment); |
106 | } |
107 | |
108 | fn shift_right(&mut self, increment: usize) { |
109 | self.for_each(|range| range.offset += increment); |
110 | } |
111 | |
112 | fn for_each(&mut self, f: impl Fn(&mut Range)) { |
113 | match self { |
114 | Diff::Equal(range1, range2) => { |
115 | f(range1); |
116 | f(range2); |
117 | } |
118 | Diff::Delete(range) => f(range), |
119 | Diff::Insert(range) => f(range), |
120 | } |
121 | } |
122 | } |
123 | |
124 | pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> { |
125 | let chars1: Vec<char> = text1.chars().collect(); |
126 | let chars2: Vec<char> = text2.chars().collect(); |
127 | let range1 = Range::new(&chars1, ..); |
128 | let range2 = Range::new(&chars2, ..); |
129 | |
130 | let mut solution = main(range1, range2); |
131 | cleanup_char_boundary(&mut solution); |
132 | cleanup_semantic(&mut solution); |
133 | cleanup_merge(&mut solution); |
134 | |
135 | let mut chunks = Vec::new(); |
136 | let mut pos1 = 0; |
137 | let mut pos2 = 0; |
138 | for diff in solution.diffs { |
139 | chunks.push(match diff { |
140 | Diff::Equal(range, _) => { |
141 | let len = range.len_bytes(); |
142 | let chunk = Chunk::Equal(&text1[pos1..pos1 + len]); |
143 | pos1 += len; |
144 | pos2 += len; |
145 | chunk |
146 | } |
147 | Diff::Delete(range) => { |
148 | let len = range.len_bytes(); |
149 | let chunk = Chunk::Delete(&text1[pos1..pos1 + len]); |
150 | pos1 += len; |
151 | chunk |
152 | } |
153 | Diff::Insert(range) => { |
154 | let len = range.len_bytes(); |
155 | let chunk = Chunk::Insert(&text2[pos2..pos2 + len]); |
156 | pos2 += len; |
157 | chunk |
158 | } |
159 | }); |
160 | } |
161 | chunks |
162 | } |
163 | |
164 | struct Solution<'a, 'b> { |
165 | text1: Range<'a>, |
166 | text2: Range<'b>, |
167 | diffs: Vec<Diff<'a, 'b>>, |
168 | } |
169 | |
170 | fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> { |
171 | let whole1 = text1; |
172 | let whole2 = text2; |
173 | |
174 | // Trim off common prefix. |
175 | let common_prefix_len = common_prefix(text1, text2); |
176 | let common_prefix = Diff::Equal( |
177 | text1.substring(..common_prefix_len), |
178 | text2.substring(..common_prefix_len), |
179 | ); |
180 | text1 = text1.substring(common_prefix_len..); |
181 | text2 = text2.substring(common_prefix_len..); |
182 | |
183 | // Trim off common suffix. |
184 | let common_suffix_len = common_suffix(text1, text2); |
185 | let common_suffix = Diff::Equal( |
186 | text1.substring(text1.len - common_suffix_len..), |
187 | text2.substring(text2.len - common_suffix_len..), |
188 | ); |
189 | text1 = text1.substring(..text1.len - common_suffix_len); |
190 | text2 = text2.substring(..text2.len - common_suffix_len); |
191 | |
192 | // Compute the diff on the middle block. |
193 | let mut solution = Solution { |
194 | text1: whole1, |
195 | text2: whole2, |
196 | diffs: compute(text1, text2), |
197 | }; |
198 | |
199 | // Restore the prefix and suffix. |
200 | if common_prefix_len > 0 { |
201 | solution.diffs.insert(0, common_prefix); |
202 | } |
203 | if common_suffix_len > 0 { |
204 | solution.diffs.push(common_suffix); |
205 | } |
206 | |
207 | cleanup_merge(&mut solution); |
208 | |
209 | solution |
210 | } |
211 | |
212 | // Find the differences between two texts. Assumes that the texts do not have |
213 | // any common prefix or suffix. |
214 | fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> { |
215 | match (text1.is_empty(), text2.is_empty()) { |
216 | (true, true) => return Vec::new(), |
217 | (true, false) => return vec![Diff::Insert(text2)], |
218 | (false, true) => return vec![Diff::Delete(text1)], |
219 | (false, false) => {} |
220 | } |
221 | |
222 | // Check for entire shorter text inside the longer text. |
223 | if text1.len > text2.len { |
224 | if let Some(i) = text1.find(text2) { |
225 | return vec![ |
226 | Diff::Delete(text1.substring(..i)), |
227 | Diff::Equal(text1.substring(i..i + text2.len), text2), |
228 | Diff::Delete(text1.substring(i + text2.len..)), |
229 | ]; |
230 | } |
231 | } else { |
232 | if let Some(i) = text2.find(text1) { |
233 | return vec![ |
234 | Diff::Insert(text2.substring(..i)), |
235 | Diff::Equal(text1, text2.substring(i..i + text1.len)), |
236 | Diff::Insert(text2.substring(i + text1.len..)), |
237 | ]; |
238 | } |
239 | } |
240 | |
241 | if text1.len == 1 || text2.len == 1 { |
242 | // Single character string. |
243 | // After the previous check, the character can't be an equality. |
244 | return vec![Diff::Delete(text1), Diff::Insert(text2)]; |
245 | } |
246 | |
247 | bisect(text1, text2) |
248 | } |
249 | |
250 | // Find the 'middle snake' of a diff, split the problem in two and return the |
251 | // recursively constructed diff. |
252 | // |
253 | // See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. |
254 | fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> { |
255 | let max_d = (text1.len + text2.len + 1) / 2; |
256 | let v_offset = max_d; |
257 | let v_len = 2 * max_d; |
258 | let mut v1 = vec![-1isize; v_len]; |
259 | let mut v2 = vec![-1isize; v_len]; |
260 | v1[v_offset + 1] = 0; |
261 | v2[v_offset + 1] = 0; |
262 | let delta = text1.len as isize - text2.len as isize; |
263 | // If the total number of characters is odd, then the front path will |
264 | // collide with the reverse path. |
265 | let front = delta % 2 != 0; |
266 | // Offsets for start and end of k loop. |
267 | // Prevents mapping of space beyond the grid. |
268 | let mut k1start = 0; |
269 | let mut k1end = 0; |
270 | let mut k2start = 0; |
271 | let mut k2end = 0; |
272 | for d in 0..max_d as isize { |
273 | // Walk the front path one step. |
274 | let mut k1 = -d + k1start; |
275 | while k1 <= d - k1end { |
276 | let k1_offset = (v_offset as isize + k1) as usize; |
277 | let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) { |
278 | v1[k1_offset + 1] |
279 | } else { |
280 | v1[k1_offset - 1] + 1 |
281 | } as usize; |
282 | let mut y1 = (x1 as isize - k1) as usize; |
283 | if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) { |
284 | let advance = common_prefix(s1, s2); |
285 | x1 += advance; |
286 | y1 += advance; |
287 | } |
288 | v1[k1_offset] = x1 as isize; |
289 | if x1 > text1.len { |
290 | // Ran off the right of the graph. |
291 | k1end += 2; |
292 | } else if y1 > text2.len { |
293 | // Ran off the bottom of the graph. |
294 | k1start += 2; |
295 | } else if front { |
296 | let k2_offset = v_offset as isize + delta - k1; |
297 | if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 { |
298 | // Mirror x2 onto top-left coordinate system. |
299 | let x2 = text1.len as isize - v2[k2_offset as usize]; |
300 | if x1 as isize >= x2 { |
301 | // Overlap detected. |
302 | return bisect_split(text1, text2, x1, y1); |
303 | } |
304 | } |
305 | } |
306 | k1 += 2; |
307 | } |
308 | |
309 | // Walk the reverse path one step. |
310 | let mut k2 = -d + k2start; |
311 | while k2 <= d - k2end { |
312 | let k2_offset = (v_offset as isize + k2) as usize; |
313 | let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) { |
314 | v2[k2_offset + 1] |
315 | } else { |
316 | v2[k2_offset - 1] + 1 |
317 | } as usize; |
318 | let mut y2 = (x2 as isize - k2) as usize; |
319 | if x2 < text1.len && y2 < text2.len { |
320 | let advance = common_suffix( |
321 | text1.substring(..text1.len - x2), |
322 | text2.substring(..text2.len - y2), |
323 | ); |
324 | x2 += advance; |
325 | y2 += advance; |
326 | } |
327 | v2[k2_offset] = x2 as isize; |
328 | if x2 > text1.len { |
329 | // Ran off the left of the graph. |
330 | k2end += 2; |
331 | } else if y2 > text2.len { |
332 | // Ran off the top of the graph. |
333 | k2start += 2; |
334 | } else if !front { |
335 | let k1_offset = v_offset as isize + delta - k2; |
336 | if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 { |
337 | let x1 = v1[k1_offset as usize] as usize; |
338 | let y1 = v_offset + x1 - k1_offset as usize; |
339 | // Mirror x2 onto top-left coordinate system. |
340 | x2 = text1.len - x2; |
341 | if x1 >= x2 { |
342 | // Overlap detected. |
343 | return bisect_split(text1, text2, x1, y1); |
344 | } |
345 | } |
346 | } |
347 | k2 += 2; |
348 | } |
349 | } |
350 | // Number of diffs equals number of characters, no commonality at all. |
351 | vec![Diff::Delete(text1), Diff::Insert(text2)] |
352 | } |
353 | |
354 | // Given the location of the 'middle snake', split the diff in two parts and |
355 | // recurse. |
356 | fn bisect_split<'a, 'b>( |
357 | text1: Range<'a>, |
358 | text2: Range<'b>, |
359 | x: usize, |
360 | y: usize, |
361 | ) -> Vec<Diff<'a, 'b>> { |
362 | let (text1a: Range<'_>, text1b: Range<'_>) = text1.split_at(mid:x); |
363 | let (text2a: Range<'_>, text2b: Range<'_>) = text2.split_at(mid:y); |
364 | |
365 | // Compute both diffs serially. |
366 | let mut diffs: Vec> = main(text1:text1a, text2:text2a).diffs; |
367 | diffs.extend(iter:main(text1:text1b, text2:text2b).diffs); |
368 | |
369 | diffs |
370 | } |
371 | |
372 | // Determine the length of the common prefix of two strings. |
373 | fn common_prefix(text1: Range, text2: Range) -> usize { |
374 | for (i: usize, (b1: char, b2: char)) in text1.chars().zip(text2.chars()).enumerate() { |
375 | if b1 != b2 { |
376 | return i; |
377 | } |
378 | } |
379 | cmp::min(v1:text1.len, v2:text2.len) |
380 | } |
381 | |
382 | // Determine the length of the common suffix of two strings. |
383 | fn common_suffix(text1: Range, text2: Range) -> usize { |
384 | for (i: usize, (b1: char, b2: char)) in text1.chars().rev().zip(text2.chars().rev()).enumerate() { |
385 | if b1 != b2 { |
386 | return i; |
387 | } |
388 | } |
389 | cmp::min(v1:text1.len, v2:text2.len) |
390 | } |
391 | |
392 | // Determine if the suffix of one string is the prefix of another. |
393 | // |
394 | // Returns the number of characters common to the end of the first string and |
395 | // the start of the second string. |
396 | fn common_overlap(mut text1: Range, mut text2: Range) -> usize { |
397 | // Eliminate the null case. |
398 | if text1.is_empty() || text2.is_empty() { |
399 | return 0; |
400 | } |
401 | // Truncate the longer string. |
402 | if text1.len > text2.len { |
403 | text1 = text1.substring(text1.len - text2.len..); |
404 | } else if text1.len < text2.len { |
405 | text2 = text2.substring(..text1.len); |
406 | } |
407 | // Quick check for the worst case. |
408 | if slice(text1) == slice(text2) { |
409 | return text1.len; |
410 | } |
411 | |
412 | // Start by looking for a single character match |
413 | // and increase length until no match is found. |
414 | // Performance analysis: https://neil.fraser.name/news/2010/11/04/ |
415 | let mut best = 0; |
416 | let mut length = 1; |
417 | loop { |
418 | let pattern = text1.substring(text1.len - length..); |
419 | let found = match text2.find(pattern) { |
420 | Some(found) => found, |
421 | None => return best, |
422 | }; |
423 | length += found; |
424 | if found == 0 |
425 | || slice(text1.substring(text1.len - length..)) == slice(text2.substring(..length)) |
426 | { |
427 | best = length; |
428 | length += 1; |
429 | } |
430 | } |
431 | } |
432 | |
433 | fn cleanup_char_boundary(solution: &mut Solution) { |
434 | fn is_segmentation_boundary(doc: &[char], pos: usize) -> bool { |
435 | // FIXME: use unicode-segmentation crate? |
436 | let _ = doc; |
437 | let _ = pos; |
438 | true |
439 | } |
440 | |
441 | fn boundary_down(doc: &[char], pos: usize) -> usize { |
442 | let mut adjust = 0; |
443 | while !is_segmentation_boundary(doc, pos - adjust) { |
444 | adjust += 1; |
445 | } |
446 | adjust |
447 | } |
448 | |
449 | fn boundary_up(doc: &[char], pos: usize) -> usize { |
450 | let mut adjust = 0; |
451 | while !is_segmentation_boundary(doc, pos + adjust) { |
452 | adjust += 1; |
453 | } |
454 | adjust |
455 | } |
456 | |
457 | fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) { |
458 | let prev_end = prev.offset + prev.len; |
459 | if prev_end > range.offset { |
460 | let delta = cmp::min(prev_end - range.offset, range.len); |
461 | range.offset += delta; |
462 | range.len -= delta; |
463 | } |
464 | } |
465 | |
466 | let mut read = 0; |
467 | let mut retain = 0; |
468 | let mut last_delete = Range::empty(); |
469 | let mut last_insert = Range::empty(); |
470 | while let Some(&(mut diff)) = solution.diffs.get(read) { |
471 | read += 1; |
472 | match &mut diff { |
473 | Diff::Equal(range1, range2) => { |
474 | let adjust = boundary_up(range1.doc, range1.offset); |
475 | // If the whole range is sub-character, skip it. |
476 | if range1.len <= adjust { |
477 | continue; |
478 | } |
479 | range1.offset += adjust; |
480 | range1.len -= adjust; |
481 | range2.offset += adjust; |
482 | range2.len -= adjust; |
483 | let adjust = boundary_down(range1.doc, range1.offset + range1.len); |
484 | range1.len -= adjust; |
485 | range2.len -= adjust; |
486 | last_delete = Range::empty(); |
487 | last_insert = Range::empty(); |
488 | } |
489 | Diff::Delete(range) => { |
490 | skip_overlap(&last_delete, range); |
491 | if range.len == 0 { |
492 | continue; |
493 | } |
494 | let adjust = boundary_down(range.doc, range.offset); |
495 | range.offset -= adjust; |
496 | range.len += adjust; |
497 | let adjust = boundary_up(range.doc, range.offset + range.len); |
498 | range.len += adjust; |
499 | last_delete = *range; |
500 | } |
501 | Diff::Insert(range) => { |
502 | skip_overlap(&last_insert, range); |
503 | if range.len == 0 { |
504 | continue; |
505 | } |
506 | let adjust = boundary_down(range.doc, range.offset); |
507 | range.offset -= adjust; |
508 | range.len += adjust; |
509 | let adjust = boundary_up(range.doc, range.offset + range.len); |
510 | range.len += adjust; |
511 | last_insert = *range; |
512 | } |
513 | } |
514 | solution.diffs[retain] = diff; |
515 | retain += 1; |
516 | } |
517 | |
518 | solution.diffs.truncate(retain); |
519 | } |
520 | |
521 | // Reduce the number of edits by eliminating semantically trivial equalities. |
522 | fn cleanup_semantic(solution: &mut Solution) { |
523 | let mut diffs = &mut solution.diffs; |
524 | if diffs.is_empty() { |
525 | return; |
526 | } |
527 | |
528 | let mut changes = false; |
529 | let mut equalities = VecDeque::new(); // Double-ended queue of equalities. |
530 | let mut last_equality = None; // Always equal to equalities.peek().text |
531 | let mut pointer = 0; |
532 | // Number of characters that changed prior to the equality. |
533 | let mut len_insertions1 = 0; |
534 | let mut len_deletions1 = 0; |
535 | // Number of characters that changed after the equality. |
536 | let mut len_insertions2 = 0; |
537 | let mut len_deletions2 = 0; |
538 | while let Some(&this_diff) = diffs.get(pointer) { |
539 | match this_diff { |
540 | Diff::Equal(text1, text2) => { |
541 | equalities.push_back(pointer); |
542 | len_insertions1 = len_insertions2; |
543 | len_deletions1 = len_deletions2; |
544 | len_insertions2 = 0; |
545 | len_deletions2 = 0; |
546 | last_equality = Some((text1, text2)); |
547 | pointer += 1; |
548 | continue; |
549 | } |
550 | Diff::Delete(text) => len_deletions2 += text.len, |
551 | Diff::Insert(text) => len_insertions2 += text.len, |
552 | } |
553 | // Eliminate an equality that is smaller or equal to the edits on both |
554 | // sides of it. |
555 | if last_equality.map_or(false, |(last_equality, _)| { |
556 | last_equality.len <= cmp::max(len_insertions1, len_deletions1) |
557 | && last_equality.len <= cmp::max(len_insertions2, len_deletions2) |
558 | }) { |
559 | // Jump back to offending equality. |
560 | pointer = equalities.pop_back().unwrap(); |
561 | |
562 | // Replace equality with a delete. |
563 | diffs[pointer] = Diff::Delete(last_equality.unwrap().0); |
564 | // Insert a corresponding insert. |
565 | diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1)); |
566 | |
567 | len_insertions1 = 0; // Reset the counters. |
568 | len_insertions2 = 0; |
569 | len_deletions1 = 0; |
570 | len_deletions2 = 0; |
571 | last_equality = None; |
572 | changes = true; |
573 | |
574 | // Throw away the previous equality (it needs to be reevaluated). |
575 | equalities.pop_back(); |
576 | if let Some(back) = equalities.back() { |
577 | // There is a safe equality we can fall back to. |
578 | pointer = *back; |
579 | } else { |
580 | // There are no previous equalities, jump back to the start. |
581 | pointer = 0; |
582 | continue; |
583 | } |
584 | } |
585 | pointer += 1; |
586 | } |
587 | |
588 | // Normalize the diff. |
589 | if changes { |
590 | cleanup_merge(solution); |
591 | } |
592 | cleanup_semantic_lossless(solution); |
593 | diffs = &mut solution.diffs; |
594 | |
595 | // Find any overlaps between deletions and insertions. |
596 | // e.g: <del>abcxxx</del><ins>xxxdef</ins> |
597 | // -> <del>abc</del>xxx<ins>def</ins> |
598 | // e.g: <del>xxxabc</del><ins>defxxx</ins> |
599 | // -> <ins>def</ins>xxx<del>abc</del> |
600 | // Only extract an overlap if it is as big as the edit ahead or behind it. |
601 | let mut pointer = 1; |
602 | while let Some(&this_diff) = diffs.get(pointer) { |
603 | let prev_diff = diffs[pointer - 1]; |
604 | if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) { |
605 | let overlap_len1 = common_overlap(deletion, insertion); |
606 | let overlap_len2 = common_overlap(insertion, deletion); |
607 | let overlap_min = cmp::min(deletion.len, insertion.len); |
608 | if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min { |
609 | // Overlap found. Insert an equality and trim the surrounding edits. |
610 | diffs.insert( |
611 | pointer, |
612 | Diff::Equal( |
613 | deletion.substring(deletion.len - overlap_len1..deletion.len), |
614 | insertion.substring(..overlap_len1), |
615 | ), |
616 | ); |
617 | diffs[pointer - 1] = |
618 | Diff::Delete(deletion.substring(..deletion.len - overlap_len1)); |
619 | diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..)); |
620 | } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min { |
621 | // Reverse overlap found. |
622 | // Insert an equality and swap and trim the surrounding edits. |
623 | diffs.insert( |
624 | pointer, |
625 | Diff::Equal( |
626 | deletion.substring(..overlap_len2), |
627 | insertion.substring(insertion.len - overlap_len2..insertion.len), |
628 | ), |
629 | ); |
630 | diffs[pointer - 1] = |
631 | Diff::Insert(insertion.substring(..insertion.len - overlap_len2)); |
632 | diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..)); |
633 | } |
634 | pointer += 1; |
635 | } |
636 | pointer += 1; |
637 | } |
638 | } |
639 | |
640 | // Look for single edits surrounded on both sides by equalities which can be |
641 | // shifted sideways to align the edit to a word boundary. |
642 | // |
643 | // e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. |
644 | fn cleanup_semantic_lossless(solution: &mut Solution) { |
645 | let diffs = &mut solution.diffs; |
646 | let mut pointer = 1; |
647 | while let Some(&next_diff) = diffs.get(pointer + 1) { |
648 | let prev_diff = diffs[pointer - 1]; |
649 | if let ( |
650 | Diff::Equal(mut prev_equal1, mut prev_equal2), |
651 | Diff::Equal(mut next_equal1, mut next_equal2), |
652 | ) = (prev_diff, next_diff) |
653 | { |
654 | // This is a single edit surrounded by equalities. |
655 | let mut edit = diffs[pointer]; |
656 | |
657 | // First, shift the edit as far left as possible. |
658 | let common_offset = common_suffix(prev_equal1, edit.text()); |
659 | let original_prev_len = prev_equal1.len; |
660 | prev_equal1.len -= common_offset; |
661 | prev_equal2.len -= common_offset; |
662 | edit.shift_left(common_offset); |
663 | next_equal1.offset -= common_offset; |
664 | next_equal1.len += common_offset; |
665 | next_equal2.offset -= common_offset; |
666 | next_equal2.len += common_offset; |
667 | |
668 | // Second, step character by character right, looking for the best fit. |
669 | let mut best_prev_equal = (prev_equal1, prev_equal2); |
670 | let mut best_edit = edit; |
671 | let mut best_next_equal = (next_equal1, next_equal2); |
672 | let mut best_score = cleanup_semantic_score(prev_equal1, edit.text()) |
673 | + cleanup_semantic_score(edit.text(), next_equal1); |
674 | while !edit.text().is_empty() |
675 | && !next_equal1.is_empty() |
676 | && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap() |
677 | { |
678 | prev_equal1.len += 1; |
679 | prev_equal2.len += 1; |
680 | edit.shift_right(1); |
681 | next_equal1.offset += 1; |
682 | next_equal1.len -= 1; |
683 | next_equal2.offset += 1; |
684 | next_equal2.len -= 1; |
685 | let score = cleanup_semantic_score(prev_equal1, edit.text()) |
686 | + cleanup_semantic_score(edit.text(), next_equal1); |
687 | // The >= encourages trailing rather than leading whitespace on edits. |
688 | if score >= best_score { |
689 | best_score = score; |
690 | best_prev_equal = (prev_equal1, prev_equal2); |
691 | best_edit = edit; |
692 | best_next_equal = (next_equal1, next_equal2); |
693 | } |
694 | } |
695 | |
696 | if original_prev_len != best_prev_equal.0.len { |
697 | // We have an improvement, save it back to the diff. |
698 | if best_next_equal.0.is_empty() { |
699 | diffs.remove(pointer + 1); |
700 | } else { |
701 | diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1); |
702 | } |
703 | diffs[pointer] = best_edit; |
704 | if best_prev_equal.0.is_empty() { |
705 | diffs.remove(pointer - 1); |
706 | pointer -= 1; |
707 | } else { |
708 | diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1); |
709 | } |
710 | } |
711 | } |
712 | pointer += 1; |
713 | } |
714 | } |
715 | |
716 | // Given two strings, compute a score representing whether the internal boundary |
717 | // falls on logical boundaries. |
718 | // |
719 | // Scores range from 6 (best) to 0 (worst). |
720 | fn cleanup_semantic_score(one: Range, two: Range) -> usize { |
721 | if one.is_empty() || two.is_empty() { |
722 | // Edges are the best. |
723 | return 6; |
724 | } |
725 | |
726 | // Each port of this function behaves slightly differently due to subtle |
727 | // differences in each language's definition of things like 'whitespace'. |
728 | // Since this function's purpose is largely cosmetic, the choice has been |
729 | // made to use each language's native features rather than force total |
730 | // conformity. |
731 | let char1 = one.chars().next_back().unwrap(); |
732 | let char2 = two.chars().next().unwrap(); |
733 | let non_alphanumeric1 = !char1.is_ascii_alphanumeric(); |
734 | let non_alphanumeric2 = !char2.is_ascii_alphanumeric(); |
735 | let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace(); |
736 | let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace(); |
737 | let line_break1 = whitespace1 && char1.is_control(); |
738 | let line_break2 = whitespace2 && char2.is_control(); |
739 | let blank_line1 = |
740 | line_break1 && (one.ends_with([' \n' , ' \n' ]) || one.ends_with([' \n' , ' \r' , ' \n' ])); |
741 | let blank_line2 = |
742 | line_break2 && (two.starts_with([' \n' , ' \n' ]) || two.starts_with([' \r' , ' \n' , ' \r' , ' \n' ])); |
743 | |
744 | if blank_line1 || blank_line2 { |
745 | // Five points for blank lines. |
746 | 5 |
747 | } else if line_break1 || line_break2 { |
748 | // Four points for line breaks. |
749 | 4 |
750 | } else if non_alphanumeric1 && !whitespace1 && whitespace2 { |
751 | // Three points for end of sentences. |
752 | 3 |
753 | } else if whitespace1 || whitespace2 { |
754 | // Two points for whitespace. |
755 | 2 |
756 | } else if non_alphanumeric1 || non_alphanumeric2 { |
757 | // One point for non-alphanumeric. |
758 | 1 |
759 | } else { |
760 | 0 |
761 | } |
762 | } |
763 | |
764 | // Reorder and merge like edit sections. Merge equalities. Any edit section can |
765 | // move as long as it doesn't cross an equality. |
766 | fn cleanup_merge(solution: &mut Solution) { |
767 | let diffs = &mut solution.diffs; |
768 | while !diffs.is_empty() { |
769 | diffs.push(Diff::Equal( |
770 | solution.text1.substring(solution.text1.len..), |
771 | solution.text2.substring(solution.text2.len..), |
772 | )); // Add a dummy entry at the end. |
773 | let mut pointer = 0; |
774 | let mut count_delete = 0; |
775 | let mut count_insert = 0; |
776 | let mut text_delete = Range::empty(); |
777 | let mut text_insert = Range::empty(); |
778 | while let Some(&this_diff) = diffs.get(pointer) { |
779 | match this_diff { |
780 | Diff::Insert(text) => { |
781 | count_insert += 1; |
782 | if text_insert.is_empty() { |
783 | text_insert = text; |
784 | } else { |
785 | text_insert.len += text.len; |
786 | } |
787 | } |
788 | Diff::Delete(text) => { |
789 | count_delete += 1; |
790 | if text_delete.is_empty() { |
791 | text_delete = text; |
792 | } else { |
793 | text_delete.len += text.len; |
794 | } |
795 | } |
796 | Diff::Equal(text, _) => { |
797 | let count_both = count_delete + count_insert; |
798 | if count_both > 1 { |
799 | let both_types = count_delete != 0 && count_insert != 0; |
800 | // Delete the offending records. |
801 | diffs.splice(pointer - count_both..pointer, None); |
802 | pointer -= count_both; |
803 | if both_types { |
804 | // Factor out any common prefix. |
805 | let common_length = common_prefix(text_insert, text_delete); |
806 | if common_length != 0 { |
807 | if pointer > 0 { |
808 | match &mut diffs[pointer - 1] { |
809 | Diff::Equal(this_diff1, this_diff2) => { |
810 | this_diff1.len += common_length; |
811 | this_diff2.len += common_length; |
812 | } |
813 | _ => unreachable!( |
814 | "previous diff should have been an equality" |
815 | ), |
816 | } |
817 | } else { |
818 | diffs.insert( |
819 | pointer, |
820 | Diff::Equal( |
821 | text_delete.substring(..common_length), |
822 | text_insert.substring(..common_length), |
823 | ), |
824 | ); |
825 | pointer += 1; |
826 | } |
827 | text_insert = text_insert.substring(common_length..); |
828 | text_delete = text_delete.substring(common_length..); |
829 | } |
830 | // Factor out any common suffix. |
831 | let common_length = common_suffix(text_insert, text_delete); |
832 | if common_length != 0 { |
833 | diffs[pointer].grow_left(common_length); |
834 | text_insert.len -= common_length; |
835 | text_delete.len -= common_length; |
836 | } |
837 | } |
838 | // Insert the merged records. |
839 | if !text_delete.is_empty() { |
840 | diffs.insert(pointer, Diff::Delete(text_delete)); |
841 | pointer += 1; |
842 | } |
843 | if !text_insert.is_empty() { |
844 | diffs.insert(pointer, Diff::Insert(text_insert)); |
845 | pointer += 1; |
846 | } |
847 | } else if pointer > 0 { |
848 | if let Some(Diff::Equal(prev_equal1, prev_equal2)) = |
849 | diffs.get_mut(pointer - 1) |
850 | { |
851 | // Merge this equality with the previous one. |
852 | prev_equal1.len += text.len; |
853 | prev_equal2.len += text.len; |
854 | diffs.remove(pointer); |
855 | pointer -= 1; |
856 | } |
857 | } |
858 | count_insert = 0; |
859 | count_delete = 0; |
860 | text_delete = Range::empty(); |
861 | text_insert = Range::empty(); |
862 | } |
863 | } |
864 | pointer += 1; |
865 | } |
866 | if diffs.last().unwrap().text().is_empty() { |
867 | diffs.pop(); // Remove the dummy entry at the end. |
868 | } |
869 | |
870 | // Second pass: look for single edits surrounded on both sides by equalities |
871 | // which can be shifted sideways to eliminate an equality. |
872 | // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC |
873 | let mut changes = false; |
874 | let mut pointer = 1; |
875 | // Intentionally ignore the first and last element (don't need checking). |
876 | while let Some(&next_diff) = diffs.get(pointer + 1) { |
877 | let prev_diff = diffs[pointer - 1]; |
878 | let this_diff = diffs[pointer]; |
879 | if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) { |
880 | // This is a single edit surrounded by equalities. |
881 | if this_diff.text().ends_with(prev_diff) { |
882 | // Shift the edit over the previous equality. |
883 | diffs[pointer].shift_left(prev_diff.len); |
884 | diffs[pointer + 1].grow_left(prev_diff.len); |
885 | diffs.remove(pointer - 1); // Delete prev_diff. |
886 | changes = true; |
887 | } else if this_diff.text().starts_with(next_diff) { |
888 | // Shift the edit over the next equality. |
889 | diffs[pointer - 1].grow_right(next_diff.len); |
890 | diffs[pointer].shift_right(next_diff.len); |
891 | diffs.remove(pointer + 1); // Delete next_diff. |
892 | changes = true; |
893 | } |
894 | } |
895 | pointer += 1; |
896 | } |
897 | // If shifts were made, the diff needs reordering and another shift sweep. |
898 | if !changes { |
899 | return; |
900 | } |
901 | } |
902 | } |
903 | |
904 | impl Debug for Chunk<'_> { |
905 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
906 | let (name: &str, text: &str) = match *self { |
907 | Chunk::Equal(text: &str) => ("Equal" , text), |
908 | Chunk::Delete(text: &str) => ("Delete" , text), |
909 | Chunk::Insert(text: &str) => ("Insert" , text), |
910 | }; |
911 | write!(formatter, " {}( {:?})" , name, text) |
912 | } |
913 | } |
914 | |
915 | impl Debug for Diff<'_, '_> { |
916 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
917 | let (name: &str, range: Range<'_>) = match *self { |
918 | Diff::Equal(range: Range<'_>, _) => ("Equal" , range), |
919 | Diff::Delete(range: Range<'_>) => ("Delete" , range), |
920 | Diff::Insert(range: Range<'_>) => ("Insert" , range), |
921 | }; |
922 | formatter.write_str(data:name)?; |
923 | formatter.write_str(data:"( \"" )?; |
924 | for ch: char in range.chars() { |
925 | if ch == ' \'' { |
926 | // escape_debug turns this into "\'" which is unnecessary. |
927 | formatter.write_char(ch)?; |
928 | } else { |
929 | Display::fmt(&ch.escape_debug(), f:formatter)?; |
930 | } |
931 | } |
932 | formatter.write_str(data:" \")" )?; |
933 | Ok(()) |
934 | } |
935 | } |
936 | |