1 | //! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar) |
2 | //! |
3 | //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github |
4 | //! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust |
5 | //! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs |
6 | //! |
7 | //! <br> |
8 | //! |
9 | //! ## Diff library with semantic cleanup, based on Google's diff-match-patch |
10 | //! |
11 | //! This library is a port of the Diff component of [Diff Match Patch] to Rust. |
12 | //! The diff implementation is based on [Myers' diff algorithm] but includes |
13 | //! some [semantic cleanups] to increase human readability by factoring out |
14 | //! commonalities which are likely to be coincidental. |
15 | //! |
16 | //! Diff Match Patch was originally built in 2006 to power Google Docs. |
17 | //! |
18 | //! # Interface |
19 | //! |
20 | //! Here is the entire API of the Rust implementation. It operates on borrowed |
21 | //! strings and the return value of the diff algorithm is a vector of chunks |
22 | //! pointing into slices of those input strings. |
23 | //! |
24 | //! ``` |
25 | //! pub enum Chunk<'a> { |
26 | //! Equal(&'a str), |
27 | //! Delete(&'a str), |
28 | //! Insert(&'a str), |
29 | //! } |
30 | //! |
31 | //! # const IGNORE: &str = stringify! { |
32 | //! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>; |
33 | //! # }; |
34 | //! ``` |
35 | //! |
36 | //! [Diff Match Patch]: https://github.com/google/diff-match-patch |
37 | //! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf |
38 | //! [semantic cleanups]: https://neil.fraser.name/writing/diff/ |
39 | |
40 | #![doc (html_root_url = "https://docs.rs/dissimilar/1.0.7" )] |
41 | #![allow ( |
42 | clippy::blocks_in_if_conditions, |
43 | clippy::bool_to_int_with_if, |
44 | clippy::cast_possible_wrap, |
45 | clippy::cast_sign_loss, |
46 | clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127 |
47 | clippy::collapsible_else_if, |
48 | clippy::comparison_chain, |
49 | clippy::items_after_test_module, // https://github.com/rust-lang/rust-clippy/issues/10713 |
50 | clippy::let_underscore_untyped, |
51 | clippy::match_same_arms, |
52 | clippy::module_name_repetitions, |
53 | clippy::must_use_candidate, |
54 | clippy::new_without_default, |
55 | clippy::octal_escapes, |
56 | clippy::shadow_unrelated, |
57 | clippy::similar_names, |
58 | clippy::too_many_lines, |
59 | clippy::unseparated_literal_suffix, |
60 | unused_parens, // false positive on Some(&(mut diff)) pattern |
61 | )] |
62 | |
63 | mod find; |
64 | mod range; |
65 | |
66 | #[cfg (test)] |
67 | mod tests; |
68 | |
69 | use crate::range::{slice, Range}; |
70 | use std::cmp; |
71 | use std::collections::VecDeque; |
72 | use std::fmt::{self, Debug, Display, Write}; |
73 | |
74 | #[derive (Copy, Clone, PartialEq, Eq)] |
75 | pub enum Chunk<'a> { |
76 | Equal(&'a str), |
77 | Delete(&'a str), |
78 | Insert(&'a str), |
79 | } |
80 | |
81 | #[derive (Copy, Clone)] |
82 | enum Diff<'a, 'b> { |
83 | Equal(Range<'a>, Range<'b>), |
84 | Delete(Range<'a>), |
85 | Insert(Range<'b>), |
86 | } |
87 | |
88 | impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> { |
89 | fn text(&self) -> Range<'tmp> { |
90 | match *self { |
91 | Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range, |
92 | } |
93 | } |
94 | |
95 | fn grow_left(&mut self, increment: usize) { |
96 | self.for_each(|range| { |
97 | range.offset -= increment; |
98 | range.len += increment; |
99 | }); |
100 | } |
101 | |
102 | fn grow_right(&mut self, increment: usize) { |
103 | self.for_each(|range| range.len += increment); |
104 | } |
105 | |
106 | fn shift_left(&mut self, increment: usize) { |
107 | self.for_each(|range| range.offset -= increment); |
108 | } |
109 | |
110 | fn shift_right(&mut self, increment: usize) { |
111 | self.for_each(|range| range.offset += increment); |
112 | } |
113 | |
114 | fn for_each(&mut self, f: impl Fn(&mut Range)) { |
115 | match self { |
116 | Diff::Equal(range1, range2) => { |
117 | f(range1); |
118 | f(range2); |
119 | } |
120 | Diff::Delete(range) => f(range), |
121 | Diff::Insert(range) => f(range), |
122 | } |
123 | } |
124 | } |
125 | |
126 | pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> { |
127 | let chars1: Vec<char> = text1.chars().collect(); |
128 | let chars2: Vec<char> = text2.chars().collect(); |
129 | let range1 = Range::new(&chars1, ..); |
130 | let range2 = Range::new(&chars2, ..); |
131 | |
132 | let mut solution = main(range1, range2); |
133 | cleanup_char_boundary(&mut solution); |
134 | cleanup_semantic(&mut solution); |
135 | cleanup_merge(&mut solution); |
136 | |
137 | let mut chunks = Vec::new(); |
138 | let mut pos1 = 0; |
139 | let mut pos2 = 0; |
140 | for diff in solution.diffs { |
141 | chunks.push(match diff { |
142 | Diff::Equal(range, _) => { |
143 | let len = range.len_bytes(); |
144 | let chunk = Chunk::Equal(&text1[pos1..pos1 + len]); |
145 | pos1 += len; |
146 | pos2 += len; |
147 | chunk |
148 | } |
149 | Diff::Delete(range) => { |
150 | let len = range.len_bytes(); |
151 | let chunk = Chunk::Delete(&text1[pos1..pos1 + len]); |
152 | pos1 += len; |
153 | chunk |
154 | } |
155 | Diff::Insert(range) => { |
156 | let len = range.len_bytes(); |
157 | let chunk = Chunk::Insert(&text2[pos2..pos2 + len]); |
158 | pos2 += len; |
159 | chunk |
160 | } |
161 | }); |
162 | } |
163 | chunks |
164 | } |
165 | |
166 | struct Solution<'a, 'b> { |
167 | text1: Range<'a>, |
168 | text2: Range<'b>, |
169 | diffs: Vec<Diff<'a, 'b>>, |
170 | } |
171 | |
172 | fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> { |
173 | let whole1 = text1; |
174 | let whole2 = text2; |
175 | |
176 | // Trim off common prefix. |
177 | let common_prefix_len = common_prefix(text1, text2); |
178 | let common_prefix = Diff::Equal( |
179 | text1.substring(..common_prefix_len), |
180 | text2.substring(..common_prefix_len), |
181 | ); |
182 | text1 = text1.substring(common_prefix_len..); |
183 | text2 = text2.substring(common_prefix_len..); |
184 | |
185 | // Trim off common suffix. |
186 | let common_suffix_len = common_suffix(text1, text2); |
187 | let common_suffix = Diff::Equal( |
188 | text1.substring(text1.len - common_suffix_len..), |
189 | text2.substring(text2.len - common_suffix_len..), |
190 | ); |
191 | text1 = text1.substring(..text1.len - common_suffix_len); |
192 | text2 = text2.substring(..text2.len - common_suffix_len); |
193 | |
194 | // Compute the diff on the middle block. |
195 | let mut solution = Solution { |
196 | text1: whole1, |
197 | text2: whole2, |
198 | diffs: compute(text1, text2), |
199 | }; |
200 | |
201 | // Restore the prefix and suffix. |
202 | if common_prefix_len > 0 { |
203 | solution.diffs.insert(0, common_prefix); |
204 | } |
205 | if common_suffix_len > 0 { |
206 | solution.diffs.push(common_suffix); |
207 | } |
208 | |
209 | cleanup_merge(&mut solution); |
210 | |
211 | solution |
212 | } |
213 | |
214 | // Find the differences between two texts. Assumes that the texts do not have |
215 | // any common prefix or suffix. |
216 | fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> { |
217 | match (text1.is_empty(), text2.is_empty()) { |
218 | (true, true) => return Vec::new(), |
219 | (true, false) => return vec![Diff::Insert(text2)], |
220 | (false, true) => return vec![Diff::Delete(text1)], |
221 | (false, false) => {} |
222 | } |
223 | |
224 | // Check for entire shorter text inside the longer text. |
225 | if text1.len > text2.len { |
226 | if let Some(i) = text1.find(text2) { |
227 | return vec![ |
228 | Diff::Delete(text1.substring(..i)), |
229 | Diff::Equal(text1.substring(i..i + text2.len), text2), |
230 | Diff::Delete(text1.substring(i + text2.len..)), |
231 | ]; |
232 | } |
233 | } else { |
234 | if let Some(i) = text2.find(text1) { |
235 | return vec![ |
236 | Diff::Insert(text2.substring(..i)), |
237 | Diff::Equal(text1, text2.substring(i..i + text1.len)), |
238 | Diff::Insert(text2.substring(i + text1.len..)), |
239 | ]; |
240 | } |
241 | } |
242 | |
243 | if text1.len == 1 || text2.len == 1 { |
244 | // Single character string. |
245 | // After the previous check, the character can't be an equality. |
246 | return vec![Diff::Delete(text1), Diff::Insert(text2)]; |
247 | } |
248 | |
249 | bisect(text1, text2) |
250 | } |
251 | |
252 | // Find the 'middle snake' of a diff, split the problem in two and return the |
253 | // recursively constructed diff. |
254 | // |
255 | // See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. |
256 | fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> { |
257 | let max_d = (text1.len + text2.len + 1) / 2; |
258 | let v_offset = max_d; |
259 | let v_len = 2 * max_d; |
260 | let mut v1 = vec![-1isize; v_len]; |
261 | let mut v2 = vec![-1isize; v_len]; |
262 | v1[v_offset + 1] = 0; |
263 | v2[v_offset + 1] = 0; |
264 | let delta = text1.len as isize - text2.len as isize; |
265 | // If the total number of characters is odd, then the front path will |
266 | // collide with the reverse path. |
267 | let front = delta % 2 != 0; |
268 | // Offsets for start and end of k loop. |
269 | // Prevents mapping of space beyond the grid. |
270 | let mut k1start = 0; |
271 | let mut k1end = 0; |
272 | let mut k2start = 0; |
273 | let mut k2end = 0; |
274 | for d in 0..max_d as isize { |
275 | // Walk the front path one step. |
276 | let mut k1 = -d + k1start; |
277 | while k1 <= d - k1end { |
278 | let k1_offset = (v_offset as isize + k1) as usize; |
279 | let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) { |
280 | v1[k1_offset + 1] |
281 | } else { |
282 | v1[k1_offset - 1] + 1 |
283 | } as usize; |
284 | let mut y1 = (x1 as isize - k1) as usize; |
285 | if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) { |
286 | let advance = common_prefix(s1, s2); |
287 | x1 += advance; |
288 | y1 += advance; |
289 | } |
290 | v1[k1_offset] = x1 as isize; |
291 | if x1 > text1.len { |
292 | // Ran off the right of the graph. |
293 | k1end += 2; |
294 | } else if y1 > text2.len { |
295 | // Ran off the bottom of the graph. |
296 | k1start += 2; |
297 | } else if front { |
298 | let k2_offset = v_offset as isize + delta - k1; |
299 | if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 { |
300 | // Mirror x2 onto top-left coordinate system. |
301 | let x2 = text1.len as isize - v2[k2_offset as usize]; |
302 | if x1 as isize >= x2 { |
303 | // Overlap detected. |
304 | return bisect_split(text1, text2, x1, y1); |
305 | } |
306 | } |
307 | } |
308 | k1 += 2; |
309 | } |
310 | |
311 | // Walk the reverse path one step. |
312 | let mut k2 = -d + k2start; |
313 | while k2 <= d - k2end { |
314 | let k2_offset = (v_offset as isize + k2) as usize; |
315 | let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) { |
316 | v2[k2_offset + 1] |
317 | } else { |
318 | v2[k2_offset - 1] + 1 |
319 | } as usize; |
320 | let mut y2 = (x2 as isize - k2) as usize; |
321 | if x2 < text1.len && y2 < text2.len { |
322 | let advance = common_suffix( |
323 | text1.substring(..text1.len - x2), |
324 | text2.substring(..text2.len - y2), |
325 | ); |
326 | x2 += advance; |
327 | y2 += advance; |
328 | } |
329 | v2[k2_offset] = x2 as isize; |
330 | if x2 > text1.len { |
331 | // Ran off the left of the graph. |
332 | k2end += 2; |
333 | } else if y2 > text2.len { |
334 | // Ran off the top of the graph. |
335 | k2start += 2; |
336 | } else if !front { |
337 | let k1_offset = v_offset as isize + delta - k2; |
338 | if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 { |
339 | let x1 = v1[k1_offset as usize] as usize; |
340 | let y1 = v_offset + x1 - k1_offset as usize; |
341 | // Mirror x2 onto top-left coordinate system. |
342 | x2 = text1.len - x2; |
343 | if x1 >= x2 { |
344 | // Overlap detected. |
345 | return bisect_split(text1, text2, x1, y1); |
346 | } |
347 | } |
348 | } |
349 | k2 += 2; |
350 | } |
351 | } |
352 | // Number of diffs equals number of characters, no commonality at all. |
353 | vec![Diff::Delete(text1), Diff::Insert(text2)] |
354 | } |
355 | |
356 | // Given the location of the 'middle snake', split the diff in two parts and |
357 | // recurse. |
358 | fn bisect_split<'a, 'b>( |
359 | text1: Range<'a>, |
360 | text2: Range<'b>, |
361 | x: usize, |
362 | y: usize, |
363 | ) -> Vec<Diff<'a, 'b>> { |
364 | let (text1a: Range<'_>, text1b: Range<'_>) = text1.split_at(mid:x); |
365 | let (text2a: Range<'_>, text2b: Range<'_>) = text2.split_at(mid:y); |
366 | |
367 | // Compute both diffs serially. |
368 | let mut diffs: Vec> = main(text1:text1a, text2:text2a).diffs; |
369 | diffs.extend(iter:main(text1:text1b, text2:text2b).diffs); |
370 | |
371 | diffs |
372 | } |
373 | |
374 | // Determine the length of the common prefix of two strings. |
375 | fn common_prefix(text1: Range, text2: Range) -> usize { |
376 | for (i: usize, (b1: char, b2: char)) in text1.chars().zip(text2.chars()).enumerate() { |
377 | if b1 != b2 { |
378 | return i; |
379 | } |
380 | } |
381 | cmp::min(v1:text1.len, v2:text2.len) |
382 | } |
383 | |
384 | // Determine the length of the common suffix of two strings. |
385 | fn common_suffix(text1: Range, text2: Range) -> usize { |
386 | for (i: usize, (b1: char, b2: char)) in text1.chars().rev().zip(text2.chars().rev()).enumerate() { |
387 | if b1 != b2 { |
388 | return i; |
389 | } |
390 | } |
391 | cmp::min(v1:text1.len, v2:text2.len) |
392 | } |
393 | |
394 | // Determine if the suffix of one string is the prefix of another. |
395 | // |
396 | // Returns the number of characters common to the end of the first string and |
397 | // the start of the second string. |
398 | fn common_overlap(mut text1: Range, mut text2: Range) -> usize { |
399 | // Eliminate the null case. |
400 | if text1.is_empty() || text2.is_empty() { |
401 | return 0; |
402 | } |
403 | // Truncate the longer string. |
404 | if text1.len > text2.len { |
405 | text1 = text1.substring(text1.len - text2.len..); |
406 | } else if text1.len < text2.len { |
407 | text2 = text2.substring(..text1.len); |
408 | } |
409 | // Quick check for the worst case. |
410 | if slice(text1) == slice(text2) { |
411 | return text1.len; |
412 | } |
413 | |
414 | // Start by looking for a single character match |
415 | // and increase length until no match is found. |
416 | // Performance analysis: https://neil.fraser.name/news/2010/11/04/ |
417 | let mut best = 0; |
418 | let mut length = 1; |
419 | loop { |
420 | let pattern = text1.substring(text1.len - length..); |
421 | let found = match text2.find(pattern) { |
422 | Some(found) => found, |
423 | None => return best, |
424 | }; |
425 | length += found; |
426 | if found == 0 |
427 | || slice(text1.substring(text1.len - length..)) == slice(text2.substring(..length)) |
428 | { |
429 | best = length; |
430 | length += 1; |
431 | } |
432 | } |
433 | } |
434 | |
435 | fn cleanup_char_boundary(solution: &mut Solution) { |
436 | fn is_segmentation_boundary(doc: &[char], pos: usize) -> bool { |
437 | // FIXME: use unicode-segmentation crate? |
438 | let _ = doc; |
439 | let _ = pos; |
440 | true |
441 | } |
442 | |
443 | fn boundary_down(doc: &[char], pos: usize) -> usize { |
444 | let mut adjust = 0; |
445 | while !is_segmentation_boundary(doc, pos - adjust) { |
446 | adjust += 1; |
447 | } |
448 | adjust |
449 | } |
450 | |
451 | fn boundary_up(doc: &[char], pos: usize) -> usize { |
452 | let mut adjust = 0; |
453 | while !is_segmentation_boundary(doc, pos + adjust) { |
454 | adjust += 1; |
455 | } |
456 | adjust |
457 | } |
458 | |
459 | fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) { |
460 | let prev_end = prev.offset + prev.len; |
461 | if prev_end > range.offset { |
462 | let delta = cmp::min(prev_end - range.offset, range.len); |
463 | range.offset += delta; |
464 | range.len -= delta; |
465 | } |
466 | } |
467 | |
468 | let mut read = 0; |
469 | let mut retain = 0; |
470 | let mut last_delete = Range::empty(); |
471 | let mut last_insert = Range::empty(); |
472 | while let Some(&(mut diff)) = solution.diffs.get(read) { |
473 | read += 1; |
474 | match &mut diff { |
475 | Diff::Equal(range1, range2) => { |
476 | let adjust = boundary_up(range1.doc, range1.offset); |
477 | // If the whole range is sub-character, skip it. |
478 | if range1.len <= adjust { |
479 | continue; |
480 | } |
481 | range1.offset += adjust; |
482 | range1.len -= adjust; |
483 | range2.offset += adjust; |
484 | range2.len -= adjust; |
485 | let adjust = boundary_down(range1.doc, range1.offset + range1.len); |
486 | range1.len -= adjust; |
487 | range2.len -= adjust; |
488 | last_delete = Range::empty(); |
489 | last_insert = Range::empty(); |
490 | } |
491 | Diff::Delete(range) => { |
492 | skip_overlap(&last_delete, range); |
493 | if range.len == 0 { |
494 | continue; |
495 | } |
496 | let adjust = boundary_down(range.doc, range.offset); |
497 | range.offset -= adjust; |
498 | range.len += adjust; |
499 | let adjust = boundary_up(range.doc, range.offset + range.len); |
500 | range.len += adjust; |
501 | last_delete = *range; |
502 | } |
503 | Diff::Insert(range) => { |
504 | skip_overlap(&last_insert, range); |
505 | if range.len == 0 { |
506 | continue; |
507 | } |
508 | let adjust = boundary_down(range.doc, range.offset); |
509 | range.offset -= adjust; |
510 | range.len += adjust; |
511 | let adjust = boundary_up(range.doc, range.offset + range.len); |
512 | range.len += adjust; |
513 | last_insert = *range; |
514 | } |
515 | } |
516 | solution.diffs[retain] = diff; |
517 | retain += 1; |
518 | } |
519 | |
520 | solution.diffs.truncate(retain); |
521 | } |
522 | |
523 | // Reduce the number of edits by eliminating semantically trivial equalities. |
524 | fn cleanup_semantic(solution: &mut Solution) { |
525 | let mut diffs = &mut solution.diffs; |
526 | if diffs.is_empty() { |
527 | return; |
528 | } |
529 | |
530 | let mut changes = false; |
531 | let mut equalities = VecDeque::new(); // Double-ended queue of equalities. |
532 | let mut last_equality = None; // Always equal to equalities.peek().text |
533 | let mut pointer = 0; |
534 | // Number of characters that changed prior to the equality. |
535 | let mut len_insertions1 = 0; |
536 | let mut len_deletions1 = 0; |
537 | // Number of characters that changed after the equality. |
538 | let mut len_insertions2 = 0; |
539 | let mut len_deletions2 = 0; |
540 | while let Some(&this_diff) = diffs.get(pointer) { |
541 | match this_diff { |
542 | Diff::Equal(text1, text2) => { |
543 | equalities.push_back(pointer); |
544 | len_insertions1 = len_insertions2; |
545 | len_deletions1 = len_deletions2; |
546 | len_insertions2 = 0; |
547 | len_deletions2 = 0; |
548 | last_equality = Some((text1, text2)); |
549 | pointer += 1; |
550 | continue; |
551 | } |
552 | Diff::Delete(text) => len_deletions2 += text.len, |
553 | Diff::Insert(text) => len_insertions2 += text.len, |
554 | } |
555 | // Eliminate an equality that is smaller or equal to the edits on both |
556 | // sides of it. |
557 | if last_equality.map_or(false, |(last_equality, _)| { |
558 | last_equality.len <= cmp::max(len_insertions1, len_deletions1) |
559 | && last_equality.len <= cmp::max(len_insertions2, len_deletions2) |
560 | }) { |
561 | // Jump back to offending equality. |
562 | pointer = equalities.pop_back().unwrap(); |
563 | |
564 | // Replace equality with a delete. |
565 | diffs[pointer] = Diff::Delete(last_equality.unwrap().0); |
566 | // Insert a corresponding insert. |
567 | diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1)); |
568 | |
569 | len_insertions1 = 0; // Reset the counters. |
570 | len_insertions2 = 0; |
571 | len_deletions1 = 0; |
572 | len_deletions2 = 0; |
573 | last_equality = None; |
574 | changes = true; |
575 | |
576 | // Throw away the previous equality (it needs to be reevaluated). |
577 | equalities.pop_back(); |
578 | if let Some(back) = equalities.back() { |
579 | // There is a safe equality we can fall back to. |
580 | pointer = *back; |
581 | } else { |
582 | // There are no previous equalities, jump back to the start. |
583 | pointer = 0; |
584 | continue; |
585 | } |
586 | } |
587 | pointer += 1; |
588 | } |
589 | |
590 | // Normalize the diff. |
591 | if changes { |
592 | cleanup_merge(solution); |
593 | } |
594 | cleanup_semantic_lossless(solution); |
595 | diffs = &mut solution.diffs; |
596 | |
597 | // Find any overlaps between deletions and insertions. |
598 | // e.g: <del>abcxxx</del><ins>xxxdef</ins> |
599 | // -> <del>abc</del>xxx<ins>def</ins> |
600 | // e.g: <del>xxxabc</del><ins>defxxx</ins> |
601 | // -> <ins>def</ins>xxx<del>abc</del> |
602 | // Only extract an overlap if it is as big as the edit ahead or behind it. |
603 | let mut pointer = 1; |
604 | while let Some(&this_diff) = diffs.get(pointer) { |
605 | let prev_diff = diffs[pointer - 1]; |
606 | if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) { |
607 | let overlap_len1 = common_overlap(deletion, insertion); |
608 | let overlap_len2 = common_overlap(insertion, deletion); |
609 | let overlap_min = cmp::min(deletion.len, insertion.len); |
610 | if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min { |
611 | // Overlap found. Insert an equality and trim the surrounding edits. |
612 | diffs.insert( |
613 | pointer, |
614 | Diff::Equal( |
615 | deletion.substring(deletion.len - overlap_len1..deletion.len), |
616 | insertion.substring(..overlap_len1), |
617 | ), |
618 | ); |
619 | diffs[pointer - 1] = |
620 | Diff::Delete(deletion.substring(..deletion.len - overlap_len1)); |
621 | diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..)); |
622 | } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min { |
623 | // Reverse overlap found. |
624 | // Insert an equality and swap and trim the surrounding edits. |
625 | diffs.insert( |
626 | pointer, |
627 | Diff::Equal( |
628 | deletion.substring(..overlap_len2), |
629 | insertion.substring(insertion.len - overlap_len2..insertion.len), |
630 | ), |
631 | ); |
632 | diffs[pointer - 1] = |
633 | Diff::Insert(insertion.substring(..insertion.len - overlap_len2)); |
634 | diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..)); |
635 | } |
636 | pointer += 1; |
637 | } |
638 | pointer += 1; |
639 | } |
640 | } |
641 | |
642 | // Look for single edits surrounded on both sides by equalities which can be |
643 | // shifted sideways to align the edit to a word boundary. |
644 | // |
645 | // e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. |
646 | fn cleanup_semantic_lossless(solution: &mut Solution) { |
647 | let diffs = &mut solution.diffs; |
648 | let mut pointer = 1; |
649 | while let Some(&next_diff) = diffs.get(pointer + 1) { |
650 | let prev_diff = diffs[pointer - 1]; |
651 | if let ( |
652 | Diff::Equal(mut prev_equal1, mut prev_equal2), |
653 | Diff::Equal(mut next_equal1, mut next_equal2), |
654 | ) = (prev_diff, next_diff) |
655 | { |
656 | // This is a single edit surrounded by equalities. |
657 | let mut edit = diffs[pointer]; |
658 | |
659 | // First, shift the edit as far left as possible. |
660 | let common_offset = common_suffix(prev_equal1, edit.text()); |
661 | let original_prev_len = prev_equal1.len; |
662 | prev_equal1.len -= common_offset; |
663 | prev_equal2.len -= common_offset; |
664 | edit.shift_left(common_offset); |
665 | next_equal1.offset -= common_offset; |
666 | next_equal1.len += common_offset; |
667 | next_equal2.offset -= common_offset; |
668 | next_equal2.len += common_offset; |
669 | |
670 | // Second, step character by character right, looking for the best fit. |
671 | let mut best_prev_equal = (prev_equal1, prev_equal2); |
672 | let mut best_edit = edit; |
673 | let mut best_next_equal = (next_equal1, next_equal2); |
674 | let mut best_score = cleanup_semantic_score(prev_equal1, edit.text()) |
675 | + cleanup_semantic_score(edit.text(), next_equal1); |
676 | while !edit.text().is_empty() |
677 | && !next_equal1.is_empty() |
678 | && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap() |
679 | { |
680 | prev_equal1.len += 1; |
681 | prev_equal2.len += 1; |
682 | edit.shift_right(1); |
683 | next_equal1.offset += 1; |
684 | next_equal1.len -= 1; |
685 | next_equal2.offset += 1; |
686 | next_equal2.len -= 1; |
687 | let score = cleanup_semantic_score(prev_equal1, edit.text()) |
688 | + cleanup_semantic_score(edit.text(), next_equal1); |
689 | // The >= encourages trailing rather than leading whitespace on edits. |
690 | if score >= best_score { |
691 | best_score = score; |
692 | best_prev_equal = (prev_equal1, prev_equal2); |
693 | best_edit = edit; |
694 | best_next_equal = (next_equal1, next_equal2); |
695 | } |
696 | } |
697 | |
698 | if original_prev_len != best_prev_equal.0.len { |
699 | // We have an improvement, save it back to the diff. |
700 | if best_next_equal.0.is_empty() { |
701 | diffs.remove(pointer + 1); |
702 | } else { |
703 | diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1); |
704 | } |
705 | diffs[pointer] = best_edit; |
706 | if best_prev_equal.0.is_empty() { |
707 | diffs.remove(pointer - 1); |
708 | pointer -= 1; |
709 | } else { |
710 | diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1); |
711 | } |
712 | } |
713 | } |
714 | pointer += 1; |
715 | } |
716 | } |
717 | |
718 | // Given two strings, compute a score representing whether the internal boundary |
719 | // falls on logical boundaries. |
720 | // |
721 | // Scores range from 6 (best) to 0 (worst). |
722 | fn cleanup_semantic_score(one: Range, two: Range) -> usize { |
723 | if one.is_empty() || two.is_empty() { |
724 | // Edges are the best. |
725 | return 6; |
726 | } |
727 | |
728 | // Each port of this function behaves slightly differently due to subtle |
729 | // differences in each language's definition of things like 'whitespace'. |
730 | // Since this function's purpose is largely cosmetic, the choice has been |
731 | // made to use each language's native features rather than force total |
732 | // conformity. |
733 | let char1 = one.chars().next_back().unwrap(); |
734 | let char2 = two.chars().next().unwrap(); |
735 | let non_alphanumeric1 = !char1.is_ascii_alphanumeric(); |
736 | let non_alphanumeric2 = !char2.is_ascii_alphanumeric(); |
737 | let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace(); |
738 | let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace(); |
739 | let line_break1 = whitespace1 && char1.is_control(); |
740 | let line_break2 = whitespace2 && char2.is_control(); |
741 | let blank_line1 = |
742 | line_break1 && (one.ends_with([' \n' , ' \n' ]) || one.ends_with([' \n' , ' \r' , ' \n' ])); |
743 | let blank_line2 = |
744 | line_break2 && (two.starts_with([' \n' , ' \n' ]) || two.starts_with([' \r' , ' \n' , ' \r' , ' \n' ])); |
745 | |
746 | if blank_line1 || blank_line2 { |
747 | // Five points for blank lines. |
748 | 5 |
749 | } else if line_break1 || line_break2 { |
750 | // Four points for line breaks. |
751 | 4 |
752 | } else if non_alphanumeric1 && !whitespace1 && whitespace2 { |
753 | // Three points for end of sentences. |
754 | 3 |
755 | } else if whitespace1 || whitespace2 { |
756 | // Two points for whitespace. |
757 | 2 |
758 | } else if non_alphanumeric1 || non_alphanumeric2 { |
759 | // One point for non-alphanumeric. |
760 | 1 |
761 | } else { |
762 | 0 |
763 | } |
764 | } |
765 | |
766 | // Reorder and merge like edit sections. Merge equalities. Any edit section can |
767 | // move as long as it doesn't cross an equality. |
768 | fn cleanup_merge(solution: &mut Solution) { |
769 | let diffs = &mut solution.diffs; |
770 | while !diffs.is_empty() { |
771 | diffs.push(Diff::Equal( |
772 | solution.text1.substring(solution.text1.len..), |
773 | solution.text2.substring(solution.text2.len..), |
774 | )); // Add a dummy entry at the end. |
775 | let mut pointer = 0; |
776 | let mut count_delete = 0; |
777 | let mut count_insert = 0; |
778 | let mut text_delete = Range::empty(); |
779 | let mut text_insert = Range::empty(); |
780 | while let Some(&this_diff) = diffs.get(pointer) { |
781 | match this_diff { |
782 | Diff::Insert(text) => { |
783 | count_insert += 1; |
784 | if text_insert.is_empty() { |
785 | text_insert = text; |
786 | } else { |
787 | text_insert.len += text.len; |
788 | } |
789 | } |
790 | Diff::Delete(text) => { |
791 | count_delete += 1; |
792 | if text_delete.is_empty() { |
793 | text_delete = text; |
794 | } else { |
795 | text_delete.len += text.len; |
796 | } |
797 | } |
798 | Diff::Equal(text, _) => { |
799 | let count_both = count_delete + count_insert; |
800 | if count_both > 1 { |
801 | let both_types = count_delete != 0 && count_insert != 0; |
802 | // Delete the offending records. |
803 | diffs.splice(pointer - count_both..pointer, None); |
804 | pointer -= count_both; |
805 | if both_types { |
806 | // Factor out any common prefix. |
807 | let common_length = common_prefix(text_insert, text_delete); |
808 | if common_length != 0 { |
809 | if pointer > 0 { |
810 | match &mut diffs[pointer - 1] { |
811 | Diff::Equal(this_diff1, this_diff2) => { |
812 | this_diff1.len += common_length; |
813 | this_diff2.len += common_length; |
814 | } |
815 | _ => unreachable!( |
816 | "previous diff should have been an equality" |
817 | ), |
818 | } |
819 | } else { |
820 | diffs.insert( |
821 | pointer, |
822 | Diff::Equal( |
823 | text_delete.substring(..common_length), |
824 | text_insert.substring(..common_length), |
825 | ), |
826 | ); |
827 | pointer += 1; |
828 | } |
829 | text_insert = text_insert.substring(common_length..); |
830 | text_delete = text_delete.substring(common_length..); |
831 | } |
832 | // Factor out any common suffix. |
833 | let common_length = common_suffix(text_insert, text_delete); |
834 | if common_length != 0 { |
835 | diffs[pointer].grow_left(common_length); |
836 | text_insert.len -= common_length; |
837 | text_delete.len -= common_length; |
838 | } |
839 | } |
840 | // Insert the merged records. |
841 | if !text_delete.is_empty() { |
842 | diffs.insert(pointer, Diff::Delete(text_delete)); |
843 | pointer += 1; |
844 | } |
845 | if !text_insert.is_empty() { |
846 | diffs.insert(pointer, Diff::Insert(text_insert)); |
847 | pointer += 1; |
848 | } |
849 | } else if pointer > 0 { |
850 | if let Some(Diff::Equal(prev_equal1, prev_equal2)) = |
851 | diffs.get_mut(pointer - 1) |
852 | { |
853 | // Merge this equality with the previous one. |
854 | prev_equal1.len += text.len; |
855 | prev_equal2.len += text.len; |
856 | diffs.remove(pointer); |
857 | pointer -= 1; |
858 | } |
859 | } |
860 | count_insert = 0; |
861 | count_delete = 0; |
862 | text_delete = Range::empty(); |
863 | text_insert = Range::empty(); |
864 | } |
865 | } |
866 | pointer += 1; |
867 | } |
868 | if diffs.last().unwrap().text().is_empty() { |
869 | diffs.pop(); // Remove the dummy entry at the end. |
870 | } |
871 | |
872 | // Second pass: look for single edits surrounded on both sides by equalities |
873 | // which can be shifted sideways to eliminate an equality. |
874 | // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC |
875 | let mut changes = false; |
876 | let mut pointer = 1; |
877 | // Intentionally ignore the first and last element (don't need checking). |
878 | while let Some(&next_diff) = diffs.get(pointer + 1) { |
879 | let prev_diff = diffs[pointer - 1]; |
880 | let this_diff = diffs[pointer]; |
881 | if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) { |
882 | // This is a single edit surrounded by equalities. |
883 | if this_diff.text().ends_with(prev_diff) { |
884 | // Shift the edit over the previous equality. |
885 | diffs[pointer].shift_left(prev_diff.len); |
886 | diffs[pointer + 1].grow_left(prev_diff.len); |
887 | diffs.remove(pointer - 1); // Delete prev_diff. |
888 | changes = true; |
889 | } else if this_diff.text().starts_with(next_diff) { |
890 | // Shift the edit over the next equality. |
891 | diffs[pointer - 1].grow_right(next_diff.len); |
892 | diffs[pointer].shift_right(next_diff.len); |
893 | diffs.remove(pointer + 1); // Delete next_diff. |
894 | changes = true; |
895 | } |
896 | } |
897 | pointer += 1; |
898 | } |
899 | // If shifts were made, the diff needs reordering and another shift sweep. |
900 | if !changes { |
901 | return; |
902 | } |
903 | } |
904 | } |
905 | |
906 | impl Debug for Chunk<'_> { |
907 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
908 | let (name: &str, text: &str) = match *self { |
909 | Chunk::Equal(text: &str) => ("Equal" , text), |
910 | Chunk::Delete(text: &str) => ("Delete" , text), |
911 | Chunk::Insert(text: &str) => ("Insert" , text), |
912 | }; |
913 | write!(formatter, " {}( {:?})" , name, text) |
914 | } |
915 | } |
916 | |
917 | impl Debug for Diff<'_, '_> { |
918 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
919 | let (name: &str, range: Range<'_>) = match *self { |
920 | Diff::Equal(range: Range<'_>, _) => ("Equal" , range), |
921 | Diff::Delete(range: Range<'_>) => ("Delete" , range), |
922 | Diff::Insert(range: Range<'_>) => ("Insert" , range), |
923 | }; |
924 | formatter.write_str(data:name)?; |
925 | formatter.write_str(data:"( \"" )?; |
926 | for ch: char in range.chars() { |
927 | if ch == ' \'' { |
928 | // escape_debug turns this into "\'" which is unnecessary. |
929 | formatter.write_char(ch)?; |
930 | } else { |
931 | Display::fmt(&ch.escape_debug(), f:formatter)?; |
932 | } |
933 | } |
934 | formatter.write_str(data:" \")" )?; |
935 | Ok(()) |
936 | } |
937 | } |
938 | |