1 | //! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar) |
2 | //! |
3 | //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github |
4 | //! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust |
5 | //! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs |
6 | //! |
7 | //! <br> |
8 | //! |
9 | //! ## Diff library with semantic cleanup, based on Google's diff-match-patch |
10 | //! |
11 | //! This library is a port of the Diff component of [Diff Match Patch] to Rust. |
12 | //! The diff implementation is based on [Myers' diff algorithm] but includes |
13 | //! some [semantic cleanups] to increase human readability by factoring out |
14 | //! commonalities which are likely to be coincidental. |
15 | //! |
16 | //! Diff Match Patch was originally built in 2006 to power Google Docs. |
17 | //! |
18 | //! # Interface |
19 | //! |
20 | //! Here is the entire API of the Rust implementation. It operates on borrowed |
21 | //! strings and the return value of the diff algorithm is a vector of chunks |
22 | //! pointing into slices of those input strings. |
23 | //! |
24 | //! ``` |
25 | //! pub enum Chunk<'a> { |
26 | //! Equal(&'a str), |
27 | //! Delete(&'a str), |
28 | //! Insert(&'a str), |
29 | //! } |
30 | //! |
31 | //! # const IGNORE: &str = stringify! { |
32 | //! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>; |
33 | //! # }; |
34 | //! ``` |
35 | //! |
36 | //! [Diff Match Patch]: https://github.com/google/diff-match-patch |
37 | //! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf |
38 | //! [semantic cleanups]: https://neil.fraser.name/writing/diff/ |
39 | |
40 | #![doc (html_root_url = "https://docs.rs/dissimilar/1.0.10" )] |
41 | #![allow ( |
42 | clippy::blocks_in_conditions, |
43 | clippy::bool_to_int_with_if, |
44 | clippy::cast_possible_wrap, |
45 | clippy::cast_sign_loss, |
46 | clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127 |
47 | clippy::collapsible_else_if, |
48 | clippy::comparison_chain, |
49 | clippy::implied_bounds_in_impls, |
50 | clippy::items_after_test_module, // https://github.com/rust-lang/rust-clippy/issues/10713 |
51 | clippy::let_underscore_untyped, |
52 | clippy::match_same_arms, |
53 | clippy::module_name_repetitions, |
54 | clippy::must_use_candidate, |
55 | clippy::new_without_default, |
56 | clippy::octal_escapes, |
57 | clippy::shadow_unrelated, |
58 | clippy::similar_names, |
59 | clippy::too_many_lines, |
60 | clippy::unseparated_literal_suffix, |
61 | unused_parens, // false positive on Some(&(mut diff)) pattern |
62 | )] |
63 | |
64 | mod find; |
65 | mod range; |
66 | |
67 | #[cfg (test)] |
68 | mod tests; |
69 | |
70 | use crate::range::{slice, Range}; |
71 | use std::cmp; |
72 | use std::collections::VecDeque; |
73 | use std::fmt::{self, Debug, Display, Write}; |
74 | |
75 | #[derive (Copy, Clone, PartialEq, Eq)] |
76 | pub enum Chunk<'a> { |
77 | Equal(&'a str), |
78 | Delete(&'a str), |
79 | Insert(&'a str), |
80 | } |
81 | |
82 | #[derive (Copy, Clone)] |
83 | enum Diff<'a, 'b> { |
84 | Equal(Range<'a>, Range<'b>), |
85 | Delete(Range<'a>), |
86 | Insert(Range<'b>), |
87 | } |
88 | |
89 | impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> { |
90 | fn text(&self) -> Range<'tmp> { |
91 | match *self { |
92 | Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range, |
93 | } |
94 | } |
95 | |
96 | fn grow_left(&mut self, increment: usize) { |
97 | self.for_each(|range| { |
98 | range.offset -= increment; |
99 | range.len += increment; |
100 | }); |
101 | } |
102 | |
103 | fn grow_right(&mut self, increment: usize) { |
104 | self.for_each(|range| range.len += increment); |
105 | } |
106 | |
107 | fn shift_left(&mut self, increment: usize) { |
108 | self.for_each(|range| range.offset -= increment); |
109 | } |
110 | |
111 | fn shift_right(&mut self, increment: usize) { |
112 | self.for_each(|range| range.offset += increment); |
113 | } |
114 | |
115 | fn for_each(&mut self, f: impl Fn(&mut Range)) { |
116 | match self { |
117 | Diff::Equal(range1, range2) => { |
118 | f(range1); |
119 | f(range2); |
120 | } |
121 | Diff::Delete(range) => f(range), |
122 | Diff::Insert(range) => f(range), |
123 | } |
124 | } |
125 | } |
126 | |
127 | pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> { |
128 | let chars1: Vec<char> = text1.chars().collect(); |
129 | let chars2: Vec<char> = text2.chars().collect(); |
130 | let range1 = Range::new(&chars1, ..); |
131 | let range2 = Range::new(&chars2, ..); |
132 | |
133 | let mut solution = main(range1, range2); |
134 | cleanup_char_boundary(&mut solution); |
135 | cleanup_semantic(&mut solution); |
136 | cleanup_merge(&mut solution); |
137 | |
138 | let mut chunks = Vec::new(); |
139 | let mut pos1 = 0; |
140 | let mut pos2 = 0; |
141 | for diff in solution.diffs { |
142 | chunks.push(match diff { |
143 | Diff::Equal(range, _) => { |
144 | let len = range.len_bytes(); |
145 | let chunk = Chunk::Equal(&text1[pos1..pos1 + len]); |
146 | pos1 += len; |
147 | pos2 += len; |
148 | chunk |
149 | } |
150 | Diff::Delete(range) => { |
151 | let len = range.len_bytes(); |
152 | let chunk = Chunk::Delete(&text1[pos1..pos1 + len]); |
153 | pos1 += len; |
154 | chunk |
155 | } |
156 | Diff::Insert(range) => { |
157 | let len = range.len_bytes(); |
158 | let chunk = Chunk::Insert(&text2[pos2..pos2 + len]); |
159 | pos2 += len; |
160 | chunk |
161 | } |
162 | }); |
163 | } |
164 | chunks |
165 | } |
166 | |
167 | struct Solution<'a, 'b> { |
168 | text1: Range<'a>, |
169 | text2: Range<'b>, |
170 | diffs: Vec<Diff<'a, 'b>>, |
171 | } |
172 | |
173 | fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> { |
174 | let whole1 = text1; |
175 | let whole2 = text2; |
176 | |
177 | // Trim off common prefix. |
178 | let common_prefix_len = common_prefix(text1, text2); |
179 | let common_prefix = Diff::Equal( |
180 | text1.substring(..common_prefix_len), |
181 | text2.substring(..common_prefix_len), |
182 | ); |
183 | text1 = text1.substring(common_prefix_len..); |
184 | text2 = text2.substring(common_prefix_len..); |
185 | |
186 | // Trim off common suffix. |
187 | let common_suffix_len = common_suffix(text1, text2); |
188 | let common_suffix = Diff::Equal( |
189 | text1.substring(text1.len - common_suffix_len..), |
190 | text2.substring(text2.len - common_suffix_len..), |
191 | ); |
192 | text1 = text1.substring(..text1.len - common_suffix_len); |
193 | text2 = text2.substring(..text2.len - common_suffix_len); |
194 | |
195 | // Compute the diff on the middle block. |
196 | let mut solution = Solution { |
197 | text1: whole1, |
198 | text2: whole2, |
199 | diffs: compute(text1, text2), |
200 | }; |
201 | |
202 | // Restore the prefix and suffix. |
203 | if common_prefix_len > 0 { |
204 | solution.diffs.insert(0, common_prefix); |
205 | } |
206 | if common_suffix_len > 0 { |
207 | solution.diffs.push(common_suffix); |
208 | } |
209 | |
210 | cleanup_merge(&mut solution); |
211 | |
212 | solution |
213 | } |
214 | |
215 | // Find the differences between two texts. Assumes that the texts do not have |
216 | // any common prefix or suffix. |
217 | fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> { |
218 | match (text1.is_empty(), text2.is_empty()) { |
219 | (true, true) => return Vec::new(), |
220 | (true, false) => return vec![Diff::Insert(text2)], |
221 | (false, true) => return vec![Diff::Delete(text1)], |
222 | (false, false) => {} |
223 | } |
224 | |
225 | // Check for entire shorter text inside the longer text. |
226 | if text1.len > text2.len { |
227 | if let Some(i) = text1.find(text2) { |
228 | return vec![ |
229 | Diff::Delete(text1.substring(..i)), |
230 | Diff::Equal(text1.substring(i..i + text2.len), text2), |
231 | Diff::Delete(text1.substring(i + text2.len..)), |
232 | ]; |
233 | } |
234 | } else { |
235 | if let Some(i) = text2.find(text1) { |
236 | return vec![ |
237 | Diff::Insert(text2.substring(..i)), |
238 | Diff::Equal(text1, text2.substring(i..i + text1.len)), |
239 | Diff::Insert(text2.substring(i + text1.len..)), |
240 | ]; |
241 | } |
242 | } |
243 | |
244 | if text1.len == 1 || text2.len == 1 { |
245 | // Single character string. |
246 | // After the previous check, the character can't be an equality. |
247 | return vec![Diff::Delete(text1), Diff::Insert(text2)]; |
248 | } |
249 | |
250 | bisect(text1, text2) |
251 | } |
252 | |
253 | // Find the 'middle snake' of a diff, split the problem in two and return the |
254 | // recursively constructed diff. |
255 | // |
256 | // See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. |
257 | fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> { |
258 | let max_d = (text1.len + text2.len + 1) / 2; |
259 | let v_offset = max_d; |
260 | let v_len = 2 * max_d; |
261 | let mut v1 = vec![-1isize; v_len]; |
262 | let mut v2 = vec![-1isize; v_len]; |
263 | v1[v_offset + 1] = 0; |
264 | v2[v_offset + 1] = 0; |
265 | let delta = text1.len as isize - text2.len as isize; |
266 | // If the total number of characters is odd, then the front path will |
267 | // collide with the reverse path. |
268 | let front = delta % 2 != 0; |
269 | // Offsets for start and end of k loop. |
270 | // Prevents mapping of space beyond the grid. |
271 | let mut k1start = 0; |
272 | let mut k1end = 0; |
273 | let mut k2start = 0; |
274 | let mut k2end = 0; |
275 | for d in 0..max_d as isize { |
276 | // Walk the front path one step. |
277 | let mut k1 = -d + k1start; |
278 | while k1 <= d - k1end { |
279 | let k1_offset = (v_offset as isize + k1) as usize; |
280 | let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) { |
281 | v1[k1_offset + 1] |
282 | } else { |
283 | v1[k1_offset - 1] + 1 |
284 | } as usize; |
285 | let mut y1 = (x1 as isize - k1) as usize; |
286 | if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) { |
287 | let advance = common_prefix(s1, s2); |
288 | x1 += advance; |
289 | y1 += advance; |
290 | } |
291 | v1[k1_offset] = x1 as isize; |
292 | if x1 > text1.len { |
293 | // Ran off the right of the graph. |
294 | k1end += 2; |
295 | } else if y1 > text2.len { |
296 | // Ran off the bottom of the graph. |
297 | k1start += 2; |
298 | } else if front { |
299 | let k2_offset = v_offset as isize + delta - k1; |
300 | if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 { |
301 | // Mirror x2 onto top-left coordinate system. |
302 | let x2 = text1.len as isize - v2[k2_offset as usize]; |
303 | if x1 as isize >= x2 { |
304 | // Overlap detected. |
305 | return bisect_split(text1, text2, x1, y1); |
306 | } |
307 | } |
308 | } |
309 | k1 += 2; |
310 | } |
311 | |
312 | // Walk the reverse path one step. |
313 | let mut k2 = -d + k2start; |
314 | while k2 <= d - k2end { |
315 | let k2_offset = (v_offset as isize + k2) as usize; |
316 | let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) { |
317 | v2[k2_offset + 1] |
318 | } else { |
319 | v2[k2_offset - 1] + 1 |
320 | } as usize; |
321 | let mut y2 = (x2 as isize - k2) as usize; |
322 | if x2 < text1.len && y2 < text2.len { |
323 | let advance = common_suffix( |
324 | text1.substring(..text1.len - x2), |
325 | text2.substring(..text2.len - y2), |
326 | ); |
327 | x2 += advance; |
328 | y2 += advance; |
329 | } |
330 | v2[k2_offset] = x2 as isize; |
331 | if x2 > text1.len { |
332 | // Ran off the left of the graph. |
333 | k2end += 2; |
334 | } else if y2 > text2.len { |
335 | // Ran off the top of the graph. |
336 | k2start += 2; |
337 | } else if !front { |
338 | let k1_offset = v_offset as isize + delta - k2; |
339 | if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 { |
340 | let x1 = v1[k1_offset as usize] as usize; |
341 | let y1 = v_offset + x1 - k1_offset as usize; |
342 | // Mirror x2 onto top-left coordinate system. |
343 | x2 = text1.len - x2; |
344 | if x1 >= x2 { |
345 | // Overlap detected. |
346 | return bisect_split(text1, text2, x1, y1); |
347 | } |
348 | } |
349 | } |
350 | k2 += 2; |
351 | } |
352 | } |
353 | // Number of diffs equals number of characters, no commonality at all. |
354 | vec![Diff::Delete(text1), Diff::Insert(text2)] |
355 | } |
356 | |
357 | // Given the location of the 'middle snake', split the diff in two parts and |
358 | // recurse. |
359 | fn bisect_split<'a, 'b>( |
360 | text1: Range<'a>, |
361 | text2: Range<'b>, |
362 | x: usize, |
363 | y: usize, |
364 | ) -> Vec<Diff<'a, 'b>> { |
365 | let (text1a: Range<'_>, text1b: Range<'_>) = text1.split_at(mid:x); |
366 | let (text2a: Range<'_>, text2b: Range<'_>) = text2.split_at(mid:y); |
367 | |
368 | // Compute both diffs serially. |
369 | let mut diffs: Vec> = main(text1:text1a, text2:text2a).diffs; |
370 | diffs.extend(iter:main(text1:text1b, text2:text2b).diffs); |
371 | |
372 | diffs |
373 | } |
374 | |
375 | // Determine the length of the common prefix of two strings. |
376 | fn common_prefix(text1: Range, text2: Range) -> usize { |
377 | for (i: usize, (b1: char, b2: char)) in text1.chars().zip(text2.chars()).enumerate() { |
378 | if b1 != b2 { |
379 | return i; |
380 | } |
381 | } |
382 | cmp::min(v1:text1.len, v2:text2.len) |
383 | } |
384 | |
385 | // Determine the length of the common suffix of two strings. |
386 | fn common_suffix(text1: Range, text2: Range) -> usize { |
387 | for (i: usize, (b1: char, b2: char)) in text1.chars().rev().zip(text2.chars().rev()).enumerate() { |
388 | if b1 != b2 { |
389 | return i; |
390 | } |
391 | } |
392 | cmp::min(v1:text1.len, v2:text2.len) |
393 | } |
394 | |
395 | // Determine if the suffix of one string is the prefix of another. |
396 | // |
397 | // Returns the number of characters common to the end of the first string and |
398 | // the start of the second string. |
399 | fn common_overlap(mut text1: Range, mut text2: Range) -> usize { |
400 | // Eliminate the null case. |
401 | if text1.is_empty() || text2.is_empty() { |
402 | return 0; |
403 | } |
404 | // Truncate the longer string. |
405 | if text1.len > text2.len { |
406 | text1 = text1.substring(text1.len - text2.len..); |
407 | } else if text1.len < text2.len { |
408 | text2 = text2.substring(..text1.len); |
409 | } |
410 | // Quick check for the worst case. |
411 | if slice(text1) == slice(text2) { |
412 | return text1.len; |
413 | } |
414 | |
415 | // Start by looking for a single character match |
416 | // and increase length until no match is found. |
417 | // Performance analysis: https://neil.fraser.name/news/2010/11/04/ |
418 | let mut best = 0; |
419 | let mut length = 1; |
420 | loop { |
421 | let pattern = text1.substring(text1.len - length..); |
422 | let found = match text2.find(pattern) { |
423 | Some(found) => found, |
424 | None => return best, |
425 | }; |
426 | length += found; |
427 | if found == 0 |
428 | || slice(text1.substring(text1.len - length..)) == slice(text2.substring(..length)) |
429 | { |
430 | best = length; |
431 | length += 1; |
432 | } |
433 | } |
434 | } |
435 | |
436 | fn cleanup_char_boundary(solution: &mut Solution) { |
437 | fn is_segmentation_boundary(doc: &[char], pos: usize) -> bool { |
438 | // FIXME: use unicode-segmentation crate? |
439 | let _ = doc; |
440 | let _ = pos; |
441 | true |
442 | } |
443 | |
444 | fn boundary_down(doc: &[char], pos: usize) -> usize { |
445 | let mut adjust = 0; |
446 | while !is_segmentation_boundary(doc, pos - adjust) { |
447 | adjust += 1; |
448 | } |
449 | adjust |
450 | } |
451 | |
452 | fn boundary_up(doc: &[char], pos: usize) -> usize { |
453 | let mut adjust = 0; |
454 | while !is_segmentation_boundary(doc, pos + adjust) { |
455 | adjust += 1; |
456 | } |
457 | adjust |
458 | } |
459 | |
460 | fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) { |
461 | let prev_end = prev.offset + prev.len; |
462 | if prev_end > range.offset { |
463 | let delta = cmp::min(prev_end - range.offset, range.len); |
464 | range.offset += delta; |
465 | range.len -= delta; |
466 | } |
467 | } |
468 | |
469 | let mut read = 0; |
470 | let mut retain = 0; |
471 | let mut last_delete = Range::empty(); |
472 | let mut last_insert = Range::empty(); |
473 | while let Some(&(mut diff)) = solution.diffs.get(read) { |
474 | read += 1; |
475 | match &mut diff { |
476 | Diff::Equal(range1, range2) => { |
477 | let adjust = boundary_up(range1.doc, range1.offset); |
478 | // If the whole range is sub-character, skip it. |
479 | if range1.len <= adjust { |
480 | continue; |
481 | } |
482 | range1.offset += adjust; |
483 | range1.len -= adjust; |
484 | range2.offset += adjust; |
485 | range2.len -= adjust; |
486 | let adjust = boundary_down(range1.doc, range1.offset + range1.len); |
487 | range1.len -= adjust; |
488 | range2.len -= adjust; |
489 | last_delete = Range::empty(); |
490 | last_insert = Range::empty(); |
491 | } |
492 | Diff::Delete(range) => { |
493 | skip_overlap(&last_delete, range); |
494 | if range.len == 0 { |
495 | continue; |
496 | } |
497 | let adjust = boundary_down(range.doc, range.offset); |
498 | range.offset -= adjust; |
499 | range.len += adjust; |
500 | let adjust = boundary_up(range.doc, range.offset + range.len); |
501 | range.len += adjust; |
502 | last_delete = *range; |
503 | } |
504 | Diff::Insert(range) => { |
505 | skip_overlap(&last_insert, range); |
506 | if range.len == 0 { |
507 | continue; |
508 | } |
509 | let adjust = boundary_down(range.doc, range.offset); |
510 | range.offset -= adjust; |
511 | range.len += adjust; |
512 | let adjust = boundary_up(range.doc, range.offset + range.len); |
513 | range.len += adjust; |
514 | last_insert = *range; |
515 | } |
516 | } |
517 | solution.diffs[retain] = diff; |
518 | retain += 1; |
519 | } |
520 | |
521 | solution.diffs.truncate(retain); |
522 | } |
523 | |
524 | // Reduce the number of edits by eliminating semantically trivial equalities. |
525 | fn cleanup_semantic(solution: &mut Solution) { |
526 | let mut diffs = &mut solution.diffs; |
527 | if diffs.is_empty() { |
528 | return; |
529 | } |
530 | |
531 | let mut changes = false; |
532 | let mut equalities = VecDeque::new(); // Double-ended queue of equalities. |
533 | let mut last_equality = None; // Always equal to equalities.peek().text |
534 | let mut pointer = 0; |
535 | // Number of characters that changed prior to the equality. |
536 | let mut len_insertions1 = 0; |
537 | let mut len_deletions1 = 0; |
538 | // Number of characters that changed after the equality. |
539 | let mut len_insertions2 = 0; |
540 | let mut len_deletions2 = 0; |
541 | while let Some(&this_diff) = diffs.get(pointer) { |
542 | match this_diff { |
543 | Diff::Equal(text1, text2) => { |
544 | equalities.push_back(pointer); |
545 | len_insertions1 = len_insertions2; |
546 | len_deletions1 = len_deletions2; |
547 | len_insertions2 = 0; |
548 | len_deletions2 = 0; |
549 | last_equality = Some((text1, text2)); |
550 | pointer += 1; |
551 | continue; |
552 | } |
553 | Diff::Delete(text) => len_deletions2 += text.len, |
554 | Diff::Insert(text) => len_insertions2 += text.len, |
555 | } |
556 | // Eliminate an equality that is smaller or equal to the edits on both |
557 | // sides of it. |
558 | if last_equality.map_or(false, |(last_equality, _)| { |
559 | last_equality.len <= cmp::max(len_insertions1, len_deletions1) |
560 | && last_equality.len <= cmp::max(len_insertions2, len_deletions2) |
561 | }) { |
562 | // Jump back to offending equality. |
563 | pointer = equalities.pop_back().unwrap(); |
564 | |
565 | // Replace equality with a delete. |
566 | diffs[pointer] = Diff::Delete(last_equality.unwrap().0); |
567 | // Insert a corresponding insert. |
568 | diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1)); |
569 | |
570 | len_insertions1 = 0; // Reset the counters. |
571 | len_insertions2 = 0; |
572 | len_deletions1 = 0; |
573 | len_deletions2 = 0; |
574 | last_equality = None; |
575 | changes = true; |
576 | |
577 | // Throw away the previous equality (it needs to be reevaluated). |
578 | equalities.pop_back(); |
579 | if let Some(back) = equalities.back() { |
580 | // There is a safe equality we can fall back to. |
581 | pointer = *back; |
582 | } else { |
583 | // There are no previous equalities, jump back to the start. |
584 | pointer = 0; |
585 | continue; |
586 | } |
587 | } |
588 | pointer += 1; |
589 | } |
590 | |
591 | // Normalize the diff. |
592 | if changes { |
593 | cleanup_merge(solution); |
594 | } |
595 | cleanup_semantic_lossless(solution); |
596 | diffs = &mut solution.diffs; |
597 | |
598 | // Find any overlaps between deletions and insertions. |
599 | // e.g: <del>abcxxx</del><ins>xxxdef</ins> |
600 | // -> <del>abc</del>xxx<ins>def</ins> |
601 | // e.g: <del>xxxabc</del><ins>defxxx</ins> |
602 | // -> <ins>def</ins>xxx<del>abc</del> |
603 | // Only extract an overlap if it is as big as the edit ahead or behind it. |
604 | let mut pointer = 1; |
605 | while let Some(&this_diff) = diffs.get(pointer) { |
606 | let prev_diff = diffs[pointer - 1]; |
607 | if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) { |
608 | let overlap_len1 = common_overlap(deletion, insertion); |
609 | let overlap_len2 = common_overlap(insertion, deletion); |
610 | let overlap_min = cmp::min(deletion.len, insertion.len); |
611 | if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min { |
612 | // Overlap found. Insert an equality and trim the surrounding edits. |
613 | diffs.insert( |
614 | pointer, |
615 | Diff::Equal( |
616 | deletion.substring(deletion.len - overlap_len1..deletion.len), |
617 | insertion.substring(..overlap_len1), |
618 | ), |
619 | ); |
620 | diffs[pointer - 1] = |
621 | Diff::Delete(deletion.substring(..deletion.len - overlap_len1)); |
622 | diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..)); |
623 | } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min { |
624 | // Reverse overlap found. |
625 | // Insert an equality and swap and trim the surrounding edits. |
626 | diffs.insert( |
627 | pointer, |
628 | Diff::Equal( |
629 | deletion.substring(..overlap_len2), |
630 | insertion.substring(insertion.len - overlap_len2..insertion.len), |
631 | ), |
632 | ); |
633 | diffs[pointer - 1] = |
634 | Diff::Insert(insertion.substring(..insertion.len - overlap_len2)); |
635 | diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..)); |
636 | } |
637 | pointer += 1; |
638 | } |
639 | pointer += 1; |
640 | } |
641 | } |
642 | |
643 | // Look for single edits surrounded on both sides by equalities which can be |
644 | // shifted sideways to align the edit to a word boundary. |
645 | // |
646 | // e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. |
647 | fn cleanup_semantic_lossless(solution: &mut Solution) { |
648 | let diffs = &mut solution.diffs; |
649 | let mut pointer = 1; |
650 | while let Some(&next_diff) = diffs.get(pointer + 1) { |
651 | let prev_diff = diffs[pointer - 1]; |
652 | if let ( |
653 | Diff::Equal(mut prev_equal1, mut prev_equal2), |
654 | Diff::Equal(mut next_equal1, mut next_equal2), |
655 | ) = (prev_diff, next_diff) |
656 | { |
657 | // This is a single edit surrounded by equalities. |
658 | let mut edit = diffs[pointer]; |
659 | |
660 | // First, shift the edit as far left as possible. |
661 | let common_offset = common_suffix(prev_equal1, edit.text()); |
662 | let original_prev_len = prev_equal1.len; |
663 | prev_equal1.len -= common_offset; |
664 | prev_equal2.len -= common_offset; |
665 | edit.shift_left(common_offset); |
666 | next_equal1.offset -= common_offset; |
667 | next_equal1.len += common_offset; |
668 | next_equal2.offset -= common_offset; |
669 | next_equal2.len += common_offset; |
670 | |
671 | // Second, step character by character right, looking for the best fit. |
672 | let mut best_prev_equal = (prev_equal1, prev_equal2); |
673 | let mut best_edit = edit; |
674 | let mut best_next_equal = (next_equal1, next_equal2); |
675 | let mut best_score = cleanup_semantic_score(prev_equal1, edit.text()) |
676 | + cleanup_semantic_score(edit.text(), next_equal1); |
677 | while !edit.text().is_empty() |
678 | && !next_equal1.is_empty() |
679 | && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap() |
680 | { |
681 | prev_equal1.len += 1; |
682 | prev_equal2.len += 1; |
683 | edit.shift_right(1); |
684 | next_equal1.offset += 1; |
685 | next_equal1.len -= 1; |
686 | next_equal2.offset += 1; |
687 | next_equal2.len -= 1; |
688 | let score = cleanup_semantic_score(prev_equal1, edit.text()) |
689 | + cleanup_semantic_score(edit.text(), next_equal1); |
690 | // The >= encourages trailing rather than leading whitespace on edits. |
691 | if score >= best_score { |
692 | best_score = score; |
693 | best_prev_equal = (prev_equal1, prev_equal2); |
694 | best_edit = edit; |
695 | best_next_equal = (next_equal1, next_equal2); |
696 | } |
697 | } |
698 | |
699 | if original_prev_len != best_prev_equal.0.len { |
700 | // We have an improvement, save it back to the diff. |
701 | if best_next_equal.0.is_empty() { |
702 | diffs.remove(pointer + 1); |
703 | } else { |
704 | diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1); |
705 | } |
706 | diffs[pointer] = best_edit; |
707 | if best_prev_equal.0.is_empty() { |
708 | diffs.remove(pointer - 1); |
709 | pointer -= 1; |
710 | } else { |
711 | diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1); |
712 | } |
713 | } |
714 | } |
715 | pointer += 1; |
716 | } |
717 | } |
718 | |
719 | // Given two strings, compute a score representing whether the internal boundary |
720 | // falls on logical boundaries. |
721 | // |
722 | // Scores range from 6 (best) to 0 (worst). |
723 | fn cleanup_semantic_score(one: Range, two: Range) -> usize { |
724 | if one.is_empty() || two.is_empty() { |
725 | // Edges are the best. |
726 | return 6; |
727 | } |
728 | |
729 | // Each port of this function behaves slightly differently due to subtle |
730 | // differences in each language's definition of things like 'whitespace'. |
731 | // Since this function's purpose is largely cosmetic, the choice has been |
732 | // made to use each language's native features rather than force total |
733 | // conformity. |
734 | let char1 = one.chars().next_back().unwrap(); |
735 | let char2 = two.chars().next().unwrap(); |
736 | let non_alphanumeric1 = !char1.is_ascii_alphanumeric(); |
737 | let non_alphanumeric2 = !char2.is_ascii_alphanumeric(); |
738 | let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace(); |
739 | let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace(); |
740 | let line_break1 = whitespace1 && char1.is_control(); |
741 | let line_break2 = whitespace2 && char2.is_control(); |
742 | let blank_line1 = |
743 | line_break1 && (one.ends_with([' \n' , ' \n' ]) || one.ends_with([' \n' , ' \r' , ' \n' ])); |
744 | let blank_line2 = |
745 | line_break2 && (two.starts_with([' \n' , ' \n' ]) || two.starts_with([' \r' , ' \n' , ' \r' , ' \n' ])); |
746 | |
747 | if blank_line1 || blank_line2 { |
748 | // Five points for blank lines. |
749 | 5 |
750 | } else if line_break1 || line_break2 { |
751 | // Four points for line breaks. |
752 | 4 |
753 | } else if non_alphanumeric1 && !whitespace1 && whitespace2 { |
754 | // Three points for end of sentences. |
755 | 3 |
756 | } else if whitespace1 || whitespace2 { |
757 | // Two points for whitespace. |
758 | 2 |
759 | } else if non_alphanumeric1 || non_alphanumeric2 { |
760 | // One point for non-alphanumeric. |
761 | 1 |
762 | } else { |
763 | 0 |
764 | } |
765 | } |
766 | |
767 | // Reorder and merge like edit sections. Merge equalities. Any edit section can |
768 | // move as long as it doesn't cross an equality. |
769 | fn cleanup_merge(solution: &mut Solution) { |
770 | let diffs = &mut solution.diffs; |
771 | while !diffs.is_empty() { |
772 | diffs.push(Diff::Equal( |
773 | solution.text1.substring(solution.text1.len..), |
774 | solution.text2.substring(solution.text2.len..), |
775 | )); // Add a dummy entry at the end. |
776 | let mut pointer = 0; |
777 | let mut count_delete = 0; |
778 | let mut count_insert = 0; |
779 | let mut text_delete = Range::empty(); |
780 | let mut text_insert = Range::empty(); |
781 | while let Some(&this_diff) = diffs.get(pointer) { |
782 | match this_diff { |
783 | Diff::Insert(text) => { |
784 | count_insert += 1; |
785 | if text_insert.is_empty() { |
786 | text_insert = text; |
787 | } else { |
788 | text_insert.len += text.len; |
789 | } |
790 | } |
791 | Diff::Delete(text) => { |
792 | count_delete += 1; |
793 | if text_delete.is_empty() { |
794 | text_delete = text; |
795 | } else { |
796 | text_delete.len += text.len; |
797 | } |
798 | } |
799 | Diff::Equal(text, _) => { |
800 | let count_both = count_delete + count_insert; |
801 | if count_both > 1 { |
802 | let both_types = count_delete != 0 && count_insert != 0; |
803 | // Delete the offending records. |
804 | diffs.drain(pointer - count_both..pointer); |
805 | pointer -= count_both; |
806 | if both_types { |
807 | // Factor out any common prefix. |
808 | let common_length = common_prefix(text_insert, text_delete); |
809 | if common_length != 0 { |
810 | if pointer > 0 { |
811 | match &mut diffs[pointer - 1] { |
812 | Diff::Equal(this_diff1, this_diff2) => { |
813 | this_diff1.len += common_length; |
814 | this_diff2.len += common_length; |
815 | } |
816 | _ => unreachable!( |
817 | "previous diff should have been an equality" |
818 | ), |
819 | } |
820 | } else { |
821 | diffs.insert( |
822 | pointer, |
823 | Diff::Equal( |
824 | text_delete.substring(..common_length), |
825 | text_insert.substring(..common_length), |
826 | ), |
827 | ); |
828 | pointer += 1; |
829 | } |
830 | text_insert = text_insert.substring(common_length..); |
831 | text_delete = text_delete.substring(common_length..); |
832 | } |
833 | // Factor out any common suffix. |
834 | let common_length = common_suffix(text_insert, text_delete); |
835 | if common_length != 0 { |
836 | diffs[pointer].grow_left(common_length); |
837 | text_insert.len -= common_length; |
838 | text_delete.len -= common_length; |
839 | } |
840 | } |
841 | // Insert the merged records. |
842 | if !text_delete.is_empty() { |
843 | diffs.insert(pointer, Diff::Delete(text_delete)); |
844 | pointer += 1; |
845 | } |
846 | if !text_insert.is_empty() { |
847 | diffs.insert(pointer, Diff::Insert(text_insert)); |
848 | pointer += 1; |
849 | } |
850 | } else if pointer > 0 { |
851 | if let Some(Diff::Equal(prev_equal1, prev_equal2)) = |
852 | diffs.get_mut(pointer - 1) |
853 | { |
854 | // Merge this equality with the previous one. |
855 | prev_equal1.len += text.len; |
856 | prev_equal2.len += text.len; |
857 | diffs.remove(pointer); |
858 | pointer -= 1; |
859 | } |
860 | } |
861 | count_insert = 0; |
862 | count_delete = 0; |
863 | text_delete = Range::empty(); |
864 | text_insert = Range::empty(); |
865 | } |
866 | } |
867 | pointer += 1; |
868 | } |
869 | if diffs.last().unwrap().text().is_empty() { |
870 | diffs.pop(); // Remove the dummy entry at the end. |
871 | } |
872 | |
873 | // Second pass: look for single edits surrounded on both sides by equalities |
874 | // which can be shifted sideways to eliminate an equality. |
875 | // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC |
876 | let mut changes = false; |
877 | let mut pointer = 1; |
878 | // Intentionally ignore the first and last element (don't need checking). |
879 | while let Some(&next_diff) = diffs.get(pointer + 1) { |
880 | let prev_diff = diffs[pointer - 1]; |
881 | let this_diff = diffs[pointer]; |
882 | if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) { |
883 | // This is a single edit surrounded by equalities. |
884 | if this_diff.text().ends_with(prev_diff) { |
885 | // Shift the edit over the previous equality. |
886 | diffs[pointer].shift_left(prev_diff.len); |
887 | diffs[pointer + 1].grow_left(prev_diff.len); |
888 | diffs.remove(pointer - 1); // Delete prev_diff. |
889 | changes = true; |
890 | } else if this_diff.text().starts_with(next_diff) { |
891 | // Shift the edit over the next equality. |
892 | diffs[pointer - 1].grow_right(next_diff.len); |
893 | diffs[pointer].shift_right(next_diff.len); |
894 | diffs.remove(pointer + 1); // Delete next_diff. |
895 | changes = true; |
896 | } |
897 | } |
898 | pointer += 1; |
899 | } |
900 | // If shifts were made, the diff needs reordering and another shift sweep. |
901 | if !changes { |
902 | return; |
903 | } |
904 | } |
905 | } |
906 | |
907 | impl Debug for Chunk<'_> { |
908 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
909 | let (name: &'static str, text: &str) = match *self { |
910 | Chunk::Equal(text: &str) => ("Equal" , text), |
911 | Chunk::Delete(text: &str) => ("Delete" , text), |
912 | Chunk::Insert(text: &str) => ("Insert" , text), |
913 | }; |
914 | write!(formatter, " {}( {:?})" , name, text) |
915 | } |
916 | } |
917 | |
918 | impl Debug for Diff<'_, '_> { |
919 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
920 | let (name: &'static str, range: Range<'_>) = match *self { |
921 | Diff::Equal(range: Range<'_>, _) => ("Equal" , range), |
922 | Diff::Delete(range: Range<'_>) => ("Delete" , range), |
923 | Diff::Insert(range: Range<'_>) => ("Insert" , range), |
924 | }; |
925 | formatter.write_str(data:name)?; |
926 | formatter.write_str(data:"( \"" )?; |
927 | for ch: char in range.chars() { |
928 | if ch == ' \'' { |
929 | // escape_debug turns this into "\'" which is unnecessary. |
930 | formatter.write_char(ch)?; |
931 | } else { |
932 | Display::fmt(&ch.escape_debug(), f:formatter)?; |
933 | } |
934 | } |
935 | formatter.write_str(data:" \")" )?; |
936 | Ok(()) |
937 | } |
938 | } |
939 | |