1//! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar)
2//!
3//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
4//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
5//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs
6//!
7//! <br>
8//!
9//! ## Diff library with semantic cleanup, based on Google's diff-match-patch
10//!
11//! This library is a port of the Diff component of [Diff Match Patch] to Rust.
12//! The diff implementation is based on [Myers' diff algorithm] but includes
13//! some [semantic cleanups] to increase human readability by factoring out
14//! commonalities which are likely to be coincidental.
15//!
16//! Diff Match Patch was originally built in 2006 to power Google Docs.
17//!
18//! # Interface
19//!
20//! Here is the entire API of the Rust implementation. It operates on borrowed
21//! strings and the return value of the diff algorithm is a vector of chunks
22//! pointing into slices of those input strings.
23//!
24//! ```
25//! pub enum Chunk<'a> {
26//! Equal(&'a str),
27//! Delete(&'a str),
28//! Insert(&'a str),
29//! }
30//!
31//! # const IGNORE: &str = stringify! {
32//! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
33//! # };
34//! ```
35//!
36//! [Diff Match Patch]: https://github.com/google/diff-match-patch
37//! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
38//! [semantic cleanups]: https://neil.fraser.name/writing/diff/
39
40#![doc(html_root_url = "https://docs.rs/dissimilar/1.0.9")]
41#![allow(
42 clippy::blocks_in_conditions,
43 clippy::bool_to_int_with_if,
44 clippy::cast_possible_wrap,
45 clippy::cast_sign_loss,
46 clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127
47 clippy::collapsible_else_if,
48 clippy::comparison_chain,
49 clippy::implied_bounds_in_impls,
50 clippy::items_after_test_module, // https://github.com/rust-lang/rust-clippy/issues/10713
51 clippy::let_underscore_untyped,
52 clippy::match_same_arms,
53 clippy::module_name_repetitions,
54 clippy::must_use_candidate,
55 clippy::new_without_default,
56 clippy::octal_escapes,
57 clippy::shadow_unrelated,
58 clippy::similar_names,
59 clippy::too_many_lines,
60 clippy::unseparated_literal_suffix,
61 unused_parens, // false positive on Some(&(mut diff)) pattern
62)]
63
64mod find;
65mod range;
66
67#[cfg(test)]
68mod tests;
69
70use crate::range::{slice, Range};
71use std::cmp;
72use std::collections::VecDeque;
73use std::fmt::{self, Debug, Display, Write};
74
75#[derive(Copy, Clone, PartialEq, Eq)]
76pub enum Chunk<'a> {
77 Equal(&'a str),
78 Delete(&'a str),
79 Insert(&'a str),
80}
81
82#[derive(Copy, Clone)]
83enum Diff<'a, 'b> {
84 Equal(Range<'a>, Range<'b>),
85 Delete(Range<'a>),
86 Insert(Range<'b>),
87}
88
89impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> {
90 fn text(&self) -> Range<'tmp> {
91 match *self {
92 Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range,
93 }
94 }
95
96 fn grow_left(&mut self, increment: usize) {
97 self.for_each(|range| {
98 range.offset -= increment;
99 range.len += increment;
100 });
101 }
102
103 fn grow_right(&mut self, increment: usize) {
104 self.for_each(|range| range.len += increment);
105 }
106
107 fn shift_left(&mut self, increment: usize) {
108 self.for_each(|range| range.offset -= increment);
109 }
110
111 fn shift_right(&mut self, increment: usize) {
112 self.for_each(|range| range.offset += increment);
113 }
114
115 fn for_each(&mut self, f: impl Fn(&mut Range)) {
116 match self {
117 Diff::Equal(range1, range2) => {
118 f(range1);
119 f(range2);
120 }
121 Diff::Delete(range) => f(range),
122 Diff::Insert(range) => f(range),
123 }
124 }
125}
126
127pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> {
128 let chars1: Vec<char> = text1.chars().collect();
129 let chars2: Vec<char> = text2.chars().collect();
130 let range1 = Range::new(&chars1, ..);
131 let range2 = Range::new(&chars2, ..);
132
133 let mut solution = main(range1, range2);
134 cleanup_char_boundary(&mut solution);
135 cleanup_semantic(&mut solution);
136 cleanup_merge(&mut solution);
137
138 let mut chunks = Vec::new();
139 let mut pos1 = 0;
140 let mut pos2 = 0;
141 for diff in solution.diffs {
142 chunks.push(match diff {
143 Diff::Equal(range, _) => {
144 let len = range.len_bytes();
145 let chunk = Chunk::Equal(&text1[pos1..pos1 + len]);
146 pos1 += len;
147 pos2 += len;
148 chunk
149 }
150 Diff::Delete(range) => {
151 let len = range.len_bytes();
152 let chunk = Chunk::Delete(&text1[pos1..pos1 + len]);
153 pos1 += len;
154 chunk
155 }
156 Diff::Insert(range) => {
157 let len = range.len_bytes();
158 let chunk = Chunk::Insert(&text2[pos2..pos2 + len]);
159 pos2 += len;
160 chunk
161 }
162 });
163 }
164 chunks
165}
166
167struct Solution<'a, 'b> {
168 text1: Range<'a>,
169 text2: Range<'b>,
170 diffs: Vec<Diff<'a, 'b>>,
171}
172
173fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> {
174 let whole1 = text1;
175 let whole2 = text2;
176
177 // Trim off common prefix.
178 let common_prefix_len = common_prefix(text1, text2);
179 let common_prefix = Diff::Equal(
180 text1.substring(..common_prefix_len),
181 text2.substring(..common_prefix_len),
182 );
183 text1 = text1.substring(common_prefix_len..);
184 text2 = text2.substring(common_prefix_len..);
185
186 // Trim off common suffix.
187 let common_suffix_len = common_suffix(text1, text2);
188 let common_suffix = Diff::Equal(
189 text1.substring(text1.len - common_suffix_len..),
190 text2.substring(text2.len - common_suffix_len..),
191 );
192 text1 = text1.substring(..text1.len - common_suffix_len);
193 text2 = text2.substring(..text2.len - common_suffix_len);
194
195 // Compute the diff on the middle block.
196 let mut solution = Solution {
197 text1: whole1,
198 text2: whole2,
199 diffs: compute(text1, text2),
200 };
201
202 // Restore the prefix and suffix.
203 if common_prefix_len > 0 {
204 solution.diffs.insert(0, common_prefix);
205 }
206 if common_suffix_len > 0 {
207 solution.diffs.push(common_suffix);
208 }
209
210 cleanup_merge(&mut solution);
211
212 solution
213}
214
215// Find the differences between two texts. Assumes that the texts do not have
216// any common prefix or suffix.
217fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
218 match (text1.is_empty(), text2.is_empty()) {
219 (true, true) => return Vec::new(),
220 (true, false) => return vec![Diff::Insert(text2)],
221 (false, true) => return vec![Diff::Delete(text1)],
222 (false, false) => {}
223 }
224
225 // Check for entire shorter text inside the longer text.
226 if text1.len > text2.len {
227 if let Some(i) = text1.find(text2) {
228 return vec![
229 Diff::Delete(text1.substring(..i)),
230 Diff::Equal(text1.substring(i..i + text2.len), text2),
231 Diff::Delete(text1.substring(i + text2.len..)),
232 ];
233 }
234 } else {
235 if let Some(i) = text2.find(text1) {
236 return vec![
237 Diff::Insert(text2.substring(..i)),
238 Diff::Equal(text1, text2.substring(i..i + text1.len)),
239 Diff::Insert(text2.substring(i + text1.len..)),
240 ];
241 }
242 }
243
244 if text1.len == 1 || text2.len == 1 {
245 // Single character string.
246 // After the previous check, the character can't be an equality.
247 return vec![Diff::Delete(text1), Diff::Insert(text2)];
248 }
249
250 bisect(text1, text2)
251}
252
253// Find the 'middle snake' of a diff, split the problem in two and return the
254// recursively constructed diff.
255//
256// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
257fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
258 let max_d = (text1.len + text2.len + 1) / 2;
259 let v_offset = max_d;
260 let v_len = 2 * max_d;
261 let mut v1 = vec![-1isize; v_len];
262 let mut v2 = vec![-1isize; v_len];
263 v1[v_offset + 1] = 0;
264 v2[v_offset + 1] = 0;
265 let delta = text1.len as isize - text2.len as isize;
266 // If the total number of characters is odd, then the front path will
267 // collide with the reverse path.
268 let front = delta % 2 != 0;
269 // Offsets for start and end of k loop.
270 // Prevents mapping of space beyond the grid.
271 let mut k1start = 0;
272 let mut k1end = 0;
273 let mut k2start = 0;
274 let mut k2end = 0;
275 for d in 0..max_d as isize {
276 // Walk the front path one step.
277 let mut k1 = -d + k1start;
278 while k1 <= d - k1end {
279 let k1_offset = (v_offset as isize + k1) as usize;
280 let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) {
281 v1[k1_offset + 1]
282 } else {
283 v1[k1_offset - 1] + 1
284 } as usize;
285 let mut y1 = (x1 as isize - k1) as usize;
286 if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) {
287 let advance = common_prefix(s1, s2);
288 x1 += advance;
289 y1 += advance;
290 }
291 v1[k1_offset] = x1 as isize;
292 if x1 > text1.len {
293 // Ran off the right of the graph.
294 k1end += 2;
295 } else if y1 > text2.len {
296 // Ran off the bottom of the graph.
297 k1start += 2;
298 } else if front {
299 let k2_offset = v_offset as isize + delta - k1;
300 if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 {
301 // Mirror x2 onto top-left coordinate system.
302 let x2 = text1.len as isize - v2[k2_offset as usize];
303 if x1 as isize >= x2 {
304 // Overlap detected.
305 return bisect_split(text1, text2, x1, y1);
306 }
307 }
308 }
309 k1 += 2;
310 }
311
312 // Walk the reverse path one step.
313 let mut k2 = -d + k2start;
314 while k2 <= d - k2end {
315 let k2_offset = (v_offset as isize + k2) as usize;
316 let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) {
317 v2[k2_offset + 1]
318 } else {
319 v2[k2_offset - 1] + 1
320 } as usize;
321 let mut y2 = (x2 as isize - k2) as usize;
322 if x2 < text1.len && y2 < text2.len {
323 let advance = common_suffix(
324 text1.substring(..text1.len - x2),
325 text2.substring(..text2.len - y2),
326 );
327 x2 += advance;
328 y2 += advance;
329 }
330 v2[k2_offset] = x2 as isize;
331 if x2 > text1.len {
332 // Ran off the left of the graph.
333 k2end += 2;
334 } else if y2 > text2.len {
335 // Ran off the top of the graph.
336 k2start += 2;
337 } else if !front {
338 let k1_offset = v_offset as isize + delta - k2;
339 if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 {
340 let x1 = v1[k1_offset as usize] as usize;
341 let y1 = v_offset + x1 - k1_offset as usize;
342 // Mirror x2 onto top-left coordinate system.
343 x2 = text1.len - x2;
344 if x1 >= x2 {
345 // Overlap detected.
346 return bisect_split(text1, text2, x1, y1);
347 }
348 }
349 }
350 k2 += 2;
351 }
352 }
353 // Number of diffs equals number of characters, no commonality at all.
354 vec![Diff::Delete(text1), Diff::Insert(text2)]
355}
356
357// Given the location of the 'middle snake', split the diff in two parts and
358// recurse.
359fn bisect_split<'a, 'b>(
360 text1: Range<'a>,
361 text2: Range<'b>,
362 x: usize,
363 y: usize,
364) -> Vec<Diff<'a, 'b>> {
365 let (text1a: Range<'_>, text1b: Range<'_>) = text1.split_at(mid:x);
366 let (text2a: Range<'_>, text2b: Range<'_>) = text2.split_at(mid:y);
367
368 // Compute both diffs serially.
369 let mut diffs: Vec> = main(text1:text1a, text2:text2a).diffs;
370 diffs.extend(iter:main(text1:text1b, text2:text2b).diffs);
371
372 diffs
373}
374
375// Determine the length of the common prefix of two strings.
376fn common_prefix(text1: Range, text2: Range) -> usize {
377 for (i: usize, (b1: char, b2: char)) in text1.chars().zip(text2.chars()).enumerate() {
378 if b1 != b2 {
379 return i;
380 }
381 }
382 cmp::min(v1:text1.len, v2:text2.len)
383}
384
385// Determine the length of the common suffix of two strings.
386fn common_suffix(text1: Range, text2: Range) -> usize {
387 for (i: usize, (b1: char, b2: char)) in text1.chars().rev().zip(text2.chars().rev()).enumerate() {
388 if b1 != b2 {
389 return i;
390 }
391 }
392 cmp::min(v1:text1.len, v2:text2.len)
393}
394
395// Determine if the suffix of one string is the prefix of another.
396//
397// Returns the number of characters common to the end of the first string and
398// the start of the second string.
399fn common_overlap(mut text1: Range, mut text2: Range) -> usize {
400 // Eliminate the null case.
401 if text1.is_empty() || text2.is_empty() {
402 return 0;
403 }
404 // Truncate the longer string.
405 if text1.len > text2.len {
406 text1 = text1.substring(text1.len - text2.len..);
407 } else if text1.len < text2.len {
408 text2 = text2.substring(..text1.len);
409 }
410 // Quick check for the worst case.
411 if slice(text1) == slice(text2) {
412 return text1.len;
413 }
414
415 // Start by looking for a single character match
416 // and increase length until no match is found.
417 // Performance analysis: https://neil.fraser.name/news/2010/11/04/
418 let mut best = 0;
419 let mut length = 1;
420 loop {
421 let pattern = text1.substring(text1.len - length..);
422 let found = match text2.find(pattern) {
423 Some(found) => found,
424 None => return best,
425 };
426 length += found;
427 if found == 0
428 || slice(text1.substring(text1.len - length..)) == slice(text2.substring(..length))
429 {
430 best = length;
431 length += 1;
432 }
433 }
434}
435
436fn cleanup_char_boundary(solution: &mut Solution) {
437 fn is_segmentation_boundary(doc: &[char], pos: usize) -> bool {
438 // FIXME: use unicode-segmentation crate?
439 let _ = doc;
440 let _ = pos;
441 true
442 }
443
444 fn boundary_down(doc: &[char], pos: usize) -> usize {
445 let mut adjust = 0;
446 while !is_segmentation_boundary(doc, pos - adjust) {
447 adjust += 1;
448 }
449 adjust
450 }
451
452 fn boundary_up(doc: &[char], pos: usize) -> usize {
453 let mut adjust = 0;
454 while !is_segmentation_boundary(doc, pos + adjust) {
455 adjust += 1;
456 }
457 adjust
458 }
459
460 fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) {
461 let prev_end = prev.offset + prev.len;
462 if prev_end > range.offset {
463 let delta = cmp::min(prev_end - range.offset, range.len);
464 range.offset += delta;
465 range.len -= delta;
466 }
467 }
468
469 let mut read = 0;
470 let mut retain = 0;
471 let mut last_delete = Range::empty();
472 let mut last_insert = Range::empty();
473 while let Some(&(mut diff)) = solution.diffs.get(read) {
474 read += 1;
475 match &mut diff {
476 Diff::Equal(range1, range2) => {
477 let adjust = boundary_up(range1.doc, range1.offset);
478 // If the whole range is sub-character, skip it.
479 if range1.len <= adjust {
480 continue;
481 }
482 range1.offset += adjust;
483 range1.len -= adjust;
484 range2.offset += adjust;
485 range2.len -= adjust;
486 let adjust = boundary_down(range1.doc, range1.offset + range1.len);
487 range1.len -= adjust;
488 range2.len -= adjust;
489 last_delete = Range::empty();
490 last_insert = Range::empty();
491 }
492 Diff::Delete(range) => {
493 skip_overlap(&last_delete, range);
494 if range.len == 0 {
495 continue;
496 }
497 let adjust = boundary_down(range.doc, range.offset);
498 range.offset -= adjust;
499 range.len += adjust;
500 let adjust = boundary_up(range.doc, range.offset + range.len);
501 range.len += adjust;
502 last_delete = *range;
503 }
504 Diff::Insert(range) => {
505 skip_overlap(&last_insert, range);
506 if range.len == 0 {
507 continue;
508 }
509 let adjust = boundary_down(range.doc, range.offset);
510 range.offset -= adjust;
511 range.len += adjust;
512 let adjust = boundary_up(range.doc, range.offset + range.len);
513 range.len += adjust;
514 last_insert = *range;
515 }
516 }
517 solution.diffs[retain] = diff;
518 retain += 1;
519 }
520
521 solution.diffs.truncate(retain);
522}
523
524// Reduce the number of edits by eliminating semantically trivial equalities.
525fn cleanup_semantic(solution: &mut Solution) {
526 let mut diffs = &mut solution.diffs;
527 if diffs.is_empty() {
528 return;
529 }
530
531 let mut changes = false;
532 let mut equalities = VecDeque::new(); // Double-ended queue of equalities.
533 let mut last_equality = None; // Always equal to equalities.peek().text
534 let mut pointer = 0;
535 // Number of characters that changed prior to the equality.
536 let mut len_insertions1 = 0;
537 let mut len_deletions1 = 0;
538 // Number of characters that changed after the equality.
539 let mut len_insertions2 = 0;
540 let mut len_deletions2 = 0;
541 while let Some(&this_diff) = diffs.get(pointer) {
542 match this_diff {
543 Diff::Equal(text1, text2) => {
544 equalities.push_back(pointer);
545 len_insertions1 = len_insertions2;
546 len_deletions1 = len_deletions2;
547 len_insertions2 = 0;
548 len_deletions2 = 0;
549 last_equality = Some((text1, text2));
550 pointer += 1;
551 continue;
552 }
553 Diff::Delete(text) => len_deletions2 += text.len,
554 Diff::Insert(text) => len_insertions2 += text.len,
555 }
556 // Eliminate an equality that is smaller or equal to the edits on both
557 // sides of it.
558 if last_equality.map_or(false, |(last_equality, _)| {
559 last_equality.len <= cmp::max(len_insertions1, len_deletions1)
560 && last_equality.len <= cmp::max(len_insertions2, len_deletions2)
561 }) {
562 // Jump back to offending equality.
563 pointer = equalities.pop_back().unwrap();
564
565 // Replace equality with a delete.
566 diffs[pointer] = Diff::Delete(last_equality.unwrap().0);
567 // Insert a corresponding insert.
568 diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1));
569
570 len_insertions1 = 0; // Reset the counters.
571 len_insertions2 = 0;
572 len_deletions1 = 0;
573 len_deletions2 = 0;
574 last_equality = None;
575 changes = true;
576
577 // Throw away the previous equality (it needs to be reevaluated).
578 equalities.pop_back();
579 if let Some(back) = equalities.back() {
580 // There is a safe equality we can fall back to.
581 pointer = *back;
582 } else {
583 // There are no previous equalities, jump back to the start.
584 pointer = 0;
585 continue;
586 }
587 }
588 pointer += 1;
589 }
590
591 // Normalize the diff.
592 if changes {
593 cleanup_merge(solution);
594 }
595 cleanup_semantic_lossless(solution);
596 diffs = &mut solution.diffs;
597
598 // Find any overlaps between deletions and insertions.
599 // e.g: <del>abcxxx</del><ins>xxxdef</ins>
600 // -> <del>abc</del>xxx<ins>def</ins>
601 // e.g: <del>xxxabc</del><ins>defxxx</ins>
602 // -> <ins>def</ins>xxx<del>abc</del>
603 // Only extract an overlap if it is as big as the edit ahead or behind it.
604 let mut pointer = 1;
605 while let Some(&this_diff) = diffs.get(pointer) {
606 let prev_diff = diffs[pointer - 1];
607 if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) {
608 let overlap_len1 = common_overlap(deletion, insertion);
609 let overlap_len2 = common_overlap(insertion, deletion);
610 let overlap_min = cmp::min(deletion.len, insertion.len);
611 if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min {
612 // Overlap found. Insert an equality and trim the surrounding edits.
613 diffs.insert(
614 pointer,
615 Diff::Equal(
616 deletion.substring(deletion.len - overlap_len1..deletion.len),
617 insertion.substring(..overlap_len1),
618 ),
619 );
620 diffs[pointer - 1] =
621 Diff::Delete(deletion.substring(..deletion.len - overlap_len1));
622 diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..));
623 } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min {
624 // Reverse overlap found.
625 // Insert an equality and swap and trim the surrounding edits.
626 diffs.insert(
627 pointer,
628 Diff::Equal(
629 deletion.substring(..overlap_len2),
630 insertion.substring(insertion.len - overlap_len2..insertion.len),
631 ),
632 );
633 diffs[pointer - 1] =
634 Diff::Insert(insertion.substring(..insertion.len - overlap_len2));
635 diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..));
636 }
637 pointer += 1;
638 }
639 pointer += 1;
640 }
641}
642
643// Look for single edits surrounded on both sides by equalities which can be
644// shifted sideways to align the edit to a word boundary.
645//
646// e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
647fn cleanup_semantic_lossless(solution: &mut Solution) {
648 let diffs = &mut solution.diffs;
649 let mut pointer = 1;
650 while let Some(&next_diff) = diffs.get(pointer + 1) {
651 let prev_diff = diffs[pointer - 1];
652 if let (
653 Diff::Equal(mut prev_equal1, mut prev_equal2),
654 Diff::Equal(mut next_equal1, mut next_equal2),
655 ) = (prev_diff, next_diff)
656 {
657 // This is a single edit surrounded by equalities.
658 let mut edit = diffs[pointer];
659
660 // First, shift the edit as far left as possible.
661 let common_offset = common_suffix(prev_equal1, edit.text());
662 let original_prev_len = prev_equal1.len;
663 prev_equal1.len -= common_offset;
664 prev_equal2.len -= common_offset;
665 edit.shift_left(common_offset);
666 next_equal1.offset -= common_offset;
667 next_equal1.len += common_offset;
668 next_equal2.offset -= common_offset;
669 next_equal2.len += common_offset;
670
671 // Second, step character by character right, looking for the best fit.
672 let mut best_prev_equal = (prev_equal1, prev_equal2);
673 let mut best_edit = edit;
674 let mut best_next_equal = (next_equal1, next_equal2);
675 let mut best_score = cleanup_semantic_score(prev_equal1, edit.text())
676 + cleanup_semantic_score(edit.text(), next_equal1);
677 while !edit.text().is_empty()
678 && !next_equal1.is_empty()
679 && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap()
680 {
681 prev_equal1.len += 1;
682 prev_equal2.len += 1;
683 edit.shift_right(1);
684 next_equal1.offset += 1;
685 next_equal1.len -= 1;
686 next_equal2.offset += 1;
687 next_equal2.len -= 1;
688 let score = cleanup_semantic_score(prev_equal1, edit.text())
689 + cleanup_semantic_score(edit.text(), next_equal1);
690 // The >= encourages trailing rather than leading whitespace on edits.
691 if score >= best_score {
692 best_score = score;
693 best_prev_equal = (prev_equal1, prev_equal2);
694 best_edit = edit;
695 best_next_equal = (next_equal1, next_equal2);
696 }
697 }
698
699 if original_prev_len != best_prev_equal.0.len {
700 // We have an improvement, save it back to the diff.
701 if best_next_equal.0.is_empty() {
702 diffs.remove(pointer + 1);
703 } else {
704 diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1);
705 }
706 diffs[pointer] = best_edit;
707 if best_prev_equal.0.is_empty() {
708 diffs.remove(pointer - 1);
709 pointer -= 1;
710 } else {
711 diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1);
712 }
713 }
714 }
715 pointer += 1;
716 }
717}
718
719// Given two strings, compute a score representing whether the internal boundary
720// falls on logical boundaries.
721//
722// Scores range from 6 (best) to 0 (worst).
723fn cleanup_semantic_score(one: Range, two: Range) -> usize {
724 if one.is_empty() || two.is_empty() {
725 // Edges are the best.
726 return 6;
727 }
728
729 // Each port of this function behaves slightly differently due to subtle
730 // differences in each language's definition of things like 'whitespace'.
731 // Since this function's purpose is largely cosmetic, the choice has been
732 // made to use each language's native features rather than force total
733 // conformity.
734 let char1 = one.chars().next_back().unwrap();
735 let char2 = two.chars().next().unwrap();
736 let non_alphanumeric1 = !char1.is_ascii_alphanumeric();
737 let non_alphanumeric2 = !char2.is_ascii_alphanumeric();
738 let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace();
739 let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace();
740 let line_break1 = whitespace1 && char1.is_control();
741 let line_break2 = whitespace2 && char2.is_control();
742 let blank_line1 =
743 line_break1 && (one.ends_with(['\n', '\n']) || one.ends_with(['\n', '\r', '\n']));
744 let blank_line2 =
745 line_break2 && (two.starts_with(['\n', '\n']) || two.starts_with(['\r', '\n', '\r', '\n']));
746
747 if blank_line1 || blank_line2 {
748 // Five points for blank lines.
749 5
750 } else if line_break1 || line_break2 {
751 // Four points for line breaks.
752 4
753 } else if non_alphanumeric1 && !whitespace1 && whitespace2 {
754 // Three points for end of sentences.
755 3
756 } else if whitespace1 || whitespace2 {
757 // Two points for whitespace.
758 2
759 } else if non_alphanumeric1 || non_alphanumeric2 {
760 // One point for non-alphanumeric.
761 1
762 } else {
763 0
764 }
765}
766
767// Reorder and merge like edit sections. Merge equalities. Any edit section can
768// move as long as it doesn't cross an equality.
769fn cleanup_merge(solution: &mut Solution) {
770 let diffs = &mut solution.diffs;
771 while !diffs.is_empty() {
772 diffs.push(Diff::Equal(
773 solution.text1.substring(solution.text1.len..),
774 solution.text2.substring(solution.text2.len..),
775 )); // Add a dummy entry at the end.
776 let mut pointer = 0;
777 let mut count_delete = 0;
778 let mut count_insert = 0;
779 let mut text_delete = Range::empty();
780 let mut text_insert = Range::empty();
781 while let Some(&this_diff) = diffs.get(pointer) {
782 match this_diff {
783 Diff::Insert(text) => {
784 count_insert += 1;
785 if text_insert.is_empty() {
786 text_insert = text;
787 } else {
788 text_insert.len += text.len;
789 }
790 }
791 Diff::Delete(text) => {
792 count_delete += 1;
793 if text_delete.is_empty() {
794 text_delete = text;
795 } else {
796 text_delete.len += text.len;
797 }
798 }
799 Diff::Equal(text, _) => {
800 let count_both = count_delete + count_insert;
801 if count_both > 1 {
802 let both_types = count_delete != 0 && count_insert != 0;
803 // Delete the offending records.
804 diffs.drain(pointer - count_both..pointer);
805 pointer -= count_both;
806 if both_types {
807 // Factor out any common prefix.
808 let common_length = common_prefix(text_insert, text_delete);
809 if common_length != 0 {
810 if pointer > 0 {
811 match &mut diffs[pointer - 1] {
812 Diff::Equal(this_diff1, this_diff2) => {
813 this_diff1.len += common_length;
814 this_diff2.len += common_length;
815 }
816 _ => unreachable!(
817 "previous diff should have been an equality"
818 ),
819 }
820 } else {
821 diffs.insert(
822 pointer,
823 Diff::Equal(
824 text_delete.substring(..common_length),
825 text_insert.substring(..common_length),
826 ),
827 );
828 pointer += 1;
829 }
830 text_insert = text_insert.substring(common_length..);
831 text_delete = text_delete.substring(common_length..);
832 }
833 // Factor out any common suffix.
834 let common_length = common_suffix(text_insert, text_delete);
835 if common_length != 0 {
836 diffs[pointer].grow_left(common_length);
837 text_insert.len -= common_length;
838 text_delete.len -= common_length;
839 }
840 }
841 // Insert the merged records.
842 if !text_delete.is_empty() {
843 diffs.insert(pointer, Diff::Delete(text_delete));
844 pointer += 1;
845 }
846 if !text_insert.is_empty() {
847 diffs.insert(pointer, Diff::Insert(text_insert));
848 pointer += 1;
849 }
850 } else if pointer > 0 {
851 if let Some(Diff::Equal(prev_equal1, prev_equal2)) =
852 diffs.get_mut(pointer - 1)
853 {
854 // Merge this equality with the previous one.
855 prev_equal1.len += text.len;
856 prev_equal2.len += text.len;
857 diffs.remove(pointer);
858 pointer -= 1;
859 }
860 }
861 count_insert = 0;
862 count_delete = 0;
863 text_delete = Range::empty();
864 text_insert = Range::empty();
865 }
866 }
867 pointer += 1;
868 }
869 if diffs.last().unwrap().text().is_empty() {
870 diffs.pop(); // Remove the dummy entry at the end.
871 }
872
873 // Second pass: look for single edits surrounded on both sides by equalities
874 // which can be shifted sideways to eliminate an equality.
875 // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
876 let mut changes = false;
877 let mut pointer = 1;
878 // Intentionally ignore the first and last element (don't need checking).
879 while let Some(&next_diff) = diffs.get(pointer + 1) {
880 let prev_diff = diffs[pointer - 1];
881 let this_diff = diffs[pointer];
882 if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) {
883 // This is a single edit surrounded by equalities.
884 if this_diff.text().ends_with(prev_diff) {
885 // Shift the edit over the previous equality.
886 diffs[pointer].shift_left(prev_diff.len);
887 diffs[pointer + 1].grow_left(prev_diff.len);
888 diffs.remove(pointer - 1); // Delete prev_diff.
889 changes = true;
890 } else if this_diff.text().starts_with(next_diff) {
891 // Shift the edit over the next equality.
892 diffs[pointer - 1].grow_right(next_diff.len);
893 diffs[pointer].shift_right(next_diff.len);
894 diffs.remove(pointer + 1); // Delete next_diff.
895 changes = true;
896 }
897 }
898 pointer += 1;
899 }
900 // If shifts were made, the diff needs reordering and another shift sweep.
901 if !changes {
902 return;
903 }
904 }
905}
906
907impl Debug for Chunk<'_> {
908 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
909 let (name: &'static str, text: &str) = match *self {
910 Chunk::Equal(text: &str) => ("Equal", text),
911 Chunk::Delete(text: &str) => ("Delete", text),
912 Chunk::Insert(text: &str) => ("Insert", text),
913 };
914 write!(formatter, "{}({:?})", name, text)
915 }
916}
917
918impl Debug for Diff<'_, '_> {
919 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
920 let (name: &'static str, range: Range<'_>) = match *self {
921 Diff::Equal(range: Range<'_>, _) => ("Equal", range),
922 Diff::Delete(range: Range<'_>) => ("Delete", range),
923 Diff::Insert(range: Range<'_>) => ("Insert", range),
924 };
925 formatter.write_str(data:name)?;
926 formatter.write_str(data:"(\"")?;
927 for ch: char in range.chars() {
928 if ch == '\'' {
929 // escape_debug turns this into "\'" which is unnecessary.
930 formatter.write_char(ch)?;
931 } else {
932 Display::fmt(&ch.escape_debug(), f:formatter)?;
933 }
934 }
935 formatter.write_str(data:"\")")?;
936 Ok(())
937 }
938}
939