1//! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar)
2//!
3//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
4//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
5//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs
6//!
7//! <br>
8//!
9//! ## Diff library with semantic cleanup, based on Google's diff-match-patch
10//!
11//! This library is a port of the Diff component of [Diff Match Patch] to Rust.
12//! The diff implementation is based on [Myers' diff algorithm] but includes
13//! some [semantic cleanups] to increase human readability by factoring out
14//! commonalities which are likely to be coincidental.
15//!
16//! Diff Match Patch was originally built in 2006 to power Google Docs.
17//!
18//! # Interface
19//!
20//! Here is the entire API of the Rust implementation. It operates on borrowed
21//! strings and the return value of the diff algorithm is a vector of chunks
22//! pointing into slices of those input strings.
23//!
24//! ```
25//! pub enum Chunk<'a> {
26//! Equal(&'a str),
27//! Delete(&'a str),
28//! Insert(&'a str),
29//! }
30//!
31//! # const IGNORE: &str = stringify! {
32//! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
33//! # };
34//! ```
35//!
36//! [Diff Match Patch]: https://github.com/google/diff-match-patch
37//! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
38//! [semantic cleanups]: https://neil.fraser.name/writing/diff/
39
40#![doc(html_root_url = "https://docs.rs/dissimilar/1.0.7")]
41#![allow(
42 clippy::blocks_in_if_conditions,
43 clippy::bool_to_int_with_if,
44 clippy::cast_possible_wrap,
45 clippy::cast_sign_loss,
46 clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127
47 clippy::collapsible_else_if,
48 clippy::comparison_chain,
49 clippy::items_after_test_module, // https://github.com/rust-lang/rust-clippy/issues/10713
50 clippy::let_underscore_untyped,
51 clippy::match_same_arms,
52 clippy::module_name_repetitions,
53 clippy::must_use_candidate,
54 clippy::new_without_default,
55 clippy::octal_escapes,
56 clippy::shadow_unrelated,
57 clippy::similar_names,
58 clippy::too_many_lines,
59 clippy::unseparated_literal_suffix,
60 unused_parens, // false positive on Some(&(mut diff)) pattern
61)]
62
63mod find;
64mod range;
65
66#[cfg(test)]
67mod tests;
68
69use crate::range::{slice, Range};
70use std::cmp;
71use std::collections::VecDeque;
72use std::fmt::{self, Debug, Display, Write};
73
74#[derive(Copy, Clone, PartialEq, Eq)]
75pub enum Chunk<'a> {
76 Equal(&'a str),
77 Delete(&'a str),
78 Insert(&'a str),
79}
80
81#[derive(Copy, Clone)]
82enum Diff<'a, 'b> {
83 Equal(Range<'a>, Range<'b>),
84 Delete(Range<'a>),
85 Insert(Range<'b>),
86}
87
88impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> {
89 fn text(&self) -> Range<'tmp> {
90 match *self {
91 Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range,
92 }
93 }
94
95 fn grow_left(&mut self, increment: usize) {
96 self.for_each(|range| {
97 range.offset -= increment;
98 range.len += increment;
99 });
100 }
101
102 fn grow_right(&mut self, increment: usize) {
103 self.for_each(|range| range.len += increment);
104 }
105
106 fn shift_left(&mut self, increment: usize) {
107 self.for_each(|range| range.offset -= increment);
108 }
109
110 fn shift_right(&mut self, increment: usize) {
111 self.for_each(|range| range.offset += increment);
112 }
113
114 fn for_each(&mut self, f: impl Fn(&mut Range)) {
115 match self {
116 Diff::Equal(range1, range2) => {
117 f(range1);
118 f(range2);
119 }
120 Diff::Delete(range) => f(range),
121 Diff::Insert(range) => f(range),
122 }
123 }
124}
125
126pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> {
127 let chars1: Vec<char> = text1.chars().collect();
128 let chars2: Vec<char> = text2.chars().collect();
129 let range1 = Range::new(&chars1, ..);
130 let range2 = Range::new(&chars2, ..);
131
132 let mut solution = main(range1, range2);
133 cleanup_char_boundary(&mut solution);
134 cleanup_semantic(&mut solution);
135 cleanup_merge(&mut solution);
136
137 let mut chunks = Vec::new();
138 let mut pos1 = 0;
139 let mut pos2 = 0;
140 for diff in solution.diffs {
141 chunks.push(match diff {
142 Diff::Equal(range, _) => {
143 let len = range.len_bytes();
144 let chunk = Chunk::Equal(&text1[pos1..pos1 + len]);
145 pos1 += len;
146 pos2 += len;
147 chunk
148 }
149 Diff::Delete(range) => {
150 let len = range.len_bytes();
151 let chunk = Chunk::Delete(&text1[pos1..pos1 + len]);
152 pos1 += len;
153 chunk
154 }
155 Diff::Insert(range) => {
156 let len = range.len_bytes();
157 let chunk = Chunk::Insert(&text2[pos2..pos2 + len]);
158 pos2 += len;
159 chunk
160 }
161 });
162 }
163 chunks
164}
165
166struct Solution<'a, 'b> {
167 text1: Range<'a>,
168 text2: Range<'b>,
169 diffs: Vec<Diff<'a, 'b>>,
170}
171
172fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> {
173 let whole1 = text1;
174 let whole2 = text2;
175
176 // Trim off common prefix.
177 let common_prefix_len = common_prefix(text1, text2);
178 let common_prefix = Diff::Equal(
179 text1.substring(..common_prefix_len),
180 text2.substring(..common_prefix_len),
181 );
182 text1 = text1.substring(common_prefix_len..);
183 text2 = text2.substring(common_prefix_len..);
184
185 // Trim off common suffix.
186 let common_suffix_len = common_suffix(text1, text2);
187 let common_suffix = Diff::Equal(
188 text1.substring(text1.len - common_suffix_len..),
189 text2.substring(text2.len - common_suffix_len..),
190 );
191 text1 = text1.substring(..text1.len - common_suffix_len);
192 text2 = text2.substring(..text2.len - common_suffix_len);
193
194 // Compute the diff on the middle block.
195 let mut solution = Solution {
196 text1: whole1,
197 text2: whole2,
198 diffs: compute(text1, text2),
199 };
200
201 // Restore the prefix and suffix.
202 if common_prefix_len > 0 {
203 solution.diffs.insert(0, common_prefix);
204 }
205 if common_suffix_len > 0 {
206 solution.diffs.push(common_suffix);
207 }
208
209 cleanup_merge(&mut solution);
210
211 solution
212}
213
214// Find the differences between two texts. Assumes that the texts do not have
215// any common prefix or suffix.
216fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
217 match (text1.is_empty(), text2.is_empty()) {
218 (true, true) => return Vec::new(),
219 (true, false) => return vec![Diff::Insert(text2)],
220 (false, true) => return vec![Diff::Delete(text1)],
221 (false, false) => {}
222 }
223
224 // Check for entire shorter text inside the longer text.
225 if text1.len > text2.len {
226 if let Some(i) = text1.find(text2) {
227 return vec![
228 Diff::Delete(text1.substring(..i)),
229 Diff::Equal(text1.substring(i..i + text2.len), text2),
230 Diff::Delete(text1.substring(i + text2.len..)),
231 ];
232 }
233 } else {
234 if let Some(i) = text2.find(text1) {
235 return vec![
236 Diff::Insert(text2.substring(..i)),
237 Diff::Equal(text1, text2.substring(i..i + text1.len)),
238 Diff::Insert(text2.substring(i + text1.len..)),
239 ];
240 }
241 }
242
243 if text1.len == 1 || text2.len == 1 {
244 // Single character string.
245 // After the previous check, the character can't be an equality.
246 return vec![Diff::Delete(text1), Diff::Insert(text2)];
247 }
248
249 bisect(text1, text2)
250}
251
252// Find the 'middle snake' of a diff, split the problem in two and return the
253// recursively constructed diff.
254//
255// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
256fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
257 let max_d = (text1.len + text2.len + 1) / 2;
258 let v_offset = max_d;
259 let v_len = 2 * max_d;
260 let mut v1 = vec![-1isize; v_len];
261 let mut v2 = vec![-1isize; v_len];
262 v1[v_offset + 1] = 0;
263 v2[v_offset + 1] = 0;
264 let delta = text1.len as isize - text2.len as isize;
265 // If the total number of characters is odd, then the front path will
266 // collide with the reverse path.
267 let front = delta % 2 != 0;
268 // Offsets for start and end of k loop.
269 // Prevents mapping of space beyond the grid.
270 let mut k1start = 0;
271 let mut k1end = 0;
272 let mut k2start = 0;
273 let mut k2end = 0;
274 for d in 0..max_d as isize {
275 // Walk the front path one step.
276 let mut k1 = -d + k1start;
277 while k1 <= d - k1end {
278 let k1_offset = (v_offset as isize + k1) as usize;
279 let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) {
280 v1[k1_offset + 1]
281 } else {
282 v1[k1_offset - 1] + 1
283 } as usize;
284 let mut y1 = (x1 as isize - k1) as usize;
285 if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) {
286 let advance = common_prefix(s1, s2);
287 x1 += advance;
288 y1 += advance;
289 }
290 v1[k1_offset] = x1 as isize;
291 if x1 > text1.len {
292 // Ran off the right of the graph.
293 k1end += 2;
294 } else if y1 > text2.len {
295 // Ran off the bottom of the graph.
296 k1start += 2;
297 } else if front {
298 let k2_offset = v_offset as isize + delta - k1;
299 if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 {
300 // Mirror x2 onto top-left coordinate system.
301 let x2 = text1.len as isize - v2[k2_offset as usize];
302 if x1 as isize >= x2 {
303 // Overlap detected.
304 return bisect_split(text1, text2, x1, y1);
305 }
306 }
307 }
308 k1 += 2;
309 }
310
311 // Walk the reverse path one step.
312 let mut k2 = -d + k2start;
313 while k2 <= d - k2end {
314 let k2_offset = (v_offset as isize + k2) as usize;
315 let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) {
316 v2[k2_offset + 1]
317 } else {
318 v2[k2_offset - 1] + 1
319 } as usize;
320 let mut y2 = (x2 as isize - k2) as usize;
321 if x2 < text1.len && y2 < text2.len {
322 let advance = common_suffix(
323 text1.substring(..text1.len - x2),
324 text2.substring(..text2.len - y2),
325 );
326 x2 += advance;
327 y2 += advance;
328 }
329 v2[k2_offset] = x2 as isize;
330 if x2 > text1.len {
331 // Ran off the left of the graph.
332 k2end += 2;
333 } else if y2 > text2.len {
334 // Ran off the top of the graph.
335 k2start += 2;
336 } else if !front {
337 let k1_offset = v_offset as isize + delta - k2;
338 if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 {
339 let x1 = v1[k1_offset as usize] as usize;
340 let y1 = v_offset + x1 - k1_offset as usize;
341 // Mirror x2 onto top-left coordinate system.
342 x2 = text1.len - x2;
343 if x1 >= x2 {
344 // Overlap detected.
345 return bisect_split(text1, text2, x1, y1);
346 }
347 }
348 }
349 k2 += 2;
350 }
351 }
352 // Number of diffs equals number of characters, no commonality at all.
353 vec![Diff::Delete(text1), Diff::Insert(text2)]
354}
355
356// Given the location of the 'middle snake', split the diff in two parts and
357// recurse.
358fn bisect_split<'a, 'b>(
359 text1: Range<'a>,
360 text2: Range<'b>,
361 x: usize,
362 y: usize,
363) -> Vec<Diff<'a, 'b>> {
364 let (text1a: Range<'_>, text1b: Range<'_>) = text1.split_at(mid:x);
365 let (text2a: Range<'_>, text2b: Range<'_>) = text2.split_at(mid:y);
366
367 // Compute both diffs serially.
368 let mut diffs: Vec> = main(text1:text1a, text2:text2a).diffs;
369 diffs.extend(iter:main(text1:text1b, text2:text2b).diffs);
370
371 diffs
372}
373
374// Determine the length of the common prefix of two strings.
375fn common_prefix(text1: Range, text2: Range) -> usize {
376 for (i: usize, (b1: char, b2: char)) in text1.chars().zip(text2.chars()).enumerate() {
377 if b1 != b2 {
378 return i;
379 }
380 }
381 cmp::min(v1:text1.len, v2:text2.len)
382}
383
384// Determine the length of the common suffix of two strings.
385fn common_suffix(text1: Range, text2: Range) -> usize {
386 for (i: usize, (b1: char, b2: char)) in text1.chars().rev().zip(text2.chars().rev()).enumerate() {
387 if b1 != b2 {
388 return i;
389 }
390 }
391 cmp::min(v1:text1.len, v2:text2.len)
392}
393
394// Determine if the suffix of one string is the prefix of another.
395//
396// Returns the number of characters common to the end of the first string and
397// the start of the second string.
398fn common_overlap(mut text1: Range, mut text2: Range) -> usize {
399 // Eliminate the null case.
400 if text1.is_empty() || text2.is_empty() {
401 return 0;
402 }
403 // Truncate the longer string.
404 if text1.len > text2.len {
405 text1 = text1.substring(text1.len - text2.len..);
406 } else if text1.len < text2.len {
407 text2 = text2.substring(..text1.len);
408 }
409 // Quick check for the worst case.
410 if slice(text1) == slice(text2) {
411 return text1.len;
412 }
413
414 // Start by looking for a single character match
415 // and increase length until no match is found.
416 // Performance analysis: https://neil.fraser.name/news/2010/11/04/
417 let mut best = 0;
418 let mut length = 1;
419 loop {
420 let pattern = text1.substring(text1.len - length..);
421 let found = match text2.find(pattern) {
422 Some(found) => found,
423 None => return best,
424 };
425 length += found;
426 if found == 0
427 || slice(text1.substring(text1.len - length..)) == slice(text2.substring(..length))
428 {
429 best = length;
430 length += 1;
431 }
432 }
433}
434
435fn cleanup_char_boundary(solution: &mut Solution) {
436 fn is_segmentation_boundary(doc: &[char], pos: usize) -> bool {
437 // FIXME: use unicode-segmentation crate?
438 let _ = doc;
439 let _ = pos;
440 true
441 }
442
443 fn boundary_down(doc: &[char], pos: usize) -> usize {
444 let mut adjust = 0;
445 while !is_segmentation_boundary(doc, pos - adjust) {
446 adjust += 1;
447 }
448 adjust
449 }
450
451 fn boundary_up(doc: &[char], pos: usize) -> usize {
452 let mut adjust = 0;
453 while !is_segmentation_boundary(doc, pos + adjust) {
454 adjust += 1;
455 }
456 adjust
457 }
458
459 fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) {
460 let prev_end = prev.offset + prev.len;
461 if prev_end > range.offset {
462 let delta = cmp::min(prev_end - range.offset, range.len);
463 range.offset += delta;
464 range.len -= delta;
465 }
466 }
467
468 let mut read = 0;
469 let mut retain = 0;
470 let mut last_delete = Range::empty();
471 let mut last_insert = Range::empty();
472 while let Some(&(mut diff)) = solution.diffs.get(read) {
473 read += 1;
474 match &mut diff {
475 Diff::Equal(range1, range2) => {
476 let adjust = boundary_up(range1.doc, range1.offset);
477 // If the whole range is sub-character, skip it.
478 if range1.len <= adjust {
479 continue;
480 }
481 range1.offset += adjust;
482 range1.len -= adjust;
483 range2.offset += adjust;
484 range2.len -= adjust;
485 let adjust = boundary_down(range1.doc, range1.offset + range1.len);
486 range1.len -= adjust;
487 range2.len -= adjust;
488 last_delete = Range::empty();
489 last_insert = Range::empty();
490 }
491 Diff::Delete(range) => {
492 skip_overlap(&last_delete, range);
493 if range.len == 0 {
494 continue;
495 }
496 let adjust = boundary_down(range.doc, range.offset);
497 range.offset -= adjust;
498 range.len += adjust;
499 let adjust = boundary_up(range.doc, range.offset + range.len);
500 range.len += adjust;
501 last_delete = *range;
502 }
503 Diff::Insert(range) => {
504 skip_overlap(&last_insert, range);
505 if range.len == 0 {
506 continue;
507 }
508 let adjust = boundary_down(range.doc, range.offset);
509 range.offset -= adjust;
510 range.len += adjust;
511 let adjust = boundary_up(range.doc, range.offset + range.len);
512 range.len += adjust;
513 last_insert = *range;
514 }
515 }
516 solution.diffs[retain] = diff;
517 retain += 1;
518 }
519
520 solution.diffs.truncate(retain);
521}
522
523// Reduce the number of edits by eliminating semantically trivial equalities.
524fn cleanup_semantic(solution: &mut Solution) {
525 let mut diffs = &mut solution.diffs;
526 if diffs.is_empty() {
527 return;
528 }
529
530 let mut changes = false;
531 let mut equalities = VecDeque::new(); // Double-ended queue of equalities.
532 let mut last_equality = None; // Always equal to equalities.peek().text
533 let mut pointer = 0;
534 // Number of characters that changed prior to the equality.
535 let mut len_insertions1 = 0;
536 let mut len_deletions1 = 0;
537 // Number of characters that changed after the equality.
538 let mut len_insertions2 = 0;
539 let mut len_deletions2 = 0;
540 while let Some(&this_diff) = diffs.get(pointer) {
541 match this_diff {
542 Diff::Equal(text1, text2) => {
543 equalities.push_back(pointer);
544 len_insertions1 = len_insertions2;
545 len_deletions1 = len_deletions2;
546 len_insertions2 = 0;
547 len_deletions2 = 0;
548 last_equality = Some((text1, text2));
549 pointer += 1;
550 continue;
551 }
552 Diff::Delete(text) => len_deletions2 += text.len,
553 Diff::Insert(text) => len_insertions2 += text.len,
554 }
555 // Eliminate an equality that is smaller or equal to the edits on both
556 // sides of it.
557 if last_equality.map_or(false, |(last_equality, _)| {
558 last_equality.len <= cmp::max(len_insertions1, len_deletions1)
559 && last_equality.len <= cmp::max(len_insertions2, len_deletions2)
560 }) {
561 // Jump back to offending equality.
562 pointer = equalities.pop_back().unwrap();
563
564 // Replace equality with a delete.
565 diffs[pointer] = Diff::Delete(last_equality.unwrap().0);
566 // Insert a corresponding insert.
567 diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1));
568
569 len_insertions1 = 0; // Reset the counters.
570 len_insertions2 = 0;
571 len_deletions1 = 0;
572 len_deletions2 = 0;
573 last_equality = None;
574 changes = true;
575
576 // Throw away the previous equality (it needs to be reevaluated).
577 equalities.pop_back();
578 if let Some(back) = equalities.back() {
579 // There is a safe equality we can fall back to.
580 pointer = *back;
581 } else {
582 // There are no previous equalities, jump back to the start.
583 pointer = 0;
584 continue;
585 }
586 }
587 pointer += 1;
588 }
589
590 // Normalize the diff.
591 if changes {
592 cleanup_merge(solution);
593 }
594 cleanup_semantic_lossless(solution);
595 diffs = &mut solution.diffs;
596
597 // Find any overlaps between deletions and insertions.
598 // e.g: <del>abcxxx</del><ins>xxxdef</ins>
599 // -> <del>abc</del>xxx<ins>def</ins>
600 // e.g: <del>xxxabc</del><ins>defxxx</ins>
601 // -> <ins>def</ins>xxx<del>abc</del>
602 // Only extract an overlap if it is as big as the edit ahead or behind it.
603 let mut pointer = 1;
604 while let Some(&this_diff) = diffs.get(pointer) {
605 let prev_diff = diffs[pointer - 1];
606 if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) {
607 let overlap_len1 = common_overlap(deletion, insertion);
608 let overlap_len2 = common_overlap(insertion, deletion);
609 let overlap_min = cmp::min(deletion.len, insertion.len);
610 if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min {
611 // Overlap found. Insert an equality and trim the surrounding edits.
612 diffs.insert(
613 pointer,
614 Diff::Equal(
615 deletion.substring(deletion.len - overlap_len1..deletion.len),
616 insertion.substring(..overlap_len1),
617 ),
618 );
619 diffs[pointer - 1] =
620 Diff::Delete(deletion.substring(..deletion.len - overlap_len1));
621 diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..));
622 } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min {
623 // Reverse overlap found.
624 // Insert an equality and swap and trim the surrounding edits.
625 diffs.insert(
626 pointer,
627 Diff::Equal(
628 deletion.substring(..overlap_len2),
629 insertion.substring(insertion.len - overlap_len2..insertion.len),
630 ),
631 );
632 diffs[pointer - 1] =
633 Diff::Insert(insertion.substring(..insertion.len - overlap_len2));
634 diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..));
635 }
636 pointer += 1;
637 }
638 pointer += 1;
639 }
640}
641
642// Look for single edits surrounded on both sides by equalities which can be
643// shifted sideways to align the edit to a word boundary.
644//
645// e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
646fn cleanup_semantic_lossless(solution: &mut Solution) {
647 let diffs = &mut solution.diffs;
648 let mut pointer = 1;
649 while let Some(&next_diff) = diffs.get(pointer + 1) {
650 let prev_diff = diffs[pointer - 1];
651 if let (
652 Diff::Equal(mut prev_equal1, mut prev_equal2),
653 Diff::Equal(mut next_equal1, mut next_equal2),
654 ) = (prev_diff, next_diff)
655 {
656 // This is a single edit surrounded by equalities.
657 let mut edit = diffs[pointer];
658
659 // First, shift the edit as far left as possible.
660 let common_offset = common_suffix(prev_equal1, edit.text());
661 let original_prev_len = prev_equal1.len;
662 prev_equal1.len -= common_offset;
663 prev_equal2.len -= common_offset;
664 edit.shift_left(common_offset);
665 next_equal1.offset -= common_offset;
666 next_equal1.len += common_offset;
667 next_equal2.offset -= common_offset;
668 next_equal2.len += common_offset;
669
670 // Second, step character by character right, looking for the best fit.
671 let mut best_prev_equal = (prev_equal1, prev_equal2);
672 let mut best_edit = edit;
673 let mut best_next_equal = (next_equal1, next_equal2);
674 let mut best_score = cleanup_semantic_score(prev_equal1, edit.text())
675 + cleanup_semantic_score(edit.text(), next_equal1);
676 while !edit.text().is_empty()
677 && !next_equal1.is_empty()
678 && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap()
679 {
680 prev_equal1.len += 1;
681 prev_equal2.len += 1;
682 edit.shift_right(1);
683 next_equal1.offset += 1;
684 next_equal1.len -= 1;
685 next_equal2.offset += 1;
686 next_equal2.len -= 1;
687 let score = cleanup_semantic_score(prev_equal1, edit.text())
688 + cleanup_semantic_score(edit.text(), next_equal1);
689 // The >= encourages trailing rather than leading whitespace on edits.
690 if score >= best_score {
691 best_score = score;
692 best_prev_equal = (prev_equal1, prev_equal2);
693 best_edit = edit;
694 best_next_equal = (next_equal1, next_equal2);
695 }
696 }
697
698 if original_prev_len != best_prev_equal.0.len {
699 // We have an improvement, save it back to the diff.
700 if best_next_equal.0.is_empty() {
701 diffs.remove(pointer + 1);
702 } else {
703 diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1);
704 }
705 diffs[pointer] = best_edit;
706 if best_prev_equal.0.is_empty() {
707 diffs.remove(pointer - 1);
708 pointer -= 1;
709 } else {
710 diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1);
711 }
712 }
713 }
714 pointer += 1;
715 }
716}
717
718// Given two strings, compute a score representing whether the internal boundary
719// falls on logical boundaries.
720//
721// Scores range from 6 (best) to 0 (worst).
722fn cleanup_semantic_score(one: Range, two: Range) -> usize {
723 if one.is_empty() || two.is_empty() {
724 // Edges are the best.
725 return 6;
726 }
727
728 // Each port of this function behaves slightly differently due to subtle
729 // differences in each language's definition of things like 'whitespace'.
730 // Since this function's purpose is largely cosmetic, the choice has been
731 // made to use each language's native features rather than force total
732 // conformity.
733 let char1 = one.chars().next_back().unwrap();
734 let char2 = two.chars().next().unwrap();
735 let non_alphanumeric1 = !char1.is_ascii_alphanumeric();
736 let non_alphanumeric2 = !char2.is_ascii_alphanumeric();
737 let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace();
738 let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace();
739 let line_break1 = whitespace1 && char1.is_control();
740 let line_break2 = whitespace2 && char2.is_control();
741 let blank_line1 =
742 line_break1 && (one.ends_with(['\n', '\n']) || one.ends_with(['\n', '\r', '\n']));
743 let blank_line2 =
744 line_break2 && (two.starts_with(['\n', '\n']) || two.starts_with(['\r', '\n', '\r', '\n']));
745
746 if blank_line1 || blank_line2 {
747 // Five points for blank lines.
748 5
749 } else if line_break1 || line_break2 {
750 // Four points for line breaks.
751 4
752 } else if non_alphanumeric1 && !whitespace1 && whitespace2 {
753 // Three points for end of sentences.
754 3
755 } else if whitespace1 || whitespace2 {
756 // Two points for whitespace.
757 2
758 } else if non_alphanumeric1 || non_alphanumeric2 {
759 // One point for non-alphanumeric.
760 1
761 } else {
762 0
763 }
764}
765
766// Reorder and merge like edit sections. Merge equalities. Any edit section can
767// move as long as it doesn't cross an equality.
768fn cleanup_merge(solution: &mut Solution) {
769 let diffs = &mut solution.diffs;
770 while !diffs.is_empty() {
771 diffs.push(Diff::Equal(
772 solution.text1.substring(solution.text1.len..),
773 solution.text2.substring(solution.text2.len..),
774 )); // Add a dummy entry at the end.
775 let mut pointer = 0;
776 let mut count_delete = 0;
777 let mut count_insert = 0;
778 let mut text_delete = Range::empty();
779 let mut text_insert = Range::empty();
780 while let Some(&this_diff) = diffs.get(pointer) {
781 match this_diff {
782 Diff::Insert(text) => {
783 count_insert += 1;
784 if text_insert.is_empty() {
785 text_insert = text;
786 } else {
787 text_insert.len += text.len;
788 }
789 }
790 Diff::Delete(text) => {
791 count_delete += 1;
792 if text_delete.is_empty() {
793 text_delete = text;
794 } else {
795 text_delete.len += text.len;
796 }
797 }
798 Diff::Equal(text, _) => {
799 let count_both = count_delete + count_insert;
800 if count_both > 1 {
801 let both_types = count_delete != 0 && count_insert != 0;
802 // Delete the offending records.
803 diffs.splice(pointer - count_both..pointer, None);
804 pointer -= count_both;
805 if both_types {
806 // Factor out any common prefix.
807 let common_length = common_prefix(text_insert, text_delete);
808 if common_length != 0 {
809 if pointer > 0 {
810 match &mut diffs[pointer - 1] {
811 Diff::Equal(this_diff1, this_diff2) => {
812 this_diff1.len += common_length;
813 this_diff2.len += common_length;
814 }
815 _ => unreachable!(
816 "previous diff should have been an equality"
817 ),
818 }
819 } else {
820 diffs.insert(
821 pointer,
822 Diff::Equal(
823 text_delete.substring(..common_length),
824 text_insert.substring(..common_length),
825 ),
826 );
827 pointer += 1;
828 }
829 text_insert = text_insert.substring(common_length..);
830 text_delete = text_delete.substring(common_length..);
831 }
832 // Factor out any common suffix.
833 let common_length = common_suffix(text_insert, text_delete);
834 if common_length != 0 {
835 diffs[pointer].grow_left(common_length);
836 text_insert.len -= common_length;
837 text_delete.len -= common_length;
838 }
839 }
840 // Insert the merged records.
841 if !text_delete.is_empty() {
842 diffs.insert(pointer, Diff::Delete(text_delete));
843 pointer += 1;
844 }
845 if !text_insert.is_empty() {
846 diffs.insert(pointer, Diff::Insert(text_insert));
847 pointer += 1;
848 }
849 } else if pointer > 0 {
850 if let Some(Diff::Equal(prev_equal1, prev_equal2)) =
851 diffs.get_mut(pointer - 1)
852 {
853 // Merge this equality with the previous one.
854 prev_equal1.len += text.len;
855 prev_equal2.len += text.len;
856 diffs.remove(pointer);
857 pointer -= 1;
858 }
859 }
860 count_insert = 0;
861 count_delete = 0;
862 text_delete = Range::empty();
863 text_insert = Range::empty();
864 }
865 }
866 pointer += 1;
867 }
868 if diffs.last().unwrap().text().is_empty() {
869 diffs.pop(); // Remove the dummy entry at the end.
870 }
871
872 // Second pass: look for single edits surrounded on both sides by equalities
873 // which can be shifted sideways to eliminate an equality.
874 // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
875 let mut changes = false;
876 let mut pointer = 1;
877 // Intentionally ignore the first and last element (don't need checking).
878 while let Some(&next_diff) = diffs.get(pointer + 1) {
879 let prev_diff = diffs[pointer - 1];
880 let this_diff = diffs[pointer];
881 if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) {
882 // This is a single edit surrounded by equalities.
883 if this_diff.text().ends_with(prev_diff) {
884 // Shift the edit over the previous equality.
885 diffs[pointer].shift_left(prev_diff.len);
886 diffs[pointer + 1].grow_left(prev_diff.len);
887 diffs.remove(pointer - 1); // Delete prev_diff.
888 changes = true;
889 } else if this_diff.text().starts_with(next_diff) {
890 // Shift the edit over the next equality.
891 diffs[pointer - 1].grow_right(next_diff.len);
892 diffs[pointer].shift_right(next_diff.len);
893 diffs.remove(pointer + 1); // Delete next_diff.
894 changes = true;
895 }
896 }
897 pointer += 1;
898 }
899 // If shifts were made, the diff needs reordering and another shift sweep.
900 if !changes {
901 return;
902 }
903 }
904}
905
906impl Debug for Chunk<'_> {
907 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
908 let (name: &str, text: &str) = match *self {
909 Chunk::Equal(text: &str) => ("Equal", text),
910 Chunk::Delete(text: &str) => ("Delete", text),
911 Chunk::Insert(text: &str) => ("Insert", text),
912 };
913 write!(formatter, "{}({:?})", name, text)
914 }
915}
916
917impl Debug for Diff<'_, '_> {
918 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
919 let (name: &str, range: Range<'_>) = match *self {
920 Diff::Equal(range: Range<'_>, _) => ("Equal", range),
921 Diff::Delete(range: Range<'_>) => ("Delete", range),
922 Diff::Insert(range: Range<'_>) => ("Insert", range),
923 };
924 formatter.write_str(data:name)?;
925 formatter.write_str(data:"(\"")?;
926 for ch: char in range.chars() {
927 if ch == '\'' {
928 // escape_debug turns this into "\'" which is unnecessary.
929 formatter.write_char(ch)?;
930 } else {
931 Display::fmt(&ch.escape_debug(), f:formatter)?;
932 }
933 }
934 formatter.write_str(data:"\")")?;
935 Ok(())
936 }
937}
938