1//! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar)
2//!
3//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
4//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
5//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs
6//!
7//! <br>
8//!
9//! ## Diff library with semantic cleanup, based on Google's diff-match-patch
10//!
11//! This library is a port of the Diff component of [Diff Match Patch] to Rust.
12//! The diff implementation is based on [Myers' diff algorithm] but includes
13//! some [semantic cleanups] to increase human readability by factoring out
14//! commonalities which are likely to be coincidental.
15//!
16//! Diff Match Patch was originally built in 2006 to power Google Docs.
17//!
18//! # Interface
19//!
20//! Here is the entire API of the Rust implementation. It operates on borrowed
21//! strings and the return value of the diff algorithm is a vector of chunks
22//! pointing into slices of those input strings.
23//!
24//! ```
25//! pub enum Chunk<'a> {
26//! Equal(&'a str),
27//! Delete(&'a str),
28//! Insert(&'a str),
29//! }
30//!
31//! # const IGNORE: &str = stringify! {
32//! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
33//! # };
34//! ```
35//!
36//! [Diff Match Patch]: https://github.com/google/diff-match-patch
37//! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
38//! [semantic cleanups]: https://neil.fraser.name/writing/diff/
39
40#![doc(html_root_url = "https://docs.rs/dissimilar/1.0.6")]
41#![allow(
42 clippy::blocks_in_if_conditions,
43 clippy::bool_to_int_with_if,
44 clippy::cast_possible_wrap,
45 clippy::cast_sign_loss,
46 clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127
47 clippy::collapsible_else_if,
48 clippy::comparison_chain,
49 clippy::match_same_arms,
50 clippy::module_name_repetitions,
51 clippy::must_use_candidate,
52 clippy::new_without_default,
53 clippy::octal_escapes,
54 clippy::shadow_unrelated,
55 clippy::similar_names,
56 clippy::too_many_lines,
57 clippy::unseparated_literal_suffix,
58 unused_parens, // false positive on Some(&(mut diff)) pattern
59)]
60
61mod find;
62mod range;
63
64#[cfg(test)]
65mod tests;
66
67use crate::range::{slice, Range};
68use std::cmp;
69use std::collections::VecDeque;
70use std::fmt::{self, Debug, Display, Write};
71
72#[derive(Copy, Clone, PartialEq, Eq)]
73pub enum Chunk<'a> {
74 Equal(&'a str),
75 Delete(&'a str),
76 Insert(&'a str),
77}
78
79#[derive(Copy, Clone)]
80enum Diff<'a, 'b> {
81 Equal(Range<'a>, Range<'b>),
82 Delete(Range<'a>),
83 Insert(Range<'b>),
84}
85
86impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> {
87 fn text(&self) -> Range<'tmp> {
88 match *self {
89 Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range,
90 }
91 }
92
93 fn grow_left(&mut self, increment: usize) {
94 self.for_each(|range| {
95 range.offset -= increment;
96 range.len += increment;
97 });
98 }
99
100 fn grow_right(&mut self, increment: usize) {
101 self.for_each(|range| range.len += increment);
102 }
103
104 fn shift_left(&mut self, increment: usize) {
105 self.for_each(|range| range.offset -= increment);
106 }
107
108 fn shift_right(&mut self, increment: usize) {
109 self.for_each(|range| range.offset += increment);
110 }
111
112 fn for_each(&mut self, f: impl Fn(&mut Range)) {
113 match self {
114 Diff::Equal(range1, range2) => {
115 f(range1);
116 f(range2);
117 }
118 Diff::Delete(range) => f(range),
119 Diff::Insert(range) => f(range),
120 }
121 }
122}
123
124pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> {
125 let chars1: Vec<char> = text1.chars().collect();
126 let chars2: Vec<char> = text2.chars().collect();
127 let range1 = Range::new(&chars1, ..);
128 let range2 = Range::new(&chars2, ..);
129
130 let mut solution = main(range1, range2);
131 cleanup_char_boundary(&mut solution);
132 cleanup_semantic(&mut solution);
133 cleanup_merge(&mut solution);
134
135 let mut chunks = Vec::new();
136 let mut pos1 = 0;
137 let mut pos2 = 0;
138 for diff in solution.diffs {
139 chunks.push(match diff {
140 Diff::Equal(range, _) => {
141 let len = range.len_bytes();
142 let chunk = Chunk::Equal(&text1[pos1..pos1 + len]);
143 pos1 += len;
144 pos2 += len;
145 chunk
146 }
147 Diff::Delete(range) => {
148 let len = range.len_bytes();
149 let chunk = Chunk::Delete(&text1[pos1..pos1 + len]);
150 pos1 += len;
151 chunk
152 }
153 Diff::Insert(range) => {
154 let len = range.len_bytes();
155 let chunk = Chunk::Insert(&text2[pos2..pos2 + len]);
156 pos2 += len;
157 chunk
158 }
159 });
160 }
161 chunks
162}
163
164struct Solution<'a, 'b> {
165 text1: Range<'a>,
166 text2: Range<'b>,
167 diffs: Vec<Diff<'a, 'b>>,
168}
169
170fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> {
171 let whole1 = text1;
172 let whole2 = text2;
173
174 // Trim off common prefix.
175 let common_prefix_len = common_prefix(text1, text2);
176 let common_prefix = Diff::Equal(
177 text1.substring(..common_prefix_len),
178 text2.substring(..common_prefix_len),
179 );
180 text1 = text1.substring(common_prefix_len..);
181 text2 = text2.substring(common_prefix_len..);
182
183 // Trim off common suffix.
184 let common_suffix_len = common_suffix(text1, text2);
185 let common_suffix = Diff::Equal(
186 text1.substring(text1.len - common_suffix_len..),
187 text2.substring(text2.len - common_suffix_len..),
188 );
189 text1 = text1.substring(..text1.len - common_suffix_len);
190 text2 = text2.substring(..text2.len - common_suffix_len);
191
192 // Compute the diff on the middle block.
193 let mut solution = Solution {
194 text1: whole1,
195 text2: whole2,
196 diffs: compute(text1, text2),
197 };
198
199 // Restore the prefix and suffix.
200 if common_prefix_len > 0 {
201 solution.diffs.insert(0, common_prefix);
202 }
203 if common_suffix_len > 0 {
204 solution.diffs.push(common_suffix);
205 }
206
207 cleanup_merge(&mut solution);
208
209 solution
210}
211
212// Find the differences between two texts. Assumes that the texts do not have
213// any common prefix or suffix.
214fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
215 match (text1.is_empty(), text2.is_empty()) {
216 (true, true) => return Vec::new(),
217 (true, false) => return vec![Diff::Insert(text2)],
218 (false, true) => return vec![Diff::Delete(text1)],
219 (false, false) => {}
220 }
221
222 // Check for entire shorter text inside the longer text.
223 if text1.len > text2.len {
224 if let Some(i) = text1.find(text2) {
225 return vec![
226 Diff::Delete(text1.substring(..i)),
227 Diff::Equal(text1.substring(i..i + text2.len), text2),
228 Diff::Delete(text1.substring(i + text2.len..)),
229 ];
230 }
231 } else {
232 if let Some(i) = text2.find(text1) {
233 return vec![
234 Diff::Insert(text2.substring(..i)),
235 Diff::Equal(text1, text2.substring(i..i + text1.len)),
236 Diff::Insert(text2.substring(i + text1.len..)),
237 ];
238 }
239 }
240
241 if text1.len == 1 || text2.len == 1 {
242 // Single character string.
243 // After the previous check, the character can't be an equality.
244 return vec![Diff::Delete(text1), Diff::Insert(text2)];
245 }
246
247 bisect(text1, text2)
248}
249
250// Find the 'middle snake' of a diff, split the problem in two and return the
251// recursively constructed diff.
252//
253// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
254fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
255 let max_d = (text1.len + text2.len + 1) / 2;
256 let v_offset = max_d;
257 let v_len = 2 * max_d;
258 let mut v1 = vec![-1isize; v_len];
259 let mut v2 = vec![-1isize; v_len];
260 v1[v_offset + 1] = 0;
261 v2[v_offset + 1] = 0;
262 let delta = text1.len as isize - text2.len as isize;
263 // If the total number of characters is odd, then the front path will
264 // collide with the reverse path.
265 let front = delta % 2 != 0;
266 // Offsets for start and end of k loop.
267 // Prevents mapping of space beyond the grid.
268 let mut k1start = 0;
269 let mut k1end = 0;
270 let mut k2start = 0;
271 let mut k2end = 0;
272 for d in 0..max_d as isize {
273 // Walk the front path one step.
274 let mut k1 = -d + k1start;
275 while k1 <= d - k1end {
276 let k1_offset = (v_offset as isize + k1) as usize;
277 let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) {
278 v1[k1_offset + 1]
279 } else {
280 v1[k1_offset - 1] + 1
281 } as usize;
282 let mut y1 = (x1 as isize - k1) as usize;
283 if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) {
284 let advance = common_prefix(s1, s2);
285 x1 += advance;
286 y1 += advance;
287 }
288 v1[k1_offset] = x1 as isize;
289 if x1 > text1.len {
290 // Ran off the right of the graph.
291 k1end += 2;
292 } else if y1 > text2.len {
293 // Ran off the bottom of the graph.
294 k1start += 2;
295 } else if front {
296 let k2_offset = v_offset as isize + delta - k1;
297 if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 {
298 // Mirror x2 onto top-left coordinate system.
299 let x2 = text1.len as isize - v2[k2_offset as usize];
300 if x1 as isize >= x2 {
301 // Overlap detected.
302 return bisect_split(text1, text2, x1, y1);
303 }
304 }
305 }
306 k1 += 2;
307 }
308
309 // Walk the reverse path one step.
310 let mut k2 = -d + k2start;
311 while k2 <= d - k2end {
312 let k2_offset = (v_offset as isize + k2) as usize;
313 let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) {
314 v2[k2_offset + 1]
315 } else {
316 v2[k2_offset - 1] + 1
317 } as usize;
318 let mut y2 = (x2 as isize - k2) as usize;
319 if x2 < text1.len && y2 < text2.len {
320 let advance = common_suffix(
321 text1.substring(..text1.len - x2),
322 text2.substring(..text2.len - y2),
323 );
324 x2 += advance;
325 y2 += advance;
326 }
327 v2[k2_offset] = x2 as isize;
328 if x2 > text1.len {
329 // Ran off the left of the graph.
330 k2end += 2;
331 } else if y2 > text2.len {
332 // Ran off the top of the graph.
333 k2start += 2;
334 } else if !front {
335 let k1_offset = v_offset as isize + delta - k2;
336 if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 {
337 let x1 = v1[k1_offset as usize] as usize;
338 let y1 = v_offset + x1 - k1_offset as usize;
339 // Mirror x2 onto top-left coordinate system.
340 x2 = text1.len - x2;
341 if x1 >= x2 {
342 // Overlap detected.
343 return bisect_split(text1, text2, x1, y1);
344 }
345 }
346 }
347 k2 += 2;
348 }
349 }
350 // Number of diffs equals number of characters, no commonality at all.
351 vec![Diff::Delete(text1), Diff::Insert(text2)]
352}
353
354// Given the location of the 'middle snake', split the diff in two parts and
355// recurse.
356fn bisect_split<'a, 'b>(
357 text1: Range<'a>,
358 text2: Range<'b>,
359 x: usize,
360 y: usize,
361) -> Vec<Diff<'a, 'b>> {
362 let (text1a: Range<'_>, text1b: Range<'_>) = text1.split_at(mid:x);
363 let (text2a: Range<'_>, text2b: Range<'_>) = text2.split_at(mid:y);
364
365 // Compute both diffs serially.
366 let mut diffs: Vec> = main(text1:text1a, text2:text2a).diffs;
367 diffs.extend(iter:main(text1:text1b, text2:text2b).diffs);
368
369 diffs
370}
371
372// Determine the length of the common prefix of two strings.
373fn common_prefix(text1: Range, text2: Range) -> usize {
374 for (i: usize, (b1: char, b2: char)) in text1.chars().zip(text2.chars()).enumerate() {
375 if b1 != b2 {
376 return i;
377 }
378 }
379 cmp::min(v1:text1.len, v2:text2.len)
380}
381
382// Determine the length of the common suffix of two strings.
383fn common_suffix(text1: Range, text2: Range) -> usize {
384 for (i: usize, (b1: char, b2: char)) in text1.chars().rev().zip(text2.chars().rev()).enumerate() {
385 if b1 != b2 {
386 return i;
387 }
388 }
389 cmp::min(v1:text1.len, v2:text2.len)
390}
391
392// Determine if the suffix of one string is the prefix of another.
393//
394// Returns the number of characters common to the end of the first string and
395// the start of the second string.
396fn common_overlap(mut text1: Range, mut text2: Range) -> usize {
397 // Eliminate the null case.
398 if text1.is_empty() || text2.is_empty() {
399 return 0;
400 }
401 // Truncate the longer string.
402 if text1.len > text2.len {
403 text1 = text1.substring(text1.len - text2.len..);
404 } else if text1.len < text2.len {
405 text2 = text2.substring(..text1.len);
406 }
407 // Quick check for the worst case.
408 if slice(text1) == slice(text2) {
409 return text1.len;
410 }
411
412 // Start by looking for a single character match
413 // and increase length until no match is found.
414 // Performance analysis: https://neil.fraser.name/news/2010/11/04/
415 let mut best = 0;
416 let mut length = 1;
417 loop {
418 let pattern = text1.substring(text1.len - length..);
419 let found = match text2.find(pattern) {
420 Some(found) => found,
421 None => return best,
422 };
423 length += found;
424 if found == 0
425 || slice(text1.substring(text1.len - length..)) == slice(text2.substring(..length))
426 {
427 best = length;
428 length += 1;
429 }
430 }
431}
432
433fn cleanup_char_boundary(solution: &mut Solution) {
434 fn is_segmentation_boundary(doc: &[char], pos: usize) -> bool {
435 // FIXME: use unicode-segmentation crate?
436 let _ = doc;
437 let _ = pos;
438 true
439 }
440
441 fn boundary_down(doc: &[char], pos: usize) -> usize {
442 let mut adjust = 0;
443 while !is_segmentation_boundary(doc, pos - adjust) {
444 adjust += 1;
445 }
446 adjust
447 }
448
449 fn boundary_up(doc: &[char], pos: usize) -> usize {
450 let mut adjust = 0;
451 while !is_segmentation_boundary(doc, pos + adjust) {
452 adjust += 1;
453 }
454 adjust
455 }
456
457 fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) {
458 let prev_end = prev.offset + prev.len;
459 if prev_end > range.offset {
460 let delta = cmp::min(prev_end - range.offset, range.len);
461 range.offset += delta;
462 range.len -= delta;
463 }
464 }
465
466 let mut read = 0;
467 let mut retain = 0;
468 let mut last_delete = Range::empty();
469 let mut last_insert = Range::empty();
470 while let Some(&(mut diff)) = solution.diffs.get(read) {
471 read += 1;
472 match &mut diff {
473 Diff::Equal(range1, range2) => {
474 let adjust = boundary_up(range1.doc, range1.offset);
475 // If the whole range is sub-character, skip it.
476 if range1.len <= adjust {
477 continue;
478 }
479 range1.offset += adjust;
480 range1.len -= adjust;
481 range2.offset += adjust;
482 range2.len -= adjust;
483 let adjust = boundary_down(range1.doc, range1.offset + range1.len);
484 range1.len -= adjust;
485 range2.len -= adjust;
486 last_delete = Range::empty();
487 last_insert = Range::empty();
488 }
489 Diff::Delete(range) => {
490 skip_overlap(&last_delete, range);
491 if range.len == 0 {
492 continue;
493 }
494 let adjust = boundary_down(range.doc, range.offset);
495 range.offset -= adjust;
496 range.len += adjust;
497 let adjust = boundary_up(range.doc, range.offset + range.len);
498 range.len += adjust;
499 last_delete = *range;
500 }
501 Diff::Insert(range) => {
502 skip_overlap(&last_insert, range);
503 if range.len == 0 {
504 continue;
505 }
506 let adjust = boundary_down(range.doc, range.offset);
507 range.offset -= adjust;
508 range.len += adjust;
509 let adjust = boundary_up(range.doc, range.offset + range.len);
510 range.len += adjust;
511 last_insert = *range;
512 }
513 }
514 solution.diffs[retain] = diff;
515 retain += 1;
516 }
517
518 solution.diffs.truncate(retain);
519}
520
521// Reduce the number of edits by eliminating semantically trivial equalities.
522fn cleanup_semantic(solution: &mut Solution) {
523 let mut diffs = &mut solution.diffs;
524 if diffs.is_empty() {
525 return;
526 }
527
528 let mut changes = false;
529 let mut equalities = VecDeque::new(); // Double-ended queue of equalities.
530 let mut last_equality = None; // Always equal to equalities.peek().text
531 let mut pointer = 0;
532 // Number of characters that changed prior to the equality.
533 let mut len_insertions1 = 0;
534 let mut len_deletions1 = 0;
535 // Number of characters that changed after the equality.
536 let mut len_insertions2 = 0;
537 let mut len_deletions2 = 0;
538 while let Some(&this_diff) = diffs.get(pointer) {
539 match this_diff {
540 Diff::Equal(text1, text2) => {
541 equalities.push_back(pointer);
542 len_insertions1 = len_insertions2;
543 len_deletions1 = len_deletions2;
544 len_insertions2 = 0;
545 len_deletions2 = 0;
546 last_equality = Some((text1, text2));
547 pointer += 1;
548 continue;
549 }
550 Diff::Delete(text) => len_deletions2 += text.len,
551 Diff::Insert(text) => len_insertions2 += text.len,
552 }
553 // Eliminate an equality that is smaller or equal to the edits on both
554 // sides of it.
555 if last_equality.map_or(false, |(last_equality, _)| {
556 last_equality.len <= cmp::max(len_insertions1, len_deletions1)
557 && last_equality.len <= cmp::max(len_insertions2, len_deletions2)
558 }) {
559 // Jump back to offending equality.
560 pointer = equalities.pop_back().unwrap();
561
562 // Replace equality with a delete.
563 diffs[pointer] = Diff::Delete(last_equality.unwrap().0);
564 // Insert a corresponding insert.
565 diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1));
566
567 len_insertions1 = 0; // Reset the counters.
568 len_insertions2 = 0;
569 len_deletions1 = 0;
570 len_deletions2 = 0;
571 last_equality = None;
572 changes = true;
573
574 // Throw away the previous equality (it needs to be reevaluated).
575 equalities.pop_back();
576 if let Some(back) = equalities.back() {
577 // There is a safe equality we can fall back to.
578 pointer = *back;
579 } else {
580 // There are no previous equalities, jump back to the start.
581 pointer = 0;
582 continue;
583 }
584 }
585 pointer += 1;
586 }
587
588 // Normalize the diff.
589 if changes {
590 cleanup_merge(solution);
591 }
592 cleanup_semantic_lossless(solution);
593 diffs = &mut solution.diffs;
594
595 // Find any overlaps between deletions and insertions.
596 // e.g: <del>abcxxx</del><ins>xxxdef</ins>
597 // -> <del>abc</del>xxx<ins>def</ins>
598 // e.g: <del>xxxabc</del><ins>defxxx</ins>
599 // -> <ins>def</ins>xxx<del>abc</del>
600 // Only extract an overlap if it is as big as the edit ahead or behind it.
601 let mut pointer = 1;
602 while let Some(&this_diff) = diffs.get(pointer) {
603 let prev_diff = diffs[pointer - 1];
604 if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) {
605 let overlap_len1 = common_overlap(deletion, insertion);
606 let overlap_len2 = common_overlap(insertion, deletion);
607 let overlap_min = cmp::min(deletion.len, insertion.len);
608 if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min {
609 // Overlap found. Insert an equality and trim the surrounding edits.
610 diffs.insert(
611 pointer,
612 Diff::Equal(
613 deletion.substring(deletion.len - overlap_len1..deletion.len),
614 insertion.substring(..overlap_len1),
615 ),
616 );
617 diffs[pointer - 1] =
618 Diff::Delete(deletion.substring(..deletion.len - overlap_len1));
619 diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..));
620 } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min {
621 // Reverse overlap found.
622 // Insert an equality and swap and trim the surrounding edits.
623 diffs.insert(
624 pointer,
625 Diff::Equal(
626 deletion.substring(..overlap_len2),
627 insertion.substring(insertion.len - overlap_len2..insertion.len),
628 ),
629 );
630 diffs[pointer - 1] =
631 Diff::Insert(insertion.substring(..insertion.len - overlap_len2));
632 diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..));
633 }
634 pointer += 1;
635 }
636 pointer += 1;
637 }
638}
639
640// Look for single edits surrounded on both sides by equalities which can be
641// shifted sideways to align the edit to a word boundary.
642//
643// e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
644fn cleanup_semantic_lossless(solution: &mut Solution) {
645 let diffs = &mut solution.diffs;
646 let mut pointer = 1;
647 while let Some(&next_diff) = diffs.get(pointer + 1) {
648 let prev_diff = diffs[pointer - 1];
649 if let (
650 Diff::Equal(mut prev_equal1, mut prev_equal2),
651 Diff::Equal(mut next_equal1, mut next_equal2),
652 ) = (prev_diff, next_diff)
653 {
654 // This is a single edit surrounded by equalities.
655 let mut edit = diffs[pointer];
656
657 // First, shift the edit as far left as possible.
658 let common_offset = common_suffix(prev_equal1, edit.text());
659 let original_prev_len = prev_equal1.len;
660 prev_equal1.len -= common_offset;
661 prev_equal2.len -= common_offset;
662 edit.shift_left(common_offset);
663 next_equal1.offset -= common_offset;
664 next_equal1.len += common_offset;
665 next_equal2.offset -= common_offset;
666 next_equal2.len += common_offset;
667
668 // Second, step character by character right, looking for the best fit.
669 let mut best_prev_equal = (prev_equal1, prev_equal2);
670 let mut best_edit = edit;
671 let mut best_next_equal = (next_equal1, next_equal2);
672 let mut best_score = cleanup_semantic_score(prev_equal1, edit.text())
673 + cleanup_semantic_score(edit.text(), next_equal1);
674 while !edit.text().is_empty()
675 && !next_equal1.is_empty()
676 && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap()
677 {
678 prev_equal1.len += 1;
679 prev_equal2.len += 1;
680 edit.shift_right(1);
681 next_equal1.offset += 1;
682 next_equal1.len -= 1;
683 next_equal2.offset += 1;
684 next_equal2.len -= 1;
685 let score = cleanup_semantic_score(prev_equal1, edit.text())
686 + cleanup_semantic_score(edit.text(), next_equal1);
687 // The >= encourages trailing rather than leading whitespace on edits.
688 if score >= best_score {
689 best_score = score;
690 best_prev_equal = (prev_equal1, prev_equal2);
691 best_edit = edit;
692 best_next_equal = (next_equal1, next_equal2);
693 }
694 }
695
696 if original_prev_len != best_prev_equal.0.len {
697 // We have an improvement, save it back to the diff.
698 if best_next_equal.0.is_empty() {
699 diffs.remove(pointer + 1);
700 } else {
701 diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1);
702 }
703 diffs[pointer] = best_edit;
704 if best_prev_equal.0.is_empty() {
705 diffs.remove(pointer - 1);
706 pointer -= 1;
707 } else {
708 diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1);
709 }
710 }
711 }
712 pointer += 1;
713 }
714}
715
716// Given two strings, compute a score representing whether the internal boundary
717// falls on logical boundaries.
718//
719// Scores range from 6 (best) to 0 (worst).
720fn cleanup_semantic_score(one: Range, two: Range) -> usize {
721 if one.is_empty() || two.is_empty() {
722 // Edges are the best.
723 return 6;
724 }
725
726 // Each port of this function behaves slightly differently due to subtle
727 // differences in each language's definition of things like 'whitespace'.
728 // Since this function's purpose is largely cosmetic, the choice has been
729 // made to use each language's native features rather than force total
730 // conformity.
731 let char1 = one.chars().next_back().unwrap();
732 let char2 = two.chars().next().unwrap();
733 let non_alphanumeric1 = !char1.is_ascii_alphanumeric();
734 let non_alphanumeric2 = !char2.is_ascii_alphanumeric();
735 let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace();
736 let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace();
737 let line_break1 = whitespace1 && char1.is_control();
738 let line_break2 = whitespace2 && char2.is_control();
739 let blank_line1 =
740 line_break1 && (one.ends_with(['\n', '\n']) || one.ends_with(['\n', '\r', '\n']));
741 let blank_line2 =
742 line_break2 && (two.starts_with(['\n', '\n']) || two.starts_with(['\r', '\n', '\r', '\n']));
743
744 if blank_line1 || blank_line2 {
745 // Five points for blank lines.
746 5
747 } else if line_break1 || line_break2 {
748 // Four points for line breaks.
749 4
750 } else if non_alphanumeric1 && !whitespace1 && whitespace2 {
751 // Three points for end of sentences.
752 3
753 } else if whitespace1 || whitespace2 {
754 // Two points for whitespace.
755 2
756 } else if non_alphanumeric1 || non_alphanumeric2 {
757 // One point for non-alphanumeric.
758 1
759 } else {
760 0
761 }
762}
763
764// Reorder and merge like edit sections. Merge equalities. Any edit section can
765// move as long as it doesn't cross an equality.
766fn cleanup_merge(solution: &mut Solution) {
767 let diffs = &mut solution.diffs;
768 while !diffs.is_empty() {
769 diffs.push(Diff::Equal(
770 solution.text1.substring(solution.text1.len..),
771 solution.text2.substring(solution.text2.len..),
772 )); // Add a dummy entry at the end.
773 let mut pointer = 0;
774 let mut count_delete = 0;
775 let mut count_insert = 0;
776 let mut text_delete = Range::empty();
777 let mut text_insert = Range::empty();
778 while let Some(&this_diff) = diffs.get(pointer) {
779 match this_diff {
780 Diff::Insert(text) => {
781 count_insert += 1;
782 if text_insert.is_empty() {
783 text_insert = text;
784 } else {
785 text_insert.len += text.len;
786 }
787 }
788 Diff::Delete(text) => {
789 count_delete += 1;
790 if text_delete.is_empty() {
791 text_delete = text;
792 } else {
793 text_delete.len += text.len;
794 }
795 }
796 Diff::Equal(text, _) => {
797 let count_both = count_delete + count_insert;
798 if count_both > 1 {
799 let both_types = count_delete != 0 && count_insert != 0;
800 // Delete the offending records.
801 diffs.splice(pointer - count_both..pointer, None);
802 pointer -= count_both;
803 if both_types {
804 // Factor out any common prefix.
805 let common_length = common_prefix(text_insert, text_delete);
806 if common_length != 0 {
807 if pointer > 0 {
808 match &mut diffs[pointer - 1] {
809 Diff::Equal(this_diff1, this_diff2) => {
810 this_diff1.len += common_length;
811 this_diff2.len += common_length;
812 }
813 _ => unreachable!(
814 "previous diff should have been an equality"
815 ),
816 }
817 } else {
818 diffs.insert(
819 pointer,
820 Diff::Equal(
821 text_delete.substring(..common_length),
822 text_insert.substring(..common_length),
823 ),
824 );
825 pointer += 1;
826 }
827 text_insert = text_insert.substring(common_length..);
828 text_delete = text_delete.substring(common_length..);
829 }
830 // Factor out any common suffix.
831 let common_length = common_suffix(text_insert, text_delete);
832 if common_length != 0 {
833 diffs[pointer].grow_left(common_length);
834 text_insert.len -= common_length;
835 text_delete.len -= common_length;
836 }
837 }
838 // Insert the merged records.
839 if !text_delete.is_empty() {
840 diffs.insert(pointer, Diff::Delete(text_delete));
841 pointer += 1;
842 }
843 if !text_insert.is_empty() {
844 diffs.insert(pointer, Diff::Insert(text_insert));
845 pointer += 1;
846 }
847 } else if pointer > 0 {
848 if let Some(Diff::Equal(prev_equal1, prev_equal2)) =
849 diffs.get_mut(pointer - 1)
850 {
851 // Merge this equality with the previous one.
852 prev_equal1.len += text.len;
853 prev_equal2.len += text.len;
854 diffs.remove(pointer);
855 pointer -= 1;
856 }
857 }
858 count_insert = 0;
859 count_delete = 0;
860 text_delete = Range::empty();
861 text_insert = Range::empty();
862 }
863 }
864 pointer += 1;
865 }
866 if diffs.last().unwrap().text().is_empty() {
867 diffs.pop(); // Remove the dummy entry at the end.
868 }
869
870 // Second pass: look for single edits surrounded on both sides by equalities
871 // which can be shifted sideways to eliminate an equality.
872 // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
873 let mut changes = false;
874 let mut pointer = 1;
875 // Intentionally ignore the first and last element (don't need checking).
876 while let Some(&next_diff) = diffs.get(pointer + 1) {
877 let prev_diff = diffs[pointer - 1];
878 let this_diff = diffs[pointer];
879 if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) {
880 // This is a single edit surrounded by equalities.
881 if this_diff.text().ends_with(prev_diff) {
882 // Shift the edit over the previous equality.
883 diffs[pointer].shift_left(prev_diff.len);
884 diffs[pointer + 1].grow_left(prev_diff.len);
885 diffs.remove(pointer - 1); // Delete prev_diff.
886 changes = true;
887 } else if this_diff.text().starts_with(next_diff) {
888 // Shift the edit over the next equality.
889 diffs[pointer - 1].grow_right(next_diff.len);
890 diffs[pointer].shift_right(next_diff.len);
891 diffs.remove(pointer + 1); // Delete next_diff.
892 changes = true;
893 }
894 }
895 pointer += 1;
896 }
897 // If shifts were made, the diff needs reordering and another shift sweep.
898 if !changes {
899 return;
900 }
901 }
902}
903
904impl Debug for Chunk<'_> {
905 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
906 let (name: &str, text: &str) = match *self {
907 Chunk::Equal(text: &str) => ("Equal", text),
908 Chunk::Delete(text: &str) => ("Delete", text),
909 Chunk::Insert(text: &str) => ("Insert", text),
910 };
911 write!(formatter, "{}({:?})", name, text)
912 }
913}
914
915impl Debug for Diff<'_, '_> {
916 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
917 let (name: &str, range: Range<'_>) = match *self {
918 Diff::Equal(range: Range<'_>, _) => ("Equal", range),
919 Diff::Delete(range: Range<'_>) => ("Delete", range),
920 Diff::Insert(range: Range<'_>) => ("Insert", range),
921 };
922 formatter.write_str(data:name)?;
923 formatter.write_str(data:"(\"")?;
924 for ch: char in range.chars() {
925 if ch == '\'' {
926 // escape_debug turns this into "\'" which is unnecessary.
927 formatter.write_char(ch)?;
928 } else {
929 Display::fmt(&ch.escape_debug(), f:formatter)?;
930 }
931 }
932 formatter.write_str(data:"\")")?;
933 Ok(())
934 }
935}
936