1 | use std::collections::hash_map::Entry; |
2 | use std::collections::HashMap; |
3 | use std::fmt::Debug; |
4 | use std::hash::{Hash, Hasher}; |
5 | use std::ops::{Add, Index, Range}; |
6 | |
7 | /// Utility function to check if a range is empty that works on older rust versions |
8 | #[inline (always)] |
9 | #[allow (clippy::neg_cmp_op_on_partial_ord)] |
10 | pub fn is_empty_range<T: PartialOrd<T>>(range: &Range<T>) -> bool { |
11 | !(range.start < range.end) |
12 | } |
13 | |
14 | /// Represents an item in the vector returned by [`unique`]. |
15 | /// |
16 | /// It compares like the underlying item does it was created from but |
17 | /// carries the index it was originally created from. |
18 | pub struct UniqueItem<'a, Idx: ?Sized> { |
19 | lookup: &'a Idx, |
20 | index: usize, |
21 | } |
22 | |
23 | impl<'a, Idx: ?Sized> UniqueItem<'a, Idx> |
24 | where |
25 | Idx: Index<usize>, |
26 | { |
27 | /// Returns the value. |
28 | #[inline (always)] |
29 | pub fn value(&self) -> &Idx::Output { |
30 | &self.lookup[self.index] |
31 | } |
32 | |
33 | /// Returns the original index. |
34 | #[inline (always)] |
35 | pub fn original_index(&self) -> usize { |
36 | self.index |
37 | } |
38 | } |
39 | |
40 | impl<'a, Idx: Index<usize> + 'a> Debug for UniqueItem<'a, Idx> |
41 | where |
42 | Idx::Output: Debug, |
43 | { |
44 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
45 | f&mut DebugStruct<'_, '_>.debug_struct("UniqueItem" ) |
46 | .field("value" , &self.value()) |
47 | .field(name:"original_index" , &self.original_index()) |
48 | .finish() |
49 | } |
50 | } |
51 | |
52 | impl<'a, 'b, A, B> PartialEq<UniqueItem<'a, A>> for UniqueItem<'b, B> |
53 | where |
54 | A: Index<usize> + 'b + ?Sized, |
55 | B: Index<usize> + 'b + ?Sized, |
56 | B::Output: PartialEq<A::Output>, |
57 | { |
58 | #[inline (always)] |
59 | fn eq(&self, other: &UniqueItem<'a, A>) -> bool { |
60 | self.value() == other.value() |
61 | } |
62 | } |
63 | |
64 | /// Returns only unique items in the sequence as vector. |
65 | /// |
66 | /// Each item is wrapped in a [`UniqueItem`] so that both the value and the |
67 | /// index can be extracted. |
68 | pub fn unique<Idx>(lookup: &Idx, range: Range<usize>) -> Vec<UniqueItem<Idx>> |
69 | where |
70 | Idx: Index<usize> + ?Sized, |
71 | Idx::Output: Hash + Eq, |
72 | { |
73 | let mut by_item: HashMap<&impl Hash + Eq, Option<…>> = HashMap::new(); |
74 | for index: usize in range { |
75 | match by_item.entry(&lookup[index]) { |
76 | Entry::Vacant(entry: VacantEntry<'_, &impl Hash + Eq, …>) => { |
77 | entry.insert(Some(index)); |
78 | } |
79 | Entry::Occupied(mut entry: OccupiedEntry<'_, &impl Hash + Eq, …>) => { |
80 | let entry: &mut Option = entry.get_mut(); |
81 | if entry.is_some() { |
82 | *entry = None |
83 | } |
84 | } |
85 | } |
86 | } |
87 | let mut rv: Vec> = by_itemimpl Iterator- >
|
88 | .into_iter() |
89 | .filter_map(|(_, x: Option)| x) |
90 | .map(|index: usize| UniqueItem { lookup, index }) |
91 | .collect::<Vec<_>>(); |
92 | rv.sort_by_key(|a: &UniqueItem<'_, Idx>| a.original_index()); |
93 | rv |
94 | } |
95 | |
96 | /// Given two lookups and ranges calculates the length of the common prefix. |
97 | pub fn common_prefix_len<Old, New>( |
98 | old: &Old, |
99 | old_range: Range<usize>, |
100 | new: &New, |
101 | new_range: Range<usize>, |
102 | ) -> usize |
103 | where |
104 | Old: Index<usize> + ?Sized, |
105 | New: Index<usize> + ?Sized, |
106 | New::Output: PartialEq<Old::Output>, |
107 | { |
108 | if is_empty_range(&old_range) || is_empty_range(&new_range) { |
109 | return 0; |
110 | } |
111 | new_rangeimpl Iterator |
112 | .zip(old_range) |
113 | .take_while( |
114 | #[inline (always)] |
115 | |x: &(usize, usize)| new[x.0] == old[x.1], |
116 | ) |
117 | .count() |
118 | } |
119 | |
120 | /// Given two lookups and ranges calculates the length of common suffix. |
121 | pub fn common_suffix_len<Old, New>( |
122 | old: &Old, |
123 | old_range: Range<usize>, |
124 | new: &New, |
125 | new_range: Range<usize>, |
126 | ) -> usize |
127 | where |
128 | Old: Index<usize> + ?Sized, |
129 | New: Index<usize> + ?Sized, |
130 | New::Output: PartialEq<Old::Output>, |
131 | { |
132 | if is_empty_range(&old_range) || is_empty_range(&new_range) { |
133 | return 0; |
134 | } |
135 | new_rangeimpl Iterator |
136 | .rev() |
137 | .zip(old_range.rev()) |
138 | .take_while( |
139 | #[inline (always)] |
140 | |x: &(usize, usize)| new[x.0] == old[x.1], |
141 | ) |
142 | .count() |
143 | } |
144 | |
145 | struct OffsetLookup<Int> { |
146 | offset: usize, |
147 | vec: Vec<Int>, |
148 | } |
149 | |
150 | impl<Int> Index<usize> for OffsetLookup<Int> { |
151 | type Output = Int; |
152 | |
153 | #[inline (always)] |
154 | fn index(&self, index: usize) -> &Self::Output { |
155 | &self.vec[index - self.offset] |
156 | } |
157 | } |
158 | |
159 | /// A utility struct to convert distinct items to unique integers. |
160 | /// |
161 | /// This can be helpful on larger inputs to speed up the comparisons |
162 | /// performed by doing a first pass where the data set gets reduced |
163 | /// to (small) integers. |
164 | /// |
165 | /// The idea is that instead of passing two sequences to a diffling algorithm |
166 | /// you first pass it via [`IdentifyDistinct`]: |
167 | /// |
168 | /// ```rust |
169 | /// use similar::capture_diff; |
170 | /// use similar::algorithms::{Algorithm, IdentifyDistinct}; |
171 | /// |
172 | /// let old = &["foo" , "bar" , "baz" ][..]; |
173 | /// let new = &["foo" , "blah" , "baz" ][..]; |
174 | /// let h = IdentifyDistinct::<u32>::new(old, 0..old.len(), new, 0..new.len()); |
175 | /// let ops = capture_diff( |
176 | /// Algorithm::Myers, |
177 | /// h.old_lookup(), |
178 | /// h.old_range(), |
179 | /// h.new_lookup(), |
180 | /// h.new_range(), |
181 | /// ); |
182 | /// ``` |
183 | /// |
184 | /// The indexes are the same as with the passed source ranges. |
185 | pub struct IdentifyDistinct<Int> { |
186 | old: OffsetLookup<Int>, |
187 | new: OffsetLookup<Int>, |
188 | } |
189 | |
190 | impl<Int> IdentifyDistinct<Int> |
191 | where |
192 | Int: Add<Output = Int> + From<u8> + Default + Copy, |
193 | { |
194 | /// Creates an int hasher for two sequences. |
195 | pub fn new<Old, New>( |
196 | old: &Old, |
197 | old_range: Range<usize>, |
198 | new: &New, |
199 | new_range: Range<usize>, |
200 | ) -> Self |
201 | where |
202 | Old: Index<usize> + ?Sized, |
203 | Old::Output: Eq + Hash, |
204 | New: Index<usize> + ?Sized, |
205 | New::Output: Eq + Hash + PartialEq<Old::Output>, |
206 | { |
207 | enum Key<'old, 'new, Old: ?Sized, New: ?Sized> { |
208 | Old(&'old Old), |
209 | New(&'new New), |
210 | } |
211 | |
212 | impl<'old, 'new, Old, New> Hash for Key<'old, 'new, Old, New> |
213 | where |
214 | Old: Hash + ?Sized, |
215 | New: Hash + ?Sized, |
216 | { |
217 | fn hash<H: Hasher>(&self, state: &mut H) { |
218 | match *self { |
219 | Key::Old(val) => val.hash(state), |
220 | Key::New(val) => val.hash(state), |
221 | } |
222 | } |
223 | } |
224 | |
225 | impl<'old, 'new, Old, New> PartialEq for Key<'old, 'new, Old, New> |
226 | where |
227 | Old: Eq + ?Sized, |
228 | New: Eq + PartialEq<Old> + ?Sized, |
229 | { |
230 | #[inline (always)] |
231 | fn eq(&self, other: &Self) -> bool { |
232 | match (self, other) { |
233 | (Key::Old(a), Key::Old(b)) => a == b, |
234 | (Key::New(a), Key::New(b)) => a == b, |
235 | (Key::Old(a), Key::New(b)) | (Key::New(b), Key::Old(a)) => b == a, |
236 | } |
237 | } |
238 | } |
239 | |
240 | impl<'old, 'new, Old, New> Eq for Key<'old, 'new, Old, New> |
241 | where |
242 | Old: Eq + ?Sized, |
243 | New: Eq + PartialEq<Old> + ?Sized, |
244 | { |
245 | } |
246 | |
247 | let mut map = HashMap::new(); |
248 | let mut old_seq = Vec::new(); |
249 | let mut new_seq = Vec::new(); |
250 | let mut next_id = Int::default(); |
251 | let step = Int::from(1); |
252 | let old_start = old_range.start; |
253 | let new_start = new_range.start; |
254 | |
255 | for idx in old_range { |
256 | let item = Key::Old(&old[idx]); |
257 | let id = match map.entry(item) { |
258 | Entry::Occupied(o) => *o.get(), |
259 | Entry::Vacant(v) => { |
260 | let id = next_id; |
261 | next_id = next_id + step; |
262 | *v.insert(id) |
263 | } |
264 | }; |
265 | old_seq.push(id); |
266 | } |
267 | |
268 | for idx in new_range { |
269 | let item = Key::New(&new[idx]); |
270 | let id = match map.entry(item) { |
271 | Entry::Occupied(o) => *o.get(), |
272 | Entry::Vacant(v) => { |
273 | let id = next_id; |
274 | next_id = next_id + step; |
275 | *v.insert(id) |
276 | } |
277 | }; |
278 | new_seq.push(id); |
279 | } |
280 | |
281 | IdentifyDistinct { |
282 | old: OffsetLookup { |
283 | offset: old_start, |
284 | vec: old_seq, |
285 | }, |
286 | new: OffsetLookup { |
287 | offset: new_start, |
288 | vec: new_seq, |
289 | }, |
290 | } |
291 | } |
292 | |
293 | /// Returns a lookup for the old side. |
294 | pub fn old_lookup(&self) -> &impl Index<usize, Output = Int> { |
295 | &self.old |
296 | } |
297 | |
298 | /// Returns a lookup for the new side. |
299 | pub fn new_lookup(&self) -> &impl Index<usize, Output = Int> { |
300 | &self.new |
301 | } |
302 | |
303 | /// Convenience method to get back the old range. |
304 | pub fn old_range(&self) -> Range<usize> { |
305 | self.old.offset..self.old.offset + self.old.vec.len() |
306 | } |
307 | |
308 | /// Convenience method to get back the new range. |
309 | pub fn new_range(&self) -> Range<usize> { |
310 | self.new.offset..self.new.offset + self.new.vec.len() |
311 | } |
312 | } |
313 | |
314 | #[test ] |
315 | fn test_unique() { |
316 | let u = unique(&vec!['a' , 'b' , 'c' , 'd' , 'd' , 'b' ], 0..6) |
317 | .into_iter() |
318 | .map(|x| (*x.value(), x.original_index())) |
319 | .collect::<Vec<_>>(); |
320 | assert_eq!(u, vec![('a' , 0), ('c' , 2)]); |
321 | } |
322 | |
323 | #[test ] |
324 | fn test_int_hasher() { |
325 | let ih = IdentifyDistinct::<u8>::new( |
326 | &["" , "foo" , "bar" , "baz" ][..], |
327 | 1..4, |
328 | &["" , "foo" , "blah" , "baz" ][..], |
329 | 1..4, |
330 | ); |
331 | assert_eq!(ih.old_lookup()[1], 0); |
332 | assert_eq!(ih.old_lookup()[2], 1); |
333 | assert_eq!(ih.old_lookup()[3], 2); |
334 | assert_eq!(ih.new_lookup()[1], 0); |
335 | assert_eq!(ih.new_lookup()[2], 3); |
336 | assert_eq!(ih.new_lookup()[3], 2); |
337 | assert_eq!(ih.old_range(), 1..4); |
338 | assert_eq!(ih.new_range(), 1..4); |
339 | } |
340 | |
341 | #[test ] |
342 | fn test_common_prefix_len() { |
343 | assert_eq!( |
344 | common_prefix_len("" .as_bytes(), 0..0, "" .as_bytes(), 0..0), |
345 | 0 |
346 | ); |
347 | assert_eq!( |
348 | common_prefix_len("foobarbaz" .as_bytes(), 0..9, "foobarblah" .as_bytes(), 0..10), |
349 | 7 |
350 | ); |
351 | assert_eq!( |
352 | common_prefix_len("foobarbaz" .as_bytes(), 0..9, "blablabla" .as_bytes(), 0..9), |
353 | 0 |
354 | ); |
355 | assert_eq!( |
356 | common_prefix_len("foobarbaz" .as_bytes(), 3..9, "foobarblah" .as_bytes(), 3..10), |
357 | 4 |
358 | ); |
359 | } |
360 | |
361 | #[test ] |
362 | fn test_common_suffix_len() { |
363 | assert_eq!( |
364 | common_suffix_len("" .as_bytes(), 0..0, "" .as_bytes(), 0..0), |
365 | 0 |
366 | ); |
367 | assert_eq!( |
368 | common_suffix_len("1234" .as_bytes(), 0..4, "X0001234" .as_bytes(), 0..8), |
369 | 4 |
370 | ); |
371 | assert_eq!( |
372 | common_suffix_len("1234" .as_bytes(), 0..4, "Xxxx" .as_bytes(), 0..4), |
373 | 0 |
374 | ); |
375 | assert_eq!( |
376 | common_suffix_len("1234" .as_bytes(), 2..4, "01234" .as_bytes(), 2..5), |
377 | 2 |
378 | ); |
379 | } |
380 | |