utils.rs source code [crates/similar/src/algorithms/utils.rs]

1	use std::collections::hash_map::Entry;
2	use std::collections::HashMap;
3	use std::fmt::Debug;
4	use std::hash::{Hash, Hasher};
5	use std::ops::{Add, Index, Range};
6
7	/// Utility function to check if a range is empty that works on older rust versions
8	#[inline(always)]
9	#[allow(clippy::neg_cmp_op_on_partial_ord)]
10	pub fn is_empty_range<T: PartialOrd<T>>(range: &Range<T>) -> bool {
11	!(range.start < range.end)
12	}
13
14	/// Represents an item in the vector returned by [`unique`].
15	///
16	/// It compares like the underlying item does it was created from but
17	/// carries the index it was originally created from.
18	pub struct UniqueItem<'a, Idx: ?Sized> {
19	lookup: &'a Idx,
20	index: usize,
21	}
22
23	impl<'a, Idx: ?Sized> UniqueItem<'a, Idx>
24	where
25	Idx: Index<usize>,
26	{
27	/// Returns the value.
28	#[inline(always)]
29	pub fn value(&self) -> &Idx::Output {
30	&self.lookup[self.index]
31	}
32
33	/// Returns the original index.
34	#[inline(always)]
35	pub fn original_index(&self) -> usize {
36	self.index
37	}
38	}
39
40	impl<'a, Idx: Index<usize> + 'a> Debug for UniqueItem<'a, Idx>
41	where
42	Idx::Output: Debug,
43	{
44	fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
45	f&mut DebugStruct<'_, '_>.debug_struct("UniqueItem")
46	.field("value", &self.value())
47	.field(name:"original_index", &self.original_index())
48	.finish()
49	}
50	}
51
52	impl<'a, 'b, A, B> PartialEq<UniqueItem<'a, A>> for UniqueItem<'b, B>
53	where
54	A: Index<usize> + 'b + ?Sized,
55	B: Index<usize> + 'b + ?Sized,
56	B::Output: PartialEq<A::Output>,
57	{
58	#[inline(always)]
59	fn eq(&self, other: &UniqueItem<'a, A>) -> bool {
60	self.value() == other.value()
61	}
62	}
63
64	/// Returns only unique items in the sequence as vector.
65	///
66	/// Each item is wrapped in a [`UniqueItem`] so that both the value and the
67	/// index can be extracted.
68	pub fn unique<Idx>(lookup: &Idx, range: Range<usize>) -> Vec<UniqueItem<Idx>>
69	where
70	Idx: Index<usize> + ?Sized,
71	Idx::Output: Hash + Eq,
72	{
73	let mut by_item: HashMap<&impl Hash + Eq, Option<…>> = HashMap::new();
74	for index: usize in range {
75	match by_item.entry(&lookup[index]) {
76	Entry::Vacant(entry: VacantEntry<'_, &impl Hash + Eq, …>) => {
77	entry.insert(Some(index));
78	}
79	Entry::Occupied(mut entry: OccupiedEntry<'_, &impl Hash + Eq, …>) => {
80	let entry: &mut Option = entry.get_mut();
81	if entry.is_some() {
82	*entry = None
83	}
84	}
85	}
86	}
87	let mut rv: Vec> = by_itemimpl Iterator>
88	.into_iter()
89	.filter_map(\|(_, x: Option)\| x)
90	.map(\|index: usize\| UniqueItem { lookup, index })
91	.collect::<Vec<_>>();
92	rv.sort_by_key(\|a: &UniqueItem<'_, Idx>\| a.original_index());
93	rv
94	}
95
96	/// Given two lookups and ranges calculates the length of the common prefix.
97	pub fn common_prefix_len<Old, New>(
98	old: &Old,
99	old_range: Range<usize>,
100	new: &New,
101	new_range: Range<usize>,
102	) -> usize
103	where
104	Old: Index<usize> + ?Sized,
105	New: Index<usize> + ?Sized,
106	New::Output: PartialEq<Old::Output>,
107	{
108	if is_empty_range(&old_range) \|\| is_empty_range(&new_range) {
109	return `0`;
110	}
111	new_rangeimpl Iterator
112	.zip(old_range)
113	.take_while(
114	#[inline(always)]
115	\|x: &(usize, usize)\| new[x.0] == old[x.1],
116	)
117	.count()
118	}
119
120	/// Given two lookups and ranges calculates the length of common suffix.
121	pub fn common_suffix_len<Old, New>(
122	old: &Old,
123	old_range: Range<usize>,
124	new: &New,
125	new_range: Range<usize>,
126	) -> usize
127	where
128	Old: Index<usize> + ?Sized,
129	New: Index<usize> + ?Sized,
130	New::Output: PartialEq<Old::Output>,
131	{
132	if is_empty_range(&old_range) \|\| is_empty_range(&new_range) {
133	return `0`;
134	}
135	new_rangeimpl Iterator
136	.rev()
137	.zip(old_range.rev())
138	.take_while(
139	#[inline(always)]
140	\|x: &(usize, usize)\| new[x.0] == old[x.1],
141	)
142	.count()
143	}
144
145	struct OffsetLookup<Int> {
146	offset: usize,
147	vec: Vec<Int>,
148	}
149
150	impl<Int> Index<usize> for OffsetLookup<Int> {
151	type Output = Int;
152
153	#[inline(always)]
154	fn index(&self, index: usize) -> &Self::Output {
155	&self.vec[index - self.offset]
156	}
157	}
158
159	/// A utility struct to convert distinct items to unique integers.
160	///
161	/// This can be helpful on larger inputs to speed up the comparisons
162	/// performed by doing a first pass where the data set gets reduced
163	/// to (small) integers.
164	///
165	/// The idea is that instead of passing two sequences to a diffling algorithm
166	/// you first pass it via [`IdentifyDistinct`]:
167	///
168	/// ```rust
169	/// use similar::capture_diff;
170	/// use similar::algorithms::{Algorithm, IdentifyDistinct};
171	///
172	/// let old = &["foo", "bar", "baz"][..];
173	/// let new = &["foo", "blah", "baz"][..];
174	/// let h = IdentifyDistinct::<u32>::new(old, `0`..old.len(), new, `0`..new.len());
175	/// let ops = capture_diff(
176	/// Algorithm::Myers,
177	/// h.old_lookup(),
178	/// h.old_range(),
179	/// h.new_lookup(),
180	/// h.new_range(),
181	/// );
182	/// ```
183	///
184	/// The indexes are the same as with the passed source ranges.
185	pub struct IdentifyDistinct<Int> {
186	old: OffsetLookup<Int>,
187	new: OffsetLookup<Int>,
188	}
189
190	impl<Int> IdentifyDistinct<Int>
191	where
192	Int: Add<Output = Int> + From<u8> + Default + Copy,
193	{
194	/// Creates an int hasher for two sequences.
195	pub fn new<Old, New>(
196	old: &Old,
197	old_range: Range<usize>,
198	new: &New,
199	new_range: Range<usize>,
200	) -> Self
201	where
202	Old: Index<usize> + ?Sized,
203	Old::Output: Eq + Hash,
204	New: Index<usize> + ?Sized,
205	New::Output: Eq + Hash + PartialEq<Old::Output>,
206	{
207	enum Key<'old, 'new, Old: ?Sized, New: ?Sized> {
208	Old(&'old Old),
209	New(&'new New),
210	}
211
212	impl<'old, 'new, Old, New> Hash for Key<'old, 'new, Old, New>
213	where
214	Old: Hash + ?Sized,
215	New: Hash + ?Sized,
216	{
217	fn hash<H: Hasher>(&self, state: &mut H) {
218	match *self {
219	Key::Old(val) => val.hash(state),
220	Key::New(val) => val.hash(state),
221	}
222	}
223	}
224
225	impl<'old, 'new, Old, New> PartialEq for Key<'old, 'new, Old, New>
226	where
227	Old: Eq + ?Sized,
228	New: Eq + PartialEq<Old> + ?Sized,
229	{
230	#[inline(always)]
231	fn eq(&self, other: &Self) -> bool {
232	match (self, other) {
233	(Key::Old(a), Key::Old(b)) => a == b,
234	(Key::New(a), Key::New(b)) => a == b,
235	(Key::Old(a), Key::New(b)) \| (Key::New(b), Key::Old(a)) => b == a,
236	}
237	}
238	}
239
240	impl<'old, 'new, Old, New> Eq for Key<'old, 'new, Old, New>
241	where
242	Old: Eq + ?Sized,
243	New: Eq + PartialEq<Old> + ?Sized,
244	{
245	}
246
247	let mut map = HashMap::new();
248	let mut old_seq = Vec::new();
249	let mut new_seq = Vec::new();
250	let mut next_id = Int::default();
251	let step = Int::from(`1`);
252	let old_start = old_range.start;
253	let new_start = new_range.start;
254
255	for idx in old_range {
256	let item = Key::Old(&old[idx]);
257	let id = match map.entry(item) {
258	Entry::Occupied(o) => *o.get(),
259	Entry::Vacant(v) => {
260	let id = next_id;
261	next_id = next_id + step;
262	*v.insert(id)
263	}
264	};
265	old_seq.push(id);
266	}
267
268	for idx in new_range {
269	let item = Key::New(&new[idx]);
270	let id = match map.entry(item) {
271	Entry::Occupied(o) => *o.get(),
272	Entry::Vacant(v) => {
273	let id = next_id;
274	next_id = next_id + step;
275	*v.insert(id)
276	}
277	};
278	new_seq.push(id);
279	}
280
281	IdentifyDistinct {
282	old: OffsetLookup {
283	offset: old_start,
284	vec: old_seq,
285	},
286	new: OffsetLookup {
287	offset: new_start,
288	vec: new_seq,
289	},
290	}
291	}
292
293	/// Returns a lookup for the old side.
294	pub fn old_lookup(&self) -> &impl Index<usize, Output = Int> {
295	&self.old
296	}
297
298	/// Returns a lookup for the new side.
299	pub fn new_lookup(&self) -> &impl Index<usize, Output = Int> {
300	&self.new
301	}
302
303	/// Convenience method to get back the old range.
304	pub fn old_range(&self) -> Range<usize> {
305	self.old.offset..self.old.offset + self.old.vec.len()
306	}
307
308	/// Convenience method to get back the new range.
309	pub fn new_range(&self) -> Range<usize> {
310	self.new.offset..self.new.offset + self.new.vec.len()
311	}
312	}
313
314	#[test]
315	fn test_unique() {
316	let u = unique(&vec!['a', 'b', 'c', 'd', 'd', 'b'], `0`..`6`)
317	.into_iter()
318	.map(\|x\| (*x.value(), x.original_index()))
319	.collect::<Vec<_>>();
320	assert_eq!(u, vec![('a', `0`), ('c', `2`)]);
321	}
322
323	#[test]
324	fn test_int_hasher() {
325	let ih = IdentifyDistinct::<u8>::new(
326	&["", "foo", "bar", "baz"][..],
327	`1`..`4`,
328	&["", "foo", "blah", "baz"][..],
329	`1`..`4`,
330	);
331	assert_eq!(ih.old_lookup()[`1`], `0`);
332	assert_eq!(ih.old_lookup()[`2`], `1`);
333	assert_eq!(ih.old_lookup()[`3`], `2`);
334	assert_eq!(ih.new_lookup()[`1`], `0`);
335	assert_eq!(ih.new_lookup()[`2`], `3`);
336	assert_eq!(ih.new_lookup()[`3`], `2`);
337	assert_eq!(ih.old_range(), `1`..`4`);
338	assert_eq!(ih.new_range(), `1`..`4`);
339	}
340
341	#[test]
342	fn test_common_prefix_len() {
343	assert_eq!(
344	common_prefix_len("".as_bytes(), `0`..`0`, "".as_bytes(), `0`..`0`),
345	`0`
346	);
347	assert_eq!(
348	common_prefix_len("foobarbaz".as_bytes(), `0`..`9`, "foobarblah".as_bytes(), `0`..`10`),
349	`7`
350	);
351	assert_eq!(
352	common_prefix_len("foobarbaz".as_bytes(), `0`..`9`, "blablabla".as_bytes(), `0`..`9`),
353	`0`
354	);
355	assert_eq!(
356	common_prefix_len("foobarbaz".as_bytes(), `3`..`9`, "foobarblah".as_bytes(), `3`..`10`),
357	`4`
358	);
359	}
360
361	#[test]
362	fn test_common_suffix_len() {
363	assert_eq!(
364	common_suffix_len("".as_bytes(), `0`..`0`, "".as_bytes(), `0`..`0`),
365	`0`
366	);
367	assert_eq!(
368	common_suffix_len("1234".as_bytes(), `0`..`4`, "X0001234".as_bytes(), `0`..`8`),
369	`4`
370	);
371	assert_eq!(
372	common_suffix_len("1234".as_bytes(), `0`..`4`, "Xxxx".as_bytes(), `0`..`4`),
373	`0`
374	);
375	assert_eq!(
376	common_suffix_len("1234".as_bytes(), `2`..`4`, "01234".as_bytes(), `2`..`5`),
377	`2`
378	);
379	}
380