1 | //! This library implements string similarity metrics. |
2 | |
3 | #![forbid (unsafe_code)] |
4 | #![allow ( |
5 | // these casts are sometimes needed. They restrict the length of input iterators |
6 | // but there isn't really any way around this except for always working with |
7 | // 128 bit types |
8 | clippy::cast_possible_wrap, |
9 | clippy::cast_sign_loss, |
10 | clippy::cast_precision_loss, |
11 | // not practical |
12 | clippy::needless_pass_by_value, |
13 | clippy::similar_names, |
14 | // noisy |
15 | clippy::missing_errors_doc, |
16 | clippy::missing_panics_doc, |
17 | clippy::must_use_candidate, |
18 | // todo https://github.com/rapidfuzz/strsim-rs/issues/59 |
19 | clippy::range_plus_one |
20 | )] |
21 | |
22 | use std::char; |
23 | use std::cmp::{max, min}; |
24 | use std::collections::HashMap; |
25 | use std::convert::TryFrom; |
26 | use std::error::Error; |
27 | use std::fmt::{self, Display, Formatter}; |
28 | use std::hash::Hash; |
29 | use std::mem; |
30 | use std::str::Chars; |
31 | |
32 | #[derive (Debug, PartialEq)] |
33 | pub enum StrSimError { |
34 | DifferentLengthArgs, |
35 | } |
36 | |
37 | impl Display for StrSimError { |
38 | fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> { |
39 | let text: &'static str = match self { |
40 | StrSimError::DifferentLengthArgs => "Differing length arguments provided" , |
41 | }; |
42 | |
43 | write!(fmt, " {}" , text) |
44 | } |
45 | } |
46 | |
47 | impl Error for StrSimError {} |
48 | |
49 | pub type HammingResult = Result<usize, StrSimError>; |
50 | |
51 | /// Calculates the number of positions in the two sequences where the elements |
52 | /// differ. Returns an error if the sequences have different lengths. |
53 | pub fn generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult |
54 | where |
55 | Iter1: IntoIterator<Item = Elem1>, |
56 | Iter2: IntoIterator<Item = Elem2>, |
57 | Elem1: PartialEq<Elem2>, |
58 | { |
59 | let (mut ita: Iter1, mut itb: Iter2) = (a.into_iter(), b.into_iter()); |
60 | let mut count: usize = 0; |
61 | loop { |
62 | match (ita.next(), itb.next()) { |
63 | (Some(x), Some(y)) => { |
64 | if x != y { |
65 | count += 1; |
66 | } |
67 | } |
68 | (None, None) => return Ok(count), |
69 | _ => return Err(StrSimError::DifferentLengthArgs), |
70 | } |
71 | } |
72 | } |
73 | |
74 | /// Calculates the number of positions in the two strings where the characters |
75 | /// differ. Returns an error if the strings have different lengths. |
76 | /// |
77 | /// ``` |
78 | /// use strsim::{hamming, StrSimError::DifferentLengthArgs}; |
79 | /// |
80 | /// assert_eq!(Ok(3), hamming("hamming" , "hammers" )); |
81 | /// |
82 | /// assert_eq!(Err(DifferentLengthArgs), hamming("hamming" , "ham" )); |
83 | /// ``` |
84 | pub fn hamming(a: &str, b: &str) -> HammingResult { |
85 | generic_hamming(a.chars(), b.chars()) |
86 | } |
87 | |
88 | /// Calculates the Jaro similarity between two sequences. The returned value |
89 | /// is between 0.0 and 1.0 (higher value means more similar). |
90 | pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 |
91 | where |
92 | &'a Iter1: IntoIterator<Item = Elem1>, |
93 | &'b Iter2: IntoIterator<Item = Elem2>, |
94 | Elem1: PartialEq<Elem2>, |
95 | { |
96 | let a_len = a.into_iter().count(); |
97 | let b_len = b.into_iter().count(); |
98 | |
99 | if a_len == 0 && b_len == 0 { |
100 | return 1.0; |
101 | } else if a_len == 0 || b_len == 0 { |
102 | return 0.0; |
103 | } |
104 | |
105 | let mut search_range = max(a_len, b_len) / 2; |
106 | search_range = search_range.saturating_sub(1); |
107 | |
108 | // combine memory allocations to reduce runtime |
109 | let mut flags_memory = vec![false; a_len + b_len]; |
110 | let (a_flags, b_flags) = flags_memory.split_at_mut(a_len); |
111 | |
112 | let mut matches = 0_usize; |
113 | |
114 | for (i, a_elem) in a.into_iter().enumerate() { |
115 | // prevent integer wrapping |
116 | let min_bound = if i > search_range { |
117 | i - search_range |
118 | } else { |
119 | 0 |
120 | }; |
121 | |
122 | let max_bound = min(b_len, i + search_range + 1); |
123 | |
124 | for (j, b_elem) in b.into_iter().enumerate().take(max_bound) { |
125 | if min_bound <= j && a_elem == b_elem && !b_flags[j] { |
126 | a_flags[i] = true; |
127 | b_flags[j] = true; |
128 | matches += 1; |
129 | break; |
130 | } |
131 | } |
132 | } |
133 | |
134 | let mut transpositions = 0_usize; |
135 | if matches != 0 { |
136 | let mut b_iter = b_flags.iter().zip(b); |
137 | for (a_flag, ch1) in a_flags.iter().zip(a) { |
138 | if *a_flag { |
139 | loop { |
140 | if let Some((b_flag, ch2)) = b_iter.next() { |
141 | if !*b_flag { |
142 | continue; |
143 | } |
144 | |
145 | if ch1 != ch2 { |
146 | transpositions += 1; |
147 | } |
148 | break; |
149 | } |
150 | } |
151 | } |
152 | } |
153 | } |
154 | transpositions /= 2; |
155 | |
156 | if matches == 0 { |
157 | 0.0 |
158 | } else { |
159 | ((matches as f64 / a_len as f64) |
160 | + (matches as f64 / b_len as f64) |
161 | + ((matches - transpositions) as f64 / matches as f64)) |
162 | / 3.0 |
163 | } |
164 | } |
165 | |
166 | struct StringWrapper<'a>(&'a str); |
167 | |
168 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { |
169 | type Item = char; |
170 | type IntoIter = Chars<'b>; |
171 | |
172 | fn into_iter(self) -> Self::IntoIter { |
173 | self.0.chars() |
174 | } |
175 | } |
176 | |
177 | /// Calculates the Jaro similarity between two strings. The returned value |
178 | /// is between 0.0 and 1.0 (higher value means more similar). |
179 | /// |
180 | /// ``` |
181 | /// use strsim::jaro; |
182 | /// |
183 | /// assert!((0.392 - jaro("Friedrich Nietzsche" , "Jean-Paul Sartre" )).abs() < |
184 | /// 0.001); |
185 | /// ``` |
186 | pub fn jaro(a: &str, b: &str) -> f64 { |
187 | generic_jaro(&StringWrapper(a), &StringWrapper(b)) |
188 | } |
189 | |
190 | /// Like Jaro but gives a boost to sequences that have a common prefix. |
191 | pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 |
192 | where |
193 | &'a Iter1: IntoIterator<Item = Elem1>, |
194 | &'b Iter2: IntoIterator<Item = Elem2>, |
195 | Elem1: PartialEq<Elem2>, |
196 | { |
197 | let sim: f64 = generic_jaro(a, b); |
198 | |
199 | if sim > 0.7 { |
200 | let prefix_length: usize = aTakeWhile, …>, …> |
201 | .into_iter() |
202 | .take(4) |
203 | .zip(b) |
204 | .take_while(|(a_elem: &{unknown}, b_elem: &{unknown})| a_elem == b_elem) |
205 | .count(); |
206 | |
207 | sim + 0.1 * prefix_length as f64 * (1.0 - sim) |
208 | } else { |
209 | sim |
210 | } |
211 | } |
212 | |
213 | /// Like Jaro but gives a boost to strings that have a common prefix. |
214 | /// |
215 | /// ``` |
216 | /// use strsim::jaro_winkler; |
217 | /// |
218 | /// assert!((0.866 - jaro_winkler("cheeseburger" , "cheese fries" )).abs() < |
219 | /// 0.001); |
220 | /// ``` |
221 | pub fn jaro_winkler(a: &str, b: &str) -> f64 { |
222 | generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b)) |
223 | } |
224 | |
225 | /// Calculates the minimum number of insertions, deletions, and substitutions |
226 | /// required to change one sequence into the other. |
227 | /// |
228 | /// ``` |
229 | /// use strsim::generic_levenshtein; |
230 | /// |
231 | /// assert_eq!(3, generic_levenshtein(&[1,2,3], &[1,2,3,4,5,6])); |
232 | /// ``` |
233 | pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize |
234 | where |
235 | &'a Iter1: IntoIterator<Item = Elem1>, |
236 | &'b Iter2: IntoIterator<Item = Elem2>, |
237 | Elem1: PartialEq<Elem2>, |
238 | { |
239 | let b_len: usize = b.into_iter().count(); |
240 | |
241 | let mut cache: Vec<usize> = (1..b_len + 1).collect(); |
242 | |
243 | let mut result: usize = b_len; |
244 | |
245 | for (i, a_elem) in a.into_iter().enumerate() { |
246 | result = i + 1; |
247 | let mut distance_b = i; |
248 | |
249 | for (j, b_elem) in b.into_iter().enumerate() { |
250 | let cost: usize = usize::from(a_elem != b_elem); |
251 | let distance_a: usize = distance_b + cost; |
252 | distance_b = cache[j]; |
253 | result = min(v1:result + 1, v2:min(v1:distance_a, v2:distance_b + 1)); |
254 | cache[j] = result; |
255 | } |
256 | } |
257 | |
258 | result |
259 | } |
260 | |
261 | /// Calculates the minimum number of insertions, deletions, and substitutions |
262 | /// required to change one string into the other. |
263 | /// |
264 | /// ``` |
265 | /// use strsim::levenshtein; |
266 | /// |
267 | /// assert_eq!(3, levenshtein("kitten" , "sitting" )); |
268 | /// ``` |
269 | pub fn levenshtein(a: &str, b: &str) -> usize { |
270 | generic_levenshtein(&StringWrapper(a), &StringWrapper(b)) |
271 | } |
272 | |
273 | /// Calculates a normalized score of the Levenshtein algorithm between 0.0 and |
274 | /// 1.0 (inclusive), where 1.0 means the strings are the same. |
275 | /// |
276 | /// ``` |
277 | /// use strsim::normalized_levenshtein; |
278 | /// |
279 | /// assert!((normalized_levenshtein("kitten" , "sitting" ) - 0.57142).abs() < 0.00001); |
280 | /// assert!((normalized_levenshtein("" , "" ) - 1.0).abs() < 0.00001); |
281 | /// assert!(normalized_levenshtein("" , "second" ).abs() < 0.00001); |
282 | /// assert!(normalized_levenshtein("first" , "" ).abs() < 0.00001); |
283 | /// assert!((normalized_levenshtein("string" , "string" ) - 1.0).abs() < 0.00001); |
284 | /// ``` |
285 | pub fn normalized_levenshtein(a: &str, b: &str) -> f64 { |
286 | if a.is_empty() && b.is_empty() { |
287 | return 1.0; |
288 | } |
289 | 1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) |
290 | } |
291 | |
292 | /// Like Levenshtein but allows for adjacent transpositions. Each substring can |
293 | /// only be edited once. |
294 | /// |
295 | /// ``` |
296 | /// use strsim::osa_distance; |
297 | /// |
298 | /// assert_eq!(3, osa_distance("ab" , "bca" )); |
299 | /// ``` |
300 | pub fn osa_distance(a: &str, b: &str) -> usize { |
301 | let b_len = b.chars().count(); |
302 | // 0..=b_len behaves like 0..b_len.saturating_add(1) which could be a different size |
303 | // this leads to significantly worse code gen when swapping the vectors below |
304 | let mut prev_two_distances: Vec<usize> = (0..b_len + 1).collect(); |
305 | let mut prev_distances: Vec<usize> = (0..b_len + 1).collect(); |
306 | let mut curr_distances: Vec<usize> = vec![0; b_len + 1]; |
307 | |
308 | let mut prev_a_char = char::MAX; |
309 | let mut prev_b_char = char::MAX; |
310 | |
311 | for (i, a_char) in a.chars().enumerate() { |
312 | curr_distances[0] = i + 1; |
313 | |
314 | for (j, b_char) in b.chars().enumerate() { |
315 | let cost = usize::from(a_char != b_char); |
316 | curr_distances[j + 1] = min( |
317 | curr_distances[j] + 1, |
318 | min(prev_distances[j + 1] + 1, prev_distances[j] + cost), |
319 | ); |
320 | if i > 0 && j > 0 && a_char != b_char && a_char == prev_b_char && b_char == prev_a_char |
321 | { |
322 | curr_distances[j + 1] = min(curr_distances[j + 1], prev_two_distances[j - 1] + 1); |
323 | } |
324 | |
325 | prev_b_char = b_char; |
326 | } |
327 | |
328 | mem::swap(&mut prev_two_distances, &mut prev_distances); |
329 | mem::swap(&mut prev_distances, &mut curr_distances); |
330 | prev_a_char = a_char; |
331 | } |
332 | |
333 | // access prev_distances instead of curr_distances since we swapped |
334 | // them above. In case a is empty this would still contain the correct value |
335 | // from initializing the last element to b_len |
336 | prev_distances[b_len] |
337 | } |
338 | |
339 | /* Returns the final index for a value in a single vector that represents a fixed |
340 | 2d grid */ |
341 | fn flat_index(i: usize, j: usize, width: usize) -> usize { |
342 | j * width + i |
343 | } |
344 | |
345 | /// Like optimal string alignment, but substrings can be edited an unlimited |
346 | /// number of times, and the triangle inequality holds. |
347 | /// |
348 | /// ``` |
349 | /// use strsim::generic_damerau_levenshtein; |
350 | /// |
351 | /// assert_eq!(2, generic_damerau_levenshtein(&[1,2], &[2,3,1])); |
352 | /// ``` |
353 | pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize |
354 | where |
355 | Elem: Eq + Hash + Clone, |
356 | { |
357 | let a_len = a_elems.len(); |
358 | let b_len = b_elems.len(); |
359 | |
360 | if a_len == 0 { |
361 | return b_len; |
362 | } |
363 | if b_len == 0 { |
364 | return a_len; |
365 | } |
366 | |
367 | let width = a_len + 2; |
368 | let mut distances = vec![0; (a_len + 2) * (b_len + 2)]; |
369 | let max_distance = a_len + b_len; |
370 | distances[0] = max_distance; |
371 | |
372 | for i in 0..(a_len + 1) { |
373 | distances[flat_index(i + 1, 0, width)] = max_distance; |
374 | distances[flat_index(i + 1, 1, width)] = i; |
375 | } |
376 | |
377 | for j in 0..(b_len + 1) { |
378 | distances[flat_index(0, j + 1, width)] = max_distance; |
379 | distances[flat_index(1, j + 1, width)] = j; |
380 | } |
381 | |
382 | let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64); |
383 | |
384 | for i in 1..(a_len + 1) { |
385 | let mut db = 0; |
386 | |
387 | for j in 1..(b_len + 1) { |
388 | let k = match elems.get(&b_elems[j - 1]) { |
389 | Some(&value) => value, |
390 | None => 0, |
391 | }; |
392 | |
393 | let insertion_cost = distances[flat_index(i, j + 1, width)] + 1; |
394 | let deletion_cost = distances[flat_index(i + 1, j, width)] + 1; |
395 | let transposition_cost = |
396 | distances[flat_index(k, db, width)] + (i - k - 1) + 1 + (j - db - 1); |
397 | |
398 | let mut substitution_cost = distances[flat_index(i, j, width)] + 1; |
399 | if a_elems[i - 1] == b_elems[j - 1] { |
400 | db = j; |
401 | substitution_cost -= 1; |
402 | } |
403 | |
404 | distances[flat_index(i + 1, j + 1, width)] = min( |
405 | substitution_cost, |
406 | min(insertion_cost, min(deletion_cost, transposition_cost)), |
407 | ); |
408 | } |
409 | |
410 | elems.insert(a_elems[i - 1].clone(), i); |
411 | } |
412 | |
413 | distances[flat_index(a_len + 1, b_len + 1, width)] |
414 | } |
415 | |
416 | #[derive (Clone, Copy, PartialEq, Eq)] |
417 | struct RowId { |
418 | val: isize, |
419 | } |
420 | |
421 | impl Default for RowId { |
422 | fn default() -> Self { |
423 | Self { val: -1 } |
424 | } |
425 | } |
426 | |
427 | #[derive (Default, Clone)] |
428 | struct GrowingHashmapMapElemChar<ValueType> { |
429 | key: u32, |
430 | value: ValueType, |
431 | } |
432 | |
433 | /// specialized hashmap to store user provided types |
434 | /// this implementation relies on a couple of base assumptions in order to simplify the implementation |
435 | /// - the hashmap does not have an upper limit of included items |
436 | /// - the default value for the `ValueType` can be used as a dummy value to indicate an empty cell |
437 | /// - elements can't be removed |
438 | /// - only allocates memory on first write access. |
439 | /// This improves performance for hashmaps that are never written to |
440 | struct GrowingHashmapChar<ValueType> { |
441 | used: i32, |
442 | fill: i32, |
443 | mask: i32, |
444 | map: Option<Vec<GrowingHashmapMapElemChar<ValueType>>>, |
445 | } |
446 | |
447 | impl<ValueType> Default for GrowingHashmapChar<ValueType> |
448 | where |
449 | ValueType: Default + Clone + Eq, |
450 | { |
451 | fn default() -> Self { |
452 | Self { |
453 | used: 0, |
454 | fill: 0, |
455 | mask: -1, |
456 | map: None, |
457 | } |
458 | } |
459 | } |
460 | |
461 | impl<ValueType> GrowingHashmapChar<ValueType> |
462 | where |
463 | ValueType: Default + Clone + Eq + Copy, |
464 | { |
465 | fn get(&self, key: u32) -> ValueType { |
466 | self.map |
467 | .as_ref() |
468 | .map_or_else(|| Default::default(), |map| map[self.lookup(key)].value) |
469 | } |
470 | |
471 | fn get_mut(&mut self, key: u32) -> &mut ValueType { |
472 | if self.map.is_none() { |
473 | self.allocate(); |
474 | } |
475 | |
476 | let mut i = self.lookup(key); |
477 | if self |
478 | .map |
479 | .as_ref() |
480 | .expect("map should have been created above" )[i] |
481 | .value |
482 | == Default::default() |
483 | { |
484 | self.fill += 1; |
485 | // resize when 2/3 full |
486 | if self.fill * 3 >= (self.mask + 1) * 2 { |
487 | self.grow((self.used + 1) * 2); |
488 | i = self.lookup(key); |
489 | } |
490 | |
491 | self.used += 1; |
492 | } |
493 | |
494 | let elem = &mut self |
495 | .map |
496 | .as_mut() |
497 | .expect("map should have been created above" )[i]; |
498 | elem.key = key; |
499 | &mut elem.value |
500 | } |
501 | |
502 | fn allocate(&mut self) { |
503 | self.mask = 8 - 1; |
504 | self.map = Some(vec![GrowingHashmapMapElemChar::default(); 8]); |
505 | } |
506 | |
507 | /// lookup key inside the hashmap using a similar collision resolution |
508 | /// strategy to `CPython` and `Ruby` |
509 | fn lookup(&self, key: u32) -> usize { |
510 | let hash = key; |
511 | let mut i = hash as usize & self.mask as usize; |
512 | |
513 | let map = self |
514 | .map |
515 | .as_ref() |
516 | .expect("callers have to ensure map is allocated" ); |
517 | |
518 | if map[i].value == Default::default() || map[i].key == key { |
519 | return i; |
520 | } |
521 | |
522 | let mut perturb = key; |
523 | loop { |
524 | i = (i * 5 + perturb as usize + 1) & self.mask as usize; |
525 | |
526 | if map[i].value == Default::default() || map[i].key == key { |
527 | return i; |
528 | } |
529 | |
530 | perturb >>= 5; |
531 | } |
532 | } |
533 | |
534 | fn grow(&mut self, min_used: i32) { |
535 | let mut new_size = self.mask + 1; |
536 | while new_size <= min_used { |
537 | new_size <<= 1; |
538 | } |
539 | |
540 | self.fill = self.used; |
541 | self.mask = new_size - 1; |
542 | |
543 | let old_map = std::mem::replace( |
544 | self.map |
545 | .as_mut() |
546 | .expect("callers have to ensure map is allocated" ), |
547 | vec![GrowingHashmapMapElemChar::<ValueType>::default(); new_size as usize], |
548 | ); |
549 | |
550 | for elem in old_map { |
551 | if elem.value != Default::default() { |
552 | let j = self.lookup(elem.key); |
553 | let new_elem = &mut self.map.as_mut().expect("map created above" )[j]; |
554 | new_elem.key = elem.key; |
555 | new_elem.value = elem.value; |
556 | self.used -= 1; |
557 | if self.used == 0 { |
558 | break; |
559 | } |
560 | } |
561 | } |
562 | |
563 | self.used = self.fill; |
564 | } |
565 | } |
566 | |
567 | struct HybridGrowingHashmapChar<ValueType> { |
568 | map: GrowingHashmapChar<ValueType>, |
569 | extended_ascii: [ValueType; 256], |
570 | } |
571 | |
572 | impl<ValueType> HybridGrowingHashmapChar<ValueType> |
573 | where |
574 | ValueType: Default + Clone + Copy + Eq, |
575 | { |
576 | fn get(&self, key: char) -> ValueType { |
577 | let value: u32 = key as u32; |
578 | if value <= 255 { |
579 | let val_u8: u8 = u8::try_from(value).expect(msg:"we check the bounds above" ); |
580 | self.extended_ascii[usize::from(val_u8)] |
581 | } else { |
582 | self.map.get(key:value) |
583 | } |
584 | } |
585 | |
586 | fn get_mut(&mut self, key: char) -> &mut ValueType { |
587 | let value: u32 = key as u32; |
588 | if value <= 255 { |
589 | let val_u8: u8 = u8::try_from(value).expect(msg:"we check the bounds above" ); |
590 | &mut self.extended_ascii[usize::from(val_u8)] |
591 | } else { |
592 | self.map.get_mut(key:value) |
593 | } |
594 | } |
595 | } |
596 | |
597 | impl<ValueType> Default for HybridGrowingHashmapChar<ValueType> |
598 | where |
599 | ValueType: Default + Clone + Copy + Eq, |
600 | { |
601 | fn default() -> Self { |
602 | HybridGrowingHashmapChar { |
603 | map: GrowingHashmapChar::default(), |
604 | extended_ascii: [Default::default(); 256], |
605 | } |
606 | } |
607 | } |
608 | |
609 | fn damerau_levenshtein_impl<Iter1, Iter2>(s1: Iter1, len1: usize, s2: Iter2, len2: usize) -> usize |
610 | where |
611 | Iter1: Iterator<Item = char> + Clone, |
612 | Iter2: Iterator<Item = char> + Clone, |
613 | { |
614 | // The implementations is based on the paper |
615 | // `Linear space string correction algorithm using the Damerau-Levenshtein distance` |
616 | // from Chunchun Zhao and Sartaj Sahni |
617 | // |
618 | // It has a runtime complexity of `O(N*M)` and a memory usage of `O(N+M)`. |
619 | let max_val = max(len1, len2) as isize + 1; |
620 | |
621 | let mut last_row_id = HybridGrowingHashmapChar::<RowId>::default(); |
622 | |
623 | let size = len2 + 2; |
624 | let mut fr = vec![max_val; size]; |
625 | let mut r1 = vec![max_val; size]; |
626 | let mut r: Vec<isize> = (max_val..max_val + 1) |
627 | .chain(0..(size - 1) as isize) |
628 | .collect(); |
629 | |
630 | for (i, ch1) in s1.enumerate().map(|(i, ch1)| (i + 1, ch1)) { |
631 | mem::swap(&mut r, &mut r1); |
632 | let mut last_col_id: isize = -1; |
633 | let mut last_i2l1 = r[1]; |
634 | r[1] = i as isize; |
635 | let mut t = max_val; |
636 | |
637 | for (j, ch2) in s2.clone().enumerate().map(|(j, ch2)| (j + 1, ch2)) { |
638 | let diag = r1[j] + isize::from(ch1 != ch2); |
639 | let left = r[j] + 1; |
640 | let up = r1[j + 1] + 1; |
641 | let mut temp = min(diag, min(left, up)); |
642 | |
643 | if ch1 == ch2 { |
644 | last_col_id = j as isize; // last occurence of s1_i |
645 | fr[j + 1] = r1[j - 1]; // save H_k-1,j-2 |
646 | t = last_i2l1; // save H_i-2,l-1 |
647 | } else { |
648 | let k = last_row_id.get(ch2).val; |
649 | let l = last_col_id; |
650 | |
651 | if j as isize - l == 1 { |
652 | let transpose = fr[j + 1] + (i as isize - k); |
653 | temp = min(temp, transpose); |
654 | } else if i as isize - k == 1 { |
655 | let transpose = t + (j as isize - l); |
656 | temp = min(temp, transpose); |
657 | } |
658 | } |
659 | |
660 | last_i2l1 = r[j + 1]; |
661 | r[j + 1] = temp; |
662 | } |
663 | last_row_id.get_mut(ch1).val = i as isize; |
664 | } |
665 | |
666 | r[len2 + 1] as usize |
667 | } |
668 | |
669 | /// Like optimal string alignment, but substrings can be edited an unlimited |
670 | /// number of times, and the triangle inequality holds. |
671 | /// |
672 | /// ``` |
673 | /// use strsim::damerau_levenshtein; |
674 | /// |
675 | /// assert_eq!(2, damerau_levenshtein("ab" , "bca" )); |
676 | /// ``` |
677 | pub fn damerau_levenshtein(a: &str, b: &str) -> usize { |
678 | damerau_levenshtein_impl(s1:a.chars(), len1:a.chars().count(), s2:b.chars(), len2:b.chars().count()) |
679 | } |
680 | |
681 | /// Calculates a normalized score of the Damerau–Levenshtein algorithm between |
682 | /// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same. |
683 | /// |
684 | /// ``` |
685 | /// use strsim::normalized_damerau_levenshtein; |
686 | /// |
687 | /// assert!((normalized_damerau_levenshtein("levenshtein" , "löwenbräu" ) - 0.27272).abs() < 0.00001); |
688 | /// assert!((normalized_damerau_levenshtein("" , "" ) - 1.0).abs() < 0.00001); |
689 | /// assert!(normalized_damerau_levenshtein("" , "flower" ).abs() < 0.00001); |
690 | /// assert!(normalized_damerau_levenshtein("tree" , "" ).abs() < 0.00001); |
691 | /// assert!((normalized_damerau_levenshtein("sunglasses" , "sunglasses" ) - 1.0).abs() < 0.00001); |
692 | /// ``` |
693 | pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 { |
694 | if a.is_empty() && b.is_empty() { |
695 | return 1.0; |
696 | } |
697 | |
698 | let len1: usize = a.chars().count(); |
699 | let len2: usize = b.chars().count(); |
700 | let dist: usize = damerau_levenshtein_impl(s1:a.chars(), len1, s2:b.chars(), len2); |
701 | 1.0 - (dist as f64) / (max(v1:len1, v2:len2) as f64) |
702 | } |
703 | |
704 | /// Returns an Iterator of char tuples. |
705 | fn bigrams(s: &str) -> impl Iterator<Item = (char, char)> + '_ { |
706 | s.chars().zip(s.chars().skip(1)) |
707 | } |
708 | |
709 | /// Calculates a Sørensen-Dice similarity distance using bigrams. |
710 | /// See <https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient>. |
711 | /// |
712 | /// ``` |
713 | /// use strsim::sorensen_dice; |
714 | /// |
715 | /// assert_eq!(1.0, sorensen_dice("" , "" )); |
716 | /// assert_eq!(0.0, sorensen_dice("" , "a" )); |
717 | /// assert_eq!(0.0, sorensen_dice("french" , "quebec" )); |
718 | /// assert_eq!(1.0, sorensen_dice("ferris" , "ferris" )); |
719 | /// assert_eq!(0.8888888888888888, sorensen_dice("feris" , "ferris" )); |
720 | /// ``` |
721 | pub fn sorensen_dice(a: &str, b: &str) -> f64 { |
722 | // implementation guided by |
723 | // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6 |
724 | |
725 | let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect(); |
726 | let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect(); |
727 | |
728 | if a == b { |
729 | return 1.0; |
730 | } |
731 | |
732 | if a.len() < 2 || b.len() < 2 { |
733 | return 0.0; |
734 | } |
735 | |
736 | let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new(); |
737 | |
738 | for bigram in bigrams(&a) { |
739 | *a_bigrams.entry(bigram).or_insert(0) += 1; |
740 | } |
741 | |
742 | let mut intersection_size = 0_usize; |
743 | |
744 | for bigram in bigrams(&b) { |
745 | a_bigrams.entry(bigram).and_modify(|bi| { |
746 | if *bi > 0 { |
747 | *bi -= 1; |
748 | intersection_size += 1; |
749 | } |
750 | }); |
751 | } |
752 | |
753 | (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 |
754 | } |
755 | |
756 | #[cfg (test)] |
757 | mod tests { |
758 | use super::*; |
759 | |
760 | macro_rules! assert_delta { |
761 | ($x:expr, $y:expr) => { |
762 | assert_delta!($x, $y, 1e-5); |
763 | }; |
764 | ($x:expr, $y:expr, $d:expr) => { |
765 | if ($x - $y).abs() > $d { |
766 | panic!( |
767 | "assertion failed: actual: `{}`, expected: `{}`: \ |
768 | actual not within < {} of expected" , |
769 | $x, $y, $d |
770 | ); |
771 | } |
772 | }; |
773 | } |
774 | |
775 | #[test ] |
776 | fn bigrams_iterator() { |
777 | let mut bi = bigrams("abcde" ); |
778 | |
779 | assert_eq!(Some(('a' , 'b' )), bi.next()); |
780 | assert_eq!(Some(('b' , 'c' )), bi.next()); |
781 | assert_eq!(Some(('c' , 'd' )), bi.next()); |
782 | assert_eq!(Some(('d' , 'e' )), bi.next()); |
783 | assert_eq!(None, bi.next()); |
784 | } |
785 | |
786 | fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) { |
787 | assert_eq!(Ok(dist), hamming(str1, str2)); |
788 | } |
789 | |
790 | #[test ] |
791 | fn hamming_empty() { |
792 | assert_hamming_dist(0, "" , "" ) |
793 | } |
794 | |
795 | #[test ] |
796 | fn hamming_same() { |
797 | assert_hamming_dist(0, "hamming" , "hamming" ) |
798 | } |
799 | |
800 | #[test ] |
801 | fn hamming_numbers() { |
802 | assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3])); |
803 | } |
804 | |
805 | #[test ] |
806 | fn hamming_diff() { |
807 | assert_hamming_dist(3, "hamming" , "hammers" ) |
808 | } |
809 | |
810 | #[test ] |
811 | fn hamming_diff_multibyte() { |
812 | assert_hamming_dist(2, "hamming" , "h香mmüng" ); |
813 | } |
814 | |
815 | #[test ] |
816 | fn hamming_unequal_length() { |
817 | assert_eq!( |
818 | Err(StrSimError::DifferentLengthArgs), |
819 | generic_hamming("ham" .chars(), "hamming" .chars()) |
820 | ); |
821 | } |
822 | |
823 | #[test ] |
824 | fn hamming_names() { |
825 | assert_hamming_dist(14, "Friedrich Nietzs" , "Jean-Paul Sartre" ) |
826 | } |
827 | |
828 | #[test ] |
829 | fn jaro_both_empty() { |
830 | assert_eq!(1.0, jaro("" , "" )); |
831 | } |
832 | |
833 | #[test ] |
834 | fn jaro_first_empty() { |
835 | assert_eq!(0.0, jaro("" , "jaro" )); |
836 | } |
837 | |
838 | #[test ] |
839 | fn jaro_second_empty() { |
840 | assert_eq!(0.0, jaro("distance" , "" )); |
841 | } |
842 | |
843 | #[test ] |
844 | fn jaro_same() { |
845 | assert_eq!(1.0, jaro("jaro" , "jaro" )); |
846 | } |
847 | |
848 | #[test ] |
849 | fn jaro_multibyte() { |
850 | assert_delta!(0.818, jaro("testabctest" , "testöঙ香test" ), 0.001); |
851 | assert_delta!(0.818, jaro("testöঙ香test" , "testabctest" ), 0.001); |
852 | } |
853 | |
854 | #[test ] |
855 | fn jaro_diff_short() { |
856 | assert_delta!(0.767, jaro("dixon" , "dicksonx" ), 0.001); |
857 | } |
858 | |
859 | #[test ] |
860 | fn jaro_diff_one_character() { |
861 | assert_eq!(0.0, jaro("a" , "b" )); |
862 | } |
863 | |
864 | #[test ] |
865 | fn jaro_same_one_character() { |
866 | assert_eq!(1.0, jaro("a" , "a" )); |
867 | } |
868 | |
869 | #[test ] |
870 | fn generic_jaro_diff() { |
871 | assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4])); |
872 | } |
873 | |
874 | #[test ] |
875 | fn jaro_diff_one_and_two() { |
876 | assert_delta!(0.83, jaro("a" , "ab" ), 0.01); |
877 | } |
878 | |
879 | #[test ] |
880 | fn jaro_diff_two_and_one() { |
881 | assert_delta!(0.83, jaro("ab" , "a" ), 0.01); |
882 | } |
883 | |
884 | #[test ] |
885 | fn jaro_diff_no_transposition() { |
886 | assert_delta!(0.822, jaro("dwayne" , "duane" ), 0.001); |
887 | } |
888 | |
889 | #[test ] |
890 | fn jaro_diff_with_transposition() { |
891 | assert_delta!(0.944, jaro("martha" , "marhta" ), 0.001); |
892 | assert_delta!(0.6, jaro("a jke" , "jane a k" ), 0.001); |
893 | } |
894 | |
895 | #[test ] |
896 | fn jaro_names() { |
897 | assert_delta!( |
898 | 0.392, |
899 | jaro("Friedrich Nietzsche" , "Jean-Paul Sartre" ), |
900 | 0.001 |
901 | ); |
902 | } |
903 | |
904 | #[test ] |
905 | fn jaro_winkler_both_empty() { |
906 | assert_eq!(1.0, jaro_winkler("" , "" )); |
907 | } |
908 | |
909 | #[test ] |
910 | fn jaro_winkler_first_empty() { |
911 | assert_eq!(0.0, jaro_winkler("" , "jaro-winkler" )); |
912 | } |
913 | |
914 | #[test ] |
915 | fn jaro_winkler_second_empty() { |
916 | assert_eq!(0.0, jaro_winkler("distance" , "" )); |
917 | } |
918 | |
919 | #[test ] |
920 | fn jaro_winkler_same() { |
921 | assert_eq!(1.0, jaro_winkler("Jaro-Winkler" , "Jaro-Winkler" )); |
922 | } |
923 | |
924 | #[test ] |
925 | fn jaro_winkler_multibyte() { |
926 | assert_delta!(0.89, jaro_winkler("testabctest" , "testöঙ香test" ), 0.001); |
927 | assert_delta!(0.89, jaro_winkler("testöঙ香test" , "testabctest" ), 0.001); |
928 | } |
929 | |
930 | #[test ] |
931 | fn jaro_winkler_diff_short() { |
932 | assert_delta!(0.813, jaro_winkler("dixon" , "dicksonx" ), 0.001); |
933 | assert_delta!(0.813, jaro_winkler("dicksonx" , "dixon" ), 0.001); |
934 | } |
935 | |
936 | #[test ] |
937 | fn jaro_winkler_diff_one_character() { |
938 | assert_eq!(0.0, jaro_winkler("a" , "b" )); |
939 | } |
940 | |
941 | #[test ] |
942 | fn jaro_winkler_same_one_character() { |
943 | assert_eq!(1.0, jaro_winkler("a" , "a" )); |
944 | } |
945 | |
946 | #[test ] |
947 | fn jaro_winkler_diff_no_transposition() { |
948 | assert_delta!(0.84, jaro_winkler("dwayne" , "duane" ), 0.001); |
949 | } |
950 | |
951 | #[test ] |
952 | fn jaro_winkler_diff_with_transposition() { |
953 | assert_delta!(0.961, jaro_winkler("martha" , "marhta" ), 0.001); |
954 | assert_delta!(0.6, jaro_winkler("a jke" , "jane a k" ), 0.001); |
955 | } |
956 | |
957 | #[test ] |
958 | fn jaro_winkler_names() { |
959 | assert_delta!( |
960 | 0.452, |
961 | jaro_winkler("Friedrich Nietzsche" , "Fran-Paul Sartre" ), |
962 | 0.001 |
963 | ); |
964 | } |
965 | |
966 | #[test ] |
967 | fn jaro_winkler_long_prefix() { |
968 | assert_delta!(0.866, jaro_winkler("cheeseburger" , "cheese fries" ), 0.001); |
969 | } |
970 | |
971 | #[test ] |
972 | fn jaro_winkler_more_names() { |
973 | assert_delta!(0.868, jaro_winkler("Thorkel" , "Thorgier" ), 0.001); |
974 | } |
975 | |
976 | #[test ] |
977 | fn jaro_winkler_length_of_one() { |
978 | assert_delta!(0.738, jaro_winkler("Dinsdale" , "D" ), 0.001); |
979 | } |
980 | |
981 | #[test ] |
982 | fn jaro_winkler_very_long_prefix() { |
983 | assert_delta!( |
984 | 0.98519, |
985 | jaro_winkler("thequickbrownfoxjumpedoverx" , "thequickbrownfoxjumpedovery" ) |
986 | ); |
987 | } |
988 | |
989 | #[test ] |
990 | fn levenshtein_empty() { |
991 | assert_eq!(0, levenshtein("" , "" )); |
992 | } |
993 | |
994 | #[test ] |
995 | fn levenshtein_same() { |
996 | assert_eq!(0, levenshtein("levenshtein" , "levenshtein" )); |
997 | } |
998 | |
999 | #[test ] |
1000 | fn levenshtein_diff_short() { |
1001 | assert_eq!(3, levenshtein("kitten" , "sitting" )); |
1002 | } |
1003 | |
1004 | #[test ] |
1005 | fn levenshtein_diff_with_space() { |
1006 | assert_eq!(5, levenshtein("hello, world" , "bye, world" )); |
1007 | } |
1008 | |
1009 | #[test ] |
1010 | fn levenshtein_diff_multibyte() { |
1011 | assert_eq!(3, levenshtein("öঙ香" , "abc" )); |
1012 | assert_eq!(3, levenshtein("abc" , "öঙ香" )); |
1013 | } |
1014 | |
1015 | #[test ] |
1016 | fn levenshtein_diff_longer() { |
1017 | let a = "The quick brown fox jumped over the angry dog." ; |
1018 | let b = "Lorem ipsum dolor sit amet, dicta latine an eam." ; |
1019 | assert_eq!(37, levenshtein(a, b)); |
1020 | } |
1021 | |
1022 | #[test ] |
1023 | fn levenshtein_first_empty() { |
1024 | assert_eq!(7, levenshtein("" , "sitting" )); |
1025 | } |
1026 | |
1027 | #[test ] |
1028 | fn levenshtein_second_empty() { |
1029 | assert_eq!(6, levenshtein("kitten" , "" )); |
1030 | } |
1031 | |
1032 | #[test ] |
1033 | fn normalized_levenshtein_diff_short() { |
1034 | assert_delta!(0.57142, normalized_levenshtein("kitten" , "sitting" )); |
1035 | } |
1036 | |
1037 | #[test ] |
1038 | fn normalized_levenshtein_for_empty_strings() { |
1039 | assert_delta!(1.0, normalized_levenshtein("" , "" )); |
1040 | } |
1041 | |
1042 | #[test ] |
1043 | fn normalized_levenshtein_first_empty() { |
1044 | assert_delta!(0.0, normalized_levenshtein("" , "second" )); |
1045 | } |
1046 | |
1047 | #[test ] |
1048 | fn normalized_levenshtein_second_empty() { |
1049 | assert_delta!(0.0, normalized_levenshtein("first" , "" )); |
1050 | } |
1051 | |
1052 | #[test ] |
1053 | fn normalized_levenshtein_identical_strings() { |
1054 | assert_delta!(1.0, normalized_levenshtein("identical" , "identical" )); |
1055 | } |
1056 | |
1057 | #[test ] |
1058 | fn osa_distance_empty() { |
1059 | assert_eq!(0, osa_distance("" , "" )); |
1060 | } |
1061 | |
1062 | #[test ] |
1063 | fn osa_distance_same() { |
1064 | assert_eq!(0, osa_distance("damerau" , "damerau" )); |
1065 | } |
1066 | |
1067 | #[test ] |
1068 | fn osa_distance_first_empty() { |
1069 | assert_eq!(7, osa_distance("" , "damerau" )); |
1070 | } |
1071 | |
1072 | #[test ] |
1073 | fn osa_distance_second_empty() { |
1074 | assert_eq!(7, osa_distance("damerau" , "" )); |
1075 | } |
1076 | |
1077 | #[test ] |
1078 | fn osa_distance_diff() { |
1079 | assert_eq!(3, osa_distance("ca" , "abc" )); |
1080 | } |
1081 | |
1082 | #[test ] |
1083 | fn osa_distance_diff_short() { |
1084 | assert_eq!(3, osa_distance("damerau" , "aderua" )); |
1085 | } |
1086 | |
1087 | #[test ] |
1088 | fn osa_distance_diff_reversed() { |
1089 | assert_eq!(3, osa_distance("aderua" , "damerau" )); |
1090 | } |
1091 | |
1092 | #[test ] |
1093 | fn osa_distance_diff_multibyte() { |
1094 | assert_eq!(3, osa_distance("öঙ香" , "abc" )); |
1095 | assert_eq!(3, osa_distance("abc" , "öঙ香" )); |
1096 | } |
1097 | |
1098 | #[test ] |
1099 | fn osa_distance_diff_unequal_length() { |
1100 | assert_eq!(6, osa_distance("damerau" , "aderuaxyz" )); |
1101 | } |
1102 | |
1103 | #[test ] |
1104 | fn osa_distance_diff_unequal_length_reversed() { |
1105 | assert_eq!(6, osa_distance("aderuaxyz" , "damerau" )); |
1106 | } |
1107 | |
1108 | #[test ] |
1109 | fn osa_distance_diff_comedians() { |
1110 | assert_eq!(5, osa_distance("Stewart" , "Colbert" )); |
1111 | } |
1112 | |
1113 | #[test ] |
1114 | fn osa_distance_many_transpositions() { |
1115 | assert_eq!(4, osa_distance("abcdefghijkl" , "bacedfgihjlk" )); |
1116 | } |
1117 | |
1118 | #[test ] |
1119 | fn osa_distance_diff_longer() { |
1120 | let a = "The quick brown fox jumped over the angry dog." ; |
1121 | let b = "Lehem ipsum dolor sit amet, dicta latine an eam." ; |
1122 | assert_eq!(36, osa_distance(a, b)); |
1123 | } |
1124 | |
1125 | #[test ] |
1126 | fn osa_distance_beginning_transposition() { |
1127 | assert_eq!(1, osa_distance("foobar" , "ofobar" )); |
1128 | } |
1129 | |
1130 | #[test ] |
1131 | fn osa_distance_end_transposition() { |
1132 | assert_eq!(1, osa_distance("specter" , "spectre" )); |
1133 | } |
1134 | |
1135 | #[test ] |
1136 | fn osa_distance_restricted_edit() { |
1137 | assert_eq!(4, osa_distance("a cat" , "an abct" )); |
1138 | } |
1139 | |
1140 | #[test ] |
1141 | fn damerau_levenshtein_empty() { |
1142 | assert_eq!(0, damerau_levenshtein("" , "" )); |
1143 | } |
1144 | |
1145 | #[test ] |
1146 | fn damerau_levenshtein_same() { |
1147 | assert_eq!(0, damerau_levenshtein("damerau" , "damerau" )); |
1148 | } |
1149 | |
1150 | #[test ] |
1151 | fn damerau_levenshtein_first_empty() { |
1152 | assert_eq!(7, damerau_levenshtein("" , "damerau" )); |
1153 | } |
1154 | |
1155 | #[test ] |
1156 | fn damerau_levenshtein_second_empty() { |
1157 | assert_eq!(7, damerau_levenshtein("damerau" , "" )); |
1158 | } |
1159 | |
1160 | #[test ] |
1161 | fn damerau_levenshtein_diff() { |
1162 | assert_eq!(2, damerau_levenshtein("ca" , "abc" )); |
1163 | } |
1164 | |
1165 | #[test ] |
1166 | fn damerau_levenshtein_diff_short() { |
1167 | assert_eq!(3, damerau_levenshtein("damerau" , "aderua" )); |
1168 | } |
1169 | |
1170 | #[test ] |
1171 | fn damerau_levenshtein_diff_reversed() { |
1172 | assert_eq!(3, damerau_levenshtein("aderua" , "damerau" )); |
1173 | } |
1174 | |
1175 | #[test ] |
1176 | fn damerau_levenshtein_diff_multibyte() { |
1177 | assert_eq!(3, damerau_levenshtein("öঙ香" , "abc" )); |
1178 | assert_eq!(3, damerau_levenshtein("abc" , "öঙ香" )); |
1179 | } |
1180 | |
1181 | #[test ] |
1182 | fn damerau_levenshtein_diff_unequal_length() { |
1183 | assert_eq!(6, damerau_levenshtein("damerau" , "aderuaxyz" )); |
1184 | } |
1185 | |
1186 | #[test ] |
1187 | fn damerau_levenshtein_diff_unequal_length_reversed() { |
1188 | assert_eq!(6, damerau_levenshtein("aderuaxyz" , "damerau" )); |
1189 | } |
1190 | |
1191 | #[test ] |
1192 | fn damerau_levenshtein_diff_comedians() { |
1193 | assert_eq!(5, damerau_levenshtein("Stewart" , "Colbert" )); |
1194 | } |
1195 | |
1196 | #[test ] |
1197 | fn damerau_levenshtein_many_transpositions() { |
1198 | assert_eq!(4, damerau_levenshtein("abcdefghijkl" , "bacedfgihjlk" )); |
1199 | } |
1200 | |
1201 | #[test ] |
1202 | fn damerau_levenshtein_diff_longer() { |
1203 | let a = "The quick brown fox jumped over the angry dog." ; |
1204 | let b = "Lehem ipsum dolor sit amet, dicta latine an eam." ; |
1205 | assert_eq!(36, damerau_levenshtein(a, b)); |
1206 | } |
1207 | |
1208 | #[test ] |
1209 | fn damerau_levenshtein_beginning_transposition() { |
1210 | assert_eq!(1, damerau_levenshtein("foobar" , "ofobar" )); |
1211 | } |
1212 | |
1213 | #[test ] |
1214 | fn damerau_levenshtein_end_transposition() { |
1215 | assert_eq!(1, damerau_levenshtein("specter" , "spectre" )); |
1216 | } |
1217 | |
1218 | #[test ] |
1219 | fn damerau_levenshtein_unrestricted_edit() { |
1220 | assert_eq!(3, damerau_levenshtein("a cat" , "an abct" )); |
1221 | } |
1222 | |
1223 | #[test ] |
1224 | fn normalized_damerau_levenshtein_diff_short() { |
1225 | assert_delta!( |
1226 | 0.27272, |
1227 | normalized_damerau_levenshtein("levenshtein" , "löwenbräu" ) |
1228 | ); |
1229 | } |
1230 | |
1231 | #[test ] |
1232 | fn normalized_damerau_levenshtein_for_empty_strings() { |
1233 | assert_delta!(1.0, normalized_damerau_levenshtein("" , "" )); |
1234 | } |
1235 | |
1236 | #[test ] |
1237 | fn normalized_damerau_levenshtein_first_empty() { |
1238 | assert_delta!(0.0, normalized_damerau_levenshtein("" , "flower" )); |
1239 | } |
1240 | |
1241 | #[test ] |
1242 | fn normalized_damerau_levenshtein_second_empty() { |
1243 | assert_delta!(0.0, normalized_damerau_levenshtein("tree" , "" )); |
1244 | } |
1245 | |
1246 | #[test ] |
1247 | fn normalized_damerau_levenshtein_identical_strings() { |
1248 | assert_delta!( |
1249 | 1.0, |
1250 | normalized_damerau_levenshtein("sunglasses" , "sunglasses" ) |
1251 | ); |
1252 | } |
1253 | |
1254 | #[test ] |
1255 | fn sorensen_dice_all() { |
1256 | // test cases taken from |
1257 | // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11 |
1258 | |
1259 | assert_delta!(1.0, sorensen_dice("a" , "a" )); |
1260 | assert_delta!(0.0, sorensen_dice("a" , "b" )); |
1261 | assert_delta!(1.0, sorensen_dice("" , "" )); |
1262 | assert_delta!(0.0, sorensen_dice("a" , "" )); |
1263 | assert_delta!(0.0, sorensen_dice("" , "a" )); |
1264 | assert_delta!(1.0, sorensen_dice("apple event" , "apple event" )); |
1265 | assert_delta!(0.90909, sorensen_dice("iphone" , "iphone x" )); |
1266 | assert_delta!(0.0, sorensen_dice("french" , "quebec" )); |
1267 | assert_delta!(1.0, sorensen_dice("france" , "france" )); |
1268 | assert_delta!(0.2, sorensen_dice("fRaNce" , "france" )); |
1269 | assert_delta!(0.8, sorensen_dice("healed" , "sealed" )); |
1270 | assert_delta!( |
1271 | 0.78788, |
1272 | sorensen_dice("web applications" , "applications of the web" ) |
1273 | ); |
1274 | assert_delta!( |
1275 | 0.92, |
1276 | sorensen_dice( |
1277 | "this will have a typo somewhere" , |
1278 | "this will huve a typo somewhere" |
1279 | ) |
1280 | ); |
1281 | assert_delta!( |
1282 | 0.60606, |
1283 | sorensen_dice( |
1284 | "Olive-green table for sale, in extremely good condition." , |
1285 | "For sale: table in very good condition, olive green in colour." |
1286 | ) |
1287 | ); |
1288 | assert_delta!( |
1289 | 0.25581, |
1290 | sorensen_dice( |
1291 | "Olive-green table for sale, in extremely good condition." , |
1292 | "For sale: green Subaru Impreza, 210,000 miles" |
1293 | ) |
1294 | ); |
1295 | assert_delta!( |
1296 | 0.14118, |
1297 | sorensen_dice( |
1298 | "Olive-green table for sale, in extremely good condition." , |
1299 | "Wanted: mountain bike with at least 21 gears." |
1300 | ) |
1301 | ); |
1302 | assert_delta!( |
1303 | 0.77419, |
1304 | sorensen_dice("this has one extra word" , "this has one word" ) |
1305 | ); |
1306 | } |
1307 | } |
1308 | |