1 | //! This library implements string similarity metrics. |
2 | |
3 | #![forbid (unsafe_code)] |
4 | |
5 | use std::char; |
6 | use std::cmp::{max, min}; |
7 | use std::collections::HashMap; |
8 | use std::error::Error; |
9 | use std::fmt::{self, Display, Formatter}; |
10 | use std::hash::Hash; |
11 | use std::str::Chars; |
12 | |
13 | #[derive (Debug, PartialEq)] |
14 | pub enum StrSimError { |
15 | DifferentLengthArgs, |
16 | } |
17 | |
18 | impl Display for StrSimError { |
19 | fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> { |
20 | let text: &str = match self { |
21 | StrSimError::DifferentLengthArgs => "Differing length arguments provided" , |
22 | }; |
23 | |
24 | write!(fmt, " {}" , text) |
25 | } |
26 | } |
27 | |
28 | impl Error for StrSimError {} |
29 | |
30 | pub type HammingResult = Result<usize, StrSimError>; |
31 | |
32 | /// Calculates the number of positions in the two sequences where the elements |
33 | /// differ. Returns an error if the sequences have different lengths. |
34 | pub fn generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult |
35 | where Iter1: IntoIterator<Item=Elem1>, |
36 | Iter2: IntoIterator<Item=Elem2>, |
37 | Elem1: PartialEq<Elem2> { |
38 | let (mut ita: Iter1, mut itb: Iter2) = (a.into_iter(), b.into_iter()); |
39 | let mut count: usize = 0; |
40 | loop { |
41 | match (ita.next(), itb.next()){ |
42 | (Some(x), Some(y)) => if x != y { count += 1 }, |
43 | (None, None) => return Ok(count), |
44 | _ => return Err(StrSimError::DifferentLengthArgs), |
45 | } |
46 | } |
47 | } |
48 | |
49 | /// Calculates the number of positions in the two strings where the characters |
50 | /// differ. Returns an error if the strings have different lengths. |
51 | /// |
52 | /// ``` |
53 | /// use strsim::{hamming, StrSimError::DifferentLengthArgs}; |
54 | /// |
55 | /// assert_eq!(Ok(3), hamming("hamming" , "hammers" )); |
56 | /// |
57 | /// assert_eq!(Err(DifferentLengthArgs), hamming("hamming" , "ham" )); |
58 | /// ``` |
59 | pub fn hamming(a: &str, b: &str) -> HammingResult { |
60 | generic_hamming(a:a.chars(), b:b.chars()) |
61 | } |
62 | |
63 | /// Calculates the Jaro similarity between two sequences. The returned value |
64 | /// is between 0.0 and 1.0 (higher value means more similar). |
65 | pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 |
66 | where &'a Iter1: IntoIterator<Item=Elem1>, |
67 | &'b Iter2: IntoIterator<Item=Elem2>, |
68 | Elem1: PartialEq<Elem2> { |
69 | let a_len = a.into_iter().count(); |
70 | let b_len = b.into_iter().count(); |
71 | |
72 | // The check for lengths of one here is to prevent integer overflow when |
73 | // calculating the search range. |
74 | if a_len == 0 && b_len == 0 { |
75 | return 1.0; |
76 | } else if a_len == 0 || b_len == 0 { |
77 | return 0.0; |
78 | } else if a_len == 1 && b_len == 1 { |
79 | return if a.into_iter().eq(b.into_iter()) { 1.0} else { 0.0 }; |
80 | } |
81 | |
82 | let search_range = (max(a_len, b_len) / 2) - 1; |
83 | |
84 | let mut b_consumed = Vec::with_capacity(b_len); |
85 | for _ in 0..b_len { |
86 | b_consumed.push(false); |
87 | } |
88 | let mut matches = 0.0; |
89 | |
90 | let mut transpositions = 0.0; |
91 | let mut b_match_index = 0; |
92 | |
93 | for (i, a_elem) in a.into_iter().enumerate() { |
94 | let min_bound = |
95 | // prevent integer wrapping |
96 | if i > search_range { |
97 | max(0, i - search_range) |
98 | } else { |
99 | 0 |
100 | }; |
101 | |
102 | let max_bound = min(b_len - 1, i + search_range); |
103 | |
104 | if min_bound > max_bound { |
105 | continue; |
106 | } |
107 | |
108 | for (j, b_elem) in b.into_iter().enumerate() { |
109 | if min_bound <= j && j <= max_bound && a_elem == b_elem && |
110 | !b_consumed[j] { |
111 | b_consumed[j] = true; |
112 | matches += 1.0; |
113 | |
114 | if j < b_match_index { |
115 | transpositions += 1.0; |
116 | } |
117 | b_match_index = j; |
118 | |
119 | break; |
120 | } |
121 | } |
122 | } |
123 | |
124 | if matches == 0.0 { |
125 | 0.0 |
126 | } else { |
127 | (1.0 / 3.0) * ((matches / a_len as f64) + |
128 | (matches / b_len as f64) + |
129 | ((matches - transpositions) / matches)) |
130 | } |
131 | } |
132 | |
133 | struct StringWrapper<'a>(&'a str); |
134 | |
135 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { |
136 | type Item = char; |
137 | type IntoIter = Chars<'b>; |
138 | |
139 | fn into_iter(self) -> Self::IntoIter { |
140 | self.0.chars() |
141 | } |
142 | } |
143 | |
144 | /// Calculates the Jaro similarity between two strings. The returned value |
145 | /// is between 0.0 and 1.0 (higher value means more similar). |
146 | /// |
147 | /// ``` |
148 | /// use strsim::jaro; |
149 | /// |
150 | /// assert!((0.392 - jaro("Friedrich Nietzsche" , "Jean-Paul Sartre" )).abs() < |
151 | /// 0.001); |
152 | /// ``` |
153 | pub fn jaro(a: &str, b: &str) -> f64 { |
154 | generic_jaro(&StringWrapper(a), &StringWrapper(b)) |
155 | } |
156 | |
157 | /// Like Jaro but gives a boost to sequences that have a common prefix. |
158 | pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 |
159 | where &'a Iter1: IntoIterator<Item=Elem1>, |
160 | &'b Iter2: IntoIterator<Item=Elem2>, |
161 | Elem1: PartialEq<Elem2> { |
162 | let jaro_distance: f64 = generic_jaro(a, b); |
163 | |
164 | // Don't limit the length of the common prefix |
165 | let prefix_length: usize = aTakeWhile, …>.into_iter() |
166 | .zip(b.into_iter()) |
167 | .take_while(|&(ref a_elem: &{unknown}, ref b_elem: &{unknown})| a_elem == b_elem) |
168 | .count(); |
169 | |
170 | let jaro_winkler_distance: f64 = |
171 | jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance)); |
172 | |
173 | if jaro_winkler_distance <= 1.0 { |
174 | jaro_winkler_distance |
175 | } else { |
176 | 1.0 |
177 | } |
178 | } |
179 | |
180 | /// Like Jaro but gives a boost to strings that have a common prefix. |
181 | /// |
182 | /// ``` |
183 | /// use strsim::jaro_winkler; |
184 | /// |
185 | /// assert!((0.911 - jaro_winkler("cheeseburger" , "cheese fries" )).abs() < |
186 | /// 0.001); |
187 | /// ``` |
188 | pub fn jaro_winkler(a: &str, b: &str) -> f64 { |
189 | generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b)) |
190 | } |
191 | |
192 | /// Calculates the minimum number of insertions, deletions, and substitutions |
193 | /// required to change one sequence into the other. |
194 | /// |
195 | /// ``` |
196 | /// use strsim::generic_levenshtein; |
197 | /// |
198 | /// assert_eq!(3, generic_levenshtein(&[1,2,3], &[1,2,3,4,5,6])); |
199 | /// ``` |
200 | pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize |
201 | where &'a Iter1: IntoIterator<Item=Elem1>, |
202 | &'b Iter2: IntoIterator<Item=Elem2>, |
203 | Elem1: PartialEq<Elem2> { |
204 | let b_len: usize = b.into_iter().count(); |
205 | |
206 | if a.into_iter().next().is_none() { return b_len; } |
207 | |
208 | let mut cache: Vec<usize> = (1..b_len+1).collect(); |
209 | |
210 | let mut result: usize = 0; |
211 | |
212 | for (i, a_elem) in a.into_iter().enumerate() { |
213 | result = i + 1; |
214 | let mut distance_b = i; |
215 | |
216 | for (j, b_elem) in b.into_iter().enumerate() { |
217 | let cost: usize = if a_elem == b_elem { 0usize } else { 1usize }; |
218 | let distance_a: usize = distance_b + cost; |
219 | distance_b = cache[j]; |
220 | result = min(v1:result + 1, v2:min(v1:distance_a, v2:distance_b + 1)); |
221 | cache[j] = result; |
222 | } |
223 | } |
224 | |
225 | result |
226 | } |
227 | |
228 | /// Calculates the minimum number of insertions, deletions, and substitutions |
229 | /// required to change one string into the other. |
230 | /// |
231 | /// ``` |
232 | /// use strsim::levenshtein; |
233 | /// |
234 | /// assert_eq!(3, levenshtein("kitten" , "sitting" )); |
235 | /// ``` |
236 | pub fn levenshtein(a: &str, b: &str) -> usize { |
237 | generic_levenshtein(&StringWrapper(a), &StringWrapper(b)) |
238 | } |
239 | |
240 | /// Calculates a normalized score of the Levenshtein algorithm between 0.0 and |
241 | /// 1.0 (inclusive), where 1.0 means the strings are the same. |
242 | /// |
243 | /// ``` |
244 | /// use strsim::normalized_levenshtein; |
245 | /// |
246 | /// assert!((normalized_levenshtein("kitten" , "sitting" ) - 0.57142).abs() < 0.00001); |
247 | /// assert!((normalized_levenshtein("" , "" ) - 1.0).abs() < 0.00001); |
248 | /// assert!(normalized_levenshtein("" , "second" ).abs() < 0.00001); |
249 | /// assert!(normalized_levenshtein("first" , "" ).abs() < 0.00001); |
250 | /// assert!((normalized_levenshtein("string" , "string" ) - 1.0).abs() < 0.00001); |
251 | /// ``` |
252 | pub fn normalized_levenshtein(a: &str, b: &str) -> f64 { |
253 | if a.is_empty() && b.is_empty() { |
254 | return 1.0; |
255 | } |
256 | 1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) |
257 | } |
258 | |
259 | /// Like Levenshtein but allows for adjacent transpositions. Each substring can |
260 | /// only be edited once. |
261 | /// |
262 | /// ``` |
263 | /// use strsim::osa_distance; |
264 | /// |
265 | /// assert_eq!(3, osa_distance("ab" , "bca" )); |
266 | /// ``` |
267 | pub fn osa_distance(a: &str, b: &str) -> usize { |
268 | let a_len = a.chars().count(); |
269 | let b_len = b.chars().count(); |
270 | if a == b { return 0; } |
271 | else if a_len == 0 { return b_len; } |
272 | else if b_len == 0 { return a_len; } |
273 | |
274 | let mut prev_two_distances: Vec<usize> = Vec::with_capacity(b_len + 1); |
275 | let mut prev_distances: Vec<usize> = Vec::with_capacity(b_len + 1); |
276 | let mut curr_distances: Vec<usize> = Vec::with_capacity(b_len + 1); |
277 | |
278 | let mut prev_a_char = char::MAX; |
279 | let mut prev_b_char = char::MAX; |
280 | |
281 | for i in 0..(b_len + 1) { |
282 | prev_two_distances.push(i); |
283 | prev_distances.push(i); |
284 | curr_distances.push(0); |
285 | } |
286 | |
287 | for (i, a_char) in a.chars().enumerate() { |
288 | curr_distances[0] = i + 1; |
289 | |
290 | for (j, b_char) in b.chars().enumerate() { |
291 | let cost = if a_char == b_char { 0 } else { 1 }; |
292 | curr_distances[j + 1] = min(curr_distances[j] + 1, |
293 | min(prev_distances[j + 1] + 1, |
294 | prev_distances[j] + cost)); |
295 | if i > 0 && j > 0 && a_char != b_char && |
296 | a_char == prev_b_char && b_char == prev_a_char { |
297 | curr_distances[j + 1] = min(curr_distances[j + 1], |
298 | prev_two_distances[j - 1] + 1); |
299 | } |
300 | |
301 | prev_b_char = b_char; |
302 | } |
303 | |
304 | prev_two_distances.clone_from(&prev_distances); |
305 | prev_distances.clone_from(&curr_distances); |
306 | prev_a_char = a_char; |
307 | } |
308 | |
309 | curr_distances[b_len] |
310 | |
311 | } |
312 | |
313 | /* Returns the final index for a value in a single vector that represents a fixed |
314 | 2d grid */ |
315 | fn flat_index(i: usize, j: usize, width: usize) -> usize { |
316 | j * width + i |
317 | } |
318 | |
319 | /// Like optimal string alignment, but substrings can be edited an unlimited |
320 | /// number of times, and the triangle inequality holds. |
321 | /// |
322 | /// ``` |
323 | /// use strsim::generic_damerau_levenshtein; |
324 | /// |
325 | /// assert_eq!(2, generic_damerau_levenshtein(&[1,2], &[2,3,1])); |
326 | /// ``` |
327 | pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize |
328 | where Elem: Eq + Hash + Clone { |
329 | let a_len = a_elems.len(); |
330 | let b_len = b_elems.len(); |
331 | |
332 | if a_len == 0 { return b_len; } |
333 | if b_len == 0 { return a_len; } |
334 | |
335 | let width = a_len + 2; |
336 | let mut distances = vec![0; (a_len + 2) * (b_len + 2)]; |
337 | let max_distance = a_len + b_len; |
338 | distances[0] = max_distance; |
339 | |
340 | for i in 0..(a_len + 1) { |
341 | distances[flat_index(i + 1, 0, width)] = max_distance; |
342 | distances[flat_index(i + 1, 1, width)] = i; |
343 | } |
344 | |
345 | for j in 0..(b_len + 1) { |
346 | distances[flat_index(0, j + 1, width)] = max_distance; |
347 | distances[flat_index(1, j + 1, width)] = j; |
348 | } |
349 | |
350 | let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64); |
351 | |
352 | for i in 1..(a_len + 1) { |
353 | let mut db = 0; |
354 | |
355 | for j in 1..(b_len + 1) { |
356 | let k = match elems.get(&b_elems[j - 1]) { |
357 | Some(&value) => value, |
358 | None => 0 |
359 | }; |
360 | |
361 | let insertion_cost = distances[flat_index(i, j + 1, width)] + 1; |
362 | let deletion_cost = distances[flat_index(i + 1, j, width)] + 1; |
363 | let transposition_cost = distances[flat_index(k, db, width)] + |
364 | (i - k - 1) + 1 + (j - db - 1); |
365 | |
366 | let mut substitution_cost = distances[flat_index(i, j, width)] + 1; |
367 | if a_elems[i - 1] == b_elems[j - 1] { |
368 | db = j; |
369 | substitution_cost -= 1; |
370 | } |
371 | |
372 | distances[flat_index(i + 1, j + 1, width)] = min(substitution_cost, |
373 | min(insertion_cost, min(deletion_cost, transposition_cost))); |
374 | } |
375 | |
376 | elems.insert(a_elems[i - 1].clone(), i); |
377 | } |
378 | |
379 | distances[flat_index(a_len + 1, b_len + 1, width)] |
380 | } |
381 | |
382 | /// Like optimal string alignment, but substrings can be edited an unlimited |
383 | /// number of times, and the triangle inequality holds. |
384 | /// |
385 | /// ``` |
386 | /// use strsim::damerau_levenshtein; |
387 | /// |
388 | /// assert_eq!(2, damerau_levenshtein("ab" , "bca" )); |
389 | /// ``` |
390 | pub fn damerau_levenshtein(a: &str, b: &str) -> usize { |
391 | let (x: Vec, y: Vec): (Vec<_>, Vec<_>) = (a.chars().collect(), b.chars().collect()); |
392 | generic_damerau_levenshtein(a_elems:x.as_slice(), b_elems:y.as_slice()) |
393 | } |
394 | |
395 | /// Calculates a normalized score of the Damerau–Levenshtein algorithm between |
396 | /// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same. |
397 | /// |
398 | /// ``` |
399 | /// use strsim::normalized_damerau_levenshtein; |
400 | /// |
401 | /// assert!((normalized_damerau_levenshtein("levenshtein" , "löwenbräu" ) - 0.27272).abs() < 0.00001); |
402 | /// assert!((normalized_damerau_levenshtein("" , "" ) - 1.0).abs() < 0.00001); |
403 | /// assert!(normalized_damerau_levenshtein("" , "flower" ).abs() < 0.00001); |
404 | /// assert!(normalized_damerau_levenshtein("tree" , "" ).abs() < 0.00001); |
405 | /// assert!((normalized_damerau_levenshtein("sunglasses" , "sunglasses" ) - 1.0).abs() < 0.00001); |
406 | /// ``` |
407 | pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 { |
408 | if a.is_empty() && b.is_empty() { |
409 | return 1.0; |
410 | } |
411 | 1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) |
412 | } |
413 | |
414 | /// Returns an Iterator of char tuples. |
415 | fn bigrams(s: &str) -> impl Iterator<Item=(char, char)> + '_ { |
416 | s.chars().zip(s.chars().skip(1)) |
417 | } |
418 | |
419 | |
420 | /// Calculates a Sørensen-Dice similarity distance using bigrams. |
421 | /// See http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient. |
422 | /// |
423 | /// ``` |
424 | /// use strsim::sorensen_dice; |
425 | /// |
426 | /// assert_eq!(1.0, sorensen_dice("" , "" )); |
427 | /// assert_eq!(0.0, sorensen_dice("" , "a" )); |
428 | /// assert_eq!(0.0, sorensen_dice("french" , "quebec" )); |
429 | /// assert_eq!(1.0, sorensen_dice("ferris" , "ferris" )); |
430 | /// assert_eq!(1.0, sorensen_dice("ferris" , "ferris" )); |
431 | /// assert_eq!(0.8888888888888888, sorensen_dice("feris" , "ferris" )); |
432 | /// ``` |
433 | pub fn sorensen_dice(a: &str, b: &str) -> f64 { |
434 | // implementation guided by |
435 | // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6 |
436 | |
437 | let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect(); |
438 | let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect(); |
439 | |
440 | if a.len() == 0 && b.len() == 0 { |
441 | return 1.0; |
442 | } |
443 | |
444 | if a.len() == 0 || b.len() == 0 { |
445 | return 0.0; |
446 | } |
447 | |
448 | if a == b { |
449 | return 1.0; |
450 | } |
451 | |
452 | if a.len() == 1 && b.len() == 1 { |
453 | return 0.0; |
454 | } |
455 | |
456 | if a.len() < 2 || b.len() < 2 { |
457 | return 0.0; |
458 | } |
459 | |
460 | let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new(); |
461 | |
462 | for bigram in bigrams(&a) { |
463 | *a_bigrams.entry(bigram).or_insert(0) += 1; |
464 | } |
465 | |
466 | let mut intersection_size = 0; |
467 | |
468 | for bigram in bigrams(&b) { |
469 | a_bigrams.entry(bigram).and_modify(|bi| { |
470 | if *bi > 0 { |
471 | *bi -= 1; |
472 | intersection_size += 1; |
473 | } |
474 | }); |
475 | } |
476 | |
477 | (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 |
478 | } |
479 | |
480 | |
481 | #[cfg (test)] |
482 | mod tests { |
483 | use super::*; |
484 | |
485 | #[test ] |
486 | fn bigrams_iterator() { |
487 | let mut bi = bigrams("abcde" ); |
488 | |
489 | assert_eq!(Some(('a' , 'b' )), bi.next()); |
490 | assert_eq!(Some(('b' , 'c' )), bi.next()); |
491 | assert_eq!(Some(('c' , 'd' )), bi.next()); |
492 | assert_eq!(Some(('d' , 'e' )), bi.next()); |
493 | assert_eq!(None, bi.next()); |
494 | } |
495 | |
496 | fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) { |
497 | assert_eq!(Ok(dist), hamming(str1, str2)); |
498 | } |
499 | |
500 | #[test ] |
501 | fn hamming_empty() { |
502 | assert_hamming_dist(0, "" , "" ) |
503 | } |
504 | |
505 | #[test ] |
506 | fn hamming_same() { |
507 | assert_hamming_dist(0, "hamming" , "hamming" ) |
508 | } |
509 | |
510 | #[test ] |
511 | fn hamming_numbers() { |
512 | assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3])); |
513 | } |
514 | |
515 | #[test ] |
516 | fn hamming_diff() { |
517 | assert_hamming_dist(3, "hamming" , "hammers" ) |
518 | } |
519 | |
520 | #[test ] |
521 | fn hamming_diff_multibyte() { |
522 | assert_hamming_dist(2, "hamming" , "h香mmüng" ); |
523 | } |
524 | |
525 | #[test ] |
526 | fn hamming_unequal_length() { |
527 | assert_eq!( |
528 | Err(StrSimError::DifferentLengthArgs), |
529 | generic_hamming("ham" .chars(), "hamming" .chars()) |
530 | ); |
531 | } |
532 | |
533 | #[test ] |
534 | fn hamming_names() { |
535 | assert_hamming_dist(14, "Friedrich Nietzs" , "Jean-Paul Sartre" ) |
536 | } |
537 | |
538 | #[test ] |
539 | fn jaro_both_empty() { |
540 | assert_eq!(1.0, jaro("" , "" )); |
541 | } |
542 | |
543 | #[test ] |
544 | fn jaro_first_empty() { |
545 | assert_eq!(0.0, jaro("" , "jaro" )); |
546 | } |
547 | |
548 | #[test ] |
549 | fn jaro_second_empty() { |
550 | assert_eq!(0.0, jaro("distance" , "" )); |
551 | } |
552 | |
553 | #[test ] |
554 | fn jaro_same() { |
555 | assert_eq!(1.0, jaro("jaro" , "jaro" )); |
556 | } |
557 | |
558 | #[test ] |
559 | fn jaro_multibyte() { |
560 | assert!((0.818 - jaro("testabctest" , "testöঙ香test" )) < 0.001); |
561 | assert!((0.818 - jaro("testöঙ香test" , "testabctest" )) < 0.001); |
562 | } |
563 | |
564 | #[test ] |
565 | fn jaro_diff_short() { |
566 | assert!((0.767 - jaro("dixon" , "dicksonx" )).abs() < 0.001); |
567 | } |
568 | |
569 | #[test ] |
570 | fn jaro_diff_one_character() { |
571 | assert_eq!(0.0, jaro("a" , "b" )); |
572 | } |
573 | |
574 | #[test ] |
575 | fn jaro_same_one_character() { |
576 | assert_eq!(1.0, jaro("a" , "a" )); |
577 | } |
578 | |
579 | #[test ] |
580 | fn generic_jaro_diff() { |
581 | assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4])); |
582 | } |
583 | |
584 | #[test ] |
585 | fn jaro_diff_one_and_two() { |
586 | assert!((0.83 - jaro("a" , "ab" )).abs() < 0.01); |
587 | } |
588 | |
589 | #[test ] |
590 | fn jaro_diff_two_and_one() { |
591 | assert!((0.83 - jaro("ab" , "a" )).abs() < 0.01); |
592 | } |
593 | |
594 | #[test ] |
595 | fn jaro_diff_no_transposition() { |
596 | assert!((0.822 - jaro("dwayne" , "duane" )).abs() < 0.001); |
597 | } |
598 | |
599 | #[test ] |
600 | fn jaro_diff_with_transposition() { |
601 | assert!((0.944 - jaro("martha" , "marhta" )).abs() < 0.001); |
602 | } |
603 | |
604 | #[test ] |
605 | fn jaro_names() { |
606 | assert!((0.392 - jaro("Friedrich Nietzsche" , |
607 | "Jean-Paul Sartre" )).abs() < 0.001); |
608 | } |
609 | |
610 | #[test ] |
611 | fn jaro_winkler_both_empty() { |
612 | assert_eq!(1.0, jaro_winkler("" , "" )); |
613 | } |
614 | |
615 | #[test ] |
616 | fn jaro_winkler_first_empty() { |
617 | assert_eq!(0.0, jaro_winkler("" , "jaro-winkler" )); |
618 | } |
619 | |
620 | #[test ] |
621 | fn jaro_winkler_second_empty() { |
622 | assert_eq!(0.0, jaro_winkler("distance" , "" )); |
623 | } |
624 | |
625 | #[test ] |
626 | fn jaro_winkler_same() { |
627 | assert_eq!(1.0, jaro_winkler("Jaro-Winkler" , "Jaro-Winkler" )); |
628 | } |
629 | |
630 | #[test ] |
631 | fn jaro_winkler_multibyte() { |
632 | assert!((0.89 - jaro_winkler("testabctest" , "testöঙ香test" )).abs() < |
633 | 0.001); |
634 | assert!((0.89 - jaro_winkler("testöঙ香test" , "testabctest" )).abs() < |
635 | 0.001); |
636 | } |
637 | |
638 | #[test ] |
639 | fn jaro_winkler_diff_short() { |
640 | assert!((0.813 - jaro_winkler("dixon" , "dicksonx" )).abs() < 0.001); |
641 | assert!((0.813 - jaro_winkler("dicksonx" , "dixon" )).abs() < 0.001); |
642 | } |
643 | |
644 | #[test ] |
645 | fn jaro_winkler_diff_one_character() { |
646 | assert_eq!(0.0, jaro_winkler("a" , "b" )); |
647 | } |
648 | |
649 | #[test ] |
650 | fn jaro_winkler_same_one_character() { |
651 | assert_eq!(1.0, jaro_winkler("a" , "a" )); |
652 | } |
653 | |
654 | #[test ] |
655 | fn jaro_winkler_diff_no_transposition() { |
656 | assert!((0.840 - jaro_winkler("dwayne" , "duane" )).abs() < 0.001); |
657 | } |
658 | |
659 | #[test ] |
660 | fn jaro_winkler_diff_with_transposition() { |
661 | assert!((0.961 - jaro_winkler("martha" , "marhta" )).abs() < 0.001); |
662 | } |
663 | |
664 | #[test ] |
665 | fn jaro_winkler_names() { |
666 | assert!((0.562 - jaro_winkler("Friedrich Nietzsche" , |
667 | "Fran-Paul Sartre" )).abs() < 0.001); |
668 | } |
669 | |
670 | #[test ] |
671 | fn jaro_winkler_long_prefix() { |
672 | assert!((0.911 - jaro_winkler("cheeseburger" , "cheese fries" )).abs() < |
673 | 0.001); |
674 | } |
675 | |
676 | #[test ] |
677 | fn jaro_winkler_more_names() { |
678 | assert!((0.868 - jaro_winkler("Thorkel" , "Thorgier" )).abs() < 0.001); |
679 | } |
680 | |
681 | #[test ] |
682 | fn jaro_winkler_length_of_one() { |
683 | assert!((0.738 - jaro_winkler("Dinsdale" , "D" )).abs() < 0.001); |
684 | } |
685 | |
686 | #[test ] |
687 | fn jaro_winkler_very_long_prefix() { |
688 | assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx" , |
689 | "thequickbrownfoxjumpedovery" )).abs() < |
690 | 0.001); |
691 | } |
692 | |
693 | #[test ] |
694 | fn levenshtein_empty() { |
695 | assert_eq!(0, levenshtein("" , "" )); |
696 | } |
697 | |
698 | #[test ] |
699 | fn levenshtein_same() { |
700 | assert_eq!(0, levenshtein("levenshtein" , "levenshtein" )); |
701 | } |
702 | |
703 | #[test ] |
704 | fn levenshtein_diff_short() { |
705 | assert_eq!(3, levenshtein("kitten" , "sitting" )); |
706 | } |
707 | |
708 | #[test ] |
709 | fn levenshtein_diff_with_space() { |
710 | assert_eq!(5, levenshtein("hello, world" , "bye, world" )); |
711 | } |
712 | |
713 | #[test ] |
714 | fn levenshtein_diff_multibyte() { |
715 | assert_eq!(3, levenshtein("öঙ香" , "abc" )); |
716 | assert_eq!(3, levenshtein("abc" , "öঙ香" )); |
717 | } |
718 | |
719 | #[test ] |
720 | fn levenshtein_diff_longer() { |
721 | let a = "The quick brown fox jumped over the angry dog." ; |
722 | let b = "Lorem ipsum dolor sit amet, dicta latine an eam." ; |
723 | assert_eq!(37, levenshtein(a, b)); |
724 | } |
725 | |
726 | #[test ] |
727 | fn levenshtein_first_empty() { |
728 | assert_eq!(7, levenshtein("" , "sitting" )); |
729 | } |
730 | |
731 | #[test ] |
732 | fn levenshtein_second_empty() { |
733 | assert_eq!(6, levenshtein("kitten" , "" )); |
734 | } |
735 | |
736 | #[test ] |
737 | fn normalized_levenshtein_diff_short() { |
738 | assert!((normalized_levenshtein("kitten" , "sitting" ) - 0.57142).abs() < 0.00001); |
739 | } |
740 | |
741 | #[test ] |
742 | fn normalized_levenshtein_for_empty_strings() { |
743 | assert!((normalized_levenshtein("" , "" ) - 1.0).abs() < 0.00001); |
744 | } |
745 | |
746 | #[test ] |
747 | fn normalized_levenshtein_first_empty() { |
748 | assert!(normalized_levenshtein("" , "second" ).abs() < 0.00001); |
749 | } |
750 | |
751 | #[test ] |
752 | fn normalized_levenshtein_second_empty() { |
753 | assert!(normalized_levenshtein("first" , "" ).abs() < 0.00001); |
754 | } |
755 | |
756 | #[test ] |
757 | fn normalized_levenshtein_identical_strings() { |
758 | assert!((normalized_levenshtein("identical" , "identical" ) - 1.0).abs() < 0.00001); |
759 | } |
760 | |
761 | #[test ] |
762 | fn osa_distance_empty() { |
763 | assert_eq!(0, osa_distance("" , "" )); |
764 | } |
765 | |
766 | #[test ] |
767 | fn osa_distance_same() { |
768 | assert_eq!(0, osa_distance("damerau" , "damerau" )); |
769 | } |
770 | |
771 | #[test ] |
772 | fn osa_distance_first_empty() { |
773 | assert_eq!(7, osa_distance("" , "damerau" )); |
774 | } |
775 | |
776 | #[test ] |
777 | fn osa_distance_second_empty() { |
778 | assert_eq!(7, osa_distance("damerau" , "" )); |
779 | } |
780 | |
781 | #[test ] |
782 | fn osa_distance_diff() { |
783 | assert_eq!(3, osa_distance("ca" , "abc" )); |
784 | } |
785 | |
786 | #[test ] |
787 | fn osa_distance_diff_short() { |
788 | assert_eq!(3, osa_distance("damerau" , "aderua" )); |
789 | } |
790 | |
791 | #[test ] |
792 | fn osa_distance_diff_reversed() { |
793 | assert_eq!(3, osa_distance("aderua" , "damerau" )); |
794 | } |
795 | |
796 | #[test ] |
797 | fn osa_distance_diff_multibyte() { |
798 | assert_eq!(3, osa_distance("öঙ香" , "abc" )); |
799 | assert_eq!(3, osa_distance("abc" , "öঙ香" )); |
800 | } |
801 | |
802 | #[test ] |
803 | fn osa_distance_diff_unequal_length() { |
804 | assert_eq!(6, osa_distance("damerau" , "aderuaxyz" )); |
805 | } |
806 | |
807 | #[test ] |
808 | fn osa_distance_diff_unequal_length_reversed() { |
809 | assert_eq!(6, osa_distance("aderuaxyz" , "damerau" )); |
810 | } |
811 | |
812 | #[test ] |
813 | fn osa_distance_diff_comedians() { |
814 | assert_eq!(5, osa_distance("Stewart" , "Colbert" )); |
815 | } |
816 | |
817 | #[test ] |
818 | fn osa_distance_many_transpositions() { |
819 | assert_eq!(4, osa_distance("abcdefghijkl" , "bacedfgihjlk" )); |
820 | } |
821 | |
822 | #[test ] |
823 | fn osa_distance_diff_longer() { |
824 | let a = "The quick brown fox jumped over the angry dog." ; |
825 | let b = "Lehem ipsum dolor sit amet, dicta latine an eam." ; |
826 | assert_eq!(36, osa_distance(a, b)); |
827 | } |
828 | |
829 | #[test ] |
830 | fn osa_distance_beginning_transposition() { |
831 | assert_eq!(1, osa_distance("foobar" , "ofobar" )); |
832 | } |
833 | |
834 | #[test ] |
835 | fn osa_distance_end_transposition() { |
836 | assert_eq!(1, osa_distance("specter" , "spectre" )); |
837 | } |
838 | |
839 | #[test ] |
840 | fn osa_distance_restricted_edit() { |
841 | assert_eq!(4, osa_distance("a cat" , "an abct" )); |
842 | } |
843 | |
844 | #[test ] |
845 | fn damerau_levenshtein_empty() { |
846 | assert_eq!(0, damerau_levenshtein("" , "" )); |
847 | } |
848 | |
849 | #[test ] |
850 | fn damerau_levenshtein_same() { |
851 | assert_eq!(0, damerau_levenshtein("damerau" , "damerau" )); |
852 | } |
853 | |
854 | #[test ] |
855 | fn damerau_levenshtein_first_empty() { |
856 | assert_eq!(7, damerau_levenshtein("" , "damerau" )); |
857 | } |
858 | |
859 | #[test ] |
860 | fn damerau_levenshtein_second_empty() { |
861 | assert_eq!(7, damerau_levenshtein("damerau" , "" )); |
862 | } |
863 | |
864 | #[test ] |
865 | fn damerau_levenshtein_diff() { |
866 | assert_eq!(2, damerau_levenshtein("ca" , "abc" )); |
867 | } |
868 | |
869 | #[test ] |
870 | fn damerau_levenshtein_diff_short() { |
871 | assert_eq!(3, damerau_levenshtein("damerau" , "aderua" )); |
872 | } |
873 | |
874 | #[test ] |
875 | fn damerau_levenshtein_diff_reversed() { |
876 | assert_eq!(3, damerau_levenshtein("aderua" , "damerau" )); |
877 | } |
878 | |
879 | #[test ] |
880 | fn damerau_levenshtein_diff_multibyte() { |
881 | assert_eq!(3, damerau_levenshtein("öঙ香" , "abc" )); |
882 | assert_eq!(3, damerau_levenshtein("abc" , "öঙ香" )); |
883 | } |
884 | |
885 | #[test ] |
886 | fn damerau_levenshtein_diff_unequal_length() { |
887 | assert_eq!(6, damerau_levenshtein("damerau" , "aderuaxyz" )); |
888 | } |
889 | |
890 | #[test ] |
891 | fn damerau_levenshtein_diff_unequal_length_reversed() { |
892 | assert_eq!(6, damerau_levenshtein("aderuaxyz" , "damerau" )); |
893 | } |
894 | |
895 | #[test ] |
896 | fn damerau_levenshtein_diff_comedians() { |
897 | assert_eq!(5, damerau_levenshtein("Stewart" , "Colbert" )); |
898 | } |
899 | |
900 | #[test ] |
901 | fn damerau_levenshtein_many_transpositions() { |
902 | assert_eq!(4, damerau_levenshtein("abcdefghijkl" , "bacedfgihjlk" )); |
903 | } |
904 | |
905 | #[test ] |
906 | fn damerau_levenshtein_diff_longer() { |
907 | let a = "The quick brown fox jumped over the angry dog." ; |
908 | let b = "Lehem ipsum dolor sit amet, dicta latine an eam." ; |
909 | assert_eq!(36, damerau_levenshtein(a, b)); |
910 | } |
911 | |
912 | #[test ] |
913 | fn damerau_levenshtein_beginning_transposition() { |
914 | assert_eq!(1, damerau_levenshtein("foobar" , "ofobar" )); |
915 | } |
916 | |
917 | #[test ] |
918 | fn damerau_levenshtein_end_transposition() { |
919 | assert_eq!(1, damerau_levenshtein("specter" , "spectre" )); |
920 | } |
921 | |
922 | #[test ] |
923 | fn damerau_levenshtein_unrestricted_edit() { |
924 | assert_eq!(3, damerau_levenshtein("a cat" , "an abct" )); |
925 | } |
926 | |
927 | #[test ] |
928 | fn normalized_damerau_levenshtein_diff_short() { |
929 | assert!((normalized_damerau_levenshtein("levenshtein" , "löwenbräu" ) - 0.27272).abs() < 0.00001); |
930 | } |
931 | |
932 | #[test ] |
933 | fn normalized_damerau_levenshtein_for_empty_strings() { |
934 | assert!((normalized_damerau_levenshtein("" , "" ) - 1.0).abs() < 0.00001); |
935 | } |
936 | |
937 | #[test ] |
938 | fn normalized_damerau_levenshtein_first_empty() { |
939 | assert!(normalized_damerau_levenshtein("" , "flower" ).abs() < 0.00001); |
940 | } |
941 | |
942 | #[test ] |
943 | fn normalized_damerau_levenshtein_second_empty() { |
944 | assert!(normalized_damerau_levenshtein("tree" , "" ).abs() < 0.00001); |
945 | } |
946 | |
947 | #[test ] |
948 | fn normalized_damerau_levenshtein_identical_strings() { |
949 | assert!((normalized_damerau_levenshtein("sunglasses" , "sunglasses" ) - 1.0).abs() < 0.00001); |
950 | } |
951 | |
952 | #[test ] |
953 | fn sorensen_dice_all() { |
954 | // test cases taken from |
955 | // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11 |
956 | |
957 | assert_eq!(1.0, sorensen_dice("a" , "a" )); |
958 | assert_eq!(0.0, sorensen_dice("a" , "b" )); |
959 | assert_eq!(1.0, sorensen_dice("" , "" )); |
960 | assert_eq!(0.0, sorensen_dice("a" , "" )); |
961 | assert_eq!(0.0, sorensen_dice("" , "a" )); |
962 | assert_eq!(1.0, sorensen_dice("apple event" , "apple event" )); |
963 | assert_eq!(0.9090909090909091, sorensen_dice("iphone" , "iphone x" )); |
964 | assert_eq!(0.0, sorensen_dice("french" , "quebec" )); |
965 | assert_eq!(1.0, sorensen_dice("france" , "france" )); |
966 | assert_eq!(0.2, sorensen_dice("fRaNce" , "france" )); |
967 | assert_eq!(0.8, sorensen_dice("healed" , "sealed" )); |
968 | assert_eq!( |
969 | 0.7878787878787878, |
970 | sorensen_dice("web applications" , "applications of the web" ) |
971 | ); |
972 | assert_eq!( |
973 | 0.92, |
974 | sorensen_dice( |
975 | "this will have a typo somewhere" , |
976 | "this will huve a typo somewhere" |
977 | ) |
978 | ); |
979 | assert_eq!( |
980 | 0.6060606060606061, |
981 | sorensen_dice( |
982 | "Olive-green table for sale, in extremely good condition." , |
983 | "For sale: table in very good condition, olive green in colour." |
984 | ) |
985 | ); |
986 | assert_eq!( |
987 | 0.2558139534883721, |
988 | sorensen_dice( |
989 | "Olive-green table for sale, in extremely good condition." , |
990 | "For sale: green Subaru Impreza, 210,000 miles" |
991 | ) |
992 | ); |
993 | assert_eq!( |
994 | 0.1411764705882353, |
995 | sorensen_dice( |
996 | "Olive-green table for sale, in extremely good condition." , |
997 | "Wanted: mountain bike with at least 21 gears." |
998 | ) |
999 | ); |
1000 | assert_eq!( |
1001 | 0.7741935483870968, |
1002 | sorensen_dice("this has one extra word" , "this has one word" ) |
1003 | ); |
1004 | } |
1005 | } |
1006 | |