| 1 | //! This library implements string similarity metrics. |
| 2 | |
| 3 | #![forbid (unsafe_code)] |
| 4 | #![allow ( |
| 5 | // these casts are sometimes needed. They restrict the length of input iterators |
| 6 | // but there isn't really any way around this except for always working with |
| 7 | // 128 bit types |
| 8 | clippy::cast_possible_wrap, |
| 9 | clippy::cast_sign_loss, |
| 10 | clippy::cast_precision_loss, |
| 11 | // not practical |
| 12 | clippy::needless_pass_by_value, |
| 13 | clippy::similar_names, |
| 14 | // noisy |
| 15 | clippy::missing_errors_doc, |
| 16 | clippy::missing_panics_doc, |
| 17 | clippy::must_use_candidate, |
| 18 | // todo https://github.com/rapidfuzz/strsim-rs/issues/59 |
| 19 | clippy::range_plus_one |
| 20 | )] |
| 21 | |
| 22 | use std::char; |
| 23 | use std::cmp::{max, min}; |
| 24 | use std::collections::HashMap; |
| 25 | use std::convert::TryFrom; |
| 26 | use std::error::Error; |
| 27 | use std::fmt::{self, Display, Formatter}; |
| 28 | use std::hash::Hash; |
| 29 | use std::mem; |
| 30 | use std::str::Chars; |
| 31 | |
| 32 | #[derive (Debug, PartialEq)] |
| 33 | pub enum StrSimError { |
| 34 | DifferentLengthArgs, |
| 35 | } |
| 36 | |
| 37 | impl Display for StrSimError { |
| 38 | fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> { |
| 39 | let text: &'static str = match self { |
| 40 | StrSimError::DifferentLengthArgs => "Differing length arguments provided" , |
| 41 | }; |
| 42 | |
| 43 | write!(fmt, " {}" , text) |
| 44 | } |
| 45 | } |
| 46 | |
| 47 | impl Error for StrSimError {} |
| 48 | |
| 49 | pub type HammingResult = Result<usize, StrSimError>; |
| 50 | |
| 51 | /// Calculates the number of positions in the two sequences where the elements |
| 52 | /// differ. Returns an error if the sequences have different lengths. |
| 53 | pub fn generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult |
| 54 | where |
| 55 | Iter1: IntoIterator<Item = Elem1>, |
| 56 | Iter2: IntoIterator<Item = Elem2>, |
| 57 | Elem1: PartialEq<Elem2>, |
| 58 | { |
| 59 | let (mut ita: Iter1, mut itb: Iter2) = (a.into_iter(), b.into_iter()); |
| 60 | let mut count: usize = 0; |
| 61 | loop { |
| 62 | match (ita.next(), itb.next()) { |
| 63 | (Some(x), Some(y)) => { |
| 64 | if x != y { |
| 65 | count += 1; |
| 66 | } |
| 67 | } |
| 68 | (None, None) => return Ok(count), |
| 69 | _ => return Err(StrSimError::DifferentLengthArgs), |
| 70 | } |
| 71 | } |
| 72 | } |
| 73 | |
| 74 | /// Calculates the number of positions in the two strings where the characters |
| 75 | /// differ. Returns an error if the strings have different lengths. |
| 76 | /// |
| 77 | /// ``` |
| 78 | /// use strsim::{hamming, StrSimError::DifferentLengthArgs}; |
| 79 | /// |
| 80 | /// assert_eq!(Ok(3), hamming("hamming" , "hammers" )); |
| 81 | /// |
| 82 | /// assert_eq!(Err(DifferentLengthArgs), hamming("hamming" , "ham" )); |
| 83 | /// ``` |
| 84 | pub fn hamming(a: &str, b: &str) -> HammingResult { |
| 85 | generic_hamming(a.chars(), b.chars()) |
| 86 | } |
| 87 | |
| 88 | /// Calculates the Jaro similarity between two sequences. The returned value |
| 89 | /// is between 0.0 and 1.0 (higher value means more similar). |
| 90 | pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 |
| 91 | where |
| 92 | &'a Iter1: IntoIterator<Item = Elem1>, |
| 93 | &'b Iter2: IntoIterator<Item = Elem2>, |
| 94 | Elem1: PartialEq<Elem2>, |
| 95 | { |
| 96 | let a_len = a.into_iter().count(); |
| 97 | let b_len = b.into_iter().count(); |
| 98 | |
| 99 | if a_len == 0 && b_len == 0 { |
| 100 | return 1.0; |
| 101 | } else if a_len == 0 || b_len == 0 { |
| 102 | return 0.0; |
| 103 | } |
| 104 | |
| 105 | let mut search_range = max(a_len, b_len) / 2; |
| 106 | search_range = search_range.saturating_sub(1); |
| 107 | |
| 108 | // combine memory allocations to reduce runtime |
| 109 | let mut flags_memory = vec![false; a_len + b_len]; |
| 110 | let (a_flags, b_flags) = flags_memory.split_at_mut(a_len); |
| 111 | |
| 112 | let mut matches = 0_usize; |
| 113 | |
| 114 | for (i, a_elem) in a.into_iter().enumerate() { |
| 115 | // prevent integer wrapping |
| 116 | let min_bound = if i > search_range { |
| 117 | i - search_range |
| 118 | } else { |
| 119 | 0 |
| 120 | }; |
| 121 | |
| 122 | let max_bound = min(b_len, i + search_range + 1); |
| 123 | |
| 124 | for (j, b_elem) in b.into_iter().enumerate().take(max_bound) { |
| 125 | if min_bound <= j && a_elem == b_elem && !b_flags[j] { |
| 126 | a_flags[i] = true; |
| 127 | b_flags[j] = true; |
| 128 | matches += 1; |
| 129 | break; |
| 130 | } |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | let mut transpositions = 0_usize; |
| 135 | if matches != 0 { |
| 136 | let mut b_iter = b_flags.iter().zip(b); |
| 137 | for (a_flag, ch1) in a_flags.iter().zip(a) { |
| 138 | if *a_flag { |
| 139 | loop { |
| 140 | if let Some((b_flag, ch2)) = b_iter.next() { |
| 141 | if !*b_flag { |
| 142 | continue; |
| 143 | } |
| 144 | |
| 145 | if ch1 != ch2 { |
| 146 | transpositions += 1; |
| 147 | } |
| 148 | break; |
| 149 | } |
| 150 | } |
| 151 | } |
| 152 | } |
| 153 | } |
| 154 | transpositions /= 2; |
| 155 | |
| 156 | if matches == 0 { |
| 157 | 0.0 |
| 158 | } else { |
| 159 | ((matches as f64 / a_len as f64) |
| 160 | + (matches as f64 / b_len as f64) |
| 161 | + ((matches - transpositions) as f64 / matches as f64)) |
| 162 | / 3.0 |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | struct StringWrapper<'a>(&'a str); |
| 167 | |
| 168 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { |
| 169 | type Item = char; |
| 170 | type IntoIter = Chars<'b>; |
| 171 | |
| 172 | fn into_iter(self) -> Self::IntoIter { |
| 173 | self.0.chars() |
| 174 | } |
| 175 | } |
| 176 | |
| 177 | /// Calculates the Jaro similarity between two strings. The returned value |
| 178 | /// is between 0.0 and 1.0 (higher value means more similar). |
| 179 | /// |
| 180 | /// ``` |
| 181 | /// use strsim::jaro; |
| 182 | /// |
| 183 | /// assert!((0.392 - jaro("Friedrich Nietzsche" , "Jean-Paul Sartre" )).abs() < |
| 184 | /// 0.001); |
| 185 | /// ``` |
| 186 | pub fn jaro(a: &str, b: &str) -> f64 { |
| 187 | generic_jaro(&StringWrapper(a), &StringWrapper(b)) |
| 188 | } |
| 189 | |
| 190 | /// Like Jaro but gives a boost to sequences that have a common prefix. |
| 191 | pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 |
| 192 | where |
| 193 | &'a Iter1: IntoIterator<Item = Elem1>, |
| 194 | &'b Iter2: IntoIterator<Item = Elem2>, |
| 195 | Elem1: PartialEq<Elem2>, |
| 196 | { |
| 197 | let sim: f64 = generic_jaro(a, b); |
| 198 | |
| 199 | if sim > 0.7 { |
| 200 | let prefix_length: usize = aTakeWhile, …>, …> |
| 201 | .into_iter() |
| 202 | .take(4) |
| 203 | .zip(b) |
| 204 | .take_while(|(a_elem: &{unknown}, b_elem: &{unknown})| a_elem == b_elem) |
| 205 | .count(); |
| 206 | |
| 207 | sim + 0.1 * prefix_length as f64 * (1.0 - sim) |
| 208 | } else { |
| 209 | sim |
| 210 | } |
| 211 | } |
| 212 | |
| 213 | /// Like Jaro but gives a boost to strings that have a common prefix. |
| 214 | /// |
| 215 | /// ``` |
| 216 | /// use strsim::jaro_winkler; |
| 217 | /// |
| 218 | /// assert!((0.866 - jaro_winkler("cheeseburger" , "cheese fries" )).abs() < |
| 219 | /// 0.001); |
| 220 | /// ``` |
| 221 | pub fn jaro_winkler(a: &str, b: &str) -> f64 { |
| 222 | generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b)) |
| 223 | } |
| 224 | |
| 225 | /// Calculates the minimum number of insertions, deletions, and substitutions |
| 226 | /// required to change one sequence into the other. |
| 227 | /// |
| 228 | /// ``` |
| 229 | /// use strsim::generic_levenshtein; |
| 230 | /// |
| 231 | /// assert_eq!(3, generic_levenshtein(&[1,2,3], &[1,2,3,4,5,6])); |
| 232 | /// ``` |
| 233 | pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize |
| 234 | where |
| 235 | &'a Iter1: IntoIterator<Item = Elem1>, |
| 236 | &'b Iter2: IntoIterator<Item = Elem2>, |
| 237 | Elem1: PartialEq<Elem2>, |
| 238 | { |
| 239 | let b_len: usize = b.into_iter().count(); |
| 240 | |
| 241 | let mut cache: Vec<usize> = (1..b_len + 1).collect(); |
| 242 | |
| 243 | let mut result: usize = b_len; |
| 244 | |
| 245 | for (i, a_elem) in a.into_iter().enumerate() { |
| 246 | result = i + 1; |
| 247 | let mut distance_b = i; |
| 248 | |
| 249 | for (j, b_elem) in b.into_iter().enumerate() { |
| 250 | let cost: usize = usize::from(a_elem != b_elem); |
| 251 | let distance_a: usize = distance_b + cost; |
| 252 | distance_b = cache[j]; |
| 253 | result = min(v1:result + 1, v2:min(v1:distance_a, v2:distance_b + 1)); |
| 254 | cache[j] = result; |
| 255 | } |
| 256 | } |
| 257 | |
| 258 | result |
| 259 | } |
| 260 | |
| 261 | /// Calculates the minimum number of insertions, deletions, and substitutions |
| 262 | /// required to change one string into the other. |
| 263 | /// |
| 264 | /// ``` |
| 265 | /// use strsim::levenshtein; |
| 266 | /// |
| 267 | /// assert_eq!(3, levenshtein("kitten" , "sitting" )); |
| 268 | /// ``` |
| 269 | pub fn levenshtein(a: &str, b: &str) -> usize { |
| 270 | generic_levenshtein(&StringWrapper(a), &StringWrapper(b)) |
| 271 | } |
| 272 | |
| 273 | /// Calculates a normalized score of the Levenshtein algorithm between 0.0 and |
| 274 | /// 1.0 (inclusive), where 1.0 means the strings are the same. |
| 275 | /// |
| 276 | /// ``` |
| 277 | /// use strsim::normalized_levenshtein; |
| 278 | /// |
| 279 | /// assert!((normalized_levenshtein("kitten" , "sitting" ) - 0.57142).abs() < 0.00001); |
| 280 | /// assert!((normalized_levenshtein("" , "" ) - 1.0).abs() < 0.00001); |
| 281 | /// assert!(normalized_levenshtein("" , "second" ).abs() < 0.00001); |
| 282 | /// assert!(normalized_levenshtein("first" , "" ).abs() < 0.00001); |
| 283 | /// assert!((normalized_levenshtein("string" , "string" ) - 1.0).abs() < 0.00001); |
| 284 | /// ``` |
| 285 | pub fn normalized_levenshtein(a: &str, b: &str) -> f64 { |
| 286 | if a.is_empty() && b.is_empty() { |
| 287 | return 1.0; |
| 288 | } |
| 289 | 1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) |
| 290 | } |
| 291 | |
| 292 | /// Like Levenshtein but allows for adjacent transpositions. Each substring can |
| 293 | /// only be edited once. |
| 294 | /// |
| 295 | /// ``` |
| 296 | /// use strsim::osa_distance; |
| 297 | /// |
| 298 | /// assert_eq!(3, osa_distance("ab" , "bca" )); |
| 299 | /// ``` |
| 300 | pub fn osa_distance(a: &str, b: &str) -> usize { |
| 301 | let b_len = b.chars().count(); |
| 302 | // 0..=b_len behaves like 0..b_len.saturating_add(1) which could be a different size |
| 303 | // this leads to significantly worse code gen when swapping the vectors below |
| 304 | let mut prev_two_distances: Vec<usize> = (0..b_len + 1).collect(); |
| 305 | let mut prev_distances: Vec<usize> = (0..b_len + 1).collect(); |
| 306 | let mut curr_distances: Vec<usize> = vec![0; b_len + 1]; |
| 307 | |
| 308 | let mut prev_a_char = char::MAX; |
| 309 | let mut prev_b_char = char::MAX; |
| 310 | |
| 311 | for (i, a_char) in a.chars().enumerate() { |
| 312 | curr_distances[0] = i + 1; |
| 313 | |
| 314 | for (j, b_char) in b.chars().enumerate() { |
| 315 | let cost = usize::from(a_char != b_char); |
| 316 | curr_distances[j + 1] = min( |
| 317 | curr_distances[j] + 1, |
| 318 | min(prev_distances[j + 1] + 1, prev_distances[j] + cost), |
| 319 | ); |
| 320 | if i > 0 && j > 0 && a_char != b_char && a_char == prev_b_char && b_char == prev_a_char |
| 321 | { |
| 322 | curr_distances[j + 1] = min(curr_distances[j + 1], prev_two_distances[j - 1] + 1); |
| 323 | } |
| 324 | |
| 325 | prev_b_char = b_char; |
| 326 | } |
| 327 | |
| 328 | mem::swap(&mut prev_two_distances, &mut prev_distances); |
| 329 | mem::swap(&mut prev_distances, &mut curr_distances); |
| 330 | prev_a_char = a_char; |
| 331 | } |
| 332 | |
| 333 | // access prev_distances instead of curr_distances since we swapped |
| 334 | // them above. In case a is empty this would still contain the correct value |
| 335 | // from initializing the last element to b_len |
| 336 | prev_distances[b_len] |
| 337 | } |
| 338 | |
| 339 | /* Returns the final index for a value in a single vector that represents a fixed |
| 340 | 2d grid */ |
| 341 | fn flat_index(i: usize, j: usize, width: usize) -> usize { |
| 342 | j * width + i |
| 343 | } |
| 344 | |
| 345 | /// Like optimal string alignment, but substrings can be edited an unlimited |
| 346 | /// number of times, and the triangle inequality holds. |
| 347 | /// |
| 348 | /// ``` |
| 349 | /// use strsim::generic_damerau_levenshtein; |
| 350 | /// |
| 351 | /// assert_eq!(2, generic_damerau_levenshtein(&[1,2], &[2,3,1])); |
| 352 | /// ``` |
| 353 | pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize |
| 354 | where |
| 355 | Elem: Eq + Hash + Clone, |
| 356 | { |
| 357 | let a_len = a_elems.len(); |
| 358 | let b_len = b_elems.len(); |
| 359 | |
| 360 | if a_len == 0 { |
| 361 | return b_len; |
| 362 | } |
| 363 | if b_len == 0 { |
| 364 | return a_len; |
| 365 | } |
| 366 | |
| 367 | let width = a_len + 2; |
| 368 | let mut distances = vec![0; (a_len + 2) * (b_len + 2)]; |
| 369 | let max_distance = a_len + b_len; |
| 370 | distances[0] = max_distance; |
| 371 | |
| 372 | for i in 0..(a_len + 1) { |
| 373 | distances[flat_index(i + 1, 0, width)] = max_distance; |
| 374 | distances[flat_index(i + 1, 1, width)] = i; |
| 375 | } |
| 376 | |
| 377 | for j in 0..(b_len + 1) { |
| 378 | distances[flat_index(0, j + 1, width)] = max_distance; |
| 379 | distances[flat_index(1, j + 1, width)] = j; |
| 380 | } |
| 381 | |
| 382 | let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64); |
| 383 | |
| 384 | for i in 1..(a_len + 1) { |
| 385 | let mut db = 0; |
| 386 | |
| 387 | for j in 1..(b_len + 1) { |
| 388 | let k = match elems.get(&b_elems[j - 1]) { |
| 389 | Some(&value) => value, |
| 390 | None => 0, |
| 391 | }; |
| 392 | |
| 393 | let insertion_cost = distances[flat_index(i, j + 1, width)] + 1; |
| 394 | let deletion_cost = distances[flat_index(i + 1, j, width)] + 1; |
| 395 | let transposition_cost = |
| 396 | distances[flat_index(k, db, width)] + (i - k - 1) + 1 + (j - db - 1); |
| 397 | |
| 398 | let mut substitution_cost = distances[flat_index(i, j, width)] + 1; |
| 399 | if a_elems[i - 1] == b_elems[j - 1] { |
| 400 | db = j; |
| 401 | substitution_cost -= 1; |
| 402 | } |
| 403 | |
| 404 | distances[flat_index(i + 1, j + 1, width)] = min( |
| 405 | substitution_cost, |
| 406 | min(insertion_cost, min(deletion_cost, transposition_cost)), |
| 407 | ); |
| 408 | } |
| 409 | |
| 410 | elems.insert(a_elems[i - 1].clone(), i); |
| 411 | } |
| 412 | |
| 413 | distances[flat_index(a_len + 1, b_len + 1, width)] |
| 414 | } |
| 415 | |
| 416 | #[derive (Clone, Copy, PartialEq, Eq)] |
| 417 | struct RowId { |
| 418 | val: isize, |
| 419 | } |
| 420 | |
| 421 | impl Default for RowId { |
| 422 | fn default() -> Self { |
| 423 | Self { val: -1 } |
| 424 | } |
| 425 | } |
| 426 | |
| 427 | #[derive (Default, Clone)] |
| 428 | struct GrowingHashmapMapElemChar<ValueType> { |
| 429 | key: u32, |
| 430 | value: ValueType, |
| 431 | } |
| 432 | |
| 433 | /// specialized hashmap to store user provided types |
| 434 | /// this implementation relies on a couple of base assumptions in order to simplify the implementation |
| 435 | /// - the hashmap does not have an upper limit of included items |
| 436 | /// - the default value for the `ValueType` can be used as a dummy value to indicate an empty cell |
| 437 | /// - elements can't be removed |
| 438 | /// - only allocates memory on first write access. |
| 439 | /// This improves performance for hashmaps that are never written to |
| 440 | struct GrowingHashmapChar<ValueType> { |
| 441 | used: i32, |
| 442 | fill: i32, |
| 443 | mask: i32, |
| 444 | map: Option<Vec<GrowingHashmapMapElemChar<ValueType>>>, |
| 445 | } |
| 446 | |
| 447 | impl<ValueType> Default for GrowingHashmapChar<ValueType> |
| 448 | where |
| 449 | ValueType: Default + Clone + Eq, |
| 450 | { |
| 451 | fn default() -> Self { |
| 452 | Self { |
| 453 | used: 0, |
| 454 | fill: 0, |
| 455 | mask: -1, |
| 456 | map: None, |
| 457 | } |
| 458 | } |
| 459 | } |
| 460 | |
| 461 | impl<ValueType> GrowingHashmapChar<ValueType> |
| 462 | where |
| 463 | ValueType: Default + Clone + Eq + Copy, |
| 464 | { |
| 465 | fn get(&self, key: u32) -> ValueType { |
| 466 | self.map |
| 467 | .as_ref() |
| 468 | .map_or_else(|| Default::default(), |map| map[self.lookup(key)].value) |
| 469 | } |
| 470 | |
| 471 | fn get_mut(&mut self, key: u32) -> &mut ValueType { |
| 472 | if self.map.is_none() { |
| 473 | self.allocate(); |
| 474 | } |
| 475 | |
| 476 | let mut i = self.lookup(key); |
| 477 | if self |
| 478 | .map |
| 479 | .as_ref() |
| 480 | .expect("map should have been created above" )[i] |
| 481 | .value |
| 482 | == Default::default() |
| 483 | { |
| 484 | self.fill += 1; |
| 485 | // resize when 2/3 full |
| 486 | if self.fill * 3 >= (self.mask + 1) * 2 { |
| 487 | self.grow((self.used + 1) * 2); |
| 488 | i = self.lookup(key); |
| 489 | } |
| 490 | |
| 491 | self.used += 1; |
| 492 | } |
| 493 | |
| 494 | let elem = &mut self |
| 495 | .map |
| 496 | .as_mut() |
| 497 | .expect("map should have been created above" )[i]; |
| 498 | elem.key = key; |
| 499 | &mut elem.value |
| 500 | } |
| 501 | |
| 502 | fn allocate(&mut self) { |
| 503 | self.mask = 8 - 1; |
| 504 | self.map = Some(vec![GrowingHashmapMapElemChar::default(); 8]); |
| 505 | } |
| 506 | |
| 507 | /// lookup key inside the hashmap using a similar collision resolution |
| 508 | /// strategy to `CPython` and `Ruby` |
| 509 | fn lookup(&self, key: u32) -> usize { |
| 510 | let hash = key; |
| 511 | let mut i = hash as usize & self.mask as usize; |
| 512 | |
| 513 | let map = self |
| 514 | .map |
| 515 | .as_ref() |
| 516 | .expect("callers have to ensure map is allocated" ); |
| 517 | |
| 518 | if map[i].value == Default::default() || map[i].key == key { |
| 519 | return i; |
| 520 | } |
| 521 | |
| 522 | let mut perturb = key; |
| 523 | loop { |
| 524 | i = (i * 5 + perturb as usize + 1) & self.mask as usize; |
| 525 | |
| 526 | if map[i].value == Default::default() || map[i].key == key { |
| 527 | return i; |
| 528 | } |
| 529 | |
| 530 | perturb >>= 5; |
| 531 | } |
| 532 | } |
| 533 | |
| 534 | fn grow(&mut self, min_used: i32) { |
| 535 | let mut new_size = self.mask + 1; |
| 536 | while new_size <= min_used { |
| 537 | new_size <<= 1; |
| 538 | } |
| 539 | |
| 540 | self.fill = self.used; |
| 541 | self.mask = new_size - 1; |
| 542 | |
| 543 | let old_map = std::mem::replace( |
| 544 | self.map |
| 545 | .as_mut() |
| 546 | .expect("callers have to ensure map is allocated" ), |
| 547 | vec![GrowingHashmapMapElemChar::<ValueType>::default(); new_size as usize], |
| 548 | ); |
| 549 | |
| 550 | for elem in old_map { |
| 551 | if elem.value != Default::default() { |
| 552 | let j = self.lookup(elem.key); |
| 553 | let new_elem = &mut self.map.as_mut().expect("map created above" )[j]; |
| 554 | new_elem.key = elem.key; |
| 555 | new_elem.value = elem.value; |
| 556 | self.used -= 1; |
| 557 | if self.used == 0 { |
| 558 | break; |
| 559 | } |
| 560 | } |
| 561 | } |
| 562 | |
| 563 | self.used = self.fill; |
| 564 | } |
| 565 | } |
| 566 | |
| 567 | struct HybridGrowingHashmapChar<ValueType> { |
| 568 | map: GrowingHashmapChar<ValueType>, |
| 569 | extended_ascii: [ValueType; 256], |
| 570 | } |
| 571 | |
| 572 | impl<ValueType> HybridGrowingHashmapChar<ValueType> |
| 573 | where |
| 574 | ValueType: Default + Clone + Copy + Eq, |
| 575 | { |
| 576 | fn get(&self, key: char) -> ValueType { |
| 577 | let value: u32 = key as u32; |
| 578 | if value <= 255 { |
| 579 | let val_u8: u8 = u8::try_from(value).expect(msg:"we check the bounds above" ); |
| 580 | self.extended_ascii[usize::from(val_u8)] |
| 581 | } else { |
| 582 | self.map.get(key:value) |
| 583 | } |
| 584 | } |
| 585 | |
| 586 | fn get_mut(&mut self, key: char) -> &mut ValueType { |
| 587 | let value: u32 = key as u32; |
| 588 | if value <= 255 { |
| 589 | let val_u8: u8 = u8::try_from(value).expect(msg:"we check the bounds above" ); |
| 590 | &mut self.extended_ascii[usize::from(val_u8)] |
| 591 | } else { |
| 592 | self.map.get_mut(key:value) |
| 593 | } |
| 594 | } |
| 595 | } |
| 596 | |
| 597 | impl<ValueType> Default for HybridGrowingHashmapChar<ValueType> |
| 598 | where |
| 599 | ValueType: Default + Clone + Copy + Eq, |
| 600 | { |
| 601 | fn default() -> Self { |
| 602 | HybridGrowingHashmapChar { |
| 603 | map: GrowingHashmapChar::default(), |
| 604 | extended_ascii: [Default::default(); 256], |
| 605 | } |
| 606 | } |
| 607 | } |
| 608 | |
| 609 | fn damerau_levenshtein_impl<Iter1, Iter2>(s1: Iter1, len1: usize, s2: Iter2, len2: usize) -> usize |
| 610 | where |
| 611 | Iter1: Iterator<Item = char> + Clone, |
| 612 | Iter2: Iterator<Item = char> + Clone, |
| 613 | { |
| 614 | // The implementations is based on the paper |
| 615 | // `Linear space string correction algorithm using the Damerau-Levenshtein distance` |
| 616 | // from Chunchun Zhao and Sartaj Sahni |
| 617 | // |
| 618 | // It has a runtime complexity of `O(N*M)` and a memory usage of `O(N+M)`. |
| 619 | let max_val = max(len1, len2) as isize + 1; |
| 620 | |
| 621 | let mut last_row_id = HybridGrowingHashmapChar::<RowId>::default(); |
| 622 | |
| 623 | let size = len2 + 2; |
| 624 | let mut fr = vec![max_val; size]; |
| 625 | let mut r1 = vec![max_val; size]; |
| 626 | let mut r: Vec<isize> = (max_val..max_val + 1) |
| 627 | .chain(0..(size - 1) as isize) |
| 628 | .collect(); |
| 629 | |
| 630 | for (i, ch1) in s1.enumerate().map(|(i, ch1)| (i + 1, ch1)) { |
| 631 | mem::swap(&mut r, &mut r1); |
| 632 | let mut last_col_id: isize = -1; |
| 633 | let mut last_i2l1 = r[1]; |
| 634 | r[1] = i as isize; |
| 635 | let mut t = max_val; |
| 636 | |
| 637 | for (j, ch2) in s2.clone().enumerate().map(|(j, ch2)| (j + 1, ch2)) { |
| 638 | let diag = r1[j] + isize::from(ch1 != ch2); |
| 639 | let left = r[j] + 1; |
| 640 | let up = r1[j + 1] + 1; |
| 641 | let mut temp = min(diag, min(left, up)); |
| 642 | |
| 643 | if ch1 == ch2 { |
| 644 | last_col_id = j as isize; // last occurence of s1_i |
| 645 | fr[j + 1] = r1[j - 1]; // save H_k-1,j-2 |
| 646 | t = last_i2l1; // save H_i-2,l-1 |
| 647 | } else { |
| 648 | let k = last_row_id.get(ch2).val; |
| 649 | let l = last_col_id; |
| 650 | |
| 651 | if j as isize - l == 1 { |
| 652 | let transpose = fr[j + 1] + (i as isize - k); |
| 653 | temp = min(temp, transpose); |
| 654 | } else if i as isize - k == 1 { |
| 655 | let transpose = t + (j as isize - l); |
| 656 | temp = min(temp, transpose); |
| 657 | } |
| 658 | } |
| 659 | |
| 660 | last_i2l1 = r[j + 1]; |
| 661 | r[j + 1] = temp; |
| 662 | } |
| 663 | last_row_id.get_mut(ch1).val = i as isize; |
| 664 | } |
| 665 | |
| 666 | r[len2 + 1] as usize |
| 667 | } |
| 668 | |
| 669 | /// Like optimal string alignment, but substrings can be edited an unlimited |
| 670 | /// number of times, and the triangle inequality holds. |
| 671 | /// |
| 672 | /// ``` |
| 673 | /// use strsim::damerau_levenshtein; |
| 674 | /// |
| 675 | /// assert_eq!(2, damerau_levenshtein("ab" , "bca" )); |
| 676 | /// ``` |
| 677 | pub fn damerau_levenshtein(a: &str, b: &str) -> usize { |
| 678 | damerau_levenshtein_impl(s1:a.chars(), len1:a.chars().count(), s2:b.chars(), len2:b.chars().count()) |
| 679 | } |
| 680 | |
| 681 | /// Calculates a normalized score of the Damerau–Levenshtein algorithm between |
| 682 | /// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same. |
| 683 | /// |
| 684 | /// ``` |
| 685 | /// use strsim::normalized_damerau_levenshtein; |
| 686 | /// |
| 687 | /// assert!((normalized_damerau_levenshtein("levenshtein" , "löwenbräu" ) - 0.27272).abs() < 0.00001); |
| 688 | /// assert!((normalized_damerau_levenshtein("" , "" ) - 1.0).abs() < 0.00001); |
| 689 | /// assert!(normalized_damerau_levenshtein("" , "flower" ).abs() < 0.00001); |
| 690 | /// assert!(normalized_damerau_levenshtein("tree" , "" ).abs() < 0.00001); |
| 691 | /// assert!((normalized_damerau_levenshtein("sunglasses" , "sunglasses" ) - 1.0).abs() < 0.00001); |
| 692 | /// ``` |
| 693 | pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 { |
| 694 | if a.is_empty() && b.is_empty() { |
| 695 | return 1.0; |
| 696 | } |
| 697 | |
| 698 | let len1: usize = a.chars().count(); |
| 699 | let len2: usize = b.chars().count(); |
| 700 | let dist: usize = damerau_levenshtein_impl(s1:a.chars(), len1, s2:b.chars(), len2); |
| 701 | 1.0 - (dist as f64) / (max(v1:len1, v2:len2) as f64) |
| 702 | } |
| 703 | |
| 704 | /// Returns an Iterator of char tuples. |
| 705 | fn bigrams(s: &str) -> impl Iterator<Item = (char, char)> + '_ { |
| 706 | s.chars().zip(s.chars().skip(1)) |
| 707 | } |
| 708 | |
| 709 | /// Calculates a Sørensen-Dice similarity distance using bigrams. |
| 710 | /// See <https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient>. |
| 711 | /// |
| 712 | /// ``` |
| 713 | /// use strsim::sorensen_dice; |
| 714 | /// |
| 715 | /// assert_eq!(1.0, sorensen_dice("" , "" )); |
| 716 | /// assert_eq!(0.0, sorensen_dice("" , "a" )); |
| 717 | /// assert_eq!(0.0, sorensen_dice("french" , "quebec" )); |
| 718 | /// assert_eq!(1.0, sorensen_dice("ferris" , "ferris" )); |
| 719 | /// assert_eq!(0.8888888888888888, sorensen_dice("feris" , "ferris" )); |
| 720 | /// ``` |
| 721 | pub fn sorensen_dice(a: &str, b: &str) -> f64 { |
| 722 | // implementation guided by |
| 723 | // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6 |
| 724 | |
| 725 | let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect(); |
| 726 | let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect(); |
| 727 | |
| 728 | if a == b { |
| 729 | return 1.0; |
| 730 | } |
| 731 | |
| 732 | if a.len() < 2 || b.len() < 2 { |
| 733 | return 0.0; |
| 734 | } |
| 735 | |
| 736 | let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new(); |
| 737 | |
| 738 | for bigram in bigrams(&a) { |
| 739 | *a_bigrams.entry(bigram).or_insert(0) += 1; |
| 740 | } |
| 741 | |
| 742 | let mut intersection_size = 0_usize; |
| 743 | |
| 744 | for bigram in bigrams(&b) { |
| 745 | a_bigrams.entry(bigram).and_modify(|bi| { |
| 746 | if *bi > 0 { |
| 747 | *bi -= 1; |
| 748 | intersection_size += 1; |
| 749 | } |
| 750 | }); |
| 751 | } |
| 752 | |
| 753 | (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 |
| 754 | } |
| 755 | |
| 756 | #[cfg (test)] |
| 757 | mod tests { |
| 758 | use super::*; |
| 759 | |
| 760 | macro_rules! assert_delta { |
| 761 | ($x:expr, $y:expr) => { |
| 762 | assert_delta!($x, $y, 1e-5); |
| 763 | }; |
| 764 | ($x:expr, $y:expr, $d:expr) => { |
| 765 | if ($x - $y).abs() > $d { |
| 766 | panic!( |
| 767 | "assertion failed: actual: `{}`, expected: `{}`: \ |
| 768 | actual not within < {} of expected" , |
| 769 | $x, $y, $d |
| 770 | ); |
| 771 | } |
| 772 | }; |
| 773 | } |
| 774 | |
| 775 | #[test ] |
| 776 | fn bigrams_iterator() { |
| 777 | let mut bi = bigrams("abcde" ); |
| 778 | |
| 779 | assert_eq!(Some(('a' , 'b' )), bi.next()); |
| 780 | assert_eq!(Some(('b' , 'c' )), bi.next()); |
| 781 | assert_eq!(Some(('c' , 'd' )), bi.next()); |
| 782 | assert_eq!(Some(('d' , 'e' )), bi.next()); |
| 783 | assert_eq!(None, bi.next()); |
| 784 | } |
| 785 | |
| 786 | fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) { |
| 787 | assert_eq!(Ok(dist), hamming(str1, str2)); |
| 788 | } |
| 789 | |
| 790 | #[test ] |
| 791 | fn hamming_empty() { |
| 792 | assert_hamming_dist(0, "" , "" ) |
| 793 | } |
| 794 | |
| 795 | #[test ] |
| 796 | fn hamming_same() { |
| 797 | assert_hamming_dist(0, "hamming" , "hamming" ) |
| 798 | } |
| 799 | |
| 800 | #[test ] |
| 801 | fn hamming_numbers() { |
| 802 | assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3])); |
| 803 | } |
| 804 | |
| 805 | #[test ] |
| 806 | fn hamming_diff() { |
| 807 | assert_hamming_dist(3, "hamming" , "hammers" ) |
| 808 | } |
| 809 | |
| 810 | #[test ] |
| 811 | fn hamming_diff_multibyte() { |
| 812 | assert_hamming_dist(2, "hamming" , "h香mmüng" ); |
| 813 | } |
| 814 | |
| 815 | #[test ] |
| 816 | fn hamming_unequal_length() { |
| 817 | assert_eq!( |
| 818 | Err(StrSimError::DifferentLengthArgs), |
| 819 | generic_hamming("ham" .chars(), "hamming" .chars()) |
| 820 | ); |
| 821 | } |
| 822 | |
| 823 | #[test ] |
| 824 | fn hamming_names() { |
| 825 | assert_hamming_dist(14, "Friedrich Nietzs" , "Jean-Paul Sartre" ) |
| 826 | } |
| 827 | |
| 828 | #[test ] |
| 829 | fn jaro_both_empty() { |
| 830 | assert_eq!(1.0, jaro("" , "" )); |
| 831 | } |
| 832 | |
| 833 | #[test ] |
| 834 | fn jaro_first_empty() { |
| 835 | assert_eq!(0.0, jaro("" , "jaro" )); |
| 836 | } |
| 837 | |
| 838 | #[test ] |
| 839 | fn jaro_second_empty() { |
| 840 | assert_eq!(0.0, jaro("distance" , "" )); |
| 841 | } |
| 842 | |
| 843 | #[test ] |
| 844 | fn jaro_same() { |
| 845 | assert_eq!(1.0, jaro("jaro" , "jaro" )); |
| 846 | } |
| 847 | |
| 848 | #[test ] |
| 849 | fn jaro_multibyte() { |
| 850 | assert_delta!(0.818, jaro("testabctest" , "testöঙ香test" ), 0.001); |
| 851 | assert_delta!(0.818, jaro("testöঙ香test" , "testabctest" ), 0.001); |
| 852 | } |
| 853 | |
| 854 | #[test ] |
| 855 | fn jaro_diff_short() { |
| 856 | assert_delta!(0.767, jaro("dixon" , "dicksonx" ), 0.001); |
| 857 | } |
| 858 | |
| 859 | #[test ] |
| 860 | fn jaro_diff_one_character() { |
| 861 | assert_eq!(0.0, jaro("a" , "b" )); |
| 862 | } |
| 863 | |
| 864 | #[test ] |
| 865 | fn jaro_same_one_character() { |
| 866 | assert_eq!(1.0, jaro("a" , "a" )); |
| 867 | } |
| 868 | |
| 869 | #[test ] |
| 870 | fn generic_jaro_diff() { |
| 871 | assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4])); |
| 872 | } |
| 873 | |
| 874 | #[test ] |
| 875 | fn jaro_diff_one_and_two() { |
| 876 | assert_delta!(0.83, jaro("a" , "ab" ), 0.01); |
| 877 | } |
| 878 | |
| 879 | #[test ] |
| 880 | fn jaro_diff_two_and_one() { |
| 881 | assert_delta!(0.83, jaro("ab" , "a" ), 0.01); |
| 882 | } |
| 883 | |
| 884 | #[test ] |
| 885 | fn jaro_diff_no_transposition() { |
| 886 | assert_delta!(0.822, jaro("dwayne" , "duane" ), 0.001); |
| 887 | } |
| 888 | |
| 889 | #[test ] |
| 890 | fn jaro_diff_with_transposition() { |
| 891 | assert_delta!(0.944, jaro("martha" , "marhta" ), 0.001); |
| 892 | assert_delta!(0.6, jaro("a jke" , "jane a k" ), 0.001); |
| 893 | } |
| 894 | |
| 895 | #[test ] |
| 896 | fn jaro_names() { |
| 897 | assert_delta!( |
| 898 | 0.392, |
| 899 | jaro("Friedrich Nietzsche" , "Jean-Paul Sartre" ), |
| 900 | 0.001 |
| 901 | ); |
| 902 | } |
| 903 | |
| 904 | #[test ] |
| 905 | fn jaro_winkler_both_empty() { |
| 906 | assert_eq!(1.0, jaro_winkler("" , "" )); |
| 907 | } |
| 908 | |
| 909 | #[test ] |
| 910 | fn jaro_winkler_first_empty() { |
| 911 | assert_eq!(0.0, jaro_winkler("" , "jaro-winkler" )); |
| 912 | } |
| 913 | |
| 914 | #[test ] |
| 915 | fn jaro_winkler_second_empty() { |
| 916 | assert_eq!(0.0, jaro_winkler("distance" , "" )); |
| 917 | } |
| 918 | |
| 919 | #[test ] |
| 920 | fn jaro_winkler_same() { |
| 921 | assert_eq!(1.0, jaro_winkler("Jaro-Winkler" , "Jaro-Winkler" )); |
| 922 | } |
| 923 | |
| 924 | #[test ] |
| 925 | fn jaro_winkler_multibyte() { |
| 926 | assert_delta!(0.89, jaro_winkler("testabctest" , "testöঙ香test" ), 0.001); |
| 927 | assert_delta!(0.89, jaro_winkler("testöঙ香test" , "testabctest" ), 0.001); |
| 928 | } |
| 929 | |
| 930 | #[test ] |
| 931 | fn jaro_winkler_diff_short() { |
| 932 | assert_delta!(0.813, jaro_winkler("dixon" , "dicksonx" ), 0.001); |
| 933 | assert_delta!(0.813, jaro_winkler("dicksonx" , "dixon" ), 0.001); |
| 934 | } |
| 935 | |
| 936 | #[test ] |
| 937 | fn jaro_winkler_diff_one_character() { |
| 938 | assert_eq!(0.0, jaro_winkler("a" , "b" )); |
| 939 | } |
| 940 | |
| 941 | #[test ] |
| 942 | fn jaro_winkler_same_one_character() { |
| 943 | assert_eq!(1.0, jaro_winkler("a" , "a" )); |
| 944 | } |
| 945 | |
| 946 | #[test ] |
| 947 | fn jaro_winkler_diff_no_transposition() { |
| 948 | assert_delta!(0.84, jaro_winkler("dwayne" , "duane" ), 0.001); |
| 949 | } |
| 950 | |
| 951 | #[test ] |
| 952 | fn jaro_winkler_diff_with_transposition() { |
| 953 | assert_delta!(0.961, jaro_winkler("martha" , "marhta" ), 0.001); |
| 954 | assert_delta!(0.6, jaro_winkler("a jke" , "jane a k" ), 0.001); |
| 955 | } |
| 956 | |
| 957 | #[test ] |
| 958 | fn jaro_winkler_names() { |
| 959 | assert_delta!( |
| 960 | 0.452, |
| 961 | jaro_winkler("Friedrich Nietzsche" , "Fran-Paul Sartre" ), |
| 962 | 0.001 |
| 963 | ); |
| 964 | } |
| 965 | |
| 966 | #[test ] |
| 967 | fn jaro_winkler_long_prefix() { |
| 968 | assert_delta!(0.866, jaro_winkler("cheeseburger" , "cheese fries" ), 0.001); |
| 969 | } |
| 970 | |
| 971 | #[test ] |
| 972 | fn jaro_winkler_more_names() { |
| 973 | assert_delta!(0.868, jaro_winkler("Thorkel" , "Thorgier" ), 0.001); |
| 974 | } |
| 975 | |
| 976 | #[test ] |
| 977 | fn jaro_winkler_length_of_one() { |
| 978 | assert_delta!(0.738, jaro_winkler("Dinsdale" , "D" ), 0.001); |
| 979 | } |
| 980 | |
| 981 | #[test ] |
| 982 | fn jaro_winkler_very_long_prefix() { |
| 983 | assert_delta!( |
| 984 | 0.98519, |
| 985 | jaro_winkler("thequickbrownfoxjumpedoverx" , "thequickbrownfoxjumpedovery" ) |
| 986 | ); |
| 987 | } |
| 988 | |
| 989 | #[test ] |
| 990 | fn levenshtein_empty() { |
| 991 | assert_eq!(0, levenshtein("" , "" )); |
| 992 | } |
| 993 | |
| 994 | #[test ] |
| 995 | fn levenshtein_same() { |
| 996 | assert_eq!(0, levenshtein("levenshtein" , "levenshtein" )); |
| 997 | } |
| 998 | |
| 999 | #[test ] |
| 1000 | fn levenshtein_diff_short() { |
| 1001 | assert_eq!(3, levenshtein("kitten" , "sitting" )); |
| 1002 | } |
| 1003 | |
| 1004 | #[test ] |
| 1005 | fn levenshtein_diff_with_space() { |
| 1006 | assert_eq!(5, levenshtein("hello, world" , "bye, world" )); |
| 1007 | } |
| 1008 | |
| 1009 | #[test ] |
| 1010 | fn levenshtein_diff_multibyte() { |
| 1011 | assert_eq!(3, levenshtein("öঙ香" , "abc" )); |
| 1012 | assert_eq!(3, levenshtein("abc" , "öঙ香" )); |
| 1013 | } |
| 1014 | |
| 1015 | #[test ] |
| 1016 | fn levenshtein_diff_longer() { |
| 1017 | let a = "The quick brown fox jumped over the angry dog." ; |
| 1018 | let b = "Lorem ipsum dolor sit amet, dicta latine an eam." ; |
| 1019 | assert_eq!(37, levenshtein(a, b)); |
| 1020 | } |
| 1021 | |
| 1022 | #[test ] |
| 1023 | fn levenshtein_first_empty() { |
| 1024 | assert_eq!(7, levenshtein("" , "sitting" )); |
| 1025 | } |
| 1026 | |
| 1027 | #[test ] |
| 1028 | fn levenshtein_second_empty() { |
| 1029 | assert_eq!(6, levenshtein("kitten" , "" )); |
| 1030 | } |
| 1031 | |
| 1032 | #[test ] |
| 1033 | fn normalized_levenshtein_diff_short() { |
| 1034 | assert_delta!(0.57142, normalized_levenshtein("kitten" , "sitting" )); |
| 1035 | } |
| 1036 | |
| 1037 | #[test ] |
| 1038 | fn normalized_levenshtein_for_empty_strings() { |
| 1039 | assert_delta!(1.0, normalized_levenshtein("" , "" )); |
| 1040 | } |
| 1041 | |
| 1042 | #[test ] |
| 1043 | fn normalized_levenshtein_first_empty() { |
| 1044 | assert_delta!(0.0, normalized_levenshtein("" , "second" )); |
| 1045 | } |
| 1046 | |
| 1047 | #[test ] |
| 1048 | fn normalized_levenshtein_second_empty() { |
| 1049 | assert_delta!(0.0, normalized_levenshtein("first" , "" )); |
| 1050 | } |
| 1051 | |
| 1052 | #[test ] |
| 1053 | fn normalized_levenshtein_identical_strings() { |
| 1054 | assert_delta!(1.0, normalized_levenshtein("identical" , "identical" )); |
| 1055 | } |
| 1056 | |
| 1057 | #[test ] |
| 1058 | fn osa_distance_empty() { |
| 1059 | assert_eq!(0, osa_distance("" , "" )); |
| 1060 | } |
| 1061 | |
| 1062 | #[test ] |
| 1063 | fn osa_distance_same() { |
| 1064 | assert_eq!(0, osa_distance("damerau" , "damerau" )); |
| 1065 | } |
| 1066 | |
| 1067 | #[test ] |
| 1068 | fn osa_distance_first_empty() { |
| 1069 | assert_eq!(7, osa_distance("" , "damerau" )); |
| 1070 | } |
| 1071 | |
| 1072 | #[test ] |
| 1073 | fn osa_distance_second_empty() { |
| 1074 | assert_eq!(7, osa_distance("damerau" , "" )); |
| 1075 | } |
| 1076 | |
| 1077 | #[test ] |
| 1078 | fn osa_distance_diff() { |
| 1079 | assert_eq!(3, osa_distance("ca" , "abc" )); |
| 1080 | } |
| 1081 | |
| 1082 | #[test ] |
| 1083 | fn osa_distance_diff_short() { |
| 1084 | assert_eq!(3, osa_distance("damerau" , "aderua" )); |
| 1085 | } |
| 1086 | |
| 1087 | #[test ] |
| 1088 | fn osa_distance_diff_reversed() { |
| 1089 | assert_eq!(3, osa_distance("aderua" , "damerau" )); |
| 1090 | } |
| 1091 | |
| 1092 | #[test ] |
| 1093 | fn osa_distance_diff_multibyte() { |
| 1094 | assert_eq!(3, osa_distance("öঙ香" , "abc" )); |
| 1095 | assert_eq!(3, osa_distance("abc" , "öঙ香" )); |
| 1096 | } |
| 1097 | |
| 1098 | #[test ] |
| 1099 | fn osa_distance_diff_unequal_length() { |
| 1100 | assert_eq!(6, osa_distance("damerau" , "aderuaxyz" )); |
| 1101 | } |
| 1102 | |
| 1103 | #[test ] |
| 1104 | fn osa_distance_diff_unequal_length_reversed() { |
| 1105 | assert_eq!(6, osa_distance("aderuaxyz" , "damerau" )); |
| 1106 | } |
| 1107 | |
| 1108 | #[test ] |
| 1109 | fn osa_distance_diff_comedians() { |
| 1110 | assert_eq!(5, osa_distance("Stewart" , "Colbert" )); |
| 1111 | } |
| 1112 | |
| 1113 | #[test ] |
| 1114 | fn osa_distance_many_transpositions() { |
| 1115 | assert_eq!(4, osa_distance("abcdefghijkl" , "bacedfgihjlk" )); |
| 1116 | } |
| 1117 | |
| 1118 | #[test ] |
| 1119 | fn osa_distance_diff_longer() { |
| 1120 | let a = "The quick brown fox jumped over the angry dog." ; |
| 1121 | let b = "Lehem ipsum dolor sit amet, dicta latine an eam." ; |
| 1122 | assert_eq!(36, osa_distance(a, b)); |
| 1123 | } |
| 1124 | |
| 1125 | #[test ] |
| 1126 | fn osa_distance_beginning_transposition() { |
| 1127 | assert_eq!(1, osa_distance("foobar" , "ofobar" )); |
| 1128 | } |
| 1129 | |
| 1130 | #[test ] |
| 1131 | fn osa_distance_end_transposition() { |
| 1132 | assert_eq!(1, osa_distance("specter" , "spectre" )); |
| 1133 | } |
| 1134 | |
| 1135 | #[test ] |
| 1136 | fn osa_distance_restricted_edit() { |
| 1137 | assert_eq!(4, osa_distance("a cat" , "an abct" )); |
| 1138 | } |
| 1139 | |
| 1140 | #[test ] |
| 1141 | fn damerau_levenshtein_empty() { |
| 1142 | assert_eq!(0, damerau_levenshtein("" , "" )); |
| 1143 | } |
| 1144 | |
| 1145 | #[test ] |
| 1146 | fn damerau_levenshtein_same() { |
| 1147 | assert_eq!(0, damerau_levenshtein("damerau" , "damerau" )); |
| 1148 | } |
| 1149 | |
| 1150 | #[test ] |
| 1151 | fn damerau_levenshtein_first_empty() { |
| 1152 | assert_eq!(7, damerau_levenshtein("" , "damerau" )); |
| 1153 | } |
| 1154 | |
| 1155 | #[test ] |
| 1156 | fn damerau_levenshtein_second_empty() { |
| 1157 | assert_eq!(7, damerau_levenshtein("damerau" , "" )); |
| 1158 | } |
| 1159 | |
| 1160 | #[test ] |
| 1161 | fn damerau_levenshtein_diff() { |
| 1162 | assert_eq!(2, damerau_levenshtein("ca" , "abc" )); |
| 1163 | } |
| 1164 | |
| 1165 | #[test ] |
| 1166 | fn damerau_levenshtein_diff_short() { |
| 1167 | assert_eq!(3, damerau_levenshtein("damerau" , "aderua" )); |
| 1168 | } |
| 1169 | |
| 1170 | #[test ] |
| 1171 | fn damerau_levenshtein_diff_reversed() { |
| 1172 | assert_eq!(3, damerau_levenshtein("aderua" , "damerau" )); |
| 1173 | } |
| 1174 | |
| 1175 | #[test ] |
| 1176 | fn damerau_levenshtein_diff_multibyte() { |
| 1177 | assert_eq!(3, damerau_levenshtein("öঙ香" , "abc" )); |
| 1178 | assert_eq!(3, damerau_levenshtein("abc" , "öঙ香" )); |
| 1179 | } |
| 1180 | |
| 1181 | #[test ] |
| 1182 | fn damerau_levenshtein_diff_unequal_length() { |
| 1183 | assert_eq!(6, damerau_levenshtein("damerau" , "aderuaxyz" )); |
| 1184 | } |
| 1185 | |
| 1186 | #[test ] |
| 1187 | fn damerau_levenshtein_diff_unequal_length_reversed() { |
| 1188 | assert_eq!(6, damerau_levenshtein("aderuaxyz" , "damerau" )); |
| 1189 | } |
| 1190 | |
| 1191 | #[test ] |
| 1192 | fn damerau_levenshtein_diff_comedians() { |
| 1193 | assert_eq!(5, damerau_levenshtein("Stewart" , "Colbert" )); |
| 1194 | } |
| 1195 | |
| 1196 | #[test ] |
| 1197 | fn damerau_levenshtein_many_transpositions() { |
| 1198 | assert_eq!(4, damerau_levenshtein("abcdefghijkl" , "bacedfgihjlk" )); |
| 1199 | } |
| 1200 | |
| 1201 | #[test ] |
| 1202 | fn damerau_levenshtein_diff_longer() { |
| 1203 | let a = "The quick brown fox jumped over the angry dog." ; |
| 1204 | let b = "Lehem ipsum dolor sit amet, dicta latine an eam." ; |
| 1205 | assert_eq!(36, damerau_levenshtein(a, b)); |
| 1206 | } |
| 1207 | |
| 1208 | #[test ] |
| 1209 | fn damerau_levenshtein_beginning_transposition() { |
| 1210 | assert_eq!(1, damerau_levenshtein("foobar" , "ofobar" )); |
| 1211 | } |
| 1212 | |
| 1213 | #[test ] |
| 1214 | fn damerau_levenshtein_end_transposition() { |
| 1215 | assert_eq!(1, damerau_levenshtein("specter" , "spectre" )); |
| 1216 | } |
| 1217 | |
| 1218 | #[test ] |
| 1219 | fn damerau_levenshtein_unrestricted_edit() { |
| 1220 | assert_eq!(3, damerau_levenshtein("a cat" , "an abct" )); |
| 1221 | } |
| 1222 | |
| 1223 | #[test ] |
| 1224 | fn normalized_damerau_levenshtein_diff_short() { |
| 1225 | assert_delta!( |
| 1226 | 0.27272, |
| 1227 | normalized_damerau_levenshtein("levenshtein" , "löwenbräu" ) |
| 1228 | ); |
| 1229 | } |
| 1230 | |
| 1231 | #[test ] |
| 1232 | fn normalized_damerau_levenshtein_for_empty_strings() { |
| 1233 | assert_delta!(1.0, normalized_damerau_levenshtein("" , "" )); |
| 1234 | } |
| 1235 | |
| 1236 | #[test ] |
| 1237 | fn normalized_damerau_levenshtein_first_empty() { |
| 1238 | assert_delta!(0.0, normalized_damerau_levenshtein("" , "flower" )); |
| 1239 | } |
| 1240 | |
| 1241 | #[test ] |
| 1242 | fn normalized_damerau_levenshtein_second_empty() { |
| 1243 | assert_delta!(0.0, normalized_damerau_levenshtein("tree" , "" )); |
| 1244 | } |
| 1245 | |
| 1246 | #[test ] |
| 1247 | fn normalized_damerau_levenshtein_identical_strings() { |
| 1248 | assert_delta!( |
| 1249 | 1.0, |
| 1250 | normalized_damerau_levenshtein("sunglasses" , "sunglasses" ) |
| 1251 | ); |
| 1252 | } |
| 1253 | |
| 1254 | #[test ] |
| 1255 | fn sorensen_dice_all() { |
| 1256 | // test cases taken from |
| 1257 | // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11 |
| 1258 | |
| 1259 | assert_delta!(1.0, sorensen_dice("a" , "a" )); |
| 1260 | assert_delta!(0.0, sorensen_dice("a" , "b" )); |
| 1261 | assert_delta!(1.0, sorensen_dice("" , "" )); |
| 1262 | assert_delta!(0.0, sorensen_dice("a" , "" )); |
| 1263 | assert_delta!(0.0, sorensen_dice("" , "a" )); |
| 1264 | assert_delta!(1.0, sorensen_dice("apple event" , "apple event" )); |
| 1265 | assert_delta!(0.90909, sorensen_dice("iphone" , "iphone x" )); |
| 1266 | assert_delta!(0.0, sorensen_dice("french" , "quebec" )); |
| 1267 | assert_delta!(1.0, sorensen_dice("france" , "france" )); |
| 1268 | assert_delta!(0.2, sorensen_dice("fRaNce" , "france" )); |
| 1269 | assert_delta!(0.8, sorensen_dice("healed" , "sealed" )); |
| 1270 | assert_delta!( |
| 1271 | 0.78788, |
| 1272 | sorensen_dice("web applications" , "applications of the web" ) |
| 1273 | ); |
| 1274 | assert_delta!( |
| 1275 | 0.92, |
| 1276 | sorensen_dice( |
| 1277 | "this will have a typo somewhere" , |
| 1278 | "this will huve a typo somewhere" |
| 1279 | ) |
| 1280 | ); |
| 1281 | assert_delta!( |
| 1282 | 0.60606, |
| 1283 | sorensen_dice( |
| 1284 | "Olive-green table for sale, in extremely good condition." , |
| 1285 | "For sale: table in very good condition, olive green in colour." |
| 1286 | ) |
| 1287 | ); |
| 1288 | assert_delta!( |
| 1289 | 0.25581, |
| 1290 | sorensen_dice( |
| 1291 | "Olive-green table for sale, in extremely good condition." , |
| 1292 | "For sale: green Subaru Impreza, 210,000 miles" |
| 1293 | ) |
| 1294 | ); |
| 1295 | assert_delta!( |
| 1296 | 0.14118, |
| 1297 | sorensen_dice( |
| 1298 | "Olive-green table for sale, in extremely good condition." , |
| 1299 | "Wanted: mountain bike with at least 21 gears." |
| 1300 | ) |
| 1301 | ); |
| 1302 | assert_delta!( |
| 1303 | 0.77419, |
| 1304 | sorensen_dice("this has one extra word" , "this has one word" ) |
| 1305 | ); |
| 1306 | } |
| 1307 | } |
| 1308 | |