diff --git a/CHANGELOG.md b/CHANGELOG.md index b8f7908..f38f562 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,9 +17,12 @@ This project attempts to adhere to [Semantic Versioning](http://semver.org). - reduce runtime in our own benchmark by more than `70%` - reduce binary size by more than `25%` +- only boost similarity in Jaro-Winkler once the Jaro similarity exceeds 0.7 + ### Fixed - Fix transposition counting in Jaro and Jaro-Winkler. +- Limit common prefix in Jaro-Winkler to 4 characters ## [0.10.0] - (2020-01-31) diff --git a/README.md b/README.md index 3e19575..f3dcd96 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ - [Levenshtein] - distance & normalized - [Optimal string alignment] - [Damerau-Levenshtein] - distance & normalized - - [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length + - [Jaro and Jaro-Winkler] - [Sørensen-Dice] The normalized versions return values between `0.0` and `1.0`, where `1.0` means diff --git a/src/lib.rs b/src/lib.rs index 6f9bec7..8118277 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -194,22 +194,19 @@ where &'b Iter2: IntoIterator, Elem1: PartialEq, { - let jaro_distance = generic_jaro(a, b); + let sim = generic_jaro(a, b); - // Don't limit the length of the common prefix - let prefix_length = a - .into_iter() - .zip(b) - .take_while(|(a_elem, b_elem)| a_elem == b_elem) - .count(); + if sim > 0.7 { + let prefix_length = a + .into_iter() + .take(4) + .zip(b) + .take_while(|(a_elem, b_elem)| a_elem == b_elem) + .count(); - let jaro_winkler_distance = - jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance)); - - if jaro_winkler_distance <= 1.0 { - jaro_winkler_distance + sim + 0.1 * prefix_length as f64 * (1.0 - sim) } else { - 1.0 + sim } } @@ -218,7 +215,7 @@ where /// ``` /// use strsim::jaro_winkler; /// -/// assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() < +/// assert!((0.866 - jaro_winkler("cheeseburger", "cheese fries")).abs() < /// 0.001); /// ``` pub fn jaro_winkler(a: &str, b: &str) -> f64 { @@ -960,7 +957,7 @@ mod tests { #[test] fn jaro_winkler_names() { assert_delta!( - 0.562, + 0.452, jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre"), 0.001 ); @@ -968,7 +965,7 @@ mod tests { #[test] fn jaro_winkler_long_prefix() { - assert_delta!(0.911, jaro_winkler("cheeseburger", "cheese fries"), 0.001); + assert_delta!(0.866, jaro_winkler("cheeseburger", "cheese fries"), 0.001); } #[test] @@ -984,7 +981,7 @@ mod tests { #[test] fn jaro_winkler_very_long_prefix() { assert_delta!( - 1.0, + 0.98519, jaro_winkler("thequickbrownfoxjumpedoverx", "thequickbrownfoxjumpedovery") ); } diff --git a/tests/lib.rs b/tests/lib.rs index c170be0..991fc6f 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -67,5 +67,5 @@ fn jaro_works() { #[test] fn jaro_winkler_works() { - assert_delta!(0.911, jaro_winkler("cheeseburger", "cheese fries"), 0.001); + assert_delta!(0.866, jaro_winkler("cheeseburger", "cheese fries"), 0.001); }