Update fuzzy_match.R

further edits to fuzzy match - distances only calculated for names where the first letter of the first and second words in the input text matches names in the reference list with identical first letters for those words - this greatly sped up running the test dataset.
traitecoevo · Mar 13, 2024 · 05cb57a · 05cb57a
1 parent 9e943c8
commit 05cb57a
Showing 1 changed file with 8 additions and 0 deletions.
diff --git a/R/fuzzy_match.R b/R/fuzzy_match.R
@@ -49,6 +49,14 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,
     txt_word3_start <- stringr::str_extract(word(txt,3), "[:alpha:]|[:digit:]")
   }
 
+  ## subset accepted list to taxa that begin with the same first letter to reduce the number of fuzzy matches that are made in the next step.
+    accepted_list <- accepted_list[(stringr::str_extract(accepted_list, "[:alpha:]") %>% stringr::str_to_lower() == txt_word1_start %>% stringr::str_to_lower())]
+
+  ## to further speed up matches, further reduce list by subsetting accepted list based on the first letter of the second word.
+  if(words_in_text > 1) {
+    accepted_list <- accepted_list[(stringr::str_extract(stringr::word(accepted_list,2), "[:alpha:]") %>% stringr::str_to_lower() == txt_word2_start %>% stringr::str_to_lower())]
+  }
+
   ## identify the number of characters that must change for the text string to match each of the possible accepted names
   distance_c <- utils::adist(txt, accepted_list, fixed=TRUE)[1,]