traitecoevo · ehwenk · Mar 11, 2024 · Mar 13, 2024 · Mar 13, 2024 · Mar 27, 2024
diff --git a/R/align_taxa.R b/R/align_taxa.R
@@ -18,7 +18,7 @@
 #' @param fuzzy_rel_dist The proportion of characters allowed to be different for a fuzzy match. 
 #' @param fuzzy_matches Fuzzy matches are turned on as a default. The relative and absolute distances allowed for fuzzy matches to species and infraspecific taxon names are defined by the parameters `fuzzy_abs_dist` and `fuzzy_rel_dist`
 #' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default.
-#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default.
+#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned on as a default.
 #' @param identifier A dataset, location or other identifier, which defaults to NA.
 #'
 #' @return A tibble with columns that include original_name, aligned_name, taxonomic_dataset, taxon_rank, aligned_reason, alignment_code. 

diff --git a/R/create_species_state_origin_matrix.R b/R/create_species_state_origin_matrix.R
@@ -50,7 +50,7 @@ identify_places <- function(sep_state_data) {
 #' @noRd
 create_species_df <- function(apc_places, apc_species) {
   species_df <- dplyr::tibble(species = apc_species$canonical_name)
-  for (i in 1:length(apc_places)) {
+  for (i in seq_along(apc_places)) {
     species_df <- dplyr::bind_cols(species_df, NA, .name_repair = "minimal")
   }
   names(species_df) <- c("species", apc_places)
@@ -76,7 +76,7 @@ state_parse_and_add_column <- function(species_df, state, apc_species) {
 
 #' @noRd
 parse_states <- function(species_df, apc_places, apc_species) {
-  for (i in 1:length(apc_places)) {
+  for (i in seq_along(apc_places)) {
     species_df <- state_parse_and_add_column(species_df, apc_places[i], apc_species)
   }
   return(species_df)

diff --git a/R/create_taxonomic_update_lookup.R b/R/create_taxonomic_update_lookup.R
@@ -13,7 +13,7 @@
 #' @param full logical for whether the full lookup table is returned or just key columns
 #' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer.  If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in.
 #' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default. 
-#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default.
+#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned on as a default.
 #' @param identifier A dataset, location or other identifier, which defaults to NA.
 #' @param output file path to save the intermediate output to
 #' @return A lookup table containing the accepted and suggested names for each original name input, and additional taxonomic information such as taxon rank, taxonomic status, taxon IDs and genera. 

diff --git a/R/fuzzy_match.R b/R/fuzzy_match.R
@@ -49,6 +49,10 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,
     txt_word3_start <- stringr::str_extract(word(txt,3), "[:alpha:]|[:digit:]")
   }
 
+  ## subset accepted list to taxa that begin with the same first letter to reduce the number of fuzzy matches that are made in the next step.
+  ## has also wanted to do this for the second word, but then need to separate different lists of reference names - smaller time saving and not worth it.
+  #  accepted_list <- accepted_list[(stringr::str_extract(accepted_list, "[:alpha:]") %>% stringr::str_to_lower() == txt_word1_start %>% stringr::str_to_lower())]
+
   ## identify the number of characters that must change for the text string to match each of the possible accepted names
   distance_c <- utils::adist(txt, accepted_list, fixed=TRUE)[1,]
 
@@ -57,66 +61,83 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,
   min_dist_per_c <-  min(distance_c) / stringr::str_length(txt)
 
   i <- which(distance_c==min_dist_abs_c)
+  keep <- FALSE
 
   if(
     ## Within allowable number of characters (absolute)
     min_dist_abs_c <= max_distance_abs &
     ## Within allowable number of characters (relative)
-    min_dist_per_c <= max_distance_rel &
+    min_dist_per_c <= max_distance_rel #&
     ## Is a unique solution
-    length(i)<= n_allowed
-  ) {
-    ## identify number of words in the matched string
-    words_in_match <- 1 + stringr::str_count(accepted_list[i]," ")
-
-    ## identify the first letter of the first word in the matched string
-    match_word1_start <- stringr::str_extract(accepted_list[i], "[:alpha:]")
+    #length(i) <= n_allowed
+    ) {
 
-    ## identify the first letter of the second word in the matched string (if the matched string includes 2+ words)
-    if(words_in_text > 1 & epithet_letters == 2) {
-      if(nchar(word(accepted_list[i],2)) == 1) {
-        match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:]|[:digit:]")
+    for (j in seq_len(i)) {
+
+      if (keep == TRUE) {
+
+        break()
+
       } else {
-        match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:][:alpha:]|[:digit:]")
-      }
-    }
+        ## identify number of words in the matched string
+        words_in_match <- 1 + stringr::str_count(accepted_list[i][j]," ")
+
+        ## identify the first letter of the first word in the matched string
+        match_word1_start <- stringr::str_extract(accepted_list[i][j], "[:alpha:]")
+
+        ## identify the first letter of the second word in the matched string (if the matched string includes 2+ words)
+        if(words_in_text > 1 & epithet_letters == 2) {
+          if(nchar(word(accepted_list[i][j],2)) == 1) {
+            match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:]|[:digit:]")
+          } else {
+            match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:][:alpha:]|[:digit:]")
+          }
+        }
+
+        if(words_in_text > 1 & epithet_letters == 1) {
+            match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:]|[:digit:]")
+        }
 
-    if(words_in_text > 1 & epithet_letters == 1) {
-        match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:]|[:digit:]")
-    }
-
-    ## identify the first letter of the third word in the matched string (if the matched string includes 3+ words)
-    if(words_in_text > 2) {
-      match_word3_start <- stringr::str_extract(word(accepted_list[i],3), "[:alpha:]|[:digit:]")
-    }
+        ## identify the first letter of the third word in the matched string (if the matched string includes 3+ words)
+        if(words_in_text > 2) {
+          match_word3_start <- stringr::str_extract(word(accepted_list[i][j],3), "[:alpha:]|[:digit:]")
+        }
+
+        ## keep match if the first letters of the first three words (or fewer if applicable) in the string to match 
+        ## are identical to the first letters of the first three words in the matched string
 
-    keep = FALSE
+        if(words_in_text == 1) {
+          if (txt_word1_start == match_word1_start) {
+            keep <- TRUE }
+
+        } else if(words_in_text == 2) {
+          if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
+            keep <- TRUE }
+
+        } else if(words_in_text > 2) {
+          if (words_in_match > 2) {
+            if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) {
+              keep <- TRUE }
+          } else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
+            keep <- TRUE }
+        }
 
-    ## keep match if the first letters of the first three words (or fewer if applicable) in the string to match 
-    ## are identical to the first letters of the first three words in the matched string
+
+        if(keep == TRUE) {
+
+          return(accepted_list[i][j])
+
+        }
+
+        return(NA)
+      }
 
-    if(words_in_text == 1) {
-      if (txt_word1_start == match_word1_start) {
-        keep = TRUE }
-
-    } else if(words_in_text == 2) {
-      if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
-        keep = TRUE }
-
-    } else if(words_in_text > 2) {
-      if (words_in_match > 2) {
-        if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) {
-          keep = TRUE }
-      } else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
-        keep = TRUE }
+      return(NA)
     }
 
-    if(keep == TRUE) {
-
-      return(accepted_list[i])
-
-    }
     return(NA)
   }
+
   return(NA)
 }
+
diff --git a/R/match_taxa.R b/R/match_taxa.R
@@ -740,7 +740,7 @@ match_taxa <- function(
 
   # match_05a: fuzzy match to APC-accepted canonical name
   # Fuzzy match of taxon name to an APC-accepted canonical name, once filler words and punctuation are removed.
-  for (i in 1:nrow(taxa$tocheck)) {    
+  for (i in seq_len(nrow(taxa$tocheck))) {    
     taxa$tocheck$fuzzy_match_cleaned_APC[i] <-
       fuzzy_match(
         txt = taxa$tocheck$stripped_name[i],
@@ -781,7 +781,7 @@ match_taxa <- function(
 
   # match_05b: fuzzy match to APC-known canonical name
   # Fuzzy match of taxon name to an APC-known canonical name, once filler words and punctuation are removed.
-  for (i in 1:nrow(taxa$tocheck)) {    
+  for (i in seq_len(nrow(taxa$tocheck))) {    
     taxa$tocheck$fuzzy_match_cleaned_APC_synonym[i] <-
       fuzzy_match(
         txt = taxa$tocheck$stripped_name[i],
@@ -863,7 +863,7 @@ match_taxa <- function(
   i <-
     (
       stringr::str_detect(taxa$tocheck$cleaned_name, "[Aa]ff[\\.\\s]") |
-        stringr::str_detect(taxa$tocheck$cleaned_name, " affinis ")  |
+        stringr::str_detect(taxa$tocheck$cleaned_name, " affinis[\\s|$]")  |
         stringr::str_detect(taxa$tocheck$cleaned_name, " cf[\\.\\s]")
     ) &
     taxa$tocheck$genus %in% resources$genera_all2$genus
@@ -1048,7 +1048,7 @@ match_taxa <- function(
   # For imprecise fuzzy matches, the taxon name can differ from the `APC-accepted` names by 5 characters & up to 25% of the string length.
   # These matches require individual review and are turned off as a default.
   if (imprecise_fuzzy_matches == TRUE) {
-    for (i in 1:nrow(taxa$tocheck)) {
+    for (i in seq_len(nrow(taxa$tocheck))) {
       taxa$tocheck$fuzzy_match_cleaned_APC_imprecise[i] <-
         fuzzy_match(
           txt = taxa$tocheck$stripped_name[i],
@@ -1094,7 +1094,7 @@ match_taxa <- function(
   # For imprecise fuzzy matches, the taxon name can differ from the `APC -known` names by 5 characters & up to 25% of the string length.
   # These matches require individual review and are turned off as a default.
   if (imprecise_fuzzy_matches == TRUE) {
-    for (i in 1:nrow(taxa$tocheck)) {
+    for (i in seq_len(nrow(taxa$tocheck))) {
       taxa$tocheck$fuzzy_match_cleaned_APC_synonym_imprecise[i] <-
         fuzzy_match(
           txt = taxa$tocheck$stripped_name[i],
@@ -1381,7 +1381,7 @@ match_taxa <- function(
   # sometimes the submitted taxon name is a valid trinomial + notes and 
   # such names will only be aligned by matches considering only the first three words of the stripped name.
   # This match also does a good job aligning and correcting syntax of phrase names
-  for (i in 1:nrow(taxa$tocheck)) {
+  for (i in seq_len(nrow(taxa$tocheck))) {
     if (!is.na(taxa$tocheck$trinomial[i])) {
       taxa$tocheck$fuzzy_match_trinomial[i] <-
         fuzzy_match(
@@ -1428,7 +1428,7 @@ match_taxa <- function(
   # sometimes the submitted taxon name is a valid trinomial + notes and 
   # such names will only be aligned by matches considering only the first three words of the stripped name.
   # This match also does a good job aligning and correcting syntax of phrase names
-  for (i in 1:nrow(taxa$tocheck)) {
+  for (i in seq_len(nrow(taxa$tocheck))) {
     if (!is.na(taxa$tocheck$trinomial[i])) {
       taxa$tocheck$fuzzy_match_trinomial_synonym[i] <-
         fuzzy_match(
@@ -1547,7 +1547,7 @@ match_taxa <- function(
   # or a valid binomial + invalid infraspecific epithet.
   # Such names will only be aligned by matches considering only the first two words of the stripped name.
   # This match also does a good job aligning and correcting syntax of phrase names.
-  for (i in 1:nrow(taxa$tocheck)) {
+  for (i in seq_len(nrow(taxa$tocheck))) {
     if (!is.na(taxa$tocheck$binomial[i]) &
         is.na(taxa$tocheck$fuzzy_match_binomial[i])) {
       taxa$tocheck$fuzzy_match_binomial[i] <-
@@ -1597,7 +1597,7 @@ match_taxa <- function(
   # or a valid binomial + invalid infraspecific epithet.
   # Such names will only be aligned by matches considering only the first two words of the stripped name.
   # This match also does a good job aligning and correcting syntax of phrase names.
-  for (i in 1:nrow(taxa$tocheck)) {
+  for (i in seq_len(nrow(taxa$tocheck))) {
     if (!is.na(taxa$tocheck$binomial[i]) &
         is.na(taxa$tocheck$fuzzy_match_binomial_APC_synonym[i])) {
       taxa$tocheck$fuzzy_match_binomial_APC_synonym[i] <-
@@ -1648,7 +1648,7 @@ match_taxa <- function(
   # to avoid incorrectly aligning an APC accepted/known taxa to an APNI name.
   # This is especially true to accurately align phrase names.
   if (APNI_matches == TRUE) {
-    for (i in 1:nrow(taxa$tocheck)) {
+    for (i in seq_len(nrow(taxa$tocheck))) {
       taxa$tocheck$fuzzy_match_cleaned_APNI[i] <-
         fuzzy_match(
           txt = taxa$tocheck$stripped_name[i],
@@ -1695,7 +1695,7 @@ match_taxa <- function(
   # These matches require individual review and are turned off as a default.
   if (APNI_matches == TRUE & imprecise_fuzzy_matches == TRUE) {
 
-    for (i in 1:nrow(taxa$tocheck)) {
+    for (i in seq_len(nrow(taxa$tocheck))) {
       taxa$tocheck$fuzzy_match_cleaned_APNI_imprecise[i] <-
         fuzzy_match(
           txt = taxa$tocheck$cleaned_name[i],

diff --git a/R/standardise_names.R b/R/standardise_names.R
@@ -63,7 +63,7 @@ standardise_names <- function(taxon_names) {
     f("\\saffin(\\s|$)",    " aff. ") %>%
     f("\\saff(\\s|$)",      " aff. ") %>%
     f("\\saffn(\\s|$|\\.)", " aff. ") %>%
-    f("\\saffinis(\\s|$)",  " aff. ") %>%
+    f("\\saffinis(\\s)",  " aff. ") %>%
 
     ## f. not forma or form or form. or f
     f("\\sforma(\\s|$)",       " f. ") %>%

diff --git a/tests/testthat/benchmarks/test_matches_alignments_updates.csv b/tests/testthat/benchmarks/test_matches_alignments_updates.csv
@@ -32,7 +32,7 @@ Driandra abc--def,match_03c,match_03c,Dryandra sp. [Driandra abc--def; test_all_
 Xyystidium abc--def,match_03d,match_03d,Xystidium sp. [Xyystidium abc--def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin.
 Zygiaa abc--def,match_03d,match_03d,Zygia sp. [Zygiaa abc--def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne
 Abcde fgh -- ijk,match_03e,match_03e,NA,NA,genus,NA,TRUE,NA,NA,NA
-Ryandra abc--def,match_03e,match_03e,NA,NA,genus,NA,TRUE,NA,NA,NA
+Ryandra abc--def,match_03e,match_03b,Randia sp. [Ryandra abc--def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA
 Abildgaardia odontocarpa / Abildgaardia oxystachya,match_04a,match_04a,Abildgaardia sp. [Abildgaardia odontocarpa / Abildgaardia oxystachya; test_all_matches_TRUE],APC,genus,Abildgaardia,FALSE,https://id.biodiversity.org.au/node/apni/2905759,https://id.biodiversity.org.au/name/apni/55984,Abildgaardia Vahl
 Acanthocarpus fimbriatus / Acanthocarpus mucronatus,match_04a,match_04a,Acanthocarpus sp. [Acanthocarpus fimbriatus / Acanthocarpus mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm.
 Acanthocarpus fimbriatus / mucronatus,match_04a,match_04a,Acanthocarpus sp. [Acanthocarpus fimbriatus / mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm.
@@ -52,7 +52,7 @@ Drrandra abc / def,match_04c,match_04c,Dryandra sp. [Drrandra abc / def; test_al
 Xyystidium abc/def,match_04d,match_04d,Xystidium sp. [Xyystidium abc/def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin.
 Zygiaa abc / def,match_04d,match_04d,Zygia sp. [Zygiaa abc / def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne
 Abcde fgh / ijk,match_04e,match_04e,NA,NA,genus,NA,TRUE,NA,NA,NA
-Ryandra abc / def,match_04e,match_04e,NA,NA,genus,NA,TRUE,NA,NA,NA
+Ryandra abc / def,match_04e,match_04b,Randia sp. [Ryandra abc / def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA
 Cycas candida K.D.Hill,match_05a,match_01a,Cycas candida,APC,species,Cycas candida,TRUE,https://id.biodiversity.org.au/node/apni/2893335,https://id.biodiversity.org.au/name/apni/188177,Cycas candida K.D.Hill
 Eremophila papillata Chinnock,match_05a,match_01a,Eremophila papillata,APC,species,Eremophila papillata,TRUE,https://id.biodiversity.org.au/node/apni/2910890,https://id.biodiversity.org.au/name/apni/207453,Eremophila papillata Chinnock
 Acalypha indica var. australis F.M.Bailey,match_05b,match_01b,Acalypha indica var. australis,APC,variety,Acalypha lanceolata,TRUE,https://id.biodiversity.org.au/instance/apni/889946,https://id.biodiversity.org.au/name/apni/72588,Acalypha indica var. australis F.M.Bailey
@@ -155,7 +155,7 @@ Drrandra x def,match_11c,match_08c,Dryandra x [Drrandra x def; test_all_matches_
 Xyystidium x def,match_11d,match_08d,Xystidium x [Xyystidium x def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin.
 Zygiaa abc x Zygia def,match_11d,match_08d,Zygia x [Zygiaa abc x Zygia def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne
 Abcde fgh x ijk,match_11e,match_08e,NA,NA,genus,NA,TRUE,NA,NA,NA
-Ryandra abc x def,match_11e,match_08e,NA,NA,genus,NA,TRUE,NA,NA,NA
+Ryandra abc x def,match_11e,match_08b,Randia x [Ryandra abc x def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA
 Baeckea sp. murchison river,match_12a,match_09a,Baeckea sp. Murchison River (M.E.Trudgen 12009),APC,species,Baeckea sp. Murchison River (M.E.Trudgen 12009),TRUE,https://id.biodiversity.org.au/node/apni/2888052,https://id.biodiversity.org.au/name/apni/191267,Baeckea sp. Murchison River (M.E.Trudgen 12009) WA Herbarium
 Eremophila oppositifolia rubra (needle leaves),match_12a,match_09a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock
 Eremophila oppositifolia rubra early collection,match_12a,match_09a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock

diff --git a/tests/testthat/test-alignment_results.R b/tests/testthat/test-alignment_results.R
@@ -9,20 +9,6 @@ test_that("consistency with previous runs", {
 
   taxa <-
     c(
-      "Banksia integrifolia",
-      "Acacia longifolia",
-      "Commersonia rosea",
-      "Thelymitra pauciflora",
-      "Justicia procumbens",
-      "Hibbertia stricta",
-      "Rostellularia adscendens",
-      "Hibbertia sericea",
-      "Hibbertia sp.",
-      "Athrotaxis laxiflolia",
-      "Genoplesium insigne",
-      "Polypogon viridis",
-      "Acacia aneura",
-      "Acacia paraneura",
       "Galactia striata"
     )