diff --git a/R/align_taxa.R b/R/align_taxa.R index 90c2fcb0..0bc3a610 100644 --- a/R/align_taxa.R +++ b/R/align_taxa.R @@ -18,7 +18,7 @@ #' @param fuzzy_rel_dist The proportion of characters allowed to be different for a fuzzy match. #' @param fuzzy_matches Fuzzy matches are turned on as a default. The relative and absolute distances allowed for fuzzy matches to species and infraspecific taxon names are defined by the parameters `fuzzy_abs_dist` and `fuzzy_rel_dist` #' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default. -#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default. +#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned on as a default. #' @param identifier A dataset, location or other identifier, which defaults to NA. #' #' @return A tibble with columns that include original_name, aligned_name, taxonomic_dataset, taxon_rank, aligned_reason, alignment_code. diff --git a/R/create_species_state_origin_matrix.R b/R/create_species_state_origin_matrix.R index 59e6f564..bb32ea83 100644 --- a/R/create_species_state_origin_matrix.R +++ b/R/create_species_state_origin_matrix.R @@ -50,7 +50,7 @@ identify_places <- function(sep_state_data) { #' @noRd create_species_df <- function(apc_places, apc_species) { species_df <- dplyr::tibble(species = apc_species$canonical_name) - for (i in 1:length(apc_places)) { + for (i in seq_along(apc_places)) { species_df <- dplyr::bind_cols(species_df, NA, .name_repair = "minimal") } names(species_df) <- c("species", apc_places) @@ -76,7 +76,7 @@ state_parse_and_add_column <- function(species_df, state, apc_species) { #' @noRd parse_states <- function(species_df, apc_places, apc_species) { - for (i in 1:length(apc_places)) { + for (i in seq_along(apc_places)) { species_df <- state_parse_and_add_column(species_df, apc_places[i], apc_species) } return(species_df) diff --git a/R/create_taxonomic_update_lookup.R b/R/create_taxonomic_update_lookup.R index 8fa2577b..1f457385 100644 --- a/R/create_taxonomic_update_lookup.R +++ b/R/create_taxonomic_update_lookup.R @@ -13,7 +13,7 @@ #' @param full logical for whether the full lookup table is returned or just key columns #' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in. #' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default. -#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default. +#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned on as a default. #' @param identifier A dataset, location or other identifier, which defaults to NA. #' @param output file path to save the intermediate output to #' @return A lookup table containing the accepted and suggested names for each original name input, and additional taxonomic information such as taxon rank, taxonomic status, taxon IDs and genera. diff --git a/R/fuzzy_match.R b/R/fuzzy_match.R index 6d9259b5..74201414 100644 --- a/R/fuzzy_match.R +++ b/R/fuzzy_match.R @@ -51,7 +51,7 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel, ## subset accepted list to taxa that begin with the same first letter to reduce the number of fuzzy matches that are made in the next step. ## has also wanted to do this for the second word, but then need to separate different lists of reference names - smaller time saving and not worth it. - accepted_list <- accepted_list[(stringr::str_extract(accepted_list, "[:alpha:]") %>% stringr::str_to_lower() == txt_word1_start %>% stringr::str_to_lower())] + # accepted_list <- accepted_list[(stringr::str_extract(accepted_list, "[:alpha:]") %>% stringr::str_to_lower() == txt_word1_start %>% stringr::str_to_lower())] ## identify the number of characters that must change for the text string to match each of the possible accepted names distance_c <- utils::adist(txt, accepted_list, fixed=TRUE)[1,] @@ -61,7 +61,7 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel, min_dist_per_c <- min(distance_c) / stringr::str_length(txt) i <- which(distance_c==min_dist_abs_c) - keep = FALSE + keep <- FALSE if( ## Within allowable number of characters (absolute) @@ -72,7 +72,7 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel, #length(i) <= n_allowed ) { - for (j in 1:length(i)) { + for (j in seq_length(i)) { if (keep == TRUE) { @@ -108,18 +108,18 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel, if(words_in_text == 1) { if (txt_word1_start == match_word1_start) { - keep = TRUE } + keep <- TRUE } } else if(words_in_text == 2) { if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) { - keep = TRUE } + keep <- TRUE } } else if(words_in_text > 2) { if (words_in_match > 2) { if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) { - keep = TRUE } + keep <- TRUE } } else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) { - keep = TRUE } + keep <- TRUE } } diff --git a/R/match_taxa.R b/R/match_taxa.R index 64f8c1c7..bc574df2 100644 --- a/R/match_taxa.R +++ b/R/match_taxa.R @@ -740,7 +740,7 @@ match_taxa <- function( # match_05a: fuzzy match to APC-accepted canonical name # Fuzzy match of taxon name to an APC-accepted canonical name, once filler words and punctuation are removed. - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_along(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APC[i] <- fuzzy_match( txt = taxa$tocheck$stripped_name[i], @@ -781,7 +781,7 @@ match_taxa <- function( # match_05b: fuzzy match to APC-known canonical name # Fuzzy match of taxon name to an APC-known canonical name, once filler words and punctuation are removed. - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_along(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APC_synonym[i] <- fuzzy_match( txt = taxa$tocheck$stripped_name[i], @@ -863,7 +863,7 @@ match_taxa <- function( i <- ( stringr::str_detect(taxa$tocheck$cleaned_name, "[Aa]ff[\\.\\s]") | - stringr::str_detect(taxa$tocheck$cleaned_name, " affinis ") | + stringr::str_detect(taxa$tocheck$cleaned_name, " affinis[\\s|$]") | stringr::str_detect(taxa$tocheck$cleaned_name, " cf[\\.\\s]") ) & taxa$tocheck$genus %in% resources$genera_all2$genus @@ -1048,7 +1048,7 @@ match_taxa <- function( # For imprecise fuzzy matches, the taxon name can differ from the `APC-accepted` names by 5 characters & up to 25% of the string length. # These matches require individual review and are turned off as a default. if (imprecise_fuzzy_matches == TRUE) { - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_along(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APC_imprecise[i] <- fuzzy_match( txt = taxa$tocheck$stripped_name[i], @@ -1094,7 +1094,7 @@ match_taxa <- function( # For imprecise fuzzy matches, the taxon name can differ from the `APC -known` names by 5 characters & up to 25% of the string length. # These matches require individual review and are turned off as a default. if (imprecise_fuzzy_matches == TRUE) { - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_along(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APC_synonym_imprecise[i] <- fuzzy_match( txt = taxa$tocheck$stripped_name[i], @@ -1381,7 +1381,7 @@ match_taxa <- function( # sometimes the submitted taxon name is a valid trinomial + notes and # such names will only be aligned by matches considering only the first three words of the stripped name. # This match also does a good job aligning and correcting syntax of phrase names - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_along(nrow(taxa$tocheck))) { if (!is.na(taxa$tocheck$trinomial[i])) { taxa$tocheck$fuzzy_match_trinomial[i] <- fuzzy_match( @@ -1428,7 +1428,7 @@ match_taxa <- function( # sometimes the submitted taxon name is a valid trinomial + notes and # such names will only be aligned by matches considering only the first three words of the stripped name. # This match also does a good job aligning and correcting syntax of phrase names - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_along(nrow(taxa$tocheck))) { if (!is.na(taxa$tocheck$trinomial[i])) { taxa$tocheck$fuzzy_match_trinomial_synonym[i] <- fuzzy_match( @@ -1547,7 +1547,7 @@ match_taxa <- function( # or a valid binomial + invalid infraspecific epithet. # Such names will only be aligned by matches considering only the first two words of the stripped name. # This match also does a good job aligning and correcting syntax of phrase names. - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_along(nrow(taxa$tocheck))) { if (!is.na(taxa$tocheck$binomial[i]) & is.na(taxa$tocheck$fuzzy_match_binomial[i])) { taxa$tocheck$fuzzy_match_binomial[i] <- @@ -1597,7 +1597,7 @@ match_taxa <- function( # or a valid binomial + invalid infraspecific epithet. # Such names will only be aligned by matches considering only the first two words of the stripped name. # This match also does a good job aligning and correcting syntax of phrase names. - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq(nrow(taxa$tocheck))) { if (!is.na(taxa$tocheck$binomial[i]) & is.na(taxa$tocheck$fuzzy_match_binomial_APC_synonym[i])) { taxa$tocheck$fuzzy_match_binomial_APC_synonym[i] <- @@ -1648,7 +1648,7 @@ match_taxa <- function( # to avoid incorrectly aligning an APC accepted/known taxa to an APNI name. # This is especially true to accurately align phrase names. if (APNI_matches == TRUE) { - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_along(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APNI[i] <- fuzzy_match( txt = taxa$tocheck$stripped_name[i], @@ -1695,7 +1695,7 @@ match_taxa <- function( # These matches require individual review and are turned off as a default. if (APNI_matches == TRUE & imprecise_fuzzy_matches == TRUE) { - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_along(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APNI_imprecise[i] <- fuzzy_match( txt = taxa$tocheck$cleaned_name[i], diff --git a/R/standardise_names.R b/R/standardise_names.R index 94fd25ec..eaf39455 100644 --- a/R/standardise_names.R +++ b/R/standardise_names.R @@ -63,7 +63,7 @@ standardise_names <- function(taxon_names) { f("\\saffin(\\s|$)", " aff. ") %>% f("\\saff(\\s|$)", " aff. ") %>% f("\\saffn(\\s|$|\\.)", " aff. ") %>% - f("\\saffinis(\\s|$)", " aff. ") %>% + f("\\saffinis(\\s)", " aff. ") %>% ## f. not forma or form or form. or f f("\\sforma(\\s|$)", " f. ") %>% diff --git a/tests/testthat/benchmarks/test_matches_alignments_updates.csv b/tests/testthat/benchmarks/test_matches_alignments_updates.csv index bf158aaa..9071bf61 100644 --- a/tests/testthat/benchmarks/test_matches_alignments_updates.csv +++ b/tests/testthat/benchmarks/test_matches_alignments_updates.csv @@ -32,7 +32,7 @@ Driandra abc--def,match_03c,match_03c,Dryandra sp. [Driandra abc--def; test_all_ Xyystidium abc--def,match_03d,match_03d,Xystidium sp. [Xyystidium abc--def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin. Zygiaa abc--def,match_03d,match_03d,Zygia sp. [Zygiaa abc--def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne Abcde fgh -- ijk,match_03e,match_03e,NA,NA,genus,NA,TRUE,NA,NA,NA -Ryandra abc--def,match_03e,match_03e,NA,NA,genus,NA,TRUE,NA,NA,NA +Ryandra abc--def,match_03e,match_03b,Randia sp. [Ryandra abc--def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA Abildgaardia odontocarpa / Abildgaardia oxystachya,match_04a,match_04a,Abildgaardia sp. [Abildgaardia odontocarpa / Abildgaardia oxystachya; test_all_matches_TRUE],APC,genus,Abildgaardia,FALSE,https://id.biodiversity.org.au/node/apni/2905759,https://id.biodiversity.org.au/name/apni/55984,Abildgaardia Vahl Acanthocarpus fimbriatus / Acanthocarpus mucronatus,match_04a,match_04a,Acanthocarpus sp. [Acanthocarpus fimbriatus / Acanthocarpus mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm. Acanthocarpus fimbriatus / mucronatus,match_04a,match_04a,Acanthocarpus sp. [Acanthocarpus fimbriatus / mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm. @@ -52,7 +52,7 @@ Drrandra abc / def,match_04c,match_04c,Dryandra sp. [Drrandra abc / def; test_al Xyystidium abc/def,match_04d,match_04d,Xystidium sp. [Xyystidium abc/def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin. Zygiaa abc / def,match_04d,match_04d,Zygia sp. [Zygiaa abc / def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne Abcde fgh / ijk,match_04e,match_04e,NA,NA,genus,NA,TRUE,NA,NA,NA -Ryandra abc / def,match_04e,match_04e,NA,NA,genus,NA,TRUE,NA,NA,NA +Ryandra abc / def,match_04e,match_04b,Randia sp. [Ryandra abc / def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA Cycas candida K.D.Hill,match_05a,match_01a,Cycas candida,APC,species,Cycas candida,TRUE,https://id.biodiversity.org.au/node/apni/2893335,https://id.biodiversity.org.au/name/apni/188177,Cycas candida K.D.Hill Eremophila papillata Chinnock,match_05a,match_01a,Eremophila papillata,APC,species,Eremophila papillata,TRUE,https://id.biodiversity.org.au/node/apni/2910890,https://id.biodiversity.org.au/name/apni/207453,Eremophila papillata Chinnock Acalypha indica var. australis F.M.Bailey,match_05b,match_01b,Acalypha indica var. australis,APC,variety,Acalypha lanceolata,TRUE,https://id.biodiversity.org.au/instance/apni/889946,https://id.biodiversity.org.au/name/apni/72588,Acalypha indica var. australis F.M.Bailey @@ -155,7 +155,7 @@ Drrandra x def,match_11c,match_08c,Dryandra x [Drrandra x def; test_all_matches_ Xyystidium x def,match_11d,match_08d,Xystidium x [Xyystidium x def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin. Zygiaa abc x Zygia def,match_11d,match_08d,Zygia x [Zygiaa abc x Zygia def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne Abcde fgh x ijk,match_11e,match_08e,NA,NA,genus,NA,TRUE,NA,NA,NA -Ryandra abc x def,match_11e,match_08e,NA,NA,genus,NA,TRUE,NA,NA,NA +Ryandra abc x def,match_11e,match_08b,Randia x [Ryandra abc x def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA Baeckea sp. murchison river,match_12a,match_09a,Baeckea sp. Murchison River (M.E.Trudgen 12009),APC,species,Baeckea sp. Murchison River (M.E.Trudgen 12009),TRUE,https://id.biodiversity.org.au/node/apni/2888052,https://id.biodiversity.org.au/name/apni/191267,Baeckea sp. Murchison River (M.E.Trudgen 12009) WA Herbarium Eremophila oppositifolia rubra (needle leaves),match_12a,match_09a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock Eremophila oppositifolia rubra early collection,match_12a,match_09a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock diff --git a/tests/testthat/test-alignment_results.R b/tests/testthat/test-alignment_results.R index 6bd71749..ae71d74f 100644 --- a/tests/testthat/test-alignment_results.R +++ b/tests/testthat/test-alignment_results.R @@ -9,20 +9,6 @@ test_that("consistency with previous runs", { taxa <- c( - "Banksia integrifolia", - "Acacia longifolia", - "Commersonia rosea", - "Thelymitra pauciflora", - "Justicia procumbens", - "Hibbertia stricta", - "Rostellularia adscendens", - "Hibbertia sericea", - "Hibbertia sp.", - "Athrotaxis laxiflolia", - "Genoplesium insigne", - "Polypogon viridis", - "Acacia aneura", - "Acacia paraneura", "Galactia striata" )