Skip to content

Commit

Permalink
changes following reviews
Browse files Browse the repository at this point in the history
  • Loading branch information
ehwenk committed Apr 10, 2024
1 parent 519f46f commit 576696c
Show file tree
Hide file tree
Showing 8 changed files with 26 additions and 40 deletions.
2 changes: 1 addition & 1 deletion R/align_taxa.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#' @param fuzzy_rel_dist The proportion of characters allowed to be different for a fuzzy match.
#' @param fuzzy_matches Fuzzy matches are turned on as a default. The relative and absolute distances allowed for fuzzy matches to species and infraspecific taxon names are defined by the parameters `fuzzy_abs_dist` and `fuzzy_rel_dist`
#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default.
#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default.
#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned on as a default.
#' @param identifier A dataset, location or other identifier, which defaults to NA.
#'
#' @return A tibble with columns that include original_name, aligned_name, taxonomic_dataset, taxon_rank, aligned_reason, alignment_code.
Expand Down
4 changes: 2 additions & 2 deletions R/create_species_state_origin_matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ identify_places <- function(sep_state_data) {
#' @noRd
create_species_df <- function(apc_places, apc_species) {
species_df <- dplyr::tibble(species = apc_species$canonical_name)
for (i in 1:length(apc_places)) {
for (i in seq_along(apc_places)) {
species_df <- dplyr::bind_cols(species_df, NA, .name_repair = "minimal")
}
names(species_df) <- c("species", apc_places)
Expand All @@ -76,7 +76,7 @@ state_parse_and_add_column <- function(species_df, state, apc_species) {

#' @noRd
parse_states <- function(species_df, apc_places, apc_species) {
for (i in 1:length(apc_places)) {
for (i in seq_along(apc_places)) {
species_df <- state_parse_and_add_column(species_df, apc_places[i], apc_species)
}
return(species_df)
Expand Down
2 changes: 1 addition & 1 deletion R/create_taxonomic_update_lookup.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#' @param full logical for whether the full lookup table is returned or just key columns
#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in.
#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default.
#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default.
#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned on as a default.
#' @param identifier A dataset, location or other identifier, which defaults to NA.
#' @param output file path to save the intermediate output to
#' @return A lookup table containing the accepted and suggested names for each original name input, and additional taxonomic information such as taxon rank, taxonomic status, taxon IDs and genera.
Expand Down
14 changes: 7 additions & 7 deletions R/fuzzy_match.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,

## subset accepted list to taxa that begin with the same first letter to reduce the number of fuzzy matches that are made in the next step.
## has also wanted to do this for the second word, but then need to separate different lists of reference names - smaller time saving and not worth it.
accepted_list <- accepted_list[(stringr::str_extract(accepted_list, "[:alpha:]") %>% stringr::str_to_lower() == txt_word1_start %>% stringr::str_to_lower())]
# accepted_list <- accepted_list[(stringr::str_extract(accepted_list, "[:alpha:]") %>% stringr::str_to_lower() == txt_word1_start %>% stringr::str_to_lower())]

## identify the number of characters that must change for the text string to match each of the possible accepted names
distance_c <- utils::adist(txt, accepted_list, fixed=TRUE)[1,]
Expand All @@ -61,7 +61,7 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,
min_dist_per_c <- min(distance_c) / stringr::str_length(txt)

i <- which(distance_c==min_dist_abs_c)
keep = FALSE
keep <- FALSE

if(
## Within allowable number of characters (absolute)
Expand All @@ -72,7 +72,7 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,
#length(i) <= n_allowed
) {

for (j in 1:length(i)) {
for (j in seq_length(i)) {

if (keep == TRUE) {

Expand Down Expand Up @@ -108,18 +108,18 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,

if(words_in_text == 1) {
if (txt_word1_start == match_word1_start) {
keep = TRUE }
keep <- TRUE }

} else if(words_in_text == 2) {
if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
keep = TRUE }
keep <- TRUE }

} else if(words_in_text > 2) {
if (words_in_match > 2) {
if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) {
keep = TRUE }
keep <- TRUE }
} else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
keep = TRUE }
keep <- TRUE }
}


Expand Down
22 changes: 11 additions & 11 deletions R/match_taxa.R
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ match_taxa <- function(

# match_05a: fuzzy match to APC-accepted canonical name
# Fuzzy match of taxon name to an APC-accepted canonical name, once filler words and punctuation are removed.
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_along(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APC[i] <-
fuzzy_match(
txt = taxa$tocheck$stripped_name[i],
Expand Down Expand Up @@ -781,7 +781,7 @@ match_taxa <- function(

# match_05b: fuzzy match to APC-known canonical name
# Fuzzy match of taxon name to an APC-known canonical name, once filler words and punctuation are removed.
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_along(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APC_synonym[i] <-
fuzzy_match(
txt = taxa$tocheck$stripped_name[i],
Expand Down Expand Up @@ -863,7 +863,7 @@ match_taxa <- function(
i <-
(
stringr::str_detect(taxa$tocheck$cleaned_name, "[Aa]ff[\\.\\s]") |
stringr::str_detect(taxa$tocheck$cleaned_name, " affinis ") |
stringr::str_detect(taxa$tocheck$cleaned_name, " affinis[\\s|$]") |
stringr::str_detect(taxa$tocheck$cleaned_name, " cf[\\.\\s]")
) &
taxa$tocheck$genus %in% resources$genera_all2$genus
Expand Down Expand Up @@ -1048,7 +1048,7 @@ match_taxa <- function(
# For imprecise fuzzy matches, the taxon name can differ from the `APC-accepted` names by 5 characters & up to 25% of the string length.
# These matches require individual review and are turned off as a default.
if (imprecise_fuzzy_matches == TRUE) {
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_along(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APC_imprecise[i] <-
fuzzy_match(
txt = taxa$tocheck$stripped_name[i],
Expand Down Expand Up @@ -1094,7 +1094,7 @@ match_taxa <- function(
# For imprecise fuzzy matches, the taxon name can differ from the `APC -known` names by 5 characters & up to 25% of the string length.
# These matches require individual review and are turned off as a default.
if (imprecise_fuzzy_matches == TRUE) {
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_along(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APC_synonym_imprecise[i] <-
fuzzy_match(
txt = taxa$tocheck$stripped_name[i],
Expand Down Expand Up @@ -1381,7 +1381,7 @@ match_taxa <- function(
# sometimes the submitted taxon name is a valid trinomial + notes and
# such names will only be aligned by matches considering only the first three words of the stripped name.
# This match also does a good job aligning and correcting syntax of phrase names
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_along(nrow(taxa$tocheck))) {
if (!is.na(taxa$tocheck$trinomial[i])) {
taxa$tocheck$fuzzy_match_trinomial[i] <-
fuzzy_match(
Expand Down Expand Up @@ -1428,7 +1428,7 @@ match_taxa <- function(
# sometimes the submitted taxon name is a valid trinomial + notes and
# such names will only be aligned by matches considering only the first three words of the stripped name.
# This match also does a good job aligning and correcting syntax of phrase names
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_along(nrow(taxa$tocheck))) {
if (!is.na(taxa$tocheck$trinomial[i])) {
taxa$tocheck$fuzzy_match_trinomial_synonym[i] <-
fuzzy_match(
Expand Down Expand Up @@ -1547,7 +1547,7 @@ match_taxa <- function(
# or a valid binomial + invalid infraspecific epithet.
# Such names will only be aligned by matches considering only the first two words of the stripped name.
# This match also does a good job aligning and correcting syntax of phrase names.
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_along(nrow(taxa$tocheck))) {
if (!is.na(taxa$tocheck$binomial[i]) &
is.na(taxa$tocheck$fuzzy_match_binomial[i])) {
taxa$tocheck$fuzzy_match_binomial[i] <-
Expand Down Expand Up @@ -1597,7 +1597,7 @@ match_taxa <- function(
# or a valid binomial + invalid infraspecific epithet.
# Such names will only be aligned by matches considering only the first two words of the stripped name.
# This match also does a good job aligning and correcting syntax of phrase names.
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq(nrow(taxa$tocheck))) {
if (!is.na(taxa$tocheck$binomial[i]) &
is.na(taxa$tocheck$fuzzy_match_binomial_APC_synonym[i])) {
taxa$tocheck$fuzzy_match_binomial_APC_synonym[i] <-
Expand Down Expand Up @@ -1648,7 +1648,7 @@ match_taxa <- function(
# to avoid incorrectly aligning an APC accepted/known taxa to an APNI name.
# This is especially true to accurately align phrase names.
if (APNI_matches == TRUE) {
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_along(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APNI[i] <-
fuzzy_match(
txt = taxa$tocheck$stripped_name[i],
Expand Down Expand Up @@ -1695,7 +1695,7 @@ match_taxa <- function(
# These matches require individual review and are turned off as a default.
if (APNI_matches == TRUE & imprecise_fuzzy_matches == TRUE) {

for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_along(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APNI_imprecise[i] <-
fuzzy_match(
txt = taxa$tocheck$cleaned_name[i],
Expand Down
2 changes: 1 addition & 1 deletion R/standardise_names.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ standardise_names <- function(taxon_names) {
f("\\saffin(\\s|$)", " aff. ") %>%
f("\\saff(\\s|$)", " aff. ") %>%
f("\\saffn(\\s|$|\\.)", " aff. ") %>%
f("\\saffinis(\\s|$)", " aff. ") %>%
f("\\saffinis(\\s)", " aff. ") %>%

## f. not forma or form or form. or f
f("\\sforma(\\s|$)", " f. ") %>%
Expand Down
6 changes: 3 additions & 3 deletions tests/testthat/benchmarks/test_matches_alignments_updates.csv
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Driandra abc--def,match_03c,match_03c,Dryandra sp. [Driandra abc--def; test_all_
Xyystidium abc--def,match_03d,match_03d,Xystidium sp. [Xyystidium abc--def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin.
Zygiaa abc--def,match_03d,match_03d,Zygia sp. [Zygiaa abc--def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne
Abcde fgh -- ijk,match_03e,match_03e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc--def,match_03e,match_03e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc--def,match_03e,match_03b,Randia sp. [Ryandra abc--def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA
Abildgaardia odontocarpa / Abildgaardia oxystachya,match_04a,match_04a,Abildgaardia sp. [Abildgaardia odontocarpa / Abildgaardia oxystachya; test_all_matches_TRUE],APC,genus,Abildgaardia,FALSE,https://id.biodiversity.org.au/node/apni/2905759,https://id.biodiversity.org.au/name/apni/55984,Abildgaardia Vahl
Acanthocarpus fimbriatus / Acanthocarpus mucronatus,match_04a,match_04a,Acanthocarpus sp. [Acanthocarpus fimbriatus / Acanthocarpus mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm.
Acanthocarpus fimbriatus / mucronatus,match_04a,match_04a,Acanthocarpus sp. [Acanthocarpus fimbriatus / mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm.
Expand All @@ -52,7 +52,7 @@ Drrandra abc / def,match_04c,match_04c,Dryandra sp. [Drrandra abc / def; test_al
Xyystidium abc/def,match_04d,match_04d,Xystidium sp. [Xyystidium abc/def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin.
Zygiaa abc / def,match_04d,match_04d,Zygia sp. [Zygiaa abc / def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne
Abcde fgh / ijk,match_04e,match_04e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc / def,match_04e,match_04e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc / def,match_04e,match_04b,Randia sp. [Ryandra abc / def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA
Cycas candida K.D.Hill,match_05a,match_01a,Cycas candida,APC,species,Cycas candida,TRUE,https://id.biodiversity.org.au/node/apni/2893335,https://id.biodiversity.org.au/name/apni/188177,Cycas candida K.D.Hill
Eremophila papillata Chinnock,match_05a,match_01a,Eremophila papillata,APC,species,Eremophila papillata,TRUE,https://id.biodiversity.org.au/node/apni/2910890,https://id.biodiversity.org.au/name/apni/207453,Eremophila papillata Chinnock
Acalypha indica var. australis F.M.Bailey,match_05b,match_01b,Acalypha indica var. australis,APC,variety,Acalypha lanceolata,TRUE,https://id.biodiversity.org.au/instance/apni/889946,https://id.biodiversity.org.au/name/apni/72588,Acalypha indica var. australis F.M.Bailey
Expand Down Expand Up @@ -155,7 +155,7 @@ Drrandra x def,match_11c,match_08c,Dryandra x [Drrandra x def; test_all_matches_
Xyystidium x def,match_11d,match_08d,Xystidium x [Xyystidium x def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin.
Zygiaa abc x Zygia def,match_11d,match_08d,Zygia x [Zygiaa abc x Zygia def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne
Abcde fgh x ijk,match_11e,match_08e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc x def,match_11e,match_08e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc x def,match_11e,match_08b,Randia x [Ryandra abc x def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA
Baeckea sp. murchison river,match_12a,match_09a,Baeckea sp. Murchison River (M.E.Trudgen 12009),APC,species,Baeckea sp. Murchison River (M.E.Trudgen 12009),TRUE,https://id.biodiversity.org.au/node/apni/2888052,https://id.biodiversity.org.au/name/apni/191267,Baeckea sp. Murchison River (M.E.Trudgen 12009) WA Herbarium
Eremophila oppositifolia rubra (needle leaves),match_12a,match_09a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock
Eremophila oppositifolia rubra early collection,match_12a,match_09a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock
Expand Down
14 changes: 0 additions & 14 deletions tests/testthat/test-alignment_results.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,6 @@ test_that("consistency with previous runs", {

taxa <-
c(
"Banksia integrifolia",
"Acacia longifolia",
"Commersonia rosea",
"Thelymitra pauciflora",
"Justicia procumbens",
"Hibbertia stricta",
"Rostellularia adscendens",
"Hibbertia sericea",
"Hibbertia sp.",
"Athrotaxis laxiflolia",
"Genoplesium insigne",
"Polypogon viridis",
"Acacia aneura",
"Acacia paraneura",
"Galactia striata"
)

Expand Down

0 comments on commit 576696c

Please sign in to comment.