Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update fuzzy_match.R #181

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion R/align_taxa.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#' @param fuzzy_rel_dist The proportion of characters allowed to be different for a fuzzy match.
#' @param fuzzy_matches Fuzzy matches are turned on as a default. The relative and absolute distances allowed for fuzzy matches to species and infraspecific taxon names are defined by the parameters `fuzzy_abs_dist` and `fuzzy_rel_dist`
#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default.
#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default.
#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned on as a default.
#' @param identifier A dataset, location or other identifier, which defaults to NA.
#'
#' @return A tibble with columns that include original_name, aligned_name, taxonomic_dataset, taxon_rank, aligned_reason, alignment_code.
Expand Down
4 changes: 2 additions & 2 deletions R/create_species_state_origin_matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ identify_places <- function(sep_state_data) {
#' @noRd
create_species_df <- function(apc_places, apc_species) {
species_df <- dplyr::tibble(species = apc_species$canonical_name)
for (i in 1:length(apc_places)) {
for (i in seq_along(apc_places)) {
species_df <- dplyr::bind_cols(species_df, NA, .name_repair = "minimal")
}
names(species_df) <- c("species", apc_places)
Expand All @@ -76,7 +76,7 @@ state_parse_and_add_column <- function(species_df, state, apc_species) {

#' @noRd
parse_states <- function(species_df, apc_places, apc_species) {
for (i in 1:length(apc_places)) {
for (i in seq_along(apc_places)) {
species_df <- state_parse_and_add_column(species_df, apc_places[i], apc_species)
}
return(species_df)
Expand Down
2 changes: 1 addition & 1 deletion R/create_taxonomic_update_lookup.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#' @param full logical for whether the full lookup table is returned or just key columns
#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in.
#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default.
#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default.
#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned on as a default.
#' @param identifier A dataset, location or other identifier, which defaults to NA.
#' @param output file path to save the intermediate output to
#' @return A lookup table containing the accepted and suggested names for each original name input, and additional taxonomic information such as taxon rank, taxonomic status, taxon IDs and genera.
Expand Down
111 changes: 66 additions & 45 deletions R/fuzzy_match.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,
txt_word3_start <- stringr::str_extract(word(txt,3), "[:alpha:]|[:digit:]")
}

## subset accepted list to taxa that begin with the same first letter to reduce the number of fuzzy matches that are made in the next step.
## has also wanted to do this for the second word, but then need to separate different lists of reference names - smaller time saving and not worth it.
# accepted_list <- accepted_list[(stringr::str_extract(accepted_list, "[:alpha:]") %>% stringr::str_to_lower() == txt_word1_start %>% stringr::str_to_lower())]

## identify the number of characters that must change for the text string to match each of the possible accepted names
distance_c <- utils::adist(txt, accepted_list, fixed=TRUE)[1,]

Expand All @@ -57,66 +61,83 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel,
min_dist_per_c <- min(distance_c) / stringr::str_length(txt)

i <- which(distance_c==min_dist_abs_c)
keep <- FALSE

if(
## Within allowable number of characters (absolute)
min_dist_abs_c <= max_distance_abs &
## Within allowable number of characters (relative)
min_dist_per_c <= max_distance_rel &
min_dist_per_c <= max_distance_rel #&
## Is a unique solution
length(i)<= n_allowed
) {
## identify number of words in the matched string
words_in_match <- 1 + stringr::str_count(accepted_list[i]," ")

## identify the first letter of the first word in the matched string
match_word1_start <- stringr::str_extract(accepted_list[i], "[:alpha:]")
#length(i) <= n_allowed
) {

## identify the first letter of the second word in the matched string (if the matched string includes 2+ words)
if(words_in_text > 1 & epithet_letters == 2) {
if(nchar(word(accepted_list[i],2)) == 1) {
match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:]|[:digit:]")
for (j in seq_len(i)) {

if (keep == TRUE) {

break()

} else {
match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:][:alpha:]|[:digit:]")
}
}
## identify number of words in the matched string
words_in_match <- 1 + stringr::str_count(accepted_list[i][j]," ")

## identify the first letter of the first word in the matched string
match_word1_start <- stringr::str_extract(accepted_list[i][j], "[:alpha:]")

## identify the first letter of the second word in the matched string (if the matched string includes 2+ words)
if(words_in_text > 1 & epithet_letters == 2) {
if(nchar(word(accepted_list[i][j],2)) == 1) {
match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:]|[:digit:]")
} else {
match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:][:alpha:]|[:digit:]")
}
}

if(words_in_text > 1 & epithet_letters == 1) {
match_word2_start <- stringr::str_extract(word(accepted_list[i][j],2), "[:alpha:]|[:digit:]")
}

if(words_in_text > 1 & epithet_letters == 1) {
match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:]|[:digit:]")
}

## identify the first letter of the third word in the matched string (if the matched string includes 3+ words)
if(words_in_text > 2) {
match_word3_start <- stringr::str_extract(word(accepted_list[i],3), "[:alpha:]|[:digit:]")
}
## identify the first letter of the third word in the matched string (if the matched string includes 3+ words)
if(words_in_text > 2) {
match_word3_start <- stringr::str_extract(word(accepted_list[i][j],3), "[:alpha:]|[:digit:]")
}

## keep match if the first letters of the first three words (or fewer if applicable) in the string to match
## are identical to the first letters of the first three words in the matched string

keep = FALSE
if(words_in_text == 1) {
if (txt_word1_start == match_word1_start) {
keep <- TRUE }

} else if(words_in_text == 2) {
if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
keep <- TRUE }

} else if(words_in_text > 2) {
if (words_in_match > 2) {
if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) {
keep <- TRUE }
} else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
keep <- TRUE }
}

## keep match if the first letters of the first three words (or fewer if applicable) in the string to match
## are identical to the first letters of the first three words in the matched string

if(keep == TRUE) {

return(accepted_list[i][j])

}

return(NA)
}

if(words_in_text == 1) {
if (txt_word1_start == match_word1_start) {
keep = TRUE }

} else if(words_in_text == 2) {
if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
keep = TRUE }

} else if(words_in_text > 2) {
if (words_in_match > 2) {
if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) {
keep = TRUE }
} else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) {
keep = TRUE }
return(NA)
}

if(keep == TRUE) {

return(accepted_list[i])

}
return(NA)
}

return(NA)
}

22 changes: 11 additions & 11 deletions R/match_taxa.R
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ match_taxa <- function(

# match_05a: fuzzy match to APC-accepted canonical name
# Fuzzy match of taxon name to an APC-accepted canonical name, once filler words and punctuation are removed.
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_len(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APC[i] <-
fuzzy_match(
txt = taxa$tocheck$stripped_name[i],
Expand Down Expand Up @@ -781,7 +781,7 @@ match_taxa <- function(

# match_05b: fuzzy match to APC-known canonical name
# Fuzzy match of taxon name to an APC-known canonical name, once filler words and punctuation are removed.
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_len(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APC_synonym[i] <-
fuzzy_match(
txt = taxa$tocheck$stripped_name[i],
Expand Down Expand Up @@ -863,7 +863,7 @@ match_taxa <- function(
i <-
(
stringr::str_detect(taxa$tocheck$cleaned_name, "[Aa]ff[\\.\\s]") |
stringr::str_detect(taxa$tocheck$cleaned_name, " affinis ") |
stringr::str_detect(taxa$tocheck$cleaned_name, " affinis[\\s|$]") |
stringr::str_detect(taxa$tocheck$cleaned_name, " cf[\\.\\s]")
) &
taxa$tocheck$genus %in% resources$genera_all2$genus
Expand Down Expand Up @@ -1048,7 +1048,7 @@ match_taxa <- function(
# For imprecise fuzzy matches, the taxon name can differ from the `APC-accepted` names by 5 characters & up to 25% of the string length.
# These matches require individual review and are turned off as a default.
if (imprecise_fuzzy_matches == TRUE) {
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_len(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APC_imprecise[i] <-
fuzzy_match(
txt = taxa$tocheck$stripped_name[i],
Expand Down Expand Up @@ -1094,7 +1094,7 @@ match_taxa <- function(
# For imprecise fuzzy matches, the taxon name can differ from the `APC -known` names by 5 characters & up to 25% of the string length.
# These matches require individual review and are turned off as a default.
if (imprecise_fuzzy_matches == TRUE) {
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_len(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APC_synonym_imprecise[i] <-
fuzzy_match(
txt = taxa$tocheck$stripped_name[i],
Expand Down Expand Up @@ -1381,7 +1381,7 @@ match_taxa <- function(
# sometimes the submitted taxon name is a valid trinomial + notes and
# such names will only be aligned by matches considering only the first three words of the stripped name.
# This match also does a good job aligning and correcting syntax of phrase names
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_len(nrow(taxa$tocheck))) {
if (!is.na(taxa$tocheck$trinomial[i])) {
taxa$tocheck$fuzzy_match_trinomial[i] <-
fuzzy_match(
Expand Down Expand Up @@ -1428,7 +1428,7 @@ match_taxa <- function(
# sometimes the submitted taxon name is a valid trinomial + notes and
# such names will only be aligned by matches considering only the first three words of the stripped name.
# This match also does a good job aligning and correcting syntax of phrase names
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_len(nrow(taxa$tocheck))) {
if (!is.na(taxa$tocheck$trinomial[i])) {
taxa$tocheck$fuzzy_match_trinomial_synonym[i] <-
fuzzy_match(
Expand Down Expand Up @@ -1547,7 +1547,7 @@ match_taxa <- function(
# or a valid binomial + invalid infraspecific epithet.
# Such names will only be aligned by matches considering only the first two words of the stripped name.
# This match also does a good job aligning and correcting syntax of phrase names.
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_len(nrow(taxa$tocheck))) {
if (!is.na(taxa$tocheck$binomial[i]) &
is.na(taxa$tocheck$fuzzy_match_binomial[i])) {
taxa$tocheck$fuzzy_match_binomial[i] <-
Expand Down Expand Up @@ -1597,7 +1597,7 @@ match_taxa <- function(
# or a valid binomial + invalid infraspecific epithet.
# Such names will only be aligned by matches considering only the first two words of the stripped name.
# This match also does a good job aligning and correcting syntax of phrase names.
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_len(nrow(taxa$tocheck))) {
if (!is.na(taxa$tocheck$binomial[i]) &
is.na(taxa$tocheck$fuzzy_match_binomial_APC_synonym[i])) {
taxa$tocheck$fuzzy_match_binomial_APC_synonym[i] <-
Expand Down Expand Up @@ -1648,7 +1648,7 @@ match_taxa <- function(
# to avoid incorrectly aligning an APC accepted/known taxa to an APNI name.
# This is especially true to accurately align phrase names.
if (APNI_matches == TRUE) {
for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_len(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APNI[i] <-
fuzzy_match(
txt = taxa$tocheck$stripped_name[i],
Expand Down Expand Up @@ -1695,7 +1695,7 @@ match_taxa <- function(
# These matches require individual review and are turned off as a default.
if (APNI_matches == TRUE & imprecise_fuzzy_matches == TRUE) {

for (i in 1:nrow(taxa$tocheck)) {
for (i in seq_len(nrow(taxa$tocheck))) {
taxa$tocheck$fuzzy_match_cleaned_APNI_imprecise[i] <-
fuzzy_match(
txt = taxa$tocheck$cleaned_name[i],
Expand Down
2 changes: 1 addition & 1 deletion R/standardise_names.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ standardise_names <- function(taxon_names) {
f("\\saffin(\\s|$)", " aff. ") %>%
f("\\saff(\\s|$)", " aff. ") %>%
f("\\saffn(\\s|$|\\.)", " aff. ") %>%
f("\\saffinis(\\s|$)", " aff. ") %>%
f("\\saffinis(\\s)", " aff. ") %>%

## f. not forma or form or form. or f
f("\\sforma(\\s|$)", " f. ") %>%
Expand Down
6 changes: 3 additions & 3 deletions tests/testthat/benchmarks/test_matches_alignments_updates.csv
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Driandra abc--def,match_03c,match_03c,Dryandra sp. [Driandra abc--def; test_all_
Xyystidium abc--def,match_03d,match_03d,Xystidium sp. [Xyystidium abc--def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin.
Zygiaa abc--def,match_03d,match_03d,Zygia sp. [Zygiaa abc--def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne
Abcde fgh -- ijk,match_03e,match_03e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc--def,match_03e,match_03e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc--def,match_03e,match_03b,Randia sp. [Ryandra abc--def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA
Abildgaardia odontocarpa / Abildgaardia oxystachya,match_04a,match_04a,Abildgaardia sp. [Abildgaardia odontocarpa / Abildgaardia oxystachya; test_all_matches_TRUE],APC,genus,Abildgaardia,FALSE,https://id.biodiversity.org.au/node/apni/2905759,https://id.biodiversity.org.au/name/apni/55984,Abildgaardia Vahl
Acanthocarpus fimbriatus / Acanthocarpus mucronatus,match_04a,match_04a,Acanthocarpus sp. [Acanthocarpus fimbriatus / Acanthocarpus mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm.
Acanthocarpus fimbriatus / mucronatus,match_04a,match_04a,Acanthocarpus sp. [Acanthocarpus fimbriatus / mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm.
Expand All @@ -52,7 +52,7 @@ Drrandra abc / def,match_04c,match_04c,Dryandra sp. [Drrandra abc / def; test_al
Xyystidium abc/def,match_04d,match_04d,Xystidium sp. [Xyystidium abc/def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin.
Zygiaa abc / def,match_04d,match_04d,Zygia sp. [Zygiaa abc / def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne
Abcde fgh / ijk,match_04e,match_04e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc / def,match_04e,match_04e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc / def,match_04e,match_04b,Randia sp. [Ryandra abc / def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA
Cycas candida K.D.Hill,match_05a,match_01a,Cycas candida,APC,species,Cycas candida,TRUE,https://id.biodiversity.org.au/node/apni/2893335,https://id.biodiversity.org.au/name/apni/188177,Cycas candida K.D.Hill
Eremophila papillata Chinnock,match_05a,match_01a,Eremophila papillata,APC,species,Eremophila papillata,TRUE,https://id.biodiversity.org.au/node/apni/2910890,https://id.biodiversity.org.au/name/apni/207453,Eremophila papillata Chinnock
Acalypha indica var. australis F.M.Bailey,match_05b,match_01b,Acalypha indica var. australis,APC,variety,Acalypha lanceolata,TRUE,https://id.biodiversity.org.au/instance/apni/889946,https://id.biodiversity.org.au/name/apni/72588,Acalypha indica var. australis F.M.Bailey
Expand Down Expand Up @@ -155,7 +155,7 @@ Drrandra x def,match_11c,match_08c,Dryandra x [Drrandra x def; test_all_matches_
Xyystidium x def,match_11d,match_08d,Xystidium x [Xyystidium x def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin.
Zygiaa abc x Zygia def,match_11d,match_08d,Zygia x [Zygiaa abc x Zygia def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne
Abcde fgh x ijk,match_11e,match_08e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc x def,match_11e,match_08e,NA,NA,genus,NA,TRUE,NA,NA,NA
Ryandra abc x def,match_11e,match_08b,Randia x [Ryandra abc x def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA
Baeckea sp. murchison river,match_12a,match_09a,Baeckea sp. Murchison River (M.E.Trudgen 12009),APC,species,Baeckea sp. Murchison River (M.E.Trudgen 12009),TRUE,https://id.biodiversity.org.au/node/apni/2888052,https://id.biodiversity.org.au/name/apni/191267,Baeckea sp. Murchison River (M.E.Trudgen 12009) WA Herbarium
Eremophila oppositifolia rubra (needle leaves),match_12a,match_09a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock
Eremophila oppositifolia rubra early collection,match_12a,match_09a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock
Expand Down
14 changes: 0 additions & 14 deletions tests/testthat/test-alignment_results.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,6 @@ test_that("consistency with previous runs", {

taxa <-
c(
"Banksia integrifolia",
"Acacia longifolia",
"Commersonia rosea",
"Thelymitra pauciflora",
"Justicia procumbens",
"Hibbertia stricta",
"Rostellularia adscendens",
"Hibbertia sericea",
"Hibbertia sp.",
"Athrotaxis laxiflolia",
"Genoplesium insigne",
"Polypogon viridis",
"Acacia aneura",
"Acacia paraneura",
"Galactia striata"
)

Expand Down
Loading