Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update build_update_taxon_list.R #789

Merged
merged 2 commits into from
Nov 23, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 125 additions & 92 deletions R/build_update_taxon_list.R
Original file line number Diff line number Diff line change
@@ -1,63 +1,80 @@
build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) {

resources <- APCalign::load_taxonomic_resources()
ranks_in_database <- unique(austraits$taxa$taxon_rank)

higher_ranks <- c("phylum", "class", "order", "family", "genus")
highest_ranks <- c("phylum", "class", "order")
higher_ranks_taxon_list <- higher_ranks[higher_ranks %in% unique(austraits$taxa$taxon_rank)]
highest_rank_column <- higher_ranks_taxon_list[1]
highest_ranks_taxon_list <- highest_ranks[highest_ranks %in% unique(austraits$taxa$taxon_rank)]

# Create reduced APC list, just including a few columns and taxon ranks relevant to AusTraits
APC_tmp <- resources$APC %>%
select(canonical_name, taxon_rank) %>%
mutate(taxonomic_dataset_APC = "APC") %>%
filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>%
rename(aligned_name = canonical_name, taxon_rank_APC = taxon_rank)
APC_tmp <- resources$APC %>%
dplyr::select(dplyr::all_of(c("canonical_name", "taxon_rank"))) %>%
dplyr::mutate(taxonomic_dataset_APC = "APC") %>%
dplyr::filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>%
dplyr::rename(dplyr::all_of(c("aligned_name" = "canonical_name", "taxon_rank_APC" = "taxon_rank")))

# Create reduced APNI list, just including a few columns and taxon ranks relevant to AusTraits
APNI_tmp <- resources$APNI %>%
select(canonical_name, taxon_rank) %>%
mutate(taxonomic_dataset_APNI = "APNI") %>%
filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>%
rename(aligned_name = canonical_name, taxon_rank_APNI = taxon_rank)
APNI_tmp <- resources$APNI %>%
dplyr::select(dplyr::all_of(c("canonical_name", "taxon_rank"))) %>%
dplyr::mutate(taxonomic_dataset_APNI = "APNI") %>%
dplyr::filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>%
dplyr::rename(dplyr::all_of(c("aligned_name" = "canonical_name", "taxon_rank_APNI" = "taxon_rank")))

# List of taxa that are explicitly excluded in metadata - don't want these in the taxon_list
# These should be excluded from `taxonomic_updates` table during processing, but good to check
excluded_in_metadata <- austraits$excluded_data %>% filter(error == "Observation excluded in metadata") %>% distinct(original_name)

excluded_in_metadata <- austraits$excluded_data %>%
dplyr::filter(error == "Observation excluded in metadata") %>%
dplyr::distinct(original_name)

# Start with taxonomic_updates table, which is all original names, aligned names, by dataset
all_taxa <-
all_taxa <-
austraits$taxonomic_updates %>%
dplyr::select(dplyr::all_of(c("original_name", "aligned_name", "taxonomic_resolution"))) %>%
# In case the same `original_name`, `aligned_name` combination occurs twice, but only once with `taxonomic_resolution` attached, arrange names, taxon_ranks
# In case the same `original_name`, `aligned_name` combination occurs twice, but only once with
# `taxonomic_resolution` attached, arrange names, taxon_ranks
dplyr::arrange(aligned_name, taxonomic_resolution) %>%
# Keep unique names
dplyr::distinct(original_name, aligned_name, .keep_all = TRUE) %>%
# Need to merge in taxon_ranks for taxa that have not had a taxonomic alignment added in the metadata file
dplyr::left_join(by = "aligned_name",
austraits$taxa %>%
dplyr::select(dplyr::all_of(c("taxon_name", "taxon_rank"))) %>%
dplyr::rename(aligned_name = taxon_name, taxon_rank_taxa = taxon_rank) %>%
arrange(aligned_name, taxon_rank_taxa) %>%
distinct(aligned_name, .keep_all = TRUE)
dplyr::left_join(
by = "aligned_name",
austraits$taxa %>%
dplyr::select(dplyr::all_of(c("taxon_name", "taxon_rank"))) %>%
dplyr::rename(dplyr::all_of(c("aligned_name" = "taxon_name", "taxon_rank_taxa" = "taxon_rank"))) %>%
dplyr::arrange(aligned_name, taxon_rank_taxa) %>%
dplyr::distinct(aligned_name, .keep_all = TRUE)
) %>%
# For taxa with: 1) taxon_rank = genus, family, or 2) notes in []
# Need to mutate a separate column with `name_to_match_to` for joining in other columns
dplyr::mutate(
name_to_match_to = aligned_name,
taxonomic_resolution = ifelse(is.na(taxonomic_resolution), taxon_rank_taxa, taxonomic_resolution),
name_to_match_to =
ifelse(taxonomic_resolution %in% c("family", "genus"),
ifelse(stringr::word(name_to_match_to, 1) == "x", stringr::word(name_to_match_to, start = 1, end = 2), stringr::word(name_to_match_to, 1)),
name_to_match_to),
name_to_match_to = ifelse(stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("species"), stringr::word(name_to_match_to, start = 1, end = 2), name_to_match_to),
name_to_match_to = ifelse(stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("subspecies", "variety", "form"), stringr::word(name_to_match_to, start = 1, end = 3), name_to_match_to),
name_to_match_to =
ifelse(
taxonomic_resolution %in% c("family", "genus"),
ifelse(
stringr::word(name_to_match_to, 1) == "x",
stringr::word(name_to_match_to, start = 1, end = 2),
stringr::word(name_to_match_to, 1)),
name_to_match_to),
name_to_match_to = ifelse(
stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("species"),
stringr::word(name_to_match_to, start = 1, end = 2),
name_to_match_to),
name_to_match_to = ifelse(
stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("subspecies", "variety", "form"),
stringr::word(name_to_match_to, start = 1, end = 3),
name_to_match_to),
) %>%
# Merge in taxon_ranks & taxonomic_dataset for all aligned name; this information not yet present for most names where original_name = aligned_name
dplyr::left_join(APC_tmp %>% rename(name_to_match_to = aligned_name) %>% distinct(name_to_match_to, .keep_all = TRUE)) %>%
dplyr::left_join(APNI_tmp %>% rename(name_to_match_to = aligned_name) %>% distinct(name_to_match_to, .keep_all = TRUE)) %>%
# Merge in taxon_ranks & taxonomic_dataset for all aligned name; this information not yet present for
# most names where original_name = aligned_name
dplyr::left_join(APC_tmp %>% dplyr::rename("name_to_match_to" = "aligned_name") %>%
dplyr::distinct(name_to_match_to, .keep_all = TRUE)) %>%
dplyr::left_join(APNI_tmp %>% dplyr::rename("name_to_match_to" = "aligned_name") %>%
dplyr::distinct(name_to_match_to, .keep_all = TRUE)) %>%
dplyr::mutate(
taxon_rank = taxon_rank_APC,
taxon_rank = ifelse(is.na(taxon_rank), taxon_rank_APNI, taxon_rank),
Expand All @@ -67,37 +84,41 @@ build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) {
) %>%
# Remove taxa that are excluded in metadata
dplyr::filter(!(original_name %in% excluded_in_metadata$original_name & is.na(taxon_rank)))

# Filter out taxa that need to be run through `APCalign::align_taxa()` first
taxa_for_taxon_list <- all_taxa %>%
filter(!(is.na(taxon_rank) & is.na(taxonomic_dataset))) %>%
select(original_name, aligned_name, taxon_rank, taxonomic_dataset) %>%
mutate(aligned_reason = NA)
dplyr::filter(!(is.na(taxon_rank) & is.na(taxonomic_dataset))) %>%
dplyr::select(dplyr::all_of(c("original_name", "aligned_name", "taxon_rank", "taxonomic_dataset"))) %>%
dplyr::mutate(aligned_reason = NA)

# Use function `APCalign::update_taxonomy` to update names and add identifier columns
updated <- APCalign::update_taxonomy(taxa_for_taxon_list, resources = resources)
taxon_list_new <- updated %>%

taxon_list_new <- updated %>%
# Remove columns from APCalign's output that aren't needed
dplyr::select(-dplyr::any_of(c("accepted_name", "row_number", "number_of_collapsed_taxa", "taxonomic_status_aligned", "update_reason", "aligned_reason", "scientific_name_authorship"))) %>%
dplyr::select(-dplyr::any_of(
c("accepted_name", "row_number", "number_of_collapsed_taxa", "taxonomic_status_aligned",
"update_reason", "aligned_reason", "scientific_name_authorship"))) %>%
# Rename columns to match AusTraits conventions
dplyr::rename(dplyr::all_of(c(
"taxon_name" = "suggested_name",
"aligned_name" = "aligned_name",
"taxonomic_dataset" = "taxonomic_dataset",
"taxon_id" = "taxon_ID",
"scientific_name_id" = "scientific_name_ID",
"taxon_id_genus" = "taxon_ID_genus"
"taxon_name" = "suggested_name",
"aligned_name" = "aligned_name",
"taxonomic_dataset" = "taxonomic_dataset",
"taxon_id" = "taxon_ID",
"scientific_name_id" = "scientific_name_ID",
"taxon_id_genus" = "taxon_ID_genus"
))) %>%
# In AusTraits we also want to document identifiers for `aligned_names`, not just for the final `taxon_name`
# We do this by rejoining columns from APC, but now to the aligned_names, not the taxon_names
dplyr::left_join(by = c("aligned_name", "taxon_name"),
resources$APC %>%
resources$APC %>%
dplyr::mutate(
accepted_name = resources$`APC list (accepted)`$canonical_name[match(accepted_name_usage_ID, resources$`APC list (accepted)`$taxon_ID)],
taxon_name = accepted_name
) %>%
dplyr::select(dplyr::all_of(c("scientific_name_ID", "taxonomic_status", "taxon_ID", "accepted_name", "taxon_name", "canonical_name"))) %>%
) %>%
dplyr::select(dplyr::all_of(c(
"scientific_name_ID", "taxonomic_status", "taxon_ID",
"accepted_name", "taxon_name", "canonical_name"))) %>%
dplyr::rename(dplyr::all_of(c(
"aligned_name_taxonomic_status" = "taxonomic_status",
"aligned_name_taxon_id" = "taxon_ID",
Expand All @@ -106,15 +127,16 @@ build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) {
))) %>%
dplyr::distinct(aligned_name, taxon_name, .keep_all = TRUE)
) %>%
# For taxon names that are aligned at the genus- or family-level, we need to replace the taxon & scientific name identifiers
# with those for the relevant genus or family.
# For taxon names that are aligned at the genus- or family-level, we need to replace the taxon & scientific
# name identifiers with those for the relevant genus or family
dplyr::mutate(
# Genus filled in for all names that have a taxonomic of genus or more detailed
genus = ifelse(
!.data$taxon_rank %in% c("family", "order", "class", "phylum", "kingdom"),
ifelse(stringr::word(.data$taxon_name, 1) == "x",
stringr::word(.data$taxon_name, start = 1, end = 2),
stringr::word(.data$taxon_name, 1)),
ifelse(
stringr::word(.data$taxon_name, 1) == "x",
stringr::word(.data$taxon_name, start = 1, end = 2),
stringr::word(.data$taxon_name, 1)),
NA),
taxon_rank = ifelse(
taxonomic_status == "unknown" & aligned_name %in% austraits$taxonomic_updates$aligned_name,
Expand All @@ -129,79 +151,90 @@ build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) {
resources$`APC list (accepted)`$taxon_rank[match(taxon_id, resources$`APC list (accepted)`$taxon_ID)],
taxon_rank
)),
# For taxon names that are valid names (per herbarium standards) or repeatedly reported invasives, but not in APC/APNI, map on families, genus_ids - APCalign doesn't do this
# For taxon names that are valid names (per herbarium standards) or repeatedly reported invasives, but
# not in APC/APNI, map on families, genus_ids - APCalign doesn't do this
taxon_id_genus = ifelse(
taxonomic_status == "unknown" & aligned_name %in% austraits$taxonomic_updates$aligned_name,
resources$genera_all$taxon_ID[match(genus, resources$genera_all$genus)],
taxon_id_genus
resources$genera_all$taxon_ID[match(genus, resources$genera_all$genus)], taxon_id_genus
),
family = ifelse(
taxonomic_status == "unknown" & aligned_name %in% austraits$taxonomic_updates$aligned_name,
resources$APC$family[match(genus, resources$APC$genus)],
family
resources$APC$family[match(genus, resources$APC$genus)], family
),
taxon_id_family = resources$family_accepted$taxon_ID[match(family, resources$family_accepted$canonical_name)],
taxon_id = ifelse(taxon_rank %in% c("genus", "family"), NA, taxon_id),
scientific_name_id = ifelse(taxonomic_dataset == "APC", resources$`APC list (accepted)`$scientific_name_ID[match(scientific_name, resources$`APC list (accepted)`$scientific_name)], scientific_name_id),
scientific_name_id = ifelse(
taxonomic_dataset == "APC",
resources$`APC list (accepted)`$scientific_name_ID[match(scientific_name, resources$`APC list (accepted)`$scientific_name)],
scientific_name_id),
scientific_name_id = ifelse(taxon_rank %in% c("genus", "family"), NA, scientific_name_id)
) %>%
# The function `APCalign::update_taxonomy` includes alternative possible names as part of the taxon name.
# We want this information in a separate column.
dplyr::mutate(
taxon_name = stringr::str_split(taxon_name, "\\[alternative possible names\\:")) %>%
dplyr::mutate(taxon_name = stringr::str_split(taxon_name, "\\[alternative possible names\\:")) %>%
tidyr::unnest_wider(taxon_name, names_sep = "_") %>%
dplyr::rename(taxon_name = taxon_name_1, taxon_name_alternatives = taxon_name_2) %>%
dplyr::mutate(
taxon_name_alternatives = stringr::str_replace(taxon_name_alternatives, "\\]$", "")
) %>%
dplyr::rename(dplyr::all_of(c("taxon_name" = "taxon_name_1", "taxon_name_alternatives" = "taxon_name_2"))) %>%
dplyr::mutate(taxon_name_alternatives = stringr::str_replace(taxon_name_alternatives, "\\]$", "")) %>%
# Add in data for genus, binomial and trinomial, as appropriate.
dplyr::mutate(
trinomial = ifelse(.data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
trinomial = ifelse(
.data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
# Field binomial is filled in if taxonomic resolution is an infraspecific name or a binomial
# All taxon names that have "extra" information (beyond the actual name) have been formatted
# to have that information in square brackets '[]', so these can be used as a delimitor to
# extract the actual name
binomial = ifelse(.data$taxon_rank %in% c("species"),
stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
binomial = ifelse(.data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
stringr::word(.data$taxon_name, start = 1, end = 2), .data$binomial),
binomial = ifelse(
.data$taxon_rank %in% c("species"),
stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
binomial = ifelse(
.data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
stringr::word(.data$taxon_name, start = 1, end = 2), .data$binomial),
binomial = stringr::str_trim(.data$binomial)
) %>%
# Add in `establishment_means`, indicating if a taxon is native, naturalised or both
# This code is based on the exact syntax for taxon_distribution in APC;
# This code is based on the exact syntax for taxon_distribution in APC;
# the word `native` is used only if a taxon is both native and naturalised in a state
dplyr::mutate(
count_naturalised = stringr::str_count(.data$taxon_distribution, "naturalised"),
count_n_and_n = stringr::str_count(.data$taxon_distribution, "native and naturalised"),
count_states = stringr::str_count(.data$taxon_distribution, ",") + 1,
establishment_means = ifelse(.data$count_naturalised > 0 & .data$count_n_and_n == 0, "naturalised", NA),
establishment_means = ifelse(.data$count_n_and_n > 0 | (.data$count_naturalised > 0 & .data$count_states > .data$count_naturalised), "native and naturalised", .data$establishment_means),
establishment_means = ifelse(.data$count_naturalised == 0 & .data$count_n_and_n == 0, "native", .data$establishment_means),
establishment_means = ifelse(
.data$count_n_and_n > 0 | (.data$count_naturalised > 0 & .data$count_states > .data$count_naturalised),
"native and naturalised", .data$establishment_means),
establishment_means = ifelse(
.data$count_naturalised == 0 & .data$count_n_and_n == 0,
"native", .data$establishment_means),
establishment_means = ifelse(.data$taxon_rank %in% higher_ranks, NA, .data$establishment_means),
taxon_distribution = ifelse(.data$taxon_rank %in% higher_ranks, NA, .data$taxon_distribution)
) %>%
dplyr::select(dplyr::all_of(c("aligned_name", "taxon_name", "taxon_rank", "taxonomic_status", "taxonomic_dataset", "taxon_name_alternatives",
"genus", "family", "binomial", "trinomial", "taxon_distribution", "establishment_means", "scientific_name",
"taxon_id", "taxon_id_genus", "taxon_id_family", "scientific_name_id", "aligned_name_taxon_id", "aligned_name_taxonomic_status")))
dplyr::select(dplyr::all_of(
c("aligned_name", "taxon_name", "taxon_rank", "taxonomic_status", "taxonomic_dataset",
"taxon_name_alternatives", "genus", "family", "binomial", "trinomial", "taxon_distribution",
"establishment_means", "scientific_name", "taxon_id", "taxon_id_genus", "taxon_id_family",
"scientific_name_id", "aligned_name_taxon_id", "aligned_name_taxonomic_status")))

# New taxon list

# New taxon list

if (replace == TRUE) {
taxon_list_replace <- taxon_list_new %>%
arrange(taxon_name, aligned_name) %>%
distinct(taxon_name, aligned_name, .keep_all = TRUE)


taxon_list_replace <- taxon_list_new %>%
dplyr::arrange(taxon_name, aligned_name) %>%
dplyr::distinct(taxon_name, aligned_name, .keep_all = TRUE)

} else {

taxon_list_replace <- taxon_list %>%
# First bind rows for cleaned names not yet in AusTraits taxon_list.csv file
bind_rows(taxon_list_new %>% filter(!aligned_name %in% taxon_list$aligned_name)) %>%
dplyr::bind_rows(taxon_list_new %>% dplyr::filter(!aligned_name %in% taxon_list$aligned_name)) %>%
# Arrange by names - hopefully this will be best solution for keeping GitHub commits more transparent
arrange(taxon_name, aligned_name) %>%
distinct(taxon_name, aligned_name, .keep_all = TRUE)
dplyr::arrange(taxon_name, aligned_name) %>%
dplyr::distinct(taxon_name, aligned_name, .keep_all = TRUE)

}
taxon_list_replace %>%

taxon_list_replace %>%
readr::write_csv("config/taxon_list.csv", na = "")

}
Loading