-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Taxonomic updates & new functions for updating taxa, building taxon_l…
…ist.csv (#779) New function to rebuild the taxon_list: - uses APCalign::update_taxonomy() for the bulk of its functionality - defaults to binding new rows to bottom; but option to overwrite list to update from new NSL files New function to align taxon names and add taxonomic_updates to metadata files (build_align_taxon_names.R) Lots of taxonomic updates to bring all datasets up to date again Also reworking Nano_2011 taxon names * Nano_2011 had a mix of scientific names with and without authorship, but not always in standard syntax * there would have been 1000's of taxonomic updates added, so first manipulated the taxon names using stringr matches, such that all but ~300 ended up as exact matches to APC/APNI canonical names * merged these names into main data spreadsheet, together with alignments, reasons, etc for the remaining names * just the ~300 taxa requiring actual alignments added to metadata file Standardising which taxa are excluded from observations Checking to make sure that is a taxon is flagged as `non-native, non-naturalised` (or some other reason for exclusion) it is excluded from all datasets.
- Loading branch information
Showing
85 changed files
with
9,077 additions
and
17,754 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
build_align_taxon_names <- function(austraits, dataset) { | ||
|
||
resources <- APCalign::load_taxonomic_resources() | ||
|
||
names_to_align <- austraits$taxonomic_updates %>% | ||
dplyr::filter(stringr::str_detect(dataset_id, dataset)) %>% | ||
dplyr::filter(!aligned_name %in% resources$APC$canonical_name & !aligned_name %in% resources$APNI$canonical_name) %>% | ||
dplyr::filter(is.na(taxonomic_resolution)) %>% | ||
dplyr::distinct(original_name) | ||
|
||
names_aligned <- | ||
APCalign::align_taxa(original_name = names_to_align$original_name) %>% | ||
dplyr::select(all_of(c("original_name", "aligned_name", "aligned_reason", "taxon_rank"))) %>% | ||
dplyr::rename(all_of(c( | ||
"find" = "original_name", | ||
"replace" = "aligned_name", | ||
"reason" = "aligned_reason", | ||
"taxonomic_resolution" = "taxon_rank" | ||
))) | ||
|
||
traits.build::metadata_add_taxonomic_changes_list(dataset, names_aligned) | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) { | ||
|
||
resources <- APCalign::load_taxonomic_resources() | ||
ranks_in_database <- unique(austraits$taxa$taxon_rank) | ||
|
||
higher_ranks <- c("phylum", "class", "order", "family", "genus") | ||
highest_ranks <- c("phylum", "class", "order") | ||
higher_ranks_taxon_list <- higher_ranks[higher_ranks %in% unique(austraits$taxa$taxon_rank)] | ||
highest_rank_column <- higher_ranks_taxon_list[1] | ||
highest_ranks_taxon_list <- highest_ranks[highest_ranks %in% unique(austraits$taxa$taxon_rank)] | ||
|
||
# Create reduced APC list, just including a few columns and taxon ranks relevant to AusTraits | ||
APC_tmp <- resources$APC %>% | ||
select(canonical_name, taxon_rank) %>% | ||
mutate(taxonomic_dataset_APC = "APC") %>% | ||
filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>% | ||
rename(aligned_name = canonical_name, taxon_rank_APC = taxon_rank) | ||
|
||
# Create reduced APNI list, just including a few columns and taxon ranks relevant to AusTraits | ||
APNI_tmp <- resources$APNI %>% | ||
select(canonical_name, taxon_rank) %>% | ||
mutate(taxonomic_dataset_APNI = "APNI") %>% | ||
filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>% | ||
rename(aligned_name = canonical_name, taxon_rank_APNI = taxon_rank) | ||
|
||
# List of taxa that are explicitly excluded in metadata - don't want these in the taxon_list | ||
# These should be excluded from `taxonomic_updates` table during processing, but good to check | ||
excluded_in_metadata <- austraits$excluded_data %>% filter(error == "Observation excluded in metadata") %>% distinct(original_name) | ||
|
||
# Start with taxonomic_updates table, which is all original names, aligned names, by dataset | ||
all_taxa <- | ||
austraits$taxonomic_updates %>% | ||
dplyr::select(dplyr::all_of(c("original_name", "aligned_name", "taxonomic_resolution"))) %>% | ||
# In case the same `original_name`, `aligned_name` combination occurs twice, but only once with `taxonomic_resolution` attached, arrange names, taxon_ranks | ||
dplyr::arrange(aligned_name, taxonomic_resolution) %>% | ||
# Keep unique names | ||
dplyr::distinct(original_name, aligned_name, .keep_all = TRUE) %>% | ||
# Need to merge in taxon_ranks for taxa that have not had a taxonomic alignment added in the metadata file | ||
dplyr::left_join(by = "aligned_name", | ||
austraits$taxa %>% | ||
dplyr::select(dplyr::all_of(c("taxon_name", "taxon_rank"))) %>% | ||
dplyr::rename(aligned_name = taxon_name, taxon_rank_taxa = taxon_rank) %>% | ||
arrange(aligned_name, taxon_rank_taxa) %>% | ||
distinct(aligned_name, .keep_all = TRUE) | ||
) %>% | ||
# For taxa with: 1) taxon_rank = genus, family, or 2) notes in [] | ||
# Need to mutate a separate column with `name_to_match_to` for joining in other columns | ||
dplyr::mutate( | ||
name_to_match_to = aligned_name, | ||
taxonomic_resolution = ifelse(is.na(taxonomic_resolution), taxon_rank_taxa, taxonomic_resolution), | ||
name_to_match_to = | ||
ifelse(taxonomic_resolution %in% c("family", "genus"), | ||
ifelse(stringr::word(name_to_match_to, 1) == "x", stringr::word(name_to_match_to, start = 1, end = 2), stringr::word(name_to_match_to, 1)), | ||
name_to_match_to), | ||
name_to_match_to = ifelse(stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("species"), stringr::word(name_to_match_to, start = 1, end = 2), name_to_match_to), | ||
name_to_match_to = ifelse(stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("subspecies", "variety", "form"), stringr::word(name_to_match_to, start = 1, end = 3), name_to_match_to), | ||
) %>% | ||
# Merge in taxon_ranks & taxonomic_dataset for all aligned name; this information not yet present for most names where original_name = aligned_name | ||
dplyr::left_join(APC_tmp %>% rename(name_to_match_to = aligned_name) %>% distinct(name_to_match_to, .keep_all = TRUE)) %>% | ||
dplyr::left_join(APNI_tmp %>% rename(name_to_match_to = aligned_name) %>% distinct(name_to_match_to, .keep_all = TRUE)) %>% | ||
dplyr::mutate( | ||
taxon_rank = taxon_rank_APC, | ||
taxon_rank = ifelse(is.na(taxon_rank), taxon_rank_APNI, taxon_rank), | ||
taxon_rank = ifelse(is.na(taxon_rank), taxonomic_resolution, taxon_rank), | ||
taxonomic_dataset = taxonomic_dataset_APC, | ||
taxonomic_dataset = ifelse(is.na(taxonomic_dataset), taxonomic_dataset_APNI, taxonomic_dataset) | ||
) %>% | ||
# Remove taxa that are excluded in metadata | ||
dplyr::filter(!(original_name %in% excluded_in_metadata$original_name & is.na(taxon_rank))) | ||
|
||
# Filter out taxa that need to be run through `APCalign::align_taxa()` first | ||
taxa_for_taxon_list <- all_taxa %>% | ||
filter(!(is.na(taxon_rank) & is.na(taxonomic_dataset))) %>% | ||
select(original_name, aligned_name, taxon_rank, taxonomic_dataset) %>% | ||
mutate(aligned_reason = NA) | ||
|
||
# Use function `APCalign::update_taxonomy` to update names and add identifier columns | ||
updated <- APCalign::update_taxonomy(taxa_for_taxon_list, resources = resources) | ||
|
||
taxon_list_new <- updated %>% | ||
# Remove columns from APCalign's output that aren't needed | ||
dplyr::select(-dplyr::any_of(c("accepted_name", "row_number", "number_of_collapsed_taxa", "taxonomic_status_aligned", "update_reason", "aligned_reason", "scientific_name_authorship"))) %>% | ||
# Rename columns to match AusTraits conventions | ||
dplyr::rename(dplyr::all_of(c( | ||
"taxon_name" = "suggested_name", | ||
"aligned_name" = "aligned_name", | ||
"taxonomic_dataset" = "taxonomic_dataset", | ||
"taxon_id" = "taxon_ID", | ||
"scientific_name_id" = "scientific_name_ID" | ||
))) %>% | ||
# In AusTraits we also want to document identifiers for `aligned_names`, not just for the final `taxon_name` | ||
# We do this by rejoining columns from APC, but now to the aligned_names, not the taxon_names | ||
# XX These are currently all prefixed with "cleaned" | ||
dplyr::left_join(by = c("aligned_name", "taxon_name"), | ||
resources$APC %>% | ||
dplyr::mutate( | ||
accepted_name = resources$`APC list (accepted)`$canonical_name[match(accepted_name_usage_ID, resources$`APC list (accepted)`$taxon_ID)], | ||
taxon_name = accepted_name | ||
) %>% | ||
dplyr::select(dplyr::all_of(c("scientific_name_ID", "taxonomic_status", "taxon_ID", "accepted_name", "taxon_name", "canonical_name"))) %>% | ||
dplyr::rename(dplyr::all_of(c( | ||
"aligned_name_taxonomic_status" = "taxonomic_status", | ||
"aligned_name_taxon_id" = "taxon_ID", | ||
"aligned_name" = "canonical_name", | ||
"cleaned_scientific_name_id" = "scientific_name_ID" | ||
))) %>% | ||
dplyr::distinct(aligned_name, taxon_name, .keep_all = TRUE) | ||
) %>% | ||
# For taxon names that are aligned at the genus- or family-level, we need to replace the taxon & scientific name identifiers | ||
# with those for the relevant genus or family. | ||
dplyr::mutate( | ||
taxon_rank = ifelse( | ||
taxon_rank %in% c("genus", "family"), | ||
taxon_rank, | ||
ifelse( | ||
(taxonomic_dataset == "APC" & taxonomic_status == "accepted"), | ||
resources$`APC list (accepted)`$taxon_rank[match(taxon_id, resources$`APC list (accepted)`$taxon_ID)], | ||
taxon_rank | ||
)), | ||
taxon_id_family = resources$family_accepted$taxon_ID[match(updated$family, resources$family_accepted$canonical_name)], | ||
taxon_id = ifelse(taxon_rank %in% c("genus", "family"), NA, taxon_id), | ||
scientific_name_id = ifelse(taxonomic_dataset == "APC", resources$`APC list (accepted)`$scientific_name_ID[match(scientific_name, resources$`APC list (accepted)`$scientific_name)], scientific_name_id), | ||
scientific_name_id = ifelse(taxon_rank %in% c("genus", "family"), NA, scientific_name_id) | ||
) %>% | ||
# The function `APCalign::update_taxonomy` includes alternative possible names as part of the taxon name. | ||
# We want this information in a separate column. | ||
dplyr::mutate( | ||
taxon_name = stringr::str_split(taxon_name, "\\[alternative possible names\\:")) %>% | ||
tidyr::unnest_wider(taxon_name, names_sep = "_") %>% | ||
dplyr::rename(taxon_name = taxon_name_1, taxon_name_alternatives = taxon_name_2) %>% | ||
dplyr::mutate( | ||
taxon_name_alternatives = stringr::str_replace(taxon_name_alternatives, "\\]$", "") | ||
) %>% | ||
# Add in data for genus, binomial and trinomial, as appropriate. | ||
dplyr::mutate( | ||
trinomial = ifelse(.data$taxon_rank %in% c("subspecies", "form", "variety", "series"), | ||
stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA), | ||
# Field binomial is filled in if taxonomic resolution is an infraspecific name or a binomial | ||
# All taxon names that have "extra" information (beyond the actual name) have been formatted | ||
# to have that information in square brackets '[]', so these can be used as a delimitor to | ||
# extract the actual name | ||
binomial = ifelse(.data$taxon_rank %in% c("species"), | ||
stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA), | ||
binomial = ifelse(.data$taxon_rank %in% c("subspecies", "form", "variety", "series"), | ||
stringr::word(.data$taxon_name, start = 1, end = 2), .data$binomial), | ||
binomial = stringr::str_trim(.data$binomial), | ||
# Genus filled in for all names that have a taxonomic of genus or more detailed | ||
genus = ifelse( | ||
!.data$taxon_rank %in% c("family", "order", "class", "phylum", "kingdom"), | ||
ifelse(stringr::word(.data$taxon_name, 1) == "x", | ||
stringr::word(.data$taxon_name, start = 1, end = 2), | ||
stringr::word(.data$taxon_name, 1)), | ||
NA) | ||
) %>% | ||
# Add in `establishment_means`, indicating if a taxon is native, naturalised or both | ||
# This code is based on the exact syntax for taxon_distribution in APC; | ||
# the word `native` is used only if a taxon is both native and naturalised in a state | ||
dplyr::mutate( | ||
count_naturalised = stringr::str_count(.data$taxon_distribution, "naturalised"), | ||
count_n_and_n = stringr::str_count(.data$taxon_distribution, "native and naturalised"), | ||
count_states = stringr::str_count(.data$taxon_distribution, ",") + 1, | ||
establishment_means = ifelse(.data$count_naturalised > 0 & .data$count_n_and_n == 0, "naturalised", NA), | ||
establishment_means = ifelse(.data$count_n_and_n > 0 | (.data$count_naturalised > 0 & .data$count_states > .data$count_naturalised), "native and naturalised", .data$establishment_means), | ||
establishment_means = ifelse(.data$count_naturalised == 0 & .data$count_n_and_n == 0, "native", .data$establishment_means), | ||
establishment_means = ifelse(.data$taxon_rank %in% higher_ranks, NA, .data$establishment_means), | ||
taxon_distribution = ifelse(.data$taxon_rank %in% higher_ranks, NA, .data$taxon_distribution) | ||
) %>% | ||
dplyr::select(-dplyr::all_of(c("count_naturalised", "count_n_and_n", "count_states", "accepted_name"))) | ||
|
||
# New taxon list | ||
|
||
if (replace == TRUE) { | ||
taxon_list_replace <- taxon_list_new %>% | ||
arrange(taxon_name, aligned_name) %>% | ||
distinct(taxon_name, aligned_name, .keep_all = TRUE) | ||
|
||
} else { | ||
|
||
taxon_list_replace <- taxon_list %>% | ||
# First bind rows for cleaned names not yet in AusTraits taxon_list.csv file | ||
bind_rows(taxon_list_new %>% filter(!aligned_name %in% taxon_list$aligned_name)) %>% | ||
# Arrange by names - hopefully this will be best solution for keeping GitHub commits more transparent | ||
arrange(taxon_name, aligned_name) %>% | ||
distinct(taxon_name, aligned_name, .keep_all = TRUE) | ||
} | ||
|
||
taxon_list_replace %>% | ||
write_csv("config/taxon_list.csv") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.