Skip to content

Commit

Permalink
Taxonomic updates & new functions for updating taxa, building taxon_l…
Browse files Browse the repository at this point in the history
…ist.csv (#779)

New function to rebuild the taxon_list:
- uses APCalign::update_taxonomy() for the bulk of its functionality
- defaults to binding new rows to bottom; but option to overwrite list to update from new NSL files

New function to align taxon names and add taxonomic_updates to metadata files (build_align_taxon_names.R)

Lots of taxonomic updates to bring all datasets up to date again

Also reworking Nano_2011 taxon names

* Nano_2011 had a mix of scientific names with and without authorship, but not always in standard syntax
* there would have been 1000's of taxonomic updates added, so first manipulated the taxon names using stringr matches, such that all but ~300 ended up as exact matches to APC/APNI canonical names
* merged these names into main data spreadsheet, together with alignments, reasons, etc for the remaining names
* just the ~300 taxa requiring actual alignments added to metadata file

Standardising which taxa are excluded from observations

Checking to make sure that is a taxon is flagged as `non-native, non-naturalised` (or some other reason for exclusion) it is excluded from all datasets.
  • Loading branch information
ehwenk authored Nov 3, 2023
1 parent c8ef093 commit 735ad35
Show file tree
Hide file tree
Showing 85 changed files with 9,077 additions and 17,754 deletions.
23 changes: 23 additions & 0 deletions R/build_align_taxon_names.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
build_align_taxon_names <- function(austraits, dataset) {

resources <- APCalign::load_taxonomic_resources()

names_to_align <- austraits$taxonomic_updates %>%
dplyr::filter(stringr::str_detect(dataset_id, dataset)) %>%
dplyr::filter(!aligned_name %in% resources$APC$canonical_name & !aligned_name %in% resources$APNI$canonical_name) %>%
dplyr::filter(is.na(taxonomic_resolution)) %>%
dplyr::distinct(original_name)

names_aligned <-
APCalign::align_taxa(original_name = names_to_align$original_name) %>%
dplyr::select(all_of(c("original_name", "aligned_name", "aligned_reason", "taxon_rank"))) %>%
dplyr::rename(all_of(c(
"find" = "original_name",
"replace" = "aligned_name",
"reason" = "aligned_reason",
"taxonomic_resolution" = "taxon_rank"
)))

traits.build::metadata_add_taxonomic_changes_list(dataset, names_aligned)

}
189 changes: 189 additions & 0 deletions R/build_update_taxon_list.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) {

resources <- APCalign::load_taxonomic_resources()
ranks_in_database <- unique(austraits$taxa$taxon_rank)

higher_ranks <- c("phylum", "class", "order", "family", "genus")
highest_ranks <- c("phylum", "class", "order")
higher_ranks_taxon_list <- higher_ranks[higher_ranks %in% unique(austraits$taxa$taxon_rank)]
highest_rank_column <- higher_ranks_taxon_list[1]
highest_ranks_taxon_list <- highest_ranks[highest_ranks %in% unique(austraits$taxa$taxon_rank)]

# Create reduced APC list, just including a few columns and taxon ranks relevant to AusTraits
APC_tmp <- resources$APC %>%
select(canonical_name, taxon_rank) %>%
mutate(taxonomic_dataset_APC = "APC") %>%
filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>%
rename(aligned_name = canonical_name, taxon_rank_APC = taxon_rank)

# Create reduced APNI list, just including a few columns and taxon ranks relevant to AusTraits
APNI_tmp <- resources$APNI %>%
select(canonical_name, taxon_rank) %>%
mutate(taxonomic_dataset_APNI = "APNI") %>%
filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>%
rename(aligned_name = canonical_name, taxon_rank_APNI = taxon_rank)

# List of taxa that are explicitly excluded in metadata - don't want these in the taxon_list
# These should be excluded from `taxonomic_updates` table during processing, but good to check
excluded_in_metadata <- austraits$excluded_data %>% filter(error == "Observation excluded in metadata") %>% distinct(original_name)

# Start with taxonomic_updates table, which is all original names, aligned names, by dataset
all_taxa <-
austraits$taxonomic_updates %>%
dplyr::select(dplyr::all_of(c("original_name", "aligned_name", "taxonomic_resolution"))) %>%
# In case the same `original_name`, `aligned_name` combination occurs twice, but only once with `taxonomic_resolution` attached, arrange names, taxon_ranks
dplyr::arrange(aligned_name, taxonomic_resolution) %>%
# Keep unique names
dplyr::distinct(original_name, aligned_name, .keep_all = TRUE) %>%
# Need to merge in taxon_ranks for taxa that have not had a taxonomic alignment added in the metadata file
dplyr::left_join(by = "aligned_name",
austraits$taxa %>%
dplyr::select(dplyr::all_of(c("taxon_name", "taxon_rank"))) %>%
dplyr::rename(aligned_name = taxon_name, taxon_rank_taxa = taxon_rank) %>%
arrange(aligned_name, taxon_rank_taxa) %>%
distinct(aligned_name, .keep_all = TRUE)
) %>%
# For taxa with: 1) taxon_rank = genus, family, or 2) notes in []
# Need to mutate a separate column with `name_to_match_to` for joining in other columns
dplyr::mutate(
name_to_match_to = aligned_name,
taxonomic_resolution = ifelse(is.na(taxonomic_resolution), taxon_rank_taxa, taxonomic_resolution),
name_to_match_to =
ifelse(taxonomic_resolution %in% c("family", "genus"),
ifelse(stringr::word(name_to_match_to, 1) == "x", stringr::word(name_to_match_to, start = 1, end = 2), stringr::word(name_to_match_to, 1)),
name_to_match_to),
name_to_match_to = ifelse(stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("species"), stringr::word(name_to_match_to, start = 1, end = 2), name_to_match_to),
name_to_match_to = ifelse(stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("subspecies", "variety", "form"), stringr::word(name_to_match_to, start = 1, end = 3), name_to_match_to),
) %>%
# Merge in taxon_ranks & taxonomic_dataset for all aligned name; this information not yet present for most names where original_name = aligned_name
dplyr::left_join(APC_tmp %>% rename(name_to_match_to = aligned_name) %>% distinct(name_to_match_to, .keep_all = TRUE)) %>%
dplyr::left_join(APNI_tmp %>% rename(name_to_match_to = aligned_name) %>% distinct(name_to_match_to, .keep_all = TRUE)) %>%
dplyr::mutate(
taxon_rank = taxon_rank_APC,
taxon_rank = ifelse(is.na(taxon_rank), taxon_rank_APNI, taxon_rank),
taxon_rank = ifelse(is.na(taxon_rank), taxonomic_resolution, taxon_rank),
taxonomic_dataset = taxonomic_dataset_APC,
taxonomic_dataset = ifelse(is.na(taxonomic_dataset), taxonomic_dataset_APNI, taxonomic_dataset)
) %>%
# Remove taxa that are excluded in metadata
dplyr::filter(!(original_name %in% excluded_in_metadata$original_name & is.na(taxon_rank)))

# Filter out taxa that need to be run through `APCalign::align_taxa()` first
taxa_for_taxon_list <- all_taxa %>%
filter(!(is.na(taxon_rank) & is.na(taxonomic_dataset))) %>%
select(original_name, aligned_name, taxon_rank, taxonomic_dataset) %>%
mutate(aligned_reason = NA)

# Use function `APCalign::update_taxonomy` to update names and add identifier columns
updated <- APCalign::update_taxonomy(taxa_for_taxon_list, resources = resources)

taxon_list_new <- updated %>%
# Remove columns from APCalign's output that aren't needed
dplyr::select(-dplyr::any_of(c("accepted_name", "row_number", "number_of_collapsed_taxa", "taxonomic_status_aligned", "update_reason", "aligned_reason", "scientific_name_authorship"))) %>%
# Rename columns to match AusTraits conventions
dplyr::rename(dplyr::all_of(c(
"taxon_name" = "suggested_name",
"aligned_name" = "aligned_name",
"taxonomic_dataset" = "taxonomic_dataset",
"taxon_id" = "taxon_ID",
"scientific_name_id" = "scientific_name_ID"
))) %>%
# In AusTraits we also want to document identifiers for `aligned_names`, not just for the final `taxon_name`
# We do this by rejoining columns from APC, but now to the aligned_names, not the taxon_names
# XX These are currently all prefixed with "cleaned"
dplyr::left_join(by = c("aligned_name", "taxon_name"),
resources$APC %>%
dplyr::mutate(
accepted_name = resources$`APC list (accepted)`$canonical_name[match(accepted_name_usage_ID, resources$`APC list (accepted)`$taxon_ID)],
taxon_name = accepted_name
) %>%
dplyr::select(dplyr::all_of(c("scientific_name_ID", "taxonomic_status", "taxon_ID", "accepted_name", "taxon_name", "canonical_name"))) %>%
dplyr::rename(dplyr::all_of(c(
"aligned_name_taxonomic_status" = "taxonomic_status",
"aligned_name_taxon_id" = "taxon_ID",
"aligned_name" = "canonical_name",
"cleaned_scientific_name_id" = "scientific_name_ID"
))) %>%
dplyr::distinct(aligned_name, taxon_name, .keep_all = TRUE)
) %>%
# For taxon names that are aligned at the genus- or family-level, we need to replace the taxon & scientific name identifiers
# with those for the relevant genus or family.
dplyr::mutate(
taxon_rank = ifelse(
taxon_rank %in% c("genus", "family"),
taxon_rank,
ifelse(
(taxonomic_dataset == "APC" & taxonomic_status == "accepted"),
resources$`APC list (accepted)`$taxon_rank[match(taxon_id, resources$`APC list (accepted)`$taxon_ID)],
taxon_rank
)),
taxon_id_family = resources$family_accepted$taxon_ID[match(updated$family, resources$family_accepted$canonical_name)],
taxon_id = ifelse(taxon_rank %in% c("genus", "family"), NA, taxon_id),
scientific_name_id = ifelse(taxonomic_dataset == "APC", resources$`APC list (accepted)`$scientific_name_ID[match(scientific_name, resources$`APC list (accepted)`$scientific_name)], scientific_name_id),
scientific_name_id = ifelse(taxon_rank %in% c("genus", "family"), NA, scientific_name_id)
) %>%
# The function `APCalign::update_taxonomy` includes alternative possible names as part of the taxon name.
# We want this information in a separate column.
dplyr::mutate(
taxon_name = stringr::str_split(taxon_name, "\\[alternative possible names\\:")) %>%
tidyr::unnest_wider(taxon_name, names_sep = "_") %>%
dplyr::rename(taxon_name = taxon_name_1, taxon_name_alternatives = taxon_name_2) %>%
dplyr::mutate(
taxon_name_alternatives = stringr::str_replace(taxon_name_alternatives, "\\]$", "")
) %>%
# Add in data for genus, binomial and trinomial, as appropriate.
dplyr::mutate(
trinomial = ifelse(.data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
# Field binomial is filled in if taxonomic resolution is an infraspecific name or a binomial
# All taxon names that have "extra" information (beyond the actual name) have been formatted
# to have that information in square brackets '[]', so these can be used as a delimitor to
# extract the actual name
binomial = ifelse(.data$taxon_rank %in% c("species"),
stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
binomial = ifelse(.data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
stringr::word(.data$taxon_name, start = 1, end = 2), .data$binomial),
binomial = stringr::str_trim(.data$binomial),
# Genus filled in for all names that have a taxonomic of genus or more detailed
genus = ifelse(
!.data$taxon_rank %in% c("family", "order", "class", "phylum", "kingdom"),
ifelse(stringr::word(.data$taxon_name, 1) == "x",
stringr::word(.data$taxon_name, start = 1, end = 2),
stringr::word(.data$taxon_name, 1)),
NA)
) %>%
# Add in `establishment_means`, indicating if a taxon is native, naturalised or both
# This code is based on the exact syntax for taxon_distribution in APC;
# the word `native` is used only if a taxon is both native and naturalised in a state
dplyr::mutate(
count_naturalised = stringr::str_count(.data$taxon_distribution, "naturalised"),
count_n_and_n = stringr::str_count(.data$taxon_distribution, "native and naturalised"),
count_states = stringr::str_count(.data$taxon_distribution, ",") + 1,
establishment_means = ifelse(.data$count_naturalised > 0 & .data$count_n_and_n == 0, "naturalised", NA),
establishment_means = ifelse(.data$count_n_and_n > 0 | (.data$count_naturalised > 0 & .data$count_states > .data$count_naturalised), "native and naturalised", .data$establishment_means),
establishment_means = ifelse(.data$count_naturalised == 0 & .data$count_n_and_n == 0, "native", .data$establishment_means),
establishment_means = ifelse(.data$taxon_rank %in% higher_ranks, NA, .data$establishment_means),
taxon_distribution = ifelse(.data$taxon_rank %in% higher_ranks, NA, .data$taxon_distribution)
) %>%
dplyr::select(-dplyr::all_of(c("count_naturalised", "count_n_and_n", "count_states", "accepted_name")))

# New taxon list

if (replace == TRUE) {
taxon_list_replace <- taxon_list_new %>%
arrange(taxon_name, aligned_name) %>%
distinct(taxon_name, aligned_name, .keep_all = TRUE)

} else {

taxon_list_replace <- taxon_list %>%
# First bind rows for cleaned names not yet in AusTraits taxon_list.csv file
bind_rows(taxon_list_new %>% filter(!aligned_name %in% taxon_list$aligned_name)) %>%
# Arrange by names - hopefully this will be best solution for keeping GitHub commits more transparent
arrange(taxon_name, aligned_name) %>%
distinct(taxon_name, aligned_name, .keep_all = TRUE)
}

taxon_list_replace %>%
write_csv("config/taxon_list.csv")
}
53 changes: 23 additions & 30 deletions data/ABRS_2022/metadata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ contributors:
dataset_curators: Elizabeth Wenk
dataset:
data_is_long_format: no
custom_R_code: '
custom_R_code: '
data %>%
filter(str_detect(taxon_name, " ")) %>%
mutate(
Expand Down Expand Up @@ -242,11 +242,6 @@ taxonomic_updates:
replace: Eucalyptus carnei
reason: match_07_fuzzy. Fuzzy alignment with accepted canonical name in APC (2022-11-22)
taxonomic_resolution: species
- find: Eucalyptus communalis
replace: Eucalyptus communalis
reason: match_06. Automatic alignment with synonymous term among known canonical
names APC (2022-11-22)
taxonomic_resolution: species
- find: Eucalyptus erythrandra
replace: Eucalyptus erythronema
reason: match_07_fuzzy. Fuzzy alignment with accepted canonical name in APC (2022-11-22)
Expand Down Expand Up @@ -312,6 +307,11 @@ taxonomic_updates:
reason: match_14. Automatic alignment with species-level canonical name in APC accepted
when notes are ignored (2022-11-22)
taxonomic_resolution: species
- find: Eucalyptus yumbarrana subsp. striata
replace: Eucalyptus yumbarrana subsp. x striata
reason: Exact match of the first three words of the taxon name to an APC-known canonical
name (2023-11-02)
taxonomic_resolution: subspecies
- find: Goodenia vilmorinae
replace: Goodenia vilmoriniae
reason: match_07_fuzzy. Fuzzy alignment with accepted canonical name in APC (2022-11-22)
Expand Down Expand Up @@ -375,6 +375,11 @@ taxonomic_updates:
reason: match_14. Automatic alignment with species-level canonical name in APC accepted
when notes are ignored (2022-11-22)
taxonomic_resolution: species
- find: Opuntia polyacantha var. erinacea (Engelm. & J.M.Bigelow) B.D.Parfitt
replace: Opuntia polyacantha
reason: Exact match of the first two words of the taxon name to an APC-accepted
canonical name (2023-11-02)
taxonomic_resolution: species
- find: Osteocarpum scleropterum
replace: Osteocarpum x scleropterum
reason: match_14. Automatic alignment with species-level canonical name in APC accepted
Expand Down Expand Up @@ -453,11 +458,6 @@ taxonomic_updates:
reason: match_20. Rewording name to be recognised as genus rank, with genus accepted
by APC (2022-11-22)
taxonomic_resolution: genus
- find: Solanum viride
replace: Solanum viride
reason: match_06. Automatic alignment with synonymous term among known canonical
names APC (2022-11-22)
taxonomic_resolution: species
- find: Sphaerocionium
replace: Sphaerocionium sp. [Sphaerocionium; FoA_2022]
reason: match_20. Rewording name to be recognised as genus rank, with genus in APNI
Expand Down Expand Up @@ -520,41 +520,34 @@ taxonomic_updates:
reason: match_14. Automatic alignment with species-level canonical name in APC accepted
when notes are ignored (2022-11-22)
taxonomic_resolution: species
- find: X Cynochloris
replace: x Cynochloris
reason: Manual matched to genus for taxon that can't be matched to species (Elizabeth
Wenk, 2022-11-22)
taxonomic_resolution: genus
- find: X Cynochloris
- find: x Cynochloris
replace: x Cynochloris
reason: match_14. Automatic alignment with species-level canonical name in APC accepted
when notes are ignored (2022-11-23)
taxonomic_resolution: genus
- find: X Cynochloris macivorii
- find: x Cynochloris macivorii
replace: x Cynochloris macivorii
reason: match_06. Automatic alignment with synonymous term among accepted canonical
names in APC (2022-11-22)
taxonomic_resolution: species
- find: X Cynochloris macivorii
replace: x Cynochloris macivorii
reason: match_06. Automatic alignment with synonymous term among accepted canonical
names in APC (2022-11-23)
taxonomic_resolution: species
- find: X Cynochloris reynoldensis
- find: x Cynochloris reynoldensis
replace: x Cynochloris reynoldensis
reason: match_06. Automatic alignment with synonymous term among accepted canonical
names in APC (2022-11-22)
taxonomic_resolution: species
- find: X Cynochloris reynoldensis
replace: x Cynochloris reynoldensis
reason: match_06. Automatic alignment with synonymous term among accepted canonical
names in APC (2022-11-23)
taxonomic_resolution: species
- find: x Glossadenia x tutelata
replace: x Glossadenia tutelata
replace: Glossodia x tutelata
reason: match_06. Automatic alignment with synonymous term among known canonical
names APC (2022-11-21)
taxonomic_resolution: species
- find: xCyanthera glossodioides
replace: x Cyanthera glossodioides
reason: Manual alignment with canonical species name in APC (E. Wenk, 2023-11-02)
taxonomic_resolution: species
- find: xGlossadenia tutelata
replace: Glossodia x tutelata
reason: Manual alignment with canonical species name in APC (E. Wenk, 2023-11-02)
taxonomic_resolution: species
exclude_observations:
- variable: taxon_name
find: Acacioides Group, Adenotricha Group, Aethiopicum Group, Agrifolia Group, Aspera
Expand Down
Loading

0 comments on commit 735ad35

Please sign in to comment.