Taxonomic updates & new functions for updating taxa, building taxon_l…

…ist.csv (#779) New function to rebuild the taxon_list: - uses APCalign::update_taxonomy() for the bulk of its functionality - defaults to binding new rows to bottom; but option to overwrite list to update from new NSL files New function to align taxon names and add taxonomic_updates to metadata files (build_align_taxon_names.R) Lots of taxonomic updates to bring all datasets up to date again Also reworking Nano_2011 taxon names * Nano_2011 had a mix of scientific names with and without authorship, but not always in standard syntax * there would have been 1000's of taxonomic updates added, so first manipulated the taxon names using stringr matches, such that all but ~300 ended up as exact matches to APC/APNI canonical names * merged these names into main data spreadsheet, together with alignments, reasons, etc for the remaining names * just the ~300 taxa requiring actual alignments added to metadata file Standardising which taxa are excluded from observations Checking to make sure that is a taxon is flagged as `non-native, non-naturalised` (or some other reason for exclusion) it is excluded from all datasets.
traitecoevo · Nov 3, 2023 · 735ad35 · 735ad35
1 parent c8ef093
commit 735ad35
Show file tree

Hide file tree

Showing 85 changed files with 9,077 additions and 17,754 deletions.
diff --git a/R/build_align_taxon_names.R b/R/build_align_taxon_names.R
@@ -0,0 +1,23 @@
+build_align_taxon_names <- function(austraits, dataset) {
+
+  resources <- APCalign::load_taxonomic_resources()
+
+  names_to_align <- austraits$taxonomic_updates %>%
+    dplyr::filter(stringr::str_detect(dataset_id, dataset)) %>%
+    dplyr::filter(!aligned_name %in% resources$APC$canonical_name & !aligned_name %in% resources$APNI$canonical_name) %>%
+    dplyr::filter(is.na(taxonomic_resolution)) %>%
+    dplyr::distinct(original_name)
+
+  names_aligned <- 
+    APCalign::align_taxa(original_name = names_to_align$original_name) %>%
+    dplyr::select(all_of(c("original_name", "aligned_name", "aligned_reason", "taxon_rank"))) %>%
+    dplyr::rename(all_of(c(
+      "find" = "original_name",
+      "replace" = "aligned_name",
+      "reason" = "aligned_reason",
+      "taxonomic_resolution" = "taxon_rank"
+    )))
+
+  traits.build::metadata_add_taxonomic_changes_list(dataset, names_aligned)
+
+}
diff --git a/R/build_update_taxon_list.R b/R/build_update_taxon_list.R
@@ -0,0 +1,189 @@
+build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) {
+
+  resources <- APCalign::load_taxonomic_resources()
+  ranks_in_database <- unique(austraits$taxa$taxon_rank)
+
+  higher_ranks <- c("phylum", "class", "order", "family", "genus")
+  highest_ranks <- c("phylum", "class", "order")
+  higher_ranks_taxon_list <- higher_ranks[higher_ranks %in% unique(austraits$taxa$taxon_rank)]
+  highest_rank_column <- higher_ranks_taxon_list[1]
+  highest_ranks_taxon_list <- highest_ranks[highest_ranks %in% unique(austraits$taxa$taxon_rank)]
+
+  # Create reduced APC list, just including a few columns and taxon ranks relevant to AusTraits
+  APC_tmp <- resources$APC %>% 
+    select(canonical_name, taxon_rank) %>% 
+    mutate(taxonomic_dataset_APC = "APC") %>% 
+    filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>%
+    rename(aligned_name = canonical_name, taxon_rank_APC = taxon_rank)
+
+  # Create reduced APNI list, just including a few columns and taxon ranks relevant to AusTraits
+  APNI_tmp <- resources$APNI %>% 
+    select(canonical_name, taxon_rank) %>% 
+    mutate(taxonomic_dataset_APNI = "APNI") %>% 
+    filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>% 
+    rename(aligned_name = canonical_name, taxon_rank_APNI = taxon_rank)
+
+  # List of taxa that are explicitly excluded in metadata - don't want these in the taxon_list
+  # These should be excluded from `taxonomic_updates` table during processing, but good to check
+  excluded_in_metadata <- austraits$excluded_data %>% filter(error == "Observation excluded in metadata") %>% distinct(original_name)
+
+  # Start with taxonomic_updates table, which is all original names, aligned names, by dataset
+  all_taxa <- 
+    austraits$taxonomic_updates %>%
+    dplyr::select(dplyr::all_of(c("original_name", "aligned_name", "taxonomic_resolution"))) %>%
+    # In case the same `original_name`, `aligned_name` combination occurs twice, but only once with `taxonomic_resolution` attached, arrange names, taxon_ranks
+    dplyr::arrange(aligned_name, taxonomic_resolution) %>%
+    # Keep unique names
+    dplyr::distinct(original_name, aligned_name, .keep_all = TRUE) %>%
+    # Need to merge in taxon_ranks for taxa that have not had a taxonomic alignment added in the metadata file
+    dplyr::left_join(by = "aligned_name",
+                     austraits$taxa %>%
+                       dplyr::select(dplyr::all_of(c("taxon_name", "taxon_rank"))) %>%
+                       dplyr::rename(aligned_name = taxon_name, taxon_rank_taxa = taxon_rank) %>%
+                       arrange(aligned_name, taxon_rank_taxa) %>%
+                       distinct(aligned_name, .keep_all = TRUE)
+    ) %>%
+    # For taxa with: 1) taxon_rank = genus, family, or 2) notes in []
+    # Need to mutate a separate column with `name_to_match_to` for joining in other columns
+    dplyr::mutate(
+      name_to_match_to = aligned_name,
+      taxonomic_resolution = ifelse(is.na(taxonomic_resolution), taxon_rank_taxa, taxonomic_resolution),
+      name_to_match_to = 
+        ifelse(taxonomic_resolution %in% c("family", "genus"), 
+               ifelse(stringr::word(name_to_match_to, 1) == "x", stringr::word(name_to_match_to, start = 1, end = 2), stringr::word(name_to_match_to, 1)), 
+                       name_to_match_to),
+      name_to_match_to = ifelse(stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("species"), stringr::word(name_to_match_to, start = 1, end = 2), name_to_match_to),
+      name_to_match_to = ifelse(stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("subspecies", "variety", "form"), stringr::word(name_to_match_to, start = 1, end = 3), name_to_match_to),
+      ) %>%
+    # Merge in taxon_ranks & taxonomic_dataset for all aligned name; this information not yet present for most names where original_name = aligned_name
+    dplyr::left_join(APC_tmp %>% rename(name_to_match_to = aligned_name) %>% distinct(name_to_match_to, .keep_all = TRUE)) %>%
+    dplyr::left_join(APNI_tmp %>% rename(name_to_match_to = aligned_name) %>% distinct(name_to_match_to, .keep_all = TRUE)) %>%
+    dplyr::mutate(
+      taxon_rank = taxon_rank_APC,
+      taxon_rank = ifelse(is.na(taxon_rank), taxon_rank_APNI, taxon_rank),
+      taxon_rank = ifelse(is.na(taxon_rank), taxonomic_resolution, taxon_rank),
+      taxonomic_dataset = taxonomic_dataset_APC,
+      taxonomic_dataset = ifelse(is.na(taxonomic_dataset), taxonomic_dataset_APNI, taxonomic_dataset)
+      ) %>%
+    # Remove taxa that are excluded in metadata
+    dplyr::filter(!(original_name %in% excluded_in_metadata$original_name & is.na(taxon_rank)))
+
+  # Filter out taxa that need to be run through `APCalign::align_taxa()` first
+  taxa_for_taxon_list <- all_taxa %>%
+    filter(!(is.na(taxon_rank) & is.na(taxonomic_dataset))) %>%
+    select(original_name, aligned_name, taxon_rank, taxonomic_dataset) %>%
+    mutate(aligned_reason = NA)
+
+  # Use function `APCalign::update_taxonomy` to update names and add identifier columns
+  updated <- APCalign::update_taxonomy(taxa_for_taxon_list, resources = resources)
+
+  taxon_list_new <- updated %>% 
+    # Remove columns from APCalign's output that aren't needed
+    dplyr::select(-dplyr::any_of(c("accepted_name", "row_number", "number_of_collapsed_taxa", "taxonomic_status_aligned", "update_reason", "aligned_reason", "scientific_name_authorship"))) %>%
+    # Rename columns to match AusTraits conventions
+    dplyr::rename(dplyr::all_of(c(
+        "taxon_name" = "suggested_name",
+        "aligned_name" = "aligned_name",
+        "taxonomic_dataset" = "taxonomic_dataset",
+        "taxon_id" = "taxon_ID",
+        "scientific_name_id" = "scientific_name_ID"
+      ))) %>%
+    # In AusTraits we also want to document identifiers for `aligned_names`, not just for the final `taxon_name`
+    # We do this by rejoining columns from APC, but now to the aligned_names, not the taxon_names
+    # XX These are currently all prefixed with "cleaned"
+    dplyr::left_join(by = c("aligned_name", "taxon_name"),
+      resources$APC %>% 
+        dplyr::mutate(
+          accepted_name = resources$`APC list (accepted)`$canonical_name[match(accepted_name_usage_ID, resources$`APC list (accepted)`$taxon_ID)],
+          taxon_name = accepted_name
+          ) %>%
+        dplyr::select(dplyr::all_of(c("scientific_name_ID", "taxonomic_status", "taxon_ID", "accepted_name", "taxon_name", "canonical_name"))) %>%
+        dplyr::rename(dplyr::all_of(c(
+          "aligned_name_taxonomic_status" = "taxonomic_status",
+          "aligned_name_taxon_id" = "taxon_ID",
+          "aligned_name" = "canonical_name",
+          "cleaned_scientific_name_id" = "scientific_name_ID"
+          ))) %>%
+        dplyr::distinct(aligned_name, taxon_name, .keep_all = TRUE)
+    ) %>%
+    # For taxon names that are aligned at the genus- or family-level, we need to replace the taxon & scientific name identifiers
+    # with those for the relevant genus or family.
+    dplyr::mutate(
+      taxon_rank = ifelse(
+        taxon_rank %in% c("genus", "family"),
+        taxon_rank,
+        ifelse(
+          (taxonomic_dataset == "APC" & taxonomic_status == "accepted"),
+          resources$`APC list (accepted)`$taxon_rank[match(taxon_id, resources$`APC list (accepted)`$taxon_ID)],
+          taxon_rank
+        )),
+      taxon_id_family = resources$family_accepted$taxon_ID[match(updated$family, resources$family_accepted$canonical_name)],
+      taxon_id = ifelse(taxon_rank %in% c("genus", "family"), NA, taxon_id),
+      scientific_name_id = ifelse(taxonomic_dataset == "APC", resources$`APC list (accepted)`$scientific_name_ID[match(scientific_name, resources$`APC list (accepted)`$scientific_name)], scientific_name_id),
+      scientific_name_id = ifelse(taxon_rank %in% c("genus", "family"), NA, scientific_name_id)
+    ) %>%
+    # The function `APCalign::update_taxonomy` includes alternative possible names as part of the taxon name.
+    # We want this information in a separate column.
+    dplyr::mutate(
+        taxon_name = stringr::str_split(taxon_name, "\\[alternative possible names\\:")) %>%
+    tidyr::unnest_wider(taxon_name, names_sep = "_") %>%
+    dplyr::rename(taxon_name = taxon_name_1, taxon_name_alternatives = taxon_name_2) %>%
+    dplyr::mutate(
+      taxon_name_alternatives = stringr::str_replace(taxon_name_alternatives, "\\]$", "")
+      ) %>%
+    # Add in data for genus, binomial and trinomial, as appropriate.
+    dplyr::mutate(
+      trinomial = ifelse(.data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
+                         stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
+      # Field binomial is filled in if taxonomic resolution is an infraspecific name or a binomial
+      # All taxon names that have "extra" information (beyond the actual name) have been formatted
+      # to have that information in square brackets '[]', so these can be used as a delimitor to
+      # extract the actual name
+      binomial = ifelse(.data$taxon_rank %in% c("species"),
+                        stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
+      binomial = ifelse(.data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
+                        stringr::word(.data$taxon_name, start = 1, end = 2), .data$binomial),
+      binomial = stringr::str_trim(.data$binomial),
+      # Genus filled in for all names that have a taxonomic of genus or more detailed
+      genus = ifelse(
+        !.data$taxon_rank %in% c("family", "order", "class", "phylum", "kingdom"),
+        ifelse(stringr::word(.data$taxon_name, 1) == "x",
+               stringr::word(.data$taxon_name, start = 1, end = 2),
+               stringr::word(.data$taxon_name, 1)),
+        NA)
+    ) %>%
+    # Add in `establishment_means`, indicating if a taxon is native, naturalised or both
+    # This code is based on the exact syntax for taxon_distribution in APC; 
+    # the word `native` is used only if a taxon is both native and naturalised in a state
+    dplyr::mutate(
+      count_naturalised = stringr::str_count(.data$taxon_distribution, "naturalised"),
+      count_n_and_n = stringr::str_count(.data$taxon_distribution, "native and naturalised"),
+      count_states = stringr::str_count(.data$taxon_distribution, ",") + 1,
+      establishment_means = ifelse(.data$count_naturalised > 0 & .data$count_n_and_n == 0, "naturalised", NA),
+      establishment_means = ifelse(.data$count_n_and_n > 0 | (.data$count_naturalised > 0 & .data$count_states > .data$count_naturalised), "native and naturalised", .data$establishment_means),
+      establishment_means = ifelse(.data$count_naturalised == 0 & .data$count_n_and_n == 0, "native", .data$establishment_means),
+      establishment_means = ifelse(.data$taxon_rank %in% higher_ranks, NA, .data$establishment_means),
+      taxon_distribution = ifelse(.data$taxon_rank %in% higher_ranks, NA, .data$taxon_distribution)
+      ) %>%
+    dplyr::select(-dplyr::all_of(c("count_naturalised", "count_n_and_n", "count_states", "accepted_name")))
+
+  # New taxon list  
+
+   if (replace == TRUE) {
+      taxon_list_replace <- taxon_list_new %>% 
+        arrange(taxon_name, aligned_name) %>%
+        distinct(taxon_name, aligned_name, .keep_all = TRUE)
+
+   } else {
+
+      taxon_list_replace <- taxon_list %>%
+        # First bind rows for cleaned names not yet in AusTraits taxon_list.csv file
+        bind_rows(taxon_list_new %>% filter(!aligned_name %in% taxon_list$aligned_name)) %>%
+        # Arrange by names - hopefully this will be best solution for keeping GitHub commits more transparent
+        arrange(taxon_name, aligned_name) %>%
+        distinct(taxon_name, aligned_name, .keep_all = TRUE)
+   }
+
+  taxon_list_replace %>% 
+      write_csv("config/taxon_list.csv")
+}
diff --git a/data/ABRS_2022/metadata.yml b/data/ABRS_2022/metadata.yml
@@ -20,7 +20,7 @@ contributors:
   dataset_curators: Elizabeth Wenk
 dataset:
   data_is_long_format: no
-  custom_R_code:          '
+  custom_R_code:            '
     data %>%
       filter(str_detect(taxon_name, " ")) %>%
       mutate(
@@ -242,11 +242,6 @@ taxonomic_updates:
   replace: Eucalyptus carnei
   reason: match_07_fuzzy. Fuzzy alignment with accepted canonical name in APC (2022-11-22)
   taxonomic_resolution: species
-- find: Eucalyptus communalis
-  replace: Eucalyptus communalis
-  reason: match_06. Automatic alignment with synonymous term among known canonical
-    names APC (2022-11-22)
-  taxonomic_resolution: species
 - find: Eucalyptus erythrandra
   replace: Eucalyptus erythronema
   reason: match_07_fuzzy. Fuzzy alignment with accepted canonical name in APC (2022-11-22)
@@ -312,6 +307,11 @@ taxonomic_updates:
   reason: match_14. Automatic alignment with species-level canonical name in APC accepted
     when notes are ignored (2022-11-22)
   taxonomic_resolution: species
+- find: Eucalyptus yumbarrana subsp. striata
+  replace: Eucalyptus yumbarrana subsp. x striata
+  reason: Exact match of the first three words of the taxon name to an APC-known canonical
+    name (2023-11-02)
+  taxonomic_resolution: subspecies
 - find: Goodenia vilmorinae
   replace: Goodenia vilmoriniae
   reason: match_07_fuzzy. Fuzzy alignment with accepted canonical name in APC (2022-11-22)
@@ -375,6 +375,11 @@ taxonomic_updates:
   reason: match_14. Automatic alignment with species-level canonical name in APC accepted
     when notes are ignored (2022-11-22)
   taxonomic_resolution: species
+- find: Opuntia polyacantha var. erinacea (Engelm. & J.M.Bigelow) B.D.Parfitt
+  replace: Opuntia polyacantha
+  reason: Exact match of the first two words of the taxon name to an APC-accepted
+    canonical name (2023-11-02)
+  taxonomic_resolution: species
 - find: Osteocarpum scleropterum
   replace: Osteocarpum x scleropterum
   reason: match_14. Automatic alignment with species-level canonical name in APC accepted
@@ -453,11 +458,6 @@ taxonomic_updates:
   reason: match_20. Rewording name to be recognised as genus rank, with genus accepted
     by APC (2022-11-22)
   taxonomic_resolution: genus
-- find: Solanum viride
-  replace: Solanum viride
-  reason: match_06. Automatic alignment with synonymous term among known canonical
-    names APC (2022-11-22)
-  taxonomic_resolution: species
 - find: Sphaerocionium
   replace: Sphaerocionium sp. [Sphaerocionium; FoA_2022]
   reason: match_20. Rewording name to be recognised as genus rank, with genus in APNI
@@ -520,41 +520,34 @@ taxonomic_updates:
   reason: match_14. Automatic alignment with species-level canonical name in APC accepted
     when notes are ignored (2022-11-22)
   taxonomic_resolution: species
-- find: X Cynochloris
-  replace: x Cynochloris
-  reason: Manual matched to genus for taxon that can't be matched to species (Elizabeth
-    Wenk, 2022-11-22)
-  taxonomic_resolution: genus
-- find: X Cynochloris
+- find: x Cynochloris
   replace: x Cynochloris
   reason: match_14. Automatic alignment with species-level canonical name in APC accepted
     when notes are ignored (2022-11-23)
   taxonomic_resolution: genus
-- find: X Cynochloris macivorii
+- find: x Cynochloris macivorii
   replace: x Cynochloris macivorii
   reason: match_06. Automatic alignment with synonymous term among accepted canonical
     names in APC (2022-11-22)
   taxonomic_resolution: species
-- find: X Cynochloris macivorii
-  replace: x Cynochloris macivorii
-  reason: match_06. Automatic alignment with synonymous term among accepted canonical
-    names in APC (2022-11-23)
-  taxonomic_resolution: species
-- find: X Cynochloris reynoldensis
+- find: x Cynochloris reynoldensis
   replace: x Cynochloris reynoldensis
   reason: match_06. Automatic alignment with synonymous term among accepted canonical
     names in APC (2022-11-22)
   taxonomic_resolution: species
-- find: X Cynochloris reynoldensis
-  replace: x Cynochloris reynoldensis
-  reason: match_06. Automatic alignment with synonymous term among accepted canonical
-    names in APC (2022-11-23)
-  taxonomic_resolution: species
 - find: x Glossadenia x tutelata
-  replace: x Glossadenia tutelata
+  replace: Glossodia x tutelata
   reason: match_06. Automatic alignment with synonymous term among known canonical
     names APC (2022-11-21)
   taxonomic_resolution: species
+- find: xCyanthera glossodioides
+  replace: x Cyanthera glossodioides
+  reason: Manual alignment with canonical species name in APC (E. Wenk, 2023-11-02)
+  taxonomic_resolution: species
+- find: xGlossadenia tutelata
+  replace: Glossodia x tutelata
+  reason: Manual alignment with canonical species name in APC (E. Wenk, 2023-11-02)
+  taxonomic_resolution: species
 exclude_observations:
 - variable: taxon_name
   find: Acacioides Group, Adenotricha Group, Aethiopicum Group, Agrifolia Group, Aspera