diff --git a/R/pivot.R b/R/pivot.R index cd742750..cfd1c87d 100644 --- a/R/pivot.R +++ b/R/pivot.R @@ -9,18 +9,18 @@ check_pivot_wider <- function(dataset) { duplicates <- dataset$traits %>% - select( + dplyr::select( dplyr::all_of(c("dataset_id", "trait_name", "value", "observation_id", "value_type", "repeat_measurements_id", "method_id", "method_context_id")) ) %>% tidyr::pivot_wider(names_from = "trait_name", values_from = "value", values_fn = length) %>% tidyr::pivot_longer(cols = 7:ncol(.)) %>% dplyr::rename(dplyr::all_of(c("trait_name" = "name", "number_of_duplicates" = "value"))) %>% - select( + dplyr::select( dplyr::all_of(c("dataset_id", "trait_name", "number_of_duplicates", "observation_id", "value_type")), everything() ) %>% - filter(.data$number_of_duplicates > 1) %>% + dplyr::filter(.data$number_of_duplicates > 1) %>% nrow() if (duplicates == 0) { @@ -61,14 +61,14 @@ db_traits_pivot_wider <- function(traits) { # A check for if there are more than 1 value_type for a given taxon_name, observation_id and method check_value_type <- traits %>% - select(dplyr::all_of(c( + dplyr::select(dplyr::all_of(c( "trait_name", "value", "dataset_id", "observation_id", "method_id", "method_context_id", "repeat_measurements_id", "value_type"))) %>% dplyr::group_by( .data$dataset_id, .data$observation_id, .data$method_id, .data$method_context_id, .data$repeat_measurements_id) %>% dplyr::summarise(n_value_type = length(unique(.data$value_type))) %>% - arrange(.data$observation_id) %>% + dplyr::arrange(.data$observation_id) %>% dplyr::filter(.data$n_value_type > 1) if (nrow(check_value_type) > 1) { diff --git a/R/process.R b/R/process.R index 4e9fb832..aecb374a 100644 --- a/R/process.R +++ b/R/process.R @@ -126,7 +126,7 @@ dataset_process <- function(filename_data_raw, ) traits <- traits %>% - mutate( + dplyr::mutate( location_id = ifelse(.data$entity_type == "species", NA_character_, .data$location_id) ) } @@ -147,7 +147,7 @@ dataset_process <- function(filename_data_raw, by = "location_id", locations %>% tidyr::pivot_wider(names_from = "location_property", values_from = "value") %>% - mutate(col_tmp = .data[[v]]) %>% + dplyr::mutate(col_tmp = .data[[v]]) %>% dplyr::select(dplyr::any_of(c("location_id", "col_tmp"))) %>% stats::na.omit() ) @@ -185,13 +185,11 @@ dataset_process <- function(filename_data_raw, process_format_contributors(dataset_id, schema) # Record sources - sources <- metadata$source %>% - lapply(util_list_to_bib) %>% purrr::reduce(c) + sources <- metadata$source %>% lapply(util_list_to_bib) %>% purrr::reduce(c) # Record methods methods <- process_format_methods(metadata, dataset_id, sources, contributors) - # Retrieve taxonomic details for known species taxonomic_updates <- traits %>% @@ -561,7 +559,7 @@ process_create_observation_id <- function(data, metadata) { if (!is.null(traits_table[["repeat_measurements_id"]])) { to_add_id <- traits_table %>% - filter(.data$repeat_measurements_id == TRUE) %>% + dplyr::filter(.data$repeat_measurements_id == TRUE) %>% dplyr::pull(.data$trait_name) i <- !is.na(data$value) & data$trait_name %in% to_add_id & @@ -803,12 +801,12 @@ process_create_context_ids <- function(data, contexts) { contexts_finished <- contexts %>% - filter(!is.na(.data$value)) %>% + dplyr::filter(!is.na(.data$value)) %>% dplyr::left_join( id_link %>% dplyr::bind_rows(), by = c("context_property", "category", "value") ) %>% - distinct(dplyr::across(-dplyr::any_of("find"))) + dplyr::distinct(dplyr::across(-dplyr::any_of("find"))) list( contexts = contexts_finished %>% util_df_convert_character(), @@ -985,7 +983,7 @@ util_check_disallowed_chars <- function(object) { process_flag_unsupported_characters <- function(data) { data <- data %>% - mutate( + dplyr::mutate( error = ifelse(is.na(.data$error) & util_check_disallowed_chars(.data$value), "Value contains unsupported characters", .data$error) ) @@ -1866,8 +1864,8 @@ build_combine <- function(..., d = list(...)) { metadata[["contributors"]] <- contributors %>% dplyr::select(-dplyr::any_of(c("dataset_id", "additional_role"))) %>% - distinct() %>% - arrange(.data$last_name, .data$given_name) %>% + dplyr::distinct() %>% + dplyr::arrange(.data$last_name, .data$given_name) %>% util_df_to_list() ret <- list(traits = combine("traits", d), @@ -1954,7 +1952,7 @@ dataset_update_taxonomy <- function(austraits_raw, taxa) { stringr::word(.data$taxon_name, 1), .data$name_to_match_to) ) %>% # Remove `taxon_rank`, as it is about to be merged back in, but matches will now be possible to more rows - select(-dplyr::any_of(c("taxon_rank", "taxonomic_resolution"))) %>% + dplyr::select(-dplyr::any_of(c("taxon_rank", "taxonomic_resolution"))) %>% util_df_convert_character() %>% # Merge in all data from taxa dplyr::left_join(by = c("taxon_name"), @@ -2050,8 +2048,8 @@ check_pivot_duplicates <- function( # Check for duplicates database_object$traits %>% - filter(.data$dataset_id %in% dataset_ids) %>% - select( + dplyr::filter(.data$dataset_id %in% dataset_ids) %>% + dplyr::select( # `taxon_name` and `original_name` are not needed for pivoting but are included for informative purposes dplyr::all_of( c("dataset_id", "trait_name", "value", "taxon_name", "original_name", "observation_id", @@ -2060,10 +2058,10 @@ check_pivot_duplicates <- function( tidyr::pivot_wider(names_from = "trait_name", values_from = "value", values_fn = length) %>% tidyr::pivot_longer(cols = 9:ncol(.)) %>% dplyr::rename(dplyr::all_of(c("trait_name" = "name", "number_of_duplicates" = "value"))) %>% - select( + dplyr::select( dplyr::all_of(c("dataset_id", "trait_name", "number_of_duplicates", "taxon_name", "original_name", "observation_id", "value_type")), everything() ) %>% - filter(.data$number_of_duplicates > 1) + dplyr::filter(.data$number_of_duplicates > 1) } diff --git a/R/setup.R b/R/setup.R index 0786365b..df350424 100644 --- a/R/setup.R +++ b/R/setup.R @@ -377,7 +377,7 @@ metadata_add_locations <- function(dataset_id, location_data, user_responses = N # Save and notify location_data <- location_data %>% dplyr::select(dplyr::all_of(c(location_name, keep))) %>% - distinct() + dplyr::distinct() # If user didn't select any variables to keep, so add defaults if (is.na(keep[1])) { @@ -900,15 +900,15 @@ metadata_add_taxonomic_change <- function(dataset_id, find, replace, reason, tax } else { message(sprintf(red("Existing substitution will be overwritten for ") %+% green("'%s'"), find)) data <- data %>% - filter(.data$find != to_add$find) %>% - dplyr::bind_rows(to_add) %>% - filter(!.data$find == replace) %>% - arrange(.data$find) + dplyr::filter(.data$find != to_add$find) %>% + dplyr::bind_rows(to_add) %>% + dplyr::filter(!.data$find == replace) %>% + dplyr::arrange(.data$find) } } else { data <- dplyr::bind_rows(data, to_add) %>% - filter(!.data$find == replace) %>% - arrange(.data$find) + dplyr::filter(!.data$find == replace) %>% + dplyr::arrange(.data$find) } } @@ -967,7 +967,7 @@ metadata_add_taxonomic_changes_list <- function(dataset_id, taxonomic_updates) { )) } # Write new taxonomic updates to metadata - metadata$taxonomic_updates <- existing_updates %>% dplyr::arrange(.data$find) %>% filter(!.data$find == .data$replace) + metadata$taxonomic_updates <- existing_updates %>% dplyr::arrange(.data$find) %>% dplyr::filter(!.data$find == .data$replace) } else { # Read in dataframe of taxonomic changes, split into single-row lists, and add to metadata file diff --git a/R/testdata.R b/R/testdata.R index a9170c56..fcd90058 100644 --- a/R/testdata.R +++ b/R/testdata.R @@ -590,6 +590,27 @@ dataset_test_worker <- process_format_contexts(dataset_id, data) ) + # Check that there are no duplicate `var_in` or `context_property` fields + context_properties <- sapply(metadata[["contexts"]], "[[", "context_property") + context_vars_in <- sapply(metadata[["contexts"]], "[[", "var_in") + + expect_equal( + context_properties |> duplicated() |> sum(), + 0, + info = sprintf( + "%s\tcontexts - duplicate `context_property` values detected: '%s'", + red(f), + paste(context_properties[duplicated(context_properties)], collapse = "', '")) + ) + expect_equal( + context_vars_in |> duplicated() |> sum(), + 0, + info = sprintf( + "%s\tcontexts - duplicate `var_in` values detected: '%s'", + red(f), + paste(context_vars_in[duplicated(context_vars_in)], collapse = "', '")) + ) + # Check context details load if (nrow(contexts) > 0) { @@ -599,29 +620,30 @@ dataset_test_worker <- info = paste0(red(f), "\tcontexts"), label = "field names" ) + # Check that unique context `value`'s only have one unique description expect_equal( contexts %>% dplyr::group_by(.data$context_property, .data$value) %>% dplyr::summarise(n = dplyr::n_distinct(.data$description)) %>% - filter(.data$n > 1) %>% nrow(), + dplyr::filter(.data$n > 1) %>% nrow(), 0, info = sprintf( "%s\tcontexts - `value`'s should only have one unique description each: '%s'", red(f), paste( contexts %>% dplyr::group_by(.data$context_property, .data$value) %>% dplyr::summarise(n = dplyr::n_distinct(.data$description)) %>% - filter(.data$n > 1) %>% dplyr::pull(.data$value) %>% unique(), + dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$value) %>% unique(), collapse = "', '") ) ) # Check that there are no duplicate `find` fields expect_equal( - contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% + contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% nrow(), 0, info = sprintf( "%s\tcontexts - duplicate `find` values detected: '%s'", red(f), paste( - contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% + contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(), collapse = "', '") ) @@ -682,6 +704,7 @@ dataset_test_worker <- } ## Traits + expect_list_elements_contains_names( metadata[["traits"]], c("var_in", "unit_in", "trait_name", "value_type", "basis_of_value"), @@ -707,8 +730,31 @@ dataset_test_worker <- label = "`trait_name`'s" ) + # Check units are found in `unit_conversions.csv` + units <- read_csv("config/unit_conversions.csv") + expect_is_in( + traits$unit_in, units$unit_from, + info = paste0(red(f), "\ttraits"), + label = "`unit_in`'s" + ) + # Check no duplicate `var_in`'s + expect_equal( + traits %>% dplyr::group_by(.data$var_in) %>% dplyr::summarise(n = dplyr::n()) %>% + dplyr::filter(.data$n > 1) %>% nrow(), + 0, + info = sprintf( + "%s\ttraits - duplicate `var_in` values detected: '%s'", + red(f), + paste( + traits %>% dplyr::group_by(.data$var_in) %>% dplyr::summarise(n = dplyr::n()) %>% + dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$var_in) %>% unique(), + collapse = "', '") + ) + ) + + # Now that traits loaded, check details of contexts match if (nrow(contexts > 0)) { @@ -723,7 +769,7 @@ dataset_test_worker <- for (j in unique(contexts[["var_in"]])) { - contextsub <- contexts %>% filter(var_in == j) + contextsub <- contexts %>% dplyr::filter(var_in == j) # Context values align either with a column of data or a column of traits table if (is.null(data[[j]])) { @@ -852,13 +898,14 @@ dataset_test_worker <- # First check no duplicate combinations of `find` expect_equal( - x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% nrow(), + x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% + dplyr::filter(.data$n > 1) %>% nrow(), 0, info = sprintf( "%s\tsubstitutions - duplicate `find` values detected: '%s'", red(f), paste( - x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% - dplyr::pull(.data$find) %>% unique(), + x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% + dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(), collapse = "', '") ) ) @@ -898,12 +945,12 @@ dataset_test_worker <- # Check no duplicate `find` values expect_equal( - x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% nrow(), + x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% nrow(), 0, info = sprintf( "%s\ttaxonomic_updates - duplicate `find` values detected: '%s'", red(f), paste( - x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% + x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(), collapse = "', '") ) @@ -1005,13 +1052,13 @@ dataset_test_worker <- # Check no duplicate `find` values expect_equal( x %>% dplyr::group_by(.data$variable, .data$find) %>% - dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% nrow(), + dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% nrow(), 0, info = sprintf( "%s\texclude_observations - duplicate `find` values detected: '%s'", red(f), paste( x %>% dplyr::group_by(.data$variable, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% - filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(), + dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(), collapse = "', '") ) ) @@ -1030,7 +1077,7 @@ dataset_test_worker <- expect_is_in( find_values, # Extract values from the data for that variable - parsed_data %>% filter(.data$trait_name == variable) %>% dplyr::pull(.data$value) %>% unique(), + parsed_data %>% dplyr::filter(.data$trait_name == variable) %>% dplyr::pull(.data$value) %>% unique(), info = paste0(red(f), "\texclude_observations"), label = sprintf("variable '%s'", variable) ) @@ -1061,9 +1108,8 @@ dataset_test_worker <- } else { # For wide datasets, expect variables in traits are headers in the data - values <- names(data) expect_is_in( - traits[["var_in"]], values, + traits[["var_in"]], names(data), info = paste0(red(files[2]), "\ttraits"), label = "`var_in`" ) @@ -1081,12 +1127,12 @@ dataset_test_worker <- - ## Check traits are not only NAs + ## Check that not all trait names are NAs expect_false( - nrow(metadata[["traits"]] %>% util_list_to_df2() %>% dplyr::filter(!is.na(.data$trait_name))) == 0, + nrow(traits %>% dplyr::filter(!is.na(.data$trait_name))) == 0, info = paste0(red(f), "\ttraits - only contain NA `trait_name`'s")) - if (nrow(metadata[["traits"]] %>% util_list_to_df2() %>% dplyr::filter(!is.na(.data$trait_name))) > 0) { + if (nrow(traits %>% dplyr::filter(!is.na(.data$trait_name))) > 0) { # Test build dataset expect_no_error( diff --git a/R/utils.R b/R/utils.R index 819b000a..50157204 100644 --- a/R/utils.R +++ b/R/utils.R @@ -268,12 +268,13 @@ write_metadata <- function(data, path, style_code = FALSE) { if (!is.na(data$dataset$custom_R_code)) { code <- data$dataset$custom_R_code + code <- stringr::str_trim(code, side = "left") if (style_code) code <- code %>% suppressWarnings(styler::style_text(transformers = .data$tidyverse_style(strict = TRUE))) txt <- gsub("custom_R_code: .na", code %>% paste(collapse = "\n") %>% - paste0("custom_R_code:", .), txt, fixed = TRUE) + paste0("custom_R_code: ", .), txt, fixed = TRUE) } if (!stringr::str_sub(txt, nchar(txt)) == "\n") diff --git a/inst/support/report_dataset.Rmd b/inst/support/report_dataset.Rmd index c9bb0888..f42db845 100644 --- a/inst/support/report_dataset.Rmd +++ b/inst/support/report_dataset.Rmd @@ -255,9 +255,9 @@ new_question("(section `dataset`) Can you provide more detailed information for ```{r, results='asis', echo=FALSE} missing <- metadata$dataset %>% - util_list_to_df1() %>% - filter(value == "unknown") %>% - pull(key) + util_list_to_df1() %>% + filter(value == "unknown") %>% + pull(key) for (v in missing) { sprintf("(section `dataset`) Can you provide missing details for the variable `%s`?\n", v) %>% diff --git a/tests/testthat/examples/Test_2023_3/metadata.yml b/tests/testthat/examples/Test_2023_3/metadata.yml index 6211a1ff..2ffa033a 100644 --- a/tests/testthat/examples/Test_2023_3/metadata.yml +++ b/tests/testthat/examples/Test_2023_3/metadata.yml @@ -91,7 +91,7 @@ dataset: arrange(category) %>% distinct() %>% mutate(entity_measured = ifelse(is.na(entity_measured), "unknown", entity_measured)) - ' + ' collection_date: unknown/2022 taxon_name: taxon_name trait_name: trait_name