Skip to content

Commit

Permalink
Add dataset tests (#143)
Browse files Browse the repository at this point in the history
Fixes #141
- Check units are found in the unit conversions file (this is addressed by the new `dataset_check` functions but thought it might be useful here instead)
- @ehwenk: "Adding this as a test, effectively means 'Missing unit conversion' will never actually be a reason for being in excluded data because tests won't allow that. But let's leave it for now and if it becomes a problem we can comment it out"
- Also added checks for duplicate trait `var_in` and duplicate context `var_in` and `context_property`
- Fix `write_metadata` adding spaces to `custom_R_code`
  • Loading branch information
yangsophieee authored Nov 30, 2023
1 parent 9d33a1c commit c903660
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 52 deletions.
10 changes: 5 additions & 5 deletions R/pivot.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@
check_pivot_wider <- function(dataset) {

duplicates <- dataset$traits %>%
select(
dplyr::select(
dplyr::all_of(c("dataset_id", "trait_name", "value", "observation_id", "value_type",
"repeat_measurements_id", "method_id", "method_context_id"))
) %>%
tidyr::pivot_wider(names_from = "trait_name", values_from = "value", values_fn = length) %>%
tidyr::pivot_longer(cols = 7:ncol(.)) %>%
dplyr::rename(dplyr::all_of(c("trait_name" = "name", "number_of_duplicates" = "value"))) %>%
select(
dplyr::select(
dplyr::all_of(c("dataset_id", "trait_name", "number_of_duplicates", "observation_id",
"value_type")), everything()
) %>%
filter(.data$number_of_duplicates > 1) %>%
dplyr::filter(.data$number_of_duplicates > 1) %>%
nrow()

if (duplicates == 0) {
Expand Down Expand Up @@ -61,14 +61,14 @@ db_traits_pivot_wider <- function(traits) {

# A check for if there are more than 1 value_type for a given taxon_name, observation_id and method
check_value_type <- traits %>%
select(dplyr::all_of(c(
dplyr::select(dplyr::all_of(c(
"trait_name", "value", "dataset_id", "observation_id", "method_id", "method_context_id",
"repeat_measurements_id", "value_type"))) %>%
dplyr::group_by(
.data$dataset_id, .data$observation_id, .data$method_id,
.data$method_context_id, .data$repeat_measurements_id) %>%
dplyr::summarise(n_value_type = length(unique(.data$value_type))) %>%
arrange(.data$observation_id) %>%
dplyr::arrange(.data$observation_id) %>%
dplyr::filter(.data$n_value_type > 1)

if (nrow(check_value_type) > 1) {
Expand Down
30 changes: 14 additions & 16 deletions R/process.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ dataset_process <- function(filename_data_raw,
)
traits <-
traits %>%
mutate(
dplyr::mutate(
location_id = ifelse(.data$entity_type == "species", NA_character_, .data$location_id)
)
}
Expand All @@ -147,7 +147,7 @@ dataset_process <- function(filename_data_raw,
by = "location_id",
locations %>%
tidyr::pivot_wider(names_from = "location_property", values_from = "value") %>%
mutate(col_tmp = .data[[v]]) %>%
dplyr::mutate(col_tmp = .data[[v]]) %>%
dplyr::select(dplyr::any_of(c("location_id", "col_tmp"))) %>%
stats::na.omit()
)
Expand Down Expand Up @@ -185,13 +185,11 @@ dataset_process <- function(filename_data_raw,
process_format_contributors(dataset_id, schema)

# Record sources
sources <- metadata$source %>%
lapply(util_list_to_bib) %>% purrr::reduce(c)
sources <- metadata$source %>% lapply(util_list_to_bib) %>% purrr::reduce(c)

# Record methods
methods <- process_format_methods(metadata, dataset_id, sources, contributors)


# Retrieve taxonomic details for known species
taxonomic_updates <-
traits %>%
Expand Down Expand Up @@ -561,7 +559,7 @@ process_create_observation_id <- function(data, metadata) {
if (!is.null(traits_table[["repeat_measurements_id"]])) {

to_add_id <- traits_table %>%
filter(.data$repeat_measurements_id == TRUE) %>%
dplyr::filter(.data$repeat_measurements_id == TRUE) %>%
dplyr::pull(.data$trait_name)

i <- !is.na(data$value) & data$trait_name %in% to_add_id &
Expand Down Expand Up @@ -803,12 +801,12 @@ process_create_context_ids <- function(data, contexts) {

contexts_finished <-
contexts %>%
filter(!is.na(.data$value)) %>%
dplyr::filter(!is.na(.data$value)) %>%
dplyr::left_join(
id_link %>% dplyr::bind_rows(),
by = c("context_property", "category", "value")
) %>%
distinct(dplyr::across(-dplyr::any_of("find")))
dplyr::distinct(dplyr::across(-dplyr::any_of("find")))

list(
contexts = contexts_finished %>% util_df_convert_character(),
Expand Down Expand Up @@ -985,7 +983,7 @@ util_check_disallowed_chars <- function(object) {
process_flag_unsupported_characters <- function(data) {

data <- data %>%
mutate(
dplyr::mutate(
error = ifelse(is.na(.data$error) & util_check_disallowed_chars(.data$value),
"Value contains unsupported characters", .data$error)
)
Expand Down Expand Up @@ -1866,8 +1864,8 @@ build_combine <- function(..., d = list(...)) {
metadata[["contributors"]] <-
contributors %>%
dplyr::select(-dplyr::any_of(c("dataset_id", "additional_role"))) %>%
distinct() %>%
arrange(.data$last_name, .data$given_name) %>%
dplyr::distinct() %>%
dplyr::arrange(.data$last_name, .data$given_name) %>%
util_df_to_list()

ret <- list(traits = combine("traits", d),
Expand Down Expand Up @@ -1954,7 +1952,7 @@ dataset_update_taxonomy <- function(austraits_raw, taxa) {
stringr::word(.data$taxon_name, 1), .data$name_to_match_to)
) %>%
# Remove `taxon_rank`, as it is about to be merged back in, but matches will now be possible to more rows
select(-dplyr::any_of(c("taxon_rank", "taxonomic_resolution"))) %>%
dplyr::select(-dplyr::any_of(c("taxon_rank", "taxonomic_resolution"))) %>%
util_df_convert_character() %>%
# Merge in all data from taxa
dplyr::left_join(by = c("taxon_name"),
Expand Down Expand Up @@ -2050,8 +2048,8 @@ check_pivot_duplicates <- function(

# Check for duplicates
database_object$traits %>%
filter(.data$dataset_id %in% dataset_ids) %>%
select(
dplyr::filter(.data$dataset_id %in% dataset_ids) %>%
dplyr::select(
# `taxon_name` and `original_name` are not needed for pivoting but are included for informative purposes
dplyr::all_of(
c("dataset_id", "trait_name", "value", "taxon_name", "original_name", "observation_id",
Expand All @@ -2060,10 +2058,10 @@ check_pivot_duplicates <- function(
tidyr::pivot_wider(names_from = "trait_name", values_from = "value", values_fn = length) %>%
tidyr::pivot_longer(cols = 9:ncol(.)) %>%
dplyr::rename(dplyr::all_of(c("trait_name" = "name", "number_of_duplicates" = "value"))) %>%
select(
dplyr::select(
dplyr::all_of(c("dataset_id", "trait_name", "number_of_duplicates",
"taxon_name", "original_name", "observation_id", "value_type")), everything()
) %>%
filter(.data$number_of_duplicates > 1)
dplyr::filter(.data$number_of_duplicates > 1)

}
16 changes: 8 additions & 8 deletions R/setup.R
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ metadata_add_locations <- function(dataset_id, location_data, user_responses = N
# Save and notify
location_data <- location_data %>%
dplyr::select(dplyr::all_of(c(location_name, keep))) %>%
distinct()
dplyr::distinct()

# If user didn't select any variables to keep, so add defaults
if (is.na(keep[1])) {
Expand Down Expand Up @@ -900,15 +900,15 @@ metadata_add_taxonomic_change <- function(dataset_id, find, replace, reason, tax
} else {
message(sprintf(red("Existing substitution will be overwritten for ") %+% green("'%s'"), find))
data <- data %>%
filter(.data$find != to_add$find) %>%
dplyr::bind_rows(to_add) %>%
filter(!.data$find == replace) %>%
arrange(.data$find)
dplyr::filter(.data$find != to_add$find) %>%
dplyr::bind_rows(to_add) %>%
dplyr::filter(!.data$find == replace) %>%
dplyr::arrange(.data$find)
}
} else {
data <- dplyr::bind_rows(data, to_add) %>%
filter(!.data$find == replace) %>%
arrange(.data$find)
dplyr::filter(!.data$find == replace) %>%
dplyr::arrange(.data$find)
}
}

Expand Down Expand Up @@ -967,7 +967,7 @@ metadata_add_taxonomic_changes_list <- function(dataset_id, taxonomic_updates) {
))
}
# Write new taxonomic updates to metadata
metadata$taxonomic_updates <- existing_updates %>% dplyr::arrange(.data$find) %>% filter(!.data$find == .data$replace)
metadata$taxonomic_updates <- existing_updates %>% dplyr::arrange(.data$find) %>% dplyr::filter(!.data$find == .data$replace)
} else {

# Read in dataframe of taxonomic changes, split into single-row lists, and add to metadata file
Expand Down
82 changes: 64 additions & 18 deletions R/testdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,27 @@ dataset_test_worker <-
process_format_contexts(dataset_id, data)
)

# Check that there are no duplicate `var_in` or `context_property` fields
context_properties <- sapply(metadata[["contexts"]], "[[", "context_property")
context_vars_in <- sapply(metadata[["contexts"]], "[[", "var_in")

expect_equal(
context_properties |> duplicated() |> sum(),
0,
info = sprintf(
"%s\tcontexts - duplicate `context_property` values detected: '%s'",
red(f),
paste(context_properties[duplicated(context_properties)], collapse = "', '"))
)
expect_equal(
context_vars_in |> duplicated() |> sum(),
0,
info = sprintf(
"%s\tcontexts - duplicate `var_in` values detected: '%s'",
red(f),
paste(context_vars_in[duplicated(context_vars_in)], collapse = "', '"))
)

# Check context details load
if (nrow(contexts) > 0) {

Expand All @@ -599,29 +620,30 @@ dataset_test_worker <-
info = paste0(red(f), "\tcontexts"), label = "field names"
)


# Check that unique context `value`'s only have one unique description
expect_equal(
contexts %>% dplyr::group_by(.data$context_property, .data$value) %>% dplyr::summarise(n = dplyr::n_distinct(.data$description)) %>%
filter(.data$n > 1) %>% nrow(),
dplyr::filter(.data$n > 1) %>% nrow(),
0, info = sprintf(
"%s\tcontexts - `value`'s should only have one unique description each: '%s'",
red(f),
paste(
contexts %>% dplyr::group_by(.data$context_property, .data$value) %>% dplyr::summarise(n = dplyr::n_distinct(.data$description)) %>%
filter(.data$n > 1) %>% dplyr::pull(.data$value) %>% unique(),
dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$value) %>% unique(),
collapse = "', '")
)
)

# Check that there are no duplicate `find` fields
expect_equal(
contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>%
contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>%
nrow(),
0, info = sprintf(
"%s\tcontexts - duplicate `find` values detected: '%s'",
red(f),
paste(
contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>%
contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>%
dplyr::pull(.data$find) %>% unique(),
collapse = "', '")
)
Expand Down Expand Up @@ -682,6 +704,7 @@ dataset_test_worker <-
}

## Traits

expect_list_elements_contains_names(
metadata[["traits"]],
c("var_in", "unit_in", "trait_name", "value_type", "basis_of_value"),
Expand All @@ -707,8 +730,31 @@ dataset_test_worker <-
label = "`trait_name`'s"
)

# Check units are found in `unit_conversions.csv`
units <- read_csv("config/unit_conversions.csv")
expect_is_in(
traits$unit_in, units$unit_from,
info = paste0(red(f), "\ttraits"),
label = "`unit_in`'s"
)

# Check no duplicate `var_in`'s

expect_equal(
traits %>% dplyr::group_by(.data$var_in) %>% dplyr::summarise(n = dplyr::n()) %>%
dplyr::filter(.data$n > 1) %>% nrow(),
0,
info = sprintf(
"%s\ttraits - duplicate `var_in` values detected: '%s'",
red(f),
paste(
traits %>% dplyr::group_by(.data$var_in) %>% dplyr::summarise(n = dplyr::n()) %>%
dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$var_in) %>% unique(),
collapse = "', '")
)
)


# Now that traits loaded, check details of contexts match
if (nrow(contexts > 0)) {

Expand All @@ -723,7 +769,7 @@ dataset_test_worker <-

for (j in unique(contexts[["var_in"]])) {

contextsub <- contexts %>% filter(var_in == j)
contextsub <- contexts %>% dplyr::filter(var_in == j)

# Context values align either with a column of data or a column of traits table
if (is.null(data[[j]])) {
Expand Down Expand Up @@ -852,13 +898,14 @@ dataset_test_worker <-

# First check no duplicate combinations of `find`
expect_equal(
x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% nrow(),
x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>%
dplyr::filter(.data$n > 1) %>% nrow(),
0, info = sprintf(
"%s\tsubstitutions - duplicate `find` values detected: '%s'",
red(f),
paste(
x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>%
dplyr::pull(.data$find) %>% unique(),
x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>%
dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(),
collapse = "', '")
)
)
Expand Down Expand Up @@ -898,12 +945,12 @@ dataset_test_worker <-

# Check no duplicate `find` values
expect_equal(
x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% nrow(),
x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% nrow(),
0, info = sprintf(
"%s\ttaxonomic_updates - duplicate `find` values detected: '%s'",
red(f),
paste(
x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>%
x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>%
dplyr::pull(.data$find) %>% unique(),
collapse = "', '")
)
Expand Down Expand Up @@ -1005,13 +1052,13 @@ dataset_test_worker <-
# Check no duplicate `find` values
expect_equal(
x %>% dplyr::group_by(.data$variable, .data$find) %>%
dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% nrow(),
dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% nrow(),
0, info = sprintf(
"%s\texclude_observations - duplicate `find` values detected: '%s'",
red(f),
paste(
x %>% dplyr::group_by(.data$variable, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>%
filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(),
dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(),
collapse = "', '")
)
)
Expand All @@ -1030,7 +1077,7 @@ dataset_test_worker <-
expect_is_in(
find_values,
# Extract values from the data for that variable
parsed_data %>% filter(.data$trait_name == variable) %>% dplyr::pull(.data$value) %>% unique(),
parsed_data %>% dplyr::filter(.data$trait_name == variable) %>% dplyr::pull(.data$value) %>% unique(),
info = paste0(red(f), "\texclude_observations"), label = sprintf("variable '%s'", variable)
)

Expand Down Expand Up @@ -1061,9 +1108,8 @@ dataset_test_worker <-
} else {

# For wide datasets, expect variables in traits are headers in the data
values <- names(data)
expect_is_in(
traits[["var_in"]], values,
traits[["var_in"]], names(data),
info = paste0(red(files[2]), "\ttraits"), label = "`var_in`"
)

Expand All @@ -1081,12 +1127,12 @@ dataset_test_worker <-



## Check traits are not only NAs
## Check that not all trait names are NAs
expect_false(
nrow(metadata[["traits"]] %>% util_list_to_df2() %>% dplyr::filter(!is.na(.data$trait_name))) == 0,
nrow(traits %>% dplyr::filter(!is.na(.data$trait_name))) == 0,
info = paste0(red(f), "\ttraits - only contain NA `trait_name`'s"))

if (nrow(metadata[["traits"]] %>% util_list_to_df2() %>% dplyr::filter(!is.na(.data$trait_name))) > 0) {
if (nrow(traits %>% dplyr::filter(!is.na(.data$trait_name))) > 0) {

# Test build dataset
expect_no_error(
Expand Down
Loading

0 comments on commit c903660

Please sign in to comment.