Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dataset tests #143

Merged
merged 12 commits into from
Nov 30, 2023
10 changes: 5 additions & 5 deletions R/pivot.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@
check_pivot_wider <- function(dataset) {

duplicates <- dataset$traits %>%
select(
dplyr::select(
dplyr::all_of(c("dataset_id", "trait_name", "value", "observation_id", "value_type",
"repeat_measurements_id", "method_id", "method_context_id"))
) %>%
tidyr::pivot_wider(names_from = "trait_name", values_from = "value", values_fn = length) %>%
tidyr::pivot_longer(cols = 7:ncol(.)) %>%
dplyr::rename(dplyr::all_of(c("trait_name" = "name", "number_of_duplicates" = "value"))) %>%
select(
dplyr::select(
dplyr::all_of(c("dataset_id", "trait_name", "number_of_duplicates", "observation_id",
"value_type")), everything()
) %>%
filter(.data$number_of_duplicates > 1) %>%
dplyr::filter(.data$number_of_duplicates > 1) %>%
nrow()

if (duplicates == 0) {
Expand Down Expand Up @@ -61,14 +61,14 @@ db_traits_pivot_wider <- function(traits) {

# A check for if there are more than 1 value_type for a given taxon_name, observation_id and method
check_value_type <- traits %>%
select(dplyr::all_of(c(
dplyr::select(dplyr::all_of(c(
"trait_name", "value", "dataset_id", "observation_id", "method_id", "method_context_id",
"repeat_measurements_id", "value_type"))) %>%
dplyr::group_by(
.data$dataset_id, .data$observation_id, .data$method_id,
.data$method_context_id, .data$repeat_measurements_id) %>%
dplyr::summarise(n_value_type = length(unique(.data$value_type))) %>%
arrange(.data$observation_id) %>%
dplyr::arrange(.data$observation_id) %>%
dplyr::filter(.data$n_value_type > 1)

if (nrow(check_value_type) > 1) {
Expand Down
30 changes: 14 additions & 16 deletions R/process.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ dataset_process <- function(filename_data_raw,
)
traits <-
traits %>%
mutate(
dplyr::mutate(
location_id = ifelse(.data$entity_type == "species", NA_character_, .data$location_id)
)
}
Expand All @@ -147,7 +147,7 @@ dataset_process <- function(filename_data_raw,
by = "location_id",
locations %>%
tidyr::pivot_wider(names_from = "location_property", values_from = "value") %>%
mutate(col_tmp = .data[[v]]) %>%
dplyr::mutate(col_tmp = .data[[v]]) %>%
dplyr::select(dplyr::any_of(c("location_id", "col_tmp"))) %>%
stats::na.omit()
)
Expand Down Expand Up @@ -185,13 +185,11 @@ dataset_process <- function(filename_data_raw,
process_format_contributors(dataset_id, schema)

# Record sources
sources <- metadata$source %>%
lapply(util_list_to_bib) %>% purrr::reduce(c)
sources <- metadata$source %>% lapply(util_list_to_bib) %>% purrr::reduce(c)

# Record methods
methods <- process_format_methods(metadata, dataset_id, sources, contributors)


# Retrieve taxonomic details for known species
taxonomic_updates <-
traits %>%
Expand Down Expand Up @@ -561,7 +559,7 @@ process_create_observation_id <- function(data, metadata) {
if (!is.null(traits_table[["repeat_measurements_id"]])) {

to_add_id <- traits_table %>%
filter(.data$repeat_measurements_id == TRUE) %>%
dplyr::filter(.data$repeat_measurements_id == TRUE) %>%
dplyr::pull(.data$trait_name)

i <- !is.na(data$value) & data$trait_name %in% to_add_id &
Expand Down Expand Up @@ -803,12 +801,12 @@ process_create_context_ids <- function(data, contexts) {

contexts_finished <-
contexts %>%
filter(!is.na(.data$value)) %>%
dplyr::filter(!is.na(.data$value)) %>%
dplyr::left_join(
id_link %>% dplyr::bind_rows(),
by = c("context_property", "category", "value")
) %>%
distinct(dplyr::across(-dplyr::any_of("find")))
dplyr::distinct(dplyr::across(-dplyr::any_of("find")))

list(
contexts = contexts_finished %>% util_df_convert_character(),
Expand Down Expand Up @@ -985,7 +983,7 @@ util_check_disallowed_chars <- function(object) {
process_flag_unsupported_characters <- function(data) {

data <- data %>%
mutate(
dplyr::mutate(
error = ifelse(is.na(.data$error) & util_check_disallowed_chars(.data$value),
"Value contains unsupported characters", .data$error)
)
Expand Down Expand Up @@ -1866,8 +1864,8 @@ build_combine <- function(..., d = list(...)) {
metadata[["contributors"]] <-
contributors %>%
dplyr::select(-dplyr::any_of(c("dataset_id", "additional_role"))) %>%
distinct() %>%
arrange(.data$last_name, .data$given_name) %>%
dplyr::distinct() %>%
dplyr::arrange(.data$last_name, .data$given_name) %>%
util_df_to_list()

ret <- list(traits = combine("traits", d),
Expand Down Expand Up @@ -1954,7 +1952,7 @@ dataset_update_taxonomy <- function(austraits_raw, taxa) {
stringr::word(.data$taxon_name, 1), .data$name_to_match_to)
) %>%
# Remove `taxon_rank`, as it is about to be merged back in, but matches will now be possible to more rows
select(-dplyr::any_of(c("taxon_rank", "taxonomic_resolution"))) %>%
dplyr::select(-dplyr::any_of(c("taxon_rank", "taxonomic_resolution"))) %>%
util_df_convert_character() %>%
# Merge in all data from taxa
dplyr::left_join(by = c("taxon_name"),
Expand Down Expand Up @@ -2050,8 +2048,8 @@ check_pivot_duplicates <- function(

# Check for duplicates
database_object$traits %>%
filter(.data$dataset_id %in% dataset_ids) %>%
select(
dplyr::filter(.data$dataset_id %in% dataset_ids) %>%
dplyr::select(
# `taxon_name` and `original_name` are not needed for pivoting but are included for informative purposes
dplyr::all_of(
c("dataset_id", "trait_name", "value", "taxon_name", "original_name", "observation_id",
Expand All @@ -2060,10 +2058,10 @@ check_pivot_duplicates <- function(
tidyr::pivot_wider(names_from = "trait_name", values_from = "value", values_fn = length) %>%
tidyr::pivot_longer(cols = 9:ncol(.)) %>%
dplyr::rename(dplyr::all_of(c("trait_name" = "name", "number_of_duplicates" = "value"))) %>%
select(
dplyr::select(
dplyr::all_of(c("dataset_id", "trait_name", "number_of_duplicates",
"taxon_name", "original_name", "observation_id", "value_type")), everything()
) %>%
filter(.data$number_of_duplicates > 1)
dplyr::filter(.data$number_of_duplicates > 1)

}
16 changes: 8 additions & 8 deletions R/setup.R
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ metadata_add_locations <- function(dataset_id, location_data, user_responses = N
# Save and notify
location_data <- location_data %>%
dplyr::select(dplyr::all_of(c(location_name, keep))) %>%
distinct()
dplyr::distinct()

# If user didn't select any variables to keep, so add defaults
if (is.na(keep[1])) {
Expand Down Expand Up @@ -900,15 +900,15 @@ metadata_add_taxonomic_change <- function(dataset_id, find, replace, reason, tax
} else {
message(sprintf(red("Existing substitution will be overwritten for ") %+% green("'%s'"), find))
data <- data %>%
filter(.data$find != to_add$find) %>%
dplyr::bind_rows(to_add) %>%
filter(!.data$find == replace) %>%
arrange(.data$find)
dplyr::filter(.data$find != to_add$find) %>%
dplyr::bind_rows(to_add) %>%
dplyr::filter(!.data$find == replace) %>%
dplyr::arrange(.data$find)
}
} else {
data <- dplyr::bind_rows(data, to_add) %>%
filter(!.data$find == replace) %>%
arrange(.data$find)
dplyr::filter(!.data$find == replace) %>%
dplyr::arrange(.data$find)
}
}

Expand Down Expand Up @@ -967,7 +967,7 @@ metadata_add_taxonomic_changes_list <- function(dataset_id, taxonomic_updates) {
))
}
# Write new taxonomic updates to metadata
metadata$taxonomic_updates <- existing_updates %>% dplyr::arrange(.data$find) %>% filter(!.data$find == .data$replace)
metadata$taxonomic_updates <- existing_updates %>% dplyr::arrange(.data$find) %>% dplyr::filter(!.data$find == .data$replace)
} else {

# Read in dataframe of taxonomic changes, split into single-row lists, and add to metadata file
Expand Down
82 changes: 64 additions & 18 deletions R/testdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,27 @@ dataset_test_worker <-
process_format_contexts(dataset_id, data)
)

# Check that there are no duplicate `var_in` or `context_property` fields
context_properties <- sapply(metadata[["contexts"]], "[[", "context_property")
context_vars_in <- sapply(metadata[["contexts"]], "[[", "var_in")

expect_equal(
context_properties |> duplicated() |> sum(),
dfalster marked this conversation as resolved.
Show resolved Hide resolved
0,
info = sprintf(
"%s\tcontexts - duplicate `context_property` values detected: '%s'",
red(f),
paste(context_properties[duplicated(context_properties)], collapse = "', '"))
)
expect_equal(
context_vars_in |> duplicated() |> sum(),
0,
info = sprintf(
"%s\tcontexts - duplicate `var_in` values detected: '%s'",
red(f),
paste(context_vars_in[duplicated(context_vars_in)], collapse = "', '"))
)

# Check context details load
if (nrow(contexts) > 0) {

Expand All @@ -599,29 +620,30 @@ dataset_test_worker <-
info = paste0(red(f), "\tcontexts"), label = "field names"
)


# Check that unique context `value`'s only have one unique description
expect_equal(
contexts %>% dplyr::group_by(.data$context_property, .data$value) %>% dplyr::summarise(n = dplyr::n_distinct(.data$description)) %>%
filter(.data$n > 1) %>% nrow(),
dplyr::filter(.data$n > 1) %>% nrow(),
0, info = sprintf(
"%s\tcontexts - `value`'s should only have one unique description each: '%s'",
red(f),
paste(
contexts %>% dplyr::group_by(.data$context_property, .data$value) %>% dplyr::summarise(n = dplyr::n_distinct(.data$description)) %>%
filter(.data$n > 1) %>% dplyr::pull(.data$value) %>% unique(),
dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$value) %>% unique(),
collapse = "', '")
)
)

# Check that there are no duplicate `find` fields
expect_equal(
contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>%
contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>%
nrow(),
0, info = sprintf(
"%s\tcontexts - duplicate `find` values detected: '%s'",
red(f),
paste(
contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>%
contexts %>% dplyr::group_by(.data$context_property, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>%
dplyr::pull(.data$find) %>% unique(),
collapse = "', '")
)
Expand Down Expand Up @@ -682,6 +704,7 @@ dataset_test_worker <-
}

## Traits

expect_list_elements_contains_names(
metadata[["traits"]],
c("var_in", "unit_in", "trait_name", "value_type", "basis_of_value"),
Expand All @@ -707,8 +730,31 @@ dataset_test_worker <-
label = "`trait_name`'s"
)

# Check units are found in `unit_conversions.csv`
units <- read_csv("config/unit_conversions.csv")
expect_is_in(
traits$unit_in, units$unit_from,
info = paste0(red(f), "\ttraits"),
label = "`unit_in`'s"
)

# Check no duplicate `var_in`'s

expect_equal(
traits %>% dplyr::group_by(.data$var_in) %>% dplyr::summarise(n = dplyr::n()) %>%
dplyr::filter(.data$n > 1) %>% nrow(),
0,
info = sprintf(
"%s\ttraits - duplicate `var_in` values detected: '%s'",
red(f),
paste(
traits %>% dplyr::group_by(.data$var_in) %>% dplyr::summarise(n = dplyr::n()) %>%
dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$var_in) %>% unique(),
collapse = "', '")
)
)


# Now that traits loaded, check details of contexts match
if (nrow(contexts > 0)) {

Expand All @@ -723,7 +769,7 @@ dataset_test_worker <-

for (j in unique(contexts[["var_in"]])) {

contextsub <- contexts %>% filter(var_in == j)
contextsub <- contexts %>% dplyr::filter(var_in == j)

# Context values align either with a column of data or a column of traits table
if (is.null(data[[j]])) {
Expand Down Expand Up @@ -852,13 +898,14 @@ dataset_test_worker <-

# First check no duplicate combinations of `find`
expect_equal(
x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% nrow(),
x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>%
dplyr::filter(.data$n > 1) %>% nrow(),
0, info = sprintf(
"%s\tsubstitutions - duplicate `find` values detected: '%s'",
red(f),
paste(
x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>%
dplyr::pull(.data$find) %>% unique(),
x[[trait]] %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>%
dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(),
collapse = "', '")
)
)
Expand Down Expand Up @@ -898,12 +945,12 @@ dataset_test_worker <-

# Check no duplicate `find` values
expect_equal(
x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% nrow(),
x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% nrow(),
0, info = sprintf(
"%s\ttaxonomic_updates - duplicate `find` values detected: '%s'",
red(f),
paste(
x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>%
x %>% dplyr::group_by(.data$find) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>%
dplyr::pull(.data$find) %>% unique(),
collapse = "', '")
)
Expand Down Expand Up @@ -1005,13 +1052,13 @@ dataset_test_worker <-
# Check no duplicate `find` values
expect_equal(
x %>% dplyr::group_by(.data$variable, .data$find) %>%
dplyr::summarise(n = dplyr::n()) %>% filter(.data$n > 1) %>% nrow(),
dplyr::summarise(n = dplyr::n()) %>% dplyr::filter(.data$n > 1) %>% nrow(),
0, info = sprintf(
"%s\texclude_observations - duplicate `find` values detected: '%s'",
red(f),
paste(
x %>% dplyr::group_by(.data$variable, .data$find) %>% dplyr::summarise(n = dplyr::n()) %>%
filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(),
dplyr::filter(.data$n > 1) %>% dplyr::pull(.data$find) %>% unique(),
collapse = "', '")
)
)
Expand All @@ -1030,7 +1077,7 @@ dataset_test_worker <-
expect_is_in(
find_values,
# Extract values from the data for that variable
parsed_data %>% filter(.data$trait_name == variable) %>% dplyr::pull(.data$value) %>% unique(),
parsed_data %>% dplyr::filter(.data$trait_name == variable) %>% dplyr::pull(.data$value) %>% unique(),
info = paste0(red(f), "\texclude_observations"), label = sprintf("variable '%s'", variable)
)

Expand Down Expand Up @@ -1061,9 +1108,8 @@ dataset_test_worker <-
} else {

# For wide datasets, expect variables in traits are headers in the data
values <- names(data)
expect_is_in(
traits[["var_in"]], values,
traits[["var_in"]], names(data),
info = paste0(red(files[2]), "\ttraits"), label = "`var_in`"
)

Expand All @@ -1081,12 +1127,12 @@ dataset_test_worker <-



## Check traits are not only NAs
## Check that not all trait names are NAs
expect_false(
nrow(metadata[["traits"]] %>% util_list_to_df2() %>% dplyr::filter(!is.na(.data$trait_name))) == 0,
nrow(traits %>% dplyr::filter(!is.na(.data$trait_name))) == 0,
info = paste0(red(f), "\ttraits - only contain NA `trait_name`'s"))

if (nrow(metadata[["traits"]] %>% util_list_to_df2() %>% dplyr::filter(!is.na(.data$trait_name))) > 0) {
if (nrow(traits %>% dplyr::filter(!is.na(.data$trait_name))) > 0) {

# Test build dataset
expect_no_error(
Expand Down
Loading
Loading