Skip to content

Commit

Permalink
Merge pull request #107 from hubverse-org/98/subset-grid-by-out-type
Browse files Browse the repository at this point in the history
98/ Subset expanded grid of valid values by output type
  • Loading branch information
annakrystalli authored Aug 16, 2024
2 parents 172091c + bc18984 commit ca8e22b
Show file tree
Hide file tree
Showing 58 changed files with 1,758 additions and 188 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: hubValidations
Title: Testing framework for hubverse hub validations
Version: 0.4.0
Version: 0.5.0
Authors@R: c(
person(
given = "Anna",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export(is_exec_warn)
export(is_failure)
export(is_info)
export(is_success)
export(match_tbl_to_model_task)
export(new_hub_validations)
export(not_pass)
export(opt_check_metadata_team_max_model_n)
Expand Down
15 changes: 15 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# hubValidations 0.5.0

This release introduces **significant improvements in the performance of submission validation** via the following changes:

* Add ability to sub-set expanded valid value grids by output type through `output_type` argument in `expand_model_out_grid()` (#98).
* Add ability to ignore the values of derived task IDs in expanded valid value grids through argument `derived_task_ids` in `expand_model_out_grid()`.
* Use sub-setting and batching of model output data validation by output type in appropriate lower level checks and add ability to ignore derived task IDs in `validate_model_data()`, `validate_submission()` and `validate_pr()`.

Both of these changes **allow for the creation of smaller, more focused expanded valid value grids, significantly reducing pressure on memory** when working with large, complex hub configs and making submission validation much more efficient.

Additional useful functionality:

* Add ability to subset by output type and ignore derived task IDs to `submission_tmpl()`. Ignoring derived task ids can be particularly useful to avoid creating templates with invalid derived task ID value combinations.
* Add new exported function `match_tbl_to_model_task()` that matches the rows in a `tbl` of model output data to a model task of a given round (as defined in `tasks.json`).

# hubValidations 0.4.0

- Add new `check_tbl_spl_compound_taskid_set()` check function to `validate_model_data()` that ensures that sample compound task id sets for each modeling task match or are coarser than the expected set defined in `tasks.json` config.
Expand Down
12 changes: 7 additions & 5 deletions R/check_tbl_spl_compound_taskid_set.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#' Column types must **all be character**.
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_colnames return
#' @inheritParams expand_model_out_grid
#' @details If the check fails, the output of the check includes an `errors` element,
#' a list of items, one for each modeling task failing validation.
#' The structure depends on the reason the check failed.
Expand All @@ -31,24 +32,25 @@
#' See [hubverse documentation on samples](https://hubverse.io/en/latest/user-guide/sample-output-type.html)
#' for more details.
#' @export
check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path) {
check_tbl_spl_compound_taskid_set <- function(tbl, round_id, file_path, hub_path,
derived_task_ids = NULL) {
config_tasks <- hubUtils::read_config(hub_path, "tasks")

if (isFALSE(has_spls_tbl(tbl)) || isFALSE(hubUtils::is_v3_config(config_tasks))) {
return(skip_v3_spl_check(file_path))
}

compound_taskid_set <- get_tbl_compound_taskid_set(tbl, config_tasks, round_id,
compact = FALSE, error = FALSE
compound_taskid_set <- get_tbl_compound_taskid_set(
tbl, config_tasks, round_id,
compact = FALSE, error = FALSE,
derived_task_ids = NULL
)

check <- purrr::map_lgl(
compound_taskid_set,
~ is.null(attr(.x, "errors"))
) |> all()



capture_check_cnd(
check = check,
file_path = file_path,
Expand Down
8 changes: 6 additions & 2 deletions R/check_tbl_spl_compound_tid.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#' @param tbl a tibble/data.frame of the contents of the file being validated. Column types must **all be character**.
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_colnames return
#' @inheritParams expand_model_out_grid
#' @param compound_taskid_set a list of `compound_taskid_set`s (characters vector of compound task IDs),
#' one for each modeling task. Used to override the compound task ID set in the config file,
#' for example, when validating coarser samples.
Expand All @@ -17,7 +18,8 @@
#' for more details.
#' @export
check_tbl_spl_compound_tid <- function(tbl, round_id, file_path, hub_path,
compound_taskid_set = NULL) {
compound_taskid_set = NULL,
derived_task_ids = NULL) {
if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) {
cli::cli_abort("Valid {.var compound_taskid_set} must be provided.")
}
Expand All @@ -33,7 +35,9 @@ check_tbl_spl_compound_tid <- function(tbl, round_id, file_path, hub_path,
return(skip_v3_spl_check(file_path))
}

hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set)
hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set,
derived_task_ids = derived_task_ids
)
n_tbl <- hash_tbl[hash_tbl$n_compound_idx > 1L, ]

check <- nrow(n_tbl) == 0L
Expand Down
7 changes: 5 additions & 2 deletions R/check_tbl_spl_n.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
#' for more details.
#' @export
check_tbl_spl_n <- function(tbl, round_id, file_path, hub_path,
compound_taskid_set = NULL) {
compound_taskid_set = NULL,
derived_task_ids = NULL) {
if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) {
cli::cli_abort("Valid {.var compound_taskid_set} must be provided.")
}
Expand All @@ -32,7 +33,9 @@ check_tbl_spl_n <- function(tbl, round_id, file_path, hub_path,
return(skip_v3_spl_check(file_path))
}

hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set)
hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set,
derived_task_ids = derived_task_ids
)
n_ranges <- get_round_spl_n_ranges(config_tasks, round_id)

n_tbl <- dplyr::group_by(hash_tbl, .data$compound_idx) %>%
Expand Down
6 changes: 4 additions & 2 deletions R/check_tbl_spl_non_compound_tid.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
#' for more details.
#' @export
check_tbl_spl_non_compound_tid <- function(tbl, round_id, file_path, hub_path,
compound_taskid_set = NULL) {
compound_taskid_set = NULL,
derived_task_ids = NULL) {
if (!is.null(compound_taskid_set) && isTRUE(is.na(compound_taskid_set))) {
cli::cli_abort("Valid {.var compound_taskid_set} must be provided.")
}
Expand All @@ -33,7 +34,8 @@ check_tbl_spl_non_compound_tid <- function(tbl, round_id, file_path, hub_path,
return(skip_v3_spl_check(file_path))
}

hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set)
hash_tbl <- spl_hash_tbl(tbl, round_id, config_tasks, compound_taskid_set,
derived_task_ids = derived_task_ids)

n_tbl <- dplyr::summarise(
hash_tbl,
Expand Down
79 changes: 34 additions & 45 deletions R/check_tbl_value_col.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,54 +5,34 @@
#' type of the appropriate model task.
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_col_types return
#' @inheritParams expand_model_out_grid
#' @export
check_tbl_value_col <- function(tbl, round_id, file_path, hub_path) {
check_tbl_value_col <- function(tbl, round_id, file_path, hub_path,
derived_task_ids = NULL) {
config_tasks <- hubUtils::read_config(hub_path, "tasks")

tbl[, names(tbl) != "value"] <- hubData::coerce_to_character(
tbl[, names(tbl) != "value"]
)
if (!is.null(derived_task_ids)) {
tbl[, derived_task_ids] <- NA_character_
}

full <- expand_model_out_grid(
config_tasks,
round_id = round_id,
required_vals_only = FALSE,
all_character = TRUE,
as_arrow_table = FALSE,
bind_model_tasks = FALSE
)

join_cols <- names(tbl)[names(tbl) != "value"] # nolint: object_usage_linter
tbl <- purrr::map(
full,
~ dplyr::inner_join(.x, tbl, by = join_cols)
)

round_config <- get_file_round_config(file_path, hub_path)
output_type_config <- round_config[["model_tasks"]] %>%
purrr::map(~ .x[["output_type"]])


details <- purrr::map2(
tbl, output_type_config,
check_modeling_task_value_col
) %>%
details <- split(tbl, f = tbl$output_type) %>%
purrr::imap(
\(.x, .y) {
check_value_col_by_output_type(
tbl = .x, output_type = .y,
config_tasks = config_tasks,
round_id = round_id,
derived_task_ids = derived_task_ids
)
}
) %>%
unlist(use.names = TRUE)

check <- is.null(details)

## Example code for attempting bullets of details. Needs more experimentation
## but parking for now.
# if (!check) {
# details_bullets_div <- function(details) {
# cli::cli_div()
# cli::format_bullets_raw(
# stats::setNames(details, rep("*", length(details)))
# )
# }
# details <- details_bullets_div(details)
# }

capture_check_cnd(
check = check,
file_path = file_path,
Expand All @@ -63,19 +43,28 @@ check_tbl_value_col <- function(tbl, round_id, file_path, hub_path) {
)
}


check_modeling_task_value_col <- function(tbl, output_type_config) {
purrr::imap(
split(tbl, tbl[["output_type"]]),
~ compare_values_to_config(
tbl = .x, output_type = .y,
output_type_config
)
check_value_col_by_output_type <- function(tbl, output_type,
config_tasks, round_id,
derived_task_ids = NULL) {
purrr::map2(
.x = match_tbl_to_model_task(tbl, config_tasks,
round_id, output_type,
derived_task_ids = derived_task_ids
),
.y = get_round_output_types(config_tasks, round_id),
\(.x, .y) {
compare_values_to_config(
tbl = .x, output_type_config = .y, output_type = output_type
)
}
) %>%
unlist(use.names = TRUE)
}

compare_values_to_config <- function(tbl, output_type, output_type_config) {
if (any(is.null(tbl), is.null(output_type_config))) {
return(NULL)
}
details <- NULL
values <- tbl$value
config <- output_type_config[[output_type]][["value"]]
Expand Down
Loading

0 comments on commit ca8e22b

Please sign in to comment.