Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for ascending CDF values with numeric output_type_id cast as character #105

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
48 changes: 44 additions & 4 deletions R/check_tbl_value_col_ascending.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,20 @@
#' @inherit check_tbl_colnames params
#' @inherit check_tbl_col_types return
#' @export
check_tbl_value_col_ascending <- function(tbl, file_path) {
check_tbl_value_col_ascending <- function(tbl, file_path, hub_path, round_id) {

config_tasks <- hubUtils::read_config(hub_path, "tasks")

# Coerce accepted vals to character for easier comparison of
# values. Tried to use arrow tbls for comparisons as more efficient when
# working with larger files but currently arrow does not match NAs as dplyr
# does, returning false positives for mean & median rows which contain NA in
# output type ID column.
accepted_vals <- expand_model_out_grid(
config_tasks = config_tasks,
round_id = round_id,
all_character = TRUE
)
if (all(!c("cdf", "quantile") %in% tbl[["output_type"]])) {
return(
capture_check_info(
Expand All @@ -22,8 +35,10 @@ check_tbl_value_col_ascending <- function(tbl, file_path) {
)
}

output_type_tbl <- split(tbl, tbl[["output_type"]])[c("cdf", "quantile")] %>%
purrr::compact()
# FIX for <https://github.com/hubverse-org/hubValidations/issues/78>
# sort the table by config by merging from config ----------------
tbl_sorted <- order_output_type_ids(tbl, accepted_vals, c("cdf", "quantile"))
output_type_tbl <- split_cdf_quantile(tbl_sorted)

error_tbl <- purrr::map(
output_type_tbl,
Expand Down Expand Up @@ -57,8 +72,8 @@ check_values_ascending <- function(tbl) {
group_cols <- names(tbl)[!names(tbl) %in% hubUtils::std_colnames]
tbl[["value"]] <- as.numeric(tbl[["value"]])

# group by all of the target columns
check_tbl <- dplyr::group_by(tbl, dplyr::across(dplyr::all_of(group_cols))) %>%
dplyr::arrange(.data$output_type_id, .by_group = TRUE) %>%
dplyr::summarise(non_asc = any(diff(.data[["value"]]) < 0))

if (!any(check_tbl$non_asc)) {
Expand All @@ -72,3 +87,28 @@ check_values_ascending <- function(tbl) {
dplyr::ungroup() %>%
dplyr::mutate(.env$output_type)
}

split_cdf_quantile <- function(tbl) {
split(tbl, tbl[["output_type"]])[c("cdf", "quantile")] %>%
purrr::compact()
}

# Order the output type ids in the order of the config
#
# This extracts the output_type_id from the config-generated table for the
# given types and creates a lookup table that has the types in the right order.
#
# The data from `tbl` is then joined into the lookup table (after being coerced
# to character), which sorts `tbl` in the order of the lookup table.
#
# NOTE: this assumes that the cdf and quantile values in the `tbl` are complete.
order_output_type_ids <- function(tbl, config, types = c("cdf", "quantile")) {
# step 1: create a lookup table from the config
order_ref <- config[c("output_type", "output_type_id")]
cdf_and_quantile <- order_ref$output_type %in% types
order_ref <- order_ref[cdf_and_quantile, , drop = FALSE]
order_ref <- unique(order_ref)
# step 2: join
tbl$output_type_id <- as.character(tbl$output_type_id)
dplyr::inner_join(order_ref, tbl, by = c("output_type", "output_type_id"))
}
4 changes: 3 additions & 1 deletion R/validate_model_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,9 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
checks$value_col_non_desc <- try_check(
check_tbl_value_col_ascending(
tbl,
file_path = file_path
file_path = file_path,
hub_path = hub_path,
round_id = round_id
), file_path
)

Expand Down
14 changes: 13 additions & 1 deletion man/check_tbl_value_col_ascending.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 9 additions & 7 deletions tests/testthat/_snaps/check_tbl_value_col_ascending.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# check_tbl_value_col_ascending works

Code
check_tbl_value_col_ascending(tbl, file_path)
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
Output
<message/check_success>
Message:
Expand All @@ -10,7 +10,7 @@
---

Code
check_tbl_value_col_ascending(tbl, file_path)
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
Output
<message/check_success>
Message:
Expand All @@ -19,7 +19,7 @@
# check_tbl_value_col_ascending works when output type IDs not ordered

Code
check_tbl_value_col_ascending(tbl, file_path)
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
Output
<message/check_success>
Message:
Expand All @@ -28,7 +28,7 @@
# check_tbl_value_col_ascending errors correctly

Code
str(check_tbl_value_col_ascending(tbl, file_path))
str(check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id))
Output
List of 7
$ message : chr "Values in `value` column are not non-decreasing as output_type_ids increase for all unique task ID\n value/o"| __truncated__
Expand All @@ -48,7 +48,8 @@
---

Code
str(check_tbl_value_col_ascending(tbl_error, file_path))
str(check_tbl_value_col_ascending(tbl_error, file_path, hub_path, file_meta$
round_id))
Output
List of 7
$ message : chr "Values in `value` column are not non-decreasing as output_type_ids increase for all unique task ID\n value/o"| __truncated__
Expand All @@ -68,7 +69,8 @@
---

Code
str(check_tbl_value_col_ascending(rbind(tbl, tbl_error), file_path))
str(check_tbl_value_col_ascending(rbind(tbl, tbl_error), file_path, hub_path,
file_meta$round_id))
Output
List of 7
$ message : chr "Values in `value` column are not non-decreasing as output_type_ids increase for all unique task ID\n value/o"| __truncated__
Expand All @@ -88,7 +90,7 @@
# check_tbl_value_col_ascending skips correctly

Code
check_tbl_value_col_ascending(tbl, file_path)
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
Output
<message/check_info>
Message:
Expand Down
76 changes: 68 additions & 8 deletions tests/testthat/test-check_tbl_value_col_ascending.R
Original file line number Diff line number Diff line change
@@ -1,60 +1,69 @@
test_that("check_tbl_value_col_ascending works", {
hub_path <- system.file("testhubs/simple", package = "hubValidations")
file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
file_meta <- parse_file_name(file_path)
tbl <- hubValidations::read_model_out_file(file_path, hub_path)

expect_snapshot(
check_tbl_value_col_ascending(tbl, file_path)
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
)

hub_path <- system.file("testhubs/flusight", package = "hubUtils")
file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet"
file_meta <- parse_file_name(file_path)

tbl <- hubValidations::read_model_out_file(file_path, hub_path)

expect_snapshot(
check_tbl_value_col_ascending(tbl, file_path)
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
)
})

test_that("check_tbl_value_col_ascending works when output type IDs not ordered", {
hub_path <- test_path("testdata/hub-unordered/")
tbl <- arrow::read_csv_arrow(
test_path("testdata/files/2024-01-10-ISI-NotOrdered.csv")
fs::path(hub_path, "model-output/2024-01-10-ISI-NotOrdered.csv")
) %>%
hubData::coerce_to_character()
file_path <- "ISI-NotOrdered/2024-01-10-ISI-NotOrdered.csv"
file_meta <- parse_file_name(file_path)
expect_snapshot(
check_tbl_value_col_ascending(tbl, file_path)
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
)
})

test_that("check_tbl_value_col_ascending errors correctly", {
hub_path <- system.file("testhubs/simple", package = "hubValidations")
file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
file_meta <- parse_file_name(file_path)
tbl <- hubValidations::read_model_out_file(file_path, hub_path)

tbl$value[c(1, 10)] <- 150

expect_snapshot(
str(check_tbl_value_col_ascending(tbl, file_path))
str(check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id))
)

hub_path <- system.file("testhubs/flusight", package = "hubUtils")
file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet"
file_meta <- parse_file_name(file_path)
tbl <- hubValidations::read_model_out_file(file_path, hub_path)
tbl_error <- tbl
tbl_error$target <- "wk ahead inc covid hosp"
tbl_error$value[1] <- 800

expect_snapshot(
str(
check_tbl_value_col_ascending(tbl_error, file_path)
check_tbl_value_col_ascending(tbl_error, file_path, hub_path, file_meta$round_id)
)
)
expect_snapshot(
str(
check_tbl_value_col_ascending(
rbind(tbl, tbl_error),
file_path
file_path,
hub_path,
file_meta$round_id
)
)
)
Expand All @@ -63,10 +72,61 @@ test_that("check_tbl_value_col_ascending errors correctly", {
test_that("check_tbl_value_col_ascending skips correctly", {
hub_path <- system.file("testhubs/simple", package = "hubValidations")
file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
file_meta <- parse_file_name(file_path)
tbl <- hubValidations::read_model_out_file(file_path, hub_path)
tbl <- tbl[tbl$output_type == "mean", ]

expect_snapshot(
check_tbl_value_col_ascending(tbl, file_path)
check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
)
})


test_that("(#78) check_tbl_value_col_ascending will sort even if the data doesn't naturally sort", {
# In this situaton, I am duplicating the simple testhub and modifying it in
# one way:
#
# I am replacing the `quantile` model task with `cdf` and adding a cumulative
# sum so that we can get unsortable numbers.
make_unsortable <- function(x) suppressWarnings(x + 1:23)

# Duplicating the simple test hub ---------------------------------------
hub_path <- withr::local_tempdir()
fs::dir_copy(system.file("testhubs/simple", package = "hubValidations"),
hub_path,
overwrite = TRUE
)

# Creating the CFG output -----------------------------------------------
cfg <- attr(hubData::connect_hub(hub_path), "config_tasks")
outputs <- cfg$rounds[[1]]$model_tasks[[1]]$output_type
outputs$cfg <- outputs$quantile
outputs$quantile <- NULL
otid <- outputs$cfg$output_type_id$required
outputs$cfg$output_type_id$required <- make_unsortable(otid)
cfg$rounds[[1]]$model_tasks[[1]]$output_type <- outputs
jsonlite::toJSON(cfg) %>%
jsonlite::prettify() %>%
writeLines(fs::path(hub_path, "hub-config", "tasks.json"))

# Updating the data to match the config --------------------------------
file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
file_meta <- parse_file_name(file_path)
tbl <- hubValidations::read_model_out_file(file_path, hub_path)
tbl$output_type_id <- make_unsortable(tbl$output_type_id)

# validating when it is sorted -----------------------------------------
res <- check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
expect_s3_class(res, "check_success")
expect_null(res$error_tbl)

# validating when it is unsorted ---------------------------------------
res_unordered <- check_tbl_value_col_ascending(
tbl[sample(nrow(tbl)), ],
file_path,
hub_path,
file_meta$round_id
)
expect_s3_class(res_unordered, "check_success")
expect_null(res_unordered$error_tbl)
})
Loading
Loading