hubverse-org · zkamvar · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 13, 2024
diff --git a/R/check_tbl_value_col_ascending.R b/R/check_tbl_value_col_ascending.R
@@ -11,7 +11,20 @@
 #' @inherit check_tbl_colnames params
 #' @inherit check_tbl_col_types return
 #' @export
-check_tbl_value_col_ascending <- function(tbl, file_path) {
+check_tbl_value_col_ascending <- function(tbl, file_path, hub_path, round_id) {
+
+  config_tasks <- hubUtils::read_config(hub_path, "tasks")
+
+  # Coerce accepted vals to character for easier comparison of
+  # values. Tried to use arrow tbls for comparisons as more efficient when
+  # working with larger files but currently arrow does not match NAs as dplyr
+  # does, returning false positives for mean & median rows which contain NA in
+  # output type ID column.
+  accepted_vals <- expand_model_out_grid(
+    config_tasks = config_tasks,
+    round_id = round_id,
+    all_character = TRUE
+  )
   if (all(!c("cdf", "quantile") %in% tbl[["output_type"]])) {
     return(
       capture_check_info(
@@ -22,8 +35,10 @@ check_tbl_value_col_ascending <- function(tbl, file_path) {
     )
   }
 
-  output_type_tbl <- split(tbl, tbl[["output_type"]])[c("cdf", "quantile")] %>%
-    purrr::compact()
+  # FIX for <https://github.com/hubverse-org/hubValidations/issues/78>
+  # sort the table by config by merging from config ----------------
+  tbl_sorted <- order_output_type_ids(tbl, accepted_vals, c("cdf", "quantile"))
+  output_type_tbl <- split_cdf_quantile(tbl_sorted)
 
   error_tbl <- purrr::map(
     output_type_tbl,
@@ -57,8 +72,8 @@ check_values_ascending <- function(tbl) {
   group_cols <- names(tbl)[!names(tbl) %in% hubUtils::std_colnames]
   tbl[["value"]] <- as.numeric(tbl[["value"]])
 
+  # group by all of the target columns
   check_tbl <- dplyr::group_by(tbl, dplyr::across(dplyr::all_of(group_cols))) %>%
-    dplyr::arrange(.data$output_type_id, .by_group = TRUE) %>%
     dplyr::summarise(non_asc = any(diff(.data[["value"]]) < 0))
 
   if (!any(check_tbl$non_asc)) {
@@ -72,3 +87,28 @@ check_values_ascending <- function(tbl) {
     dplyr::ungroup() %>%
     dplyr::mutate(.env$output_type)
 }
+
+split_cdf_quantile <- function(tbl) {
+  split(tbl, tbl[["output_type"]])[c("cdf", "quantile")] %>%
+    purrr::compact()
+}
+
+# Order the output type ids in the order of the config
+#
+# This extracts the output_type_id from the config-generated table for the
+# given types and creates a lookup table that has the types in the right order.
+#
+# The data from `tbl` is then joined into the lookup table (after being coerced
+# to character), which sorts `tbl` in the order of the lookup table.
+#
+# NOTE: this assumes that the cdf and quantile values in the `tbl` are complete.
+order_output_type_ids <- function(tbl, config, types = c("cdf", "quantile")) {
+  # step 1: create a lookup table from the config
+  order_ref <- config[c("output_type", "output_type_id")]
+  cdf_and_quantile <- order_ref$output_type %in% types
+  order_ref <- order_ref[cdf_and_quantile, , drop = FALSE]
+  order_ref <- unique(order_ref)
+  # step 2: join
+  tbl$output_type_id <- as.character(tbl$output_type_id)
+  dplyr::inner_join(order_ref, tbl, by = c("output_type", "output_type_id"))
+}
diff --git a/R/validate_model_data.R b/R/validate_model_data.R
@@ -178,7 +178,9 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
   checks$value_col_non_desc <- try_check(
     check_tbl_value_col_ascending(
       tbl,
-      file_path = file_path
+      file_path = file_path,
+      hub_path = hub_path,
+      round_id = round_id
     ), file_path
   )
 

diff --git a/man/check_tbl_value_col_ascending.Rd b/man/check_tbl_value_col_ascending.Rd
diff --git a/tests/testthat/_snaps/check_tbl_value_col_ascending.md b/tests/testthat/_snaps/check_tbl_value_col_ascending.md
@@ -1,7 +1,7 @@
 # check_tbl_value_col_ascending works
 
     Code
-      check_tbl_value_col_ascending(tbl, file_path)
+      check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
     Output
       <message/check_success>
       Message:
@@ -10,7 +10,7 @@
 ---
 
     Code
-      check_tbl_value_col_ascending(tbl, file_path)
+      check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
     Output
       <message/check_success>
       Message:
@@ -19,7 +19,7 @@
 # check_tbl_value_col_ascending works when output type IDs not ordered
 
     Code
-      check_tbl_value_col_ascending(tbl, file_path)
+      check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
     Output
       <message/check_success>
       Message:
@@ -28,7 +28,7 @@
 # check_tbl_value_col_ascending errors correctly
 
     Code
-      str(check_tbl_value_col_ascending(tbl, file_path))
+      str(check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id))
     Output
       List of 7
        $ message       : chr "Values in `value` column are not non-decreasing as output_type_ids increase for all unique task ID\n    value/o"| __truncated__
@@ -48,7 +48,8 @@
 ---
 
     Code
-      str(check_tbl_value_col_ascending(tbl_error, file_path))
+      str(check_tbl_value_col_ascending(tbl_error, file_path, hub_path, file_meta$
+        round_id))
     Output
       List of 7
        $ message       : chr "Values in `value` column are not non-decreasing as output_type_ids increase for all unique task ID\n    value/o"| __truncated__
@@ -68,7 +69,8 @@
 ---
 
     Code
-      str(check_tbl_value_col_ascending(rbind(tbl, tbl_error), file_path))
+      str(check_tbl_value_col_ascending(rbind(tbl, tbl_error), file_path, hub_path,
+      file_meta$round_id))
     Output
       List of 7
        $ message       : chr "Values in `value` column are not non-decreasing as output_type_ids increase for all unique task ID\n    value/o"| __truncated__
@@ -88,7 +90,7 @@
 # check_tbl_value_col_ascending skips correctly
 
     Code
-      check_tbl_value_col_ascending(tbl, file_path)
+      check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
     Output
       <message/check_info>
       Message:

diff --git a/tests/testthat/test-check_tbl_value_col_ascending.R b/tests/testthat/test-check_tbl_value_col_ascending.R
@@ -1,60 +1,69 @@
 test_that("check_tbl_value_col_ascending works", {
   hub_path <- system.file("testhubs/simple", package = "hubValidations")
   file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
+  file_meta <- parse_file_name(file_path)
   tbl <- hubValidations::read_model_out_file(file_path, hub_path)
 
   expect_snapshot(
-    check_tbl_value_col_ascending(tbl, file_path)
+    check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
   )
 
   hub_path <- system.file("testhubs/flusight", package = "hubUtils")
   file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet"
+  file_meta <- parse_file_name(file_path)
+
   tbl <- hubValidations::read_model_out_file(file_path, hub_path)
 
   expect_snapshot(
-    check_tbl_value_col_ascending(tbl, file_path)
+    check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
   )
 })
 
 test_that("check_tbl_value_col_ascending works when output type IDs not ordered", {
+  hub_path <- test_path("testdata/hub-unordered/")
   tbl <- arrow::read_csv_arrow(
-    test_path("testdata/files/2024-01-10-ISI-NotOrdered.csv")
+    fs::path(hub_path, "model-output/2024-01-10-ISI-NotOrdered.csv")
   ) %>%
     hubData::coerce_to_character()
   file_path <- "ISI-NotOrdered/2024-01-10-ISI-NotOrdered.csv"
+  file_meta <- parse_file_name(file_path)
   expect_snapshot(
-    check_tbl_value_col_ascending(tbl, file_path)
+    check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
   )
 })
 
 test_that("check_tbl_value_col_ascending errors correctly", {
   hub_path <- system.file("testhubs/simple", package = "hubValidations")
   file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
+  file_meta <- parse_file_name(file_path)
   tbl <- hubValidations::read_model_out_file(file_path, hub_path)
 
   tbl$value[c(1, 10)] <- 150
 
   expect_snapshot(
-    str(check_tbl_value_col_ascending(tbl, file_path))
+    str(check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id))
   )
 
   hub_path <- system.file("testhubs/flusight", package = "hubUtils")
   file_path <- "hub-ensemble/2023-05-08-hub-ensemble.parquet"
+  file_meta <- parse_file_name(file_path)
   tbl <- hubValidations::read_model_out_file(file_path, hub_path)
   tbl_error <- tbl
   tbl_error$target <- "wk ahead inc covid hosp"
   tbl_error$value[1] <- 800
 
   expect_snapshot(
     str(
-      check_tbl_value_col_ascending(tbl_error, file_path)
+      check_tbl_value_col_ascending(tbl_error, file_path, hub_path, file_meta$round_id)
     )
   )
   expect_snapshot(
     str(
       check_tbl_value_col_ascending(
         rbind(tbl, tbl_error),
-        file_path
+        file_path,
+        hub_path,
+        file_meta$round_id
       )
     )
   )
@@ -63,10 +72,61 @@ test_that("check_tbl_value_col_ascending errors correctly", {
 test_that("check_tbl_value_col_ascending skips correctly", {
   hub_path <- system.file("testhubs/simple", package = "hubValidations")
   file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
+  file_meta <- parse_file_name(file_path)
   tbl <- hubValidations::read_model_out_file(file_path, hub_path)
   tbl <- tbl[tbl$output_type == "mean", ]
 
   expect_snapshot(
-    check_tbl_value_col_ascending(tbl, file_path)
+    check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
+  )
+})
+
+
+test_that("(#78) check_tbl_value_col_ascending will sort even if the data doesn't naturally sort", {
+  # In this situaton, I am duplicating the simple testhub and modifying it in
+  # one way:
+  #
+  # I am replacing the `quantile` model task with `cdf` and adding a cumulative
+  # sum so that we can get unsortable numbers.
+  make_unsortable <- function(x) suppressWarnings(x + 1:23)
+
+  # Duplicating the simple test hub ---------------------------------------
+  hub_path <- withr::local_tempdir()
+  fs::dir_copy(system.file("testhubs/simple", package = "hubValidations"),
+    hub_path,
+    overwrite = TRUE
+  )
+
+  # Creating the CFG output -----------------------------------------------
+  cfg <- attr(hubData::connect_hub(hub_path), "config_tasks")
+  outputs <- cfg$rounds[[1]]$model_tasks[[1]]$output_type
+  outputs$cfg <- outputs$quantile
+  outputs$quantile <- NULL
+  otid <- outputs$cfg$output_type_id$required
+  outputs$cfg$output_type_id$required <- make_unsortable(otid)
+  cfg$rounds[[1]]$model_tasks[[1]]$output_type <- outputs
+  jsonlite::toJSON(cfg) %>%
+    jsonlite::prettify() %>%
+    writeLines(fs::path(hub_path, "hub-config", "tasks.json"))
+
+  # Updating the data to match the config --------------------------------
+  file_path <- "team1-goodmodel/2022-10-08-team1-goodmodel.csv"
+  file_meta <- parse_file_name(file_path)
+  tbl <- hubValidations::read_model_out_file(file_path, hub_path)
+  tbl$output_type_id <- make_unsortable(tbl$output_type_id)
+
+  # validating when it is sorted -----------------------------------------
+  res <- check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
+  expect_s3_class(res, "check_success")
+  expect_null(res$error_tbl)
+
+  # validating when it is unsorted ---------------------------------------
+  res_unordered <- check_tbl_value_col_ascending(
+    tbl[sample(nrow(tbl)), ],
+    file_path,
+    hub_path,
+    file_meta$round_id
   )
+  expect_s3_class(res_unordered, "check_success")
+  expect_null(res_unordered$error_tbl)
 })