From 20d24667277847219426f453a830500f528a45dd Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Thu, 31 Oct 2024 00:44:30 +0000 Subject: [PATCH 01/11] expand keep options --- R/build.R | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/R/build.R b/R/build.R index 2a15a7e..877c822 100644 --- a/R/build.R +++ b/R/build.R @@ -42,6 +42,8 @@ #' build_from_derived(spec, ds_list, predecessor_only = FALSE) build_from_derived <- function(metacore, ds_list, dataset_name = NULL, predecessor_only = TRUE, keep = FALSE) { + browser() + keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "UNUSED")) metacore <- make_lone_dataset(metacore, dataset_name) derirvations <- metacore$derivations %>% mutate(derivation = trimws(derivation)) @@ -60,6 +62,7 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, vars_to_pull_through <- derirvations %>% filter(str_detect(derivation, "^\\w*\\.[a-zA-Z0-9]*$")) + # To lower so it is flexible about how people name their ds list vars_w_ds <- vars_to_pull_through %>% mutate(ds = str_extract(derivation, "^\\w*(?=\\.)") %>% @@ -144,14 +147,33 @@ get_variables <- function(x, ds_list, keep) { ds_name <- unique(x$ds) data <- ds_list[[ds_name]] rename_vec <- set_names(x$col_name, x$variable) - if (keep) { + if (keep == "TRUE") { + # Don't drop predecessor columns out <- data %>% select(x$col_name) %>% mutate(across(all_of(rename_vec))) - } else { + } else if (keep == "FALSE") { + # Drop predecessor columns out <- data %>% select(x$col_name) %>% rename(all_of(rename_vec)) + } else if (keep == "ALL") { + # Keep all cols from original datasets + out <- data %>% + mutate(across(all_of(rename_vec))) + } else if (keep == "PREREQUISITE") { + # Keep all columns required for future derivations + vars_to_pull_through <- derirvations %>% + filter(str_detect(derivation, "^\\w*\\.[a-zA-Z0-9]*$")) + } + out +} + +prepare_join <- function(x) { + if (length(x) < 2){ + out <- x + } else { + } out } From 00d9b4405c614deedd0a5c63bdca76542eaea670 Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Mon, 4 Nov 2024 10:28:08 +0000 Subject: [PATCH 02/11] add prepare_join function --- R/build.R | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/R/build.R b/R/build.R index 877c822..e96d156 100644 --- a/R/build.R +++ b/R/build.R @@ -43,7 +43,7 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, predecessor_only = TRUE, keep = FALSE) { browser() - keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "UNUSED")) + keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "PREREQUISITE")) metacore <- make_lone_dataset(metacore, dataset_name) derirvations <- metacore$derivations %>% mutate(derivation = trimws(derivation)) @@ -127,11 +127,10 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, group_by(ds) %>% group_split() %>% map(get_variables, ds_list, keep) %>% + prepare_join(join_by) %>% reduce(full_join, by = join_by) } - - #' Internal functions to get variables from a dataset list #' #' This function is used with `build_from_derived` to build a dataset of columns @@ -169,11 +168,19 @@ get_variables <- function(x, ds_list, keep) { out } -prepare_join <- function(x) { - if (length(x) < 2){ - out <- x - } else { +prepare_join <- function(x, keys) { + browser() + out <- list(x[[1]]) + if (length(x) > 1){ + for (i in 2:length(x)){ + # Drop non-key cols present in each previous dataset in order + for (j in 1:(i-1)){ + out[[i]] <- x[[i]] %>% + select(-any_of(keep(names(x[[j]]), + function(col) !(col %in% keys)))) + } + } } out } From 47a3e6f97df8ff77f0652100ab732e061c7e841d Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Mon, 4 Nov 2024 10:28:08 +0000 Subject: [PATCH 03/11] add prepare_join function --- R/build.R | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/R/build.R b/R/build.R index 877c822..e96d156 100644 --- a/R/build.R +++ b/R/build.R @@ -43,7 +43,7 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, predecessor_only = TRUE, keep = FALSE) { browser() - keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "UNUSED")) + keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "PREREQUISITE")) metacore <- make_lone_dataset(metacore, dataset_name) derirvations <- metacore$derivations %>% mutate(derivation = trimws(derivation)) @@ -127,11 +127,10 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, group_by(ds) %>% group_split() %>% map(get_variables, ds_list, keep) %>% + prepare_join(join_by) %>% reduce(full_join, by = join_by) } - - #' Internal functions to get variables from a dataset list #' #' This function is used with `build_from_derived` to build a dataset of columns @@ -169,11 +168,19 @@ get_variables <- function(x, ds_list, keep) { out } -prepare_join <- function(x) { - if (length(x) < 2){ - out <- x - } else { +prepare_join <- function(x, keys) { + browser() + out <- list(x[[1]]) + if (length(x) > 1){ + for (i in 2:length(x)){ + # Drop non-key cols present in each previous dataset in order + for (j in 1:(i-1)){ + out[[i]] <- x[[i]] %>% + select(-any_of(keep(names(x[[j]]), + function(col) !(col %in% keys)))) + } + } } out } From be2093e2150d72c8fddf010ee774b7a177778b2a Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Mon, 4 Nov 2024 15:31:02 +0000 Subject: [PATCH 04/11] calculate prerequisite cols --- R/build.R | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/R/build.R b/R/build.R index 6266b6d..e852862 100644 --- a/R/build.R +++ b/R/build.R @@ -42,7 +42,13 @@ #' build_from_derived(spec, ds_list, predecessor_only = FALSE) build_from_derived <- function(metacore, ds_list, dataset_name = NULL, predecessor_only = TRUE, keep = FALSE) { + # Deprecate KEEP = TRUE keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "PREREQUISITE")) + if (keep == "TRUE"){ + warning("Setting 'keep' = TRUE has been superseded, and will be + unavailable in future releases. Please consider setting 'keep' + equal to 'ALL' or 'PREREQUISITE'.") + } metacore <- make_lone_dataset(metacore, dataset_name) derirvations <- metacore$derivations %>% mutate(derivation = trimws(derivation)) @@ -117,7 +123,7 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, variable = joining_vals_to_add) %>% unnest(variable) %>% mutate(col_name = variable) - + browser() vars_w_ds %>% mutate(col_name = str_extract(derivation, "(?<=\\.).*")) %>% inner_join(metacore$value_spec, ., by = "derivation_id") %>% @@ -125,7 +131,7 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, bind_rows(additional_vals) %>% group_by(ds) %>% group_split() %>% - map(get_variables, ds_list, keep) %>% + map(get_variables, ds_list, keep, derirvations) %>% prepare_join(join_by) %>% reduce(full_join, by = join_by) } @@ -141,7 +147,8 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, #' #' @return datasets #' @noRd -get_variables <- function(x, ds_list, keep) { +get_variables <- function(x, ds_list, keep, derivations) { + browser() ds_name <- unique(x$ds) data <- ds_list[[ds_name]] rename_vec <- set_names(x$col_name, x$variable) @@ -161,12 +168,38 @@ get_variables <- function(x, ds_list, keep) { mutate(across(all_of(rename_vec))) } else if (keep == "PREREQUISITE") { # Keep all columns required for future derivations - vars_to_pull_through <- derirvations %>% - filter(str_detect(derivation, "^\\w*\\.[a-zA-Z0-9]*$")) + prereq_vector <- derivations$derivation %>% + str_match_all("([A-Z]+)\\.([A-Z0-9a-z]+)") + + prereq_matrix <- do.call(rbind,prereq_vector) %>% + unique() + + prereq_cols <- subset(prereq_matrix, tolower(prereq_matrix[,2]) == tolower(ds_name))[,3] + + out <- data %>% + mutate(across(all_of(rename_vec))) %>% + select(c(prereq_cols,x$col_name)) } out } +select_required <- function(x, derivations, ds_name) { + + + +} + +#' Internal function to remove duplicated non-key variables prior to join +#' +#' This function is used with `build_from_derived` to drop columns that would +#' cause a conflict on joining datasets, prioritising keeping columns in +#' datasets earlier on in ds_list +#' +#' @param x List of datasets with all columns added +#' @param keys List of key values to join on +#' +#' @return datasets +#' @noRd prepare_join <- function(x, keys) { out <- list(x[[1]]) @@ -174,6 +207,7 @@ prepare_join <- function(x, keys) { for (i in 2:length(x)){ # Drop non-key cols present in each previous dataset in order for (j in 1:(i-1)){ + # WARNING out[[i]] <- x[[i]] %>% select(-any_of(keep(names(x[[j]]), function(col) !(col %in% keys)))) @@ -183,7 +217,6 @@ prepare_join <- function(x, keys) { out } - #' Drop Unspecified Variables #' #' This function drops all unspecified variables. It will throw and error if the From 9292ec0c48b1df121aff573a8a60821d939c798b Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Tue, 5 Nov 2024 00:30:50 +0000 Subject: [PATCH 05/11] update joining logic --- R/build.R | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/R/build.R b/R/build.R index e852862..5c26ff7 100644 --- a/R/build.R +++ b/R/build.R @@ -123,7 +123,7 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, variable = joining_vals_to_add) %>% unnest(variable) %>% mutate(col_name = variable) - browser() + vars_w_ds %>% mutate(col_name = str_extract(derivation, "(?<=\\.).*")) %>% inner_join(metacore$value_spec, ., by = "derivation_id") %>% @@ -148,7 +148,6 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, #' @return datasets #' @noRd get_variables <- function(x, ds_list, keep, derivations) { - browser() ds_name <- unique(x$ds) data <- ds_list[[ds_name]] rename_vec <- set_names(x$col_name, x$variable) @@ -168,25 +167,22 @@ get_variables <- function(x, ds_list, keep, derivations) { mutate(across(all_of(rename_vec))) } else if (keep == "PREREQUISITE") { # Keep all columns required for future derivations + # Find all "XX.XXXXX" prereq_vector <- derivations$derivation %>% str_match_all("([A-Z]+)\\.([A-Z0-9a-z]+)") + # Bind into matrix + remove dups prereq_matrix <- do.call(rbind,prereq_vector) %>% unique() + # Subset to those present in current dataset prereq_cols <- subset(prereq_matrix, tolower(prereq_matrix[,2]) == tolower(ds_name))[,3] out <- data %>% mutate(across(all_of(rename_vec))) %>% - select(c(prereq_cols,x$col_name)) + select(c(x$col_name, prereq_cols)) } out -} - -select_required <- function(x, derivations, ds_name) { - - - } #' Internal function to remove duplicated non-key variables prior to join @@ -207,10 +203,13 @@ prepare_join <- function(x, keys) { for (i in 2:length(x)){ # Drop non-key cols present in each previous dataset in order for (j in 1:(i-1)){ - # WARNING + non_key <- keep(names(x[[j]]), function(col) !(col %in% keys)) out[[i]] <- x[[i]] %>% - select(-any_of(keep(names(x[[j]]), - function(col) !(col %in% keys)))) + select(-any_of(non_key)) + + if(length(intersect(non_key,colnames(x[[i]]))) > 0){ + print(paste0()) + } } } } From 3d90692fde82af7c02f766ee08cd23c6fdf792cc Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:03:00 +0000 Subject: [PATCH 06/11] add tests --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/build.R | 31 ++++++++++++++----------- tests/testthat/test-build.R | 45 +++++++++++++++++++++++++++++++------ 4 files changed, 58 insertions(+), 21 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 16af8c6..bf4f556 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,7 @@ Description: Uses the metadata information stored in 'metacore' objects to check License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Imports: dplyr, metacore (>= 0.0.4), diff --git a/NAMESPACE b/NAMESPACE index 7a14495..80f1143 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -75,6 +75,7 @@ importFrom(stringr,str_c) importFrom(stringr,str_count) importFrom(stringr,str_detect) importFrom(stringr,str_extract) +importFrom(stringr,str_match_all) importFrom(stringr,str_remove) importFrom(stringr,str_remove_all) importFrom(stringr,str_split) diff --git a/R/build.R b/R/build.R index 5c26ff7..be35ecb 100644 --- a/R/build.R +++ b/R/build.R @@ -25,7 +25,7 @@ #' @return dataset #' @export #' @importFrom stringr str_to_lower str_detect str_extract str_to_upper -#' str_split +#' str_split str_match_all #' @importFrom dplyr filter pull mutate group_by group_split inner_join select #' full_join bind_rows #' @importFrom tidyr unnest @@ -45,9 +45,8 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, # Deprecate KEEP = TRUE keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "PREREQUISITE")) if (keep == "TRUE"){ - warning("Setting 'keep' = TRUE has been superseded, and will be - unavailable in future releases. Please consider setting 'keep' - equal to 'ALL' or 'PREREQUISITE'.") + warning("Setting 'keep' = TRUE has been superseded, and will be unavailable in future releases. + Please consider setting 'keep' equal to 'ALL' or 'PREREQUISITE'.") } metacore <- make_lone_dataset(metacore, dataset_name) derirvations <- metacore$derivations %>% @@ -132,7 +131,7 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, group_by(ds) %>% group_split() %>% map(get_variables, ds_list, keep, derirvations) %>% - prepare_join(join_by) %>% + prepare_join(join_by, names(ds_list)) %>% reduce(full_join, by = join_by) } @@ -179,8 +178,9 @@ get_variables <- function(x, ds_list, keep, derivations) { prereq_cols <- subset(prereq_matrix, tolower(prereq_matrix[,2]) == tolower(ds_name))[,3] out <- data %>% - mutate(across(all_of(rename_vec))) %>% - select(c(x$col_name, prereq_cols)) + select(c(x$col_name, prereq_cols)) %>% + mutate(across(all_of(rename_vec))) + } out } @@ -196,21 +196,26 @@ get_variables <- function(x, ds_list, keep, derivations) { #' #' @return datasets #' @noRd -prepare_join <- function(x, keys) { +prepare_join <- function(x, keys, ds_names) { out <- list(x[[1]]) if (length(x) > 1){ for (i in 2:length(x)){ # Drop non-key cols present in each previous dataset in order + drop_cols <- c() + for (j in 1:(i-1)){ - non_key <- keep(names(x[[j]]), function(col) !(col %in% keys)) - out[[i]] <- x[[i]] %>% - select(-any_of(non_key)) + conflicting_cols <- keep(names(x[[j]]), function(col) !(col %in% keys)) %>% + intersect(colnames(x[[i]])) + drop_cols <- c(drop_cols, conflicting_cols) - if(length(intersect(non_key,colnames(x[[i]]))) > 0){ - print(paste0()) + if(length(conflicting_cols) > 0){ + message(paste0("Dropping column(s) from ", ds_names[[i]]," due to conflict with ",ds_names[[j]],": ", conflicting_cols,".")) } } + + out[[i]] <- x[[i]] %>% + select(-any_of(drop_cols)) } } out diff --git a/tests/testthat/test-build.R b/tests/testthat/test-build.R index 44aebf8..54ba753 100644 --- a/tests/testthat/test-build.R +++ b/tests/testthat/test-build.R @@ -46,13 +46,16 @@ test_that("build_from_derived", { unique() %>% c(., "TRT01P") %>% sort() - build_from_derived(spec, ds_list, - predecessor_only = FALSE, - keep = TRUE - ) %>% - names() %>% - sort() %>% - expect_equal(man_vars) + + expect_warning( + build_from_derived(spec, ds_list, + predecessor_only = FALSE, + keep = TRUE + ) %>% + names() %>% + sort() %>% + expect_equal(man_vars) + ) # Pulling through from more than one dataset spec2 <- metacore %>% select_dataset("ADAE") @@ -108,6 +111,34 @@ test_that("build_from_derived", { keep = FALSE )) + # Pulling through all columns from original dataset + adae_full <- build_from_derived(spec2, + ds_list = list("AE" = safetyData::sdtm_ae, + "ADSL" = safetyData::adam_adsl), + predecessor_only = FALSE, + keep = "ALL" + ) + + full_adsl_part <- safetyData::adam_adsl %>% + mutate(TRTA = TRT01A, TRTAN = TRT01AN) + + adae_all_man <- full_join(full_adsl_part, safetyData::sdtm_ae, by = c("STUDYID", "USUBJID"), multiple = "all") + + expect_equal(adae_full,adae_all_man) + + # Pulling through columns required for future derivations + adae_prereq <- build_from_derived(spec2, + ds_list = list("AE" = safetyData::sdtm_ae, + "ADSL" = safetyData::adam_adsl), + predecessor_only = FALSE, + keep = "PREREQUISITE" + ) + + adae_prereq_man <- adae_all_man %>% + select(c(names(adae_auto), TRT01A, TRT01AN, AEENDTC, AESTDTC)) %>% + select(all_of(names(adae_prereq)), everything()) + + expect_equal(adae_prereq, adae_prereq_man) }) From d64488082eda3948ccb71d7dcd4b16d6e101b366 Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:47:45 +0000 Subject: [PATCH 07/11] fixed prereq selection --- R/build.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/R/build.R b/R/build.R index be35ecb..cfc3f12 100644 --- a/R/build.R +++ b/R/build.R @@ -159,7 +159,8 @@ get_variables <- function(x, ds_list, keep, derivations) { # Drop predecessor columns out <- data %>% select(x$col_name) %>% - rename(all_of(rename_vec)) + mutate(across(all_of(rename_vec))) %>% + select(x$variable) } else if (keep == "ALL") { # Keep all cols from original datasets out <- data %>% @@ -167,8 +168,11 @@ get_variables <- function(x, ds_list, keep, derivations) { } else if (keep == "PREREQUISITE") { # Keep all columns required for future derivations # Find all "XX.XXXXX" - prereq_vector <- derivations$derivation %>% - str_match_all("([A-Z]+)\\.([A-Z0-9a-z]+)") + future_derivations <- derivations %>% + select(derivation) %>% + filter(!str_detect(derivation,"^[A-Z]+\\.[A-Z0-9a-z]+$")) + + prereq_vector <- str_match_all(future_derivations$derivation, "([A-Z]+)\\.([A-Z0-9a-z]+)") # Bind into matrix + remove dups prereq_matrix <- do.call(rbind,prereq_vector) %>% @@ -179,8 +183,8 @@ get_variables <- function(x, ds_list, keep, derivations) { out <- data %>% select(c(x$col_name, prereq_cols)) %>% - mutate(across(all_of(rename_vec))) - + mutate(across(all_of(rename_vec))) %>% + select(c(x$variable, prereq_cols)) } out } From 51a39a322af16ed3d1bf173f728353d908bb1491 Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:47:53 +0000 Subject: [PATCH 08/11] added tests --- tests/testthat/test-build.R | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/tests/testthat/test-build.R b/tests/testthat/test-build.R index 54ba753..eda8078 100644 --- a/tests/testthat/test-build.R +++ b/tests/testthat/test-build.R @@ -28,7 +28,7 @@ test_that("build_from_derived", { pull(derivation) %>% str_remove("^DM\\.") %>% unique() %>% - ifelse(. == "ARM", "TRT01P", .) %>% + c("TRT01P") %>% sort() build_from_derived(spec, ds_list, predecessor_only = FALSE, @@ -127,18 +127,34 @@ test_that("build_from_derived", { expect_equal(adae_full,adae_all_man) # Pulling through columns required for future derivations - adae_prereq <- build_from_derived(spec2, - ds_list = list("AE" = safetyData::sdtm_ae, + spec3 <- metacore %>% select_dataset("ADVS") + + advs_prereq <- build_from_derived(spec3, + ds_list = list("VS" = safetyData::sdtm_vs, "ADSL" = safetyData::adam_adsl), predecessor_only = FALSE, keep = "PREREQUISITE" ) - adae_prereq_man <- adae_all_man %>% - select(c(names(adae_auto), TRT01A, TRT01AN, AEENDTC, AESTDTC)) %>% - select(all_of(names(adae_prereq)), everything()) + advs_auto <- build_from_derived(spec3, + ds_list = list("VS" = safetyData::sdtm_vs, + "ADSL" = safetyData::adam_adsl), + predecessor_only = FALSE, + keep = "PREREQUISITE" + ) + + + advs_all <- build_from_derived(spec3, + ds_list = list("VS" = safetyData::sdtm_vs, + "ADSL" = safetyData::adam_adsl), + predecessor_only = FALSE, + keep = "ALL" + ) + + advs_prereq_man <- advs_all %>% + select(c(names(advs_auto), VSDTC, VSSTRESN)) - expect_equal(adae_prereq, adae_prereq_man) + expect_equal(advs_prereq, advs_prereq_man) }) From 592533efad12fda6a6ef66531409fb37a24a496f Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Wed, 27 Nov 2024 17:13:45 +0000 Subject: [PATCH 09/11] added documentation --- R/build.R | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/R/build.R b/R/build.R index cfc3f12..de9b4c9 100644 --- a/R/build.R +++ b/R/build.R @@ -16,11 +16,17 @@ #' @param predecessor_only By default `TRUE`, so only variables with the origin #' of 'Predecessor' will be used. If `FALSE` any derivation matching the #' dataset.variable will be used. -#' @param keep Boolean to determine if the original columns should be kept. By -#' default `FALSE`, so only the ADaM columns are kept. If `TRUE` the resulting -#' dataset will have all the ADaM columns as well as any SDTM column that were -#' renamed in the ADaM (i.e `ARM` and `TRT01P` will be in the resulting -#' dataset) +#' @param keep String to determine which columns from the original datasets +#' should be kept +#' - "FALSE" (default): only columns that are also present in the ADaM +#' specification are kept in the output. +#' - "ALL": all original columns are carried through to the +#' ADaM, including those that have been renamed. +#' - "PREREQUISITE": columns are kept if they are required for future +#' derivations in the specification. For example, if +#' a derivation references VSSTDTC despite this not +#' being present in the ADaM specification, the column +#' will be kept. #' #' @return dataset #' @export @@ -40,6 +46,22 @@ #' spec <- metacore %>% select_dataset("ADSL") #' ds_list <- list(DM = read_xpt(metatools_example("dm.xpt"))) #' build_from_derived(spec, ds_list, predecessor_only = FALSE) +#' +#' # Building an ADaM (ADVS) from multiple input datasets, keeping columns +#' # needed for future transformations +#' library(metacore) +#' library(haven) +#' library(magrittr) +#' library(safetyData) +#' load(metacore_example("pilot_ADaM.rda")) +#' spec <- metacore %>% select_dataset("ADVS") +#' ds_list <- list("VS" = safetyData::sdtm_vs,"ADSL" = safetyData::adam_adsl) +#' build_from_derived(spec, +#' ds_list, +#' predecessor_only = FALSE, +#' keep = "PREREQUISITE" +#' ) + build_from_derived <- function(metacore, ds_list, dataset_name = NULL, predecessor_only = TRUE, keep = FALSE) { # Deprecate KEEP = TRUE @@ -182,9 +204,9 @@ get_variables <- function(x, ds_list, keep, derivations) { prereq_cols <- subset(prereq_matrix, tolower(prereq_matrix[,2]) == tolower(ds_name))[,3] out <- data %>% - select(c(x$col_name, prereq_cols)) %>% + select(c(x$col_name, all_of(prereq_cols))) %>% mutate(across(all_of(rename_vec))) %>% - select(c(x$variable, prereq_cols)) + select(c(x$variable, all_of(prereq_cols))) } out } From 39b26e048d20354e092bba2dd8cc63ca6eaef9bb Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:36:40 +0000 Subject: [PATCH 10/11] replace warnings with cli alerts + documentation --- DESCRIPTION | 3 ++- NAMESPACE | 2 ++ R/build.R | 42 ++++++++++++++++++++++++++---------- man/build_from_derived.Rd | 43 ++++++++++++++++++++++++++++++++----- tests/testthat/test-build.R | 7 ++++-- 5 files changed, 78 insertions(+), 19 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index bf4f556..80d0f37 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,7 +38,8 @@ Imports: stringr, tidyr, tibble, - magrittr + magrittr, + cli Suggests: testthat (>= 3.0.0), haven, diff --git a/NAMESPACE b/NAMESPACE index 80f1143..86e1e05 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,8 @@ export(order_cols) export(remove_labels) export(set_variable_labels) export(sort_by_key) +importFrom(cli,cli_alert_info) +importFrom(cli,cli_alert_warning) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,add_count) diff --git a/R/build.R b/R/build.R index de9b4c9..a5838ee 100644 --- a/R/build.R +++ b/R/build.R @@ -22,11 +22,24 @@ #' specification are kept in the output. #' - "ALL": all original columns are carried through to the #' ADaM, including those that have been renamed. -#' - "PREREQUISITE": columns are kept if they are required for future -#' derivations in the specification. For example, if -#' a derivation references VSSTDTC despite this not -#' being present in the ADaM specification, the column -#' will be kept. +#' e.g. if DM.ARM is a predecessor to DM.TRT01P, +#' both ARM and TRT01P will be present as columns +#' in the ADaM output. +#' - "PREREQUISITE": columns are retained if they are required for future +#' derivations in the specification. Additional +#' prerequisite columns are identified as columns +#' that appear in the 'derivation' column of the +#' metacore object, but not as direct predecessors. +#' Predecessors are defined as columns where the +#' derivation is a 1:1 copy of a column in a source +#' dataset. +#' +#' e.g. derivation = "VS.VSTESTCD" is a predecessor, +#' while derivation = "Value of VS.VSSTRESN where +#' VS.VSTESTCD == 'Heart Rate'" contains both +#' VS.VSTESTCD and VS.VSSTRESN as prerequisites, and +#' these columns will be kept through to the ADaM. +#' #' #' @return dataset #' @export @@ -37,6 +50,7 @@ #' @importFrom tidyr unnest #' @importFrom purrr map reduce #' @importFrom tibble tibble +#' @importFrom cli cli_alert_warning cli_alert_info #' #' @examples #' library(metacore) @@ -67,8 +81,9 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, # Deprecate KEEP = TRUE keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "PREREQUISITE")) if (keep == "TRUE"){ - warning("Setting 'keep' = TRUE has been superseded, and will be unavailable in future releases. - Please consider setting 'keep' equal to 'ALL' or 'PREREQUISITE'.") + cli_alert_warning(paste0("Setting 'keep' = TRUE has been superseded", + ", and will be unavailable in future releases. Please consider setting ", + "'keep' equal to 'ALL' or 'PREREQUISITE'.")) } metacore <- make_lone_dataset(metacore, dataset_name) derirvations <- metacore$derivations %>% @@ -192,9 +207,9 @@ get_variables <- function(x, ds_list, keep, derivations) { # Find all "XX.XXXXX" future_derivations <- derivations %>% select(derivation) %>% - filter(!str_detect(derivation,"^[A-Z]+\\.[A-Z0-9a-z]+$")) + filter(!str_detect(derivation,"^[A-Z0-9a-z]+\\.[A-Z0-9a-z]+$")) - prereq_vector <- str_match_all(future_derivations$derivation, "([A-Z]+)\\.([A-Z0-9a-z]+)") + prereq_vector <- str_match_all(future_derivations$derivation, "([A-Z0-9a-z]+)\\.([A-Z0-9a-z]+)") # Bind into matrix + remove dups prereq_matrix <- do.call(rbind,prereq_vector) %>% @@ -215,7 +230,11 @@ get_variables <- function(x, ds_list, keep, derivations) { #' #' This function is used with `build_from_derived` to drop columns that would #' cause a conflict on joining datasets, prioritising keeping columns in -#' datasets earlier on in ds_list +#' datasets earlier on in ds_list. +#' +#' e.g. if ds_list = ("AE", "ADSL") and there is a conflicting column +#' "STUDYID", the column will be dropped from ADSL (index 2) rather than AE +#' (index 1). #' #' @param x List of datasets with all columns added #' @param keys List of key values to join on @@ -236,7 +255,8 @@ prepare_join <- function(x, keys, ds_names) { drop_cols <- c(drop_cols, conflicting_cols) if(length(conflicting_cols) > 0){ - message(paste0("Dropping column(s) from ", ds_names[[i]]," due to conflict with ",ds_names[[j]],": ", conflicting_cols,".")) + cli_alert_info(paste0("Dropping column(s) from ", ds_names[[i]], + " due to conflict with ",ds_names[[j]],": ", conflicting_cols,".")) } } diff --git a/man/build_from_derived.Rd b/man/build_from_derived.Rd index 4e5c597..b060c0b 100644 --- a/man/build_from_derived.Rd +++ b/man/build_from_derived.Rd @@ -27,11 +27,29 @@ been subsetted.} of 'Predecessor' will be used. If `FALSE` any derivation matching the dataset.variable will be used.} -\item{keep}{Boolean to determine if the original columns should be kept. By -default `FALSE`, so only the ADaM columns are kept. If `TRUE` the resulting -dataset will have all the ADaM columns as well as any SDTM column that were -renamed in the ADaM (i.e `ARM` and `TRT01P` will be in the resulting -dataset)} +\item{keep}{String to determine which columns from the original datasets + should be kept + - "FALSE" (default): only columns that are also present in the ADaM + specification are kept in the output. + - "ALL": all original columns are carried through to the + ADaM, including those that have been renamed. + e.g. if DM.ARM is a predecessor to DM.TRT01P, + both ARM and TRT01P will be present as columns + in the ADaM output. + - "PREREQUISITE": columns are retained if they are required for future + derivations in the specification. Additional + prerequisite columns are identified as columns + that appear in the 'derivation' column of the + metacore object, but not as direct predecessors. + Predecessors are defined as columns where the + derivation is a 1:1 copy of a column in a source + dataset. + + e.g. derivation = "VS.VSTESTCD" is a predecessor, + while derivation = "Value of VS.VSSTRESN where + VS.VSTESTCD == 'Heart Rate'" contains both + VS.VSTESTCD and VS.VSSTRESN as prerequisites, and + these columns will be kept through to the ADaM.} } \value{ dataset @@ -52,4 +70,19 @@ load(metacore_example("pilot_ADaM.rda")) spec <- metacore \%>\% select_dataset("ADSL") ds_list <- list(DM = read_xpt(metatools_example("dm.xpt"))) build_from_derived(spec, ds_list, predecessor_only = FALSE) + +# Building an ADaM (ADVS) from multiple input datasets, keeping columns +# needed for future transformations +library(metacore) +library(haven) +library(magrittr) +library(safetyData) +load(metacore_example("pilot_ADaM.rda")) +spec <- metacore \%>\% select_dataset("ADVS") +ds_list <- list("VS" = safetyData::sdtm_vs,"ADSL" = safetyData::adam_adsl) +build_from_derived(spec, + ds_list, + predecessor_only = FALSE, + keep = "PREREQUISITE" +) } diff --git a/tests/testthat/test-build.R b/tests/testthat/test-build.R index eda8078..e819460 100644 --- a/tests/testthat/test-build.R +++ b/tests/testthat/test-build.R @@ -47,14 +47,17 @@ test_that("build_from_derived", { c(., "TRT01P") %>% sort() - expect_warning( + expect_message( build_from_derived(spec, ds_list, predecessor_only = FALSE, keep = TRUE ) %>% names() %>% sort() %>% - expect_equal(man_vars) + expect_equal(man_vars), + label = paste0("! Setting 'keep' = TRUE has been superseded, and will be", + " unavailable in future releases. Please consider setting", + " 'keep' equal to 'ALL' or 'PREREQUISITE'.") ) # Pulling through from more than one dataset From 806d915c13c19606f31f6afb47a6bd1122792dc5 Mon Sep 17 00:00:00 2001 From: Matt Bearham <79635317+MattBearham@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:36:40 +0000 Subject: [PATCH 11/11] replace warnings with cli alerts + documentation --- DESCRIPTION | 3 ++- NAMESPACE | 2 ++ R/build.R | 41 +++++++++++++++++++++++++---------- man/build_from_derived.Rd | 43 ++++++++++++++++++++++++++++++++----- tests/testthat/test-build.R | 7 ++++-- 5 files changed, 77 insertions(+), 19 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index bf4f556..80d0f37 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,7 +38,8 @@ Imports: stringr, tidyr, tibble, - magrittr + magrittr, + cli Suggests: testthat (>= 3.0.0), haven, diff --git a/NAMESPACE b/NAMESPACE index 80f1143..86e1e05 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,8 @@ export(order_cols) export(remove_labels) export(set_variable_labels) export(sort_by_key) +importFrom(cli,cli_alert_info) +importFrom(cli,cli_alert_warning) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,add_count) diff --git a/R/build.R b/R/build.R index de9b4c9..b870b54 100644 --- a/R/build.R +++ b/R/build.R @@ -22,11 +22,23 @@ #' specification are kept in the output. #' - "ALL": all original columns are carried through to the #' ADaM, including those that have been renamed. -#' - "PREREQUISITE": columns are kept if they are required for future -#' derivations in the specification. For example, if -#' a derivation references VSSTDTC despite this not -#' being present in the ADaM specification, the column -#' will be kept. +#' e.g. if DM.ARM is a predecessor to DM.TRT01P, +#' both ARM and TRT01P will be present as columns +#' in the ADaM output. +#' - "PREREQUISITE": columns are retained if they are required for future +#' derivations in the specification. Additional +#' prerequisite columns are identified as columns +#' that appear in the 'derivation' column of the +#' metacore object in the format "DATASET.VARIABLE", +#' but not as direct predecessors. Predecessors are +#' defined as columns where the derivation is a 1:1 +#' copy of a column in a source dataset. +#' e.g. derivation = "VS.VSTESTCD" is a predecessor, +#' while derivation = "Value of VS.VSSTRESN where +#' VS.VSTESTCD == 'Heart Rate'" contains both +#' VS.VSTESTCD and VS.VSSTRESN as prerequisites, and +#' these columns will be kept through to the ADaM. +#' #' #' @return dataset #' @export @@ -37,6 +49,7 @@ #' @importFrom tidyr unnest #' @importFrom purrr map reduce #' @importFrom tibble tibble +#' @importFrom cli cli_alert_warning cli_alert_info #' #' @examples #' library(metacore) @@ -67,8 +80,9 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, # Deprecate KEEP = TRUE keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "PREREQUISITE")) if (keep == "TRUE"){ - warning("Setting 'keep' = TRUE has been superseded, and will be unavailable in future releases. - Please consider setting 'keep' equal to 'ALL' or 'PREREQUISITE'.") + cli_alert_warning(paste0("Setting 'keep' = TRUE has been superseded", + ", and will be unavailable in future releases. Please consider setting ", + "'keep' equal to 'ALL' or 'PREREQUISITE'.")) } metacore <- make_lone_dataset(metacore, dataset_name) derirvations <- metacore$derivations %>% @@ -192,9 +206,9 @@ get_variables <- function(x, ds_list, keep, derivations) { # Find all "XX.XXXXX" future_derivations <- derivations %>% select(derivation) %>% - filter(!str_detect(derivation,"^[A-Z]+\\.[A-Z0-9a-z]+$")) + filter(!str_detect(derivation,"^[A-Z0-9a-z]+\\.[A-Z0-9a-z]+$")) - prereq_vector <- str_match_all(future_derivations$derivation, "([A-Z]+)\\.([A-Z0-9a-z]+)") + prereq_vector <- str_match_all(future_derivations$derivation, "([A-Z0-9a-z]+)\\.([A-Z0-9a-z]+)") # Bind into matrix + remove dups prereq_matrix <- do.call(rbind,prereq_vector) %>% @@ -215,7 +229,11 @@ get_variables <- function(x, ds_list, keep, derivations) { #' #' This function is used with `build_from_derived` to drop columns that would #' cause a conflict on joining datasets, prioritising keeping columns in -#' datasets earlier on in ds_list +#' datasets earlier on in ds_list. +#' +#' e.g. if ds_list = ("AE", "ADSL") and there is a conflicting column +#' "STUDYID", the column will be dropped from ADSL (index 2) rather than AE +#' (index 1). #' #' @param x List of datasets with all columns added #' @param keys List of key values to join on @@ -236,7 +254,8 @@ prepare_join <- function(x, keys, ds_names) { drop_cols <- c(drop_cols, conflicting_cols) if(length(conflicting_cols) > 0){ - message(paste0("Dropping column(s) from ", ds_names[[i]]," due to conflict with ",ds_names[[j]],": ", conflicting_cols,".")) + cli_alert_info(paste0("Dropping column(s) from ", ds_names[[i]], + " due to conflict with ",ds_names[[j]],": ", conflicting_cols,".")) } } diff --git a/man/build_from_derived.Rd b/man/build_from_derived.Rd index 4e5c597..b060c0b 100644 --- a/man/build_from_derived.Rd +++ b/man/build_from_derived.Rd @@ -27,11 +27,29 @@ been subsetted.} of 'Predecessor' will be used. If `FALSE` any derivation matching the dataset.variable will be used.} -\item{keep}{Boolean to determine if the original columns should be kept. By -default `FALSE`, so only the ADaM columns are kept. If `TRUE` the resulting -dataset will have all the ADaM columns as well as any SDTM column that were -renamed in the ADaM (i.e `ARM` and `TRT01P` will be in the resulting -dataset)} +\item{keep}{String to determine which columns from the original datasets + should be kept + - "FALSE" (default): only columns that are also present in the ADaM + specification are kept in the output. + - "ALL": all original columns are carried through to the + ADaM, including those that have been renamed. + e.g. if DM.ARM is a predecessor to DM.TRT01P, + both ARM and TRT01P will be present as columns + in the ADaM output. + - "PREREQUISITE": columns are retained if they are required for future + derivations in the specification. Additional + prerequisite columns are identified as columns + that appear in the 'derivation' column of the + metacore object, but not as direct predecessors. + Predecessors are defined as columns where the + derivation is a 1:1 copy of a column in a source + dataset. + + e.g. derivation = "VS.VSTESTCD" is a predecessor, + while derivation = "Value of VS.VSSTRESN where + VS.VSTESTCD == 'Heart Rate'" contains both + VS.VSTESTCD and VS.VSSTRESN as prerequisites, and + these columns will be kept through to the ADaM.} } \value{ dataset @@ -52,4 +70,19 @@ load(metacore_example("pilot_ADaM.rda")) spec <- metacore \%>\% select_dataset("ADSL") ds_list <- list(DM = read_xpt(metatools_example("dm.xpt"))) build_from_derived(spec, ds_list, predecessor_only = FALSE) + +# Building an ADaM (ADVS) from multiple input datasets, keeping columns +# needed for future transformations +library(metacore) +library(haven) +library(magrittr) +library(safetyData) +load(metacore_example("pilot_ADaM.rda")) +spec <- metacore \%>\% select_dataset("ADVS") +ds_list <- list("VS" = safetyData::sdtm_vs,"ADSL" = safetyData::adam_adsl) +build_from_derived(spec, + ds_list, + predecessor_only = FALSE, + keep = "PREREQUISITE" +) } diff --git a/tests/testthat/test-build.R b/tests/testthat/test-build.R index eda8078..e819460 100644 --- a/tests/testthat/test-build.R +++ b/tests/testthat/test-build.R @@ -47,14 +47,17 @@ test_that("build_from_derived", { c(., "TRT01P") %>% sort() - expect_warning( + expect_message( build_from_derived(spec, ds_list, predecessor_only = FALSE, keep = TRUE ) %>% names() %>% sort() %>% - expect_equal(man_vars) + expect_equal(man_vars), + label = paste0("! Setting 'keep' = TRUE has been superseded, and will be", + " unavailable in future releases. Please consider setting", + " 'keep' equal to 'ALL' or 'PREREQUISITE'.") ) # Pulling through from more than one dataset