diff --git a/DESCRIPTION b/DESCRIPTION index 16af8c6..80d0f37 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,7 @@ Description: Uses the metadata information stored in 'metacore' objects to check License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Imports: dplyr, metacore (>= 0.0.4), @@ -38,7 +38,8 @@ Imports: stringr, tidyr, tibble, - magrittr + magrittr, + cli Suggests: testthat (>= 3.0.0), haven, diff --git a/NAMESPACE b/NAMESPACE index 7a14495..86e1e05 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,8 @@ export(order_cols) export(remove_labels) export(set_variable_labels) export(sort_by_key) +importFrom(cli,cli_alert_info) +importFrom(cli,cli_alert_warning) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,add_count) @@ -75,6 +77,7 @@ importFrom(stringr,str_c) importFrom(stringr,str_count) importFrom(stringr,str_detect) importFrom(stringr,str_extract) +importFrom(stringr,str_match_all) importFrom(stringr,str_remove) importFrom(stringr,str_remove_all) importFrom(stringr,str_split) diff --git a/R/build.R b/R/build.R index 2a15a7e..b870b54 100644 --- a/R/build.R +++ b/R/build.R @@ -16,21 +16,40 @@ #' @param predecessor_only By default `TRUE`, so only variables with the origin #' of 'Predecessor' will be used. If `FALSE` any derivation matching the #' dataset.variable will be used. -#' @param keep Boolean to determine if the original columns should be kept. By -#' default `FALSE`, so only the ADaM columns are kept. If `TRUE` the resulting -#' dataset will have all the ADaM columns as well as any SDTM column that were -#' renamed in the ADaM (i.e `ARM` and `TRT01P` will be in the resulting -#' dataset) +#' @param keep String to determine which columns from the original datasets +#' should be kept +#' - "FALSE" (default): only columns that are also present in the ADaM +#' specification are kept in the output. +#' - "ALL": all original columns are carried through to the +#' ADaM, including those that have been renamed. +#' e.g. if DM.ARM is a predecessor to DM.TRT01P, +#' both ARM and TRT01P will be present as columns +#' in the ADaM output. +#' - "PREREQUISITE": columns are retained if they are required for future +#' derivations in the specification. Additional +#' prerequisite columns are identified as columns +#' that appear in the 'derivation' column of the +#' metacore object in the format "DATASET.VARIABLE", +#' but not as direct predecessors. Predecessors are +#' defined as columns where the derivation is a 1:1 +#' copy of a column in a source dataset. +#' e.g. derivation = "VS.VSTESTCD" is a predecessor, +#' while derivation = "Value of VS.VSSTRESN where +#' VS.VSTESTCD == 'Heart Rate'" contains both +#' VS.VSTESTCD and VS.VSSTRESN as prerequisites, and +#' these columns will be kept through to the ADaM. +#' #' #' @return dataset #' @export #' @importFrom stringr str_to_lower str_detect str_extract str_to_upper -#' str_split +#' str_split str_match_all #' @importFrom dplyr filter pull mutate group_by group_split inner_join select #' full_join bind_rows #' @importFrom tidyr unnest #' @importFrom purrr map reduce #' @importFrom tibble tibble +#' @importFrom cli cli_alert_warning cli_alert_info #' #' @examples #' library(metacore) @@ -40,8 +59,31 @@ #' spec <- metacore %>% select_dataset("ADSL") #' ds_list <- list(DM = read_xpt(metatools_example("dm.xpt"))) #' build_from_derived(spec, ds_list, predecessor_only = FALSE) +#' +#' # Building an ADaM (ADVS) from multiple input datasets, keeping columns +#' # needed for future transformations +#' library(metacore) +#' library(haven) +#' library(magrittr) +#' library(safetyData) +#' load(metacore_example("pilot_ADaM.rda")) +#' spec <- metacore %>% select_dataset("ADVS") +#' ds_list <- list("VS" = safetyData::sdtm_vs,"ADSL" = safetyData::adam_adsl) +#' build_from_derived(spec, +#' ds_list, +#' predecessor_only = FALSE, +#' keep = "PREREQUISITE" +#' ) + build_from_derived <- function(metacore, ds_list, dataset_name = NULL, predecessor_only = TRUE, keep = FALSE) { + # Deprecate KEEP = TRUE + keep <- match.arg(as.character(keep), c("TRUE", "FALSE", "ALL", "PREREQUISITE")) + if (keep == "TRUE"){ + cli_alert_warning(paste0("Setting 'keep' = TRUE has been superseded", + ", and will be unavailable in future releases. Please consider setting ", + "'keep' equal to 'ALL' or 'PREREQUISITE'.")) + } metacore <- make_lone_dataset(metacore, dataset_name) derirvations <- metacore$derivations %>% mutate(derivation = trimws(derivation)) @@ -60,6 +102,7 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, vars_to_pull_through <- derirvations %>% filter(str_detect(derivation, "^\\w*\\.[a-zA-Z0-9]*$")) + # To lower so it is flexible about how people name their ds list vars_w_ds <- vars_to_pull_through %>% mutate(ds = str_extract(derivation, "^\\w*(?=\\.)") %>% @@ -123,12 +166,11 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, bind_rows(additional_vals) %>% group_by(ds) %>% group_split() %>% - map(get_variables, ds_list, keep) %>% + map(get_variables, ds_list, keep, derirvations) %>% + prepare_join(join_by, names(ds_list)) %>% reduce(full_join, by = join_by) } - - #' Internal functions to get variables from a dataset list #' #' This function is used with `build_from_derived` to build a dataset of columns @@ -140,22 +182,89 @@ build_from_derived <- function(metacore, ds_list, dataset_name = NULL, #' #' @return datasets #' @noRd -get_variables <- function(x, ds_list, keep) { +get_variables <- function(x, ds_list, keep, derivations) { ds_name <- unique(x$ds) data <- ds_list[[ds_name]] rename_vec <- set_names(x$col_name, x$variable) - if (keep) { + if (keep == "TRUE") { + # Don't drop predecessor columns out <- data %>% select(x$col_name) %>% mutate(across(all_of(rename_vec))) - } else { + } else if (keep == "FALSE") { + # Drop predecessor columns out <- data %>% select(x$col_name) %>% - rename(all_of(rename_vec)) + mutate(across(all_of(rename_vec))) %>% + select(x$variable) + } else if (keep == "ALL") { + # Keep all cols from original datasets + out <- data %>% + mutate(across(all_of(rename_vec))) + } else if (keep == "PREREQUISITE") { + # Keep all columns required for future derivations + # Find all "XX.XXXXX" + future_derivations <- derivations %>% + select(derivation) %>% + filter(!str_detect(derivation,"^[A-Z0-9a-z]+\\.[A-Z0-9a-z]+$")) + + prereq_vector <- str_match_all(future_derivations$derivation, "([A-Z0-9a-z]+)\\.([A-Z0-9a-z]+)") + + # Bind into matrix + remove dups + prereq_matrix <- do.call(rbind,prereq_vector) %>% + unique() + + # Subset to those present in current dataset + prereq_cols <- subset(prereq_matrix, tolower(prereq_matrix[,2]) == tolower(ds_name))[,3] + + out <- data %>% + select(c(x$col_name, all_of(prereq_cols))) %>% + mutate(across(all_of(rename_vec))) %>% + select(c(x$variable, all_of(prereq_cols))) } out } +#' Internal function to remove duplicated non-key variables prior to join +#' +#' This function is used with `build_from_derived` to drop columns that would +#' cause a conflict on joining datasets, prioritising keeping columns in +#' datasets earlier on in ds_list. +#' +#' e.g. if ds_list = ("AE", "ADSL") and there is a conflicting column +#' "STUDYID", the column will be dropped from ADSL (index 2) rather than AE +#' (index 1). +#' +#' @param x List of datasets with all columns added +#' @param keys List of key values to join on +#' +#' @return datasets +#' @noRd +prepare_join <- function(x, keys, ds_names) { + out <- list(x[[1]]) + + if (length(x) > 1){ + for (i in 2:length(x)){ + # Drop non-key cols present in each previous dataset in order + drop_cols <- c() + + for (j in 1:(i-1)){ + conflicting_cols <- keep(names(x[[j]]), function(col) !(col %in% keys)) %>% + intersect(colnames(x[[i]])) + drop_cols <- c(drop_cols, conflicting_cols) + + if(length(conflicting_cols) > 0){ + cli_alert_info(paste0("Dropping column(s) from ", ds_names[[i]], + " due to conflict with ",ds_names[[j]],": ", conflicting_cols,".")) + } + } + + out[[i]] <- x[[i]] %>% + select(-any_of(drop_cols)) + } + } + out +} #' Drop Unspecified Variables #' diff --git a/man/build_from_derived.Rd b/man/build_from_derived.Rd index 4e5c597..b060c0b 100644 --- a/man/build_from_derived.Rd +++ b/man/build_from_derived.Rd @@ -27,11 +27,29 @@ been subsetted.} of 'Predecessor' will be used. If `FALSE` any derivation matching the dataset.variable will be used.} -\item{keep}{Boolean to determine if the original columns should be kept. By -default `FALSE`, so only the ADaM columns are kept. If `TRUE` the resulting -dataset will have all the ADaM columns as well as any SDTM column that were -renamed in the ADaM (i.e `ARM` and `TRT01P` will be in the resulting -dataset)} +\item{keep}{String to determine which columns from the original datasets + should be kept + - "FALSE" (default): only columns that are also present in the ADaM + specification are kept in the output. + - "ALL": all original columns are carried through to the + ADaM, including those that have been renamed. + e.g. if DM.ARM is a predecessor to DM.TRT01P, + both ARM and TRT01P will be present as columns + in the ADaM output. + - "PREREQUISITE": columns are retained if they are required for future + derivations in the specification. Additional + prerequisite columns are identified as columns + that appear in the 'derivation' column of the + metacore object, but not as direct predecessors. + Predecessors are defined as columns where the + derivation is a 1:1 copy of a column in a source + dataset. + + e.g. derivation = "VS.VSTESTCD" is a predecessor, + while derivation = "Value of VS.VSSTRESN where + VS.VSTESTCD == 'Heart Rate'" contains both + VS.VSTESTCD and VS.VSSTRESN as prerequisites, and + these columns will be kept through to the ADaM.} } \value{ dataset @@ -52,4 +70,19 @@ load(metacore_example("pilot_ADaM.rda")) spec <- metacore \%>\% select_dataset("ADSL") ds_list <- list(DM = read_xpt(metatools_example("dm.xpt"))) build_from_derived(spec, ds_list, predecessor_only = FALSE) + +# Building an ADaM (ADVS) from multiple input datasets, keeping columns +# needed for future transformations +library(metacore) +library(haven) +library(magrittr) +library(safetyData) +load(metacore_example("pilot_ADaM.rda")) +spec <- metacore \%>\% select_dataset("ADVS") +ds_list <- list("VS" = safetyData::sdtm_vs,"ADSL" = safetyData::adam_adsl) +build_from_derived(spec, + ds_list, + predecessor_only = FALSE, + keep = "PREREQUISITE" +) } diff --git a/tests/testthat/test-build.R b/tests/testthat/test-build.R index 44aebf8..e819460 100644 --- a/tests/testthat/test-build.R +++ b/tests/testthat/test-build.R @@ -28,7 +28,7 @@ test_that("build_from_derived", { pull(derivation) %>% str_remove("^DM\\.") %>% unique() %>% - ifelse(. == "ARM", "TRT01P", .) %>% + c("TRT01P") %>% sort() build_from_derived(spec, ds_list, predecessor_only = FALSE, @@ -46,13 +46,19 @@ test_that("build_from_derived", { unique() %>% c(., "TRT01P") %>% sort() - build_from_derived(spec, ds_list, - predecessor_only = FALSE, - keep = TRUE - ) %>% - names() %>% - sort() %>% - expect_equal(man_vars) + + expect_message( + build_from_derived(spec, ds_list, + predecessor_only = FALSE, + keep = TRUE + ) %>% + names() %>% + sort() %>% + expect_equal(man_vars), + label = paste0("! Setting 'keep' = TRUE has been superseded, and will be", + " unavailable in future releases. Please consider setting", + " 'keep' equal to 'ALL' or 'PREREQUISITE'.") + ) # Pulling through from more than one dataset spec2 <- metacore %>% select_dataset("ADAE") @@ -108,6 +114,50 @@ test_that("build_from_derived", { keep = FALSE )) + # Pulling through all columns from original dataset + adae_full <- build_from_derived(spec2, + ds_list = list("AE" = safetyData::sdtm_ae, + "ADSL" = safetyData::adam_adsl), + predecessor_only = FALSE, + keep = "ALL" + ) + + full_adsl_part <- safetyData::adam_adsl %>% + mutate(TRTA = TRT01A, TRTAN = TRT01AN) + + adae_all_man <- full_join(full_adsl_part, safetyData::sdtm_ae, by = c("STUDYID", "USUBJID"), multiple = "all") + + expect_equal(adae_full,adae_all_man) + + # Pulling through columns required for future derivations + spec3 <- metacore %>% select_dataset("ADVS") + + advs_prereq <- build_from_derived(spec3, + ds_list = list("VS" = safetyData::sdtm_vs, + "ADSL" = safetyData::adam_adsl), + predecessor_only = FALSE, + keep = "PREREQUISITE" + ) + + advs_auto <- build_from_derived(spec3, + ds_list = list("VS" = safetyData::sdtm_vs, + "ADSL" = safetyData::adam_adsl), + predecessor_only = FALSE, + keep = "PREREQUISITE" + ) + + + advs_all <- build_from_derived(spec3, + ds_list = list("VS" = safetyData::sdtm_vs, + "ADSL" = safetyData::adam_adsl), + predecessor_only = FALSE, + keep = "ALL" + ) + + advs_prereq_man <- advs_all %>% + select(c(names(advs_auto), VSDTC, VSSTRESN)) + + expect_equal(advs_prereq, advs_prereq_man) })