From d89e1650e14c269ff20efa8ec29a5cb7bcb8d3cf Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Wed, 18 Sep 2024 20:54:16 -0400 Subject: [PATCH 01/77] first attempt of improve speed --- R/is_id.R | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/R/is_id.R b/R/is_id.R index a1f77086..9e42f5d3 100644 --- a/R/is_id.R +++ b/R/is_id.R @@ -35,40 +35,32 @@ if (getRversion() >= '2.15.1') #' is_id(y1, by = "id") is_id <- function(dt, by, - verbose = getOption("joyn.verbose"), + verbose = getOption("joyn.verbose", default = FALSE), return_report = FALSE) { - # make sure it is data.table - if (!(is.data.table(dt))) { + # Ensure dt is a data.table + if (!is.data.table(dt)) { dt <- as.data.table(dt) - } else { - dt <- data.table::copy(dt) } - # count - m <- dt[, .(copies =.N), by = mget(by)] - is_id <- m[, mean(copies)] == 1 + # Check for duplicates + is_id <- !(anyDuplicated(dt, by = by) > 0) if (verbose) { - - cli::cli_h3("Duplicates in terms of {.code {by}}") - - d <- freq_table(m, "copies") - print(d[]) - - cli::cli_rule(right = "End of {.field is_id()} report") - + if (is_id) { + cli::cli_alert_success("No duplicates found by {.code {by}}") + } else { + cli::cli_alert_warning("Duplicates found by: {.code {by}}") + } } - if (isFALSE(return_report)) { - - return(is_id) - + if (return_report) { + # Return the duplicated rows if requested + dup_rows <- dt[duplicated(dt, by = by) | + duplicated(dt, by = by, fromLast = TRUE)] + # freq_table(x = dt, byvar = by)[] + return(dup_rows) } else { - - return(m) - + return(is_id) } - } - From 333bd5bae4d9c28bfb13175c1e5edc38de3cb7a5 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Wed, 18 Sep 2024 21:15:03 -0400 Subject: [PATCH 02/77] first attempt of improving is_id --- R/freq_table.R | 8 ++++++-- R/is_id.R | 14 ++++++++++---- man/freq_table.Rd | 2 +- man/is_id.Rd | 7 ++++++- man/merge.Rd | 2 +- 5 files changed, 24 insertions(+), 9 deletions(-) diff --git a/R/freq_table.R b/R/freq_table.R index 5bf93150..51aa79a9 100644 --- a/R/freq_table.R +++ b/R/freq_table.R @@ -26,7 +26,8 @@ if (getRversion() >= '2.15.1') freq_table <- function(x, byvar, digits = 1, - na.rm = FALSE) { + na.rm = FALSE, + freq_var_name = "n") { x_name <- as.character(substitute(x)) if (!is.data.frame(x)) { @@ -48,7 +49,10 @@ freq_table <- function(x, # filter zeros fsubset(n > 0) - setrename(ft, joyn = byvar, .nse = FALSE) + setrename(ft, + joyn = byvar, + n = freq_var_name, + .nse = FALSE) } diff --git a/R/is_id.R b/R/is_id.R index 9e42f5d3..8bbd8ccc 100644 --- a/R/is_id.R +++ b/R/is_id.R @@ -56,10 +56,16 @@ is_id <- function(dt, if (return_report) { # Return the duplicated rows if requested - dup_rows <- dt[duplicated(dt, by = by) | - duplicated(dt, by = by, fromLast = TRUE)] - # freq_table(x = dt, byvar = by)[] - return(dup_rows) + cli::cli_h3("Duplicates in terms of {.code {by}}") + + d <- freq_table(x = dt, + byvar = by, + freq_var_name = "copies") + d |> + fsubset(copies > 1) |> + print() + cli::cli_rule(right = "End of {.field is_id()} report") + return(invisible(d)) } else { return(is_id) } diff --git a/man/freq_table.Rd b/man/freq_table.Rd index 6be47f0c..83e68fbd 100644 --- a/man/freq_table.Rd +++ b/man/freq_table.Rd @@ -4,7 +4,7 @@ \alias{freq_table} \title{Tabulate simple frequencies} \usage{ -freq_table(x, byvar, digits = 1, na.rm = FALSE) +freq_table(x, byvar, digits = 1, na.rm = FALSE, freq_var_name = "n") } \arguments{ \item{x}{data frame} diff --git a/man/is_id.Rd b/man/is_id.Rd index fce181e9..2e5e2051 100644 --- a/man/is_id.Rd +++ b/man/is_id.Rd @@ -4,7 +4,12 @@ \alias{is_id} \title{Check if dt is uniquely identified by \code{by} variable} \usage{ -is_id(dt, by, verbose = getOption("joyn.verbose"), return_report = FALSE) +is_id( + dt, + by, + verbose = getOption("joyn.verbose", default = FALSE), + return_report = FALSE +) } \arguments{ \item{dt}{either right of left table} diff --git a/man/merge.Rd b/man/merge.Rd index 7f748784..5107d64d 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -45,7 +45,7 @@ data from both \code{x} and \code{y} are included in the output.} \item{sort}{logical. If \code{TRUE} (default), the rows of the merged \code{data.table} are sorted by setting the key to the \code{by / by.x} columns. If \code{FALSE}, unlike base R's \code{merge} for which row order is unspecified, the -row order in \code{x} is retained (including retaining the position of missings when +row order in \code{x} is retained (including retaining the position of missing entries when \code{all.x=TRUE}), followed by \code{y} rows that don't match \code{x} (when \code{all.y=TRUE}) retaining the order those appear in \code{y}.} From 0f0b7d5ba0ffde0e5ece1ef883f53bc5505092ad Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Wed, 18 Sep 2024 22:39:15 -0400 Subject: [PATCH 03/77] build super big data to test efficiency in possible_ids --- R/possible_ids.R | 215 +++++++++++------------------ tests/testthat/test-possible_ids.R | 75 ++++++++++ 2 files changed, 155 insertions(+), 135 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 86cea94a..a97a8116 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -24,175 +24,120 @@ possible_ids <- function(dt, exclude = NULL, include = NULL, - verbose = getOption("possible_ids.verbose")) { - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Check inputs --------- - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + exclude_types = NULL, + include_types = NULL, + verbose = getOption("possible_ids.verbose", default = FALSE), + comb_size = 5, + get_all = FALSE) { + # Ensure dt is a data.table if (!is.data.frame(dt)) { stop("data must be a data frame") } - - if (is.data.table(dt)) { - dt <- as.data.frame(dt) - } - - - if (is.null(exclude) && !is.null(include)) { - if (verbose) { - cli::cli_alert_warning("Since {.code exclude} is NULL, {.code include} - directive does not make sense. Ignored.", - wrap = TRUE) - } - warning("inconsistent use of `include`") + if (!is.data.table(dt)) { + dt <- as.data.table(dt) } - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ## exclude variables from check ------ + # Get all variable names + vars <- names(dt) |> copy() - vars <- names(dt) + # Compute the primary class of each variable + vars_class <- vapply(dt, function(x) class(x)[1], character(1)) + names(vars_class) <- vars # Ensure names are preserved - ### Exclude variable according to their type --------- + # Apply 'include' filter if (!is.null(include)) { + vars <- intersect(vars, include) + } - # Find position of variable to include - ii <- which(names(dt) %in% include) - - } else { - - ii <- NULL - + # Apply 'include_types' filter + if (!is.null(include_types)) { + vars <- vars[vars_class[vars] %in% include_types] } - ### Exclude variable by name --------- + # Apply 'exclude' filter if (!is.null(exclude)) { - - if (any(grepl("^_", exclude))) { - - type_ex <- exclude[grepl("^_", exclude)] - vars_ex <- exclude[!grepl("^_", exclude)] - - type_ex <- match.arg(type_ex, c("_character", "_numeric")) - - # find variable that meet criteria and exclude them, making sure to include - # the variables of the user. - ex <- gsub("^_", "", type_ex) - FUN <- paste0("is.", ex) - - n_cols <- unlist(lapply(dt, FUN)) - n_cols[ii] <- FALSE - - # Exclude variables by name - - if (length(vars_ex) > 0) { - ex <-which(names(dt) %in% vars_ex) - n_cols[ex] <- TRUE - } - - vars <- names(dt)[!n_cols] - - } else { - vars <- vars[!(vars %in% exclude)] - - if (identical(vars, names(dt))) { - if (verbose) { - cli::cli_alert_warning("Variable {.field {exclude}} is not available in data frame. - Nothing is excluded.", wrap = TRUE) - } - - warning("inconsistenty use of `exclude`") - - } - - } + vars <- setdiff(vars, exclude) } - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ## check all names are unieuq -------- - dup_var <- duplicated(vars) - - if (any(dup_var)) { - - dvars <- vars[dup_var] - - msg <- "column names must be unique" - hint <- "try changing the names using {.fun make.names}" - problem <- "{.var {dvars}} {?is/are} duplicated" - cli::cli_abort(c( - msg, - i = hint, - x = problem - )) - + # Apply 'exclude_types' filter + if (!is.null(exclude_types)) { + vars <- vars[!(vars_class[vars] %in% exclude_types)] } + # Remove duplicate column names... just in case + vars <- unique(vars) - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Find duplicates --------- - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - duplicates <- is_id(dt, by = vars, verbose = FALSE) - if (duplicates) { - if (verbose) { - cli::cli_alert_success("There are no duplicates in data frame") - } - } else { + if (length(vars) == 0) { if (verbose) { - cli::cli_alert_warning("Data has duplicates. returning NULL") + cli::cli_alert_danger("No variables available after applying include/exclude filters.") } - is_id(dt, by = vars, verbose = TRUE) - return(NULL) + return(NULL) # should this be an error? } - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Find ids --------- - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nvars <- length(vars) + # Check if any variables uniquely identify the data individually + unique_vars <- vapply(vars, + \(var) !anyDuplicated(dt[[var]]), + logical(1)) - found <- FALSE - i = 0 - while(i < nvars && found == FALSE) { - i = i + 1 - cm <- utils::combn(vars, m = i) + # Collect variables that are unique identifiers + unique_ids <- vars[unique_vars] - lcm <- dim(cm)[2] # number of combinations of size j + # Initialize list to store possible IDs + possible_ids_list <- list() - selected_vars <- vector(length = lcm) - for (j in 1:lcm) { - tvars <- cm[, j] # testing vars - selected_vars[j] <- is_id(dt, by = tvars, verbose = FALSE) + # Add individual unique variables + if (length(unique_ids) > 0) { + possible_ids_list <- c(possible_ids_list, as.list(unique_ids)) + if (verbose) { + cli::cli_alert_info("Found unique identifiers: {.code {unique_ids}}") } + } - sv <- which(selected_vars) - - if (length(sv) > 0) { - - if (length(sv) == 1 && i > 1) { - - lv <- list(V1 = cm[, sv]) - - } else if (i == 1) { + # Remove unique identifiers from vars to reduce combinations + vars <- setdiff(vars, unique_ids) - ee <- as.data.frame(t(cm[, sv])) - lv <- lapply(ee, unique) + # If data is uniquely identified by existing variables, return the unique IDs + if (length(possible_ids_list) > 0 && fnrow(dt) == fnrow(unique(dt[, ..unique_ids]))) { + return(possible_ids_list) + } - } else { + # Start with combinations of size 2 up to the number of remaining vars + for (i in 2:min(length(vars), comb_size)) { # Limit combination size to 5 for efficiency + combos <- combn(vars, i, simplify = FALSE) - ee <- as.data.frame(cm[, sv]) - lv <- lapply(ee, unique) + if (verbose) { + msg <- sprintf("combinations of %s variables", i) + cli::cli_progress_bar(msg, total = length(combos)) + } + for (combo in combos) { + # Check if the combination uniquely identifies the data + if (is_id(dt, by = combo, verbose = FALSE)) { + # This is inefficient... it is copying every time... + # I need to think better on how to do it. + possible_ids_list <- c(possible_ids_list, list(combo)) + if (!get_all) { + return(possible_ids_list) + } + # Remove variables in the current combo from vars to avoid redundant checks + vars <- setdiff(vars, combo) + break # Break since we found a minimal unique key of size i } - - found <- TRUE - + if (verbose) cli::cli_progress_update() + } + # Break if all variables are used + if (length(vars) == 0) { + break } } - if (verbose) { - cli::cli_alert("we found {length(lv)} possible id{?s}") + if (length(possible_ids_list) == 0) { + if (verbose) { + cli::cli_alert_warning("No unique identifier found.") + } + return(NULL) + } else { + return(possible_ids_list) } - - return(lv) - } diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index ccf68847..7ad695e7 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -118,3 +118,78 @@ test_that("duplicated names", { }) + +library(lubridate) # For date functions + +# Set seed for reproducibility +set.seed(123) + +# Number of rows and variables +n_rows <- 1e5 # 100,000 rows +n_vars <- 50 # Total variables + +# Initialize an empty data.table +dt_large <- data.table(n = 1:n_rows) + +# Function to generate random data +generate_random_data <- function(n, type) { + switch(type, + "numeric_int" = sample(1:1e6, n, replace = TRUE), + "numeric_double" = rnorm(n), + "character" = replicate(n, paste0(sample(letters, 5, replace = TRUE), collapse = "")), + "factor" = factor(sample(letters[1:10], n, replace = TRUE)), + "logical" = sample(c(TRUE, FALSE), n, replace = TRUE), + "date" = as.Date("2000-01-01") + sample(0:3650, n, replace = TRUE), + "datetime" = as.POSIXct("2000-01-01") + sample(0:(3650*24*60*60), n, replace = TRUE) + ) +} + +# Variable types and counts +var_types <- c("numeric_int", "numeric_double", "character", "factor", "logical", "date", "datetime") +vars_per_type <- c(10, 10, 10, 10, 5, 3, 2) # sum to 50 + +# Generate variables and add to the data.table +var_count <- 0 +for (i in seq_along(var_types)) { + type <- var_types[i] + n_vars_type <- vars_per_type[i] + for (j in 1:n_vars_type) { + var_count <- var_count + 1 + var_name <- paste0(type, "_", j) + dt_large[, (var_name) := generate_random_data(n_rows, type)] + } +} + +# Introduce duplicates intentionally +# Duplicate the first 100 rows +dt_large <- rowbind(dt_large, dt_large[1:100, ]) + +# Shuffle the data +dt_large <- dt_large[sample(.N)] +dt_large[, id := .I] + + +possible_ids( + dt = dt_large, + verbose = TRUE +) + +# Remove the 'id' column to simulate data without a clear unique identifier +dt_large[, id := NULL] + +possible_ids_list <- possible_ids( + dt = dt_large, + exclude_types = c("logical", "date", "datetime"), # Exclude some types for efficiency + verbose = TRUE +) +possible_ids_list + + +# Display the structure of the data.table +str(dt_large) + + + + + + From baa6bd6ecefcf23a4eaf26f808d9fe5b7fc95954 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Fri, 20 Sep 2024 08:21:24 -0400 Subject: [PATCH 04/77] break down by function init --- R/possible_ids.R | 63 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index a97a8116..603fc473 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -30,6 +30,7 @@ possible_ids <- function(dt, comb_size = 5, get_all = FALSE) { + # defenses --------- # Ensure dt is a data.table if (!is.data.frame(dt)) { stop("data must be a data frame") @@ -41,29 +42,22 @@ possible_ids <- function(dt, # Get all variable names vars <- names(dt) |> copy() - # Compute the primary class of each variable - vars_class <- vapply(dt, function(x) class(x)[1], character(1)) - names(vars_class) <- vars # Ensure names are preserved + # Exclude and include ------- + + ## classes ---------- + vars <- filter_by_class(dt = dt, + vars = vars, + include_types = include_types, + exclude_types = exclude_types) + + ## var names -------- + vars <- filter_by_name(vars, include, exclude) + + - # Apply 'include' filter - if (!is.null(include)) { - vars <- intersect(vars, include) - } - # Apply 'include_types' filter - if (!is.null(include_types)) { - vars <- vars[vars_class[vars] %in% include_types] - } - # Apply 'exclude' filter - if (!is.null(exclude)) { - vars <- setdiff(vars, exclude) - } - # Apply 'exclude_types' filter - if (!is.null(exclude_types)) { - vars <- vars[!(vars_class[vars] %in% exclude_types)] - } # Remove duplicate column names... just in case vars <- unique(vars) @@ -141,3 +135,34 @@ possible_ids <- function(dt, return(possible_ids_list) } } + + +filter_by_class <- function(dt, vars, include_types, exclude_types) { + # Compute the primary class of each variable + vars_class <- vapply(dt, function(x) class(x)[1], character(1)) + names(vars_class) <- vars # Ensure names are preserved + + # Apply 'include_types' filter + if (!is.null(include_types)) { + vars <- vars[vars_class[vars] %in% include_types] + } + + # Apply 'exclude_types' filter + if (!is.null(exclude_types)) { + vars <- vars[!(vars_class[vars] %in% exclude_types)] + } + vars +} + +filter_by_name <- function(vars, include, exclude) { + # Apply 'include' filter + if (!is.null(include)) { + vars <- intersect(vars, include) + } + + # Apply 'exclude' filter + if (!is.null(exclude)) { + vars <- setdiff(vars, exclude) + } + vars +} From 1bd7aee57eae8e122e98248885ec8d840ffbfa21 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Fri, 20 Sep 2024 10:30:38 -0400 Subject: [PATCH 05/77] update --- R/possible_ids.R | 92 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 24 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 603fc473..346cd71d 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -26,8 +26,10 @@ possible_ids <- function(dt, include = NULL, exclude_types = NULL, include_types = NULL, - verbose = getOption("possible_ids.verbose", default = FALSE), - comb_size = 5, + verbose = getOption("possible_ids.verbose", + default = FALSE), + max_combination_size = 5, + max_processing_time = 60, # in seconds get_all = FALSE) { # defenses --------- @@ -54,54 +56,70 @@ possible_ids <- function(dt, vars <- filter_by_name(vars, include, exclude) - - - - - # Remove duplicate column names... just in case vars <- unique(vars) if (length(vars) == 0) { if (verbose) { - cli::cli_alert_danger("No variables available after applying include/exclude filters.") + cli::cli_alert_danger("No variables available after applying + include/exclude filters.") } return(NULL) # should this be an error? } - # Check if any variables uniquely identify the data individually - unique_vars <- vapply(vars, - \(var) !anyDuplicated(dt[[var]]), - logical(1)) + # Unique values --------- - # Collect variables that are unique identifiers - unique_ids <- vars[unique_vars] + # Sort variables by number of unique values (ascending order) + unique_counts <- vapply(dt[, ..vars], fnunique, numeric(1)) + vars <- vars[order(unique_counts)] + unique_counts <- unique_counts[order(unique_counts)] + n_row <- fnrow(dt) + unique_ids <- vars[unique_counts == n_row] # Initialize list to store possible IDs possible_ids_list <- list() - # Add individual unique variables if (length(unique_ids) > 0) { possible_ids_list <- c(possible_ids_list, as.list(unique_ids)) if (verbose) { cli::cli_alert_info("Found unique identifiers: {.code {unique_ids}}") } + if (!get_all) return(possible_ids_list) } # Remove unique identifiers from vars to reduce combinations vars <- setdiff(vars, unique_ids) - - # If data is uniquely identified by existing variables, return the unique IDs - if (length(possible_ids_list) > 0 && fnrow(dt) == fnrow(unique(dt[, ..unique_ids]))) { + if (length(vars) == 0) { + # All variables are unique identifiers return(possible_ids_list) } + unique_counts <- unique_counts[vars] + + # combinations ----------- + + # Start testing combinations + start_time <- Sys.time() + max_size <- min(length(vars), max_combination_size) - # Start with combinations of size 2 up to the number of remaining vars - for (i in 2:min(length(vars), comb_size)) { # Limit combination size to 5 for efficiency - combos <- combn(vars, i, simplify = FALSE) + for (comb_size in 2:max_size) { + + combos <- combn(vars, comb_size, simplify = FALSE) + + # Prune combinations where the product of unique counts is less + # than n_rows + combos_to_keep <- vapply(combos, \(combo) { + prod(unique_counts[combo]) >= n_rows + }, logical(1)) + + combos <- combos[combo_to_keep] + + # Estimate processing time and prune combinations + est_times <- vapply(combos, function(combo) { + estimate_combination_time(n_rows, unique_counts[combo]) + }, numeric(1)) if (verbose) { - msg <- sprintf("combinations of %s variables", i) + msg <- sprintf("combinations of %s variables", comb_size) cli::cli_progress_bar(msg, total = length(combos)) } @@ -114,14 +132,29 @@ possible_ids <- function(dt, if (!get_all) { return(possible_ids_list) } - # Remove variables in the current combo from vars to avoid redundant checks + # Remove variables in the current combo from vars to + # avoid redundant checks vars <- setdiff(vars, combo) + unique_counts <- unique_counts[vars] break # Break since we found a minimal unique key of size i } + # Check processing time + elapsed_time <- as.numeric(difftime(Sys.time(), + start_time, + units = "secs")) + if (elapsed_time > max_processing_time) { + if (verbose) { + mxt_msg <- "Maximum processing time exceeded. + modify {.arg max_processing_time} argument to increse time. + Stopping search." + cli::cli_alert_warning(mxt_msg) + } + break + } if (verbose) cli::cli_progress_update() } # Break if all variables are used - if (length(vars) == 0) { + if (length(vars) == 0 || elapsed_time > max_processing_time) { break } } @@ -166,3 +199,14 @@ filter_by_name <- function(vars, include, exclude) { } vars } + + + +# Function to estimate processing time based on unique counts +estimate_combination_time <- function(n_rows, unique_counts) { + # Simple estimation function + # Time is proportional to (product of unique counts) / n_rows + # Adjust the constant factor based on empirical observations + est_time <- (prod(unique_counts) / n_rows) * 0.0001 + return(est_time) +} From 209f272cd16259852e0b2540d11618efa7bc37cc Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Fri, 20 Sep 2024 12:51:01 -0400 Subject: [PATCH 06/77] update format --- R/possible_ids.R | 94 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 31 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 346cd71d..93fa548b 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -28,8 +28,10 @@ possible_ids <- function(dt, include_types = NULL, verbose = getOption("possible_ids.verbose", default = FALSE), + min_combination_size = 1, max_combination_size = 5, max_processing_time = 60, # in seconds + max_numb_possible_ids = 100, get_all = FALSE) { # defenses --------- @@ -59,6 +61,10 @@ possible_ids <- function(dt, # Remove duplicate column names... just in case vars <- unique(vars) + if (verbose) { + cli::cli_alert_info("Variables to test: {.field {vars}}") + } + if (length(vars) == 0) { if (verbose) { cli::cli_alert_danger("No variables available after applying @@ -74,34 +80,43 @@ possible_ids <- function(dt, vars <- vars[order(unique_counts)] unique_counts <- unique_counts[order(unique_counts)] n_row <- fnrow(dt) - unique_ids <- vars[unique_counts == n_row] + init_index <- 0 + + # Initialize list to store possible IDs - possible_ids_list <- list() - # Add individual unique variables - if (length(unique_ids) > 0) { - possible_ids_list <- c(possible_ids_list, as.list(unique_ids)) - if (verbose) { - cli::cli_alert_info("Found unique identifiers: {.code {unique_ids}}") - } - if (!get_all) return(possible_ids_list) - } + possible_ids_list <- vector("list", max_numb_possible_ids) + + if (min_combination_size == 1) { + unique_ids <- vars[unique_counts == n_row] + # Add individual unique variables + init_index <- length(unique_ids) + if (init_index > 0) { + possible_ids_list[1:init_index] <- as.list(unique_ids) + if (verbose) { + cli::cli_alert_info("Found unique identifiers: {.code {unique_ids}}") + } + if (!get_all) return(remove_null(possible_ids_list)) - # Remove unique identifiers from vars to reduce combinations - vars <- setdiff(vars, unique_ids) - if (length(vars) == 0) { - # All variables are unique identifiers - return(possible_ids_list) + # Remove unique identifiers from vars to reduce combinations + vars <- setdiff(vars, unique_ids) + if (length(vars) == 0) { + # All variables are unique identifiers + return(remove_null(possible_ids_list)) + } + unique_counts <- unique_counts[vars] + } } - unique_counts <- unique_counts[vars] # combinations ----------- # Start testing combinations start_time <- Sys.time() max_size <- min(length(vars), max_combination_size) + min_size <- max(min_combination_size, 2) - for (comb_size in 2:max_size) { + j <- init_index + 1 + for (comb_size in min_size:max_size) { combos <- combn(vars, comb_size, simplify = FALSE) @@ -111,7 +126,7 @@ possible_ids <- function(dt, prod(unique_counts[combo]) >= n_rows }, logical(1)) - combos <- combos[combo_to_keep] + combos <- combos[combos_to_keep] # Estimate processing time and prune combinations est_times <- vapply(combos, function(combo) { @@ -119,8 +134,10 @@ possible_ids <- function(dt, }, numeric(1)) if (verbose) { - msg <- sprintf("combinations of %s variables", comb_size) - cli::cli_progress_bar(msg, total = length(combos)) + cli::cli_progress_bar( + format = "combs of {cli::pb_extra$comb_size} vars: {cli::pb_bar} {cli::pb_percent} | ETA: {cli::pb_eta} | {cli::pb_current}/{cli::pb_total}", + extra = list(comb_size = comb_size), + total = length(combos)) } for (combo in combos) { @@ -128,15 +145,26 @@ possible_ids <- function(dt, if (is_id(dt, by = combo, verbose = FALSE)) { # This is inefficient... it is copying every time... # I need to think better on how to do it. - possible_ids_list <- c(possible_ids_list, list(combo)) - if (!get_all) { + possible_ids_list[j] <- combo + j <- init_index + 1 + if (j > max_numb_possible_ids) { + if (verbose) { + cli::cli_alert_warning( + "Max number of possible IDs ({max_numb_possible_ids}) reached. + You may modify it in argument {.arg max_numb_possible_ids}") + } return(possible_ids_list) } + if (!get_all) { + return(remove_null(possible_ids_list)) + } # Remove variables in the current combo from vars to # avoid redundant checks vars <- setdiff(vars, combo) unique_counts <- unique_counts[vars] - break # Break since we found a minimal unique key of size i + + # Break since we found a minimal unique key of size i + if (!get_all) break } # Check processing time elapsed_time <- as.numeric(difftime(Sys.time(), @@ -159,13 +187,13 @@ possible_ids <- function(dt, } } - if (length(possible_ids_list) == 0) { + if (length(remove_null(possible_ids_list)) == 0) { if (verbose) { cli::cli_alert_warning("No unique identifier found.") } return(NULL) } else { - return(possible_ids_list) + return(remove_null(possible_ids_list)) } } @@ -188,16 +216,14 @@ filter_by_class <- function(dt, vars, include_types, exclude_types) { } filter_by_name <- function(vars, include, exclude) { - # Apply 'include' filter - if (!is.null(include)) { - vars <- intersect(vars, include) - } - # Apply 'exclude' filter if (!is.null(exclude)) { vars <- setdiff(vars, exclude) } - vars + + # Apply 'include' filter + c(vars, include) + } @@ -210,3 +236,9 @@ estimate_combination_time <- function(n_rows, unique_counts) { est_time <- (prod(unique_counts) / n_rows) * 0.0001 return(est_time) } + + +remove_null <- \(x) { + y <- vapply(x, \(.) !is.null(.), logical(1)) + x[y] +} From fb9dff9b9b0bc5410abad5e2fa9b9ba3249b0c6c Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Fri, 20 Sep 2024 14:21:04 -0400 Subject: [PATCH 07/77] add some testing --- R/possible_ids.R | 34 +++++++----- man/possible_ids.Rd | 9 +++- tests/testthat/test-possible_ids.R | 84 ++++++++++++++++++++---------- 3 files changed, 87 insertions(+), 40 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 93fa548b..9e18f73e 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -24,8 +24,8 @@ possible_ids <- function(dt, exclude = NULL, include = NULL, - exclude_types = NULL, - include_types = NULL, + exclude_classes = NULL, + include_classes = NULL, verbose = getOption("possible_ids.verbose", default = FALSE), min_combination_size = 1, @@ -51,8 +51,8 @@ possible_ids <- function(dt, ## classes ---------- vars <- filter_by_class(dt = dt, vars = vars, - include_types = include_types, - exclude_types = exclude_types) + include_classes = include_classes, + exclude_classes = exclude_classes) ## var names -------- vars <- filter_by_name(vars, include, exclude) @@ -112,8 +112,18 @@ possible_ids <- function(dt, # Start testing combinations start_time <- Sys.time() - max_size <- min(length(vars), max_combination_size) min_size <- max(min_combination_size, 2) + max_size <- min(length(vars), max_combination_size) + + # where there is only one variable or not enough vars to combine + if (min_size >= max_size) { + if (verbose) { + cli::cli_alert_warning( + "Can't make combinations of {.field {vars}} if the min number of + combinations is {min_size} and the max is {max_size}") + } + return(remove_null(possible_ids_list)) + } j <- init_index + 1 for (comb_size in min_size:max_size) { @@ -198,19 +208,19 @@ possible_ids <- function(dt, } -filter_by_class <- function(dt, vars, include_types, exclude_types) { +filter_by_class <- function(dt, vars, include_classes, exclude_classes) { # Compute the primary class of each variable vars_class <- vapply(dt, function(x) class(x)[1], character(1)) names(vars_class) <- vars # Ensure names are preserved - # Apply 'include_types' filter - if (!is.null(include_types)) { - vars <- vars[vars_class[vars] %in% include_types] + # Apply 'include_classes' filter + if (!is.null(include_classes)) { + vars <- vars[vars_class[vars] %in% include_classes] } - # Apply 'exclude_types' filter - if (!is.null(exclude_types)) { - vars <- vars[!(vars_class[vars] %in% exclude_types)] + # Apply 'exclude_classes' filter + if (!is.null(exclude_classes)) { + vars <- vars[!(vars_class[vars] %in% exclude_classes)] } vars } diff --git a/man/possible_ids.Rd b/man/possible_ids.Rd index 3d284b31..72490eb2 100644 --- a/man/possible_ids.Rd +++ b/man/possible_ids.Rd @@ -8,7 +8,14 @@ possible_ids( dt, exclude = NULL, include = NULL, - verbose = getOption("possible_ids.verbose") + exclude_classes = NULL, + include_classes = NULL, + verbose = getOption("possible_ids.verbose", default = FALSE), + min_combination_size = 1, + max_combination_size = 5, + max_processing_time = 60, + max_numb_possible_ids = 100, + get_all = FALSE ) } \arguments{ diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 7ad695e7..c062b0ae 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -51,7 +51,7 @@ test_that("error if not dataframe", { }) -test_that("inconsistent user of `include`", { +test_that("inconsistent use of `include`", { expect_warning(possible_ids(x1, include = "x")) @@ -61,18 +61,18 @@ test_that("inconsistent user of `include`", { test_that("exclude and include", { dd <- possible_ids(x3, - exclude = "_numeric", + exclude_classes = c("numeric", "integer"), include = "foo") - expect_equal(c("V1", "V2"), names(dd)) + expect_equal(unlist(dd), c("id", "foo")) }) -test_that("get NULL when duplicates", { +test_that("get length 0", { - expect_null(possible_ids(x1, - exclude = "_numeric", - include = "t")) + expect_length(possible_ids(x1, + exclude_classes = c("numeric", "integer"), + include = "t"), 0) }) @@ -117,19 +117,22 @@ test_that("duplicated names", { }) - - -library(lubridate) # For date functions +# Big data -------------------- # Set seed for reproducibility set.seed(123) # Number of rows and variables -n_rows <- 1e5 # 100,000 rows +n_rows <- 1e4 # 10,000 rows n_vars <- 50 # Total variables # Initialize an empty data.table -dt_large <- data.table(n = 1:n_rows) +dt_large <- data.table(id = 1:n_rows) + +# Manually create three variables that uniquely identify the data +dt_large[, unique_id1 := rep(1:10, each = 1000)] # 1000 unique values repeated 100 times +dt_large[, unique_id2 := sample(letters, n_rows, replace = TRUE)] # Random character variable +dt_large[, unique_id3 := sample(1:1000, n_rows, replace = TRUE)] # Random integer # Function to generate random data generate_random_data <- function(n, type) { @@ -146,7 +149,7 @@ generate_random_data <- function(n, type) { # Variable types and counts var_types <- c("numeric_int", "numeric_double", "character", "factor", "logical", "date", "datetime") -vars_per_type <- c(10, 10, 10, 10, 5, 3, 2) # sum to 50 +vars_per_type <- c(10, 10, 10, 10, 5, 3, 2) # Total should sum to 50 # Generate variables and add to the data.table var_count <- 0 @@ -160,15 +163,44 @@ for (i in seq_along(var_types)) { } } -# Introduce duplicates intentionally -# Duplicate the first 100 rows -dt_large <- rowbind(dt_large, dt_large[1:100, ]) +# Introduce duplicates in some columns that are NOT the unique identifiers +# For example, we can duplicate the first 100 rows in the "numeric_int_1" and "character_1" columns +# dt_large <- rbind(dt_large, dt_large[1:100, .(numeric_int_1, character_1)]) -# Shuffle the data +# Shuffle the data to avoid ordered data dt_large <- dt_large[sample(.N)] -dt_large[, id := .I] + +# dt_large[, id := .I] +dt <- copy(dt_large) + +possible_ids( + dt = dt_large, + exclude_types = c("numeric"), + verbose = TRUE +) + +possible_ids( + dt = dt_large, + exclude_types = c("numeric"), + exclude = "id", + verbose = TRUE +) + +uniq_vars <- grep("unique_id", names(dt_large), value = TRUE) +pids <- possible_ids( + dt = dt_large, + exclude_types = c("logical", "date", "datetime", "numeric"), + exclude = "id", + include = uniq_vars, + verbose = TRUE, + min_combination_size = 3, + # max_combination_size = 3, + max_processing_time = 240, + get_all = TRUE +) + possible_ids( dt = dt_large, verbose = TRUE @@ -184,12 +216,10 @@ possible_ids_list <- possible_ids( ) possible_ids_list - -# Display the structure of the data.table -str(dt_large) - - - - - - +possible_ids_list <- possible_ids( + dt = dt_large, + exclude_types = c("logical", "date", "datetime", "numeric"), # Exclude some types for efficiency + max_processing_time = 120, + verbose = TRUE +) +possible_ids_list From f222a780cb7cebfd2237117a3ce4c0855e525ac8 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Fri, 20 Sep 2024 15:00:12 -0400 Subject: [PATCH 08/77] passed all tests --- R/possible_ids.R | 53 +++++++++++++++++++----------- tests/testthat/test-possible_ids.R | 12 ++++--- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 9e18f73e..a65e3ad4 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -55,11 +55,15 @@ possible_ids <- function(dt, exclude_classes = exclude_classes) ## var names -------- - vars <- filter_by_name(vars, include, exclude) + vars <- filter_by_name(vars, include, exclude, verbose) - # Remove duplicate column names... just in case - vars <- unique(vars) + ## no duplicated vars ------------- + if (anyDuplicated(vars)) { + dupvars <- vars[duplicated(vars)] |> + unique() + cli::cli_abort("vars {.field {dupvars}} are duplicated.") + } if (verbose) { cli::cli_alert_info("Variables to test: {.field {vars}}") @@ -79,7 +83,7 @@ possible_ids <- function(dt, unique_counts <- vapply(dt[, ..vars], fnunique, numeric(1)) vars <- vars[order(unique_counts)] unique_counts <- unique_counts[order(unique_counts)] - n_row <- fnrow(dt) + n_rows <- fnrow(dt) init_index <- 0 @@ -88,7 +92,7 @@ possible_ids <- function(dt, possible_ids_list <- vector("list", max_numb_possible_ids) if (min_combination_size == 1) { - unique_ids <- vars[unique_counts == n_row] + unique_ids <- vars[unique_counts == n_rows] # Add individual unique variables init_index <- length(unique_ids) if (init_index > 0) { @@ -111,12 +115,13 @@ possible_ids <- function(dt, # combinations ----------- # Start testing combinations - start_time <- Sys.time() - min_size <- max(min_combination_size, 2) - max_size <- min(length(vars), max_combination_size) + start_time <- Sys.time() + min_size <- max(min_combination_size, 2) + max_size <- min(length(vars), max_combination_size) + elapsed_time <- 0 # where there is only one variable or not enough vars to combine - if (min_size >= max_size) { + if (min_size > max_size) { if (verbose) { cli::cli_alert_warning( "Can't make combinations of {.field {vars}} if the min number of @@ -132,16 +137,21 @@ possible_ids <- function(dt, # Prune combinations where the product of unique counts is less # than n_rows - combos_to_keep <- vapply(combos, \(combo) { - prod(unique_counts[combo]) >= n_rows - }, logical(1)) + combos_to_keep <- vapply(combos, + \(combo) { + prod(unique_counts[combo]) >= n_rows + }, + logical(1)) combos <- combos[combos_to_keep] # Estimate processing time and prune combinations - est_times <- vapply(combos, function(combo) { - estimate_combination_time(n_rows, unique_counts[combo]) - }, numeric(1)) + est_times <- vapply(combos, + \(combo) { + estimate_combination_time(n_rows, + unique_counts[combo]) + }, + numeric(1)) if (verbose) { cli::cli_progress_bar( @@ -155,7 +165,7 @@ possible_ids <- function(dt, if (is_id(dt, by = combo, verbose = FALSE)) { # This is inefficient... it is copying every time... # I need to think better on how to do it. - possible_ids_list[j] <- combo + possible_ids_list[[j]] <- combo j <- init_index + 1 if (j > max_numb_possible_ids) { if (verbose) { @@ -201,10 +211,8 @@ possible_ids <- function(dt, if (verbose) { cli::cli_alert_warning("No unique identifier found.") } - return(NULL) - } else { - return(remove_null(possible_ids_list)) } + return(remove_null(possible_ids_list)) } @@ -225,9 +233,14 @@ filter_by_class <- function(dt, vars, include_classes, exclude_classes) { vars } -filter_by_name <- function(vars, include, exclude) { +filter_by_name <- function(vars, include, exclude, verbose) { # Apply 'exclude' filter if (!is.null(exclude)) { + wno_exc <- which(!exclude %in% vars) # which not excluded + if (length(wno_exc) > 0 & verbose) { + no_exc <- exclude[wno_exc] + cli::cli_alert_warning("var{?s} {.var {no_exc}} not found in dataframe") + } vars <- setdiff(vars, exclude) } diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index c062b0ae..618ed0de 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -78,9 +78,10 @@ test_that("get length 0", { test_that("Exclude nothing", { + p1 <- possible_ids(x1) + p2 <- possible_ids(x1, exclude = "rer") - expect_warning(possible_ids(x1, - exclude = "rer")) + expect_equal(p1, p2) }) @@ -91,9 +92,10 @@ test_that("Exclude type and variable", { xx4[, id2 := as.character(id2)] dd <- possible_ids(xx4, - exclude = c("_character", "x")) + exclude_classes = c("character"), + exclude = "x") - expect_equal(c("id1", "t"), dd$V1) + expect_equal(c("id1", "t"), unlist(dd)) }) @@ -104,7 +106,7 @@ test_that("Exclude more than one variable", { dd <- possible_ids(x4, exclude = c("id2", "x")) - expect_equal(c("id1", "t"), dd$V1) + expect_equal(c("id1", "t"), unlist(dd)) }) From a9d99d9ca215691979e3789cf6378f1c9b2a7542 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Fri, 20 Sep 2024 15:29:25 -0400 Subject: [PATCH 09/77] document --- R/possible_ids.R | 34 +++++++++++++++++++++++++++++++--- man/possible_ids.Rd | 42 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index a65e3ad4..1cf90c4d 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -3,14 +3,42 @@ #' Identify possible variables uniquely identifying x #' #' @param dt data frame -#' @param exclude character: Exclude variables to be selected as identifiers. It -#' could be either the name of the variables of one type of the variable -#' prefixed by "_". For instance, "_numeric" or "_character". +#' @param exclude character: Exclude variables to be selected as identifiers. #' @param include character: Name of variable to be included, that might belong #' to the group excluded in the `exclude` +#' @param exclude_classes character: classes to exclude from analysis (e.g., +#' "numeric", "integer", "date") +#' @param include_classes character: classes to include from analysis (e.g., +#' "numeric", "integer", "date") +#' @param min_combination_size numeric: Min number of combinations. Default is +#' 1, so all combinations. +#' @param max_combination_size numeric. Max number of combinations. Default is +#' 5. If there is a combinations of identifiers larger than +#' `max_combination_size`, they won't be found +#' @param max_processing_time numeric: Max time to process in seconds. After +#' that, it returns what it found. +#' @param max_numb_possible_ids numeric: Max number of possible IDs to find. See +#' details. +#' @param get_all logical: get all possible combinations based on the parameters +#' above. #' @param verbose logical: If FALSE no message will be displayed. Default is #' TRUE #' +#' @section Number of possible IDs: +#' +#' The number of possible IDs in a dataframe could be very large. This is why, +#' `possible_ids()` makes use of heuristics to return something useful without +#' wasting the time of the user. in addition, we provide multiple parameter so +#' that the user can fine tune their search for possible IDs easily and +#' quickly. +#' +#' Say for instance that you have a dataframe with 10 variables. Testing every +#' possible pair of variables will give you 90 possible unique identifiers for +#' this dataframe. If you want to test all the possible IDs, you will have to +#' test more 5000 combinations. If the dataframe has many rows, it may take a +#' while. +#' +#' #' @return list with possible identifiers #' @export #' diff --git a/man/possible_ids.Rd b/man/possible_ids.Rd index 72490eb2..c0c91bd7 100644 --- a/man/possible_ids.Rd +++ b/man/possible_ids.Rd @@ -21,15 +21,35 @@ possible_ids( \arguments{ \item{dt}{data frame} -\item{exclude}{character: Exclude variables to be selected as identifiers. It -could be either the name of the variables of one type of the variable -prefixed by "_". For instance, "_numeric" or "_character".} +\item{exclude}{character: Exclude variables to be selected as identifiers.} \item{include}{character: Name of variable to be included, that might belong to the group excluded in the \code{exclude}} +\item{exclude_classes}{character: classes to exclude from analysis (e.g., +"numeric", "integer", "date")} + +\item{include_classes}{character: classes to include from analysis (e.g., +"numeric", "integer", "date")} + \item{verbose}{logical: If FALSE no message will be displayed. Default is TRUE} + +\item{min_combination_size}{numeric: Min number of combinations. Default is +1, so all combinations.} + +\item{max_combination_size}{numeric. Max number of combinations. Default is +5. If there is a combinations of identifiers larger than +\code{max_combination_size}, they won't be found} + +\item{max_processing_time}{numeric: Max time to process in seconds. After +that, it returns what it found.} + +\item{max_numb_possible_ids}{numeric: Max number of possible IDs to find. See +details.} + +\item{get_all}{logical: get all possible combinations based on the parameters +above.} } \value{ list with possible identifiers @@ -37,6 +57,22 @@ list with possible identifiers \description{ Identify possible variables uniquely identifying x } +\section{Number of possible IDs}{ + + +The number of possible IDs in a dataframe could be very large. This is why, +\code{possible_ids()} makes use of heuristics to return something useful without +wasting the time of the user. in addition, we provide multiple parameter so +that the user can fine tune their search for possible IDs easily and +quickly. + +Say for instance that you have a dataframe with 10 variables. Testing every +possible pair of variables will give you 90 possible unique identifiers for +this dataframe. If you want to test all the possible IDs, you will have to +test more 5000 combinations. If the dataframe has many rows, it may take a +while. +} + \examples{ library(data.table) x4 = data.table(id1 = c(1, 1, 2, 3, 3), From ab45f0b45289a4a075d19848e2fef4985397e84c Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Fri, 20 Sep 2024 15:31:33 -0400 Subject: [PATCH 10/77] fix documentation --- R/possible_ids.R | 4 ++-- man/possible_ids.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 1cf90c4d..983c0221 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -1,6 +1,6 @@ #' Find possible unique identifies of data frame #' -#' Identify possible variables uniquely identifying x +#' Identify possible combinations of variables that uniquely identifying dt #' #' @param dt data frame #' @param exclude character: Exclude variables to be selected as identifiers. @@ -28,7 +28,7 @@ #' #' The number of possible IDs in a dataframe could be very large. This is why, #' `possible_ids()` makes use of heuristics to return something useful without -#' wasting the time of the user. in addition, we provide multiple parameter so +#' wasting the time of the user. In addition, we provide multiple parameter so #' that the user can fine tune their search for possible IDs easily and #' quickly. #' diff --git a/man/possible_ids.Rd b/man/possible_ids.Rd index c0c91bd7..05866bb8 100644 --- a/man/possible_ids.Rd +++ b/man/possible_ids.Rd @@ -55,14 +55,14 @@ above.} list with possible identifiers } \description{ -Identify possible variables uniquely identifying x +Identify possible combinations of variables that uniquely identifying dt } \section{Number of possible IDs}{ The number of possible IDs in a dataframe could be very large. This is why, \code{possible_ids()} makes use of heuristics to return something useful without -wasting the time of the user. in addition, we provide multiple parameter so +wasting the time of the user. In addition, we provide multiple parameter so that the user can fine tune their search for possible IDs easily and quickly. From a633ba477f6d2cce67e58d6f3309f355f1615d96 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Mon, 23 Sep 2024 11:48:06 -0400 Subject: [PATCH 11/77] add old function --- R/possible_ids.R | 201 ++++++++++++++++++++++++++++++++++++++++ man/possible_ids_old.Rd | 41 ++++++++ 2 files changed, 242 insertions(+) create mode 100644 man/possible_ids_old.Rd diff --git a/R/possible_ids.R b/R/possible_ids.R index 983c0221..b19eca95 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -293,3 +293,204 @@ remove_null <- \(x) { y <- vapply(x, \(.) !is.null(.), logical(1)) x[y] } + + + +#' Find possible unique identifies of data frame +#' +#' Identify possible variables uniquely identifying x +#' +#' @param dt data frame +#' @param exclude character: Exclude variables to be selected as identifiers. It +#' could be either the name of the variables of one type of the variable +#' prefixed by "_". For instance, "_numeric" or "_character". +#' @param include character: Name of variable to be included, that might belong +#' to the group excluded in the `exclude` +#' @param verbose logical: If FALSE no message will be displayed. Default is +#' TRUE +#' +#' @return list with possible identifiers +#' @keywords internal +#' +#' @examples +#' library(data.table) +#' x4 = data.table(id1 = c(1, 1, 2, 3, 3), +#' id2 = c(1, 1, 2, 3, 4), +#' t = c(1L, 2L, 1L, 2L, NA_integer_), +#' x = c(16, 12, NA, NA, 15)) +#' possible_ids_old(x4) +possible_ids_old <- function(dt, + exclude = NULL, + include = NULL, + verbose = getOption("possible_ids.verbose")) { + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Check inputs --------- + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + if (!is.data.frame(dt)) { + stop("data must be a data frame") + } + + if (is.data.table(dt)) { + dt <- as.data.frame(dt) + } + + + if (is.null(exclude) && !is.null(include)) { + if (verbose) { + cli::cli_alert_warning("Since {.code exclude} is NULL, {.code include} + directive does not make sense. Ignored.", + wrap = TRUE) + } + warning("inconsistent use of `include`") + } + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ## exclude variables from check ------ + + vars <- names(dt) + + ### Exclude variable according to their type --------- + if (!is.null(include)) { + + # Find position of variable to include + ii <- which(names(dt) %in% include) + + } else { + + ii <- NULL + + } + + ### Exclude variable by name --------- + if (!is.null(exclude)) { + + if (any(grepl("^_", exclude))) { + + type_ex <- exclude[grepl("^_", exclude)] + vars_ex <- exclude[!grepl("^_", exclude)] + + type_ex <- match.arg(type_ex, c("_character", "_numeric")) + + # find variable that meet criteria and exclude them, making sure to include + # the variables of the user. + ex <- gsub("^_", "", type_ex) + FUN <- paste0("is.", ex) + + n_cols <- unlist(lapply(dt, FUN)) + n_cols[ii] <- FALSE + + # Exclude variables by name + + if (length(vars_ex) > 0) { + ex <-which(names(dt) %in% vars_ex) + n_cols[ex] <- TRUE + } + + vars <- names(dt)[!n_cols] + + } else { + vars <- vars[!(vars %in% exclude)] + + if (identical(vars, names(dt))) { + if (verbose) { + cli::cli_alert_warning("Variable {.field {exclude}} is not available in data frame. + Nothing is excluded.", wrap = TRUE) + } + + warning("inconsistenty use of `exclude`") + + } + + } + } + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ## check all names are unieuq -------- + dup_var <- duplicated(vars) + + if (any(dup_var)) { + + dvars <- vars[dup_var] + + msg <- "column names must be unique" + hint <- "try changing the names using {.fun make.names}" + problem <- "{.var {dvars}} {?is/are} duplicated" + cli::cli_abort(c( + msg, + i = hint, + x = problem + )) + + } + + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Find duplicates --------- + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + duplicates <- is_id(dt, by = vars, verbose = FALSE) + if (duplicates) { + if (verbose) { + cli::cli_alert_success("There are no duplicates in data frame") + } + } else { + if (verbose) { + cli::cli_alert_warning("Data has duplicates. returning NULL") + } + is_id(dt, by = vars, verbose = TRUE) + return(NULL) + } + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Find ids --------- + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + nvars <- length(vars) + + found <- FALSE + i = 0 + while(i < nvars && found == FALSE) { + i = i + 1 + cm <- utils::combn(vars, m = i) + + lcm <- dim(cm)[2] # number of combinations of size j + + selected_vars <- vector(length = lcm) + for (j in 1:lcm) { + tvars <- cm[, j] # testing vars + selected_vars[j] <- is_id(dt, by = tvars, verbose = FALSE) + } + + sv <- which(selected_vars) + + if (length(sv) > 0) { + + if (length(sv) == 1 && i > 1) { + + lv <- list(V1 = cm[, sv]) + + } else if (i == 1) { + + ee <- as.data.frame(t(cm[, sv])) + lv <- lapply(ee, unique) + + } else { + + ee <- as.data.frame(cm[, sv]) + lv <- lapply(ee, unique) + + } + + found <- TRUE + + } + } + + if (verbose) { + cli::cli_alert("we found {length(lv)} possible id{?s}") + } + + return(lv) + +} diff --git a/man/possible_ids_old.Rd b/man/possible_ids_old.Rd new file mode 100644 index 00000000..11043343 --- /dev/null +++ b/man/possible_ids_old.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/possible_ids.R +\name{possible_ids_old} +\alias{possible_ids_old} +\title{Find possible unique identifies of data frame} +\usage{ +possible_ids_old( + dt, + exclude = NULL, + include = NULL, + verbose = getOption("possible_ids.verbose") +) +} +\arguments{ +\item{dt}{data frame} + +\item{exclude}{character: Exclude variables to be selected as identifiers. It +could be either the name of the variables of one type of the variable +prefixed by "_". For instance, "_numeric" or "_character".} + +\item{include}{character: Name of variable to be included, that might belong +to the group excluded in the \code{exclude}} + +\item{verbose}{logical: If FALSE no message will be displayed. Default is +TRUE} +} +\value{ +list with possible identifiers +} +\description{ +Identify possible variables uniquely identifying x +} +\examples{ +library(data.table) +x4 = data.table(id1 = c(1, 1, 2, 3, 3), + id2 = c(1, 1, 2, 3, 4), + t = c(1L, 2L, 1L, 2L, NA_integer_), + x = c(16, 12, NA, NA, 15)) +possible_ids_old(x4) +} +\keyword{internal} From b3fe0987f88d09fe73d28bbd2b47b9791b7a108c Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Tue, 24 Sep 2024 13:58:08 -0400 Subject: [PATCH 12/77] small changes --- R/possible_ids.R | 7 ++++++- tests/testthat/test-possible_ids.R | 9 +++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index b19eca95..7f38d2ac 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -50,6 +50,7 @@ #' x = c(16, 12, NA, NA, 15)) #' possible_ids(x4) possible_ids <- function(dt, + vars = NULL, exclude = NULL, include = NULL, exclude_classes = NULL, @@ -72,7 +73,11 @@ possible_ids <- function(dt, } # Get all variable names - vars <- names(dt) |> copy() + if (is.null(vars)) { + vars <- names(dt) |> copy() + } else { + # check that `vars` are in dt + } # Exclude and include ------- diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 618ed0de..63d1cc3f 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -179,13 +179,18 @@ dt <- copy(dt_large) possible_ids( dt = dt_large, - exclude_types = c("numeric"), verbose = TRUE ) possible_ids( dt = dt_large, exclude_types = c("numeric"), + verbose = TRUE +) + +possible_ids( + dt = dt_large, + exclude_classes = c("numeric"), exclude = "id", verbose = TRUE ) @@ -193,7 +198,7 @@ possible_ids( uniq_vars <- grep("unique_id", names(dt_large), value = TRUE) pids <- possible_ids( dt = dt_large, - exclude_types = c("logical", "date", "datetime", "numeric"), + exclude_classes = c("logical", "date", "datetime", "numeric"), exclude = "id", include = uniq_vars, verbose = TRUE, From 8992048734b1da79705d068a090e4be2363f96cb Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 24 Sep 2024 14:58:46 -0400 Subject: [PATCH 13/77] small changes --- R/possible_ids.R | 8 +++++++- tests/testthat/test-possible_ids.R | 10 +++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 983c0221..e3564a33 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -50,6 +50,7 @@ #' x = c(16, 12, NA, NA, 15)) #' possible_ids(x4) possible_ids <- function(dt, + vars = NULL exclude = NULL, include = NULL, exclude_classes = NULL, @@ -72,7 +73,12 @@ possible_ids <- function(dt, } # Get all variable names - vars <- names(dt) |> copy() + #vars <- names(dt) |> copy() + if (is.null(vars)) { + vars <- names(dt) |> copy() + } else { + # check that `vars` are in dt + } # Exclude and include ------- diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 618ed0de..5ca88061 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -179,13 +179,13 @@ dt <- copy(dt_large) possible_ids( dt = dt_large, - exclude_types = c("numeric"), + exclude_classes = c("numeric"), verbose = TRUE ) possible_ids( dt = dt_large, - exclude_types = c("numeric"), + exclude_classes = c("numeric"), exclude = "id", verbose = TRUE ) @@ -193,7 +193,7 @@ possible_ids( uniq_vars <- grep("unique_id", names(dt_large), value = TRUE) pids <- possible_ids( dt = dt_large, - exclude_types = c("logical", "date", "datetime", "numeric"), + exclude_classes = c("logical", "date", "datetime", "numeric"), exclude = "id", include = uniq_vars, verbose = TRUE, @@ -213,14 +213,14 @@ dt_large[, id := NULL] possible_ids_list <- possible_ids( dt = dt_large, - exclude_types = c("logical", "date", "datetime"), # Exclude some types for efficiency + exclude_classes = c("logical", "date", "datetime"), # Exclude some types for efficiency verbose = TRUE ) possible_ids_list possible_ids_list <- possible_ids( dt = dt_large, - exclude_types = c("logical", "date", "datetime", "numeric"), # Exclude some types for efficiency + exclude_classes = c("logical", "date", "datetime", "numeric"), # Exclude some types for efficiency max_processing_time = 120, verbose = TRUE ) From a61102cfe6a31532ef2a1b41730ef14b0e9c1d00 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 24 Sep 2024 15:53:05 -0400 Subject: [PATCH 14/77] add vars argument --- R/possible_ids.R | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index e3564a33..c7d30198 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -50,18 +50,18 @@ #' x = c(16, 12, NA, NA, 15)) #' possible_ids(x4) possible_ids <- function(dt, - vars = NULL - exclude = NULL, - include = NULL, - exclude_classes = NULL, - include_classes = NULL, - verbose = getOption("possible_ids.verbose", - default = FALSE), - min_combination_size = 1, - max_combination_size = 5, - max_processing_time = 60, # in seconds - max_numb_possible_ids = 100, - get_all = FALSE) { + vars = NULL, + exclude = NULL, + include = NULL, + exclude_classes = NULL, + include_classes = NULL, + verbose = getOption("possible_ids.verbose", + default = FALSE), + min_combination_size = 1, + max_combination_size = 5, + max_processing_time = 60, # in seconds + max_numb_possible_ids = 100, + get_all = FALSE) { # defenses --------- # Ensure dt is a data.table @@ -77,7 +77,11 @@ possible_ids <- function(dt, if (is.null(vars)) { vars <- names(dt) |> copy() } else { - # check that `vars` are in dt + missing_vars <- setdiff(vars, names(dt)) + if (length(missing_vars) > 0) { + cli::cli_abort("The following variables are not in the data table: {.strongVar {missing_vars}}") + } + } # Exclude and include ------- From 0ee172bc3811157398612857b7ad39216ba14e65 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 24 Sep 2024 15:55:48 -0400 Subject: [PATCH 15/77] fix documentation --- R/possible_ids.R | 1 + man/merge.Rd | 2 +- man/possible_ids.Rd | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index c7d30198..84a1fe12 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -3,6 +3,7 @@ #' Identify possible combinations of variables that uniquely identifying dt #' #' @param dt data frame +#' @param vars character: A vector of variable names to consider for identifying unique combinations. If `NULL` (default), all variables in the data frame are considered. #' @param exclude character: Exclude variables to be selected as identifiers. #' @param include character: Name of variable to be included, that might belong #' to the group excluded in the `exclude` diff --git a/man/merge.Rd b/man/merge.Rd index 5107d64d..7f748784 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -45,7 +45,7 @@ data from both \code{x} and \code{y} are included in the output.} \item{sort}{logical. If \code{TRUE} (default), the rows of the merged \code{data.table} are sorted by setting the key to the \code{by / by.x} columns. If \code{FALSE}, unlike base R's \code{merge} for which row order is unspecified, the -row order in \code{x} is retained (including retaining the position of missing entries when +row order in \code{x} is retained (including retaining the position of missings when \code{all.x=TRUE}), followed by \code{y} rows that don't match \code{x} (when \code{all.y=TRUE}) retaining the order those appear in \code{y}.} diff --git a/man/possible_ids.Rd b/man/possible_ids.Rd index 05866bb8..09c1f9a5 100644 --- a/man/possible_ids.Rd +++ b/man/possible_ids.Rd @@ -6,6 +6,7 @@ \usage{ possible_ids( dt, + vars = NULL, exclude = NULL, include = NULL, exclude_classes = NULL, @@ -21,6 +22,8 @@ possible_ids( \arguments{ \item{dt}{data frame} +\item{vars}{character: A vector of variable names to consider for identifying unique combinations. If \code{NULL} (default), all variables in the data frame are considered.} + \item{exclude}{character: Exclude variables to be selected as identifiers.} \item{include}{character: Name of variable to be included, that might belong From c6a8ab779858e6f029aa64854928559d54b6ad41 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 24 Sep 2024 16:07:35 -0400 Subject: [PATCH 16/77] fix msg --- R/possible_ids.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 84a1fe12..a1970db1 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -101,11 +101,11 @@ possible_ids <- function(dt, if (anyDuplicated(vars)) { dupvars <- vars[duplicated(vars)] |> unique() - cli::cli_abort("vars {.field {dupvars}} are duplicated.") + cli::cli_abort("vars {.strongVar {dupvars}} are duplicated.") } if (verbose) { - cli::cli_alert_info("Variables to test: {.field {vars}}") + cli::cli_alert_info("Variables to test: {.strongVar {vars}}") } if (length(vars) == 0) { From 2cb136298d5488cb09026a7a35e3ae6eaa62e881 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 24 Sep 2024 16:49:39 -0400 Subject: [PATCH 17/77] fix error with single var --- R/possible_ids.R | 4 ++++ tests/testthat/test-possible_ids.R | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/R/possible_ids.R b/R/possible_ids.R index a1970db1..5875aeda 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -83,6 +83,10 @@ possible_ids <- function(dt, cli::cli_abort("The following variables are not in the data table: {.strongVar {missing_vars}}") } + if (length(vars) < 2) { + cli::cli_abort("Can't make combinations with a single var: {.strongVar {vars}}") + } + } # Exclude and include ------- diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 5ca88061..a458210a 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -51,6 +51,14 @@ test_that("error if not dataframe", { }) +test_that("vars provided by user", { + + # single var -raise error + possible_ids(x4, + vars = c("t")) |> + expect_error() + +}) test_that("inconsistent use of `include`", { expect_warning(possible_ids(x1, From 451b0256749e7104b811040834c4a3cfcbc348c0 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 24 Sep 2024 17:15:46 -0400 Subject: [PATCH 18/77] adding tests --- R/possible_ids.R | 1 - tests/testthat/test-possible_ids.R | 29 +++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 5875aeda..b5605ac6 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -130,7 +130,6 @@ possible_ids <- function(dt, init_index <- 0 - # Initialize list to store possible IDs possible_ids_list <- vector("list", max_numb_possible_ids) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index a458210a..516e30ac 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -58,6 +58,35 @@ test_that("vars provided by user", { vars = c("t")) |> expect_error() + # one or more vars not included in dt + possible_ids(x4, + vars = c("id3", "id2")) |> + expect_error() + + possible_ids(dt, + vars = c("id", "d3", "id2")) + + possible_ids(x4, + vars = c("id3", "id4")) |> + expect_error() + + # check only combs with vars in vars are returned + all_ids_dt <- possible_ids(dt) + + use_ids_dt <- possible_ids(dt, + vars = c("id", "numeric_double_1", "numeric_double_2")) + + + + + + + + + + + + }) test_that("inconsistent use of `include`", { From 35bd6e768676f3187e14b0656e7af39a13464ad7 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 24 Sep 2024 18:35:39 -0400 Subject: [PATCH 19/77] adding tests --- tests/testthat/test-possible_ids.R | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 516e30ac..88dd8e2d 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -70,21 +70,17 @@ test_that("vars provided by user", { vars = c("id3", "id4")) |> expect_error() - # check only combs with vars in vars are returned + # check combs with vars in vars are returned all_ids_dt <- possible_ids(dt) use_ids_dt <- possible_ids(dt, vars = c("id", "numeric_double_1", "numeric_double_2")) + all(sapply(use_ids_dt, + function(x) { x %in% all_ids_dt })) |> + expect_equal(TRUE) - - - - - - - - + # no errors raised if vars in dt }) From fc298648070b39d45bf7ff92b5697720fd9c66f2 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 25 Sep 2024 11:59:30 -0400 Subject: [PATCH 20/77] adding test/identify probs --- tests/testthat/test-possible_ids.R | 50 +++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 88dd8e2d..80106fd6 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -64,24 +64,46 @@ test_that("vars provided by user", { expect_error() possible_ids(dt, - vars = c("id", "d3", "id2")) - - possible_ids(x4, - vars = c("id3", "id4")) |> + vars = c("id", "id3", "id2")) |> expect_error() - # check combs with vars in vars are returned - all_ids_dt <- possible_ids(dt) + + ids_dt <- possible_ids(dt, get) + + vars = c("id", "numeric_double_1", "numeric_double_2") use_ids_dt <- possible_ids(dt, - vars = c("id", "numeric_double_1", "numeric_double_2")) + vars = vars) all(sapply(use_ids_dt, - function(x) { x %in% all_ids_dt })) |> + function(x) { x %in% ids_dt })) |> + expect_equal(TRUE) + + all(unlist(use_ids_dt) %in% vars) |> expect_equal(TRUE) # no errors raised if vars in dt + # Check if the combination of unique_id1, unique_id2, and unique_id3 uniquely identifies rows + vars <- c("id", "unique_id2", "unique_id3") + + res <- dt_large[, .N, + by = vars][N > 1] |> + nrow() + + # NOTE (RT) --> this test should be correct, but fails because of an issue with the function (?) +# +# if (res == 0) { # if no duplicate rows +# possible_ids(dt, +# vars = vars, +# get_all = TRUE) |> +# unlist() |> +# expect_equal(vars) +# } + + # check that returned ids only include vars in vars + ids <- + }) test_that("inconsistent use of `include`", { @@ -152,7 +174,7 @@ test_that("duplicated names", { }) -# Big data -------------------- +# Auxiliary data: Big data table-------------------- # Set seed for reproducibility set.seed(123) @@ -164,7 +186,7 @@ n_vars <- 50 # Total variables # Initialize an empty data.table dt_large <- data.table(id = 1:n_rows) -# Manually create three variables that uniquely identify the data +## Manually create three variables that uniquely identify the data #### dt_large[, unique_id1 := rep(1:10, each = 1000)] # 1000 unique values repeated 100 times dt_large[, unique_id2 := sample(letters, n_rows, replace = TRUE)] # Random character variable dt_large[, unique_id3 := sample(1:1000, n_rows, replace = TRUE)] # Random integer @@ -198,7 +220,7 @@ for (i in seq_along(var_types)) { } } -# Introduce duplicates in some columns that are NOT the unique identifiers +## Introduce duplicates in some columns that are NOT the unique identifiers #### # For example, we can duplicate the first 100 rows in the "numeric_int_1" and "character_1" columns # dt_large <- rbind(dt_large, dt_large[1:100, .(numeric_int_1, character_1)]) @@ -226,9 +248,9 @@ possible_ids( uniq_vars <- grep("unique_id", names(dt_large), value = TRUE) pids <- possible_ids( dt = dt_large, - exclude_classes = c("logical", "date", "datetime", "numeric"), + #exclude_classes = c("logical", "date", "datetime", "numeric"), exclude = "id", - include = uniq_vars, + #vars = uniq_vars, verbose = TRUE, min_combination_size = 3, # max_combination_size = 3, @@ -241,7 +263,7 @@ possible_ids( verbose = TRUE ) -# Remove the 'id' column to simulate data without a clear unique identifier +## Remove the 'id' column to simulate data without a clear unique identifier #### dt_large[, id := NULL] possible_ids_list <- possible_ids( From 694aae4926583fed98d30c3fe4f05ba5b76e8e0d Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 25 Sep 2024 14:48:40 -0400 Subject: [PATCH 21/77] fix relationship between include and vars --- R/possible_ids.R | 12 ++++++++++- tests/testthat/test-possible_ids.R | 32 +++++++++++++++++------------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index b5605ac6..f48b6216 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -76,7 +76,17 @@ possible_ids <- function(dt, # Get all variable names #vars <- names(dt) |> copy() if (is.null(vars)) { - vars <- names(dt) |> copy() + vars <- names(dt) |> + copy() + + # 2 options: + # 1. If include is not null raise an error -user must provide either include or vars + # 2. Remove from include vars in vars: + if (!is.null(include)){ + include <- setdiff(include, + vars) + } + } else { missing_vars <- setdiff(vars, names(dt)) if (length(missing_vars) > 0) { diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 80106fd6..37f615b4 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -68,9 +68,9 @@ test_that("vars provided by user", { expect_error() - ids_dt <- possible_ids(dt, get) + ids_dt <- possible_ids(dt) - vars = c("id", "numeric_double_1", "numeric_double_2") + vars <- c("id", "numeric_double_1", "numeric_double_2") use_ids_dt <- possible_ids(dt, vars = vars) @@ -82,7 +82,6 @@ test_that("vars provided by user", { all(unlist(use_ids_dt) %in% vars) |> expect_equal(TRUE) - # no errors raised if vars in dt # Check if the combination of unique_id1, unique_id2, and unique_id3 uniquely identifies rows vars <- c("id", "unique_id2", "unique_id3") @@ -91,26 +90,31 @@ test_that("vars provided by user", { by = vars][N > 1] |> nrow() - # NOTE (RT) --> this test should be correct, but fails because of an issue with the function (?) -# -# if (res == 0) { # if no duplicate rows -# possible_ids(dt, -# vars = vars, -# get_all = TRUE) |> -# unlist() |> -# expect_equal(vars) -# } + # --if it does, possible_ids should return those vars - # check that returned ids only include vars in vars - ids <- + # NOTE (RT) --> this test should be correct, but fails --> find issue in function (?) + if (res == 0) { # if no duplicate rows + possible_ids(dt, + vars = vars, + get_all = TRUE) |> + unlist() |> + expect_equal(vars) + } }) + test_that("inconsistent use of `include`", { expect_warning(possible_ids(x1, include = "x")) + # this was failing due to wrong relationship between vars and include + possible_ids(x1, + include = c("id", "x")) |> + expect_no_error() + + }) test_that("exclude and include", { From 8a5775fd9518710e807484f3a9132e5fd2092da6 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 25 Sep 2024 18:02:34 -0400 Subject: [PATCH 22/77] small adds --- tests/testthat/test-possible_ids.R | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 37f615b4..d5a47daa 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -114,7 +114,6 @@ test_that("inconsistent use of `include`", { include = c("id", "x")) |> expect_no_error() - }) test_that("exclude and include", { @@ -122,7 +121,27 @@ test_that("exclude and include", { dd <- possible_ids(x3, exclude_classes = c("numeric", "integer"), include = "foo") - expect_equal(unlist(dd), c("id", "foo")) + + expect_equal(unlist(dd), + c("id", "foo")) + + # + + # possible_ids(dt, + # #include_classes = c("integer"), + # exclude = c(paste0("numeric_double", 1:8))) |> unlist() + + # example excluding integers + + possible_ids(dt, + exclude = c("numeric_double_7", "numeric_double_8")) + + # to complete + + possible_ids(dt, + exclude_classes = "numeric") |> unlist() + + }) From 2faa4a81b05cad596e146dbe83d6ba70d58a0a39 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 26 Sep 2024 11:04:49 -0400 Subject: [PATCH 23/77] add more tests and fix issue with vars-include --- R/possible_ids.R | 20 +++++++++++--- tests/testthat/test-possible_ids.R | 43 ++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index f48b6216..30c6bf27 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -58,6 +58,7 @@ possible_ids <- function(dt, include_classes = NULL, verbose = getOption("possible_ids.verbose", default = FALSE), + return_checked_vars = FALSE, min_combination_size = 1, max_combination_size = 5, max_processing_time = 60, # in seconds @@ -82,10 +83,17 @@ possible_ids <- function(dt, # 2 options: # 1. If include is not null raise an error -user must provide either include or vars # 2. Remove from include vars in vars: + # if (!is.null(include)){ + # include <- setdiff(include, + # vars) + # } + if (!is.null(include)){ - include <- setdiff(include, - vars) - } + vars <- setdiff(vars, + include) + } + + } else { missing_vars <- setdiff(vars, names(dt)) @@ -110,6 +118,10 @@ possible_ids <- function(dt, ## var names -------- vars <- filter_by_name(vars, include, exclude, verbose) + # return vars to check #### + if (return_checked_vars) { + return(vars) + } ## no duplicated vars ------------- if (anyDuplicated(vars)) { @@ -264,6 +276,8 @@ possible_ids <- function(dt, cli::cli_alert_warning("No unique identifier found.") } } + + # add option to return checked vars: return(remove_null(possible_ids_list)) } diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index d5a47daa..4cc3d8e0 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -109,7 +109,6 @@ test_that("inconsistent use of `include`", { expect_warning(possible_ids(x1, include = "x")) - # this was failing due to wrong relationship between vars and include possible_ids(x1, include = c("id", "x")) |> expect_no_error() @@ -125,21 +124,43 @@ test_that("exclude and include", { expect_equal(unlist(dd), c("id", "foo")) - # + ## Test combination between include class and exclude vars #### - # possible_ids(dt, - # #include_classes = c("integer"), - # exclude = c(paste0("numeric_double", 1:8))) |> unlist() + checked_vars <- possible_ids(dt, + return_checked_vars = TRUE, + include_classes = c("integer"), + exclude = paste0("numeric_int_", 1:5)) - # example excluding integers + any( + paste0("numeric_int_", 1:5) %in% checked_vars + ) |> + expect_equal(FALSE) - possible_ids(dt, - exclude = c("numeric_double_7", "numeric_double_8")) + all( + paste0("numeric_int_", 6:10) %in% checked_vars + ) |> + expect_equal(TRUE) - # to complete + ## Test combination between include vars and exclude class #### + checked_vars <- possible_ids(dt, + return_checked_vars = TRUE, + include = c("numeric_double_1", + "numeric_double_2"), + exclude_classes = "numeric") - possible_ids(dt, - exclude_classes = "numeric") |> unlist() + all( + paste0("numeric_double_", 1:2) %in% checked_vars + ) |> + expect_equal(TRUE) + + any( + paste0("numeric_double_", 3:10) %in% checked_vars + ) |> + expect_equal(FALSE) + + checked_vars <- possible_ids(x2, + include = "x", + exclude_classes = "numeric") From 1dddea58864af7d2b98b69e76b6d5b5d8f591c08 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 26 Sep 2024 11:35:55 -0400 Subject: [PATCH 24/77] more tests --- tests/testthat/test-possible_ids.R | 31 +++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 4cc3d8e0..a110cfb5 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -158,10 +158,39 @@ test_that("exclude and include", { ) |> expect_equal(FALSE) - checked_vars <- possible_ids(x2, + res_ids <- possible_ids(x2, include = "x", exclude_classes = "numeric") + res_ids |> + unlist() |> + expect_equal(c("t", "x")) + + res_ids <- possible_ids(x3, + include = "id", + exclude_classes = "character") + + res_ids |> + unlist() |> + expect_equal(c("foo", "id")) + + # alert if include and exclude same class #### + possible_ids(dt, + include_classes = "numeric", + exclude_classes = "numeric") |> + expect_message() + + # alert if include and exclude same vars #### + possible_ids(dt, + include = c("id", "unique_id1"), + exclude = c("id", "unique_id1")) |> + expect_message() + + + + + + }) From e086a07e8ea6c01cdcbfdc2449d1f6a802ff4a86 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 26 Sep 2024 14:25:47 -0400 Subject: [PATCH 25/77] small fix --- R/possible_ids.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 30c6bf27..59a669eb 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -58,7 +58,7 @@ possible_ids <- function(dt, include_classes = NULL, verbose = getOption("possible_ids.verbose", default = FALSE), - return_checked_vars = FALSE, + return_checked_vars = TRUE, min_combination_size = 1, max_combination_size = 5, max_processing_time = 60, # in seconds @@ -86,12 +86,12 @@ possible_ids <- function(dt, # if (!is.null(include)){ # include <- setdiff(include, # vars) - # } - - if (!is.null(include)){ - vars <- setdiff(vars, - include) - } + # # } + # + # if (!is.null(include)){ + # vars <- setdiff(vars, + # include) + # } From 3d5c28b18ed4c06c868fa3ea3ea09ba1f5e43dfb Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 26 Sep 2024 18:59:31 -0400 Subject: [PATCH 26/77] try again fix vars arg --- R/possible_ids.R | 52 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 59a669eb..8a40fd0a 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -76,9 +76,37 @@ possible_ids <- function(dt, # Get all variable names #vars <- names(dt) |> copy() - if (is.null(vars)) { + + + # Vars argument attempt one ------- + + if (!(is.null(vars))) { #when user provdes vars + + # exclude should not be used + if (!(is.null(exclude) & is.null(exclude_classes))) { + exclude <- NULL + exclude_classes <- NULL + cli::cli_alert_danger("Args {.strongArg `exclude`} and {.strongArg `exclude_classes`} not available when using {.strongArg `vars`}") + } + + # include + if (!is.null(include)) { + vars <- funique(c(vars, setdiff(include, vars))) + } + } else { vars <- names(dt) |> copy() + } + + + + + + + # OLD VERSION ##### + # if (is.null(vars)) { + # vars <- names(dt) |> + # copy() # 2 options: # 1. If include is not null raise an error -user must provide either include or vars @@ -95,17 +123,17 @@ possible_ids <- function(dt, - } else { - missing_vars <- setdiff(vars, names(dt)) - if (length(missing_vars) > 0) { - cli::cli_abort("The following variables are not in the data table: {.strongVar {missing_vars}}") - } - - if (length(vars) < 2) { - cli::cli_abort("Can't make combinations with a single var: {.strongVar {vars}}") - } - - } + # } else { + # missing_vars <- setdiff(vars, names(dt)) + # if (length(missing_vars) > 0) { + # cli::cli_abort("The following variables are not in the data table: {.strongVar {missing_vars}}") + # } + # + # if (length(vars) < 2) { + # cli::cli_abort("Can't make combinations with a single var: {.strongVar {vars}}") + # } + # + # } # Exclude and include ------- From 8ffdd2e330f8fd37cd0ad720609cd15d4312834b Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Fri, 27 Sep 2024 09:34:44 -0400 Subject: [PATCH 27/77] fix filter by name -duplicates issue --- R/possible_ids.R | 69 ++++++++++++++++++++++-------- tests/testthat/test-possible_ids.R | 19 ++++++++ 2 files changed, 69 insertions(+), 19 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 8a40fd0a..2c5a7777 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -58,7 +58,7 @@ possible_ids <- function(dt, include_classes = NULL, verbose = getOption("possible_ids.verbose", default = FALSE), - return_checked_vars = TRUE, + return_checked_vars = FALSE, min_combination_size = 1, max_combination_size = 5, max_processing_time = 60, # in seconds @@ -80,25 +80,52 @@ possible_ids <- function(dt, # Vars argument attempt one ------- - if (!(is.null(vars))) { #when user provdes vars + # if (!(is.null(vars))) { #when user provides vars + # + # # exclude should not be used + # if (!(is.null(exclude) & is.null(exclude_classes))) { + # exclude <- NULL + # exclude_classes <- NULL + # cli::cli_alert_danger("Args {.strongArg `exclude`} and {.strongArg `exclude_classes`} not available when using {.strongArg `vars`}") + # } + # + # # include + # if (!is.null(include)) { + # vars <- funique(c(vars, setdiff(include, vars))) + # } + # } else { + # vars <- names(dt) |> + # copy() + # } - # exclude should not be used - if (!(is.null(exclude) & is.null(exclude_classes))) { - exclude <- NULL - exclude_classes <- NULL - cli::cli_alert_danger("Args {.strongArg `exclude`} and {.strongArg `exclude_classes`} not available when using {.strongArg `vars`}") - } + # Vars arg attempt two -------- - # include - if (!is.null(include)) { - vars <- funique(c(vars, setdiff(include, vars))) + if (is.null(vars)) { + vars <- names(dt) |> + copy() + } else { + # check if all vars are in dt + missing_vars <- setdiff(vars, names(dt)) + + if (length(missing_vars) > 0) { + cli::cli_abort("The following variables are not in the data table: {.strongVar {missing_vars}}") } - } else { - vars <- names(dt) |> - copy() - } + # check at least 2 vars are provided + + if (length(vars) < 2) { + cli::cli_abort("Can't make combinations with a single var: {.strongVar {vars}}") + } + } + + # to fix duplicates with include + + # if (!is.null(include)){ + # vars <- funique(c(vars, + # setdiff(include, vars))) + # + # } @@ -146,10 +173,12 @@ possible_ids <- function(dt, ## var names -------- vars <- filter_by_name(vars, include, exclude, verbose) + ## NOTE: DUPLICATES ORIGINATE HERE -IN FILTER BY NAME !! + # return vars to check #### - if (return_checked_vars) { - return(vars) - } + # if (return_checked_vars) { + # return(vars) + # } ## no duplicated vars ------------- if (anyDuplicated(vars)) { @@ -335,7 +364,9 @@ filter_by_name <- function(vars, include, exclude, verbose) { no_exc <- exclude[wno_exc] cli::cli_alert_warning("var{?s} {.var {no_exc}} not found in dataframe") } - vars <- setdiff(vars, exclude) + #vars <- setdiff(vars, exclude) + vars <- c(vars, + setdiff(include, vars)) } # Apply 'include' filter diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index a110cfb5..c8931c84 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -67,6 +67,10 @@ test_that("vars provided by user", { vars = c("id", "id3", "id2")) |> expect_error() + possible_ids(dt, + vars = c("id", "numeric_int_1", "character_1"), + verbose = TRUE) |> + expect_no_error() ids_dt <- possible_ids(dt) @@ -104,6 +108,21 @@ test_that("vars provided by user", { }) +test_that("relationship include and vars", { + + possible_ids(x4, + vars = c("id1", "id2"), + include = c("t")) |> + expect_no_error() + + +}) + +test_that("relationship exclude and vars") { + + # add tests +} + test_that("inconsistent use of `include`", { expect_warning(possible_ids(x1, From 1bf176a5d29a8943c78d3ccfa65451bfcac45bae Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Fri, 27 Sep 2024 09:45:53 -0400 Subject: [PATCH 28/77] fix again filtering and add tests --- R/possible_ids.R | 4 +++- tests/testthat/test-possible_ids.R | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 2c5a7777..86974ef5 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -370,7 +370,9 @@ filter_by_name <- function(vars, include, exclude, verbose) { } # Apply 'include' filter - c(vars, include) + + c(vars, + setdiff(include, vars)) } diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index c8931c84..40a9dc12 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -100,8 +100,7 @@ test_that("vars provided by user", { if (res == 0) { # if no duplicate rows possible_ids(dt, - vars = vars, - get_all = TRUE) |> + vars = vars) |> unlist() |> expect_equal(vars) } @@ -115,6 +114,20 @@ test_that("relationship include and vars", { include = c("t")) |> expect_no_error() + possible_ids(dt, + vars = c("id", "unique_id2"), + include = c("id", "factor_1", "factor_2")) |> + expect_no_error() + + possible_ids(x4, + vars = NULL, + include = c("t", "x")) |> + expect_no_error() + + possible_ids(x4, + vars = c("t", "x"), + include = NULL) + }) From be8f101a8b48a7196e807201ea2d588fbf349309 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Fri, 27 Sep 2024 10:43:29 -0400 Subject: [PATCH 29/77] add checked vars attempt one --- R/possible_ids.R | 25 ++++++++++++++++++++++--- tests/testthat/test-possible_ids.R | 15 ++++++++++++--- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 86974ef5..1c94f451 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -58,7 +58,7 @@ possible_ids <- function(dt, include_classes = NULL, verbose = getOption("possible_ids.verbose", default = FALSE), - return_checked_vars = FALSE, + store_checked_vars = TRUE, min_combination_size = 1, max_combination_size = 5, max_processing_time = 60, # in seconds @@ -117,6 +117,13 @@ possible_ids <- function(dt, cli::cli_abort("Can't make combinations with a single var: {.strongVar {vars}}") } + # exclude should not be used + if (!(is.null(exclude) & is.null(exclude_classes))) { + exclude <- NULL + exclude_classes <- NULL + cli::cli_alert_danger("Args {.strongArg `exclude`} and {.strongArg `exclude_classes`} not available when using {.strongArg `vars`}") + } + } # to fix duplicates with include @@ -334,8 +341,20 @@ possible_ids <- function(dt, } } - # add option to return checked vars: - return(remove_null(possible_ids_list)) + ret_list <- remove_null(possible_ids_list) + + if (store_checked_vars == TRUE) { + # store checked vars in env and has an attribute + rlang::env_poke(env = .joynenv, + nm = "possible_ids", + value = vars) + + #add vars as an attribute to the list remove_null(possible_ids_list) + attr(ret_list, "checked_vars") <- vars + } + + #return(remove_null(possible_ids_list)) + return(ret_list) } diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 40a9dc12..83c63224 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -133,13 +133,22 @@ test_that("relationship include and vars", { test_that("relationship exclude and vars") { - # add tests + possible_ids(x4, + vars = c("t", "x"), + exclude_classes = "character") |> + expect_message() + + possible_ids(x4, + vars = c("id1", "x"), + exclude = "x") |> + expect_message() } test_that("inconsistent use of `include`", { - expect_warning(possible_ids(x1, - include = "x")) + # why is this an inconsistent use of include? + # expect_warning(possible_ids(x1, + # include = "x")) possible_ids(x1, include = c("id", "x")) |> From 4db75b876a82100850b31ff5342e9965d63bfcfe Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Mon, 30 Sep 2024 11:27:08 -0400 Subject: [PATCH 30/77] fixes --- R/possible_ids.R | 110 +++++++++++++++-------------------------------- 1 file changed, 35 insertions(+), 75 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 1c94f451..7ab8ca3c 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -3,8 +3,7 @@ #' Identify possible combinations of variables that uniquely identifying dt #' #' @param dt data frame -#' @param vars character: A vector of variable names to consider for identifying unique combinations. If `NULL` (default), all variables in the data frame are considered. -#' @param exclude character: Exclude variables to be selected as identifiers. +#' @param vars character: A vector of variable names to consider for identifying unique combinations. #' @param include character: Name of variable to be included, that might belong #' to the group excluded in the `exclude` #' @param exclude_classes character: classes to exclude from analysis (e.g., @@ -77,27 +76,6 @@ possible_ids <- function(dt, # Get all variable names #vars <- names(dt) |> copy() - - # Vars argument attempt one ------- - - # if (!(is.null(vars))) { #when user provides vars - # - # # exclude should not be used - # if (!(is.null(exclude) & is.null(exclude_classes))) { - # exclude <- NULL - # exclude_classes <- NULL - # cli::cli_alert_danger("Args {.strongArg `exclude`} and {.strongArg `exclude_classes`} not available when using {.strongArg `vars`}") - # } - # - # # include - # if (!is.null(include)) { - # vars <- funique(c(vars, setdiff(include, vars))) - # } - # } else { - # vars <- names(dt) |> - # copy() - # } - # Vars arg attempt two -------- if (is.null(vars)) { @@ -126,49 +104,6 @@ possible_ids <- function(dt, } - # to fix duplicates with include - - # if (!is.null(include)){ - # vars <- funique(c(vars, - # setdiff(include, vars))) - # - # } - - - - - # OLD VERSION ##### - # if (is.null(vars)) { - # vars <- names(dt) |> - # copy() - - # 2 options: - # 1. If include is not null raise an error -user must provide either include or vars - # 2. Remove from include vars in vars: - # if (!is.null(include)){ - # include <- setdiff(include, - # vars) - # # } - # - # if (!is.null(include)){ - # vars <- setdiff(vars, - # include) - # } - - - - # } else { - # missing_vars <- setdiff(vars, names(dt)) - # if (length(missing_vars) > 0) { - # cli::cli_abort("The following variables are not in the data table: {.strongVar {missing_vars}}") - # } - # - # if (length(vars) < 2) { - # cli::cli_abort("Can't make combinations with a single var: {.strongVar {vars}}") - # } - # - # } - # Exclude and include ------- ## classes ---------- @@ -240,6 +175,17 @@ possible_ids <- function(dt, } } + print(vars) + # add checked vars to .joynenv + if (store_checked_vars == TRUE) { + # store in .joynenv + rlang::env_poke(env = .joynenv, + nm = "checked_ids", + value = vars) + + } + + # combinations ----------- # Start testing combinations @@ -258,6 +204,8 @@ possible_ids <- function(dt, return(remove_null(possible_ids_list)) } + print(vars) + j <- init_index + 1 for (comb_size in min_size:max_size) { @@ -329,6 +277,9 @@ possible_ids <- function(dt, } if (verbose) cli::cli_progress_update() } + + # charcare(0) + print(vars) # Break if all variables are used if (length(vars) == 0 || elapsed_time > max_processing_time) { break @@ -341,19 +292,28 @@ possible_ids <- function(dt, } } + + ret_list <- remove_null(possible_ids_list) + # setattrib("checked_vars" = vars) - if (store_checked_vars == TRUE) { - # store checked vars in env and has an attribute - rlang::env_poke(env = .joynenv, - nm = "possible_ids", - value = vars) + #if (store_checked_vars == TRUE) { - #add vars as an attribute to the list remove_null(possible_ids_list) - attr(ret_list, "checked_vars") <- vars - } + # print(.joynenv) + # print(vars) + # # store checked vars in env and has an attribute + # rlang::env_poke(env = .joynenv, + # possible_ids = vars) + # #value = vars) + # + # #add vars as an attribute to the list remove_null(possible_ids_list) + # attr(ret_list, "checked_vars") <- vars + # #} + + # setattr(x = ret_list, + # name = "checked_vars", + # value = vars) - #return(remove_null(possible_ids_list)) return(ret_list) } From 151d7f43e93806331c6cdaca4dd741333dda7f12 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Mon, 30 Sep 2024 17:32:48 -0400 Subject: [PATCH 31/77] fix issue with storing in joynenv and adding attribute of checked_ids --- R/possible_ids.R | 38 ++++++++++++++++++++---------- tests/testthat/test-possible_ids.R | 1 + 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 7ab8ca3c..cf7ef429 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -76,7 +76,7 @@ possible_ids <- function(dt, # Get all variable names #vars <- names(dt) |> copy() - # Vars arg attempt two -------- + # Vars -------- if (is.null(vars)) { vars <- names(dt) |> @@ -141,6 +141,9 @@ possible_ids <- function(dt, return(NULL) # should this be an error? } + print(vars) + + # Unique values --------- # Sort variables by number of unique values (ascending order) @@ -154,6 +157,19 @@ possible_ids <- function(dt, # Initialize list to store possible IDs possible_ids_list <- vector("list", max_numb_possible_ids) + # add checked vars to .joynenv + if (store_checked_vars == TRUE) { + # store in .joynenv + rlang::env_poke(env = .joynenv, + nm = "checked_ids", + value = vars) + + # add attribute + attr(possible_ids_list, "checked_ids") <- vars + } + + #print(attributes(possible_ids_list)) + if (min_combination_size == 1) { unique_ids <- vars[unique_counts == n_rows] # Add individual unique variables @@ -175,15 +191,15 @@ possible_ids <- function(dt, } } - print(vars) - # add checked vars to .joynenv - if (store_checked_vars == TRUE) { - # store in .joynenv - rlang::env_poke(env = .joynenv, - nm = "checked_ids", - value = vars) - - } + # print(vars) + # # add checked vars to .joynenv + # if (store_checked_vars == TRUE) { + # # store in .joynenv + # rlang::env_poke(env = .joynenv, + # nm = "checked_ids", + # value = vars) + # + # } # combinations ----------- @@ -204,8 +220,6 @@ possible_ids <- function(dt, return(remove_null(possible_ids_list)) } - print(vars) - j <- init_index + 1 for (comb_size in min_size:max_size) { diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 83c63224..6e6208d8 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -346,6 +346,7 @@ dt_large <- dt_large[sample(.N)] # dt_large[, id := .I] dt <- copy(dt_large) + possible_ids( dt = dt_large, exclude_classes = c("numeric"), From a6fee8bb8d5b2734b5afd5cdd1b101fe3e3d7083 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 1 Oct 2024 14:42:15 -0400 Subject: [PATCH 32/77] fix get_all error --- R/possible_ids.R | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index cf7ef429..f772a25f 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -141,7 +141,7 @@ possible_ids <- function(dt, return(NULL) # should this be an error? } - print(vars) + #print(vars) # Unique values --------- @@ -163,12 +163,13 @@ possible_ids <- function(dt, rlang::env_poke(env = .joynenv, nm = "checked_ids", value = vars) + print(.joynenv$checked_ids) # add attribute attr(possible_ids_list, "checked_ids") <- vars + print(attributes(possible_ids_list)) } - #print(attributes(possible_ids_list)) if (min_combination_size == 1) { unique_ids <- vars[unique_counts == n_rows] @@ -218,11 +219,18 @@ possible_ids <- function(dt, combinations is {min_size} and the max is {max_size}") } return(remove_null(possible_ids_list)) + # this returns an empty list -should we raise an error instead? (RT) } j <- init_index + 1 for (comb_size in min_size:max_size) { + # make sure length of vars is >= comb_size + # Skip the iteration if comb_size is larger than the number of variables in vars + if (length(vars) < comb_size) { + next + } + combos <- combn(vars, comb_size, simplify = FALSE) # Prune combinations where the product of unique counts is less @@ -292,8 +300,6 @@ possible_ids <- function(dt, if (verbose) cli::cli_progress_update() } - # charcare(0) - print(vars) # Break if all variables are used if (length(vars) == 0 || elapsed_time > max_processing_time) { break @@ -308,7 +314,7 @@ possible_ids <- function(dt, - ret_list <- remove_null(possible_ids_list) + #ret_list <- remove_null(possible_ids_list) # setattrib("checked_vars" = vars) #if (store_checked_vars == TRUE) { @@ -328,7 +334,7 @@ possible_ids <- function(dt, # name = "checked_vars", # value = vars) - return(ret_list) + return(remove_null(possible_ids_list)) } From ae49f2d79d727c2952bb032fb6d3f87b9c31b8fa Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 1 Oct 2024 14:48:43 -0400 Subject: [PATCH 33/77] another issue of get_all ?? --- R/possible_ids.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index f772a25f..f780d33c 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -264,7 +264,7 @@ possible_ids <- function(dt, # This is inefficient... it is copying every time... # I need to think better on how to do it. possible_ids_list[[j]] <- combo - j <- init_index + 1 + j <- init_index + 1 # RT I think this has to be fixed if (j > max_numb_possible_ids) { if (verbose) { cli::cli_alert_warning( From 5d85558bb5d33e0c54d035edf0f6dd06a5d4fe15 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 1 Oct 2024 14:55:36 -0400 Subject: [PATCH 34/77] fix issue with indexing inside the loop --- R/possible_ids.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index f780d33c..6d93213d 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -264,7 +264,10 @@ possible_ids <- function(dt, # This is inefficient... it is copying every time... # I need to think better on how to do it. possible_ids_list[[j]] <- combo - j <- init_index + 1 # RT I think this has to be fixed + + #j <- init_index + 1 # RT I think this has to be fixed + j <- j + 1 + if (j > max_numb_possible_ids) { if (verbose) { cli::cli_alert_warning( From fdbd3d70cec42b845bf585307d3e1503e9635eed Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 1 Oct 2024 16:31:48 -0400 Subject: [PATCH 35/77] add tests --- R/possible_ids.R | 37 ++++++++++++++++++++---------- tests/testthat/test-possible_ids.R | 31 +++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 6d93213d..77b6faf1 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -159,15 +159,21 @@ possible_ids <- function(dt, # add checked vars to .joynenv if (store_checked_vars == TRUE) { - # store in .joynenv - rlang::env_poke(env = .joynenv, - nm = "checked_ids", - value = vars) - print(.joynenv$checked_ids) - - # add attribute - attr(possible_ids_list, "checked_ids") <- vars - print(attributes(possible_ids_list)) + + checked_ids <- vars + + print("here") + print(checked_ids) + + # # store in .joynenv + # rlang::env_poke(env = .joynenv, + # nm = "checked_ids", + # value = vars) + # print(.joynenv$checked_ids) + # + # # add attribute + # attr(possible_ids_list, "checked_ids") <- vars + # print(attributes(possible_ids_list)) } @@ -317,8 +323,13 @@ possible_ids <- function(dt, - #ret_list <- remove_null(possible_ids_list) - # setattrib("checked_vars" = vars) + ret_list <- remove_null(possible_ids_list) + + if (store_checked_vars) { + attr(ret_list, "checked_ids") <- checked_ids + + } + #if (store_checked_vars == TRUE) { @@ -337,7 +348,9 @@ possible_ids <- function(dt, # name = "checked_vars", # value = vars) - return(remove_null(possible_ids_list)) + #return(remove_null(possible_ids_list)) + + return(ret_list) } diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 6e6208d8..1b908057 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -243,6 +243,37 @@ test_that("get length 0", { exclude_classes = c("numeric", "integer"), include = "t"), 0) +}) + +test_that("get all works", { + + # no error + possible_ids(x4, + get_all = TRUE) |> + expect_no_error() + + # get all with user selected vars + possible_ids(x4, + vars = c("id1", "t"), + get_all = TRUE) |> + expect_no_error() + + # get all with max number of combinations + possible_ids(x4, + max_combination_size = 3, + get_all = TRUE) |> + expect_no_error() + + # possible_ids(dt, + # get_all = TRUE) |> + # expect_no_error() + + + + # get all with + + + }) From 883bd46b45898e1c2f4ec3a2830abbd2b99f171c Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 1 Oct 2024 16:46:20 -0400 Subject: [PATCH 36/77] test max comb size --- tests/testthat/test-possible_ids.R | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 1b908057..61f087ea 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -269,9 +269,30 @@ test_that("get all works", { # expect_no_error() +}) + + +test_that("Max combination size", { + + res <- possible_ids(dt, + vars = c( "unique_id1", "unique_id2", "unique_id3", + "character_1", "character_2", "character_3", "character_4"), + max_combination_size = 5) + + + sapply(res, function(sublist) { + length(sublist) <= 3}) |> + all() |> + expect_true() - # get all with + res <- possible_ids(x1, + get_all = TRUE, + max_combination_size = 2) + sapply(res, function(sublist) { + length(sublist) <= 2}) |> + all() |> + expect_true() }) From 44c92e971b447aba0429e1eaab5877b0eee92be6 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 2 Oct 2024 10:03:56 -0400 Subject: [PATCH 37/77] clean code --- R/possible_ids.R | 64 +++++------------------------- tests/testthat/test-possible_ids.R | 4 ++ 2 files changed, 15 insertions(+), 53 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 77b6faf1..9ee0fd86 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -115,13 +115,6 @@ possible_ids <- function(dt, ## var names -------- vars <- filter_by_name(vars, include, exclude, verbose) - ## NOTE: DUPLICATES ORIGINATE HERE -IN FILTER BY NAME !! - - # return vars to check #### - # if (return_checked_vars) { - # return(vars) - # } - ## no duplicated vars ------------- if (anyDuplicated(vars)) { dupvars <- vars[duplicated(vars)] |> @@ -143,7 +136,6 @@ possible_ids <- function(dt, #print(vars) - # Unique values --------- # Sort variables by number of unique values (ascending order) @@ -157,23 +149,11 @@ possible_ids <- function(dt, # Initialize list to store possible IDs possible_ids_list <- vector("list", max_numb_possible_ids) - # add checked vars to .joynenv + # store checked ids if (store_checked_vars == TRUE) { checked_ids <- vars - print("here") - print(checked_ids) - - # # store in .joynenv - # rlang::env_poke(env = .joynenv, - # nm = "checked_ids", - # value = vars) - # print(.joynenv$checked_ids) - # - # # add attribute - # attr(possible_ids_list, "checked_ids") <- vars - # print(attributes(possible_ids_list)) } @@ -198,17 +178,6 @@ possible_ids <- function(dt, } } - # print(vars) - # # add checked vars to .joynenv - # if (store_checked_vars == TRUE) { - # # store in .joynenv - # rlang::env_poke(env = .joynenv, - # nm = "checked_ids", - # value = vars) - # - # } - - # combinations ----------- # Start testing combinations @@ -321,34 +290,23 @@ possible_ids <- function(dt, } } - + # ----------------------------- # + # Return #### + # ----------------------------- # ret_list <- remove_null(possible_ids_list) - if (store_checked_vars) { + if (store_checked_vars == TRUE) { + # add attribute attr(ret_list, "checked_ids") <- checked_ids - } - + # store in .joynenv + rlang::env_poke(env = .joynenv, + nm = "checked_ids", + value = checked_ids) - #if (store_checked_vars == TRUE) { - - # print(.joynenv) - # print(vars) - # # store checked vars in env and has an attribute - # rlang::env_poke(env = .joynenv, - # possible_ids = vars) - # #value = vars) - # - # #add vars as an attribute to the list remove_null(possible_ids_list) - # attr(ret_list, "checked_vars") <- vars - # #} - - # setattr(x = ret_list, - # name = "checked_vars", - # value = vars) + } - #return(remove_null(possible_ids_list)) return(ret_list) } diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 61f087ea..45822f44 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -294,6 +294,10 @@ test_that("Max combination size", { all() |> expect_true() +}) + +test_that("Min combination size", { + }) From c84dfffdc872dc273cbdf5895134510825a13aa1 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 2 Oct 2024 10:26:04 -0400 Subject: [PATCH 38/77] debugging store checked vars --- R/possible_ids.R | 21 +++++++++++++++- tests/testthat/test-possible_ids.R | 40 ++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 9ee0fd86..ee618dcd 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -252,7 +252,26 @@ possible_ids <- function(dt, return(possible_ids_list) } if (!get_all) { - return(remove_null(possible_ids_list)) + + # debug statement + print("not get_all") + print("not get all, checked vars") + print(checked_ids) + + ret_list <- remove_null(possible_ids_list) + + if (store_checked_vars == TRUE) { + # add attribute + attr(ret_list, "checked_ids") <- checked_ids + + # store in .joynenv + rlang::env_poke(env = .joynenv, + nm = "checked_ids", + value = checked_ids) + + } + + return(ret_list) } # Remove variables in the current combo from vars to # avoid redundant checks diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 45822f44..fd2ed907 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -44,6 +44,20 @@ test_that("convert to data.table", { expect_equal(possible_ids(x1), possible_ids(xx1)) }) + +test_that("store checked ids", { + + store_checked_vars <- TRUE + + res <- possible_ids(x4, + store_checked_vars = store_checked_vars) + + + # does not work + + +}) + test_that("error if not dataframe", { m1 <- as.matrix(x1) @@ -298,6 +312,32 @@ test_that("Max combination size", { test_that("Min combination size", { + possible_ids(x4, + min_combination_size = 1, + get_all = FALSE) |> + unlist() |> + length() > 1 |> + expect_true() + + + res <- possible_ids(dt, + min_combination_size = 3, + get_all = FALSE) |> + unlist() + + expect_true(length(res) >= 3) + + + possible_ids(x4, + #min_combination_size = 1, + max_combination_size = 1) |> + expect_message() + + + possible_ids(x4, + min_combination_size = 3, + max_combination_size = 2) |> + expect_message() }) From 1dd5e1355393c9346d7bf5fa8b1325f11c3f087d Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 2 Oct 2024 10:39:51 -0400 Subject: [PATCH 39/77] add tests --- tests/testthat/test-possible_ids.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index fd2ed907..47f71bb8 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -50,10 +50,16 @@ test_that("store checked ids", { store_checked_vars <- TRUE res <- possible_ids(x4, - store_checked_vars = store_checked_vars) + store_checked_vars = store_checked_vars, + get_all = TRUE) + attributes(res)$checked_ids |> + expect_setequal(names(x4)) - # does not work + .joynenv$checked_ids |> + expect_setequal(names(x4)) + + # does not work with get_all = FALSE }) From 97ce951a1c8506cc731bfc96ac6bc4ba86daba81 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 3 Oct 2024 10:19:16 -0400 Subject: [PATCH 40/77] fix store checked vars again --- R/possible_ids.R | 17 +++++++++++++---- tests/testthat/test-possible_ids.R | 11 ++++++++++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index ee618dcd..7acb101d 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -154,6 +154,16 @@ possible_ids <- function(dt, checked_ids <- vars + + # add attribute + attr(possible_ids_list, "checked_ids") <- checked_ids + + # store in .joynenv + rlang::env_poke(env = .joynenv, + nm = "checked_ids", + value = checked_ids) + + } @@ -254,9 +264,9 @@ possible_ids <- function(dt, if (!get_all) { # debug statement - print("not get_all") - print("not get all, checked vars") - print(checked_ids) + # print("FALSE get_all") + # print("not get all, checked vars") + # print(checked_ids) ret_list <- remove_null(possible_ids_list) @@ -326,7 +336,6 @@ possible_ids <- function(dt, } - return(ret_list) } diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 47f71bb8..3ac275cd 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -59,7 +59,16 @@ test_that("store checked ids", { .joynenv$checked_ids |> expect_setequal(names(x4)) - # does not work with get_all = FALSE + # with get_all = FALSE + res <- possible_ids(x4, + store_checked_vars = store_checked_vars) + + attributes(res)$checked_ids |> + expect_setequal(names(x4)) + + .joynenv$checked_ids |> + expect_setequal(names(x4)) + }) From 40930d6cbb69b81d47bdb6c23aa832a31ab6bb1b Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 3 Oct 2024 10:29:16 -0400 Subject: [PATCH 41/77] update docuemntation --- R/possible_ids.R | 1 + man/possible_ids.Rd | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 7acb101d..22056685 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -23,6 +23,7 @@ #' above. #' @param verbose logical: If FALSE no message will be displayed. Default is #' TRUE +#' @param store_checked_vars logical: If `TRUE`, stores the variables checked in .joynenv environment and as an attribute of the returned list. Default is `TRUE`. #' #' @section Number of possible IDs: #' diff --git a/man/possible_ids.Rd b/man/possible_ids.Rd index 09c1f9a5..d85a482f 100644 --- a/man/possible_ids.Rd +++ b/man/possible_ids.Rd @@ -12,6 +12,7 @@ possible_ids( exclude_classes = NULL, include_classes = NULL, verbose = getOption("possible_ids.verbose", default = FALSE), + store_checked_vars = TRUE, min_combination_size = 1, max_combination_size = 5, max_processing_time = 60, @@ -22,9 +23,7 @@ possible_ids( \arguments{ \item{dt}{data frame} -\item{vars}{character: A vector of variable names to consider for identifying unique combinations. If \code{NULL} (default), all variables in the data frame are considered.} - -\item{exclude}{character: Exclude variables to be selected as identifiers.} +\item{vars}{character: A vector of variable names to consider for identifying unique combinations.} \item{include}{character: Name of variable to be included, that might belong to the group excluded in the \code{exclude}} @@ -38,6 +37,8 @@ to the group excluded in the \code{exclude}} \item{verbose}{logical: If FALSE no message will be displayed. Default is TRUE} +\item{store_checked_vars}{logical: If \code{TRUE}, stores the variables checked in .joynenv environment and as an attribute of the returned list. Default is \code{TRUE}.} + \item{min_combination_size}{numeric: Min number of combinations. Default is 1, so all combinations.} From aa4674484566876047266df06fa864e7c57951e7 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 3 Oct 2024 16:02:45 -0400 Subject: [PATCH 42/77] aux fun to store and return ids attempt one --- R/possible_ids.R | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 22056685..27174109 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -153,16 +153,17 @@ possible_ids <- function(dt, # store checked ids if (store_checked_vars == TRUE) { - checked_ids <- vars + checked_ids <- vars |> + copy() - # add attribute - attr(possible_ids_list, "checked_ids") <- checked_ids - - # store in .joynenv - rlang::env_poke(env = .joynenv, - nm = "checked_ids", - value = checked_ids) + # # add attribute + # attr(possible_ids_list, "checked_ids") <- checked_ids + # + # # store in .joynenv + # rlang::env_poke(env = .joynenv, + # nm = "checked_ids", + # value = checked_ids) } @@ -215,6 +216,7 @@ possible_ids <- function(dt, # Skip the iteration if comb_size is larger than the number of variables in vars if (length(vars) < comb_size) { next + # or break } combos <- combn(vars, comb_size, simplify = FALSE) @@ -394,3 +396,28 @@ remove_null <- \(x) { y <- vapply(x, \(.) !is.null(.), logical(1)) x[y] } + + +# Function to store checked vars as possible ids + +store_checked_ids <- function(checked_ids, + possible_ids, + env = .joynenv) { + + # Remove null from possible ids + possible_ids <- remove_null(possible_ids) + + # Store checked_ids in environment + rlang::env_poke(env = env, + nm = "checked_ids", + value = checked_ids) + + # Store attribute + attr(possible_ids, + "checked_ids") <- checked_ids + + # Return + return(possible_ids) + + +} From ed5cc75275a717e1b773901e20634ddfdd46fd6f Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 3 Oct 2024 16:34:10 -0400 Subject: [PATCH 43/77] implement it and update docuemntation --- R/possible_ids.R | 77 +++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 27174109..14422335 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -23,7 +23,6 @@ #' above. #' @param verbose logical: If FALSE no message will be displayed. Default is #' TRUE -#' @param store_checked_vars logical: If `TRUE`, stores the variables checked in .joynenv environment and as an attribute of the returned list. Default is `TRUE`. #' #' @section Number of possible IDs: #' @@ -150,11 +149,14 @@ possible_ids <- function(dt, # Initialize list to store possible IDs possible_ids_list <- vector("list", max_numb_possible_ids) + checked_ids <- vars |> + copy() + # store checked ids - if (store_checked_vars == TRUE) { + #if (store_checked_vars == TRUE) { - checked_ids <- vars |> - copy() + # checked_ids <- vars |> + # copy() # # add attribute @@ -166,7 +168,7 @@ possible_ids <- function(dt, # value = checked_ids) - } + #} if (min_combination_size == 1) { @@ -266,25 +268,32 @@ possible_ids <- function(dt, } if (!get_all) { + + ret_list <- store_checked_ids(checked_ids = checked_ids, + possible_ids = possible_ids_list, + ) + + return(ret_list) + # debug statement # print("FALSE get_all") # print("not get all, checked vars") # print(checked_ids) - ret_list <- remove_null(possible_ids_list) - - if (store_checked_vars == TRUE) { - # add attribute - attr(ret_list, "checked_ids") <- checked_ids - - # store in .joynenv - rlang::env_poke(env = .joynenv, - nm = "checked_ids", - value = checked_ids) - - } - - return(ret_list) + #ret_list <- remove_null(possible_ids_list) + + # if (store_checked_vars == TRUE) { + # # add attribute + # attr(ret_list, "checked_ids") <- checked_ids + # + # # store in .joynenv + # rlang::env_poke(env = .joynenv, + # nm = "checked_ids", + # value = checked_ids) + # + # } + # + # return(ret_list) } # Remove variables in the current combo from vars to # avoid redundant checks @@ -326,18 +335,24 @@ possible_ids <- function(dt, # Return #### # ----------------------------- # - ret_list <- remove_null(possible_ids_list) - - if (store_checked_vars == TRUE) { - # add attribute - attr(ret_list, "checked_ids") <- checked_ids - - # store in .joynenv - rlang::env_poke(env = .joynenv, - nm = "checked_ids", - value = checked_ids) - - } + # ret_list <- remove_null(possible_ids_list) + # + # if (store_checked_vars == TRUE) { + # # add attribute + # attr(ret_list, "checked_ids") <- checked_ids + # + # # store in .joynenv + # rlang::env_poke(env = .joynenv, + # nm = "checked_ids", + # value = checked_ids) + # + # } + # + # return(ret_list) + + ret_list <- store_checked_ids(checked_ids = checked_ids, + possible_ids = possible_ids_list, + ) return(ret_list) } From c87080384030c6e9d3c49e2576359d9497598f46 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 3 Oct 2024 16:35:47 -0400 Subject: [PATCH 44/77] documentation again --- man/possible_ids.Rd | 2 -- 1 file changed, 2 deletions(-) diff --git a/man/possible_ids.Rd b/man/possible_ids.Rd index d85a482f..108ed6f5 100644 --- a/man/possible_ids.Rd +++ b/man/possible_ids.Rd @@ -37,8 +37,6 @@ to the group excluded in the \code{exclude}} \item{verbose}{logical: If FALSE no message will be displayed. Default is TRUE} -\item{store_checked_vars}{logical: If \code{TRUE}, stores the variables checked in .joynenv environment and as an attribute of the returned list. Default is \code{TRUE}.} - \item{min_combination_size}{numeric: Min number of combinations. Default is 1, so all combinations.} From 668dccfba2e9418993abff169f36bb7b08fabe45 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 3 Oct 2024 16:36:58 -0400 Subject: [PATCH 45/77] update tests --- tests/testthat/test-possible_ids.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 3ac275cd..e32aea0d 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -47,10 +47,9 @@ test_that("convert to data.table", { test_that("store checked ids", { - store_checked_vars <- TRUE + res <- possible_ids(x4, - store_checked_vars = store_checked_vars, get_all = TRUE) attributes(res)$checked_ids |> @@ -60,8 +59,7 @@ test_that("store checked ids", { expect_setequal(names(x4)) # with get_all = FALSE - res <- possible_ids(x4, - store_checked_vars = store_checked_vars) + res <- possible_ids(x4) attributes(res)$checked_ids |> expect_setequal(names(x4)) From 63d01d09acfd303bac5cbea40153e51f86bd8f76 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 3 Oct 2024 16:39:41 -0400 Subject: [PATCH 46/77] clean some code --- R/possible_ids.R | 69 +++--------------------------------------------- 1 file changed, 3 insertions(+), 66 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 14422335..0cbe40c9 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -73,9 +73,7 @@ possible_ids <- function(dt, dt <- as.data.table(dt) } - # Get all variable names - #vars <- names(dt) |> copy() - + # Get variable # Vars -------- if (is.null(vars)) { @@ -134,8 +132,6 @@ possible_ids <- function(dt, return(NULL) # should this be an error? } - #print(vars) - # Unique values --------- # Sort variables by number of unique values (ascending order) @@ -152,25 +148,6 @@ possible_ids <- function(dt, checked_ids <- vars |> copy() - # store checked ids - #if (store_checked_vars == TRUE) { - - # checked_ids <- vars |> - # copy() - - - # # add attribute - # attr(possible_ids_list, "checked_ids") <- checked_ids - # - # # store in .joynenv - # rlang::env_poke(env = .joynenv, - # nm = "checked_ids", - # value = checked_ids) - - - #} - - if (min_combination_size == 1) { unique_ids <- vars[unique_counts == n_rows] # Add individual unique variables @@ -215,7 +192,6 @@ possible_ids <- function(dt, for (comb_size in min_size:max_size) { # make sure length of vars is >= comb_size - # Skip the iteration if comb_size is larger than the number of variables in vars if (length(vars) < comb_size) { next # or break @@ -255,7 +231,6 @@ possible_ids <- function(dt, # I need to think better on how to do it. possible_ids_list[[j]] <- combo - #j <- init_index + 1 # RT I think this has to be fixed j <- j + 1 if (j > max_numb_possible_ids) { @@ -268,32 +243,10 @@ possible_ids <- function(dt, } if (!get_all) { - ret_list <- store_checked_ids(checked_ids = checked_ids, - possible_ids = possible_ids_list, - ) - + possible_ids = possible_ids_list) return(ret_list) - # debug statement - # print("FALSE get_all") - # print("not get all, checked vars") - # print(checked_ids) - - #ret_list <- remove_null(possible_ids_list) - - # if (store_checked_vars == TRUE) { - # # add attribute - # attr(ret_list, "checked_ids") <- checked_ids - # - # # store in .joynenv - # rlang::env_poke(env = .joynenv, - # nm = "checked_ids", - # value = checked_ids) - # - # } - # - # return(ret_list) } # Remove variables in the current combo from vars to # avoid redundant checks @@ -335,24 +288,8 @@ possible_ids <- function(dt, # Return #### # ----------------------------- # - # ret_list <- remove_null(possible_ids_list) - # - # if (store_checked_vars == TRUE) { - # # add attribute - # attr(ret_list, "checked_ids") <- checked_ids - # - # # store in .joynenv - # rlang::env_poke(env = .joynenv, - # nm = "checked_ids", - # value = checked_ids) - # - # } - # - # return(ret_list) - ret_list <- store_checked_ids(checked_ids = checked_ids, - possible_ids = possible_ids_list, - ) + possible_ids = possible_ids_list) return(ret_list) } From 8d8c3d5c0d4efa40ae10f1028e1cd90b48e1ea6f Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 3 Oct 2024 17:22:13 -0400 Subject: [PATCH 47/77] fix error in filter by name --- R/possible_ids.R | 8 +++----- tests/testthat/test-possible_ids.R | 6 +++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 0cbe40c9..9ef0cce5 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -145,8 +145,8 @@ possible_ids <- function(dt, # Initialize list to store possible IDs possible_ids_list <- vector("list", max_numb_possible_ids) - checked_ids <- vars |> - copy() + checked_ids <- vars + #copy() if (min_combination_size == 1) { unique_ids <- vars[unique_counts == n_rows] @@ -320,9 +320,7 @@ filter_by_name <- function(vars, include, exclude, verbose) { no_exc <- exclude[wno_exc] cli::cli_alert_warning("var{?s} {.var {no_exc}} not found in dataframe") } - #vars <- setdiff(vars, exclude) - vars <- c(vars, - setdiff(include, vars)) + vars <- setdiff(vars, exclude) } # Apply 'include' filter diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index e32aea0d..0230c85b 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -195,9 +195,9 @@ test_that("exclude and include", { ## Test combination between include class and exclude vars #### checked_vars <- possible_ids(dt, - return_checked_vars = TRUE, - include_classes = c("integer"), - exclude = paste0("numeric_int_", 1:5)) + #get_all = TRUE, + #include_classes = c("integer"), + exclude = paste0("numeric_int_", 1:5)) any( paste0("numeric_int_", 1:5) %in% checked_vars From b0f8b7c08612eb82d5948120f052935d83913478 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 3 Oct 2024 17:31:37 -0400 Subject: [PATCH 48/77] fix tests --- tests/testthat/test-possible_ids.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 0230c85b..0b9a3731 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -194,11 +194,13 @@ test_that("exclude and include", { ## Test combination between include class and exclude vars #### - checked_vars <- possible_ids(dt, - #get_all = TRUE, - #include_classes = c("integer"), + res <- possible_ids(dt, + get_all = TRUE, + include_classes = c("integer"), exclude = paste0("numeric_int_", 1:5)) + checked_vars <- attributes(res)$checked_ids + any( paste0("numeric_int_", 1:5) %in% checked_vars ) |> @@ -210,12 +212,16 @@ test_that("exclude and include", { expect_equal(TRUE) ## Test combination between include vars and exclude class #### - checked_vars <- possible_ids(dt, - return_checked_vars = TRUE, + res <- possible_ids(dt, + get_all = TRUE, include = c("numeric_double_1", "numeric_double_2"), exclude_classes = "numeric") + # TODO: Fix from here + + checked_vars <- attributes(res)$checked_ids + all( paste0("numeric_double_", 1:2) %in% checked_vars ) |> From f8d3004b3b848b7c5eac5f789b621460bfd0de02 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Fri, 4 Oct 2024 11:25:31 -0400 Subject: [PATCH 49/77] update ret list --- R/possible_ids.R | 33 ++++++++++++++++++++++++--------- man/possible_ids.Rd | 1 - 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 9ef0cce5..6ce47178 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -57,7 +57,6 @@ possible_ids <- function(dt, include_classes = NULL, verbose = getOption("possible_ids.verbose", default = FALSE), - store_checked_vars = TRUE, min_combination_size = 1, max_combination_size = 5, max_processing_time = 60, # in seconds @@ -80,6 +79,7 @@ possible_ids <- function(dt, vars <- names(dt) |> copy() } else { + # check if all vars are in dt missing_vars <- setdiff(vars, names(dt)) @@ -105,8 +105,8 @@ possible_ids <- function(dt, # Exclude and include ------- ## classes ---------- - vars <- filter_by_class(dt = dt, - vars = vars, + vars <- filter_by_class(dt = dt, + vars = vars, include_classes = include_classes, exclude_classes = exclude_classes) @@ -138,7 +138,7 @@ possible_ids <- function(dt, unique_counts <- vapply(dt[, ..vars], fnunique, numeric(1)) vars <- vars[order(unique_counts)] unique_counts <- unique_counts[order(unique_counts)] - n_rows <- fnrow(dt) + n_rows <- fnrow(dt) init_index <- 0 @@ -157,13 +157,21 @@ possible_ids <- function(dt, if (verbose) { cli::cli_alert_info("Found unique identifiers: {.code {unique_ids}}") } - if (!get_all) return(remove_null(possible_ids_list)) + if (!get_all) { + ret_list <- store_checked_ids(checked_ids, + possible_ids_list) + return(ret_list) + } + #return(remove_null(possible_ids_list)) # Remove unique identifiers from vars to reduce combinations vars <- setdiff(vars, unique_ids) if (length(vars) == 0) { # All variables are unique identifiers - return(remove_null(possible_ids_list)) + ret_list <- store_checked_ids(checked_ids, + possible_ids_list) + return(ret_list) + #return(remove_null(possible_ids_list)) } unique_counts <- unique_counts[vars] } @@ -184,8 +192,12 @@ possible_ids <- function(dt, "Can't make combinations of {.field {vars}} if the min number of combinations is {min_size} and the max is {max_size}") } - return(remove_null(possible_ids_list)) - # this returns an empty list -should we raise an error instead? (RT) + # ret_list <- store_checked_ids(checked_ids, + # possible_ids_list) + # return(ret_list) + #return(remove_null(possible_ids_list)) + # this returned an empty list - I would raise an error instead (RT) + cli::cli_abort("No unique identifier found.") } j <- init_index + 1 @@ -239,7 +251,10 @@ possible_ids <- function(dt, "Max number of possible IDs ({max_numb_possible_ids}) reached. You may modify it in argument {.arg max_numb_possible_ids}") } - return(possible_ids_list) + #return(possible_ids_list) + ret_list <- store_checked_ids(checked_ids = checked_ids, + possible_ids = possible_ids_list) + return(ret_list) } if (!get_all) { diff --git a/man/possible_ids.Rd b/man/possible_ids.Rd index 108ed6f5..7a1bdac8 100644 --- a/man/possible_ids.Rd +++ b/man/possible_ids.Rd @@ -12,7 +12,6 @@ possible_ids( exclude_classes = NULL, include_classes = NULL, verbose = getOption("possible_ids.verbose", default = FALSE), - store_checked_vars = TRUE, min_combination_size = 1, max_combination_size = 5, max_processing_time = 60, From 6bb91174cda041f8523d8fc84e2c87c063102f8b Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Fri, 4 Oct 2024 11:48:01 -0400 Subject: [PATCH 50/77] update tests on include and exclude plus store checked vars --- tests/testthat/test-possible_ids.R | 56 ++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 0b9a3731..c5010207 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -47,8 +47,6 @@ test_that("convert to data.table", { test_that("store checked ids", { - - res <- possible_ids(x4, get_all = TRUE) @@ -67,7 +65,26 @@ test_that("store checked ids", { .joynenv$checked_ids |> expect_setequal(names(x4)) + # with large dt -get_all FALSE + res <- possible_ids(dt, + vars = paste0("numeric_int_", 1:4)) + + attributes(res)$checked_ids |> + expect_setequal(paste0("numeric_int_", 1:4)) + + .joynenv$checked_ids |> + expect_setequal(paste0("numeric_int_", 1:4)) + + # with large dt -get_all TRUE + res <- possible_ids(dt, + vars = paste0("numeric_int_", 1:4), + get_all = TRUE) + attributes(res)$checked_ids |> + expect_setequal(paste0("numeric_int_", 1:4)) + + .joynenv$checked_ids |> + expect_setequal(paste0("numeric_int_", 1:4)) }) @@ -153,7 +170,27 @@ test_that("relationship include and vars", { possible_ids(x4, vars = c("t", "x"), - include = NULL) + include = NULL) |> + expect_no_error() + + # test checked vars are at least `vars` plus those in `include` + res <- possible_ids(x4, + vars = c("id1", "id2", "t"), + include = "t") + + checked_ids <- attributes(res)$checked_ids + + checked_ids |> + expect_setequal(c("id1", "id2", "t")) + + res <- possible_ids(dt, + vars = c("logical_1", "logical_2", "factor_1", "factor_2"), + include = "unique_id1") + + checked_ids <- attributes(res)$checked_ids + + checked_ids |> + expect_setequal(c("logical_1", "logical_2", "factor_1", "factor_2", "unique_id1")) }) @@ -169,6 +206,12 @@ test_that("relationship exclude and vars") { vars = c("id1", "x"), exclude = "x") |> expect_message() + + possible_ids(dt, + vars = paste0("character_", 1:10), + exclude = c("character_1", "character_2")) |> + expect_message() + } test_that("inconsistent use of `include`", { @@ -213,10 +256,9 @@ test_that("exclude and include", { ## Test combination between include vars and exclude class #### res <- possible_ids(dt, - get_all = TRUE, - include = c("numeric_double_1", - "numeric_double_2"), - exclude_classes = "numeric") + include = c("numeric_double_1", + "numeric_double_2"), + exclude_classes = "numeric") # TODO: Fix from here From fe5c15fa969a4a733f4c6b8b14c67120db39092b Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Fri, 4 Oct 2024 12:41:57 -0400 Subject: [PATCH 51/77] more tests --- tests/testthat/test-possible_ids.R | 69 +++++++++++++++++------------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index c5010207..f06728df 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -195,7 +195,7 @@ test_that("relationship include and vars", { }) -test_that("relationship exclude and vars") { +test_that("relationship exclude and vars", { possible_ids(x4, vars = c("t", "x"), @@ -211,18 +211,16 @@ test_that("relationship exclude and vars") { vars = paste0("character_", 1:10), exclude = c("character_1", "character_2")) |> expect_message() - -} + }) test_that("inconsistent use of `include`", { - # why is this an inconsistent use of include? - # expect_warning(possible_ids(x1, - # include = "x")) - - possible_ids(x1, - include = c("id", "x")) |> - expect_no_error() + # expect_warning(possible_ids(x1, + # include = "x")) + # + # possible_ids(x1, + # include = c("id", "x")) |> + # expect_no_error() }) @@ -260,8 +258,6 @@ test_that("exclude and include", { "numeric_double_2"), exclude_classes = "numeric") - # TODO: Fix from here - checked_vars <- attributes(res)$checked_ids all( @@ -288,7 +284,7 @@ test_that("exclude and include", { res_ids |> unlist() |> - expect_equal(c("foo", "id")) + expect_setequal(c("foo", "id", "v")) # alert if include and exclude same class #### possible_ids(dt, @@ -302,21 +298,31 @@ test_that("exclude and include", { exclude = c("id", "unique_id1")) |> expect_message() + res <- possible_ids(dt, + exclude_classes = c("integer"), + include = c("numeric_int_1")) - - - - - + attributes(res)$checked_ids |> + expect_setequal(setdiff(names(dt), c(paste0("numeric_int_", 2:10), + "id", + "unique_id1", "unique_id3"))) }) -test_that("get length 0", { +# test_that("get length 0", { +# +# expect_length(possible_ids(x1, +# exclude_classes = c("numeric", "integer"), +# include = "t"), 0) +# +# }) + +test_that("get length 0 -error", { - expect_length(possible_ids(x1, + expect_error(possible_ids(x1, exclude_classes = c("numeric", "integer"), - include = "t"), 0) + include = "t")) }) @@ -339,9 +345,12 @@ test_that("get all works", { get_all = TRUE) |> expect_no_error() - # possible_ids(dt, - # get_all = TRUE) |> - # expect_no_error() + # check get all combs + possible_ids(x3, + get_all = TRUE) |> + unlist() |> + expect_setequal(c("id", "v", "foo")) + }) @@ -373,13 +382,13 @@ test_that("Max combination size", { test_that("Min combination size", { - possible_ids(x4, + res <- possible_ids(x4, min_combination_size = 1, get_all = FALSE) |> - unlist() |> - length() > 1 |> - expect_true() + unlist() + expect_true( + length(res) >= 1) res <- possible_ids(dt, min_combination_size = 3, @@ -392,13 +401,13 @@ test_that("Min combination size", { possible_ids(x4, #min_combination_size = 1, max_combination_size = 1) |> - expect_message() + expect_error() possible_ids(x4, min_combination_size = 3, max_combination_size = 2) |> - expect_message() + expect_error() }) From c38d614a460362cfc4ff8f4e2868e725623dee9a Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Fri, 4 Oct 2024 14:47:18 -0400 Subject: [PATCH 52/77] update tests and ensure they pass --- tests/testthat/test-possible_ids.R | 143 ++++++++++++++++++++++++++--- 1 file changed, 128 insertions(+), 15 deletions(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index f06728df..e04639dc 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -1,3 +1,5 @@ +# PREPARATION #### + withr::local_options(joyn.verbose = FALSE) library(data.table) # options(possible_ids.verbose = FALSE) @@ -38,6 +40,118 @@ y4 = data.table(id = c(1, 2, 5, 6, 3), y = c(11L, 15L, 20L, 13L, 10L), x = c(16:20)) +# Auxiliary data: Big data table-------------------- + +# Set seed for reproducibility +set.seed(123) + +# Number of rows and variables +n_rows <- 1e4 # 10,000 rows +n_vars <- 50 # Total variables + +# Initialize an empty data.table +dt_large <- data.table(id = 1:n_rows) + +## Manually create three variables that uniquely identify the data #### +dt_large[, unique_id1 := rep(1:10, each = 1000)] # 1000 unique values repeated 100 times +dt_large[, unique_id2 := sample(letters, n_rows, replace = TRUE)] # Random character variable +dt_large[, unique_id3 := sample(1:1000, n_rows, replace = TRUE)] # Random integer + +# Function to generate random data +generate_random_data <- function(n, type) { + switch(type, + "numeric_int" = sample(1:1e6, n, replace = TRUE), + "numeric_double" = rnorm(n), + "character" = replicate(n, paste0(sample(letters, 5, replace = TRUE), collapse = "")), + "factor" = factor(sample(letters[1:10], n, replace = TRUE)), + "logical" = sample(c(TRUE, FALSE), n, replace = TRUE), + "date" = as.Date("2000-01-01") + sample(0:3650, n, replace = TRUE), + "datetime" = as.POSIXct("2000-01-01") + sample(0:(3650*24*60*60), n, replace = TRUE) + ) +} + +# Variable types and counts +var_types <- c("numeric_int", "numeric_double", "character", "factor", "logical", "date", "datetime") +vars_per_type <- c(10, 10, 10, 10, 5, 3, 2) # Total should sum to 50 + +# Generate variables and add to the data.table +var_count <- 0 +for (i in seq_along(var_types)) { + type <- var_types[i] + n_vars_type <- vars_per_type[i] + for (j in 1:n_vars_type) { + var_count <- var_count + 1 + var_name <- paste0(type, "_", j) + dt_large[, (var_name) := generate_random_data(n_rows, type)] + } +} + +## Introduce duplicates in some columns that are NOT the unique identifiers #### +# For example, we can duplicate the first 100 rows in the "numeric_int_1" and "character_1" columns +# dt_large <- rbind(dt_large, dt_large[1:100, .(numeric_int_1, character_1)]) + +# Shuffle the data to avoid ordered data +dt_large <- dt_large[sample(.N)] + + + +# dt_large[, id := .I] +dt <- copy(dt_large) + + +possible_ids( + dt = dt_large, + exclude_classes = c("numeric"), + verbose = TRUE +) + +possible_ids( + dt = dt_large, + exclude_classes = c("numeric"), + exclude = "id", + verbose = TRUE +) + +uniq_vars <- grep("unique_id", names(dt_large), value = TRUE) +pids <- possible_ids( + dt = dt_large, + #exclude_classes = c("logical", "date", "datetime", "numeric"), + exclude = "id", + #vars = uniq_vars, + verbose = TRUE, + min_combination_size = 3, + # max_combination_size = 3, + max_processing_time = 240, + get_all = TRUE +) + +possible_ids( + dt = dt_large, + verbose = TRUE +) + +## Remove the 'id' column to simulate data without a clear unique identifier #### +dt_large[, id := NULL] + +possible_ids_list <- possible_ids( + dt = dt_large, + exclude_classes = c("logical", "date", "datetime"), # Exclude some types for efficiency + verbose = TRUE +) +possible_ids_list + +possible_ids_list <- possible_ids( + dt = dt_large, + exclude_classes = c("logical", "date", "datetime", "numeric"), # Exclude some types for efficiency + max_processing_time = 120, + verbose = TRUE +) +possible_ids_list + + + +# TESTS #### + test_that("convert to data.table", { xx1 <- as.data.frame(x1) @@ -134,19 +248,18 @@ test_that("vars provided by user", { # Check if the combination of unique_id1, unique_id2, and unique_id3 uniquely identifies rows vars <- c("id", "unique_id2", "unique_id3") - res <- dt_large[, .N, + res <- dt[, .N, by = vars][N > 1] |> nrow() # --if it does, possible_ids should return those vars - # NOTE (RT) --> this test should be correct, but fails --> find issue in function (?) - if (res == 0) { # if no duplicate rows possible_ids(dt, - vars = vars) |> + vars = vars, + min_combination_size = 3) |> unlist() |> - expect_equal(vars) + expect_setequal(vars) } }) @@ -213,16 +326,16 @@ test_that("relationship exclude and vars", { expect_message() }) -test_that("inconsistent use of `include`", { - - # expect_warning(possible_ids(x1, - # include = "x")) - # - # possible_ids(x1, - # include = c("id", "x")) |> - # expect_no_error() - -}) +# test_that("inconsistent use of `include`", { +# +# # expect_warning(possible_ids(x1, +# # include = "x")) +# # +# # possible_ids(x1, +# # include = c("id", "x")) |> +# # expect_no_error() +# +# }) test_that("exclude and include", { From 2e26d951a0fbc5a03d93066a1faf0176c3042b7e Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Fri, 4 Oct 2024 14:57:53 -0400 Subject: [PATCH 53/77] clean code --- R/possible_ids.R | 23 +++++++++-------------- man/merge.Rd | 2 +- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 6ce47178..ac39dc1c 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -145,8 +145,8 @@ possible_ids <- function(dt, # Initialize list to store possible IDs possible_ids_list <- vector("list", max_numb_possible_ids) - checked_ids <- vars - #copy() + checked_ids <- vars |> + copy() if (min_combination_size == 1) { unique_ids <- vars[unique_counts == n_rows] @@ -162,7 +162,6 @@ possible_ids <- function(dt, possible_ids_list) return(ret_list) } - #return(remove_null(possible_ids_list)) # Remove unique identifiers from vars to reduce combinations vars <- setdiff(vars, unique_ids) @@ -171,7 +170,6 @@ possible_ids <- function(dt, ret_list <- store_checked_ids(checked_ids, possible_ids_list) return(ret_list) - #return(remove_null(possible_ids_list)) } unique_counts <- unique_counts[vars] } @@ -192,11 +190,7 @@ possible_ids <- function(dt, "Can't make combinations of {.field {vars}} if the min number of combinations is {min_size} and the max is {max_size}") } - # ret_list <- store_checked_ids(checked_ids, - # possible_ids_list) - # return(ret_list) - #return(remove_null(possible_ids_list)) - # this returned an empty list - I would raise an error instead (RT) + cli::cli_abort("No unique identifier found.") } @@ -251,7 +245,6 @@ possible_ids <- function(dt, "Max number of possible IDs ({max_numb_possible_ids}) reached. You may modify it in argument {.arg max_numb_possible_ids}") } - #return(possible_ids_list) ret_list <- store_checked_ids(checked_ids = checked_ids, possible_ids = possible_ids_list) return(ret_list) @@ -337,11 +330,10 @@ filter_by_name <- function(vars, include, exclude, verbose) { } vars <- setdiff(vars, exclude) } - # Apply 'include' filter c(vars, - setdiff(include, vars)) + setdiff(include, vars)) } @@ -363,8 +355,11 @@ remove_null <- \(x) { } -# Function to store checked vars as possible ids - +# Function to store checked vars as possible ids: +# 1. Remove nulls in possible ids list +# 2. Poke environment +# 3. Save checked vars as attribute +# 4. Return possible ids list store_checked_ids <- function(checked_ids, possible_ids, env = .joynenv) { diff --git a/man/merge.Rd b/man/merge.Rd index 7f748784..5107d64d 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -45,7 +45,7 @@ data from both \code{x} and \code{y} are included in the output.} \item{sort}{logical. If \code{TRUE} (default), the rows of the merged \code{data.table} are sorted by setting the key to the \code{by / by.x} columns. If \code{FALSE}, unlike base R's \code{merge} for which row order is unspecified, the -row order in \code{x} is retained (including retaining the position of missings when +row order in \code{x} is retained (including retaining the position of missing entries when \code{all.x=TRUE}), followed by \code{y} rows that don't match \code{x} (when \code{all.y=TRUE}) retaining the order those appear in \code{y}.} From 0b74d2c77b2695f1a72e1b030d134c5eb7c6ae0c Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 8 Oct 2024 16:11:57 -0400 Subject: [PATCH 54/77] attempt one, two and three --- R/possible_ids.R | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ _pkgdown.yml | 83 ---------------------------------------------- 2 files changed, 85 insertions(+), 83 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index ac39dc1c..fee5636d 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -379,5 +379,90 @@ store_checked_ids <- function(checked_ids, # Return return(possible_ids) +} + +# Create variables that uniquely idenitfy a dt +# Attempt one ----------------------------------------------------- +df_test <- data.table(a = 1:50) + +# create_ids <- function(n_rows, numb_ids = 2, prefix = "id") { +# # Initialize the list to store the ID columns +# ids <- vector("list", numb_ids) +# +# remaining_rows <- n_rows +# id_name <- paste0("id", 1:numb_ids) +# +# power_den <- 1 / seq(numb_ids, 1) +# +# for (i in seq_len(numb_ids)) { +# +# # Calculate the number of unique values for the current ID variable +# n_unique <- ceiling(remaining_rows^power_den[i]) +# +# # Generate the ID values, ensuring that they will work together to uniquely identify rows +# ids[[i]] <- rep(seq_len(n_unique), +# length.out = n_rows) +# +# # Update the remaining number of rows to ensure uniqueness with future IDs +# remaining_rows <- remaining_rows / n_unique +# } +# # return +# names(ids) <- id_name +# ids +# } + +# create_ids <- function(n_rows, numb_ids = 2, prefix = "id") { +# # Initialize the list to store the ID columns +# ids <- vector("list", numb_ids) +# +# remaining_rows <- n_rows +# id_name <- paste0("id", 1:numb_ids) +# for (i in seq_len(numb_ids)) { +# +# # Calculate the number of unique values for the current ID variable +# n_unique <- ceiling(remaining_rows^(1 / (numb_ids - i + 1))) +# +# # Generate the ID values, ensuring that they will work together to uniquely identify rows +# ids[[i]] <- rep(1:n_unique, length.out = n_rows) +# +# # Update the remaining number of rows to ensure uniqueness with future IDs +# remaining_rows <- remaining_rows / n_unique +# } +# # return +# names(ids) <- id_name +# ids +# } + +# different ways to use it. +# just IDs +# ids <- create_ids(nrow(df_test), 3) +# +# # create own names +# vars <- c("var1", "var2", "var3") +# dt[, (vars) := create_ids(.N, numb_ids = 3)] +# +# # use create_ids logic (you need to colbind them later) +# dt[, .(as.data.table(create_ids(.N, numb_ids = 3)))] + +# Attempt two ----------- +create_unique_variables_fast <- function(n_rows, X, prefix = "var") { + # Initialize a list to store the generated variables + vars <- vector("list", X) + + # The maximum unique values each variable can have to maintain uniqueness + max_vals <- ceiling(n_rows^(1 / X)) + + # Generate each variable using sequences and vectorization + for (i in seq_len(X)) { + # For the i-th variable, create a repeating sequence of increasing length + repeat_factor <- max_vals^(X - i) + vars[[i]] <- rep(rep(seq_len(max_vals), each = repeat_factor), length.out = n_rows) + } + # Set the names of the variables (e.g., var1, var2, ...) + names(vars) <- paste0(prefix, seq_len(X)) + + # Convert the list of variables into a data frame and return it + return(as.data.frame(vars)) } + diff --git a/_pkgdown.yml b/_pkgdown.yml index 518ad6e0..a273982f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,87 +1,4 @@ url: https://randrescastaneda.github.io/joyn/ template: bootstrap: 5 - bottswatch: cosmo - bslib: - primary: "#87CEFA" - -development: - mode: auto - -home: - title: An R package for joining tables - description: It is a joy to join tables in R with `joyn` - -authors: - R.Andres Castaneda: - href: https://randrescastaneda.rbind.io/ - Zander Prinsloo: - Rossana Tatulli: - -navbar: - bg: primary - structure: - left: - - home - - start here - - reference - - articles - - news - right: - - Dev version - - github - components: - reference: - text: Reference - href: reference/index.html - dev version: - text: dev version - href: dev/ - github: - icon: fab fa-github fa-lg - href: https://github.com/randrescastaneda/joyn/ - -articles: -- title: Usage - navbar: ~ - contents: - - main-functionalities - - adv-functionalities - - dplyr-joins - - merge-wrapper - - messages - - aux-functions - -reference: -- title: "Main function" - desc: > - Since the objective of joyn is to join tables with joy, there is only one - main function in this package - contents: - - joyn -- title: "Dplyr-joins" - contents: - - full_join - - right_join - - left_join - - inner_join - - anti_join -- title: "Merge data tables" - contents: - - merge -- title: "Auxiliary functions" -- subtitle: Information - contents: - - has_concept("messages") -- subtitle: JOYn options - contents: - - has_concept("options") -- subtitle: MISC - contents: - - is_id - - freq_table - - possible_ids - - is_balanced - - rename_to_valid - - joyn_msg From 9eb251987fa3fa94589a77028da18dbc3f9ce46a Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 8 Oct 2024 17:13:15 -0400 Subject: [PATCH 55/77] fix issue with duplicate vars --- R/possible_ids.R | 49 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index fee5636d..988cca16 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -445,24 +445,57 @@ df_test <- data.table(a = 1:50) # dt[, .(as.data.table(create_ids(.N, numb_ids = 3)))] # Attempt two ----------- -create_unique_variables_fast <- function(n_rows, X, prefix = "var") { +create_unique_variables <- function(n_rows, n_ids, prefix = "id") { # Initialize a list to store the generated variables - vars <- vector("list", X) + vars <- vector("list", n_ids) # The maximum unique values each variable can have to maintain uniqueness - max_vals <- ceiling(n_rows^(1 / X)) + max_vals <- ceiling(n_rows^(1 / n_ids)) # Generate each variable using sequences and vectorization - for (i in seq_len(X)) { + for (i in seq_len(n_ids)) { + # For the i-th variable, create a repeating sequence of increasing length - repeat_factor <- max_vals^(X - i) - vars[[i]] <- rep(rep(seq_len(max_vals), each = repeat_factor), length.out = n_rows) + repeat_factor <- max_vals^(n_ids - i) + vars[[i]] <- rep(rep(seq_len(max_vals), + each = repeat_factor), length.out = n_rows) } # Set the names of the variables (e.g., var1, var2, ...) - names(vars) <- paste0(prefix, seq_len(X)) + names(vars) <- paste0(prefix, + seq_len(n_ids)) + + return(vars) # Convert the list of variables into a data frame and return it - return(as.data.frame(vars)) + # return(as.data.frame(vars)) } +# fixing the issue of duplicate rows: +create_unique_ids_2 <- function(n_rows, n_ids, prefix = "id") { + # Initialize a list to store the generated variables + vars <- vector("list", n_ids) + + # The maximum unique values each variable can have to maintain uniqueness + max_vals <- ceiling(n_rows^(1 / n_ids)) + + # Generate a sequence of unique identifiers + all_ids <- expand.grid(lapply(1:n_ids, function(x) seq_len(max_vals))) + + # Randomly sample the unique combinations without replacement + sampled_ids <- all_ids[sample(nrow(all_ids), n_rows), ] + + # Store each unique identifier in the vars list + for (i in seq_len(n_ids)) { + vars[[i]] <- sampled_ids[[i]] + } + + # Set the names of the variables (e.g., id1, id2, ...) + names(vars) <- paste0(prefix, seq_len(n_ids)) + + return(vars) + + #return(as.data.table(vars)) +} + + From 8b2d8fbd623c227d83204891f61676cee54dfb53 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 9 Oct 2024 10:59:21 -0400 Subject: [PATCH 56/77] try again, fixing the issue of more rows than n_rows --- R/possible_ids.R | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 988cca16..e3f9074b 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -472,18 +472,22 @@ create_unique_variables <- function(n_rows, n_ids, prefix = "id") { } # fixing the issue of duplicate rows: -create_unique_ids_2 <- function(n_rows, n_ids, prefix = "id") { - # Initialize a list to store the generated variables - vars <- vector("list", n_ids) - # The maximum unique values each variable can have to maintain uniqueness +create_ids <- function(n_rows, n_ids, prefix = "id") { + + vars <- vector("list", + n_ids) + + # Get max unique values each variable can have to keep uniqueness max_vals <- ceiling(n_rows^(1 / n_ids)) # Generate a sequence of unique identifiers - all_ids <- expand.grid(lapply(1:n_ids, function(x) seq_len(max_vals))) + all_ids <- expand.grid(lapply(1:n_ids, + function(x) seq_len(max_vals))) # Randomly sample the unique combinations without replacement - sampled_ids <- all_ids[sample(nrow(all_ids), n_rows), ] + sampled_ids <- all_ids[sample(nrow(all_ids), + n_rows), ] # Store each unique identifier in the vars list for (i in seq_len(n_ids)) { @@ -491,11 +495,27 @@ create_unique_ids_2 <- function(n_rows, n_ids, prefix = "id") { } # Set the names of the variables (e.g., id1, id2, ...) - names(vars) <- paste0(prefix, seq_len(n_ids)) + names(vars) <- paste0(prefix, + seq_len(n_ids)) return(vars) +} - #return(as.data.table(vars)) +# another version #### +create_random_unique_variables <- function(n_rows, n_ids, max_vals, prefix = "id") { + # Initialize a data frame to store the sampled variables + sampled_vars <- as.data.frame(matrix(NA, nrow = n_rows, ncol = n_ids)) + + # For each variable (column), sample 'n_rows' random values from '1:max_vals' + for (i in seq_len(n_ids)) { + sampled_vars[[i]] <- sample(seq_len(max_vals), n_rows, replace = TRUE) + } + + # Set the names of the variables (e.g., id1, id2, ...) + names(sampled_vars) <- paste0(prefix, seq_len(n_ids)) + + return(sampled_vars) } + From 1be3522fe0743869c0f435b5eddb7abd7d81d540 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 9 Oct 2024 11:01:34 -0400 Subject: [PATCH 57/77] calculate max val insisde the fun instead --- R/possible_ids.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index e3f9074b..8e84a491 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -502,7 +502,10 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { } # another version #### -create_random_unique_variables <- function(n_rows, n_ids, max_vals, prefix = "id") { +create_random_unique_variables <- function(n_rows, n_ids, prefix = "id") { + # Calculate max_vals based on n_rows and n_ids + max_vals <- ceiling(n_rows^(1 / n_ids)) + # Initialize a data frame to store the sampled variables sampled_vars <- as.data.frame(matrix(NA, nrow = n_rows, ncol = n_ids)) From 35236b22e78f7d8545bd43338c7fb42eaee8b338 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 9 Oct 2024 11:35:54 -0400 Subject: [PATCH 58/77] fix issue with single id --- R/possible_ids.R | 52 +++++++++++++++++++----------- tests/testthat/test-possible_ids.R | 13 +++++++- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 8e84a491..a76173eb 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -481,6 +481,12 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { # Get max unique values each variable can have to keep uniqueness max_vals <- ceiling(n_rows^(1 / n_ids)) + # If n_ids is 1, simply generate a sequence of IDs + if (n_ids == 1) { + vars[[1]] <- seq_len(n_rows) + names(vars)[1] <- paste0(prefix, 1) + } + # Generate a sequence of unique identifiers all_ids <- expand.grid(lapply(1:n_ids, function(x) seq_len(max_vals))) @@ -501,24 +507,32 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { return(vars) } -# another version #### -create_random_unique_variables <- function(n_rows, n_ids, prefix = "id") { - # Calculate max_vals based on n_rows and n_ids - max_vals <- ceiling(n_rows^(1 / n_ids)) - - # Initialize a data frame to store the sampled variables - sampled_vars <- as.data.frame(matrix(NA, nrow = n_rows, ncol = n_ids)) - - # For each variable (column), sample 'n_rows' random values from '1:max_vals' - for (i in seq_len(n_ids)) { - sampled_vars[[i]] <- sample(seq_len(max_vals), n_rows, replace = TRUE) - } - - # Set the names of the variables (e.g., id1, id2, ...) - names(sampled_vars) <- paste0(prefix, seq_len(n_ids)) - - return(sampled_vars) -} - +# examples: +# ids <- create_ids(nrow(df_test), 3) +# # create own names +# vars <- c("var1", "var2", "var3") +# dt[, (vars) := create_ids(.N, n_ids = 3)] +# +# # use create_ids logic (you need to colbind them later) +# dt[, .(as.data.table(create_ids(.N, n_ids = 3)))] +# +# # Create a new dataset and generate IDs for it +# df_new <- data.frame(b = 1:15) +# +# # Generate 4 unique ID columns for the new dataset +# df_new_ids <- create_ids(nrow(df_new), 4) +# +# # Bind the IDs to the new dataset +# df_new <- cbind(df_new, df_new_ids) +# +# # with data table +# # Create a data.table for demonstration +# dt <- data.table(a = 1:10) +# +# # Generate unique IDs and convert them into a data.table +# id_dt <- as.data.table(create_ids(.N, n_ids = 3)) +# +# # Bind the new ID columns to the original data.table +# dt <- cbind(dt, id_dt) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index e04639dc..cc9e3ca0 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -1,5 +1,5 @@ -# PREPARATION #### +# PREPARATION #### withr::local_options(joyn.verbose = FALSE) library(data.table) # options(possible_ids.verbose = FALSE) @@ -151,8 +151,19 @@ possible_ids_list # TESTS #### +## Test create ids --------------------- #### + +test_that("create_ids works as intended", { + + # with a single id + res <- as.data.frame(create_ids(n_rows = 50, n_ids = 1)) + + +}) + +## Test possible_ids ------------------- #### test_that("convert to data.table", { xx1 <- as.data.frame(x1) expect_equal(possible_ids(x1), possible_ids(xx1)) From 6eca3d5b7f1bf74ff1bfd0853617d14f746aa594 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 9 Oct 2024 11:44:42 -0400 Subject: [PATCH 59/77] fix again when n_ids is 1 --- R/possible_ids.R | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index a76173eb..01a2e904 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -478,33 +478,41 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { vars <- vector("list", n_ids) - # Get max unique values each variable can have to keep uniqueness - max_vals <- ceiling(n_rows^(1 / n_ids)) - # If n_ids is 1, simply generate a sequence of IDs if (n_ids == 1) { vars[[1]] <- seq_len(n_rows) names(vars)[1] <- paste0(prefix, 1) - } - # Generate a sequence of unique identifiers - all_ids <- expand.grid(lapply(1:n_ids, - function(x) seq_len(max_vals))) + #print(vars) - # Randomly sample the unique combinations without replacement - sampled_ids <- all_ids[sample(nrow(all_ids), - n_rows), ] + return(vars) + } else { + + # Get max unique values each variable can have to keep uniqueness + max_vals <- ceiling(n_rows^(1 / n_ids)) + + # Generate a sequence of unique identifiers + all_ids <- expand.grid(lapply(1:n_ids, + function(x) seq_len(max_vals))) + + # Randomly sample the unique combinations without replacement + sampled_ids <- all_ids[sample(nrow(all_ids), + n_rows), ] + + # Store each unique identifier in the vars list + for (i in seq_len(n_ids)) { + vars[[i]] <- sampled_ids[[i]] + } + + # Set the names of the variables (e.g., id1, id2, ...) + names(vars) <- paste0(prefix, + seq_len(n_ids)) + + return(vars) - # Store each unique identifier in the vars list - for (i in seq_len(n_ids)) { - vars[[i]] <- sampled_ids[[i]] } - # Set the names of the variables (e.g., id1, id2, ...) - names(vars) <- paste0(prefix, - seq_len(n_ids)) - return(vars) } # examples: From cb9ad26ae39e0777bd5f8f48fcd4503a70bf7de1 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 9 Oct 2024 12:21:55 -0400 Subject: [PATCH 60/77] add tests --- tests/testthat/test-possible_ids.R | 39 +++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index cc9e3ca0..bc54790e 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -152,11 +152,48 @@ possible_ids_list # TESTS #### ## Test create ids --------------------- #### +df_test <- data.table(a = 1:50) + test_that("create_ids works as intended", { # with a single id - res <- as.data.frame(create_ids(n_rows = 50, n_ids = 1)) + res <- as.data.frame(create_ids(n_rows = 50, + n_ids = 1)) + + nrow(unique(res)) |> + expect_equal(50) + + class(res) |> + expect_equal("data.frame") + + + # ret class + res <- create_ids(n_rows = 50, + n_ids = 1) + + class(res) |> + expect_equal("list") + + length(res) |> + expect_equal(1) + + # prefix + create_ids(n_rows = 40, n_ids = 4, prefix = "unique_id") |> + names() |> + expect_equal(paste0("unique_id", 1:4)) + + # with more than an id + vars <- c("var1", "var2", "var3") + + res <- df_test[, (vars) := create_ids(.N, + n_ids = 3)] + + nrow(res[, .N, + + by = vars][N > 1]) |> + expect_equal(0) + }) From e28e4d7505468f06b9fc60a2de8d1249ec76ef73 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 9 Oct 2024 12:41:31 -0400 Subject: [PATCH 61/77] add more tests on create ids --- tests/testthat/test-possible_ids.R | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index bc54790e..54d3b050 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -186,7 +186,9 @@ test_that("create_ids works as intended", { # with more than an id vars <- c("var1", "var2", "var3") - res <- df_test[, (vars) := create_ids(.N, + dt <- df_test |> copy() + + res <- dt[, (vars) := create_ids(.N, n_ids = 3)] nrow(res[, .N, @@ -194,6 +196,23 @@ test_that("create_ids works as intended", { by = vars][N > 1]) |> expect_equal(0) + res <- df_test[, .(as.data.table( + create_ids(.N, n_ids = 3) + ))] + + nrow(res[, .N, + + by = c("id1", "id2", "id3")][N > 1]) |> + expect_equal(0) + + dt <- dt_large |> copy() + + new_ids <- as.data.table(create_ids(nrow(dt), + n_ids = 3, + prefix = "new_id")) + + # Bind the new IDs to the existing data.table + dt <- cbind(dt, new_ids) }) From 26415ac28af34ae41b694f6246416e2c083bfed7 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 9 Oct 2024 12:43:04 -0400 Subject: [PATCH 62/77] first cleanup of code --- R/possible_ids.R | 102 ++++++++--------------------------------------- 1 file changed, 16 insertions(+), 86 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 01a2e904..b4247e32 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -385,65 +385,6 @@ store_checked_ids <- function(checked_ids, # Attempt one ----------------------------------------------------- df_test <- data.table(a = 1:50) -# create_ids <- function(n_rows, numb_ids = 2, prefix = "id") { -# # Initialize the list to store the ID columns -# ids <- vector("list", numb_ids) -# -# remaining_rows <- n_rows -# id_name <- paste0("id", 1:numb_ids) -# -# power_den <- 1 / seq(numb_ids, 1) -# -# for (i in seq_len(numb_ids)) { -# -# # Calculate the number of unique values for the current ID variable -# n_unique <- ceiling(remaining_rows^power_den[i]) -# -# # Generate the ID values, ensuring that they will work together to uniquely identify rows -# ids[[i]] <- rep(seq_len(n_unique), -# length.out = n_rows) -# -# # Update the remaining number of rows to ensure uniqueness with future IDs -# remaining_rows <- remaining_rows / n_unique -# } -# # return -# names(ids) <- id_name -# ids -# } - -# create_ids <- function(n_rows, numb_ids = 2, prefix = "id") { -# # Initialize the list to store the ID columns -# ids <- vector("list", numb_ids) -# -# remaining_rows <- n_rows -# id_name <- paste0("id", 1:numb_ids) -# for (i in seq_len(numb_ids)) { -# -# # Calculate the number of unique values for the current ID variable -# n_unique <- ceiling(remaining_rows^(1 / (numb_ids - i + 1))) -# -# # Generate the ID values, ensuring that they will work together to uniquely identify rows -# ids[[i]] <- rep(1:n_unique, length.out = n_rows) -# -# # Update the remaining number of rows to ensure uniqueness with future IDs -# remaining_rows <- remaining_rows / n_unique -# } -# # return -# names(ids) <- id_name -# ids -# } - -# different ways to use it. -# just IDs -# ids <- create_ids(nrow(df_test), 3) -# -# # create own names -# vars <- c("var1", "var2", "var3") -# dt[, (vars) := create_ids(.N, numb_ids = 3)] -# -# # use create_ids logic (you need to colbind them later) -# dt[, .(as.data.table(create_ids(.N, numb_ids = 3)))] - # Attempt two ----------- create_unique_variables <- function(n_rows, n_ids, prefix = "id") { # Initialize a list to store the generated variables @@ -515,32 +456,21 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { } -# examples: -# ids <- create_ids(nrow(df_test), 3) -# # create own names -# vars <- c("var1", "var2", "var3") -# dt[, (vars) := create_ids(.N, n_ids = 3)] -# -# # use create_ids logic (you need to colbind them later) -# dt[, .(as.data.table(create_ids(.N, n_ids = 3)))] -# -# # Create a new dataset and generate IDs for it -# df_new <- data.frame(b = 1:15) -# -# # Generate 4 unique ID columns for the new dataset -# df_new_ids <- create_ids(nrow(df_new), 4) -# -# # Bind the IDs to the new dataset -# df_new <- cbind(df_new, df_new_ids) -# -# # with data table -# # Create a data.table for demonstration -# dt <- data.table(a = 1:10) -# -# # Generate unique IDs and convert them into a data.table -# id_dt <- as.data.table(create_ids(.N, n_ids = 3)) -# -# # Bind the new ID columns to the original data.table -# dt <- cbind(dt, id_dt) +#examples: +ids <- create_ids(nrow(df_test), 3) +# create own names +vars <- c("var1", "var2", "var3") +dt[, (vars) := create_ids(.N, n_ids = 3)] + +# use create_ids logic (you need to colbind them later) +dt[, .(as.data.table(create_ids(.N, n_ids = 3)))] + +# Create a new dataset and generate IDs for it +df_new <- data.frame(b = 1:15) + +# Generate 4 unique ID columns for the new dataset +df_new_ids <- create_ids(nrow(df_new), 4) + + From 041ab525ce323caa523169faf3445640c41a2e62 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 9 Oct 2024 15:27:06 -0400 Subject: [PATCH 63/77] tests --- tests/testthat/test-possible_ids.R | 39 ++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 54d3b050..f1fef210 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -152,8 +152,6 @@ possible_ids_list # TESTS #### ## Test create ids --------------------- #### -df_test <- data.table(a = 1:50) - test_that("create_ids works as intended", { @@ -490,14 +488,6 @@ test_that("exclude and include", { }) -# test_that("get length 0", { -# -# expect_length(possible_ids(x1, -# exclude_classes = c("numeric", "integer"), -# include = "t"), 0) -# -# }) - test_that("get length 0 -error", { expect_error(possible_ids(x1, @@ -634,6 +624,35 @@ test_that("duplicated names", { }) +test_that("identifies ids", { + + vars <- c("var1", "var2", "var3") + dt[, (vars) := create_ids(.N, n_ids = 3)] + + possible_ids(dt, + vars = vars) |> + unlist() |> + expect_equal(vars) + + df_test <- as.data.frame( + create_ids(n_rows = 50, + n_ids = 3) + ) + + possible_ids(df_test, + vars = c("id1", "id2"), + include = "id3") |> + unlist() |> + expect_equal(c("id1", "id2", "id3")) + + possible_ids(df_test, + exclude_classes = "integer", + include = c("id1", "id2", "id3")) |> + unlist() |> + expect_equal(c("id1", "id2", "id3")) + +}) + # Auxiliary data: Big data table-------------------- # Set seed for reproducibility From d689569ad26fdbb868d89c7742e638ebb5229b8c Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 9 Oct 2024 16:11:42 -0400 Subject: [PATCH 64/77] clean code and add documentation --- R/possible_ids.R | 71 ++++++++++++----------------------------------- man/create_ids.Rd | 29 +++++++++++++++++++ 2 files changed, 47 insertions(+), 53 deletions(-) create mode 100644 man/create_ids.Rd diff --git a/R/possible_ids.R b/R/possible_ids.R index b4247e32..1c8a9261 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -381,39 +381,23 @@ store_checked_ids <- function(checked_ids, } -# Create variables that uniquely idenitfy a dt -# Attempt one ----------------------------------------------------- -df_test <- data.table(a = 1:50) - -# Attempt two ----------- -create_unique_variables <- function(n_rows, n_ids, prefix = "id") { - # Initialize a list to store the generated variables - vars <- vector("list", n_ids) - - # The maximum unique values each variable can have to maintain uniqueness - max_vals <- ceiling(n_rows^(1 / n_ids)) - - # Generate each variable using sequences and vectorization - for (i in seq_len(n_ids)) { - - # For the i-th variable, create a repeating sequence of increasing length - repeat_factor <- max_vals^(n_ids - i) - vars[[i]] <- rep(rep(seq_len(max_vals), - each = repeat_factor), length.out = n_rows) - } - - # Set the names of the variables (e.g., var1, var2, ...) - names(vars) <- paste0(prefix, - seq_len(n_ids)) - - return(vars) - - # Convert the list of variables into a data frame and return it - # return(as.data.frame(vars)) -} - -# fixing the issue of duplicate rows: - +#' Create variables that uniquely identify rows in a data table +#' +#' This function generates unique identifier columns for a given number of rows, based on the specified number of identifier variables. +#' +#' @param n_rows An integer specifying the number of rows in the data table for which unique identifiers need to be generated. +#' @param n_ids An integer specifying the number of identifiers to be created. If `n_ids` is 1, a simple sequence of unique IDs is created. If greater than 1, a combination of IDs is generated. +#' @param prefix A character string specifying the prefix for the identifier variable names (default is `"id"`). +#' +#' @return A named list where each element is a vector representing a unique identifier column. The number of elements in the list corresponds to the number of identifier variables (`n_ids`). The length of each element is equal to `n_rows`. +#' +#' @details +#' The function handles two scenarios: +#' 1. When `n_ids` is 1, it simply returns a sequence of integers from 1 to `n_rows`. +#' 2. When `n_ids` is greater than 1, it generates combinations of IDs such that the rows are uniquely identified by the combination of the identifier variables. +#' +#' +#' @keywords internal create_ids <- function(n_rows, n_ids, prefix = "id") { vars <- vector("list", @@ -436,7 +420,7 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { all_ids <- expand.grid(lapply(1:n_ids, function(x) seq_len(max_vals))) - # Randomly sample the unique combinations without replacement + # Randomly sample the unique combinations sampled_ids <- all_ids[sample(nrow(all_ids), n_rows), ] @@ -445,32 +429,13 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { vars[[i]] <- sampled_ids[[i]] } - # Set the names of the variables (e.g., id1, id2, ...) names(vars) <- paste0(prefix, seq_len(n_ids)) return(vars) - } - } -#examples: -ids <- create_ids(nrow(df_test), 3) -# create own names -vars <- c("var1", "var2", "var3") -dt[, (vars) := create_ids(.N, n_ids = 3)] - -# use create_ids logic (you need to colbind them later) -dt[, .(as.data.table(create_ids(.N, n_ids = 3)))] - -# Create a new dataset and generate IDs for it -df_new <- data.frame(b = 1:15) - -# Generate 4 unique ID columns for the new dataset -df_new_ids <- create_ids(nrow(df_new), 4) - - diff --git a/man/create_ids.Rd b/man/create_ids.Rd new file mode 100644 index 00000000..48810703 --- /dev/null +++ b/man/create_ids.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/possible_ids.R +\name{create_ids} +\alias{create_ids} +\title{Create variables that uniquely identify rows in a data table} +\usage{ +create_ids(n_rows, n_ids, prefix = "id") +} +\arguments{ +\item{n_rows}{An integer specifying the number of rows in the data table for which unique identifiers need to be generated.} + +\item{n_ids}{An integer specifying the number of identifiers to be created. If \code{n_ids} is 1, a simple sequence of unique IDs is created. If greater than 1, a combination of IDs is generated.} + +\item{prefix}{A character string specifying the prefix for the identifier variable names (default is \code{"id"}).} +} +\value{ +A named list where each element is a vector representing a unique identifier column. The number of elements in the list corresponds to the number of identifier variables (\code{n_ids}). The length of each element is equal to \code{n_rows}. +} +\description{ +This function generates unique identifier columns for a given number of rows, based on the specified number of identifier variables. +} +\details{ +The function handles two scenarios: +\enumerate{ +\item When \code{n_ids} is 1, it simply returns a sequence of integers from 1 to \code{n_rows}. +\item When \code{n_ids} is greater than 1, it generates combinations of IDs such that the rows are uniquely identified by the combination of the identifier variables. +} +} +\keyword{internal} From bb22bd201a31be94b320d8e02bc4e9f3a2002d34 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 9 Oct 2024 16:27:42 -0400 Subject: [PATCH 65/77] ensure tests pass --- tests/testthat/test-possible_ids.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index f1fef210..4d9041ec 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -154,6 +154,7 @@ possible_ids_list ## Test create ids --------------------- #### test_that("create_ids works as intended", { + df_test <- data.table(a = 1:50) # with a single id res <- as.data.frame(create_ids(n_rows = 50, From 050d3ac6512e2293fe12ece13b82aa81a74b0776 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 10 Oct 2024 13:56:12 -0400 Subject: [PATCH 66/77] remove lapply --- R/possible_ids.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 1c8a9261..bbecac9a 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -413,12 +413,16 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { return(vars) } else { - # Get max unique values each variable can have to keep uniqueness + # Get max unique values each variable can have max_vals <- ceiling(n_rows^(1 / n_ids)) # Generate a sequence of unique identifiers - all_ids <- expand.grid(lapply(1:n_ids, - function(x) seq_len(max_vals))) + + all_ids <- expand.grid(rep(list(seq_len(max_vals)), + n_ids)) + + # nrows smaller ? + # nrows always bigger (?) # Randomly sample the unique combinations sampled_ids <- all_ids[sample(nrow(all_ids), From b65d12baefe2f58cb43fb2a367bf62db972b25ec Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 10 Oct 2024 14:08:40 -0400 Subject: [PATCH 67/77] add condition for when rows is larger -not always! --- R/possible_ids.R | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index bbecac9a..78f5453d 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -421,16 +421,15 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { all_ids <- expand.grid(rep(list(seq_len(max_vals)), n_ids)) - # nrows smaller ? - # nrows always bigger (?) - - # Randomly sample the unique combinations - sampled_ids <- all_ids[sample(nrow(all_ids), - n_rows), ] + if (nrow(all_ids) > n_rows) { + # Randomly sample the unique combinations + all_ids <- all_ids[sample(nrow(all_ids), + n_rows), ] + } # Store each unique identifier in the vars list for (i in seq_len(n_ids)) { - vars[[i]] <- sampled_ids[[i]] + vars[[i]] <- all_ids[[i]] } names(vars) <- paste0(prefix, From 57b4dbda7e888270b4c962c600a886438da8280f Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Fri, 11 Oct 2024 11:08:23 -0400 Subject: [PATCH 68/77] documentation for checked ids --- R/possible_ids.R | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 78f5453d..9bc2f6ca 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -355,11 +355,20 @@ remove_null <- \(x) { } -# Function to store checked vars as possible ids: -# 1. Remove nulls in possible ids list -# 2. Poke environment -# 3. Save checked vars as attribute -# 4. Return possible ids list +#' store checked variables as possible ids +#' +#' This function processes a list of possible IDs by removing any `NULL` entries, +#' storing a set of checked variables as an attribute and in the specified environment, +#' and then returning the updated list of possible IDs. +#' +#' @param checked_ids A vector of variable names that have been checked as possible IDs. +#' @param possible_ids A list containing potential identifiers. This list may contain `NULL` values, which will be removed by the function. +#' @param env An environment where the `checked_ids` will be stored. The default is `.joynenv`. +#' +#' @return A list of possible IDs with `NULL` values removed, and the `checked_ids` stored as an attribute. +#' +#' +#' @keywords internal store_checked_ids <- function(checked_ids, possible_ids, env = .joynenv) { @@ -391,11 +400,6 @@ store_checked_ids <- function(checked_ids, #' #' @return A named list where each element is a vector representing a unique identifier column. The number of elements in the list corresponds to the number of identifier variables (`n_ids`). The length of each element is equal to `n_rows`. #' -#' @details -#' The function handles two scenarios: -#' 1. When `n_ids` is 1, it simply returns a sequence of integers from 1 to `n_rows`. -#' 2. When `n_ids` is greater than 1, it generates combinations of IDs such that the rows are uniquely identified by the combination of the identifier variables. -#' #' #' @keywords internal create_ids <- function(n_rows, n_ids, prefix = "id") { From 75b48cd056bbddcb7e3a55c5a0768874c7e42305 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Fri, 11 Oct 2024 11:11:03 -0400 Subject: [PATCH 69/77] doc again --- man/create_ids.Rd | 7 ------- man/store_checked_ids.Rd | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 7 deletions(-) create mode 100644 man/store_checked_ids.Rd diff --git a/man/create_ids.Rd b/man/create_ids.Rd index 48810703..2197685a 100644 --- a/man/create_ids.Rd +++ b/man/create_ids.Rd @@ -19,11 +19,4 @@ A named list where each element is a vector representing a unique identifier col \description{ This function generates unique identifier columns for a given number of rows, based on the specified number of identifier variables. } -\details{ -The function handles two scenarios: -\enumerate{ -\item When \code{n_ids} is 1, it simply returns a sequence of integers from 1 to \code{n_rows}. -\item When \code{n_ids} is greater than 1, it generates combinations of IDs such that the rows are uniquely identified by the combination of the identifier variables. -} -} \keyword{internal} diff --git a/man/store_checked_ids.Rd b/man/store_checked_ids.Rd new file mode 100644 index 00000000..b0471724 --- /dev/null +++ b/man/store_checked_ids.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/possible_ids.R +\name{store_checked_ids} +\alias{store_checked_ids} +\title{store checked variables as possible ids} +\usage{ +store_checked_ids(checked_ids, possible_ids, env = .joynenv) +} +\arguments{ +\item{checked_ids}{A vector of variable names that have been checked as possible IDs.} + +\item{possible_ids}{A list containing potential identifiers. This list may contain \code{NULL} values, which will be removed by the function.} + +\item{env}{An environment where the \code{checked_ids} will be stored. The default is \code{.joynenv}.} +} +\value{ +A list of possible IDs with \code{NULL} values removed, and the \code{checked_ids} stored as an attribute. +} +\description{ +This function processes a list of possible IDs by removing any \code{NULL} entries, +storing a set of checked variables as an attribute and in the specified environment, +and then returning the updated list of possible IDs. +} +\keyword{internal} From f2c29c66c5c3a973777cb13c988ef1130047e79a Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 15 Oct 2024 16:03:03 -0400 Subject: [PATCH 70/77] small fix --- R/possible_ids.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 9bc2f6ca..c1168442 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -412,8 +412,6 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { vars[[1]] <- seq_len(n_rows) names(vars)[1] <- paste0(prefix, 1) - #print(vars) - return(vars) } else { From b899960fb1fea272c595232d5c3fd811da26dd99 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Thu, 17 Oct 2024 17:29:39 -0400 Subject: [PATCH 71/77] improve is_id and freq_table --- R/freq_table.R | 29 +++++++++++++++++++---------- R/is_id.R | 5 +++-- man/freq_table.Rd | 2 ++ tests/testthat/test-freq_table.R | 4 +--- tests/testthat/test-is_id.R | 9 ++++++--- 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/R/freq_table.R b/R/freq_table.R index 51aa79a9..75db07b9 100644 --- a/R/freq_table.R +++ b/R/freq_table.R @@ -12,6 +12,7 @@ if (getRversion() >= '2.15.1') #' @param byvar character: name of variable to tabulate. Use Standard evaluation. #' @param digits numeric: number of decimal places to display. Default is 1. #' @param na.rm logical: report NA values in frequencies. Default is FALSE. +#' @param freq_var_name character: name for frequency variable. Default is "n" #' #' @return data.table with frequencies. #' @export @@ -33,30 +34,38 @@ freq_table <- function(x, if (!is.data.frame(x)) { cli::cli_abort("Argument {.arg x} ({.field {x_name}}) must be a data frame") } + if (isFALSE(is.data.table(x))) { + x <- qDT(x) + } + - fq <- qtab(x[[byvar]], na.exclude = na.rm) - ft <- data.frame(joyn = names(fq), - n = as.numeric(fq)) + fq <- qtab(x[, ..byvar], na.exclude = na.rm, dnn = byvar) + + ft <- fq |> + as.data.table() |> + setnames("N", "n") |> + # filter zeros + fsubset(n > 0) N <- fsum(ft$n) ft <- ft |> ftransform(percent = paste0(round(n / N * 100, digits), "%")) # add row with totals - ft <- rowbind(ft, data.table(joyn = "total", - n = N, - percent = "100%")) |> - # filter zeros - fsubset(n > 0) + total_row <- rep("total", length(byvar)) |> + as.list() |> + as.data.table() |> + setnames(new = byvar) |> + ftransform(n = N, + percent = "100%") + ft <- rowbind(ft, total_row) setrename(ft, - joyn = byvar, n = freq_var_name, .nse = FALSE) } - #' Report frequencies from attributes in report var #' #' @param x dataframe from [joyn_workhorse] diff --git a/R/is_id.R b/R/is_id.R index 8bbd8ccc..0b55a45a 100644 --- a/R/is_id.R +++ b/R/is_id.R @@ -56,7 +56,7 @@ is_id <- function(dt, if (return_report) { # Return the duplicated rows if requested - cli::cli_h3("Duplicates in terms of {.code {by}}") + if (verbose) cli::cli_h3("Duplicates in terms of {.code {by}}") d <- freq_table(x = dt, byvar = by, @@ -64,7 +64,8 @@ is_id <- function(dt, d |> fsubset(copies > 1) |> print() - cli::cli_rule(right = "End of {.field is_id()} report") + + if (verbose) cli::cli_rule(right = "End of {.field is_id()} report") return(invisible(d)) } else { return(is_id) diff --git a/man/freq_table.Rd b/man/freq_table.Rd index 83e68fbd..9ed5cc34 100644 --- a/man/freq_table.Rd +++ b/man/freq_table.Rd @@ -14,6 +14,8 @@ freq_table(x, byvar, digits = 1, na.rm = FALSE, freq_var_name = "n") \item{digits}{numeric: number of decimal places to display. Default is 1.} \item{na.rm}{logical: report NA values in frequencies. Default is FALSE.} + +\item{freq_var_name}{character: name for frequency variable. Default is "n"} } \value{ data.table with frequencies. diff --git a/tests/testthat/test-freq_table.R b/tests/testthat/test-freq_table.R index f579699b..d5467b71 100644 --- a/tests/testthat/test-freq_table.R +++ b/tests/testthat/test-freq_table.R @@ -67,9 +67,7 @@ test_that("correct totals", { tr <- nrow(y4) - j <- freq_table(y4, "id2") - j <- freq_table(y4, "id2") - j |> + freq_table(y4, "id2") |> fsubset(id2 == "total") |> fselect(n) |> reg_elem() |> diff --git a/tests/testthat/test-is_id.R b/tests/testthat/test-is_id.R index 3205ccdd..332c5a2a 100644 --- a/tests/testthat/test-is_id.R +++ b/tests/testthat/test-is_id.R @@ -75,10 +75,13 @@ test_that("returns correct report table", { id = c("c", "b", "c", "a"), y = c(11L, 15L, 18L, 20L) ) - j <- is_id(y, by = "id", return_report = TRUE) + j <- is_id(y, by = "id", return_report = TRUE) |> + roworder(by = "id") - r <- data.table(id = c("c", "b", "a"), - copies = c(2L, 1L, 1L)) + r <- data.table(id = c("c", "b", "a", "total"), + copies = c(2L, 1L, 1L, 4L), + percent = c("50%", "25%", "25%", "100%")) |> + roworder(by = "id") expect_equal(j, r) From 1f655cced12a8cb35e852d366d666416b7d87f85 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Tue, 22 Oct 2024 17:11:27 -0400 Subject: [PATCH 72/77] merge fix is id --- R/possible_ids.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/possible_ids.R b/R/possible_ids.R index c1168442..2690c668 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -423,6 +423,8 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { all_ids <- expand.grid(rep(list(seq_len(max_vals)), n_ids)) + #collapse::fnrow faster? + if (nrow(all_ids) > n_rows) { # Randomly sample the unique combinations all_ids <- all_ids[sample(nrow(all_ids), From 80d38789e2592a95f8cd433749e7d713062cc54d Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 23 Oct 2024 15:22:53 -0400 Subject: [PATCH 73/77] print freq table only if verbose --- R/is_id.R | 9 ++++++--- R/possible_ids.R | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/R/is_id.R b/R/is_id.R index 0b55a45a..c8e3b80a 100644 --- a/R/is_id.R +++ b/R/is_id.R @@ -61,9 +61,12 @@ is_id <- function(dt, d <- freq_table(x = dt, byvar = by, freq_var_name = "copies") - d |> - fsubset(copies > 1) |> - print() + + if (verbose) { + d |> + fsubset(copies > 1) |> + print() + } if (verbose) cli::cli_rule(right = "End of {.field is_id()} report") return(invisible(d)) diff --git a/R/possible_ids.R b/R/possible_ids.R index 2690c668..bc8ceb96 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -425,9 +425,9 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { #collapse::fnrow faster? - if (nrow(all_ids) > n_rows) { + if (fnrow(all_ids) > n_rows) { # Randomly sample the unique combinations - all_ids <- all_ids[sample(nrow(all_ids), + all_ids <- all_ids[sample(fnrow(all_ids), n_rows), ] } From eafc27fbc4a61f74336fdbb9e64e0f64e7e6bbad Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 30 Oct 2024 14:01:22 -0400 Subject: [PATCH 74/77] fix error in vignette --- man/possible_ids.Rd | 1 + vignettes/aux-functions.Rmd | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/man/possible_ids.Rd b/man/possible_ids.Rd index 05866bb8..cf6a09a5 100644 --- a/man/possible_ids.Rd +++ b/man/possible_ids.Rd @@ -6,6 +6,7 @@ \usage{ possible_ids( dt, + vars = NULL, exclude = NULL, include = NULL, exclude_classes = NULL, diff --git a/vignettes/aux-functions.Rmd b/vignettes/aux-functions.Rmd index 2eab35de..a668bb74 100644 --- a/vignettes/aux-functions.Rmd +++ b/vignettes/aux-functions.Rmd @@ -68,10 +68,10 @@ possible_ids(dt = x1, possible_ids(dt = x1, exclude = "_character") -# Identify possible unique identifiers, excluding character variables but considering variable z +# Identify possible unique identifiers, excluding character variables but considering variable c1 possible_ids(dt = x1, - exclude = "_character", - include = "z") + exclude_classes = "character", + include = "c1") ``` From 4d790203c9b769fc76c731ad790441b8efe5ffec Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 30 Oct 2024 16:11:14 -0400 Subject: [PATCH 75/77] clean code: remove old possible ids --- R/possible_ids.R | 203 ----------------------------- man/possible_ids_old.Rd | 41 ------ tests/testthat/test-possible_ids.R | 1 - 3 files changed, 245 deletions(-) delete mode 100644 man/possible_ids_old.Rd diff --git a/R/possible_ids.R b/R/possible_ids.R index b1a513f1..1fdbbe52 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -444,206 +444,3 @@ create_ids <- function(n_rows, n_ids, prefix = "id") { } - - - - - -#' Find possible unique identifies of data frame -#' -#' Identify possible variables uniquely identifying x -#' -#' @param dt data frame -#' @param exclude character: Exclude variables to be selected as identifiers. It -#' could be either the name of the variables of one type of the variable -#' prefixed by "_". For instance, "_numeric" or "_character". -#' @param include character: Name of variable to be included, that might belong -#' to the group excluded in the `exclude` -#' @param verbose logical: If FALSE no message will be displayed. Default is -#' TRUE -#' -#' @return list with possible identifiers -#' @keywords internal -#' -#' @examples -#' library(data.table) -#' x4 = data.table(id1 = c(1, 1, 2, 3, 3), -#' id2 = c(1, 1, 2, 3, 4), -#' t = c(1L, 2L, 1L, 2L, NA_integer_), -#' x = c(16, 12, NA, NA, 15)) -#' possible_ids_old(x4) -possible_ids_old <- function(dt, - exclude = NULL, - include = NULL, - verbose = getOption("possible_ids.verbose")) { - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Check inputs --------- - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - if (!is.data.frame(dt)) { - stop("data must be a data frame") - } - - if (is.data.table(dt)) { - dt <- as.data.frame(dt) - } - - - if (is.null(exclude) && !is.null(include)) { - if (verbose) { - cli::cli_alert_warning("Since {.code exclude} is NULL, {.code include} - directive does not make sense. Ignored.", - wrap = TRUE) - } - warning("inconsistent use of `include`") - } - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ## exclude variables from check ------ - - vars <- names(dt) - - ### Exclude variable according to their type --------- - if (!is.null(include)) { - - # Find position of variable to include - ii <- which(names(dt) %in% include) - - } else { - - ii <- NULL - - } - - ### Exclude variable by name --------- - if (!is.null(exclude)) { - - if (any(grepl("^_", exclude))) { - - type_ex <- exclude[grepl("^_", exclude)] - vars_ex <- exclude[!grepl("^_", exclude)] - - type_ex <- match.arg(type_ex, c("_character", "_numeric")) - - # find variable that meet criteria and exclude them, making sure to include - # the variables of the user. - ex <- gsub("^_", "", type_ex) - FUN <- paste0("is.", ex) - - n_cols <- unlist(lapply(dt, FUN)) - n_cols[ii] <- FALSE - - # Exclude variables by name - - if (length(vars_ex) > 0) { - ex <-which(names(dt) %in% vars_ex) - n_cols[ex] <- TRUE - } - - vars <- names(dt)[!n_cols] - - } else { - vars <- vars[!(vars %in% exclude)] - - if (identical(vars, names(dt))) { - if (verbose) { - cli::cli_alert_warning("Variable {.field {exclude}} is not available in data frame. - Nothing is excluded.", wrap = TRUE) - } - - warning("inconsistenty use of `exclude`") - - } - - } - } - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ## check all names are unieuq -------- - dup_var <- duplicated(vars) - - if (any(dup_var)) { - - dvars <- vars[dup_var] - - msg <- "column names must be unique" - hint <- "try changing the names using {.fun make.names}" - problem <- "{.var {dvars}} {?is/are} duplicated" - cli::cli_abort(c( - msg, - i = hint, - x = problem - )) - - } - - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Find duplicates --------- - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - duplicates <- is_id(dt, by = vars, verbose = FALSE) - if (duplicates) { - if (verbose) { - cli::cli_alert_success("There are no duplicates in data frame") - } - } else { - if (verbose) { - cli::cli_alert_warning("Data has duplicates. returning NULL") - } - is_id(dt, by = vars, verbose = TRUE) - return(NULL) - } - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Find ids --------- - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nvars <- length(vars) - - found <- FALSE - i = 0 - while(i < nvars && found == FALSE) { - i = i + 1 - cm <- utils::combn(vars, m = i) - - lcm <- dim(cm)[2] # number of combinations of size j - - selected_vars <- vector(length = lcm) - for (j in 1:lcm) { - tvars <- cm[, j] # testing vars - selected_vars[j] <- is_id(dt, by = tvars, verbose = FALSE) - } - - sv <- which(selected_vars) - - if (length(sv) > 0) { - - if (length(sv) == 1 && i > 1) { - - lv <- list(V1 = cm[, sv]) - - } else if (i == 1) { - - ee <- as.data.frame(t(cm[, sv])) - lv <- lapply(ee, unique) - - } else { - - ee <- as.data.frame(cm[, sv]) - lv <- lapply(ee, unique) - - } - - found <- TRUE - - } - } - - if (verbose) { - cli::cli_alert("we found {length(lv)} possible id{?s}") - } - - return(lv) - -} diff --git a/man/possible_ids_old.Rd b/man/possible_ids_old.Rd deleted file mode 100644 index 11043343..00000000 --- a/man/possible_ids_old.Rd +++ /dev/null @@ -1,41 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/possible_ids.R -\name{possible_ids_old} -\alias{possible_ids_old} -\title{Find possible unique identifies of data frame} -\usage{ -possible_ids_old( - dt, - exclude = NULL, - include = NULL, - verbose = getOption("possible_ids.verbose") -) -} -\arguments{ -\item{dt}{data frame} - -\item{exclude}{character: Exclude variables to be selected as identifiers. It -could be either the name of the variables of one type of the variable -prefixed by "_". For instance, "_numeric" or "_character".} - -\item{include}{character: Name of variable to be included, that might belong -to the group excluded in the \code{exclude}} - -\item{verbose}{logical: If FALSE no message will be displayed. Default is -TRUE} -} -\value{ -list with possible identifiers -} -\description{ -Identify possible variables uniquely identifying x -} -\examples{ -library(data.table) -x4 = data.table(id1 = c(1, 1, 2, 3, 3), - id2 = c(1, 1, 2, 3, 4), - t = c(1L, 2L, 1L, 2L, NA_integer_), - x = c(16, 12, NA, NA, 15)) -possible_ids_old(x4) -} -\keyword{internal} diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index 3f1b8ade..4d9041ec 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -722,7 +722,6 @@ possible_ids( possible_ids( dt = dt_large, exclude_classes = c("numeric"), - exclude_classes = c("numeric"), exclude = "id", verbose = TRUE ) From 1b1b375addd551bb45e80586ec3293a25a884a7d Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Wed, 30 Oct 2024 17:39:06 -0400 Subject: [PATCH 76/77] fix warning --- R/possible_ids.R | 3 ++- man/possible_ids.Rd | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index 1fdbbe52..b53c3d9a 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -4,11 +4,12 @@ #' #' @param dt data frame #' @param vars character: A vector of variable names to consider for identifying unique combinations. +#' @param exclude character: Names of variables to exclude from analysis #' @param include character: Name of variable to be included, that might belong #' to the group excluded in the `exclude` #' @param exclude_classes character: classes to exclude from analysis (e.g., #' "numeric", "integer", "date") -#' @param include_classes character: classes to include from analysis (e.g., +#' @param include_classes character: classes to include in the analysis (e.g., #' "numeric", "integer", "date") #' @param min_combination_size numeric: Min number of combinations. Default is #' 1, so all combinations. diff --git a/man/possible_ids.Rd b/man/possible_ids.Rd index 7a1bdac8..484e261c 100644 --- a/man/possible_ids.Rd +++ b/man/possible_ids.Rd @@ -24,13 +24,15 @@ possible_ids( \item{vars}{character: A vector of variable names to consider for identifying unique combinations.} +\item{exclude}{character: Names of variables to exclude from analysis} + \item{include}{character: Name of variable to be included, that might belong to the group excluded in the \code{exclude}} \item{exclude_classes}{character: classes to exclude from analysis (e.g., "numeric", "integer", "date")} -\item{include_classes}{character: classes to include from analysis (e.g., +\item{include_classes}{character: classes to include in the analysis (e.g., "numeric", "integer", "date")} \item{verbose}{logical: If FALSE no message will be displayed. Default is From 23d8eef71f86a65ade08629b1ab4ad3fa1fe5127 Mon Sep 17 00:00:00 2001 From: RossanaTat Date: Thu, 31 Oct 2024 10:23:22 -0400 Subject: [PATCH 77/77] addressing warnings and notes --- R/possible_ids.R | 2 +- R/zzz.R | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/R/possible_ids.R b/R/possible_ids.R index b53c3d9a..faec30b3 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -204,7 +204,7 @@ possible_ids <- function(dt, # or break } - combos <- combn(vars, comb_size, simplify = FALSE) + combos <- utils::combn(vars, comb_size, simplify = FALSE) # Prune combinations where the product of unique counts is less # than n_rows diff --git a/R/zzz.R b/R/zzz.R index d6caf669..dedbe614 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -139,6 +139,13 @@ set_joyn_options <- function(..., } +# ------------------------------------ +# -- Define global variables -- +# ------------------------------------ + +utils::globalVariables(c("..byvar", + "..vars")) + # ------------------------------------------------------------------------------------------ # Define custom .strong {cli} classes to emphasize messages subcomponents # --- to be used when creating/storing {joyn} messages