diff --git a/R/dplyr-joins.R b/R/dplyr-joins.R index 73ca2fab..abcd1c2b 100644 --- a/R/dplyr-joins.R +++ b/R/dplyr-joins.R @@ -59,146 +59,54 @@ left_join <- function( verbose = getOption("joyn.verbose"), ... ) { + clear_joynenv() # Argument checks --------------------------------- - if (is.null(by)) { - by <- intersect( - names(x), - names(y) - ) - } - if (copy == TRUE) { - store_msg( - type = "warn", - warn = paste(cli::symbol$warn, "\nWarning:"), - pale = "\nargument", - bolded_pale = " copy = TRUE", - pale = "\nis not active in this version of", - bolded_pale = " joyn" - ) - } - if (is.null(suffix) || !length(suffix) == 2 || !is.character(suffix)) { - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: argument `suffix` must be character vector of length 2" - ) - ) - - } - if (!is.null(keep) & !is.logical(keep)) { - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: argument `keep` should be one of NULL, TRUE, or FALSE" - ) - ) - } - if (is.null(keep)) { - store_msg( - type = "warn", - warn = paste(cli::symbol$warn,"\nWarning:"), - pale = " joyn does not currently allow inequality joins, so", - bolded_pale = " keep = NULL", - pale = " will retain only keys in x" - ) - keep <- FALSE - } - - na_matches <- match.arg(na_matches) - multiple <- match.arg( - multiple, - choices = c( - "all", - "any", - "first", - "last" - ) - ) - - if (multiple == "any") { - multiple <- "first" - } - - unmatched <- match.arg( - unmatched, - choices = c( - "drop", - "error" - ) - ) - - if (is.null(relationship)) {relationship <- "one-to-one"} - - relationship <- switch( - relationship, - "one-to-one" = "1:1", - "one-to-many" = "1:m", - "many-to-one" = "m:1", - "many-to-many" = "m:m" - ) - if ( - relationship %in% c("1:m", "m:m") & - !multiple == "all" - ) { - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: if `relationship` is 1:m or m:m then `multiple` should be 'all' " - ) - ) - } - na_matches <- match.arg( - na_matches, - choices = c( - "na", - "never" - ) - ) - if (na_matches == "never") { - store_msg( - type = "warn", - warn = paste(cli::symbol$warn, "\nWarning:"), - pale = " Currently, joyn allows only", - bolded_pale = " na_matches = 'na'" - ) - } - if (is.null(reportvar) || isFALSE(reportvar)) { - dropreport <- TRUE - reportvar <- getOption("joyn.reportvar") - } else{ - dropreport <- FALSE - } + x <- copy(x) + y <- copy(y) + na_matches <- match.arg(na_matches, + choices = c("na","never")) + multiple <- match.arg(multiple, + choices = c("all", + "any", + "first", + "last")) + unmatched <- match.arg(unmatched, + choices = c("drop", + "error")) + + args_check <- arguments_checks(x = x, + y = y, + by = by, + copy = copy, + keep = keep, + suffix = suffix, + na_matches = na_matches, + multiple = multiple, + relationship = relationship, + reportvar = reportvar) + by <- args_check$by + keep <- args_check$keep + na_matches <- args_check$na_matches + multiple <- args_check$multiple + relationship <- args_check$relationship + reportvar <- args_check$reportvar + dropreport <- args_check$dropreport # Column names ----------------------------------- - #xnames <- names(x) - #ynames <- names(y) if (keep == TRUE) { - - x_1 <- copy(x) - y_1 <- copy(y) - - if (length(grep(pattern = "==?", x = by, value = TRUE)) != 0) { - by_y_names <- fix_by_vars(by = by, x_1, y_1)$yby - } else { - by_y_names <- fix_by_vars(by = by, x_1, y_1)$by - } - - ykeys <- y |> - fselect(by_y_names) - names(ykeys) <- paste0(names(ykeys), suffix[2]) - y <- cbind( - ykeys, - y - ) + jn_type <- "left" + modified_cols <- set_col_names(x = x, + y = y, + by = by, + jn_type = jn_type, + suffix = suffix) + x <- modified_cols$x + y <- modified_cols$y } - - # left join checks -------------------------------- - - - # Do left join ------------------------------------ + # Execute left join------------------------------------ lj <- joyn( x = x, y = y, @@ -211,43 +119,21 @@ left_join <- function( update_NAs = update_NAs, reportvar = reportvar, reporttype = reporttype, - keep_common_vars = T, + keep_common_vars = TRUE, sort = sort, verbose = verbose, ... ) - - # Do filter --------------------------------------- - - ### unmatched == "error" + # Unmatched Keys --------------------------------------- if (unmatched == "error") { - if (any( - lj[ - , - get(names(lj)[length(lj)]) - ] == "x" - ) | - any( - lj[ - , - get(names(lj)[length(lj)]) - ] == 1 - ) - ) { - - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: some rows in `y` are not matched - this check is due to - argument `unmatched = 'error'` " - ) - ) - - } - + check_unmatched_keys(x = x, + y = y, + out = lj, + by = by, + jn_type = "left") } - ### if dropreport = T + # Should report be kept--------------------------------- if (dropreport == T) { get_vars(lj, reportvar) <- NULL } @@ -256,7 +142,9 @@ left_join <- function( lj } - +#------------------------------------------------------------------------------- +# RIGHT JOIN -------------------------------------------------------------------- +#------------------------------------------------------------------------------- #' Right join two data frames #' @@ -318,136 +206,50 @@ right_join <- function( clear_joynenv() # Argument checks --------------------------------- - if (is.null(by)) { - by <- intersect( - names(x), - names(y) - ) - } - if (copy == TRUE) { - store_msg( - type = "warn", - warn = paste(cli::symbol$warn, "\nWarning:"), - pale = " argument", - bolded_pale = " copy = TRUE", - pale = " is not active in this version of joyn" - ) - } - if (is.null(suffix) || !length(suffix) == 2 || !is.character(suffix)) { - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: argument `suffix` must be character vector of length 2" - ) - ) - - } - if (!is.null(keep) & !is.logical(keep)) { - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: argument `keep` should be one of NULL, TRUE, or FALSE" - ) - ) - } - if (is.null(keep)) { - store_msg( - type = "warn", - warn = paste(cli::symbol$warn, "\nWarning:"), - pale = " joyn does not currently allow inequality joins, so", - bolded_pale = " keep = NULL", - pale = " will retain only keys in x" - ) - keep <- FALSE - } - - na_matches <- match.arg(na_matches) - multiple <- match.arg( - multiple, - choices = c( - "all", - "any", - "first", - "last" - ) - ) - if (multiple == "any") { - multiple <- "first" - } - unmatched <- match.arg( - unmatched, - choices = c( - "drop", - "error" - ) - ) - if (is.null(relationship)) {relationship <- "one-to-one"} - relationship <- switch( - relationship, - "one-to-one" = "1:1", - "one-to-many" = "1:m", - "many-to-one" = "m:1", - "many-to-many" = "m:m" - ) - if ( - relationship %in% c("1:m", "m:m") & - !multiple == "all" - ) { - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: if `relationship` is 1:m or m:m then `multiple` should be 'all' " - ) - ) - } - na_matches <- match.arg( - na_matches, - choices = c( - "na", - "never" - ) - ) - if (na_matches == "never") { - store_msg( - type = "warn", - warn = paste(cli::symbol$warn, "\nWarning:"), - pale = " Currently, joyn allows only", - bolded_pale = " na_matches = 'na'" - ) - } - if (is.null(reportvar) || isFALSE(reportvar)) { - dropreport <- TRUE - reportvar <- getOption("joyn.reportvar") - } else{ - dropreport <- FALSE - } + x <- copy(x) + y <- copy(y) + na_matches <- match.arg(na_matches, + choices = c("na","never")) + multiple <- match.arg(multiple, + choices = c("all", + "any", + "first", + "last")) + unmatched <- match.arg(unmatched, + choices = c("drop", + "error")) + + args_check <- arguments_checks(x = x, + y = y, + by = by, + copy = copy, + keep = keep, + suffix = suffix, + na_matches = na_matches, + multiple = multiple, + relationship = relationship, + reportvar = reportvar) + by <- args_check$by + keep <- args_check$keep + na_matches <- args_check$na_matches + multiple <- args_check$multiple + relationship <- args_check$relationship + reportvar <- args_check$reportvar + dropreport <- args_check$dropreport # Column names ----------------------------------- if (keep == TRUE) { - - x_1 <- copy(x) - y_1 <- copy(y) - - if (length(grep(pattern = "==?", x = by, value = TRUE)) != 0) { - by_x_names <- fix_by_vars(by = by, x_1, y_1)$xby - } else { - by_x_names <- fix_by_vars(by = by, x_1, y_1)$by - } - - xkeys <- x |> - fselect(by_x_names) - names(xkeys) <- paste0(names(xkeys), suffix[1]) - x <- cbind( - xkeys, - x - ) + jn_type <- "right" + modified_cols <- set_col_names(x = x, + y = y, + by = by, + jn_type = jn_type, + suffix = suffix) + x <- modified_cols$x + y <- modified_cols$y } - - # right join checks -------------------------------- - - - # Do right join ------------------------------------ + # Execute right join ------------------------------------ rj <- joyn( x = x, y = y, @@ -460,43 +262,22 @@ right_join <- function( update_NAs = update_NAs, reportvar = reportvar, reporttype = reporttype, - keep_common_vars = T, + keep_common_vars = TRUE, sort = sort, verbose = verbose, ... ) - - # Do filter --------------------------------------- - - ### unmatched == "error" + # Unmatched Keys --------------------------------------- if (unmatched == "error") { - if (any( - rj[ - , - get(names(rj)[length(rj)]) - ] == "y" - ) | - any( - rj[ - , - get(names(rj)[length(rj)]) - ] == 2 - ) - ) { - - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: some rows in `y` are not matched - this check is due to - argument `unmatched = 'error'` " - ) - ) - - } - + check_unmatched_keys(x = x, + y = y, + out = rj, + by = by, + jn_type = "right") } - ### if dropreport = T + + # Should reportvar be kept if (dropreport == T) { get_vars(rj, reportvar) <- NULL } @@ -507,18 +288,11 @@ right_join <- function( } - - - - #------------------------------------------------------------------------------- # FULL JOIN -------------------------------------------------------------------- #------------------------------------------------------------------------------- - - - #' Full join two data frames #' #' This is a `joyn` wrapper that works in a similar @@ -575,136 +349,55 @@ full_join <- function( verbose = getOption("joyn.verbose"), ... ) { + clear_joynenv() # Argument checks --------------------------------- - if (is.null(by)) { - by <- intersect( - names(x), - names(y) - ) - } - if (copy == TRUE) { - store_msg( - type = "warn", - warn = paste(cli::symbol$warn, "\nWarning:"), - pale = " argument", - bolded_pale = " copy = TRUE", - pale = " is not active in this version of joyn" - ) - } - if (is.null(suffix) || !length(suffix) == 2 || !is.character(suffix)) { - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: argument `suffix` must be character vector of length 2" - ) - ) - - } - if (!is.null(keep) & !is.logical(keep)) { - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: argument `keep` should be one of NULL, TRUE, or FALSE" - ) - ) - } - if (is.null(keep)) { - store_msg( - type = "warn", - warn = paste(cli::symbol$warn, "\nWarning:"), - pale = " joyn does not currently allow inequality joins, so", - bolded_pale = " keep = NULL", - pale = " will retain only keys in x. Equivalent to `keep = FALSE`" - ) - keep <- FALSE - } - - na_matches <- match.arg(na_matches) - multiple <- match.arg( - multiple, - choices = c( - "all", - "any", - "first", - "last" - ) - ) - if (multiple == "any") { - multiple <- "first" - } - unmatched <- match.arg( - unmatched, - choices = c( - "drop", - "error" - ) - ) - if (is.null(relationship)) {relationship <- "one-to-one"} - relationship <- switch( - relationship, - "one-to-one" = "1:1", - "one-to-many" = "1:m", - "many-to-one" = "m:1", - "many-to-many" = "m:m" - ) - if ( - relationship %in% c("1:m", "m:m") & - !multiple == "all" - ) { - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: if `relationship` is 1:m or m:m then `multiple` should be 'all' " - ) - ) - } - na_matches <- match.arg( - na_matches, - choices = c( - "na", - "never" - ) - ) - if (na_matches == "never") { - store_msg( - type = "warn", - warn = paste(cli::symbol$warn, "\nWarning:"), - pale = " Currently, joyn allows only", - bolded_pale = "na_matches = 'na'" - ) - } - if (is.null(reportvar) || isFALSE(reportvar)) { - dropreport <- TRUE - reportvar <- getOption("joyn.reportvar") - } else{ - dropreport <- FALSE - } + x <- copy(x) + y <- copy(y) + na_matches <- match.arg(na_matches, + choices = c("na","never")) + multiple <- match.arg(multiple, + choices = c("all", + "any", + "first", + "last")) + unmatched <- match.arg(unmatched, + choices = c("drop", + "error")) + + args_check <- arguments_checks(x = x, + y = y, + by = by, + copy = copy, + keep = keep, + suffix = suffix, + na_matches = na_matches, + multiple = multiple, + relationship = relationship, + reportvar = reportvar) + by <- args_check$by + keep <- args_check$keep + na_matches <- args_check$na_matches + multiple <- args_check$multiple + relationship <- args_check$relationship + reportvar <- args_check$reportvar + dropreport <- args_check$dropreport # Column names ----------------------------------- if (keep == TRUE) { - - x_1 <- copy(x) - y_1 <- copy(y) - - if (length(grep(pattern = "==?", x = by, value = TRUE)) != 0) { - by_y_names <- fix_by_vars(by = by, x_1, y_1)$yby - } else { - by_y_names <- fix_by_vars(by = by, x_1, y_1)$by - } - - ykeys <- y |> - fselect(by_y_names) - names(ykeys) <- paste0(names(ykeys), suffix[2]) - y <- cbind( - ykeys, - y - ) + jn_type <- "full" + modified_cols <- set_col_names(x = x, + y = y, + by = by, + jn_type = jn_type, + suffix = suffix) + x <- modified_cols$x + y <- modified_cols$y } - # Do full join ------------------------------------ + # Execute full join ------------------------------------ fj <- joyn( x = x, y = y, @@ -717,43 +410,28 @@ full_join <- function( update_NAs = update_NAs, reportvar = reportvar, reporttype = reporttype, - keep_common_vars = T, + keep_common_vars = TRUE, sort = sort, verbose = verbose, ... ) - - # Do filter --------------------------------------- - - ### unmatched == "error" + # Unmatched Keys---------------------------------------- if (unmatched == "error") { - if (any( - fj[ - , - get(names(fj)[length(fj)]) - ] == "x" - ) | - any( - fj[ - , - get(names(fj)[length(fj)]) - ] == 1 - ) - ) { - - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: some rows in `y` are not matched - this check is due to - argument `unmatched = 'error'` " - ) - ) - } + # Store warning message + store_msg( + type = "warn", + warn = paste(cli::symbol$warn, "\nWarning:"), + pale = "\nargument", + bolded_pale = " warning = error", + pale = "\nis not active in this type of", + bolded_pale = " joyn" + ) } - ### if dropreport = T + + # Should reportvar be kept if (dropreport == T) { get_vars(fj, reportvar) <- NULL } @@ -764,7 +442,9 @@ full_join <- function( } - +#------------------------------------------------------------------------------- +# INNER JOIN -------------------------------------------------------------------- +#------------------------------------------------------------------------------- #' Inner join two data frames @@ -823,24 +503,125 @@ inner_join <- function( verbose = getOption("joyn.verbose"), ... ) { + clear_joynenv() # Argument checks --------------------------------- + x <- copy(x) + y <- copy(y) + na_matches <- match.arg(na_matches, + choices = c("na","never")) + multiple <- match.arg(multiple, + choices = c("all", + "any", + "first", + "last")) + unmatched <- match.arg(unmatched, + choices = c("drop", + "error")) + + args_check <- arguments_checks(x = x, + y = y, + by = by, + copy = copy, + keep = keep, + suffix = suffix, + na_matches = na_matches, + multiple = multiple, + relationship = relationship, + reportvar = reportvar) + by <- args_check$by + keep <- args_check$keep + na_matches <- args_check$na_matches + multiple <- args_check$multiple + relationship <- args_check$relationship + reportvar <- args_check$reportvar + dropreport <- args_check$dropreport + + # Column names ----------------------------------- + if (keep == TRUE) { + jn_type <- "inner" + modified_cols <- set_col_names(x = x, + y = y, + by = by, + jn_type = jn_type, + suffix = suffix) + x <- modified_cols$x + y <- modified_cols$y + } + + # Execute inner join ------------------------------------ + ij <- joyn( + x = x, + y = y, + by = by, + match_type = relationship, + keep = "inner", + y_vars_to_keep = y_vars_to_keep, + suffixes = suffix, + update_values = update_values, + update_NAs = update_NAs, + reportvar = reportvar, + reporttype = reporttype, + keep_common_vars = TRUE, + sort = sort, + verbose = verbose, + ... + ) + + # Unmatched Keys --------------------------------------- + if (unmatched == "error") { + check_unmatched_keys(x = x, + y = y, + out = ij, + by = by, + jn_type = "inner") + } + + ### if dropreport = T + if (dropreport == T) { + get_vars(ij, reportvar) <- NULL + } + + # Return + ij + +} + + +# HELPER FUNCTIONS ------------------------------------------------------------- +## Arguments checks #### + +#' Perform necessary preliminary checks on arguments that are passed to joyn +#' @param x data frame: left table +#' @param y data frame: right table +#' @param by character vector or variables to join by +#' @inheritParams left_join +#' @return list of checked arguments to pass on to the main joyn function +#' @keywords internal +arguments_checks <- function(x, y, by, copy, keep, suffix, na_matches, multiple, + relationship, reportvar) { + # Check by if (is.null(by)) { by <- intersect( names(x), names(y) ) } + + # Check copy if (copy == TRUE) { store_msg( type = "warn", warn = paste(cli::symbol$warn, "\nWarning:"), - pale = " argument", + pale = "\nargument", bolded_pale = " copy = TRUE", - pale = " is not active in this version of joyn" + pale = "\nis not active in this version of", + bolded_pale = " joyn" ) } + + # Check suffix if (is.null(suffix) || !length(suffix) == 2 || !is.character(suffix)) { cli::cli_abort( paste0( @@ -848,8 +629,9 @@ inner_join <- function( " Error: argument `suffix` must be character vector of length 2" ) ) - } + + # Check keep if (!is.null(keep) & !is.logical(keep)) { cli::cli_abort( paste0( @@ -861,7 +643,7 @@ inner_join <- function( if (is.null(keep)) { store_msg( type = "warn", - warn = paste(cli::symbol$warn, "\nWarning:"), + warn = paste(cli::symbol$warn,"\nWarning:"), pale = " joyn does not currently allow inequality joins, so", bolded_pale = " keep = NULL", pale = " will retain only keys in x" @@ -869,27 +651,14 @@ inner_join <- function( keep <- FALSE } - na_matches <- match.arg(na_matches) - multiple <- match.arg( - multiple, - choices = c( - "all", - "any", - "first", - "last" - ) - ) + # Check multiple if (multiple == "any") { multiple <- "first" } - unmatched <- match.arg( - unmatched, - choices = c( - "drop", - "error" - ) - ) + + # Check relationship if (is.null(relationship)) {relationship <- "one-to-one"} + relationship <- switch( relationship, "one-to-one" = "1:1", @@ -908,13 +677,8 @@ inner_join <- function( ) ) } - na_matches <- match.arg( - na_matches, - choices = c( - "na", - "never" - ) - ) + + # Check na_matches if (na_matches == "never") { store_msg( type = "warn", @@ -923,6 +687,8 @@ inner_join <- function( bolded_pale = " na_matches = 'na'" ) } + + # Check reportvar if (is.null(reportvar) || isFALSE(reportvar)) { dropreport <- TRUE reportvar <- getOption("joyn.reportvar") @@ -930,103 +696,238 @@ inner_join <- function( dropreport <- FALSE } - # Column names ----------------------------------- - if (keep == TRUE) { + out <- list(by = by, + copy = copy, + suffix = suffix, + keep = keep, + na_matches = na_matches, + multiple = multiple, + relationship = relationship, + reportvar = reportvar, + dropreport = dropreport) - x_1 <- copy(x) - y_1 <- copy(y) + return(out) - if (length(grep(pattern = "==?", x = by, value = TRUE)) != 0) { - by_y_names <- fix_by_vars(by = by, x_1, y_1)$yby - } else { - by_y_names <- fix_by_vars(by = by, x_1, y_1)$by - } +} - ykeys <- y |> - fselect(by_y_names) - names(ykeys) <- paste0(names(ykeys), suffix[2]) - y <- cbind( - ykeys, - y - ) - } +#' Add x key var and y key var (with suffixes) to x and y +#' -when joining by different variables and keep is true +#' @param x data table: left table +#' @param y data table: right table +#' @param by character vector of variables to join by +#' @param suffix character(2) specifying the suffixes to be used for making non-by column names unique +#' @param jn_type character specifying type of join +#' @return list containing x and y +#' @keywords internal +set_col_names <- function(x, y, by, suffix, jn_type) { - # Do inner join ------------------------------------ - fj <- joyn( - x = x, - y = y, - by = by, - match_type = relationship, - keep = "inner", - y_vars_to_keep = y_vars_to_keep, - suffixes = suffix, - update_values = update_values, - update_NAs = update_NAs, - reportvar = reportvar, - reporttype = reporttype, - keep_common_vars = T, - sort = sort, - verbose = verbose, - ... - ) + x_1 <- copy(x) + y_1 <- copy(y) + # If joining by different variables + if (length(grep(pattern = "==?", x = by, value = TRUE)) != 0) { - # Do filter --------------------------------------- + if (jn_type == "right") { + by_x_names <- fix_by_vars(by = by, x_1, y_1)$xby + } - ### unmatched == "error" - if (unmatched == "error") { - if (any( - fj[ - , - get(names(fj)[length(fj)]) - ] == "x" - ) | - any( - fj[ - , - get(names(fj)[length(fj)]) - ] == 1 - ) - ) { + else if (jn_type == "left" | jn_type == "full" | jn_type == "inner") { + by_y_names <- fix_by_vars(by = by, x_1, y_1)$yby + } - cli::cli_abort( - paste0( - cli::symbol$cross, - " Error: some rows in `y` are not matched - this check is due to - argument `unmatched = 'error'` " - ) - ) + } + # If joining by common var + else { + by_y_names <- by_x_names <- fix_by_vars(by = by, x_1, y_1)$by } - } - ### if dropreport = T - if (dropreport == T) { - get_vars(fj, reportvar) <- NULL - } + # Add key vars with suffix to x and y + if (jn_type == "right") { + xkeys <- x |> + fselect(by_x_names) + names(xkeys) <- paste0(names(xkeys), suffix[1]) + x <- cbind( + xkeys, + x + ) + } else if (jn_type == "left" | jn_type == "full" | jn_type == "inner") { - # Return - fj + ykeys <- y |> + fselect(by_y_names) + names(ykeys) <- paste0(names(ykeys), suffix[2]) + y <- cbind( + ykeys, + y + ) -} + } #close else + return(list(x = x, + y = y)) +} +#' Conduct all unmatched keys checks and return error if necessary +#' +#' @param x left table +#' @param y right table +#' @param out output from join +#' @param by character vector of keys that x and y are joined by +#' @param jn_type character: "left", "right", or "inner" +#' +#' @return error message +#' @keywords internal +check_unmatched_keys <- function(x, y, out, by, jn_type) { + + # Left table -------------------------------------------------------- + if (jn_type == "left" | jn_type == "inner") { + + use_y_input <- process_by_vector(by = by, input = "right") + use_y_out <- process_by_vector(by = by, input = "left") + + if (length(grep("==?", by, value = TRUE)) != 0) { + + if (any(use_y_out %in% colnames(y))) { + + store_msg( + type = "warn", + warn = paste(cli::symbol$warn, "\nWarning:"), + pale = "\nUnmatched = error not active for this joyn -unmatched keys are not detected" + ) + } + + else { + data.table::setnames(y, new = use_y_out, old = use_y_input) + + if (unmatched_keys(x = y, + by = use_y_out, + out = out)) { + cli::cli_abort( + paste0( + cli::symbol$cross, + " Error: some rows in `y` are not matched - this check is due to + argument `unmatched = 'error'` ") + ) + } + } + } + + else { + if (unmatched_keys(x = y, + by = use_y_out, + out = out)) { + cli::cli_abort( + paste0( + cli::symbol$cross, + " Error: some rows in `y` are not matched - this check is due to + argument `unmatched = 'error'` ") + ) + } + } + } + # Right Join -------------------------------------------------------- + if (jn_type == "right" | jn_type == "inner") { + use_x_input <- process_by_vector(by = by, input = "left") + if (unmatched_keys(x = x, + by = use_x_input, + out = out)) { + cli::cli_abort( + paste0( + cli::symbol$cross, + " Error: some rows in `x` are not matched - this check is due to + argument `unmatched = 'error'`. To drop these rows, set `unmatched = 'drop'` ") + ) + } + } + invisible(x) +} +#' Check for unmatched keys +#' +#' Gives TRUE if unmatched keys, FALSE if not. +#' +#' @param x input table to join +#' @param out output of join +#' @param by by argument, giving keys for join +#' +#' @return logical +#' @keywords internal +unmatched_keys <- function(x, out, by) { + + check <- NULL + + # Get all keys from `x` + x_keys <- x |> + fselect(by) |> + as.data.table() + + # get all key combos from `out` + out_keys <- out |> + fselect(by) |> + as.data.table() + + # check that key combos are equal + check <- (data.table::fsetdiff(x_keys, + out_keys) |> + nrow()) > 0 # if true => more unique combos in x + # false => same unique combos of keys + # same number unique keys => + # all matched keys + # because output is result + # of join + check +} +#' Process the `by` vector +#' +#' Gives as output a vector of names to be used for the specified +#' table that correspond to the `by` argument for that table +#' +#' @param by character vector: by argument for join +#' @param input character: either "left" or "right", indicating +#' whether to give the left or right side of the equals ("=") if +#' the equals is part of the `by` vector +#' +#' @return character vector +#' @keywords internal +#' +#' @examples +#' joyn:::process_by_vector(by = c("An = foo", "example"), input = "left") +process_by_vector <- function(by, input = c("left", "right")) { + input <- match.arg(input) + if (input == "left") { + out <- sapply(by, function(x) { + if (grepl("=", x)) { + trimws(gsub("([^=]+)(\\s*==?\\s*)([^=]+)", "\\1", x)) + } else { + x + } + }) + } else if (input == "right") { + out <- sapply(by, function(x) { + if (grepl("=", x)) { + trimws(gsub("([^=]+)(\\s*==?\\s*)([^=]+)", "\\3", x)) + } else { + x + } + }) + } + out |> unname() +} diff --git a/R/joyn-merge.R b/R/joyn-merge.R index bf014c44..ba1426c6 100644 --- a/R/joyn-merge.R +++ b/R/joyn-merge.R @@ -211,9 +211,9 @@ joyn <- function(x, # Initial parameters --------- #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ start_joyn <- Sys.time() - # copy objects if data.tables - x <- copy(x) - y <- copy(y) + x <- copy(x) + y <- copy(y) + ## X and Y ----------- check_xy(x,y) diff --git a/man/arguments_checks.Rd b/man/arguments_checks.Rd new file mode 100644 index 00000000..f3ea6820 --- /dev/null +++ b/man/arguments_checks.Rd @@ -0,0 +1,112 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-joins.R +\name{arguments_checks} +\alias{arguments_checks} +\title{Perform necessary preliminary checks on arguments that are passed to joyn} +\usage{ +arguments_checks( + x, + y, + by, + copy, + keep, + suffix, + na_matches, + multiple, + relationship, + reportvar +) +} +\arguments{ +\item{x}{data frame: left table} + +\item{y}{data frame: right table} + +\item{by}{character vector or variables to join by} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{keep}{Should the join keys from both \code{x} and \code{y} be preserved in the +output? +\itemize{ +\item If \code{NULL}, the default, joins on equality retain only the keys from \code{x}, +while joins on inequality retain the keys from both inputs. +\item If \code{TRUE}, all keys from both inputs are retained. +\item If \code{FALSE}, only keys from \code{x} are retained. For right and full joins, +the data in key columns corresponding to rows that only exist in \code{y} are +merged into the key columns from \code{x}. Can't be used when joining on +inequality conditions. +}} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} + +\item{na_matches}{Should two \code{NA} or two \code{NaN} values match? +\itemize{ +\item \code{"na"}, the default, treats two \code{NA} or two \code{NaN} values as equal, like +\code{\%in\%}, \code{\link[=match]{match()}}, and \code{\link[=merge]{merge()}}. +\item \code{"never"} treats two \code{NA} or two \code{NaN} values as different, and will +never match them together or to any other values. This is similar to joins +for database sources and to \code{base::merge(incomparables = NA)}. +}} + +\item{multiple}{Handling of rows in \code{x} with multiple matches in \code{y}. +For each row of \code{x}: +\itemize{ +\item \code{"all"}, the default, returns every match detected in \code{y}. This is the +same behavior as SQL. +\item \code{"any"} returns one match detected in \code{y}, with no guarantees on which +match will be returned. It is often faster than \code{"first"} and \code{"last"} +if you just need to detect if there is at least one match. +\item \code{"first"} returns the first match detected in \code{y}. +\item \code{"last"} returns the last match detected in \code{y}. +}} + +\item{relationship}{Handling of the expected relationship between the keys of +\code{x} and \code{y}. If the expectations chosen from the list below are +invalidated, an error is thrown. +\itemize{ +\item \code{NULL}, the default, doesn't expect there to be any relationship between +\code{x} and \code{y}. However, for equality joins it will check for a many-to-many +relationship (which is typically unexpected) and will warn if one occurs, +encouraging you to either take a closer look at your inputs or make this +relationship explicit by specifying \code{"many-to-many"}. + +See the \emph{Many-to-many relationships} section for more details. +\item \code{"one-to-one"} expects: +\itemize{ +\item Each row in \code{x} matches at most 1 row in \code{y}. +\item Each row in \code{y} matches at most 1 row in \code{x}. +} +\item \code{"one-to-many"} expects: +\itemize{ +\item Each row in \code{y} matches at most 1 row in \code{x}. +} +\item \code{"many-to-one"} expects: +\itemize{ +\item Each row in \code{x} matches at most 1 row in \code{y}. +} +\item \code{"many-to-many"} doesn't perform any relationship checks, but is provided +to allow you to be explicit about this relationship if you know it +exists. +} + +\code{relationship} doesn't handle cases where there are zero matches. For that, +see \code{unmatched}.} + +\item{reportvar}{character: Name of reporting variable. Default is ".joyn". +This is the same as variable "_merge" in Stata after performing a merge. If +FALSE or NULL, the reporting variable will be excluded from the final +table, though a summary of the join will be display after concluding.} +} +\value{ +list of checked arguments to pass on to the main joyn function +} +\description{ +Perform necessary preliminary checks on arguments that are passed to joyn +} +\keyword{internal} diff --git a/man/check_unmatched_keys.Rd b/man/check_unmatched_keys.Rd new file mode 100644 index 00000000..ba0528fb --- /dev/null +++ b/man/check_unmatched_keys.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-joins.R +\name{check_unmatched_keys} +\alias{check_unmatched_keys} +\title{Conduct all unmatched keys checks and return error if necessary} +\usage{ +check_unmatched_keys(x, y, out, by, jn_type) +} +\arguments{ +\item{x}{left table} + +\item{y}{right table} + +\item{out}{output from join} + +\item{by}{character vector of keys that x and y are joined by} + +\item{jn_type}{character: "left", "right", or "inner"} +} +\value{ +error message +} +\description{ +Conduct all unmatched keys checks and return error if necessary +} +\keyword{internal} diff --git a/man/joyn-package.Rd b/man/joyn-package.Rd index 1bc98aa3..e10802cc 100644 --- a/man/joyn-package.Rd +++ b/man/joyn-package.Rd @@ -3,6 +3,7 @@ \docType{package} \name{joyn-package} \alias{joyn-package} +\alias{_PACKAGE} \title{joyn: Tool for Diagnosis of Tables Joins and Complementary Join Features} \description{ Tool for diagnosing table joins. It combines the speed of `collapse` and `data.table`, the flexibility of `dplyr`, and the diagnosis and features of the `merge` command in `Stata`. diff --git a/man/merge.Rd b/man/merge.Rd index 7f748784..fa33ff7c 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -35,19 +35,17 @@ If \code{y} has no key columns, this defaults to the key of \code{x}.} \item{all}{logical; \code{all = TRUE} is shorthand to save setting both \code{all.x = TRUE} and \code{all.y = TRUE}.} -\item{all.x}{logical; if \code{TRUE}, rows from \code{x} which have no matching row -in \code{y} are included. These rows will have 'NA's in the columns that are usually -filled with values from \code{y}. The default is \code{FALSE} so that only rows with -data from both \code{x} and \code{y} are included in the output.} +\item{all.x}{logical; if \code{TRUE}, then extra rows will be added to the +output, one for each row in \code{x} that has no matching row in \code{y}. +These rows will have 'NA's in those columns that are usually filled with values +from \code{y}. The default is \code{FALSE}, so that only rows with data from both +\code{x} and \code{y} are included in the output.} \item{all.y}{logical; analogous to \code{all.x} above.} -\item{sort}{logical. If \code{TRUE} (default), the rows of the merged -\code{data.table} are sorted by setting the key to the \code{by / by.x} columns. If -\code{FALSE}, unlike base R's \code{merge} for which row order is unspecified, the -row order in \code{x} is retained (including retaining the position of missings when -\code{all.x=TRUE}), followed by \code{y} rows that don't match \code{x} (when \code{all.y=TRUE}) -retaining the order those appear in \code{y}.} +\item{sort}{logical. If \code{TRUE} (default), the merged \code{data.table} is +sorted by setting the key to the \code{by / by.x} columns. If \code{FALSE}, the +result is not sorted.} \item{suffixes}{A \code{character(2)} specifying the suffixes to be used for making non-\code{by} column names unique. The suffix behaviour works in a similar diff --git a/man/process_by_vector.Rd b/man/process_by_vector.Rd new file mode 100644 index 00000000..78a3b707 --- /dev/null +++ b/man/process_by_vector.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-joins.R +\name{process_by_vector} +\alias{process_by_vector} +\title{Process the \code{by} vector} +\usage{ +process_by_vector(by, input = c("left", "right")) +} +\arguments{ +\item{by}{character vector: by argument for join} + +\item{input}{character: either "left" or "right", indicating +whether to give the left or right side of the equals ("=") if +the equals is part of the \code{by} vector} +} +\value{ +character vector +} +\description{ +Gives as output a vector of names to be used for the specified +table that correspond to the \code{by} argument for that table +} +\examples{ +joyn:::process_by_vector(by = c("An = foo", "example"), input = "left") +} +\keyword{internal} diff --git a/man/set_col_names.Rd b/man/set_col_names.Rd new file mode 100644 index 00000000..997fff56 --- /dev/null +++ b/man/set_col_names.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-joins.R +\name{set_col_names} +\alias{set_col_names} +\title{Add x key var and y key var (with suffixes) to x and y +-when joining by different variables and keep is true} +\usage{ +set_col_names(x, y, by, suffix, jn_type) +} +\arguments{ +\item{x}{data table: left table} + +\item{y}{data table: right table} + +\item{by}{character vector of variables to join by} + +\item{suffix}{character(2) specifying the suffixes to be used for making non-by column names unique} + +\item{jn_type}{character specifying type of join} +} +\value{ +list containing x and y +} +\description{ +Add x key var and y key var (with suffixes) to x and y +-when joining by different variables and keep is true +} +\keyword{internal} diff --git a/man/unmatched_keys.Rd b/man/unmatched_keys.Rd new file mode 100644 index 00000000..198489cd --- /dev/null +++ b/man/unmatched_keys.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-joins.R +\name{unmatched_keys} +\alias{unmatched_keys} +\title{Check for unmatched keys} +\usage{ +unmatched_keys(x, out, by) +} +\arguments{ +\item{x}{input table to join} + +\item{out}{output of join} + +\item{by}{by argument, giving keys for join} +} +\value{ +logical +} +\description{ +Gives TRUE if unmatched keys, FALSE if not. +} +\keyword{internal} diff --git a/tests/testthat/test-dplyr-joins.R b/tests/testthat/test-dplyr-joins.R index 74e2b18e..64ae06b7 100644 --- a/tests/testthat/test-dplyr-joins.R +++ b/tests/testthat/test-dplyr-joins.R @@ -4,39 +4,39 @@ withr::local_options(joyn.verbose = FALSE) # TEST DATA -------------------------------------------------------------------- #------------------------------------------------------------------------------- # options(joyn.verbose = FALSE) -x1 = data.frame(id = c(1L, 1L, 2L, 3L, NA_integer_), +x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_), t = c(1L, 2L, 1L, 2L, NA_integer_), x = 11:15) -y1 = data.frame(id = c(1,2, 4), +y1 = data.table(id = c(1,2, 4), y = c(11L, 15L, 16)) -x2 = data.frame(id = c(1, 4, 2, 3, NA), +x2 = data.table(id = c(1, 4, 2, 3, NA), t = c(1L, 2L, 1L, 2L, NA_integer_), x = c(16, 12, NA, NA, 15)) -y2 = data.frame(id = c(1, 2, 5, 6, 3), +y2 = data.table(id = c(1, 2, 5, 6, 3), yd = c(1, 2, 5, 6, 3), y = c(11L, 15L, 20L, 13L, 10L), x = c(16:20)) -y3 <- data.frame(id = c("c","b", "c", "a"), +y3 <- data.table(id = c("c","b", "c", "a"), y = c(11L, 15L, 18L, 20L)) -x3 <- data.frame(id = c("c","b", "d"), +x3 <- data.table(id = c("c","b", "d"), v = 8:10, foo = c(4,2, 7)) -x4 = data.frame(id1 = c(1, 1, 2, 3, 3), +x4 = data.table(id1 = c(1, 1, 2, 3, 3), id2 = c(1, 1, 2, 3, 4), t = c(1L, 2L, 1L, 2L, NA_integer_), x = c(16, 12, NA, NA, 15)) -y4 = data.frame(id = c(1, 2, 5, 6, 3), +y4 = data.table(id = c(1, 2, 5, 6, 3), id2 = c(1, 1, 2, 3, 4), y = c(11L, 15L, 20L, 13L, 10L), x = c(16:20)) -x5 = data.frame(id = c(1L, 1L, 2L, 3L, NA_integer_, NA_integer_), +x5 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_, NA_integer_), t = c(1L, 2L, 1L, 2L, NA_integer_, 4L), x = 11:16) -y5 = data.frame(id = c(1,2, 4, NA_integer_, NA_integer_), +y5 = data.table(id = c(1,2, 4, NA_integer_, NA_integer_), y = c(11L, 15L, 16, 17L, 18L)) reportvar = getOption("joyn.reportvar") #------------------------------------------------------------------------------- @@ -141,13 +141,14 @@ test_that("LEFT JOIN - Conducts left join", { ) attr(jn_dplyr, "sorted") <- "id" + attr(jn, "sorted") <- "id" jn_dplyr <- roworder(jn_dplyr, "id", na.last = FALSE) rownames(jn) <- c(1:length(x1$id)) expect_equal( - jn |> fselect(-get(reportvar)), - jn_dplyr, + jn |> fselect(-get(reportvar)) |> as.data.frame(), + jn_dplyr |> as.data.frame(), ignore_attr = ".internal.selfref" ) @@ -242,6 +243,7 @@ test_that("LEFT JOIN - incorrectly specified arguments give errors", { left_join( x = x1, y = y1, + by = "id", relationship = "many-to-one", unmatched = "error" ) @@ -378,13 +380,49 @@ test_that("LEFT JOIN - reportvar works", { test_that("LEFT JOIN - unmatched throws error", { - expect_error( - left_join(x = x1, - y = y1, - relationship = "many-to-one", - by = "id", - unmatched = "error") - ) + left_join(x = x4, + y = y4, + relationship = "many-to-many", + by = "id2", + unmatched = "error") |> + expect_no_error() + + left_join(x = x4, + y = y4, + relationship = "many-to-many", + by = "id1=id2", + unmatched = "error") |> + expect_error() + + left_join(x = x1, + y = y1, + relationship = "many-to-one", + by = "id", + unmatched = "error") |> + expect_error() + + left_join(x = x4, + y = y4, + relationship = "many-to-many", + by = c("id2", "x"), + unmatched = "error") |> + expect_error() + + left_join(x = x4, + y = y4, + relationship = "many-to-many", + by = c("id2=id"), + unmatched = "error") |> + expect_no_error() + + left_join(x = x4, + y = y4, + relationship = "many-to-many", + by = c("id1=id2", "id2=id"), + unmatched = "error") |> + expect_no_error() + + }) @@ -406,6 +444,7 @@ test_that("LEFT JOIN - NA matches", { }) # TEST RIGHT JOINS ------------------------------------------------------ + test_that("RIGHT JOIN - Conducts right join", { # One way jn_joyn <- right_join( @@ -685,8 +724,48 @@ test_that("RIGHT JOIN - NA matches", { expect_contains("warn") }) +test_that("RIGHT JOIN - unmatched error", { + + right_join(x = x4, + y = y4, + relationship = "many-to-many", + by = "id2", + unmatched = "error") |> + expect_no_error() + + right_join(x = x4, + y = y4, + relationship = "many-to-many", + by = "id1=id2", + unmatched = "error") |> + expect_no_error() + + right_join(x = x1, + y = y1, + relationship = "many-to-one", + by = "id", + unmatched = "error") |> + expect_error() + + right_join(x = x4, + y = y4, + relationship = "one-to-one", + by = c("id2", "x"), + unmatched = "error") |> + expect_error() + + right_join(x = x4, + y = y4, + relationship = "many-to-one", + by = c("id1=id2", "id2=id"), + unmatched = "error") |> + expect_error() + + +}) + #------------------------------------------------------------------------------- -# TEST FULL JOINS ------------------------------------------------------------- +# TEST FULL JOINS -------------------------------------------------------------- #------------------------------------------------------------------------------- @@ -875,7 +954,7 @@ test_that("FULL JOIN - (correctly) incorrectly specified arguments give (no) err ) ) - expect_error( + expect_no_error( full_join( x = x1, y = y1, @@ -985,7 +1064,7 @@ test_that("FULL JOIN - (correctly) incorrectly specified arguments give (no) err ) ) - expect_error( + expect_no_error( full_join( x = x2, y = y5, @@ -1108,6 +1187,31 @@ test_that("FULL JOIN - NA matches", { expect_contains("warn") }) +test_that("FULL JOIN -unmatched error", { + + full_join(x = x4, + y = y4, + relationship = "many-to-many", + by = "id2", + unmatched = "error") |> + expect_no_error() + + full_join(x = x4, + y = y4, + relationship = "many-to-many", + by = "id1=id2", + unmatched = "error") |> + expect_no_error() + + full_join(x = x1, + y = y1, + relationship = "many-to-one", + by = "id", + unmatched = "error") |> + expect_no_error() + +}) + #------------------------------------------------------------------------------- # TEST INNER JOINS ------------------------------------------------------------- @@ -1515,3 +1619,42 @@ test_that("INNER JOIN - NA matches", { rlang::env_get(.joynenv, "joyn_msgs")$type |> expect_contains("warn") }) + +test_that("INNER JOIN - unmatched error", { + inner_join(x = x4, + y = y4, + relationship = "many-to-many", + by = "id2", + unmatched = "error") |> + expect_no_error() + + inner_join(x = x4, + y = y4, + relationship = "many-to-many", + by = "id1=id2", + unmatched = "error") |> + expect_error() + + inner_join(x = x1, + y = y1, + relationship = "many-to-one", + by = "id", + unmatched = "error") |> + expect_error() + + inner_join(x = x4, + y = y4, + relationship = "one-to-one", + by = c("id2", "x"), + unmatched = "error") |> + expect_error() + + inner_join(x = x4, + y = y4, + relationship = "many-to-one", + by = c("id1=id2", "id2=id"), + unmatched = "error") |> + expect_error() + +}) +