Skip to content

Commit

Permalink
Merge pull request #64 from randrescastaneda/optimize
Browse files Browse the repository at this point in the history
Optimize
  • Loading branch information
randrescastaneda authored Jun 14, 2024
2 parents 6ca6b27 + e4dcd53 commit fc644b5
Show file tree
Hide file tree
Showing 25 changed files with 322 additions and 236 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ doc
Meta

docs
/doc/
/Meta/
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: joyn
Type: Package
Title: Tool for Diagnosis of Tables Joins and Complementary Join Features
Version: 0.2.0.9004
Version: 0.2.0.9007
Authors@R: c(person(given = "R.Andres",
family = "Castaneda",
email = "[email protected]",
Expand Down Expand Up @@ -37,10 +37,12 @@ Imports:
data.table,
cli,
utils,
collapse (>= 2.0.13),
collapse (>= 2.0.15),
lifecycle
Depends:
R (>= 2.10)
RoxygenNote: 7.3.1
Roxygen: list(markdown = TRUE)
VignetteBuilder: knitr
Remotes:
SebKrantz/collapse
8 changes: 7 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,25 @@

* Add information about duplicated obs in `by` variable when match type is `1` rathern than `m`.

* improve ineffciencies in deep copies with `m:m` joins
* improve inefficiencies in deep copies with `m:m` joins

* Replace `m:m` joins from `data.table::merge.data.table` to `collapse::join`. Thanks to @SebKrantz for the suggestion (#58).

* Add information about duplicated obs in `by` variable when match type is `1` rather than `m`.

* Internal: improve storing of joyn messages.

* Improve creation of reporting variable. Now, it is created in [collapse::join] rather than in `joyn` function. In addition, the reporting variable is created as factor to improve performance. Thanks to @SebKrantz for the suggestion (#58)

## breaking changes

* Now, by default, `joyn` will not sort the data. This is to avoid unnecessary
computational time that most of the time is not needed.
If the user wants to sort the data, they can use the `sort` argument, which triggers
the sorting mechanism of `collapse` package.

* report variable (named ".join" by default) is now a factor instead of character. Yet, users can still use character if they want with the `reporttype = "character"`.

# joyn 0.2.0

* `joyn` has gained two new authors: Zander Prinsloo and Rossana Tatulli.
Expand Down
10 changes: 5 additions & 5 deletions R/dplyr-joins.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ left_join <- function(
update_values = FALSE,
update_NAs = update_values,
reportvar = getOption("joyn.reportvar"),
reporttype = c("character", "numeric"),
reporttype = c("factor", "character", "numeric"),
roll = NULL,
keep_common_vars = FALSE,
sort = TRUE,
Expand Down Expand Up @@ -207,7 +207,7 @@ right_join <- function(
update_values = FALSE,
update_NAs = update_values,
reportvar = getOption("joyn.reportvar"),
reporttype = c("character", "numeric"),
reporttype = c("factor", "character", "numeric"),
roll = NULL,
keep_common_vars = FALSE,
sort = TRUE,
Expand Down Expand Up @@ -369,7 +369,7 @@ full_join <- function(
update_values = FALSE,
update_NAs = update_values,
reportvar = getOption("joyn.reportvar"),
reporttype = c("character", "numeric"),
reporttype = c("factor", "character", "numeric"),
roll = NULL,
keep_common_vars = FALSE,
sort = TRUE,
Expand Down Expand Up @@ -527,7 +527,7 @@ inner_join <- function(
update_values = FALSE,
update_NAs = update_values,
reportvar = getOption("joyn.reportvar"),
reporttype = c("character", "numeric"),
reporttype = c("factor", "character", "numeric"),
roll = NULL,
keep_common_vars = FALSE,
sort = TRUE,
Expand Down Expand Up @@ -687,7 +687,7 @@ anti_join <- function(
relationship = "many-to-many",
y_vars_to_keep = FALSE,
reportvar = getOption("joyn.reportvar"),
reporttype = c("character", "numeric"),
reporttype = c("factor", "character", "numeric"),
roll = NULL,
keep_common_vars = FALSE,
sort = TRUE,
Expand Down
78 changes: 50 additions & 28 deletions R/freq_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ if (getRversion() >= '2.15.1')
#' @param x data frame
#' @param byvar character: name of variable to tabulate. Use Standard evaluation.
#' @param digits numeric: number of decimal places to display. Default is 1.
#' @param na.rm logical: if TRUE remove NAs from calculations. Default is TRUE
#' @param na.rm logical: report NA values in frequencies. Default is FALSE.
#'
#' @return data.table with frequencies.
#' @export
Expand All @@ -26,36 +26,58 @@ if (getRversion() >= '2.15.1')
freq_table <- function(x,
byvar,
digits = 1,
na.rm = TRUE) {
na.rm = FALSE) {

if (!(is.data.table(x))) {
x <- as.data.table(x)
} else {
x <- data.table::copy(x)
x_name <- as.character(substitute(x))
if (!is.data.frame(x)) {
cli::cli_abort("Argument {.arg x} ({.field {x_name}}) must be a data frame")
}

fq <- qtab(x[[byvar]], na.exclude = na.rm)
ft <- data.frame(joyn = names(fq),
n = as.numeric(fq))

# Frequencies and format
d <- x[, .(n = .N), by = byvar
][, percent :=
{
total = sum(n, na.rm = na.rm)
d <- round((n/ total)*100, digits = digits)
d <- as.character(d)
d <- paste0(d, "%")
}
]

# Total row just for completeness
setorderv(d, byvar)
totd <- data.table::data.table(
tempname = "total",
n = d[, sum(n, na.rm = na.rm)],
percent = "100%"
)
N <- fsum(ft$n)
ft <- ft |>
ftransform(percent = paste0(round(n / N * 100, digits), "%"))

# add row with totals
ft <- rowbind(ft, data.table(joyn = "total",
n = N,
percent = "100%")) |>
# filter zeros
fsubset(n > 0)

setrename(ft, joyn = byvar, .nse = FALSE)
}



#' Report frequencies from attributes in report var
#'
#' @param x dataframe from [joyn_workhorse]
#' @param y dataframe from original merge ("right" or "using")
#'
#' @return dataframe with frequencies of report var
#' @keywords internal
report_from_attr <- function(x,y, reportvar) {
# from suggestion by @SebKrantz in #58
# https://github.com/randrescastaneda/joyn/issues/58
m <- attr(x, "join.match")$match

N <- fnrow(x)
nm_x <- attr(m, "N.nomatch") # Number of non-matched x values
nm_y <- fnrow(y) - attr(m, "N.distinct") # Number of non-matched y values. If multiple = FALSE attr(m, "N.distinct") = number of unique matches.


counts <- c(nm_x, nm_y, N-nm_x-nm_y, N)
report <- data.frame(
.joyn1 = c("x", "y", "x & y", "total"),
n = counts,
percent = paste0(round(counts / N * 100, 1), "%")
) |>
fsubset(n > 0)

setrename(report, .joyn1 = reportvar, .nse = FALSE)

setnames(totd, "tempname", byvar)
d <- data.table::rbindlist(list(d, totd),
use.names = TRUE)
return(d)
}
38 changes: 28 additions & 10 deletions R/info_display.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,33 +127,49 @@ store_msg <- function(type, ...) {
#' @param info A character string representing an info message to be stored. Default value is NULL
#'
#' @section Hot to pass the message string:
#' The function allows for the customization of the message string using {cli} classes to emphasize specific components of the message
#' The function allows for the customization of the message string using cli classes to emphasize specific components of the message
#' Here's how to format the message string:
#' *For variables: .strongVar --example: "{.strongVar {reportvar}}"
#' *For function arguments: .strongArg --example: "{.strongArg {y_vars_to_keep}}"
#' *For dt/df: .strongTable --example: "{.strongTable x}"
#' *For text/anything else: .strong --example: "reportvar is {.strong NOT} returned"
#' *For variables: .strongVar
#' *For function arguments: .strongArg
#' *For dt/df: .strongTable
#' *For text/anything else: .strong
#' *NOTE: By default, the number of seconds specified in timing messages is
#' automatically emphasized using a custom formatting approach.
#' You do not need to apply {cli} classes nor to specify that the number is in seconds.
#' --example usage: store_joyn_msg(timing =
#' paste("The full joyn is executed in", round(time_taken, 6)))
#' You do not need to apply cli classes nor to specify that the number is in seconds.
#'
#'
#'
#' @return invisible TRUE
#'
#' @examples
#' # Timing msg
#' joyn:::store_joyn_msg(timing = paste(" The entire joyn function, including checks,
#' is executed in ", round(1.8423467, 6)))
#'
#' # Error msg
#' joyn:::store_joyn_msg(err = " Input table {.strongTable x} has no columns.")
#'
#' # Info msg
#' joyn:::store_joyn_msg(info = "Joyn's report available in variable {.strongVar .joyn}")
#'
#'
#' @keywords internal
store_joyn_msg <- function(err = NULL,
warn = NULL,
timing = NULL,
info = NULL) {

# Check that only one among err, warn, timing and info is not null, otherwise stop
# Check that only one among err, warn, timing and info is not null,
# otherwise stop
#
# Formals
frm <- formals() |>
names()

cn <- c(err, warn, timing, info)

if (length(cn) != 1) {
cli::cli_abort(c("only one of err, warn, timing, info can be not null",
cli::cli_abort(c("only one of {.or {.arg {frm}}} can be not null",
"i" = "check the arguments"))
}

Expand Down Expand Up @@ -418,7 +434,9 @@ joyn_report <- function(verbose = getOption("joyn.verbose")) {

freq <- rlang::env_get(.joynenv, "freq_joyn")
if (verbose) {
cli::cli_h2("JOYn Report")
print(freq)
cli::cli_rule(right = "End of {.field JOYn} report")
}
return(invisible(freq))
}
Loading

0 comments on commit fc644b5

Please sign in to comment.