From ec8399193de7adfabb5e2a535248504c9835f320 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Thu, 22 Feb 2024 09:13:03 -0500 Subject: [PATCH 1/9] small modifications in yml files --- .github/workflows/R-CMD-check.yaml | 4 ++-- .github/workflows/test-coverage.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 0c284bf9..38a50449 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master, website] + branches: [main, DEV] pull_request: - branches: [main, master, website] + branches: [main, DEV] name: R-CMD-check diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 47d0c11b..ddf8b7bd 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master, website] + branches: [main, DEV] pull_request: - branches: [main, master, website] + branches: [main, DEV] name: test-coverage From b2cd508b0abeda20682c24b6e1749c7547f32be9 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Thu, 22 Feb 2024 09:13:13 -0500 Subject: [PATCH 2/9] Increment version number to 0.1.6.9000 --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index bd7b7ab1..955f1cdc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: joyn Type: Package Title: Tool for Diagnosis of Tables Joins and Complementary Join Features -Version: 0.1.6 +Version: 0.1.6.9000 Authors@R: c(person(given = "R.Andres", family = "Castaneda", email = "acastanedaa@worldbank.org", diff --git a/NEWS.md b/NEWS.md index 3f9dfee0..57f68edb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# joyn (development version) + # joyn 0.1.6 # joyn 0.1.5 From df34efaf868529ac54b615fa06dc0e136d3fc584 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Thu, 22 Feb 2024 11:58:10 -0500 Subject: [PATCH 3/9] add try catch to workhorse --- R/info_display.R | 6 +- R/joyn_workhorse.R | 97 ++- _pkgdown.yml | 7 +- docs/dev/LICENSE-text.html | 13 +- docs/dev/LICENSE.html | 13 +- docs/dev/articles/index.html | 203 ++---- docs/dev/authors.html | 22 +- .../deps/bootstrap-5.3.1/bootstrap.min.css | 4 +- docs/dev/deps/bootstrap-5.3.1/font.css | 115 +++- docs/dev/index.html | 401 ++++++++---- docs/dev/news/index.html | 240 +++---- docs/dev/pkgdown.yml | 13 +- docs/dev/reference/freq_table.html | 268 +++----- docs/dev/reference/index.html | 345 +++++----- docs/dev/reference/is_id.html | 306 ++++----- docs/dev/reference/merge.html | 617 ++++++++---------- docs/dev/reference/possible_ids.html | 286 ++++---- 17 files changed, 1423 insertions(+), 1533 deletions(-) diff --git a/R/info_display.R b/R/info_display.R index ffc20a6a..65009360 100644 --- a/R/info_display.R +++ b/R/info_display.R @@ -19,7 +19,8 @@ #' #' joyn_msg("all") -joyn_msg <- function(type = c("all", type_choices())) { +joyn_msg <- function(type = c("all", type_choices()), + msg = NULL) { # Check --------- type_to_use <- match.arg(type, several.ok = TRUE) @@ -39,6 +40,9 @@ joyn_msg <- function(type = c("all", type_choices())) { cli::cli_text(.) }) + if ("err" %in% type_to_use & is.character(msg)) { + cli::cli_abort(msg) + } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Return --------- diff --git a/R/joyn_workhorse.R b/R/joyn_workhorse.R index 4fc03a9d..e4dfff88 100644 --- a/R/joyn_workhorse.R +++ b/R/joyn_workhorse.R @@ -69,33 +69,78 @@ joyn_workhorse <- function( # Do a full join ------------------------------------------------------------- # if not 1:1 => use merge.data.table - if (match_type == "m:m") { - - dt_result <- data.table::merge.data.table( - x = x, - y = y, - by = by, - all = TRUE, - sort = FALSE, - suffixes = suffixes, - allow.cartesian = TRUE - ) - } else { - - # not m:m => use collapse::join() - dt_result <- collapse::join( x = x, - y = y, - how = "full", - on = by, - multiple = TRUE, # matches row in x with m in y - validate = "m:m", # no checks performed - suffix = suffixes, # data.table suffixes - keep.col.order = TRUE, - verbose = 0, - column = NULL - ) - } + # not m:m => use collapse::join() + dt_result <- tryCatch( + expr = { + souce_pkg <- if (match_type == "m:m") "data.table::merge" else "collapse::join" + if (match_type == "m:m") { + data.table::merge.data.table( + x = x, + y = y, + by = by, + all = TRUE, + sort = FALSE, + suffixes = suffixes, + allow.cartesian = TRUE + ) + + } else { + collapse::join( x = x, + y = y, + how = "full", + on = by, + multiple = TRUE, # matches row in x with m in y + validate = "m:m", # no checks performed + suffix = suffixes, # data.table suffixes + keep.col.order = TRUE, + verbose = 0, + column = NULL) + } + }, # end of expr section + + error = function(e) { + + + joyn_msg("err", c("{.pkg {souce_pkg}} returned the following:", + x = e$message)) + }, # end of error section + + warning = function(w) { + if (grepl("[Oo]veridentified", w$message)) { + store_msg( + type = "warn", + ok = paste(cli::symbol$warning, "\nWarning: "), + pale = "Your data is overidentified. Below the original message from {.pkg {souce_pkg}}:", + bolded_pale = "\n{w$message}" + ) + } else { + store_msg( + type = "warn", + ok = paste(cli::symbol$warning, "\nWarning: "), + pale = "{.pkg {souce_pkg}} returned the following warning:", + bolded_pale = "\n{w$message}" + ) + } + + # This is inefficient but it is the only way to return the table when + # there is a warning + + collapse::join( x = x, + y = y, + how = "full", + on = by, + multiple = TRUE, # matches row in x with m in y + validate = "m:m", # no checks performed + suffix = suffixes, # data.table suffixes + keep.col.order = TRUE, + verbose = 0, + column = NULL) |> + suppressWarnings() + + } + + ) # End of trycatch # Calculate the time taken end_time <- Sys.time() diff --git a/_pkgdown.yml b/_pkgdown.yml index 19415e08..3be63979 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -27,11 +27,16 @@ navbar: - reference - articles - news - right: github + right: + - Dev version + - github components: reference: text: Reference href: reference/index.html + dev version: + text: dev version + href: dev/ github: icon: fab fa-github fa-lg href: https://github.com/randrescastaneda/joyn/ diff --git a/docs/dev/LICENSE-text.html b/docs/dev/LICENSE-text.html index 0543716b..36f884e6 100644 --- a/docs/dev/LICENSE-text.html +++ b/docs/dev/LICENSE-text.html @@ -10,7 +10,7 @@ joyn - 0.1.4.9005 + 0.1.6.9000 - - joyn - 0.1.2.9000 - - + + - - - - - - + + + + +
+ + +
+ + - - - + diff --git a/docs/dev/authors.html b/docs/dev/authors.html index b95e4cb2..d503f0c0 100644 --- a/docs/dev/authors.html +++ b/docs/dev/authors.html @@ -10,7 +10,7 @@ joyn - 0.1.4.9005 + 0.1.6.9000 - -
-
-
+ +
+ + +
-
- -
+ - - - + diff --git a/docs/dev/pkgdown.yml b/docs/dev/pkgdown.yml index 7203c28a..96f0b6f3 100644 --- a/docs/dev/pkgdown.yml +++ b/docs/dev/pkgdown.yml @@ -2,7 +2,14 @@ pandoc: 3.1.1 pkgdown: 2.0.7 pkgdown_sha: ~ articles: - advanced-use: advanced-use.html - general-use: general-use.html -last_built: 2024-02-02T16:04Z + adv-functionalities: adv-functionalities.html + aux-functions: aux-functions.html + dplyr-joins: dplyr-joins.html + main-functionalities: main-functionalities.html + merge-wrapper: merge-wrapper.html + messages: messages.html +last_built: 2024-02-22T14:37Z +urls: + reference: https://randrescastaneda.github.io/joyn/reference + article: https://randrescastaneda.github.io/joyn/articles diff --git a/docs/dev/reference/freq_table.html b/docs/dev/reference/freq_table.html index 1b6d7d3d..fa1e6779 100644 --- a/docs/dev/reference/freq_table.html +++ b/docs/dev/reference/freq_table.html @@ -1,201 +1,131 @@ - - - - - - - -tabulate simple frequencies — freq_table • joyn +Tabulate simple frequencies — freq_table • joyn + Skip to contents + +
+
+
- - +
+

tabulate one variable frequencies

+
- - - +
+

Usage

+
freq_table(x, byvar, digits = 1, na.rm = TRUE)
+
- - - +
+

Arguments

+
x
+

data frame

+
byvar
+

character: name of variable to tabulate. Use Standard evaluation.

- - +
digits
+

numeric: number of decimal places to display. Default is 1.

- +
na.rm
+

logical: if TRUE remove NAs from calculations. Default is TRUE

- - - +
+
+

Value

+ - - - - - - - -
-
- - - - -
- -
-
- - -
+
-
+ - - - + diff --git a/docs/dev/reference/index.html b/docs/dev/reference/index.html index 415bbc46..a4cc9736 100644 --- a/docs/dev/reference/index.html +++ b/docs/dev/reference/index.html @@ -1,219 +1,184 @@ - - - - - - - -Function reference • joyn - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Function reference • joyn + Skip to contents + - +
+
+
+ +
+

Main function

-
-
-
+

Since the objective of joyn is to join tables with joy, there is only one main function in this package

+
+ + + - +
+ + joyn() +
+
Join two tables
+
+

Dplyr-joins

+ + -
-
- + +
+ + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-

Main function

-

Since the objective of joyn is to join tables with joy, there is only one main function in this package

-
-

merge()

-

Merge two tables

-

Auxiliary functions

-

-
-

is_id()

-

Make sure the match type is correct

-

freq_table()

-

tabulate simple frequencies

-

possible_ids()

-

Find possible unique identifies of data frame

-
+
+ + full_join() +
+
Full join two data frames
+
+ + right_join() +
+
Right join two data frames
+
+ + left_join() +
+
Left join two data frames
+
+ + inner_join() +
+
Inner join two data frames
+
+

Merge data tables

+ + - -
+ +
+ + + +
+ + merge() +
+
Merge two data frames
+
+

Auxiliary functions

+ + -
+ + + -
+ + - - - + diff --git a/docs/dev/reference/is_id.html b/docs/dev/reference/is_id.html index 9779185a..12770bbb 100644 --- a/docs/dev/reference/is_id.html +++ b/docs/dev/reference/is_id.html @@ -1,207 +1,167 @@ - - - - - - - -Make sure the match type is correct — is_id • joyn - - - - - - +Check if dt is uniquely identified by by variable — is_id • joyn + Skip to contents + +
+
+
- - - +
+

report if dt is uniquely identified by by var or, if report = TRUE, the duplicates in by variable

+
+
+

Usage

+
is_id(dt, by, verbose = getOption("joyn.verbose"), return_report = FALSE)
+
+
+

Arguments

+
dt
+

either right of left table

- - +
by
+

variable to merge by

- +
verbose
+

logical: if TRUE messages will be displayed

- - - - - - - - - - -
-
- +
+
+

Value

+ - +

logical or data.frame, depending on the value of argument return_report

- -
-
- -
-

Make sure the match type is correct

+
+

Examples

+
library(data.table)
+
+# example with data frame not uniquely identified by `by` var
+
+y <- data.table(id = c("c","b", "c", "a"),
+                 y  = c(11L, 15L, 18L, 20L))
+is_id(y, by = "id")
+#> 
+#> ── Duplicates in terms of `id` 
+#>    copies     n percent
+#>    <char> <int>  <char>
+#> 1:      1     2   66.7%
+#> 2:      2     1   33.3%
+#> 3:  total     3    100%
+#> ─────────────────────────────────────────────────────── End of is_id() report ──
+#> [1] FALSE
+is_id(y, by = "id", return_report = TRUE)
+#> 
+#> ── Duplicates in terms of `id` 
+#>    copies     n percent
+#>    <char> <int>  <char>
+#> 1:      1     2   66.7%
+#> 2:      2     1   33.3%
+#> 3:  total     3    100%
+#> ─────────────────────────────────────────────────────── End of is_id() report ──
+#>        id copies
+#>    <char>  <int>
+#> 1:      c      2
+#> 2:      b      1
+#> 3:      a      1
+
+# example with data frame uniquely identified by `by` var
+
+y1 <- data.table(id = c("1","3", "2", "9"),
+                 y  = c(11L, 15L, 18L, 20L))
+is_id(y1, by = "id")
+#> 
+#> ── Duplicates in terms of `id` 
+#>    copies     n percent
+#>    <char> <int>  <char>
+#> 1:      1     4    100%
+#> 2:  total     4    100%
+#> ─────────────────────────────────────────────────────── End of is_id() report ──
+#> [1] TRUE
+
- -
is_id(dt, by, verbose = TRUE, return_report = FALSE)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
dt

either right of left table

by

by argument in merge

verbose

logical: if TRUE messages will be displayed

return_report

logical: if TRUE, returns data with summary of duplicates. -If FALSE, returns logical value depending on whether dt is uniquely identified -by by

- -

Value

- -

logical or data.frame, depending on the value of argument return_report

- -

Examples

-
is_id(y3, by = "id") -
#>
#> -- Duplicates in terms of `id`
#> copies n percent -#> 1: 1 2 66.7% -#> 2: 2 1 33.3% -#> 3: total 3 100%
#> ------------------------------------------------------- End of is_id() report --
#> [1] FALSE
is_id(y3, by = "id", return_report = TRUE) -
#>
#> -- Duplicates in terms of `id`
#> copies n percent -#> 1: 1 2 66.7% -#> 2: 2 1 33.3% -#> 3: total 3 100%
#> ------------------------------------------------------- End of is_id() report --
#> id copies -#> 1: c 2 -#> 2: b 1 -#> 3: a 1
-
- -
+
-
+ - - - + diff --git a/docs/dev/reference/merge.html b/docs/dev/reference/merge.html index 8bf13e40..81790352 100644 --- a/docs/dev/reference/merge.html +++ b/docs/dev/reference/merge.html @@ -1,398 +1,303 @@ - - - - - - +Merge two data frames — merge • joyn + Skip to contents + -Merge two tables — merge • joyn +
+
+
- - - +
+

This is a joyn wrapper that works in a similar fashion to base::merge and +data.table::merge, which is why merge masks the other two.

+
- - +
+

Usage

+
merge(
+  x,
+  y,
+  by = NULL,
+  by.x = NULL,
+  by.y = NULL,
+  all = FALSE,
+  all.x = all,
+  all.y = all,
+  sort = TRUE,
+  suffixes = c(".x", ".y"),
+  no.dups = TRUE,
+  allow.cartesian = getOption("datatable.allow.cartesian"),
+  match_type = c("m:m", "m:1", "1:m", "1:1"),
+  keep_common_vars = TRUE,
+  ...
+)
+
- - - +
+

Arguments

+
x, y
+

data tables. y is coerced to a data.table if +it isn't one already.

- - - +
by
+

A vector of shared column names in x and y to merge on. +This defaults to the shared key columns between the two tables. +If y has no key columns, this defaults to the key of x.

+
by.x, by.y
+

Vectors of column names in x and y to merge on.

- - +
all
+

logical; all = TRUE is shorthand to save setting both +all.x = TRUE and all.y = TRUE.

- - - - +
all.x
+

logical; if TRUE, rows from x which have no matching row +in y are included. These rows will have 'NA's in the columns that are usually +filled with values from y. The default is FALSE so that only rows with +data from both x and y are included in the output.

- - - - - - - -
-
- +
all.y
+

logical; analogous to all.x above.

- -
+
sort
+

logical. If TRUE (default), the rows of the merged +data.table are sorted by setting the key to the by / by.x columns. If +FALSE, unlike base R's merge for which row order is unspecified, the +row order in x is retained (including retaining the position of missings when +all.x=TRUE), followed by y rows that don't match x (when all.y=TRUE) +retaining the order those appear in y.

-
-
- -
-

This is the main and, basically, the only function in joyn.

-
+
suffixes
+

A character(2) specifying the suffixes to be used for +making non-by column names unique. The suffix behaviour works in a similar +fashion as the merge.data.frame method does.

-
merge(
-  x,
-  y,
-  by = intersect(names(x), names(y)),
-  yvars = TRUE,
-  match_type = c("m:m", "m:1", "1:m", "1:1"),
-  keep = c("full", "left", "master", "right", "using", "inner"),
-  update_values = FALSE,
-  update_NAs = update_values,
-  reportvar = "report",
-  reporttype = c("character", "numeric"),
-  roll = NULL,
-  keep_y_in_x = FALSE,
-  sort = TRUE,
-  verbose = getOption("joyn.verbose")
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
x

data frame: referred to left in R terminology, or master in -Stata terminology.

y

data frame: referred to right in R terminology, or using in -Stata terminology.

by

a character vector of variables to join by. If NULL, the default, -joyn will do a natural join, using all variables with common names across -the two tables. A message lists the variables so that you can check they're -right (to suppress the message, simply explicitly list the variables that -you want to join). To join by different variables on x and y use a vector -of expressions. For example, by = c("a = b", "z") will use "a" in x, "b" -in y, and "z" in both tables.

yvars

character: Vector of variable names that will be kept after the -merge. If TRUE (the default), it keeps all the brings all the variables in -y into x. If FALSE or NULL, it does not bring any variable into x, but a -report will be generated.

match_type

character: one of "m:m", "m:1", "1:m", "1:1". -Default is "m:m" since this is the default generally used in joins in R. -However, following Stata's recommendation, it is better to be explicit and -use any of the other three match types (See details in match types -sections).

keep

character: One of "full", "left", "master", "right", -"using", "inner". Default is "full". Even though this is not the -regular behavior of joins in R, the objective of joyn is to present a -diagnosis of the join, so that it must use by default a full join. Yet, if -"left" or "master", it keeps the observations that matched in both -tables and the ones that did not match in x. The ones in y will be -discarded. If "right" or "using", it keeps the observations that -matched in both tables and the ones that did not match in y. The ones in x -will be discarded. If "inner", it only keeps the observations that -matched both tables.

update_values

logical: If TRUE, it will update all values of variables -in x with the actual of variables in y with the same name as the ones in x. -NAs from y won't be used to update actual values in x.

update_NAs

logical: If TRUE, it will update NA values of all variables -in x with actual values of variables in y that have the same name as the -ones in x. If FALSE, NA values won't be updated.

reportvar

character: Name of reporting variable. Default if "report". + +

no.dups
+

logical indicating that suffixes are also appended to +non-by.y column names in y when they have the same column name +as any by.x.

+ + +
allow.cartesian
+

See allow.cartesian in [.data.table.

+ + +
match_type
+

character: one of "m:m", "m:1", "1:m", "1:1". +Default is "1:1" since this the most restrictive. However, following +Stata's recommendation, it is better to be explicit and use any of the +other three match types (See details in match types sections).

+ + +
keep_common_vars
+

logical: If TRUE, it will keep the original variable +from y when both tables have common variable names. Thus, the prefix "y." +will be added to the original name to distinguish from the resulting +variable in the joined table.

+ + +
...
+

Arguments passed on to joyn

y_vars_to_keep
+

character: Vector of variable names in y that will be +kept after the merge. If TRUE (the default), it keeps all the brings all +the variables in y into x. If FALSE or NULL, it does not bring any variable +into x, but a report will be generated.

+ +
reportvar
+

character: Name of reporting variable. Default is ".joyn". This is the same as variable "_merge" in Stata after performing a merge. If FALSE or NULL, the reporting variable will be excluded from the final -table, though a summary of the join will be display after concluding.

reporttype

character: One of "character" or "numeric". Default is -"character". If "numeric", the reporting variable will contain numeric -codes of the source and the contents of each observation in the joined -table.

roll

double: to be implemented

keep_y_in_x

logical: If TRUE, it will keep the original variable from -y when both tables have common variable names. Thus, the prefix "y." will -be added to the original name to distinguish from the resulting variable in -the joined table.

sort

logical: If TRUE, sort by key variables in by. Default is -TRUE.

verbose

logical: if FALSE, it won't display any message (programmer's -option). Default is TRUE.

- -

Value

- -

a data.table joining x and y.

-

match types

+table, though a summary of the join will be display after concluding.

+ +
update_NAs
+

logical: If TRUE, it will update NA values of all variables +in x with actual values of variables in y that have the same name as the +ones in x. If FALSE, NA values won't be updated, even if update_values is +TRUE

+ +
update_values
+

logical: If TRUE, it will update all values of variables +in x with the actual of variables in y with the same name as the ones in x. +NAs from y won't be used to update actual values in x. Yet, by default, +NAs in x will be updated with values in y. To avoid this, make sure to set +update_NAs = FALSE

+ +
verbose
+

logical: if FALSE, it won't display any message (programmer's +option). Default is TRUE.

+ + +
+
+
+

Value

+

data.table merging x and y

+
-

Using the same wording of the Stata manual

-

1:1: specifies a one-to-one match merge. The variables specified in -by uniquely identify single observations in both table.

-

1:m and m:1: specify one-to-many and many-to-one match merges, -respectively. This means that in of the tables the observations are -uniquely identify by the variables in by, while in the other table many -(two or more) of the observations are identify by the variables in by

-

m:m refers to many-to-many merge. variables in by does not uniquely -identify the observations in either table. Matching is performed by -combining observations with equal values in by; within matching values, -the first observation in the master (i.e. left or x) table is matched with -the first matching observation in the using (i.e. right or y) table; the -second, with the second; and so on. If there is an unequal number of -observations within a group, then the last observation of the shorter group -is used repeatedly to match with subsequent observations of the longer -group.

- -

Examples

-
# Simple merge -library(data.table) -x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_), -t = c(1L, 2L, 1L, 2L, NA_integer_), -x = 11:15) - -y1 = data.table(id = 1:2, - y = c(11L, 15L)) - -x2 = data.table(id = c(1, 1, 2, 3, NA), - t = c(1L, 2L, 1L, 2L, NA_integer_), - x = c(16, 12, NA, NA, 15)) - -y2 = data.table(id = c(1, 2, 5, 6, 3), - yd = c(1, 2, 5, 6, 3), - y = c(11L, 15L, 20L, 13L, 10L), - x = c(16:20)) -merge(x1, y1) -
#> > removing key variables `id` from yvars
#>
#> -- JOYn Report --
#>
#> report n percent -#> 1: x 2 40% -#> 2: x & y 3 60% -#> 3: total 5 100%
#> ---------------------------------------------------------- End of JOYn report --
#> id t x y report -#> 1: 1 1 11 11 x & y -#> 2: 1 2 12 11 x & y -#> 3: 2 1 13 15 x & y -#> 4: 3 2 14 NA x -#> 5: NA NA 15 NA x
-# Bad merge for not specifying by argument -merge(x2, y2) -
#> > removing key variables `id` and `x` from yvars
#>
#> -- JOYn Report --
#>
#> report n percent -#> 1: x 4 44.4% -#> 2: x & y 1 11.1% -#> 3: y 4 44.4% -#> 4: total 9 100%
#> ---------------------------------------------------------- End of JOYn report --
#> id x t yd y report -#> 1: 1 12 2 NA NA x -#> 2: 1 16 1 1 11 x & y -#> 3: 2 17 NA 2 15 y -#> 4: 2 NA 1 NA NA x -#> 5: 3 20 NA 3 10 y -#> 6: 3 NA 2 NA NA x -#> 7: 5 18 NA 5 20 y -#> 8: 6 19 NA 6 13 y -#> 9: NA 15 NA NA NA x
-# good merge, ignoring variable x from y -merge(x2, y2, by = "id") -
#> > removing key variables `id` from yvars
#> i variable `x` in table y is ignored because arguments `update_NAs` and -#> `update_values` are FALSE.
#>
#> -- JOYn Report --
#>
#> report n percent -#> 1: x 1 14.3% -#> 2: x & y 4 57.1% -#> 3: y 2 28.6% -#> 4: total 7 100%
#> ---------------------------------------------------------- End of JOYn report --
#> id t x yd y report -#> 1: 1 1 16 1 11 x & y -#> 2: 1 2 12 1 11 x & y -#> 3: 2 1 NA 2 15 x & y -#> 4: 3 2 NA 3 10 x & y -#> 5: 5 NA NA 5 20 y -#> 6: 6 NA NA 6 13 y -#> 7: NA NA 15 NA NA x
-# update NAs in x variable form x -merge(x2, y2, by = "id", update_NAs = TRUE) -
#> > removing key variables `id` from yvars
#>
#> -- JOYn Report --
#>
#> report n percent -#> 1: NA updated 2 28.6% -#> 2: x 1 14.3% -#> 3: x & y 2 28.6% -#> 4: y 2 28.6% -#> 5: total 7 100%
#> ---------------------------------------------------------- End of JOYn report --
#> id t x yd y report -#> 1: 1 1 16 1 11 x & y -#> 2: 1 2 12 1 11 x & y -#> 3: 2 1 17 2 15 NA updated -#> 4: 3 2 20 3 10 NA updated -#> 5: 5 NA 18 5 20 y -#> 6: 6 NA 19 6 13 y -#> 7: NA NA 15 NA NA x
-# Update values in x with variables from y -merge(x2, y2, by = "id", update_values = TRUE) -
#> > removing key variables `id` from yvars
#>
#> -- JOYn Report --
#>
#> report n percent -#> 1: NA updated 2 28.6% -#> 2: not updated 1 14.3% -#> 3: value updated 1 14.3% -#> 4: x & y 1 14.3% -#> 5: y 2 28.6% -#> 6: total 7 100%
#> ---------------------------------------------------------- End of JOYn report --
#> id t x yd y report -#> 1: 1 1 16 1 11 x & y -#> 2: 1 2 16 1 11 value updated -#> 3: 2 1 17 2 15 NA updated -#> 4: 3 2 20 3 10 NA updated -#> 5: 5 NA 18 5 20 y -#> 6: 6 NA 19 6 13 y -#> 7: NA NA 15 NA NA not updated
-
-
- -
+
+

Examples

+
x1 = data.frame(id = c(1L, 1L, 2L, 3L, NA_integer_),
+                t  = c(1L, 2L, 1L, 2L, NA_integer_),
+                x  = 11:15)
+y1 = data.frame(id = c(1,2, 4),
+                y  = c(11L, 15L, 16))
+joyn::merge(x1, y1, by = "id")
+#> 
+#> ── JOYn Report ──
+#> 
+#>     .joyn     n percent
+#>    <char> <int>  <char>
+#> 1:  x & y     3    100%
+#> 2:  total     3    100%
+#> ────────────────────────────────────────────────────────── End of JOYn report ──
+#> ⚠ Warning: Supplied both by and by.x/by.y. by argument will be ignored.
+#>   Joyn's report available in variable .joyn
+#> ⚠ Warning: The keys supplied uniquely identify y therefore a m:1 join is
+#> executed.
+#> ℹ ❯ Removing key variables id from id and y
+#> ● Timing: The full joyn is executed in 0.00457 seconds
+#> ● Timing: The entire joyn function, including checks, is executed in 0.053809
+#> seconds
+#>   id t  x  y .joyn
+#> 1  1 1 11 11 x & y
+#> 2  1 2 12 11 x & y
+#> 3  2 1 13 15 x & y
+# example of using by.x and by.y
+x2 = data.frame(id1 = c(1, 1, 2, 3, 3),
+                id2 = c(1, 1, 2, 3, 4),
+                t   = c(1L, 2L, 1L, 2L, NA_integer_),
+                x   = c(16, 12, NA, NA, 15))
+y2 = data.frame(id  = c(1, 2, 5, 6, 3),
+                id2 = c(1, 1, 2, 3, 4),
+                y   = c(11L, 15L, 20L, 13L, 10L),
+                x   = c(16:20))
+jn <- joyn::merge(x2,
+            y2,
+            match_type = "m:m",
+            all.x = TRUE,
+            by.x = "id1",
+            by.y = "id2")
+#> 
+#> ── JOYn Report ──
+#> 
+#>     .joyn     n percent
+#>    <char> <int>  <char>
+#> 1:  x & y     7    100%
+#> 2:  total     7    100%
+#> ────────────────────────────────────────────────────────── End of JOYn report ──
+#> ⚠ Warning: Supplied both by and by.x/by.y. by argument will be ignored.
+#>   Joyn's report available in variable .joyn
+#> ℹ ❯ Removing key variables keyby1 from id, keyby1, y, and x
+#> ● Timing: The full joyn is executed in 0.0029 seconds
+#> ● Timing: The entire joyn function, including checks, is executed in 0.039003
+#> seconds
+# example with all = TRUE
+jn <- joyn::merge(x2,
+            y2,
+            match_type = "m:m",
+            by.x = "id1",
+            by.y = "id2",
+            all = TRUE)
+#> 
+#> ── JOYn Report ──
+#> 
+#>     .joyn     n percent
+#>    <char> <int>  <char>
+#> 1:  x & y     7   87.5%
+#> 2:      y     1   12.5%
+#> 3:  total     8    100%
+#> ────────────────────────────────────────────────────────── End of JOYn report ──
+#> ⚠ Warning: Supplied both by and by.x/by.y. by argument will be ignored.
+#>   Joyn's report available in variable .joyn
+#> ℹ ❯ Removing key variables keyby1 from id, keyby1, y, and x
+#> ● Timing: The full joyn is executed in 0.00191 seconds
+#> ● Timing: The entire joyn function, including checks, is executed in 0.032904
+#> seconds
+
+
+ - + - - - + diff --git a/docs/dev/reference/possible_ids.html b/docs/dev/reference/possible_ids.html index 79af3200..853af5df 100644 --- a/docs/dev/reference/possible_ids.html +++ b/docs/dev/reference/possible_ids.html @@ -1,203 +1,151 @@ - - - - - - - -Find possible unique identifies of data frame — possible_ids • joyn - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Find possible unique identifies of data frame — possible_ids • joyn + Skip to contents + - +
-
- +
+

Value

+ + +

list with possible identifiers

+
+ +
+

Examples

+
library(data.table)
+x4 = data.table(id1 = c(1, 1, 2, 3, 3),
+                id2 = c(1, 1, 2, 3, 4),
+                t   = c(1L, 2L, 1L, 2L, NA_integer_),
+                x   = c(16, 12, NA, NA, 15))
+possible_ids(x4)
+#>  There are no duplicates in data frame
+#> → we found 5 possible ids
+#> $V1
+#> [1] "id1" "t"  
+#> 
+#> $V2
+#> [1] "id1" "x"  
+#> 
+#> $V3
+#> [1] "id2" "t"  
+#> 
+#> $V4
+#> [1] "id2" "x"  
+#> 
+#> $V5
+#> [1] "t" "x"
+#> 
+
+
+
-
+ - - - + From 4e1c991b4171b456b57a5f58dbdd27ff33e89438 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Thu, 22 Feb 2024 14:30:01 -0500 Subject: [PATCH 4/9] document --- R/info_display.R | 6 ++++-- R/joyn_workhorse.R | 36 +++++++++++++++++++++++++----------- man/joyn_msg.Rd | 9 ++++++--- man/msg_type_dt.Rd | 4 ++-- man/store_msg.Rd | 4 ++-- 5 files changed, 39 insertions(+), 20 deletions(-) diff --git a/R/info_display.R b/R/info_display.R index 65009360..a4fc943a 100644 --- a/R/info_display.R +++ b/R/info_display.R @@ -1,7 +1,9 @@ #' display type of joyn message #' -#' @param type character: one or more of the following: -#' `r cli::format_inline("{.or {.val {type_choices()}}}")` or `all` +#' @param type character: one or more of the following: `r joyn:::type_choices()` +#' cli::format_inline("{.or {.val {type_choices()}}}")` or `all` +#' @param msg character vector to be parsed to [cli::cli_abort()]. Default is +#' NULL. It only works if `"err" %in% type` #' #' @return returns data frame with message invisibly. print message in console #' @export diff --git a/R/joyn_workhorse.R b/R/joyn_workhorse.R index e4dfff88..972fc64c 100644 --- a/R/joyn_workhorse.R +++ b/R/joyn_workhorse.R @@ -126,17 +126,31 @@ joyn_workhorse <- function( # This is inefficient but it is the only way to return the table when # there is a warning - collapse::join( x = x, - y = y, - how = "full", - on = by, - multiple = TRUE, # matches row in x with m in y - validate = "m:m", # no checks performed - suffix = suffixes, # data.table suffixes - keep.col.order = TRUE, - verbose = 0, - column = NULL) |> - suppressWarnings() + if (match_type == "m:m") { + data.table::merge.data.table( + x = x, + y = y, + by = by, + all = TRUE, + sort = FALSE, + suffixes = suffixes, + allow.cartesian = TRUE + ) |> + suppressWarnings() + + } else { + collapse::join( x = x, + y = y, + how = "full", + on = by, + multiple = TRUE, # matches row in x with m in y + validate = "m:m", # no checks performed + suffix = suffixes, # data.table suffixes + keep.col.order = TRUE, + verbose = 0, + column = NULL) |> + suppressWarnings() + } } diff --git a/man/joyn_msg.Rd b/man/joyn_msg.Rd index 95e34ecd..0694d0e1 100644 --- a/man/joyn_msg.Rd +++ b/man/joyn_msg.Rd @@ -4,11 +4,14 @@ \alias{joyn_msg} \title{display type of joyn message} \usage{ -joyn_msg(type = c("all", type_choices())) +joyn_msg(type = c("all", type_choices()), msg = NULL) } \arguments{ -\item{type}{character: one or more of the following: -"info", "note", "warn", "timing", or "err" or \code{all}} +\item{type}{character: one or more of the following: info, note, warn, timing, err +cli::format_inline("{.or {.val {type_choices()}}}")\code{or}all`} + +\item{msg}{character vector to be parsed to \code{\link[cli:cli_abort]{cli::cli_abort()}}. Default is +NULL. It only works if \code{"err" \%in\% type}} } \value{ returns data frame with message invisibly. print message in console diff --git a/man/msg_type_dt.Rd b/man/msg_type_dt.Rd index 75dbd18e..fff355e9 100644 --- a/man/msg_type_dt.Rd +++ b/man/msg_type_dt.Rd @@ -7,8 +7,8 @@ msg_type_dt(type, ...) } \arguments{ -\item{type}{character: one or more of the following: -"info", "note", "warn", "timing", or "err" or \code{all}} +\item{type}{character: one or more of the following: info, note, warn, timing, err +cli::format_inline("{.or {.val {type_choices()}}}")\code{or}all`} } \value{ data frame with two variables, type and msg diff --git a/man/store_msg.Rd b/man/store_msg.Rd index b56dc660..03563e2c 100644 --- a/man/store_msg.Rd +++ b/man/store_msg.Rd @@ -7,8 +7,8 @@ store_msg(type, ...) } \arguments{ -\item{type}{character: one or more of the following: -"info", "note", "warn", "timing", or "err" or \code{all}} +\item{type}{character: one or more of the following: info, note, warn, timing, err +cli::format_inline("{.or {.val {type_choices()}}}")\code{or}all`} \item{...}{combination of type and text in the form \verb{style1 = text1, style2 = text2}, etc.} } From 0a629d5cbe4caeb548d9eda3441a75cc4337ec9b Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda Aguilar" Date: Thu, 22 Feb 2024 14:35:06 -0500 Subject: [PATCH 5/9] Increment version number to 0.1.6.9001 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 955f1cdc..3436cd6c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: joyn Type: Package Title: Tool for Diagnosis of Tables Joins and Complementary Join Features -Version: 0.1.6.9000 +Version: 0.1.6.9001 Authors@R: c(person(given = "R.Andres", family = "Castaneda", email = "acastanedaa@worldbank.org", From 7a04cb125387afea1567e1bb2cd630efc93a321c Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Fri, 23 Feb 2024 15:56:31 -0500 Subject: [PATCH 6/9] correct typo --- R/joyn_workhorse.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/joyn_workhorse.R b/R/joyn_workhorse.R index 972fc64c..09a78015 100644 --- a/R/joyn_workhorse.R +++ b/R/joyn_workhorse.R @@ -73,7 +73,7 @@ joyn_workhorse <- function( # not m:m => use collapse::join() dt_result <- tryCatch( expr = { - souce_pkg <- if (match_type == "m:m") "data.table::merge" else "collapse::join" + source_pkg <- if (match_type == "m:m") "data.table::merge" else "collapse::join" if (match_type == "m:m") { data.table::merge.data.table( x = x, @@ -102,7 +102,7 @@ joyn_workhorse <- function( error = function(e) { - joyn_msg("err", c("{.pkg {souce_pkg}} returned the following:", + joyn_msg("err", c("{.pkg {source_pkg}} returned the following:", x = e$message)) }, # end of error section @@ -111,14 +111,14 @@ joyn_workhorse <- function( store_msg( type = "warn", ok = paste(cli::symbol$warning, "\nWarning: "), - pale = "Your data is overidentified. Below the original message from {.pkg {souce_pkg}}:", + pale = "Your data is overidentified. Below the original message from {.pkg {source_pkg}}:", bolded_pale = "\n{w$message}" ) } else { store_msg( type = "warn", ok = paste(cli::symbol$warning, "\nWarning: "), - pale = "{.pkg {souce_pkg}} returned the following warning:", + pale = "{.pkg {source_pkg}} returned the following warning:", bolded_pale = "\n{w$message}" ) } From c897e8312ae2b9db833a2d927bb8e520dcb77ca7 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 26 Feb 2024 13:26:42 -0500 Subject: [PATCH 7/9] fix changing column names of input tables --- R/joyn-merge.R | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/R/joyn-merge.R b/R/joyn-merge.R index a935dcc4..4f15c4a7 100644 --- a/R/joyn-merge.R +++ b/R/joyn-merge.R @@ -212,12 +212,8 @@ joyn <- function(x, #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ start_joyn <- Sys.time() # copy objects if data.tables - if (any(class(x) == "data.table")) { - x <- copy(x) - } - if (any(class(y) == "data.table")) { - y <- copy(y) - } + x <- copy(x) + y <- copy(y) ## X and Y ----------- check_xy(x,y) From 3c188a9e672592c1ce60727bfc6c4757cc800b8c Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 26 Feb 2024 14:18:18 -0500 Subject: [PATCH 8/9] fix order bug --- R/joyn-merge.R | 23 +++++------------------ tests/testthat/test-joyn.R | 4 ++-- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/R/joyn-merge.R b/R/joyn-merge.R index 4f15c4a7..001948ed 100644 --- a/R/joyn-merge.R +++ b/R/joyn-merge.R @@ -234,7 +234,6 @@ joyn <- function(x, y <- as.data.table(y) } - ## Modify BY when is expression --------- fixby <- check_by_vars(by, x, y) by <- fixby$by @@ -388,19 +387,6 @@ joyn <- function(x, #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Update x --------- #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - # if (isTRUE(update_values) || isTRUE(update_NAs)) { - # var_use <- sub( - # pattern = "\\.y$", - # replacement = "", - # x = newyvars[ - # grepl( - # pattern = "\\.y$", - # x = newyvars - # ) - # ] - # ) - # } var_use <- NULL if (isTRUE(update_values) || isTRUE(update_NAs)) { var_use <- common_vars @@ -462,6 +448,11 @@ joyn <- function(x, .yreport = NULL) + if (sort) { + setorderv(x, by, na.last = na.last) + setattr(x, 'sorted', by) + } + ## Rename by variables ----- if (!is.null(fixby$xby)) { @@ -519,10 +510,6 @@ joyn <- function(x, x |> fselect(get(reportvar)) <- NULL } - if (sort) { - setorderv(x, by, na.last = na.last) - setattr(x, 'sorted', by) - } if (verbose == TRUE) { end_joyn <- Sys.time() diff --git a/tests/testthat/test-joyn.R b/tests/testthat/test-joyn.R index 1d33af34..2fd86a6d 100644 --- a/tests/testthat/test-joyn.R +++ b/tests/testthat/test-joyn.R @@ -568,8 +568,8 @@ test_that("different names in key vars are working fine", { ".joyn" = c("x & y", "x & y", "x", "y", "x", "x & y", "y", "y") ) - setorderv(dd, "id1", na.last = TRUE) - setattr(dd, 'sorted', "id1") + setorderv(dd, c("id1", "id2"), na.last = TRUE) + setattr(dd, 'sorted', c("id1", "id2")) expect_equal(df, dd) From 2699753153dbb2b6f326978de03f8298b4932085 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 26 Feb 2024 14:21:17 -0500 Subject: [PATCH 9/9] Increment version number to 0.1.6.9002 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3436cd6c..a52f48aa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: joyn Type: Package Title: Tool for Diagnosis of Tables Joins and Complementary Join Features -Version: 0.1.6.9001 +Version: 0.1.6.9002 Authors@R: c(person(given = "R.Andres", family = "Castaneda", email = "acastanedaa@worldbank.org",