Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cbindlist/mergelist] mergepair workhorse #6437

Draft
wants to merge 1 commit into
base: mergelist-helpers
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions R/mergelist.R
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,116 @@ dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
return(list(ans=ans, irows=irows, xrows=xrows))
}

# atomic join between two tables
mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=names(rhs), copy=TRUE, join.many=TRUE, verbose=FALSE) {
semianti = how=="semi" || how=="anti"
innerfull = how=="inner" || how=="full"
{
if (how!="cross") {
if (is.null(on)) {
if (how=="left" || semianti) on = key(rhs)
else if (how=="right") on = key(lhs)
else if (innerfull) on = onkeys(key(lhs), key(rhs))
if (is.null(on))
stopf("'on' is missing and necessary key is not present")
}
if (any(bad.on <- !on %chin% names(lhs)))
stopf("'on' argument specify columns to join [%s] that are not present in LHS table [%s]", brackify(on[bad.on]), brackify(names(lhs)))
if (any(bad.on <- !on %chin% names(rhs)))
stopf("'on' argument specify columns to join [%s] that are not present in RHS table [%s]", brackify(on[bad.on]), brackify(names(rhs)))
} else if (is.null(on)) {
on = character() ## cross join only
}
} ## on
{
if (how!="right") {
jnfm = lhs; fm.cols = lhs.cols; jnto = rhs; to.cols = rhs.cols
} else {
jnfm = rhs; fm.cols = rhs.cols; jnto = lhs; to.cols = lhs.cols
}
} ## join-to and join-from tables and columns (right outer join swap)

## ensure symmetric join for inner|full join, apply mult on both tables, bmerge do only 'x' table
cp.i = FALSE ## copy marker of out.i
if ((innerfull) && !is.null(mult) && (mult=="first" || mult=="last")) {
jnfm = fdistinct(jnfm, on=on, mult=mult, cols=fm.cols, copy=FALSE) ## might not copy when already unique by 'on'
cp.i = nrow(jnfm)!=nrow(lhs) ## nrow(lhs) bc how='inner|full' so jnfm=lhs
} else if (how=="inner" && (is.null(mult) || mult=="error")) { ## we do this branch only to raise error from bmerge, we cannot use forder to just find duplicates because those duplicates might not have matching rows in another table, full join checks mult='error' during two non-void bmerges
dtmerge(x=jnfm, i=jnto, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many, void=TRUE)
}

## binary merge
ans = dtmerge(x=jnto, i=jnfm, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many)

## make i side
out.i = if (is.null(ans$irows))
.shallow(jnfm, cols=someCols(jnfm, fm.cols, keep=on, retain.order=semianti), retain.key=TRUE)
else
.Call(CsubsetDT, jnfm, ans$irows, someCols(jnfm, fm.cols, keep=on, retain.order=semianti))
cp.i = cp.i || !is.null(ans$irows)

## make x side
if (semianti) {
out.x = list(); cp.x = TRUE
} else {
out.x = if (is.null(ans$xrows)) ## as of now xrows cannot be NULL #4409 thus nocov below
internal_error("dtmerge()$xrows returned NULL, #4409 been resolved but related code has not been updated?") #.shallow(jnto, cols=someCols(jnto, to.cols, drop=on), retain.key=TRUE) # nocov ## as of now nocov does not make difference r-lib/covr#279
else
.Call(CsubsetDT, jnto, ans$xrows, someCols(jnto, to.cols, drop=on))
cp.x = !is.null(ans$xrows)
## ensure no duplicated column names in merge results
if (any(dup.i<-names(out.i) %chin% names(out.x)))
stopf("merge result has duplicated column names, use 'cols' argument or rename columns in 'l' tables, duplicated column(s): %s", brackify(names(out.i)[dup.i]))
}

## stack i and x
if (how!="full") {
if (!cp.i && copy) out.i = copy(out.i)
#if (!cp.x && copy) out.x = copy(out.x) ## as of now cp.x always TRUE, search for #4409 here
out = .Call(Ccbindlist, list(out.i, out.x), FALSE)
if (how=="right") setcolorder(out, neworder=c(on, names(out.x))) ## arrange columns: i.on, x.cols, i.cols
} else { # how=="full"
## we made left join side above, proceed to right join side, so swap tbls
jnfm = rhs; fm.cols = rhs.cols; jnto = lhs; to.cols = lhs.cols

cp.r = FALSE
if (!is.null(mult) && (mult=="first" || mult=="last")) {
jnfm = fdistinct(jnfm, on=on, mult=mult, cols=fm.cols, copy=FALSE)
cp.r = nrow(jnfm)!=nrow(rhs) ## nrow(rhs) bc jnfm=rhs
} ## mult=="error" check was made on one side already, below we do on the second side, test 101.43

## binary merge anti join
bns = dtmerge(x=jnto, i=jnfm, on=on, how="anti", mult=if (!is.null(mult) && mult!="all") mult, verbose=verbose, join.many=join.many)

## make anti join side
out.r = if (is.null(bns$irows))
.shallow(jnfm, cols=someCols(jnfm, fm.cols, keep=on), retain.key=TRUE) ## retain.key is used only in the edge case when !nrow(out.i)
else
.Call(CsubsetDT, jnfm, bns$irows, someCols(jnfm, fm.cols, keep=on))
cp.r = cp.r || !is.null(bns$irows)

## short circuit to avoid rbindlist to empty sets and retains keys
if (!nrow(out.r)) { ## possibly also !nrow(out.i)
if (!cp.i && copy) out.i = copy(out.i)
#if (!cp.x && copy) out.x = copy(out.x) ## as of now cp.x always TRUE, search for #4409 here
out = .Call(Ccbindlist, list(out.i, out.x), FALSE)
} else if (!nrow(out.i)) { ## but not !nrow(out.r)
if (!cp.r && copy) out.r = copy(out.r)
if (length(add<-setdiff(names(out.i), names(out.r)))) { ## add missing columns of proper types NA
neworder = copy(names(out.i)) #set(out.r, NULL, add, lapply(unclass(out.i)[add], `[`, 1L)) ## 291.04 overalloc exceed fail during set()
out.i = lapply(unclass(out.i)[add], `[`, seq_len(nrow(out.r))) ## could eventually remove this when cbindlist recycle 0 rows up, note that we need out.r not to be copied
out.r = .Call(Ccbindlist, list(out.r, out.i), FALSE)
setcolorder(out.r, neworder=neworder)
}
out = out.r
} else { ## all might have not been copied yet, rbindlist will copy
out.l = .Call(Ccbindlist, list(out.i, out.x), FALSE)
out = rbindlist(list(out.l, out.r), use.names=TRUE, fill=TRUE)
}
}
setDT(out)
}

# Previously, we had a custom C implementation here, which is ~2x faster,
# but this is fast enough we don't bother maintaining a new routine.
# Hopefully in the future rep() can recognize the ALTREP and use that, too.
Expand Down
1 change: 1 addition & 0 deletions R/onLoad.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
"datatable.print.trunc.cols"="FALSE", # for print.data.table
"datatable.show.indices"="FALSE", # for print.data.table
"datatable.allow.cartesian"="FALSE", # datatable.<argument name>
"datatable.join.many"="TRUE", # mergelist, [.data.table #4383 #914
"datatable.dfdispatchwarn"="TRUE", # not a function argument
"datatable.warnredundantby"="TRUE", # not a function argument
"datatable.alloccol"="1024L", # argument 'n' of alloc.col. Over-allocate 1024 spare column slots
Expand Down
194 changes: 194 additions & 0 deletions inst/tests/mergelist.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,24 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
} else {
require(data.table)
test = data.table:::test
mergepair = data.table:::mergepair
perhaps.data.table = data.table:::perhaps.data.table
hasindex = data.table:::hasindex
fdistinct = data.table:::fdistinct
forderv = data.table:::forderv
}

addresses = function(x) vapply(x, address, "")
copied = function(ans, l) {
all(!addresses(ans) %chin% unlist(recursive=FALSE, lapply(l, addresses)))
}
notcopied = function(ans, l, how="left", unless=character()) {
if (how %chin% unless) return(copied(ans, l)) ## used during looping tests for easier escape
if (how=="full") return( ## either side, left|right, notcopied is fine
all(addresses(l[[1L]]) %chin% addresses(ans)) || all(addresses(l[[length(l)]]) %chin% addresses(ans))
)
all(addresses(if (how=="right") l[[length(l)]] else l[[1L]]) %chin% addresses(ans))
}

# internal helpers

Expand Down Expand Up @@ -110,6 +121,189 @@ test(13.04, key(ans), "id1")
test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9"))
test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib

# mergepair

## test copy-ness argument in mergepair

### LHS equal to RHS: no copy in all cases
num = 21.000
l = list(
lhs = data.table(id1=1:2, v1=1:2),
rhs = data.table(id1=1:2, v2=1:2)
)
expected = data.table(id1=1:2, v1=1:2, v2=1:2)
for (how in c("inner","left","right","full")) {
num = trunc(num*10)/10 + 0.1
for (mult in c("all","first","last","error")) {
num = trunc(num*100)/100 + 0.01
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected) ## copy=TRUE: no shared columns
test(num<-num+0.001, copied(ans, l))
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected) ## copy=FALSE: LHS shared but no RHS
test(num<-num+0.001, notcopied(ans, l, how=how))
}
}
### RHS includes LHS: no copy in inner, left, right
num = 22.000
unless = "full"
l = list(
lhs = data.table(id1=1:2, v1=1:2),
rhs = data.table(id1=1:3, v2=1:3)
)
expected = list(
inner = data.table(id1=1:2, v1=1:2, v2=1:2),
left = data.table(id1=1:2, v1=1:2, v2=1:2),
right = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3),
full = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3)
)
for (how in c("inner","left","right","full")) {
num = trunc(num*10)/10 + 0.1
for (mult in c("all","first","last","error")) {
num = trunc(num*100)/100 + 0.01
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
test(num<-num+0.001, copied(ans, l))
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
}
}
### LHS includes RHS: no copy in left, right, full
num = 23.000
unless = "inner"
l = list(
lhs = data.table(id1=1:3, v1=1:3),
rhs = data.table(id1=1:2, v2=1:2)
)
expected = list(
inner = data.table(id1=1:2, v1=1:2, v2=1:2),
left = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA)),
right = data.table(id1=1:2, v1=1:2, v2=1:2),
full = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA))
)
for (how in c("inner","left","right","full")) {
num = trunc(num*10)/10 + 0.1
for (mult in c("all","first","last","error")) {
num = trunc(num*100)/100 + 0.01
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
test(num<-num+0.001, copied(ans, l))
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
}
}
### LHS single nonmatch RHS on both sides: no copy in left, right
num = 24.000
unless = c("inner","full")
l = list(
lhs = data.table(id1=3:1, v1=1:3),
rhs = data.table(id1=c(4L,2:1), v2=1:3)
)
expected = list(
inner = data.table(id1=2:1, v1=2:3, v2=2:3),
left = data.table(id1=3:1, v1=1:3, v2=c(NA,2:3)),
right = data.table(id1=c(4L,2:1), v1=c(NA,2:3), v2=1:3),
full = data.table(id1=c(3:1,4L), v1=c(1:3,NA), v2=c(NA,2:3,1L))
)
for (how in c("inner","left","right","full")) {
num = trunc(num*10)/10 + 0.1
for (mult in c("all","first","last","error")) {
num = trunc(num*100)/100 + 0.01
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
test(num<-num+0.001, copied(ans, l))
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
}
}
### LHS zero match RHS: no copy in left, right
num = 25.000
unless = c("inner","full")
l = list(
lhs = data.table(id1=2:1, v1=1:2),
rhs = data.table(id1=3:4, v2=1:2)
)
expected = list(
inner = data.table(id1=integer(), v1=integer(), v2=integer()),
left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)),
right = data.table(id1=3:4, v1=c(NA_integer_,NA), v2=1:2),
full = data.table(id1=c(2:1,3:4), v1=c(1:2,NA,NA), v2=c(NA,NA,1:2))
)
for (how in c("inner","left","right","full")) {
num = trunc(num*10)/10 + 0.1
for (mult in c("all","first","last","error")) {
num = trunc(num*100)/100 + 0.01
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
test(num<-num+0.001, copied(ans, l))
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
}
}
### LHS and RHS zero nrow: no copies
num = 26.000
unless = character()
l = list(
lhs = data.table(id1=integer(), v1=integer()),
rhs = data.table(id1=integer(), v2=integer())
)
expected = list(
inner = data.table(id1=integer(), v1=integer(), v2=integer()),
left = data.table(id1=integer(), v1=integer(), v2=integer()),
right = data.table(id1=integer(), v1=integer(), v2=integer()),
full = data.table(id1=integer(), v1=integer(), v2=integer())
)
for (how in c("inner","left","right","full")) {
num = trunc(num*10)/10 + 0.1
for (mult in c("all","first","last","error")) {
num = trunc(num*100)/100 + 0.01
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
test(num<-num+0.001, copied(ans, l))
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
}
}
### LHS has zero nrow: no copies
num = 27.000
unless = character()
l = list(
lhs = data.table(id1=integer(), v1=integer()),
rhs = data.table(id1=2:1, v2=1:2)
)
expected = list(
inner = data.table(id1=integer(), v1=integer(), v2=integer()),
left = data.table(id1=integer(), v1=integer(), v2=integer()),
right = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2),
full = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2)
)
for (how in c("inner","left","right","full")) {
num = trunc(num*10)/10 + 0.1
for (mult in c("all","first","last","error")) {
num = trunc(num*100)/100 + 0.01
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
test(num<-num+0.001, copied(ans, l))
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
}
}
### RHS has zero nrow
num = 28.000
unless = "inner"
l = list(
lhs = data.table(id1=2:1, v1=1:2),
rhs = data.table(id1=integer(), v2=integer())
)
expected = list(
inner = data.table(id1=integer(), v1=integer(), v2=integer()),
left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)),
right = data.table(id1=integer(), v1=integer(), v2=integer()),
full = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA))
)
for (how in c("inner","left","right","full")) {
num = trunc(num*10)/10 + 0.1
for (mult in c("all","first","last","error")) {
num = trunc(num*100)/100 + 0.01
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
test(num<-num+0.001, copied(ans, l))
test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
}
}

## fdistinct, another round

dt = data.table(x =
Expand Down
Loading