-
Notifications
You must be signed in to change notification settings - Fork 1k
/
Copy pathmergelist.R
132 lines (123 loc) · 6.3 KB
/
mergelist.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
cbindlist = function(l, copy=TRUE) {
ans = .Call(Ccbindlist, l, copy)
if (anyDuplicated(names(ans))) { ## invalidate key and index
setattr(ans, "sorted", NULL)
setattr(ans, "index", integer())
}
setDT(ans)
ans
}
# when 'on' is missing then use keys, used only for inner and full join
onkeys = function(x, y) {
if (is.null(x) && !is.null(y)) y
else if (!is.null(x) && is.null(y)) x
else if (!is.null(x) && !is.null(y)) {
if (length(x)>=length(y)) intersect(y, x) ## align order to shorter|rhs key
else intersect(x, y)
} else NULL # nocov ## internal error is being called later in mergepair
}
someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) {
keep = colnamesInt(x, keep)
drop = colnamesInt(x, drop)
cols = colnamesInt(x, cols)
ans = union(keep, setdiff(cols, drop))
if (!retain.order) return(ans)
intersect(colnamesInt(x, NULL), ans)
}
hasindex = function(x, by, retGrp=FALSE) {
index = attr(x, "index", TRUE)
if (is.null(index)) return(FALSE)
idx_name = paste0("__",by,collapse="")
idx = attr(index, idx_name, TRUE)
if (is.null(idx)) return(FALSE)
if (!retGrp) return(TRUE)
return(!is.null(attr(idx, "starts", TRUE)))
}
# fdistinct applies mult='first|last'
# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE]
# it may not copy when copy=FALSE and x is unique by 'on'
fdistinct = function(x, on=key(x), mult=c("first","last"), cols=seq_along(x), copy=TRUE) {
if (!perhaps.data.table(x))
stopf("'x' must be data.table")
if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x)))
stopf("'on' must be character column names of 'x' argument")
mult = match.arg(mult)
if (is.null(cols))
cols = seq_along(x)
else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols))
stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument")
if (!isTRUEorFALSE(copy))
stopf("'%s' must be TRUE or FALSE", "copy")
## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last"
## this short circuit will work after #4386 because it requires retGrp=T
#### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE)
sort = TRUE ## above line does not work for the moment, test 302.02
o = forderv(x, by=on, sort=sort, retGrp=TRUE)
if (attr(o, "maxgrpn", TRUE) <= 1L) {
ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE)
if (copy) ans = copy(ans)
return(ans)
}
f = attr(o, "starts", exact=TRUE)
if (mult=="last") {
if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov
f = c(f[-1L]-1L, nrow(x)) ## last of each group
}
if (length(o)) f = o[f]
if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order
.Call(CsubsetDT, x, f, someCols(x, cols, keep=on))
}
# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow)
# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT
dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
nomatch = switch(how, "inner"=, "semi"=, "anti"=, "cross"= 0L, "left"=, "right"=, "full"= NA_integer_)
nomatch0 = identical(nomatch, 0L)
if (is.null(mult))
mult = switch(how, "semi"=, "anti"= "last", "cross"= "all", "inner"=, "left"=, "right"=, "full"= "error")
if (void && mult!="error")
internal_error("void must be used with mult='error'") # nocov
if (how=="cross") { ## short-circuit bmerge results only for cross join
if (length(on) || mult!="all" || !join.many)
stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE")
if (void)
internal_error("cross join must be used with void=FALSE") # nocov
ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer())
} else {
if (!length(on))
stopf("'on' must be non-zero length character vector")
if (mult=="all" && (how=="semi" || how=="anti"))
stopf("semi and anti joins must be used with mult!='all'")
icols = colnamesInt(i, on, check_dups=TRUE)
xcols = colnamesInt(x, on, check_dups=TRUE)
ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose)
if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line
return(invisible(NULL))
} else if (how=="semi" || how=="anti") { ## semi and anti short-circuit
irows = which(if (how=="semi") ans$lens!=0L else ans$lens==0L) ## we will subset i rather than x, thus assign to irows, not to xrows
if (length(irows)==length(ans$lens)) irows = NULL
return(list(ans=ans, irows=irows))
} else if (mult=="all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check
!(length(ans$starts)==1L && ans$lens==nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
anyDuplicated(ans$starts, incomparables=c(0L,NA_integer_))
)
stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
}
## xrows, join-to
xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)
if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)]
len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here
## irows, join-from
irows = if (!(ans$allLen1 && (!nomatch0 || len.x==length(ans$starts)))) seqexp(ans$lens)
len.i = if (is.null(irows)) nrow(i) else length(irows)
if (length(ans$xo) && length(xrows))
xrows = ans$xo[xrows]
len.x = length(xrows)
if (len.i!=len.x)
internal_error("dtmerge out len.i != len.x") # nocov
return(list(ans=ans, irows=irows, xrows=xrows))
}
# Previously, we had a custom C implementation here, which is ~2x faster,
# but this is fast enough we don't bother maintaining a new routine.
# Hopefully in the future rep() can recognize the ALTREP and use that, too.
seqexp = function(x) rep(seq_along(x), x)
perhaps.data.table = function(x) .Call(CperhapsDataTableR, x)