diff --git a/R/data.table.R b/R/data.table.R index 9eb294c41..02ea2a694 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -517,13 +517,22 @@ replace_dot_alias = function(e) { if (!byjoin || nqbyjoin) { # Really, `anyDuplicated` in base is AWESOME! # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates - if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} - irows = if (allLen1) f__ else vecseq(f__,len__, - if (allow.cartesian || - notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). - !anyDuplicated(f__, incomparables = c(0L, NA_integer_))) { - NULL # #742. If 'i' has no duplicates, ignore - } else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)). + if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} + irows = if (allLen1) f__ else { + join.many = getOption("datatable.join.many") # #914, default TRUE for backward compatibility + anyDups = if (!join.many && length(f__)==1L && len__==nrow(x)) { + NULL # special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"] + } else if (!notjoin && ( # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). + !allow.cartesian || + !join.many)) + as.logical(anyDuplicated(f__, incomparables = c(0L, NA_integer_))) + limit = if (!is.null(anyDups) && anyDups) { # #742. If 'i' has no duplicates, ignore + if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.") + else if (!allow.cartesian && !notjoin) as.double(nrow(x)+nrow(i)) + else internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov + } + vecseq(f__, len__, limit) + } # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)). if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # Fix for #1092 and #1074 # TODO: implement better version of "any"/"all"/"which" to avoid diff --git a/R/mergelist.R b/R/mergelist.R index 9606ce0ab..f33cdcf99 100644 --- a/R/mergelist.R +++ b/R/mergelist.R @@ -7,3 +7,123 @@ cbindlist = function(l, copy=TRUE) { setDT(ans) ans } + +# when 'on' is missing then use keys, used only for inner and full join +onkeys = function(x, y) { + if (is.null(x) && !is.null(y)) y + else if (!is.null(x) && is.null(y)) x + else if (!is.null(x) && !is.null(y)) { + if (length(x)>=length(y)) intersect(y, x) ## align order to shorter|rhs key + else intersect(x, y) + } else NULL # nocov ## internal error is being called later in mergepair +} +someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) { + keep = colnamesInt(x, keep) + drop = colnamesInt(x, drop) + cols = colnamesInt(x, cols) + ans = union(keep, setdiff(cols, drop)) + if (!retain.order) return(ans) + intersect(colnamesInt(x, NULL), ans) +} +hasindex = function(x, by, retGrp=FALSE) { + index = attr(x, "index", TRUE) + if (is.null(index)) return(FALSE) + idx_name = paste0("__",by,collapse="") + idx = attr(index, idx_name, TRUE) + if (is.null(idx)) return(FALSE) + if (!retGrp) return(TRUE) + return(!is.null(attr(idx, "starts", TRUE))) +} + +# fdistinct applies mult='first|last' +# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE] +# it may not copy when copy=FALSE and x is unique by 'on' +fdistinct = function(x, on=key(x), mult=c("first","last"), cols=seq_along(x), copy=TRUE) { + if (!perhaps.data.table(x)) + stopf("'x' must be data.table") + if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x))) + stopf("'on' must be character column names of 'x' argument") + mult = match.arg(mult) + if (is.null(cols)) + cols = seq_along(x) + else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols)) + stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument") + if (!isTRUEorFALSE(copy)) + stopf("'%s' must be TRUE or FALSE", "copy") + ## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last" + ## this short circuit will work after #4386 because it requires retGrp=T + #### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE) + sort = TRUE ## above line does not work for the moment, test 302.02 + o = forderv(x, by=on, sort=sort, retGrp=TRUE) + if (attr(o, "maxgrpn", TRUE) <= 1L) { + ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE) + if (copy) ans = copy(ans) + return(ans) + } + f = attr(o, "starts", exact=TRUE) + if (mult=="last") { + if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov + f = c(f[-1L]-1L, nrow(x)) ## last of each group + } + if (length(o)) f = o[f] + if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order + .Call(CsubsetDT, x, f, someCols(x, cols, keep=on)) +} + +# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow) +# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT +dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) { + nomatch = switch(how, "inner"=, "semi"=, "anti"=, "cross"= 0L, "left"=, "right"=, "full"= NA_integer_) + nomatch0 = identical(nomatch, 0L) + if (is.null(mult)) + mult = switch(how, "semi"=, "anti"= "last", "cross"= "all", "inner"=, "left"=, "right"=, "full"= "error") + if (void && mult!="error") + internal_error("void must be used with mult='error'") # nocov + if (how=="cross") { ## short-circuit bmerge results only for cross join + if (length(on) || mult!="all" || !join.many) + stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE") + if (void) + internal_error("cross join must be used with void=FALSE") # nocov + ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer()) + } else { + if (!length(on)) + stopf("'on' must be non-zero length character vector") + if (mult=="all" && (how=="semi" || how=="anti")) + stopf("semi and anti joins must be used with mult!='all'") + icols = colnamesInt(i, on, check_dups=TRUE) + xcols = colnamesInt(x, on, check_dups=TRUE) + ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose) + if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line + return(invisible(NULL)) + } else if (how=="semi" || how=="anti") { ## semi and anti short-circuit + irows = which(if (how=="semi") ans$lens!=0L else ans$lens==0L) ## we will subset i rather than x, thus assign to irows, not to xrows + if (length(irows)==length(ans$lens)) irows = NULL + return(list(ans=ans, irows=irows)) + } else if (mult=="all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check + !(length(ans$starts)==1L && ans$lens==nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"] + anyDuplicated(ans$starts, incomparables=c(0L,NA_integer_)) + ) + stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.") + } + + ## xrows, join-to + xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL) + if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)] + len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here + + ## irows, join-from + irows = if (!(ans$allLen1 && (!nomatch0 || len.x==length(ans$starts)))) seqexp(ans$lens) + len.i = if (is.null(irows)) nrow(i) else length(irows) + + if (length(ans$xo) && length(xrows)) + xrows = ans$xo[xrows] + len.x = length(xrows) + + if (len.i!=len.x) + internal_error("dtmerge out len.i != len.x") # nocov + + return(list(ans=ans, irows=irows, xrows=xrows)) +} + +seqexp = function(x) .Call(Cseqexp, x) +perhaps.data.table = function(x) .Call(CperhapsDataTableR, x) diff --git a/src/init.c b/src/init.c index e61c9b85e..8dd8277f7 100644 --- a/src/init.c +++ b/src/init.c @@ -151,6 +151,7 @@ R_CallMethodDef callMethods[] = { {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, {"Ccbindlist", (DL_FUNC) &cbindlist, -1}, +{"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1}, {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, {NULL, NULL, 0} };