Rdatatable · mattdowle · Mar 26, 2018 · Mar 26, 2018 · Mar 26, 2018 · Mar 26, 2018
@@ -6,6 +6,20 @@
 
 1. In `DT[, variable := value]`, when value is class `POSIXlt`, we automatically coerce it to class `POSIXct` instead, [#1724](https://github.com/Rdatatable/data.table/issues/1724). Thanks to @linzhp for the report, and Benjamin Schwendinger for the fix.
 
+TODO. `[.data.table` gains `old.nonequi=.datatable.old.nonequi` argument :
+    ```
+    TRUE           # this release; i.e. no change yet; old behaviour
+    "warning"      # issues warning for every roll= and non-equi on= query
+    "error"        # issues error for every roll= and non-equi on= query
+    FALSE          # future release; new behaviour
+    ```
+    This future breaking change resolves frequent complaints and confusion from users regarding `roll=` and non-equi join columns, [#1615](https://github.com/Rdatatable/data.table/issues/1615), [#1700](https://github.com/Rdatatable/data.table/issues/1700), [#2006](https://github.com/Rdatatable/data.table/issues/2006), [#2569](https://github.com/Rdatatable/data.table/issues/2569) and related issues [#1469](https://github.com/Rdatatable/data.table/issues/1469), [#1807](https://github.com/Rdatatable/data.table/issues/1807), [#2307](https://github.com/Rdatatable/data.table/issues/2307), [#2595](https://github.com/Rdatatable/data.table/issues/2595), and [#2602](https://github.com/Rdatatable/data.table/issues/2602). New behaviour i) includes both sides of join columns which have been non-equi joined (so the result now has more columns) and ii) for such columns, the data from the `i` table is no longer renamed with the column from `x`. Many thanks to @sritchie73 for the implementation [PR#2706](https://github.com/Rdatatable/data.table/pull/2706) [PR#3093](https://github.com/Rdatatable/data.table/pull/3093) and to many for feedback on the PR.
+    A few packages on CRAN and Bioconductor need to migrate. They will set `.datatable.old.nonequi=FALSE` in their namespace when they have. You can set the variable to `"warning"` or `"error"` in order to find all `roll=` and non-equi-`on=` usage in your code. Once you have identified and migrated all usage, you can set the option to `FALSE`.
+    There is no change to the much more common case of equi-joined columns. For such columns, the data in `i` is still named in the result using the name from `x`.
+    The default will be changed to `FALSE` in future and then the argument and option will eventually be removed; hence the `old.` prefix to convey it is a migration option.
+
+TODO. `fread()` can now read a remote compressed file in one step; `fread("https://domain.org/file.csv.bz2")`. The `file=` argument now supports `.gz` and `.bz2` too; i.e. `fread(file="file.csv.gz")` works now where only `fread("file.csv.gz")` worked in 1.11.8.
+
 ## NEW FEATURES
 
 1. New function `rowwiseDT()` for creating a data.table object "row-wise", often convenient for readability of small, literally-defined tables. Thanks to @shrektan for the suggestion and PR and @tdeenes for the idea of the `name=` syntax. Inspired by `tibble::tribble()`.

@@ -128,7 +128,18 @@
   }
 }
 
-"[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL, showProgress=getOption("datatable.showProgress", interactive()))
+# A (relatively) fast (uses DT grouping) wrapper for matching two vectors, BUT:
+# it behaves like 'pmatch' but only the 'exact' matching part. That is, a value in
+# 'x' is matched to 'table' only once. No index will be present more than once.
+# This should make it even clearer:
+# chmatch2(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table'
+# chmatch2(c("a", "a"), c("a", "b")) # 1,NA - the second one doesn't 'see' the first 'a'
+# chmatch2(c("a", "a"), c("a", "a.1")) # 1,NA - this is where it differs from pmatch - we don't need the partial match.
+chmatch2 <- function(x, table, nomatch=NA_integer_) {
+  .Call(Cchmatch2, x, table, as.integer(nomatch)) # this is in 'rbindlist.c' for now.
+}
+
+"[.data.table" <- function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch"), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL, showProgress=getOption("datatable.showProgress", interactive()), old.nonequi=mget(".datatable.old.nonequi", envir=parent.frame(), inherits=TRUE, ifnotfound=TRUE)[[1L]])
 {
   # ..selfcount <<- ..selfcount+1  # in dev, we check no self calls, each of which doubles overhead, or could
   # test explicitly if the caller is [.data.table (even stronger test. TO DO.)
@@ -497,9 +508,42 @@
         setnames(i, orignames[leftcols])
         setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted
       }
-      i = .shallow(i, retain.key = TRUE)
+      io = if (missing(on)) haskey(i) else identical(unname(on), head(key(i), length(on)))
+      i = .shallow(i, retain.key = io)
       ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose)
       xo = ans$xo ## to make it available for further use.
+      # Fix for #1615, #1700 and related issues - keep columns used for non-equi joins from both x and i.
+      # keep copies of the full leftcols and rightcols which are needed if by = .EACHI is also used.
+      allleftcols = leftcols
+      allrightcols = rightcols
+      # Drop any non-equi join columns from leftcols and rightcols so they are kept from both x and i
+      if (!missing(on) && !is.na(non_equi)) {
+        if (identical(old.nonequi, "warning")) {
+          warning("old.nonequi=='warning' occurs in on=", call.=TRUE)
+        } else if (identical(old.nonequi, "error")) {
+          # warnings in examples and vignettes do not cause R CMD check to warn
+          stop("old.nonequi=='error' occurs in on=", call.=TRUE)
+        } else if (identical(old.nonequi, FALSE)) {
+          leftcols = leftcols[ops == 1]  # ops > 1 where there is a non-equi opertor
+          rightcols = rightcols[ops == 1]
+        }
+      }
+      # Do the same for rolling joins. The column used for the roll is always the last key column
+      if (roll != 0) {
+        if (identical(old.nonequi, "warning")) {
+          warning("old.nonequi=='warning' occurs in roll=", call.=TRUE)
+        } else if (identical(old.nonequi, "error")) {
+          stop("old.nonequi=='error' occurs in roll=", call.=TRUE)
+        } else if (identical(old.nonequi, FALSE)) {
+          leftcols = leftcols[-length(leftcols)]
+          rightcols = rightcols[-length(rightcols)]
+        }
+      }
+      # If there are only non-equi / roll keys then leftcols and rightcols become integer(0),
+      # which is used as a switch to keep only columns in x. Use NULL instead to signify
+      # keeping all columns in both x and i.
+      if (!length(leftcols)) leftcols = NULL
+      if (!length(rightcols)) rightcols = NULL
       # temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
       # 'setorder', as there's another 'setorder' in generating 'irows' below...
       if (length(ans$indices)) setorder(setDT(ans[1L:3L]), indices)
@@ -657,7 +701,20 @@
   if (missing(j)) {
     # missingby was already checked above before dealing with i
     if (!length(x)) return(null.data.table())
-    if (!length(leftcols)) {
+    if (is.null(leftcols)) { # Keep all columns for non-equi / roll joins with no equi keys
+      jisvars = names(i)
+      tt = jisvars %chin% names(x)
+      if (length(tt)) jisvars[tt] = paste0("i.",jisvars[tt])
+      nx = names(x)
+      ansvars = make.unique(c(nx, jisvars))
+      icols = seq_along(i)
+      icolsAns = seq.int(length(nx)+1, length.out=ncol(i))
+      xcols = xcolsAns = seq_along(x)
+    } else if (!length(leftcols)) {
+      ## ansvars = nx = names(x)
+      ## jisvars = character()
+      ## xcols = xcolsAns = seq_along(x)
+      ## TODO the commented code above is from #2420 merged Oct 2017 https://github.com/Rdatatable/data.table/blame/sritchie73-non-equi-key/R/data.table.R#L760 so probably not useful anymore because other code below is more recent, from #3213 merged Dec 2018 https://github.com/Rdatatable/data.table/blame/546259ddaba0e8ab1506729113688f85ca2986fd/R/data.table.R#L662 but keeping for now during conflict resolution.
       # basic x[i] subset, #2951
       if (is.null(irows)) return(shallow(x))   # e.g. DT[TRUE] (#3214); otherwise CsubsetDT would materialize a deep copy
       else                return(.Call(CsubsetDT, x, irows, seq_along(x)) )
@@ -1256,6 +1313,17 @@
         xcols = w
         xcolsAns = seq_along(ansvars)
         icols = icolsAns = integer()
+      } else if (is.null(leftcols)) {
+        xcols = w[!wna]
+        xcolsAns = which(!wna)
+        ivars = names(i)
+        w2 = chmatch(ansvars[wna], ivars)
+        if (any(w2na <- is.na(w2))) {
+          ivars = paste0("i.",ivars)
+          w2[w2na] = chmatch(ansvars[wna][w2na], ivars)
+        }
+        icols = w2
+        icolsAns = which(wna)
       } else {
         if (!length(leftcols)) internal_error("column(s) not found: %s", brackify(ansvars[wna])) # nocov
         xcols = w[!wna]
@@ -1279,6 +1347,11 @@
     }
   }  # end of  if !missing(j)
 
+  # Restore full leftcols and rightcols now that we have kept non-equi
+  # and rolling join columns from both x and i.
+  if (!identical(leftcols, integer(0L))) leftcols = allleftcols
+  if (!identical(rightcols, integer(0L))) rightcols = allrightcols
+
   SDenv = new.env(parent=parent.frame())
 
   syms = all.vars(jsub)

@@ -13327,6 +13327,10 @@ setDF(DT)
 test(1953.4, melt.data.table(DT, id.vars = 'id', measure.vars = 'a'),
              error = "must be a data.table")
 
+###################################
+#  Add new tests above this line  #
+###################################
+
 # appearance order of two low-cardinality columns that were squashed in pr#3124
 DT = data.table(A=INT(1,3,2,3,2), B=1:5)  # respect groups in 1st column (3's and 2's)
 test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L))
@@ -14390,13 +14394,131 @@ x = 5L
 test(1998.1, as.IDate(x), output = '1970-01-06')
 test(1998.2, class(x), 'integer')
 
+
+
+#
+# gap in test number for now to avoid merge conflicts. Matt will remove gap when PR merged.
+#
+# PR#2602 -- non-equi joins and roll joins columns are now returned from
+# both data.tables in x[i]. These tests cover previous tests which expect
+# only the column from i (named after the column from x) to be returned.
+.datatable.old.nonequi=FALSE
+
+# Tests 129, 130, 131 but with the roll column returned from both x and i
+TESTDT = data.table(a=3L,v=2L,key="a")  # testing 1-row table
+test(1999.01, TESTDT[J(4),roll=TRUE], data.table(a=3L,v=2L,V1=4L,key="a"))
+test(1999.02, TESTDT[J(4),roll=TRUE,rollends=FALSE], data.table(a=NA_integer_,v=NA_integer_,V1=4L,key="a"))
+test(1999.03, TESTDT[J(-4),roll=TRUE], data.table(a=NA_integer_,v=NA_integer_,V1=-4L,key="a"))
+
+# Tests 148 and 149 with the roll column returned from both x and i
+dt = data.table(a=c(1L,4L,5L), b=1:3, key="a")
+test(1999.04, dt[CJ(2:3),roll=TRUE], data.table(a=c(1L,1L),b=c(1L,1L),V1=c(2L,3L),key="a"))
+test(1999.05, dt[J(2:3),roll=TRUE], data.table(a=c(1L,1L),b=c(1L,1L),V1=c(2L,3L)))
+
+# Test 917 with roll column returned from both x and i
+X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key='state,uid,ts')
+Y = data.table(id=3001:3004, uid=c(1001,1003,1002,1001), state=factor(c('CA','CA','CA','CA')), ts=c(51,57,59,59), key='state,uid,ts')
+test(1999.06, X[Y,roll=TRUE], data.table(id=INT(2001,2003,2002,NA), uid=c(1001,1001,1002,1003), state=factor('CA'), ts=c(51,53,52,NA), i.id=INT(3001,3004,3003,3002), i.ts=c(51,59,59,57), key='state,uid,ts'))
+
+# Test 1317.1 with roll column returned from both x and i
+dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names =
+c("x", "y", "z"), class = "data.frame", row.names = c(NA, -2L)), key = "x,y")
+dt2 <- data.table(structure(list(x = c(7L, 7L, 33L, 33L, 33L, 33L), y = structure(c(15884, 15917, 15884, 15884, 15917, 15917), class = "Date"), w = c(-0.118303, 0.141225, -0.03137, -0.02533, 0.045967, 0.043694)), .Names = c("x", "y", "w"), class = "data.frame", row.names = c(NA, -6L)), key = "x,y")
+test(1999.07, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-26", "2013-07-26", "2013-07-26")), z=c(dt1$z[1:2], dt1$z[2]), i.y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), w=c(dt2$w[2], dt2$w[5:6]), key="x,y"))
+
+# Test 1317.2 with roll column returned from both x and i
+set.seed(1L)
+dt2 <- dt2[sample(nrow(dt2))] # key should be gone
+test(1999.08, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-26", "2013-07-26", "2013-07-26")), z=c(dt1$z[1:2], dt1$z[2]), i.y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), w=c(dt2$w[1], dt2$w[c(2,6)])))
+
+# Test 1469.2 with roll column returned from both x and i
+DT = data.table(x=c(-5,5), y=1:2, key="x")
+test(1999.09, key(DT[J(c(2,0)), .(V1,y), roll=TRUE]), NULL)
+
+# Test 1470.1, 1470.2, 1470.3 with roll column returned from both x and i
+DT = data.table(x=c(-Inf, 3, Inf), y=1:3, key="x")
+test(1999.10, DT[J(c(2,-Inf,5,Inf)), roll=Inf], data.table(x=c(-Inf, -Inf, 3, Inf), y=c(1L, 1:3), V1=c(2,-Inf,5,Inf)))
+test(1999.11, DT[J(c(2,-Inf,5,Inf)), roll=10], data.table(x=c(NA, -Inf, 3, Inf), y=INT(c(NA, 1, 2, 3)), V1=c(2,-Inf,5,Inf)))
+test(1999.12, DT[SJ(c(2,-Inf,5,Inf)), roll=Inf], data.table(x=c(-Inf, -Inf, 3, Inf), y=c(1L, 1:3), V1=c(-Inf,2,5,Inf), key="x"))
+
+# Test 1540.15 with roll column returned from both x and i
+set.seed(45L)
+DT1 = data.table(x=sample(letters[1:3], 15, TRUE), y=sample(6:10, 15, TRUE),
+                 a=sample(100, 15), b=runif(15))
+DT2 = CJ(x=letters[1:3], y=6:10)[, mul := sample(20, 15)][sample(15L, 5L)]
+DT3 = rbindlist(list(DT2, list(x="d", y=7L, mul=100L)))
+DT3 = DT3[sample(nrow(DT3))]
+DT1.copy = copy(DT1)
+setkey(DT1.copy, y)
+test(1999.13, DT1[DT3, on=c(y="y"), roll=TRUE], DT1.copy[DT3[, c(2,1,3), with=FALSE], roll=TRUE][,c(1:4,6,5,7)])
+
+# Tests 1614.1, 1614.2, 1614.3, and 1614.4 with roll column returned from both x and i
+if (test_bit64) {
+  dt = data.table(x=as.integer64(c(-1000, 0)), y=c(5,10))
+  val = c(-1100,-900,100)
+  test(1999.14, dt[.(val), roll=Inf,  on="x"], data.table(x=as.integer64(c(NA,-1000,0)), y=c(NA,5,10), V1=as.integer64(val)))
+  test(1999.15, dt[.(val), roll=Inf, on="x", rollends=TRUE], data.table(x=as.integer64(c(-1000,-1000,0)), y=c(5,5,10), V1=as.integer64(val)))
+  test(1999.16, dt[.(val), roll=-Inf, on="x"], data.table(x=as.integer64(c(-1000,0,NA)), y=c(5,10,NA), V1=as.integer64(val)))
+  test(1999.17, dt[.(val), roll=-Inf, on="x", rollends=TRUE], data.table(x=as.integer64(c(-1000,0,0)), y=c(5,10,10), V1=as.integer64(val)))
+}
+
+# Tests 1660.1 and 1660.2 with non-equi join columns returned from both x and i
+dt1 = fread('Chr    Start    End    Region
+chr6    3324    3360   Region1
+chr4    2445    2455   Region2
+chr1    1034    1090   Region4')
+dt2 = fread('Site    Chr     Location    Gene
+Site1   chr4    2447        GeneB
+Site2   chr9    1153        GeneT
+Site3   chr6    3350        GeneM
+Site4   chr1    1034        GeneC
+Site5   chr1    2000        GeneU
+Site6   chr6    3359        GeneF
+Site7   chr7    1158        GeneI
+Site8   chr4    2451        GeneO
+Site9   chr6    3367        GeneZ   ')
+test(1999.18, names(dt2[dt1, on=.(Chr, Location>=Start, Location<=End)]), c(names(dt2), "Start", "End", "Region"))
+test(1999.19, names(dt1[dt2, on=.(Chr, Start<=Location, End>=Location)]), c(names(dt1), "Site", "Location", "Gene"))
+
+# Tests 1682.1, 1682.2, 1682.3, 1682.4, 1682.5, 1682.6, and 1682.7 with
+# non-equi join columns returned from both x and i.
+set.seed(45L)
+dt1 = data.table(x=sample(8,20,TRUE), y=sample(8,20,TRUE), z=1:20)
+dt2 = data.table(c(2,5), c(5,7), c(2,4))
+dt3 = data.table(c(12,5), c(15,7), c(2,4))
+test(1999.20, dt1[dt2, .N, by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt2, on=.(x>=V1, y<=V2)][, .N, by=.(x=V1,y=V2)])
+test(1999.21, dt1[dt2, sum(z), by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt2, on=.(x>=V1, y<=V2)][, sum(z), by=.(x=V1,y=V2)])
+test(1999.22, dt1[dt2, as.numeric(median(z)), by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt2, on=.(x>=V1, y<=V2)][, median(z), by=.(x=V1,y=V2)])
+test(1999.23, dt1[dt3, .N, by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt3, on=.(x>=V1, y<=V2)][, .(N=sum(!is.na(z))), by=.(x=V1,y=V2)])
+test(1999.24, dt1[dt3, .N, by=.EACHI, on=.(x>=V1, y<=V2), nomatch=0L], dt1[dt3, on=.(x>=V1, y<=V2), nomatch=0L][, .N, by=.(x=V1,y=V2)])
+test(1999.25, dt1[dt2, on=.(x>=V1, y<=V2), sum(z)*V3, by=.EACHI], dt1[dt2, on=.(x>=V1, y<=V2)][, sum(z)*V3[1L], by=.(x=V1,y=V2)])
+test(1999.26, dt1[dt3, on=.(x>=V1, y<=V2), sum(z)*V3, by=.EACHI], dt1[dt3, on=.(x>=V1, y<=V2)][, sum(z)*V3[1L], by=.(x=V1,y=V2)])
+
+# Tests for 1846.1 and 1846.2 with non-equi join columns returned from both x and i
+d1 <- data.table(a=1L, b=2L)
+d2 <- d1[0L]
+test(1999.27, d2[d1, on=.(a>a, b<b)], data.table(a=NA_integer_, b=NA_integer_, i.a=1L, i.b=2L))
+test(1999.28, d2[d1, on=.(a>a)], data.table(a=NA_integer_, b=NA_integer_, i.a=1L, i.b=2L))
+
+# New test required to maintain code coverage - does i. prefix work in j where there are
+# only non-equi / roll join columns in on?
+d1 <- data.table(id=c("X", "Y", "Z"), val1 = 1:3)
+d2 <- data.table(id=c("a", "b", "c"), val2 = 1:3)
+test(1999.29, d1[d2, on = .(val1 > val2), .(i.id)], data.table(i.id=c("a", "a", "b", "c")))
+
+# Testing completed PR#2602, set old.nonequi back to TRUE (the default for now) for future tests
+.datatable.old.nonequi=TRUE
+
+
 # a single NA at the beginning with no other nomatch would cause incorrect key, #3441
 dx = data.table(id = "A", key = "id")
 di = list(c("D", "A"))
-test(1999.1, key(dx[di]), NULL)
+test(1999.31, key(dx[di]), NULL)
 dx = data.table(id = 1L, key = "id")
 di = list(z=c(2L, 1L))
-test(1999.2, key(dx[di]), NULL)
+test(1999.32, key(dx[di]), NULL)
+
+
 
 # 2000 moved to benchmark.Rraw, #5517