Merge branch 'master' into subset_expression

Rdatatable · Sep 8, 2024 · 3078461 · 3078461
2 parents 694ed5e + f0aaaf9
commit 3078461
Show file tree

Hide file tree

Showing 29 changed files with 542 additions and 134 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -16,6 +16,7 @@
 ^\.devcontainer$
 ^\.graphics$
 ^\.github$
+^\.vscode$
 ^\.zed$
 
 ^\.gitlab-ci\.yml$

diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R
@@ -158,5 +158,17 @@ test.list <- atime::atime_test_list(
     Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
     Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
 
+  # Issue reported in: https://github.com/Rdatatable/data.table/issues/6286
+  # Fixed in: https://github.com/Rdatatable/data.table/pull/6296
+  "DT[by, verbose = TRUE] improved in #6296" = atime::atime_test(
+    N = 10^seq(1, 9),
+    setup = {
+      dt = data.table(a = 1:N)
+      dt_mod <- copy(dt)
+    },
+    expr = data.table:::`[.data.table`(dt_mod, , 1, by = a, verbose = TRUE),
+    Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue
+    Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue
+
   NULL)
 # nolint end: undesirable_operator_linter.
diff --git a/.dev/cc.R b/.dev/cc.R
@@ -91,9 +91,9 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, path=Sys.getenv(
   if (clean) system("rm *.o *.so")
   OMP = if (omp) "openmp" else "no-openmp"
   if (debug) {
-    cmd = sprintf(R"(MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f% CFLAGS=-std=c99\ -O0\ -ggdb\ %s\ -pedantic' R CMD SHLIB -d -o data_table.so *.c)", CC, OMP, W32)
+    cmd = sprintf(R"(MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f% CFLAGS=-std=c11\ -O0\ -ggdb\ %s\ -pedantic' R CMD SHLIB -d -o data_table.so *.c)", CC, OMP, W32)
   } else {
-    cmd = sprintf(R"(MAKEFLAGS='-j CC=%s CFLAGS=-f%s\ -std=c99\ -O3\ -pipe\ -Wall\ -pedantic\ -Wstrict-prototypes\ -isystem\ /usr/share/R/include\ %s\ -fno-common' R CMD SHLIB -o data_table.so *.c)", CC, OMP, W32)
+    cmd = sprintf(R"(MAKEFLAGS='-j CC=%s CFLAGS=-f%s\ -std=c11\ -O3\ -pipe\ -Wall\ -pedantic\ -Wstrict-prototypes\ -isystem\ /usr/share/R/include\ %s\ -fno-common' R CMD SHLIB -o data_table.so *.c)", CC, OMP, W32)
     # the -isystem suppresses strict-prototypes warnings from R's headers, #5477. Look at the output to see what -I is and pass the same path to -isystem.
     # TODO add -Wextra too?
   }

diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml
@@ -15,6 +15,7 @@ on:
 
 jobs:
   comment:
+    if: github.repository == 'Rdatatable/data.table'
     runs-on: ubuntu-latest
     container: ghcr.io/iterative/cml:0-dvc2-base1
     env:

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -35,6 +35,10 @@
 /src/shift.c @ben-schwen @michaelchirico
 /man/shift.Rd @ben-schwen @michaelchirico
 
+# rbind
+/src/rbindlist.c @ben-schwen
+/man/rbindlist.Rd @ben-schwen
+
 # translations
 /inst/po/ @michaelchirico
 /po/ @michaelchirico

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -89,5 +89,6 @@ Authors@R: c(
   person("Iago", "Giné-Vázquez",   role="ctb"),
   person("Anirban", "Chetia",      role="ctb"),
   person("Doris", "Amoakohene",    role="ctb"),
-  person("Ivan", "Krylov",         role="ctb")
+  person("Ivan", "Krylov",         role="ctb"),
+  person("Michael","Young",        role="ctb")
   )
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ exportClasses(data.table, IDate, ITime)
 ##
 
 export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy)
+export(rowwiseDT)
 export(setindex, setindexv, indices)
 export(as.data.table,is.data.table,test.data.table)
 export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%", "%notin%")

diff --git a/NEWS.md b/NEWS.md
@@ -6,13 +6,64 @@
 
 1. In `DT[, variable := value]`, when value is class `POSIXlt`, we automatically coerce it to class `POSIXct` instead, [#1724](https://github.com/Rdatatable/data.table/issues/1724). Thanks to @linzhp for the report, and Benjamin Schwendinger for the fix.
 
+## NEW FEATURES
+
+1. New function `rowwiseDT()` for creating a data.table object "row-wise", often convenient for readability of small, literally-defined tables. Thanks to @shrektan for the suggestion and PR and @tdeenes for the idea of the `name=` syntax. Inspired by `tibble::tribble()`.
+
+```r
+library(data.table)
+rowwiseDT(
+  a=,b=,c=,  d=,
+  1, 2, "a", 2:3,
+  3, 4, "b", list("e"),
+  5, 6, "c", ~a+b,
+)
+#>        a     b      c      d
+#>    <num> <num> <char> <list>
+#> 1:     1     2      a    2,3
+#> 2:     3     4      b      e
+#> 3:     5     6      c ~a + b
+```
+
+2. Subsetting or aggregating columns of type `expression` works, [#5596](https://github.com/Rdatatable/data.table/issues/5596). Thanks to @tsp for the report, and @ben-schwen for the fix.
+
 ## BUG FIXES
 
 1. Using `print.data.table()` with character truncation using `datatable.prettyprint.char` no longer errors with `NA` entries, [#6441](https://github.com/Rdatatable/data.table/issues/6441). Thanks to @r2evans for the bug report, and @joshhwuu for the fix.
 
 2. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR.
 
-3. Subsetting or aggregating columns of type `expression` works, [#5596](https://github.com/Rdatatable/data.table/issues/5596). Thanks to @tsp for the report, and @ben-schwen for the fix.
+3. The data.table-only attribute `$.internal.selfref` is no longer set for data.frames. [#5286](https://github.com/Rdatatable/data.table/issues/5286). Thanks @OfekShilon for the report and fix.
+
+4. Tagging/naming arguments of `c()` in `j=c()` should now more closely follow base R conventions for concatenation of named lists during grouping, [#2311](https://github.com/Rdatatable/data.table/issues/2311). Naming an `lapply(.SD, FUN)` call as an argument of `c()` in `j` will now always cause that tag to get prepended (with a single dot separator) to the resulting column names. Additionally, naming a `list()` call as an argument of `c()` in `j` will now always cause that tag to get prepended to any names specified within the list call. This bug only affected queries with (1) `by=` grouping (2) `getOption("datatable.optimize") >= 1L` and (3) `lapply(.SD, FUN)` in `j`.
+
+    While the names returned by `data.table` when `j=c()` will now mostly follow base R conventions for concatenating lists, note that names which are completely unspecified will still be named positionally, matching the typical behavior in `j` and `data.table()`. according to position in `j` (e.g. `V1`, `V2`).
+
+    Thanks to @franknarf1 for reporting and @myoung3 for the PR.
+
+    ```r
+    # tag 'mean' prepended to lapply()-named columns
+    names(mtcars[, c(mean=lapply(.SD,sum)), by="cyl", .SDcols=c("am", "carb")])
+    # [1] "cyl" "mean.am" "mean.carb"
+
+    # tag 'mean' is prepended to the first named sublist, 'sum' to the second
+    names(mtcars[, c(mean=list(a=mean(hp), b=mean(wt)), sum=lapply(.SD, sum)), by="cyl", .SDcols=c("am", "carb")])
+    # [1] "cyl" "mean.a" "mean.b" "sum.am" "sum.carb"
+
+    # strict base naming would result in names c("", "b", "c") here
+    names(mtcars[, c(list(mean(hp), b=mean(wt)), c=list(mean(cyl)))])
+    # [1] "V1" "b" "c"
+    ```
+
+5. Queries like `DT[, min(x):max(x)]` now work as expected, i.e. the same as `DT[, seq(min(x), max(x))]` or `with(DT, min(x):max(x))`, [#2069](https://github.com/Rdatatable/data.table/issues/2069). Shorthand like `DT[, a:b]` meaning "select from columns `a` through `b`" still works. Thanks to @franknarf1 for reporting and @jangorecki for the fix.
+
+6. Fixed a segfault in `fcase()`, [#6448](https://github.com/Rdatatable/data.table/issues/6448). Thanks @ethanbsmith for reporting with reprex, @aitap for finding the root cause, and @MichaelChirico for the PR.
+
+7. `fread()` performance improves when specifying `Date` among `colClasses`, [#6105](https://github.com/Rdatatable/data.table/issues/6105). One implication of the change is that the column will be an `IDate` (which also inherits from `Date`), which may affect code strongly relying on the column class to be `Date` exactly; computations with `IDate` and `Date` columns should otherwise be the same. If you strongly prefer the `Date` class, run `as.Date()` explicitly following `fread()`. Thanks @scipima for the report and @MichaelChirico for the fix.
+
+8. `dt[, col]` now returns a copy of `col` also when it is a list column, as in any other case, [#4877](https://github.com/Rdatatable/data.table/issues/4877). Thanks to @tlapak for reporting and the PR.
+
+9. `rbindlist` and `rbind` binding `bit64::integer64` columns with `character`/`complex`/`list` columns now works, [#5504](https://github.com/Rdatatable/data.table/issues/5504). Thanks to @MichaelChirico for the request and @ben-schwen for the PR.
 
 ## NOTES
 
@@ -24,6 +75,10 @@
 
 4. The translations submitted for 1.16.0 are now actually shipped with the package -- our deepest apologies to the translators for the omission. We have added a CI check to ensure that the .mo binaries which get shipped with the package are always up-to-date.
 
+5. Some grouping operations run much faster under `verbose=TRUE`, [#6286](https://github.com/Rdatatable/data.table/issues/6286). Thanks @joshhwuu for the report and fix. This overhead was not present on Windows. As a rule, users should expect `verbose=TRUE` operations to run more slowly, as extra statistics might be calculated as part of the report; here was a case where the overhead was particularly high and the fix was particularly easy.
+
+6. `set()` and `:=` now provide some extra guidance for common incorrect approaches to assigning `NULL` to some rows of a list column. The correct way is to put `list(list(NULL))` on the RHS of `:=` (or `.(.(NULL))` for short). Thanks to @MichaelChirico for the suggestion and @Nj221102 for the implementation.
+
 # data.table [v1.16.0](https://github.com/Rdatatable/data.table/milestone/30)  (25 August 2024)
 
 ## BREAKING CHANGES

diff --git a/R/data.table.R b/R/data.table.R
@@ -244,10 +244,10 @@ replace_dot_alias = function(e) {
   if (!missing(j)) {
     jsub = replace_dot_alias(jsub)
     root = root_name(jsub)
-    if (root == ":" ||
-        (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') ||
-        ( (!length(av<-all.vars(jsub)) || all(startsWith(av, ".."))) &&
-          root %chin% c("","c","paste","paste0","-","!") &&
+    if ((root == ":" && !is.call(jsub[[2L]]) && !is.call(jsub[[3L]])) ||                        ## x[, V1:V2]; but not x[, (V1):(V2)] #2069
+        (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') || ## x[, !(V8:V10)]
+        ( (!length(av<-all.vars(jsub)) || all(startsWith(av, ".."))) &&                         ## x[, "V1"]; x[, ..v]
+          root %chin% c("","c","paste","paste0","-","!") &&                                     ## x[, c("V1","V2")]; x[, paste("V",1:2,sep="")]; x[, paste0("V",1:2)]
           missingby )) {   # test 763. TODO: likely that !missingby iff with==TRUE (so, with can be removed)
       # When no variable names (i.e. symbols) occur in j, scope doesn't matter because there are no symbols to find.
       # If variable names do occur, but they are all prefixed with .., then that means look up in calling scope.
@@ -1382,10 +1382,14 @@ replace_dot_alias = function(e) {
         if (jcpy) jval = copy(jval)
       } else if (address(jval) == address(SDenv$.SD)) {
         jval = copy(jval)
-      } else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) {
-        for (jidx in jcpy) { if(!is.null(jval[[jidx]])) jval[[jidx]] = copy(jval[[jidx]]) }
-      } else if (jsub %iscall% 'get') {
-        jval = copy(jval) # fix for #1212
+      } else {
+        sd_addresses = vapply_1c(SDenv, address)
+        jcpy = which(!vapply_1b(jval, is.null) & vapply_1c(jval, address) %chin% sd_addresses)
+        if (length(jcpy)) {
+          for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]])
+        } else if (address(jval) %chin% sd_addresses) {
+          jval = copy(jval) # fix for #4877, includes fix for #1212
+        }
       }
     }
 
@@ -1697,12 +1701,30 @@ replace_dot_alias = function(e) {
               deparse_ans = .massageSD(this)
               funi = funi + 1L # Fix for #985
               jsubl[[i_]] = as.list(deparse_ans[[1L]][-1L]) # just keep the '.' from list(.)
-              jvnames = c(jvnames, deparse_ans[[2L]])
+              jn__ = deparse_ans[[2L]]
+              if (isTRUE(nzchar(names(jsubl)[i_]))) {
+                # Fix for #2311, prepend named arguments of c() to column names of .SD
+                #   e.g. c(mean=lapply(.SD, mean))
+                jn__ = paste(names(jsubl)[i_], jn__, sep=".") # sep="." for consistency with c(A=list(a=1,b=1))
+              }
+              jvnames = c(jvnames, jn__)
             } else if (this[[1L]] == "list") {
               # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen
               if (length(this) > 1L) {
                 jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.)
-                jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__)
+                if (isTRUE(nzchar(names(jsubl)[i_]))) {
+                  # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.*
+                  njl__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__)
+                  njl__nonblank = nzchar(names(jl__))
+                  if (length(jl__) > 1L) {
+                    jn__ = paste0(names(jsubl)[i_], seq_along(jl__))
+                  } else {
+                    jn__ = names(jsubl)[i_]
+                  }
+                  jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".")
+                } else {
+                  jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__)
+                }
                 idx  = unlist(lapply(jl__, function(x) is.name(x) && x == ".I"))
                 if (any(idx))
                   jn__[idx & !nzchar(jn__)] = "I"  # this & is correct not &&
@@ -1967,7 +1989,10 @@ replace_dot_alias = function(e) {
     if (any(ww)) jvnames[ww] = paste0("V",ww)
     setattr(ans, "names", c(bynames, jvnames))
   } else {
-    setnames(ans,seq_along(bynames),bynames)   # TO DO: reinvestigate bynames flowing from dogroups here and simplify
+    nonbynames = names(ans)[-seq_along(bynames)] #related to 2311. make naming of empty columns names more consistent
+    ww = which(!nzchar(nonbynames))
+    if (length(ww)) nonbynames[ww] = paste0("V", ww)
+    setattr(ans, "names", c(bynames, nonbynames))   # TO DO: reinvestigate bynames flowing from dogroups here and simplify
   }
   if (byjoin && keyby && !bysameorder) {
     if (verbose) {last.started.at=proc.time();catf("setkey() afterwards for keyby=.EACHI ... ");flush.console()}

diff --git a/R/rowwiseDT.R b/R/rowwiseDT.R
@@ -0,0 +1,23 @@
+rowwiseDT = function(...) {
+  x = substitute(list(...))[-1L]
+  if (is.null(nms <- names(x)))
+    stopf("Must provide at least one column (use `name=`). See ?rowwiseDT for details")
+  header_pos = which(nzchar(nms))
+  if (any(nzchar(x[header_pos])))
+    stopf("Named arguments must be empty")
+  if (!identical(header_pos, seq_along(header_pos)))
+    stopf("Header must be the first N arguments")
+  header = nms[header_pos]
+  ncols = length(header)
+  body = lapply(x[-header_pos], eval, envir = parent.frame())
+  nrows = length(body) %/% ncols
+  if (length(body) != nrows * ncols)
+    stopf("There are %d columns but the number of cells is %d, which is not an integer multiple of the columns", ncols, length(body))
+  # make all the non-scalar elements to a list
+  needs_list = lengths(body) != 1L
+  body[needs_list] = lapply(body[needs_list], list)
+  body = split(body, rep(seq_len(nrows), each = ncols))
+  ans = rbindlist(body)
+  setnames(ans, header)
+  ans
+}
diff --git a/inst/tests/S4.Rraw b/inst/tests/S4.Rraw
@@ -90,14 +90,14 @@ ans_print = utils::capture.output(print(ans))
 if (TZnotUTC) {
   test(5.2, options=list(datatable.old.fread.datetime.character=NULL),
        fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct"), tz=""),
-       ans, output=ans_print)
+       copy(ans)[, a := as.IDate(a)], output=ans_print)
   test(5.3, options=list(datatable.old.fread.datetime.character=NULL),
        fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA), tz=""),
-       data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print)
+       data.table(a=as.IDate("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print)
 } else {
   test(5.4, options=list(datatable.old.fread.datetime.character=NULL),
        fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")),
-       ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print)
+       ans<-data.table(a=as.IDate("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print)
   test(5.5, options=list(datatable.old.fread.datetime.character=NULL),
        fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)),
        ans, output=ans_print)