Skip to content

Commit

Permalink
Merge branch 'master' into subset_expression
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelChirico committed Sep 8, 2024
2 parents 694ed5e + f0aaaf9 commit 3078461
Show file tree
Hide file tree
Showing 29 changed files with 542 additions and 134 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
^\.devcontainer$
^\.graphics$
^\.github$
^\.vscode$
^\.zed$

^\.gitlab-ci\.yml$
Expand Down
12 changes: 12 additions & 0 deletions .ci/atime/tests.R
Original file line number Diff line number Diff line change
Expand Up @@ -158,5 +158,17 @@ test.list <- atime::atime_test_list(
Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue

# Issue reported in: https://github.com/Rdatatable/data.table/issues/6286
# Fixed in: https://github.com/Rdatatable/data.table/pull/6296
"DT[by, verbose = TRUE] improved in #6296" = atime::atime_test(
N = 10^seq(1, 9),
setup = {
dt = data.table(a = 1:N)
dt_mod <- copy(dt)
},
expr = data.table:::`[.data.table`(dt_mod, , 1, by = a, verbose = TRUE),
Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue
Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue

NULL)
# nolint end: undesirable_operator_linter.
4 changes: 2 additions & 2 deletions .dev/cc.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, path=Sys.getenv(
if (clean) system("rm *.o *.so")
OMP = if (omp) "openmp" else "no-openmp"
if (debug) {
cmd = sprintf(R"(MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f% CFLAGS=-std=c99\ -O0\ -ggdb\ %s\ -pedantic' R CMD SHLIB -d -o data_table.so *.c)", CC, OMP, W32)
cmd = sprintf(R"(MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f% CFLAGS=-std=c11\ -O0\ -ggdb\ %s\ -pedantic' R CMD SHLIB -d -o data_table.so *.c)", CC, OMP, W32)
} else {
cmd = sprintf(R"(MAKEFLAGS='-j CC=%s CFLAGS=-f%s\ -std=c99\ -O3\ -pipe\ -Wall\ -pedantic\ -Wstrict-prototypes\ -isystem\ /usr/share/R/include\ %s\ -fno-common' R CMD SHLIB -o data_table.so *.c)", CC, OMP, W32)
cmd = sprintf(R"(MAKEFLAGS='-j CC=%s CFLAGS=-f%s\ -std=c11\ -O3\ -pipe\ -Wall\ -pedantic\ -Wstrict-prototypes\ -isystem\ /usr/share/R/include\ %s\ -fno-common' R CMD SHLIB -o data_table.so *.c)", CC, OMP, W32)
# the -isystem suppresses strict-prototypes warnings from R's headers, #5477. Look at the output to see what -I is and pass the same path to -isystem.
# TODO add -Wextra too?
}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/performance-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ on:

jobs:
comment:
if: github.repository == 'Rdatatable/data.table'
runs-on: ubuntu-latest
container: ghcr.io/iterative/cml:0-dvc2-base1
env:
Expand Down
4 changes: 4 additions & 0 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
/src/shift.c @ben-schwen @michaelchirico
/man/shift.Rd @ben-schwen @michaelchirico

# rbind
/src/rbindlist.c @ben-schwen
/man/rbindlist.Rd @ben-schwen

# translations
/inst/po/ @michaelchirico
/po/ @michaelchirico
Expand Down
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -89,5 +89,6 @@ Authors@R: c(
person("Iago", "Giné-Vázquez", role="ctb"),
person("Anirban", "Chetia", role="ctb"),
person("Doris", "Amoakohene", role="ctb"),
person("Ivan", "Krylov", role="ctb")
person("Ivan", "Krylov", role="ctb"),
person("Michael","Young", role="ctb")
)
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ exportClasses(data.table, IDate, ITime)
##

export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy)
export(rowwiseDT)
export(setindex, setindexv, indices)
export(as.data.table,is.data.table,test.data.table)
export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%", "%notin%")
Expand Down
57 changes: 56 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,64 @@

1. In `DT[, variable := value]`, when value is class `POSIXlt`, we automatically coerce it to class `POSIXct` instead, [#1724](https://github.com/Rdatatable/data.table/issues/1724). Thanks to @linzhp for the report, and Benjamin Schwendinger for the fix.

## NEW FEATURES

1. New function `rowwiseDT()` for creating a data.table object "row-wise", often convenient for readability of small, literally-defined tables. Thanks to @shrektan for the suggestion and PR and @tdeenes for the idea of the `name=` syntax. Inspired by `tibble::tribble()`.

```r
library(data.table)
rowwiseDT(
a=,b=,c=, d=,
1, 2, "a", 2:3,
3, 4, "b", list("e"),
5, 6, "c", ~a+b,
)
#> a b c d
#> <num> <num> <char> <list>
#> 1: 1 2 a 2,3
#> 2: 3 4 b e
#> 3: 5 6 c ~a + b
```

2. Subsetting or aggregating columns of type `expression` works, [#5596](https://github.com/Rdatatable/data.table/issues/5596). Thanks to @tsp for the report, and @ben-schwen for the fix.

## BUG FIXES

1. Using `print.data.table()` with character truncation using `datatable.prettyprint.char` no longer errors with `NA` entries, [#6441](https://github.com/Rdatatable/data.table/issues/6441). Thanks to @r2evans for the bug report, and @joshhwuu for the fix.

2. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR.

3. Subsetting or aggregating columns of type `expression` works, [#5596](https://github.com/Rdatatable/data.table/issues/5596). Thanks to @tsp for the report, and @ben-schwen for the fix.
3. The data.table-only attribute `$.internal.selfref` is no longer set for data.frames. [#5286](https://github.com/Rdatatable/data.table/issues/5286). Thanks @OfekShilon for the report and fix.

4. Tagging/naming arguments of `c()` in `j=c()` should now more closely follow base R conventions for concatenation of named lists during grouping, [#2311](https://github.com/Rdatatable/data.table/issues/2311). Naming an `lapply(.SD, FUN)` call as an argument of `c()` in `j` will now always cause that tag to get prepended (with a single dot separator) to the resulting column names. Additionally, naming a `list()` call as an argument of `c()` in `j` will now always cause that tag to get prepended to any names specified within the list call. This bug only affected queries with (1) `by=` grouping (2) `getOption("datatable.optimize") >= 1L` and (3) `lapply(.SD, FUN)` in `j`.

While the names returned by `data.table` when `j=c()` will now mostly follow base R conventions for concatenating lists, note that names which are completely unspecified will still be named positionally, matching the typical behavior in `j` and `data.table()`. according to position in `j` (e.g. `V1`, `V2`).

Thanks to @franknarf1 for reporting and @myoung3 for the PR.

```r
# tag 'mean' prepended to lapply()-named columns
names(mtcars[, c(mean=lapply(.SD,sum)), by="cyl", .SDcols=c("am", "carb")])
# [1] "cyl" "mean.am" "mean.carb"

# tag 'mean' is prepended to the first named sublist, 'sum' to the second
names(mtcars[, c(mean=list(a=mean(hp), b=mean(wt)), sum=lapply(.SD, sum)), by="cyl", .SDcols=c("am", "carb")])
# [1] "cyl" "mean.a" "mean.b" "sum.am" "sum.carb"

# strict base naming would result in names c("", "b", "c") here
names(mtcars[, c(list(mean(hp), b=mean(wt)), c=list(mean(cyl)))])
# [1] "V1" "b" "c"
```

5. Queries like `DT[, min(x):max(x)]` now work as expected, i.e. the same as `DT[, seq(min(x), max(x))]` or `with(DT, min(x):max(x))`, [#2069](https://github.com/Rdatatable/data.table/issues/2069). Shorthand like `DT[, a:b]` meaning "select from columns `a` through `b`" still works. Thanks to @franknarf1 for reporting and @jangorecki for the fix.

6. Fixed a segfault in `fcase()`, [#6448](https://github.com/Rdatatable/data.table/issues/6448). Thanks @ethanbsmith for reporting with reprex, @aitap for finding the root cause, and @MichaelChirico for the PR.

7. `fread()` performance improves when specifying `Date` among `colClasses`, [#6105](https://github.com/Rdatatable/data.table/issues/6105). One implication of the change is that the column will be an `IDate` (which also inherits from `Date`), which may affect code strongly relying on the column class to be `Date` exactly; computations with `IDate` and `Date` columns should otherwise be the same. If you strongly prefer the `Date` class, run `as.Date()` explicitly following `fread()`. Thanks @scipima for the report and @MichaelChirico for the fix.

8. `dt[, col]` now returns a copy of `col` also when it is a list column, as in any other case, [#4877](https://github.com/Rdatatable/data.table/issues/4877). Thanks to @tlapak for reporting and the PR.

9. `rbindlist` and `rbind` binding `bit64::integer64` columns with `character`/`complex`/`list` columns now works, [#5504](https://github.com/Rdatatable/data.table/issues/5504). Thanks to @MichaelChirico for the request and @ben-schwen for the PR.

## NOTES

Expand All @@ -24,6 +75,10 @@

4. The translations submitted for 1.16.0 are now actually shipped with the package -- our deepest apologies to the translators for the omission. We have added a CI check to ensure that the .mo binaries which get shipped with the package are always up-to-date.

5. Some grouping operations run much faster under `verbose=TRUE`, [#6286](https://github.com/Rdatatable/data.table/issues/6286). Thanks @joshhwuu for the report and fix. This overhead was not present on Windows. As a rule, users should expect `verbose=TRUE` operations to run more slowly, as extra statistics might be calculated as part of the report; here was a case where the overhead was particularly high and the fix was particularly easy.

6. `set()` and `:=` now provide some extra guidance for common incorrect approaches to assigning `NULL` to some rows of a list column. The correct way is to put `list(list(NULL))` on the RHS of `:=` (or `.(.(NULL))` for short). Thanks to @MichaelChirico for the suggestion and @Nj221102 for the implementation.

# data.table [v1.16.0](https://github.com/Rdatatable/data.table/milestone/30) (25 August 2024)

## BREAKING CHANGES
Expand Down
47 changes: 36 additions & 11 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -244,10 +244,10 @@ replace_dot_alias = function(e) {
if (!missing(j)) {
jsub = replace_dot_alias(jsub)
root = root_name(jsub)
if (root == ":" ||
(root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') ||
( (!length(av<-all.vars(jsub)) || all(startsWith(av, ".."))) &&
root %chin% c("","c","paste","paste0","-","!") &&
if ((root == ":" && !is.call(jsub[[2L]]) && !is.call(jsub[[3L]])) || ## x[, V1:V2]; but not x[, (V1):(V2)] #2069
(root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') || ## x[, !(V8:V10)]
( (!length(av<-all.vars(jsub)) || all(startsWith(av, ".."))) && ## x[, "V1"]; x[, ..v]
root %chin% c("","c","paste","paste0","-","!") && ## x[, c("V1","V2")]; x[, paste("V",1:2,sep="")]; x[, paste0("V",1:2)]
missingby )) { # test 763. TODO: likely that !missingby iff with==TRUE (so, with can be removed)
# When no variable names (i.e. symbols) occur in j, scope doesn't matter because there are no symbols to find.
# If variable names do occur, but they are all prefixed with .., then that means look up in calling scope.
Expand Down Expand Up @@ -1382,10 +1382,14 @@ replace_dot_alias = function(e) {
if (jcpy) jval = copy(jval)
} else if (address(jval) == address(SDenv$.SD)) {
jval = copy(jval)
} else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) {
for (jidx in jcpy) { if(!is.null(jval[[jidx]])) jval[[jidx]] = copy(jval[[jidx]]) }
} else if (jsub %iscall% 'get') {
jval = copy(jval) # fix for #1212
} else {
sd_addresses = vapply_1c(SDenv, address)
jcpy = which(!vapply_1b(jval, is.null) & vapply_1c(jval, address) %chin% sd_addresses)
if (length(jcpy)) {
for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]])
} else if (address(jval) %chin% sd_addresses) {
jval = copy(jval) # fix for #4877, includes fix for #1212
}
}
}

Expand Down Expand Up @@ -1697,12 +1701,30 @@ replace_dot_alias = function(e) {
deparse_ans = .massageSD(this)
funi = funi + 1L # Fix for #985
jsubl[[i_]] = as.list(deparse_ans[[1L]][-1L]) # just keep the '.' from list(.)
jvnames = c(jvnames, deparse_ans[[2L]])
jn__ = deparse_ans[[2L]]
if (isTRUE(nzchar(names(jsubl)[i_]))) {
# Fix for #2311, prepend named arguments of c() to column names of .SD
# e.g. c(mean=lapply(.SD, mean))
jn__ = paste(names(jsubl)[i_], jn__, sep=".") # sep="." for consistency with c(A=list(a=1,b=1))
}
jvnames = c(jvnames, jn__)
} else if (this[[1L]] == "list") {
# also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen
if (length(this) > 1L) {
jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.)
jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__)
if (isTRUE(nzchar(names(jsubl)[i_]))) {
# Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.*
njl__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__)
njl__nonblank = nzchar(names(jl__))
if (length(jl__) > 1L) {
jn__ = paste0(names(jsubl)[i_], seq_along(jl__))
} else {
jn__ = names(jsubl)[i_]
}
jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".")
} else {
jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__)
}
idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I"))
if (any(idx))
jn__[idx & !nzchar(jn__)] = "I" # this & is correct not &&
Expand Down Expand Up @@ -1967,7 +1989,10 @@ replace_dot_alias = function(e) {
if (any(ww)) jvnames[ww] = paste0("V",ww)
setattr(ans, "names", c(bynames, jvnames))
} else {
setnames(ans,seq_along(bynames),bynames) # TO DO: reinvestigate bynames flowing from dogroups here and simplify
nonbynames = names(ans)[-seq_along(bynames)] #related to 2311. make naming of empty columns names more consistent
ww = which(!nzchar(nonbynames))
if (length(ww)) nonbynames[ww] = paste0("V", ww)
setattr(ans, "names", c(bynames, nonbynames)) # TO DO: reinvestigate bynames flowing from dogroups here and simplify
}
if (byjoin && keyby && !bysameorder) {
if (verbose) {last.started.at=proc.time();catf("setkey() afterwards for keyby=.EACHI ... ");flush.console()}
Expand Down
23 changes: 23 additions & 0 deletions R/rowwiseDT.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
rowwiseDT = function(...) {
x = substitute(list(...))[-1L]
if (is.null(nms <- names(x)))
stopf("Must provide at least one column (use `name=`). See ?rowwiseDT for details")
header_pos = which(nzchar(nms))
if (any(nzchar(x[header_pos])))
stopf("Named arguments must be empty")
if (!identical(header_pos, seq_along(header_pos)))
stopf("Header must be the first N arguments")
header = nms[header_pos]
ncols = length(header)
body = lapply(x[-header_pos], eval, envir = parent.frame())
nrows = length(body) %/% ncols
if (length(body) != nrows * ncols)
stopf("There are %d columns but the number of cells is %d, which is not an integer multiple of the columns", ncols, length(body))
# make all the non-scalar elements to a list
needs_list = lengths(body) != 1L
body[needs_list] = lapply(body[needs_list], list)
body = split(body, rep(seq_len(nrows), each = ncols))
ans = rbindlist(body)
setnames(ans, header)
ans
}
6 changes: 3 additions & 3 deletions inst/tests/S4.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,14 @@ ans_print = utils::capture.output(print(ans))
if (TZnotUTC) {
test(5.2, options=list(datatable.old.fread.datetime.character=NULL),
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct"), tz=""),
ans, output=ans_print)
copy(ans)[, a := as.IDate(a)], output=ans_print)
test(5.3, options=list(datatable.old.fread.datetime.character=NULL),
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA), tz=""),
data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print)
data.table(a=as.IDate("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print)
} else {
test(5.4, options=list(datatable.old.fread.datetime.character=NULL),
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")),
ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print)
ans<-data.table(a=as.IDate("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print)
test(5.5, options=list(datatable.old.fread.datetime.character=NULL),
fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)),
ans, output=ans_print)
Expand Down
Loading

0 comments on commit 3078461

Please sign in to comment.