diff --git a/DESCRIPTION b/DESCRIPTION index 25a7bd6d37..f94ee86669 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -98,5 +98,6 @@ Authors@R: c( person("Christian", "Wia", role="ctb"), person("Elise", "Maigné", role="ctb"), person("Vincent", "Rocher", role="ctb"), - person("Vijay", "Lulla", role="ctb") + person("Vijay", "Lulla", role="ctb"), + person("Bill", "Evans", role="ctb") ) diff --git a/NEWS.md b/NEWS.md index f203117bad..e601916fc1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -65,6 +65,29 @@ rowwiseDT( 4. `patterns()` in `melt()` combines correctly with user-defined `cols=`, which can be useful to specify a subset of columns to reshape without having to use a regex, for example `patterns("2", cols=c("y1", "y2"))` will only give `y2` even if there are other columns in the input matching `2`, [#6498](https://github.com/Rdatatable/data.table/issues/6498). Thanks to @hongyuanjia for the report, and to @tdhock for the PR. +5. `.SDcols=` now supports a list of expressions. The default action for the second and subsequent expressions will be to add to columns already selected, but if an expression is prefaced with the literal `--` then columns found in that expression will be removed from the columns selected so far. The order matters, columns removed in one `--expr` may be added in the next expression (so order of `--`-expressions within the list of expressions matters). All current forms of expressions normally supported are still allowed in the list of expressions. Thanks to @r2evans for the request and PR. + + ```r + DT = data.table(int1=1L, int2=2L, chr1="A", chr2="B", num1=1, num2=2, lgl=TRUE) + DT[, .SD, .SDcols = .(is.logical, !is.numeric)] + # lgl chr1 chr2 + # 1: TRUE A B + DT[, .SD, .SDcols = .(patterns("r2"), c(1L, 1L, 2L, 3L))] + # chr2 int1 int1 int2 chr1 + # 1: B 1 1 2 A + DT[, .SD, .SDcols = .("lgl", is.numeric)] + # lgl int1 int2 num1 num2 + # 1: TRUE 1 2 1 2 + DT[, .SD, .SDcols = .(patterns("1$"), --is.integer)] + # chr1 num1 + # 1: A 1 + DT[, .SD, .SDcols = .(patterns("1$"), !is.integer)] + # int1 chr1 num1 chr2 num2 lgl + # 1: 1 A 1 B 2 TRUE + ``` + + Please note in the last two examples that `--` is distinct from `!`/`-`: with `--is.integer`, already-selected columns that are integers will be _removed_ from the set of columns to return, whereas with `!is.integer`, all columns that are not integers will be _added_ to the set of columns to return. + ## BUG FIXES 1. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR. diff --git a/R/data.table.R b/R/data.table.R index 62210bd838..376716095f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1004,7 +1004,16 @@ replace_dot_alias = function(e) { ansvals = chmatchdup(ansvars, names_x) } else { # FR #355 - negative numeric and character indices for SDcols - colsub = substitute(.SDcols) + colsubs = substitute(.SDcols) + # FR #6619 - list of expressions for .SDcols + colsubs = if (colsubs %iscall% c(".", "list")) as.list(colsubs)[-1] else list(colsubs) + combine = function(a, b, subtract = FALSE) if (subtract) a[!a %in% b] else c(a, b[!b %in% a]) + ansvars = sdvars = character() + ansvals = integer() + for (colsub in colsubs) { + # double-minus means to _remove_ columns from selection + rem_cols = (colsub %iscall% "-") && as.list(colsub[-1])[[1]] %iscall% "-" + if (rem_cols) colsub = colsub[[-1]][[-1]] # peel from parentheses before negation so (-1L) works as well: as.data.table(as.list(1:3))[, .SD,.SDcols=(-1L)] #4231 while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]] # fix for R-Forge #5190. colsub[[1L]] gave error when it's a symbol. @@ -1017,48 +1026,52 @@ replace_dot_alias = function(e) { while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]] if (colsub %iscall% ':' && length(colsub)==3L && !is.call(colsub[[2L]]) && !is.call(colsub[[3L]])) { # .SDcols is of the format a:b, ensure none of : arguments is a call data.table(V1=-1L, V2=-2L, V3=-3L)[,.SD,.SDcols=-V2:-V1] #4231 - .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) + colsub = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) } else { if (colsub %iscall% 'patterns') { patterns_list_or_vector = eval_with_cols(colsub, names_x) - .SDcols = if (is.list(patterns_list_or_vector)) { + colsub = if (is.list(patterns_list_or_vector)) { # each pattern gives a new filter condition, intersect the end result Reduce(intersect, patterns_list_or_vector) } else { patterns_list_or_vector } } else { - .SDcols = eval(colsub, parent.frame(), parent.frame()) + colsub = eval(colsub, parent.frame(), parent.frame()) # allow filtering via function in .SDcols, #3950 - if (is.function(.SDcols)) { - .SDcols = lapply(x, .SDcols) - if (any(idx <- lengths(.SDcols) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA))) + if (is.function(colsub)) { + colsub = lapply(x, colsub) + if (any(idx <- lengths(colsub) > 1L | vapply_1c(colsub, typeof) != 'logical' | vapply_1b(colsub, anyNA))) stopf("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: %s", brackify(names(x)[idx])) - .SDcols = unlist(.SDcols, use.names = FALSE) + colsub = unlist(colsub, use.names = FALSE) } } } - if (anyNA(.SDcols)) - stopf(".SDcols missing at the following indices: %s", brackify(which(is.na(.SDcols)))) - if (is.logical(.SDcols)) { - if (length(.SDcols)!=length(x)) stopf(".SDcols is a logical vector of length %d but there are %d columns", length(.SDcols), length(x)) - ansvals = which_(.SDcols, !negate_sdcols) - ansvars = sdvars = names_x[ansvals] - } else if (is.numeric(.SDcols)) { - .SDcols = as.integer(.SDcols) - # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff' - if (length(unique(sign(.SDcols))) > 1L) stopf(".SDcols is numeric but has both +ve and -ve indices") - if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L)) + if (anyNA(colsub)) + stopf(".SDcols missing at the following indices: %s", brackify(which(is.na(colsub)))) + if (is.logical(colsub)) { + if (length(colsub)!=length(x)) stopf(".SDcols is a logical vector of length %d but there are %d columns", length(colsub), length(x)) + newvars = which_(colsub, !negate_sdcols) + ansvals = combine(ansvals, newvars, rem_cols) + ansvars = sdvars = combine(ansvars, names_x[newvars], rem_cols) + } else if (is.numeric(colsub)) { + colsub = as.integer(colsub) + # if colsub is numeric, use 'dupdiff' instead of 'setdiff' + if (length(unique(sign(colsub))) > 1L) stopf(".SDcols is numeric but has both +ve and -ve indices") + if (any(idx <- abs(colsub)>ncol(x) | abs(colsub)<1L)) stopf(".SDcols is numeric but out of bounds [1, %d] at: %s", ncol(x), brackify(which(idx))) - ansvars = sdvars = if (negate_sdcols) dupdiff(names_x[-.SDcols], bynames) else names_x[.SDcols] - ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols + newvars = if (negate_sdcols) dupdiff(names_x[-colsub], bynames) else names_x[colsub] + ansvars = sdvars = combine(ansvars, newvars, rem_cols) + ansvals = combine(ansvals, if (negate_sdcols) setdiff(seq_along(names(x)), c(colsub, which(names(x) %chin% bynames))) else colsub, rem_cols) } else { - if (!is.character(.SDcols)) stopf(".SDcols should be column numbers or names") - if (!all(idx <- .SDcols %chin% names_x)) - stopf("Some items of .SDcols are not column names: %s", brackify(.SDcols[!idx])) - ansvars = sdvars = if (negate_sdcols) setdiff(names_x, c(.SDcols, bynames)) else .SDcols + if (!is.character(colsub)) stopf(".SDcols should be column numbers or names") + if (!all(idx <- colsub %chin% names_x)) + stopf("Some items of .SDcols are not column names: %s", brackify(colsub[!idx])) + newvars = if (negate_sdcols) setdiff(names_x, c(colsub, bynames)) else colsub + ansvars = sdvars = combine(ansvars, newvars, rem_cols) # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove) - ansvals = chmatch(ansvars, names_x) + ansvals = combine(ansvals, chmatch(newvars, names_x), rem_cols) + } } } # fix for long standing FR/bug, #495 and #484 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fef8f1fffb..735941856d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -20596,3 +20596,12 @@ test(2295.3, is.data.table(d2)) # #6588: .checkTypos used to give arbitrary strings to stopf as the first argument test(2296, d2[x %no such operator% 1], error = '%no such operator%') + +# #6619: .SDcols supports list of expressions +DT = data.table(int1=1L, int2=2L, chr1="A", chr2="B", num1=1, num2=2, lgl=TRUE) +test(2297.1, DT[, .SD, .SDcols = .(is.logical, !is.numeric)], DT[, .(lgl, chr1, chr2)]) +test(2297.2, DT[, .SD, .SDcols = .(patterns("r2"), c(1L, 1L, 2L, 3L))], DT[, .(chr2, int1, int1, int2, chr1)]) +test(2297.3, DT[, .SD, .SDcols = .("lgl", is.numeric)], DT[, .(lgl, int1, int2, num1, num2)]) +# difference between `--` and `!` +test(2297.4, DT[, .SD, .SDcols = .(patterns("1$"), --is.integer)], DT[, .(chr1, num1)]) +test(2297.5, DT[, .SD, .SDcols = .(patterns("1$"), !is.integer)], DT[, .(int1, chr1, num1, chr2 ,num2, lgl)]) diff --git a/man/data.table.Rd b/man/data.table.Rd index 56738a9d3c..deff18ca4e 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -152,7 +152,17 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac Inversion (column dropping instead of keeping) can be accomplished be prepending the argument with \code{!} or \code{-} (there's no difference between these), e.g. \code{.SDcols = !c('x', 'y')}. - Finally, you can filter columns to include in \code{.SD} based on their \emph{names} according to regular expressions via \code{.SDcols=patterns(regex1, regex2, ...)}. The included columns will be the \emph{intersection} of the columns identified by each pattern; pattern unions can easily be specified with \code{|} in a regex. You can filter columns on \code{values} by passing a function, e.g. \code{.SDcols=\link{is.numeric}}. You can also invert a pattern as usual with \code{.SDcols=!patterns(...)} or \code{.SDcols=!is.numeric}. + You can filter columns to include in \code{.SD} based on their \emph{names} according to regular expressions via \code{.SDcols=patterns(regex1, regex2, ...)}. The included columns will be the \emph{intersection} of the columns identified by each pattern; pattern unions can easily be specified with \code{|} in a regex. You can filter columns on \code{values} by passing a function, e.g. \code{.SDcols=\link{is.numeric}}. You can also invert a pattern as usual with \code{.SDcols=!patterns(...)} or \code{.SDcols=!is.numeric}. + + Finally, you can combine any of the other column-selection methods in a list of different column-selection methods. Considerations: + + \itemize{ + \item The selection of columns is \emph{additive}, meaning that \code{.SDcols=.(is.numeric, 'x')} will include all columns that are numeric and the column named 'x' (whether or not it contains numeric data). + \item Selection-inversion (selecting columns that do not meet a condition) works the same using the traditional \code{!} or \code{-}. + \item Deduplication. Columns in one element that were selected in previous elements will not be added again. For example, if column \code{x} is a numeric column, then \code{.SDcols=.(is.numeric, 'x')} will return only one instance of the \code{x} column. One can still explicitly choose duplicate columns within each selection, such as \code{.SDcols=.(is.numeric, c(1,1,2,3))}. + \item Removing columns that were selected in previous elements in the list is done with a double-minus, as in \code{--cols}. This supports any of the accepted column-selection methods (integers, strings, function calls, etc). For example, \code{.SDcols=.(is.numeric, --'x'} will select all numeric columns except one whose column name is 'x'. Because this only removes columns that were previously selected, a \code{--cols} removal as the first element of \code{.SDcols} has no effect. + } + } \item{verbose}{ \code{TRUE} turns on status and information messages to the console. Turn this on by default using \code{options(datatable.verbose=TRUE)}. The quantity and types of verbosity may be expanded in future.