Merge branch 'master' into master

Rdatatable · Apr 4, 2024 · b6d1cf9 · b6d1cf9
2 parents 52d87cb + b6d6100
commit b6d1cf9
Show file tree

Hide file tree

Showing 11 changed files with 68 additions and 71 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,10 @@
 
 # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30)  (in development)
 
+## BREAKING CHANGES
+
+1. Usage of comma-separated character strings representing multiple columns in `data.table()`'s `key=` argument and `[`'s `by=`/`keyby=` arguments is deprecated, [#4357](https://github.com/Rdatatable/data.table/issues/4357). While sometimes convenient, ultimately it introduces inconsistency in implementation that is not worth the benefit to maintain. NB: this hard deprecation is temporary in the development version. Before release, it will soften into the normal data.table deprecation cycle starting from introducing the new behavior with an option, then changing the default for the option with a warning, then upgrading the warning to an error before finally removing the option and the error.
+
 ## NEW FEATURES
 
 1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix.

diff --git a/R/data.table.R b/R/data.table.R
@@ -62,8 +62,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str
   if (!is.null(key)) {
     if (!is.character(key)) stopf("key argument of data.table() must be character")
     if (length(key)==1L) {
-      key = strsplit(key,split=",")[[1L]]
-      # eg key="A,B"; a syntax only useful in key argument to data.table(), really.
+      if (key != strsplit(key,split=",")[[1L]]) stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "key=")
     }
     setkeyv(ans,key)
   } else {
@@ -806,8 +805,7 @@ replace_dot_alias = function(e) {
 
         if (mode(bysub) == "character") {
           if (any(grepl(",", bysub, fixed = TRUE))) {
-            if (length(bysub)>1L) stopf("'by' is a character vector length %d but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.", length(bysub))
-            bysub = strsplit(bysub, split=",", fixed=TRUE)[[1L]]
+            stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "by=")
           }
           bysub = gsub("^`(.*)`$", "\\1", bysub) # see test 138
           nzidx = nzchar(bysub)

diff --git a/R/fread.R b/R/fread.R
@@ -340,7 +340,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     if (!is.character(key))
       stopf("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)")
     if (length(key) == 1L) {
-      key = strsplit(key, split = ",", fixed = TRUE)[[1L]]
+      if (key != strsplit(key,split=",")[[1L]]) stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "key=")
     }
     setkeyv(ans, key)
   }

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
diff --git a/man/IDateTime.Rd b/man/IDateTime.Rd
@@ -241,7 +241,7 @@ identical(as.ITime("10:45"), methods::as("10:45", "ITime"))
 as.POSIXct("2001-01-01") + as.ITime("10:45")
 
 datetime <- seq(as.POSIXct("2001-01-01"), as.POSIXct("2001-01-03"), by = "5 hour")
-(af <- data.table(IDateTime(datetime), a = rep(1:2, 5), key = "a,idate,itime"))
+(af <- data.table(IDateTime(datetime), a = rep(1:2, 5), key = c("a", "idate", "itime")))
 
 af[, mean(a), by = "itime"]
 af[, mean(a), by = list(hour = hour(itime))]

diff --git a/man/data.table.Rd b/man/data.table.Rd
@@ -44,7 +44,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
 
     \item{check.names}{ Just as \code{check.names} in \code{\link{data.frame}}.}
 
-    \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}.}
+    \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkey}}.}
 
     \item{stringsAsFactors}{Logical (default is \code{FALSE}). Convert all \code{character} columns to \code{factor}s?}
 

diff --git a/man/duplicated.Rd b/man/duplicated.Rd
@@ -88,7 +88,7 @@ If none exists, 0L is returned.
 }
 \examples{
 DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3),
-                  C = rep(1:2, 6), key = "A,B")
+                  C = rep(1:2, 6), key = c("A", "B"))
 duplicated(DT)
 unique(DT)
 
@@ -113,7 +113,7 @@ identical(unique(DT),DT[10]) # FALSE
 
 # fromLast=TRUE
 DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3),
-                 C = rep(1:2, 6), key = "A,B")
+                 C = rep(1:2, 6), key = c("A", "B"))
 duplicated(DT, by="B", fromLast=TRUE)
 unique(DT, by="B", fromLast=TRUE)
 

diff --git a/man/fread.Rd b/man/fread.Rd
@@ -55,7 +55,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC"
   \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. }
   \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. }
   \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.}
-  \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
+  \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
   \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
   \item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. }
   \item{data.table}{ TRUE returns a \code{data.table}. FALSE returns a \code{data.frame}. The default for this argument can be changed with \code{options(datatable.fread.datatable=FALSE)}.}

diff --git a/man/merge.Rd b/man/merge.Rd
@@ -87,16 +87,16 @@ merge(dt1, dt2, all = TRUE)
 (dt2 <- data.table(A = letters[rep(2:4, 2)], Y = 6:1, key = "A"))
 merge(dt1, dt2, allow.cartesian=TRUE)
 
-(dt1 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(1:3, 2)], X = 1:6, key = "A,B"))
-(dt2 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(2:4, 2)], Y = 6:1, key = "A,B"))
+(dt1 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(1:3, 2)], X = 1:6, key = c("A", "B")))
+(dt2 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(2:4, 2)], Y = 6:1, key = c("A", "B")))
 merge(dt1, dt2)
 merge(dt1, dt2, by="B", allow.cartesian=TRUE)
 
 # test it more:
-d1 <- data.table(a=rep(1:2,each=3), b=1:6, key="a,b")
+d1 <- data.table(a=rep(1:2,each=3), b=1:6, key=c("a", "b"))
 d2 <- data.table(a=0:1, bb=10:11, key="a")
 d3 <- data.table(a=0:1, key="a")
-d4 <- data.table(a=0:1, b=0:1, key="a,b")
+d4 <- data.table(a=0:1, b=0:1, key=c("a", "b"))
 
 merge(d1, d2)
 merge(d2, d1)

diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd
@@ -83,7 +83,7 @@
   print(DT, row.names = FALSE)
 
   #`print.keys` can alert which columns are currently keys
-  DT <- data.table(a=1:3, b=4:6, c=7:9, key="b,a")
+  DT <- data.table(a=1:3, b=4:6, c=7:9, key=c("b", "a"))
   setindexv(DT, c("a", "b"))
   setindexv(DT, "a")
   print(DT, print.keys=TRUE)

diff --git a/man/setDT.Rd b/man/setDT.Rd
@@ -13,7 +13,7 @@ setDT(x, keep.rownames=FALSE, key=NULL, check.names=FALSE)
 \arguments{
   \item{x}{ A named or unnamed \code{list}, \code{data.frame} or \code{data.table}. }
   \item{keep.rownames}{ For \code{data.frame}s, \code{TRUE} retains the \code{data.frame}'s row names under a new column \code{rn}. \code{keep.rownames = "id"} names the column \code{"id"} instead. } 
-  \item{key}{Character vector of one or more column names which is passed to \code{\link{setkeyv}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. }
+  \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. }
   \item{check.names}{ Just as \code{check.names} in \code{\link{data.frame}}. }
 }