Merge branch 'master' into condition-signals

Rdatatable · Apr 9, 2024 · 25e524b · 25e524b
2 parents f18ce75 + b09649b
commit 25e524b
Show file tree

Hide file tree

Showing 24 changed files with 454 additions and 261 deletions.
diff --git a/.dev/cc.R b/.dev/cc.R
@@ -31,6 +31,26 @@ sourceDir = function(path=getwd(), trace = TRUE, ...) {
   if(trace) cat("\n")
 }
 
+# NB: since we only import from default packages, this is rarely needed, but useful for truly minimal dev environments (#6056)
+sourceImports = function(path=getwd(), quiet=FALSE) {
+  nsFile = file.path(path, "NAMESPACE")
+  if (!file.exists(nsFile)) {
+    if (!quiet) warning("No NAMESPACE file found, required to guarantee imports resolve correctly")
+    return(invisible())
+  }
+  nsParsedImports = parseNamespaceFile(basename(path), "..")$imports # weird signature to this function
+  if (!quiet && length(nsParsedImports)) cat(sprintf("Ensuring objects from %d import entries in NAMESPACE resolve correctly\n", length(nsParsedImports)))
+  for (ii in seq_along(nsParsedImports)) {
+    entry = nsParsedImports[[ii]]
+    # getNamespaceExports includes weird objects but that's intentional, consider evalq(.__C__VIRTUAL, asNamespace("Rcpp")) due to import(methods) in that NAMESPACE
+    imported = if (length(entry) == 1L) getNamespaceExports(entry) else entry[[2L]]
+    # Assign directly to better imitate actual namespace imports. Earlier tried to require(include.only=) these objects, but R doesn't allow multiple such require, meaning we can't add more objects later in tests, see:
+    #   https://stat.ethz.ch/pipermail/r-devel/2024-April/083319.html
+    for (import in imported) assign(import, getExportedValue(entry[[1L]], import), .GlobalEnv)
+  }
+  return(invisible())
+}
+
 cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH"), CC="gcc", quiet=FALSE) {
   if (!missing(cc_dir)) {
     warning("'cc_dir' arg is deprecated, use 'path' argument or 'PROJ_PATH' env var instead")
@@ -81,6 +101,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys
     .GlobalEnv[[Call$name]] = Call$address
   for (Extern in xx$.External)
     .GlobalEnv[[Extern$name]] = Extern$address
+  sourceImports(path, quiet=quiet)
   sourceDir(file.path(path, "R"), trace=!quiet)
   if (base::getRversion()<"4.0.0") rm(list=c("rbind.data.table", "cbind.data.table"), envir=.GlobalEnv) # 3968 follow up
   .GlobalEnv$testDir = function(x) file.path(path,"inst/tests",x)

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -1,9 +1,12 @@
 # https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
 * @jangorecki @michaelchirico
 
-# melt
+# reshaping
+/R/fcast.R @tdhock
 /R/fmelt.R @tdhock
+/src/fcast.c @tdhock
 /src/fmelt.c @tdhock
+/man/dcast.data.table.Rd @tdhock
 /man/melt.data.table.Rd @tdhock
 /vignettes/datatable-reshape.Rmd @tdhock
 

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -83,5 +83,6 @@ Authors@R: c(
   person("Dereck","de Mezquita",   role="ctb"),
   person("Michael","Czekanski",    role="ctb"),
   person("Dmitry", "Shemetov",     role="ctb"),
-  person("Nitish", "Jha",          role="ctb")
+  person("Nitish", "Jha",          role="ctb"),
+  person("Joshua", "Wu",           role="ctb")
   )
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,7 +2,6 @@ useDynLib("data_table", .registration=TRUE)
 
 ## For S4-ization
 importFrom(methods, "S3Part<-", slotNames)
-importMethodsFrom(methods, "[")
 exportClasses(data.table, IDate, ITime)
 ##
 

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,10 @@
 
 # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30)  (in development)
 
+## BREAKING CHANGES
+
+1. Usage of comma-separated character strings representing multiple columns in `data.table()`'s `key=` argument and `[`'s `by=`/`keyby=` arguments is deprecated, [#4357](https://github.com/Rdatatable/data.table/issues/4357). While sometimes convenient, ultimately it introduces inconsistency in implementation that is not worth the benefit to maintain. NB: this hard deprecation is temporary in the development version. Before release, it will soften into the normal data.table deprecation cycle starting from introducing the new behavior with an option, then changing the default for the option with a warning, then upgrading the warning to an error before finally removing the option and the error.
+
 ## NEW FEATURES
 
 1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix.
@@ -26,6 +30,8 @@
 
 7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading and ignores subsequent rows when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2727](https://github.com/Rdatatable/data.table/issues/2727) [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` allows for a manual estimate of the number of columns instead, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR.
 
+8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix.
+
 ## BUG FIXES
 
 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix.
@@ -38,6 +44,10 @@
 
 5. `fwrite(x, row.names=TRUE)` with `x` a `matrix` writes `row.names` when present, not row numbers, [#5315](https://github.com/Rdatatable/data.table/issues/5315). Thanks to @Liripo for the report, and @ben-schwen for the fix.
 
+6. `patterns()` helper for `.SDcols` now accepts arguments `ignore.case`, `perl`, `fixed`, and `useBytes`, which are passed to `grep`, #5387. Thanks to @iago-pssjd for the feature request, and @tdhock for the implementation.
+
+7. `melt` returns an integer column for `variable` when `measure.vars` is a list of length=1, consistent with the documented behavior, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. Any users who were relying on this behavior can change `measure.vars=list("col_name")` (output `variable` was column name, now is column index/integer) to `measure.vars="col_name"` (`variable` still is column name).
+
 ## NOTES
 
 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1.
@@ -56,6 +66,12 @@
 
 8. OpenMP detection when building from source on Mac is improved, [#4348](https://github.com/Rdatatable/data.table/issues/4348). Thanks @jameshester and @kevinushey for the request and @kevinushey for the PR, @jameslamb for the advice and @s-u of R-core for ensuring CRAN machines are configured to support the uxpected setup.
 
+9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix.
+
+10. `test.data.table()` runs correctly in more sessions, in particular those where the `digits` or `warn` settings are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR.
+
+11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix.
+
 # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29)  (30 Jan 2024)
 
 ## BREAKING CHANGE

diff --git a/R/AllS4.R b/R/AllS4.R
@@ -3,21 +3,21 @@ if ("package:data.table" %in% search()) stopf("data.table package loaded. When d
 
 ## Allows data.table to be defined as an object of an S4 class,
 ## or even have data.table be a super class of an S4 class.
-setOldClass(c('data.frame'))
-setOldClass(c('data.table', 'data.frame'))
+methods::setOldClass(c('data.frame'))
+methods::setOldClass(c('data.table', 'data.frame'))
 
 ## as(some.data.frame, "data.table")
-setAs("data.frame", "data.table", function(from) {
+methods::setAs("data.frame", "data.table", function(from) {
   as.data.table(from)
 })
 
 ## as(some.data.table, "data.frame")
-setAs("data.table", "data.frame", function(from) {
+methods::setAs("data.table", "data.frame", function(from) {
   as.data.frame(from)
 })
 
-setOldClass("IDate")
-setOldClass("ITime")
+methods::setOldClass("IDate")
+methods::setOldClass("ITime")
 
-setAs("character", "IDate", function(from) as.IDate(from))
-setAs("character", "ITime", function(from) as.ITime(from))
+methods::setAs("character", "IDate", function(from) as.IDate(from))
+methods::setAs("character", "ITime", function(from) as.ITime(from))
diff --git a/R/data.table.R b/R/data.table.R
@@ -12,7 +12,7 @@ dim.data.table = function(x)
 }
 
 .global = new.env()  # thanks to: http://stackoverflow.com/a/12605694/403310
-setPackageName("data.table",.global)
+methods::setPackageName("data.table",.global)
 .global$print = ""
 
 # NB: if adding to/editing this list, be sure to do the following:
@@ -62,8 +62,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str
   if (!is.null(key)) {
     if (!is.character(key)) stopf("key argument of data.table() must be character")
     if (length(key)==1L) {
-      key = strsplit(key,split=",")[[1L]]
-      # eg key="A,B"; a syntax only useful in key argument to data.table(), really.
+      if (key != strsplit(key,split=",")[[1L]]) stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "key=")
     }
     setkeyv(ans,key)
   } else {
@@ -806,8 +805,7 @@ replace_dot_alias = function(e) {
 
         if (mode(bysub) == "character") {
           if (any(grepl(",", bysub, fixed = TRUE))) {
-            if (length(bysub)>1L) stopf("'by' is a character vector length %d but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.", length(bysub))
-            bysub = strsplit(bysub, split=",", fixed=TRUE)[[1L]]
+            stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "by=")
           }
           bysub = gsub("^`(.*)`$", "\\1", bysub) # see test 138
           nzidx = nzchar(bysub)
@@ -1024,8 +1022,13 @@ replace_dot_alias = function(e) {
             .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame())
           } else {
             if (colsub %iscall% 'patterns') {
-              # each pattern gives a new filter condition, intersect the end result
-              .SDcols = Reduce(intersect, eval_with_cols(colsub, names_x))
+              patterns_list_or_vector = eval_with_cols(colsub, names_x)
+              .SDcols = if (is.list(patterns_list_or_vector)) {
+                # each pattern gives a new filter condition, intersect the end result
+                Reduce(intersect, patterns_list_or_vector)
+              } else {
+                patterns_list_or_vector
+              }
             } else {
               .SDcols = eval(colsub, parent.frame(), parent.frame())
               # allow filtering via function in .SDcols, #3950

diff --git a/R/fmelt.R b/R/fmelt.R
@@ -19,17 +19,18 @@ melt.default = function(data, ..., na.rm = FALSE, value.name = "value") {
   # nocov end
 }
 
-patterns = function(..., cols=character(0L)) {
+patterns = function(..., cols=character(0L), ignore.case=FALSE, perl=FALSE, fixed=FALSE, useBytes=FALSE) {
   # if ... has no names, names(list(...)) will be "";
   #   this assures they'll be NULL instead
   L = list(...)
   p = unlist(L, use.names = any(nzchar(names(L))))
   if (!is.character(p))
     stopf("Input patterns must be of type character.")
-  matched = lapply(p, grep, cols)
+  matched = lapply(p, grep, cols, ignore.case=ignore.case, perl=perl, fixed=fixed, useBytes=useBytes)
   # replace with lengths when R 3.2.0 dependency arrives
   if (length(idx <- which(sapply(matched, length) == 0L)))
     stopf('Pattern(s) not found: [%s]', brackify(p[idx]))
+  if (length(matched) == 1L) return(matched[[1L]])
   matched
 }
 

diff --git a/R/fread.R b/R/fread.R
@@ -340,7 +340,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     if (!is.character(key))
       stopf("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)")
     if (length(key) == 1L) {
-      key = strsplit(key, split = ",", fixed = TRUE)[[1L]]
+      if (key != strsplit(key,split=",")[[1L]]) stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "key=")
     }
     setkeyv(ans, key)
   }

diff --git a/R/print.data.table.R b/R/print.data.table.R
@@ -111,7 +111,11 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
     toprint = toprint_subset(toprint, cols_to_print)
   }
   if (printdots) {
-    toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn))
+    if (isFALSE(row.names)) {
+      toprint = rbind(head(toprint, topn + isTRUE(class)), "---", tail(toprint, topn)) # 4083
+    } else {
+      toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn))
+    }
     rownames(toprint) = format(rownames(toprint), justify="right")
     if (col.names == "none") {
       cut_colnames(print(toprint, right=TRUE, quote=quote))
@@ -229,11 +233,16 @@ format_list_item.default = function(x, ...) {
 
 # FR #1091 for pretty printing of character
 # TODO: maybe instead of doing "this is...", we could do "this ... test"?
+# Current implementation may have issues when dealing with strings that have combinations of full-width and half-width characters,
+# if this becomes a problem in the future, we could consider string traversal instead.
 char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) {
   trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE)
   if (!is.character(x) || trunc.char <= 0L) return(x)
-  idx = which(nchar(x) > trunc.char)
-  x[idx] = paste0(substr(x[idx], 1L, as.integer(trunc.char)), "...")
+  nchar_width = nchar(x, 'width') # Check whether string is full-width or half-width, #5096 
+  nchar_chars = nchar(x, 'char')
+  is_full_width = nchar_width > nchar_chars
+  idx = pmin(nchar_width, nchar_chars) > trunc.char
+  x[idx] = paste0(strtrim(x[idx], trunc.char * fifelse(is_full_width[idx], 2L, 1L)), "...")
   x
 }
 

diff --git a/R/test.data.table.R b/R/test.data.table.R
@@ -98,6 +98,8 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
     datatable.print.trunc.cols = FALSE, #4552
     datatable.rbindlist.check = NULL,
     datatable.integer64 = "integer64",
+    digits = 7L, # ensure printing rounds to the expected number of digits in all sessions, #5285
+    warn = 0L,   # ensure signals are emitted as they are in the code, #5285
     warnPartialMatchArgs = base::getRversion()>="3.6.0", # ensure we don't rely on partial argument matching in internal code, #3664; >=3.6.0 for #3865
     warnPartialMatchAttr = TRUE,
     warnPartialMatchDollar = TRUE,
@@ -416,7 +418,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
     }
   }
   if (!fail && !length(error) && (!length(output) || !missing(y))) {   # TODO test y when output=, too
-    y = try(y,TRUE)
+    capture.output(y <- try(y, silent=TRUE)) # y might produce verbose output, just toss it
     if (identical(x,y)) return(invisible(TRUE))
     all.equal.result = TRUE
     if (is.data.frame(x) && is.data.frame(y)) {