Merge branch 'master' into condition-signals

Rdatatable · Apr 20, 2024 · 4624817 · 4624817
2 parents f290c38 + 8ae1b2d
commit 4624817
Show file tree

Hide file tree

Showing 30 changed files with 751 additions and 302 deletions.
diff --git a/.dev/cc.R b/.dev/cc.R
@@ -38,6 +38,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) {
     if (!quiet) warning("No NAMESPACE file found, required to guarantee imports resolve correctly")
     return(invisible())
   }
+  suppressWarnings(rm("getRversion", envir=.GlobalEnv)) # clean up from previous cc() because parseNamespaceFile() run getRversion() in NAMESPACE in .GlobalEnv
   nsParsedImports = parseNamespaceFile(basename(path), "..")$imports # weird signature to this function
   if (!quiet && length(nsParsedImports)) cat(sprintf("Ensuring objects from %d import entries in NAMESPACE resolve correctly\n", length(nsParsedImports)))
   for (ii in seq_along(nsParsedImports)) {
@@ -51,7 +52,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) {
   return(invisible())
 }
 
-cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH"), CC="gcc", quiet=FALSE) {
+cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH", unset="."), CC="gcc", quiet=FALSE) {
   if (!missing(cc_dir)) {
     warning("'cc_dir' arg is deprecated, use 'path' argument or 'PROJ_PATH' env var instead")
     path = cc_dir

diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml
@@ -0,0 +1,103 @@
+on:
+  schedule:
+   - cron: '18 13 8 * *' # 8th of month at 13:18 UTC
+
+# A more complete suite of checks to run monthly; each PR/merge need not pass all these, but they should pass before CRAN release
+name: R-CMD-check-occasional
+
+jobs:
+  R-CMD-check-occasional:
+    runs-on: ${{ matrix.os }}
+
+    name: ${{ matrix.os }} (${{ matrix.r }})
+
+    strategy:
+      matrix:
+        os: [macOS-latest, windows-latest, ubuntu-latest]
+        r: ['devel', 'release', '3.2', '3.3', '3.4', '3.5', '3.6', '4.0', '4.1', '4.2', '4.3']
+        locale: ['en_US.utf8', 'zh_CN.utf8', 'lv_LV.utf8'] # Chinese for translations, Latvian for collate order (#3502)
+        exclude: # only run non-English locale CI on Ubuntu
+          - os: macOS-latest
+            locale: 'zh_CN.utf8'
+          - os: macOS-latest
+            locale: 'lv_LV.utf8'
+          - os: windows-latest
+            locale: 'zh_CN.utf8'
+          - os: windows-latest
+            locale: 'lv_LV.utf8'
+
+    env:
+      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+      TEST_DATA_TABLE_WITH_OTHER_PACKAGES: true
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+    steps:
+      - name: Set locale
+        if: matrix.locale == 'en_US.utf8'
+        run: |
+          sudo locale-gen en_US
+          echo "LC_ALL=en_US.utf8" >> $GITHUB_ENV
+
+      - name: Set locale
+        if: matrix.locale == 'zh_CN.utf8'
+        run: |
+          sudo locale-gen zh_CN
+          echo "LC_ALL=zh_CN.utf8" >> $GITHUB_ENV
+          echo "LANGUAGE=zh_CN" >> $GITHUB_ENV
+
+      - name: Set locale
+        if: matrix.locale == 'lv_LV.utf8'
+        run: |
+          sudo locale-gen lv_LV
+          echo "LC_ALL=lv_LV.utf8" >> $GITHUB_ENV
+          echo "LANGUAGE=lv_LV" >> $GITHUB_ENV
+
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.r }}
+
+
+      - name: Query dependencies
+        run: |
+          install.packages('remotes')
+          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
+          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
+        shell: Rscript {0}
+
+      - name: Restore R package cache
+        uses: actions/cache@v2
+        with:
+          path: ${{ env.R_LIBS_USER }}
+          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
+
+      - name: Install system dependencies
+        if: runner.os == 'Linux'
+        run: |
+          while read -r cmd
+          do
+            eval sudo $cmd
+          done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))')
+
+      - name: Install dependencies
+        run: |
+          remotes::install_deps(dependencies = TRUE)
+          remotes::install_cran("rcmdcheck")
+        shell: Rscript {0}
+
+      - name: Check
+        env:
+          _R_CHECK_CRAN_INCOMING_REMOTE_: false
+        run: |
+          options(crayon.enabled = TRUE)
+          rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
+        shell: Rscript {0}
+
+      - name: Upload check results
+        if: failure()
+        uses: actions/upload-artifact@main
+        with:
+          name: ${{ runner.os }}-r${{ matrix.r }}-results
+          path: check
diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml
@@ -0,0 +1,23 @@
+name: Autocomment atime-based performance regression analysis on PRs
+
+on:
+  pull_request:
+    branches:
+      - '*'
+    types:
+      - opened
+      - reopened
+      - synchronize
+    paths:
+      - 'R/**'
+      - 'src/**'       
+
+jobs:
+  comment:
+    runs-on: ubuntu-latest
+    container: ghcr.io/iterative/cml:0-dvc2-base1
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      repo_token: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: Anirban166/[email protected]
diff --git a/NEWS.md b/NEWS.md
@@ -32,6 +32,12 @@
 
 8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix.
 
+9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development.
+
+10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR.
+
+11. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417).
+
 ## BUG FIXES
 
 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix.
@@ -68,10 +74,14 @@
 
 9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix.
 
-10. `test.data.table()` runs correctly in more sessions, in particular those where the `digits` or `warn` settings are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR.
+10. `test.data.table()` runs robustly:
+  + In sessions where the `digits` or `warn` options are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR.
+  + In locales where `letters != sort(letters)`, e.g. Latvian, [#3502](https://github.com/Rdatatable/data.table/issues/3502). Thanks @minemR for the report and @MichaelChirico for the fix.
 
 11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix.
 
+12. `print.data.table` now honors `na.print`, as seen in `print.default`, allowing for string replacement of `NA` values when printing. Thanks @HughParsonage for the report and @joshhwuu for the fix.
+
 # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29)  (30 Jan 2024)
 
 ## BREAKING CHANGE

diff --git a/R/data.table.R b/R/data.table.R
@@ -2452,9 +2452,11 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR
     dtq[["i"]] = quote(levs)
     join = TRUE
   }
+  dots = list(...)
+  if (!"sep" %chin% names(dots)) dots$sep = "."
   dtq[["j"]] = substitute(
-    list(.ll.tech.split=list(.expr), .ll.tech.split.names=paste(lapply(.BY, as.character), collapse=".")),
-    list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD"))
+    list(.ll.tech.split=list(.expr), .ll.tech.split.names=paste(lapply(.BY, as.character), collapse=.sep)),
+    list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD"), .sep = dots$sep)
   )
   dtq[["by"]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`.
     .expr,

diff --git a/R/fmelt.R b/R/fmelt.R
@@ -107,17 +107,18 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
       stopf("pattern must be character string")
     }
     match.vec = regexpr(pattern, cols, perl=TRUE)
-    measure.vec = which(0 < match.vec)
-    if (length(measure.vec) == 0L) {
+    measure.vec.i = which(0 < match.vec)
+    if (length(measure.vec.i) == 0L) {
       stopf("pattern did not match any cols, so nothing would be melted; fix by changing pattern")
     }
-    start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE]
+    start = attr(match.vec, "capture.start")[measure.vec.i, , drop=FALSE]
     if (is.null(start)) {
       stopf("pattern must contain at least one capture group (parenthesized sub-pattern)")
     }
     err.args.groups("number of capture groups in pattern", ncol(start))
-    end = attr(match.vec, "capture.length")[measure.vec,]+start-1L
-    names.mat = matrix(cols[measure.vec], nrow(start), ncol(start))
+    end = attr(match.vec, "capture.length")[measure.vec.i,]+start-1L
+    measure.vec <- cols[measure.vec.i]
+    names.mat = matrix(measure.vec, nrow(start), ncol(start))
     substr(names.mat, start, end)
   } else { #pattern not specified, so split using sep.
     if (!is.character(sep)) {
@@ -130,10 +131,11 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
       stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification")
     }
     err.args.groups("max number of items after splitting column names", n.groups)
-    measure.vec = which(vector.lengths==n.groups)
-    do.call(rbind, list.of.vectors[measure.vec])
+    measure.vec.i = which(vector.lengths==n.groups)
+    measure.vec = cols[measure.vec.i]
+    do.call(rbind, list.of.vectors[measure.vec.i])
   }
-  err.names.unique("measured columns", cols[measure.vec])
+  err.names.unique("measured columns", measure.vec)
   uniq.mat = unique(group.mat)
   if (nrow(uniq.mat) < nrow(group.mat)) {
     stopf("number of unique column IDs =%d is less than number of melted columns =%d; fix by changing pattern/sep", nrow(uniq.mat), nrow(group.mat))

diff --git a/R/fread.R b/R/fread.R
@@ -1,5 +1,5 @@
 fread = function(
-input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto",
+input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto",
 na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
 skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
 col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
@@ -16,7 +16,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     else if (sep=="auto") sep=""      # sep=="" at C level means auto sep
     else stopifnot( nchar(sep)==1L )  # otherwise an actual character to use as sep
   }
-  stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L )
+  stopifnot( is.character(dec), length(dec)==1L)
+  if (dec == "auto") dec = "" else stopifnot(nchar(dec) == 1L)
   # handle encoding, #563
   if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) {
     stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.")

diff --git a/R/print.data.table.R b/R/print.data.table.R
@@ -8,6 +8,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
                print.keys=getOption("datatable.print.keys"),
                trunc.cols=getOption("datatable.print.trunc.cols"),
                quote=FALSE,
+               na.print=NULL,
                timezone=FALSE, ...) {
   # topn  - print the top topn and bottom topn rows with '---' inbetween (5)
   # nrows - under this the whole (small) table is printed, unless topn is provided (100)
@@ -109,6 +110,13 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
     # When nrow(toprint) = 1, attributes get lost in the subset,
     #   function below adds those back when necessary
     toprint = toprint_subset(toprint, cols_to_print)
+    trunc.cols <- length(not_printed) > 0L
+  }
+  print_default = function(x) {
+    if (col.names != "none") cut_colnames = identity
+    cut_colnames(print(x, right=TRUE, quote=quote, na.print=na.print))
+    # prints names of variables not shown in the print
+    if (trunc.cols) trunc_cols_message(not_printed, abbs, class, col.names)
   }
   if (printdots) {
     if (isFALSE(row.names)) {
@@ -117,30 +125,14 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
       toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn))
     }
     rownames(toprint) = format(rownames(toprint), justify="right")
-    if (col.names == "none") {
-      cut_colnames(print(toprint, right=TRUE, quote=quote))
-    } else {
-      print(toprint, right=TRUE, quote=quote)
-    }
-    if (trunc.cols && length(not_printed) > 0L)
-      # prints names of variables not shown in the print
-      trunc_cols_message(not_printed, abbs, class, col.names)
-
+    print_default(toprint)
     return(invisible(x))
   }
   if (nrow(toprint)>20L && col.names == "auto")
     # repeat colnames at the bottom if over 20 rows so you don't have to scroll up to see them
     #   option to shut this off per request of Oleg Bondar on SO, #1482
-    toprint=rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97
-  if (col.names == "none") {
-    cut_colnames(print(toprint, right=TRUE, quote=quote))
-  } else {
-    print(toprint, right=TRUE, quote=quote)
-  }
-  if (trunc.cols && length(not_printed) > 0L)
-    # prints names of variables not shown in the print
-    trunc_cols_message(not_printed, abbs, class, col.names)
-
+    toprint = rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97
+  print_default(toprint)
   invisible(x)
 }