Merge branch 'master' into add-a-test-and-move-to-ci

Rdatatable · Apr 23, 2024 · 2b7fe3a · 2b7fe3a
2 parents 1d8c699 + 6db0eda
commit 2b7fe3a
Show file tree

Hide file tree

Showing 31 changed files with 289 additions and 88 deletions.
diff --git a/.ci/.lintr.R b/.ci/.lintr.R
@@ -0,0 +1,105 @@
+for (f in list.files('ci/linters', full.names=TRUE)) source(f)
+rm(f)
+
+linters = all_linters(
+  packages = "lintr", # TODO(lintr->3.2.0): Remove this.
+  # eq_assignment_linter(),
+  brace_linter(allow_single_line = TRUE),
+  # TODO(michaelchirico): Activate these incrementally. These are the
+  #   parameterizations that match our style guide.
+  # implicit_assignment_linter(allow_lazy = TRUE, allow_scoped = TRUE),
+  # implicit_integer_linter(allow_colon = TRUE),
+  # system_time_linter = undesirable_function_linter(c(
+  #   system.time = "Only run timings in benchmark.Rraw"
+  # )),
+  # undesirable_function_linter(modify_defaults(
+  #   default_undesirable_functions,
+  #   ifelse = "Use fifelse instead.",
+  #   Sys.setenv = NULL,
+  #   library = NULL,
+  #   options = NULL,
+  #   par = NULL,
+  #   setwd = NULL
+  # )),
+  undesirable_operator_linter(modify_defaults(
+    default_undesirable_operators,
+    `<<-` = NULL
+  )),
+  # TODO(lintr#2441): Use upstream implementation.
+  assignment_linter = NULL,
+  # TODO(lintr#2442): Use this once x[ , j, by] is supported.
+  commas_linter = NULL,
+  commented_code_linter = NULL,
+  # TODO(linter->3.2.0): Activate this.
+  consecutive_assertion_linter = NULL,
+  cyclocomp_linter = NULL,
+  function_argument_linter = NULL,
+  indentation_linter = NULL,
+  infix_spaces_linter = NULL,
+  # TODO(R>3.2.0): Activate this, extending to recognize vapply_1i(x, length).
+  lengths_linter = NULL,
+  line_length_linter = NULL,
+  missing_package_linter = NULL,
+  namespace_linter = NULL,
+  nonportable_path_linter = NULL,
+  object_name_linter = NULL,
+  object_usage_linter = NULL,
+  quotes_linter = NULL,
+  semicolon_linter = NULL,
+  spaces_inside_linter = NULL,
+  spaces_left_parentheses_linter = NULL,
+  # TODO(michaelchirico): Only exclude from vignettes, not sure what's wrong.
+  strings_as_factors_linter = NULL,
+  # TODO(lintr->3.2.0): Fix on a valid TODO style, enforce it, and re-activate.
+  todo_comment_linter = NULL,
+  # TODO(michaelchirico): Enforce these and re-activate them one-by-one. Also stop using '<<-'.
+  brace_linter = NULL,
+  condition_call_linter = NULL,
+  conjunct_test_linter = NULL,
+  fixed_regex_linter = NULL,
+  function_left_parentheses_linter = NULL,
+  if_not_else_linter = NULL,
+  implicit_assignment_linter = NULL,
+  implicit_integer_linter = NULL,
+  keyword_quote_linter = NULL,
+  length_levels_linter = NULL,
+  matrix_apply_linter = NULL,
+  missing_argument_linter = NULL,
+  nzchar_linter = NULL,
+  object_overwrite_linter = NULL,
+  paren_body_linter = NULL,
+  redundant_equals_linter = NULL,
+  rep_len_linter = NULL,
+  repeat_linter = NULL,
+  return_linter = NULL,
+  sample_int_linter = NULL,
+  scalar_in_linter = NULL,
+  seq_linter = NULL,
+  undesirable_function_linter = NULL,
+  unnecessary_concatenation_linter = NULL,
+  unnecessary_lambda_linter = NULL,
+  unnecessary_nesting_linter = NULL,
+  unreachable_code_linter = NULL,
+  unused_import_linter = NULL
+)
+# TODO(lintr#2172): Glob with lintr itself.
+exclusions = local({
+  exclusion_for_dir <- function(dir, exclusions) {
+    files = list.files(dir, pattern = "\\.(R|Rmd)$")
+    stats::setNames(rep(list(exclusions), length(files)), files)
+  }
+  c(
+    exclusion_for_dir("tests", list(
+      quotes_linter = Inf,
+      # TODO(michaelchirico): Enforce these and re-activate them one-by-one.
+      implicit_integer_linter = Inf,
+      infix_spaces_linter = Inf,
+      undesirable_function_linter = Inf
+    )),
+    exclusion_for_dir("vignettes", list(
+      quotes_linter = Inf
+      # strings_as_factors_linter = Inf
+      # system_time_linter = Inf
+    ))
+  )
+})
diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R
@@ -1,7 +1,7 @@
 # A function to customize R package metadata and source files to facilitate version-specific installation and testing.
 #
-# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R) 
-# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. 
+# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R)
+# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package.
 # It appends a SHA1 hash to the package name (PKG.SHA), ensuring each version can be installed and tested separately.
 #
 # @param old.Package Current name of the package.
@@ -29,7 +29,7 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) {
       Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
       new.Package_ <- paste0(Package_, "_", sha)
       pkg_find_replace(
-        "DESCRIPTION", 
+        "DESCRIPTION",
         paste0("Package:\\s+", old.Package),
         paste("Package:", new.Package))
       pkg_find_replace(
@@ -55,13 +55,13 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) {
     }
 
 # A list of performance tests.
-# 
+#
 # Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments:
 # - N: A numeric sequence of data sizes to vary.
 # - setup: An expression evaluated for every data size before measuring time/memory.
-# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions. 
+# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions.
 #         This must call a function from data.table using a syntax with double or triple colon prefix.
-#         The package name before the colons will be replaced by a new package name that uses the commit SHA hash. 
+#         The package name before the colons will be replaced by a new package name that uses the commit SHA hash.
 #         (For instance, data.table:::[.data.table will become data.table.some_40_digit_SHA1_hash:::[.data.table)
 #
 # Optional parameters that may be useful to configure tests:
@@ -70,8 +70,9 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) {
 # - sha.vec: Named character vector or a list of vectors that specify data.table-specific commit SHAs for testing across those different git commit versions.
 #            For historical regressions, use 'Before', 'Regression', and 'Fixed' (otherwise something like 'Slow' or 'Fast' ideally).
 # @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information.
+# nolint start: undesirable_operator_linter. ':::' needed+appropriate here.
 test.list <- list(
-  # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 
+  # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311
   # Fixed in: https://github.com/Rdatatable/data.table/pull/4440
   "Regression fixed in #4440" = list(
     pkg.edit.fun = pkg.edit.fun,
@@ -88,7 +89,7 @@ test.list <- list(
 
   # Test based on: https://github.com/Rdatatable/data.table/issues/5424
   # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/5463    
+  # Fixed in: https://github.com/Rdatatable/data.table/pull/5463
   "Regression fixed in #5463" = list(
     pkg.edit.fun = pkg.edit.fun,
     N = 10^seq(3, 8),
@@ -101,7 +102,7 @@ test.list <- list(
         key = "g")
       dt_mod <- copy(dt)
     }),
-    expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)),    
+    expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)),
     Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
     Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
     Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits)
@@ -118,3 +119,4 @@ test.list <- list(
     Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801)
     Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits)
 )
+# nolint end: undesirable_operator_linter.
diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml
@@ -16,12 +16,19 @@ jobs:
         os: [macOS-latest, windows-latest, ubuntu-latest]
         r: ['devel', 'release', '3.2', '3.3', '3.4', '3.5', '3.6', '4.0', '4.1', '4.2', '4.3']
         locale: ['en_US.utf8', 'zh_CN.utf8', 'lv_LV.utf8'] # Chinese for translations, Latvian for collate order (#3502)
-        exclude:
-          - os: ['macOS-latest', 'windows-latest'] # only run non-English locale CI on Ubuntu
-            locale: ['zh_CN.utf8', 'lv_LV.utf8']
+        exclude: # only run non-English locale CI on Ubuntu
+          - os: macOS-latest
+            locale: 'zh_CN.utf8'
+          - os: macOS-latest
+            locale: 'lv_LV.utf8'
+          - os: windows-latest
+            locale: 'zh_CN.utf8'
+          - os: windows-latest
+            locale: 'lv_LV.utf8'
 
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+      TEST_DATA_TABLE_WITH_OTHER_PACKAGES: true
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 
     steps:

diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -0,0 +1,35 @@
+on:
+  push:
+    branches:
+     - master
+  pull_request:
+    branches:
+     - master
+
+name: lint
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: |
+            r-lib/lintr
+            local::.
+          needs: lint
+
+      - name: Lint
+        run: lintr::lint_package()
+        shell: Rscript {0}
+        env:
+          LINTR_ERROR_ON_LINT: true
+          R_LINTR_LINTER_FILE: .ci/.lintr
diff --git a/NEWS.md b/NEWS.md
@@ -36,6 +36,8 @@
 
 10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR.
 
+11. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417).
+
 ## BUG FIXES
 
 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix.
@@ -72,7 +74,9 @@
 
 9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix.
 
-10. `test.data.table()` runs correctly in more sessions, in particular those where the `digits` or `warn` settings are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR.
+10. `test.data.table()` runs robustly:
+  + In sessions where the `digits` or `warn` options are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR.
+  + In locales where `letters != sort(letters)`, e.g. Latvian, [#3502](https://github.com/Rdatatable/data.table/issues/3502). Thanks @minemR for the report and @MichaelChirico for the fix.
 
 11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix.
 

diff --git a/R/bmerge.R b/R/bmerge.R
@@ -187,4 +187,3 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
   ans$xo = xo  # for further use by [.data.table
   return(ans)
 }
-
diff --git a/R/data.table.R b/R/data.table.R
@@ -2300,7 +2300,7 @@ transform.data.table = function (`_data`, ...)
 {
   if (!cedta()) return(NextMethod()) # nocov
   `_data` = copy(`_data`)
-  e = eval(substitute(list(...)), `_data`, parent.frame()) 
+  e = eval(substitute(list(...)), `_data`, parent.frame())
   set(`_data`, ,names(e), e)
   `_data`
 }
@@ -2452,9 +2452,11 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR
     dtq[["i"]] = quote(levs)
     join = TRUE
   }
+  dots = list(...)
+  if (!"sep" %chin% names(dots)) dots$sep = "."
   dtq[["j"]] = substitute(
-    list(.ll.tech.split=list(.expr), .ll.tech.split.names=paste(lapply(.BY, as.character), collapse=".")),
-    list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD"))
+    list(.ll.tech.split=list(.expr), .ll.tech.split.names=paste(lapply(.BY, as.character), collapse=.sep)),
+    list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD"), .sep = dots$sep)
   )
   dtq[["by"]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`.
     .expr,

diff --git a/R/duplicated.R b/R/duplicated.R
@@ -118,4 +118,3 @@ uniqueN = function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE)
     length(starts)
   }
 }
-
diff --git a/R/foverlaps.R b/R/foverlaps.R
@@ -247,4 +247,3 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k
 # Tests are added to ensure we cover these aspects (to my knowledge) to ensure that any undesirable changes in the future breaks those tests.
 
 # Conclusion: floating point manipulations are hell!
-
diff --git a/R/fwrite.R b/R/fwrite.R
@@ -64,7 +64,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
     length(nThread)==1L && !is.na(nThread) && nThread>=1L
     )
 
-  is_gzip = compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file))
+  is_gzip = compress == "gzip" || (compress == "auto" && endsWithAny(file, ".gz"))
 
   file = path.expand(file)  # "~/foo/bar"
   if (append && (file=="" || file.exists(file))) {
@@ -122,4 +122,3 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
 }
 
 haszlib = function() .Call(Cdt_has_zlib)
-
diff --git a/R/onAttach.R b/R/onAttach.R
@@ -21,7 +21,7 @@
     nth = getDTthreads(verbose=FALSE)
     if (dev)
       packageStartupMessagef("data.table %s IN DEVELOPMENT built %s%s using %d threads (see ?getDTthreads).  ", v, d, g, nth, appendLF=FALSE)
-    else 
+    else
       packageStartupMessagef("data.table %s using %d threads (see ?getDTthreads).  ", v, nth, appendLF=FALSE)
     packageStartupMessagef("Latest news: r-datatable.com")
     if (gettext("TRANSLATION CHECK") != "TRANSLATION CHECK")

diff --git a/R/openmp-utils.R b/R/openmp-utils.R
@@ -13,4 +13,3 @@ setDTthreads = function(threads=NULL, restore_after_fork=NULL, percent=NULL, thr
 getDTthreads = function(verbose=getOption("datatable.verbose")) {
   .Call(CgetDTthreads, verbose)
 }
-
diff --git a/R/print.data.table.R b/R/print.data.table.R
@@ -230,7 +230,7 @@ format_list_item.default = function(x, ...) {
 char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) {
   trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE)
   if (!is.character(x) || trunc.char <= 0L) return(x)
-  nchar_width = nchar(x, 'width') # Check whether string is full-width or half-width, #5096 
+  nchar_width = nchar(x, 'width') # Check whether string is full-width or half-width, #5096
   nchar_chars = nchar(x, 'char')
   is_full_width = nchar_width > nchar_chars
   idx = pmin(nchar_width, nchar_chars) > trunc.char
@@ -272,4 +272,3 @@ trunc_cols_message = function(not_printed, abbs, class, col.names){
     n, brackify(paste0(not_printed, classes))
   )
 }
-
diff --git a/R/setkey.R b/R/setkey.R
@@ -353,4 +353,3 @@ CJ = function(..., sorted = TRUE, unique = FALSE)
   }
   l
 }
-
diff --git a/R/setops.R b/R/setops.R
@@ -290,4 +290,3 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu
   }
   TRUE
 }
-
diff --git a/R/tables.R b/R/tables.R
@@ -60,4 +60,3 @@ tables = function(mb=type_size, order.col="NAME", width=80,
   }
   invisible(info)
 }
-
diff --git a/R/test.data.table.R b/R/test.data.table.R
@@ -7,7 +7,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
   if (length(memtest.id)) {
     if (length(memtest.id)==1L) memtest.id = rep(memtest.id, 2L)  # for convenience of supplying one id rather than always a range
     stopifnot(length(memtest.id)<=2L,  # conditions quoted to user when false so "<=2L" even though following conditions rely on ==2L
-                     !anyNA(memtest.id), memtest.id[1L]<=memtest.id[2L]) 
+                     !anyNA(memtest.id), memtest.id[1L]<=memtest.id[2L])
     if (memtest==0L) memtest=1L  # using memtest.id implies memtest
   }
   if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
@@ -134,7 +134,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
 
   owd = setwd(tempdir()) # ensure writeable directory; e.g. tests that plot may write .pdf here depending on device option and/or batch mode; #5190
   on.exit(setwd(owd))
-  
+
   if (memtest) {
     catf("\n***\n*** memtest=%d. This should be the first call in a fresh R_GC_MEM_GROW=0 R session for best results. Ctrl-C now if not.\n***\n\n", memtest)
     if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.")

diff --git a/R/timetaken.R b/R/timetaken.R
@@ -12,4 +12,3 @@ timetaken = function(started.at)
   tt = proc.time()-started.at  # diff all 3 times
   paste0(format(tt[3L])," elapsed (", format(tt[1L]), " cpu)")
 }
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -187,4 +187,3 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
		ans$xo = xo # for further use by [.data.table
		return(ans)
		}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -118,4 +118,3 @@ uniqueN = function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE)
		length(starts)
		}
		}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -247,4 +247,3 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k
		# Tests are added to ensure we cover these aspects (to my knowledge) to ensure that any undesirable changes in the future breaks those tests.

		# Conclusion: floating point manipulations are hell!
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,4 +13,3 @@ setDTthreads = function(threads=NULL, restore_after_fork=NULL, percent=NULL, thr
		getDTthreads = function(verbose=getOption("datatable.verbose")) {
		.Call(CgetDTthreads, verbose)
		}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -353,4 +353,3 @@ CJ = function(..., sorted = TRUE, unique = FALSE)
		}
		l
		}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -290,4 +290,3 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu
		}
		TRUE
		}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -60,4 +60,3 @@ tables = function(mb=type_size, order.col="NAME", width=80,
		}
		invisible(info)
		}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,4 +12,3 @@ timetaken = function(started.at)
		tt = proc.time()-started.at # diff all 3 times
		paste0(format(tt[3L])," elapsed (", format(tt[1L]), " cpu)")
		}