From 60828522cb1dbf696ce32a7323464d9d8870b9f6 Mon Sep 17 00:00:00 2001 From: Ricardo Villalba Date: Fri, 20 Dec 2024 23:03:00 -0300 Subject: [PATCH 1/4] add sort_by.data.table --- NAMESPACE | 2 ++ NEWS.md | 3 +++ R/data.table.R | 12 ++++++++++++ inst/tests/tests.Rraw | 7 +++++++ 4 files changed, 24 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 2497f0cf9..3603355f5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -206,3 +206,5 @@ S3method(format_list_item, data.frame) export(fdroplevels, setdroplevels) S3method(droplevels, data.table) + +S3method(sort_by, data.table) diff --git a/NEWS.md b/NEWS.md index fa384dc10..8d8fc0b00 100644 --- a/NEWS.md +++ b/NEWS.md @@ -69,6 +69,9 @@ rowwiseDT( 6. `fread()` gains `logicalYN` argument to read columns consisting only of strings `Y`, `N` as `logical` (as opposed to character), [#4563](https://github.com/Rdatatable/data.table/issues/4563). The default is controlled by option `datatable.logicalYN`, itself defaulting to `FALSE`, for back-compatibility -- some smaller tables (especially sharded tables) might inadvertently read a "true" string column as `logical` and cause bugs. This is particularly important for tables with a column named `y` or `n` -- automatic header detection under `logicalYN=TRUE` will see these values in the first row as being "data" as opposed to column names. A parallel option was not included for `fwrite()` at this time -- users looking for a compact representation of logical columns can still use `fwrite(logical01=TRUE)`. We also opted for now to check only `Y`, `N` and not `Yes`/`No`/`YES`/`NO`. +7. Base R generic `sort_by()` (new in R 4.4.0) is implemented for data.table's. It internally uses data.table's `forder()` instead of base R `order()` for efficiency. Hence, it uses C-locale as data.table's conventional sorting (suggested by @rikivillalba). + + ## BUG FIXES 1. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR. diff --git a/R/data.table.R b/R/data.table.R index bac200b8a..0638fae35 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2532,6 +2532,18 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR } } +sort_by.data.table <- function (x, y, ...) +{ + if (!cedta()) return(NextMethod()) # nocov + if (inherits(y, "formula")) + y <- .formula2varlist(y, x) + if (!is.list(y)) + y <- list(y) + # use forder instead of base 'order' + o <- do.call(forder, c(unname(y), list(...))) + x[o, , drop = FALSE] +} + # TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want copy = function(x) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 657478c61..9eed38e3a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -20686,6 +20686,13 @@ test(2299.10, data.table(a=1), output="a\ test(2299.11, data.table(a=list(data.frame(b=1))), output="a\n1: ") test(2299.12, data.table(a=list(data.table(b=1))), output="a\n1: ") +# sort_by.data.table +DT1 = data.table(a = c(1, 3, 2, NA, 3) , b = 4:0) +DT2 = data.table(a = c("c", "a", "B")) # data.table uses C-locale and should sort_by if cedta() +test(2300.01, sort_by(DT1, ~ a + b), data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L))) +test(2300.02, sort_by(DT1, ~ I(a + b)), data.table(a = c(3,2,1,3,NA), b = c(0L,2L,4L,3L,1L))) +test(2300.03, sort_by(DT2, ~ a), data.table(a = c("B", "a", "c"))) + if (test_bit64) { # Join to integer64 doesn't require integer32 representation, just integer64, #6625 i64_val = .Machine$integer.max + 1 From 1a159022b52c51323c1903ee311d0ef8327dadc8 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 29 Dec 2024 21:25:19 -0800 Subject: [PATCH 2/4] style --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 0638fae35..36b7426c9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2532,7 +2532,7 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR } } -sort_by.data.table <- function (x, y, ...) +sort_by.data.table <- function(x, y, ...) { if (!cedta()) return(NextMethod()) # nocov if (inherits(y, "formula")) From bdb2e2db7bb7ba886bd22b7b03677139f3b54569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Demi=C3=A1n=20Villalba?= <32423469+rikivillalba@users.noreply.github.com> Date: Mon, 30 Dec 2024 17:19:11 -0300 Subject: [PATCH 3/4] add R >= 4.4.0 guard for sort_by method --- NAMESPACE | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 3603355f5..b894d63c1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -207,4 +207,5 @@ S3method(format_list_item, data.frame) export(fdroplevels, setdroplevels) S3method(droplevels, data.table) -S3method(sort_by, data.table) +# sort_by added in R 4.4.0, #6662, https://stat.ethz.ch/pipermail/r-announce/2024/000701.html +if (getRversion() >= "4.4.0") S3method(sort_by, data.table) From fe8f6f1dff83925f23661ae12961f5888d74e65d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 31 Dec 2024 23:48:27 -0800 Subject: [PATCH 4/4] Tidy up NEWS --- NEWS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8d8fc0b00..ca3132c56 100644 --- a/NEWS.md +++ b/NEWS.md @@ -69,8 +69,7 @@ rowwiseDT( 6. `fread()` gains `logicalYN` argument to read columns consisting only of strings `Y`, `N` as `logical` (as opposed to character), [#4563](https://github.com/Rdatatable/data.table/issues/4563). The default is controlled by option `datatable.logicalYN`, itself defaulting to `FALSE`, for back-compatibility -- some smaller tables (especially sharded tables) might inadvertently read a "true" string column as `logical` and cause bugs. This is particularly important for tables with a column named `y` or `n` -- automatic header detection under `logicalYN=TRUE` will see these values in the first row as being "data" as opposed to column names. A parallel option was not included for `fwrite()` at this time -- users looking for a compact representation of logical columns can still use `fwrite(logical01=TRUE)`. We also opted for now to check only `Y`, `N` and not `Yes`/`No`/`YES`/`NO`. -7. Base R generic `sort_by()` (new in R 4.4.0) is implemented for data.table's. It internally uses data.table's `forder()` instead of base R `order()` for efficiency. Hence, it uses C-locale as data.table's conventional sorting (suggested by @rikivillalba). - +7. New `sort_by()` method for data.tables, [#6662](https://github.com/Rdatatable/data.table/issues/6662). It uses `forder()` to improve upon the data.frame method and also match `DT[order(...)]` behavior with respect to locale. Thanks @rikivillalba for the suggestion and PR. ## BUG FIXES