From 60828522cb1dbf696ce32a7323464d9d8870b9f6 Mon Sep 17 00:00:00 2001
From: Ricardo Villalba <rikivillalba@gmail.com>
Date: Fri, 20 Dec 2024 23:03:00 -0300
Subject: [PATCH 1/4] add sort_by.data.table

---
 NAMESPACE             |  2 ++
 NEWS.md               |  3 +++
 R/data.table.R        | 12 ++++++++++++
 inst/tests/tests.Rraw |  7 +++++++
 4 files changed, 24 insertions(+)

diff --git a/NAMESPACE b/NAMESPACE
index 2497f0cf9..3603355f5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -206,3 +206,5 @@ S3method(format_list_item, data.frame)
 
 export(fdroplevels, setdroplevels)
 S3method(droplevels, data.table)
+
+S3method(sort_by, data.table)
diff --git a/NEWS.md b/NEWS.md
index fa384dc10..8d8fc0b00 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -69,6 +69,9 @@ rowwiseDT(
 
 6. `fread()` gains `logicalYN` argument to read columns consisting only of strings `Y`, `N` as `logical` (as opposed to character), [#4563](https://github.com/Rdatatable/data.table/issues/4563). The default is controlled by option `datatable.logicalYN`, itself defaulting to `FALSE`, for back-compatibility -- some smaller tables (especially sharded tables) might inadvertently read a "true" string column as `logical` and cause bugs. This is particularly important for tables with a column named `y` or `n` -- automatic header detection under `logicalYN=TRUE` will see these values in the first row as being "data" as opposed to column names. A parallel option was not included for `fwrite()` at this time -- users looking for a compact representation of logical columns can still use `fwrite(logical01=TRUE)`. We also opted for now to check only `Y`, `N` and not `Yes`/`No`/`YES`/`NO`.
 
+7. Base R generic `sort_by()` (new in R 4.4.0) is implemented for data.table's. It internally uses data.table's `forder()` instead of base R `order()` for efficiency. Hence, it uses C-locale as data.table's conventional sorting (suggested by @rikivillalba).
+
+
 ## BUG FIXES
 
 1. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR.
diff --git a/R/data.table.R b/R/data.table.R
index bac200b8a..0638fae35 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -2532,6 +2532,18 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR
   }
 }
 
+sort_by.data.table <- function (x, y, ...)
+{
+  if (!cedta()) return(NextMethod()) # nocov
+  if (inherits(y, "formula"))
+    y <- .formula2varlist(y, x)
+  if (!is.list(y))
+    y <- list(y)
+  # use forder instead of base 'order'
+  o <- do.call(forder, c(unname(y), list(...)))
+  x[o, , drop = FALSE]
+}
+
 # TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want
 
 copy = function(x) {
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 657478c61..9eed38e3a 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -20686,6 +20686,13 @@ test(2299.10, data.table(a=1),                                        output="a\
 test(2299.11, data.table(a=list(data.frame(b=1))),                    output="a\n1: <data.frame[1x1]>")
 test(2299.12, data.table(a=list(data.table(b=1))),                    output="a\n1: <data.table[1x1]>")
 
+# sort_by.data.table
+DT1 = data.table(a = c(1, 3, 2, NA, 3) , b = 4:0)
+DT2 = data.table(a =  c("c", "a", "B"))  # data.table uses C-locale and should sort_by if cedta()
+test(2300.01, sort_by(DT1, ~ a + b), data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L)))
+test(2300.02, sort_by(DT1, ~ I(a + b)), data.table(a = c(3,2,1,3,NA), b = c(0L,2L,4L,3L,1L)))
+test(2300.03, sort_by(DT2, ~ a), data.table(a = c("B", "a", "c")))
+
 if (test_bit64) {
   # Join to integer64 doesn't require integer32 representation, just integer64, #6625
   i64_val = .Machine$integer.max + 1

From 1a159022b52c51323c1903ee311d0ef8327dadc8 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Sun, 29 Dec 2024 21:25:19 -0800
Subject: [PATCH 2/4] style

---
 R/data.table.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/data.table.R b/R/data.table.R
index 0638fae35..36b7426c9 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -2532,7 +2532,7 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR
   }
 }
 
-sort_by.data.table <- function (x, y, ...)
+sort_by.data.table <- function(x, y, ...)
 {
   if (!cedta()) return(NextMethod()) # nocov
   if (inherits(y, "formula"))

From bdb2e2db7bb7ba886bd22b7b03677139f3b54569 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Demi=C3=A1n=20Villalba?=
 <32423469+rikivillalba@users.noreply.github.com>
Date: Mon, 30 Dec 2024 17:19:11 -0300
Subject: [PATCH 3/4] add R >= 4.4.0 guard for sort_by method

---
 NAMESPACE | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/NAMESPACE b/NAMESPACE
index 3603355f5..b894d63c1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -207,4 +207,5 @@ S3method(format_list_item, data.frame)
 export(fdroplevels, setdroplevels)
 S3method(droplevels, data.table)
 
-S3method(sort_by, data.table)
+# sort_by added in R 4.4.0, #6662, https://stat.ethz.ch/pipermail/r-announce/2024/000701.html
+if (getRversion() >= "4.4.0") S3method(sort_by, data.table)  

From fe8f6f1dff83925f23661ae12961f5888d74e65d Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Tue, 31 Dec 2024 23:48:27 -0800
Subject: [PATCH 4/4] Tidy up NEWS

---
 NEWS.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 8d8fc0b00..ca3132c56 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -69,8 +69,7 @@ rowwiseDT(
 
 6. `fread()` gains `logicalYN` argument to read columns consisting only of strings `Y`, `N` as `logical` (as opposed to character), [#4563](https://github.com/Rdatatable/data.table/issues/4563). The default is controlled by option `datatable.logicalYN`, itself defaulting to `FALSE`, for back-compatibility -- some smaller tables (especially sharded tables) might inadvertently read a "true" string column as `logical` and cause bugs. This is particularly important for tables with a column named `y` or `n` -- automatic header detection under `logicalYN=TRUE` will see these values in the first row as being "data" as opposed to column names. A parallel option was not included for `fwrite()` at this time -- users looking for a compact representation of logical columns can still use `fwrite(logical01=TRUE)`. We also opted for now to check only `Y`, `N` and not `Yes`/`No`/`YES`/`NO`.
 
-7. Base R generic `sort_by()` (new in R 4.4.0) is implemented for data.table's. It internally uses data.table's `forder()` instead of base R `order()` for efficiency. Hence, it uses C-locale as data.table's conventional sorting (suggested by @rikivillalba).
-
+7. New `sort_by()` method for data.tables, [#6662](https://github.com/Rdatatable/data.table/issues/6662). It uses `forder()` to improve upon the data.frame method and also match `DT[order(...)]` behavior with respect to locale. Thanks @rikivillalba for the suggestion and PR.
 
 ## BUG FIXES