From 95587ec6252b713dbc76bc682a840773c99837a7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 11 Dec 2024 01:18:26 +0000 Subject: [PATCH] better match base::order(method=) and decreasing= --- NEWS.md | 2 ++ R/setkey.R | 20 +++++++++++++++----- inst/tests/tests.Rraw | 11 +++++++++-- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index fa384dc10..f5fbd2248 100644 --- a/NEWS.md +++ b/NEWS.md @@ -123,6 +123,8 @@ rowwiseDT( 16. Joins of `integer64` and `double` columns succeed when the `double` column has lossless `integer64` representation, [#4167](https://github.com/Rdatatable/data.table/issues/4167) and [#6625](https://github.com/Rdatatable/data.table/issues/6625). Previously, this only worked when the double column had lossless _32-bit_ integer representation. Thanks @MichaelChirico for the reports and fix. +17. `DT[order(...)]` better matches `base::order()` behavior by (1) recognizing the `method=` argument (and erroring since this is not supported) and (2) accepting a vector of `TRUE`/`FALSE` in `decreasing=` as an alternative to using `-a` to convey "sort `a` decreasing", [#4456](https://github.com/Rdatatable/data.table/issues/4456). Thanks @jangorecki for the FR and @MichaelChirico for the PR. + ## NOTES 1. There is a new vignette on joins! See `vignette("datatable-joins")`. Thanks to Angel Feliz for authoring it! Feedback welcome. This vignette has been highly requested since 2017: [#2181](https://github.com/Rdatatable/data.table/issues/2181). diff --git a/R/setkey.R b/R/setkey.R index 11b02ec2d..58c5af5ea 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -160,8 +160,10 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, .Call(CforderReuseSorting, x, by, retGrp, retStats, sort, order, na.last, reuseSorting) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } -forder = function(..., na.last=TRUE, decreasing=FALSE) +forder = function(..., na.last=TRUE, decreasing=FALSE, method="radix") { + if (method != "radix") stopf("data.table has no support for sorting by method='%s'. Use base::order(), not order(), if you really need this.", method) + stopifnot(is.logical(decreasing), length(decreasing) > 0L, !is.na(decreasing)) sub = substitute(list(...)) tt = vapply_1b(sub, function(x) is.null(x) || (is.symbol(x) && !nzchar(x))) if (any(tt)) sub[tt] = NULL # remove any NULL or empty arguments; e.g. test 1962.052: forder(DT, NULL) and forder(DT, ) @@ -169,8 +171,8 @@ forder = function(..., na.last=TRUE, decreasing=FALSE) asc = rep.int(1L, length(sub)-1L) # ascending (1) or descending (-1) per column # the idea here is to intercept - (and unusual --+ deriving from built expressions) before vectors in forder(DT, -colA, colB) so that : # 1) - on character vector works; ordinarily in R that fails with type error - # 2) each column/expression can have its own +/- more easily that having to use a separate decreasing=TRUE/FALSE - # 3) we can pass the decreasing (-) flag to C and avoid what normally happens in R; i.e. allocate a new vector and apply - to every element first + # 2) each column/expression can have its own +/- more easily than having to use a separate decreasing=TRUE/FALSE + # 3) we can pass the decreasing (-) flag to C and avoid what normally happens in R; i.e. allocate a new vector and negate every element first # We intercept the unevaluated expressions and massage them before evaluating in with(DT) scope or not depending on the first item. for (i in seq.int(2L, length(sub))) { v = sub[[i]] @@ -193,8 +195,16 @@ forder = function(..., na.last=TRUE, decreasing=FALSE) } else { data = eval(sub, parent.frame(), parent.frame()) } - stopifnot(isTRUEorFALSE(decreasing)) - o = forderv(data, seq_along(data), retGrp=FALSE, retStats=FALSE, sort=TRUE, order=if (decreasing) -asc else asc, na.last=na.last) + if (length(decreasing) > 1L) { + if (any(asc < 0L)) stopf("Mixing '-' with vector decreasing= is not supported.") + if (length(decreasing) != length(asc)) stopf("decreasing= has length %d applied to sorting %d columns.", length(decreasing), length(asc)) + orderArg = fifelse(decreasing, -asc, asc) + } else if (decreasing) { + orderArg = -asc + } else { + orderArg = asc + } + o = forderv(data, seq_along(data), retGrp=FALSE, retStats=FALSE, sort=TRUE, order=orderArg, na.last=na.last) if (!length(o) && length(data)>=1L) o = seq_along(data[[1L]]) else o o } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 657478c61..b930aca76 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13614,8 +13614,9 @@ test(1962.0482, forder(L), 3:1) test(1962.0483, forder(), NULL) setDT(DT) test(1962.049, forder(DT[ , 0L]), error = 'Attempting to order a 0-column') -test(1962.050, forder(DT, decreasing = NA), error = base_messages$stopifnot('isTRUEorFALSE(decreasing)')) -test(1962.051, forder(DT, decreasing = 1.4), error = base_messages$stopifnot('isTRUEorFALSE(decreasing)')) +test(1962.050, forder(DT, decreasing = NA), error = base_messages$stopifnot('!is.na(decreasing)')) +test(1962.0510, forder(DT, decreasing = 1.4), error = base_messages$stopifnot('is.logical(decreasing)')) +test(1962.0511, forder(DT, decreasing=logical()), error=base_messages$stopifnot('length(decreasing) > 0L')) test(1962.052, forder(DT, NULL), 3:1) test(1962.053, forder(DT), 3:1) test(1962.054, forder(DT, ), 3:1) @@ -20697,3 +20698,9 @@ if (test_bit64) { test(2300.3, DT1[DT2, on='id'], error="Incompatible join types") test(2300.4, DT2[DT1, on='id'], error="Incompatible join types") } + +# interpret more arguments to order() correctly when translating to forder(), #4456 +DT = data.table(a=rep(1:3, 4), b=rep(1:2, 6)) +test(2301.1, DT[order(a, method="auto")], error="no support for sorting by method='auto'") +test(2301.2, DT[order(a, b, decreasing=c(TRUE, FALSE))], DT[order(-a, b)]) +test(2301.3, DT[order(a, -b, decreasing=c(TRUE, TRUE))], error="Mixing '-' with vector decreasing")