From ac60ca2d21ea950e1eade5c1929ff2037bdacdc2 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 13:04:45 -0700 Subject: [PATCH 01/39] Changes to ensure test(number instead of test(name (#6041) Related to #6040 -- it will be good to keep to a pattern where `test()` always has a numeric literal in the `num=` argument, even if it's a dynamic test where the base number is incremented by a variable amount. Doing so will make the `testPattern=` argument to `test.data.table()` more usable. We can add a linter for this (#5908) to prevent regression later.
Linter to find these: ```r l=make_linter_from_xpath( "//SYMBOL_FUNCTION_CALL[text() = 'test']/parent::expr/following-sibling::expr[1][SYMBOL or expr[1]/SYMBOL]", "xxx") lint("inst/tests/tests.Rraw", l()) ```
--- inst/tests/tests.Rraw | 250 +++++++++++++++++++++--------------------- 1 file changed, 127 insertions(+), 123 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c09d43e90..f1670dd6a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2662,13 +2662,13 @@ for (ne in seq_along(eols)) { lines = capture.output(fwrite(headDT, verbose=FALSE)) cat(paste(lines,collapse=eol), file=f, sep="") # so last line abruptly ends (missing last eol) to test that, otherwise could just pass eol to fwrite # on unix we simulate Windows too. On Windows \n will write \r\n (and \r\n will write \r\r\n) - num = 894 + nr/100 + nc/1000 + ne/10000 + num_major = nr/100 + nc/1000 + ne/10000 # if (isTRUE(all.equal(testIDtail, 0.4103))) browser() - test(num+0.00001, fread(f,na.strings=""), headDT) + test(894+num_major+0.00001, fread(f,na.strings=""), headDT) cat(eol,file=f,append=TRUE) # now a normal file properly ending with final \n - test(num+0.00002, fread(f,na.strings=""), headDT) + test(894+num_major+0.00002, fread(f,na.strings=""), headDT) cat(eol,file=f,append=TRUE) # extra \n should be ignored other than for single columns where it is significant - test(num+0.00003, fread(f,na.strings=""), if (nc==1) rbind(headDT, list(NA)) else headDT) + test(894+num_major+0.00003, fread(f,na.strings=""), if (nc==1) rbind(headDT, list(NA)) else headDT) unlink(f) }}} if (test_bit64) { @@ -4466,13 +4466,13 @@ colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") -test_no = 1223.0 +test_no = 0L oldnfail = nfail for (nvars in seq_along(names(DT))) { signs = expand.grid(replicate(nvars, c(-1L,1L), simplify=FALSE)) combn(names(DT), nvars, simplify=FALSE, function(x) { # simplify=FALSE needed for R 3.1.0 for (i in seq_len(nrow(signs))) { - test_no <<- signif(test_no+.001, 7) + test_no <<- test_no + 1L ll = as.call(c(as.name("order"), lapply(seq_along(x), function(j) { if (signs[i,j] == 1L) @@ -4485,7 +4485,7 @@ for (nvars in seq_along(names(DT))) { } }) )) - test(test_no, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) + test(1223.0 + test_no*0.001, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) } integer() }) @@ -4617,16 +4617,16 @@ colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") -test_no = 1246.0 +test_no = 0L oldnfail = nfail for (i in seq_along(names(DT))) { cc = combn(names(DT), i) apply(cc, 2L, function(jj) { - test_no <<- signif(test_no+.01, 7) # first without key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) - test_no <<- signif(test_no+.01, 7) + test_no <<- test_no + 1L # first without key + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test_no <<- test_no + 1L setkeyv(DT, jj) # with key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) }) } if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce @@ -4645,11 +4645,11 @@ oldnfail = nfail for (i in seq_along(names(DT))) { cc = combn(names(DT), i) apply(cc, 2L, function(jj) { - test_no <<- signif(test_no+.01, 7) # first without key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) - test_no <<- signif(test_no+.01, 7) + test_no <<- test_no + 1L # first without key + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test_no <<- test_no + 1L setkeyv(DT, jj) # with key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) }) } if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce @@ -4734,13 +4734,13 @@ setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") ans = vector("list", length(names(DT))) -test_no = 1252 +test_no = 0L oldnfail = nfail for (i in seq_along(names(DT))) { cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i))) ans[[i]] = combn(names(DT), i, function(x) { tmp = apply(cj, 1, function(y) { - test_no <<- signif(test_no+.001, 7) + test_no <<- test_no + 1L ll = as.call(c(as.name("base_order"), lapply(seq_along(x), function(j) { if (y[j] == 1L) @@ -4754,11 +4754,11 @@ for (i in seq_along(names(DT))) { }) )) ans1 = forderv(DT, by=x, order=y, na.last=TRUE) # adding tests for both nalast=TRUE and nalast=NA - test(test_no, ans1, with(DT, eval(ll))) - test_no <<- signif(test_no+.001, 7) + test(1252.0 + test_no*0.001, ans1, with(DT, eval(ll))) + test_no <<- test_no + 1L ll <- as.call(c(as.list(ll), na.last=NA)) ans1 = forderv(DT, by=x, order=y, na.last=NA) # nalast=NA here. - test(test_no, ans1[ans1 != 0], with(DT, eval(ll))) + test(1252.0 + test_no*0.001, ans1[ans1 != 0], with(DT, eval(ll))) }) dim(tmp)=NULL list(tmp) @@ -4885,13 +4885,13 @@ setNumericRounding(old_rounding) # distinguishing small numbers from 0.0 as from v1.9.2, test from Rick # http://stackoverflow.com/questions/22290544/grouping-very-small-numbers-e-g-1e-28-and-0-0-in-data-table-v1-8-10-vs-v1-9-2 old_rounding = getNumericRounding() -test_no = 1278.001 +test_no = 0L for (dround in c(0,2)) { setNumericRounding(dround) # rounding should not affect the result here because although small, it's very accurace (1 s.f.) for (i in c(-30:-1,1:30)) { DT = data.table(c(1 * (10^i),2,9999,-1,0,1)) - test(test_no, nrow(DT[, .N, by=V1]), 6L) - test_no = test_no + 0.001 + test_no = test_no + 1L + test(1278.0 + test_no*0.001, nrow(DT[, .N, by=V1]), 6L) } } setNumericRounding(old_rounding) @@ -5697,7 +5697,7 @@ dt = data.table(AA=sample(c(-2:2), 50, TRUE), DD=sample(c(-2:2), 50, TRUE), EE=sample(as.logical(c(-2:2)), 50, TRUE)) if (test_bit64) dt[, DD := as.integer64(DD)] -test_no = 1368.0 +test_no = 0L for (i in seq_along(dt)) { col = dt[[i]] for (j in list(TRUE, FALSE, "keep")) { @@ -5716,10 +5716,10 @@ for (i in seq_along(dt)) { r3 = frankv(col, ties.method=k, na.last=j) r4 = frankv(col, order=-1L, ties.method=k, na.last=j) - test_no = test_no+.0001 - test(test_no, r1, r3) - test_no = test_no+.0001 - test(test_no, r2, r4) + test_no = test_no + 1L + test(1368.0 + test_no*0.0001, r1, r3) + test_no = test_no + 1L + test(1368.0 + test_no*0.0001, r2, r4) } } } @@ -5730,7 +5730,7 @@ dt = data.table(AA=sample(c(-2:2, NA), 50, TRUE), DD=sample(c(-2:2, NA), 50, TRUE), EE=sample(as.logical(c(-2:2, NA)), 50, TRUE)) if (test_bit64) dt[, DD := as.integer64(DD)] -test_no = 1369.0 +test_no = 0L for (i in seq_along(dt)) { col = dt[[i]] # ensure consistency with base::rank ties.methods as advertised @@ -5748,10 +5748,10 @@ for (i in seq_along(dt)) { r3 = frankv(col, ties.method=k, na.last=NA) r4 = frankv(col, order=-1L, ties.method=k, na.last=NA) - test_no = test_no+.0001 - test(test_no, r1, r3) - test_no = test_no+.0001 - test(test_no, r2, r4) + test_no = test_no + 1L + test(1369.0 + test_no*0.0001, r1, r3) + test_no = test_no + 1L + test(1369.0 + test_no*0.0001, r2, r4) } } @@ -5767,20 +5767,20 @@ dt = list(AA=sample(c(NA,-2:2), 50, TRUE), DD=sample(c(NA,-2:2), 50, TRUE), EE=sample(as.logical(c(NA,-2:2)), 50, TRUE)) if (test_bit64) dt[["DD"]] = as.integer64(dt[["DD"]]) -test_no = 1370.0 +test_no = 0L ans = as.list(na.omit(as.data.table(dt))) for (i in seq_along(dt)) { combn(names(dt), i, function(cols) { ans1 = is_na(dt[cols]) ans2 = rowSums(is.na(as.data.table(dt[cols]))) > 0L - test_no <<- test_no+.0001 - test(test_no, ans1, ans2) + test_no <<- test_no + 1L + test(1370.0 + test_no*0.0001, ans1, ans2) # update: tests for any_na - test_no <<- test_no+.0001 - test(test_no, any_na(dt[cols]), TRUE) - test_no <<- test_no+.0001 - test(test_no, any_na(ans[cols]), FALSE) + test_no <<- test_no + 1L + test(1370.0 + test_no*0.0001, any_na(dt[cols]), TRUE) + test_no <<- test_no + 1L + test(1370.0 + test_no*0.0001, any_na(ans[cols]), FALSE) TRUE }) } @@ -5874,7 +5874,7 @@ types=c("any", "within", "start", "end", "equal") # add 'equal' as well mults=c("all", "first", "last") maxgap=-1L; minoverlap=0L # default has changed in IRanges/GenomicRanges :: findOverlaps verbose=FALSE; which=TRUE -test_no = 1372.0 +test_no = 0L load(testDir("test1372.Rdata")) # Regenerated on 17/02/2019 to include type = 'equal'. Var 'ans' has all the results saved by running GenomicRanges separately using code above, is a list with names of the format type_mult_run set.seed(123) this = 1L @@ -5903,11 +5903,11 @@ for (run in seq_len(times)) { # data.table overlap join nomatch = if(mult == "all") NULL else NA_integer_ thisans = foverlaps(i, x, mult=mult, type=type, nomatch=nomatch, which=which, verbose=verbose) - test_no = test_no+.01 + test_no = test_no + 1L # cat("test =", test_no, ", run = ", run, ", type = ", type, ", mult = ", mult, "\n", sep="") idx = paste(type, mult, run, sep="_") # ans[[idx]] contains fo(gr(i), gr(x), type=type, select=mult) - test(test_no, thisans, ans[[idx]]) + test(1372.0 + test_no*0.01, thisans, ans[[idx]]) this = this+1L } } @@ -6127,13 +6127,13 @@ DT = data.table(a=sample(col, 20, TRUE), b=as.numeric(sample(col,20,TRUE)), c=as # if (test_bit64) { # DT[, e := as.integer64(sample(col,20,TRUE))] # } -test_no = 1394 +test_no = 0L for (i in seq_along(DT)) { combn(names(DT), i, function(cols) { ans1 = na.omit(DT, cols=cols) ans2 = DT[stats::complete.cases(DT[, cols, with=FALSE])] - test_no <<- test_no+.001 - test(test_no, ans1, ans2) + test_no <<- test_no + 1L + test(1394.0 + test_no*0.001, ans1, ans2) 0L }) } @@ -6509,15 +6509,15 @@ for(t in seq_len(nrow(all))){ ansOpt <- DT[eval(parse(text = thisQuery))] options("datatable.optimize" = 2L) ansRef <- DT[eval(parse(text = thisQuery))] - test_no <- test_no + 0.0001 - test(test_no, ansOpt, ansRef) + test_no <- test_no + 1L + test(1438.0 + test_no*0.0001, ansOpt, ansRef) ## repeat the test with 'which = TRUE' options("datatable.optimize" = 3L) ansOpt <- DT[eval(parse(text = thisQuery)), which = TRUE] options("datatable.optimize" = 2L) ansRef <- DT[eval(parse(text = thisQuery)), which = TRUE] - test_no <- test_no + 0.0001 - test(test_no, ansOpt, ansRef) + test_no <- test_no + 1L + test(1438.0 + test_no*0.0001, ansOpt, ansRef) ## repeat the test with the j queries for(thisJquery in jQueries) { ## do it with and without existing "by" @@ -6526,8 +6526,8 @@ for(t in seq_len(nrow(all))){ ansOpt <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] options("datatable.optimize" = 2L) ansRef <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] - test_no <- test_no + 0.0001 - test(test_no, ansOpt, ansRef) + test_no <- test_no + 1L + test(1438.0 + test_no*0.0001, ansOpt, ansRef) } } } @@ -12905,10 +12905,10 @@ M <- merge(x, y) m <- merge(as.data.frame(x), as.data.frame(y), by="a") test(1913.09, is.data.table(M) && !is.data.table(m)) test(1913.10, all(names(M) %in% union(names(M), names(m)))) -test_no = 1913.11 +test_no = 0L for (name in names(m)) { - test_no = test_no + 0.0001 - test(test_no, M[[name]], m[[name]]) + test_no = test_no + 1L + test(1913.11 + test_no*0.0001, M[[name]], m[[name]]) } # # Original example that smoked out the bug @@ -12923,10 +12923,10 @@ for (i in 1:3) { } test(1913.12, is.data.table(M) && !is.data.table(m)) test(1913.13, all(names(M) %in% union(names(M), names(m)))) -test_no = 1913.14 +test_no = 0L for (name in names(m)) { - test_no = test_no + 0.0001 - test(test_no, M[[name]], m[[name]]) + test_no = test_no + 1L + test(1913.14 + test_no*0.0001, M[[name]], m[[name]]) } # # simple subset maintains keys @@ -12961,10 +12961,10 @@ t2 <- transform(dt, d=c+4, a=sample(c('x', 'y', 'z'), 20, replace=TRUE)) test(1913.23, is.null(key(t2))) # transforming a key column nukes the key ## This is probably not necessary, but let's just check that transforming ## a key column doesn't twist around the rows in the result. -test_no = 1913.24 +test_no = 0L for (col in c('b', 'c')) { - test_no = test_no + 0.0001 - test(test_no, t2[[col]], dt[[col]]) # mutating-key-transform maintains other columns + test_no = test_no + 1L + test(1913.24 + test_no*0.0001, t2[[col]], dt[[col]]) # mutating-key-transform maintains other columns } # Test 1914 of S4 compatibility was moved to S4.Rraw for #3808 @@ -14695,18 +14695,18 @@ test(2025.01, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data. f = tempfile() for (nNUL in 0:3) { writeBin(c(charToRaw("a=b\nA B C\n1 3 5\n"), rep(as.raw(0), nNUL), charToRaw("2 4 6\n")), con=f) - test_no = 2025 + (1+nNUL)/10 - test(test_no + .01, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6)) - test(test_no + .02, fread(f), ans) # auto detect skip and header works too + num_major = (1+nNUL)/10 + test(2025 + num_major + .01, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6)) + test(2025 + num_major + .02, fread(f), ans) # auto detect skip and header works too writeBin(c(charToRaw("a=b\nA,B,C\n1,3,5\n"), rep(as.raw(0), nNUL), charToRaw("2,4,6\n")), con=f) - test(test_no + .03, fread(f, skip=1, header=TRUE), ans) - test(test_no + .04, fread(f), ans) + test(2025 + num_major + .03, fread(f, skip=1, header=TRUE), ans) + test(2025 + num_major + .04, fread(f), ans) writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A B C\n1 3 5\n2 4 6\n")), con=f) - test(test_no + .05, fread(f, skip=1, header=TRUE), ans) - test(test_no + .06, fread(f), ans) + test(2025 + num_major + .05, fread(f, skip=1, header=TRUE), ans) + test(2025 + num_major + .06, fread(f), ans) writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A,B,C\n1,3,5\n2,4,6\n")), con=f) - test(test_no + .07, fread(f, skip=1, header=TRUE), ans) - test(test_no + .08, fread(f), ans) + test(2025 + num_major + .07, fread(f, skip=1, header=TRUE), ans) + test(2025 + num_major + .08, fread(f), ans) } makeNul = function(str){ tt=charToRaw(str); tt[tt==42L]=as.raw(0); writeBin(tt, con=f)} # "*" (42) represents NUL makeNul("A,B,C\n1,foo,5\n2,*bar**,6\n") @@ -17614,19 +17614,21 @@ EVAL = function(...) { # cat(e,"\n") # uncomment to check the queries tested eval(parse(text=e)) } -testnum = 2211.0 +testnum = 0L for (col in c("a","b","c")) { - testnum = testnum+0.1 + testnum = testnum + 100L for (fi in seq_along(funs)) { if (col=="c" && fi<=6L) next # first 6 funs don't support type character f = funs[fi] - testnum = testnum+0.001 - test(testnum, EVAL("DT[i, ",f,"(",col, if(fi>8L)", 1L","), by=grp]"), # segfault before when NA in i - EVAL("DT[i][, ",f,"(",col, if(fi>8L)", 1L","), by=grp]")) # ok before by taking DT[i] subset first + testnum = testnum + 1L + test(2211.0 + testnum*0.001, + EVAL("DT[i, ",f,"(",col, if(fi>8L)", 1L","), by=grp]"), # segfault before when NA in i + EVAL("DT[i][, ",f,"(",col, if(fi>8L)", 1L","), by=grp]")) # ok before by taking DT[i] subset first if (fi<=8L) { - testnum = testnum+0.001 - test(testnum, EVAL("DT[i, ",f,"(",col,", na.rm=TRUE), by=grp]"), - EVAL("DT[i][, ",f,"(",col,", na.rm=TRUE), by=grp]")) + testnum = testnum + 1L + test(2211.0 + testnum*0.001, + EVAL("DT[i, ",f,"(",col,", na.rm=TRUE), by=grp]"), + EVAL("DT[i][, ",f,"(",col,", na.rm=TRUE), by=grp]")) } } } @@ -17730,7 +17732,7 @@ DT2 = data.table(grp = c('a', 'b'), agg = list(c('1' = 4, '2' = 5), c('3' = 6))) test(2217, DT1[, by = grp, .(agg = list(setNames(as.numeric(value), id)))], DT2) # shift integer64 when fill isn't integer32, #4865 -testnum = 2218 +testnum = 0L funs = c(as.integer, as.double, as.complex, as.character, if (test_bit64) as.integer64) # when test_bit64==FALSE these all passed before; now passes with test_bit64==TRUE too # add grouping tests for #5205 @@ -17739,32 +17741,32 @@ options(datatable.optimize = 2L) for (f1 in funs) { DT = data.table(x=f1(1:4), g=g) for (f2 in funs) { - testnum = testnum + 0.001 - test(testnum, DT[, shift(x)], f1(c(NA, 1:3))) - testnum = testnum + 0.001 + testnum = testnum + 1L + test(2218.0 + testnum*0.001, DT[, shift(x)], f1(c(NA, 1:3))) + testnum = testnum + 1L w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" - test(testnum, DT[, shift(x, fill=f2(NA))], f1(c(NA, 1:3)), warning=w) - testnum = testnum + 0.001 + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f2(NA))], f1(c(NA, 1:3)), warning=w) + testnum = testnum + 1L if (identical(f1,as.character) && identical(f2,as.complex)) { # one special case due to as.complex(0)=="0+0i"!="0" - test(testnum, DT[, shift(x, fill="0")], f1(0:3)) + test(2218.0 + testnum*0.001, DT[, shift(x, fill="0")], f1(0:3)) } else { - test(testnum, DT[, shift(x, fill=f2(0))], f1(0:3), warning=w) + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f2(0))], f1(0:3), warning=w) } - testnum = testnum + 0.001 - test(testnum, DT[, shift(x), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3)))) - testnum = testnum + 0.001 + testnum = testnum + 1L + test(2218.0 + testnum*0.001, DT[, shift(x), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3)))) + testnum = testnum + 1L w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" f = f2(NA) - test(testnum, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3))), warning=w) - testnum = testnum + 0.001 + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3))), warning=w) + testnum = testnum + 1L if (identical(f1,as.character) && identical(f2,as.complex)) { # one special case due to as.complex(0)=="0+0i"!="0" - test(testnum, DT[, shift(x, fill="0"), by=g], data.table(g=g, V1=f1(c(0,1,0,3)))) + test(2218.0 + testnum*0.001, DT[, shift(x, fill="0"), by=g], data.table(g=g, V1=f1(c(0,1,0,3)))) } else { f = f2(0) - test(testnum, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(0,1,0,3))), warning=w) + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(0,1,0,3))), warning=w) } } } @@ -17778,14 +17780,15 @@ if (test_bit64) test(2219.2, DT[3, A:=as.integer64("4611686018427387906")], data DT = data.table(g=1:2, i=c(NA, 1:4, NA), f=factor(letters[1:6]), l=as.list(1:6)) options(datatable.optimize = 2L) funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") -testnum = 2220 +testnum = 0L for (fun in funs) { - testnum = testnum + 0.01 - test(testnum, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") - testnum = testnum + 0.01 - test(testnum, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) + testnum = testnum + 1L + test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + testnum = testnum + 1L + test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) } -test(testnum+0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +testnum = testnum + 1L +test(2220.0 + testnum*0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") @@ -18067,19 +18070,18 @@ test(2233.38, copy(DT)[, val:=v[1L], keyby=.(A,B), verbose=TRUE], data.table(A=I set.seed(10) n = 100 a = data.table(id1=1:n, id2=sample(1:900,n,replace=TRUE), flag=sample(c(0,0,0,1),n,replace=TRUE)) -testnum = 2233.39 for (opt in c(0,Inf)) { options(datatable.optimize=opt) out = if (opt) "GForce.*gsum" else "GForce FALSE" B = copy(a) A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle - test(testnum+0.001, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= + num_bump = (opt>0)/100 + test(2233.39+num_bump+0.001, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= setorder(A, id1) - test(testnum+0.002, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) - test(testnum+0.003, any(A[,t1!=t2]), FALSE) - test(testnum+0.004, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) - test(testnum+0.005, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) - testnum = 2233.40 + test(2233.39+num_bump+0.002, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) + test(2233.39+num_bump+0.003, any(A[,t1!=t2]), FALSE) + test(2233.39+num_bump+0.004, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) + test(2233.39+num_bump+0.005, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) } # test from #5337 n=4; k=2 @@ -18099,22 +18101,24 @@ DT = data.table( ) load(testDir("test2233-43.Rdata")) # ans setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk -old = options(datatable.verbose=TRUE) -testnum = 2233.43 -for (opt in c(0,Inf)) { - options(datatable.optimize=opt) - out = if (opt) "GForce.*gsum" else "GForce FALSE" - test(testnum, - copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") - ][, n_idT :=dim(.SD)[[1]], by=list(t, id) - ][, sum_v2_id :=sum(v2), by=.(id) - ][, sum_v1_idT:=sum(v1), by=c("id", "t") - ][, sum_v1_id :=sum(v1), by=c("id")], - ans, - output=out) - testnum = 2233.44 -} -options(old) +test(2233.43, + options = list(datatable.verbose=TRUE, datatable.optimize=0), + copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") + ][, n_idT :=dim(.SD)[[1]], by=list(t, id) + ][, sum_v2_id :=sum(v2), by=.(id) + ][, sum_v1_idT:=sum(v1), by=c("id", "t") + ][, sum_v1_id :=sum(v1), by=c("id")], + ans, + output="GForce FALSE") +test(2233.44, + options = list(datatable.verbose=TRUE, datatable.optimize=Inf), + copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") + ][, n_idT :=dim(.SD)[[1]], by=list(t, id) + ][, sum_v2_id :=sum(v2), by=.(id) + ][, sum_v1_idT:=sum(v1), by=c("id", "t") + ][, sum_v1_id :=sum(v1), by=c("id")], + ans, + output="GForce.*gsum") # optimized := with gforce functions that can return lists #5403 old = options(datatable.verbose=TRUE) DT = data.table(grp=1:2, x=1:4) From 585ec52e28e173c59ae1879d9dc3ade5f9477d95 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 13:06:57 -0700 Subject: [PATCH 02/39] assume PROJ_PATH=. if unset in cc() (#6042) * assume PROJ_PATH=. if unset * ? --- .dev/cc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/cc.R b/.dev/cc.R index f2031ca48..a51021ac7 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -51,7 +51,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) { return(invisible()) } -cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH"), CC="gcc", quiet=FALSE) { +cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH", unset="."), CC="gcc", quiet=FALSE) { if (!missing(cc_dir)) { warning("'cc_dir' arg is deprecated, use 'path' argument or 'PROJ_PATH' env var instead") path = cc_dir From 26c558d395a5d700f6b7c49c7af1e46e35c54978 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 18:58:51 -0700 Subject: [PATCH 03/39] Retain earlier dcast(fill=list(...)) behavior relying on base coercion behavior for lists (#6051) * Retain fill=list(...) behavior * refactor to unclutter line for typical usage * test list->int64 coercion too --- inst/tests/tests.Rraw | 7 ++++++- src/fcast.c | 13 ++++++++----- src/utils.c | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f1670dd6a..fc3b1163c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18314,7 +18314,12 @@ test(2247.4, split(dt, ~y+z), list("a.c"=dt[1], "b.c"=dt[2], "a.d"=dt[3], "b.d"= if (test_bit64) { i64v = as.integer64(c(12345678901234, 70, 20, NA)) apple = data.table(id = c("a", "b", "b"), time = c(1L, 1L, 2L), y = i64v[1:3]) - test(2248, dcast(apple, id ~ time, value.var = "y"), data.table(id = c('a', 'b'), `1` = i64v[1:2], `2` = i64v[4:3], key='id')) + test(2248.1, dcast(apple, id ~ time, value.var = "y"), ans<-data.table(id = c('a', 'b'), `1` = i64v[1:2], `2` = i64v[4:3], key='id')) + # associated regression test: downtreams used fill=list() which is not directly supported by coerceAs() + DT = data.table(a=1:2, b=2:3, c=3) + test(2248.2, dcast(DT, a ~ b, value.var='c', fill=list(0L)), data.table(a=1:2, `2`=c(3, 0), `3`=c(0, 3), key='a')) + # also ensure list() gets coerced to integer64 correctly + test(2248.3, dcast(apple, id ~ time, value.var = "y", fill=list(NA)), ans) } # Unit tests for DT[, .SD] retaining secondary indices, #1709 diff --git a/src/fcast.c b/src/fcast.c index d049711bf..334dfd7e8 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -21,14 +21,17 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil SEXP thisfill = fill; const SEXPTYPE thistype = TYPEOF(thiscol); int nprotect = 0; - if(some_fill){ + if (some_fill) { if (isNull(fill)) { - if (LOGICAL(is_agg)[0]) { - thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++; - } else thisfill = VECTOR_ELT(fill_d, i); + if (LOGICAL(is_agg)[0]) { + thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++; + } else + thisfill = VECTOR_ELT(fill_d, i); } if (isVectorAtomic(thiscol)) { // defer error handling to below, but also skip on list - thisfill = PROTECT(coerceAs(thisfill, thiscol, /*copyArg=*/ScalarLogical(false))); nprotect++; + // #5980: some callers used fill=list(...) and relied on R's coercion mechanics for lists, which are nontrivial, so just dispatch and double-coerce. + if (isNewList(thisfill)) { thisfill = PROTECT(coerceVector(thisfill, TYPEOF(thiscol))); nprotect++; } + thisfill = PROTECT(coerceAs(thisfill, thiscol, /*copyArg=*/ScalarLogical(false))); nprotect++; } } switch (thistype) { diff --git a/src/utils.c b/src/utils.c index 1fba47cac..e59cc8208 100644 --- a/src/utils.c +++ b/src/utils.c @@ -322,7 +322,7 @@ SEXP coerceUtf8IfNeeded(SEXP x) { return(ans); } -// class1 is used by coerseAs only, which is used by frollR.c and nafill.c only +// class1 is used by coerceAs only, which is used by frollR.c and nafill.c only const char *class1(SEXP x) { SEXP cl = getAttrib(x, R_ClassSymbol); if (length(cl)) From 9aacf3a32857b2c1cb20d8e11d254e8acf9cdc58 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 21:50:05 -0700 Subject: [PATCH 04/39] Allow running a subset of tests by pattern (#6040) * Changes to ensure test(number instead of test(name * remaining tests * swapped outputs * don't need verbose setting anymore * more tests leading with symbol, now literal * Allow running a subset of tests by pattern * working version with static analysis * R CMD check fixes * nocov * Add an escape to back up to the full suite. --- R/test.data.table.R | 56 ++++++++++++++++++++++++++++++++++++++++-- man/test.data.table.Rd | 2 ++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 7cb573748..aa1c2c2ea 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -1,4 +1,4 @@ -test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, +test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, testPattern=NULL, memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id=NULL) { stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress)) memtest = as.integer(memtest) @@ -38,7 +38,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F scripts = scripts[!grepl("bench|other", scripts)] scripts = gsub("[.]bz2$","",scripts) return(sapply(scripts, function(fn) { - err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress)) + err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress, testPattern=testPattern)) cat("\n"); isTRUE(err) })) @@ -140,6 +140,58 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.") } + # nocov start: only used interactively -- "production" suites should always run in full + if (!is.null(testPattern)) { + # due to how non-hermetic our tests are, the simple approach (pass this to test(), return early if 'numStr' matches testPattern) + # does not work, or at least getting it to work is not much more efficient (see initial commit of #6040). so instead, + # here we parse the file, extract the tests that match the pattern to a new file, and include other setup lines likely required + # to run the tests successfully. two major drawbacks (1) we can only take a guess which lines are required, so this approach + # can't work (or at least, may need a lot of adjustment) for _every_ test, though not working is also a good sign that test + # should be refactored to be more hermetic (2) not all tests have literal test numbers, meaning we can't always match the + # runtime test number (i.e. 'numStr') since we're just doing a static check here, though we _are_ careful to match the + # full test expression string, i.e., not just limited to numeric literal test numbers. + arg_line = call_id = col1 = col2 = i.line1 = id = line1 = parent = preceding_line = test_start_line = text = token = x.line1 = x.parent = NULL # R CMD check + pd = setDT(utils::getParseData(parse(fn))) + file_lines = readLines(fn) + # NB: a call looks like (with id/parent tracking) + # + # name + # ( + # ... + # ... + # ) + # + ## navigate up two steps from 'test' SYMBOL_FUNCTION_CALL to the overall 'expr' for the call + test_calls = pd[pd[pd[token == 'SYMBOL_FUNCTION_CALL' & text == 'test'], list(call_lhs_id = id, call_id = x.parent), on=c(id='parent')], .(line1, id), on=c(id='call_id')] + ## all the arguments for each call to test() + test_call_args = test_calls[pd[token == 'expr'], .(call_id = parent, arg_line = i.line1, col1, col2), on=c(id='parent'), nomatch=NULL] + ## 2nd argument is the num= argument + test_num_expr = test_call_args[ , .SD[2L], by="call_id"] + # NB: subtle assumption that 2nd arg to test() is all on one line, true as of 2024-Apr and likely to remain so + keep_test_ids = test_num_expr[grepl(testPattern, substring(file_lines[arg_line], col1, col2)), call_id] + # Now find all tests just previous to the keep tests; we want to keep non-test setup lines between them, e.g. + # test(drop, ...) + # setup_line1 # retain + # setup_line2 # retain + # test(keep, ...) # retain + intertest_ranges = test_calls[!id %in% keep_test_ids][test_calls[id %in% keep_test_ids], .(preceding_line = x.line1, test_start_line = i.line1), on='line1', roll=TRUE] + # TODO(michaelchirico): this doesn't do well with tests inside control statements. + # those could be included by looking for tests with parent!=0, i.e., not-top-level tests, + # and including the full parent for such tests. omitting for now until needed. + keep_lines = intertest_ranges[, sort(unique(unlist(Map(function(l, u) l:u, preceding_line+1L, test_start_line))))] + header_lines = seq_len(test_calls$line1[1L]-1L) + + tryCatch(error = function(c) warningf("Attempt to subset to %d tests matching '%s' failed, running full suite.", length(keep_test_ids), testPattern), { + new_script = file_lines[c(header_lines, keep_lines)] + parse(text = new_script) # as noted above the static approach is not fool-proof (yet?), so force the script to at least parse before continuing. + fn = tempfile() + on.exit(unlink(fn), add=TRUE) + catf("Running %d of %d tests matching '%s'\n", length(keep_test_ids), nrow(test_calls), testPattern) + writeLines(new_script, fn) + }) + } + # nocov end + err = try(sys.source(fn, envir=env), silent=silent) options(oldOptions) diff --git a/man/test.data.table.Rd b/man/test.data.table.Rd index c36e5f9d4..37496fddd 100644 --- a/man/test.data.table.Rd +++ b/man/test.data.table.Rd @@ -8,6 +8,7 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", silent = FALSE, showProgress = interactive() && !silent, + testPattern = NULL, memtest = Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id = NULL) } @@ -17,6 +18,7 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", \item{pkg}{ Root directory name under which all package content (ex: DESCRIPTION, src/, R/, inst/ etc..) resides. Used only in \emph{dev-mode}. } \item{silent}{ Controls what happens if a test fails. Like \code{silent} in \code{\link{try}}, \code{TRUE} causes the error message to be suppressed and \code{FALSE} to be returned, otherwise the error is returned. } \item{showProgress}{ Output 'Running test ...\\r' at the start of each test? } +\item{testPattern}{ When present, a regular expression tested againt the number of each test for inclusion. Useful for running only a small portion of a large test script. } \item{memtest}{ Measure and report memory usage of tests (1:gc before ps, 2:gc after ps) rather than time taken (0) by default. Intended for and tested on Linux. See PR #5515 for more details. } \item{memtest.id}{ An id for which to print memory usage for every sub id. May be a range of ids. } } From a7a12a93f1c56eae339d74ee82f2f7f2a2ccd6d7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 10 Apr 2024 21:08:34 -0700 Subject: [PATCH 05/39] New option env= for test() (#6072) --- R/test.data.table.R | 14 ++++++++- inst/tests/tests.Rraw | 69 +++++++++++++------------------------------ man/test.Rd | 3 +- 3 files changed, 35 insertions(+), 51 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index aa1c2c2ea..4908f7718 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -303,7 +303,19 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL) { + if (!is.null(env)) { + old = Sys.getenv(names(env), names=TRUE, unset=NA) + to_unset = !lengths(env) + # NB: Sys.setenv() (no arguments) errors + if (!all(to_unset)) do.call(Sys.setenv, as.list(env[!to_unset])) + Sys.unsetenv(names(env)[to_unset]) + on.exit(add=TRUE, { + is_preset = !is.na(old) + if (any(is_preset)) do.call(Sys.setenv, as.list(old[is_preset])) + Sys.unsetenv(names(old)[!is_preset]) + }) + } if (!is.null(options)) { old_options <- do.call('options', as.list(options)) # as.list(): allow passing named character vector for convenience on.exit(options(old_options), add=TRUE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fc3b1163c..078a7d173 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14290,35 +14290,24 @@ test(1997.06, setDTthreads(percent=NULL), error="but is length 0") test(1997.07, setDTthreads(percent=1:2), error="but is length 2") test(1997.08, setDTthreads(restore_after_fork=21), error="must be TRUE, FALSE, or NULL") old = getDTthreads() # (1) -oldenv1 = Sys.getenv("R_DATATABLE_NUM_PROCS_PERCENT") -oldenv2 = Sys.getenv("R_DATATABLE_NUM_THREADS") -Sys.setenv(R_DATATABLE_NUM_THREADS="") # in case user has this set, so we can test PROCS_PERCENT -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="3.0") -test(1997.09, setDTthreads(), old, ignore.warning="Ignoring invalid.*Please remove any.*not a digit") +test(1997.09, env = c(R_DATATABLE_NUM_THREADS="", R_DATATABLE_NUM_PROCS_PERCENT="3.0"), setDTthreads(), old, ignore.warning="Ignoring invalid.*Please remove any.*not a digit") new = getDTthreads() # old above at (1) may not have been default. new now is. test(1997.10, getDTthreads(), new) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="1") -test(1997.11, setDTthreads(), new, ignore.warning="Ignoring invalid.*integer between 2 and 100") +test(1997.11, env=c(R_DATATABLE_NUM_PROCS_PERCENT="1"), setDTthreads(), new, ignore.warning="Ignoring invalid.*integer between 2 and 100") test(1997.12, getDTthreads(), new) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75") -test(1997.13, setDTthreads(), new) +test(1997.13, env=c(R_DATATABLE_NUM_PROCS_PERCENT="75"), setDTthreads(), new) new = getDTthreads() setDTthreads(percent=75) test(1997.14, getDTthreads(), new) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="100") -setDTthreads() +test(1997.15, env=c(R_DATATABLE_NUM_PROCS_PERCENT="100"), setDTthreads(), new) allcpu = getDTthreads() -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75") -Sys.setenv(R_DATATABLE_NUM_THREADS=allcpu) -setDTthreads() -test(1997.15, getDTthreads(), allcpu) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT=oldenv1) -Sys.setenv(R_DATATABLE_NUM_THREADS=oldenv2) -test(1997.16, setDTthreads(old), allcpu) -test(1997.17, getDTthreads(), old) -test(1997.18, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1") +test(1997.16, env=c(R_DATATABLE_NUM_PROCS_PERCENT="75", R_DATATABLE_NUM_THREADS=allcpu), setDTthreads(), allcpu) +test(1997.17, getDTthreads(), allcpu) +test(1997.18, setDTthreads(old), allcpu) +test(1997.19, getDTthreads(), old) +test(1997.20, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1") setDTthreads(throttle=65536) -test(1997.19, getDTthreads(TRUE), output="throttle==65536") +test(1997.21, getDTthreads(TRUE), output="throttle==65536") setDTthreads(throttle=1024) # test that a copy is being made and output is printed, #3385 after partial revert of #3281 @@ -16429,14 +16418,9 @@ test(2122.2, DT, data.table(V3=5:6)) dt = data.table(SomeNumberA=c(1,1,1),SomeNumberB=c(1,1,1)) test(2123, dt[, .(.N, TotalA=sum(SomeNumberA), TotalB=sum(SomeNumberB)), by=SomeNumberA], data.table(SomeNumberA=1, N=3L, TotalA=1, TotalB=3)) -# system timezone is not usually UTC, so as.ITime.POSIXct shouldn't assume so, #4085 -oldtz=Sys.getenv('TZ', unset=NA) -Sys.setenv(TZ='Asia/Jakarta') # UTC+7 -t0 = as.POSIXct('2019-10-01') -test(2124.1, format(as.ITime(t0)), '00:00:00') -test(2124.2, format(as.IDate(t0)), '2019-10-01') -if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) -# careful to unset because TZ="" means UTC whereas unset TZ means local, #4261 and #4464 +# system timezone is not usually UTC, so as.ITime.POSIXct shouldn't assume so, #4085, #4261, #4464 +test(2124.1, env=c(TZ='Asia/Jakarta'), format(as.ITime(as.POSIXct('2019-10-01'))), '00:00:00') +test(2124.2, env=c(TZ='Asia/Jakarta'), format(as.IDate(as.POSIXct('2019-10-01'))), '2019-10-01') # trunc.cols in print.data.table, #4074 old_width = options("width" = 40L) @@ -16799,20 +16783,12 @@ if (.Platform$OS.type=="windows") local({ ) x_old = Map(Sys.getlocale, names(x)) invisible(Map(Sys.setlocale, names(x), x)) - old = Sys.getenv('LANGUAGE') - Sys.setenv('LANGUAGE' = 'zh_CN') - on.exit({ - if (nzchar(old)) - Sys.setenv('LANGUAGE' = old) - else - Sys.unsetenv('LANGUAGE') - invisible(Map(Sys.setlocale, names(x_old), x_old)) - }, add = TRUE) + on.exit(Map(Sys.setlocale, names(x_old), x_old)) # triggered segfault here in #4402, Windows-only under translation. # test that the argument order changes correctly (the 'item 2' moves to the beginning of the message) # since the argument order changes in this example (and that was the crash) we don't need to test # the display of the Chinese characters here. Thanks to @shrektan for all his help on this. - test(2143, rbind(DT,list(c=4L,a=7L)), error="2.*1.*c.*1") + test(2143, env=c(LANGUAGE='zh_CN'), rbind(DT,list(c=4L,a=7L)), error="2.*1.*c.*1") }) # test back to English (the argument order is back to 1,c,2,1) test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1") @@ -16871,18 +16847,13 @@ tmp = tempfile() fwrite(DT, tmp) test(2150.01, fread(tmp), DT) # defaults for fwrite/fread simple and preserving fwrite(DT, tmp, dateTimeAs='write.csv') # as write.csv, writes the UTC times as-is not local because the time column has tzone=="UTC", but without the Z marker -oldtz = Sys.getenv("TZ", unset=NA) -Sys.unsetenv("TZ") -test(2150.021, sapply(fread(tmp,tz=""), typeof), c(dates="integer", times="character")) # from v1.14.0 tz="" needed to read datetime as character -test(2150.022, fread(tmp,tz="UTC"), DT) # user can tell fread to interpet the unmarked datetimes as UTC -Sys.setenv(TZ="UTC") -test(2150.023, fread(tmp), DT) # TZ environment variable is also recognized +test(2150.021, env=list(TZ=NULL), sapply(fread(tmp,tz=""), typeof), c(dates="integer", times="character")) # from v1.14.0 tz="" needed to read datetime as character +test(2150.022, env=list(TZ=NULL), fread(tmp,tz="UTC"), DT) # user can tell fread to interpet the unmarked datetimes as UTC +test(2150.023, env=c(TZ='UTC'), fread(tmp), DT) # TZ environment variable is also recognized if (.Platform$OS.type!="windows") { - Sys.setenv(TZ="") # on Windows this unsets TZ, see ?Sys.setenv - test(2150.024, fread(tmp), DT) + test(2150.024, env=c(TZ=''), fread(tmp), DT) # on Windows this unsets TZ, see ?Sys.setenv # blank TZ env variable on non-Windows is recognized as UTC consistent with C and R; but R's tz= argument is the opposite and uses "" for local } -Sys.unsetenv("TZ") # Notes: # - from v1.14.0 tz="" needed # - as.POSIXct puts "" on the result (testing the write.csv version here with missing tzone) @@ -16891,11 +16862,11 @@ Sys.unsetenv("TZ") # as.POSIXct() failure means 'times' is returned as a character, hence no 'tzone' attribute. # fread() will also throw a warning, one substring of which will be the reproduced base R error. test(2150.025, + env=list(TZ=NULL), attr(fread(tmp, colClasses=list(POSIXct="times"), tz="")$times, "tzone"), if (is.null(base_messages$maybe_invalid_old_posixct)) "" else NULL, warning=base_messages$maybe_invalid_old_posixct) # the times will be different though here because as.POSIXct read them as local time. -if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) fwrite(copy(DT)[ , times := format(times, '%FT%T+00:00')], tmp) test(2150.03, fread(tmp), DT) fwrite(copy(DT)[ , times := format(times, '%FT%T+0000')], tmp) diff --git a/man/test.Rd b/man/test.Rd index ddf1198bf..d264d98af 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,7 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL) + options = NULL, env = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } @@ -21,6 +21,7 @@ test(num, x, y = TRUE, \item{notOutput}{ Or if you are testing that a feature does \emph{not} print particular console output. Case insensitive (unlike output) so that the test does not incorrectly pass just because the string is not found due to case. } \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to `test()` (usually, `x`, or maybe `y`) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } +\item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From fa6204d35cc77731291a1a0b4574cf5e8e695f60 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 10 Apr 2024 21:34:12 -0700 Subject: [PATCH 06/39] Clean up getRversion overwrite to avoid error (#6073) --- .dev/cc.R | 1 + 1 file changed, 1 insertion(+) diff --git a/.dev/cc.R b/.dev/cc.R index a51021ac7..28f398f16 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -38,6 +38,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) { if (!quiet) warning("No NAMESPACE file found, required to guarantee imports resolve correctly") return(invisible()) } + suppressWarnings(rm("getRversion", envir=.GlobalEnv)) # clean up from previous cc() because parseNamespaceFile() run getRversion() in NAMESPACE in .GlobalEnv nsParsedImports = parseNamespaceFile(basename(path), "..")$imports # weird signature to this function if (!quiet && length(nsParsedImports)) cat(sprintf("Ensuring objects from %d import entries in NAMESPACE resolve correctly\n", length(nsParsedImports))) for (ii in seq_along(nsParsedImports)) { From 94e4be689733ea50b0a0cae5e13e97efd0d4255b Mon Sep 17 00:00:00 2001 From: Anirban Date: Wed, 10 Apr 2024 22:58:22 -0700 Subject: [PATCH 07/39] Added my workflow (Marketplace version) and the two tests I used in the examples --- .github/workflows/autocomment.yml | 21 ++++++++++ inst/atime/tests.R | 69 +++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 .github/workflows/autocomment.yml create mode 100644 inst/atime/tests.R diff --git a/.github/workflows/autocomment.yml b/.github/workflows/autocomment.yml new file mode 100644 index 000000000..94a906a36 --- /dev/null +++ b/.github/workflows/autocomment.yml @@ -0,0 +1,21 @@ +name: Autocomment atime-based performance regression analysis on PRs + +on: + pull_request: + branches: + - '*' + types: + - opened + - reopened + - synchronize + +jobs: + comment: + runs-on: ubuntu-latest + container: ghcr.io/iterative/cml:0-dvc2-base1 + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + repo_token: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + steps: + - uses: Anirban166/Autocomment-atime-results@v1.1.6 \ No newline at end of file diff --git a/inst/atime/tests.R b/inst/atime/tests.R new file mode 100644 index 000000000..7095ae350 --- /dev/null +++ b/inst/atime/tests.R @@ -0,0 +1,69 @@ +pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { + pkg_find_replace <- function(glob, FIND, REPLACE) { + atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE) + } + Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE) + Package_ <- gsub(".", "_", old.Package, fixed = TRUE) + new.Package_ <- paste0(Package_, "_", sha) + pkg_find_replace( + "DESCRIPTION", + paste0("Package:\\s+", old.Package), + paste("Package:", new.Package)) + pkg_find_replace( + file.path("src", "Makevars.*in"), + Package_regex, + new.Package_) + pkg_find_replace( + file.path("R", "onLoad.R"), + Package_regex, + new.Package_) + pkg_find_replace( + file.path("R", "onLoad.R"), + sprintf('packageVersion\\("%s"\\)', old.Package), + sprintf('packageVersion\\("%s"\\)', new.Package)) + pkg_find_replace( + file.path("src", "init.c"), + paste0("R_init_", Package_regex), + paste0("R_init_", gsub("[.]", "_", new.Package_))) + pkg_find_replace( + "NAMESPACE", + sprintf('useDynLib\\("?%s"?', Package_regex), + paste0('useDynLib(', new.Package_)) + } + +test.list <- list( + # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 + # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 + "Test regression fixed in #4440" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(3,8), + setup = quote({ + set.seed(1L) + dt <- data.table(a = sample(N, N)) + setindex(dt, a) + }), + expr = quote(data.table:::shallow(dt)), + "Before" = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This should be changed later. Currently, the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) + "Regression" = "752012f577f8e268bb6d0084ca39a09fa7fbc1c4", # A commit that is affected by the regression: https://github.com/Rdatatable/data.table/commit/752012f577f8e268bb6d0084ca39a09fa7fbc1c4 + "Fixed" = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # The merge commit in #4440, the PR that fixed the regression: https://github.com/Rdatatable/data.table/commit/9d3b9202fddb980345025a4f6ac451ed26a423be + + # Test based on: https://github.com/Rdatatable/data.table/issues/5424 + # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 + # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 + "Test regression fixed in #5463" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(3, 8), + expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), + setup = quote({ + n <- N/100 + set.seed(1L) + dt <- data.table( + g = sample(seq_len(n), N, TRUE), + x = runif(N), + key = "g") + dt_mod <- copy(dt) + }), + "Before" = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # The commit in PR #4491 that comes before the regression introducting commit: https://github.com/Rdatatable/data.table/pull/4491/commits/be2f72e6f5c90622fe72e1c315ca05769a9dc854 + "Regression" = "e793f53466d99f86e70fc2611b708ae8c601a451", # The commit in #4491 that introduced the regression: https://github.com/Rdatatable/data.table/pull/4491/commits/e793f53466d99f86e70fc2611b708ae8c601a451 + "Fixed" = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in #5463, the PR that fixed the regression: https://github.com/Rdatatable/data.table/pull/5463/commits/58409197426ced4714af842650b0cc3b9e2cb842 +) \ No newline at end of file From 2de34e729b355614c6742ad0cb5fc5311d8bf779 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 11 Apr 2024 07:25:29 -0700 Subject: [PATCH 08/39] Add a deep-but-infrequent GHA (#6076) --- .github/workflows/R-CMD-check-occasional.yaml | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 .github/workflows/R-CMD-check-occasional.yaml diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml new file mode 100644 index 000000000..1358f0538 --- /dev/null +++ b/.github/workflows/R-CMD-check-occasional.yaml @@ -0,0 +1,96 @@ +on: + schedule: + - cron: '18 13 8 * *' # 8th of month at 13:18 UTC + +# A more complete suite of checks to run monthly; each PR/merge need not pass all these, but they should pass before CRAN release +name: R-CMD-check-occasional + +jobs: + R-CMD-check-occasional: + runs-on: ${{ matrix.os }} + + name: ${{ matrix.os }} (${{ matrix.r }}) + + strategy: + matrix: + os: [macOS-latest, windows-latest, ubuntu-latest] + r: ['devel', 'release', '3.2', '3.3', '3.4', '3.5', '3.6', '4.0', '4.1', '4.2', '4.3'] + locale: ['en_US.utf8', 'zh_CN.utf8', 'lv_LV.utf8'] # Chinese for translations, Latvian for collate order (#3502) + exclude: + - os: ['macOS-latest', 'windows-latest'] # only run non-English locale CI on Ubuntu + locale: ['zh_CN.utf8', 'lv_LV.utf8'] + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - name: Set locale + if: matrix.locale == 'en_US.utf8' + run: | + sudo locale-gen en_US + echo "LC_ALL=en_US.utf8" >> $GITHUB_ENV + + - name: Set locale + if: matrix.locale == 'zh_CN.utf8' + run: | + sudo locale-gen zh_CN + echo "LC_ALL=zh_CN.utf8" >> $GITHUB_ENV + echo "LANGUAGE=zh_CN" >> $GITHUB_ENV + + - name: Set locale + if: matrix.locale == 'lv_LV.utf8' + run: | + sudo locale-gen lv_LV + echo "LC_ALL=lv_LV.utf8" >> $GITHUB_ENV + echo "LANGUAGE=lv_LV" >> $GITHUB_ENV + + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.r }} + + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + shell: Rscript {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@main + with: + name: ${{ runner.os }}-r${{ matrix.r }}-results + path: check From 317139752a219140709b0250ef27df9e824bfb15 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Thu, 11 Apr 2024 07:55:48 -0700 Subject: [PATCH 09/39] Use %s in 'should be TRUE or FALSE' messages (#6075) * refactor should be TF messages * revert R-level message * consistency * changed msg in nafill, changed test 2003.2 to align with consistency --------- Co-authored-by: Michael Chirico --- inst/tests/tests.Rraw | 2 +- src/fastmean.c | 2 +- src/nafill.c | 2 +- src/rbindlist.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 078a7d173..405ccd0a0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14366,7 +14366,7 @@ test(2002.12, rbind(DT1, DT2, idcol='id'), data.table(id=integer(), a=logica #rbindlist coverage test(2003.1, rbindlist(list(), use.names=1), error="use.names= should be TRUE, FALSE, or not used [(]\"check\" by default[)]") -test(2003.2, rbindlist(list(), fill=1), error="fill= should be TRUE or FALSE") +test(2003.2, rbindlist(list(), fill=1), error="fill should be TRUE or FALSE") test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, use.names=FALSE), data.table(a=c(1:4))) test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TRUE, use.names=FALSE), diff --git a/src/fastmean.c b/src/fastmean.c index 2fcc6ebd2..1c9b3eb64 100644 --- a/src/fastmean.c +++ b/src/fastmean.c @@ -36,7 +36,7 @@ SEXP fastmean(SEXP args) if (length(args)>2) { tmp = CADDR(args); if (!isLogical(tmp) || LENGTH(tmp)!=1 || LOGICAL(tmp)[0]==NA_LOGICAL) - error(_("narm should be TRUE or FALSE")); // # nocov ; [.data.table should construct the .External call correctly + error(_("%s should be TRUE or FALSE"), "narm"); // # nocov ; [.data.table should construct the .External call correctly narm=LOGICAL(tmp)[0]; } PROTECT(ans = allocNAVector(REALSXP, 1)); diff --git a/src/nafill.c b/src/nafill.c index 03aa6d091..5fe81933d 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -100,7 +100,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S bool binplace = LOGICAL(inplace)[0]; if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) - error(_("nan_is_na must be TRUE or FALSE")); // # nocov + error(_("%s must be TRUE or FALSE"), "nan_is_na"); // # nocov bool nan_is_na = LOGICAL(nan_is_na_arg)[0]; SEXP x = R_NilValue; diff --git a/src/rbindlist.c b/src/rbindlist.c index ba19d2c38..d8cd32476 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -5,7 +5,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) { if (!isLogical(fillArg) || LENGTH(fillArg) != 1 || LOGICAL(fillArg)[0] == NA_LOGICAL) - error(_("fill= should be TRUE or FALSE")); + error(_("%s should be TRUE or FALSE"), "fill"); if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1) error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)")); // R levels converts "check" to NA if (!length(l)) return(l); From 0711e4004d48db749fa39dd407c87bb06726b1c0 Mon Sep 17 00:00:00 2001 From: Nitish Jha <151559388+Nj221102@users.noreply.github.com> Date: Thu, 11 Apr 2024 22:02:00 +0530 Subject: [PATCH 10/39] Added skip_absent arguement to colnamesInt() (#6068) * Added skip_absent arguement to colnamesInt() * Update NEWS.md * Update NEWS.md * Update utils.c * Update utils.c * Update utils.c * Update utils.c * Update utils.c * added test * Update src/nafill.c Co-authored-by: Michael Chirico * Update src/utils.c Co-authored-by: Michael Chirico * Update src/utils.c Co-authored-by: Michael Chirico * Update src/utils.c Co-authored-by: Michael Chirico * Implemented suggestions * small fix * Update utils.c * minor issues * restore comment for now * Update nafill.Rraw * adjusted any colno. > ncol to 0L * Added test and changed refrence to deep copy * annotate test purpose * More careful about when duplicate() is needed * refine comment * whitespace * Add a new test against duplicates for numeric input * update last test number --------- Co-authored-by: nitish jha Co-authored-by: Michael Chirico --- R/wrappers.R | 2 +- inst/tests/nafill.Rraw | 16 +++++++++++++++- src/data.table.h | 2 +- src/nafill.c | 2 +- src/utils.c | 27 +++++++++++++++++++-------- 5 files changed, 37 insertions(+), 12 deletions(-) diff --git a/R/wrappers.R b/R/wrappers.R index dcf8ba08e..a018b91ae 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -8,7 +8,7 @@ setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE) fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na) fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.list(substitute(list(...)))[-1L]) -colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) +colnamesInt = function(x, cols, check_dups=FALSE, skip_absent=FALSE) .Call(CcolnamesInt, x, cols, check_dups, skip_absent) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index b72c0b506..cf65f61bf 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -149,8 +149,22 @@ test(4.20, colnamesInt(dt, integer()), integer()) test(4.21, colnamesInt(dt, NULL), seq_along(dt)) test(4.22, colnamesInt("asd", 1), error="must be data.table compatible") test(4.23, colnamesInt(dt, 1, check_dups="a"), error="check_dups") +test(4.24, colnamesInt(dt, c("a", "e"), skip_absent=TRUE), c(1L,0L)) +test(4.25, colnamesInt(dt, c(1L, 4L), skip_absent=TRUE), c(1L,0L)) +test(4.26, colnamesInt(dt, c(1, 4), skip_absent=TRUE), c(1L,0L)) +test(4.27, colnamesInt(dt, c("a", NA), skip_absent=TRUE), c(1L,0L)) +test(4.28, colnamesInt(dt, c(1L, 0L), skip_absent=TRUE), error="received non-existing column*.*0") +test(4.29, colnamesInt(dt, c(1, -5), skip_absent=TRUE), error="received non-existing column*.*-5") +test(4.30, colnamesInt(dt, c(1, 4), skip_absent=NULL), error="skip_absent must be TRUE or FALSE") +test(4.31, colnamesInt(dt, c(1L, 1000L), skip_absent=TRUE), c(1L,0L)) +cols=c(1L,100L) +test(4.32, colnamesInt(dt, cols, skip_absent=TRUE), c(1L, 0L)) +test(4.33, cols, c(1L, 100L)) # ensure input was not overwritten with output 0 +cols=c(1,100) +test(4.34, colnamesInt(dt, cols, skip_absent=TRUE), c(1L, 0L)) +test(4.35, cols, c(1, 100)) # ensure input was not overwritten with output 0 names(dt) <- NULL -test(4.24, colnamesInt(dt, "a"), error="has no names") +test(4.36, colnamesInt(dt, "a"), error="has no names") # verbose dt = data.table(a=c(1L, 2L, NA_integer_), b=c(1, 2, NA_real_)) diff --git a/src/data.table.h b/src/data.table.h index 21b7e30e0..297167d46 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -238,7 +238,7 @@ bool isRealReallyInt(SEXP x); SEXP isRealReallyIntR(SEXP x); SEXP isReallyReal(SEXP x); bool allNA(SEXP x, bool errorForBadType); -SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups); +SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent); bool INHERITS(SEXP x, SEXP char_); SEXP copyAsPlain(SEXP x); void copySharedColumns(SEXP x); diff --git a/src/nafill.c b/src/nafill.c index 5fe81933d..8d50f32ea 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -114,7 +114,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S obj = PROTECT(allocVector(VECSXP, 1)); protecti++; // wrap into list SET_VECTOR_ELT(obj, 0, obj1); } - SEXP ricols = PROTECT(colnamesInt(obj, cols, ScalarLogical(TRUE))); protecti++; // nafill cols=NULL which turns into seq_along(obj) + SEXP ricols = PROTECT(colnamesInt(obj, cols, /* check_dups= */ ScalarLogical(TRUE), /* skip_absent= */ ScalarLogical(FALSE))); protecti++; // nafill cols=NULL which turns into seq_along(obj) x = PROTECT(allocVector(VECSXP, length(ricols))); protecti++; int *icols = INTEGER(ricols); for (int i=0; inx) || (icols[i]<1)) + for (int i=0; inx) || (icols[i]<1)) error(_("argument specifying columns received non-existing column(s): cols[%d]=%d"), i+1, icols[i]); // handles NAs also + else if(bskip_absent && icols[i]>nx) + icols[i] = 0L; } } else if (isString(cols)) { SEXP xnames = PROTECT(getAttrib(x, R_NamesSymbol)); protecti++; @@ -133,9 +142,11 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups) { error(_("'x' argument data.table has no names")); ricols = PROTECT(chmatch(cols, xnames, 0)); protecti++; int *icols = INTEGER(ricols); - for (int i=0; i Date: Thu, 11 Apr 2024 09:40:56 -0700 Subject: [PATCH 11/39] Automatic detection of dec (. or ,) (#4482) * initial progress on automatic dec=, detection * if sep=, detected, turn off auto-dec * first pass at NEWS and man * add comments, tests * improve man * add verbose output, tests --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/fread.R | 5 +++-- inst/tests/tests.Rraw | 36 +++++++++++++++++++++++-------- man/fread.Rd | 8 +++---- src/fread.c | 50 ++++++++++++++++++++++++++++++++++++++----- src/fread.h | 2 ++ src/freadR.c | 5 +++-- 7 files changed, 86 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4fa8d699b..27c35e385 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,8 @@ 8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix. +9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/fread.R b/R/fread.R index b2e55403d..66bda3fb1 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,5 +1,5 @@ fread = function( -input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto", +input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, @@ -16,7 +16,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") else if (sep=="auto") sep="" # sep=="" at C level means auto sep else stopifnot( nchar(sep)==1L ) # otherwise an actual character to use as sep } - stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L ) + stopifnot( is.character(dec), length(dec)==1L) + if (dec == "auto") dec = "" else stopifnot(nchar(dec) == 1L) # handle encoding, #563 if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) { stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 405ccd0a0..d3a0e37e8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2681,15 +2681,13 @@ if (test_bit64) { test(897, class(DT$b), "integer64") test(898, fread(f), DT) unlink(f) - DT[,a2:=as.integer64(a)][,a3:=as.double(a)][,a4:=gsub(" ","",format(a))] - DT[,b2:=as.double(b)][,b3:=gsub(" ","",format(b))] - DT[,r:=a/100][,r2:=gsub(" ","",format(r))] - DT[112, a2:=as.integer64(12345678901234)] # start on row 112 to avoid the first 100 - DT[113, a3:=3.14] - DT[114, a4:="123A"] - DT[115, b2:=1234567890123.45] - DT[116, b3:="12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) - DT[117, r2:="3.14A"] + DT[ , a2 := as.integer64(a)][112L, a2 := as.integer64(12345678901234)] # start on row 112 to avoid the first 100 + DT[ , a3 := as.double(a) ][113L, a3 := 3.14] + DT[ , a4 := as.character(a)][114L, a4 := "123A"] + DT[ , b2 := as.double(b) ][115L, b2 := 1234567890123.45] + DT[ , b3 := as.character(b)][116L, b3 := "12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) + DT[ , r := a/100] + DT[ , r2 := as.character(r)][117L, r2 := "3.14A"] fwrite(DT,f<-tempfile()) test(899.1, fread(f, verbose=TRUE), DT, output="Rereading 6 columns.*out-of-sample.*Column 4.*a2.*int32.*int64.*<<12345678901234>>.*Column 10.*r2.*float64.*string.*<<3.14A>>") test(899.2, fread(f, colClasses=list(character=c("a4","b3","r2"), integer64="a2", double=c("a3","b2")), verbose=TRUE), @@ -18432,3 +18430,23 @@ DF <- structure( ) test(2255, as.data.table(DF), output="DF1.V1.*DF1.V2.*DF2.V3.*DF2.V4.*V5") + +# automatic detection of dec=',' for #2431 +DT = data.table(a = letters, b = 1:26/6, c = 1:26) +## auto-detect dec=',' +fwrite(DT, f <- tempfile(), dec=',', sep=';') +test(2256.1, fread(f), DT) + +fwrite(DT, f, dec=',', sep='|') +test(2256.2, fread(f), DT) + +## auto-detect dec='.' +fwrite(DT, f) +test(2256.3, fread(f), DT) + +## verbose output +test(2256.4, fread(f, verbose=TRUE), DT, output="sep=',' so dec set to '.'") + +fwrite(DT, f, dec=',', sep=';') +test(2256.5, fread(f, verbose=TRUE), DT, output="dec=',' detected based on a balance of 18") +test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), output="dec=',' detected based on a balance of 1 ") diff --git a/man/fread.Rd b/man/fread.Rd index 49b187364..d397a441d 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -9,7 +9,7 @@ \code{fread} is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector. } \usage{ -fread(input, file, text, cmd, sep="auto", sep2="auto", dec=".", quote="\"", +fread(input, file, text, cmd, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), # due to change to ""; see NEWS stringsAsFactors=FALSE, verbose=getOption("datatable.verbose", FALSE), @@ -47,7 +47,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" If type coercion results in an error, introduces \code{NA}s, or would result in loss of accuracy, the coercion attempt is aborted for that column with warning and the column's type is left unchanged. If you really desire data loss (e.g. reading \code{3.14} as \code{integer}) you have to truncate such columns afterwards yourself explicitly so that this is clear to future readers of your code. } \item{integer64}{ "integer64" (default) reads columns detected as containing integers larger than 2^31 as type \code{bit64::integer64}. Alternatively, \code{"double"|"numeric"} reads as \code{utils::read.csv} does; i.e., possibly with loss of precision and if so silently. Or, "character". } - \item{dec}{ The decimal separator as in \code{utils::read.csv}. If not "." (default) then usually ",". See details. } + \item{dec}{ The decimal separator as in \code{utils::read.csv}. When \code{"auto"} (the default), an attempt is made to decide whether \code{"."} or \code{","} is more suitable for this input. See details. } \item{col.names}{ A vector of optional names for the variables (columns). The default is to use the header column if present or detected, or if not "V" followed by the column number. This is applied after \code{check.names} and before \code{key} and \code{index}. } \item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.} \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } @@ -79,9 +79,9 @@ If an empty line is encountered then reading stops there with warning if any tex \bold{Line endings:} All known line endings are detected automatically: \code{\\n} (*NIX including Mac), \code{\\r\\n} (Windows CRLF), \code{\\r} (old Mac) and \code{\\n\\r} (just in case). There is no need to convert input files first. \code{fread} running on any architecture will read a file from any architecture. Both \code{\\r} and \code{\\n} may be embedded in character strings (including column names) provided the field is quoted. -\bold{Decimal separator and locale:} \code{fread(\dots,dec=",")} should just work. \code{fread} uses C function \code{strtod} to read numeric data; e.g., \code{1.23} or \code{1,23}. \code{strtod} retrieves the decimal separator (\code{.} or \code{,} usually) from the locale of the R session rather than as an argument passed to the \code{strtod} function. So for \code{fread(\dots,dec=",")} to work, \code{fread} changes this (and only this) R session's locale temporarily to a locale which provides the desired decimal separator. +\bold{Decimal separator:} \code{dec} is used to parse numeric fields as the separator between integral and fractional parts. When \code{dec='auto'}, during column type detection, when a field is a candidate for being numeric (i.e., parsing as lower types has already failed), \code{dec='.'} is tried, and, if it fails to create a numeric field, \code{dec=','} is tried. At the end of the sample lines, if more were successfully parsed with \code{dec=','}, \code{dec} is set to \code{','}; otherwise, \code{dec} is set to \code{'.'}. -On Windows, "French_France.1252" is tried which should be available as standard (any locale with comma decimal separator would suffice) and on unix "fr_FR.utf8" (you may need to install this locale on unix). \code{fread()} is very careful to set the locale back again afterwards, even if the function fails with an error. The choice of locale is determined by \code{options()$datatable.fread.dec.locale}. This may be a \emph{vector} of locale names and if so they will be tried in turn until the desired \code{dec} is obtained; thus allowing more than two different decimal separators to be selected. This is a new feature in v1.9.6 and is experimental. In case of problems, turn it off with \code{options(datatable.fread.dec.experiment=FALSE)}. +Automatic detection of \code{sep} occurs \emph{prior} to column type detection -- as such, it is possible that \code{sep} has been inferred to be \code{','}, in which case \code{dec} is set to \code{'.'}. \bold{Quotes:} diff --git a/src/fread.c b/src/fread.c index a1521fb37..e2602e596 100644 --- a/src/fread.c +++ b/src/fread.c @@ -33,6 +33,7 @@ static const char *sof, *eof; static char sep; static char whiteChar; // what to consider as whitespace to skip: ' ', '\t' or 0 means both (when sep!=' ' && sep!='\t') static char quote, dec; +static int linesForDecDot; // when dec='auto', track the balance of fields in favor of dec='.' vs dec=',', ties go to '.' static bool eol_one_r; // only true very rarely for \r-only files // Quote rule: @@ -1206,11 +1207,16 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped skip_white(&ch); if (eol(&ch)) return 0; // empty line int field=0; + const bool autoDec = dec == '\0'; while (field>(%d)"), strlim(ch,20), quoteRule); skip_white(&ch); const char *fieldStart = ch; while (tmpType[field]<=CT_STRING) { + if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '\0') { // guess . first + dec = '.'; + } + fun[tmpType[field]](&fctx); if (end_of_field(ch)) break; skip_white(&ch); @@ -1234,9 +1240,19 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped } } ch = fieldStart; + if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '.') { // . didn't parse a double; try , + dec = ','; + continue; + } while (++tmpType[field]=eof) break; // The 9th jump could reach the end in the same situation and that's ok. As long as the end is sampled is what we want. bool bumped = false; // did this jump find any different types; to reduce verbose output to relevant lines int jumpLine = 0; // line from this jump point start + linesForDecDot = 0; while(ch0, apply the bumps (if any) at the end of the successfully completed jump sample ASSERT(jump>0, "jump(%d)>0", jump); @@ -1906,7 +1936,17 @@ int freadMain(freadMainArgs _args) { if (args.header==NA_BOOL8) { for (int j=0; j0) for (int j=0; jCT_EMPTY) { args.header=true; diff --git a/src/fread.h b/src/fread.h index 1e2783643..89dea2592 100644 --- a/src/fread.h +++ b/src/fread.h @@ -36,6 +36,8 @@ typedef enum { NUMTYPE // placeholder for the number of types including drop; used for allocation and loop bounds } colType; +#define IS_DEC_TYPE(x) ((x) == CT_FLOAT64 || (x) == CT_FLOAT64_EXT) // types where dec matters + extern int8_t typeSize[NUMTYPE]; extern const char typeName[NUMTYPE][10]; extern const long double pow10lookup[301]; diff --git a/src/freadR.c b/src/freadR.c index 97fbfadac..035c76eda 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -102,9 +102,10 @@ SEXP freadR( error(_("Internal error: freadR sep not a single character. R level catches this.")); // # nocov args.sep = CHAR(STRING_ELT(sepArg,0))[0]; // '\0' when default "auto" was replaced by "" at R level - if (!(isString(decArg) && LENGTH(decArg)==1 && strlen(CHAR(STRING_ELT(decArg,0)))==1)) + if (!isString(decArg) || LENGTH(decArg)!=1 || strlen(CHAR(STRING_ELT(decArg,0)))>1) { error(_("Internal error: freadR dec not a single character. R level catches this.")); // # nocov - args.dec = CHAR(STRING_ELT(decArg,0))[0]; + } + args.dec = CHAR(STRING_ELT(decArg,0))[0]; // '\0' when default "auto" was replaced by "" at R level if (IS_FALSE(quoteArg)) { args.quote = '\0'; From 523e3cc90553060f072e647a4ac68e921deb89b3 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 12:12:03 -0700 Subject: [PATCH 12/39] Renamed workflow as per Toby's suggestion --- .github/workflows/{autocomment.yml => performance-tests.yml} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename .github/workflows/{autocomment.yml => performance-tests.yml} (87%) diff --git a/.github/workflows/autocomment.yml b/.github/workflows/performance-tests.yml similarity index 87% rename from .github/workflows/autocomment.yml rename to .github/workflows/performance-tests.yml index 94a906a36..027854faa 100644 --- a/.github/workflows/autocomment.yml +++ b/.github/workflows/performance-tests.yml @@ -18,4 +18,4 @@ jobs: repo_token: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes steps: - - uses: Anirban166/Autocomment-atime-results@v1.1.6 \ No newline at end of file + - uses: Anirban166/Autocomment-atime-results@v1.1.6 From e07565ceb79fa2153b903aa3440dd0e27adf57c0 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 17:24:46 -0700 Subject: [PATCH 13/39] Made the suggested changes --- inst/atime/tests.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 7095ae350..94844b7b3 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -43,9 +43,9 @@ test.list <- list( setindex(dt, a) }), expr = quote(data.table:::shallow(dt)), - "Before" = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This should be changed later. Currently, the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) - "Regression" = "752012f577f8e268bb6d0084ca39a09fa7fbc1c4", # A commit that is affected by the regression: https://github.com/Rdatatable/data.table/commit/752012f577f8e268bb6d0084ca39a09fa7fbc1c4 - "Fixed" = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # The merge commit in #4440, the PR that fixed the regression: https://github.com/Rdatatable/data.table/commit/9d3b9202fddb980345025a4f6ac451ed26a423be + Before = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This needs to be changed later. Currently assigned to the merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) as the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) + Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression + Fixed = "769f02c6fbbb031391a79f46c6042de99f1ea712"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440/commits) # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 @@ -63,7 +63,7 @@ test.list <- list( key = "g") dt_mod <- copy(dt) }), - "Before" = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # The commit in PR #4491 that comes before the regression introducting commit: https://github.com/Rdatatable/data.table/pull/4491/commits/be2f72e6f5c90622fe72e1c315ca05769a9dc854 - "Regression" = "e793f53466d99f86e70fc2611b708ae8c601a451", # The commit in #4491 that introduced the regression: https://github.com/Rdatatable/data.table/pull/4491/commits/e793f53466d99f86e70fc2611b708ae8c601a451 - "Fixed" = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in #5463, the PR that fixed the regression: https://github.com/Rdatatable/data.table/pull/5463/commits/58409197426ced4714af842650b0cc3b9e2cb842 -) \ No newline at end of file + Before = "19b7866112614db53eb3e909c097407d91cd6738", # Parent of the regression commit (https://github.com/Rdatatable/data.table/commit/0895fa247afcf6b38044bd5f56c0d209691ddb31), which is the parent of the first commit in the PR that causes the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) + Regression = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # The regression commit is the parent of the first commit in the PR that fixed the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) + Fixed = "1e03fe7b890e63da9651d997ea52548c90b3ae32") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5493/commits) +) From a3d5cf938d6831cda483c79d7b6e4194865dde7c Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 18:10:32 -0700 Subject: [PATCH 14/39] Reverted changes to the 'Fixed' commit SHA for the first test case since the last commit of #4440 failed to check out --- inst/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 94844b7b3..e023eec21 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -45,7 +45,7 @@ test.list <- list( expr = quote(data.table:::shallow(dt)), Before = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This needs to be changed later. Currently assigned to the merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) as the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression - Fixed = "769f02c6fbbb031391a79f46c6042de99f1ea712"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440/commits) + Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 From 3093a35db700441e0cb2b3fca5a67f36a17ddce1 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 19:11:03 -0700 Subject: [PATCH 15/39] Reverted changes to the 'Fixed' commit SHA for the second test case as well since the newly provided commit SHA is wrong --- inst/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index e023eec21..6f38660b4 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -65,5 +65,5 @@ test.list <- list( }), Before = "19b7866112614db53eb3e909c097407d91cd6738", # Parent of the regression commit (https://github.com/Rdatatable/data.table/commit/0895fa247afcf6b38044bd5f56c0d209691ddb31), which is the parent of the first commit in the PR that causes the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) Regression = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # The regression commit is the parent of the first commit in the PR that fixed the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) - Fixed = "1e03fe7b890e63da9651d997ea52548c90b3ae32") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5493/commits) + Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) ) From 48b2dddc551b7a2708cc3f6de061bcabf26513b9 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 19:29:09 -0700 Subject: [PATCH 16/39] Added the suggested path filters --- .github/workflows/performance-tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 027854faa..0473dcbc2 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -8,6 +8,9 @@ on: - opened - reopened - synchronize + paths: + - 'R/**' + - 'src/**' jobs: comment: From fb8ca6f696624cee3a13e02cf69356341c6d7763 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 19:42:50 -0700 Subject: [PATCH 17/39] Don't need R to retain the source code attributes when parsing and saving functions --- .github/workflows/performance-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 0473dcbc2..9c8cc664f 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -19,6 +19,5 @@ jobs: env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} repo_token: ${{ secrets.GITHUB_TOKEN }} - R_KEEP_PKG_SOURCE: yes steps: - uses: Anirban166/Autocomment-atime-results@v1.1.6 From cd0331ff7942e3dce4b39a6c68deefd8e406fbfb Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 21:28:50 -0700 Subject: [PATCH 18/39] Added pseudo-roxygen style comments for the pkg.edit.fun function. --- inst/atime/tests.R | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 6f38660b4..cdbc7326e 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -1,3 +1,26 @@ +# A function to customize R package metadata and source files to facilitate version-specific installation and testing. +# +# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in `Makevars` and version checking code in `onLoad.R`) +# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. +# It appends a SHA1 hash to the package name (`PKG.SHA`), ensuring each version can be installed and tested separately. +# +# @param old.Package Current name of the package. +# @param new.Package New name of the package, including a SHA hash. +# @param sha SHA1 hash used for differentiating versions. +# @param new.pkg.path Path to the package files. +# +# @details +# The function modifies: +# - DESCRIPTION, updating the package name. +# - Makevars, customizing the shared object file name and adjusting the build settings. +# - R/onLoad.R, adapting custom version checking for package loading operations. +# - NAMESPACE, changing namespace settings for dynamic linking. +# +# @examples +# pkg.edit.fun("data.table", "data.table.some_40_digit_SHA1_hash", "some_40_digit_SHA1_hash", "/path/to/data.table") +# +# @return None (performs in-place file modifications) +# @note This setup is typically unnecessary for most packages but essential for `data.table` due to its unique configuration. pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { pkg_find_replace <- function(glob, FIND, REPLACE) { atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE) From a11a2e2840c4a3b8e84c639b24ae777ea62ae74c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 11 Apr 2024 22:49:00 -0700 Subject: [PATCH 19/39] spell out "significant figures" (#6081) --- man/setNumericRounding.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/setNumericRounding.Rd b/man/setNumericRounding.Rd index 87ce2256b..f9e00de27 100644 --- a/man/setNumericRounding.Rd +++ b/man/setNumericRounding.Rd @@ -18,8 +18,8 @@ Computers cannot represent some floating point numbers (such as 0.6) precisely, using base 2. This leads to unexpected behaviour when joining or grouping columns of type 'numeric'; i.e. 'double', see example below. In cases where this is undesirable, data.table allows rounding such data up to -approximately 11 s.f. which is plenty of digits for many cases. This is -achieved by rounding the last 2 bytes off the significand. Other possible +approximately 11 significant figures which is plenty of digits for many cases. +This is achieved by rounding the last 2 bytes off the significand. Other possible values are 1 byte rounding, or no rounding (full precision, default). It is bytes rather than bits because it is tied in with the radix sort From 716da67b6045bf57a8049102020d6cd2f2a7e033 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 22:55:33 -0700 Subject: [PATCH 20/39] Made the suggested changes (and reverted to the correct commits for the second test case) --- inst/atime/tests.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index cdbc7326e..a02f07b52 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -86,7 +86,7 @@ test.list <- list( key = "g") dt_mod <- copy(dt) }), - Before = "19b7866112614db53eb3e909c097407d91cd6738", # Parent of the regression commit (https://github.com/Rdatatable/data.table/commit/0895fa247afcf6b38044bd5f56c0d209691ddb31), which is the parent of the first commit in the PR that causes the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) - Regression = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # The regression commit is the parent of the first commit in the PR that fixed the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) + Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Commit preceding the regression causing commit (https://github.com/Rdatatable/data.table/pull/4491/commits/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) + Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) ) From 047c90f0d6674a18a2a31b33d4a0521f182bf83a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 00:03:06 -0700 Subject: [PATCH 21/39] regression test for #1873 (#6080) --- inst/tests/tests.Rraw | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d3a0e37e8..d33bd72a6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18450,3 +18450,7 @@ test(2256.4, fread(f, verbose=TRUE), DT, output="sep=',' so dec set to '.'") fwrite(DT, f, dec=',', sep=';') test(2256.5, fread(f, verbose=TRUE), DT, output="dec=',' detected based on a balance of 18") test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), output="dec=',' detected based on a balance of 1 ") + +# helpful error about deleting during grouping, #1873 +DT = data.table(id = c(1, 1, 2, 2), a = 1:4, b = 5:8) +test(2257, DT[ , c("c", "a") := .(a + 1, NULL), by=id], error="it's not possible to delete parts of a column") From b5e1bc79353838ec0810626b25a946d0e89ee3ce Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 08:09:30 -0700 Subject: [PATCH 22/39] clean up style, R CMD check issues in testPattern= code (#6084) --- R/test.data.table.R | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 4908f7718..748e09512 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -151,7 +151,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # runtime test number (i.e. 'numStr') since we're just doing a static check here, though we _are_ careful to match the # full test expression string, i.e., not just limited to numeric literal test numbers. arg_line = call_id = col1 = col2 = i.line1 = id = line1 = parent = preceding_line = test_start_line = text = token = x.line1 = x.parent = NULL # R CMD check - pd = setDT(utils::getParseData(parse(fn))) + pd = setDT(utils::getParseData(parse(fn, keep.source=TRUE))) file_lines = readLines(fn) # NB: a call looks like (with id/parent tracking) # @@ -162,9 +162,15 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # ) # ## navigate up two steps from 'test' SYMBOL_FUNCTION_CALL to the overall 'expr' for the call - test_calls = pd[pd[pd[token == 'SYMBOL_FUNCTION_CALL' & text == 'test'], list(call_lhs_id = id, call_id = x.parent), on=c(id='parent')], .(line1, id), on=c(id='call_id')] + test_calls = pd[ + pd[ + pd[token == 'SYMBOL_FUNCTION_CALL' & text == 'test'], + list(call_lhs_id=id, call_id=x.parent), + on=c(id='parent')], + list(line1, id), + on=c(id='call_id')] ## all the arguments for each call to test() - test_call_args = test_calls[pd[token == 'expr'], .(call_id = parent, arg_line = i.line1, col1, col2), on=c(id='parent'), nomatch=NULL] + test_call_args = test_calls[pd[token == 'expr'], list(call_id=parent, arg_line=i.line1, col1, col2), on=c(id='parent'), nomatch=NULL] ## 2nd argument is the num= argument test_num_expr = test_call_args[ , .SD[2L], by="call_id"] # NB: subtle assumption that 2nd arg to test() is all on one line, true as of 2024-Apr and likely to remain so @@ -174,7 +180,11 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # setup_line1 # retain # setup_line2 # retain # test(keep, ...) # retain - intertest_ranges = test_calls[!id %in% keep_test_ids][test_calls[id %in% keep_test_ids], .(preceding_line = x.line1, test_start_line = i.line1), on='line1', roll=TRUE] + intertest_ranges = test_calls[!id %in% keep_test_ids][ + test_calls[id %in% keep_test_ids], + list(preceding_line=x.line1, test_start_line=i.line1), + on='line1', + roll=TRUE] # TODO(michaelchirico): this doesn't do well with tests inside control statements. # those could be included by looking for tests with parent!=0, i.e., not-top-level tests, # and including the full parent for such tests. omitting for now until needed. From fde7f43fbf7a8a6b52aee56e2dd41303bd2239f5 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 12 Apr 2024 09:02:31 -0700 Subject: [PATCH 23/39] link measure from the list of valid measure.vars values --- man/melt.data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 6dd74291d..53919d359 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -27,7 +27,7 @@ non-measure columns will be assigned to it. If integer, must be positive; see De \item{ When missing, \code{measure.vars} will become all columns outside \code{id.vars}. } \item{ Vector can be \code{integer} (implying column numbers) or \code{character} (column names). } \item{ \code{list} is a generalization of the vector version -- each element of the list (which should be \code{integer} or \code{character} as above) will become a \code{melt}ed column. } - \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} syntax; multiple patterns will produce multiple columns. } + \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} (regex without capture groups; matching column names are used in the \code{variable.name} output column), or \code{\link{measure}} (regex with capture groups; each capture group becomes an output column). } For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } From 90abe53b9f29e9f28e2891337d0c5a05e083fa34 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 09:11:33 -0700 Subject: [PATCH 24/39] missing '}' --- man/melt.data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 53919d359..44954e34c 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -27,7 +27,7 @@ non-measure columns will be assigned to it. If integer, must be positive; see De \item{ When missing, \code{measure.vars} will become all columns outside \code{id.vars}. } \item{ Vector can be \code{integer} (implying column numbers) or \code{character} (column names). } \item{ \code{list} is a generalization of the vector version -- each element of the list (which should be \code{integer} or \code{character} as above) will become a \code{melt}ed column. } - \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} (regex without capture groups; matching column names are used in the \code{variable.name} output column), or \code{\link{measure}} (regex with capture groups; each capture group becomes an output column). + \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} (regex without capture groups; matching column names are used in the \code{variable.name} output column), or \code{\link{measure}} (regex with capture groups; each capture group becomes an output column). } } For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } From e78341e53934850d65054386d418e57bdbe4f280 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 13:54:51 -0700 Subject: [PATCH 25/39] Use options= to avoid failing to reset verbose (#6088) --- inst/tests/tests.Rraw | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d33bd72a6..886c1e635 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18232,17 +18232,20 @@ test(2243.38, dt[, sd(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table( dt = data.table(x = c(2,2,1,1), y = 1:4, z=letters[1:4]) i=c(1,2) j=1L -old = options(datatable.optimize=1L) -test(2243.41, dt[, .I[TRUE], x]$V1, 1:4) -test(2243.42, dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA))) -options(datatable.optimize=2L, datatable.verbose=TRUE) -test(2243.51, dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") -test(2243.52, dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") -test(2243.53, dt[, .I[1], x]$V1, c(1L, 3L), output="GForce TRUE") -test(2243.54, dt[, .I[j], x]$V1, c(1L, 3L), output="GForce TRUE") -test(2243.55, dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") -test(2243.56, dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") -options(old) +test(2243.41, options=c(datatable.optimize=1L), dt[, .I[TRUE], x]$V1, 1:4) +test(2243.42, options=c(datatable.optimize=1L), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA))) +test(2243.51, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +test(2243.52, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +test(2243.53, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[1], x]$V1, c(1L, 3L), output="GForce TRUE") +test(2243.54, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[j], x]$V1, c(1L, 3L), output="GForce TRUE") +test(2243.55, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") +test(2243.56, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") DT = data.table(1) test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") From 79a7f3555aafd827a2e8f41902daa642cd661eab Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 12 Apr 2024 16:58:35 -0700 Subject: [PATCH 26/39] measure supports cols arg (#6077) * measure supports cols arg * line break to avoid too wide NOTE * measure.vec.i for patterns * test numbering * test measure(pattern,cols) together * rm lcols * measure supports cols arg --- NEWS.md | 2 ++ R/fmelt.R | 18 ++++++++++-------- inst/tests/tests.Rraw | 7 +++++++ man/melt.data.table.Rd | 5 +++++ 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 27c35e385..3d5b2f81c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,6 +34,8 @@ 9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development. +10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/fmelt.R b/R/fmelt.R index 23f07c552..5c50ca26c 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -107,17 +107,18 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na stopf("pattern must be character string") } match.vec = regexpr(pattern, cols, perl=TRUE) - measure.vec = which(0 < match.vec) - if (length(measure.vec) == 0L) { + measure.vec.i = which(0 < match.vec) + if (length(measure.vec.i) == 0L) { stopf("pattern did not match any cols, so nothing would be melted; fix by changing pattern") } - start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE] + start = attr(match.vec, "capture.start")[measure.vec.i, , drop=FALSE] if (is.null(start)) { stopf("pattern must contain at least one capture group (parenthesized sub-pattern)") } err.args.groups("number of capture groups in pattern", ncol(start)) - end = attr(match.vec, "capture.length")[measure.vec,]+start-1L - names.mat = matrix(cols[measure.vec], nrow(start), ncol(start)) + end = attr(match.vec, "capture.length")[measure.vec.i,]+start-1L + measure.vec <- cols[measure.vec.i] + names.mat = matrix(measure.vec, nrow(start), ncol(start)) substr(names.mat, start, end) } else { #pattern not specified, so split using sep. if (!is.character(sep)) { @@ -130,10 +131,11 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") } err.args.groups("max number of items after splitting column names", n.groups) - measure.vec = which(vector.lengths==n.groups) - do.call(rbind, list.of.vectors[measure.vec]) + measure.vec.i = which(vector.lengths==n.groups) + measure.vec = cols[measure.vec.i] + do.call(rbind, list.of.vectors[measure.vec.i]) } - err.names.unique("measured columns", cols[measure.vec]) + err.names.unique("measured columns", measure.vec) uniq.mat = unique(group.mat) if (nrow(uniq.mat) < nrow(group.mat)) { stopf("number of unique column IDs =%d is less than number of melted columns =%d; fix by changing pattern/sep", nrow(uniq.mat), nrow(group.mat)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 886c1e635..977b29b5c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3157,6 +3157,13 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) DT_missing_l_2 = data.table(num_1=1, num_2=2, list_1=list(1), list_3=list(3)) test(1035.0186, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=TRUE), data.table(char="1", num=1, list=list(1))) test(1035.0187, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=FALSE), data.table(char=c("1","2","3"), num=c(1,2,NA), list=list(1,NA,3))) + # measure supports cols arg, #5063 + expected_without_value = data.table(num_1=1,num_2=2,prefix="list",char=c("1","3"),value=list(1,3)) + test(1035.0188, melt(DT_missing_l_2, measure.vars=measure(prefix, char, sep="_", cols=c("list_1","list_3"))), expected_without_value) + test(1035.0189, melt(DT_missing_l_2, measure.vars=measure(prefix, char, pattern="(.*)_(.*)", cols=c("list_1","list_3"))), expected_without_value) + expected_with_value = data.table(num_1=1,num_2=2,char=c("1","3"),list=list(1,3)) + test(1035.0190, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_", cols=c("list_1","list_3"))), expected_with_value) + test(1035.0191, melt(DT_missing_l_2, measure.vars=measure(value.name, char, pattern="(.*)_(.*)", cols=c("list_1","list_3"))), expected_with_value) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 44954e34c..ad4dfd8dd 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -154,6 +154,11 @@ melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, sep="_ # specifying columns to melt via regex. melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, pattern="(.)_(.)")) +melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, pattern="([dc])_(.)")) + +# cols arg of measure can be used if you do not want to use regex +melt(DT.missing.cols, measure.vars=measure( + value.name, number=as.integer, sep="_", cols=c("d_1","d_2","c_1"))) } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} From 33736725e8d1e48552248f5f8d63628b88ae913a Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 12 Apr 2024 20:24:27 -0700 Subject: [PATCH 27/39] Documented test.list, made some formatting edits to what I documented yesterday (tick removals), added a link to the related atime vignette, tried to write in as much detail as I understand and included optional parameters for test.list --- inst/atime/tests.R | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index a02f07b52..3ef0f8bd7 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -1,8 +1,8 @@ # A function to customize R package metadata and source files to facilitate version-specific installation and testing. # -# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in `Makevars` and version checking code in `onLoad.R`) +# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R) # to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. -# It appends a SHA1 hash to the package name (`PKG.SHA`), ensuring each version can be installed and tested separately. +# It appends a SHA1 hash to the package name (PKG.SHA), ensuring each version can be installed and tested separately. # # @param old.Package Current name of the package. # @param new.Package New name of the package, including a SHA hash. @@ -17,10 +17,10 @@ # - NAMESPACE, changing namespace settings for dynamic linking. # # @examples -# pkg.edit.fun("data.table", "data.table.some_40_digit_SHA1_hash", "some_40_digit_SHA1_hash", "/path/to/data.table") +# pkg.edit.fun("data.table", "data.table.some_SHA1_hash", "some_SHA1_hash", "/path/to/data.table") # # @return None (performs in-place file modifications) -# @note This setup is typically unnecessary for most packages but essential for `data.table` due to its unique configuration. +# @note This setup is typically unnecessary for most packages but essential for data.table due to its unique configuration. pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { pkg_find_replace <- function(glob, FIND, REPLACE) { atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE) @@ -54,6 +54,22 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { paste0('useDynLib(', new.Package_)) } +# A list of performance tests. +# +# Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments: +# - N: A numeric sequence of data sizes to vary. +# - setup: An expression evaluated for every data size before measuring time/memory. +# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions. +# This must call a function from data.table using a syntax with double or triple colon prefix. +# The package name before the colons will be replaced by a new package name that uses the commit SHA hash. +# (For instance, data.table:::[.data.table will become data.table.some_40_digit_SHA1_hash:::[.data.table) +# +# Optional parameters that may be useful to configure tests: +# - times: Number of times each expression is evaluated (default is 10). +# - seconds.limit: The maximum median timing (in seconds) of an expression. No timings for larger N are computed past that threshold. +# - sha.vec: Named character vector or a list of vectors that specify data.table-specific commit SHAs for testing across those different git commit versions. +# For historical regressions, use 'Before', 'Regression', and 'Fixed' (otherwise something like 'Slow' or 'Fast' ideally). +# @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information. test.list <- list( # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 From a65b08873d3145a0576a545a358d0e1e9f8b69d0 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 12 Apr 2024 20:25:47 -0700 Subject: [PATCH 28/39] Temporarily removing the path filters to run a final check for the current commits to be working (tested locally, but just to ensure..) --- .github/workflows/performance-tests.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 9c8cc664f..5750bf209 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -7,10 +7,7 @@ on: types: - opened - reopened - - synchronize - paths: - - 'R/**' - - 'src/**' + - synchronize jobs: comment: From 1d151e0cd63970fcd6db2a1e31f2a575a8c46459 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 12 Apr 2024 20:46:15 -0700 Subject: [PATCH 29/39] Added back the path filters now that I'm confirmed the commit SHAs are working as expected. --- .github/workflows/performance-tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 5750bf209..2fc3a76f5 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -7,7 +7,10 @@ on: types: - opened - reopened - - synchronize + - synchronize + paths: + - 'R/**' + - 'src/**' jobs: comment: From 7268eff60180dac38ecc4b079021a020d3e82db3 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Fri, 12 Apr 2024 21:16:56 -0700 Subject: [PATCH 30/39] Add "na.print" as a new argument to "print.data.table" (#6087) * Added naprint argument to print.data.table * Added corresponding documentation * Simple tests * changed tests, added for when quote=true * Update man/print.data.table.Rd Co-authored-by: Michael Chirico * updated NEWS.md * added tests * review suggestions --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/print.data.table.R | 9 +++++---- inst/tests/tests.Rraw | 17 +++++++++++++++++ man/print.data.table.Rd | 2 ++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3d5b2f81c..bc5147107 100644 --- a/NEWS.md +++ b/NEWS.md @@ -76,6 +76,8 @@ 11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. +12. `print.data.table` now honors `na.print`, as seen in `print.default`, allowing for string replacement of `NA` values when printing. Thanks @HughParsonage for the report and @joshhwuu for the fix. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE diff --git a/R/print.data.table.R b/R/print.data.table.R index dd641f946..7f351fd8d 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -8,6 +8,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), print.keys=getOption("datatable.print.keys"), trunc.cols=getOption("datatable.print.trunc.cols"), quote=FALSE, + na.print=NULL, timezone=FALSE, ...) { # topn - print the top topn and bottom topn rows with '---' inbetween (5) # nrows - under this the whole (small) table is printed, unless topn is provided (100) @@ -118,9 +119,9 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), } rownames(toprint) = format(rownames(toprint), justify="right") if (col.names == "none") { - cut_colnames(print(toprint, right=TRUE, quote=quote)) + cut_colnames(print(toprint, right=TRUE, quote=quote, na.print=na.print)) } else { - print(toprint, right=TRUE, quote=quote) + print(toprint, right=TRUE, quote=quote, na.print=na.print) } if (trunc.cols && length(not_printed) > 0L) # prints names of variables not shown in the print @@ -133,9 +134,9 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), # option to shut this off per request of Oleg Bondar on SO, #1482 toprint=rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97 if (col.names == "none") { - cut_colnames(print(toprint, right=TRUE, quote=quote)) + cut_colnames(print(toprint, right=TRUE, quote=quote, na.print=na.print)) } else { - print(toprint, right=TRUE, quote=quote) + print(toprint, right=TRUE, quote=quote, na.print=na.print) } if (trunc.cols && length(not_printed) > 0L) # prints names of variables not shown in the print diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 977b29b5c..28532eb59 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18464,3 +18464,20 @@ test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), outpu # helpful error about deleting during grouping, #1873 DT = data.table(id = c(1, 1, 2, 2), a = 1:4, b = 5:8) test(2257, DT[ , c("c", "a") := .(a + 1, NULL), by=id], error="it's not possible to delete parts of a column") + +# testing printing data.tables with na.print, #3152 +DT = data.table(x=c(NA, "a", "b")) +test(2258.1, capture.output(print(DT, na.print=".")), c(" x", "1: .", "2: a", "3: b")) +test(2258.2, capture.output(print(DT, na.print="_")), c(" x", "1: _", "2: a", "3: b")) +test(2258.3, capture.output(print(DT, na.print="NA")), c(" x", "1: NA", "2: a", "3: b")) +test(2258.4, capture.output(print(DT, na.print=TRUE)), error="invalid 'na.print' specification") +test(2258.5, capture.output(print(DT, na.print=".", quote=TRUE)), c(' "x"', "1: .", '2: "a"', '3: "b"')) +test(2258.6, capture.output(print(DT, na.print=".", right=TRUE)), c(" x", "1: .", "2: a", "3: b")) +# tests for other call sites +# col.names="none" +test(2258.7, capture.output(print(DT, na.print=".", col.names="none")), c("1: .", "2: a", "3: b")) +# table requires splitting, col.names="none" +DT = data.table(x = c(NA, "e", "b", "j", "w", NA)) +test(2258.8, capture.output(print(DT, na.print=".", topn=2, col.names="none")), c(" 1: .", " 2: e", "--- ", " 5: w", " 6: .")) +# table requires splitting, col.names!="none" +test(2258.9, capture.output(print(DT, na.print=".", topn=2)), c(" x", " 1: .", " 2: e", "--- ", " 5: w", " 6: .")) diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index a39c8c446..f740de9d9 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -26,6 +26,7 @@ print.keys=getOption("datatable.print.keys"), # default: TRUE trunc.cols=getOption("datatable.print.trunc.cols"), # default: FALSE quote=FALSE, + na.print=NULL, timezone=FALSE, \dots) format_col(x, \dots) @@ -47,6 +48,7 @@ \item{trunc.cols}{ If \code{TRUE}, only the columns that can be printed in the console without wrapping the columns to new lines will be printed (similar to \code{tibbles}). } \item{quote}{ If \code{TRUE}, all output will appear in quotes, as in \code{print.default}. } \item{timezone}{ If \code{TRUE}, time columns of class POSIXct or POSIXlt will be printed with their timezones (if attribute is available). } + \item{na.print}{ The string to be printed in place of \code{NA} values, as in \code{print.default}. } \item{\dots}{ Other arguments ultimately passed to \code{format}. } } \value{ From 27adaad7fb55687df4920a458564817d50f30564 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 12 Apr 2024 22:29:06 -0700 Subject: [PATCH 31/39] Made the changes Michael suggested --- inst/atime/tests.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 3ef0f8bd7..83bb34ecf 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -79,7 +79,7 @@ test.list <- list( setup = quote({ set.seed(1L) dt <- data.table(a = sample(N, N)) - setindex(dt, a) + setindexv(dt, "a") }), expr = quote(data.table:::shallow(dt)), Before = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This needs to be changed later. Currently assigned to the merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) as the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) @@ -92,16 +92,16 @@ test.list <- list( "Test regression fixed in #5463" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3, 8), - expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), setup = quote({ n <- N/100 - set.seed(1L) + set.seed(2L) dt <- data.table( g = sample(seq_len(n), N, TRUE), x = runif(N), key = "g") dt_mod <- copy(dt) }), + expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Commit preceding the regression causing commit (https://github.com/Rdatatable/data.table/pull/4491/commits/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) From eaa70106493be5ad4b82052af82f64321889e652 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 12 Apr 2024 22:30:25 -0700 Subject: [PATCH 32/39] Oops forgot one Co-authored-by: Michael Chirico --- inst/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 83bb34ecf..68770347f 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -78,7 +78,7 @@ test.list <- list( N = 10^seq(3,8), setup = quote({ set.seed(1L) - dt <- data.table(a = sample(N, N)) + dt <- data.table(a = sample.int(N)) setindexv(dt, "a") }), expr = quote(data.table:::shallow(dt)), From c823c615dd31237e3525da75ff8f3e9fef9c9016 Mon Sep 17 00:00:00 2001 From: Ani Date: Tue, 16 Apr 2024 13:37:48 -0700 Subject: [PATCH 33/39] Made the suggested changes that Toby and I discussed this morning --- inst/atime/tests.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 68770347f..a0635d063 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -82,7 +82,7 @@ test.list <- list( setindexv(dt, "a") }), expr = quote(data.table:::shallow(dt)), - Before = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This needs to be changed later. Currently assigned to the merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) as the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) + # Before = "", This needs to be updated later as there are two issues here: A) The source of regression (or the particular commit that led to it) is not clear; B) Older versions of data.table are having problems when being installed in this manner (This includes commits from before March 20 2020, when the issue that discovered or first mentioned the regression was created) Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) @@ -102,7 +102,7 @@ test.list <- list( dt_mod <- copy(dt) }), expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), - Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Commit preceding the regression causing commit (https://github.com/Rdatatable/data.table/pull/4491/commits/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) + Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) ) From 47a7f52222ee1a7062701d637a346480d838ef37 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Tue, 16 Apr 2024 17:44:57 -0700 Subject: [PATCH 34/39] Refactor calls to "print.default" within "print.data.table" (#6091) * refactor calls to print.default * better approach? * refactor prints using internal helper * review change * brace not needed --------- Co-authored-by: Michael Chirico --- R/print.data.table.R | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/R/print.data.table.R b/R/print.data.table.R index 7f351fd8d..9e33e0c4d 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -110,6 +110,13 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), # When nrow(toprint) = 1, attributes get lost in the subset, # function below adds those back when necessary toprint = toprint_subset(toprint, cols_to_print) + trunc.cols <- length(not_printed) > 0L + } + print_default = function(x) { + if (col.names != "none") cut_colnames = identity + cut_colnames(print(x, right=TRUE, quote=quote, na.print=na.print)) + # prints names of variables not shown in the print + if (trunc.cols) trunc_cols_message(not_printed, abbs, class, col.names) } if (printdots) { if (isFALSE(row.names)) { @@ -118,30 +125,14 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn)) } rownames(toprint) = format(rownames(toprint), justify="right") - if (col.names == "none") { - cut_colnames(print(toprint, right=TRUE, quote=quote, na.print=na.print)) - } else { - print(toprint, right=TRUE, quote=quote, na.print=na.print) - } - if (trunc.cols && length(not_printed) > 0L) - # prints names of variables not shown in the print - trunc_cols_message(not_printed, abbs, class, col.names) - + print_default(toprint) return(invisible(x)) } if (nrow(toprint)>20L && col.names == "auto") # repeat colnames at the bottom if over 20 rows so you don't have to scroll up to see them # option to shut this off per request of Oleg Bondar on SO, #1482 - toprint=rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97 - if (col.names == "none") { - cut_colnames(print(toprint, right=TRUE, quote=quote, na.print=na.print)) - } else { - print(toprint, right=TRUE, quote=quote, na.print=na.print) - } - if (trunc.cols && length(not_printed) > 0L) - # prints names of variables not shown in the print - trunc_cols_message(not_printed, abbs, class, col.names) - + toprint = rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97 + print_default(toprint) invisible(x) } From 304aed5697aaf30fc0bd20ca5e35fe8b285eff82 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 09:17:36 -0700 Subject: [PATCH 35/39] sep= works with by= approach to splitting (#6028) --- NEWS.md | 2 ++ R/data.table.R | 6 ++++-- inst/tests/tests.Rraw | 5 +++++ man/split.Rd | 2 +- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index bc5147107..e30849114 100644 --- a/NEWS.md +++ b/NEWS.md @@ -36,6 +36,8 @@ 10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR. +11. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417). + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/data.table.R b/R/data.table.R index 89309e58b..e0cddb38f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2452,9 +2452,11 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR dtq[["i"]] = quote(levs) join = TRUE } + dots = list(...) + if (!"sep" %chin% names(dots)) dots$sep = "." dtq[["j"]] = substitute( - list(.ll.tech.split=list(.expr), .ll.tech.split.names=paste(lapply(.BY, as.character), collapse=".")), - list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) + list(.ll.tech.split=list(.expr), .ll.tech.split.names=paste(lapply(.BY, as.character), collapse=.sep)), + list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD"), .sep = dots$sep) ) dtq[["by"]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`. .expr, diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 28532eb59..e00c4ac6e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18481,3 +18481,8 @@ DT = data.table(x = c(NA, "e", "b", "j", "w", NA)) test(2258.8, capture.output(print(DT, na.print=".", topn=2, col.names="none")), c(" 1: .", " 2: e", "--- ", " 5: w", " 6: .")) # table requires splitting, col.names!="none" test(2258.9, capture.output(print(DT, na.print=".", topn=2)), c(" x", " 1: .", " 2: e", "--- ", " 5: w", " 6: .")) + +# split(by = ., sep = ..) works like split(f= ., sep = ..), #5417 +x = data.table(rep(1:2, each=5L), 1:5, 1:10) +test(2259.1, names(split(x, by = c("V1", "V2"), sep = "|")), sort(names(split(x, list(x$V1, x$V2), sep = "|")))) +test(2259.2, names(split(x, by = c("V1", "V2"), sep = "||")), sort(names(split(x, list(x$V1, x$V2), sep = "||")))) diff --git a/man/split.Rd b/man/split.Rd index 687771f0c..eedbe7d67 100644 --- a/man/split.Rd +++ b/man/split.Rd @@ -18,7 +18,7 @@ \item{sorted}{When default \code{FALSE} it will retain the order of groups we are splitting on. When \code{TRUE} then sorted list(s) are returned. Does not have effect for \code{f} argument.} \item{keep.by}{logical default \code{TRUE}. Keep column provided to \code{by} argument.} \item{flatten}{logical default \code{TRUE} will unlist nested lists of data.tables. When using \code{f} results are always flattened to list of data.tables.} - \item{\dots}{passed to data.frame way of processing when using \code{f} argument.} + \item{\dots}{When using \code{f}, passed to \code{\link[base:split]{split.data.frame}}. When using \code{by}, \code{sep} is recognized as with the default method.} \item{verbose}{logical default \code{FALSE}. When \code{TRUE} it will print to console data.table split query used to split data.} } \details{ From b9d51f1034cc0fdd06d324e62bdd05b9dcb84cef Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 09:32:26 -0700 Subject: [PATCH 36/39] Tests robust to locale sorting (#6074) * Tests robust to locale sorting * NEWS * Comment describing helper * add a TODO for our future selves --- NEWS.md | 4 +++- inst/tests/tests.Rraw | 53 +++++++++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/NEWS.md b/NEWS.md index e30849114..b14b9491c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -74,7 +74,9 @@ 9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix. -10. `test.data.table()` runs correctly in more sessions, in particular those where the `digits` or `warn` settings are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR. +10. `test.data.table()` runs robustly: + + In sessions where the `digits` or `warn` options are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR. + + In locales where `letters != sort(letters)`, e.g. Latvian, [#3502](https://github.com/Rdatatable/data.table/issues/3502). Thanks @minemR for the report and @MichaelChirico for the fix. 11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e00c4ac6e..e2791ed5d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -192,6 +192,16 @@ base_messages = list( NULL ) +# Ensure an operation uses C-locale sorting (#3502). For test set-ups/comparisons that use base operations, which are +# susceptible to locale-specific sorting issues, but shouldn't be needed for data.table code, which always uses C sorting. +# TODO(R>=3.3.0): use order(method="radix") as a way to avoid needing this helper +with_c_collate = function(expr) { + old = Sys.getlocale("LC_COLLATE") + on.exit(Sys.setlocale("LC_COLLATE", old)) + Sys.setlocale("LC_COLLATE", "C") + expr +} + ########################## .do_not_rm = ls() # objects that exist at this point should not be removed by rm_all(); e.g. test_*, base_messages, Ctest_dt_win_snprintf, prevtest, etc ########################## @@ -1834,10 +1844,10 @@ test(609, chorder(character()), base::order(character())) test(610, chorder(""), base::order("")) # Extra tests of chorder and chgroup x = sample(LETTERS) -test(610.1, chorder(x), base::order(x)) +test(610.1, chorder(x), with_c_collate(base::order(x))) test(610.2, chgroup(x), seq_along(x)) x = sample(LETTERS,1000,replace=TRUE) -test(610.3, chorder(x), base::order(x)) +test(610.3, chorder(x), with_c_collate(base::order(x))) test(610.4, unique(x[chgroup(x)]), unique(x)) # := by group @@ -3612,34 +3622,37 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, test(1102.12, dcast(DT, "a ~ c ", value.var="b"), error="not found or of unknown type") test(1102.13, dcast(DT, a ~ a, value.var="c"), error="are not found in 'data'") + # NB: for 1102.{14,15,16}, always supply levels for letters in setup data for locale robustness (#3502) + # fix for #47 - issue when factor columns on formula LHS along with `drop=FALSE` set.seed(1L) - DT = data.table(a=factor(sample(letters[1:3], 10, replace=TRUE), letters[1:5]), - b=factor(sample(tail(letters, 5), 10, replace=TRUE))) + DT = data.table(a=factor(sample(letters[1:3], 10L, replace=TRUE), levels=letters[1:5]), + b=factor(sample(letters[22:26], 10L, replace=TRUE), levels=letters[22:26])) test(1102.14, dcast(DT, a~b, drop=FALSE, fun.aggregate=length, value.var="b"), - data.table(a=factor(letters[1:5]), v=INT(0,1,0,0,0), w=INT(1,1,1,0,0), x=INT(0,0,1,0,0), y=INT(2,1,1,0,0), z=INT(0,1,0,0,0), key="a")) + data.table(a=factor(letters[1:5], levels=letters[1:5]), v=INT(0,1,0,0,0), w=INT(1,1,1,0,0), x=INT(0,0,1,0,0), y=INT(2,1,1,0,0), z=INT(0,1,0,0,0), key="a")) # reverse the levels set.seed(1L) - DT = data.table(a=factor(sample(letters[1:3], 10, replace=TRUE), letters[5:1]), - b=factor(sample(tail(letters, 5), 10, replace=TRUE))) + DT = data.table(a=factor(sample(letters[1:3], 10L, replace=TRUE), levels=letters[5:1]), + b=factor(sample(letters[22:26], 10L, replace=TRUE), levels=letters[22:26])) test(1102.15, dcast(DT, a~b, drop=FALSE, value.var="b", fun.aggregate=length), - data.table(a=factor(c("e","d","c","b","a"),levels=levels(DT$a)), v=INT(0,0,0,1,0), w=INT(0,0,1,1,1), x=INT(0,0,1,0,0), y=INT(0,0,1,1,2), z=INT(0,0,0,1,0), key="a")) + data.table(a=factor(c("e","d","c","b","a"), levels=levels(DT$a)), v=INT(0,0,0,1,0), w=INT(0,0,1,1,1), x=INT(0,0,1,0,0), y=INT(0,0,1,1,2), z=INT(0,0,0,1,0), key="a")) # more factor cols set.seed(1L) - DT = data.table(a1=factor(sample(letters[1:3], 10, replace=TRUE), letters[1:5]), # factor col 1 - a2=factor(sample(letters[6:10], 10, replace=TRUE), letters[6:10]), # factor col 2 - a3=sample(letters[1:3], 10, TRUE), # no factor - b=factor(sample(tail(letters, 5), 10, replace=TRUE))) + DT = data.table(a1=factor(sample(letters[1:3], 10L, replace=TRUE), levels=letters[1:5]), # factor col 1 + a2=factor(sample(letters[6:10], 10L, replace=TRUE), levels=letters[6:10]), # factor col 2 + a3=sample(letters[1:3], 10L, TRUE), # no factor + b=factor(sample(letters[22:26], 10L, replace=TRUE), levels=letters[22:26])) test(1102.16, dcast(DT, a1+a2+a3~b, drop=FALSE, value.var="b")[c(1,21,.N)], - data.table(a1=factor(c("a","b","e"),levels=letters[1:5]), + data.table(a1=factor(c("a","b","e"), levels=letters[1:5]), a2=factor(c("f","g","j"), levels=letters[6:10]), a3=c("a","c","c"), - v=factor(NA, levels=tail(letters,5)), - x=factor(NA, levels=tail(letters,5)), - y=factor(c(NA,"y",NA), levels=tail(letters,5)), - z=factor(NA, levels=tail(letters,5)), key=c("a1", "a2", "a3"))) + v=factor(NA, levels=letters[22:26]), + w=factor(NA, levels=letters[22:26]), + x=factor(NA, levels=letters[22:26]), + y=factor(c(NA,"y",NA), levels=letters[22:26]), + z=factor(NA, levels=letters[22:26]), key=c("a1", "a2", "a3"))) # dcast bug fix for 'subset' argument (it doesn't get key set before to run C-fcast): DT = data.table(x=c(1,1,1,2,2,2,1,1), y=c(1,2,3,1,2,1,1,2), z=c(1,2,3,NA,4,5,NA,NA)) @@ -4490,7 +4503,7 @@ for (nvars in seq_along(names(DT))) { } }) )) - test(1223.0 + test_no*0.001, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) + test(1223.0 + test_no*0.001, forderv(DT, by=x, order=signs[i,]), with_c_collate(with(DT, eval(ll)))) } integer() }) @@ -4759,11 +4772,11 @@ for (i in seq_along(names(DT))) { }) )) ans1 = forderv(DT, by=x, order=y, na.last=TRUE) # adding tests for both nalast=TRUE and nalast=NA - test(1252.0 + test_no*0.001, ans1, with(DT, eval(ll))) + test(1252.0 + test_no*0.001, ans1, with_c_collate(with(DT, eval(ll)))) test_no <<- test_no + 1L ll <- as.call(c(as.list(ll), na.last=NA)) ans1 = forderv(DT, by=x, order=y, na.last=NA) # nalast=NA here. - test(1252.0 + test_no*0.001, ans1[ans1 != 0], with(DT, eval(ll))) + test(1252.0 + test_no*0.001, ans1[ans1 != 0], with_c_collate(with(DT, eval(ll)))) }) dim(tmp)=NULL list(tmp) From 6f3fc8dcd37ac6976050b290e37f85762d0bccb5 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 11:04:15 -0700 Subject: [PATCH 37/39] New regression test re: column plonk in magrittr pipeline (#6090) * new regression test re: column plonk in magrittr pipeline * simplify --- inst/tests/other.Rraw | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 88593bcdf..99169809f 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -722,3 +722,8 @@ if (FALSE) { # moved from tests.Rraw in #5517 and not yet back on; wasn't sure } } +if (loaded[["dplyr"]]) { + # regression test for converting character->list column in a magrittr (dplyr) pipe, #2651 + DT = data.table(a = 1, b = 2, c = '1,2,3,4]', d = 4) + test(30, DT[, c := strsplit(c, ',', fixed = TRUE) %>% lapply(as.integer) %>% as.list]$c, list(1:4)) +} From 54d6b966ada6950d8977fa80628e2c0dc727c31d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 11:14:04 -0700 Subject: [PATCH 38/39] Expand exclusion array (which isn't valid) (#6096) --- .github/workflows/R-CMD-check-occasional.yaml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml index 1358f0538..cd0fec0bd 100644 --- a/.github/workflows/R-CMD-check-occasional.yaml +++ b/.github/workflows/R-CMD-check-occasional.yaml @@ -16,9 +16,15 @@ jobs: os: [macOS-latest, windows-latest, ubuntu-latest] r: ['devel', 'release', '3.2', '3.3', '3.4', '3.5', '3.6', '4.0', '4.1', '4.2', '4.3'] locale: ['en_US.utf8', 'zh_CN.utf8', 'lv_LV.utf8'] # Chinese for translations, Latvian for collate order (#3502) - exclude: - - os: ['macOS-latest', 'windows-latest'] # only run non-English locale CI on Ubuntu - locale: ['zh_CN.utf8', 'lv_LV.utf8'] + exclude: # only run non-English locale CI on Ubuntu + - os: macOS-latest + locale: 'zh_CN.utf8' + - os: macOS-latest + locale: 'lv_LV.utf8' + - os: windows-latest + locale: 'zh_CN.utf8' + - os: windows-latest + locale: 'lv_LV.utf8' env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true From 8ae1b2d343258e22bd81dfb5bb411eb28b387d5c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 13:12:25 -0700 Subject: [PATCH 39/39] Be sure to test 'other' in occasional suite (#6095) --- .github/workflows/R-CMD-check-occasional.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml index cd0fec0bd..9a5f48277 100644 --- a/.github/workflows/R-CMD-check-occasional.yaml +++ b/.github/workflows/R-CMD-check-occasional.yaml @@ -28,6 +28,7 @@ jobs: env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + TEST_DATA_TABLE_WITH_OTHER_PACKAGES: true GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: