From 96e89fad997cb07d322ac6c68e2f6eac92122b65 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 7 Dec 2024 00:08:14 +0800 Subject: [PATCH] Allow double-integer64 joins when double is in (integer32 , integer64] range (#6626) * Allow double-integer64 joins when double is in (integer32 , integer64] range * rename R-side argument for readability? * Totally drop isReallyReal, just use isRealReallyInt with flavors for 32/64 * Error: flip result when changing to isRealReallyInt* * Further simplify -- first* helpers not needed if we just return bool * logical inversion * Subtle difference vs isReallyReal (type check) * Same subtle difference in .prepareFastSubset * .prepareFastSubset fix * fix test output * Add duplicate bug number to NEWS * amend a new call site for isReallyReal * fix botched merge * add codecov test * add non exported function * isRealReallyInt -> fitsInInt --------- Co-authored-by: Benjamin Schwendinger --- NEWS.md | 2 ++ R/IDateTime.R | 6 ++-- R/bmerge.R | 12 ++++---- R/data.table.R | 6 ++-- R/wrappers.R | 4 +-- inst/tests/tests.Rraw | 67 ++++++++++++++++++++++++++----------------- src/between.c | 4 +-- src/data.table.h | 7 +++-- src/forder.c | 6 ++-- src/frollR.c | 2 +- src/init.c | 4 +-- src/utils.c | 34 ++++++++++++++-------- 12 files changed, 90 insertions(+), 64 deletions(-) diff --git a/NEWS.md b/NEWS.md index e9b89dc115..508833e25f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -117,6 +117,8 @@ rowwiseDT( 15. The auto-printing suppression in `knitr` documents is now done by implementing a method for `knit_print` instead of looking up the call stack, [#6589](https://github.com/Rdatatable/data.table/pull/6589). Thanks to @jangorecki for the report [#6509](https://github.com/Rdatatable/data.table/issues/6509) and @aitap for the fix. +16. Joins of `integer64` and `double` columns succeed when the `double` column has lossless `integer64` representation, [#4167](https://github.com/Rdatatable/data.table/issues/4167) and [#6625](https://github.com/Rdatatable/data.table/issues/6625). Previously, this only worked when the double column had lossless _32-bit_ integer representation. Thanks @MichaelChirico for the reports and fix. + ## NOTES 1. There is a new vignette on joins! See `vignette("datatable-joins")`. Thanks to Angel Feliz for authoring it! Feedback welcome. This vignette has been highly requested since 2017: [#2181](https://github.com/Rdatatable/data.table/issues/2181). diff --git a/R/IDateTime.R b/R/IDateTime.R index 84c8f470a9..33ed3c62bd 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -99,9 +99,9 @@ round.IDate = function(x, digits=c("weeks", "months", "quarters", "years"), ...) # TODO: investigate Ops.IDate method a la Ops.difftime if (inherits(e1, "difftime") || inherits(e2, "difftime")) internal_error("difftime objects may not be added to IDate, but Ops dispatch should have intervened to prevent this") # nocov - if (isReallyReal(e1) || isReallyReal(e2)) { + # IDate doesn't support fractional days; revert to base Date + if ((is.double(e1) && !fitsInInt32(e1)) || (is.double(e2) && !fitsInInt32(e2))) { return(`+.Date`(e1, e2)) - # IDate doesn't support fractional days; revert to base Date } if (inherits(e1, "Date") && inherits(e2, "Date")) stopf("binary + is not defined for \"IDate\" objects") @@ -120,7 +120,7 @@ round.IDate = function(x, digits=c("weeks", "months", "quarters", "years"), ...) if (inherits(e2, "difftime")) internal_error("difftime objects may not be subtracted from IDate, but Ops dispatch should have intervened to prevent this") # nocov - if ( isReallyReal(e2) ) { + if ( is.double(e2) && !fitsInInt32(e2) ) { # IDate deliberately doesn't support fractional days so revert to base Date return(base::`-.Date`(as.Date(e1), e2)) # can't call base::.Date directly (last line of base::`-.Date`) as tried in PR#3168 because diff --git a/R/bmerge.R b/R/bmerge.R index 317fe2f645..f814615740 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -4,7 +4,7 @@ mergeType = function(x) { ans = typeof(x) if (ans=="integer") { if (is.factor(x)) ans = "factor" } else if (ans=="double") { if (inherits(x, "integer64")) ans = "integer64" } - # do not call isReallyReal(x) yet because i) if both types are double we don't need to coerce even if one or both sides + # do not call fitsInInt*(x) yet because i) if both types are double we don't need to coerce even if one or both sides # are int-as-double, and ii) to save calling it until we really need it ans } @@ -103,23 +103,23 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (x_merge_type=="integer64" || i_merge_type=="integer64") { nm = c(iname, xname) if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce - if (wclass=="integer" || (wclass=="double" && !isReallyReal(w[[wc]]))) { - if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which contains no fractions)" else "", nm[2L]) + if (wclass=="integer" || (wclass=="double" && fitsInInt64(w[[wc]]))) { + if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which has integer64 representation, e.g. no fractions)" else "", nm[2L]) set(w, j=wc, value=bit64::as.integer64(w[[wc]])) - } else stopf("Incompatible join types: %s is type integer64 but %s is type double and contains fractions", nm[2L], nm[1L]) + } else stopf("Incompatible join types: %s is type integer64 but %s is type double and cannot be coerced to integer64 (e.g. has fractions)", nm[2L], nm[1L]) } else { # just integer and double left ic_idx = which(icol == icols) # check if on is joined on multiple conditions, #6602 if (i_merge_type=="double") { coerce_x = FALSE - if (!isReallyReal(i[[icol]])) { + if (fitsInInt32(i[[icol]])) { coerce_x = TRUE # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys # we've always coerced to int and returned int, for convenience. if (length(ic_idx)>1L) { xc_idx = xcols[ic_idx] for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) { - if (isReallyReal(x[[xb]])) { + if (!fitsInInt32(x[[xb]])) { coerce_x = FALSE break } diff --git a/R/data.table.R b/R/data.table.R index 8fcc0df1ff..6594cb928c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3241,9 +3241,9 @@ is_constantish = function(q, check_singleton=FALSE) { if (length(RHS) != nrow(x)) stopf("RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %%in%% instead.", operator, length(RHS), nrow(x)) return(NULL) # DT[colA == colB] regular element-wise vector scan } - if ( mode(x[[col]]) != mode(RHS) || # mode() so that doubleLHS/integerRHS and integerLHS/doubleRHS!isReallyReal are optimized (both sides mode 'numeric') - is.factor(x[[col]])+is.factor(RHS) == 1L || # but factor is also mode 'numeric' so treat that separately - is.integer(x[[col]]) && isReallyReal(RHS) ) { # and if RHS contains fractions then don't optimize that as bmerge truncates the fractions to match to the target integer type + if ( (mode(x[[col]]) != mode(RHS)) || # mode() so that doubleLHS/integerRHS and integerLHS/doubleRHS&fitsInInt32 are optimized (both sides mode 'numeric') + (is.factor(x[[col]])+is.factor(RHS) == 1L) || # but factor is also mode 'numeric' so treat that separately + (is.integer(x[[col]]) && is.double(RHS) && !fitsInInt32(RHS)) ) { # and if RHS contains fractions then don't optimize that as bmerge truncates the fractions to match to the target integer type # re-direct non-matching type cases to base R, as data.table's binary # search based join is strict in types. #957, #961 and #1361 # the mode() checks also deals with NULL since mode(NULL)=="NULL" and causes this return, as one CRAN package (eplusr 0.9.1) relies on diff --git a/R/wrappers.R b/R/wrappers.R index e93f044b17..80b7a64e99 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -17,7 +17,7 @@ colnamesInt = function(x, cols, check_dups=FALSE, skip_absent=FALSE) .Call(Ccoln testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -isRealReallyInt = function(x) .Call(CisRealReallyIntR, x) -isReallyReal = function(x) .Call(CisReallyReal, x) +fitsInInt32 = function(x) .Call(CfitsInInt32R, x) +fitsInInt64 = function(x) .Call(CfitsInInt64R, x) coerceAs = function(x, as, copy=TRUE) .Call(CcoerceAs, x, as, copy) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 84d1c8ee82..3d8d2fabfe 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -56,8 +56,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { internal_error = data.table:::internal_error is_na = data.table:::is_na is.sorted = data.table:::is.sorted - isReallyReal = data.table:::isReallyReal - isRealReallyInt = data.table:::isRealReallyInt + fitsInInt32 = data.table:::fitsInInt32 + fitsInInt64 = data.table:::fitsInInt64 is_utc = data.table:::is_utc melt.data.table = data.table:::melt.data.table # for test 1953.4 messagef = data.table:::messagef @@ -7356,22 +7356,23 @@ if (test_bit64) { X = list(a = 4:1, b=runif(4)) test(1513, setkey(as.data.table(X), a), setDT(X, key="a")) -# Adding tests for `isReallyReal` +# Adding tests for `fitsInInt32` x = as.numeric(sample(10)) -test(1514.1, isReallyReal(x), 0L) +test(1514.1, fitsInInt32(x), TRUE) x = as.numeric(sample(c(1:5, NA))) -test(1514.2, isReallyReal(x), 0L) # NAs in numeric can be coerced to integer NA without loss +test(1514.2, fitsInInt32(x), TRUE) # NAs in numeric can be coerced to integer NA without loss x = c(1:2, NaN, NA) -test(1514.3, isReallyReal(x), 3L) +test(1514.3, fitsInInt32(x), FALSE) x = c(1:2, Inf, NA) -test(1514.4, isReallyReal(x), 3L) +test(1514.4, fitsInInt32(x), FALSE) x = c(1:2, -Inf, NA) -test(1514.5, isReallyReal(x), 3L) +test(1514.5, fitsInInt32(x), FALSE) x = runif(2) -test(1514.6, isReallyReal(x), 1L) +test(1514.6, fitsInInt32(x), FALSE) x = numeric() -test(1514.7, isReallyReal(x), 0L) -test(1514.8, isReallyReal(9L), 0L) +test(1514.7, fitsInInt32(x), TRUE) +test(1514.8, fitsInInt32(9L), FALSE) # b/c not double input +test(1514.9, fitsInInt64(9L), FALSE) # #1091 options(datatable.prettyprint.char = 5L) @@ -15150,9 +15151,9 @@ if (test_bit64) { output = "Coercing integer column x.int to type integer64 to match type of i.int64") test(2044.67, dt1[dt2, ..cols, on="doubleInt==int64", nomatch=0L, verbose=TRUE], ans, - output = "Coercing double column x.doubleInt (which contains no fractions) to type integer64 to match type of i.int64") + output = "Coercing double column x.doubleInt (which has integer64 representation, e.g. no fractions) to type integer64 to match type of i.int64") test(2044.68, dt1[dt2, ..cols, on="realDouble==int64", nomatch=0L, verbose=TRUE], - error="Incompatible join types: i.int64 is type integer64 but x.realDouble is type double and contains fractions") + error="Incompatible join types: i.int64 is type integer64 but x.realDouble is type double and cannot be coerced to integer64") # int64 in x test(2044.69, dt1[dt2, ..cols, on="int64==int", nomatch=0L, verbose=TRUE], ans<-data.table(x.int=1:5, x.doubleInt=as.double(1:5), x.realDouble=c(0.5,1.0,1.5,2.0,2.5), x.int64=as.integer64(1:5), @@ -15160,9 +15161,9 @@ if (test_bit64) { output = "Coercing integer column i.int to type integer64 to match type of x.int64") test(2044.70, dt1[dt2, ..cols, on="int64==doubleInt", nomatch=0L, verbose=TRUE], ans, - output = "Coercing double column i.doubleInt (which contains no fractions) to type integer64 to match type of x.int64") + output = "Coercing double column i.doubleInt (which has integer64 representation, e.g. no fractions) to type integer64 to match type of x.int64") test(2044.71, dt1[dt2, ..cols, on="int64==realDouble", nomatch=0L, verbose=TRUE], - error="Incompatible join types: x.int64 is type integer64 but i.realDouble is type double and contains fractions") + error="Incompatible join types: x.int64 is type integer64 but i.realDouble is type double and cannot be coerced to integer64") } # coercion of all-NA dt1 = data.table(a=1, b=NA_character_) @@ -17603,18 +17604,18 @@ test(2204, as.data.table(mtcars, keep.rownames='model', key='model'), # 2205 tested nanotime moved to other.Rraw 27, #5516 -# isRealReallyInt, #3966 -test(2206.01, isRealReallyInt(c(-2147483647.0, NA, 0.0, 2147483647.0)), TRUE) -test(2206.02, isRealReallyInt(2147483648.0), FALSE) # >INT_MAX -test(2206.03, isRealReallyInt(-2147483648.0), FALSE) # <=INT_MIN since INT_MIN==NA_integer_ -test(2206.04, isRealReallyInt(c(5,-5,2147483648)), FALSE) # test real last position -test(2206.05, isRealReallyInt(NaN), FALSE) -test(2206.06, isRealReallyInt(+Inf), FALSE) -test(2206.07, isRealReallyInt(-Inf), FALSE) -test(2206.08, isRealReallyInt(0.1), FALSE) -test(2206.09, isRealReallyInt(numeric()), TRUE) -test(2206.10, isRealReallyInt(9L), FALSE) # must be type double -test(2206.11, isRealReallyInt(integer()), FALSE) +# fitsInInt32, #3966 +test(2206.01, fitsInInt32(c(-2147483647.0, NA, 0.0, 2147483647.0)), TRUE) +test(2206.02, fitsInInt32(2147483648.0), FALSE) # >INT_MAX +test(2206.03, fitsInInt32(-2147483648.0), FALSE) # <=INT_MIN since INT_MIN==NA_integer_ +test(2206.04, fitsInInt32(c(5,-5,2147483648)), FALSE) # test real last position +test(2206.05, fitsInInt32(NaN), FALSE) +test(2206.06, fitsInInt32(+Inf), FALSE) +test(2206.07, fitsInInt32(-Inf), FALSE) +test(2206.08, fitsInInt32(0.1), FALSE) +test(2206.09, fitsInInt32(numeric()), TRUE) +test(2206.10, fitsInInt32(9L), FALSE) # must be type double +test(2206.11, fitsInInt32(integer()), FALSE) # dcast supports complex value to cast, #4855 DT = CJ(x=1:3, y=letters[1:2]) @@ -20664,3 +20665,15 @@ test(2299.09, format_list_item(data.table(a=numeric(), b=numeric())), output="= (double)INT64_MIN; } -static R_xlen_t firstNonInt(SEXP x) { +// used to error if not passed type double but this needed extra is.double() calls in calling R code +// which needed a repeat of the argument. Hence simpler and more robust to return false when not type double. +bool fitsInInt32(SEXP x) { + if (!isReal(x)) + return false; R_xlen_t n=xlength(x), i=0; const double *dx = REAL(x); while (i