Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into rbindlist_encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelChirico committed Dec 3, 2024
2 parents efaf11b + 23dac21 commit f85272b
Show file tree
Hide file tree
Showing 32 changed files with 4,182 additions and 102 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.dir-locals.el
.check.translations.R
^\.Rprofile$
^data\.table_.*\.tar\.gz$
^config\.log$
Expand Down
2 changes: 1 addition & 1 deletion .ci/.lintr.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ exclusions = c(local({
infix_spaces_linter = Inf,
undesirable_function_linter = Inf
)),
exclusion_for_dir("vignettes", list(
exclusion_for_dir(c("vignettes", "vignettes/fr"), list(
quotes_linter = Inf,
sample_int_linter = Inf
# strings_as_factors_linter = Inf
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pkgup.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ jobs:
Rscript -e 'tools::write_PACKAGES("public/src/contrib", fields="Revision")'
- name: upload
if: github.ref == 'refs/heads/master'
uses: actions/upload-pages-artifact@v4
uses: actions/upload-pages-artifact@v3
with:
path: "public"
- name: deploy
Expand Down
5 changes: 0 additions & 5 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,6 @@ integration:
- R --version
- *install-deps ## markdown pkg not present in r-pkgdown image
- mkdir -p ./pkgdown/favicon/ && cp .graphics/favicon/* ./pkgdown/favicon/ ## copy favicons
- rm -rf ./vignettes ## r-lib/pkgdown#2383
- Rscript -e 'pkgdown::build_site(override=list(destination="./website"))'
## html manual, vignettes, repos, cran_web, cran_checks
- echo 'source(".ci/ci.R"); source(".ci/publish.R")' >> .Rprofile
Expand Down Expand Up @@ -388,10 +387,6 @@ integration:
- Rscript -e 'check.index("data.table", names(test.jobs))'
## web/checks/check_flavors.html
- Rscript -e 'check.flavors(names(test.jobs))'
## pkgdown vignettes workaround r-lib/pkgdown#2383
- mkdir -p website/articles
- cp bus/integration/cran/library/data.table/doc/*.html website/articles/.
- rm website/articles/index.html
## pkgdown merge
- Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); msg = if (length(f<-common_files("website","bus/integration/cran"))) paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0(" ", f)), collapse="\n") else "No overlapping files from pkgdown artifacts"; message(msg); q("no")'
- mv website/* bus/integration/cran/
Expand Down
8 changes: 7 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -92,5 +92,11 @@ Authors@R: c(
person("Ivan", "Krylov", role="ctb"),
person("Angel", "Feliz", role="ctb"),
person("Michael","Young", role="ctb"),
person("Mark", "Seeto", role="ctb")
person("Mark", "Seeto", role="ctb"),
person("Philippe", "Grosjean", role="ctb"),
person("Vincent", "Runge", role="ctb"),
person("Christian", "Wia", role="ctb"),
person("Elise", "Maigné", role="ctb"),
person("Vincent", "Rocher", role="ctb"),
person("Vijay", "Lulla", role="ctb")
)
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,11 @@ rowwiseDT(
10. `DT[1, on=NULL]` now works for returning the first row, [#6579](https://github.com/Rdatatable/data.table/issues/6579). Thanks to @Kodiologist for the report and @tdhock for the PR.
11. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix.
11. `tables()` now returns the correct size for data.tables over 2GiB, [#6607](https://github.com/Rdatatable/data.table/issues/6607). Thanks to @vlulla for the report and the PR.
12. Joins on multiple columns, such as `x[y, on=c("x1==y1", "x2==y1")]`, could fail during implicit type coercions if `x1` and `x2` had different but still compatible types, [#6602](https://github.com/Rdatatable/data.table/issues/6602). This was particularly unexpected when columns `x1`, `x2`, and `y1` were all of the same class, e.g. `Date`, but differed in their underlying storage types. Thanks to Benjamin Schwendinger for the report and the fix.
13. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix.
## NOTES
Expand Down
144 changes: 90 additions & 54 deletions R/bmerge.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,25 @@


mergeType = function(x) {
ans = typeof(x)
if (ans=="integer") { if (is.factor(x)) ans = "factor" }
else if (ans=="double") { if (inherits(x, "integer64")) ans = "integer64" }
# do not call isReallyReal(x) yet because i) if both types are double we don't need to coerce even if one or both sides
# are int-as-double, and ii) to save calling it until we really need it
ans
}

cast_with_atts = function(x, as.f) {
ans = as.f(x)
if (!is.null(attributes(x))) attributes(ans) = attributes(x)
ans
}

coerce_col = function(dt, col, from_type, to_type, from_name, to_name, verbose_msg=NULL) {
if (!is.null(verbose_msg)) catf(verbose_msg, from_type, from_name, to_type, to_name, domain=NULL)
set(dt, j=col, value=cast_with_atts(dt[[col]], match.fun(paste0("as.", to_type))))
}

bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
{
callersi = i
Expand All @@ -25,95 +46,110 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos

supported = c(ORDERING_TYPES, "factor", "integer64")

getClass = function(x) {
ans = typeof(x)
if (ans=="integer") { if (is.factor(x)) ans = "factor" }
else if (ans=="double") { if (inherits(x, "integer64")) ans = "integer64" }
# do not call isReallyReal(x) yet because i) if both types are double we don't need to coerce even if one or both sides
# are int-as-double, and ii) to save calling it until we really need it
ans
}

if (nrow(i)) for (a in seq_along(icols)) {
# - check that join columns have compatible types
# - do type coercions if necessary on just the shallow local copies for the purpose of join
# - handle factor columns appropriately
# Note that if i is keyed, if this coerces i's key gets dropped by set()
ic = icols[a]
xc = xcols[a]
xclass = getClass(x[[xc]])
iclass = getClass(i[[ic]])
xname = paste0("x.", names(x)[xc])
iname = paste0("i.", names(i)[ic])
if (!xclass %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, xclass)
if (!iclass %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, iclass)
if (xclass=="factor" || iclass=="factor") {
icol = icols[a]
xcol = xcols[a]
x_merge_type = mergeType(x[[xcol]])
i_merge_type = mergeType(i[[icol]])
xname = paste0("x.", names(x)[xcol])
iname = paste0("i.", names(i)[icol])
if (!x_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, x_merge_type)
if (!i_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, i_merge_type)
if (x_merge_type=="factor" || i_merge_type=="factor") {
if (roll!=0.0 && a==length(icols))
stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname)
if (xclass=="factor" && iclass=="factor") {
if (x_merge_type=="factor" && i_merge_type=="factor") {
if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname)
set(i, j=ic, value=chmatch(levels(i[[ic]]), levels(x[[xc]]), nomatch=0L)[i[[ic]]]) # nomatch=0L otherwise a level that is missing would match to NA values
set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]]) # nomatch=0L otherwise a level that is missing would match to NA values
next
} else {
if (xclass=="character") {
if (x_merge_type=="character") {
if (verbose) catf("Coercing factor column %s to type character to match type of %s.\n", iname, xname)
set(i, j=ic, value=val<-as.character(i[[ic]]))
set(callersi, j=ic, value=val) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581
set(i, j=icol, value=val<-as.character(i[[icol]]))
set(callersi, j=icol, value=val) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581
next
} else if (iclass=="character") {
} else if (i_merge_type=="character") {
if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname)
newvalue = chmatch(i[[ic]], levels(x[[xc]]), nomatch=0L)
if (anyNA(i[[ic]])) newvalue[is.na(i[[ic]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809
set(i, j=ic, value=newvalue)
newvalue = chmatch(i[[icol]], levels(x[[xcol]]), nomatch=0L)
if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809
set(i, j=icol, value=newvalue)
next
}
}
stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, xclass, iname, iclass)
stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, iname, i_merge_type)
}
if (xclass == iclass) {
if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, xclass, xname)
# we check factors first to cater for the case when trying to do rolling joins on factors
if (x_merge_type == i_merge_type) {
if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname)
next
}
if (xclass=="character" || iclass=="character" ||
xclass=="logical" || iclass=="logical" ||
xclass=="factor" || iclass=="factor") {
if (anyNA(i[[ic]]) && allNA(i[[ic]])) {
if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", iname, iclass, xclass, xname)
set(i, j=ic, value=match.fun(paste0("as.", xclass))(i[[ic]]))
cfl = c("character", "logical", "factor")
if (x_merge_type %chin% cfl || i_merge_type %chin% cfl) {
msg = if(verbose) gettext("Coercing all-NA %s column %s to type %s to match type of %s.\n") else NULL
if (anyNA(i[[icol]]) && allNA(i[[icol]])) {
coerce_col(i, icol, i_merge_type, x_merge_type, iname, xname, msg)
next
}
else if (anyNA(x[[xc]]) && allNA(x[[xc]])) {
if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", xname, xclass, iclass, iname)
set(x, j=xc, value=match.fun(paste0("as.", iclass))(x[[xc]]))
if (anyNA(x[[xcol]]) && allNA(x[[xcol]])) {
coerce_col(x, xcol, x_merge_type, i_merge_type, xname, iname, msg)
next
}
stopf("Incompatible join types: %s (%s) and %s (%s)", xname, xclass, iname, iclass)
stopf("Incompatible join types: %s (%s) and %s (%s)", xname, x_merge_type, iname, i_merge_type)
}
if (xclass=="integer64" || iclass=="integer64") {
if (x_merge_type=="integer64" || i_merge_type=="integer64") {
nm = c(iname, xname)
if (xclass=="integer64") { w=i; wc=ic; wclass=iclass; } else { w=x; wc=xc; wclass=xclass; nm=rev(nm) } # w is which to coerce
if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce
if (wclass=="integer" || (wclass=="double" && !isReallyReal(w[[wc]]))) {
if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which contains no fractions)" else "", nm[2L])
set(w, j=wc, value=bit64::as.integer64(w[[wc]]))
} else stopf("Incompatible join types: %s is type integer64 but %s is type double and contains fractions", nm[2L], nm[1L])
} else {
# just integer and double left
if (iclass=="double") {
if (!isReallyReal(i[[ic]])) {
ic_idx = which(icol == icols) # check if on is joined on multiple conditions, #6602
if (i_merge_type=="double") {
coerce_x = FALSE
if (!isReallyReal(i[[icol]])) {
coerce_x = TRUE
# common case of ad hoc user-typed integers missing L postfix joining to correct integer keys
# we've always coerced to int and returned int, for convenience.
if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s.\n", iname, xname)
val = as.integer(i[[ic]])
if (!is.null(attributes(i[[ic]]))) attributes(val) = attributes(i[[ic]]) # to retain Date for example; 3679
set(i, j=ic, value=val)
set(callersi, j=ic, value=val) # change the shallow copy of i up in [.data.table to reflect in the result, too.
} else {
if (verbose) catf("Coercing integer column %s to type double to match type of %s which contains fractions.\n", xname, iname)
set(x, j=xc, value=as.double(x[[xc]]))
if (length(ic_idx)>1L) {
xc_idx = xcols[ic_idx]
for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) {
if (isReallyReal(x[[xb]])) {
coerce_x = FALSE
break
}
}
}
if (coerce_x) {
msg = if (verbose) gettext("Coercing %s column %s (which contains no fractions) to type %s to match type of %s.\n") else NULL
coerce_col(i, icol, "double", "integer", iname, xname, msg)
set(callersi, j=icol, value=i[[icol]]) # change the shallow copy of i up in [.data.table to reflect in the result, too.
if (length(ic_idx)>1L) {
xc_idx = xcols[ic_idx]
for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) {
coerce_col(x, xb, "double", "integer", paste0("x.", names(x)[xb]), xname, msg)
}
}
}
}
if (!coerce_x) {
msg = if (verbose) gettext("Coercing %s column %s to type %s to match type of %s which contains fractions.\n") else NULL
coerce_col(x, xcol, "integer", "double", xname, iname, msg)
}
} else {
if (verbose) catf("Coercing integer column %s to type double for join to match type of %s.\n", iname, xname)
set(i, j=ic, value=as.double(i[[ic]]))
msg = if (verbose) gettext("Coercing %s column %s to type %s for join to match type of %s.\n") else NULL
coerce_col(i, icol, "integer", "double", iname, xname, msg)
if (length(ic_idx)>1L) {
xc_idx = xcols[ic_idx]
for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "integer")]) {
coerce_col(x, xb, "integer", "double", paste0("x.", names(x)[xb]), xname, msg)
}
}
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion R/tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ type_size = function(DT) {
# for speed and ram efficiency, a lower bound by not descending into character string lengths or list items
# if a more accurate and higher estimate is needed then user can pass object.size or alternative to mb=
# in case number of columns is very large (e.g. 1e6 columns) then we use a for() to avoid allocation of sapply()
ans = 0L
ans = 0.0
lookup = c("raw"=1L, "integer"=4L, "double"=8L, "complex"=16L)
for (i in seq_along(DT)) {
col = DT[[i]]
Expand Down
7 changes: 7 additions & 0 deletions inst/tests/other.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -766,3 +766,10 @@ if (loaded[["nanotime"]]) {
# respect dec=',' for nanotime, related to #6446, corresponding to tests 2281.*
test(31, fwrite(data.table(as.nanotime(.POSIXct(0))), dec=',', sep=';'), output="1970-01-01T00:00:00,000000000Z")
}

# tables() with large environment #6607
.e <- new.env() ## to not populate the .GlobalEnv
.e[["DT"]] <- as.data.table(lapply(1:15,function(i) runif(20e6)))
res <- tables(env=.e)
test(32, res[, .(NAME,NROW,NCOL,MB)], data.table(NAME="DT",NROW=20000000L,NCOL=15L,MB=2288.0))
rm(.e, res)
Loading

0 comments on commit f85272b

Please sign in to comment.