From f05893e74604b660fd3a99a8b253aad3c47df789 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 3 Dec 2024 07:20:33 +0100 Subject: [PATCH] rbindlist(l, use.names=TRUE) handle different encodings for column names (#5453) * fix handling of different encodings for column names * improve comment * write special chars in unicode * simplify tests * Fix NEWS numbering * Update NEWS.md Co-authored-by: Michael Chirico * add comments * fix lint --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 12 ++++++++++++ src/rbindlist.c | 16 ++++++++-------- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index ea8f4c4df3..23213934fc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -111,6 +111,8 @@ rowwiseDT( 12. Joins on multiple columns, such as `x[y, on=c("x1==y1", "x2==y1")]`, could fail during implicit type coercions if `x1` and `x2` had different but still compatible types, [#6602](https://github.com/Rdatatable/data.table/issues/6602). This was particularly unexpected when columns `x1`, `x2`, and `y1` were all of the same class, e.g. `Date`, but differed in their underlying storage types. Thanks to Benjamin Schwendinger for the report and the fix. +13. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix. + ## NOTES 1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 96e85e14ac..451dad6840 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -20639,3 +20639,15 @@ test(2297.22, y[x, on=.(d == a, c == a)], data.table(c=1, d=1)) x = data.table(a=1, b=2L) y = data.table(c=1.5, d=1L) test(2297.31, y[x, on=.(c == a, d == a), nomatch=NULL], output="Empty data.table (0 rows and 3 cols): c,d,b") + +# rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452 +x = data.table(a = 1, b = 2, c = 3) +y = data.table(x = 4, y = 5, z = 6) +# a-umlaut, o-umlaut, u-umlaut +setnames(x , c("\u00e4", "\u00f6", "\u00fc")) +setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) +test(2298.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) +test(2298.2, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1))) +set(y, j="\u00e4", value=NULL) +test(2298.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) +test(2298.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1))) diff --git a/src/rbindlist.c b/src/rbindlist.c index 9b65ec4c73..42ba9ad749 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -76,6 +76,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames); // # nocov savetl_init(); int nuniq=0; + // first pass - gather unique column names for (int i=0; i0) savetl(s); uniq[nuniq++] = s; SET_TRUELENGTH(s,-nuniq); } } - if (nuniq>0) { - SEXP *tt = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare - if (!tt) free(uniq); // shrink never fails; just keep codacy happy - uniq = tt; - } + if (nuniq>0) uniq = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare + // now count the dups (if any) and how they're distributed across the items int *counts = (int *)calloc(nuniq, sizeof(int)); // counts of names for each colnames int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector @@ -107,6 +105,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor error(_("Failed to allocate nuniq=%d items working memory in rbindlist.c"), nuniq); // # nocov end } + // second pass - count duplicates for (int i=0; i