From 3e5cbdd2496b51ef4ce9e242c60233c378f334eb Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 31 Aug 2022 12:13:09 +0200 Subject: [PATCH 1/8] fix handling of different encodings for column names --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 11 +++++++++++ src/rbindlist.c | 6 +++--- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4f4a2f417a..f3ffedcc4d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -552,6 +552,8 @@ 53. `as.data.frame(DT, row.names=)` no longer silently ignores `row.names`, [#5319](https://github.com/Rdatatable/data.table/issues/5319). Thanks to @dereckdemezquita for the fix and PR, and @ben-schwen for guidance. +54. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e05f522814..d9472efbe9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18814,3 +18814,14 @@ test(2238.6, "a" %notin% integer(), TRUE) test(2238.7, "a" %notin% NULL, TRUE) test(2238.8, NA %notin% 1:5, TRUE) test(2238.9, NA %notin% c(1:5, NA), FALSE) + +# rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452 +x = data.table(a = 1, b = 2, c = 3) +y = data.table(x = 4, y = 5, z = 6) +setnames(x , c("Ä", "Ö", "Ü")) +setnames(y , iconv(c("Ö", "Ü", "Ä"), from = "UTF-8", to = "latin1")) +test(2239.1, rbindlist(list(x,y), use.names=TRUE), data.table(Ä = c(1,6), Ö=c(2,4), Ü=c(3,5))) +test(2239.2, rbindlist(list(y,x), use.names=TRUE), data.table(Ö=c(4,2), Ü=c(5,3), Ä = c(6,1))) +y[, Ä := NULL] +test(2239.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table(Ä = c(1,NA), Ö=c(2,4), Ü=c(3,5))) +test(2239.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table(Ö=c(4,2), Ü=c(5,3), Ä = c(NA,1))) diff --git a/src/rbindlist.c b/src/rbindlist.c index 3669028835..89cbcb511b 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -78,7 +78,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (!length(cn)) continue; const SEXP *cnp = STRING_PTR(cn); for (int j=0; j0) savetl(s); uniq[nuniq++] = s; @@ -110,7 +110,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) const SEXP *cnp = STRING_PTR(cn); memset(counts, 0, nuniq*sizeof(int)); for (int j=0; j Date: Wed, 31 Aug 2022 12:15:50 +0200 Subject: [PATCH 2/8] improve comment --- src/rbindlist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rbindlist.c b/src/rbindlist.c index 89cbcb511b..1e7fd60343 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -78,7 +78,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (!length(cn)) continue; const SEXP *cnp = STRING_PTR(cn); for (int j=0; j0) savetl(s); uniq[nuniq++] = s; @@ -110,7 +110,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) const SEXP *cnp = STRING_PTR(cn); memset(counts, 0, nuniq*sizeof(int)); for (int j=0; j Date: Wed, 31 Aug 2022 12:31:37 +0200 Subject: [PATCH 3/8] write special chars in unicode --- inst/tests/tests.Rraw | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d9472efbe9..9658e9b63c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18818,10 +18818,10 @@ test(2238.9, NA %notin% c(1:5, NA), FALSE) # rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452 x = data.table(a = 1, b = 2, c = 3) y = data.table(x = 4, y = 5, z = 6) -setnames(x , c("Ä", "Ö", "Ü")) -setnames(y , iconv(c("Ö", "Ü", "Ä"), from = "UTF-8", to = "latin1")) -test(2239.1, rbindlist(list(x,y), use.names=TRUE), data.table(Ä = c(1,6), Ö=c(2,4), Ü=c(3,5))) -test(2239.2, rbindlist(list(y,x), use.names=TRUE), data.table(Ö=c(4,2), Ü=c(5,3), Ä = c(6,1))) -y[, Ä := NULL] -test(2239.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table(Ä = c(1,NA), Ö=c(2,4), Ü=c(3,5))) -test(2239.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table(Ö=c(4,2), Ü=c(5,3), Ä = c(NA,1))) +setnames(x , c("\u00e4", "\u00f6", "\u00fc")) +setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) +test(2239.1, rbindlist(list(x,y), use.names=TRUE), setnames(data.table(a=c(1,6), b=c(2,4), c=c(3,5)), c("\u00e4", "\u00f6", "\u00fc"))) +test(2239.2, rbindlist(list(y,x), use.names=TRUE), setnames(data.table(a=c(4,2), b=c(5,3), c=c(6,1)), iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1"))) +set(y, j="\u00e4", value=NULL) +test(2239.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), setnames(data.table(a=c(1,NA), b=c(2,4), c=c(3,5)), c("\u00e4", "\u00f6", "\u00fc"))) +test(2239.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), setnames(data.table(a=c(4,2), b=c(5,3), c=c(NA,1)), iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1"))) From f6881a684818356fe15c149504c291f807516bd5 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 31 Aug 2022 13:25:36 +0200 Subject: [PATCH 4/8] simplify tests --- inst/tests/tests.Rraw | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9658e9b63c..0fe528a8ed 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18820,8 +18820,8 @@ x = data.table(a = 1, b = 2, c = 3) y = data.table(x = 4, y = 5, z = 6) setnames(x , c("\u00e4", "\u00f6", "\u00fc")) setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) -test(2239.1, rbindlist(list(x,y), use.names=TRUE), setnames(data.table(a=c(1,6), b=c(2,4), c=c(3,5)), c("\u00e4", "\u00f6", "\u00fc"))) -test(2239.2, rbindlist(list(y,x), use.names=TRUE), setnames(data.table(a=c(4,2), b=c(5,3), c=c(6,1)), iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1"))) +test(2239.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) +test(2239.2, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1))) set(y, j="\u00e4", value=NULL) -test(2239.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), setnames(data.table(a=c(1,NA), b=c(2,4), c=c(3,5)), c("\u00e4", "\u00f6", "\u00fc"))) -test(2239.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), setnames(data.table(a=c(4,2), b=c(5,3), c=c(NA,1)), iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1"))) +test(2239.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) +test(2239.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1))) From d84a9096cf1a89c49f69c5eeaa639cc3a677d792 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 4 Sep 2024 05:09:42 +0000 Subject: [PATCH 5/8] Fix NEWS numbering --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 847f14f53a..ec626c728e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,8 @@ 2. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR. +3. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix. + ## NOTES 1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix. @@ -133,8 +135,6 @@ 14. Passing functions programmatically with `env=` doesn't produce an opaque error, e.g. `DT[, f(b), env = list(f=sum)]`, [#6026](https://github.com/Rdatatable/data.table/issues/6026). Note that it's much better to pass functions like `f="sum"` instead. Thanks to @MichaelChirico for the bug report and fix. -8. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix. - ## NOTES 1. `transform()` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. From b62073ce49b3dc505fd5cc20ac4d4dfd1708bd9c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sat, 9 Nov 2024 21:58:45 +0100 Subject: [PATCH 6/8] Update NEWS.md Co-authored-by: Michael Chirico --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ec626c728e..d70fec6d7f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,7 +12,7 @@ 2. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR. -3. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix. +3. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix. ## NOTES From ad1392c2e1e70ed1ec8463931ae334e96827040a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 9 Nov 2024 22:19:26 +0100 Subject: [PATCH 7/8] add comments --- inst/tests/tests.Rraw | 1 + src/rbindlist.c | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3916c400ad..405bce8a0a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -19067,6 +19067,7 @@ test(2281.2, fwrite(data.table(a=.POSIXct(0.0001)), dec=',', sep=';'), output="1 # rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452 x = data.table(a = 1, b = 2, c = 3) y = data.table(x = 4, y = 5, z = 6) +# a-umlaut, o-umlaut, u-umlaut setnames(x , c("\u00e4", "\u00f6", "\u00fc")) setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) test(2282.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) diff --git a/src/rbindlist.c b/src/rbindlist.c index 597526b7fe..13fc32f3b4 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -74,6 +74,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames); savetl_init(); int nuniq=0; + // first pass - gather unique column names for (int i=0; i0) { - SEXP *tt = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare - if (!tt) free(uniq); // shrink never fails; just keep codacy happy - uniq = tt; - } + if (nuniq>0) uniq = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare + // now count the dups (if any) and how they're distributed across the items int *counts = (int *)calloc(nuniq, sizeof(int)); // counts of names for each colnames int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector @@ -105,6 +103,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor error(_("Failed to allocate nuniq=%d items working memory in rbindlist.c"), nuniq); // # nocov end } + // second pass - count duplicates for (int i=0; i Date: Sun, 10 Nov 2024 00:05:25 +0100 Subject: [PATCH 8/8] fix lint --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 8c3bfe814f..5338d42a1b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -105,7 +105,7 @@ rowwiseDT( 10. `DT[1, on=NULL]` now works for returning the first row, [#6579](https://github.com/Rdatatable/data.table/issues/6579). Thanks to @Kodiologist for the report and @tdhock for the PR. -3. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix. +11. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix. ## NOTES