From 3e5cbdd2496b51ef4ce9e242c60233c378f334eb Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Wed, 31 Aug 2022 12:13:09 +0200
Subject: [PATCH 1/8] fix handling of different encodings for column names

---
 NEWS.md               |  2 ++
 inst/tests/tests.Rraw | 11 +++++++++++
 src/rbindlist.c       |  6 +++---
 3 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/NEWS.md b/NEWS.md
index 4f4a2f417a..f3ffedcc4d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -552,6 +552,8 @@
 
 53. `as.data.frame(DT, row.names=)` no longer silently ignores `row.names`, [#5319](https://github.com/Rdatatable/data.table/issues/5319). Thanks to @dereckdemezquita for the fix and PR, and @ben-schwen for guidance.
 
+54. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix.
+
 ## NOTES
 
 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.<type>()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example :
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index e05f522814..d9472efbe9 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18814,3 +18814,14 @@ test(2238.6, "a" %notin% integer(), TRUE)
 test(2238.7, "a" %notin% NULL, TRUE)
 test(2238.8, NA %notin% 1:5, TRUE)
 test(2238.9, NA %notin% c(1:5, NA), FALSE)
+
+# rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452
+x = data.table(a = 1, b = 2, c = 3)
+y = data.table(x = 4, y = 5, z = 6)
+setnames(x , c("Ä", "Ö", "Ü"))
+setnames(y , iconv(c("Ö", "Ü", "Ä"), from = "UTF-8", to = "latin1"))
+test(2239.1, rbindlist(list(x,y), use.names=TRUE), data.table(Ä = c(1,6), Ö=c(2,4), Ü=c(3,5)))
+test(2239.2, rbindlist(list(y,x), use.names=TRUE), data.table(Ö=c(4,2), Ü=c(5,3), Ä = c(6,1)))
+y[, Ä := NULL]
+test(2239.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table(Ä = c(1,NA), Ö=c(2,4), Ü=c(3,5)))
+test(2239.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table(Ö=c(4,2), Ü=c(5,3), Ä = c(NA,1)))
diff --git a/src/rbindlist.c b/src/rbindlist.c
index 3669028835..89cbcb511b 100644
--- a/src/rbindlist.c
+++ b/src/rbindlist.c
@@ -78,7 +78,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
       if (!length(cn)) continue;
       const SEXP *cnp = STRING_PTR(cn);
       for (int j=0; j<thisncol; j++) {
-        SEXP s = cnp[j];
+        SEXP s = ENC2UTF8(cnp[j]); // ignore encodings for use.names #5452
         if (TRUELENGTH(s)<0) continue;  // seen this name before
         if (TRUELENGTH(s)>0) savetl(s);
         uniq[nuniq++] = s;
@@ -110,7 +110,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
       const SEXP *cnp = STRING_PTR(cn);
       memset(counts, 0, nuniq*sizeof(int));
       for (int j=0; j<thisncol; j++) {
-        SEXP s = cnp[j];
+        SEXP s = ENC2UTF8(cnp[j]); // ignore encodings for use.names #5452
         counts[ -TRUELENGTH(s)-1 ]++;
       }
       for (int u=0; u<nuniq; u++) {
@@ -150,7 +150,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
         const SEXP *cnp = STRING_PTR(cn);
         memset(counts, 0, nuniq*sizeof(int));
         for (int j=0; j<thisncol; j++) {
-          SEXP s = cnp[j];
+          SEXP s = ENC2UTF8(cnp[j]); // ignore encodings for use.names #5452
           int w = -TRUELENGTH(s)-1;
           int wi = counts[w]++; // how many dups have we seen before of this name within this item
           if (uniqMap[w]==-1) {

From bf24383ae7607e3296d023a196e8f37f2a96dfd0 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Wed, 31 Aug 2022 12:15:50 +0200
Subject: [PATCH 2/8] improve comment

---
 src/rbindlist.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/rbindlist.c b/src/rbindlist.c
index 89cbcb511b..1e7fd60343 100644
--- a/src/rbindlist.c
+++ b/src/rbindlist.c
@@ -78,7 +78,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
       if (!length(cn)) continue;
       const SEXP *cnp = STRING_PTR(cn);
       for (int j=0; j<thisncol; j++) {
-        SEXP s = ENC2UTF8(cnp[j]); // ignore encodings for use.names #5452
+        SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
         if (TRUELENGTH(s)<0) continue;  // seen this name before
         if (TRUELENGTH(s)>0) savetl(s);
         uniq[nuniq++] = s;
@@ -110,7 +110,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
       const SEXP *cnp = STRING_PTR(cn);
       memset(counts, 0, nuniq*sizeof(int));
       for (int j=0; j<thisncol; j++) {
-        SEXP s = ENC2UTF8(cnp[j]); // ignore encodings for use.names #5452
+        SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
         counts[ -TRUELENGTH(s)-1 ]++;
       }
       for (int u=0; u<nuniq; u++) {
@@ -150,7 +150,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
         const SEXP *cnp = STRING_PTR(cn);
         memset(counts, 0, nuniq*sizeof(int));
         for (int j=0; j<thisncol; j++) {
-          SEXP s = ENC2UTF8(cnp[j]); // ignore encodings for use.names #5452
+          SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
           int w = -TRUELENGTH(s)-1;
           int wi = counts[w]++; // how many dups have we seen before of this name within this item
           if (uniqMap[w]==-1) {

From 7142335cb31eba4a6b281c1430da7fb5cce02239 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Wed, 31 Aug 2022 12:31:37 +0200
Subject: [PATCH 3/8] write special chars in unicode

---
 inst/tests/tests.Rraw | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index d9472efbe9..9658e9b63c 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18818,10 +18818,10 @@ test(2238.9, NA %notin% c(1:5, NA), FALSE)
 # rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452
 x = data.table(a = 1, b = 2, c = 3)
 y = data.table(x = 4, y = 5, z = 6)
-setnames(x , c("Ä", "Ö", "Ü"))
-setnames(y , iconv(c("Ö", "Ü", "Ä"), from = "UTF-8", to = "latin1"))
-test(2239.1, rbindlist(list(x,y), use.names=TRUE), data.table(Ä = c(1,6), Ö=c(2,4), Ü=c(3,5)))
-test(2239.2, rbindlist(list(y,x), use.names=TRUE), data.table(Ö=c(4,2), Ü=c(5,3), Ä = c(6,1)))
-y[, Ä := NULL]
-test(2239.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table(Ä = c(1,NA), Ö=c(2,4), Ü=c(3,5)))
-test(2239.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table(Ö=c(4,2), Ü=c(5,3), Ä = c(NA,1)))
+setnames(x , c("\u00e4", "\u00f6", "\u00fc"))
+setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1"))
+test(2239.1, rbindlist(list(x,y), use.names=TRUE), setnames(data.table(a=c(1,6), b=c(2,4), c=c(3,5)), c("\u00e4", "\u00f6", "\u00fc")))
+test(2239.2, rbindlist(list(y,x), use.names=TRUE), setnames(data.table(a=c(4,2), b=c(5,3), c=c(6,1)), iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")))
+set(y, j="\u00e4", value=NULL)
+test(2239.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), setnames(data.table(a=c(1,NA), b=c(2,4), c=c(3,5)), c("\u00e4", "\u00f6", "\u00fc")))
+test(2239.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), setnames(data.table(a=c(4,2),  b=c(5,3), c=c(NA,1)), iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")))

From f6881a684818356fe15c149504c291f807516bd5 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Wed, 31 Aug 2022 13:25:36 +0200
Subject: [PATCH 4/8] simplify tests

---
 inst/tests/tests.Rraw | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 9658e9b63c..0fe528a8ed 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18820,8 +18820,8 @@ x = data.table(a = 1, b = 2, c = 3)
 y = data.table(x = 4, y = 5, z = 6)
 setnames(x , c("\u00e4", "\u00f6", "\u00fc"))
 setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1"))
-test(2239.1, rbindlist(list(x,y), use.names=TRUE), setnames(data.table(a=c(1,6), b=c(2,4), c=c(3,5)), c("\u00e4", "\u00f6", "\u00fc")))
-test(2239.2, rbindlist(list(y,x), use.names=TRUE), setnames(data.table(a=c(4,2), b=c(5,3), c=c(6,1)), iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")))
+test(2239.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5)))
+test(2239.2, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1)))
 set(y, j="\u00e4", value=NULL)
-test(2239.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), setnames(data.table(a=c(1,NA), b=c(2,4), c=c(3,5)), c("\u00e4", "\u00f6", "\u00fc")))
-test(2239.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), setnames(data.table(a=c(4,2),  b=c(5,3), c=c(NA,1)), iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")))
+test(2239.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5)))
+test(2239.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2),  "\u00fc"=c(5,3), "\u00e4"=c(NA,1)))

From d84a9096cf1a89c49f69c5eeaa639cc3a677d792 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Wed, 4 Sep 2024 05:09:42 +0000
Subject: [PATCH 5/8] Fix NEWS numbering

---
 NEWS.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 847f14f53a..ec626c728e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -12,6 +12,8 @@
 
 2. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR.
 
+3. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix.
+
 ## NOTES
 
 1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix.
@@ -133,8 +135,6 @@
 
 14. Passing functions programmatically with `env=` doesn't produce an opaque error, e.g. `DT[, f(b), env = list(f=sum)]`, [#6026](https://github.com/Rdatatable/data.table/issues/6026). Note that it's much better to pass functions like `f="sum"` instead. Thanks to @MichaelChirico for the bug report and fix.
 
-8. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix.
-
 ## NOTES
 
 1. `transform()` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1.

From b62073ce49b3dc505fd5cc20ac4d4dfd1708bd9c Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com>
Date: Sat, 9 Nov 2024 21:58:45 +0100
Subject: [PATCH 6/8] Update NEWS.md

Co-authored-by: Michael Chirico <chiricom@google.com>
---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index ec626c728e..d70fec6d7f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -12,7 +12,7 @@
 
 2. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR.
 
-3. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix.
+3. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix.
 
 ## NOTES
 

From ad1392c2e1e70ed1ec8463931ae334e96827040a Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Sat, 9 Nov 2024 22:19:26 +0100
Subject: [PATCH 7/8] add comments

---
 inst/tests/tests.Rraw |  1 +
 src/rbindlist.c       | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 3916c400ad..405bce8a0a 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -19067,6 +19067,7 @@ test(2281.2, fwrite(data.table(a=.POSIXct(0.0001)), dec=',', sep=';'), output="1
 # rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452
 x = data.table(a = 1, b = 2, c = 3)
 y = data.table(x = 4, y = 5, z = 6)
+# a-umlaut, o-umlaut, u-umlaut
 setnames(x , c("\u00e4", "\u00f6", "\u00fc"))
 setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1"))
 test(2282.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5)))
diff --git a/src/rbindlist.c b/src/rbindlist.c
index 597526b7fe..13fc32f3b4 100644
--- a/src/rbindlist.c
+++ b/src/rbindlist.c
@@ -74,6 +74,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
       error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames);
     savetl_init();
     int nuniq=0;
+    // first pass - gather unique column names
     for (int i=0; i<LENGTH(l); i++) {
       SEXP li = VECTOR_ELT(l, i);
       int thisncol=LENGTH(li);
@@ -89,11 +90,8 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
         SET_TRUELENGTH(s,-nuniq);
       }
     }
-    if (nuniq>0) {
-      SEXP *tt = realloc(uniq, nuniq*sizeof(SEXP));  // shrink to only what we need to release the spare
-      if (!tt) free(uniq);  // shrink never fails; just keep codacy happy
-      uniq = tt;
-    }
+    if (nuniq>0) uniq = realloc(uniq, nuniq*sizeof(SEXP));  // shrink to only what we need to release the spare
+
     // now count the dups (if any) and how they're distributed across the items
     int *counts = (int *)calloc(nuniq, sizeof(int)); // counts of names for each colnames
     int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector
@@ -105,6 +103,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
       error(_("Failed to allocate nuniq=%d items working memory in rbindlist.c"), nuniq);
       // # nocov end
     }
+    // second pass - count duplicates
     for (int i=0; i<LENGTH(l); i++) {
       SEXP li = VECTOR_ELT(l, i);
       int thisncol=length(li);
@@ -143,6 +142,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
     for (int i=0; i<ncol; ++i) {uniqMap[i] = dupLink[i] = -1;}
     int nextCol=0, lastDup=ncol-1;
 
+    // third pass - create final column mapping colMapRaw
     for (int i=0; i<LENGTH(l); ++i) {
       SEXP li = VECTOR_ELT(l, i);
       int thisncol=length(li);

From efaf11b9a0bbfe15db5fca02ba382a42288ee130 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Sun, 10 Nov 2024 00:05:25 +0100
Subject: [PATCH 8/8] fix lint

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 8c3bfe814f..5338d42a1b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -105,7 +105,7 @@ rowwiseDT(
 
 10. `DT[1, on=NULL]` now works for returning the first row, [#6579](https://github.com/Rdatatable/data.table/issues/6579). Thanks to @Kodiologist for the report and @tdhock for the PR.
 
-3. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix.
+11. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix.
 
 ## NOTES