Merge pull request #622 from SebKrantz/development

Development
SebKrantz · Aug 20, 2024 · 29a5875 · 29a5875
2 parents 188b957 + 10ce005
commit 29a5875
Show file tree

Hide file tree

Showing 21 changed files with 262 additions and 162 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: collapse
 Title: Advanced and Fast Data Transformation
-Version: 2.0.15
-Date: 2024-05-30
+Version: 2.0.16
+Date: 2024-08-20
 Authors@R: c(
            person("Sebastian", "Krantz", role = c("aut", "cre"), 
                   email = "[email protected]", 
@@ -37,7 +37,7 @@ BugReports: https://github.com/SebKrantz/collapse/issues
 License: GPL (>= 2) | file LICENSE
 Encoding: UTF-8
 LazyData: true
-Depends: R (>= 3.3.0)
+Depends: R (>= 3.4.0)
 Imports: Rcpp (>= 1.0.1)
 LinkingTo: Rcpp
 Suggests: fastverse, data.table, magrittr, kit, xts, zoo, plm, fixest, vars, 

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,13 @@
+# collapse 2.0.16
+
+* Fixes an installation bug on some Linux systems (conflicting types) (#613). 
+
+* *collapse* now enforces string encoding in `fmatch()` / `join()`, which caused problems if strings being matched had different encodings (#566, #579, and #618). To avoid noticeable performance implications, checks are done heuristically, i.e., the first, middle and last string of a character vector are checked, and if not UTF8, the entire vector is coerced to UTF8 strings *before* the matching process. In general, character vectors in R can contain strings of different encodings, but this is not the case with most regular data. For performance reasons, *collapse* assumes that character vectors are uniform in terms of string encoding. 
+
+* Fixes a bug using qualified names for fast statistical functions inside `across()` (#621, thanks @alinacherkas). 
+
+* *collapse* now depends on R >= 3.4.0 due to the enforcement of `STRICT_R_HEADERS = 1` from R v4.5.0. In particular R API functions were renamed `Calloc -> R_Calloc` and `Free -> R_Free`.
+
 # collapse 2.0.15
 
 * Some changes on the C-side to move the package closer to C API compliance (demanded by R-Core). One notable change is that `gsplit()` no longer supports S4 objects (because `SET_S4_OBJECT` is not part of the API and `asS4()` is too expensive for tight loops). I cannot think of a single example where it would be necessary to split an S4 object, but if you do have applications please file an issue. 

diff --git a/R/fsubset_ftransform_fmutate.R b/R/fsubset_ftransform_fmutate.R
@@ -528,9 +528,9 @@ mutate_funi_simple <- function(i, data, .data_, funs, aplvec, ce, ...) { # g is
     # if(any(...names() == "TRA")) # This down not work because it substitutes setup[[]] from mutate_across !!!
     #   return(unclass(eval(substitute(.FUN_(.data_, ...)), c(list(.data_ = .data_), data), ce)))
     # return(unclass(eval(substitute(.FUN_(.data_, ..., TRA = 1L)), c(list(.data_ = .data_), data), ce)))
-    fcal <- as.call(c(list(as.name(nami), quote(.data_)), as.list(substitute(list(...))[-1L])))
+    fcal <- as.call(c(list(quote(.FUN_), quote(.data_)), as.list(substitute(list(...))[-1L])))
     if(is.null(fcal$TRA)) fcal$TRA <- 1L
-    return(unclass(eval(fcal, c(list(.data_ = .data_), data), ce)))
+    return(unclass(eval(fcal, c(list(.data_ = .data_, .FUN_ = .FUN_), data), ce)))
   } else {
     value <- if(missing(...)) .FUN_(.data_) else
       do.call(.FUN_, c(list(.data_), eval(substitute(list(...)), data, ce)), envir = ce) # Object setup not found: eval(substitute(.FUN_(.data_, ...)), c(list(.data_ = .data_), data), ce)
@@ -617,9 +617,9 @@ mutate_funi_grouped <- function(i, data, .data_, funs, aplvec, ce, ...) {
              dots_apply_grouped(.data_, g, .FUN_, eval(substitute(list(...)), data, ce)) # Before: do.call(lapply, c(list(unattrib(.data_), copysplaplfun, g, .FUN_), eval(substitute(list(...)), data, ce)), envir = ce)
   } else if(any(nami == .FAST_STAT_FUN_POLD)) {
     if(missing(...)) return(unclass(.FUN_(.data_, g = g, TRA = 1L)))
-    fcal <- as.call(c(list(as.name(nami), quote(.data_), g = quote(.g_)), as.list(substitute(list(...))[-1L])))
+    fcal <- as.call(c(list(quote(.FUN_), quote(.data_), g = quote(.g_)), as.list(substitute(list(...))[-1L])))
     if(is.null(fcal$TRA)) fcal$TRA <- 1L
-    return(unclass(eval(fcal, c(list(.data_ = .data_), data), ce)))
+    return(unclass(eval(fcal, c(list(.data_ = .data_, .FUN_ = .FUN_), data), ce)))
   } else if(any(nami == .FAST_FUN_MOPS)) {
     if(any(nami == .OPERATOR_FUN)) {
       value <- if(missing(...)) .FUN_(.data_, by = g) else

diff --git a/R/fsummarise.R b/R/fsummarise.R
@@ -34,9 +34,9 @@ smr_funi_simple <- function(i, data, .data_, funs, aplvec, ce, ...) {
     names(value) <- names(.data_)
   } else if(any(nami == .FAST_STAT_FUN_POLD)) {
     if(missing(...)) return(unclass(.FUN_(.data_, drop = FALSE)))
-    fcal <- as.call(c(list(as.name(nami), quote(.data_)), as.list(substitute(list(...))[-1L])))
+    fcal <- as.call(c(list(quote(.FUN_), quote(.data_)), as.list(substitute(list(...))[-1L])))
     fcal$drop <- FALSE
-    return(unclass(eval(fcal, c(list(.data_ = .data_), data), ce)))
+    return(unclass(eval(fcal, c(list(.data_ = .data_, .FUN_ = .FUN_), data), ce)))
   } else {
     value <- if(missing(...)) .FUN_(.data_) else
       do.call(.FUN_, c(list(.data_), eval(substitute(list(...)), data, ce)), envir = ce)
@@ -57,9 +57,9 @@ smr_funi_grouped <- function(i, data, .data_, funs, aplvec, ce, ...) {
     names(value) <- names(.data_)
   } else if(any(nami == .FAST_STAT_FUN_POLD)) {
     if(missing(...)) return(unclass(.FUN_(.data_, g = g, use.g.names = FALSE)))
-    fcal <- as.call(c(list(as.name(nami), quote(.data_), g = quote(.g_)), as.list(substitute(list(...))[-1L])))
+    fcal <- as.call(c(list(quote(.FUN_), quote(.data_), g = quote(.g_)), as.list(substitute(list(...))[-1L])))
     fcal$use.g.names <- FALSE
-    return(unclass(eval(fcal, c(list(.data_ = .data_), data), ce)))
+    return(unclass(eval(fcal, c(list(.data_ = .data_, .FUN_ = .FUN_), data), ce)))
   } else {
     value <- dots_apply_grouped_bulk(.data_, g, .FUN_, if(missing(...)) NULL else eval(substitute(list(...)), data, ce))
     value <- .Call(C_rbindlist, unclass(value), FALSE, FALSE, NULL)

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
  [![Conda Version](https://img.shields.io/conda/vn/conda-forge/r-collapse.svg)](https://anaconda.org/conda-forge/r-collapse)
  [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/r-collapse.svg)](https://anaconda.org/conda-forge/r-collapse)
 [![Codecov test coverage](https://codecov.io/gh/SebKrantz/collapse/branch/master/graph/badge.svg)](https://app.codecov.io/gh/SebKrantz/collapse?branch=master)
-[![minimal R version](https://img.shields.io/badge/R%3E%3D-3.3.0-6666ff.svg)](https://cran.r-project.org/)
+[![minimal R version](https://img.shields.io/badge/R%3E%3D-3.4.0-6666ff.svg)](https://cran.r-project.org/)
 [![dependencies](https://tinyverse.netlify.app/badge/collapse)](https://CRAN.R-project.org/package=collapse)
 [![DOI](https://zenodo.org/badge/172910283.svg)](https://zenodo.org/badge/latestdoi/172910283)
 [![arXiv](https://img.shields.io/badge/arXiv-2403.05038-0969DA.svg)](https://arxiv.org/abs/2403.05038)

diff --git a/src/ExportSymbols.c b/src/ExportSymbols.c
@@ -166,6 +166,7 @@ void R_init_collapse(DllInfo *dll) {
   R_RegisterCCallable("collapse", "cp_range", (DL_FUNC) &frange);              // frange()
   R_RegisterCCallable("collapse", "cp_dist", (DL_FUNC) &fdist);                // fdist()
   R_RegisterCCallable("collapse", "cp_quantile", (DL_FUNC) &fquantileC);       // .quantile()
+  R_RegisterCCallable("collapse", "cp_match", (DL_FUNC) &fmatchC);             // fmatch()
   R_RegisterCCallable("collapse", "cp_group", (DL_FUNC) &groupVec);            // group(): main hash-based grouping function: for atomic vectors and data frames
   R_RegisterCCallable("collapse", "cp_group_at", (DL_FUNC) &groupAtVec);       // qG(.., sort = FALSE): same but only works with atomic vectors and has option to keep missing values
   R_RegisterCCallable("collapse", "cp_unique", (DL_FUNC) &funiqueC);           // funique.default()

diff --git a/src/base_radixsort.c b/src/base_radixsort.c
@@ -123,7 +123,7 @@ static void savetl(SEXP s)
 #undef warning
 // since it can be turned to error via warn = 2
 #define warning(...) Do not use warning in this file
-/* use malloc/realloc (not Calloc/Realloc) so we can trap errors
+/* use malloc/realloc (not R_Calloc/R_Realloc) so we can trap errors
  and call savetl_end() before the error(). */
 
 static void growstack(uint64_t newlen)

diff --git a/src/base_radixsort.h b/src/base_radixsort.h
@@ -23,6 +23,7 @@
 #define SET_TRLEN(x, v) SET_STDVEC_TRUELENGTH(x, ((int) (v)))
 
 #define MYLEV(x)	(((SEXPREC_partial *)(x))->sxpinfo.gp)
+#define IS_UTF8(x)  (MYLEV(x) & 8)
 #define IS_ASCII(x) (MYLEV(x) & 64) // from data.table.h
 
 #define SETTOF(x,v)	((((SEXPREC_partial *)(x))->sxpinfo.type)=(v))

diff --git a/src/data.table.h b/src/data.table.h
@@ -17,6 +17,10 @@
 #define SEXPPTR(x) ((SEXP *)DATAPTR(x))  // to avoid overhead of looped VECTOR_ELT
 #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x))  // to avoid overhead of looped VECTOR_ELT
 
+// Needed for match.c and join.c
+#define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s))
+#define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8))
+
 // for use with bit64::integer64
 #define NA_INTEGER64  INT64_MIN
 #define MAX_INTEGER64 INT64_MAX
@@ -52,6 +56,8 @@ extern size_t sizes[100];  // max appears to be FUNSXP = 99, see Rinternals.h
 extern size_t typeorder[100];
 
 // data.table_utils.c
+int need2utf8(SEXP x);
+SEXP coerceUtf8IfNeeded(SEXP x);
 SEXP setnames(SEXP x, SEXP nam);
 bool allNA(SEXP x, bool errorForBadType);
 SEXP allNAv(SEXP x, SEXP errorForBadType);

diff --git a/src/data.table_utils.c b/src/data.table_utils.c
@@ -6,6 +6,32 @@
 #include "data.table.h"
 
 
+int need2utf8(SEXP x) {
+  const int xlen = length(x);
+  const SEXP *xd = STRING_PTR_RO(x);
+  // for (int i=0; i<xlen; i++) {
+  //   if (NEED2UTF8(xd[i]))
+  //     return(true);
+  // }
+  // return(false);
+  if (xlen <= 1) return xlen == 1 ? NEED2UTF8(xd[0]) : 0;
+  return NEED2UTF8(xd[0]) || NEED2UTF8(xd[xlen/2]) || NEED2UTF8(xd[xlen-1]);
+}
+
+SEXP coerceUtf8IfNeeded(SEXP x) {
+  if (!need2utf8(x))
+    return(x);
+  const int xlen = length(x);
+  SEXP ans = PROTECT(allocVector(STRSXP, xlen));
+  const SEXP *xd = STRING_PTR_RO(x);
+  for (int i=0; i<xlen; i++) {
+    SET_STRING_ELT(ans, i, ENC2UTF8(xd[i]));
+  }
+  UNPROTECT(1);
+  return(ans);
+}
+
+
 SEXP setnames(SEXP x, SEXP nam) {
   if(TYPEOF(nam) != STRSXP) error("names need to be character typed");
   if(INHERITS(x, char_datatable)) {
@@ -284,17 +310,17 @@ SEXP setcolorder(SEXP x, SEXP o) {
 
   // Double-check here at C level that o[] is a strict permutation of 1:ncol. Reordering columns by reference makes no
   // difference to generations/refcnt so we can write behind barrier in this very special case of strict permutation.
-  bool *seen = Calloc(ncol, bool);
+  bool *seen = R_Calloc(ncol, bool);
   for (int i=0; i != ncol; ++i) {
     if (od[i]==NA_INTEGER || od[i]<1 || od[i]>ncol)
       error("Internal error: o passed to Csetcolorder contains an NA or out-of-bounds");  // # nocov
     if (seen[od[i]-1])
       error("Internal error: o passed to Csetcolorder contains a duplicate");             // # nocov
     seen[od[i]-1] = true;
   }
-  Free(seen);
+  R_Free(seen);
 
-  SEXP *tmp = Calloc(ncol, SEXP), *namesd = SEXPPTR(names);
+  SEXP *tmp = R_Calloc(ncol, SEXP), *namesd = SEXPPTR(names);
   const SEXP *xd = SEXPPTR_RO(x);
   for (int i=0; i != ncol; ++i) tmp[i] = xd[od[i]-1];
   for (int i=0; i != ncol; ++i) SET_VECTOR_ELT(x, i, tmp[i]);
@@ -304,7 +330,7 @@ SEXP setcolorder(SEXP x, SEXP o) {
   for (int i=0; i != ncol; ++i) tmp[i] = namesd[od[i]-1];
   memcpy(namesd, tmp, ncol*sizeof(SEXP));
   // No need to change key (if any); sorted attribute is column names not positions
-  Free(tmp);
+  R_Free(tmp);
   return(R_NilValue);
 }
 

diff --git a/src/fcumsum.c b/src/fcumsum.c
@@ -16,7 +16,7 @@ void fcumsum_double_impl(double *pout, double *px, int ng, int *pg, int narm, in
       }
     }
   } else {
-    double *last = (double*)Calloc(ng+1, double); // Also pass pointer to function ??
+    double *last = (double*)R_Calloc(ng+1, double); // Also pass pointer to function ??
     if(narm <= 0) {
       for(int i = 0; i != l; ++i) last[pg[i]] = pout[i] = last[pg[i]] + px[i];
     } else if(fill) {
@@ -27,7 +27,7 @@ void fcumsum_double_impl(double *pout, double *px, int ng, int *pg, int narm, in
         else last[pg[i]] = pout[i] = last[pg[i]] + px[i];
       }
     }
-    Free(last);
+    R_Free(last);
   }
 }
 
@@ -50,7 +50,7 @@ void fcumsum_double_impl_order(double *pout, double *px, int ng, int *pg, int *p
       }
     }
   } else {
-    double *last = (double*)Calloc(ng+1, double); // Also pass pointer to function ??
+    double *last = (double*)R_Calloc(ng+1, double); // Also pass pointer to function ??
     if(narm <= 0) {
       for(int i = 0, poi; i != l; ++i) {
         poi = po[i]-1;
@@ -68,7 +68,7 @@ void fcumsum_double_impl_order(double *pout, double *px, int ng, int *pg, int *p
         else last[pg[poi]] = pout[poi] = last[pg[poi]] + px[poi];
       }
     }
-    Free(last);
+    R_Free(last);
   }
 }
 
@@ -105,7 +105,7 @@ void fcumsum_int_impl(int *pout, int *px, int ng, int *pg, int narm, int fill, i
     if(ckof > INT_MAX || ckof <= INT_MIN)
       error("Integer overflow. Integers in R are bounded between 2,147,483,647 and -2,147,483,647. Use fcumsum(as.numeric(x)).");
   } else {
-    int *last = (int*)Calloc(ng+1, int); // Also pass pointer to function ??
+    int *last = (int*)R_Calloc(ng+1, int); // Also pass pointer to function ??
     if(narm <= 0) {
       for(int i = 0, lsi; i != l; ++i) {
         if(px[i] == NA_INTEGER) {
@@ -139,7 +139,7 @@ void fcumsum_int_impl(int *pout, int *px, int ng, int *pg, int narm, int fill, i
         }
       }
     }
-    Free(last);
+    R_Free(last);
   }
 }
 
@@ -178,7 +178,7 @@ void fcumsum_int_impl_order(int *pout, int *px, int ng, int *pg, int *po, int na
     if(ckof > INT_MAX || ckof <= INT_MIN)
       error("Integer overflow. Integers in R are bounded between 2,147,483,647 and -2,147,483,647. Use fcumsum(as.numeric(x)).");
   } else {
-    int *last = (int*)Calloc(ng+1, int); // Also pass pointer to function ??
+    int *last = (int*)R_Calloc(ng+1, int); // Also pass pointer to function ??
     if(narm <= 0) {
       for(int i = 0, poi, lsi; i != l; ++i) {
         poi = po[i]-1;
@@ -215,7 +215,7 @@ void fcumsum_int_impl_order(int *pout, int *px, int ng, int *pg, int *po, int na
         }
       }
     }
-    Free(last);
+    R_Free(last);
   }
 }
 

diff --git a/src/fmean.c b/src/fmean.c
@@ -50,7 +50,7 @@ double fmean_double_omp_impl(const double *restrict px, const int narm, const in
 void fmean_double_g_impl(double *restrict pout, const double *restrict px, const int ng, const int *restrict pg, const int *restrict pgs, const int narm, const int l) {
   memset(pout, 0, sizeof(double) * ng);
   if(narm) {
-    int *restrict n = (int*)Calloc(ng, int);
+    int *restrict n = (int*)R_Calloc(ng, int);
     for(int i = 0, gi; i != l; ++i) {
       if(ISNAN(px[i])) continue;
       gi = pg[i]-1;
@@ -61,7 +61,7 @@ void fmean_double_g_impl(double *restrict pout, const double *restrict px, const
       if(n[i] == 0) pout[i] = NA_REAL;
       else pout[i] /= n[i];
     }
-    Free(n);
+    R_Free(n);
   } else {
     --pout;
     for(int i = l; i--; ) pout[pg[i]] += px[i]; // Used to stop loop when all groups passed with NA, but probably no speed gain since groups are mostly ordered.
@@ -121,7 +121,7 @@ double fmean_weights_omp_impl(const double *restrict px, const double *restrict
 }
 
 void fmean_weights_g_impl(double *restrict pout, const double *restrict px, const int ng, const int *restrict pg, const double *restrict pw, const int narm, const int l) {
-  double *restrict sumw = (double*)Calloc(ng, double);
+  double *restrict sumw = (double*)R_Calloc(ng, double);
   memset(pout, 0, sizeof(double) * ng);
   if(narm) {
     for(int i = 0, gi; i != l; ++i) {
@@ -142,7 +142,7 @@ void fmean_weights_g_impl(double *restrict pout, const double *restrict px, cons
     }
     for(int i = ng; i--; ) pout[i] /= sumw[i];
   }
-  Free(sumw);
+  R_Free(sumw);
 }
 
 double fmean_int_impl(const int *restrict px, const int narm, const int l) {
@@ -194,7 +194,7 @@ double fmean_int_omp_impl(const int *restrict px, const int narm, const int l, c
 void fmean_int_g_impl(double *restrict pout, const int *restrict px, const int ng, const int *restrict pg, const int *restrict pgs, const int narm, const int l) {
   memset(pout, 0, sizeof(double) * ng);
   if(narm) {
-    int *restrict n = (int*)Calloc(ng, int);
+    int *restrict n = (int*)R_Calloc(ng, int);
     for(int i = 0, gi; i != l; ++i) {
       if(px[i] == NA_INTEGER) continue;
       gi = pg[i]-1;
@@ -205,7 +205,7 @@ void fmean_int_g_impl(double *restrict pout, const int *restrict px, const int n
       if(n[i] == 0) pout[i] = NA_REAL;
       else pout[i] /= n[i];
     }
-    Free(n);
+    R_Free(n);
   } else {
     --pout;
     for(int i = l; i--; ) pout[pg[i]] += px[i]; // Used to stop loop when all groups passed with NA, but probably no speed gain since groups are mostly ordered.