Skip to content

Commit

Permalink
Merge pull request #622 from SebKrantz/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
SebKrantz authored Aug 20, 2024
2 parents 188b957 + 10ce005 commit 29a5875
Show file tree
Hide file tree
Showing 21 changed files with 262 additions and 162 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: collapse
Title: Advanced and Fast Data Transformation
Version: 2.0.15
Date: 2024-05-30
Version: 2.0.16
Date: 2024-08-20
Authors@R: c(
person("Sebastian", "Krantz", role = c("aut", "cre"),
email = "[email protected]",
Expand Down Expand Up @@ -37,7 +37,7 @@ BugReports: https://github.com/SebKrantz/collapse/issues
License: GPL (>= 2) | file LICENSE
Encoding: UTF-8
LazyData: true
Depends: R (>= 3.3.0)
Depends: R (>= 3.4.0)
Imports: Rcpp (>= 1.0.1)
LinkingTo: Rcpp
Suggests: fastverse, data.table, magrittr, kit, xts, zoo, plm, fixest, vars,
Expand Down
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
# collapse 2.0.16

* Fixes an installation bug on some Linux systems (conflicting types) (#613).

* *collapse* now enforces string encoding in `fmatch()` / `join()`, which caused problems if strings being matched had different encodings (#566, #579, and #618). To avoid noticeable performance implications, checks are done heuristically, i.e., the first, middle and last string of a character vector are checked, and if not UTF8, the entire vector is coerced to UTF8 strings *before* the matching process. In general, character vectors in R can contain strings of different encodings, but this is not the case with most regular data. For performance reasons, *collapse* assumes that character vectors are uniform in terms of string encoding.

* Fixes a bug using qualified names for fast statistical functions inside `across()` (#621, thanks @alinacherkas).

* *collapse* now depends on R >= 3.4.0 due to the enforcement of `STRICT_R_HEADERS = 1` from R v4.5.0. In particular R API functions were renamed `Calloc -> R_Calloc` and `Free -> R_Free`.

# collapse 2.0.15

* Some changes on the C-side to move the package closer to C API compliance (demanded by R-Core). One notable change is that `gsplit()` no longer supports S4 objects (because `SET_S4_OBJECT` is not part of the API and `asS4()` is too expensive for tight loops). I cannot think of a single example where it would be necessary to split an S4 object, but if you do have applications please file an issue.
Expand Down
8 changes: 4 additions & 4 deletions R/fsubset_ftransform_fmutate.R
Original file line number Diff line number Diff line change
Expand Up @@ -528,9 +528,9 @@ mutate_funi_simple <- function(i, data, .data_, funs, aplvec, ce, ...) { # g is
# if(any(...names() == "TRA")) # This down not work because it substitutes setup[[]] from mutate_across !!!
# return(unclass(eval(substitute(.FUN_(.data_, ...)), c(list(.data_ = .data_), data), ce)))
# return(unclass(eval(substitute(.FUN_(.data_, ..., TRA = 1L)), c(list(.data_ = .data_), data), ce)))
fcal <- as.call(c(list(as.name(nami), quote(.data_)), as.list(substitute(list(...))[-1L])))
fcal <- as.call(c(list(quote(.FUN_), quote(.data_)), as.list(substitute(list(...))[-1L])))
if(is.null(fcal$TRA)) fcal$TRA <- 1L
return(unclass(eval(fcal, c(list(.data_ = .data_), data), ce)))
return(unclass(eval(fcal, c(list(.data_ = .data_, .FUN_ = .FUN_), data), ce)))
} else {
value <- if(missing(...)) .FUN_(.data_) else
do.call(.FUN_, c(list(.data_), eval(substitute(list(...)), data, ce)), envir = ce) # Object setup not found: eval(substitute(.FUN_(.data_, ...)), c(list(.data_ = .data_), data), ce)
Expand Down Expand Up @@ -617,9 +617,9 @@ mutate_funi_grouped <- function(i, data, .data_, funs, aplvec, ce, ...) {
dots_apply_grouped(.data_, g, .FUN_, eval(substitute(list(...)), data, ce)) # Before: do.call(lapply, c(list(unattrib(.data_), copysplaplfun, g, .FUN_), eval(substitute(list(...)), data, ce)), envir = ce)
} else if(any(nami == .FAST_STAT_FUN_POLD)) {
if(missing(...)) return(unclass(.FUN_(.data_, g = g, TRA = 1L)))
fcal <- as.call(c(list(as.name(nami), quote(.data_), g = quote(.g_)), as.list(substitute(list(...))[-1L])))
fcal <- as.call(c(list(quote(.FUN_), quote(.data_), g = quote(.g_)), as.list(substitute(list(...))[-1L])))
if(is.null(fcal$TRA)) fcal$TRA <- 1L
return(unclass(eval(fcal, c(list(.data_ = .data_), data), ce)))
return(unclass(eval(fcal, c(list(.data_ = .data_, .FUN_ = .FUN_), data), ce)))
} else if(any(nami == .FAST_FUN_MOPS)) {
if(any(nami == .OPERATOR_FUN)) {
value <- if(missing(...)) .FUN_(.data_, by = g) else
Expand Down
8 changes: 4 additions & 4 deletions R/fsummarise.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ smr_funi_simple <- function(i, data, .data_, funs, aplvec, ce, ...) {
names(value) <- names(.data_)
} else if(any(nami == .FAST_STAT_FUN_POLD)) {
if(missing(...)) return(unclass(.FUN_(.data_, drop = FALSE)))
fcal <- as.call(c(list(as.name(nami), quote(.data_)), as.list(substitute(list(...))[-1L])))
fcal <- as.call(c(list(quote(.FUN_), quote(.data_)), as.list(substitute(list(...))[-1L])))
fcal$drop <- FALSE
return(unclass(eval(fcal, c(list(.data_ = .data_), data), ce)))
return(unclass(eval(fcal, c(list(.data_ = .data_, .FUN_ = .FUN_), data), ce)))
} else {
value <- if(missing(...)) .FUN_(.data_) else
do.call(.FUN_, c(list(.data_), eval(substitute(list(...)), data, ce)), envir = ce)
Expand All @@ -57,9 +57,9 @@ smr_funi_grouped <- function(i, data, .data_, funs, aplvec, ce, ...) {
names(value) <- names(.data_)
} else if(any(nami == .FAST_STAT_FUN_POLD)) {
if(missing(...)) return(unclass(.FUN_(.data_, g = g, use.g.names = FALSE)))
fcal <- as.call(c(list(as.name(nami), quote(.data_), g = quote(.g_)), as.list(substitute(list(...))[-1L])))
fcal <- as.call(c(list(quote(.FUN_), quote(.data_), g = quote(.g_)), as.list(substitute(list(...))[-1L])))
fcal$use.g.names <- FALSE
return(unclass(eval(fcal, c(list(.data_ = .data_), data), ce)))
return(unclass(eval(fcal, c(list(.data_ = .data_, .FUN_ = .FUN_), data), ce)))
} else {
value <- dots_apply_grouped_bulk(.data_, g, .FUN_, if(missing(...)) NULL else eval(substitute(list(...)), data, ce))
value <- .Call(C_rbindlist, unclass(value), FALSE, FALSE, NULL)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
[![Conda Version](https://img.shields.io/conda/vn/conda-forge/r-collapse.svg)](https://anaconda.org/conda-forge/r-collapse)
[![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/r-collapse.svg)](https://anaconda.org/conda-forge/r-collapse)
[![Codecov test coverage](https://codecov.io/gh/SebKrantz/collapse/branch/master/graph/badge.svg)](https://app.codecov.io/gh/SebKrantz/collapse?branch=master)
[![minimal R version](https://img.shields.io/badge/R%3E%3D-3.3.0-6666ff.svg)](https://cran.r-project.org/)
[![minimal R version](https://img.shields.io/badge/R%3E%3D-3.4.0-6666ff.svg)](https://cran.r-project.org/)
[![dependencies](https://tinyverse.netlify.app/badge/collapse)](https://CRAN.R-project.org/package=collapse)
[![DOI](https://zenodo.org/badge/172910283.svg)](https://zenodo.org/badge/latestdoi/172910283)
[![arXiv](https://img.shields.io/badge/arXiv-2403.05038-0969DA.svg)](https://arxiv.org/abs/2403.05038)
Expand Down
1 change: 1 addition & 0 deletions src/ExportSymbols.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ void R_init_collapse(DllInfo *dll) {
R_RegisterCCallable("collapse", "cp_range", (DL_FUNC) &frange); // frange()
R_RegisterCCallable("collapse", "cp_dist", (DL_FUNC) &fdist); // fdist()
R_RegisterCCallable("collapse", "cp_quantile", (DL_FUNC) &fquantileC); // .quantile()
R_RegisterCCallable("collapse", "cp_match", (DL_FUNC) &fmatchC); // fmatch()
R_RegisterCCallable("collapse", "cp_group", (DL_FUNC) &groupVec); // group(): main hash-based grouping function: for atomic vectors and data frames
R_RegisterCCallable("collapse", "cp_group_at", (DL_FUNC) &groupAtVec); // qG(.., sort = FALSE): same but only works with atomic vectors and has option to keep missing values
R_RegisterCCallable("collapse", "cp_unique", (DL_FUNC) &funiqueC); // funique.default()
Expand Down
2 changes: 1 addition & 1 deletion src/base_radixsort.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ static void savetl(SEXP s)
#undef warning
// since it can be turned to error via warn = 2
#define warning(...) Do not use warning in this file
/* use malloc/realloc (not Calloc/Realloc) so we can trap errors
/* use malloc/realloc (not R_Calloc/R_Realloc) so we can trap errors
and call savetl_end() before the error(). */

static void growstack(uint64_t newlen)
Expand Down
1 change: 1 addition & 0 deletions src/base_radixsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#define SET_TRLEN(x, v) SET_STDVEC_TRUELENGTH(x, ((int) (v)))

#define MYLEV(x) (((SEXPREC_partial *)(x))->sxpinfo.gp)
#define IS_UTF8(x) (MYLEV(x) & 8)
#define IS_ASCII(x) (MYLEV(x) & 64) // from data.table.h

#define SETTOF(x,v) ((((SEXPREC_partial *)(x))->sxpinfo.type)=(v))
Expand Down
6 changes: 6 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
#define SEXPPTR(x) ((SEXP *)DATAPTR(x)) // to avoid overhead of looped VECTOR_ELT
#define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped VECTOR_ELT

// Needed for match.c and join.c
#define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s))
#define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8))

// for use with bit64::integer64
#define NA_INTEGER64 INT64_MIN
#define MAX_INTEGER64 INT64_MAX
Expand Down Expand Up @@ -52,6 +56,8 @@ extern size_t sizes[100]; // max appears to be FUNSXP = 99, see Rinternals.h
extern size_t typeorder[100];

// data.table_utils.c
int need2utf8(SEXP x);
SEXP coerceUtf8IfNeeded(SEXP x);
SEXP setnames(SEXP x, SEXP nam);
bool allNA(SEXP x, bool errorForBadType);
SEXP allNAv(SEXP x, SEXP errorForBadType);
Expand Down
34 changes: 30 additions & 4 deletions src/data.table_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,32 @@
#include "data.table.h"


int need2utf8(SEXP x) {
const int xlen = length(x);
const SEXP *xd = STRING_PTR_RO(x);
// for (int i=0; i<xlen; i++) {
// if (NEED2UTF8(xd[i]))
// return(true);
// }
// return(false);
if (xlen <= 1) return xlen == 1 ? NEED2UTF8(xd[0]) : 0;
return NEED2UTF8(xd[0]) || NEED2UTF8(xd[xlen/2]) || NEED2UTF8(xd[xlen-1]);
}

SEXP coerceUtf8IfNeeded(SEXP x) {
if (!need2utf8(x))
return(x);
const int xlen = length(x);
SEXP ans = PROTECT(allocVector(STRSXP, xlen));
const SEXP *xd = STRING_PTR_RO(x);
for (int i=0; i<xlen; i++) {
SET_STRING_ELT(ans, i, ENC2UTF8(xd[i]));
}
UNPROTECT(1);
return(ans);
}


SEXP setnames(SEXP x, SEXP nam) {
if(TYPEOF(nam) != STRSXP) error("names need to be character typed");
if(INHERITS(x, char_datatable)) {
Expand Down Expand Up @@ -284,17 +310,17 @@ SEXP setcolorder(SEXP x, SEXP o) {

// Double-check here at C level that o[] is a strict permutation of 1:ncol. Reordering columns by reference makes no
// difference to generations/refcnt so we can write behind barrier in this very special case of strict permutation.
bool *seen = Calloc(ncol, bool);
bool *seen = R_Calloc(ncol, bool);
for (int i=0; i != ncol; ++i) {
if (od[i]==NA_INTEGER || od[i]<1 || od[i]>ncol)
error("Internal error: o passed to Csetcolorder contains an NA or out-of-bounds"); // # nocov
if (seen[od[i]-1])
error("Internal error: o passed to Csetcolorder contains a duplicate"); // # nocov
seen[od[i]-1] = true;
}
Free(seen);
R_Free(seen);

SEXP *tmp = Calloc(ncol, SEXP), *namesd = SEXPPTR(names);
SEXP *tmp = R_Calloc(ncol, SEXP), *namesd = SEXPPTR(names);
const SEXP *xd = SEXPPTR_RO(x);
for (int i=0; i != ncol; ++i) tmp[i] = xd[od[i]-1];
for (int i=0; i != ncol; ++i) SET_VECTOR_ELT(x, i, tmp[i]);
Expand All @@ -304,7 +330,7 @@ SEXP setcolorder(SEXP x, SEXP o) {
for (int i=0; i != ncol; ++i) tmp[i] = namesd[od[i]-1];
memcpy(namesd, tmp, ncol*sizeof(SEXP));
// No need to change key (if any); sorted attribute is column names not positions
Free(tmp);
R_Free(tmp);
return(R_NilValue);
}

Expand Down
16 changes: 8 additions & 8 deletions src/fcumsum.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ void fcumsum_double_impl(double *pout, double *px, int ng, int *pg, int narm, in
}
}
} else {
double *last = (double*)Calloc(ng+1, double); // Also pass pointer to function ??
double *last = (double*)R_Calloc(ng+1, double); // Also pass pointer to function ??
if(narm <= 0) {
for(int i = 0; i != l; ++i) last[pg[i]] = pout[i] = last[pg[i]] + px[i];
} else if(fill) {
Expand All @@ -27,7 +27,7 @@ void fcumsum_double_impl(double *pout, double *px, int ng, int *pg, int narm, in
else last[pg[i]] = pout[i] = last[pg[i]] + px[i];
}
}
Free(last);
R_Free(last);
}
}

Expand All @@ -50,7 +50,7 @@ void fcumsum_double_impl_order(double *pout, double *px, int ng, int *pg, int *p
}
}
} else {
double *last = (double*)Calloc(ng+1, double); // Also pass pointer to function ??
double *last = (double*)R_Calloc(ng+1, double); // Also pass pointer to function ??
if(narm <= 0) {
for(int i = 0, poi; i != l; ++i) {
poi = po[i]-1;
Expand All @@ -68,7 +68,7 @@ void fcumsum_double_impl_order(double *pout, double *px, int ng, int *pg, int *p
else last[pg[poi]] = pout[poi] = last[pg[poi]] + px[poi];
}
}
Free(last);
R_Free(last);
}
}

Expand Down Expand Up @@ -105,7 +105,7 @@ void fcumsum_int_impl(int *pout, int *px, int ng, int *pg, int narm, int fill, i
if(ckof > INT_MAX || ckof <= INT_MIN)
error("Integer overflow. Integers in R are bounded between 2,147,483,647 and -2,147,483,647. Use fcumsum(as.numeric(x)).");
} else {
int *last = (int*)Calloc(ng+1, int); // Also pass pointer to function ??
int *last = (int*)R_Calloc(ng+1, int); // Also pass pointer to function ??
if(narm <= 0) {
for(int i = 0, lsi; i != l; ++i) {
if(px[i] == NA_INTEGER) {
Expand Down Expand Up @@ -139,7 +139,7 @@ void fcumsum_int_impl(int *pout, int *px, int ng, int *pg, int narm, int fill, i
}
}
}
Free(last);
R_Free(last);
}
}

Expand Down Expand Up @@ -178,7 +178,7 @@ void fcumsum_int_impl_order(int *pout, int *px, int ng, int *pg, int *po, int na
if(ckof > INT_MAX || ckof <= INT_MIN)
error("Integer overflow. Integers in R are bounded between 2,147,483,647 and -2,147,483,647. Use fcumsum(as.numeric(x)).");
} else {
int *last = (int*)Calloc(ng+1, int); // Also pass pointer to function ??
int *last = (int*)R_Calloc(ng+1, int); // Also pass pointer to function ??
if(narm <= 0) {
for(int i = 0, poi, lsi; i != l; ++i) {
poi = po[i]-1;
Expand Down Expand Up @@ -215,7 +215,7 @@ void fcumsum_int_impl_order(int *pout, int *px, int ng, int *pg, int *po, int na
}
}
}
Free(last);
R_Free(last);
}
}

Expand Down
12 changes: 6 additions & 6 deletions src/fmean.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ double fmean_double_omp_impl(const double *restrict px, const int narm, const in
void fmean_double_g_impl(double *restrict pout, const double *restrict px, const int ng, const int *restrict pg, const int *restrict pgs, const int narm, const int l) {
memset(pout, 0, sizeof(double) * ng);
if(narm) {
int *restrict n = (int*)Calloc(ng, int);
int *restrict n = (int*)R_Calloc(ng, int);
for(int i = 0, gi; i != l; ++i) {
if(ISNAN(px[i])) continue;
gi = pg[i]-1;
Expand All @@ -61,7 +61,7 @@ void fmean_double_g_impl(double *restrict pout, const double *restrict px, const
if(n[i] == 0) pout[i] = NA_REAL;
else pout[i] /= n[i];
}
Free(n);
R_Free(n);
} else {
--pout;
for(int i = l; i--; ) pout[pg[i]] += px[i]; // Used to stop loop when all groups passed with NA, but probably no speed gain since groups are mostly ordered.
Expand Down Expand Up @@ -121,7 +121,7 @@ double fmean_weights_omp_impl(const double *restrict px, const double *restrict
}

void fmean_weights_g_impl(double *restrict pout, const double *restrict px, const int ng, const int *restrict pg, const double *restrict pw, const int narm, const int l) {
double *restrict sumw = (double*)Calloc(ng, double);
double *restrict sumw = (double*)R_Calloc(ng, double);
memset(pout, 0, sizeof(double) * ng);
if(narm) {
for(int i = 0, gi; i != l; ++i) {
Expand All @@ -142,7 +142,7 @@ void fmean_weights_g_impl(double *restrict pout, const double *restrict px, cons
}
for(int i = ng; i--; ) pout[i] /= sumw[i];
}
Free(sumw);
R_Free(sumw);
}

double fmean_int_impl(const int *restrict px, const int narm, const int l) {
Expand Down Expand Up @@ -194,7 +194,7 @@ double fmean_int_omp_impl(const int *restrict px, const int narm, const int l, c
void fmean_int_g_impl(double *restrict pout, const int *restrict px, const int ng, const int *restrict pg, const int *restrict pgs, const int narm, const int l) {
memset(pout, 0, sizeof(double) * ng);
if(narm) {
int *restrict n = (int*)Calloc(ng, int);
int *restrict n = (int*)R_Calloc(ng, int);
for(int i = 0, gi; i != l; ++i) {
if(px[i] == NA_INTEGER) continue;
gi = pg[i]-1;
Expand All @@ -205,7 +205,7 @@ void fmean_int_g_impl(double *restrict pout, const int *restrict px, const int n
if(n[i] == 0) pout[i] = NA_REAL;
else pout[i] /= n[i];
}
Free(n);
R_Free(n);
} else {
--pout;
for(int i = l; i--; ) pout[pg[i]] += px[i]; // Used to stop loop when all groups passed with NA, but probably no speed gain since groups are mostly ordered.
Expand Down
Loading

0 comments on commit 29a5875

Please sign in to comment.