From 5537ba7193c987c189f9f8f9784078cd7cb23d68 Mon Sep 17 00:00:00 2001 From: Sebastian Krantz Date: Sat, 7 Dec 2024 23:12:39 +0100 Subject: [PATCH 1/4] Postpone the problem. --- src/R_Defn.h | 86 ++++++++++++++++++++++++++++++++++++++++++++++++ src/assign.c | 70 +++++++++++++++++++-------------------- src/chmatch.c | 28 ++++++++-------- src/data.table.h | 11 ++++--- src/dogroups.c | 40 +++++++++++----------- src/fmelt.c | 8 ++--- src/forder.c | 26 +++++++-------- src/freadR.c | 12 +++---- src/init.c | 2 +- src/rbindlist.c | 46 +++++++++++++------------- src/subset.c | 8 ++--- src/utils.c | 10 +++--- 12 files changed, 217 insertions(+), 130 deletions(-) create mode 100644 src/R_Defn.h diff --git a/src/R_Defn.h b/src/R_Defn.h new file mode 100644 index 0000000000..41726c6b4e --- /dev/null +++ b/src/R_Defn.h @@ -0,0 +1,86 @@ +#include +#include + +// NOTE: All of this is copied from Defn.h: https://github.com/wch/r-source/blob/28de75af0541f93832c5899139b969d290bf422e/src/include/Defn.h +// We intend to gradually remove the need for this header file + +// Writable vector/string pointer +#define SEXPPTR(x) ((SEXP *)DATAPTR(x)) + +#ifndef NAMED_BITS +# define NAMED_BITS 16 +#endif + +#ifndef SEXPREC_HEADER + +// https://github.com/wch/r-source/blob/2640a203d13473f95c9c7508eb2976fefb5c931c/src/include/Defn.h#L123 +struct sxpinfo_struct { + SEXPTYPE type : TYPE_BITS; + /* ==> (FUNSXP == 99) %% 2^5 == 3 == CLOSXP + * -> warning: `type' is narrower than values + * of its type + * when SEXPTYPE was an enum */ + unsigned int scalar: 1; + unsigned int obj : 1; + unsigned int alt : 1; + unsigned int gp : 16; + unsigned int mark : 1; + unsigned int debug : 1; + unsigned int trace : 1; /* functions and memory tracing */ + unsigned int spare : 1; /* used on closures and when REFCNT is defined */ + unsigned int gcgen : 1; /* old generation number */ + unsigned int gccls : 3; /* node class */ + unsigned int named : NAMED_BITS; + unsigned int extra : 32 - NAMED_BITS; /* used for immediate bindings */ +}; /* Tot: 64 */ + +// https://github.com/wch/r-source/blob/2640a203d13473f95c9c7508eb2976fefb5c931c/src/include/Defn.h#L190 +#define SEXPREC_HEADER \ + struct sxpinfo_struct sxpinfo; \ + struct SEXPREC *attrib; \ + struct SEXPREC *gengc_next_node, *gengc_prev_node + +// https://github.com/wch/r-source/blob/2640a203d13473f95c9c7508eb2976fefb5c931c/src/include/Defn.h#L143 +struct vecsxp_struct { + R_xlen_t length; + R_xlen_t truelength; +}; + +// https://github.com/wch/r-source/blob/2640a203d13473f95c9c7508eb2976fefb5c931c/src/include/Defn.h#L214 +typedef struct VECTOR_SEXPREC { + SEXPREC_HEADER; + struct vecsxp_struct vecsxp; +} VECTOR_SEXPREC, *VECSEXP; + +#endif + +// https://github.com/wch/r-source/blob/2640a203d13473f95c9c7508eb2976fefb5c931c/src/include/Defn.h#L197 +typedef struct { + SEXPREC_HEADER; +} SEXPREC_partial; + +// (SET_)TRULENGTH: https://github.com/wch/r-source/blob/2640a203d13473f95c9c7508eb2976fefb5c931c/src/include/Defn.h#L386 +#define STDVEC_TRUELENGTH(x) (((VECSEXP) (x))->vecsxp.truelength) +// No method to set ALTREP_TRUELENGTH (gives error): https://github.com/wch/r-source/blob/2640a203d13473f95c9c7508eb2976fefb5c931c/src/include/Defn.h#L388 +#define SET_TRULEN(x, v) (STDVEC_TRUELENGTH(x)=(v)) +// ALTREP_TRUELENGTH is 0: https://github.com/wch/r-source/blob/48f06c1071fea6a6e7e365ad3d745217268e2175/src/main/altrep.c#L345 +#define TRULEN(x) (ALTREP(x) ? 0 : STDVEC_TRUELENGTH(x)) + +// SETLENGTH: https://github.com/wch/r-source/blob/05bb18266d49e87f2477120ecb0ab1440f4e9b40/src/include/Defn.h#L385 +#define STDVEC_LENGTH(x) (((VECSEXP) (x))->vecsxp.length) +#define SETSCAL(x, v) ((((SEXPREC_partial *)(x))->sxpinfo.scalar) = (v)) +#define SET_STDVEC_LENGTH(x,v) do { \ + SEXP __x__ = (x); \ + R_xlen_t __v__ = (v); \ + STDVEC_LENGTH(__x__) = __v__; \ + SETSCAL(__x__, __v__ == 1 ? 1 : 0); \ +} while (0) +// https://github.com/wch/r-source/blob/05bb18266d49e87f2477120ecb0ab1440f4e9b40/src/main/memory.c#L4072 +#define SET_LEN(x, v) SET_STDVEC_LENGTH((x), (v)) + +// LEVELS: https://github.com/wch/r-source/blob/2640a203d13473f95c9c7508eb2976fefb5c931c/src/include/Defn.h#L228 +#define LEVLS(x) (((SEXPREC_partial *)(x))->sxpinfo.gp) + +// SET_GROWABLE_BIT: https://github.com/wch/r-source/blob/2640a203d13473f95c9c7508eb2976fefb5c931c/src/include/Defn.h#L374 +#define GROWBLE_MASK ((unsigned short)(1<<5)) +#define SET_GROWBLE_BIT(x) (LEVLS(x) |= GROWBLE_MASK) diff --git a/src/assign.c b/src/assign.c index b280c2259f..2009dde346 100644 --- a/src/assign.c +++ b/src/assign.c @@ -8,7 +8,7 @@ static void finalizer(SEXP p) p = R_ExternalPtrTag(p); if (!isString(p)) internal_error(__func__, "ExternalPtr doesn't see names in tag"); // # nocov l = LENGTH(p); - tl = TRUELENGTH(p); + tl = TRULEN(p); if (l<0 || tll) ? n : l); // e.g. test 848 and 851 in R > 3.0.2 // added (n>l) ? ... for #970, see test 1481. // TO DO: test realloc names if selfrefnamesok (users can setattr(x,"name") themselves for example. - // if (TRUELENGTH(getAttrib(dt,R_NamesSymbol))!=tl) - // internal_error(__func__, "tl of dt passes checks, but tl of names (%d) != tl of dt (%d)", tl, TRUELENGTH(getAttrib(dt,R_NamesSymbol))); // # nocov + // if (TRULEN(getAttrib(dt,R_NamesSymbol))!=tl) + // internal_error(__func__, "tl of dt passes checks, but tl of names (%d) != tl of dt (%d)", tl, TRULEN(getAttrib(dt,R_NamesSymbol))); // # nocov - tl = TRUELENGTH(dt); + tl = TRULEN(dt); // R <= 2.13.2 and we didn't catch uninitialized tl somehow if (tl<0) internal_error(__func__, "tl of class is marked but tl<0"); // # nocov if (tl>0 && tl0) { savetl(s); } else if (tl<0) { // # nocov start - for (int j=0; j=0) { if (!sourceIsFactor && s==NA_STRING) continue; // don't create NA factor level when assigning character to factor; test 2117 if (tl>0) savetl(s); - SET_TRUELENGTH(s, -nTargetLevels-(++nAdd)); + SET_TRULEN(s, -nTargetLevels-(++nAdd)); } // else, when sourceIsString, it's normal for there to be duplicates here } const int nSource = length(source); @@ -858,34 +858,34 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con const int *sourceD = INTEGER(source); for (int i=0; i0) { savetl(s); // R's internal hash (which is positive); save it - SET_TRUELENGTH(s,0); + SET_TRULEN(s,0); } else if (tl<0) { // R 2.14.0+ initializes truelength to 0 (before that it was uninitialized/random). // Now that data.table depends on R 3.1.0+, that is after 2.14.0 too. @@ -75,13 +75,13 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch int nuniq=0; for (int i=0; i0) { savetl(s); tl=0; } - if (tl==0) SET_TRUELENGTH(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table + if (tl==0) SET_TRULEN(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table } // in future if we need NAs in x not to be matched to NAs in table ... - // if (!matchNAtoNA && TRUELENGTH(NA_STRING)<0) - // SET_TRUELENGTH(NA_STRING, 0); + // if (!matchNAtoNA && TRULEN(NA_STRING)<0) + // SET_TRULEN(NA_STRING, 0); if (chmatchdup) { // chmatchdup() is basically base::pmatch() but without the partial matching part. For example : // chmatchdup(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table' @@ -100,21 +100,21 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch if (!counts || !map) { // # nocov start free(counts); free(map); - for (int i=0; i #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT #ifndef STRING_PTR_RO -#define STRING_PTR_RO STRING_PTR +#define STRING_PTR_RO SEXPPTR #endif #include // for uint64_t rather than unsigned long long #include // for va_list, va_start #include #include "types.h" #include "po.h" +#include "R_Defn.h" #ifdef WIN32 // positional specifiers (%n$) used in translations; #4402 # define snprintf dt_win_snprintf // see our snprintf.c; tried and failed to link to _sprintf_p on Windows #endif @@ -37,7 +38,7 @@ /* we mean the encoding bits, not CE_NATIVE in a UTF-8 locale */ #define IS_UTF8(x) (getCharCE(x) == CE_UTF8) #define IS_LATIN(x) (getCharCE(x) == CE_LATIN1) -#define IS_ASCII(x) (LEVELS(x) & 64) // API expected in R >= 4.5 +#define IS_ASCII(x) (LEVLS(x) & 64) // API expected in R >= 4.5 #define IS_TRUE(x) (TYPEOF(x)==LGLSXP && LENGTH(x)==1 && LOGICAL(x)[0]==TRUE) #define IS_FALSE(x) (TYPEOF(x)==LGLSXP && LENGTH(x)==1 && LOGICAL(x)[0]==FALSE) #define IS_TRUE_OR_FALSE(x) (TYPEOF(x)==LGLSXP && LENGTH(x)==1 && LOGICAL(x)[0]!=NA_LOGICAL) diff --git a/src/dogroups.c b/src/dogroups.c index cc7554c3e3..e1ef81eed6 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -46,9 +46,9 @@ static bool anySpecialStatic(SEXP x) { if (n==0) return false; if (isVectorAtomic(x)) - return ALTREP(x) || TRUELENGTH(x)<0; + return ALTREP(x) || TRULEN(x)<0; if (isNewList(x)) { - if (TRUELENGTH(x)<0) + if (TRULEN(x)<0) return true; // test 2158 for (int i=0; i maxGrpSize) maxGrpSize = ilens[i]; } defineVar(install(".I"), I = PROTECT(allocVector(INTSXP, maxGrpSize)), env); nprotect++; - SET_TRUELENGTH(I, -maxGrpSize); // marker for anySpecialStatic(); see its comments + SET_TRULEN(I, -maxGrpSize); // marker for anySpecialStatic(); see its comments R_LockBinding(install(".I"), env); SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by" @@ -149,7 +149,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX nameSyms[i] = install(CHAR(STRING_ELT(names, i))); // fixes http://stackoverflow.com/questions/14753411/why-does-data-table-lose-class-definition-in-sd-after-group-by copyMostAttrib(VECTOR_ELT(dt,INTEGER(dtcols)[i]-1), this); // not names, otherwise test 778 would fail - SET_TRUELENGTH(this, -maxGrpSize); // marker for anySpecialStatic(); see its comments + SET_TRULEN(this, -maxGrpSize); // marker for anySpecialStatic(); see its comments } SEXP xknames = PROTECT(getAttrib(xSD, R_NamesSymbol)); nprotect++; @@ -194,7 +194,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX INTEGER(GRP)[0] = i+1; // group counter exposed as .GRP INTEGER(rownames)[1] = -grpn; // the .set_row_names() of .SD. Not .N when nomatch=NA and this is a nomatch for (int j=0; j0; diff --git a/src/fmelt.c b/src/fmelt.c index 59e82455b4..deabafbbd1 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -414,10 +414,10 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType for (int k=0; k0) savetl(s); - SET_TRUELENGTH(s,-(++nlevel)); + SET_TRULEN(s,-(++nlevel)); levelsRaw[nlevel-1] = s; } } @@ -425,11 +425,11 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType if (targetd[i]==NA_STRING) { *ansd++ = NA_INTEGER; } else { - int tl = TRUELENGTH(targetd[i]); + int tl = TRULEN(targetd[i]); *ansd++ = tl<0 ? -tl : NA_INTEGER; } } - for (int i=0; i=0) { // another thread may have set it while I was waiting, so check it again - if (TRUELENGTH(s)>0) // save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore + if (TRULEN(s)>=0) { // another thread may have set it while I was waiting, so check it again + if (TRULEN(s)>0) // save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore savetl(s); // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much. // now save unique SEXP in ustr so i) we can loop through them afterwards and reset TRUELENGTH to 0 and ii) sort uniques when sorting too if (ustr_alloc<=ustr_n) { @@ -320,7 +320,7 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max if (ustr==NULL) STOP(_("Unable to realloc %d * %d bytes in range_str"), ustr_alloc, (int)sizeof(SEXP)); // # nocov } ustr[ustr_n++] = s; - SET_TRUELENGTH(s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group + SET_TRULEN(s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s); if (!anynotutf8 && // even if anynotascii we still want to know if anynotutf8, and anynotutf8 implies anynotascii already !IS_ASCII(s)) { // anynotutf8 implies anynotascii and IS_ASCII will be cheaper than IS_UTF8, so start with this one @@ -351,23 +351,23 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max for (int i=0; iustr_maxlen) ustr_maxlen=LENGTH(s); - if (TRUELENGTH(s)>0) savetl(s); + if (TRULEN(s)>0) savetl(s); } cradix(ustr3, ustr_n); // sort to detect possible duplicates after converting; e.g. two different non-utf8 map to the same utf8 - SET_TRUELENGTH(ustr3[0], -1); + SET_TRULEN(ustr3[0], -1); int o = -1; for (int i=1; i0) savetl(s); + if (TRULEN(s)<0) continue; // seen this name before + if (TRULEN(s)>0) savetl(s); uniq[nuniq++] = s; - SET_TRUELENGTH(s,-nuniq); + SET_TRULEN(s,-nuniq); } } if (nuniq>0) uniq = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare @@ -99,7 +99,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector if (!counts || !maxdup) { // # nocov start - for (int i=0; i maxdup[u]) maxdup[u] = counts[u]; @@ -134,7 +134,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor int *dupLink = (int *)malloc(ncol * sizeof(int)); // if a colname has occurred before (a dup) links from the 1st to the 2nd time in the final result, 2nd to 3rd, etc if (!colMapRaw || !uniqMap || !dupLink) { // # nocov start - for (int i=0; i0) savetl(s); + if (TRULEN(s)>0) savetl(s); levelsRaw[k] = s; - SET_TRUELENGTH(s,-k-1); + SET_TRULEN(s,-k-1); } for (int i=0; i=last) { // if tl>=0 then also tl>=last because last<=0 if (tl>=0) { snprintf(warnStr, 1000, // not direct warning as we're inside tl region @@ -438,8 +438,8 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor for (int k=0; k0) savetl(s); + TRULEN(s)<0) continue; // seen this level before; handles removing dups from levels as well as finding unique of character columns + if (TRULEN(s)>0) savetl(s); if (allocLevel==nLevel) { // including initial time when allocLevel==nLevel==0 SEXP *tt = NULL; if (allocLevel Date: Sun, 8 Dec 2024 00:39:15 +0100 Subject: [PATCH 2/4] Performance boost by replacing TRUELENGTH -> STDVEC_TRUELENGTH for strings (no ALTREP strings + algorithms wouldn't work anymore). --- src/chmatch.c | 14 +++++++------- src/forder.c | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/chmatch.c b/src/chmatch.c index 6c7fc8a813..99c70d3e71 100644 --- a/src/chmatch.c +++ b/src/chmatch.c @@ -58,7 +58,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch savetl_init(); for (int i=0; i0) { savetl(s); // R's internal hash (which is positive); save it SET_TRULEN(s,0); @@ -75,7 +75,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch int nuniq=0; for (int i=0; i0) { savetl(s); tl=0; } if (tl==0) SET_TRULEN(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table } @@ -105,12 +105,12 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch error(_("Failed to allocate %"PRIu64" bytes working memory in chmatchdup: length(table)=%d length(unique(table))=%d"), ((uint64_t)tablelen*2+nuniq)*sizeof(int), tablelen, nuniq); // # nocov end } - for (int i=0; i=0) { // another thread may have set it while I was waiting, so check it again - if (TRULEN(s)>0) // save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore + if (STDVEC_TRUELENGTH(s)>=0) { // another thread may have set it while I was waiting, so check it again + if (STDVEC_TRUELENGTH(s)>0) // save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore savetl(s); // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much. // now save unique SEXP in ustr so i) we can loop through them afterwards and reset TRUELENGTH to 0 and ii) sort uniques when sorting too if (ustr_alloc<=ustr_n) { @@ -351,7 +351,7 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max for (int i=0; iustr_maxlen) ustr_maxlen=LENGTH(s); - if (TRULEN(s)>0) savetl(s); + if (STDVEC_TRUELENGTH(s)>0) savetl(s); } cradix(ustr3, ustr_n); // sort to detect possible duplicates after converting; e.g. two different non-utf8 map to the same utf8 SET_TRULEN(ustr3[0], -1); @@ -365,7 +365,7 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max if (!tl) STOP(_("Failed to alloc tl when converting strings to UTF8")); // # nocov const SEXP *tt = STRING_PTR_RO(ustr2); - for (int i=0; i Date: Sun, 8 Dec 2024 00:51:46 +0100 Subject: [PATCH 3/4] Hard to resist that one... --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7447fed2a2..17ff723b4d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -100,5 +100,6 @@ Authors@R: c( person("Vincent", "Rocher", role="ctb"), person("Vijay", "Lulla", role="ctb"), person("Aljaž", "Sluga", role="ctb"), - person("Bill", "Evans", role="ctb") + person("Bill", "Evans", role="ctb"), + person("Sebastian", "Krantz", role="ctb") ) From 9cecde1deba67769c57af5c31846a402cc6f5317 Mon Sep 17 00:00:00 2001 From: Sebastian Krantz Date: Sun, 8 Dec 2024 19:11:27 +0100 Subject: [PATCH 4/4] Better. --- src/R_Defn.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/R_Defn.h b/src/R_Defn.h index 41726c6b4e..111c6d04a4 100644 --- a/src/R_Defn.h +++ b/src/R_Defn.h @@ -76,7 +76,7 @@ typedef struct { SETSCAL(__x__, __v__ == 1 ? 1 : 0); \ } while (0) // https://github.com/wch/r-source/blob/05bb18266d49e87f2477120ecb0ab1440f4e9b40/src/main/memory.c#L4072 -#define SET_LEN(x, v) SET_STDVEC_LENGTH((x), (v)) +#define SET_LEN SET_STDVEC_LENGTH // LEVELS: https://github.com/wch/r-source/blob/2640a203d13473f95c9c7508eb2976fefb5c931c/src/include/Defn.h#L228 #define LEVLS(x) (((SEXPREC_partial *)(x))->sxpinfo.gp)