From 421170eb1a69da0fa6f745830f5493483ebd48e6 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sat, 28 Dec 2024 14:16:27 +0300 Subject: [PATCH 1/7] Introduce the growable vector API --- src/data.table.h | 8 ++++++++ src/growable.c | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 src/growable.c diff --git a/src/data.table.h b/src/data.table.h index 252f5e3b5..f9b8bbe68 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -298,6 +298,14 @@ void hash_set(hashtab *, SEXP key, R_xlen_t value); // Returns the value corresponding to the key present in the hash, otherwise returns ifnotfound. R_xlen_t hash_lookup(const hashtab *, SEXP key, R_xlen_t ifnotfound); +// growable.c +// Return a new vector of given type. Initially its xlength() is equal to size. Using growable_resize(), it can be increased to up to max_size. +SEXP growable_allocate(SEXPTYPE type, R_xlen_t size, R_xlen_t max_size); +// Return the max_size of a growable vector. Behaviour is undefined if x was not allocated by growable_allocate. +R_xlen_t growable_max_size(SEXP x); +// Resize a growable vector to newsize. Will signal an error if newsize exceeds max_size. +void growable_resize(SEXP x, R_xlen_t newsize); + // functions called from R level .Call/.External and registered in init.c // these now live here to pass -Wstrict-prototypes, #5477 // all arguments must be SEXP since they are called from R level diff --git a/src/growable.c b/src/growable.c new file mode 100644 index 000000000..05069fe80 --- /dev/null +++ b/src/growable.c @@ -0,0 +1,23 @@ +#include "data.table.h" + +SEXP growable_allocate(SEXPTYPE type, R_xlen_t size, R_xlen_t max_size) { + SEXP ret = PROTECT(allocVector(type, max_size)); + SET_TRUELENGTH(ret, max_size); + SET_GROWABLE_BIT(ret); + SETLENGTH(ret, size); + UNPROTECT(1); + return ret; +} + +R_xlen_t growable_max_size(SEXP x) { + return TRUELENGTH(x); +} + +void growable_resize(SEXP x, R_xlen_t newsize) { + R_xlen_t max_size; + if (newsize > (max_size = growable_max_size(x))) internal_error( + __func__, "newsize=%g > max_size=%g", + (double)newsize, (double)max_size + ); + SETLENGTH(x, newsize); +} From 3b3185085119cc7262a47fc5680ce00868291192 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sat, 28 Dec 2024 16:21:18 +0300 Subject: [PATCH 2/7] Use growable_* instead of SETLENGTH et al in fread --- src/dogroups.c | 2 +- src/freadR.c | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index c2f84e9f7..e5efb3a2a 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -523,7 +523,7 @@ SEXP growVector(SEXP x, const R_len_t newlen) SEXP newx; R_len_t len = length(x); if (isNull(x)) error(_("growVector passed NULL")); - PROTECT(newx = allocVector(TYPEOF(x), newlen)); // TO DO: R_realloc(?) here? + PROTECT(newx = growable_allocate(TYPEOF(x), newlen, newlen)); // may be shrunk later by fread if (newlen < len) len=newlen; // i.e. shrink switch (TYPEOF(x)) { case RAWSXP: memcpy(RAW(newx), RAW(x), len*SIZEOF(x)); break; diff --git a/src/freadR.c b/src/freadR.c index 4f94239ca..735708537 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -257,7 +257,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int if (typeSize[CT_BOOL8_N]!=1) internal_error(__func__, "typeSize[CT_BOOL8_N] != 1"); // # nocov if (typeSize[CT_STRING]!=8) internal_error(__func__, "typeSize[CT_STRING] != 1"); // # nocov colNamesSxp = R_NilValue; - SET_VECTOR_ELT(RCHK, 1, colNamesSxp=allocVector(STRSXP, ncol)); + SET_VECTOR_ELT(RCHK, 1, colNamesSxp=growable_allocate(STRSXP, ncol, ncol)); for (int i=0; i 0) && (newDT || TYPEOF(col) != typeSxp[type[i]] || oldIsInt64 != newIsInt64); int nrowChanged = (allocNrow != dtnrows); if (typeChanged || nrowChanged) { - SEXP thiscol = typeChanged ? allocVector(typeSxp[type[i]], allocNrow) // no need to PROTECT, passed immediately to SET_VECTOR_ELT, see R-exts 5.9.1 + SEXP thiscol = typeChanged ? growable_allocate(typeSxp[type[i]], allocNrow, allocNrow) // no need to PROTECT, passed immediately to SET_VECTOR_ELT, see R-exts 5.9.1 : growVector(col, allocNrow); SET_VECTOR_ELT(DT,resi,thiscol); if (type[i]==CT_INT64) { @@ -519,7 +519,6 @@ size_t allocateDT(int8_t *typeArg, int8_t *sizeArg, int ncolArg, int ndrop, size setAttrib(thiscol, sym_tzone, ScalarString(char_UTC)); // see news for v1.13.0 } - SET_TRUELENGTH(thiscol, allocNrow); DTbytes += SIZEOF(thiscol)*allocNrow; } resi++; @@ -536,9 +535,7 @@ void setFinalNrow(size_t nrow) { return; const int ncol=LENGTH(DT); for (int i=0; i Date: Sat, 28 Dec 2024 19:31:43 +0300 Subject: [PATCH 3/7] Switch shallow() to use growable_allocate() The resulting data.tables now have GROWABLE_BIT set, therefore: - the finalizer is not needed on R >= 3.4 - duplicates of data.tables (which are not over-allocated) now have TRUELENGTH of 0 instead of whatever it was before, which is detected earlier in selfrefok() As a result, assign.c only uses TRUELENGTH on R < 3.4. --- inst/tests/tests.Rraw | 6 ----- src/assign.c | 52 ++++++++++++++++++++++++------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 657478c61..8af4d920e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -19303,12 +19303,6 @@ test(2290.3, DT[, `:=`(a, c := 3)], error="It looks like you re-used `:=` in arg # partially-named `:=`(...) --> different branch, same error test(2290.4, DT[, `:=`(a = 2, c := 3)], error="It looks like you re-used `:=` in argument 2") -# segfault when selfref is not ok before set #6410 -df = data.frame(a=1:3) -setDT(df) -attr(df, "att") = 1 -test(2291.1, set(df, NULL, "new", "new"), error="attributes .* have been reassigned") - # ns-qualified bysub error, #6493 DT = data.table(a = 1) test(2292.1, DT[, .N, by=base::mget("a")], data.table(a = 1, N = 1L)) diff --git a/src/assign.c b/src/assign.c index a7b083cb8..c36b72d6d 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1,5 +1,6 @@ #include "data.table.h" +#if R_VERSION < R_Version(3,4,0) // not needed with GROWABLE_BIT static void finalizer(SEXP p) { SEXP x; @@ -22,6 +23,7 @@ static void finalizer(SEXP p) UNPROTECT(1); return; } +#endif void setselfref(SEXP x) { if(!INHERITS(x, char_datatable)) return; // #5286 @@ -38,7 +40,9 @@ void setselfref(SEXP x) { R_NilValue )) )); +#if R_VERSION < R_Version(3,4,0) // not needed with GROWABLE_BIT R_RegisterCFinalizerEx(p, finalizer, FALSE); +#endif UNPROTECT(2); /* @@ -126,15 +130,24 @@ static int _selfrefok(SEXP x, Rboolean checkNames, Rboolean verbose) { tag = R_ExternalPtrTag(v); if (!(isNull(tag) || isString(tag))) internal_error(__func__, ".internal.selfref tag is neither NULL nor a character vector"); // # nocov names = getAttrib(x, R_NamesSymbol); - if (names!=tag && isString(names) && !ALTREP(names)) // !ALTREP for #4734 + // On R >= 3.4, either + // (1) we allocate the data.table and/or its names, so it has the GROWABLE_BIT set, so copies will have zero TRUELENGTH, or + // (2) someone else creates them from scratch, so (only using the API) will have zero TRUELENGTH. + // We then return false and either re-create the data.table from scratch or signal an error, so the current object having a zero TRUELENGTH is fine. + // R < 3.4 doesn't have the GROWABLE_BIT, so let's reset the TRUELENGTH just in case. +#if R_VERSION < R_Version(3,4,0) + if (names!=tag && isString(names)) SET_TRUELENGTH(names, LENGTH(names)); // R copied this vector not data.table; it's not actually over-allocated. It looks over-allocated // because R copies the original vector's tl over despite allocating length. +#endif prot = R_ExternalPtrProtected(v); if (TYPEOF(prot) != EXTPTRSXP) // Very rare. Was error(_(".internal.selfref prot is not itself an extptr")). return 0; // # nocov ; see http://stackoverflow.com/questions/15342227/getting-a-random-internal-selfref-error-in-data-table-for-r - if (x!=R_ExternalPtrAddr(prot) && !ALTREP(x)) +#if R_VERSION < R_Version(3,4,0) + if (x!=R_ExternalPtrAddr(prot)) SET_TRUELENGTH(x, LENGTH(x)); // R copied this vector not data.table, it's not actually over-allocated +#endif return checkNames ? names==tag : x==R_ExternalPtrAddr(prot); } @@ -151,7 +164,8 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) // called from alloccol where n is checked carefully, or from shallow() at R level // where n is set to truelength (i.e. a shallow copy only with no size change) int protecti=0; - SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here? + const int l = isNull(cols) ? length(dt) : length(cols); + SEXP newdt = PROTECT(growable_allocate(VECSXP, l, n)); protecti++; // to do, use growVector here? SHALLOW_DUPLICATE_ATTRIB(newdt, dt); // TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It @@ -169,8 +183,7 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) setAttrib(newdt, sym_sorted, duplicate(sorted)); SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++; - SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++; - const int l = isNull(cols) ? LENGTH(dt) : length(cols); + SEXP newnames = PROTECT(growable_allocate(STRSXP, l, n)); protecti++; if (isNull(cols)) { for (int i=0; il) ? n : l); // e.g. test 848 and 851 in R > 3.0.2 // added (n>l) ? ... for #970, see test 1481. // TO DO: test realloc names if selfrefnamesok (users can setattr(x,"name") themselves for example. - // if (TRUELENGTH(getAttrib(dt,R_NamesSymbol))!=tl) - // internal_error(__func__, "tl of dt passes checks, but tl of names (%d) != tl of dt (%d)", tl, TRUELENGTH(getAttrib(dt,R_NamesSymbol))); // # nocov - tl = TRUELENGTH(dt); + tl = growable_max_size(dt); // R <= 2.13.2 and we didn't catch uninitialized tl somehow if (tl<0) internal_error(__func__, "tl of class is marked but tl<0"); // # nocov if (tl>0 && tl Date: Sat, 28 Dec 2024 20:22:09 +0300 Subject: [PATCH 4/7] Use growable_allocate() in subsetDT() --- src/subset.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/subset.c b/src/subset.c index c7653b893..bf9e0ac55 100644 --- a/src/subset.c +++ b/src/subset.c @@ -297,7 +297,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md } int overAlloc = checkOverAlloc(GetOption(install("datatable.alloccol"), R_NilValue)); - SEXP ans = PROTECT(allocVector(VECSXP, LENGTH(cols)+overAlloc)); nprotect++; // doing alloc.col directly here; eventually alloc.col can be deprecated. + SEXP ans = PROTECT(growable_allocate(VECSXP, LENGTH(cols), LENGTH(cols)+overAlloc)); nprotect++; // doing alloc.col directly here; eventually alloc.col can be deprecated. // user-defined and superclass attributes get copied as from v1.12.0 copyMostAttrib(x, ans); @@ -305,8 +305,6 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md // includes row.names (oddly, given other dims aren't) and "sorted" dealt with below // class is also copied here which retains superclass name in class vector as has been the case for many years; e.g. tests 1228.* for #64 - SET_TRUELENGTH(ans, LENGTH(ans)); - SETLENGTH(ans, LENGTH(cols)); int ansn; if (isNull(rows)) { ansn = nrow; @@ -329,9 +327,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md subsetVectorRaw(target, source, rows, anyNA); // parallel within column } } - SEXP tmp = PROTECT(allocVector(STRSXP, LENGTH(cols)+overAlloc)); nprotect++; - SET_TRUELENGTH(tmp, LENGTH(tmp)); - SETLENGTH(tmp, LENGTH(cols)); + SEXP tmp = PROTECT(growable_allocate(STRSXP, LENGTH(cols), LENGTH(cols)+overAlloc)); nprotect++; setAttrib(ans, R_NamesSymbol, tmp); subsetVectorRaw(tmp, getAttrib(x, R_NamesSymbol), cols, /*anyNA=*/false); From 06e7db71c45bd306acde52588b0f8f448a1574ca Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sat, 28 Dec 2024 22:10:55 +0300 Subject: [PATCH 5/7] Use growable_* instead of SETLENGTH in dogroups() Since dogroups() relies on being able to shrink vectors it hasn't allocated, introduce make_growable() and is_growable() to adapt. Since dogroups() relies on SD and SDall having shared columns, use the setgrowable() wrapper on the R side at the time when SD and SDall are being created. (In the ALTREP case, setgrowable() will re-create the columns.) --- R/data.table.R | 2 ++ R/wrappers.R | 2 ++ src/data.table.h | 5 +++++ src/dogroups.c | 20 +++++++++----------- src/growable.c | 17 +++++++++++++++++ src/init.c | 1 + src/wrappers.c | 10 ++++++++++ 7 files changed, 46 insertions(+), 11 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index bac200b8a..0c0d1dfd3 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1353,6 +1353,7 @@ replace_dot_alias = function(e) { } if (!with || missing(j)) return(ans) if (!is.data.table(ans)) setattr(ans, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...]) .SD should be data.table, test 2212.013 + setgrowable(ans) SDenv$.SDall = ans SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall SDenv$.N = nrow(ans) @@ -1591,6 +1592,7 @@ replace_dot_alias = function(e) { SDenv$.SDall = .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols) # must be deep copy when largest group is a subset if (!is.data.table(SDenv$.SDall)) setattr(SDenv$.SDall, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...],by=grp) needs .SD to be data.table, test 2022.012 if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[xcolsAns]) # now that we allow 'x.' prefix in 'j', #2313 bug fix - [xcolsAns] + setgrowable(SDenv$.SDall) SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall } if (nrow(SDenv$.SDall)==0L) { diff --git a/R/wrappers.R b/R/wrappers.R index 80b7a64e9..2a48c9017 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -21,3 +21,5 @@ fitsInInt32 = function(x) .Call(CfitsInInt32R, x) fitsInInt64 = function(x) .Call(CfitsInInt64R, x) coerceAs = function(x, as, copy=TRUE) .Call(CcoerceAs, x, as, copy) + +setgrowable = function(x) .Call(Csetgrowable, x) diff --git a/src/data.table.h b/src/data.table.h index f9b8bbe68..48c2a68f9 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -305,6 +305,10 @@ SEXP growable_allocate(SEXPTYPE type, R_xlen_t size, R_xlen_t max_size); R_xlen_t growable_max_size(SEXP x); // Resize a growable vector to newsize. Will signal an error if newsize exceeds max_size. void growable_resize(SEXP x, R_xlen_t newsize); +// Return TRUE if growable_resize(x) and growable_max_size(x) are valid operations. +Rboolean is_growable(SEXP x); +// Transform x into a growable vector. The return value must be reprotected in place of x. What happens to x is deliberately not specified, but no copying occurs. +SEXP make_growable(SEXP x); // functions called from R level .Call/.External and registered in init.c // these now live here to pass -Wstrict-prototypes, #5477 @@ -379,4 +383,5 @@ SEXP dt_has_zlib(void); SEXP startsWithAny(SEXP, SEXP, SEXP); SEXP convertDate(SEXP, SEXP); SEXP fastmean(SEXP); +SEXP setgrowable(SEXP x); diff --git a/src/dogroups.c b/src/dogroups.c index e5efb3a2a..db32b9906 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -124,7 +124,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX for (R_len_t i=0; i maxGrpSize) maxGrpSize = ilens[i]; } - defineVar(install(".I"), I = PROTECT(allocVector(INTSXP, maxGrpSize)), env); nprotect++; + defineVar(install(".I"), I = PROTECT(growable_allocate(INTSXP, maxGrpSize, maxGrpSize)), env); nprotect++; hash_set(specials, I, -maxGrpSize); // marker for anySpecialStatic(); see its comments R_LockBinding(install(".I"), env); @@ -197,7 +197,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX INTEGER(GRP)[0] = i+1; // group counter exposed as .GRP INTEGER(rownames)[1] = -grpn; // the .set_row_names() of .SD. Not .N when nomatch=NA and this is a nomatch for (int j=0; j0; diff --git a/src/growable.c b/src/growable.c index 05069fe80..6bba73faf 100644 --- a/src/growable.c +++ b/src/growable.c @@ -3,7 +3,9 @@ SEXP growable_allocate(SEXPTYPE type, R_xlen_t size, R_xlen_t max_size) { SEXP ret = PROTECT(allocVector(type, max_size)); SET_TRUELENGTH(ret, max_size); +#if R_VERSION >= R_Version(3, 4, 0) SET_GROWABLE_BIT(ret); +#endif // otherwise perceived memory use will end up higher SETLENGTH(ret, size); UNPROTECT(1); return ret; @@ -21,3 +23,18 @@ void growable_resize(SEXP x, R_xlen_t newsize) { ); SETLENGTH(x, newsize); } + +Rboolean is_growable(SEXP x) { + return isVector(x) && TRUELENGTH(x) >= XLENGTH(x) +#if R_VERSION >= R_Version(3, 4, 0) + && IS_GROWABLE(x) +#endif + ; +} + +// Assuming no ALTREP for now +SEXP make_growable(SEXP x) { + if (TRUELENGTH(x) < XLENGTH(x)) SET_TRUELENGTH(x, XLENGTH(x)); + SET_GROWABLE_BIT(x); + return x; +} diff --git a/src/init.c b/src/init.c index 204dc1088..e87b6ee03 100644 --- a/src/init.c +++ b/src/init.c @@ -150,6 +150,7 @@ R_CallMethodDef callMethods[] = { {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, +{"Csetgrowable", (DL_FUNC)&setgrowable, -1}, {NULL, NULL, 0} }; diff --git a/src/wrappers.c b/src/wrappers.c index 6587caa97..fb698bce0 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -124,3 +124,13 @@ SEXP warn_matrix_column_r(SEXP i) { warn_matrix_column(INTEGER(i)[0]); return R_NilValue; } + +SEXP setgrowable(SEXP x) { + for (R_xlen_t i = 0; i < xlength(x); ++i) { + SEXP this = VECTOR_ELT(x, i); + // relying on the rest of data.table machinery to avoid the need for resizing + if (!is_growable(this) && !ALTREP(this)) + SET_VECTOR_ELT(x, i, make_growable(this)); + } + return R_NilValue; +} From 7b9b1417208b669f7c92919f4de8ef4179a2f38d Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 29 Dec 2024 01:06:28 +0300 Subject: [PATCH 6/7] Mark internal_error*() as NORET This helps avoid false positive warnings about unreachable places in the code. --- src/data.table.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 48c2a68f9..613e74f81 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -143,7 +143,7 @@ uint64_t dtwiddle(double x); SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP reuseSortingArg); // reuseSorting wrapper to forder int getNumericRounding_C(void); -void internal_error_with_cleanup(const char *call_name, const char *format, ...); +NORET void internal_error_with_cleanup(const char *call_name, const char *format, ...); // reorder.c SEXP reorder(SEXP x, SEXP order); @@ -259,7 +259,7 @@ SEXP islockedR(SEXP x); bool need2utf8(SEXP x); SEXP coerceUtf8IfNeeded(SEXP x); SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg); -void internal_error(const char *call_name, const char *format, ...); +NORET void internal_error(const char *call_name, const char *format, ...); // types.c char *end(char *start); From 61fdc064c2bda8a4b96074588c423c8d395037c3 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 29 Dec 2024 11:10:52 +0300 Subject: [PATCH 7/7] ALTREP implementation of growable_* Currently broken: - 1 errors out of 11637 - won't work with expression vectors at all --- src/assign.c | 4 +- src/data.table.h | 6 ++ src/dogroups.c | 5 +- src/growable.c | 238 ++++++++++++++++++++++++++++++++++++++++++++++- src/init.c | 4 + src/reorder.c | 6 ++ src/utils.c | 11 ++- src/wrappers.c | 16 +++- 8 files changed, 280 insertions(+), 10 deletions(-) diff --git a/src/assign.c b/src/assign.c index c36b72d6d..4be306251 100644 --- a/src/assign.c +++ b/src/assign.c @@ -323,7 +323,7 @@ SEXP shallowwrapper(SEXP dt, SEXP cols) { } SEXP truelength(SEXP x) { - return ScalarInteger(isNull(x) ? 0 : growable_max_size(x)); + return ScalarInteger(is_growable(x) ? growable_max_size(x) : 0); } SEXP selfrefokwrapper(SEXP x, SEXP verbose) { @@ -520,7 +520,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // modify DT by reference. Other than if new columns are being added and the allocVec() fails with // out-of-memory. In that case the user will receive hard halt and know to rerun. if (length(newcolnames)) { - oldtncol = growable_max_size(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more. + oldtncol = is_growable(dt) ? growable_max_size(dt) : 0; // TO DO: oldtncol can be just called tl now, as we won't realloc here any more. if (oldtncol= R_Version(4, 3, 0) +# define USE_GROWABLE_ALTREP +#endif #include #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT #include // for uint64_t rather than unsigned long long @@ -309,6 +312,9 @@ void growable_resize(SEXP x, R_xlen_t newsize); Rboolean is_growable(SEXP x); // Transform x into a growable vector. The return value must be reprotected in place of x. What happens to x is deliberately not specified, but no copying occurs. SEXP make_growable(SEXP x); +#if R_VERSION >= R_Version(4, 3, 0) +void register_altrep_classes(DllInfo*); +#endif // functions called from R level .Call/.External and registered in init.c // these now live here to pass -Wstrict-prototypes, #5477 diff --git a/src/dogroups.c b/src/dogroups.c index db32b9906..9c8c62d96 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -45,11 +45,10 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) { // (see data.table.h), and isNewList() is true for NULL if (n==0) return false; + if (hash_lookup(specials, x, 0)<0) return true; // test 2158 if (isVectorAtomic(x)) - return ALTREP(x) || hash_lookup(specials, x, 0)<0; + return ALTREP(x); // see test 2156: ALTREP is a source of sharing we can't trace reliably if (isNewList(x)) { - if (hash_lookup(specials, x, 0)<0) - return true; // test 2158 for (int i=0; i + +static R_altrep_class_t dta_grow_string, dta_grow_integer, dta_grow_logical, dta_grow_real, dta_grow_complex, dta_grow_raw, dta_grow_list; +static Rcomplex NA_COMPLEX = { 0, }; + +/* +ALTREP class layout: +data1 = underlying vector +data2 = its current length stored as a length-1 REALSXP +Unless we implement an Unserialize method, this can be changed any time. +Classes have been released on CRAN with a Serialized_state/Unserialize pair will have to stay as they have been defined in order to keep *.rds files readable. +*/ + +static R_xlen_t altall_Length(SEXP x) { + return (R_xlen_t)REAL(R_altrep_data2(x))[0]; +} + +#define make_inspect_method(classname) \ + static Rboolean alt##classname##_Inspect( \ + SEXP x, int pre, int deep, int pvec, \ + void (*inspect_subtree)(SEXP x, int pre, int deep, int pvec) \ + ) { \ + (void)pre; (void)deep; (void)pvec; (void)inspect_subtree; \ + Rprintf("data.table::growable" #classname "_v0(truelength=%g) ", (double)XLENGTH(R_altrep_data1(x))); \ + return FALSE; \ + } +make_inspect_method(string) +make_inspect_method(integer) +make_inspect_method(logical) +make_inspect_method(real) +make_inspect_method(complex) +make_inspect_method(raw) +make_inspect_method(list) +#undef make_inspect_method + +#define make_dataptr_method(class, accessor) \ + static void * alt##class##_Dataptr(SEXP x, Rboolean writable) { \ + (void)writable; \ + return (void*)accessor(R_altrep_data1(x)); \ + } +make_dataptr_method(string, STRING_PTR_RO) +make_dataptr_method(integer, INTEGER) +make_dataptr_method(logical, LOGICAL) +make_dataptr_method(real, REAL) +make_dataptr_method(complex, COMPLEX) +make_dataptr_method(raw, RAW) +make_dataptr_method(list, DATAPTR_RO) // VECTOR_PTR_RO to appear in R-4.5 +#undef make_dataptr_method + +static const void * altall_Dataptr_or_null(SEXP x) { return DATAPTR_RO(x); } + +// lots of boilerplate, but R calling *_ELT one by one would be far too slow +#define make_extract_subset_method(class, type, accessor, NA) \ + static SEXP alt##class##_Extract_subset(SEXP x, SEXP indx, SEXP call) { \ + (void)call; \ + indx = PROTECT(coerceVector(indx, REALSXP)); \ + double * ii = REAL(indx); \ + R_xlen_t rlen = XLENGTH(indx), mylen = XLENGTH(x); \ + SEXP ret = PROTECT(allocVector(TYPEOF(x), rlen)); \ + type *rdata = accessor(ret), *mydata = accessor(x); \ + for (R_xlen_t i = 0; i < rlen; ++i) \ + rdata[i] = (ii[i] >= 1 && ii[i] <= mylen) ? mydata[(R_xlen_t)ii[i]-1] : NA; \ + UNPROTECT(2); \ + return ret; \ + } +make_extract_subset_method(integer, int, INTEGER, NA_INTEGER) +make_extract_subset_method(logical, int, LOGICAL, NA_LOGICAL) +make_extract_subset_method(real, double, REAL, NA_REAL) +make_extract_subset_method(complex, Rcomplex, COMPLEX, NA_COMPLEX) +make_extract_subset_method(raw, Rbyte, RAW, 0) +// not implementing the string and list methods because those do require the write barrier and are thus no better than calling *_ELT one by one +#undef make_extract_subset_method + +#define make_elt_method(class, accessor) \ + static SEXP alt##class##_Elt(SEXP x, R_xlen_t i) { \ + return accessor(R_altrep_data1(x), i); \ + } +make_elt_method(string, STRING_ELT) +make_elt_method(list, VECTOR_ELT) +#undef make_elt_method + +#define make_set_elt_method(class, accessor) \ + static void alt##class##_Set_elt(SEXP x, R_xlen_t i, SEXP v) { \ + accessor(R_altrep_data1(x), i, v); \ + } +make_set_elt_method(string, SET_STRING_ELT) +make_set_elt_method(list, SET_VECTOR_ELT) +#undef make_set_elt_method + +// liked the Extract_subset methods? say hello to Get_region +#define make_get_region_method(class, type, accessor) \ + static R_xlen_t alt##class##_Get_region( \ + SEXP x, R_xlen_t i, R_xlen_t n, type * buf \ + ) { \ + R_xlen_t j = 0, mylen = XLENGTH(x); \ + type * data = accessor(x); \ + for (; j < n && i < mylen; ++i, ++j) buf[j] = data[i]; \ + return j; \ + } +make_get_region_method(integer, int, INTEGER) +make_get_region_method(logical, int, LOGICAL) +make_get_region_method(real, double, REAL) +make_get_region_method(complex, Rcomplex, COMPLEX) +make_get_region_method(raw, Rbyte, RAW) +#undef make_get_region_method + +void register_altrep_classes(DllInfo * info) { + // Used by the altcomplex_Extract_subset method + NA_COMPLEX = (Rcomplex){ .r = NA_REAL, .i = NA_REAL }; + + dta_grow_string = R_make_altstring_class("growable_string_v0", "data.table", info); + R_set_altrep_Length_method(dta_grow_string, altall_Length); + R_set_altrep_Inspect_method(dta_grow_string, altstring_Inspect); + R_set_altvec_Dataptr_method(dta_grow_string, altstring_Dataptr); + R_set_altvec_Dataptr_or_null_method(dta_grow_string, altall_Dataptr_or_null); + R_set_altstring_Elt_method(dta_grow_string, altstring_Elt); + R_set_altstring_Set_elt_method(dta_grow_string, altstring_Set_elt); + dta_grow_integer = R_make_altinteger_class("growable_integer_v0", "data.table", info); + R_set_altrep_Length_method(dta_grow_integer, altall_Length); + R_set_altrep_Inspect_method(dta_grow_integer, altinteger_Inspect); + R_set_altvec_Dataptr_method(dta_grow_integer, altinteger_Dataptr); + R_set_altvec_Dataptr_or_null_method(dta_grow_integer, altall_Dataptr_or_null); + R_set_altvec_Extract_subset_method(dta_grow_integer, altinteger_Extract_subset); + R_set_altinteger_Get_region_method(dta_grow_integer, altinteger_Get_region); + dta_grow_logical = R_make_altlogical_class("growable_logical_v0", "data.table", info); + R_set_altrep_Length_method(dta_grow_logical, altall_Length); + R_set_altrep_Inspect_method(dta_grow_logical, altlogical_Inspect); + R_set_altvec_Dataptr_method(dta_grow_logical, altlogical_Dataptr); + R_set_altvec_Dataptr_or_null_method(dta_grow_logical, altall_Dataptr_or_null); + R_set_altvec_Extract_subset_method(dta_grow_logical, altlogical_Extract_subset); + R_set_altlogical_Get_region_method(dta_grow_logical, altlogical_Get_region); + dta_grow_real = R_make_altreal_class("growable_real_v0", "data.table", info); + R_set_altrep_Length_method(dta_grow_real, altall_Length); + R_set_altrep_Inspect_method(dta_grow_real, altreal_Inspect); + R_set_altvec_Dataptr_method(dta_grow_real, altreal_Dataptr); + R_set_altvec_Dataptr_or_null_method(dta_grow_real, altall_Dataptr_or_null); + R_set_altvec_Extract_subset_method(dta_grow_real, altreal_Extract_subset); + R_set_altreal_Get_region_method(dta_grow_real, altreal_Get_region); + dta_grow_complex = R_make_altcomplex_class("growable_complex_v0", "data.table", info); + R_set_altrep_Length_method(dta_grow_complex, altall_Length); + R_set_altrep_Inspect_method(dta_grow_complex, altcomplex_Inspect); + R_set_altvec_Dataptr_method(dta_grow_complex, altcomplex_Dataptr); + R_set_altvec_Dataptr_or_null_method(dta_grow_complex, altall_Dataptr_or_null); + R_set_altvec_Extract_subset_method(dta_grow_complex, altcomplex_Extract_subset); + R_set_altcomplex_Get_region_method(dta_grow_complex, altcomplex_Get_region); + dta_grow_raw = R_make_altraw_class("growable_raw_v0", "data.table", info); + R_set_altrep_Length_method(dta_grow_raw, altall_Length); + R_set_altrep_Inspect_method(dta_grow_raw, altraw_Inspect); + R_set_altvec_Dataptr_method(dta_grow_raw, altraw_Dataptr); + R_set_altvec_Dataptr_or_null_method(dta_grow_raw, altall_Dataptr_or_null); + R_set_altvec_Extract_subset_method(dta_grow_raw, altraw_Extract_subset); + R_set_altraw_Get_region_method(dta_grow_raw, altraw_Get_region); + dta_grow_list = R_make_altlist_class("growable_list_v0", "data.table", info); + R_set_altrep_Length_method(dta_grow_list, altall_Length); + R_set_altrep_Inspect_method(dta_grow_list, altlist_Inspect); + R_set_altvec_Dataptr_method(dta_grow_list, altlist_Dataptr); + R_set_altvec_Dataptr_or_null_method(dta_grow_list, altall_Dataptr_or_null); + R_set_altlist_Elt_method(dta_grow_list, altlist_Elt); + R_set_altlist_Set_elt_method(dta_grow_list, altlist_Set_elt); +} + +static R_altrep_class_t dta_grow_string, dta_grow_integer, dta_grow_logical, dta_grow_real, dta_grow_complex, dta_grow_raw, dta_grow_list; + +static R_altrep_class_t type2class(SEXPTYPE type) { + switch(type) { + case STRSXP: + return dta_grow_string; + case INTSXP: + return dta_grow_integer; + case LGLSXP: + return dta_grow_logical; + case REALSXP: + return dta_grow_real; + case CPLXSXP: + return dta_grow_complex; + case RAWSXP: + return dta_grow_raw; + case VECSXP: + case EXPRSXP: + return dta_grow_list; + default: + internal_error(__func__, "Can't create a growable vector of type '%s'", type2char(type)); + } +} + +SEXP growable_allocate(SEXPTYPE type, R_xlen_t size, R_xlen_t max_size) { + SEXP ret = PROTECT(R_new_altrep(type2class(type), R_NilValue, R_NilValue)); + R_set_altrep_data1(ret, allocVector(type, max_size)); + R_set_altrep_data2(ret, ScalarReal(size)); + UNPROTECT(1); + return ret; +} + +R_xlen_t growable_max_size(SEXP x) { + return XLENGTH(R_altrep_data1(x)); +} + +void growable_resize(SEXP x, R_xlen_t newsize) { + R_xlen_t max_size; + if (newsize > (max_size = growable_max_size(x))) internal_error( + __func__, "newsize=%g > max_size=%g", + (double)newsize, (double)max_size + ); + REAL(R_altrep_data2(x))[0] = newsize; +} + +Rboolean is_growable(SEXP x) { + switch(TYPEOF(x)) { + case STRSXP: + case INTSXP: + case LGLSXP: + case REALSXP: + case CPLXSXP: + case RAWSXP: + case VECSXP: + return R_altrep_inherits(x, type2class(TYPEOF(x))); + default: + return FALSE; + } +} + +SEXP make_growable(SEXP x) { + SEXP ret = PROTECT(R_new_altrep(type2class(TYPEOF(x)), R_NilValue, R_NilValue)); + R_set_altrep_data1(ret, x); + R_set_altrep_data2(ret, ScalarReal(XLENGTH(x))); + SHALLOW_DUPLICATE_ATTRIB(ret, x); + UNPROTECT(1); + return ret; +} + +#endif diff --git a/src/init.c b/src/init.c index e87b6ee03..f5c2647c8 100644 --- a/src/init.c +++ b/src/init.c @@ -203,8 +203,12 @@ void attribute_visible R_init_data_table(DllInfo *info) SEXP tmp = PROTECT(allocVector(INTSXP,2)); if (LENGTH(tmp)!=2) error(_("Checking LENGTH(allocVector(INTSXP,2)) [%d] is 2 %s"), LENGTH(tmp), msg); +#if R_VERSION >= R_Version(4, 3, 0) + register_altrep_classes(info); +#else // Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768 if (TRUELENGTH(tmp)!=0) error(_("Checking TRUELENGTH(allocVector(INTSXP,2)) [%lld] is 0 %s"), (long long)TRUELENGTH(tmp), msg); +#endif UNPROTECT(1); // According to IEEE (http://en.wikipedia.org/wiki/IEEE_754-1985#Zero) we can rely on 0.0 being all 0 bits. diff --git a/src/reorder.c b/src/reorder.c index a91b4a2bc..1f00da681 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -24,13 +24,17 @@ SEXP reorder(SEXP x, SEXP order) error(_("Column %d is length %d which differs from length of column 1 (%d). Invalid data.table."), i+1, length(v), nrow); if (SIZEOF(v) > maxSize) maxSize=SIZEOF(v); +#ifndef USE_GROWABLE_ALTREP if (ALTREP(v)) SET_VECTOR_ELT(x, i, copyAsPlain(v)); +#endif } copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768 } else { if (SIZEOF(x)!=4 && SIZEOF(x)!=8 && SIZEOF(x)!=16 && SIZEOF(x)!=1) error(_("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%zu)"), type2char(TYPEOF(x)), SIZEOF(x)); +#ifndef USE_GROWABLE_ALTREP if (ALTREP(x)) internal_error(__func__, "cannot reorder an ALTREP vector. Please see NEWS item 2 in v1.11.4"); // # nocov +#endif maxSize = SIZEOF(x); nrow = length(x); ncol = 1; @@ -40,7 +44,9 @@ SEXP reorder(SEXP x, SEXP order) if (length(order) != nrow) error("nrow(x)[%d]!=length(order)[%d]", nrow, length(order)); // # notranslate int nprotect = 0; +#ifndef USE_GROWABLE_ALTREP if (ALTREP(order)) { order=PROTECT(copyAsPlain(order)); nprotect++; } // TODO: if it's an ALTREP sequence some optimizations are possible rather than expand +#endif const int *restrict idx = INTEGER(order); int i=0; diff --git a/src/utils.c b/src/utils.c index 0c9d23fe6..58917e7fc 100644 --- a/src/utils.c +++ b/src/utils.c @@ -202,6 +202,9 @@ inline bool INHERITS(SEXP x, SEXP char_) { return false; } +#ifdef USE_GROWABLE_ALTREP +SEXP copyAsPlain(SEXP x) { return duplicate(x); } +#else SEXP copyAsPlain(SEXP x) { // v1.12.2 and before used standard R duplicate() to do this. But duplicate() is not guaranteed to not return an ALTREP. // e.g. ALTREP 'wrapper' on factor column (with materialized INTSXP) in package VIM under example(hotdeck) @@ -256,6 +259,7 @@ SEXP copyAsPlain(SEXP x) { UNPROTECT(1); return ans; } +#endif void copySharedColumns(SEXP x) { const int ncol = length(x); @@ -266,7 +270,12 @@ void copySharedColumns(SEXP x) { int nShared=0; for (int i=0; i