Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Growable ALTREP vectors #6697

Draft
wants to merge 7 commits into
base: truehash
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Use growable_* instead of SETLENGTH in dogroups()
Since dogroups() relies on being able to shrink vectors it hasn't
allocated, introduce make_growable() and is_growable() to adapt. Since
dogroups() relies on SD and SDall having shared columns, use the
setgrowable() wrapper on the R side at the time when SD and SDall are
being created.

(In the ALTREP case, setgrowable() will re-create the columns.)
  • Loading branch information
aitap committed Dec 29, 2024
commit 06e7db71c45bd306acde52588b0f8f448a1574ca
2 changes: 2 additions & 0 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1353,6 +1353,7 @@ replace_dot_alias = function(e) {
}
if (!with || missing(j)) return(ans)
if (!is.data.table(ans)) setattr(ans, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...]) .SD should be data.table, test 2212.013
setgrowable(ans)
SDenv$.SDall = ans
SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
SDenv$.N = nrow(ans)
Expand Down Expand Up @@ -1591,6 +1592,7 @@ replace_dot_alias = function(e) {
SDenv$.SDall = .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols) # must be deep copy when largest group is a subset
if (!is.data.table(SDenv$.SDall)) setattr(SDenv$.SDall, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...],by=grp) needs .SD to be data.table, test 2022.012
if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[xcolsAns]) # now that we allow 'x.' prefix in 'j', #2313 bug fix - [xcolsAns]
setgrowable(SDenv$.SDall)
SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
}
if (nrow(SDenv$.SDall)==0L) {
Expand Down
2 changes: 2 additions & 0 deletions R/wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ fitsInInt32 = function(x) .Call(CfitsInInt32R, x)
fitsInInt64 = function(x) .Call(CfitsInInt64R, x)

coerceAs = function(x, as, copy=TRUE) .Call(CcoerceAs, x, as, copy)

setgrowable = function(x) .Call(Csetgrowable, x)
5 changes: 5 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,10 @@ SEXP growable_allocate(SEXPTYPE type, R_xlen_t size, R_xlen_t max_size);
R_xlen_t growable_max_size(SEXP x);
// Resize a growable vector to newsize. Will signal an error if newsize exceeds max_size.
void growable_resize(SEXP x, R_xlen_t newsize);
// Return TRUE if growable_resize(x) and growable_max_size(x) are valid operations.
Rboolean is_growable(SEXP x);
// Transform x into a growable vector. The return value must be reprotected in place of x. What happens to x is deliberately not specified, but no copying occurs.
SEXP make_growable(SEXP x);

// functions called from R level .Call/.External and registered in init.c
// these now live here to pass -Wstrict-prototypes, #5477
Expand Down Expand Up @@ -379,4 +383,5 @@ SEXP dt_has_zlib(void);
SEXP startsWithAny(SEXP, SEXP, SEXP);
SEXP convertDate(SEXP, SEXP);
SEXP fastmean(SEXP);
SEXP setgrowable(SEXP x);

20 changes: 9 additions & 11 deletions src/dogroups.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
for (R_len_t i=0; i<n; ++i) {
if (ilens[i] > maxGrpSize) maxGrpSize = ilens[i];
}
defineVar(install(".I"), I = PROTECT(allocVector(INTSXP, maxGrpSize)), env); nprotect++;
defineVar(install(".I"), I = PROTECT(growable_allocate(INTSXP, maxGrpSize, maxGrpSize)), env); nprotect++;
hash_set(specials, I, -maxGrpSize); // marker for anySpecialStatic(); see its comments
R_LockBinding(install(".I"), env);

Expand Down Expand Up @@ -197,7 +197,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
INTEGER(GRP)[0] = i+1; // group counter exposed as .GRP
INTEGER(rownames)[1] = -grpn; // the .set_row_names() of .SD. Not .N when nomatch=NA and this is a nomatch
for (int j=0; j<length(SDall); ++j) {
SETLENGTH(VECTOR_ELT(SDall,j), grpn); // before copying data in otherwise assigning after the end could error R API checks
growable_resize(VECTOR_ELT(SDall,j), grpn); // before copying data in otherwise assigning after the end could error R API checks
defineVar(nameSyms[j], VECTOR_ELT(SDall, j), env);
// Redo this defineVar for each group in case user's j assigned to the column names (env is static) (tests 387 and 388)
// nameSyms pre-stored to save repeated install() for efficiency, though.
Expand Down Expand Up @@ -230,14 +230,14 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
// and we hope that reference counting on by default from R 4.0 will avoid costly gc()s.
}
grpn = 1; // it may not be 1 e.g. test 722. TODO: revisit.
SETLENGTH(I, grpn);
growable_resize(I, grpn);
INTEGER(I)[0] = 0;
for (int j=0; j<length(xSD); ++j) {
writeNA(VECTOR_ELT(xSD, j), 0, 1, false);
}
} else {
if (verbose) tstart = wallclock();
SETLENGTH(I, grpn);
growable_resize(I, grpn);
int *iI = INTEGER(I);
if (LENGTH(order)==0) {
const int rownum = grpn ? istarts[i]-1 : -1;
Expand Down Expand Up @@ -318,13 +318,13 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
RHS = VECTOR_ELT(jval,j%LENGTH(jval));
if (isNull(target)) {
// first time adding to new column
if (TRUELENGTH(dt) < colj+1) internal_error(__func__, "Trying to add new column by reference but tl is full; setalloccol should have run first at R level before getting to this point"); // # nocov
if (growable_max_size(dt) < colj+1) internal_error(__func__, "Trying to add new column by reference but table is full; setalloccol should have run first at R level before getting to this point"); // # nocov
target = PROTECT(allocNAVectorLike(RHS, n));
// Even if we could know reliably to switch from allocNAVectorLike to allocVector for slight speedup, user code could still
// contain a switched halt, and in that case we'd want the groups not yet done to have NA rather than 0 or uninitialized.
// Increment length only if the allocation passes, #1676. But before SET_VECTOR_ELT otherwise attempt-to-set-index-n/n R error
SETLENGTH(dtnames, LENGTH(dtnames)+1);
SETLENGTH(dt, LENGTH(dt)+1);
growable_resize(dtnames, LENGTH(dtnames)+1);
growable_resize(dt, LENGTH(dt)+1);
SET_VECTOR_ELT(dt, colj, target);
UNPROTECT(1);
SET_STRING_ELT(dtnames, colj, STRING_ELT(newnames, colj-origncol));
Expand Down Expand Up @@ -482,11 +482,9 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
// Also reset truelength on specials; see comments in anySpecialStatic().
for (int j=0; j<length(SDall); ++j) {
SEXP this = VECTOR_ELT(SDall,j);
SETLENGTH(this, maxGrpSize);
SET_TRUELENGTH(this, maxGrpSize);
growable_resize(this, maxGrpSize);
}
SETLENGTH(I, maxGrpSize);
SET_TRUELENGTH(I, maxGrpSize);
growable_resize(I, maxGrpSize);
if (verbose) {
if (nblock[0] && nblock[1]) internal_error(__func__, "block 0 [%d] and block 1 [%d] have both run", nblock[0], nblock[1]); // # nocov
int w = nblock[1]>0;
Expand Down
17 changes: 17 additions & 0 deletions src/growable.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
SEXP growable_allocate(SEXPTYPE type, R_xlen_t size, R_xlen_t max_size) {
SEXP ret = PROTECT(allocVector(type, max_size));
SET_TRUELENGTH(ret, max_size);
#if R_VERSION >= R_Version(3, 4, 0)
SET_GROWABLE_BIT(ret);
#endif // otherwise perceived memory use will end up higher
SETLENGTH(ret, size);
UNPROTECT(1);
return ret;
Expand All @@ -21,3 +23,18 @@ void growable_resize(SEXP x, R_xlen_t newsize) {
);
SETLENGTH(x, newsize);
}

Rboolean is_growable(SEXP x) {
return isVector(x) && TRUELENGTH(x) >= XLENGTH(x)
#if R_VERSION >= R_Version(3, 4, 0)
&& IS_GROWABLE(x)
#endif
;
}

// Assuming no ALTREP for now
SEXP make_growable(SEXP x) {
if (TRUELENGTH(x) < XLENGTH(x)) SET_TRUELENGTH(x, XLENGTH(x));
SET_GROWABLE_BIT(x);
return x;
}
1 change: 1 addition & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ R_CallMethodDef callMethods[] = {
{"CconvertDate", (DL_FUNC)&convertDate, -1},
{"Cnotchin", (DL_FUNC)&notchin, -1},
{"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1},
{"Csetgrowable", (DL_FUNC)&setgrowable, -1},
{NULL, NULL, 0}
};

Expand Down
10 changes: 10 additions & 0 deletions src/wrappers.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,13 @@ SEXP warn_matrix_column_r(SEXP i) {
warn_matrix_column(INTEGER(i)[0]);
return R_NilValue;
}

SEXP setgrowable(SEXP x) {
for (R_xlen_t i = 0; i < xlength(x); ++i) {
SEXP this = VECTOR_ELT(x, i);
// relying on the rest of data.table machinery to avoid the need for resizing
if (!is_growable(this) && !ALTREP(this))
SET_VECTOR_ELT(x, i, make_growable(this));
}
return R_NilValue;
}