diff --git a/R/IDateTime.R b/R/IDateTime.R index 1e5f27e5a..4bec0f771 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -198,22 +198,7 @@ as.ITime.times = function(x, ms = 'truncate', ...) { } as.character.ITime = format.ITime = function(x, ...) { - # adapted from chron's format.times - # Fix for #811. Thanks to @StefanFritsch for the code snippet - neg = x < 0L - x = abs(unclass(x)) - hh = x %/% 3600L - mm = (x - hh * 3600L) %/% 60L - # #2171 -- trunc gives numeric but %02d requires integer; - # as.integer is also faster (but doesn't handle integer overflow) - # http://stackoverflow.com/questions/43894077 - ss = as.integer(x - hh * 3600L - 60L * mm) - res = sprintf('%02d:%02d:%02d', hh, mm, ss) - # Fix for #1354, so that "NA" input is handled correctly. - if (is.na(any(neg))) res[is.na(x)] = NA - neg = which(neg) - if (length(neg)) res[neg] = paste0("-", res[neg]) - res + .Call("CasCharacterITime", x) } as.data.frame.ITime = function(x, ...) { diff --git a/R/data.table.R b/R/data.table.R index af4f14de0..4294ba81d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1870,10 +1870,9 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { stop("length(rownames)==0 but should be a single column name or number, or NULL") } else { if (isTRUE(rownames)) { - if (length(key(x))>1L) { + if (length(key(x))>1L) warning("rownames is TRUE but key has multiple columns ", brackify(key(x)), "; taking first column x[,1] as rownames") - } rownames = if (length(key(x))==1L) chmatch(key(x),names(x)) else 1L } else if (is.logical(rownames) || is.na(rownames)) { @@ -1883,7 +1882,6 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { else if (is.character(rownames)) { w = chmatch(rownames, names(x)) if (is.na(w)) stop("'", rownames, "' is not a column of x") - rownames = w } else { # rownames is a column number already rownames = as.integer(rownames) @@ -1892,73 +1890,23 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { " which is outside the column number range [1,ncol=", ncol(x), "].") } } - } else if (!is.null(rownames.value)) { - if (length(rownames.value)!=nrow(x)) - stop("length(rownames.value)==", length(rownames.value), - " but should be nrow(x)==", nrow(x)) } - if (!is.null(rownames)) { - # extract that column and drop it. - rownames.value = x[[rownames]] - dm = dim(x) - 0:1 - cn = names(x)[-rownames] - X = x[, .SD, .SDcols = cn] - } else { - dm = dim(x) - cn = names(x) - X = x - } - if (any(dm == 0L)) - return(array(NA, dim = dm, dimnames = list(rownames.value, cn))) - p = dm[2L] - n = dm[1L] - collabs = as.list(cn) + + # Create shallow copy - where each element of the list X is simply a pointer to + # the same column in the data.table x. This means that we do not modify the + # input data.table when dropping rows or coercing columns. See tests #2139.XXX + X = x class(X) = NULL - non.numeric = non.atomic = FALSE - all.logical = TRUE - for (j in seq_len(p)) { - if (is.ff(X[[j]])) X[[j]] = X[[j]][] # nocov to bring the ff into memory, since we need to create a matrix in memory - xj = X[[j]] - if (length(dj <- dim(xj)) == 2L && dj[2L] > 1L) { - if (inherits(xj, "data.table")) - xj = X[[j]] = as.matrix(X[[j]]) - dnj = dimnames(xj)[[2L]] - collabs[[j]] = paste(collabs[[j]], if (length(dnj) > - 0L) - dnj - else seq_len(dj[2L]), sep = ".") - } - if (!is.logical(xj)) - all.logical = FALSE - if (length(levels(xj)) > 0L || !(is.numeric(xj) || is.complex(xj) || is.logical(xj)) || - (!is.null(cl <- attr(xj, "class", exact=TRUE)) && any(cl %chin% - c("Date", "POSIXct", "POSIXlt")))) - non.numeric = TRUE - if (!is.atomic(xj)) - non.atomic = TRUE - } - if (non.atomic) { - for (j in seq_len(p)) { - xj = X[[j]] - if (is.recursive(xj)) { } - else X[[j]] = as.list(as.vector(xj)) - } - } - else if (all.logical) { } - else if (non.numeric) { - for (j in seq_len(p)) { - if (is.character(X[[j]])) next - xj = X[[j]] - miss = is.na(xj) - xj = if (length(levels(xj))) as.vector(xj) else format(xj) - is.na(xj) = miss - X[[j]] = xj - } - } - X = unlist(X, recursive = FALSE, use.names = FALSE) - dim(X) <- c(n, length(X)/n) - dimnames(X) <- list(rownames.value, unlist(collabs, use.names = FALSE)) - X + + # Extract and drop the rownames column, if used + if (!is.null(rownames)) { + rownames.value = X[[rownames]] + X[[rownames]] = NULL + } + + # Remaining type and class coercion is handled in Casmatrix + ans = .Call(Casmatrix, X, rownames.value) + ans } # bug #2375. fixed. same as head.data.frame and tail.data.frame to deal with negative indices diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f6d17a076..6228a66c7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12544,7 +12544,7 @@ mat4 <- matrix(c("a", 1, 5), nrow=1, dimnames=list(c("x"), c("id", "X", "Y"))) test(1899.14, as.matrix(DT[1,], 1), mat[1,,drop=FALSE]) test(1899.15, as.matrix(DT[1,], "id"), mat[1,,drop=FALSE]) test(1899.16, as.matrix(DT[1,], rownames.value="x"), mat4) -test(1899.17, as.matrix(DT[1,], rownames.value=c("x", "y")), error="length(rownames.value)==2 but should be nrow(x)==1") +test(1899.17, as.matrix(DT[1,], rownames.value=c("x", "y")), error="Extracted rownames column or provided rownames.values do not match the number of rows in the matrix") test(1899.18, as.matrix(DT, rownames=TRUE, rownames.value=1:nrow(DT)), error="rownames and rownames.value cannot both be used at the same time") # index argument for fread, #2633 @@ -13687,7 +13687,7 @@ test(1967.526, x[keyby=a], x, warning=c("Ignoring keyby= because j= i test(1967.53, as.matrix(x, rownames = 2:3), error = 'length(rownames)==2 but') test(1967.54, as.matrix(x[0L]), - structure(logical(0), .Dim = c(0L, 2L), .Dimnames = list(NULL, c("a", "b")))) + structure(integer(0), .Dim = c(0L, 2L), .Dimnames = list(NULL, c("a", "b")))) test(1967.55, subset(x, 5L), error = "'subset' must evaluate to logical") @@ -15789,7 +15789,7 @@ DT = data.table(a=1, b=2) test(2074.06, DT[ , c(.SD[1], .SD[1, .SD[1]]), by=a], data.table(a=1, b=2, b=2)) ## as.matrix.data.table when a column has columns (only possible when constructed incorrectly) DT = structure(list(a=1:5, d=data.table(b=6:10, c=11:15), m=matrix(16:25, ncol=2L)), class = c('data.table', 'data.frame')) -test(2074.07, as.matrix(DT), matrix(1:25, ncol=5L, dimnames=list(NULL, c('a', 'd.b', 'd.c', 'm.1', 'm.2')))) +test(2074.07, as.matrix(DT), matrix(1:25, ncol=5L, dimnames=list(NULL, c('a', 'd.b', 'd.c', 'm.V1', 'm.V2')))) ## can induce !cedta() from base::rownames to get this error test(2074.08, rownames(structure(list(1:5), class='data.table')), error="Has it been created manually") ## default dimnames.data.table @@ -16770,6 +16770,145 @@ test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanot test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see https://github.com/Rdatatable/data.table/issues/4131.") rm(s1, s2, class2132) +## Check for appropriate errors when giving rownames to as.matrix.data.table that are mutli-column (only possible when constructed incorrectly) or list +DT = structure(list(a=list(1,2:3,4:6,letters[1:5],list(1, "a")), d=data.table(b=6:10, c=11:15), m=matrix(16:25, ncol=2L)), class = c('data.table', 'data.frame')) +test(2133.1, as.matrix(DT, rownames="d"), error="Extracted rownames column or provided rownames.values are multi-column type") +test(2133.2, as.matrix(DT, rownames=2), error='Extracted rownames column or provided rownames.values are multi-column type') +mat = matrix(6:25, nrow=5, dimnames=list(DT[["a"]], c("d.b", "d.c", "m.V1", "m.V2"))) +test(2133.3, as.matrix(DT, rownames="a"), mat, warning="Extracted rownames column or provided rownames.values are a list column") +test(2133.4, as.matrix(DT, rownames=1), mat, warning='Extracted rownames column or provided rownames.values are a list column') + +## Check C-implementations of basic matrix types not covered by previous tests +lmat = matrix(c(TRUE, FALSE, NA, FALSE), ncol=2, dimnames=list(NULL, c("A", "B"))) +ldt = as.data.table(lmat) +test(2134.1, as.matrix(ldt), lmat) +cmat = matrix(complex(real=1:4, imaginary=0:3), ncol=2, dimnames=list(NULL, c("A", "B"))) +cdt = as.data.table(cmat) +test(2134.2, as.matrix(cdt), cmat) + +## Test bit64 conversion in as.matrix +if (test_bit64) { + # integer64 with logical, raw, or integer should be coerced to integer64 + i64_mat <- matrix(as.integer64(c(1,0,NA,1:6)), nrow=3, ncol=3, dimnames=list(NULL, LETTERS[1:3])) + DT = data.table(A=c(TRUE, FALSE, NA), B=1:3, C=as.integer64(4:6)) + test(2135.1, as.matrix(DT), i64_mat) # should be integer64 aware, but class not added + + # integer64 with raw, numeric and/or complex should be coerced to character - + # large integers (>32 bits) cannot be converted to numeric or complex + mat = matrix(c("0", "1", "0.1", "0.2", "1+0i", "2+1i", "02", "03"), ncol=4, dimnames=list(NULL, LETTERS[1:4])) + DT = data.table(A=as.integer64(0:1), B=c(0.1, 0.2), C=complex(real=1:2, imaginary=0:1), D=as.raw(2:3)) + test(2135.2, as.matrix(DT), mat) + + # But not if any non-atomic columns + DT = data.table(A=as.integer64(0:1), B=c('asd', 'qwe'), C=list(2:3, "A")) + mat = matrix(list(as.integer64(0), as.integer64(1), 'asd', 'qwe', 2:3, "A"), + ncol=3, dimnames=list(NULL, c("A", "B", "C"))) + test(2135.3, as.matrix(DT), mat) +} + +## Test other class and type conversion in as.matrix +mat = matrix(letters[1:4], ncol=2, dimnames=list(NULL, c("A", "B"))) +DT = data.table(A=as.factor(c("a", "b")), B=as.factor(c("c", "d"))) +test(2136.1, as.matrix(DT), mat) + +mat = matrix(as.raw(1:4), ncol=2, dimnames=list(NULL, c("A", "B"))) +DT = data.table(A=as.raw(1:2), B=as.raw(3:4)) +test(2136.2, as.matrix(DT), mat) + +mat = matrix(paste0("2019-01-0", 1:4), ncol=2, dimnames=list(NULL, c("A", "B"))) +DT = data.table(A=as.IDate(paste0("2019-01-0", 1:2)), B=as.IDate(paste0("2019-01-0", 3:4))) +test(2136.3, as.matrix(DT), mat) + +mat = matrix(c(1, 2, 0.1, 0.2, TRUE, FALSE), ncol=3, dimnames=list(NULL, c("A", "B", "C"))) +DT = data.table(A=1:2, B=c(0.1, 0.2), C=c(TRUE, FALSE)) +test(2136.4, as.matrix(DT), mat) + +mat = matrix(c(1, 2, 0.1, 0.2, TRUE, FALSE, complex(1:2, 1:2)), ncol=4, dimnames=list(NULL, LETTERS[1:4])) +DT = data.table(A=1:2, B=c(0.1, 0.2), C=c(TRUE, FALSE), D=complex(1:2, 1:2)) +test(2136.5, as.matrix(DT), mat) + +mat = matrix(c("TRUE", "FALSE", "00", "01"), ncol=2, dimnames=list(NULL, c("A", "B"))) +DT = data.table(A=c(TRUE,FALSE), B=as.raw(0:1)) +test(2136.6, as.matrix(DT), mat) + +mat = matrix(logical(0), nrow=26, ncol=0, dimnames=list(LETTERS, NULL)) +DT = data.table(A=LETTERS) +test(2136.7, as.matrix(DT, rownames="A"), mat) # nrow and rownames should be preserved + +## Tests for complex column types not captured by standard tests. These are things that would usually only +## be possible through incorrect construction of data.tables, e.g. NULL columns, environments, expressions, etc. +DT = structure(list(A=NULL, B=1:2, C=NULL, D=3:4, E=NULL), class=c("data.table", "data.frame")) +mat = matrix(1:4, ncol=2, dimnames=list(NULL, c("B", "D"))) +test(2137.1, as.matrix(DT), mat) + +DT = structure(list(B=1:2, C=NULL, D=3:4, E=NULL), class=c("data.table", "data.frame")) +test(2137.2, as.matrix(DT), mat) + +DT = structure(list(C=NULL, E=NULL), class=c("data.table", "data.frame")) +mat = array(NA, dim = list(0, 0)) +test(2137.3, as.matrix(DT), mat) + +DT = structure(list(A=NULL, B=logical(0), C=NULL, D=character(0)), class=c("data.table", "data.frame")) +mat = matrix(character(0), nrow=0, ncol=2, dimnames = list(NULL, c("B", "D"))) +test(2137.4, as.matrix(DT), mat) + +DT = structure(list(A=1:6, B=1:2), class=c("data.table", "data.frame")) # DT[,B] is c(1,2) +DT2 = data.table(A=1:6, B=1:2) # DT2[,B] is c(1,2,1,2,1,2) +test(2137.5, as.matrix(DT), as.matrix(DT2)) + +DT = structure(list(A=function(){}, B=list(1), C=expression(1+1)), class=c("data.table", "data.frame")) +mat = matrix(c(A=list(function(){}), B=list(1), C=list(expression(1+1))), ncol=3, dimnames=list(NULL, c("A","B","C"))) +test(2137.6, as.matrix(DT), mat) + +DT = data.table(A=function(){}, B=list(1)) +mat = matrix(c(A=function(){}, B=list(1)), ncol=2, dimnames=list(NULL, c("A","B"))) +test(2137.7, as.matrix(DT), mat) + +DT = data.table(A=expression(as.character(system.time())), B=quote(1+1)) +mat = matrix(list(expression(as.character(system.time())), expression(as.character(system.time())), + expression(as.character(system.time())), quote(1+1), quote(1+1), quote(1+1)), + nrow=3, ncol=2, dimnames=list(NULL, c("A","B"))) +test(2137.8, as.matrix(DT), mat) # non-atomic wrapped in list #4196 + +# Test rownames to character conversions +DT = data.table(A=as.IDate(1:3), B=factor(c("A", "B", "A"), levels=c("B", "A"))) +mat1 = matrix(c("A", "B", "A"), nrow=3, ncol=1, dimnames=list(as.character(as.IDate(1:3)), "B")) +mat2 = matrix(as.character(as.IDate(1:3)), nrow=3, ncol=1, dimnames=list(c("A", "B", "A"), "A")) +test(2138.1, as.matrix(DT, rownames="A"), mat1) +test(2138.2, as.matrix(DT, rownames="B"), mat2) + +if (test_bit64) { + # Integer64 as row names + DT = data.table(A=as.integer64(0:1), B=c('asd', 'qwe'), C=list(2:3, "A")) + mat = matrix(list('asd', 'qwe', 2:3, "A"), ncol=2, dimnames=list(c("0", "1"), c("B", "C"))) + test(2138.3, as.matrix(DT, rownames="A"), mat) +} + +# Check input data.table is not modified by reference in as.matrix +dt = data.table(rn=letters[1:3], A=1:3, B=4:6) +dt2 = copy(dt) +mat = as.matrix(dt, rownames="rn") +test(2139.1, dt, dt2) # rn column taken as rownames should still be in dt + +dt = data.table(rn=letters[1:3], A=1:3, B=4:6) +dt2 = copy(dt) +mat = as.matrix(dt) +test(2139.2, dt, dt2) # column type coercion should not modify input dt + +dt = structure(list(A=NULL, B=1:2, C=NULL, D=3:4, E=NULL), class=c("data.table", "data.frame")) +dt2 = structure(list(A=NULL, B=1:2, C=NULL, D=3:4, E=NULL), class=c("data.table", "data.frame")) +mat = as.matrix(dt) +test(2139.3, dt, dt2) # null columns should not be dropped after as.matrix + +dt = structure(list(A=1:6, B=1:2), class=c("data.table", "data.frame")) # DT[,B] is c(1,2) +dt2 = structure(list(A=1:6, B=1:2), class=c("data.table", "data.frame")) # DT[,B] is c(1,2) +mat = as.matrix(dt) +test(2139.4, dt, dt2) # recycling of dt$B to length dt$A in as.matrix should not modify input dt$B. + +dt = structure(list(a=list(1,2:3,4:6,letters[1:5],list(1, "a")), d=data.table(b=6:10, c=11:15), m=matrix(16:25, ncol=2L)), class = c('data.table', 'data.frame')) +dt2 = structure(list(a=list(1,2:3,4:6,letters[1:5],list(1, "a")), d=data.table(b=6:10, c=11:15), m=matrix(16:25, ncol=2L)), class = c('data.table', 'data.frame')) +mat = as.matrix(dt) +test(2139.5, dt, dt2) # flattening data.table and matrix columns should not modify input dt ######################## # Add new tests here # diff --git a/src/assign.c b/src/assign.c index 0d5c67369..96cc09aee 100644 --- a/src/assign.c +++ b/src/assign.c @@ -668,7 +668,7 @@ static bool anyNamed(SEXP x) { #define MSGSIZE 1000 static char memrecycle_message[MSGSIZE+1]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects -const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceStart, const int sourceLen, const int colnum, const char *colname) +const char *memrecycle(const SEXP target, const SEXP where, const int64_t start, const int64_t len, SEXP source, const int64_t sourceStart, const int64_t sourceLen, const int64_t colnum, const char *colname) // like memcpy but recycles single-item source // 'where' a 1-based INTEGER vector subset of target to assign to, or NULL or integer() // assigns to target[start:start+len-1] or target[where[start:start+len-1]] where start is 0-based @@ -677,15 +677,15 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con // sourceLen==1 is used in dogroups to recycle the group values into ans to match the nrow of each group's result; sourceStart is set to each group value row. { if (len<1) return NULL; - const int slen = sourceLen>=0 ? sourceLen : length(source); + const int64_t slen = sourceLen>=0 ? sourceLen : xlength(source); if (slen==0) return NULL; - if (sourceStart<0 || sourceStart+slen>length(source)) - error(_("Internal error memrecycle: sourceStart=%d sourceLen=%d length(source)=%d"), sourceStart, sourceLen, length(source)); // # nocov - if (!length(where) && start+len>length(target)) - error(_("Internal error memrecycle: start=%d len=%d length(target)=%d"), start, len, length(target)); // # nocov - const int soff = sourceStart; + if (sourceStart<0 || sourceStart+slen>xlength(source)) + error(_("Internal error memrecycle: sourceStart=%"PRId64" sourceLen=%"PRId64" length(source)=%"PRId64""), sourceStart, sourceLen, xlength(source)); // # nocov + if (!xlength(where) && start+len>xlength(target)) + error(_("Internal error memrecycle: start=%"PRId64" len=%"PRId64" length(target)=%"PRId64""), start, len, xlength(target)); // # nocov + const int64_t soff = sourceStart; if (slen>1 && slen!=len && (!isNewList(target) || isNewList(source))) - error(_("Internal error: recycle length error not caught earlier. slen=%d len=%d"), slen, len); // # nocov + error(_("Internal error: recycle length error not caught earlier. slen=%"PRId64" len=%"PRId64""), slen, len); // # nocov // Internal error because the column has already been added to the DT, so length mismatch should have been caught before adding the column. // for 5647 this used to limit slen to len, but no longer if (colname==NULL) @@ -726,7 +726,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con const int nlevel = length(getAttrib(target, R_LevelsSymbol)); if (isInteger(source)) { const int *sd = INTEGER(source); - for (int i=0; inlevel) { error(_("Assigning factor numbers to column %d named '%s'. But %d is outside the level range [1,%d]"), colnum, colname, val, nlevel); @@ -734,7 +734,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con } } else { const double *sd = REAL(source); - for (int i=0; inlevel)) { error(_("Assigning factor numbers to column %d named '%s'. But %f is outside the level range [1,%d], or is not a whole number."), colnum, colname, val, nlevel); @@ -779,17 +779,17 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con SET_TRUELENGTH(s, -nTargetLevels-(++nAdd)); } // else, when sourceIsString, it's normal for there to be duplicates here } - const int nSource = length(source); + const int64_t nSource = xlength(source); int *newSourceD = INTEGER(newSource); if (sourceIsFactor) { const int *sourceD = INTEGER(source); - for (int i=0; i0 && n>0 && n0 && slen==len && soff==0; // mc=memcpy; only if types match and not for single items (a single assign faster than these non-const memcpy calls) - const int *wd = length(where) ? INTEGER(where)+start : NULL; + const int64_t off = xlength(where) ? 0 : start; // off = target offset; e.g. called from rbindlist with where=R_NilValue and start!=0 + const bool mc = xlength(where)==0 && slen>0 && slen==len && soff==0; // mc=memcpy; only if types match and not for single items (a single assign faster than these non-const memcpy calls) + const int *wd = xlength(where) ? INTEGER(where)+start : NULL; switch (TYPEOF(target)) { case RAWSXP: { Rbyte *td = RAW(target) + off; @@ -1215,4 +1215,3 @@ SEXP setcharvec(SEXP x, SEXP which, SEXP newx) } return R_NilValue; } - diff --git a/src/data.table.h b/src/data.table.h index f2687f52e..5c59e959a 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -73,6 +73,8 @@ extern SEXP char_ITime; extern SEXP char_IDate; extern SEXP char_Date; extern SEXP char_POSIXct; +extern SEXP char_POSIXlt; +extern SEXP char_POSIXt; extern SEXP char_nanotime; extern SEXP char_lens; extern SEXP char_indices; @@ -82,6 +84,7 @@ extern SEXP char_factor; extern SEXP char_ordered; extern SEXP char_datatable; extern SEXP char_dataframe; +extern SEXP char_ff; extern SEXP char_NULL; extern SEXP sym_sorted; extern SEXP sym_index; @@ -158,7 +161,7 @@ SEXP dt_na(SEXP x, SEXP cols); // assign.c SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose); -const char *memrecycle(const SEXP target, const SEXP where, const int r, const int len, SEXP source, const int sourceStart, const int sourceLen, const int coln, const char *colname); +const char *memrecycle(const SEXP target, const SEXP where, const int64_t r, const int64_t len, SEXP source, const int64_t sourceStart, const int64_t sourceLen, const int64_t coln, const char *colname); SEXP shallowwrapper(SEXP dt, SEXP cols); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, @@ -233,6 +236,9 @@ bool islocked(SEXP x); SEXP islockedR(SEXP x); bool need2utf8(SEXP x); SEXP coerceUtf8IfNeeded(SEXP x); +SEXP asCharacterInteger64(SEXP x, int *nprotect); +SEXP asCharacterITime(SEXP x, int *nprotect); +SEXP callRfun1(const char *name, const char *package, SEXP x, int *nprotect); // types.c char *end(char *start); diff --git a/src/init.c b/src/init.c index aed2da3db..60842ca14 100644 --- a/src/init.c +++ b/src/init.c @@ -10,6 +10,8 @@ SEXP char_ITime; SEXP char_IDate; SEXP char_Date; SEXP char_POSIXct; +SEXP char_POSIXlt; +SEXP char_POSIXt; SEXP char_nanotime; SEXP char_lens; SEXP char_indices; @@ -19,6 +21,7 @@ SEXP char_factor; SEXP char_ordered; SEXP char_datatable; SEXP char_dataframe; +SEXP char_ff; SEXP char_NULL; SEXP sym_sorted; SEXP sym_index; @@ -119,6 +122,9 @@ SEXP lock(); SEXP unlock(); SEXP islockedR(); SEXP allNAR(); +SEXP asmatrix(); +SEXP asCharacterInteger64(); +SEXP asCharacterITime_R(); // .Externals SEXP fastmean(); @@ -211,6 +217,9 @@ R_CallMethodDef callMethods[] = { {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1}, {"CtestMsgR", (DL_FUNC) &testMsgR, -1}, {"C_allNAR", (DL_FUNC) &allNAR, -1}, +{"Casmatrix", (DL_FUNC) &asmatrix, -1}, +{"CasCharacterInteger64", (DL_FUNC) &asCharacterInteger64, -1}, +{"CasCharacterITime", (DL_FUNC) &asCharacterITime_R, -1}, {NULL, NULL, 0} }; @@ -311,6 +320,8 @@ void attribute_visible R_init_datatable(DllInfo *info) char_ITime = PRINTNAME(install("ITime")); char_Date = PRINTNAME(install("Date")); // used for IDate too since IDate inherits from Date char_POSIXct = PRINTNAME(install("POSIXct")); + char_POSIXlt = PRINTNAME(install("POSIXlt")); + char_POSIXt = PRINTNAME(install("POSIXt")); char_nanotime = PRINTNAME(install("nanotime")); char_starts = PRINTNAME(sym_starts = install("starts")); char_lens = PRINTNAME(install("lens")); @@ -321,6 +332,7 @@ void attribute_visible R_init_datatable(DllInfo *info) char_ordered = PRINTNAME(install("ordered")); char_datatable = PRINTNAME(install("data.table")); char_dataframe = PRINTNAME(install("data.frame")); + char_ff = PRINTNAME(install("ff")); char_NULL = PRINTNAME(install("NULL")); if (TYPEOF(char_integer64) != CHARSXP) { diff --git a/src/matrix.c b/src/matrix.c new file mode 100644 index 000000000..2c8117e77 --- /dev/null +++ b/src/matrix.c @@ -0,0 +1,523 @@ +#include "data.table.h" + +/* Fills the allocated matrix (ans) with columns of the input data.table + * (dt) using memcpy assuming all columns have the same SEXPTYPE. This + * macro means we don't have to repeat the loop below for each SEXPTYPE. + * RFUN and CTYPE should be a pair (e.g. INTEGER and int for INTSXP, or + * REAL and double for REALSXP). + */ +#undef OMP_MEMCPY +#define OMP_MEMCPY(RFUN, CTYPE) {{ \ + if (nrow == 1) { /* faster to just assign directly if n=1 */ \ + for (int j=0; j 2) + error("Cannot unpack array column with %d dimensions", dims); + + // How many rows does this column have? + thisnrow = INTEGER(colDim)[0]; + + // if first column initialize maxType and nrow. + if (*ncol == 0) { + *maxType = TYPEOF(thisCol); + *nrow = thisnrow; + } + // If not the first column, compare number of rows to nrow to see if we need to recycle + else if (thisnrow != *nrow) { + *nrow = *nrow > thisnrow ? *nrow : thisnrow; + *recycle = true; // can't determine whether recyclable until we know max nrow + } + + // increment the column counter by the number of columns in the matrix + (*ncol) += INTEGER(colDim)[1]; + + continue; // no more checks for matrixcolumns + } + // From here on, we're always dealing with a column that is a vector/list (doesn't have dimensions) + + // Load ff column into memory + if (INHERITS(thisCol, char_ff)) { + thisCol = callRfun1("[.ff", "ff", thisCol, nprotect); // nprotect incremented in callRfun1 + SET_VECTOR_ELT(*dt, j, thisCol); UNPROTECT(1); (*nprotect)--; // 'thisCol' now PROTECTED by dt + } + + // Coerce factor and date/time to character (always happens regardless of target matrix type) + if (toCharacterFactorPOSIX(&thisCol, nprotect)) { + SET_VECTOR_ELT(*dt, j, thisCol); UNPROTECT(1); (*nprotect)--; // 'thisCol' now PROTECTED by dt + } + + // Now we can determine length and type of this column + thisnrow = xlength(thisCol); + thisType = TYPEOF(thisCol); + thisI64 = INHERITS(thisCol, char_integer64); + + // If this is the first column we use it to initialize maxType and nrow + if (*ncol == 0) { + // initialize nrow and increment column counter + *nrow = thisnrow; + (*ncol)++; + + // initialize maxType to type of column + if (INHERITS(thisCol, char_integer64)) { + *integer64 = true; + *maxType = REALSXP; + } else { + *maxType = thisType; + } + + continue; // no more checks required on first column + } + // For remaining columns we need to check for nrow mismatch and compare column type to maxType + + // Increment column counter + (*ncol)++; + + // Check for nrow mismatch + if (thisnrow != *nrow) { + *nrow = *nrow > thisnrow ? *nrow : thisnrow; + *recycle = true; // can't determine whether recyclable until we know max nrow + } + + // Compare column type to maxType to determine whether maxType needs to change and later column coercion is required + if (TYPEORDER(thisType)>TYPEORDER(VECSXP)) { + // Non-atomic non-list types are wrapped in a list, see #4196 + *maxType=VECSXP; + *coerce = true; + } else if (thisType == VECSXP) { + // This column is a list. In this case, all columns become wrapped in list + // If the maxType isn't already VECSXP, then a previous column is not a list, + // so we need to set the maxType and let asmatrix know we need to coerce these + // columns later. + if (*maxType != VECSXP) { + *maxType = VECSXP; + *coerce = true; + } + } else if (thisType == RAWSXP) { + // raw is handled specially: matrix can only be raw type if all columns are raw + // otherwise columns are coerced to character, or if maxType is VECSXP, then each + // element of the raw column becomes an element of a list. + if (*maxType != RAWSXP && *maxType != VECSXP) { + *maxType = STRSXP; + *coerce = true; + } + } else if (*maxType == RAWSXP && thisType != RAWSXP) { + // a previous column is raw, but this column is a non-raw vector. Must coerce to STRSXP. + *coerce = true; + *maxType = STRSXP; + + // if this column is integer64 we also need to flag that there is at least one integer64 column + if (thisI64) + *integer64 = true; + + } else if (thisI64) { + // This column has class "integer64" (type is REALSXP). Only integer and logical columns + // can be coerced to integer64. Integer64 columns cannot be coerced to numeric or complex, + // so if maxType is either of these, then the maxType becomes STRSXP (integer64 coerced to + // character vector). + if (!*integer64) { + // This column is integer64, but at least one column checked already is not + *coerce = true; + *integer64 = true; + if (*maxType == REALSXP || *maxType == CPLXSXP) + *maxType = STRSXP; // integer64 cannot be coerced to numeric or complex, must coerce to character + else if (*maxType == LGLSXP || *maxType == INTSXP) + *maxType = REALSXP; // logical and integer can be coerced to integer64 + // else *maxType == VECSXP, in which case no change to maxType and integer64 column elements wrapped in list + } + } else if (TYPEORDER(thisType)>TYPEORDER(*maxType)) { + // the type of this column is higher in the typeorder list than maxType, + // so we need to change maxType and flag that we will need to do column coercion + *coerce = true; + + // if any previous column is integer64, then maxType must be STRSXP if this column is numeric or complex + if (*integer64 && (thisType == REALSXP || thisType == CPLXSXP)) + *maxType = STRSXP; + // otherwise the type of this column becomes the maxType + else + *maxType=thisType; + + } else if (TYPEORDER(thisType) 0; ndiv /= 10) buffwidth++; + char buff[buffwidth]; + + // Make names + for (int64_t i = 0; i < n; ++i) { + snprintf(buff, buffwidth, "V%"PRId64"", i+1); + SET_STRING_ELT(names, i, mkChar(buff)); + } + return(names); +} + +// Add a prefix to a character vector, separated by a "." +// I.e. for concatenating names of a multi-dimensional column +SEXP prependNames(SEXP names, SEXP prefixArg, int *nprotect) { + int64_t len = xlength(names); + SEXP ans = PROTECT(allocVector(STRSXP, len)); (*nprotect)++; + const char *prefix = CHAR(prefixArg); + + /* To determine the buffer width of the new column names, we need to + * know the maximum number of characters in any input column name as + * well as the size of the prefix. */ + size_t prefixw = strlen(prefix); // nchar(prefix) + size_t namesmaxw = 0; + for (int64_t i = 0; i < len; ++i) { + const char *namei = CHAR(STRING_ELT(names, i)); + size_t namew = strlen(namei); // nchar(names[i]) + namesmaxw = namew > namesmaxw ? namew : namesmaxw; + } + size_t buffwidth = prefixw + namesmaxw + 2; // size of both buffers + extra for separator and terminating \0 + char buff[buffwidth]; + + // make new composite column name + for (int64_t i = 0; i < len; ++i) { + const char *namei = CHAR(STRING_ELT(names, i)); + snprintf(buff, buffwidth, "%s.%s", prefix, namei); + SET_STRING_ELT(ans, i, mkChar(buff)); + } + return(ans); +} + +/* Recurse through a data.table, extracting any multi dimensional columns + * into their single columns in the top level data.table + */ +void flatten(SEXP *dt, SEXP *dtcn, SEXP *newdt, SEXP *newcn, int64_t *jtarget, int *nprotect) { + for (int64_t j = 0; j < xlength(*dt); ++j) { + SEXP thisCol = VECTOR_ELT(*dt, j); + SEXP colDim = getAttrib(thisCol, R_DimSymbol); + SEXP colName = STRING_ELT(*dtcn, j); + if (isNull(thisCol)) { + // Empty column, do not add to newdt + continue; + } else if (INHERITS(thisCol, char_dataframe) || INHERITS(thisCol, char_datatable)) { + // This column is a data.table or data.frame we will recurse into + // Make composite column names before recursing + SEXP subdtcn = getAttrib(thisCol, R_NamesSymbol); // names(dt[,j]) + SEXP compositecn = prependNames(subdtcn, colName, nprotect); // nprotect incremented + + // recurse into data.table + int recurseflattenstack = 0; + flatten(&thisCol, &compositecn, newdt, newcn, jtarget, &recurseflattenstack); + UNPROTECT(recurseflattenstack); // everything PROTECTED behind 'dt' + UNPROTECT(1); (*nprotect)--; // no longer need to PROTECT compositecn, all elements now PROTECTED in dtcn + } else if (!isNull(colDim)) { + // matrix, we have to split up vector into new columns of length mat nrow + int matnrow = INTEGER(colDim)[0]; + int matncol = INTEGER(colDim)[1]; + int mattype = TYPEOF(thisCol); + SEXP matdm = getAttrib(thisCol, R_DimNamesSymbol); // dimnames(dt[,j]) + SEXP matcn; + + // If no column names we need to make our own of the form V1, V2, ..., VN + int matcnprotect = 0; // below may add 0 or 1 to protect stack + if (isNull(matdm)) { + matcn = makeNames(matncol, &matcnprotect); // matcnprotect incremented + } else { + matcn = STRING_ELT(matdm, 1); + if (isNull(matcn)) { + matcn = makeNames(matncol, &matcnprotect); // matcnprotect incremented + } + } + + // Make composite column names + SEXP compositecn = prependNames(matcn, colName, nprotect); // nprotect incremented + + // Iterate through each column of the matrix, copying its contents into the new column vector + for (int mj = 0; mj < matncol; ++mj) { + SEXP thisNewCol = PROTECT(allocVector(mattype, matnrow)); (*nprotect)++; + int64_t start = (int64_t) mj * matnrow; + memrecycle(thisNewCol, R_NilValue, 0, matnrow, thisCol, start, matnrow, 0, "V1"); + SET_VECTOR_ELT(*newdt, *jtarget, thisNewCol); UNPROTECT(1); (*nprotect)--; + SET_STRING_ELT(*newcn, *jtarget, STRING_ELT(compositecn, mj)); // add new column name + (*jtarget)++; // Increment column index in the new flattened data.table + } + UNPROTECT(1); (*nprotect)--; // pop compositecn from protect stack, now PROTECTED by newcn + UNPROTECT(matcnprotect); // pop any PROTECTED created matrix column names, now PROTECTED in newcn. + } else { + // Single column, add pointer to new flattend data.table in the right spot + SET_VECTOR_ELT(*newdt, *jtarget, thisCol); + SET_STRING_ELT(*newcn, *jtarget, colName); // Add column name + (*jtarget)++; // Increment column index in the new flattened data.table + } + } +} + +SEXP asmatrix(SEXP dt, SEXP rownames) +{ + // PROTECT / UNPROTECT stack counter + int nprotect=0; + + SEXP rncontainer = PROTECT(allocVector(VECSXP, 1)); nprotect++; // PROTECTED container to hold (maybe coerced) rownames + + // Coerce and check rownames - do this first so rownames, if coerced, + // is placed at the bottom of the protection stack. This way we don't + // unintentionally unprotect it when poping the stack, e.g after coercing + // a column and assigning back into the protected list container. + if (!isNull(rownames)) { + int rnstack = 0; // separate protect stack counter because number of coercions may be 0-3 + // If ff vector must be loaded into memory + if (INHERITS(rownames, char_ff)) { + rownames = callRfun1("[.ff", "ff", rownames, &rnstack); // rnstack incremented in function + } + // Convert to string if factor, date, or time + toCharacterFactorPOSIX(&rownames, &rnstack); + // if integer64 convert to character to avoid garbage numeric values + if (INHERITS(rownames, char_integer64)) { + rownames = asCharacterInteger64(rownames, &rnstack); // rnstack incremented in function + } + if (INHERITS(rownames, char_dataframe) || INHERITS(rownames, char_datatable) || + !isNull(getAttrib(rownames, R_DimSymbol))) + error("Extracted rownames column or provided rownames.values are multi-column type (e.g. a matrix or data.table) and cannot be used as rownames"); + else if (TYPEOF(rownames) == VECSXP || TYPEOF(rownames) == LISTSXP) + warning("Extracted rownames column or provided rownames.values are a list column, so will be converted to a character representation"); + SET_VECTOR_ELT(rncontainer, 0, rownames); UNPROTECT(rnstack); // coerced rownames now PROTECTED in rnContainer + rownames = VECTOR_ELT(rncontainer, 0); + } + + // Extract column types and determine type to coerce to + int preprocessstack=0; + int maxType=LGLSXP; // initialised to type of first column by preprocess(), LGLSXP if empty matrix + int64_t nrow=0; // initialised to length of first column by preprocess(), 0 rows if emtpy matrix + int64_t ncol=0; // Incremented by preprocess() based on columns encountered (some may be NULL or multi-column) + bool coerce=false; // if no columns need coercing, can just use memcpy + bool integer64=false; // are we coercing to integer64? + bool unpack=false; // are there any columns to drop or unpack? + bool recycle=false; // do any columns need recyling to nrow? + preprocess(&dt, &preprocessstack, &maxType, &nrow, &ncol, &coerce, &integer64, &unpack, &recycle); + + // Check matrix is not larger than allowed by R + if (ncol > R_LEN_T_MAX || nrow > R_LEN_T_MAX) + error("R does not support matrices with more than %d columns or rows", R_LEN_T_MAX); // # nocov + + // Check rownames length for errors now we know nrow. + if (!isNull(rownames) && nrow != 0 && xlength(rownames) != nrow) { + error("Extracted rownames column or provided rownames.values do not match the number of rows in the matrix"); + } + + // If no columns, nrow is dictated by rownames + if (ncol == 0) + nrow = isNull(rownames) ? 0 : xlength(rownames); + + // allocate matrix + SEXP ans = PROTECT(allocMatrix(maxType, nrow, ncol)); nprotect++; // should only be 'rnContainer' and 'ans' on PROTECT stack + + // if ncol == 0 we can now return + if (ncol == 0) { + if (!isNull(rownames)) { // Add rownames if they exist + SEXP dimnames = PROTECT(allocVector(VECSXP, 2)); nprotect++; + SET_VECTOR_ELT(dimnames, 0, rownames); + SET_VECTOR_ELT(dimnames, 1, R_NilValue); + setAttrib(ans, R_DimNamesSymbol, dimnames); + } + UNPROTECT(nprotect); + return(ans); + } + + // if, somehow, the input data.table lacks names, make them + SEXP colnames = getAttrib(dt, R_NamesSymbol); // names(dt) + if (isNull(colnames)) { + colnames = makeNames(xlength(dt), &nprotect); // nprotect incremented + setAttrib(dt, R_NamesSymbol, colnames); UNPROTECT(1); nprotect--; // PROTECTED by dt now + } + + // Unpack data.table if needed + if (unpack) { + SEXP newdt = PROTECT(allocVector(VECSXP, ncol)); nprotect++; + SEXP newcn = PROTECT(allocVector(STRSXP, ncol)); nprotect++; + int64_t j = 0; + int flprotect=0; // protect stack counter for flatten + flatten(&dt, &colnames, &newdt, &newcn, &j, &flprotect); + setAttrib(newdt, R_NamesSymbol, newcn); + UNPROTECT(flprotect); // everything in flattened now PROTECTED in newdt + UNPROTECT(1); nprotect--; // pop newcn PROTECT call, now PROTECTED by 'newdt' after setAttrib. + dt = newdt; + colnames = getAttrib(dt, R_NamesSymbol); + } + // top of PROTECT stack is either 'ans' or 'newdt' (if unpack was true) + + // Add dimension names + SEXP dimnames = PROTECT(allocVector(VECSXP, 2)); nprotect++; + SET_VECTOR_ELT(dimnames, 0, rownames); + SET_VECTOR_ELT(dimnames, 1, colnames); + setAttrib(ans, R_DimNamesSymbol, dimnames); UNPROTECT(1); nprotect--; // dimnames now PROTECTED by ans + + // If nrow 0 we can now return. + if (nrow == 0) { + UNPROTECT(nprotect); + return(ans); + } + + // recycle columns to match nrow if needed + if (recycle) { + for (int64_t j = 0; j < ncol; ++j) { + SEXP thisCol = VECTOR_ELT(dt, j); + int64_t thisnrow = xlength(thisCol); + if (thisnrow != nrow) { + if (nrow % thisnrow != 0) { // nrow not divisible by thisnrow; error + error("Could not recycle column of length %d into nrow of %d", thisnrow, nrow); + } + // Allocate new column to fill with memrecycle. Add integer64 class if needed. + SEXP newCol = PROTECT(allocVector(TYPEOF(thisCol), nrow)); nprotect++; + if (INHERITS(thisCol, char_integer64)) { + SEXP i64Class = PROTECT(allocVector(STRSXP, 1)); nprotect++; + SET_STRING_ELT(i64Class, 0, char_integer64); + setAttrib(newCol, R_ClassSymbol, i64Class); UNPROTECT(1); nprotect--; // class PROTECTED in newCol + } + // Fill column repeating elements as needed + for (int i = 0; i < (nrow / thisnrow); ++i) { + memrecycle(newCol, R_NilValue, i*thisnrow, thisnrow, thisCol, 0, -1, 0, "V1"); + } + SET_VECTOR_ELT(dt, j, newCol); UNPROTECT(1); nprotect--; // newCol PROTECTED in dt + } + } + } + + // Identify and coerce columns as needed + if (coerce) { + for (int j=0; j