diff --git a/.appveyor.yml b/.appveyor.yml index a283cd2a3..0f9cdf9e6 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -16,18 +16,14 @@ environment: global: CRAN: http://cloud.r-project.org WARNINGS_ARE_ERRORS: 1 - R_CHECK_ARGS: --no-manual --no-multiarch - R_ARCH: i386 -# R_CHECK_ARGS specified in order to turn off --as-cran (on by default) as that can be slow -# multiarch is on by default which (when R_ARCH: x64) compiles and tests both 32bit and 64bit in one x64 job -# --no-multiarch so as to not run both 32bit and 64bit on every commit in PRs to save dev cycle time; GLCI after merge is full-strength -# GHA has MacOS 64bit (test-coverage) and Ubuntu 64bit, therefore picked 32bit for Windows - GCC_PATH: mingw_64 -# Default GCC_PATH appears to be gcc-4.6.3 which is now unsupported as from Rtools.exe v3.4. + R_CHECK_ARGS: --as-cran --no-manual +# --no-manual to avoid error 'pdflatex is not available' +# --as-cran no longer a lot slower (now takes under 6 mins with and without); logs show _R_CHECK_CRAN_INCOMING_=FALSE which could take 5+ mins _R_CHECK_NO_STOP_ON_TEST_ERROR_: true # continue tests even if some script failed _R_CHECK_TESTS_NLINES_: 0 # Block truncation of any error messages in R CMD check +# R is 64-bit only on Windows from 4.2.0 (prior default was build and test both 32bit and 64bit) so we no longer use R_ARCH to pick one to reduce CI time in PRs matrix: diff --git a/.ci/ci.R b/.ci/ci.R index 70e5fa27a..a165de818 100644 --- a/.ci/ci.R +++ b/.ci/ci.R @@ -47,7 +47,8 @@ function (repos, type = getOption("pkgType"), ver) dcf.dependencies <- function(file = "DESCRIPTION", which = NA, - except.priority = "base") { + except.priority = "base", + exclude = NULL) { if (!is.character(file) || !length(file) || !all(file.exists(file))) stop("file argument must be character of filepath(s) to existing DESCRIPTION file(s)") if (!is.character(except.priority)) @@ -79,7 +80,13 @@ function(file = "DESCRIPTION", } x <- unlist(lapply(x, local.extract_dependency_package_names)) except <- if (length(except.priority)) c("R", unlist(tools:::.get_standard_package_names()[except.priority], use.names = FALSE)) - setdiff(x, except) + x = setdiff(x, except) + if (length(exclude)) { # to exclude knitr/rmarkdown, 5294 + if (!is.character(exclude) || anyDuplicated(exclude)) + stop("exclude may be NULL or a character vector containing no duplicates") + x = setdiff(x, exclude) + } + x } ## returns additional repositories for dependency packages based on its DESCRIPTION file diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases index 3d46c94d6..928f0e07f 100644 --- a/.dev/.bash_aliases +++ b/.dev/.bash_aliases @@ -8,6 +8,8 @@ alias gdm='git difftool master &> /dev/null' # If meld has scrolling issues, turn off GTK animation which I don't need: # https://gitlab.gnome.org/GNOME/meld/-/issues/479#note_866040 +alias perfbar=~/build/gtk_perfbar/linux_perfbar # revdep.R; https://github.com/tomkraljevic/gtk_perfbar + alias Rdevel='~/build/R-devel/bin/R --vanilla' alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' @@ -15,7 +17,7 @@ alias Rdevel-valgrind='~/build/R-devel-valgrind/bin/R --vanilla' alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla' alias R310='~/build/R-3.1.0/bin/R --vanilla' -alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=true' +alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=NULL && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=true' alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R R' # use ~/build/R-devel/bin/R at the end of revdepr to use R-devel instead of R-release. # If so, doing a `rm -rf *` in revdeplib first to rebuild everything is easiest way to avoid potential problems later. A full rebuild is a good idea periodically anyway. Packages in diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index b010d175f..3442dcb38 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -185,7 +185,7 @@ grep -En "for\s*[(]\s*[a-zA-Z0-9_]+\s*=" src/*.c | grep -Fv "#loop_counter_not_l cd .. R -cc(test=TRUE, clean=TRUE, CC="gcc-10") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html +cc(test=TRUE, clean=TRUE, CC="gcc-12") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html saf = options()$stringsAsFactors options(stringsAsFactors=!saf) # check tests (that might be run by user) are insensitive to option, #2718 test.data.table() @@ -195,15 +195,15 @@ R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" # avoids many too-many-requests in --as-cran's ping-all-URLs step (20 mins) inside the `checking CRAN incoming feasibility...` step. # Many thanks to Dirk for the tipoff that setting this env variable solves the problem, #4832. -R CMD check data.table_1.14.1.tar.gz --as-cran -R CMD INSTALL data.table_1.14.1.tar.gz --html +R CMD check data.table_1.14.9.tar.gz --as-cran +R CMD INSTALL data.table_1.14.9.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.14.1.tar.gz +R CMD check data.table_1.14.9.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -220,16 +220,24 @@ q("no") # User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 # Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. -PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.1.tar.gz +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.9.tar.gz # Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work -PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.1.tar.gz +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.9.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) require(data.table) +f1 = tempfile() +f2 = tempfile() +suppressWarnings(try(rm(list=c(".Last",".Random.seed")))) +save.image(f1) test.data.table(script="other.Rraw") test.data.table(script="*.Rraw") test.data.table(verbose=TRUE) # since main.R no longer tests verbose mode +suppressWarnings(try(rm(list=c(".Last",".Random.seed")))) +save.image(f2) +system(paste("diff",f1,f2)) # to detect any changes to .GlobalEnv, #5514 +# print(load(f1)); print(load(f2)) # run if diff found any difference # check example() works on every exported function, with these sticter options too, and also that all help pages have examples options(warn=2, warnPartialMatchArgs=TRUE, warnPartialMatchAttr=TRUE, warnPartialMatchDollar=TRUE) @@ -258,7 +266,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.14.1.tar.gz +R310 CMD INSTALL ./data.table_1.14.9.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -270,7 +278,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.14.1.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.14.9.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -278,7 +286,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.14.1.tar.gz +R CMD check data.table_1.14.9.tar.gz ##################################################### @@ -289,25 +297,30 @@ cd ~/build wget -N https://stat.ethz.ch/R/daily/R-devel.tar.gz rm -rf R-devel rm -rf R-devel-strict-* -tar xvf R-devel.tar.gz +tar xf R-devel.tar.gz mv R-devel R-devel-strict-gcc -tar xvf R-devel.tar.gz +tar xf R-devel.tar.gz mv R-devel R-devel-strict-clang -tar xvf R-devel.tar.gz +tar xf R-devel.tar.gz +sudo apt-get -y build-dep r-base cd R-devel # may be used for revdep testing: .dev/revdep.R. # important to change directory name before building not after because the path is baked into the build, iiuc ./configure CFLAGS="-O0 -Wall -pedantic" make # use latest available `apt-cache search gcc-` or `clang-` +# wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - +# sudo add-apt-repository 'deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-15 main' +# sudo apt-get install clang-15 + cd ~/build/R-devel-strict-clang -./configure --without-recommended-packages --disable-byte-compiled-packages --enable-strict-barrier --disable-long-double CC="clang-11 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" +./configure --without-recommended-packages --disable-byte-compiled-packages --enable-strict-barrier --disable-long-double CC="clang-15 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-sanitize=alignment -fno-omit-frame-pointer" CFLAGS="-g -O3 -Wall -pedantic" make cd ~/build/R-devel-strict-gcc -# gcc-10 (in dev currently) failed to build R, so using regular gcc-9 (9.3.0 as per focal/Pop!_OS 20.04) -./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="gcc-9 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" +# gcc-10 failed to build R-devel at some point, so using regular gcc-9 (9.3.0 as per focal/Pop!_OS 20.04) +./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="gcc-11 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make # See R-exts#4.3.3 @@ -328,15 +341,23 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-gcc CMD INSTALL data.table_1.14.1.tar.gz -Rdevel-strict-clang CMD INSTALL data.table_1.14.1.tar.gz -# Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so should be passed through to here -Rdevel-strict-gcc -Rdevel-strict-clang # repeat below with clang and gcc +Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.14.9.tar.gz +# Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so they should be +# passed through to here. However, our configure script seems to get in the way and gets them from {R_HOME}/bin/R +# So I needed to edit my ~/.R/Makevars to get CFLAGS the way I needed. +Rdevel-strict-[gcc|clang] CMD check data.table_1.14.9.tar.gz +# Use the (failed) output to get the list of currently needed packages and install them +Rdevel-strict-[gcc|clang] isTRUE(.Machine$sizeof.longdouble==0) # check noLD is being tested options(repos = "http://cloud.r-project.org") -install.packages(c("bit64","xts","nanotime","R.utils","yaml")) # minimum packages needed to not skip any tests in test.data.table() -# install.packages(c("curl","knitr")) # for `R CMD check` when not strict. Too slow to install when strict +install.packages(c("bit64", "bit", "curl", "R.utils", "xts","nanotime", "zoo", "yaml", "knitr", "rmarkdown", "markdown"), + Ncpus=4) +# Issue #5491 showed that CRAN is running UBSAN on .Rd examples which found an error so we now run full R CMD check +q("no") +Rdevel-strict-[gcc|clang] CMD check data.table_1.14.9.tar.gz +# UBSAN errors occur on stderr and don't affect R CMD check result. Made many failed attempts to capture them. So grep for them. +find data.table.Rcheck -name "*Rout*" -exec grep -H "runtime error" {} \; + require(data.table) test.data.table(script="*.Rraw") # 7 mins (vs 1min normally) under UBSAN, ASAN and --strict-barrier # without the fix in PR#3515, the --disable-long-double lumped into this build does now work and correctly reproduces the noLD problem @@ -370,7 +391,7 @@ cd R-devel-valgrind make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems -Rdevel-valgrind CMD INSTALL data.table_1.14.1.tar.gz +Rdevel-valgrind CMD INSTALL data.table_1.14.9.tar.gz R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" # the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. # including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks @@ -408,7 +429,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.14.1.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.14.9.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -469,14 +490,15 @@ shutdown now # doesn't return you to host prompt properly so just kill the win # Downstream dependencies ############################################### -# IF NOT ALREADY INSTALLED +# IF NOT ALREADY INSTALLED, OR AFTER AN OS UPGRADE +# No harm rerunning these commands; they do not reinstall if already latest version sudo apt-get update sudo apt-get -y install htop sudo apt-get -y install r-base r-base-dev sudo apt-get -y build-dep r-base-dev sudo apt-get -y build-dep qpdf sudo apt-get -y install aptitude -sudo aptitude -y build-dep r-cran-rgl # leads to libglu1-mesa-dev +sudo apt-get -y build-dep r-cran-rgl # leads to libglu1-mesa-dev sudo apt-get -y build-dep r-cran-rmpi sudo apt-get -y build-dep r-cran-cairodevice sudo apt-get -y build-dep r-cran-tkrplot @@ -524,6 +546,8 @@ sudo apt-get -y install libgit2-dev # for gert sudo apt-get -y install cmake # for symengine for RxODE sudo apt-get -y install libxslt1-dev # for xslt sudo apt-get -y install flex # for RcppCWB +sudo apt-get -y install libavfilter-dev libsodium-dev libgmp-dev libssh-dev librdf0-dev +sudo apt-get -y install libmariadb-dev mariadb-client # RMySQL for xQTLbiolinks sudo R CMD javareconf # ENDIF @@ -532,6 +556,7 @@ inst() # *** ensure latest dev version of data.table installed into revdeplib run() # prints menu of options status() # includes timestamp of installed data.table that is being tested. log() # cats all fail logs to ~/fail.log +cran() # compare packages with error or warning to their status on CRAN # Once all issues resolved with CRAN packages, tackle long-term unfixed bioconductor packages as follows. # 1. Note down all error and warning bioc packages @@ -569,7 +594,7 @@ du -k inst/tests # 0.75MB after R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" Rdevel -q -e "packageVersion('xml2')" # ensure installed -Rdevel CMD check data.table_1.14.0.tar.gz --as-cran # use latest Rdevel as it may have extra checks +Rdevel CMD check data.table_1.14.10.tar.gz --as-cran # use latest Rdevel as it may have extra checks # bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit compressed *.Rraw to git # @@ -596,8 +621,8 @@ When CRAN's email contains "Pretest results OK pending a manual inspection" (or 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. 4. Bump dllVersion() in init.c 5. Bump 3 version numbers in Makefile -6. Search and replace this .dev/CRAN_Release.cmd to update 1.13.7 to 1.14.1, and 1.13.6 to 1.14.0 (e.g. in step 8 and 9 below) +6. Search and replace this .dev/CRAN_Release.cmd to update 1.14.9 to 1.14.11 inc below, 1.14.10 to 1.14.12 above, 1.14.8 to 1.14.10 below 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -8. Push to master with this consistent commit message: "1.14.0 on CRAN. Bump to 1.14.1" -9. Take sha from step 8 and run `git tag 1.14.0 96c..sha..d77` then `git push origin 1.14.0` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +8. Push to master with this consistent commit message: "1.14.8 on CRAN. Bump to 1.14.10" +9. Take sha from step 8 and run `git tag 1.14.8 96c..sha..d77` then `git push origin 1.14.8` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) ###### diff --git a/.dev/cc.R b/.dev/cc.R index 6c278e269..a092aba35 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -22,8 +22,6 @@ # c # test and step between R and C -options(datatable.print.class = TRUE) - sourceDir = function(path=getwd(), trace = TRUE, ...) { # copied verbatim from example(source) in base R for (nm in list.files(path, pattern = "\\.[RrSsQq]$")) { @@ -50,7 +48,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys # Make sure library .so is not loaded (neither installed package nor from dev) dll = unlist(do.call("rbind",getLoadedDLLs())[,"path"]) - dll = grep("data_table.so",dll,value=TRUE) + dll = grep("data_table.so", dll, fixed=TRUE, value=TRUE) sapply(dll, dyn.unload) gc() @@ -63,27 +61,31 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys if (debug) { ret = system(sprintf("MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f%sopenmp CFLAGS=-std=c99\\ -O0\\ -ggdb\\ -pedantic' R CMD SHLIB -d -o data_table.so *.c", CC, OMP)) } else { - ret = system(sprintf("MAKEFLAGS='-j CC=%s CFLAGS=-f%sopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic\\ -fno-common' R CMD SHLIB -o data_table.so *.c", CC, OMP)) + ret = system(sprintf("MAKEFLAGS='-j CC=%s CFLAGS=-f%sopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic\\ -Wstrict-prototypes\\ -isystem\\ /usr/share/R/include\\ -fno-common' R CMD SHLIB -o data_table.so *.c", CC, OMP)) + # the -isystem suppresses strict-prototypes warnings from R's headers, #5477. Look at the output to see what -I is and pass the same path to -isystem. # TODO add -Wextra too? } if (ret) return() # clang -Weverything includes -pedantic and issues many more warnings than gcc # system("R CMD SHLIB -o data_table.so *.c") - if (any(sapply(objects(envir=.GlobalEnv),function(x){inherits(get(x,.GlobalEnv),"data.table")}))) { - cat("ABOUT TO RELOAD .SO BUT THERE ARE DATA.TABLE OBJECTS IN .GLOBALENV SO FINALIZER MIGHT CRASH\n") + for (obj in ls(.GlobalEnv)) { + if (inherits(.GlobalEnv[[obj]], "data.table")) { + cat("ABOUT TO RELOAD .SO BUT THERE ARE DATA.TABLE OBJECTS IN .GLOBALENV SO FINALIZER MIGHT CRASH\n") + break + } } dyn.load("data_table.so") setwd(old) xx = getDLLRegisteredRoutines("data_table",TRUE) - for (i in seq_along(xx$.Call)) - assign(xx$.Call[[i]]$name, xx$.Call[[i]]$address, envir=.GlobalEnv) - for (i in seq_along(xx$.External)) - assign(xx$.External[[i]]$name, xx$.External[[i]]$address, envir=.GlobalEnv) - sourceDir(paste0(path,"/R")) - if (base::getRversion()<"4.0.0") rm(list=c("rbind.data.table","cbind.data.table"), envir=.GlobalEnv) # 3968 follow up - assign("testDir", function(x)paste0(path,"/inst/tests/",x), envir=.GlobalEnv) + for (Call in xx$.Call) + .GlobalEnv[[Call$name]] = Call$address + for (Extern in xx$.External) + .GlobalEnv[[Extern$name]] = Extern$address + sourceDir(file.path(path, "R")) + if (base::getRversion()<"4.0.0") rm(list=c("rbind.data.table", "cbind.data.table"), envir=.GlobalEnv) # 3968 follow up + .GlobalEnv$testDir = function(x) file.path(path,"inst/tests",x) .onLoad() - if (is.logical(test) && isTRUE(test)) test.data.table() else if (is.character(test)) test.data.table(script=test) + if (isTRUE(test)) test.data.table() else if (is.character(test)) test.data.table(script=test) gc() invisible() } diff --git a/.dev/revdep.R b/.dev/revdep.R index da90f0c66..0b949da36 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -13,9 +13,10 @@ options(error=quote(utils::dump.frames())) options(width=200) # for cran() output not to wrap # Check that env variables have been set correctly: -# export R_LIBS_SITE=none +# export R_LIBS_SITE=NULL # R 4.2.0 changed to NULL but it doesn't appear to work # export R_LIBS=~/build/revdeplib/ # export _R_CHECK_FORCE_SUGGESTS_=true +if (length(.libPaths())==3L) .libPaths(.libPaths()[-2L], include.site=FALSE) # workaround as I couldn't get R_LIBS_SITE=NULL to be effective stopifnot(identical(length(.libPaths()), 2L)) # revdeplib writeable by me, and the pre-installed recommended R library (sudo writeable) stopifnot(identical(.libPaths()[1L], getwd())) tt = file.info(.libPaths())[,"uname"] @@ -96,9 +97,33 @@ update.packages(ask=FALSE, checkBuilt=TRUE) avail = available.packages() # includes CRAN and Bioc, from getOption("repos") set above -avail = avail[-match("cplexAPI",rownames(avail)),] +avail = avail[!rownames(avail) %in% c("cplexAPI","Rcplex"), ] # cplexAPI is suggested by revdeps ivmte and prioritizr. I haven't succeeded to install IBM ILOG CPLEX which requires a license, # so consider cplexAPI not available when resolving missing suggests at the end of status(). +# Update: cplexAPI was removed from CRAN on 5 Nov 2021 so this is now redundant, but leave it in place for future use. +# Update: Rcplex is on CRAN as of 20 Nov 2022 but with install errors, therefore treat it as not available. + +# The presence of packages here in revdeplib which no longer exist on CRAN could explain differences to CRAN. A revdep +# could be running tests using that package when available and failing which may be the very reason that package was removed from CRAN. +# When it is removed from revdeplib to match CRAN, then the revdep might then pass as it will skip its tests using that package. +x = installed.packages() +tt = match(rownames(x), rownames(avail)) +removed = rownames(x)[is.na(tt) & is.na(x[,"Priority"])] +cat("Removing",length(removed),"packages which are no longer available on CRAN/Bioc:", paste(removed, collapse=","), "\n") +stopifnot(all(x[removed,"LibPath"] == .libPaths()[1])) +oldn = nrow(x) +remove.packages(removed, .libPaths()[1]) +x = installed.packages() +stopifnot(nrow(x) == oldn-length(removed)) + +# Ensure all installed packages were built with this x.y release of R; i.e. that checkBuilt=TRUE worked above +cat("This is R ",R.version$major,".",R.version$minor,"; ",R.version.string,"\n",sep="") +cat("Previously installed packages were built using:\n") +print(tt <- table(x[,"Built"], dnn=NULL)) +minorR = paste(strsplit(as.character(getRversion()), split="[.]")[[1]][c(1,2)], collapse=".") +if (any(w<-names(tt)20)paste(" +",length(ns)-20,"more"), "\n"), "\n" ) - assign(if (bioc) ".fail.bioc" else ".fail.cran", c(sort(names(x)[e]), sort(names(x)[w])), envir=.GlobalEnv) + assign(if (bioc) ".fail.bioc" else ".fail.cran", c(sort(names(x)[e]), sort(names(x)[w])), envir=.GlobalEnv) + assign(if (bioc) ".running.bioc" else ".running.cran", sort(names(x)[r]), envir=.GlobalEnv) + # if parallel finished then 'running' means killed; we want to see if status on CRAN (using cran()) shows FAIL with a log showing kill signal (or similar) due to taking too long invisible() } @@ -253,36 +257,44 @@ status = function(bioc=FALSE) { } } if (length(all_sugg_unavail)) { - cat('\nPackages for which all their missing suggests are not available, try:\n', - ' run("',paste(all_sugg_unavail,collapse=" "),'", R_CHECK_FORCE_SUGGESTS=FALSE)\n', sep="") + cat('\nPackages for which all their missing suggests are not available (',length(all_sugg_unavail),'): ', paste(all_sugg_unavail, collapse=" "), "\n", sep="") + cat('Rerunning them with R_CHECK_FORCE_SUGGESTS=FALSE ...\n') + run(all_sugg_unavail, R_CHECK_FORCE_SUGGESTS=FALSE, ask=FALSE) + # the main run() ran with TRUE in an attempt to check suggests where possible in case data.table usage is there. It does that as OS level using + # parallel linux command of `R CMD check`. Therefore it would be awkward to rerun with TRUE in that step. Instead we trigger the rerun from + # here in status() afterwards once we know all such packages and can count and log them (the cat() just above). Since run() is + # asynchronous we do have to wait again and run status() again when perfbar shows finished. However, currently there are only + # 37/1315 like this so it only takes a few minutes with my default of 6 at a time. + return(TRUE) # to indicate status() started a run() so that the run() afterwards can be avoided which would otherwise see these as not started and run them simultaneously but with _SUGGESTS=TRUE } # Otherwise, inspect manually each result in fail.log written by log() } - invisible() + invisible(FALSE) } -cran = function() # reports CRAN status of the .cran.fail packages +cran = function() # reports CRAN status of the .fail.cran packages { - if (!length(.fail.cran)) { - cat("No CRAN revdeps in error or warning status\n") + x = c(.fail.cran, .running.cran) + if (!length(x)) { + cat("No CRAN revdeps in error, warning or running status\n") return(invisible()) } require(data.table) p = proc.time() - db = setDT(tools::CRAN_check_results()) + db <<- setDT(tools::CRAN_check_results()) cat("tools::CRAN_check_results() returned",prettyNum(nrow(db), big.mark=","),"rows in",timetaken(p),"\n") - ans = db[Package %chin% .fail.cran, - .(ERROR=sum(Status=="ERROR"), - WARN =sum(Status=="WARN"), + ans = db[Package %chin% x, + .(ERROR=sum(Status=="ERROR", na.rm=TRUE), + WARN =sum(Status=="WARN", na.rm=TRUE), cran =paste(unique(Version),collapse=";"), - local=as.character(packageVersion(.BY[[1]]))), + local=as.character(tryCatch(packageVersion(.BY[[1]]), error=function(e)"error"))), keyby=Package] ans[local==cran, c("cran","local"):=""] ans[, "right_click_in_bash":=paste0("https://cran.r-project.org/web/checks/check_results_",Package,".html")] - ans[] + setkey(ans, Package)[x,] } -run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE, choose=NULL) { +run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE, choose=NULL, ask=TRUE) { if (length(pkgs)==1) pkgs = strsplit(pkgs, split="[, ]")[[1]] if (anyDuplicated(pkgs)) stop("pkgs contains dups") if (!length(pkgs)) { @@ -321,13 +333,13 @@ run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE, choose=NULL) { cat("Running",length(pkgs),"packages:", paste(pkgs), "\n") filter = paste0("| grep -E '", paste0(paste0(pkgs,"_"),collapse="|"), "' ") } - if (is.null(choose)) { + if (ask && is.null(choose)) { cat("Proceed? (ctrl-c or enter)\n") scan(quiet=TRUE) } if (!identical(pkgs,"_ALL_")) for (i in pkgs) system(paste0("rm -rf ./",i,".Rcheck")) SUGG = paste0("_R_CHECK_FORCE_SUGGESTS_=",tolower(R_CHECK_FORCE_SUGGESTS)) - cmd = paste0("ls -1 *.tar.gz ", filter, "| TZ='UTC' OMP_THREAD_LIMIT=2 ",SUGG," parallel --max-procs 50% ",R," CMD check") + cmd = paste0("ls -1 *.tar.gz ", filter, "| TZ='UTC' OMP_THREAD_LIMIT=2 ",SUGG," parallel --max-procs 50% --timeout 1200 ",R," CMD check") # TZ='UTC' because some packages have failed locally for me but not on CRAN or for their maintainer, due to sensitivity of tests to timezone if (as.integer(system("ps -e | grep perfbar | wc -l", intern=TRUE)) < 1) system("perfbar",wait=FALSE) system("touch /tmp/started.flag ; rm -f /tmp/finished.flag") @@ -369,8 +381,7 @@ log = function(bioc=FALSE, fnam="~/fail.log", app="gedit") { } inst() -status() -run(choose=1) # run not-started (i.e. updates to and new revdeps) automatically on revdep startup +if (!status()) run(choose=1) # run pkgs in not-started status; i.e. updates to and new revdeps. Unless status() found any all-suggests-unavail which it then started run() for with _SUGGESTS=FALSE # Now R prompt is ready to fix any problems with CRAN or Bioconductor updates. # Then run run(), status() and log() as per section in CRAN_Release.cmd diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE/issue_template.md similarity index 86% rename from .github/ISSUE_TEMPLATE.md rename to .github/ISSUE_TEMPLATE/issue_template.md index 3facaa4de..09857e4e6 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -1,3 +1,8 @@ +--- +name: Bug report or feature request +about: Report a bug or describe a new requested feature +--- + Click preview tab ^^^ above! By continuing to file this new issue / feature request, I confirm I have : @@ -10,6 +15,6 @@ By continuing to file this new issue / feature request, I confirm I have : #### Thanks! Please remove the text above and include the two items below. -`#` [`Minimal reproducible example`](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) +`#` [`Minimal reproducible example`](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example); please be sure to set `verbose=TRUE` where possible! `#` `Output of sessionInfo()` diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 248f97a30..f0c403793 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,9 +6,9 @@ variables: TZ: "UTC" ## to avoid 'Failed to create bus connection' from timedatectl via Sys.timezone() on Docker with R 3.4. ## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under ## a non-UTC timezone, although, that's what we do routinely in dev. - R_REL_VERSION: "4.1" - R_DEVEL_VERSION: "4.2" - R_OLDREL_VERSION: "4.0" + R_REL_VERSION: "4.2" + R_DEVEL_VERSION: "4.3" + R_OLDREL_VERSION: "4.1" stages: - dependencies @@ -40,19 +40,19 @@ mirror-packages: ## mirror all recursive dependencies, source and win.binary of - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_REL_VERSION","R_DEVEL_VERSION","R_OLDREL_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' <<: *artifacts -mirror-other-packages: ## mirror integration suggests from pkgs at the top of inst/tests/other.Rraw - stage: dependencies - tags: - - linux - image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev - cache: - paths: - - bus/$CI_BUILD_NAME/cran - script: - - echo 'source(".ci/ci.R")' >> .Rprofile - - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib - - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); mirror.packages(pkgs, repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-other-packages/cran")' - <<: *artifacts +# mirror-other-packages: ## mirror integration suggests from pkgs at the top of inst/tests/other.Rraw; off now #5274 +# stage: dependencies +# tags: +# - linux +# image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev +# cache: +# paths: +# - bus/$CI_BUILD_NAME/cran +# script: +# - echo 'source(".ci/ci.R")' >> .Rprofile +# - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib +# - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); mirror.packages(pkgs, repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-other-packages/cran")' +# <<: *artifacts build: ## build data.table sources as tar.gz archive stage: build @@ -63,6 +63,7 @@ build: ## build data.table sources as tar.gz archive before_script: - Rscript -e 'install.packages(c("knitr","rmarkdown"), repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' - rm -r bus + - sed -i '/^[[:space:]]*$/d' ./DESCRIPTION ## make last line end abruptly; i.e. without a final \n - echo "Revision:" $CI_BUILD_REF >> ./DESCRIPTION script: - R CMD build . @@ -74,8 +75,6 @@ build: ## build data.table sources as tar.gz archive .test-install-deps: &install-deps - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="most"), quiet=TRUE)' -.test-install-deps-win: &install-deps-win - - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most'), quiet=TRUE)" .test-cp-src: &cp-src - cp $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) . @@ -96,14 +95,12 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.0/R-4.1.0-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/R-4.2.2-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + # see #5198 for discussion about the https link used above; it will break each time R is released and the version number will need to be updated .test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-oldrel-win: &install-r-oldrel-win - - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.0.5/R-4.0.5-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait - -.test-install-rtools-win: &install-rtools-win - - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait + - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.3/R-4.1.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-template: &test stage: test @@ -144,16 +141,17 @@ build: ## build data.table sources as tar.gz archive test-rel-lin: ## most comprehensive tests, force all suggests, also integration tests, using gcc -O3 -flto -fno-common -Wunused-result <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-builder - needs: ["mirror-packages","mirror-other-packages","build"] + needs: ["mirror-packages","build"] # "mirror-other-packages" variables: _R_CHECK_CRAN_INCOMING_: "FALSE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_CHECK_FORCE_SUGGESTS_: "TRUE" _R_CHECK_TESTS_NLINES_: "0" OPENBLAS_MAIN_FREE: "1" - TEST_DATA_TABLE_WITH_OTHER_PACKAGES: "TRUE" + TEST_DATA_TABLE_WITH_OTHER_PACKAGES: "FALSE" #5274 before_script: - - Rscript -e 'source(".ci/ci.R"); eval(parse("inst/tests/other.Rraw", n=1L)); install.packages(c(dcf.dependencies("DESCRIPTION", which="all"), pkgs), quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' + - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), quiet=TRUE)' ## does seem to be needed despite 'needs mirror-packages' + ## - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); install.packages(pkgs, quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' - *cp-src - rm -r bus - mkdir -p ~/.R @@ -170,7 +168,7 @@ test-rel-vanilla-lin: ## minimal, no suggested deps, no vignettes or manuals, me <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev variables: - TEST_DATA_TABLE_MEMTEST: "TRUE" + TEST_DATA_TABLE_MEMTEST: "1" before_script: - *cp-src - rm -r bus @@ -245,9 +243,9 @@ test-rel-win: ## R-release on Windows, test and build binaries R_VERSION: "$R_REL_VERSION" before_script: - *install-r-rel-win - - *install-rtools-win - - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - - *install-deps-win + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5355-5357.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait + - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most'), quiet=TRUE)" - *cp-src-win - rm.exe -r bus script: @@ -258,15 +256,15 @@ test-rel-win: ## R-release on Windows, test and build binaries - *rm-src-win - *mv-bin-win -test-dev-win: ## R-devel on Windows +test-dev-win: ## R-devel on Windows; see #5294 for changes in Dec 2021 related to UCRT and Rtools42 <<: *test-win variables: R_VERSION: "$R_DEVEL_VERSION" before_script: - *install-r-devel-win - - *install-rtools-win - - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - - *install-deps-win + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5493-5475.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools43" -NoNewWindow -Wait + - $ENV:PATH = "C:\R\bin;C:\rtools43\usr\bin;$ENV:PATH" + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win - rm.exe -r bus script: @@ -283,9 +281,10 @@ test-old-win: ## R-oldrel on Windows R_VERSION: "$R_OLDREL_VERSION" before_script: - *install-r-oldrel-win - - *install-rtools-win + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait + ## rtools42 doesn't support 32bit so oldrel-win (currently R 4.1) needs rtools40 - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - - *install-deps-win + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win - rm.exe -r bus script: diff --git a/DESCRIPTION b/DESCRIPTION index fea7936d5..74a4b6e1c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,16 @@ Package: data.table -Version: 1.14.1 +Version: 1.14.9 Title: Extension of `data.frame` +Depends: R (>= 3.1.0) +Imports: methods +Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown +SystemRequirements: zlib +Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. +License: MPL-2.0 | file LICENSE +URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table +BugReports: https://github.com/Rdatatable/data.table/issues +VignetteBuilder: knitr +ByteCompile: TRUE Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), person("Arun","Srinivasan", role="aut", email="asrini@pm.me"), @@ -61,7 +71,7 @@ Authors@R: c( person("Vaclav","Tlapak", role="ctb"), person("Kevin","Ushey", role="ctb"), person("Dirk","Eddelbuettel", role="ctb"), - person("Ben","Schwen", role="ctb"), + person("Benjamin","Schwendinger", role="ctb"), person("Tony","Fischetti", role="ctb"), person("Ofek","Shilon", role="ctb"), person("Vadim","Khotilovich", role="ctb"), @@ -69,14 +79,8 @@ Authors@R: c( person("Bennet","Becker", role="ctb"), person("Kyle","Haynes", role="ctb"), person("Boniface Christian","Kamgang", role="ctb"), - person("Olivier","Delmarcell", role="ctb")) -Depends: R (>= 3.1.0) -Imports: methods -Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown -SystemRequirements: zlib -Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. -License: MPL-2.0 | file LICENSE -URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table -BugReports: https://github.com/Rdatatable/data.table/issues -VignetteBuilder: knitr -ByteCompile: TRUE + person("Olivier","Delmarcell", role="ctb"), + person("Josh","O'Brien", role="ctb"), + person("Dereck","de Mezquita", role="ctb"), + person("Michael","Czekanski", role="ctb") + ) diff --git a/Makefile b/Makefile index 2be00d3b7..b4d8517df 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.14.1.tar.gz + $(RM) data.table_1.14.9.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.14.1.tar.gz + $(R) CMD INSTALL data.table_1.14.9.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.1.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.9.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NAMESPACE b/NAMESPACE index 81c0fce68..ef0aa2d17 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,9 +8,9 @@ exportClasses(data.table, IDate, ITime) export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy) export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) -export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%") +export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%", "%notin%") export(timetaken) -export(truelength, setalloccol, alloc.col, ":=") +export(truelength, setalloccol, alloc.col, ":=", let) export(setattr, setnames, setcolorder, set, setDT, setDF) export(setorder, setorderv) export(setNumericRounding, getNumericRounding) @@ -60,6 +60,7 @@ export(substitute2) export(DT) # mtcars |> DT(i,j,by) #4872 S3method("[", data.table) +export("[.data.table") # so that functional DT() finds it; PR#5176 S3method("[<-", data.table) # S3method("[[", data.table) # S3method("[[<-", data.table) @@ -89,19 +90,18 @@ if (getRversion() >= "4.0.0") { # if we register these (new in v1.12.6) methods always though, the previous workaround no longer works in R<4.0.0. Hence only register in R>=4.0.0. S3method(cbind, data.table) S3method(rbind, data.table) +} else { + # and if we export but don't register in R < 4.0.0 we get this note: + # > Found the following apparent S3 methods exported but not registered: + # > cbind.data.table rbind.data.table + # in addition to errors in tests 324, 326, 414.1, 414.2, 442, 445, 451 + # export(cbind.data.table) + # export(rbind.data.table) + # A revdep using rbind.data.frame() directly before (which data.table changed in base) should change to rbind() generic and that should work + # in all combinations of R before/after 4.0.0 and data.table before/after 1.12.6, so long as data.table is installed using the same major + # version of R (and that is checked in .onLoad with error if not). + export(.rbind.data.table) # only export in R<4.0.0 where it is still used; R-devel now detects it is missing doc, #5600 } -# else { -# # and if we export but don't register in R < 4.0.0 we get this note: -# # > Found the following apparent S3 methods exported but not registered: -# # > cbind.data.table rbind.data.table -# # in addition to errors in tests 324, 326, 414.1, 414.2, 442, 445, 451 -# export(cbind.data.table) -# export(rbind.data.table) -# # A revdep using rbind.data.frame() directly before (which data.table changed in base) should change to rbind() generic and that should work -# # in all combinations of R before/after 4.0.0 and data.table before/after 1.12.6, so long as data.table is installed using the same major -# # version of R (and that is checked in .onLoad with error if not). -# } -export(.rbind.data.table) # continue to export for now because it has been exported in the past so it may be depended on S3method(dim, data.table) S3method(dimnames, data.table) S3method("dimnames<-", data.table) @@ -130,8 +130,7 @@ S3method(melt, default) export(melt.data.table, dcast.data.table) import(utils) -S3method(update, dev.pkg) -export(update.dev.pkg) +export(update_dev_pkg) S3method(tail, data.table) S3method(head, data.table) import(stats) @@ -147,7 +146,7 @@ if (getRversion() >= "3.6.0") { # IDateTime support: export(as.IDate,as.ITime,IDateTime) -export(second,minute,hour,yday,wday,mday,week,isoweek,month,quarter,year) +export(second,minute,hour,yday,wday,mday,week,isoweek,month,quarter,year,yearmon,yearqtr) S3method("[", ITime) S3method("+", IDate) @@ -187,6 +186,8 @@ S3method(seq, ITime) S3method(unique, IDate) S3method(unique, ITime) S3method('[<-', IDate) +S3method('min', IDate) +S3method('max', IDate) S3method(edit, data.table) # generics to support custom column formatters @@ -197,3 +198,5 @@ S3method(format_col, expression) export(format_list_item) S3method(format_list_item, default) +export(fdroplevels) +S3method(droplevels, data.table) diff --git a/NEWS.md b/NEWS.md index 03dbe3d4b..025a7651b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,6 @@ **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** -**Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** - -# data.table [v1.14.1](https://github.com/Rdatatable/data.table/milestone/20) (in development) +# data.table [v1.14.9](https://github.com/Rdatatable/data.table/milestone/20) (in development) ## NEW FEATURES @@ -99,7 +97,7 @@ 16. `fwrite()` now accepts `sep=""`, [#4817](https://github.com/Rdatatable/data.table/issues/4817). The motivation is an example where the result of `paste0()` needs to be written to file but `paste0()` takes 40 minutes due to constructing a very large number of unique long strings in R's global character cache. Allowing `fwrite(, sep="")` avoids the `paste0` and saves 40 mins. Thanks to Jan Gorecki for the request, and Ben Schwen for the PR. -17. `data.table` printing now supports customizable methods for both columns and list column row items, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). `format_col` is S3-generic for customizing how to print whole columns; `format_list_item` is S3-generic for customizing how to print each row of a list column. Thanks to @mllg who initially filed [#3338](https://github.com/Rdatatable/data.table/pulls/3338) with the seed of the idea, @franknarf1 who earlier suggested the idea of providing custom formatters, @fparages who submitted a patch to improve the printing of timezones for [#2842](https://github.com/Rdatatable/data.table/issues/2842), @RichardRedding for pointing out an error relating to printing wide `expression` columns in [#3011](https://github.com/Rdatatable/data.table/issues/3011), and @MichaelChirico for implementing. See `?print.data.table` for examples. +17. `data.table` printing now supports customizable methods for both columns and list column row items, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). `format_col` is S3-generic for customizing how to print whole columns and by default defers to the S3 `format` method for the column's class if one exists; e.g. `format.sfc` for geometry columns from the `sf` package, [#2273](https://github.com/Rdatatable/data.table/issues/2273). Similarly, `format_list_item` is S3-generic for customizing how to print each row of list columns (which lack a format method at a column level) and also by default defers to the S3 `format` method for that item's class if one exists. Thanks to @mllg who initially filed [#3338](https://github.com/Rdatatable/data.table/pulls/3338) with the seed of the idea, @franknarf1 who earlier suggested the idea of providing custom formatters, @fparages who submitted a patch to improve the printing of timezones for [#2842](https://github.com/Rdatatable/data.table/issues/2842), @RichardRedding for pointing out an error relating to printing wide `expression` columns in [#3011](https://github.com/Rdatatable/data.table/issues/3011), @JoshOBrien for improving the output for geometry columns, and @MichaelChirico for implementing. See `?print.data.table` for examples. 18. `tstrsplit(,type.convert=)` now accepts a named list of functions to apply to each part, [#5094](https://github.com/Rdatatable/data.table/issues/5094). Thanks to @Kamgang-B for the request and implementing. @@ -114,7 +112,7 @@ ```R mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) ``` - + When `data.table` queries (either `[...]` or `|> DT(...)`) receive a `data.table`, the operations maintain `data.table`'s attributes such as its key and any indices. For example, if a `data.table` is reordered by `data.table`, or a key column has a value changed by `:=` in `data.table`, its key and indices will either be dropped or reordered appropriately. Some `data.table` operations automatically add and store an index on a `data.table` for reuse in future queries, if `options(datatable.auto.index=TRUE)`, which is `TRUE` by default. `data.table`'s are also over-allocated, which means there are spare column pointer slots allocated in advance so that a `data.table` in the `.GlobalEnv` can have a column added to it truly by reference, like an in-memory database with multiple client sessions connecting to one server R process, as a `data.table` video has shown in the past. But because R and other packages don't maintain `data.table`'s attributes or over-allocation (e.g. a subset or reorder by R or another package will create invalid `data.table` attributes) `data.table` cannot use these attributes when it detects that base R or another package has touched the `data.table` in the meantime, even if the attributes may sometimes still be valid. So, please realize that, `DT()` on a `data.table` should realize better speed and memory usage than `DT()` on a `data.frame`. `DT()` on a `data.frame` may still be useful to use `data.table`'s syntax (e.g. sub-queries within group: `|> DT(i, .SD[sub-query], by=grp)`) without needing to convert to a `data.table` first. 23. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. @@ -137,7 +135,166 @@ 24. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. -25 `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. +25. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. + +26. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, Boniface Kamgang and Martin Binder for testing, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. + +27. `shift()` now also supports `type="cyclic"`, [#4451](https://github.com/Rdatatable/data.table/issues/4451). Arguments that are normally pushed out by `type="lag"` or `type="lead"` are re-introduced at this type at the first/last positions. Thanks to @RicoDiel for requesting, and Benjamin Schwendinger for the PR. + + ```R + # Usage + shift(1:5, n=-1:1, type="cyclic") + # [[1]] + # [1] 2 3 4 5 1 + # + # [[2]] + # [1] 1 2 3 4 5 + # + # [[3]] + # [1] 5 1 2 3 4 + + # Benchmark + x = sample(1e9) # 3.7 GB + microbenchmark::microbenchmark( + shift(x, 1, type="cyclic"), + c(tail(x, 1), head(x,-1)), + times = 10L, + unit = "s" + ) + # Unit: seconds + # expr min lq mean median uq max neval + # shift(x, 1, type = "cyclic") 1.57 1.67 1.71 1.68 1.70 2.03 10 + # c(tail(x, 1), head(x, -1)) 6.96 7.16 7.49 7.32 7.64 8.60 10 + ``` + +28. `fread()` now supports "0" and "1" in `na.strings`, [#2927](https://github.com/Rdatatable/data.table/issues/2927). Previously this was not permitted since "0" and "1" can be recognized as boolean values. Note that it is still not permitted to use "0" and "1" in `na.strings` in combination with `logical01 = TRUE`. Thanks to @msgoussi for the request, and Benjamin Schwendinger for the PR. + +29. `setkey()` now supports type `raw` as value columns (not as key columns), [#5100](https://github.com/Rdatatable/data.table/issues/5100). Thanks Hugh Parsonage for requesting, and Benjamin Schwendinger for the PR. + +30. `shift()` is now optimised by group, [#1534](https://github.com/Rdatatable/data.table/issues/1534). Thanks to Gerhard Nachtmann for requesting, and Benjamin Schwendinger for the PR. + + ```R + N = 1e7 + DT = data.table(x=sample(N), y=sample(1e6,N,TRUE)) + shift_no_opt = shift # different name not optimised as a way to compare + microbenchmark( + DT[, c(NA, head(x,-1)), y], + DT[, shift_no_opt(x, 1, type="lag"), y], + DT[, shift(x, 1, type="lag"), y], + times=10L, unit="s") + # Unit: seconds + # expr min lq mean median uq max neval + # DT[, c(NA, head(x, -1)), y] 8.7620 9.0240 9.1870 9.2800 9.3700 9.4110 10 + # DT[, shift_no_opt(x, 1, type = "lag"), y] 20.5500 20.9000 21.1600 21.3200 21.4400 21.5200 10 + # DT[, shift(x, 1, type = "lag"), y] 0.4865 0.5238 0.5463 0.5446 0.5725 0.5982 10 + ``` + + Example from [stackoverflow](https://stackoverflow.com/questions/35179911/shift-in-data-table-v1-9-6-is-slow-for-many-groups) + ```R + set.seed(1) + mg = data.table(expand.grid(year=2012:2016, id=1:1000), + value=rnorm(5000)) + microbenchmark(v1.9.4 = mg[, c(value[-1], NA), by=id], + v1.9.6 = mg[, shift_no_opt(value, n=1, type="lead"), by=id], + v1.14.4 = mg[, shift(value, n=1, type="lead"), by=id], + unit="ms") + # Unit: milliseconds + # expr min lq mean median uq max neval + # v1.9.4 3.6600 3.8250 4.4930 4.1720 4.9490 11.700 100 + # v1.9.6 18.5400 19.1800 21.5100 20.6900 23.4200 29.040 100 + # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 + ``` + +31. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.` + + ```R + DT1 + # A B + # + # 1: 1 5 + # 2: 2 6 + + DT2 + # foo + # + # 1: 3 + # 2: 4 + + rbind(DT1, DT2, fill=TRUE) # no change + # A B foo + # + # 1: 1 5 NA + # 2: 2 6 NA + # 3: NA NA 3 + # 4: NA NA 4 + + rbind(DT1, DT2, fill=TRUE, use.names=FALSE) + + # was: + # A B foo + # + # 1: 1 5 NA + # 2: 2 6 NA + # 3: NA NA 3 + # 4: NA NA 4 + # Warning message: + # In rbindlist(l, use.names, fill, idcol) : + # use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE. + + # now: + # A B + # + # 1: 1 5 + # 2: 2 6 + # 3: 3 NA + # 4: 4 NA + ``` + +32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. + +33. `fread()` can now read `.zip` and `.tar` directly, [#3834](https://github.com/Rdatatable/data.table/issues/3834). Moreover, if a compressed file name is missing its extension, `fread()` now attempts to infer the correct filetype from its magic bytes. Thanks to Michael Chirico for the idea, and Benjamin Schwendinger for the PR. + +34. `DT[, let(...)]` is a new alias for the functional form of `:=`; i.e. `DT[, ':='(...)]`, [#3795](https://github.com/Rdatatable/data.table/issues/3795). Thanks to Elio Campitelli for requesting, and Benjamin Schwendinger for the PR. + + ```R + DT = data.table(A=1:2) + DT[, let(B=3:4, C=letters[1:2])] + DT + # A B C + # + # 1: 1 3 a + # 2: 2 4 b + ``` + +35. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. + +36. `as.xts.data.table()` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. + +37. `unique.data.table()` gains `cols` to specify a subset of columns to include in the resulting `data.table`, [#5243](https://github.com/Rdatatable/data.table/issues/5243). This saves the memory overhead of subsetting unneeded columns, and provides a cleaner API for a common operation previously needing more convoluted code. Thanks to @MichaelChirico for the suggestion & implementation. + +38. `:=` is now optimized by group, [#1414](https://github.com/Rdatatable/data.table/issues/1414). Thanks to Arun Srinivasan for suggesting, and Benjamin Schwendinger for the PR. Thanks to @clerousset, @dcaseykc, @OfekShilon, and @SeanShao98 for testing dev and filing detailed bug reports which were fixed before release and their tests added to the test suite. + +39. `.I` is now available in `by` for rowwise operations, [#1732](https://github.com/Rdatatable/data.table/issues/1732). Thanks to Rafael H. M. Pereira for requesting, and Benjamin Schwendinger for the PR. + + ```R + DT + # V1 V2 + # + # 1: 3 5 + # 2: 4 6 + + DT[, sum(.SD), by=.I] + # I V1 + # + # 1: 1 8 + # 2: 2 10 + ``` + +40. New functions `yearmon()` and `yearqtr` give a combined representation of `year()` and `month()`/`quarter()`. These and also `yday`, `wday`, `mday`, `week`, `month` and `year` are now optimized for memory and compute efficiency by removing the `POSIXlt` dependency, [#649](https://github.com/Rdatatable/data.table/issues/649). Thanks to Matt Dowle for the request, and Benjamin Schwendinger for the PR. + +41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. + +42. `tables()` is faster by default by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). `mb=` now accepts any function which accepts a `data.table` and returns a higher and better estimate of its size in bytes, albeit more slowly; e.g. `mb = utils::object.size`. ## BUG FIXES @@ -145,7 +302,7 @@ 2. `print(DT, trunc.cols=TRUE)` and the corresponding `datatable.print.trunc.cols` option (new feature 3 in v1.13.0) could incorrectly display an extra column, [#4266](https://github.com/Rdatatable/data.table/issues/4266). Thanks to @tdhock for the bug report and @MichaelChirico for the PR. -3. `fread(..., nrows=0L)` now works as intended and the same as `nrows=0`; i.e. returning the column names and typed empty columns determined by the large sample, [#4686](https://github.com/Rdatatable/data.table/issues/4686). Thanks to @hongyuanjia for reporting, and Benjamin Schwendinger for the PR. +3. `fread(..., nrows=0L)` now works as intended and the same as `nrows=0`; i.e. returning the column names and typed empty columns determined by the large sample, [#4686](https://github.com/Rdatatable/data.table/issues/4686), [#4029](https://github.com/Rdatatable/data.table/issues/4029). Thanks to @hongyuanjia and @michaelpaulhirsch for reporting, and Benjamin Schwendinger for the PR. 4. Passing `.SD` to `frankv()` with `ties.method='random'` or with `na.last=NA` failed with `.SD is locked`, [#4429](https://github.com/Rdatatable/data.table/issues/4429). Thanks @smarches for the report. @@ -177,7 +334,7 @@ 18. `as.data.table()` on `xts` objects containing a column named `x` would return an `index` of type plain `integer` rather than `POSIXct`, [#4897](https://github.com/Rdatatable/data.table/issues/4897). Thanks to Emil Sjørup for reporting, and Jan Gorecki for the PR. -19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3. +19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3. 20. `uniqueN(DT, by=character())` is now equivalent to `uniqueN(DT)` rather than internal error `'by' is either not integer or is length 0`, [#4594](https://github.com/Rdatatable/data.table/issues/4594). Thanks Marco Colombo for the report, and Michael Chirico for the PR. Similarly for `unique()`, `duplicated()` and `anyDuplicated()`. @@ -253,6 +410,35 @@ # no inconvenient warning ``` + On the same basis, `min` and `max` methods for empty `IDate` input now return `NA_integer_` of class `IDate`, rather than `NA_double_` of class `IDate` together with base R's warning `no non-missing arguments to min; returning Inf`, [#2256](https://github.com/Rdatatable/data.table/issues/2256). The type change and warning would cause an error in grouping, see example below. Since `NA` was returned before it seems clear that still returning `NA` but of the correct type and with no warning is appropriate, backwards compatible, and a bug fix. Thanks to Frank Narf for reporting, and Matt Dowle for fixing. + + ```R + DT + # d g + # + # 1: 2020-01-01 a + # 2: 2020-01-02 a + # 3: 2019-12-31 b + + DT[, min(d[d>"2020-01-01"]), by=g] + + # was: + + # Error in `[.data.table`(DT, , min(d[d > "2020-01-01"]), by = g) : + # Column 1 of result for group 2 is type 'double' but expecting type + # 'integer'. Column types must be consistent for each group. + # In addition: Warning message: + # In min.default(integer(0), na.rm = FALSE) : + # no non-missing arguments to min; returning Inf + + # now : + + # g V1 + # + # 1: a 2020-01-02 + # 2: b + ``` + 36. `DT[, min(int64Col), by=grp]` (and `max`) would return incorrect results for `bit64::integer64` columns, [#4444](https://github.com/Rdatatable/data.table/issues/4444). Thanks to @go-see for reporting, and Michael Chirico for the PR. 37. `fread(dec=',')` was able to guess `sep=','` and return an incorrect result, [#4483](https://github.com/Rdatatable/data.table/issues/4483). Thanks to Michael Chirico for reporting and fixing. It was already an error to provide both `sep=','` and `dec=','` manually. @@ -295,6 +481,86 @@ 39. `DT[i, sum(b), by=grp]` (and other optimized-by-group aggregates: `mean`, `var`, `sd`, `median`, `prod`, `min`, `max`, `first`, `last`, `head` and `tail`) could segfault if `i` contained row numbers and one or more were NA, [#1994](https://github.com/Rdatatable/data.table/issues/1994). Thanks to Arun Srinivasan for reporting, and Benjamin Schwendinger for the PR. +40. `identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)` is now TRUE, [#4461](https://github.com/Rdatatable/data.table/issues/4461). This is one of 13 numbers in the set of 100,000 between 0.80606 and 0.80607 in 0.0000000001 increments that were not already identical. In all 13 cases R's parser (same as `read.table`) and `fread` straddled the true value by a very similar small amount. `fread` now uses `/10^n` rather than `*10^-n` to match R identically in all cases. Thanks to Gabe Becker for requesting consistency, and Michael Chirico for the PR. + + ```R + for (i in 0:99999) { + s = sprintf("0.80606%05d", i) + r = eval(parse(text=s)) + f = fread(text=paste0("A\n",s,"\n"))$A + if (!identical(r, f)) + cat(s, sprintf("%1.18f", c(r, f, r)), "\n") + } + # input eval & read.table fread before fread now + # 0.8060603509 0.806060350899999944 0.806060350900000055 0.806060350899999944 + # 0.8060614740 0.806061473999999945 0.806061474000000056 0.806061473999999945 + # 0.8060623757 0.806062375699999945 0.806062375700000056 0.806062375699999945 + # 0.8060629084 0.806062908399999944 0.806062908400000055 0.806062908399999944 + # 0.8060632774 0.806063277399999945 0.806063277400000056 0.806063277399999945 + # 0.8060638101 0.806063810099999944 0.806063810100000055 0.806063810099999944 + # 0.8060647118 0.806064711799999944 0.806064711800000055 0.806064711799999944 + # 0.8060658349 0.806065834899999945 0.806065834900000056 0.806065834899999945 + # 0.8060667366 0.806066736599999945 0.806066736600000056 0.806066736599999945 + # 0.8060672693 0.806067269299999944 0.806067269300000055 0.806067269299999944 + # 0.8060676383 0.806067638299999945 0.806067638300000056 0.806067638299999945 + # 0.8060681710 0.806068170999999944 0.806068171000000055 0.806068170999999944 + # 0.8060690727 0.806069072699999944 0.806069072700000055 0.806069072699999944 + # + # remaining 99,987 of these 100,000 were already identical + ``` + +41. `dcast(empty-DT)` now returns an empty `data.table` rather than error `Cannot cast an empty data.table`, [#1215](https://github.com/Rdatatable/data.table/issues/1215). Thanks to Damian Betebenner for reporting, and Matt Dowle for fixing. + +42. `DT[factor("id")]` now works rather than error `i has evaluated to type integer. Expecting logical, integer or double`, [#1632](https://github.com/Rdatatable/data.table/issues/1632). `DT["id"]` has worked forever by automatically converting to `DT[.("id")]` for convenience, and joins have worked forever between char/fact, fact/char and fact/fact even when levels mismatch, so it was unfortunate that `DT[factor("id")]` managed to escape the simple automatic conversion to `DT[.(factor("id"))]` which is now in place. Thanks to @aushev for reporting, and Matt Dowle for the fix. + +43. All-NA character key columns could segfault, [#5070](https://github.com/Rdatatable/data.table/issues/5070). Thanks to @JorisChau for reporting and Benjamin Schwendinger for the fix. + +44. In v1.13.2 a version of an old bug was reintroduced where during a grouping operation list columns could retain a pointer to the last group. This affected only attributes of list elements and only if those were updated during the grouping operation, [#4963](https://github.com/Rdatatable/data.table/issues/4963). Thanks to @fujiaxiang for reporting and @avimallu and Václav Tlapák for investigating and the PR. + +45. `shift(xInt64, fill=0)` and `shift(xInt64, fill=as.integer64(0))` (but not `shift(xInt64, fill=0L)`) would error with `INTEGER() can only be applied to a 'integer', not a 'double'` where `xInt64` conveys `bit64::integer64`, `0` is type `double` and `0L` is type integer, [#4865](https://github.com/Rdatatable/data.table/issues/4865). Thanks to @peterlittlejohn for reporting and Benjamin Schwendinger for the PR. + +46. `DT[i, strCol:=classVal]` did not coerce using the `as.character` method for the class, resulting in either an unexpected string value or an error such as `To assign integer64 to a target of type character, please use as.character() for clarity`. Discovered during work on the previous issue, [#5189](https://github.com/Rdatatable/data.table/pull/5189). + + ```R + DT + # A + # + # 1: a + # 2: b + # 3: c + DT[2, A:=as.IDate("2021-02-03")] + DT[3, A:=bit64::as.integer64("4611686018427387906")] + DT + # A + # + # 1: a + # 2: 2021-02-03 # was 18661 + # 3: 4611686018427387906 # was error 'please use as.character' + ``` + +47. `tables()` failed with `argument "..." is missing` when called from within a function taking `...`; e.g. `function(...) { tables() }`, [#5197](https://github.com/Rdatatable/data.table/issues/5197). Thanks @greg-minshall for the report and @michaelchirico for the fix. + +48. `DT[, prod(int64Col), by=grp]` produced wrong results for `bit64::integer64` due to incorrect optimization, [#5225](https://github.com/Rdatatable/data.table/issues/5225). Thanks to Benjamin Schwendinger for reporting and fixing. + +49. `fintersect(..., all=TRUE)` and `fsetdiff(..., all=TRUE)` could return incorrect results when the inputs had columns named `x` and `y`, [#5255](https://github.com/Rdatatable/data.table/issues/5255). Thanks @Fpadt for the report, and @ben-schwen for the fix. + +50. `fwrite()` could produce not-ISO-compliant timestamps such as `2023-03-08T17:22:32.:00Z` when under a whole second by less than numerical tolerance of one microsecond, [#5238](https://github.com/Rdatatable/data.table/issues/5238). Thanks to @avraam-inside for the report and Václav Tlapák for the fix. + +51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). It is now implemented and any other ignored arguments (e.g. misspellings) are now warned about. Thanks to @GBsuperman for the report and @ben-schwen for the fix. + +52. `DT[, c('z','x') := {x=NULL; list(2,NULL)}]` now removes column `x` as expected rather than incorrectly assigning `2` to `x` as well as `z`, [#5284](https://github.com/Rdatatable/data.table/issues/5284). The `x=NULL` is superfluous while the `list(2,NULL)` is the final value of `{}` whose items correspond to `c('z','x')`. Thanks @eutwt for the report, and @ben-schwen for the fix. + +53. `as.data.frame(DT, row.names=)` no longer silently ignores `row.names`, [#5319](https://github.com/Rdatatable/data.table/issues/5319). Thanks to @dereckdemezquita for the fix and PR, and @ben-schwen for guidance. + +54. `data.table(...)` unnamed arguments are deparsed in an attempt to name the columns but when called from `do.call()` the input data itself was deparsed taking a very long time, [#5501](https://github.com/Rdatatable/data.table/pull/5501). Many thanks to @OfekShilon for the report and fix, and @michaelchirico for guidance. Unnamed arguments to `data.table(...)` may now be faster in other cases not involving `do.call()` too; e.g. expressions spanning a lot of lines or other function call constructions that led to the data itself being deparsed. + + ```R + DF = data.frame(a=runif(1e6), b=runif(1e6)) + DT1 = data.table(DF) # 0.02s before and after + DT2 = do.call(data.table, list(DF)) # 3.07s before, 0.02s after + identical(DT1, DT2) # TRUE + ``` + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : @@ -329,18 +595,95 @@ 12. `?merge` and `?setkey` have been updated to clarify that the row order is retained when `sort=FALSE`, and why `NA`s are always first when `sort=TRUE`, [#2574](https://github.com/Rdatatable/data.table/issues/2574) [#2594](https://github.com/Rdatatable/data.table/issues/2594). Thanks to Davor Josipovic and Markus Bonsch for the reports, and Jan Gorecki for the PR. -13. `datatable.[dll|so]` has changed name to `data_table.[dll|so]`, [#4442](https://github.com/Rdatatable/data.table/pull/4442). Thanks to Jan Gorecki for the PR. We had previously removed the `.` since `.` is not allowed by the following paragraph in the Writing-R-Extensions manual. Replacing `.` with `_` instead now seems more consistent with the last sentence. - - > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. - -14. For nearly two years, since v1.12.4 (Oct 2019) (note 11 below in this NEWS file), using `options(datatable.nomatch=0)` has produced the following message : +13. For nearly two years, since v1.12.4 (Oct 2019) (note 11 below in this NEWS file), using `options(datatable.nomatch=0)` has produced the following message : ``` - The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option. + The option 'datatable.nomatch' is being used and is not set to the default NA. This option + is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for + detailed information and motivation. To specify inner join, please specify `nomatch=NULL` + explicitly in your calls rather than changing the default using this option. ``` The message is now upgraded to warning that the option is now ignored. +14. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively. + +15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). + + +# data.table [v1.14.8](https://github.com/Rdatatable/data.table/milestone/28?closed=1) (17 Feb 2023) + +## NOTES + +1. Test 1613.605 now passes changes to `as.data.frame()` in R-devel, [#5597](https://github.com/Rdatatable/data.table/pull/5597). Thanks to Avraham Adler for reporting. + +2. An out of bounds read when combining non-equi join with `by=.EACHI` has been found and fixed thanks to clang ASAN, [#5598](https://github.com/Rdatatable/data.table/issues/5598). There was no bug or consequence because the read was followed (now preceded) by a bounds test. + +3. `.rbind.data.table` (note the leading `.`) is no longer exported when `data.table` is installed in R>=4.0.0 (Apr 2020), [#5600](https://github.com/Rdatatable/data.table/pull/5600). It was never documented which R-devel now detects and warns about. It is only needed by `data.table` internals to support R<4.0.0; see note 1 in v1.12.6 (Oct 2019) below in this file for more details. + + +# data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) (16 Nov 2022) + +## BUG FIXES + +1. `fread()` could leak memory, [#3292](https://github.com/Rdatatable/data.table/issues/3292). Thanks to @patrickhowerter for reporting, and Jim Hester for the fix. The fix requires R 3.4.0 or later. Loading `data.table` in earlier versions now highlights this issue on startup, asks users to upgrade R, and warns that we intend to upgrade `data.table`'s dependency from 8 year old R 3.1.0 (April 2014) to 5 year old R 3.4.0 (April 2017). + +## NOTES + +1. Test 1962.098 has been modified to pass latest changes to `POSIXt` in R-devel. + +2. `test.data.table()` no longer creates `DT` in `.GlobalEnv`, a CRAN policy violation, [#5514](https://github.com/Rdatatable/data.table/issues/5514). No other writes occurred to `.GlobalEnv` and release procedures have been improved to prevent this happening again. + +3. The memory usage of the test suite has been halved, [#5507](https://github.com/Rdatatable/data.table/issues/5507). + + +# data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) (17 Oct 2022) + +## NOTES + +1. gcc 12.1 (May 2022) now detects and warns about an always-false condition (`-Waddress`) in `fread` which caused a small efficiency saving never to be invoked, [#5476](https://github.com/Rdatatable/data.table/pull/5476). Thanks to CRAN for testing latest versions of compilers. + +2. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has passed all tests. As such we don't expect any backwards compatibility concerns. Its manual page was causing an intermittent hang/crash from `R CMD check` on Windows-only on CRAN which we hope will be worked around by changing its name. + +3. Internal C code now passes `-Wstrict-prototypes` to satisfy the warnings now displayed on CRAN, [#5477](https://github.com/Rdatatable/data.table/pull/5477). + +4. `write.csv` in R-devel no longer responds to `getOption("digits.secs")` for `POSIXct`, [#5478](https://github.com/Rdatatable/data.table/issues/5478). This caused our tests of `fwrite(, dateTimeAs="write.csv")` to fail on CRAN's daily checks using latest daily R-devel. While R-devel discussion continues, and currently it seems like the change is intended with further changes possible, this `data.table` release massages our tests to pass on latest R-devel. The idea is to try to get out of the way of R-devel changes in this regard until the new behavior of `write.csv` is released and confirmed. Package updates are not accepted on CRAN if they do not pass the latest daily version of R-devel, even if R-devel changes after the package update is submitted. If the change to `write.csv()` stands, then a future release of `data.table` will be needed to make `fwrite(, dateTimeAs="write.csv")` match `write.csv()` output again in that future version of R onwards. If you use an older version of `data.table` than said future one in the said future version of R, then `fwrite(, dateTimeAs="write.csv")` may not match `write.csv()` if you are using `getOption("digits.secs")` too. However, you can always check that your installation of `data.table` works in your version of R on your platform by simply running `test.data.table()` yourself. Doing so would detect such a situation for you: test 1741 would fail in this case. `test.data.table()` runs the entire suite of tests and is always available to you locally. This way you do not need to rely on our statements about which combinations of versions of R and `data.table` on which platforms we have tested and support; just run `test.data.table()` yourself. Having said that, because test 1741 has been relaxed in this release in order to be accepted on CRAN to pass latest R-devel, this won't be true for this particular release in regard to this particular test. + + ```R + $ R --vanilla + R version 4.2.1 (2022-06-23) -- "Funny-Looking Kid" + > DF = data.frame(A=as.POSIXct("2022-10-01 01:23:45.012")) + > options(digits.secs=0) + > write.csv(DF) + "","A" + "1",2022-10-01 01:23:45 + > options(digits.secs=3) + > write.csv(DF) + "","A" + "1",2022-10-01 01:23:45.012 + + $ Rdevel --vanilla + R Under development (unstable) (2022-10-06 r83040) -- "Unsuffered Consequences" + > DF = data.frame(A=as.POSIXct("2022-10-01 01:23:45.012")) + > options(digits.secs=0) + > write.csv(DF) + "","A" + "1",2022-10-01 01:23:45.012 + ``` + +5. Many thanks to Kurt Hornik for investigating potential impact of a possible future change to `base::intersect()` on empty input, providing a patch so that `data.table` won't break if the change is made to R, and giving us plenty of notice, [#5183](https://github.com/Rdatatable/data.table/pull/5183). + +6. `datatable.[dll|so]` has changed name to `data_table.[dll|so]`, [#4442](https://github.com/Rdatatable/data.table/pull/4442). Thanks to Jan Gorecki for the PR. We had previously removed the `.` since `.` is not allowed by the following paragraph in the Writing-R-Extensions manual. Replacing `.` with `_` instead now seems more consistent with the last sentence. + + > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. + + +# data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) + +## NOTES + +1. clang 13.0.0 (Sep 2021) requires the system header `omp.h` to be included before R's headers, [#5122](https://github.com/Rdatatable/data.table/issues/5122). Many thanks to Prof Ripley for testing and providing a patch file. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) @@ -348,7 +691,7 @@ 1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz="UTC"` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended `In addition to convenience, fread is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.`. - At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://rstudio.com/resources/rstudioglobal-2021/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). + At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://www.rstudio.com/resources/rstudioglobal-2021/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). `tz=`'s default is now changed from `""` to `"UTC"`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. @@ -394,7 +737,7 @@ ## NOTES -1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://www.h2o.ai/blog/behind-the-scenes-of-cran/). +1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://h2o.ai/blog/behind-the-scenes-of-cran/). 2. `as.Date.IDate` is no longer exported as a function to solve a new error in R-devel `S3 method lookup found 'as.Date.IDate' on search path`, [#4777](https://github.com/Rdatatable/data.table/issues/4777). The S3 method is still exported; i.e. `as.Date(x)` will still invoke the `as.Date.IDate` method when `x` is class `IDate`. The function had been exported, in addition to exporting the method, to solve a compatibility issue with `zoo` (and `xts` which uses `zoo`) because `zoo` exports `as.Date` which masks `base::as.Date`. Happily, since zoo 1.8-1 (Jan 2018) made a change to its `as.IDate`, the workaround is no longer needed. @@ -1246,7 +1589,7 @@ has a better chance of working on Mac. ## NOTES -1. When data.table loads it now checks its DLL version against the version of its R level code. This is to detect installation issues on Windows when i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. +1. When data.table loads it now checks its DLL version against the version of its R level code. This is to detect installation issues on Windows when i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. 2. When `on=` is provided but not `i=`, a helpful error is now produced rather than silently ignoring `on=`. Thanks to Dirk Eddelbuettel for the idea. @@ -1303,7 +1646,7 @@ has a better chance of working on Mac. 4. `fread()` has always accepted literal data; e.g. `fread("A,B\n1,2\n3,4")`. It now gains explicit `text=`; e.g. `fread(text="A,B\n1,2\n3,4")`. Unlike the first general purpose `input=` argument, the `text=` argument accepts multi-line input; e.g. `fread(text=c("A,B","1,2","3,4"))`, [#1423](https://github.com/Rdatatable/data.table/issues/1423). Thanks to Douglas Clark for the request and Hugh Parsonage for the PR. -5. `fread()` has always accepted system commands; e.g. `fread("grep blah file.txt")`. It now gains explicit `cmd=`; e.g. `fread(cmd="grep blah file.txt")`. Further, if and only if `input=` is a system command and a variable was used to hold that command (`fread(someCommand)` not `fread("grep blah file.txt")`) or a variable is used to construct it (`fread(paste("grep",variable,"file.txt"))`), a message is now printed suggesting `cmd=`. This is to inform all users that there is a potential security concern if you are i) creating apps, and ii) your app takes input from a public user who could be malicious, and iii) input from the malicious user (such as a filename) is passed by your app to `fread()`, and iv) your app in not running in a protected environment. If all 4 conditions hold then the malicious user could provide a system command instead of a filename which `fread()` would run, and that would be a problem too. If the app is not running in a protected environment (e.g. app is running as root) then this could do damage or obtain data you did not intend. Public facing apps should be running with limited operating system permission so that any breach from any source is contained. We agree with [Linus Torvald's advice](https://lkml.org/lkml/2017/11/21/356) on this which boils down to: "when addressing security concerns the first step is do no harm, just inform". If you aren't creating apps or apis that could have a malicious user then there is no risk but we can't distinguish you so we have to inform everyone. Please change to `fread(cmd=...)` at your leisure. The new message can be suppressed with `options(datatable.fread.input.cmd.message=FALSE)`. Passing system commands to `fread()` continues to be recommended and encouraged and is widely used; e.g. via the techniques gathered together in the book [Data Science at the Command Line](https://www.datascienceatthecommandline.com/). A `warning()` is too strong because best-practice for production systems is to set `options(warn=2)` to tolerate no warnings. Such production systems have no user input and so there is no security risk; we don't want to do harm by breaking production systems via a `warning()` which gets turned into an error by `options(warn=2)`. Now that we have informed all users, we request feedback. There are 3 options for future releases: i) remove the message, ii) leave the message in place, iii) upgrade the message to warning and then eventually error. The default choice is the middle one: leave the message in place. +5. `fread()` has always accepted system commands; e.g. `fread("grep blah file.txt")`. It now gains explicit `cmd=`; e.g. `fread(cmd="grep blah file.txt")`. Further, if and only if `input=` is a system command and a variable was used to hold that command (`fread(someCommand)` not `fread("grep blah file.txt")`) or a variable is used to construct it (`fread(paste("grep",variable,"file.txt"))`), a message is now printed suggesting `cmd=`. This is to inform all users that there is a potential security concern if you are i) creating apps, and ii) your app takes input from a public user who could be malicious, and iii) input from the malicious user (such as a filename) is passed by your app to `fread()`, and iv) your app in not running in a protected environment. If all 4 conditions hold then the malicious user could provide a system command instead of a filename which `fread()` would run, and that would be a problem too. If the app is not running in a protected environment (e.g. app is running as root) then this could do damage or obtain data you did not intend. Public facing apps should be running with limited operating system permission so that any breach from any source is contained. We agree with [Linus Torvald's advice](https://lkml.org/lkml/2017/11/21/356) on this which boils down to: "when addressing security concerns the first step is do no harm, just inform". If you aren't creating apps or apis that could have a malicious user then there is no risk but we can't distinguish you so we have to inform everyone. Please change to `fread(cmd=...)` at your leisure. The new message can be suppressed with `options(datatable.fread.input.cmd.message=FALSE)`. Passing system commands to `fread()` continues to be recommended and encouraged and is widely used; e.g. via the techniques gathered together in the book [Data Science at the Command Line](https://datascienceatthecommandline.com/). A `warning()` is too strong because best-practice for production systems is to set `options(warn=2)` to tolerate no warnings. Such production systems have no user input and so there is no security risk; we don't want to do harm by breaking production systems via a `warning()` which gets turned into an error by `options(warn=2)`. Now that we have informed all users, we request feedback. There are 3 options for future releases: i) remove the message, ii) leave the message in place, iii) upgrade the message to warning and then eventually error. The default choice is the middle one: leave the message in place. 6. New `options(datatable.CJ.names=TRUE)` changes `CJ()` to auto-name its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Thanks @franknarf1 for the suggestion. Current default is `FALSE`; i.e. no change. The option's default will be changed to `TRUE` in v1.12.0 and then eventually the option will be removed. Any code that depends on `CJ(x,y)$V1` will need to be changed to `CJ(x,y)$x` and is more akin to a bug fix due to the inconsistency with `data.table()`. diff --git a/R/IDateTime.R b/R/IDateTime.R index 42a6b289a..4e6adf55e 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -71,6 +71,11 @@ unique.IDate = x } +# define min and max to avoid base R's Inf with warning on empty, #2256 +min.IDate = max.IDate = function(x, ...) { + as.IDate(if (!length(x)) NA else NextMethod()) +} + # fix for #1315 as.list.IDate = function(x, ...) NextMethod() @@ -304,7 +309,7 @@ clip_msec = function(secs, action) { stopf("Valid options for ms are 'truncate', 'nearest', and 'ceil'.") ) } - + ################################################################### # Date - time extraction functions # Adapted from Hadley Wickham's routines cited below to ensure @@ -333,10 +338,10 @@ hour = function(x) { if (inherits(x, 'ITime')) return(as.integer(x) %/% 3600L %% 24L) as.POSIXlt(x)$hour } -yday = function(x) as.POSIXlt(x)$yday + 1L -wday = function(x) (unclass(as.IDate(x)) + 4L) %% 7L + 1L -mday = function(x) as.POSIXlt(x)$mday -week = function(x) yday(x) %/% 7L + 1L +yday = function(x) convertDate(as.IDate(x), "yday") +wday = function(x) convertDate(as.IDate(x), "wday") +mday = function(x) convertDate(as.IDate(x), "mday") +week = function(x) convertDate(as.IDate(x), "week") isoweek = function(x) { # ISO 8601-conformant week, as described at # https://en.wikipedia.org/wiki/ISO_week_date @@ -351,7 +356,13 @@ isoweek = function(x) { 1L + (nearest_thurs - year_start) %/% 7L } -month = function(x) as.POSIXlt(x)$mon + 1L -quarter = function(x) as.POSIXlt(x)$mon %/% 3L + 1L -year = function(x) as.POSIXlt(x)$year + 1900L +month = function(x) convertDate(as.IDate(x), "month") +quarter = function(x) convertDate(as.IDate(x), "quarter") +year = function(x) convertDate(as.IDate(x), "year") +yearmon = function(x) convertDate(as.IDate(x), "yearmon") +yearqtr = function(x) convertDate(as.IDate(x), "yearqtr") +convertDate = function(x, type) { + type = match.arg(type, c("yday", "wday", "mday", "week", "month", "quarter", "year", "yearmon", "yearqtr")) + .Call(CconvertDate, x, type) +} diff --git a/R/data.table.R b/R/data.table.R index d70e67761..473cf6e76 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -221,9 +221,12 @@ replace_dot_alias = function(e) { # TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one. .unsafe.opt() #3585 missingnomatch = missing(nomatch) - nomatch0 = identical(nomatch,0) || identical(nomatch,0L) # for warning with row-numbers in i; #4353 - if (nomatch0) nomatch=NULL # retain nomatch=0 backwards compatibility; #857 - if (!(is.null(nomatch) || (length(nomatch)==1L && is.na(nomatch)))) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)") + nomatch0 = identical(nomatch,0) || identical(nomatch,0L) || identical(nomatch, FALSE) # for warning with row-numbers in i; #4353 + if (nomatch0) nomatch=NULL # retain nomatch=0|FALSE backwards compatibility, #857 #5214 + if (!is.null(nomatch)) { + if (!(length(nomatch)==1L && is.na(nomatch))) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)") + nomatch=NA # convert NA_character_ to NA-logical, PR#5216 + } if (!is.logical(which) || length(which)>1L) stopf("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") @@ -307,7 +310,9 @@ replace_dot_alias = function(e) { as.character(jsub[[1L]])[1L] } else "" } - if (root == ":=") { + if (root == ":=" || root == "let") { # let(...) as alias for :=(...) (#3795) + if (root == "let") + jsub[[1L]] = as.symbol(":=") allow.cartesian=TRUE # (see #800) if (!missing(i) && keyby) stopf(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i") @@ -436,7 +441,7 @@ replace_dot_alias = function(e) { } } if (is.null(i)) return( null.data.table() ) - if (is.character(i)) { + if (is.character(i) || is.factor(i)) { isnull_inames = TRUE i = data.table(V1=i) # for user convenience; e.g. DT["foo"] without needing DT[.("foo")] } else if (identical(class(i),"list") && length(i)==1L && is.data.frame(i[[1L]])) { i = as.data.table(i[[1L]]) } @@ -761,6 +766,12 @@ replace_dot_alias = function(e) { # may evaluate to NULL | character() | "" | list(), likely a result of a user expression where no-grouping is one case being loop'd through bysubl = as.list.default(bysub) bysuborig = bysub + if (".I" %in% bysubl) { #1732 + if (!is.symbol(bysub) && (length(bysubl)!=2L || !is.symbol(bysubl[[2L]]) || !(bysubl[[1L]] %chin% c(".","c","list")))) + stopf("'by' contains .I but only the following are currently supported: by=.I, by=.(.I), by=c(.I), by=list(.I)") + bysub = if (is.null(irows)) seq_len(nrow(x)) else irows + bysuborig = as.symbol("I") + } if (is.name(bysub) && !(bysub %chin% names_x)) { # TO DO: names(x),names(i),and i. and x. prefixes bysub = eval(bysub, parent.frame(), parent.frame()) # fix for # 5106 - http://stackoverflow.com/questions/19983423/why-by-on-a-vector-not-from-a-data-table-column-is-very-slow @@ -1104,7 +1115,7 @@ replace_dot_alias = function(e) { if (is.null(names(jsub))) { # regular LHS:=RHS usage, or `:=`(...) with no named arguments (an error) # `:=`(LHS,RHS) is valid though, but more because can't see how to detect that, than desire - if (length(jsub)!=3L) stopf("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.") + if (length(jsub)!=3L) stopf("In %s(col1=val1, col2=val2, ...) form, all arguments must be named.", if (root == "let") "let" else "`:=`") lhs = jsub[[2L]] jsub = jsub[[3L]] if (is.name(lhs)) { @@ -1116,7 +1127,7 @@ replace_dot_alias = function(e) { } else { # `:=`(c2=1L,c3=2L,...) lhs = names(jsub)[-1L] - if (any(lhs=="")) stopf("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.") + if (any(lhs=="")) stopf("In %s(col1=val1, col2=val2, ...) form, all arguments must be named.", if (root == "let") "let" else "`:=`") names(jsub)="" jsub[[1L]]=as.name("list") } @@ -1298,18 +1309,8 @@ replace_dot_alias = function(e) { # But rather than that complex logic here at R level to catch that and do a shallow copy for efficiency, just do the check inside CsubsetDT # to see if it passed 1:nrow(x) and then CsubsetDT should do the shallow copy safely and centrally. # That R level branch was taken out in PR #3213 - - # TO DO: use CsubsetDT twice here and then remove this entire R level branch - for (s in seq_along(icols)) { - target = icolsAns[s] - source = icols[s] - ans[[target]] = .Call(CsubsetVector,i[[source]],ii) # i.e. i[[source]][ii] - } - for (s in seq_along(xcols)) { - target = xcolsAns[s] - source = xcols[s] - ans[[target]] = .Call(CsubsetVector,x[[source]],irows) # i.e. x[[source]][irows], but guaranteed new memory even for singleton logicals from R 3.1.0 - } + ans[icolsAns] = .Call(CsubsetDT, i, ii, icols) + ans[xcolsAns] = .Call(CsubsetDT, x, irows, xcols) setattr(ans, "names", ansvars) if (haskey(x)) { keylen = which.first(!key(x) %chin% ansvars)-1L @@ -1376,7 +1377,7 @@ replace_dot_alias = function(e) { } else if (address(jval) == address(SDenv$.SD)) { jval = copy(jval) } else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) { - for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]]) + for (jidx in jcpy) { if(!is.null(jval[[jidx]])) jval[[jidx]] = copy(jval[[jidx]]) } } else if (jsub %iscall% 'get') { jval = copy(jval) # fix for #1212 } @@ -1726,13 +1727,15 @@ replace_dot_alias = function(e) { dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character? # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with # nomatch=NULL even now.. but not switching it on yet, will deal it separately. - if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { + if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__)) { if (!length(ansvars) && !use.I) { GForce = FALSE - if ( (is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N") ) { + if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { GForce = TRUE if (verbose) catf("GForce optimized j to '%s'\n",deparse(jsub, width.cutoff=200L, nlines=1L)) } + } else if (length(lhs) && is.symbol(jsub)) { # turn off GForce for the combination of := and .N + GForce = FALSE } else { # Apply GForce .gforce_ok = function(q) { @@ -1743,9 +1746,12 @@ replace_dot_alias = function(e) { if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na")))) return(TRUE) # ^^ base::startWith errors on NULL unfortunately - # head-tail uses default value n=6 which as of now should not go gforce ... ^^ - # otherwise there must be three arguments, and only in two cases: - # 1) head/tail(x, 1) or 2) x[n], n>0 + if (length(q)>=2L && q[[1L]] == "shift") { + q_named = match.call(shift, q) + if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE) + } + if (length(q)>=3L && q[[1L]] == "weighted.mean") return(TRUE) #3977 + # otherwise there must be three arguments length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && ( (q1 %chin% c("head", "tail")) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) } @@ -1760,13 +1766,13 @@ replace_dot_alias = function(e) { for (ii in seq_along(jsub)[-1L]) { if (dotN(jsub[[ii]])) next; # For #334 jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]])) - if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 + if (length(jsub[[ii]])>=3L && is.symbol(jsub[[ii]][[3L]]) && !(jsub[[ii]][[3L]] %chin% sdvars)) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 } else { # adding argument to ghead/gtail if none is supplied to g-optimized head/tail if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) - if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 + if (length(jsub)>=3L && is.symbol(jsub[[3L]]) && !(jsub[[3L]] %chin% sdvars)) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } else if (verbose) catf("GForce is on, left j unchanged\n"); @@ -1845,6 +1851,17 @@ replace_dot_alias = function(e) { gi = if (length(o__)) o__[f__] else f__ g = lapply(grpcols, function(i) groups[[i]][gi]) + # returns all rows instead of one per group + nrow_funs = c("gshift") + .is_nrows = function(q) { + if (!is.call(q)) return(FALSE) + if (q[[1L]] == "list") { + any(vapply(q, .is_nrows, FALSE)) + } else { + q[[1L]] %chin% nrow_funs + } + } + # adding ghead/gtail(n) support for n > 1 #5060 #523 q3 = 0 if (!is.symbol(jsub)) { @@ -1862,6 +1879,8 @@ replace_dot_alias = function(e) { if (q3 > 0) { grplens = pmin.int(q3, len__) g = lapply(g, rep.int, times=grplens) + } else if (.is_nrows(jsub)) { + g = lapply(g, rep.int, times=len__) } ans = c(g, ans) } else { @@ -1885,6 +1904,16 @@ replace_dot_alias = function(e) { # Grouping by by: i is by val, icols NULL, o__ may be subset of x, f__ points to o__ (or x if !length o__) # TO DO: setkey could mark the key whether it is unique or not. if (!is.null(lhs)) { + if (GForce) { # GForce should work with := #1414 + vlen = length(ans[[1L]]) + # replicate vals if GForce returns 1 value per group + jvals = if (vlen==length(len__)) lapply(tail(ans, -length(g)), rep, times=len__) else tail(ans, -length(g)) # see comment in #4245 for why rep instead of rep.int + jrows = vecseq(f__,len__,NULL) + if (length(o__)) jrows = o__[jrows] + if (length(irows)) jrows = irows[jrows] + if (length(jvals)==1L) jvals = jvals[[1L]] # unwrap single column jvals for assign + .Call(Cassign, x, jrows, lhs, newnames, jvals) + } if (any(names_x[cols] %chin% key(x))) setkey(x,NULL) # fixes #1479. Take care of secondary indices, TODO: cleaner way of doing this @@ -1941,7 +1970,11 @@ DT = function(x, ...) { #4872 options(datatable.optimize=2L) # GForce still on; building and storing indices in .prepareFastSubset off; see long paragraph in news item 22 of v1.14.2 } - ans = `[.data.table`(x, ...) + fun = match.call() + fun[[1L]] = as.name("[.data.table") # hence now exporting [.data.table method otherwise R CMD check can't find it in tests 2212.* + ans = eval(fun, envir=parent.frame(), # for issue 2 in #5129 so that eval(.massagei(isub), x, ienv) finds objects in calling + # env, and likely other places inside [.data.table that look at the calling env + enclos=parent.frame()) # including enclos= too as it has often been needed in the past options(datatable.optimize=old) .global$print = "" # functional form should always print; #5106 ans @@ -2175,15 +2208,9 @@ tail.data.table = function(x, n=6L, ...) { set(x,j=name,value=value) # important i is missing here } -as.data.frame.data.table = function(x, ...) +as.data.frame.data.table = function(x, row.names = NULL, ...) { - ans = copy(x) - setattr(ans,"row.names",.set_row_names(nrow(x))) # since R 2.4.0, data.frames can have non-character row names - setattr(ans,"class","data.frame") - setattr(ans,"sorted",NULL) # remove so if you convert to df, do something, and convert back, it is not sorted - setattr(ans,"index",NULL) #4889 #5042 - setattr(ans,".internal.selfref",NULL) - # leave tl intact, no harm, + ans = setDF(copy(x), rownames = row.names) # issue #5319 ans } @@ -2523,8 +2550,10 @@ copy = function(x) { } shallow = function(x, cols=NULL) { - if (!is.data.frame(x)) + if (!is.data.frame(x) && !is.data.table(x)) { + # ^^ some revdeps do class(x)="data.table" without inheriting from data.frame, PR#5210 stopf("x is not a data.table|frame. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table|frame") + } ans = .shallow(x, cols=cols, retain.key=selfrefok(x)) # selfrefok for #5042 ans } @@ -2746,9 +2775,11 @@ address = function(x) .Call(Caddress, eval(substitute(x), parent.frame())) ":=" = function(...) { # this error is detected when eval'ing isub and replaced with a more helpful one when using := in i due to forgetting a comma, #4227 - stopf('Check that is.data.table(DT) == TRUE. Otherwise, := and `:=`(...) are defined for use in j, once only and in particular ways. See help(":=").') + stopf('Check that is.data.table(DT) == TRUE. Otherwise, :=, `:=`(...) and let(...) are defined for use in j, once only and in particular ways. See help(":=").') } +let = function(...) `:=`(...) + setDF = function(x, rownames=NULL) { if (!is.list(x)) stopf("setDF only accepts data.table, data.frame or list of equal length as input") if (anyDuplicated(rownames)) stopf("rownames contains duplicates") @@ -2961,7 +2992,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { # (2) edit .gforce_ok (defined within `[`) to catch which j will apply the new function # (3) define the gfun = function() R wrapper gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod", - "median", "min", "max", "var", "sd", ".N") # added .N for #334 + "median", "min", "max", "var", "sd", ".N", "shift", "weighted.mean") # added .N for #334 `g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment @@ -2969,12 +3000,28 @@ gfirst = function(x) .Call(Cgfirst, x) glast = function(x) .Call(Cglast, x) gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm) gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm) +gweighted.mean = function(x, w, na.rm=FALSE) { + if (missing(w)) gmean(x, na.rm) + else { + if (na.rm) { # take those indices out of the equation by setting them to 0 + ix <- is.na(x) + x[ix] <- 0 + w[ix] <- 0 + } + gsum((w!=0)*x*w, na.rm=FALSE)/gsum(w, na.rm=FALSE) + } +} gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm) gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm) gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm) gmax = function(x, na.rm=FALSE) .Call(Cgmax, x, na.rm) gvar = function(x, na.rm=FALSE) .Call(Cgvar, x, na.rm) gsd = function(x, na.rm=FALSE) .Call(Cgsd, x, na.rm) +gshift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift", "cyclic")) { + type = match.arg(type) + stopifnot(is.numeric(n)) + .Call(Cgshift, x, as.integer(n), fill, type) +} gforce = function(env, jsub, o, f, l, rows) .Call(Cgforce, env, jsub, o, f, l, rows) .prepareFastSubset = function(isub, x, enclos, notjoin, verbose = FALSE){ diff --git a/R/devel.R b/R/devel.R index 9461633ec..8bd7a1466 100644 --- a/R/devel.R +++ b/R/devel.R @@ -17,7 +17,7 @@ dcf.repo = function(pkg, repo, field, type) { dcf[dcf[,"Package"]==pkg, field][[1L]] } -update.dev.pkg = function(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { +update_dev_pkg = function(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { # this works for any package, not just data.table pkg = object # perform package upgrade when new Revision present @@ -32,7 +32,7 @@ update.dev.pkg = function(object="data.table", repo="https://Rdatatable.gitlab.i pkg, field, contrib.url(repo, type=type)) # see if Revision is different then currently installed Revision, note that installed package will have Revision info only when it was installed from remote devel repo upg = una || !identical(ups, dcf.lib(pkg, field, lib.loc=lib)) - # update.dev.pkg fails on windows R 4.0.0, we have to unload package namespace before installing new version #4403 + # update_dev_pkg fails on windows R 4.0.0, we have to unload package namespace before installing new version #4403 on.exit({ if (upg) { unloadNamespace(pkg) ## hopefully will release dll lock on Windows diff --git a/R/duplicated.R b/R/duplicated.R index 4fc7c8d16..901d6e3c0 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -23,7 +23,7 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_ res } -unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) { +unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), cols=NULL, ...) { if (!cedta()) return(NextMethod("unique")) # nocov if (!isFALSE(incomparables)) { .NotYetUsed("incomparables != FALSE") @@ -31,6 +31,9 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon if (nrow(x) <= 1L) return(x) if (!length(by)) by = NULL #4594 o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) + if (!is.null(cols)) { + x = .shallow(x, c(by, cols), retain.key=TRUE) + } # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't # as efficient as forderv returning empty o when input is already ordered diff --git a/R/fcast.R b/R/fcast.R index 465ff665d..efe18cf72 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -151,7 +151,6 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., idx = which(eval(subset, data, parent.frame())) # any advantage thro' secondary keys? dat = .Call(CsubsetDT, dat, idx, seq_along(dat)) } - if (!nrow(dat) || !ncol(dat)) stopf("Can not cast an empty data.table") fun.call = m[["fun.aggregate"]] fill.default = NULL if (is.null(fun.call)) { diff --git a/R/fdroplevels.R b/R/fdroplevels.R new file mode 100644 index 000000000..c7025dda0 --- /dev/null +++ b/R/fdroplevels.R @@ -0,0 +1,26 @@ +# 647 fast droplevels.data.table method +fdroplevels = function(x, exclude = if (anyNA(levels(x))) NULL else NA, ...) { + stopifnot(inherits(x, "factor")) + lev = which(tabulate(x, length(levels(x))) & (!match(levels(x), exclude, 0L))) + ans = match(as.integer(x), lev) + setattr(ans, 'levels', levels(x)[lev]) + setattr(ans, 'class', class(x)) + return(ans) +} + +droplevels.data.table = function(x, except = NULL, exclude, in.place = FALSE, ...){ + stopifnot(is.logical(in.place)) + if (nrow(x)==0L) return(x) + ix = vapply(x, is.factor, NA) + if(!is.null(except)){ + stopifnot(is.numeric(except), except <= length(x)) + ix[except] = FALSE + } + if(!sum(ix)) return(x) + if(!in.place) x = copy(x) + for(nx in names(ix)[ix==TRUE]){ + if (missing(exclude)) set(x, i = NULL, j = nx, value = fdroplevels(x[[nx]])) + else set(x, i = NULL, j = nx, value = fdroplevels(x[[nx]], exclude = exclude)) + } + return(x) +} diff --git a/R/fread.R b/R/fread.R index 12f46b57e..f8b025d9c 100644 --- a/R/fread.R +++ b/R/fread.R @@ -98,10 +98,30 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") warningf("File '%s' has size 0. Returning a NULL %s.", file, if (data.table) 'data.table' else 'data.frame') return(if (data.table) data.table(NULL) else data.frame(NULL)) } - if (w <- endsWithAny(file, c(".gz",".bz2"))) { + + # support zip and tar files #3834 + zip_signature = charToRaw("PK\x03\x04") + file_signature = readBin(file, raw(), 8L) + + if ((w <- endsWithAny(file, c(".zip", ".tar"))) || identical(head(file_signature, 4L), zip_signature)) { + FUN = if (w==2L) untar else unzip + fnames = FUN(file, list=TRUE) + if (is.data.frame(fnames)) fnames = fnames[,1L] + if (length(fnames) > 1L) + stopf("Compressed files containing more than 1 file are currently not supported.") + FUN(file, exdir=tmpdir) + decompFile = file.path(tmpdir, fnames) + file = decompFile + on.exit(unlink(decompFile), add=TRUE) + } + + gz_signature = as.raw(c(0x1F, 0x8B)) + bz2_signature = as.raw(c(0x42, 0x5A, 0x68)) + gzsig = FALSE + if ((w <- endsWithAny(file, c(".gz",".bz2"))) || (gzsig <- identical(head(file_signature, 2L), gz_signature)) || identical(head(file_signature, 3L), bz2_signature)) { if (!requireNamespace("R.utils", quietly = TRUE)) stopf("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov - FUN = if (w==1L) gzfile else bzfile + FUN = if (w==1L || gzsig) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download on.exit(unlink(decompFile), add=TRUE) diff --git a/R/groupingsets.R b/R/groupingsets.R index 4c25b5b65..96940497c 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -73,6 +73,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) stopf("Expression passed to grouping sets function must not update by reference. Use ':=' on results of your grouping function.") if (missing(.SDcols)) .SDcols = if (".SD" %chin% av) setdiff(names(x), by) else NULL + if (length(names(by))) by = unname(by) # 0 rows template data.table to keep colorder and type empty = if (length(.SDcols)) x[0L, eval(jj), by, .SDcols=.SDcols] else x[0L, eval(jj), by] if (id && "grouping" %chin% names(empty)) # `j` could have been evaluated to `grouping` field diff --git a/R/merge.R b/R/merge.R index fb0666d5e..cbc9b9e29 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,5 +1,5 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, - all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), ...) { + all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) { if (!sort %in% c(TRUE, FALSE)) stopf("Argument 'sort' should be logical TRUE/FALSE") if (!no.dups %in% c(TRUE, FALSE)) @@ -14,7 +14,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL x0 = length(x)==0L y0 = length(y)==0L if (x0 || y0) { - if (x0 && y0) + if (x0 && y0) warningf("Neither of the input data.tables to join have columns.") else if (x0) warningf("Input data.table '%s' has no columns.", "x") @@ -43,9 +43,9 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } else { if (is.null(by)) by = intersect(key(x), key(y)) - if (is.null(by)) + if (!length(by)) # was is.null() before PR#5183 changed to !length() by = key(x) - if (is.null(by)) + if (!length(by)) by = intersect(nm_x, nm_y) if (length(by) == 0L || !is.character(by)) stopf("A non-empty vector of column names for `by` is required.") @@ -54,6 +54,15 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL by = unname(by) by.x = by.y = by } + + # warn about unused arguments #2587 + if (length(list(...))) { + ell = as.list(substitute(list(...)))[-1L] + for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n) + unnamed_n = length(ell) - sum(names(ell) != "") + if (unnamed_n) + warningf("Passed %d unknown and unnamed arguments.", unnamed_n) + } # with i. prefix in v1.9.3, this goes away. Left here for now ... ## sidestep the auto-increment column number feature-leading-to-bug by ## ensuring no names end in ".1", see unit test @@ -72,22 +81,23 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L]) } + # implement incomparables argument #2587 + if (!is.null(incomparables)) { + # %fin% to be replaced when #5232 is implemented/closed + "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table + xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by) + yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by) + # subset both so later steps still work + x = x[xind] + y = y[yind] + } dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names) if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed # Perhaps not very commonly used, so not a huge deal that the join is redone here. missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) { - yy = y[missingyidx] - othercolsx = setdiff(nm_x, by) - if (length(othercolsx)) { - tmp = rep.int(NA_integer_, length(missingyidx)) - # TO DO: use set() here instead.. - yy = cbind(yy, x[tmp, othercolsx, with = FALSE]) - } - # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist - # takes care of #24 without having to save names. This is how it should be, IMHO. - dt = rbind(dt, yy, use.names=FALSE) + dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE) } } # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. diff --git a/R/notin.R b/R/notin.R new file mode 100644 index 000000000..ba5cef502 --- /dev/null +++ b/R/notin.R @@ -0,0 +1,7 @@ +"%notin%" = function(x, table) { + if (is.character(x) && is.character(table)) { + .Call(Cnotchin, x, table) + } else { + match(x, table, nomatch = 0L) == 0L + } +} diff --git a/R/onAttach.R b/R/onAttach.R index 554d2599d..6ff17972b 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -27,7 +27,7 @@ if (gettext("TRANSLATION CHECK") != "TRANSLATION CHECK") packageStartupMessagef("**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********") if (dev && (Sys.Date() - as.Date(d))>28L) - packageStartupMessagef("**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") + packageStartupMessagef("**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update_dev_pkg()\n**********") if (!.Call(ChasOpenMP)) { packageStartupMessagef("**********\nThis installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", appendLF=FALSE) if (Sys.info()["sysname"] == "Darwin") @@ -35,6 +35,10 @@ else packageStartupMessagef("This is %s. This warning should not normally occur on Windows or Linux where OpenMP is turned on by data.table's configure script by passing -fopenmp to the compiler. If you see this warning on Windows or Linux, please file a GitHub issue.\n**********", Sys.info()["sysname"]) } + if (.Call(CbeforeR340)) { + # not base::getRversion()<"3.4.0" in case the user upgrades R but does not reinstall data.table; a reasonable mistake since data.table would seem to be the latest version + packageStartupMessagef("**********\nThis data.table installation was compiled for R < 3.4.0 (Apr 2017) and is known to leak memory. Please upgrade R and reinstall data.table to fix the leak. Maintaining and testing code branches to support very old versions increases development time so please do upgrade R. We intend to bump data.table's dependency from 8 year old R 3.1.0 (Apr 2014) to 5 year old R 3.4.0 (Apr 2017).\n**********") + } } } diff --git a/R/onLoad.R b/R/onLoad.R index 1ee328e99..b4ebeafdf 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -73,10 +73,10 @@ "datatable.optimize"="Inf", # datatable. "datatable.print.nrows"="100L", # datatable. "datatable.print.topn"="5L", # datatable. - "datatable.print.class"="FALSE", # for print.data.table + "datatable.print.class"="TRUE", # for print.data.table "datatable.print.rownames"="TRUE", # for print.data.table "datatable.print.colnames"="'auto'", # for print.data.table - "datatable.print.keys"="FALSE", # for print.data.table + "datatable.print.keys"="TRUE", # for print.data.table "datatable.print.trunc.cols"="FALSE", # for print.data.table "datatable.allow.cartesian"="FALSE", # datatable. "datatable.dfdispatchwarn"="TRUE", # not a function argument diff --git a/R/print.data.table.R b/R/print.data.table.R index 023551074..16950fd11 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -180,10 +180,20 @@ format_list_item = function(x, ...) { UseMethod("format_list_item") } +has_format_method = function(x) { + f = function(y) !is.null(getS3method("format", class=y, optional=TRUE)) + any(sapply(class(x), f)) +} + format_col.default = function(x, ...) { - if (!is.null(dim(x))) return("") - if (is.list(x)) return(vapply_1c(x, format_list_item, ...)) - format(char.trunc(x), ...) # relevant to #37 + if (!is.null(dim(x))) + "" + else if (has_format_method(x) && length(formatted<-format(x, ...))==length(x)) + formatted #PR5224 motivated by package sf where column class is c("sfc_MULTIPOLYGON","sfc") and sf:::format.sfc exists + else if (is.list(x)) + vapply_1c(x, format_list_item, ...) + else + format(char.trunc(x), ...) # relevant to #37 } # #2842 -- different columns can have different tzone, so force usage in output @@ -206,14 +216,19 @@ format_list_item.default = function(x, ...) { if (is.null(x)) # NULL item in a list column "" else if (is.atomic(x) || inherits(x, "formula")) # FR #2591 - format.data.table issue with columns of class "formula" - paste(c(format(head(x, 6L), ...), if (length(x) > 6L) "..."), collapse=",") # fix for #5435 and #37 - format has to be added here... - else + paste(c(format(head(x, 6L), ...), if (length(x) > 6L) "..."), collapse=",") # fix for #5435 and #37 - format has to be added here... + else if (has_format_method(x) && length(formatted<-format(x, ...))==1L) { + # the column's class does not have a format method (otherwise it would have been used by format_col and this + # format_list_item would not be reached) but this particular list item does have a format method so use it + formatted + } else { paste0("<", class(x)[1L], paste_dims(x), ">") + } } # FR #1091 for pretty printing of character # TODO: maybe instead of doing "this is...", we could do "this ... test"? -char.trunc <- function(x, trunc.char = getOption("datatable.prettyprint.char")) { +char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) { trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) if (!is.character(x) || trunc.char <= 0L) return(x) idx = which(nchar(x) > trunc.char) diff --git a/R/setops.R b/R/setops.R index 042d0c5f9..1034b0f0f 100644 --- a/R/setops.R +++ b/R/setops.R @@ -59,8 +59,9 @@ fintersect = function(x, y, all=FALSE) { .set_ops_arg_check(x, y, all, .seqn = TRUE) if (!nrow(x) || !nrow(y)) return(x[0L]) if (all) { - x = shallow(x)[, ".seqn" := rowidv(x)] - y = shallow(y)[, ".seqn" := rowidv(y)] + .seqn_id = NULL # to avoid 'no visible binding for global variable' note from R CMD check + x = shallow(x)[, ".seqn" := rowidv(.seqn_id), env=list(.seqn_id=x)] + y = shallow(y)[, ".seqn" := rowidv(.seqn_id), env=list(.seqn_id=y)] jn.on = c(".seqn",setdiff(names(y),".seqn")) # fixes #4716 by preserving order of 1st (uses y[x] join) argument instead of 2nd (uses x[y] join) y[x, .SD, .SDcols=setdiff(names(y),".seqn"), nomatch=NULL, on=jn.on] @@ -75,8 +76,9 @@ fsetdiff = function(x, y, all=FALSE) { if (!nrow(x)) return(x) if (!nrow(y)) return(if (!all) funique(x) else x) if (all) { - x = shallow(x)[, ".seqn" := rowidv(x)] - y = shallow(y)[, ".seqn" := rowidv(y)] + .seqn_id = NULL # to avoid 'no visible binding for global variable' note from R CMD check + x = shallow(x)[, ".seqn" := rowidv(.seqn_id), env=list(.seqn_id=x)] + y = shallow(y)[, ".seqn" := rowidv(.seqn_id), env=list(.seqn_id=y)] jn.on = c(".seqn",setdiff(names(x),".seqn")) x[!y, .SD, .SDcols=setdiff(names(x),".seqn"), on=jn.on] } else { diff --git a/R/shift.R b/R/shift.R index c73d8b084..064eea20c 100644 --- a/R/shift.R +++ b/R/shift.R @@ -1,5 +1,7 @@ -shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FALSE) { +shift = function(x, n=1L, fill, type=c("lag", "lead", "shift", "cyclic"), give.names=FALSE) { type = match.arg(type) + if (type == "cyclic" && !missing(fill)) warning("Provided argument fill=", fill, " will be ignored since type='shift'.") + if (missing(fill)) fill = NA stopifnot(is.numeric(n)) ans = .Call(Cshift, x, as.integer(n), fill, type) if (give.names && is.list(ans)) { @@ -9,7 +11,7 @@ shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FA else nx = paste0("V", if (is.atomic(x)) 1L else seq_along(x)) } else nx = names(x) - if (type!="shift") { + if (!(type %chin% c("shift", "cyclic"))) { # flip type for negative n, #3223 neg = (n<0L) if (type=="lead" && length(unique(sign(n))) == 3L) neg[ n==0L ] = TRUE # lead_0 should be named lag_0 for consistency (if mixing signs of n, #3832) diff --git a/R/tables.R b/R/tables.R index 99c59f0c4..e47a1a42e 100644 --- a/R/tables.R +++ b/R/tables.R @@ -1,42 +1,62 @@ # globals to pass NOTE from R CMD check, see http://stackoverflow.com/questions/9439256 -MB = NCOL = NROW = NULL +MB = NCOL = NROW = INDICES = NULL -tables = function(mb=TRUE, order.col="NAME", width=80, - env=parent.frame(), silent=FALSE, index=FALSE) +type_size = function(DT) { + # for speed and ram efficiency, a lower bound by not descending into character string lengths or list items + # if a more accurate and higher estimate is needed then user can pass object.size or alternative to mb= + # in case number of columns is very large (e.g. 1e6 columns) then we use a for() to avoid allocation of sapply() + ans = 0L + lookup = c("raw"=1L, "integer"=4L, "double"=8L, "complex"=16L) + for (i in seq_along(DT)) { + col = DT[[i]] + tt = lookup[storage.mode(col)] + if (is.na(tt)) tt = .Machine$sizeof.pointer + tt = tt*nrow(DT) + if (is.factor(col)) tt = tt + length(levels(col))*.Machine$sizeof.pointer + ans = ans + tt + } + ans + ncol(DT)*.Machine$sizeof.pointer # column name pointers +} + +tables = function(mb=type_size, order.col="NAME", width=80, + env=parent.frame(), silent=FALSE, index=FALSE) { # Prints name, size and colnames of all data.tables in the calling environment by default - all_obj = objects(envir=env, all.names=TRUE) - is_DT = which(vapply_1b(all_obj, function(x) is.data.table(get(x, envir=env)))) - if (!length(is_DT)) { + mb_name = as.character(substitute(mb)) + if (isTRUE(mb)) { mb=type_size; mb_name="type_size" } + names = ls(envir=env, all.names=TRUE) # include "hidden" objects (starting with .) + obj = mget(names, envir=env) # doesn't copy; mget is ok with ... unlike get, #5197 + w = which(vapply_1b(obj, is.data.table)) + if (!length(w)) { if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) return(invisible(data.table(NULL))) } - DT_names = all_obj[is_DT] - info = rbindlist(lapply(DT_names, function(dt_n){ - DT = get(dt_n, envir=env) # doesn't copy - data.table( # data.table excludes any NULL items (MB and INDICES optional) unlike list() - NAME = dt_n, - NROW = nrow(DT), - NCOL = ncol(DT), - MB = if (mb) round(as.numeric(object.size(DT))/1024^2), # object.size() is slow hence optional; TODO revisit - COLS = list(names(DT)), - KEY = list(key(DT)), - INDICES = if (index) list(indices(DT))) - })) + info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0, COLS=list(), KEY=list(), INDICES=list()) + for (i in seq_along(w)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables + DT = obj[[w[i]]] + set(info, i, "NROW", nrow(DT)) + set(info, i, "NCOL", ncol(DT)) + if (is.function(mb)) set(info, i, "MB", as.integer(mb(DT)/1024^2)) + if (!is.null(tt<-names(DT))) set(info, i, "COLS", tt) # TODO: don't need these if()s when #5526 is done + if (!is.null(tt<-key(DT))) set(info, i, "KEY", tt) + if (index && !is.null(tt<-indices(DT))) set(info, i, "INDICES", tt) + } + if (!is.function(mb)) info[,MB:=NULL] + if (!index) info[,INDICES:=NULL] if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names if (!silent) { - # prettier printing on console - pretty_format = function(x, width) { - format(prettyNum(x, big.mark=","), - width=width, justify="right") - } - tt = copy(info) - tt[ , NROW := pretty_format(NROW, width=4L)] - tt[ , NCOL := pretty_format(NCOL, width=4L)] - if (mb) tt[ , MB := pretty_format(MB, width=2L)] - print(tt, class=FALSE, nrows=Inf) - if (mb) catf("Total: %sMB\n", prettyNum(sum(info$MB), big.mark=",")) + # add commas into NROW, NCOL and MB when displayed on console + # but this added all these numbers as strings to the character cache which causes the character cache to + # grow especially with a lot of tables, or changing tables over time. Stopped for now to avoid a tipping + # point in RSS in #5520 + # pretty_format = function(x, width) format(prettyNum(x, big.mark=","), width=width, justify="right") + # tt = shallow(info) + # tt[ , NROW := pretty_format(NROW, width=4L)] + # tt[ , NCOL := pretty_format(NCOL, width=4L)] + # if (is.function(mb)) tt[ , MB := pretty_format(MB, width=2L)] + print(info, class=FALSE, nrows=Inf) + if (is.function(mb)) catf("Total: %sMB using %s\n", prettyNum(sum(info$MB), big.mark=","), mb_name) } invisible(info) } diff --git a/R/test.data.table.R b/R/test.data.table.R index b64dfe119..6428bcc72 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -1,8 +1,19 @@ -test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent) { +test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, + memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id=NULL) { stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress)) + memtest = as.integer(memtest) + stopifnot(length(memtest)==1L, memtest %in% 0:2) + memtest.id = as.integer(memtest.id) + if (length(memtest.id)) { + if (length(memtest.id)==1L) memtest.id = rep(memtest.id, 2L) # for convenience of supplying one id rather than always a range + stopifnot(length(memtest.id)<=2L, # conditions quoted to user when false so "<=2L" even though following conditions rely on ==2L + !anyNA(memtest.id), memtest.id[1L]<=memtest.id[2L]) + if (memtest==0L) memtest=1L # using memtest.id implies memtest + } if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # package developer # nocov start + dev = TRUE if ("package:data.table" %chin% search()) stopf("data.table package is loaded. Unload or start a fresh R session.") rootdir = if (pkg!="." && pkg %chin% dir()) file.path(getwd(), pkg) else Sys.getenv("PROJ_PATH") subdir = file.path("inst","tests") @@ -10,6 +21,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # nocov end } else { # i) R CMD check and ii) user running test.data.table() + dev = FALSE rootdir = getNamespaceInfo("data.table","path") subdir = "tests" env = new.env(parent=parent.env(.GlobalEnv)) # when user runs test.data.table() we don't want their variables in .GlobalEnv affecting tests, #3705 @@ -28,7 +40,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F return(sapply(scripts, function(fn) { err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress)) cat("\n"); - identical(err, TRUE) + isTRUE(err) })) # nocov end } @@ -81,7 +93,8 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F scipen = 0L, # fwrite now respects scipen datatable.optimize = Inf, datatable.alloccol = 1024L, - datatable.print.class = FALSE, # this is TRUE in cc.R and we like TRUE. But output= tests need to be updated (they assume FALSE currently) + datatable.print.class = FALSE, # output= tests were written when default was FALSE + datatable.print.keys = FALSE, # output= tests were written when default was FALSE datatable.print.trunc.cols = FALSE, #4552 datatable.rbindlist.check = NULL, datatable.integer64 = "integer64", @@ -111,12 +124,20 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F assign("whichfail", NULL, envir=env) assign("started.at", proc.time(), envir=env) assign("lasttime", proc.time()[3L], envir=env) # used by test() to attribute time inbetween tests to the next test - assign("timings", data.table( ID = seq_len(9999L), time=0.0, nTest=0L ), envir=env) # test timings aggregated to integer id - assign("memtest", as.logical(Sys.getenv("TEST_DATA_TABLE_MEMTEST", "FALSE")), envir=env) + assign("timings", data.table( ID = seq_len(9999L), time=0.0, nTest=0L, RSS=0.0 ), envir=env) # test timings aggregated to integer id + assign("memtest", memtest, envir=env) + assign("memtest.id", memtest.id, envir=env) assign("filename", fn, envir=env) - assign("inittime", as.integer(Sys.time()), envir=env) # keep measures from various test.data.table runs assign("showProgress", showProgress, envir=env) + owd = setwd(tempdir()) # ensure writeable directory; e.g. tests that plot may write .pdf here depending on device option and/or batch mode; #5190 + on.exit(setwd(owd)) + + if (memtest) { + catf("\n***\n*** memtest=%d. This should be the first call in a fresh R_GC_MEM_GROW=0 R session for best results. Ctrl-C now if not.\n***\n\n", memtest) + if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.") + } + err = try(sys.source(fn, envir=env), silent=silent) options(oldOptions) @@ -152,7 +173,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (inherits(err,"try-error")) { # nocov start if (silent) return(FALSE) - stopf("Failed after test %s before the next test() call in %s", env$prevtest, fn) + stopf("Failed in %s after test %s before the next test() call in %s", timetaken(env$started.at), env$prevtest, fn) # the try() above with silent=FALSE will have already printed the error itself # nocov end } @@ -162,50 +183,40 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (nfail > 0L) { # nocov start stopf( - "%d error(s) out of %d. Search %s for test number(s) %s", - nfail, ntest, names(fn), toString(env$whichfail) + "%d error(s) out of %d. Search %s for test number(s) %s. Duration: %s.", + nfail, ntest, names(fn), toString(env$whichfail), timetaken(env$started.at) ) # important to stopf() here, so that 'R CMD check' fails # nocov end } # There aren't any errors, so we can use up 11 lines for the timings table - timings = env$timings - DT = head(timings[-1L][order(-time)], 10L) # exclude id 1 as in dev that includes JIT - if ((x<-sum(timings[["nTest"]])) != ntest) { - warningf("Timings count mismatch: %d vs %d", x, ntest) # nocov + nTest = RSS = NULL # to avoid 'no visible binding' note + timings = env$timings[nTest>0] + if (!memtest) { + ans = head(timings[if (dev) -1L else TRUE][order(-time)], 10L)[,RSS:=NULL] # exclude id 1 in dev as that includes JIT + if ((x<-sum(timings[["nTest"]])) != ntest) { + warningf("Timings count mismatch: %d vs %d", x, ntest) # nocov + } + catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-ans[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) + print(ans, class=FALSE) + } else { + y = head(order(-diff(timings$RSS)), 10L) + ans = timings[, diff:=c(NA,round(diff(RSS),1))][y+1L][,time:=NULL] # time is distracting and influenced by gc() calls; just focus on RAM usage here + catf("10 largest RAM increases (MB); see plot for cumulative effect (if any)\n") + print(ans, class=FALSE) + get("dev.new")(width=14, height=7) + get("par")(mfrow=c(1,2)) + get("plot")(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MB)", ylim=c(0,max(timings$RSS))) + get("mtext")(lastRSS<-as.integer(ceiling(last(timings$RSS))), side=4, at=lastRSS, las=1, font=2) + get("plot")(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MB)") + get("mtext")(lastRSS, side=4, at=lastRSS, las=1, font=2) } - catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) - print(DT, class=FALSE) catf("All %d tests (last %.8g) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) - - ## this chunk requires to include new suggested deps: graphics, grDevices - #memtest.plot = function(.inittime) { - # if (!all(requireNamespace(c("graphics","grDevices"), quietly=TRUE))) return(invisible()) - # inittime=PS_rss=GC_used=GC_max_used=NULL - # m = fread("memtest.csv")[inittime==.inittime] - # if (nrow(m)) { - # ps_na = allNA(m[["PS_rss"]]) # OS with no 'ps -o rss R' support - # grDevices::png("memtest.png") - # p = graphics::par(mfrow=c(if (ps_na) 2 else 3, 2)) - # if (!ps_na) { - # m[, graphics::plot(test, PS_rss, pch=18, xlab="test num", ylab="mem MB", main="ps -o rss R")] - # m[, graphics::plot(timestamp, PS_rss, type="l", xlab="timestamp", ylab="mem MB", main="ps -o rss R")] - # } - # m[, graphics::plot(test, GC_used, pch=18, xlab="test num", ylab="mem MB", main="gc used")] - # m[, graphics::plot(timestamp, GC_used, type="l", xlab="timestamp", ylab="mem MB", main="gc used")] - # m[, graphics::plot(test, GC_max_used, pch=18, xlab="test num", ylab="mem MB", main="gc max used")] - # m[, graphics::plot(timestamp, GC_max_used, type="l", xlab="timestamp", ylab="mem MB", main="gc max used")] - # graphics::par(p) - # grDevices::dev.off() - # } else { - # warningf("test.data.table runs with memory testing but did not collect any memory statistics.") - # } - #} - #if (memtest<-get("memtest", envir=env)) memtest.plot(get("inittime", envir=env)) - - invisible(nfail==0L) + ans = nfail==0L + attr(ans, "timings") = timings # as attr to not upset callers who expect a TRUE/FALSE result + invisible(ans) } # nocov start @@ -229,19 +240,9 @@ compactprint = function(DT, topn=2L) { INT = function(...) { as.integer(c(...)) } # utility used in tests.Rraw -ps_mem = function() { - # nocov start - cmd = sprintf("ps -o rss %s | tail -1", Sys.getpid()) - ans = tryCatch(as.numeric(system(cmd, intern=TRUE, ignore.stderr=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_) - stopifnot(length(ans)==1L) # extra check if other OSes would not handle 'tail -1' properly for some reason - # returns RSS memory occupied by current R process in MB rounded to 1 decimal places (as in gc), ps already returns KB - c("PS_rss"=round(ans / 1024, 1L)) - # nocov end -} - gc_mem = function() { # nocov start - # gc reported memory in MB + # gc reports memory in MB m = apply(gc()[, c(2L, 4L, 6L)], 2L, sum) names(m) = c("GC_used", "GC_gc_trigger", "GC_max_used") m @@ -274,16 +275,22 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no lasttime = get("lasttime", parent.frame()) timings = get("timings", parent.frame()) memtest = get("memtest", parent.frame()) - inittime = get("inittime", parent.frame()) + memtest.id = get("memtest.id", parent.frame()) filename = get("filename", parent.frame()) foreign = get("foreign", parent.frame()) showProgress = get("showProgress", parent.frame()) - time = nTest = NULL # to avoid 'no visible binding' note + time = nTest = RSS = NULL # to avoid 'no visible binding' note if (num>0) on.exit( { - now = proc.time()[3L] - took = now-lasttime # so that prep time between tests is attributed to the following test - assign("lasttime", now, parent.frame(), inherits=TRUE) - timings[ as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE ] + took = proc.time()[3L]-lasttime # so that prep time between tests is attributed to the following test + timings[as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE] + if (memtest) { + if (memtest==1L) gc() # see #5515 for before/after + inum = as.integer(num) + timings[inum, RSS:=max(rss(),RSS), verbose=FALSE] # TODO prefix inum with .. for clarity when that works + if (length(memtest.id) && memtest.id[1L]<=inum && inum<=memtest.id[2L]) cat(rss(),"\n") # after 'testing id ...' output; not using between() as it has verbose output when getOption(datatable.verbose) + if (memtest==2L) gc() + } + assign("lasttime", proc.time()[3L], parent.frame(), inherits=TRUE) # after gc() to exclude gc() time from next test when memtest } ) if (showProgress) # \r can't be in gettextf msg @@ -296,7 +303,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # not be flushed to the output upon segfault, depending on OS). } else { # not `test.data.table` but developer running tests manually; i.e. `cc(F); test(...)` - memtest = FALSE # nocov + memtest = 0L # nocov filename = NA_character_ # nocov foreign = FALSE # nocov ; assumes users of 'cc(F); test(...)' has LANGUAGE=en showProgress = FALSE # nocov @@ -326,9 +333,6 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no actual$message <<- c(actual$message, conditionMessage(m)) m } - if (memtest) { - timestamp = as.numeric(Sys.time()) # nocov - } if (is.null(output) && is.null(notOutput)) { x = suppressMessages(withCallingHandlers(tryCatch(x, error=eHandler), warning=wHandler, message=mHandler)) # save the overhead of capture.output() since there are a lot of tests, often called in loops @@ -336,10 +340,6 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no } else { out = capture.output(print(x <- suppressMessages(withCallingHandlers(tryCatch(x, error=eHandler), warning=wHandler, message=mHandler)))) } - if (memtest) { - mem = as.list(c(inittime=inittime, filename=basename(filename), timestamp=timestamp, test=num, ps_mem(), gc_mem())) # nocov - fwrite(mem, "memtest.csv", append=TRUE, verbose=FALSE) # nocov - } fail = FALSE if (.test.data.table && num>0) { if (num 1.0, mean(Petal.Length), by = Species] ## Community -`data.table` is widely used by the R community. It is being directly used by hundreds of CRAN and Bioconductor packages, and indirectly by thousands. It is one of the [top most starred](https://www.r-pkg.org/starred) R packages on GitHub, and was highly rated by the [Depsy project](http://depsy.org/package/r/data.table). If you need help, the `data.table` community is active on [StackOverflow](https://stackoverflow.com/questions/tagged/data.table). +`data.table` is widely used by the R community. It is being directly used by hundreds of CRAN and Bioconductor packages, and indirectly by thousands. It is one of the [top most starred](https://medium.datadriveninvestor.com/most-starred-and-forked-github-repos-for-r-in-data-science-fb87a54d2a6a) R packages on GitHub, and was highly rated by the [Depsy project](http://depsy.org/package/r/data.table). If you need help, the `data.table` community is active on [StackOverflow](https://stackoverflow.com/questions/tagged/data.table). ### Stay up-to-date diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index bf0bf77e9..04c5c490b 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -168,3 +168,335 @@ test(1742.5, substr(x, nchar(x)-10L, nchar(x)), c("50,28,95,76","62,87,23,40")) # Add scaled-up non-ASCII forder test 1896 +# Before #5501 do.call(data.table,) fully deparsed large unnamed args, #5492. +DF = data.frame(a=runif(1e6), b=runif(1e6)) +t1 = system.time(DT1 <- data.table(DF)) # 0.02s before and after +t2 = system.time(DT2 <- do.call(data.table, list(DF))) # 3.07s before, 0.02s after +test(, identical(DT1, DT2)) +test(, t2["elapsed"]/t1["elapsed"]<2) + +########################################################### +# largest tests by ram usage moved out of tests.Rraw, #5517 +########################################################### + +# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix) +# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which +# this tests. But it's well under 10 seconds. +DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep=""))) +test(301, nrow(DT[,sum(B),by=C])==100010) +DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) +test(301.1, nrow(DT[,sum(B),by=C])==100010) + +# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. +options(datatable.optimize=0L) +set.seed(1) +DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") +test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) +test(637.2, key(DT[J(43L),a:=99L]), NULL) +setkey(DT,a) +test(637.3, key(DT[,a:=99L,by=a]), NULL) +options(datatable.optimize=2L) +set.seed(1) +DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") +test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) +test(638.2, key(DT[J(43L),a:=99L]), NULL) +setkey(DT,a) +test(638.3, key(DT[,a:=99L,by=a]), NULL) + +# Test X[Y] slowdown, #2216 +# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes +# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw. +X = CJ(a=seq_len(1e3),b=seq_len(1e3)) +Y = copy(X) +X[4,b:=3L] # create a dup group, to force allLen1=FALSE +setkey(X) +test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case +test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case + +# test uniqlengths +set.seed(45) +x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE) +ox <- forderv(x) +o1 <- uniqlist(list(x), ox) +test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) +o1 <- uniqlist(list(x)) +test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) +rm(list=c("x","ox","o1")) +gc() + +# Fix for (usually small) memory leak when grouping, #2648. +# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row). +DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001)) +before = gc()["Vcells",2] +for (i in 1:50) DT[, sum(B), by=A] +after = gc()["Vcells",2] +test(1157, after < before+3) # +3 = 3MB +# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case. + +# Similar for when dogroups writes less rows than allocated, #2648. +DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4)) +before = gc()["Vcells",2] +for (i in 1:50) DT[ , unlist(.SD), by = 'k'] +after = gc()["Vcells",2] +test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024 + +# fix DT[TRUE, :=] using too much working memory for i, #1249 +if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled + f = tempfile() + N = 1000000 # or any large number of rows + DT = data.table(A=1:N, B=rnorm(N)) + DT[TRUE, B := B * 2] # stabilize with initial dummy update + Rprofmem(f) + DT[TRUE, B := B * 2] # or some in-place update + Rprofmem(NULL) + test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only + unlink(f) +} + +if (FALSE) { + # Full range takes too long for CRAN. + dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day") + dtsCh = as.character(dts) # 36s + dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000 + test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31"))) +} else { + # test on CRAN a reduced but important range + dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day") + dtsCh = as.character(dts) + test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01"))) +} +DT = data.table(A=dts, B=as.IDate(dts)) +test(1739.3, sapply(DT,typeof), c(A="double",B="integer")) +test(1739.4, typeof(dts), "double") +f = tempfile() +g = tempfile() # Full range +fwrite(DT,f) # 0.092s +write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s +test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=","))) +test(1739.6, readLines(f), readLines(g)) +unlink(f) +unlink(g) +rm(list=c("dtsCh","dts")) +gc() + +# catch malformed factor in rbindlist, #3315 +set.seed(32940) +NN=7e5; KK=4e4; TT=25 +DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) ) +test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites + +# print.data.table row id in non-scientific notation, #1167 +DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5)) +test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c")) +rm(DT) + +# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because +# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not. +# Would need a very complicated construction of embedded new lines in quoted fields, to test that. +# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN. +DT = as.data.table(CO2) +f = tempfile() +for (i in 0:1000) { + start = nrow(CO2)*i + fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE) + if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE) +} +test(1835, fread(f, verbose=TRUE), + output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped", + warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>") +unlink(f) + +# test no memory leak, #2191 and #2284 +# These take a few seconds each, and it's important to run these on CRAN to check no leak +gc(); before = gc()["Vcells","(Mb)"] +for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB +gc(); after = gc()["Vcells","(Mb)"] +test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin +gc(); before = gc()["Vcells","(Mb)"] +DF = data.frame(x=1:20, y=runif(20)) +for (i in 1:2000) { DT = as.data.table(DF); rm(DT) } +gc(); after = gc()["Vcells","(Mb)"] +test(862, after < before+0.5) +gc(); before = gc()["Vcells","(Mb)"] +DT = data.table(x=1:20, y=runif(20)) +for (i in 1:2000) { x <- DT[1:5,]; rm(x) } +gc(); after = gc()["Vcells","(Mb)"] +test(863, after < before+0.5) + +# fread should use multiple threads on single column input. +# tests 2 threads; the very reasonable limit on CRAN +# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently) +if (getDTthreads() == 1L) { + cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n") +} else { + N = if (TRUE) 2e6 else 1e9 # offline speed check + fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile()) + test(1760.1, file.info(f)$size > 4*1024*1024) + test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads") + unlink(f) +} + +# segfault of unprotected var caught with the help of address sanitizer; was test 1509 +# in #5517 I figured this test shouldn't be reduced in size due to its nature +set.seed(1) +val = sample(c(1:5, NA), 1e4L, TRUE) +dt <- setDT(replicate(100L, val, simplify=FALSE)) +## to ensure there's no segfault... +ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE) +test(1035.21, ans, ans) + +# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882 +# This runs with 2 threads in the test suite on CRAN and AppVeyor etc. +# 2 threads are sufficient to fail before the fix. +N = 20 +DF = data.frame(a=rnorm(N), + b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]), + c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5])) +DT = setDT(DF) # setDT required since data.table() already expanded altrep's +before = sum(gc()[, 2]) +fff = function(aref) { + ff = lapply(1:5, function(i) { + DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] + }) + return(rbindlist(ff)) +} +for(i in 1:100) { + f = fff("a") + rm("f") +} +gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after` + # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm. +after = sum(gc()[, 2]) +test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up). +# +before = sum(gc()[, 2]) +fff = function(aref) { + DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race. + lapply(1:5, function(i) { + DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] + }) +} +for(i in 1:100) { + fff("a") +} +gc() +after = sum(gc()[, 2]) +test(1912.2, after < before + 10) + +DT = data.table(A=seq(1, 1000000), B="x", C=TRUE) +fwrite(DT, f<-tempfile()) +test(1815, fread(f, nrows=5), DT[1:5]) #2243: nrows small vs large nrow(DT) + +# Better jump sync and run-on in PR#2627 +# +# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627 +# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly +x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184) +x[51094]="" +cat(x, file=f<-tempfile(), sep="\n") +test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),], + data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")), + output="jumps=[0..2)") # ensure jump 1 happened +# +# out-of-sample short lines in the first jump, not near the jump point +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[5021:5041] = "small,batch,short,lines" # 4 fields not 5 +cat(x, file=f, sep="\n") +test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020), + warning="Stopped early on line 5021.*<>") +test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),], + data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), + V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), + V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), + V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), + V5=c(1L,5020L,NA,NA,5042L,102184L)), + output="jumps=[0..2)") +# +# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267 +# confirmed fails in master with that error before PR#2627 +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[51094:51150] = "small,batch,short,lines" # 4 fields not 5 +cat(x, file=f, sep="\n") +test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093), + warning="Stopped early on line 51094.*<>", + output="jumps=[0..2)") +test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),], + data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), + V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), + V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), + V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), + V5=c(1L,51093L,NA,NA,51151L,102184L)), + output="jumps=[0..2)") +# +# jump inside a quoted field containing many new lines, to simulate a dirty jump +# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too. +# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps. +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093" +cat(x, file=f, sep="\n") +test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n + data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"), + V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)), + output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)") +# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's +# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth) +unlink(f) + +# chmatchdup test from benchmark at the bottom of chmatch.c +set.seed(45L) +x = sample(letters, 1e5, TRUE) +y = sample(letters, 1e6, TRUE) +test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406)) +rm(list=c("x","y")) + +# Add nq tests 1641-1652 here with larger sizes and calls that have been turned off in the past as took too long, and +# restore the exact parameters w.r.t. Jan's comment: https://github.com/Rdatatable/data.table/pull/5520#discussion_r1020180583 + +# issue 2351 +set.seed(1) +DT = data.table(id=paste0("id",1:1e5), v=sample(100,1e5,replace=TRUE)) +fwrite(DT, file=f<-tempfile(), eol="\r") +test(1826.1, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id99999","id100000"), v=c(27L,38L,10L,13L))) +cat("id888,42", file=f, append=TRUE) # without final \r after last line +test(1826.2, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id100000","id888"), v=c(27L,38L,13L,42L))) +unlink(f) + +# segfault when rbindlist is asked to create a DT with more than 2bn rows +DT = data.table(1:1e6) +L = vector("list", 2148) +for (i in seq_along(L)) L[[i]] = DT # many references to the same DT to avoid actually using large RAM for this test +test(1850, rbindlist(L), error="Total rows in the list is 2148000000 which is larger than the maximum number of rows, currently 2147483647") +rm(L, DT) +gc() + +# segfault in forder when nrow/throttle=255 && nrow>=65536; #5077 +# Matt ran these on clang's ASAN+OpenMP which correctly faulted v1.14.0; these tests segfault consistently without ASAN too +set.seed(1) +DT = data.table(grp=sample(255L, 65536L ,replace=TRUE)) # >=255 && >=65536 necessary +setDTthreads(throttle=nrow(DT)) # increase throttle to reduce threads to 1 for this nrow +test(2201.1, nrow(DT[, .N, by=grp]), 255L) +test(2201.2, nrow(setkey(DT, grp)), 65536L) +set.seed(1) +DT = data.table(grp=sample(65536L)) # extra case with all size 1 groups too just for fun +test(2201.3, nrow(DT[, .N, by=grp]), 65536L) +test(2201.4, nrow(setkey(DT, grp)), 65536L) +setDTthreads() # restore default throttle + +# print of DT with many columns reordered them, #3306. +DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print +out = capture.output(print(DT)) +tt = out[grep("V",out)] +tt = unlist(strsplit(gsub(" ","",tt), "V")) +test(1982.1, tt[1L], "") +tt = as.integer(tt[tt!=""]) +test(1982.2, tt, seq_along(tt)) + +# fread leak, #3292 +dummy = rep("1\t2\t3\t4\t5", 10000000) +writeLines(dummy, "out.tsv") +start = gc()["Vcells",2] +for (i in 1:10) data.table::fread("out.tsv") +end = gc()["Vcells",2] +test(, end/start < 1.05) + + diff --git a/inst/tests/multi-file.zip b/inst/tests/multi-file.zip new file mode 100644 index 000000000..6bf27e2a4 Binary files /dev/null and b/inst/tests/multi-file.zip differ diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index e8ea3d7ee..d2ee592cc 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -281,7 +281,7 @@ if (test_bit64) { x = as.integer64(1L) test(10.81, coerceAs(x, 1), 1, output="double[integer64] into double[numeric]") test(10.82, coerceAs(x, 1L), 1L, output="double[integer64] into integer[integer]") - test(10.83, coerceAs(x, "1"), error="please use as.character", output="double[integer64] into character[character]") # not yet implemented + test(10.83, coerceAs(x, "1"), "1", output="double[integer64] into character[character]") test(10.84, coerceAs(1, x), x, output="double[numeric] into double[integer64]") test(10.85, coerceAs(1L, x), x, output="integer[integer] into double[integer64]") test(10.86, coerceAs("1", x), x, output="character[character] into double[integer64]", warning="Coercing.*character") @@ -294,14 +294,15 @@ if (test_nanotime) { x = nanotime(1L) test(10.91, coerceAs(x, 1), 1, output="double[nanotime] into double[numeric]") test(10.92, coerceAs(x, 1L), 1L, output="double[nanotime] into integer[integer]") - test(10.93, coerceAs(x, "1"), error="please use as.character", output="double[nanotime] into character[character]") # not yet implemented + test(10.93, substring(coerceAs(x, "1"),1,11) %in% c("1","1970-01-01T"), output="double[nanotime] into character[character]") + # ^ https://github.com/eddelbuettel/nanotime/issues/92; %in% so as not to break if nanotime adds as.character method test(10.94, coerceAs(1, x), x, output="double[numeric] into double[nanotime]") test(10.95, coerceAs(1L, x), x, output="integer[integer] into double[nanotime]") test(10.96, coerceAs("1", x), x, output="character[character] into double[nanotime]", warning="Coercing.*character") } options(datatable.verbose=FALSE) test(11.01, coerceAs(list(a=1), 1), error="is not atomic") -test(11.02, coerceAs(1, list(a=1)), error="is not atomic") +test(11.02, coerceAs(1, list(a=1)), list(1)) test(11.03, coerceAs(sum, 1), error="is not atomic") test(11.04, coerceAs(quote(1+1), 1), error="is not atomic") test(11.05, coerceAs(as.name("x"), 1), error="is not atomic") diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index bd9374db2..807a67c19 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -1,10 +1,16 @@ -pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel") +pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf", "nanotime", "R.utils", "yaml") # First expression of this file must be as above: .gitlab-ci.yml uses parse(,n=1L) to read one expression from this file and installs pkgs. # So that these dependencies of other.Rraw are maintained in a single place. # TEST_DATA_TABLE_WITH_OTHER_PACKAGES is off by default so this other.Rraw doesn't run on CRAN. It is run by GLCI, locally in dev, and by # users running test.data.table("other.Rraw"). # zoo needs to be before xts for #5101 otherwise xts's dependency zoo gets attached at position 2 if xts is loaded first +# Optional Suggest-ed package tests moved from tests.Rraw to here in #5516. Retaining their comments: +# "xts", # we have xts methods in R/xts.R +# "nanotime", # fwrite looks for the 'nanotime' class name at C level (but we have our own writer in C, though) +# "yaml" # for fread's yaml argument (csvy capability) +# # zoo # In DESCRIPTION:Suggests otherwise R CMD check warning: '::' or ':::' import not declared from: 'zoo'; it is tested in other.Rraw though + if (exists("test.data.table",.GlobalEnv,inherits=FALSE) || !"package:data.table" %in% search()) { stop("Usage: R CMD INSTALL; require(data.table); test.data.table('other.Rraw')") @@ -202,3 +208,514 @@ if (loaded[["parallel"]]) { test(14.1, {example(':=', package='data.table', local=TRUE, echo=FALSE); TRUE}) test(14.2, {example('CJ', package='data.table', local=TRUE, echo=FALSE); TRUE}) +if (loaded[["sf"]]) { #2273 + DT = as.data.table(st_read(system.file("shape/nc.shp", package = "sf"))) + test(15, DT[1:3, .(NAME, FIPS, geometry)], output="Ashe.*-81.4.*Surry.*-80.4") +} + +if (loaded[["yaml"]]) { # csvy; #1701. Was 2032-2033 in tests.Rraw, #5516 + f = testDir("csvy/test.csvy") + DT = data.table(var1 = c("A", "B"), + var2 = c(1L, 3L), + var3 = c(2.5, 4.3)) + DT_yaml = copy(DT) + setattr(DT_yaml, 'yaml_metadata', + list(name = "my-dataset", + source = "https://github.com/leeper/csvy/tree/master/inst/examples", + schema = list(fields = list( + list(name = "var1", title = "variable 1", type = "string", + description = "explaining var1", + constraints = list(list(required = TRUE))), + list(name = "var2", title = "variable 2", type = "integer"), + list(name = "var3", title = "variable 3", type = "number") + )))) + ## with skip = '__auto__', fread can figure out + ## how to start after the metadata (just ignoring it) + test(16.01, fread(f), DT) + ## should be the same, but with yaml_metadata attribute + test(16.02, fread(f, yaml = TRUE), DT_yaml) + ## testing verbose messaging + test(16.03, fread(f, yaml = TRUE, verbose = TRUE), + DT_yaml, output = 'Processed.*YAML metadata.*') + ## this file is identical, except the body of the + ## YAML header is commented out with # (should read identically) + test(16.04, + fread(testDir('csvy/test_comment.csvy'), yaml = TRUE), + DT_yaml) + ## user input is taken as most intentional & overrides YAML + DT_yaml[ , var2 := as.numeric(var2)] + test(16.05, fread(f, yaml = TRUE, colClasses = list(numeric = 'var2')), + DT_yaml, message = 'colClasses.*YAML header are in conflict.*var2') + ## extraneous/unused fields shouldn't throw off reading + DT = fread(testDir('csvy/test_extraneous.csvy'), yaml = TRUE) + test(16.06, names(DT), c('Date', 'WTI')) + test(16.07, attr(DT, 'yaml_metadata'), + list(names = c("Date", "WTI"), class = "data.frame", + title = "Cushing, OK WTI Spot Price FOB", filename = "data.csv", + fileurl = "https://raw.githubusercontent.com/jrovegno/csvy/master/data.csv", + sourceurl = "http://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=PET&s=RWTC&f=D", + source_csvy = "https://github.com/leeper/csvy/tree/master/inst/examples", + item = "PET", sourcekey = "RWTC", freq = "Daily", + rate = "MID", type = "price", units = "Dollars per Barrel", + latestdate = "2015-08-31", releasedate = "2015-09-02", + nextreleasedate = "2015-09-10", source = "Thomson Reuters", + contactemail = "infoctr@eia.doe.gov", contactphone = "(202) 586-8800")) + ## yaml can also handle sep, dec, quote, and na.strings + DT_out = data.table(var1 = c("A", "B"), + var2 = c(1L, NA), + var3 = c(2.5, 4.3)) + meta = + list(name = NULL, + schema = list(fields = list( + list(name = "var1", title = "variable 1", type = "string", + description = "a single-quoted character variable"), + list(name = "var2", title = "variable 2", type = "integer"), + list(name = "var3", title = "variable 3", type = "number", + description = "European-style numeric") + )), + header = TRUE, sep = "|", dec = ",", + quote = "'", na.strings = "@") + attr(DT_out, 'yaml_metadata') = meta + test(16.08, fread(testDir( 'csvy/test_attributes.csvy'), yaml = TRUE), DT_out) + ## user-specified attributes can override data from YAML + meta$sep = "-" + setattr(DT_out, 'yaml_metadata', meta) + test(16.09, fread(testDir('csvy/test_override_sep.csvy'), yaml = TRUE, sep = '|'), DT_out, + message = 'User-supplied.*sep.*override') + + meta$sep = "|" + setattr(DT_out, 'yaml_metadata', meta) + test(16.10, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE), + DT_out, message = 'User-supplied.*header.*override') + col.names = c('x', 'y', 'z') + setnames(DT_out, col.names) + test(16.11, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE, col.names = col.names), DT_out, + message = c('User-supplied.*header.*override', 'User-supplied.*col.names.*override')) + + test(16.12, fread(testDir('csvy/test_attributes.csvy'), yaml = TRUE, col.names = col.names), + DT_out, message = 'User-supplied.*col.names') + + setnames(DT_out, c('var1', 'var2', 'var3')) + meta$quote = "^" + setattr(DT_out, 'yaml_metadata', meta) + test(16.13, fread(testDir('csvy/test_override_quote.csvy'), yaml = TRUE, quote = "'"), + DT_out, message = 'User-supplied.*quote') + + meta$quote = "'" + meta$dec = "." + setattr(DT_out, 'yaml_metadata', meta) + test(16.14, fread(testDir('csvy/test_override_dec.csvy'), yaml = TRUE, dec = ','), + DT_out, message = 'User-supplied.*dec') + + meta$dec = ',' + meta$na.strings = 'NA' + setattr(DT_out, 'yaml_metadata', meta) + test(16.15, fread(testDir('csvy/test_override_na.csvy'), yaml = TRUE, na.strings = '@'), + DT_out, message = 'User-supplied.*na.strings') + + ## error if YAML malformed + test(16.16, fread(testDir('csvy/test_incomplete_header.csvy'), yaml = TRUE), + error = 'Reached the end.*YAML.*valid csvy') + ## use any other CSV in test directory which doesn't have YAML + if (loaded[["R.utils"]]) test(16.17, fread(testDir('issue_2051.csv.gz'), yaml = TRUE), + error = 'Encountered.*unskipped.*constitute.*valid YAML') + ## no problem if some fields are missing a type (just + ## resort to standard auto-inferral, i.e., identical to + ## the case of partially-specified colClasses) + DT = data.table(var1 = c("A", "B"), var2 = c(1L, 3L), + var3 = c(2.5, 4.3)) + setattr(DT, 'yaml_metadata', + list(name = "my-dataset", source = "https://github.com/leeper/csvy/tree/master/inst/examples", + schema = list(fields = list( + list(name = "var1"), list(name = "var2", type = "integer"), + list(name = "var3", type = "number") + )))) + test(16.18, fread(testDir('csvy/test_missing_type.csvy'), yaml = TRUE), DT) + ## skip applies starting after the YAML header + setattr(DT, 'yaml_metadata', + list(schema = list(fields = list( + list(name = "var1", type = "string"), + list(name = "var2", type = "integer"), + list(name = "var3", type = "number") + )))) + test(16.19, fread(testDir('csvy/test_skip.csvy'), yaml = TRUE, skip = 2L), DT) + ## user-supplied col.names override metadata (as for colClasses) + cn = paste0('V', 1:3) + setnames(DT, cn) + test(16.20, fread(testDir('csvy/test_skip.csvy'), + yaml = TRUE, skip = 2L, col.names = cn), + DT, message = 'User-supplied column names.*override.*YAML') + ## invalid value fails + test(16.21, fread(f, yaml = 'gobble'), + error = 'isTRUEorFALSE\\(yaml\\) is not TRUE') + + ## warning that skip-as-search doesn't work with yaml + DT_yaml[ , var2 := as.integer(var2)] + test(16.22, fread(f, skip = 'var1,', yaml = TRUE), + DT_yaml, warning = 'Combining a search.*YAML.*') + + # fwrite csvy: #3534 + tmp = tempfile() + DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5]) + # force eol for platform independence + fwrite(DT, tmp, yaml = TRUE, eol = '\n') + as_read = readLines(tmp) + test(17.01, as_read[c(1L, 24L)], c('---', '---')) + test(17.02, grepl('source: R.*data.table.*fwrite', as_read[2L])) + test(17.03, grepl('creation_time_utc', as_read[3L])) + test(17.04, as_read[4:23], + c("schema:", " fields:", " - name: a", " type: integer", + " - name: b", " type: numeric", " - name: c", " type: character", + "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", + # NB: apparently \n is encoded like this in YAML + "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", + "logical01: no")) + tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") + test(17.05, as_read[25:30], tbl_body) + + # windows eol + fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') + test(17.06, readLines(tmp)[18L], 'eol: "\\r\\n"') + + # multi-class columns + DT[ , t := .POSIXct(1:5, tz = 'UTC')] + fwrite(DT, tmp, yaml = TRUE) + as_read = readLines(tmp) + test(17.07, as_read[13L], " type: POSIXct") + + # ~invertibility~ + # fread side needs to be improved for Hugh's colClasses update + DT[ , t := NULL] + fwrite(DT, tmp, yaml = TRUE) + DT2 = fread(tmp, yaml = TRUE) + # remove metadata to compare + attr(DT2, 'yaml_metadata') = NULL + test(17.08, all.equal(DT, DT2)) + + test(17.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE), + output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*")) + + # TODO: test gzip'd yaml which is now supported + + # yaml + bom arguments + DT = data.table(l=letters, n=1:26) + fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) + fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50 in tests.Rraw + lines = readLines(fcon) + lines = lines[lines!=""] # an extra "" after "eol: |2+" (line 16) on Linux but not Windows + # remove the blank here so we don't need to change this test if/when that changes in yaml package + test(17.11, length(lines), 48L) + close(fcon) + test(17.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) + # re-write should have same output (not appended) + fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) + fcon = file(f, encoding="UTF-8") + lines = readLines(fcon) + lines = lines[lines!=""] + test(17.13, length(lines), 48L) + close(fcon) + test(17.14, fread(f), DT) + unlink(f) +} + +if (loaded[["xts"]]) { # was 1465 in tests.Rraw, #5516 + # data.table-xts conversion #882 + # Date index + dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) + dt_xt = as.data.table(xt) + xt_dt = as.xts.data.table(dt) + test(18.01, all.equal(dt, dt_xt, check.attributes = FALSE)) + test(18.02, xt, xt_dt) + # POSIXct index + dt <- data.table(index = as.POSIXct(as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) + dt_xt = as.data.table(xt) + xt_dt = as.xts.data.table(dt) + test(18.03, all.equal(dt, dt_xt, check.attributes = FALSE)) + test(18.04, xt, xt_dt) + # index types returned from to.period + dt = data.table(index = as.Date((as.Date("2014-12-12") - 729):as.Date("2014-12-12"), origin = "1970-01-01"), quantity = as.numeric(rep(c(1:5), 73)), value = rep(c(1:73) * 100, 5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value), ncol = 2, dimnames = list(NULL, c("quantity", "value"))), order.by = dt$index) + xt_w = xts::to.weekly(xt) + xt_dt_xt_w = as.xts.data.table(as.data.table(xt_w)) + xt_m = xts::to.monthly(xt) + xt_dt_xt_m = as.xts.data.table(as.data.table(xt_m)) + xt_q = xts::to.quarterly(xt) + xt_dt_xt_q = as.xts.data.table(as.data.table(xt_q)) + xt_y = xts::to.yearly(xt) + xt_dt_xt_y = as.xts.data.table(as.data.table(xt_y)) + test(18.05, all.equal(xt_w, xt_dt_xt_w, check.attributes = FALSE)) + test(18.06, all.equal(xt_m, xt_dt_xt_m, check.attributes = FALSE)) + test(18.07, all.equal(xt_q, xt_dt_xt_q, check.attributes = FALSE)) + test(18.08, all.equal(xt_y, xt_dt_xt_y, check.attributes = FALSE)) + + test(18.09, xts::last(1:5), 5L) # was test 1531 + + # xts issue from Joshua, #1347 + x = as.Date(1:5, origin="2015-01-01") + test(18.10, last(x), tail(x, 1L)) # was test 1559 + + x = xts(1:100, Sys.Date()+1:100) + test(18.11, last(x,10), x[91:100,]) # was test 841 + # The important thing this tests is that data.table's last() dispatches to xts's method when data.table is loaded above xts. + # But that isn't tested by R CMD check because xts is loaded above data.table, there. + # So to make this test is relevant, run it in fresh R session directly, after: "require(xts);require(data.table)" + # rather than: "require(data.table);require(xts)" + # Which was the main thrust of bug#2312 fixed in v1.8.3 + + # fix for #1484; was test 1589 + x = xts::as.xts(8, order.by = as.Date("2016-01-03")) + test(18.12, all.equal(as.data.table(x), data.table(index = as.Date("2016-01-03"), V1 = 8), check.attributes=FALSE)) + + # IDate support in as.xts.data.table #1499; was test 1663 + dt <- data.table(date = c(as.IDate("2014-12-31"), + as.IDate("2015-12-31"), + as.IDate("2016-12-31")), + nav = c(100,101,99), + key = "date") + dt.xts <- as.xts.data.table(dt) + test(18.13, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31"))) + + # additional coverage missing uncovered in #3117 + dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) + test(18.14, as.data.table(xt, keep.rownames = FALSE), dt[ , !'index']) + names(xt)[1L] = 'index' + test(18.15, as.data.table(xt), error = 'Input xts object should not') + names(xt)[1L] = 'quantity' + setcolorder(dt, c(3, 1, 2)) + if (base::getRversion() < "3.6.0") as.xts = as.xts.data.table # fix for when we cannot register s3method for suggested dependency #3286 + test(18.16, as.xts(dt), error = 'data.table must have a time based') + setcolorder(dt, c(2, 3, 1)) + dt[ , char_col := 'a'] + test(18.17, as.xts(dt), xt, warning = 'columns are not numeric') + if (base::getRversion() < "3.6.0") rm(as.xts) + + # 890 -- key argument for as.data.table.xts + x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) + old = options(datatable.verbose=FALSE) + test(18.18, capture.output(as.data.table(x, key="index")), + c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", + " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", + " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", + " 9: 1970-01-10 9", "10: 1970-01-11 10")) + options(old) + + # as.data.table.xts(foo) had incorrect integer index with a column name called 'x', #4897 + M = xts::as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) # xts:: just to be extra robust; shouldn't be needed with rm(as.xts) above + test(18.19, inherits(as.data.table(M)$index,"POSIXct")) + + # non-numeric xts coredata, #5268 + x = xts::xts(x=c(TRUE,FALSE), order.by=Sys.Date()+(1:2)) + colnames(x) = "value" # perhaps relates to #4897 + test(18.20, identical(x, as.xts(as.data.table(x), numeric.only=FALSE))) +} + +# was 2108 in tests.Rraw, #5516 +# first and last should no longer load xts namespace, #3857, below commented test for interactive validation when xts present but not loaded or attached +# stopifnot("xts"%in%installed.packages(), !"xts"%in%loadedNamespaces()); library(data.table); x=as.POSIXct("2019-01-01"); last(x); stopifnot(!"xts" %in% loadedNamespaces()) +x = as.POSIXct("2019-09-09")+0:1 +old = options(datatable.verbose=TRUE) +test(19.01, last(x), x[length(x)], output="!is.xts(x)") +test(19.02, first(x), x[1L], output="!is.xts(x)") +if (loaded[["xts"]]) { + xt = xts(1:2, x) + test(19.03, last(xt, 2L), xt, output="using xts::last: is.xts(x)") + test(19.04, first(xt, 2L), xt, output="using xts::first: is.xts(x)") + xt = xts(matrix(1:4, 2L, 2L), x) + test(19.05, last(xt, 2L), xt, output="using xts::last: is.xts(x)") + test(19.06, first(xt, 2L), xt, output="using xts::first: is.xts(x)") +} +# first on empty df now match head(df, n=1L), #3858 +df = data.frame(a=integer(), b=integer()) +test(19.11, first(df), df, output="!is.xts(x)") +test(19.12, last(df), df, output="!is.xts(x)") +options(datatable.verbose=FALSE) # so the as.data.table() doesn't pollute output +# xts last-first dispatch fix #4053 +x = 1:3 +y = as.POSIXct(x, origin="1970-01-01") +df = data.frame(a=1:2, b=3:2) +dt = as.data.table(df) +mx = matrix(1:9, 3, 3) +ar = array(1:27, c(3,3,3)) +xt = structure( + c(142.25, 141.229996, 141.330002, 142.860001, 142.050003, 141.399994, + 140.570007, 140.610001, 140.380005, 141.369995, 141.669998, 140.539993, + 94807600, 69620600, 76645300, 108.999954, 109.231255, 108.360008), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(c(1167782400, 1167868800, 1167955200), tzone = "UTC", tclass = "Date"), + .Dim = c(3L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) +) +options(datatable.verbose=TRUE) +if (loaded[["xts"]]) { + test(19.21, last(x, n=2L), 2:3, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.22, last(y, n=2L), y[2:3], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.23, last(x, n=1L), 3L, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.24, last(y, n=1L), y[3L], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + xt_last = structure( + c(141.330002, 141.399994, 140.380005, 140.539993, 76645300, 108.360008), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(1167955200, tzone = "UTC", tclass = "Date"), + .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + xt_last2 = structure( + c(141.229996, 141.330002, 142.050003, 141.399994, 140.610001, 140.380005, + 141.669998, 140.539993, 69620600, 76645300, 109.231255, 108.360008), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(c(1167868800, 1167955200), tzone = "UTC", tclass = "Date"), + .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + test(19.25, last(xt), xt_last, output="using xts::last: is.xts(x)") + test(19.26, last(xt, n=2L), xt_last2, output="using xts::last: is.xts(x)") + test(19.31, first(x, n=2L), 1:2, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.32, first(y, n=2L), y[1:2], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.33, first(x, n=1L), 1L, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.34, first(y, n=1L), y[1L], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + xt_first = structure( + c(142.25, 142.860001, 140.570007, 141.369995, 94807600, 108.999954), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(1167782400, tzone = "UTC", tclass = "Date"), + .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + xt_first2 = structure( + c(142.25, 141.229996, 142.860001, 142.050003, 140.570007, 140.610001, 141.369995, 141.669998, 94807600, 69620600, 108.999954, 109.231255), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(c(1167782400, 1167868800), tzone = "UTC", tclass = "Date"), + .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + test(19.35, first(xt), xt_first, output="using xts::first: is.xts(x)") + test(19.36, first(xt, n=2L), xt_first2, output="using xts::first: is.xts(x)") +} else { + test(19.21, last(x, n=2L), 2:3, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.22, last(y, n=2L), y[2:3], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.23, last(x, n=1L), 3L, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.24, last(y, n=1L), y[3L], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.25, last(xt), error="you should have 'xts' installed already") + test(19.26, last(xt, n=2L), error="you should have 'xts' installed already") + test(19.31, first(x, n=2L), 1:2, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.32, first(y, n=2L), y[1:2], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.33, first(x, n=1L), 1L, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.34, first(y, n=1L), y[1L], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.35, first(xt), error="you should have 'xts' installed already") + test(19.36, first(xt, n=2L), error="you should have 'xts' installed already") +} +test(19.41, last(x), 3L, output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(19.42, last(y), y[3L], output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(19.51, first(x), 1L, output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(19.52, first(y), y[1L], output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(19.61, last(df), structure(list(a=2L, b=2L), row.names=2L, class="data.frame"), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +test(19.62, last(dt), data.table(a=2L, b=2L), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +test(19.71, first(df), structure(list(a=1L, b=3L), row.names=1L, class="data.frame"), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +test(19.72, first(dt), data.table(a=1L, b=3L), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +# matrix/array utils::tail behavior is likely to change in future R, Michael is more in the topic +test(19.81, last(mx), structure(c(3L, 6L, 9L), .Dim = c(1L, 3L), .Dimnames = list("[3,]", NULL)), output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +expected = if (base::getRversion() < "3.7.0") 27L else structure(c(3L, 6L, 9L, 12L, 15L, 18L, 21L, 24L, 27L), .Dim = c(1L, 3L, 3L), .Dimnames = list("[3,]", NULL, NULL)) #4127 +test(19.82, last(ar), expected, output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +test(19.91, first(mx), structure(c(1L, 4L, 7L), .Dim = c(1L, 3L)), output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +expected = if (base::getRversion() < "3.7.0") 1L else structure(c(1L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 25L), .Dim = c(1L, 3L, 3L)) #4127 +test(19.92, first(ar), expected, output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +options(old) + +if (loaded[["xts"]]) { # was 2133 in tests.Rraw, #5516 + # keep.rownames in as.data.table.xts() supports a string, #4232 + xts = xts::xts(1:10, structure(1:10, class = "Date")) + colnames(xts) = "VALUE" + DT = as.data.table(xts, keep.rownames = "DATE", key = "DATE") + test(20.1, colnames(DT), c("DATE", "VALUE")) + test(20.2, key(DT), "DATE") + test(20.3, as.data.table(xts, keep.rownames = "VALUE"), + error = "Input xts object should not have 'VALUE' column because it would result in duplicate column names. Rename 'VALUE' column in xts or use `keep.rownames` to change the index column name.") + test(20.4, as.data.table(xts, keep.rownames = character()), + error = "keep.rownames must be length 1") + test(20.5, as.data.table(xts, keep.rownames = NA_character_), + error = "keep.rownames must not be NA") +} + +if (loaded[["nanotime"]]) { + + # was 1463.62-65 in tests.Rraw, #5516 + x=nanotime(1:4) + test(21.1, shift(x ), c(nanotime::nanotime(NA), x[1:3])) + test(21.2, shift(x, fill=0L), c(nanotime::nanotime(0L), x[1:3])) + test(21.3, shift(x, 1, type="cyclic"), c(x[4L], x[-4L])) + test(21.4, shift(x, -1, type="cyclic"), c(x[-1L], x[1L])) + + # was 1752 in tests.Rraw, #5516 + DT = data.table(A=nanotime(tt<-c("2016-09-28T15:30:00.000000070Z", + "2016-09-29T23:59:00.000000001Z", + "2016-09-29T23:59:00.000000999Z", + "1970-01-01T00:01:01.000001000Z", + "1970-01-01T00:00:00.000000000Z", + "1969-12-31T23:59:59.999999999Z", + "1969-12-31T23:59:59.000000089Z", + "1969-12-31T12:13:14.000000000Z", + "1969-12-31T12:13:14.999999999Z", + "1969-12-31T12:13:14.000000001Z", + "1967-03-15T00:00:00.300000002Z", + "1967-03-15T23:59:59.300000002Z"))) + test(22, capture.output(fwrite(DT, verbose=FALSE))[-1], tt) + + # was 2060.401-405 in tests.Rraw, #5516 + nt = nanotime(c(1L, 2L, NA_integer_, 4L)) + nt_val = nanotime(1:4) + test(23.1, as.character(fcoalesce(nt, nanotime(3L))), as.character(nt_val)) # as.character due to eddelbuettel/nanotime#46 + test(23.2, as.character(fcoalesce(nt, nanotime(NA), nanotime(3L))), as.character(nt_val)) + test(23.3, as.character(fcoalesce(nt, nanotime(rep(3, 4L)))), as.character(nt_val)) + test(23.4, fcoalesce(nt, 1), error='Item 2 has a different class than item 1') + test(23.5, fcoalesce(nt, 1L), error = 'Item 2 is type integer but the first item is type double') + + # was 2080.01-05 in tests.Rraw, #5516 + n = nanotime(1:4) + n[2L] = NA + opt = options(datatable.verbose=TRUE) + test(24.1, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") + test(24.2, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") + test(24.3, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=NA), c(FALSE, NA, FALSE, NA), output="between parallel processing of integer64") + options(opt) + test(24.4, between(1:10, nanotime(3), nanotime(6)), error="x is not integer64 but.*Please align classes") + test(24.5, between(1:10, 3, nanotime(6)), error="x is not integer64 but.*Please align classes") + + # was 2085.11 in tests.Rraw, #5516 + n = nanotime(1:4) + test(25, fifelse(c(TRUE,FALSE,NA,TRUE), n, n+100), c(n[1L], n[2L]+100, nanotime(NA), n[4])) + + # was 2127.27 in tests.Rraw, #5516 + n = nanotime(1:12) + test(26, fcase(c(-5L:5L<0L,NA), n, c(-5L:5L>0L,NA), n+100), c(n[1L:5L], nanotime(NA), n[7L:11L]+100, as.integer64(NA))) + + # na.omit works for nanotime, #4744. Was 2205 in tests.Rraw, #5516 + DT = data.table(time=nanotime(c(1,NA,3))) + test(27, na.omit(DT), DT[c(1,3)]) + +} + +# that plot works; moved from tests.Rraw 167 to here to save ram of loading graphics package and possible screen device issues on overloaded servers, #5517 +DT = data.table( a=1:5, b=11:50, d=c("A","B","C","D"), f=1:5, grp=1:5 ) +test(28.1, DT[,plot(b,f)], NULL) +test(28.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0 +test(28.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer())) +try(graphics.off(),silent=TRUE) + +# test DT$.<- in a data.table-unaware package +# moved from tests.Rraw 1890 to here to save ram of loading stats package and plot, #5517 +DT = data.table(A=1:5) +test(29.1, stats::ts.plot(gpars=DT), error="object must have one or more observations") +# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col() +test(29.2, DT, data.table(A=1:5)) + +if (FALSE) { # moved from tests.Rraw in #5517 and not yet back on; wasn't sure we need to still test reshape2 + # test dispatch for non-data.table objects, #4864. + if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) { + test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + error="The melt generic in data.table has been passed a data.frame") + } else { + # 1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2 + # 2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded) + # 3) in dev locally I have reshape2 installed to run caret in other.Rraw + test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), + warning="The melt generic in data.table has been passed a data.frame") + } +} + diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index bed7bf0db..429545dcb 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -603,3 +603,15 @@ test(103.02, nadt, data.table(x1 = c(1, 2, 0, 0), x2 = c(2, 0, 3, 0), x3 = c(0, test(201.1, substitute2(dt, env=list(dt = data.table(a=1:9, b=1:9))), data.table(a=1:9, b=1:9)) test(201.2, substitute2(dt, env=list(dt = data.table(a=1:9, b=as.character(1:9)))), data.table(a=1:9, b=as.character(1:9))) test(201.3, substitute2(dt, env=list(dt = data.table(a=1:2, b=as.character(1:2)))), data.table(a=1:2, b=as.character(1:2))) + +# ensure env argument is a standard evaluation argument #4994 #4995 +dt = data.table(x=1:2, y=2:1) +jpar = list(.j=list("y")) +test(202.1, dt[, .j, env=jpar], data.table(y=2:1)) +f = function(d, params) { + d[, .j, env=params] +} +test(202.2, f(dt, params=jpar), data.table(y=2:1)) +"." = function(...) list(.j=list("x")) +test(202.3, dt[, .j, env=.(.j=list("y"))], data.table(x=1:2)) +rm(list=".") diff --git a/inst/tests/russellCRLF.tar b/inst/tests/russellCRLF.tar new file mode 100644 index 000000000..6d508dcce Binary files /dev/null and b/inst/tests/russellCRLF.tar differ diff --git a/inst/tests/russellCRLF.zip b/inst/tests/russellCRLF.zip new file mode 100644 index 000000000..c5060f1fe Binary files /dev/null and b/inst/tests/russellCRLF.zip differ diff --git a/inst/tests/test2224.Rdata b/inst/tests/test2224.Rdata new file mode 100644 index 000000000..9c6423b9f Binary files /dev/null and b/inst/tests/test2224.Rdata differ diff --git a/inst/tests/test2233-43.Rdata b/inst/tests/test2233-43.Rdata new file mode 100644 index 000000000..6f8456cc0 Binary files /dev/null and b/inst/tests/test2233-43.Rdata differ diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 31a218ea9..9e636c8b7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7,7 +7,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { } if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") - DTfun = DT # just in dev-mode, DT() gets overwritten in .GlobalEnv by DT objects here in tests.Rraw; we restore DT() in test 2212 + rm_all = function() {} } else { require(data.table) # Make symbols to the installed version's ::: so that we can i) test internal-only not-exposed R functions @@ -21,6 +21,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { allNA = data.table:::allNA any_na = data.table:::any_na as.data.table.array = data.table:::as.data.table.array + as.data.table.default = data.table:::as.data.table.default as.IDate.default = data.table:::as.IDate.default as.ITime.default = data.table:::as.ITime.default binary = data.table:::binary @@ -52,6 +53,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { print.data.table = data.table:::print.data.table replace_dot_alias = data.table:::replace_dot_alias rollup.data.table = data.table:::rollup.data.table + rss = data.table:::rss selfrefok = data.table:::selfrefok setcoalesce = data.table:::setcoalesce setdiff_ = data.table:::setdiff_ @@ -85,17 +87,34 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { last = data.table::last # xts first = data.table::first # xts, S4Vectors copy = data.table::copy # bit64 v4; bit64 offered to rename though so this is just in case bit64 unoffers + second = data.table::second # lubridate #1135 + minute = data.table::minute # lubridate + hour = data.table::hour # lubridate + yday = data.table::yday # lubridate + wday = data.table::wday # lubridate + mday = data.table::mday # lubridate + week = data.table::week # lubridate + isoweek = data.table::isoweek # lubridate + month = data.table::month # lubridate + quarter = data.table::quarter # lubridate + year = data.table::year # lubridate + yearmon = data.table::yearmon # zoo + yearqtr = data.table::yearqtr # zoo + + rm_all = function(env=parent.frame()) { + tt = setdiff(ls(envir=env), .do_not_rm) + rm(list=tt, envir=env) + gc() + invisible() + } } -# Load optional Suggests packages, which are tested by Travis for code coverage, and on CRAN -# The reason for inclusion here is stated next to each package +# Optional suggests are now tested in other.Rraw, #5516. No calls to require() or library() should occur +# in this file other than for methods and data.table above, and these here. +# These are included in code coverage, and on CRAN. The reason for inclusion is stated next to each package. sugg = c( "bit64", # if big integers are detected in file, fread reads them as bit64::integer64 if installed (warning if not) - "xts", # we have xts methods in R/xts.R - "nanotime", # fwrite looks for the 'nanotime' class name at C level (but we have our own writer in C, though) - "R.utils", # for fread to accept .gz and .bz2 files directly - "yaml" # for fread's yaml argument (csvy capability) - # zoo # In DESCRIPTION:Suggests otherwise R CMD check warning: '::' or ':::' import not declared from: 'zoo'; it is tested in other.Rraw though + "R.utils" # many fread test input files are compressed to save space; fundamental to test environment ) for (s in sugg) { assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages( @@ -146,10 +165,13 @@ base_messages = list( mixed_subscripts = get_msg(letters[-1:1]) ) +########################## +.do_not_rm = ls() # objects that exist at this point should not be removed by rm_all(); e.g. test_*, base_messages, Ctest_dt_win_snprintf, prevtest, etc ########################## test(1.1, tables(env=new.env()), null.data.table(), output = "No objects of class") -test(1.2, tables(silent=TRUE), data.table(NAME="timings", NROW=9999L, NCOL=3L, MB=0, COLS=list(c("ID","time","nTest")), KEY=list(NULL))) +test(1.2, tables(silent=TRUE)[,.(NAME,NROW,MB)], # memtest=TRUE adds some columns so exclude NCOL and COLS here + data.table(NAME="timings", NROW=9999L, MB=0)) TESTDT = data.table(a=as.integer(c(1,3,4,4,4,4,7)), b=as.integer(c(5,5,6,6,9,9,2)), v=1:7) setkey(TESTDT,a,b) @@ -509,11 +531,7 @@ test(164, foo(f), DT[,mean(b),by=d]) test(165, subset(DT,a>2), DT[a>2]) test(166, suppressWarnings(split(DT,DT$grp)[[2]]), DT[grp==2]) -# and that plotting works -test(167.1, DT[,plot(b,f)], NULL) -test(167.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0 -test(167.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer())) -try(graphics.off(),silent=TRUE) +# 167 tested graphics::plot, moved to other.Rraw 28 to save ram, #5517 # IDateTime conversion methods that ggplot2 uses (it calls as.data.frame method) # Since %b is e.g. "nov." in LC_TIME=fr_FR.UTF-8 locale, we need to @@ -640,7 +658,7 @@ test(211, ncol(TESTDT), 2L) DT = data.table(a=1:6,key="a") test(212, DT[J(3)]$a, 3L) # correct class c("data.table","data.frame") class(DT) = "data.table" # incorrect class, but as from 1.8.1 it works. By accident when moving from colnames() to names(), it was dimnames() doing the check, but rather than add a check that identical(class(DT),c("data.frame","data.table")) at the top of [.data.table, we'll leave it flexible to user (user might not want to inherit from data.frame for some reason). -test(213, DT[J(3)]$a, error="x is not a data.table|frame") # from v1.14.2, data.table must inherit from data.frame (internals are too hard to reason if a data.table may not be data.frame too) +test(213, DT[J(3)]$a, 3L) # setkey now auto coerces double and character for convenience, and # to solve bug #953 @@ -950,13 +968,7 @@ DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b") test(300, DT[J(1),sum(v),by=b], data.table(b=c(1L,4L,7L),V1=c(1L,4L,7L))) # should not retain key because by= is not on head(key(DT)) test(300.1, DT[J(1:2),sum(v),by=b], data.table(b=c(1L,4L,7L,2L,5L,8L),V1=c(1L,4L,7L,2L,5L,8L))) -# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix) -# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which -# this tests. But it's well under 10 seconds. -DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep=""))) -test(301, nrow(DT[,sum(B),by=C])==100010) -DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) -test(301.1, nrow(DT[,sum(B),by=C])==100010) +# 301 moved to benchmark.Rraw, #5517 # Test fast assign DT = data.table(a=c(1L,2L,2L,3L),b=4:7,key="a") @@ -1082,7 +1094,7 @@ test(353, DT[2,f:="c"], data.table(f=factor(c("a","c","a","b")),x=1:4)) test(354, DT[3,f:=factor("foo")], data.table(f=factor(c("a","c","foo","b")),x=1:4)) # Test growVector logic when adding levels (don't need to grow levels for character cols) -newlevels = as.character(as.hexmode(1:2000)) +newlevels = format(as.hexmode(1:2000)) DT = data.table(f=factor("000"),x=1:2010) test(355, DT[11:2010,f:=newlevels], data.table(f=factor(c(rep("000",10),newlevels)),x=1:2010)) @@ -1169,7 +1181,7 @@ test(381, DT[,{if (a==2) {.SD$b[1]=10L;.SD} else .SD}, by=a], error=base_message # test that direct := is trapped, but := within a copy of .SD is allowed (FAQ 4.5). See also tests 556-557. test(382, DT[,b:=.N*2L,by=a], data.table(a=rep(1:3,1:3),b=rep(2L*(1:3),1:3))) -test(383, DT[,{z=10L;b:=z},by=a], error=":= and `:=`(...) are defined for use in j, once only and in particular ways") +test(383, DT[,{z=10L;b:=z},by=a], error="defined for use in j, once only and in particular ways") test(384, DT[,{mySD=copy(.SD);mySD[1,b:=99L];mySD},by=a], data.table(a=rep(1:3,1:3),b=c(99L,99L,4L,99L,6L,6L))) # somehow missed testing := on logical subset with mixed TRUE/FALSE, reported by Muhammad Waliji @@ -1808,25 +1820,41 @@ test(610.3, chorder(x), base::order(x)) test(610.4, unique(x[chgroup(x)]), unique(x)) # := by group +options(datatable.optimize=0L) +DT = data.table(a=1:3,b=(1:9)/10) +test(611.1, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +setkey(DT,a) +test(611.2, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +# Combining := by group with i +test(611.3, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(611.4, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) +options(datatable.optimize=2L) DT = data.table(a=1:3,b=(1:9)/10) -test(611, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +test(612.1, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) setkey(DT,a) -test(612, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +test(612.2, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +# Combining := by group with i +test(612.3, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(612.4, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) # Assign to subset ok (NA initialized in the other items) ok : test(613, DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) test(614, DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) -# Combining := by group with i -test(616, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(617, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) # Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 ans = copy(DT)[,r:=NA_real_] -test(618, copy(DT)[a>3,r:=sum(b)], ans) -test(619, copy(DT)[J(-1),r:=sum(b)], ans) -test(620.1, copy(DT)[NA,r:=sum(b)], ans) -test(620.2, copy(DT)[0,r:=sum(b)], ans) -test(620.3, copy(DT)[NULL,r:=sum(b)], null.data.table()) +options(datatable.optimize=0L) +test(618.1, copy(DT)[a>3,r:=sum(b)], ans) +test(618.2, copy(DT)[J(-1),r:=sum(b)], ans) +test(618.3, copy(DT)[NA,r:=sum(b)], ans) +test(618.4, copy(DT)[0,r:=sum(b)], ans) +test(618.5, copy(DT)[NULL,r:=sum(b)], null.data.table()) +options(datatable.optimize=2L) +test(619.1, copy(DT)[a>3,r:=sum(b)], ans) +test(619.2, copy(DT)[J(-1),r:=sum(b)], ans) +test(619.3, copy(DT)[NA,r:=sum(b)], ans) +test(619.4, copy(DT)[0,r:=sum(b)], ans) +test(619.5, copy(DT)[NULL,r:=sum(b)], null.data.table()) DT = data.table(x=letters, key="x") test(621, copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained @@ -1834,7 +1862,10 @@ test(622, copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") set.seed(2) DT = data.table(a=rnorm(5)*10, b=1:5) -test(623, DT[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +options(datatable.optimize=0L) +test(623.1, copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +options(datatable.optimize=2L) +test(623.2, copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) # Tests on POSIXct attributes @@ -1863,6 +1894,8 @@ test(628.2, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list # Test merge with common names and all.y=TRUE, #2011 DT1 = data.table(a=c(1,3,4,5), total=c(2,1,3,1), key="a") DT2 = data.table(a=c(2,3,5), total=c(5,1,2), key="a") +DT3 = data.table(a=c(2), total=c(5), key="a") +DT4 = data.table(a=c(3), total=c(1), key="a") # 629+630 worked before anyway. 631+632 test the bug fix. adf=as.data.frame adt=as.data.table @@ -1875,6 +1908,16 @@ test(630.1, merge(DT1,DT2,all.x=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a" test(631, merge(DT1,DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=c(NA,1,1),total.y=c(5,1,2),key="a")) test(631.1, merge(DT1,DT2,all.y=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all.y=TRUE)),a)) +# ensure merge(x,y,all.y) does not alter input y ... +# .. i subset y with 1:nrow(y) +test(631.2, merge(DT1[c(1,3)],DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=NA_real_,total.y=c(5,1,2),key="a")) +test(631.3, DT2, data.table(a=c(2,3,5), total=c(5,1,2), key="a")) +# .. nrow(y)=1, i subset y with 1 and no match with x +test(631.4, merge(DT1,DT3,all.y=TRUE), data.table(a=c(2),total.x=NA_real_,total.y=c(5),key="a")) +test(631.5, DT3, data.table(a=c(2), total=c(5), key="a")) +# .. nrow(y)=1, i subset y with 1 and match with x +test(631.6, merge(DT1,DT4,all.y=TRUE), data.table(a=c(3),total.x=c(1),total.y=c(1),key="a")) +test(631.7, DT4, data.table(a=c(3), total=c(1), key="a")) test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3,1),total.y=c(NA,5,1,NA,2),key="a")) test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a)) @@ -1888,13 +1931,7 @@ DT = data.table(x=1:3,y=1:3) test(635, names(DT[,list(x,y,a=y)]), c("x","y","a")) test(636, names(DT[,list(x,a=y)]), c("x","a")) -# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. -set.seed(1) -DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") -test(637, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) -test(638, key(DT[J(43L),a:=99L]), NULL) -setkey(DT,a) -test(639, key(DT[,a:=99L,by=a]), NULL) +# 637-638 moved to benchmark.Rraw, #5517 # Test printing is right aligned without quotes etc, and rownames are repeated ok for more than 20 rows DT=data.table(a=8:10,b=c("xy","x","xyz"),c=c(1.1,22.1,0)) @@ -1902,7 +1939,7 @@ test(640, capture.output(print(DT,class=FALSE)), c(" a b c","1: 8 xy DT=data.table(a=letters,b=1:26) test(641, tail(capture.output(print(DT[1:20], class=FALSE)),2), c("19: s 19","20: t 20")) test(642, tail(capture.output(print(DT[1:21], class=FALSE, nrows=100)),2), c("21: u 21"," a b")) -DT=data.table(a=as.character(as.hexmode(1:500)), b=1:500) +DT=data.table(a=format(as.hexmode(1:500)), b=1:500) test(643, capture.output(print(DT, class=FALSE)), c(" a b"," 1: 001 1"," 2: 002 2"," 3: 003 3"," 4: 004 4"," 5: 005 5"," --- ","496: 1f0 496","497: 1f1 497","498: 1f2 498","499: 1f3 499","500: 1f4 500")) # Test inconsistent length of columns error. @@ -1914,9 +1951,9 @@ test(645, setkey(DT,b), error="Column 2 is length 2 which differs from length of # Test faster mean with a lot of very small groups. Example from (now not needed as much) data.table wiki point 3. # benchmarks.Rraw contains the same, to be scaled up. set.seed(9) -n=1e4 # very small n so as not to overload daily CRAN checks. -DT=data.table(grp1=sample(1:150, n, replace=TRUE), - grp2=sample(1:150, n, replace=TRUE), +n=1e3 # very small n (1e4) so as not to overload daily CRAN checks. Then reduced even further to just 1e3, #5517 +DT=data.table(grp1=sample.int(150L, n, replace=TRUE), + grp2=sample.int(150L, n, replace=TRUE), x=rnorm(n), y=rnorm(n)) DT[c(2,5),x:=NA] # seed chosen to get a group of size 2 and 3 in the first 5 to easily inspect. @@ -1986,18 +2023,32 @@ test(668, DT[a<3,sum(b),by=eval(paste("a"))], DT[a<3,sum(b),by=a]) test(669, DT[a<3,sum(b),by=c(2)], error="must evaluate to 'character'") # Test := keyby does setkey, #2065 +options(datatable.optimize=0L) +DT = data.table(x=1:2, y=1:6) +ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") +test(670.1, DT[,z:=sum(y),keyby=x], ans) +DT = data.table(x=1:2, y=1:6) +test(670.2, DT[,z:=sum(y),keyby="x"], ans) +DT = data.table(x=1:2, y=1:6) +test(670.3, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), + warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") +DT = data.table(x=1:2, y=1:6) +test(670.4, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +DT = data.table(x=1:2, y=1:6) +test(670.5, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +options(datatable.optimize=2L) DT = data.table(x=1:2, y=1:6) ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(670, DT[,z:=sum(y),keyby=x], ans) +test(671.1, DT[,z:=sum(y),keyby=x], ans) DT = data.table(x=1:2, y=1:6) -test(671, DT[,z:=sum(y),keyby="x"], ans) +test(671.2, DT[,z:=sum(y),keyby="x"], ans) DT = data.table(x=1:2, y=1:6) -test(672, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), +test(671.3, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") DT = data.table(x=1:2, y=1:6) -test(673, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +test(671.4, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) DT = data.table(x=1:2, y=1:6) -test(674, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +test(671.5, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") # Test new .() DT = data.table(x=1:2, y=1:6, key="x") @@ -2152,9 +2203,13 @@ test(738, DT[,c("c2", "c1"):=list(c1+1L, NULL)], data.table(c2=2:3)) # `:=`(c1=v1,v2=v2,...) is now valid , #2254 DT = data.table( c1=1:3 ) -test(739, DT[,`:=`(c2=4:6, c3=7:9)], data.table(c1=1:3,c2=4:6,c3=7:9)) -test(740, DT[,`:=`(4:6,c3=7:9)], error="all arguments must be named") -test(741, DT[,`:=`(4:6,7:9,10:12)], error="all arguments must be named") # test the same error message in the other branch +test(739.1, DT[,`:=`(c2=4:6, c3=7:9)], data.table(c1=1:3,c2=4:6,c3=7:9)) +test(739.2, DT[,`:=`(4:6,c3=7:9)], error="all arguments must be named") +test(739.3, DT[,`:=`(4:6,7:9,10:12)], error="all arguments must be named") # test the same error message in the other branch +DT = data.table( c1=1:3 ) +test(739.4, DT[,let(c2=4:6, c3=7:9)], data.table(c1=1:3,c2=4:6,c3=7:9)) +test(739.5, DT[,let(4:6,c3=7:9)], error="all arguments must be named") +test(739.6, DT[,let(4:6,7:9,10:12)], error="all arguments must be named") # that out of bounds LHS is caught, root cause of #2254 test(742, DT[,3:6:=1L], error="outside.*range") @@ -2168,21 +2223,32 @@ test(746, DT["a",c("new1","new2"):=list(4L, 5L)], data.table(a=letters[c(1:3,3L)],new1=INT(4,NA,NA,NA),new2=INT(5,NA,NA,NA),key="a")) test(747.1, DT[,new1:=4:6], error="Supplied 3 items to be assigned to 4 items of column 'new1'") test(747.2, DT[,new1:=INT(4,5,6,4)], data.table(a=letters[c(1:3,3L)],new1=INT(4L,5L,6L,4L),new2=INT(5,NA,NA,NA),key="a")) -test(748, DT[c("c","b"),`:=`(new3=.N,new2=sum(new1)+1L),by=.EACHI], data.table(a=letters[c(1:3,3L)],new1=INT(4,5,6,4),new2=INT(5,6,11,11),new3=INT(NA,1,2,2),key="a")) +test(748.1, copy(DT)[c("c","b"),`:=`(new3=.N,new2=sum(new1)+1L),by=.EACHI], data.table(a=letters[c(1:3,3L)],new1=INT(4,5,6,4),new2=INT(5,6,11,11),new3=INT(NA,1,2,2),key="a")) +test(748.2, copy(DT)[c("c","b"),let(new3=.N,new2=sum(new1)+1L),by=.EACHI], data.table(a=letters[c(1:3,3L)],new1=INT(4,5,6,4),new2=INT(5,6,11,11),new3=INT(NA,1,2,2),key="a")) # and multiple LHS by group, #1710 DT = data.table(a=rep(6:8,1:3),b=1:6) test(749, DT[,c("c","d","e"):=list(.N,sum(b),a*10L),by=a], data.table(a=rep(6:8,1:3),b=1:6,c=rep(1:3,1:3),d=INT(rep(c(1,5,15),1:3)),e=rep(6:8,1:3)*10L)) -test(750, DT[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) +test(750.1, copy(DT)[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) +test(750.2, copy(DT)[a<8,let(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) # varname holding colnames, by group, linked from #2120. +options(datatable.optimize=0L) +DT = data.table(a=rep(1:3,1:3),b=1:6) +colname = "newcol" +test(751.1, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) +options(datatable.optimize=2L) DT = data.table(a=rep(1:3,1:3),b=1:6) colname = "newcol" -test(751, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) +test(751.2, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) # Add tests for nested := in j by group, #1987 +options(datatable.optimize=0L) +DT = data.table(a=rep(1:3,2:4),b=1:9) +test(752.1, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +options(datatable.optimize=2L) DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +test(752.2, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) # Test duplicate() of recycled plonking RHS, #2298 DT = data.table(a=letters[3:1],x=1:3) @@ -2271,7 +2337,8 @@ test(783, DT[,.I,by=a]$I, 1:8) test(784, DT[,.I[which.max(b)],by=a], data.table(a=1:4,V1=INT(2,4,6,8),key="a")) test(785, DT[J(2:4),.I,by=a%%2L], data.table(a=rep(0:1,c(4,2)),I=INT(3,4,7,8,5,6))) test(786, DT[J(c(3,2,4)),list(.I,.GRP),by=.EACHI], data.table(a=rep(c(3L,2L,4L),each=2),I=INT(5,6,3,4,7,8),GRP=rep(1:3,each=2L))) -test(787, DT[J(3:2),`:=`(i=.I,grp=.GRP),by=.EACHI][,list(i,grp)], data.table(i=INT(NA,NA,3:6,NA,NA),grp=INT(NA,NA,2,2,1,1,NA,NA))) +test(787.1, copy(DT)[J(3:2),`:=`(i=.I,grp=.GRP),by=.EACHI][,list(i,grp)], data.table(i=INT(NA,NA,3:6,NA,NA),grp=INT(NA,NA,2,2,1,1,NA,NA))) +test(787.2, copy(DT)[J(3:2),let(i=.I,grp=.GRP),by=.EACHI][,list(i,grp)], data.table(i=INT(NA,NA,3:6,NA,NA),grp=INT(NA,NA,2,2,1,1,NA,NA))) # New not-join (a.k.a. not-select, since not just for data.table i but integer, logical and character too) DT = data.table(A=rep(1:3,each=2),B=1:6,key="A") @@ -2343,16 +2410,7 @@ mycols = 2 test(814.12, DT[,!..mycols], ans) test(814.13, DT[,-..mycols], ans) - -# Test X[Y] slowdown, #2216 -# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes -# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw. -X = CJ(a=seq_len(1e3),b=seq_len(1e3)) -Y = copy(X) -X[4,b:=3L] # create a dup group, to force allLen1=FALSE -setkey(X) -test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case -test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case +# 819-820 moved to benchmark.Rraw, #5517 # Optimization of lapply(,"+"), #2212 DT = data.table(a=rep(1:3,each=2L),b=1:6,c=7:12) @@ -2454,24 +2512,7 @@ i = data.frame(foo=1) test(859, DT[i], DT[J(i)]) test(860, DT[i], DT[data.table(i)]) -# test no memory leak, #2191 and #2284 -# These take a few seconds each, and it's important to run these on CRAN to check no leak -gc(); before = gc()["Vcells","(Mb)"] -for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB -gc(); after = gc()["Vcells","(Mb)"] -test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin - -gc(); before = gc()["Vcells","(Mb)"] -DF = data.frame(x=1:20, y=runif(20)) -for (i in 1:2000) { DT = as.data.table(DF); rm(DT) } -gc(); after = gc()["Vcells","(Mb)"] -test(862, after < before+0.5) - -gc(); before = gc()["Vcells","(Mb)"] -DT = data.table(x=1:20, y=runif(20)) -for (i in 1:2000) { x <- DT[1:5,]; rm(x) } -gc(); after = gc()["Vcells","(Mb)"] -test(863, after < before+0.5) +# 861-863 moved to benchmark.Rraw, #5517 # rbindlist should look for the first non-empty data.table - New changes (from Arun). Explanation below: # Even if data.table is empty, as long as there are column names, they should be considered. @@ -2776,7 +2817,8 @@ test(950, fread('A,B,C\n1,+,4\n2,-,5\n3,-,6\n'), data.table(A=1:3,B=c("+","-","- # catching misuse of `:=` x = data.table(a=1:5) -test(951, x[,{b=a+3; `:=`(c=b)}], error="defined for use in j, once only and in particular ways") +test(951.1, x[,{b=a+3; `:=`(c=b)}], error="defined for use in j, once only and in particular ways") +test(951.2, x[,{b=a+3; let(c=b)}], error="defined for use in j, once only and in particular ways") # fread colClasses input = 'A,B,C\n01,foo,3.140\n002,bar,6.28000\n' @@ -2827,7 +2869,8 @@ test(978.3, fread(input, skip=9), data.table(E=9:10, F=11:12)) # mixed add and update in same `:=` bug/crash, #2528 and #2778 DT = data.table(x=rep(1:2, c(3,2)), y=6:10) DT[, z:=.GRP, by=x] # first assignment -test(979, DT[, `:=`(z=.GRP, w=2), by=x], data.table(x=INT(1,1,1,2,2),y=6:10,z=INT(1,1,1,2,2),w=2)) # mixed update and add +test(979.1, copy(DT)[, `:=`(z=.GRP, w=2), by=x], data.table(x=INT(1,1,1,2,2),y=6:10,z=INT(1,1,1,2,2),w=2)) # mixed update and add +test(979.2, copy(DT)[, let(z=.GRP, w=2), by=x], data.table(x=INT(1,1,1,2,2),y=6:10,z=INT(1,1,1,2,2),w=2)) # and example from http://stackoverflow.com/a/14732348/403310 : dt1 = fread("Date,Time,A,B 01/01/2013,08:00,10,30 @@ -2841,13 +2884,18 @@ dt2 = fread("Date,A,B,C 02/01/2013,200,400,2") setkey(dt1, "Date") setkey(dt2, "Date") -test(980, dt1[dt2, `:=`(A=A+i.A, B=B+i.B, C=i.C)][,list(A,B,C)], +test(980.1, copy(dt1)[dt2, `:=`(A=A+i.A, B=B+i.B, C=i.C)][,list(A,B,C)], + data.table(A=INT(110,115,120,225,230,235),B=INT(330,325,320,415,410,405),C=rep(1:2,each=3))) +test(980.2, copy(dt1)[dt2, let(A=A+i.A, B=B+i.B, C=i.C)][,list(A,B,C)], data.table(A=INT(110,115,120,225,230,235),B=INT(330,325,320,415,410,405),C=rep(1:2,each=3))) DT = data.table(A=1:2,B=3:4,C=5:6) -test(981, DT[,`:=`(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], error="Supplied 2 items to be assigned to group 1 of size 1 in column 'B'") +test(981.1, copy(DT)[,`:=`(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], error="Supplied 2 items to be assigned to group 1 of size 1 in column 'B'") +test(981.2, copy(DT)[,let(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], error="Supplied 2 items to be assigned to group 1 of size 1 in column 'B'") DT = data.table(A=1:2,B=3:4,C=5:6) -test(982, DT[,`:=`(D=B+4L,B=0L,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], +test(982.1, copy(DT)[,`:=`(D=B+4L,B=0L,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L))) # Also note that G is not yet iterative. In future: c(12,14) +test(982.2, copy(DT)[, let(D=B+4L,B=0L,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], + data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L))) # rbindlist binding factors, #2650 test(983, rbindlist(list(data.table(factor(c("A","A","B","C","A"))), data.table(factor(c("B","F","A","G"))))), data.table(V1=factor(c("A","A","B","C","A","B","F","A","G")))) @@ -3164,13 +3212,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.20, melt(DT, id.vars=1:2), data.table(A=1:2, B=3:4, variable=factor(rep(1L, 4L), labels="D"), value=5:8)) - # segfault of unprotected var caught with the help of address sanitizer; was test 1509 - set.seed(1) - val = sample(c(1:5, NA), 1e4L, TRUE) - dt <- setDT(replicate(100L, val, simplify=FALSE)) - ## to ensure there's no segfault... - ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE) - test(1035.21, ans, ans) + # 1035.21 moved to benchmark.Rraw, #5517 # improper levels fix, #1359; was test 1563 dt = data.table(id=1:3, x=NA_character_, y=c('a', NA_character_, 'c')) @@ -3186,6 +3228,12 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.25, melt(dt, id.vars=NULL, measure.vars=-1), error="One or more values in 'measure.vars'") test(1035.26, melt(dt, id.vars=5, measure.vars=-1), error="One or more values in 'id.vars'") test(1035.27, melt(dt, id.vars=1, measure.vars=-1), error="One or more values in 'measure.vars'") + test(1035.28, melt(dt, measure.vars=list("a")), error="One or more values in 'measure.vars'") + test(1035.29, melt(dt, measure.vars=NA_integer_, id.vars="y"), error="One or more values in 'measure.vars'") + test(1035.291, melt(dt, measure.vars=NA_integer_, id.vars=NULL), error="One or more values in 'measure.vars'") + test(1035.30, melt(dt, id.vars=NA_integer_), error="One or more values in 'id.vars'") + test(1035.31, melt(dt, measure.vars=NA_character_), error="One or more values in 'measure.vars'") + test(1035.32, melt(dt, id.vars=NA_character_), error="One or more values in 'id.vars'") if (test_R.utils) { # dup names in variable used to generate malformed factor error and/or segfault, #1754; was test 1570 @@ -3267,18 +3315,8 @@ Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") test(1037.414, melt(x, id.vars='x1', measure.vars='r'), error="Unknown column type 'raw' for column 'r'") - # test dispatch for non-data.table objects, #4864. - if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) { - test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), - error="The melt generic in data.table has been passed a data.frame") - } else { - # 1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2 - # 2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded) - # 3) in dev locally I have reshape2 installed to run caret in other.Rraw - test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), - as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), - warning="The melt generic in data.table has been passed a data.frame") - } + # 1038 moved to other.Rraw, #5517 + } # sorting and grouping of Inf, -Inf, NA and NaN, #117, #112 & #105 @@ -3422,10 +3460,22 @@ test(1083, setkeyv(ans[, list(r = .N), by=key(DT1)], key(ans)), check) # if the # Tests for #2531. `:=` loses POSIXct or ITime attribute: # first test from this SO post: http://stackoverflow.com/questions/15996692/cannot-assign-columns-as-date-by-reference-in-data-table +set.seed(1) dt <- data.table(date = as.IDate(sample(10000:11000, 10), origin = "1970-01-01")) dt[, group := rep(1:2, 5)] dt[, min.group.date := as.IDate(min(date)), by = group] -test(1084, class(dt$min.group.date), c("IDate", "Date")) +test(1084.1, class(dt$min.group.date), c("IDate", "Date")) + +# min.IDate on empty input NA, #2256 +# non-optimized grouping first: +test(1084.2, dt[, min(date[date>"1999-12-01"]), by=group], data.table(group=1:2, V1=as.IDate(c("1999-12-14",NA)))) +test(1084.3, dt[, max(date[date<"1997-08-01"]), by=group], data.table(group=1:2, V1=as.IDate(c(NA,"1997-07-19")))) +dt[group==2, date:=NA] # make group 2 an all-NA group +# GForce grouping with na.rm=FALSE|TRUE on the all-NA group +test(1084.4, dt[, min(date, na.rm=TRUE), by=group], data.table(group=1:2, V1=as.IDate(c("1997-12-06",NA)))) +test(1084.5, dt[, min(date), by=group], data.table(group=1:2, V1=as.IDate(c("1997-12-06",NA)))) +test(1084.6, dt[, max(date, na.rm=TRUE), by=group], data.table(group=1:2, V1=as.IDate(c("1999-12-14",NA)))) +test(1084.7, dt[, max(date), by=group], data.table(group=1:2, V1=as.IDate(c("1999-12-14",NA)))) dt <- data.table(date = as.IDate(sample(10000:11000, 10), origin = "1970-01-01")) dt[, group := rep(1:2, 5)] @@ -3826,7 +3876,10 @@ test(1133.3, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) +options(datatable.optimize=0L) test(1133.7, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) +options(datatable.optimize=2L) +test(1133.75, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) # on a new column with warning on 2nd assign DT[,new:=NULL] test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)), @@ -3917,8 +3970,12 @@ DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2)) test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - "))) DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) -ans = copy(DT) -test(1143.4, DT[,`:=`(lx=tail(x,1L)), by=y], ans[, lx := x]) +options(datatable.optimize=0L) +test(1143.4, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.5, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +options(datatable.optimize=2L) +test(1143.6, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.7, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 @@ -3958,7 +4015,8 @@ if (test_longdouble) { old = getNumericRounding() set.seed(6) - x = rnorm(1e6)*1e4 + x = rnorm(1e4)*1e4 # first 1e4 reduced from 1e6 to save ram, #5517 + x = c(x, 11969.235757385, 11969.235757322) # add back 2 numbers from the 1e6 sample whose order is changed in test 1147.3 ans = base::sort.list(x, method="shell") setNumericRounding(0) test(1147.1, ans, forderv(x)) @@ -3992,16 +4050,7 @@ if (test_longdouble) { test(1149.1, forderv(integer(0)), integer(0)) test(1149.2, forderv(numeric(0)), integer(0)) -# test uniqlengths -set.seed(45) -x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE) -ox <- forderv(x) -o1 <- uniqlist(list(x), ox) -test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) -o1 <- uniqlist(list(x)) -test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) -rm(list=c("x","ox","o1")) -gc() +# 1151 moved to benchmark.Rraw, #5517 # #67 fix - grouping with .SDcols gave "symbol not subsettable error" - consequence of FR #355 implementation dt = data.table(grp = sample(letters[1:3],20, replace = TRUE), v1 = rnorm(20), v2 = rnorm(20)) @@ -4041,21 +4090,7 @@ setkey(dt, x) test(1155.4, dt[J(NaN)], dt[is.nan(x)]) test(1155.5, dt[J(NA_real_)], dt[is.na(x) & !is.nan(x)]) -# Fix for (usually small) memory leak when grouping, #2648. -# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row). -DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001)) -before = gc()["Vcells",2] -for (i in 1:50) DT[, sum(B), by=A] -after = gc()["Vcells",2] -test(1157, after < before+3) # +3 = 3MB -# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case. - -# Similar for when dogroups writes less rows than allocated, #2648. -DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4)) -before = gc()["Vcells",2] -for (i in 1:50) DT[ , unlist(.SD), by = 'k'] -after = gc()["Vcells",2] -test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024 +# 1157-1158 moved to benchmark.Rraw, #5517 # tests for 'setDT' - convert list, DF to DT without copy x <- data.frame(a=1:4, b=5:8) @@ -4143,6 +4178,10 @@ DT = data.table(A=c(utf8_strings, latin1_strings), B=1:4) test(1162.21, is.sorted(DT), FALSE) setkey(DT) test(1162.22, is.sorted(DT), TRUE) +# Issue #5070 +DT = data.table(x2 = rep(NA_character_, 2)) +test(1162.23, is.sorted(DT)) +test(1162.24, is.sorted(rep(NA_character_, 2))) # FR #351 - last on length=0 arguments x <- character(0) @@ -4363,48 +4402,46 @@ seed = as.integer(Sys.time()) # sample(9999L, 1L) temporary fix, because all the seedInfo = paste("forder decreasing argument test: seed = ", seed," ", sep="") # no NaN (because it's hard to match with base::order); tested below in 1988.4-8 set.seed(seed) -foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, sep="") +foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, collapse="") i1 = as.integer(sample(c(-100:100), 1e3, TRUE)) i2 = as.integer(sample(c(-100:100, -1e6, 1e6), 1e3, TRUE)) d1 = as.numeric(sample(c(-100:100,Inf,-Inf), 1e3, TRUE)) d2 = as.numeric(rnorm(1e3)) -c1 = sample(c(letters), 1e3, TRUE) -c2 = sample(foo(200), 1e3, TRUE) +c1 = sample(letters, 1e3, TRUE) +c2 = sample(foo(50), 1e3, TRUE) DT = data.table(i1, i2, d1, d2, c1, c2) # randomise col order as well colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") -ans = vector("list", length(names(DT))) test_no = 1223.0 oldnfail = nfail -for (i in seq_along(names(DT))) { - cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i))) - ans[[i]] = combn(names(DT), i, function(x) { - tmp = apply(cj, 1, function(y) { +for (nvars in seq_along(names(DT))) { + signs = expand.grid(replicate(nvars, c(-1L,1L), simplify=FALSE)) + combn(names(DT), nvars, simplify=FALSE, function(x) { # simplify=FALSE needed for R 3.1.0 + for (i in seq_len(nrow(signs))) { test_no <<- signif(test_no+.001, 7) ll = as.call(c(as.name("order"), lapply(seq_along(x), function(j) { - if (y[j] == 1L) + if (signs[i,j] == 1L) as.name(x[j]) else { - if (class(DT[[x[j]]]) =="character") + if (is.character(DT[[x[j]]])) as.call(c(as.name("-"), as.call(list(as.name("xtfrm"), as.name(x[j]))))) else as.call(list(as.name("-"), as.name(x[j]))) } }) )) - test(test_no, forderv(DT, by=x, order=y), with(DT, eval(ll))) - }) - dim(tmp)=NULL - list(tmp) + test(test_no, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) + } + integer() }) } -ans = NULL if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce +rm_all() # fix for bug #44 - unique on null data.table should return null data.table test(1224, unique(data.table(NULL)), data.table(NULL)) @@ -4500,7 +4537,7 @@ if (base::getRversion() < "3.3.0") { # Test for optimisation of 'order' to 'forder'. Copied to benchmarks.Rraw too. set.seed(45L) -DT = data.table(x=sample(1e2, 1e5, TRUE), y=sample(1e2, 1e5, TRUE)) +DT = data.table(x=sample.int(1e2, 1e3, TRUE), y=sample.int(1e2, 1e3, TRUE)) # 1e5 reduced again to 1e3, #5517 test(1241, DT[order(x,-y)], # optimized to forder() DT[base_order(x,-y)]) # not optimized @@ -4774,7 +4811,7 @@ test(1268.22, dt[, c(as.list(c), lapply(.SD, mean)), by=a], # Wide range numeric and integer64, to test all bits old_rounding = getNumericRounding() -x = sample( c(seq(-1e100, 1e100, length.out=1e5), c(seq(-1e-100,1e-100,length.out=1e5))) ) +x = sample( c(seq(-1e100, 1e100, length.out=1e3), c(seq(-1e-100,1e-100,length.out=1e3))) ) # 1e5 reduced to 1e3, #5517 setNumericRounding(0) test(1269, forderv(x), base::order(x)) setNumericRounding(2) # not affected by rounding @@ -5096,8 +5133,8 @@ DT = DT[1L] set(DT,1L,"b",FALSE) # passing 1L as i here is needed to avoid column plonk, so changes the logical singleton in place test(1297, as.integer(TRUE[1]), 1L) # In R 3.1, TRUE[1] returns the global TRUE but TRUE doesn't yet (parses as new vector) test(1298, as.integer(TRUE), 1L) -# orignal example, verbatim from James Sams : -upc_table = data.table(upc=1:100000, upc_ver_uc=rep(c(1,2), times=50000), is_PL=rep(c(TRUE, FALSE, FALSE, TRUE), each=25000), product_module_code=rep(1:4, times=25000), ignore.column=2:100001) +# orignal example, verbatim from James Sams; sizes reduced to save ram in #5517 +upc_table = data.table(upc=1:1000, upc_ver_uc=rep(c(1,2), times=500), is_PL=rep(c(TRUE, FALSE, FALSE, TRUE), each=250), product_module_code=rep(1:4, times=250), ignore.column=2:1001) test(1299, upc_table[, .N, by=list(upc, upc_ver_uc)][,max(N)], 1L) # all size 1 groups test(1300, upc_table[, list(is_PL, product_module_code), keyby=list(upc, upc_ver_uc)][,upc[1:3]], 1:3L) # was warning "internal TRUE value has been modified" rm(list="upc_table") @@ -6587,63 +6624,88 @@ test(1462.3, DT[, sum(unlist(mget(cols, as.environment(-1)))), by=x], DT[, sum(u # test for 'shift' x=1:5 y=factor(x) -test(1463.01, shift(x,1L), as.integer(c(NA, 1:4))) -test(1463.02, shift(x,1:2), list(as.integer(c(NA, 1:4)), as.integer(c(NA, NA, 1:3)))) -test(1463.03, shift(x,1L, 0L), as.integer(c(0L, 1:4))) -test(1463.04, shift(x,1L, type="lead"), as.integer(c(2:5, NA))) -test(1463.05, shift(x,1:2, type="lead"), list(as.integer(c(2:5, NA)), as.integer(c(3:5, NA, NA)))) -test(1463.06, shift(x,1L, 0L, type="lead"), as.integer(c(2:5, 0L))) -test(1463.07, shift(y,1L), factor(c(NA,1:4), levels=1:5)) -test(1463.08, shift(y,1L, type="lead"), factor(c(2:5, NA), levels=1:5)) +test(1463.01, shift(x,1L), as.integer(c(NA, 1:4))) +test(1463.02, shift(x,1:2), list(as.integer(c(NA, 1:4)), as.integer(c(NA, NA, 1:3)))) +test(1463.03, shift(x,1L, 0L), as.integer(c(0L, 1:4))) +test(1463.04, shift(x,1L, type="lead"), as.integer(c(2:5, NA))) +test(1463.05, shift(x,1:2, type="lead"), list(as.integer(c(2:5, NA)), as.integer(c(3:5, NA, NA)))) +test(1463.06, shift(x,1L, 0L,type="lead"), as.integer(c(2:5, 0L))) +test(1463.07, shift(y,1L), factor(c(NA,1:4), levels=1:5)) +test(1463.08, shift(y,1L, type="lead"), factor(c(2:5, NA), levels=1:5)) +test(1463.09, shift(x,1L, type="cyclic"), as.integer(c(5, 1:4))) +test(1463.10, shift(x,1:2, type="cyclic"), list(as.integer(c(5, 1:4)), as.integer(c(4:5, 1:3)))) +test(1463.11, shift(x,-1L, type="cyclic"), as.integer(c(2:5, 1))) +test(1463.12, shift(x,-(1:2),type="cyclic"), list(as.integer(c(2:5, 1)), as.integer(c(3:5,1:2)))) x=as.numeric(x) -test(1463.09, shift(x,1L), as.numeric(c(NA, 1:4))) -test(1463.10, shift(x,1:2), list(as.numeric(c(NA, 1:4)), as.numeric(c(NA, NA, 1:3)))) -test(1463.11, shift(x,1L, 0L), as.numeric(c(0L, 1:4))) -test(1463.12, shift(x,1L, type="lead"), as.numeric(c(2:5, NA))) -test(1463.13, shift(x,1:2, type="lead"), list(as.numeric(c(2:5, NA)), as.numeric(c(3:5, NA, NA)))) -test(1463.14, shift(x,1L, 0L, type="lead"), as.numeric(c(2:5, 0L))) +test(1463.13, shift(x,1L), as.numeric(c(NA, 1:4))) +test(1463.14, shift(x,1:2), list(as.numeric(c(NA, 1:4)), as.numeric(c(NA, NA, 1:3)))) +test(1463.15, shift(x,1L, 0L), as.numeric(c(0L, 1:4))) +test(1463.16, shift(x,1L, type="lead"), as.numeric(c(2:5, NA))) +test(1463.17, shift(x,1:2, type="lead"), list(as.numeric(c(2:5, NA)), as.numeric(c(3:5, NA, NA)))) +test(1463.18, shift(x,1L, 0L,type="lead"), as.numeric(c(2:5, 0L))) +test(1463.19, shift(x,1L, type="cyclic"), as.numeric(c(5, 1:4))) +test(1463.20, shift(x,1:2, type="cyclic"), list(as.numeric(c(5, 1:4)), as.numeric(c(4:5, 1:3)))) +test(1463.21, shift(x,-1L, type="cyclic"), as.numeric(c(2:5, 1))) +test(1463.22, shift(x,-(1:2),type="cyclic"), list(as.numeric(c(2:5, 1)), as.numeric(c(3:5,1:2)))) + if (test_bit64) { x=as.integer64(x) - test(1463.15, shift(x,1L), as.integer64(c(NA, 1:4))) - test(1463.16, shift(x,1:2), list(as.integer64(c(NA, 1:4)), as.integer64(c(NA, NA, 1:3)))) - test(1463.17, shift(x,1L, 0L), as.integer64(c(0L, 1:4))) - test(1463.18, shift(x,1L, type="lead"), as.integer64(c(2:5, NA))) - test(1463.19, shift(x,1:2, type="lead"), list(as.integer64(c(2:5, NA)), as.integer64(c(3:5, NA, NA)))) - test(1463.20, shift(x,1L, 0L, type="lead"), as.integer64(c(2:5, 0L))) + test(1463.23, shift(x,1L), as.integer64(c(NA, 1:4))) + test(1463.24, shift(x,1:2), list(as.integer64(c(NA, 1:4)), as.integer64(c(NA, NA, 1:3)))) + test(1463.25, shift(x,1L, 0L), as.integer64(c(0L, 1:4))) + test(1463.26, shift(x,1L, type="lead"), as.integer64(c(2:5, NA))) + test(1463.27, shift(x,1:2, type="lead"), list(as.integer64(c(2:5, NA)), as.integer64(c(3:5, NA, NA)))) + test(1463.28, shift(x,1L, 0L, type="lead"), as.integer64(c(2:5, 0L))) + test(1463.29, shift(x,1L, type="cyclic"), as.integer64(c(5, 1:4))) + test(1463.30, shift(x,1:2, type="cyclic"), list(as.integer64(c(5, 1:4)), as.integer64(c(4:5, 1:3)))) + test(1463.31, shift(x,-1L, type="cyclic"), as.integer64(c(2:5, 1))) + test(1463.32, shift(x,-(1:2), type="cyclic"), list(as.integer64(c(2:5, 1)), as.integer64(c(3:5,1:2)))) } x=as.character(x) -test(1463.21, shift(x,1L), as.character(c(NA, 1:4))) -test(1463.22, shift(x,1:2), list(as.character(c(NA, 1:4)), as.character(c(NA, NA, 1:3)))) -test(1463.23, shift(x,1L, 0L), as.character(c(0L, 1:4))) -test(1463.24, shift(x,1L, type="lead"), as.character(c(2:5, NA))) -test(1463.25, shift(x,1:2, type="lead"), list(as.character(c(2:5, NA)), as.character(c(3:5, NA, NA)))) -test(1463.26, shift(x,1L, 0L, type="lead"), as.character(c(2:5, 0L))) +test(1463.33, shift(x,1L), as.character(c(NA, 1:4))) +test(1463.34, shift(x,1:2), list(as.character(c(NA, 1:4)), as.character(c(NA, NA, 1:3)))) +test(1463.35, shift(x,1L, 0L), as.character(c(0L, 1:4))) +test(1463.36, shift(x,1L, type="lead"), as.character(c(2:5, NA))) +test(1463.37, shift(x,1:2, type="lead"), list(as.character(c(2:5, NA)), as.character(c(3:5, NA, NA)))) +test(1463.38, shift(x,1L, 0L, type="lead"), as.character(c(2:5, 0L))) +test(1463.39, shift(x,1L, type="cyclic"), as.character(c(5, 1:4))) +test(1463.40, shift(x,1:2, type="cyclic"), list(as.character(c(5, 1:4)), as.character(c(4:5, 1:3)))) +test(1463.41, shift(x,-1L, type="cyclic"), as.character(c(2:5, 1))) +test(1463.42, shift(x,-(1:2), type="cyclic"), list(as.character(c(2:5, 1)), as.character(c(3:5,1:2)))) x=c(TRUE,FALSE,TRUE,FALSE,TRUE) -test(1463.27, shift(x,1L), c(NA, x[-5L])) -test(1463.28, shift(x,1:2), list(c(NA, x[-5L]), c(NA, NA, x[-(4:5)]))) -test(1463.29, shift(x,1L, 0L), c(FALSE, x[-5L])) -test(1463.30, shift(x,1L, type="lead"), c(x[-1L], NA)) -test(1463.31, shift(x,1:2, type="lead"), list(c(x[-1L],NA), c(x[-(1:2)],NA,NA))) -test(1463.32, shift(x,1L, 0L, type="lead"), c(x[-(1)], FALSE)) +test(1463.43, shift(x,1L), c(NA, x[-5L])) +test(1463.44, shift(x,1:2), list(c(NA, x[-5L]), c(NA, NA, x[-(4:5)]))) +test(1463.45, shift(x,1L, 0L), c(FALSE, x[-5L])) +test(1463.46, shift(x,1L, type="lead"), c(x[-1L], NA)) +test(1463.47, shift(x,1:2, type="lead"), list(c(x[-1L],NA), c(x[-(1:2)],NA,NA))) +test(1463.48, shift(x,1L, 0L, type="lead"), c(x[-(1)], FALSE)) +test(1463.49, shift(x,1L, type="cyclic"), c(x[5L], x[-5L])) +test(1463.50, shift(x,1:2, type="cyclic"), list(c(x[5L], x[-5L]), c(x[4L:5L], x[-4L:-5L]))) +test(1463.51, shift(x,-1L, type="cyclic"), c(x[-1L], x[1L])) +test(1463.52, shift(x,-(1:2), type="cyclic"), list(c(x[-1L], x[1L]), c(x[-1L:-2L], x[1L:2L]))) # for list of list, #1595 x = data.table(foo = c(list(c("a","b","c")), list(c("b","c")), list(c("a","b")), list(c("a"))), id = c(1,1,2,2)) -test(1463.33, x[, shift(list(foo)), by=id], +test(1463.53, x[, shift(list(foo)), by=id], data.table(id=c(1,1,2,2), V1=list(NA, c("a", "b", "c"), NA, c("a", "b")))) -test(1463.34, x[, shift(list(foo), type="lead", fill=NA_integer_), by=id], +test(1463.54, x[, shift(list(foo), type="lead", fill=NA_integer_), by=id], data.table(id=c(1,1,2,2), V1=list(c("b", "c"), NA_integer_, c("a"), NA_integer_))) +test(1463.55, x[, shift(list(foo), 1, type="cyclic"), by=id], + data.table(id=c(1,1,2,2), V1=list(c("b","c"), c("a","b","c"), c("a"), c("a","b")))) +test(1463.56, x[, shift(list(foo), -1, type="cyclic"), by=id], + data.table(id=c(1,1,2,2), V1=list(c("b","c"), c("a","b","c"), c("a"), c("a","b")))) # Fix for #1009 segfault in shift val = runif(1) -test(1463.35, shift(val, 2L), NA_real_) -test(1463.36, shift(val, 2L, type="lead"), NA_real_) +test(1463.57, shift(val, 2L), NA_real_) +test(1463.58, shift(val, 2L, type="lead"), NA_real_) -test(1463.37, shift(1:5, 1L, fill=c(1:2)), error="fill must be a vector of length") -test(1463.38, shift(mean), error="type 'closure' passed to shift(). Must be a vector, list, data.frame or data.table") +test(1463.59, shift(1:5, 1L, fill=c(1:2)), error="fill must be a vector of length") +test(1463.60, shift(mean), error="type 'closure' passed to shift(). Must be a vector, list, data.frame or data.table") # add tests for date and factor? @@ -6652,12 +6714,19 @@ x = 1:10 nm = c("x_lag_1", "x_lag_2") ans = list(as.integer(c(NA, 1:9)), as.integer(c(NA, NA, 1:8))) setattr(ans, 'names', nm) -test(1463.39, shift(x, 1:2, give.names=TRUE), ans) +test(1463.61, shift(x, 1:2, give.names=TRUE), ans) + +# 1463.62-65 tested nanotime moved to other.Rraw 21, #5516 + +# shift circular +x = 1:5 +test(1463.66, shift(x, 5, type="cyclic"), x) +test(1463.67, shift(x, -5, type="cyclic"), x) +test(1463.68, shift(x, 6, type="cyclic"), shift(x, 1, type="cyclic")) +test(1463.69, shift(x, -6, type="cyclic"), shift(x, -1, type="cyclic")) +# test warning +test(1463.70, shift(x, 1, fill=1, type="cyclic"), c(5L, 1L:4L), warning="Provided argument fill=1 will be ignored since type='shift'.") -if (test_nanotime) { - test(1463.40, shift(nanotime(1:4) ), c(nanotime::nanotime(NA), nanotime::nanotime(1:3))); - test(1463.41, shift(nanotime(1:4), fill=0L), c(nanotime::nanotime(0L), nanotime::nanotime(1:3))); -} # FR #686 DT = data.table(a=rep(c("A", "B", "C", "A", "B"), c(2,2,3,1,2)), foo=1:10) @@ -6680,101 +6749,7 @@ test(1464.12, rleidv(DT, 1:2), ans<-INT(1,2,3,4,5,6,6,6,7,8,8,9,10,11,12,13,14,1 test(1464.13, rleidv(DT, 2:1), ans) test(1464.14, rleidv(DT, c(3,1)), INT(1,1,2,2,3,4,5,5,6,7,8,9,10,11,12,13,14,15,16,17)) -if (test_xts) { - - Sys.unsetenv("_R_CHECK_LENGTH_1_LOGIC2_") - # package xts has an issue with an && clause (https://github.com/joshuaulrich/xts/pull/269). When that is fixed in xts and released to CRAN, we can remove this Sys.unsetenv - # Sys.setenv is called again at the end of this xts branch. The original env variable value was stored at the top of this file and restored at the end. - - # data.table-xts conversion #882 - # Date index - dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) - dt_xt = as.data.table(xt) - xt_dt = as.xts.data.table(dt) - test(1465.01, all.equal(dt, dt_xt, check.attributes = FALSE)) - test(1465.02, xt, xt_dt) - # POSIXct index - dt <- data.table(index = as.POSIXct(as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) - dt_xt = as.data.table(xt) - xt_dt = as.xts.data.table(dt) - test(1465.03, all.equal(dt, dt_xt, check.attributes = FALSE)) - test(1465.04, xt, xt_dt) - # index types returned from to.period - dt = data.table(index = as.Date((as.Date("2014-12-12") - 729):as.Date("2014-12-12"), origin = "1970-01-01"), quantity = as.numeric(rep(c(1:5), 73)), value = rep(c(1:73) * 100, 5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value), ncol = 2, dimnames = list(NULL, c("quantity", "value"))), order.by = dt$index) - xt_w = xts::to.weekly(xt) - xt_dt_xt_w = as.xts.data.table(as.data.table(xt_w)) - xt_m = xts::to.monthly(xt) - xt_dt_xt_m = as.xts.data.table(as.data.table(xt_m)) - xt_q = xts::to.quarterly(xt) - xt_dt_xt_q = as.xts.data.table(as.data.table(xt_q)) - xt_y = xts::to.yearly(xt) - xt_dt_xt_y = as.xts.data.table(as.data.table(xt_y)) - test(1465.05, all.equal(xt_w, xt_dt_xt_w, check.attributes = FALSE)) - test(1465.06, all.equal(xt_m, xt_dt_xt_m, check.attributes = FALSE)) - test(1465.07, all.equal(xt_q, xt_dt_xt_q, check.attributes = FALSE)) - test(1465.08, all.equal(xt_y, xt_dt_xt_y, check.attributes = FALSE)) - - test(1465.09, xts::last(1:5), 5L) # was test 1531 - - # xts issue from Joshua, #1347 - x = as.Date(1:5, origin="2015-01-01") - test(1465.10, last(x), tail(x, 1L)) # was test 1559 - - x = xts(1:100, Sys.Date()+1:100) - test(1465.11, last(x,10), x[91:100,]) # was test 841 - # The important thing this tests is that data.table's last() dispatches to xts's method when data.table is loaded above xts. - # But that isn't tested by R CMD check because xts is loaded above data.table, there. - # So to make this test is relevant, run it in fresh R session directly, after: "require(xts);require(data.table)" - # rather than: "require(data.table);require(xts)" - # Which was the main thrust of bug#2312 fixed in v1.8.3 - - # fix for #1484; was test 1589 - x = xts::as.xts(8, order.by = as.Date("2016-01-03")) - test(1465.12, all.equal(as.data.table(x), data.table(index = as.Date("2016-01-03"), V1 = 8), check.attributes=FALSE)) - - # IDate support in as.xts.data.table #1499; was test 1663 - dt <- data.table(date = c(as.IDate("2014-12-31"), - as.IDate("2015-12-31"), - as.IDate("2016-12-31")), - nav = c(100,101,99), - key = "date") - dt.xts <- as.xts.data.table(dt) - test(1465.13, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31"))) - - # additional coverage missing uncovered in #3117 - dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) - test(1465.14, as.data.table(xt, keep.rownames = FALSE), dt[ , !'index']) - names(xt)[1L] = 'index' - test(1465.15, as.data.table(xt), error = 'Input xts object should not') - names(xt)[1L] = 'quantity' - setcolorder(dt, c(3, 1, 2)) - if (base::getRversion() < "3.6.0") as.xts = as.xts.data.table # fix for when we cannot register s3method for suggested dependency #3286 - test(1465.16, as.xts(dt), error = 'data.table must have a time based') - setcolorder(dt, c(2, 3, 1)) - dt[ , char_col := 'a'] - test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric') - if (base::getRversion() < "3.6.0") rm(as.xts) - - # 890 -- key argument for as.data.table.xts - x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) - old = options(datatable.verbose=FALSE) - test(1465.18, capture.output(as.data.table(x, key="index")), - c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", - " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", - " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", - " 9: 1970-01-10 9", "10: 1970-01-11 10")) - options(old) - - # as.data.table.xts(foo) had incorrect integer index with a column name called 'x', #4897 - M = xts::as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) # xts:: just to be extra robust; shouldn't be needed with rm(as.xts) above - test(1465.19, inherits(as.data.table(M)$index,"POSIXct")) - - Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) -} +# 1465 tested xts moved to other.Rraw 18, #5516 # as.data.table.default #969 ar <- array(NA, dim=c(10,4),dimnames = list(NULL,paste("col",1:4,sep=""))) @@ -7436,7 +7411,8 @@ test(1536, duplicated(dt, incomparables=TRUE), error = "argument 'incomparables test(1537 , names(melt(dt, id.vars=1L, variable.name = "x", value.name="x")), c("x", "x.1", "x.2"), output = "Duplicate column names") # test for tables() -test(1538, tables(), output = "Total:") +test(1538.1, tables(), output="Total:") +test(1538.2, !is.unsorted(tables(order.col="NROW")$NROW), output="Total:") # uniqueN not support list-of-list: reverted #1224 d1 <- data.table(a = 1:4, l = list(list(letters[1:2]),list(Sys.time()),list(1:10),list(letters[1:2]))) @@ -7574,18 +7550,8 @@ dtab <- data.table(pid = factor(c("i", "nouana")), c("pid", "year")) test(1541, key(dtp[dtab]), c("pid", "year")) -# fix DT[TRUE, :=] using too much working memory for i, #1249 -if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled - f = tempfile() - N = 1000000 # or any large number of rows - DT = data.table(A=1:N, B=rnorm(N)) - DT[TRUE, B := B * 2] # stabilize with initial dummy update - Rprofmem(f) - DT[TRUE, B := B * 2] # or some in-place update - Rprofmem(NULL) - test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only - unlink(f) -} +# 1542.0 moved to benchmark.Rraw, #5517 + # DT[TRUE] should shallow copy as v1.11.8 and earlier did (#3214); in future more will shallow copy too DT = data.table(id = 1:5, key="id") DT1 = DT[TRUE] @@ -7734,10 +7700,7 @@ ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8") test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown") test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8") -# #1167 print.data.table row id in non-scientific notation -DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5)) -test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c")) -rm(DT) +# 1549 moved to benchmark.Rraw, #5517 # PR by @dselivanov # fixes #504 - handle nastring while reading (without coercion to character) @@ -7794,10 +7757,14 @@ read_table = function(str, ...) { test(1552.1, fread(str, na.strings="#N/A"), read_table(str, na.strings="#N/A")) test(1552.2, fread(str, na.strings=c("#N/A", "-999")), read_table(str, na.strings=c("#N/A", "-999"))) test(1552.3, fread(str, na.strings=c("#N/A", "-999", "+1")), read_table(str, na.strings=c("#N/A", "-999", "+1"))) -test(1552.4, fread(str, na.strings=c("#N/A", "-999", "+1", "1")), - error="NAstring <<1>> is recognized as type boolean.*not permitted") +test(1552.4, fread(str, na.strings=c("#N/A", "-999", "+1", "1")), read_table(str, na.strings=c("#N/A", "-999", "+1", "1"))) # enabled by FR #2927 test(1552.5, fread(str, na.strings=c("#N/A", "-999", "FALSE")), error="NAstring <>.*boolean.*not permitted") test(1552.6, fread("A\n1.0\n2\n-", na.strings=c("-")), data.table(A=c(1.0, 2.0, NA))) +test(1552.7, fread(str, na.strings=c("#N/A", "-999", "+1", "1"), logical01=TRUE), + error="NAstring <<1>> and logical01=TRUE.*not permitted") +str = "a,b,c\n0,1,2\n1,0,2" +test(1552.8, fread(str, na.strings = "0"), data.table(a=c(NA,1L), b=c(1L,NA), c=c(2L,2L))) +test(1552.9, fread(str, na.strings = c("0","1")), data.table(a=c(NA,NA), b=c(NA,NA), c=c(2L,2L))) # FR #1177: 'quote' option of 'print.data.table' DT1 <- data.table(s1=paste(" ",LETTERS[1:5],sep=""),s2=LETTERS[1:5]) @@ -8720,20 +8687,20 @@ test(1613.601, all.equal(data.table(a=1), data.frame(a=1)), "target is data.tabl test(1613.602, all.equal(data.table(a=1), data.frame(a=1), check.attributes = FALSE)) test(1613.603, all.equal(data.table(a=1), list(a=1), check.attributes = FALSE)) test(1613.604, all.equal(data.table(a=1), 1, check.attributes = FALSE)) -test(1613.605, all.equal(data.table(a=1), try(stop('this wont work'), silent = TRUE), check.attributes = FALSE), "target is data.table but current is not and failed to be coerced to it") +test(1613.605, !isTRUE(all.equal(data.table(a=1), try(stop('this wont work'), silent = TRUE), check.attributes = FALSE))) L1 = list(a = data.table(1), b = setattr("foo1613", "tbl", data.table(1))) L2 = list(a = 1, b = setattr("foo1613", "tbl", 1)) test(1613.606, all(grepl("target is data.table, current is numeric", all.equal(L1, L2)))) -as.data.table.foo1613 = function(x) { # test as.data.table coerce of 'current' argument +registerS3method("as.data.table", "foo1613", function(x) { # test as.data.table coerce of 'current' argument if (!length(x)) warning("empty foo1613") - as.data.table(unclass(foo1613)) -} -registerS3method("as.data.table", "foo1613", as.data.table.foo1613) + as.data.table(unclass(x)) +}) foo1613 = structure(list(NULL), class="foo1613") test(1613.607, all.equal(data.table(), foo1613, check.attributes=FALSE)) foo1613 = structure(list(), class="foo1613") test(1613.608, all.equal(data.table(), foo1613, check.attributes=FALSE), warning="empty") -rm(as.data.table.foo1613, foo1613) +registerS3method("as.data.table", "foo1613", as.data.table.default) +# search below in this file for "registerS3method" for comments about it DT1 <- data.table(a = 1:4, b = letters[1:4], .seqn = 5L) DT2 <- data.table(a = 4:1, b = letters[4:1], .seqn = 5L) @@ -9134,6 +9101,8 @@ dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) test(1638, dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") options(datatable.optimize=Inf) +rm_all() + #1389 - split.data.table - big chunk of unit tests set.seed(123) dt = data.table(x1 = rep(letters[1:2], 6), x2 = rep(letters[3:5], 4), x3 = rep(letters[5:8], 3), y = rnorm(12)) @@ -9225,14 +9194,14 @@ test(1639.056, TRUE, all( sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(fdt, by = c("x1","x2","x3"), flatten=FALSE) # empty levels in x3 after subset are expanded -test(1639.057, TRUE, all( - is.list(l), identical(names(l), c("b","a")), - sapply(l, function(x) !is.data.table(x) && is.list(x)), - sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), - identical(lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))), - sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6), - sapply(l, sapply, sapply, ncol) == rep(4L, 24) -)) +# memtest tracing in #5520 showed this split() and the one before 1639.188 (both by 3 columns) account for the RAM usage in 1639. But they should be gc()'d eventually after rm_all(). +test(1639.0571, is.list(l)) +test(1639.0572, names(l), c("b","a")) +test(1639.0573, all(sapply(l, function(x) !is.data.table(x) && is.list(x)))) +test(1639.0574, all(sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)))) +test(1639.0575, lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))) +test(1639.0576, all(sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6))) +test(1639.0577, all(sapply(l, sapply, sapply, ncol) == rep(4L, 24))) l = split(fdt, by = c("x3","x1"), drop=TRUE, flatten=FALSE) # multi col rev test(1639.058, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), @@ -9597,6 +9566,7 @@ test(1639.141, all(sapply(dtL, truelength) > 1000)) dt <- data.table(x = factor("a"), y = 1) test(1639.142, x = split(dt, by = "x"), y = list(a = dt)) test(1639.143, x = split(dt, by = "y"), y = list(`1` = dt)) +rm_all() # allow x's cols (specifically x's join cols) to be referred to using 'x.' syntax # patch for #1615. Note that I specifically have not implemented x[y, aa, on=c(aa="bb")] @@ -9609,10 +9579,10 @@ test(1640.2, x[y, c(.SD, .(x.aa=x.aa)), on=c(aa="bb")], data.table(aa=3:5, cc=c( # tests for non-equi joins # function to create a random data.table with all necessary columns nq_fun = function(n=100L) { - i1 = sample(sample(n, 10L), n, TRUE) - i2 = sample(-n/2:n/2, n, TRUE) - i3 = sample(-1e6:1e6, n, TRUE) - i4 = sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE) + i1 = sample(sample.int(n, 10L), n, TRUE) + i2 = sample.int(n, n, TRUE) - as.integer(n/2) # this used to be type numeric before #5517 which didn't seem intentional + i3 = sample.int(2e6, n, TRUE) - as.integer(1e6) # used to sample from -1e6:1e6 which if allocated would be 8MB, #5517 + i4 = sample(c(NA_integer_, sample.int(n*2L, 10L, FALSE)-n), n, TRUE) d1 = sample(rnorm(10L), n, TRUE) d2 = sample(rnorm(50), n, TRUE) @@ -9624,15 +9594,55 @@ nq_fun = function(n=100L) { dt = data.table(i1,i2,i3,i4, d1,d2,d3,d4, c1,c2) if (test_bit64) { - I1 = as.integer64(sample(sample(n, 10L), n, TRUE)) - I2 = as.integer64(sample(-n/2:n/2, n, TRUE)) - I3 = as.integer64(sample(-1e6:1e6, n, TRUE)) - I4 = as.integer64(sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE)) + I1 = as.integer64(sample(sample.int(n, 10L), n, TRUE)) + I2 = as.integer64(sample.int(n, n, TRUE) - as.integer(n/2)) + I3 = as.integer64(sample.int(2e6, n, TRUE) - as.integer(1e6)) # there used to be another -1e6:1e6 here whose altrep likely allocated when sample accessed it, #5517 + I4 = as.integer64(sample(c(NA_integer_, sample.int(n*2L, 10L, FALSE)-n), n, TRUE)) dt = cbind(dt, data.table(I1,I2,I3,I4)) } dt } +construct <- function(cols, vals, ops, x, y) { + expr = lapply(seq_along(cols), function(i) { + GT_or_LT = ops[i]==">" || ops[i]=="<" + if (inherits(vals[[i]], "integer64")) { + if (is.na.integer64(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is.na.integer64), as.name(cols[[i]]))) + else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), as.integer(vals[[i]]))) + # don't know how to construct a call with int64 -- vals[[i]] gets converted to NAN + } else { + if (is.nan(vals[[i]])) if (GT_or_LT) quote(logical(0)) else as.call(list(quote(is.nan), as.name(cols[[i]]))) + else if (is_only_na(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is_only_na), as.name(cols[[i]]))) + else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), vals[[i]])) + } + }) + Reduce(function(x,y)call("&",x,y), expr) +} + +check <- function(x, y, cols, ops, mult="all") { + # gather just row numbers here and then select all rows once afterwards, rather than rbindlist + rowNums = unlist(lapply(1:nrow(y), function(i) { + e = construct(cols, y[i, ..cols], ops, x, y) + rowNums = which(with(x, eval(e))) # raw expression, isolated from both [.data.table overhead and subset optimization + if (!length(rowNums) || mult=="all") + rowNums + else if (mult=="first") + rowNums[1L] + else # mult=="last" + rowNums[length(rowNums)] + })) + x[rowNums] +} + +nq <- function(x, y, cols, ops, nomatch=0L, mult="all") { + sd_cols = c(paste0("x.", cols), setdiff(names(x), cols)) + ans = x[y, mget(sd_cols, as.environment(-1)), on = paste0(cols, ops, cols), allow.cartesian=TRUE, nomatch=nomatch, mult=mult] + setnames(ans, gsub("^x[.]", "", names(ans))) + setcolorder(ans, names(x))[] +} + +is_only_na <- function(x) is.na(x) & !is.nan(x) + nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { ops = c("==", ">=", "<=", ">", "<") xclass = sapply(x, class) @@ -9643,42 +9653,6 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { thisops[startsWith(cols, "c")] = "==" thisops }) - is_only_na <- function(x) is.na(x) & !is.nan(x) - construct <- function(cols, vals, ops) { - expr = lapply(seq_along(cols), function(i) { - GT_or_LT = ops[i]==">" || ops[i]=="<" - if (inherits(vals[[i]], "integer64")) { - if (is.na.integer64(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is.na.integer64), as.name(cols[[i]]))) - else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), as.integer(vals[[i]]))) - # don't know how to construct a call with int64 -- vals[[i]] gets converted to NAN - } else { - if (is.nan(vals[[i]])) if (GT_or_LT) quote(logical(0)) else as.call(list(quote(is.nan), as.name(cols[[i]]))) - else if (is_only_na(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is_only_na), as.name(cols[[i]]))) - else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), vals[[i]])) - } - }) - Reduce(function(x,y)call("&",x,y), expr) - } - check <- function(x, y, cols, ops, mult="all") { - # gather just row numbers here and then select all rows once afterwards, rather than rbindlist - rowNums = unlist(lapply(1:nrow(y), function(i) { - e = construct(cols, y[i, ..cols], ops) - rowNums = which(with(x, eval(e))) # raw expression, isolated from both [.data.table overhead and subset optimization - if (!length(rowNums) || mult=="all") - rowNums - else if (mult=="first") - rowNums[1L] - else # mult=="last" - rowNums[length(rowNums)] - })) - x[rowNums] - } - nq <- function(x, y, cols, ops, nomatch=0L, mult="all") { - sd_cols = c(paste0("x.", cols), setdiff(names(x), cols)) - ans = x[y, mget(sd_cols, as.environment(-1)), on = paste0(cols, ops, cols), allow.cartesian=TRUE, nomatch=nomatch, mult=mult] - setnames(ans, gsub("^x[.]", "", names(ans))) - setcolorder(ans, names(x))[] - } for (i in seq_along(runcmb)) { thiscols = runcmb[[i]] thisops = runops[[i]] @@ -9691,7 +9665,7 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { gc() # no longer needed but left in place just in case, no harm } -dt1 = nq_fun(400L) +dt1 = nq_fun(100L) # 400 reduced to 100, #5517 dt2 = nq_fun(50L) x = na.omit(dt1) y = na.omit(dt2) @@ -9895,7 +9869,7 @@ test(1658.34, fwrite(data.table(id=c("A","B","C"), v=c(1.1,0.0,9.9))), output="i test(1658.35, fwrite(data.table(id=1:3,bool=c(TRUE,NA,FALSE)),na="NA",logical01=TRUE), output="\"id\",\"bool\"\n1,1\n2,NA\n3,0") # POSIXct -test(1658.36, fwrite(data.table(D = as.POSIXct(seq.Date(as.Date("2038-01-19"), as.Date("2038-01-20"), by = "day")))), +test(1658.36, fwrite(data.table(D = as.POSIXct(seq(as.Date("2038-01-19"), as.Date("2038-01-20"), by = "day")))), output="D\n2038-01-19T00:00:00Z\n2038-01-20T00:00:00Z") # input is of class matrix @@ -10295,15 +10269,17 @@ test(1692, capture.output(as.data.table(structure(57600L, class = "ITime"))), # testing all time part extraction routines (subsumes #874) t <- "2016-08-03 01:02:03.45" -test(1693.1, second(t), 3L) -test(1693.2, minute(t), 2L) -test(1693.3, hour(t), 1L) -test(1693.4, yday(t), 216L) -test(1693.5, wday(t), 4L) -test(1693.6, week(t), 31L) -test(1693.7, month(t), 8L) -test(1693.8, quarter(t), 3L) -test(1693.9, year(t), 2016L) +test(1693.01, second(t), 3L) +test(1693.02, minute(t), 2L) +test(1693.03, hour(t), 1L) +test(1693.04, yday(t), 216L) +test(1693.05, wday(t), 4L) +test(1693.06, week(t), 31L) +test(1693.07, month(t), 8L) +test(1693.08, quarter(t), 3L) +test(1693.09, year(t), 2016L) +test(1693.10, yearmon(t), 2016+7/12) +test(1693.11, yearqtr(t), 2016.5) # fix for #1740 - sub-assigning NAs for factors dt = data.table(x = 1:5, y = factor(c("","a","b","a", "")), z = 5:9) @@ -10820,31 +10796,7 @@ test(1738.3, sapply(DT,typeof), c(A="double",B="integer")) test(1738.4, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) test(1738.5, as.integer(as.Date(c("0000-03-01","9999-12-31"))), c(-719468L,2932896L)) -if (FALSE) { - # Full range takes too long for CRAN. - dts = seq.Date(as.Date("0000-03-01"),as.Date("9999-12-31"),by="day") - dtsCh = as.character(dts) # 36s - dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000 - test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31"))) -} else { - # test on CRAN a reduced but important range - dts = seq.Date(as.Date("1899-12-31"),as.Date("2100-01-01"),by="day") - dtsCh = as.character(dts) - test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01"))) -} -DT = data.table(A=dts, B=as.IDate(dts)) -test(1739.3, sapply(DT,typeof), c(A="double",B="integer")) -test(1739.4, typeof(dts), "double") -f = tempfile() -g = tempfile() # Full range -fwrite(DT,f) # 0.092s -write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s -test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=","))) -test(1739.6, readLines(f), readLines(g)) -unlink(f) -unlink(g) -rm(list=c("dtsCh","dts")) -gc() +# 1739 moved to benchmark.Rraw, #5517 # dateTimeAs DT = data.table( @@ -10904,17 +10856,37 @@ setattr(DT[[4]], "tzone", NULL) setattr(DT[[5]], "tzone", NULL) # format() now supports digits = 0, to display nsmall decimal places. -options(digits.secs=0) +# Oct 2022: R-devel changed write.csv behavior to no longer respect digits.secs, #5478. +# For now we'll get out of the way while R-devel discussion is ongoing so that 1.14.4 can +# be submitted to CRAN. +# These tests test fwrite(, dateTimeAs="write.csv") whose +# very point is to match write.csv. Rather than turn off these tests, we'll for now +# continue to test that at least fwrite continues to work as intended. Otherwise +# coverage will drop and we could miss a plain old crash or error bug. +# Note that tzone has been removed above so these tests output the POSIXct in the +# R session's timezone because here dateTimeAs="write.csv" and that's what write.csv does. +# This is the reason `y` can't be fixed strings because depending on the timezone of the +# session which is running test.data.table, the results will be different. +# data.table's fwrite achieves local timezone writing (when dateTimeAs="write.csv") via +# an R call to format.POSIXct in fwriteR.c. By default fwrite writes datetime in UTC for +# consistent and reproducible research, which is different to write.csv. +# TODO: revisit when R-devel has settled w.r.t. write.csv behavior. +format_rows_as_csv = function(DT, digits) apply(sapply(DT, format, digits=digits), 1L, paste0, collapse=",") +old=options(digits.secs=0) test(1741.3, x1<-capture.output(fwrite(DT,dateTimeAs="write.csv")), - capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + c("A,B,C,D,E", format_rows_as_csv(DT, digits=0L))) + # capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) options(digits.secs=3) test(1741.4, x2<-capture.output(fwrite(DT,dateTimeAs="write.csv")), - capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + c("A,B,C,D,E", format_rows_as_csv(DT, digits=3L))) + # capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) options(digits.secs=6) test(1741.5, x3<-capture.output(fwrite(DT,dateTimeAs="write.csv")), - capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + c("A,B,C,D,E", format_rows_as_csv(DT, digits=6L))) + # capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) # check that extra digits made it into output test(1741.6, sum(nchar(x1)) < sum(nchar(x2)) && sum(nchar(x2)) < sum(nchar(x3))) +options(old) # fread should properly handle NA in colClasses argument #1910 test(1743.01, sapply(fread("a,b\n3,a", colClasses=c(NA, "factor")), class), c(a="integer", b="factor")) @@ -11142,12 +11114,13 @@ test(1750.07, # 0 length `by`, must also use `sets=list()`, so 0L rows result nrow(groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = character(), .SDcols=c("amount","value"), sets=list(), id=TRUE)), 0L ) -test(1750.08, all( # for any single value from dataset there should be always same aggregate result on any level of grouping - sapply(seq_len(nrow(dt)), function(i) uniqueN( +# for any single value from dataset there should be always be the same aggregate result on any level of grouping +# changed from all(sapply()) to for() to save ram, #5517 +for (i in seq_len(nrow(dt))) { + test(1750.08+i/10000, uniqueN( groupingsets(dt[i], j = lapply(.SD, sum), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character())), - by=c("amount","value") - )) == 1L -), TRUE) + by=c("amount","value")) == 1L) +} # all grouping id matches in all totals r = groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character()), id=TRUE) test(1750.09, uniqueN( @@ -11314,23 +11287,7 @@ test(1751.3, capture.output(fwrite(DT,na="NA",verbose=FALSE)), c("\"x\"","NA")) test(1751.4, fread({fwrite(DT, f<-tempfile());f}), DT) # the important thing unlink(f) -if (test_nanotime) { - old = options(warnPartialMatchArgs=FALSE) # option off temporarily pending https://github.com/eddelbuettel/nanotime/pull/49 - DT = data.table(A=nanotime(tt<-c("2016-09-28T15:30:00.000000070Z", - "2016-09-29T23:59:00.000000001Z", - "2016-09-29T23:59:00.000000999Z", - "1970-01-01T00:01:01.000001000Z", - "1970-01-01T00:00:00.000000000Z", - "1969-12-31T23:59:59.999999999Z", - "1969-12-31T23:59:59.000000089Z", - "1969-12-31T12:13:14.000000000Z", - "1969-12-31T12:13:14.999999999Z", - "1969-12-31T12:13:14.000000001Z", - "1967-03-15T00:00:00.300000002Z", - "1967-03-15T23:59:59.300000002Z"))) - options(old) - test(1752, capture.output(fwrite(DT, verbose=FALSE))[-1], tt) -} +# 1752 tested nanotime moved to other.Rraw 22, #5516 # check too many fields error from ,\n line ending highlighted in #2044 test(1753.1, fread("X,Y\n1,2\n3,4\n5,6"), data.table(X=INT(1,3,5),Y=INT(2,4,6))) @@ -11392,18 +11349,7 @@ if (test_R.utils) test(1759, fread(testDir("alluniquechar.csv.gz"))[c(1,2,499,50 H=c("tokakysooopwtmlkeimzbgpein","hguwmynjhecsxpxldyzlemavmw", "lyclruzkazfqhyxnppaafwcveo","myfqhltlwzwwxyvshwrzrdmfyq"))) -# fread should use multiple threads on single column input. -# tests 2 threads; the very reasonable limit on CRAN -# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently) -if (getDTthreads() == 1L) { - cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n") -} else { - N = if (TRUE) 2e6 else 1e9 # offline speed check - fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile()) - test(1760.1, file.info(f)$size > 4*1024*1024) - test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads") - unlink(f) -} +# 1760 moved to benchmark.Rraw, #5517 # fread single column with superfluous fill=TRUE, #2118 test(1761.1, fread("1\n2\n3", fill=TRUE), data.table(V1=1:3)) @@ -11748,10 +11694,10 @@ ld = sapply(same, as.IDate) test(1779.01, uniqueN(ld)==1L) lt = sapply(same[1:2], as.ITime) # exclude date test(1779.02, uniqueN(lt)==1L) -# some random 1e6 timestamps old defaults vs new methods UTC +# some random timestamps old defaults vs new methods UTC intpx = function(x) as.integer(as.POSIXct(x, origin = "1970-01-01", tz = "UTC")) set.seed(1) -i = sample(intpx("2015-10-12")-intpx("2014-10-12"), 1e5, TRUE) + intpx("2014-10-12") +i = sample(intpx("2015-10-12")-intpx("2014-10-12"), 1e3, TRUE) + intpx("2014-10-12") # 1e5 reduced to 1e3, #5517 p = as.POSIXct(i, origin = "1970-01-01", tz = "UTC") test(1779.03, identical(as.ITime.default(p), as.ITime(p))) test(1779.04, identical(as.IDate.default(p), as.IDate(p))) @@ -11823,9 +11769,7 @@ test(1812, fread("A,B\n1,2\n3,4\n", skip="4", verbose=TRUE), data.table(V1=3L, V test(1813, fread("A,B\n1,2\n3,4", skip=10L), error="skip=10 but the input only has 3 lines") test(1814, fread("A,B\n1,2\n3,4\n \n\t", skip=3L), error="skip has been set after the last non-whitespace") -DT = data.table(A=seq(1, 1000000), B="x", C=TRUE) -fwrite(DT, f<-tempfile()) -test(1815, fread(f, nrows=5), DT[1:5]) #2243 +# 1815 moved to benchmark.Rraw, #5517 test(1816.1, fread("A,E\n1,2\n5,7\n4,6\n\x1A\x1A", verbose=TRUE), data.table(A=c(1L, 5L, 4L), E=c(2L, 7L, 6L)), @@ -11942,14 +11886,7 @@ fwrite(DT, f) test(1825.22, fread(f, colClasses = c(a = "numeric", b = "integer")), DT, warning="Attempt to override column 2.*ignored") unlink(f) -# issue 2351 -set.seed(1) -DT = data.table(id=paste0("id",1:1e5), v=sample(100,1e5,replace=TRUE)) -fwrite(DT, file=f<-tempfile(), eol="\r") -test(1826.1, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id99999","id100000"), v=c(27L,38L,10L,13L))) -cat("id888,42", file=f, append=TRUE) # without final \r after last line -test(1826.2, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id100000","id888"), v=c(27L,38L,13L,42L))) -unlink(f) +# 1826 moved to benchmark.Rraw, #5517 # Issue 2222 test(1827.1, fread("A,B\n1987,1\n1987,3\n", na.strings=c("1987", "NA")), data.table(A=c(NA,NA),B=c(1L,3L))) @@ -12037,21 +11974,7 @@ if (test_R.utils) { V12=c("AAAAAAAAAAAAA","","AAAAAAA","AAA"))) } -# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because -# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not. -# Would need a very complicated construction of embedded new lines in quoted fields, to test that. -# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN. -DT = as.data.table(CO2) -f = tempfile() -for (i in 0:1000) { - start = nrow(CO2)*i - fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE) - if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE) -} -test(1835, fread(f, verbose=TRUE), - output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped", - warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>") -unlink(f) +# 1835 moved to benchmark.Rraw, #5517 test(1836, fread('1,2,"3,a"\n4,5,"6,b"'), data.table(V1=c(1L,4L), V2=c(2L,5L), V3=c("3,a","6,b"))) # 2196 @@ -12156,7 +12079,7 @@ rand_strings = function(n) { apply(M, 1, function(x) paste0(letters[x], collapse="")) } set.seed(123) # the random data here doesn't match the data in issue 2275 because they used stringi::stri_rand_strings which has a different RNG -n = 100000 +n = 1000 # reduced from 100000 to 1000 for #5517 DT1 = data.table(RANDOM_STRING = rand_strings(n), DATE = sample(seq(as.Date('2016-01-01'), as.Date('2016-12-31'), by="day"), n, replace=TRUE)) DT2 = data.table(RANDOM_STRING = rand_strings(n), @@ -12201,13 +12124,7 @@ test(1849.9, fread(f, select=c("Date", "Description", "Balance")), data.table(Date=20150725L,Description="abcd",Balance="$5,006")) unlink(f) -# segfault when rbindlist is asked to create a DT with more than 2bn rows -DT = data.table(1:1e6) -L = vector("list", 2148) -for (i in seq_along(L)) L[[i]] = DT # many references to the same DT to avoid actually using large RAM for this test -test(1850, rbindlist(L), error="Total rows in the list is 2148000000 which is larger than the maximum number of rows, currently 2147483647") -rm(list=c("L","DT")) -gc() +# 1850 moved to benchmark.Rraw, #5517 # by=.EACHI missings to list columns, #2300 dt = data.table(a=factor(1:5, levels=1:10), b=as.list(letters[1:5])) @@ -12419,9 +12336,12 @@ if (test_R.utils) { } # better colname detection by comparing potential column names to the whole sample not just the first row of the sample, #2526 -test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(A=NA, "100"=c(300L,500L), "200"=c(400L,600L))) -test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(A=NA, "100"=c(NA,500L), V3=c(NA,600L))) +test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,300L,500L), V3=c(200L,400L,600L))) +test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,NA,500L), V3=c(NA,NA,600L))) test(1870.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA,3.4))) +test(1870.4, fread("A,B,200\n,300,400\n,500,600"), data.table(A=NA, B=c(300L,500L), "200"=c(400L,600L))) +test(1870.5, fread("A,B,\n,,\n,500,600"), data.table(A=NA, B=c(NA,500L), V3=c(NA,600L))) +test(1870.6, fread("A,,\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(NA,300L,500L), V3=c(NA,400L,600L))) # nrows= now ignores errors after those nrows as expected and skip= determines first row for sure, #1267 txt = "V1, V2, V3\n2,3,4\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n" @@ -12509,60 +12429,7 @@ fwrite(DT,f<-tempfile()) test(1873, fread(f), DT) unlink(f) -# Better jump sync and run-on in PR#2627 -# -# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627 -# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly -x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184) -x[51094]="" -cat(x, file=f<-tempfile(), sep="\n") -test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),], - data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")), - output="jumps=[0..2)") # ensure jump 1 happened -# -# out-of-sample short lines in the first jump, not near the jump point -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[5021:5041] = "small,batch,short,lines" # 4 fields not 5 -cat(x, file=f, sep="\n") -test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020), - warning="Stopped early on line 5021.*<>") -test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),], - data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), - V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), - V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), - V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), - V5=c(1L,5020L,NA,NA,5042L,102184L)), - output="jumps=[0..2)") -# -# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267 -# confirmed fails in master with that error before PR#2627 -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[51094:51150] = "small,batch,short,lines" # 4 fields not 5 -cat(x, file=f, sep="\n") -test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093), - warning="Stopped early on line 51094.*<>", - output="jumps=[0..2)") -test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),], - data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), - V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), - V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), - V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), - V5=c(1L,51093L,NA,NA,51151L,102184L)), - output="jumps=[0..2)") -# -# jump inside a quoted field containing many new lines, to simulate a dirty jump -# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too. -# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps. -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093" -cat(x, file=f, sep="\n") -test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n - data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"), - V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)), - output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)") -# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's -# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth) -unlink(f) +# 1874-1875 moved to benchmark.Rraw, #5517 test(1876, fread("http://hkhfsk\nhttp://fhdkf\nhttp://kjfhskd\nhttp://hfkjf", header=FALSE), # data not a download, #2531 data.table(V1=c("http://hkhfsk","http://fhdkf","http://kjfhskd","http://hfkjf"))) @@ -12656,7 +12523,7 @@ DT = fread(",2,3\n1,,3\n1,2,\n") # all rows contain an NA, #2784 test(1887.3, na.omit(DT), DT[0L]) test(1887.4, na.omit(DT, invert=TRUE), DT) -x = runif(1e4) +x = runif(1e3) # 1e4 reduced to 1e3 in #5517 but really it was the 1e6 just after 1888.5 below which is now 1e3 too test(1888, fsort(x), base::sort(x)) test(1888.1, fsort(x, decreasing = TRUE), base::sort(x, decreasing = TRUE), warning = "New parallel sort has not been implemented for decreasing=TRUE.*one thread") @@ -12670,7 +12537,7 @@ test(1888.4, fsort(x, decreasing = TRUE, na.last = TRUE), base::sort(x, decreasi x <- as.integer(x) test(1888.5, fsort(x), base::sort(x, na.last = FALSE), warning = "Input is not a vector of type double. New parallel sort has only been done for double vectors so far.*Using one thread") -x = runif(1e6) +x = runif(1e3) test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 20 MSB counts") test(1888.7, !base::is.unsorted(y)) test(1888.8, fsort(x,verbose=1), error="verbose must be TRUE or FALSE") @@ -12683,11 +12550,7 @@ test(1889, chmatch(x,x), 1:1000) rm(list=x) gc() -# test DT$.<- in a data.table-unaware package -DT = data.table(A=1:5) -test(1890.1, stats::ts.plot(gpars=DT), error="object must have one or more observations") -# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col() -test(1890.2, DT, data.table(A=1:5)) +# 1890 used stats::ts.plot, moved to other.Rraw 29 to save ram, #5517 # na="" default, #2524 test(1891.1, fread('A,B,C\n1,foo,4\n2,,5\n3,bar,6\n', na.strings=""), data.table(A=1:3, B=c("foo",NA,"bar"), C=4:6)) @@ -12903,43 +12766,7 @@ test(1911.2, DT[, COL_INT := integer(0)], error = "RHS of assignment to existing column 'COL_INT' is zero length but not NULL.*") -# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882 -# This runs with 2 threads in the test suite on CRAN and AppVeyor etc. -# 2 threads are sufficient to fail before the fix. -N = 20 -DF = data.frame(a=rnorm(N), - b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]), - c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5])) -DT = setDT(DF) # setDT required since data.table() already expanded altrep's -before = sum(gc()[, 2]) -fff = function(aref) { - ff = lapply(1:5, function(i) { - DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] - }) - return(rbindlist(ff)) -} -for(i in 1:100) { - f = fff("a") - rm("f") -} -gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after` - # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm. -after = sum(gc()[, 2]) -test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up). -# -before = sum(gc()[, 2]) -fff = function(aref) { - DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race. - lapply(1:5, function(i) { - DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] - }) -} -for(i in 1:100) { - fff("a") -} -gc() -after = sum(gc()[, 2]) -test(1912.2, after < before + 10) +# 1912 moved to benchmark.Rraw, #5517 # BEGIN port of old testthat tests, #2740. Issue numbers may be from R-forge. # @@ -13422,13 +13249,34 @@ test(1958.01, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encod test(1958.02, fread('a,b\n1,2', nrows = NA_real_), data.table(a = 1L, b = 2L)) test(1958.03, fread('a,b\n1,2', nrows = -1), data.table(a = 1L, b = 2L)) test(1958.04, fread('a,b\n1,2', key = 1), error = 'must be a character vector naming columns') -test(1958.05, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=logical(), B=logical(), C=logical())) #2747 -test(1958.06, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=logical(), B=logical(), C=logical())) +test(1958.05, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=integer(), B=integer(), C=integer())) #2747 +test(1958.06, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=integer(), B=integer(), C=integer())) test(1958.07, fread('A,B,C,D\n"a,b",4,5,6\n"c,d",6,7\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(6,NA))) # 2547 test(1958.08, fread('A,B,C,D\n"a,b",4,5\n"c,d",6,7,8\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(NA,8))) -# 4686 -test(1958.09, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0L), data.table(A=logical(), B=logical(), C=logical())) -test(1958.10, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0L, sep=','), data.table(A=logical(), B=logical(), C=logical())) +test(1958.09, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0L), data.table(A=integer(), B=integer(), C=integer())) # nrows=0 vs 0L, 4686 +test(1958.10, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0L, sep=','), data.table(A=integer(), B=integer(), C=integer())) +# nrows=0 should perform a full sample to get the empty column types right as documented, #4029 +test(1958.11, fread('A,B,C,D\n1,CHAR,"CHAR",3.1', nrows=0L), data.table(A=integer(), B=character(), C=character(), D=numeric())) +# .. one different type in the middle of under 100 +txt = paste(c("A,B\n1,2\n", rep("3,4\n",48), "3,4.1\n", rep("5,6\n",48)), collapse="") +test(1958.12, fread(text=txt, nrows=0L), data.table(A=integer(), B=numeric())) +test(1958.13, fread(text=txt, nrows=0L, skip=1L), data.table(V1=integer(), V2=numeric())) +test(1958.14, fread(text=txt, nrows=1L), data.table(A=1L, B=2L)) # B integer not numeric because sample is min(nrows,100) when nrows>=1 +test(1958.15, fread(text=txt, nrows=1L, skip=1L), data.table(V1=1L, V2=2L)) +test(1958.16, fread(text=txt, nrows=2L), data.table(A=c(1L,3L), B=c(2L,4L))) +test(1958.17, fread(text=txt, nrows=2L, skip=1L), data.table(V1=c(1L,3L), V2=c(2L,4L))) +# .. one different type on line 148 when there are just under 200 lines +txt = paste(c("A,B\n1,2\n", rep("3,4\n",148), "3,4.1\n", rep("5,6\n",48)), collapse="") +test(1958.18, fread(text=txt, nrows=0L, verbose=TRUE), data.table(A=integer(), B=numeric()), + output="Sampled 149 rows.*at 2 jump points") +# .. one different type within sample for large number of lines +txt = paste(c("A,B\n1,2\n", rep("3,4\n",5000), "3,4.1\n", rep("5,6\n",5000)), collapse="") +test(1958.19, fread(text=txt, nrows=0L, verbose=TRUE), data.table(A=integer(), B=numeric()), + output="Sampled 1049 rows.*at 11 jump points") +# .. one different type out of sample for large number of lines +txt = paste(c("A,B\n1,2\n", rep("3,4\n",5100), "3,4.1\n", rep("5,6\n",4900)), collapse="") +test(1958.20, fread(text=txt, nrows=0L, verbose=TRUE), data.table(A=integer(), B=integer()), + output="Sampled 1049 rows.*at 11 jump points") # Skip should work with all types of newlines #3006 eols = c("\n", "\r\n", "\r", "\n\r") @@ -13522,7 +13370,7 @@ test(1962.014, merge(DT1, DT2), data.table(a = integer(0), V = character(0))) setkey(DT1, a) test(1962.015, merge(DT1, DT2), - data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a')) + ans<-data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a')) test(1962.016, merge(DT1, DT2, by.x = 'a', by.y = c('a', 'V')), error = 'must be of same length') test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'), @@ -13532,8 +13380,8 @@ test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'), error = 'Elements listed in `by.x`') test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'), error = 'Elements listed in `by.y`') -test(1962.020, merge(DT1, DT2, by = character(0L)), - error = 'non-empty vector of column names') +test(1962.0201, merge(DT1, DT2, by=character(0L)), ans) # was error before PR#5183 +test(1962.0202, merge(DT1, DT2, by=NULL), ans) # test explicit NULL too as missing() could be used inside merge() test(1962.021, merge(DT1, DT2, by = 'z'), error = 'must be valid column names in x and y') @@ -13705,8 +13553,7 @@ test(1962.086, dcast(DT, a ~ a, drop = NA), DT = data.table(a = c(1, 1, 2, 2), b = list(1, 2, 3, 4), c = c(4, 4, 2, 2)) test(1962.087, dcast(DT, a ~ b, value.var = 'b'), error = 'Columns specified in formula can not be of type list') -test(1962.088, dcast(DT[0L, ], a ~ c, value.var = 'b'), - error = 'Can not cast an empty data.table') +test(1962.088, dcast(DT[0L, ], a ~ c, value.var = 'b'), data.table(a=numeric(), key="a")) #1215 test(1962.089, dcast(DT, a ~ c, value.var = 'b'), data.table(a = c(1, 2), `2` = c(0L, 2L), `4` = c(2L, 0L), key = 'a'), message = 'Aggregate function missing') @@ -13723,10 +13570,7 @@ y = as.ITime('543210', format = '%S%M%H') test(1962.095, y, structure(37974L, class = "ITime")) test(1962.096, capture.output(print(y)), '[1] "10:32:54"') test(1962.097, rep(y, 2L), structure(c(37974L, 37974L), class = "ITime")) -test(1962.098, as.POSIXlt(y, date = '2018-12-01', tz = 'UTC'), - structure(list(sec = 54, min = 32L, hour = 10L, mday = 1L, mon = 11L, - year = 118L, wday = 6L, yday = 334L, isdst = 0L), - class = c("POSIXlt", "POSIXt"), tzone = "UTC")) +test(1962.098, format(as.POSIXlt(y, date='2018-12-01', tz='UTC'), usetz=TRUE), "2018-12-01 10:32:54 UTC") test(1962.099, as.POSIXct(x, y), structure(1533119574, tzone = "UTC", class = c("POSIXct", "POSIXt"))) @@ -13786,6 +13630,11 @@ test(1963.16, shift(DT, -3L, type="lag"), shift(DT, 3L, type="lead")) DT <- data.table(a = 1:3, b = 2:4) test(1963.17, DT[ , shift(.SD, 0:1, give.names = TRUE, type = "lead")], data.table(a_lead_0 = 1:3, a_lead_1 = c(2L, 3L, NA), b_lead_0 = 2:4, b_lead_1 = c(3L, 4L, NA))) +DT = data.table(x = 1:10, y = 10:1) +test(1963.18, shift(DT, 1L, type="cyclic"), list(c(10L, 1L:9L), c(1L, 10L:2L))) +test(1963.19, shift(DT, -1, type="cyclic"), list(c(2L:10L, 1L), c(9L:1L, 10L))) +test(1963.20, shift(DT, 3L, type="cyclic"), shift(DT, -7L, type="cyclic")) +test(1963.21, shift(DT, -3L, type="cyclic"), shift(DT, 7L, type="cyclic")) # 0 column data.table should not have rownames, #3149 M0 = matrix(1:6, nrow=3, ncol=2, dimnames=list(rows=paste0("id",1:3), cols=c("v1","v2"))) @@ -14123,11 +13972,7 @@ test(1977.4, DT["D", -"GRP"], data.table(ID="D", X=NA_real_, key="ID")) test(1977.5, DT["D", c("ID","GRP")], data.table(ID="D", GRP=NA_integer_, key="ID")) test(1977.6, DT[c("A","D"), c("ID","GRP")], data.table(ID=c("A","A","D"), GRP=INT(1,1,NA))) -# catch malformed factor in rbindlist, #3315 -set.seed(32940) -NN=7e5; KK=4e4; TT=25 -DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) ) -test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites +# 1978 moved to benchmark.Rraw, #5517 # Drop Null Values from `j` list elements #1406 DT = data.table(a = 1:3,b = letters[1:3],c = LETTERS[1:3]) @@ -14138,18 +13983,16 @@ x <- as.array(1:5) test(1980, names(data.table(x)), "x") # crash when n="lead", #3354 +options(datatable.optimize=0L) DT = data.table( id = 1:5 , val = letters[1:5] ) test(1981.1, DT[, new_col := shift(val, "lead")], error="is.numeric(n) is not TRUE") test(1981.2, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") +options(datatable.optimize=Inf) +DT = data.table( id = 1:5 , val = letters[1:5] ) +test(1981.3, DT[, new_col := shift(val, "lead")], error="is.numeric(n) is not TRUE") +test(1981.4, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") -# print of DT with many columns reordered them, #3306. -DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print -out = capture.output(print(DT)) -tt = out[grep("V",out)] -tt = unlist(strsplit(gsub(" ","",tt), "V")) -test(1982.1, tt[1L], "") -tt = as.integer(tt[tt!=""]) -test(1982.2, tt, seq_along(tt)) +# 1982 moved to benchmark.Rraw, #5517 # parse(text = 'list(`\\phantom{.}`)') fails, #3319 DT <- data.table(x=1, y=1:5) @@ -14398,11 +14241,11 @@ oldenv1 = Sys.getenv("R_DATATABLE_NUM_PROCS_PERCENT") oldenv2 = Sys.getenv("R_DATATABLE_NUM_THREADS") Sys.setenv(R_DATATABLE_NUM_THREADS="") # in case user has this set, so we can test PROCS_PERCENT Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="3.0") -test(1997.09, setDTthreads(), old, warning="Ignoring invalid.*Please remove any.*not a digit") +test(1997.09, setDTthreads(), old, ignore.warning="Ignoring invalid.*Please remove any.*not a digit") new = getDTthreads() # old above at (1) may not have been default. new now is. test(1997.10, getDTthreads(), new) Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="1") -test(1997.11, setDTthreads(), new, warning="Ignoring invalid.*integer between 2 and 100") +test(1997.11, setDTthreads(), new, ignore.warning="Ignoring invalid.*integer between 2 and 100") test(1997.12, getDTthreads(), new) Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75") test(1997.13, setDTthreads(), new) @@ -14438,12 +14281,7 @@ dx = data.table(id = 1L, key = "id") di = list(z=c(2L, 1L)) test(1999.2, key(dx[di]), NULL) -# chmatchdup test from benchmark at the bottom of chmatch.c -set.seed(45L) -x = sample(letters, 1e5, TRUE) -y = sample(letters, 1e6, TRUE) -test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406)) -rm(list=c("x","y")) +# 2000 moved to benchmark.Rraw, #5517 # rbindlist use.names=TRUE returned random column order when ncol>255; #3373 DT = setDT(replicate(300, rnorm(3L), simplify = FALSE)) @@ -14488,8 +14326,11 @@ test(2002.12, rbind(DT1, DT2, idcol='id'), data.table(id=integer(), a=logica test(2003.1, rbindlist(list(), use.names=1), error="use.names= should be TRUE, FALSE, or not used [(]\"check\" by default[)]") test(2003.2, rbindlist(list(), fill=1), error="fill= should be TRUE or FALSE") test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, use.names=FALSE), - data.table(a=c(1:2,NA,NA), b=c(NA,NA,3:4)), - warning="use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE") + data.table(a=c(1:4))) +test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TRUE, use.names=FALSE), + data.table(a=c(1:4), c=INT(5,6,NA,NA))) +test(2003.5, rbindlist(list(data.table(a=1:2), data.table(b=3:4, c=5:6)), fill=TRUE, use.names=FALSE), + data.table(a=c(1:4), V1=INT(NA,NA,5,6))) # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111 x1 = "fa\xE7ile" @@ -14554,7 +14395,7 @@ if (test_bit64) { warning="-1.*integer64.*position 1 taken as 0 when assigning.*raw.*column 3 named 'c'") test(2005.66, DT[2:3, f:=as.integer64(c(NA,"2147483648"))]$f, as.complex(c(-42,NA,2147483648))) DT[,h:=LETTERS[1:3]] - test(2005.67, DT[2:3, h:=as.integer64(1:2)], error="To assign integer64 to.*type character, please use as.character.") + test(2005.67, DT[2:3, h:=as.integer64(1:2)]$h, c("A","1","2")) # PR#5189 } # rbindlist raw type, #2819 @@ -14885,210 +14726,7 @@ test(2030.18, .Last.updated, 0L) # zero match test(2031.01, rbind(data.table(A=1:3, B=7:9), data.table(A=4:6, B=as.list(10:12))), ans<-data.table(A=1:6, B=as.list(7:12))) test(2031.02, rbind(data.table(A=1:3, B=as.list(7:9)), data.table(A=4:6, B=10:12)), ans) -if (test_yaml) { # csvy; #1701 - f = testDir("csvy/test.csvy") - DT = data.table(var1 = c("A", "B"), - var2 = c(1L, 3L), - var3 = c(2.5, 4.3)) - DT_yaml = copy(DT) - setattr(DT_yaml, 'yaml_metadata', - list(name = "my-dataset", - source = "https://github.com/leeper/csvy/tree/master/inst/examples", - schema = list(fields = list( - list(name = "var1", title = "variable 1", type = "string", - description = "explaining var1", - constraints = list(list(required = TRUE))), - list(name = "var2", title = "variable 2", type = "integer"), - list(name = "var3", title = "variable 3", type = "number") - )))) - ## with skip = '__auto__', fread can figure out - ## how to start after the metadata (just ignoring it) - test(2032.01, fread(f), DT) - ## should be the same, but with yaml_metadata attribute - test(2032.02, fread(f, yaml = TRUE), DT_yaml) - ## testing verbose messaging - test(2032.03, fread(f, yaml = TRUE, verbose = TRUE), - DT_yaml, output = 'Processed.*YAML metadata.*') - ## this file is identical, except the body of the - ## YAML header is commented out with # (should read identically) - test(2032.04, - fread(testDir('csvy/test_comment.csvy'), yaml = TRUE), - DT_yaml) - ## user input is taken as most intentional & overrides YAML - DT_yaml[ , var2 := as.numeric(var2)] - test(2032.05, fread(f, yaml = TRUE, colClasses = list(numeric = 'var2')), - DT_yaml, message = 'colClasses.*YAML header are in conflict.*var2') - ## extraneous/unused fields shouldn't throw off reading - DT = fread(testDir('csvy/test_extraneous.csvy'), yaml = TRUE) - test(2032.06, names(DT), c('Date', 'WTI')) - test(2032.07, attr(DT, 'yaml_metadata'), - list(names = c("Date", "WTI"), class = "data.frame", - title = "Cushing, OK WTI Spot Price FOB", filename = "data.csv", - fileurl = "https://raw.githubusercontent.com/jrovegno/csvy/master/data.csv", - sourceurl = "http://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=PET&s=RWTC&f=D", - source_csvy = "https://github.com/leeper/csvy/tree/master/inst/examples", - item = "PET", sourcekey = "RWTC", freq = "Daily", - rate = "MID", type = "price", units = "Dollars per Barrel", - latestdate = "2015-08-31", releasedate = "2015-09-02", - nextreleasedate = "2015-09-10", source = "Thomson Reuters", - contactemail = "infoctr@eia.doe.gov", contactphone = "(202) 586-8800")) - ## yaml can also handle sep, dec, quote, and na.strings - DT_out = data.table(var1 = c("A", "B"), - var2 = c(1L, NA), - var3 = c(2.5, 4.3)) - meta = - list(name = NULL, - schema = list(fields = list( - list(name = "var1", title = "variable 1", type = "string", - description = "a single-quoted character variable"), - list(name = "var2", title = "variable 2", type = "integer"), - list(name = "var3", title = "variable 3", type = "number", - description = "European-style numeric") - )), - header = TRUE, sep = "|", dec = ",", - quote = "'", na.strings = "@") - attr(DT_out, 'yaml_metadata') = meta - test(2032.08, fread(testDir( 'csvy/test_attributes.csvy'), yaml = TRUE), DT_out) - ## user-specified attributes can override data from YAML - meta$sep = "-" - setattr(DT_out, 'yaml_metadata', meta) - test(2032.09, fread(testDir('csvy/test_override_sep.csvy'), yaml = TRUE, sep = '|'), DT_out, - message = 'User-supplied.*sep.*override') - - meta$sep = "|" - setattr(DT_out, 'yaml_metadata', meta) - test(2032.10, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE), - DT_out, message = 'User-supplied.*header.*override') - col.names = c('x', 'y', 'z') - setnames(DT_out, col.names) - test(2032.11, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE, col.names = col.names), DT_out, - message = c('User-supplied.*header.*override', 'User-supplied.*col.names.*override')) - - test(2032.12, fread(testDir('csvy/test_attributes.csvy'), yaml = TRUE, col.names = col.names), - DT_out, message = 'User-supplied.*col.names') - - setnames(DT_out, c('var1', 'var2', 'var3')) - meta$quote = "^" - setattr(DT_out, 'yaml_metadata', meta) - test(2032.13, fread(testDir('csvy/test_override_quote.csvy'), yaml = TRUE, quote = "'"), - DT_out, message = 'User-supplied.*quote') - - meta$quote = "'" - meta$dec = "." - setattr(DT_out, 'yaml_metadata', meta) - test(2032.14, fread(testDir('csvy/test_override_dec.csvy'), yaml = TRUE, dec = ','), - DT_out, message = 'User-supplied.*dec') - - meta$dec = ',' - meta$na.strings = 'NA' - setattr(DT_out, 'yaml_metadata', meta) - test(2032.15, fread(testDir('csvy/test_override_na.csvy'), yaml = TRUE, na.strings = '@'), - DT_out, message = 'User-supplied.*na.strings') - - ## error if YAML malformed - test(2032.16, fread(testDir('csvy/test_incomplete_header.csvy'), yaml = TRUE), - error = 'Reached the end.*YAML.*valid csvy') - ## use any other CSV in test directory which doesn't have YAML - if (test_R.utils) test(2032.17, fread(testDir('issue_2051.csv.gz'), yaml = TRUE), - error = 'Encountered.*unskipped.*constitute.*valid YAML') - ## no problem if some fields are missing a type (just - ## resort to standard auto-inferral, i.e., identical to - ## the case of partially-specified colClasses) - DT = data.table(var1 = c("A", "B"), var2 = c(1L, 3L), - var3 = c(2.5, 4.3)) - setattr(DT, 'yaml_metadata', - list(name = "my-dataset", source = "https://github.com/leeper/csvy/tree/master/inst/examples", - schema = list(fields = list( - list(name = "var1"), list(name = "var2", type = "integer"), - list(name = "var3", type = "number") - )))) - test(2032.18, fread(testDir('csvy/test_missing_type.csvy'), yaml = TRUE), DT) - ## skip applies starting after the YAML header - setattr(DT, 'yaml_metadata', - list(schema = list(fields = list( - list(name = "var1", type = "string"), - list(name = "var2", type = "integer"), - list(name = "var3", type = "number") - )))) - test(2032.19, fread(testDir('csvy/test_skip.csvy'), yaml = TRUE, skip = 2L), DT) - ## user-supplied col.names override metadata (as for colClasses) - cn = paste0('V', 1:3) - setnames(DT, cn) - test(2032.20, fread(testDir('csvy/test_skip.csvy'), - yaml = TRUE, skip = 2L, col.names = cn), - DT, message = 'User-supplied column names.*override.*YAML') - ## invalid value fails - test(2032.21, fread(f, yaml = 'gobble'), - error = 'isTRUEorFALSE\\(yaml\\) is not TRUE') - - ## warning that skip-as-search doesn't work with yaml - DT_yaml[ , var2 := as.integer(var2)] - test(2032.22, fread(f, skip = 'var1,', yaml = TRUE), - DT_yaml, warning = 'Combining a search.*YAML.*') - - # fwrite csvy: #3534 - tmp = tempfile() - DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5]) - # force eol for platform independence - fwrite(DT, tmp, yaml = TRUE, eol = '\n') - as_read = readLines(tmp) - test(2033.01, as_read[c(1L, 24L)], c('---', '---')) - test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L])) - test(2033.03, grepl('creation_time_utc', as_read[3L])) - test(2033.04, as_read[4:23], - c("schema:", " fields:", " - name: a", " type: integer", - " - name: b", " type: numeric", " - name: c", " type: character", - "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", - # NB: apparently \n is encoded like this in YAML - "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", - "logical01: no")) - tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") - test(2033.05, as_read[25:30], tbl_body) - - # windows eol - fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') - test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"') - - # multi-class columns - DT[ , t := .POSIXct(1:5, tz = 'UTC')] - fwrite(DT, tmp, yaml = TRUE) - as_read = readLines(tmp) - test(2033.07, as_read[13L], " type: POSIXct") - - # ~invertibility~ - # fread side needs to be improved for Hugh's colClasses update - DT[ , t := NULL] - fwrite(DT, tmp, yaml = TRUE) - DT2 = fread(tmp, yaml = TRUE) - # remove metadata to compare - attr(DT2, 'yaml_metadata') = NULL - test(2033.08, all.equal(DT, DT2)) - - test(2033.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE), - output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*")) - - # TODO: test gzip'd yaml which is now supported - - # yaml + bom arguments - DT = data.table(l=letters, n=1:26) - fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) - fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50 - lines = readLines(fcon) - lines = lines[lines!=""] # an extra "" after "eol: |2+" (line 16) on Linux but not Windows - # remove the blank here so we don't need to change this test if/when that changes in yaml package - test(2033.11, length(lines), 48L) - close(fcon) - test(2033.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) - # re-write should have same output (not appended) - fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) - fcon = file(f, encoding="UTF-8") - lines = readLines(fcon) - lines = lines[lines!=""] - test(2033.13, length(lines), 48L) - close(fcon) - test(2033.14, fread(f), DT) - unlink(f) -} +# 2032-2033 tested yaml moved to other.Rraw 16-17, #5516 # fcast coverage DT = data.table(a = rep(1:2, each = 2), b = rep(1:2, 2), c = 4:1, d = 5:8) @@ -15104,10 +14742,10 @@ test(2035.3, fread('A,B\n"foo","ba"r"', quote=""), ans) # source() printing edge case; #2369 setup = c('DT = data.table(a = 1)') writeLines(c(setup, 'DT[ , a := 1]'), tmp<-tempfile()) -test(2036.1, !any(grepl("1: 1", capture.output(source(tmp, echo = TRUE)), fixed = TRUE))) +test(2036.1, !any(grepl("1: 1", capture.output(source(tmp, echo=TRUE, local=TRUE)), fixed=TRUE))) # local= #5514 ## test force-printing still works writeLines(c(setup, 'DT[ , a := 1][]'), tmp) -test(2036.2, source(tmp, echo = TRUE), output = "1:\\s+1") +test(2036.2, source(tmp, echo=TRUE, local=TRUE), output="1:\\s+1") # more helpful guidance when assigning before setDT() after readRDS(); #1729 DT = data.table(a = 1:3) @@ -15745,16 +15383,7 @@ if (test_bit64) { test(2060.304, fcoalesce(int64, 1), error='Item 2 has a different class than item 1') test(2060.305, fcoalesce(int64, 1L), error = 'Item 2 is type integer but the first item is type double') } -# nanotime tests -if (test_nanotime) { - nt = nanotime(int) - nt_val = nanotime(1:4) - test(2060.401, as.character(fcoalesce(nt, nanotime(3L))), as.character(nt_val)) # as.character due to eddelbuettel/nanotime#46 - test(2060.402, as.character(fcoalesce(nt, nanotime(NA), nanotime(3L))), as.character(nt_val)) - test(2060.403, as.character(fcoalesce(nt, nanotime(rep(3, 4L)))), as.character(nt_val)) - test(2060.404, fcoalesce(nt, 1), error='Item 2 has a different class than item 1') - test(2060.405, fcoalesce(nt, 1L), error = 'Item 2 is type integer but the first item is type double') -} +# 2060.401-405 tested nanotime moved to other.Rraw 23, #5516 # setcoalesce x = c(11L, NA, 13L, NA, 15L, NA) y = c(NA, 12L, 5L, NA, NA, NA) @@ -15875,16 +15504,26 @@ test(2067.1, shift(z), c(NA, z[1:2])) test(2067.2, shift(z, type = 'lead'), c(z[2:3], NA)) test(2067.3, shift(z, fill = 1i), c(1i, z[1:2])) test(2067.4, shift(list(z, 1:3)), list(c(NA, z[1:2]), c(NA, 1:2))) +test(2067.5, shift(z, n=1, type = 'cyclic'), c(z[3], z[1:2])) +test(2067.6, shift(z, n=-1, type = 'cyclic'), c(z[2:3], z[1])) +test(2067.7, shift(list(z, 1L:3L), n=1, type = 'cyclic'), list(c(z[3], z[1:2]), c(3L, 1:2))) +test(2067.8, shift(list(z, 1L:3L), n=-1, type = 'cyclic'), list(c(z[2:3], z[1]), c(2:3, 1L))) # support for ordering tables with complex columns, #1444 DT = data.table(a = 2:1, z = complex(0, 0:1)) test(2068.1, setkey(copy(DT), a), data.table(a=1:2, z=complex(0, 1:0), key='a')) test(2068.2, DT[ , abs(z), by=a], data.table(a=2:1, V1=c(0, 1))) -# raw continues not to be supported +# support for ordering tables with raw columns, #5100 DT = data.table(ID=2:1, r=as.raw(0:1)) -test(2068.3, setkey(DT, ID), error="Item 2 of list is type 'raw'") +test(2068.3, setkey(copy(DT), ID), data.table(ID=1:2, r=as.raw(1:0), key='ID')) +DT = data.table(x=c(1, 2, 1), y=raw(3)) +test(2068.4, setkey(copy(DT), x), data.table(x=c(1,1,2), y=raw(3), key='x')) +test(2068.5, DT[, y[.N], x], data.table(x=c(1,2), V1=raw(2))) +# expression continue to be not supported +DT = data.table(ID=2:1, r=expression(1, 2)) +test(2068.6, setkey(DT, ID), error="Item 2 of list is type 'expression'") # setreordervec triggers !isNewList branch for coverage -test(2068.4, setreordervec(DT$r, order(DT$ID)), error="reorder accepts vectors but this non-VECSXP") +test(2068.7, setreordervec(DT$r, order(DT$ID)), error="reorder accepts vectors but this non-VECSXP") # forderv (and downstream functions) handles complex vector input, part of #3690 DT = data.table( @@ -16159,7 +15798,9 @@ test(2074.33, merge(DT, DT, by.x = 1i, by.y=1i), error="A non-empty vector of co # shift naming test(2074.34, shift(list(a=1:5, b=6:10), give.names=TRUE), list(a_lag_1=c(NA, 1:4), b_lag_1=c(NA, 6:9))) +test(2074.345, shift(list(a=1:5, b=6:10), type="cyclic", give.names=TRUE), list(a_cyclic_1=c(5L, 1:4), b_cyclic_1=c(10L, 6:9))) test(2074.35, shift(1:5, 1:2, give.names=TRUE), list(V1_lag_1=c(NA, 1:4), V1_lag_2=c(NA, NA, 1:3))) +test(2074.355, shift(1:5, 1:2, type="cyclic", give.names=TRUE), list(V1_cyclic_1=c(5L, 1:4), V1_cyclic_2=c(4L:5L, 1:3))) # bmerge.c x = data.table(a='a') @@ -16233,18 +15874,7 @@ test(2078.32, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=FALSE, NAboun test(2079.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=NA), c(FALSE, FALSE, NA, NA, NA)) test(2079.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, TRUE, TRUE)) test(2079.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=FALSE), error="NAbounds must be TRUE or NA") -# nanotime support -if (test_nanotime) { - n=nanotime(1:4) - n[2L]=NA - op = options(datatable.verbose=TRUE) - test(2080.01, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") - test(2080.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") - test(2080.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=NA), c(FALSE, NA, FALSE, NA), output="between parallel processing of integer64") - options(op) - test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="x is not integer64 but.*Please align classes") - test(2080.05, between(1:10, 3, nanotime(6)), error="x is not integer64 but.*Please align classes") -} +# 2080.01-05 tested nanotime moved to other.Rraw 24, #5516 # use raw type to cover fallback to R in between.R old = options(datatable.verbose=TRUE) test(2081.01, between(as.raw(1:5), as.raw(2), as.raw(4)), c(FALSE, TRUE, TRUE, TRUE, FALSE), output="fallback to slow R") @@ -16288,10 +15918,7 @@ if (test_bit64) { i = as.integer64(1:4)+3e9 test(2085.01, fifelse(c(TRUE,FALSE,NA,TRUE), i, i+100), c(i[1L], i[2L]+100, as.integer64(NA), i[4])) } -if (test_nanotime) { - n = nanotime(1:4) - test(2085.11, fifelse(c(TRUE,FALSE,NA,TRUE), n, n+100), c(n[1L], n[2L]+100, nanotime(NA), n[4])) -} +# 2085.11 tested nanotime moved to other.Rraw 25, #5516 test(2085.21, fifelse(c(TRUE,FALSE,NA), 1:3, c(1,2,3)), c(1,2,NA)) test(2085.22, fifelse(c(TRUE,FALSE,NA), c(1,2,3), 1:3), c(1,2,NA)) test(2085.31, fifelse(c(a=TRUE,b=FALSE), list(m=1,n=2), list(x=11,y=12)), list(a=1, b=12)) @@ -16523,109 +16150,7 @@ test(2107.3, names(DT), c('A','b','c')) setnames(DT, -(1:2), toupper) test(2107.4, names(DT), c('A','b','C')) -# first and last should no longer load xts namespace, #3857, below commented test for interactive validation when xts present but not loaded or attached -#stopifnot("xts"%in%installed.packages(), !"xts"%in%loadedNamespaces()); library(data.table); x=as.POSIXct("2019-01-01"); last(x); stopifnot(!"xts" %in% loadedNamespaces()) -x = as.POSIXct("2019-09-09")+0:1 -old = options(datatable.verbose=TRUE) -test(2108.01, last(x), x[length(x)], output="!is.xts(x)") -test(2108.02, first(x), x[1L], output="!is.xts(x)") -if (test_xts) { - xt = xts(1:2, x) - test(2108.03, last(xt, 2L), xt, output="using xts::last: is.xts(x)") - test(2108.04, first(xt, 2L), xt, output="using xts::first: is.xts(x)") - xt = xts(matrix(1:4, 2L, 2L), x) - test(2108.05, last(xt, 2L), xt, output="using xts::last: is.xts(x)") - test(2108.06, first(xt, 2L), xt, output="using xts::first: is.xts(x)") -} -# first on empty df now match head(df, n=1L), #3858 -df = data.frame(a=integer(), b=integer()) -test(2108.11, first(df), df, output="!is.xts(x)") -test(2108.12, last(df), df, output="!is.xts(x)") -options(old) -# xts last-first dispatch fix #4053 -x = 1:3 -y = as.POSIXct(x, origin="1970-01-01") -df = data.frame(a=1:2, b=3:2) -dt = as.data.table(df) -mx = matrix(1:9, 3, 3) -ar = array(1:27, c(3,3,3)) -xt = structure( - c(142.25, 141.229996, 141.330002, 142.860001, 142.050003, 141.399994, - 140.570007, 140.610001, 140.380005, 141.369995, 141.669998, 140.539993, - 94807600, 69620600, 76645300, 108.999954, 109.231255, 108.360008), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(c(1167782400, 1167868800, 1167955200), tzone = "UTC", tclass = "Date"), - .Dim = c(3L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) -) -old = options(datatable.verbose=TRUE) -if (test_xts) { - test(2108.21, last(x, n=2L), 2:3, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.22, last(y, n=2L), y[2:3], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.23, last(x, n=1L), 3L, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.24, last(y, n=1L), y[3L], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - xt_last = structure( - c(141.330002, 141.399994, 140.380005, 140.539993, 76645300, 108.360008), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(1167955200, tzone = "UTC", tclass = "Date"), - .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - xt_last2 = structure( - c(141.229996, 141.330002, 142.050003, 141.399994, 140.610001, 140.380005, - 141.669998, 140.539993, 69620600, 76645300, 109.231255, 108.360008), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(c(1167868800, 1167955200), tzone = "UTC", tclass = "Date"), - .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - test(2108.25, last(xt), xt_last, output="using xts::last: is.xts(x)") - test(2108.26, last(xt, n=2L), xt_last2, output="using xts::last: is.xts(x)") - test(2108.31, first(x, n=2L), 1:2, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.32, first(y, n=2L), y[1:2], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.33, first(x, n=1L), 1L, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.34, first(y, n=1L), y[1L], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - xt_first = structure( - c(142.25, 142.860001, 140.570007, 141.369995, 94807600, 108.999954), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(1167782400, tzone = "UTC", tclass = "Date"), - .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - xt_first2 = structure( - c(142.25, 141.229996, 142.860001, 142.050003, 140.570007, 140.610001, 141.369995, 141.669998, 94807600, 69620600, 108.999954, 109.231255), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(c(1167782400, 1167868800), tzone = "UTC", tclass = "Date"), - .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - test(2108.35, first(xt), xt_first, output="using xts::first: is.xts(x)") - test(2108.36, first(xt, n=2L), xt_first2, output="using xts::first: is.xts(x)") -} else { - test(2108.21, last(x, n=2L), 2:3, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.22, last(y, n=2L), y[2:3], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.23, last(x, n=1L), 3L, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.24, last(y, n=1L), y[3L], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.25, last(xt), error="you should have 'xts' installed already") - test(2108.26, last(xt, n=2L), error="you should have 'xts' installed already") - test(2108.31, first(x, n=2L), 1:2, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.32, first(y, n=2L), y[1:2], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.33, first(x, n=1L), 1L, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.34, first(y, n=1L), y[1L], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.35, first(xt), error="you should have 'xts' installed already") - test(2108.36, first(xt, n=2L), error="you should have 'xts' installed already") -} -test(2108.41, last(x), 3L, output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(2108.42, last(y), y[3L], output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(2108.51, first(x), 1L, output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(2108.52, first(y), y[1L], output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(2108.61, last(df), structure(list(a=2L, b=2L), row.names=2L, class="data.frame"), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -test(2108.62, last(dt), data.table(a=2L, b=2L), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -test(2108.71, first(df), structure(list(a=1L, b=3L), row.names=1L, class="data.frame"), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -test(2108.72, first(dt), data.table(a=1L, b=3L), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -# matrix/array utils::tail behavior is likely to change in future R, Michael is more in the topic -test(2108.81, last(mx), structure(c(3L, 6L, 9L), .Dim = c(1L, 3L), .Dimnames = list("[3,]", NULL)), output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -expected = if (base::getRversion() < "3.7.0") 27L else structure(c(3L, 6L, 9L, 12L, 15L, 18L, 21L, 24L, 27L), .Dim = c(1L, 3L, 3L), .Dimnames = list("[3,]", NULL, NULL)) #4127 -test(2108.82, last(ar), expected, output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -test(2108.91, first(mx), structure(c(1L, 4L, 7L), .Dim = c(1L, 3L)), output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -expected = if (base::getRversion() < "3.7.0") 1L else structure(c(1L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 25L), .Dim = c(1L, 3L, 3L)) #4127 -test(2108.92, first(ar), expected, output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -options(old) +# 2108 tested xts moved to other.Rraw 19, #5516 # error in autonaming by={...}, #3156 DT = data.table(State=c("ERROR", "COMPLETED", "ERROR"), ExitCode=c(1, 0, 2)) @@ -16702,8 +16227,8 @@ g = function(x) { if (x==1L) factor(c("a","b")) else factor(c("a","b","c")) } test(2114.2, DT[,g(.GRP),by=A], data.table(A=INT(1,1,2,2,2), V1=as.factor(c("a","b","a","b","c")))) # original test verbatim from the same issue #2199 set.seed(2) -ids = sample(letters, 20) -dates = 1:40 +ids = sample(letters, 10) # reduced from 20 to 10 +dates = 1:10 # and 40 to 10 to save ram, #5517 dt = data.table(CJ(dates, ids, ids)) setnames(dt, c("date", "id1", "id2")) dt[, value := rnorm(length(date))] @@ -16714,14 +16239,16 @@ f1 = function(sdt) { melt.data.table(dt1, id.vars = "id1") } res = dt[, f1(.SD), by=date] -test(2114.3, setnames(res[c(1,.N)],"variable","id2")[,id2:=as.character(id2)][], dt[c(1,.N)]) -test(2114.4, print(res), output="date.*0.433") +test(2114.3, setnames(res[c(1,.N)],"variable","id2")[,id2:=as.character(id2)], dt[c(1,.N)]) +test(2114.4, print(res), output="date.*-0.830") # and from #2522 DT = data.table(id=1:9, grp=rep(1:3,each=3), val=c("a","b","c", "a","b","c", "a","b","c")) test(2114.5, as.character(DT[, valfactor1 := factor(val), by = grp]$valfactor1), ans<-rep(c("a","b","c"),3)) test(2114.6, as.character(DT[, valfactor2 := factor(val), by = id]$valfactor2), ans) DT = data.table(x = rep(letters[c(3, 1, 2)], each = 2)) -test(2114.7, DT[, `:=`(g=.GRP, f=factor(.GRP)), by = x], +test(2114.7, copy(DT)[, `:=`(g=.GRP, f=factor(.GRP)), by = x], + data.table(x=rep(c("c","a","b"),each=2), g=rep(1:3,each=2), f=factor(rep(as.character(1:3),each=2)))) +test(2114.8, copy(DT)[, let(g=.GRP, f=factor(.GRP)), by = x], data.table(x=rep(c("c","a","b"),each=2), g=rep(1:3,each=2), f=factor(rep(as.character(1:3),each=2)))) # extra tests from #996 for completeness; no warning no-alloc coerce here of 0 and 1 numerics @@ -16973,10 +16500,7 @@ if(test_bit64) { i=as.integer64(1:12)+3e9 test(2127.26, fcase(test_vec_na1, i, test_vec_na2, i+100), c(i[1L:5L], as.integer64(NA),i[7L:11L]+100, as.integer64(NA))) } -if(test_nanotime) { - n=nanotime(1:12) - test(2127.27, fcase(test_vec_na1, n, test_vec_na2, n+100), c(n[1L:5L], nanotime(NA),n[7L:11L]+100, as.integer64(NA))) -} +# 2127.27 tested nanotime moved to other.Rraw 26, #5516 test(2127.28, fcase(test_vec1, rep(1L,11L), test_vec2, rep(0L,11L)), as.integer(out_vec)) test(2127.29, fcase(test_vec1, rep(1,11L), test_vec2, rep(0,11L)), out_vec) test(2127.30, fcase(test_vec1, rep("1",11L), test_vec2, rep("0",11L)), as.character(out_vec)) @@ -17080,25 +16604,41 @@ test(2130.102, print(DT, timezone=FALSE), notOutput='UTC') # default expression printing can break format_col.default, #3011 test(2130.11, print(data.table(e = expression(1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13))), output = '1 + 2 + 3') -# format_col generic is used -format_col.complex = function(x, ...) sprintf('(%.1f, %.1fi)', Re(x), Im(x)) -registerS3method("format_col", "complex", format_col.complex) +# format_col and format_list_item generics, #2273 for package sf +registerS3method("format_col", "complex", function(x, ...) sprintf('(%.1f, %.1fi)', Re(x), Im(x))) # this registerS3method does seem to be necessary to work within the test.data.table() environment -# assigning the method using <<- probably works too, but we don't want to write to user's environment at all -x = data.table(z = c(1 + 3i, 2 - 1i, pi + 2.718i)) -test(2130.12, x, output = '(1.0, 3.0i)') -rm(format_col.complex) +# assigning the method in .GlobalEnv might work too, but we don't want to write to user's environment at all (and is disallowed by CRAN policy) +x = data.table(z = c(1+3i, 2-1i, pi+2.718i)) +test(2130.12, x, output="(1.0, 3.0i)") registerS3method("format_col", "complex", format_col.default) -# otherwise it remains registered after test.data.table() and causes test 1610.1 to fail on the next run for example, and user display if they have complex data -# haven't found a way to unregister an S3 method (tried registering NULL but there's an error that NULL isn't a function) - -# format_list_item() generic is used -format_list_item.myclass <- function(x, ...) paste0("<", class(x)[1L], ":", x$id, ">") -registerS3method("format_list_item", "myclass", format_list_item.myclass) -DT = data.table(row = 1:2, objs = list(structure(list(id = "foo"), class = "myclass"), structure(list(id = "bar"), class = "myclass"))) -test(2130.13, print(DT), output = "myclass:foo.*myclass:bar") -rm(format_list_item.myclass) -registerS3method("format_list_item", "myclass", format_list_item.default) +# haven't found a way to unregister an S3 method; tried registering NULL but that's an error that NULL isn't a function +# so registering the default method is the only known solution to clean up since the registered method persists after test.data.table() finishes and +# then i) test 1610.1 fails if test.data.table() is rerun, ii) user display of complex data would be affected +# did try wrapping with on.exit(,add=TRUE) but perhaps because this is a script that is sys.source'd, it ran straight away + +# format method for column takes predecedence over format method for each list item +registerS3method("format", "myclass2130", function(x, ...) paste0("<", class(x)[1L], ":", x$id, ">")) +DT = data.table(row=1:2, objs=list(structure(list(id="foo"), class="myclass2130"), structure(list(id="bar"), class="myclass2130"))) +test(2130.13, print(DT), output="myclass2130:foo.*myclass2130:bar") +setattr(DT$objs, "class", "foo2130") +registerS3method("format", "foo2130", function(x, ...) "All hail foo") +test(2130.14, print(DT), output="myclass2130:foo.*myclass2130:bar") # because length 1 from format but needs to be length(x) +registerS3method("format", "foo2130", function(x, ...) rep("All hail foo",length(x))) +test(2130.15, print(DT), output="All hail foo") # e.g. sf:::format.sfc rather than sf:::format.sfg on each item +setattr(DT$objs, "class", "bar2130_with_no_method") +test(2130.16, print(DT), output="myclass2130:foo.*myclass2130:bar") +registerS3method("format", "myclass2130", format.default) +registerS3method("format", "foo2130", format.default) + +DT = data.table(num = 1:2, + formula = list(as.formula("mpg~cyl")), + model = list(lm(mpg~cyl, mtcars)), + shallow = list(1:3, 4:6), + nested = list(list(1:3), list(4:6))) +test(2130.17, capture.output(DT), + c(" num formula model shallow nested", + "1: 1 mpg ~ cyl 1,2,3 ", + "2: 2 mpg ~ cyl 4,5,6 ")) # .SD from grouping should be unlocked, part of #4159 x = data.table(a=1:3, b=4:6) @@ -17125,20 +16665,8 @@ test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanot test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") test(2132.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") rm(s1, s2, class2132) -if (test_xts) { - # keep.rownames in as.data.table.xts() supports a string, #4232 - xts = xts::xts(1:10, structure(1:10, class = "Date")) - colnames(xts) = "VALUE" - DT = as.data.table(xts, keep.rownames = "DATE", key = "DATE") - test(2133.1, colnames(DT), c("DATE", "VALUE")) - test(2133.2, key(DT), "DATE") - test(2133.3, as.data.table(xts, keep.rownames = "VALUE"), - error = "Input xts object should not have 'VALUE' column because it would result in duplicate column names. Rename 'VALUE' column in xts or use `keep.rownames` to change the index column name.") - test(2133.4, as.data.table(xts, keep.rownames = character()), - error = "keep.rownames must be length 1") - test(2133.5, as.data.table(xts, keep.rownames = NA_character_), - error = "keep.rownames must not be NA") -} + +# 2133 tested xts moved to other.Rraw 20, #5516 # friendlier error for common mistake of using := in i instead of j, #4227 DT = data.table(a = 1) @@ -17854,7 +17382,8 @@ if (test_bit64) { # X[Y,,by=.EACHI] when Y contains integer64 also fixed in 1.12.4, #3779 X = data.table(x=1:3) Y = data.table(x=1:2, y=as.integer64(c(10,20))) - test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) + test(2193.2, copy(X)[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) + test(2193.3, copy(X)[Y, let(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) } # endsWithAny added in #5097 for internal use replacing one use of base::endsWith (in fread.R) @@ -17912,18 +17441,7 @@ d[1:50, "a"] = d[51:100, "a"] setDT(d) test(2200, nrow(d[a==99]), 2L) -# segfault in forder when nrow/throttle=255 && nrow>=65536; #5077 -# Matt ran these on clang's ASAN+OpenMP which correctly faulted v1.14.0; these tests segfault consistently without ASAN too -set.seed(1) -DT = data.table(grp=sample(255L, 65536L ,replace=TRUE)) # >=255 && >=65536 necessary -setDTthreads(throttle=nrow(DT)) # increase throttle to reduce threads to 1 for this nrow -test(2201.1, nrow(DT[, .N, by=grp]), 255L) -test(2201.2, nrow(setkey(DT, grp)), 65536L) -set.seed(1) -DT = data.table(grp=sample(65536L)) # extra case with all size 1 groups too just for fun -test(2201.3, nrow(DT[, .N, by=grp]), 65536L) -test(2201.4, nrow(setkey(DT, grp)), 65536L) -setDTthreads() # restore default throttle +# 2201 moved to benchmark.Rraw, #5517 # fwrite now allows sep="", #4817 test(2202.1, fwrite(data.frame(a="id", b=letters[1:5], c=1:5), sep=""), @@ -17963,11 +17481,7 @@ test(2203.20, tstrsplit(w, "/", type.convert=list()), error="not support empty l test(2204, as.data.table(mtcars, keep.rownames='model', key='model'), setnames(setkey(as.data.table(mtcars, keep.rownames = TRUE), rn), 'rn', 'model')) -# na.omit works for nanotime, #4744 -if (test_nanotime) { - DT = data.table(time=nanotime(c(1,NA,3))) - test(2205, na.omit(DT), DT[c(1,3)]) -} +# 2205 tested nanotime moved to other.Rraw 27, #5516 # isRealReallyInt, #3966 test(2206.01, isRealReallyInt(c(-2147483647.0, NA, 0.0, 2147483647.0)), TRUE) @@ -18064,10 +17578,9 @@ for (col in c("a","b","c")) { } } -# DT() functional form, #4872 #5106 #5107 +# DT() functional form, #4872 #5106 #5107 #5129 if (base::getRversion() >= "4.1.0") { # we have to EVAL "|>" here too otherwise this tests.Rraw file won't parse in R<4.1.0 - if (exists("DTfun")) DT=DTfun # just in dev-mode restore DT() in .GlobalEnv as DT object overwrote it in tests above droprn = function(df) { rownames(df)=NULL; df } # TODO: could retain rownames where droprn is currently used below test(2212.011, EVAL("mtcars |> DT(mpg>20, .(mean_hp=round(mean(hp),2)), by=cyl)"), data.frame(cyl=c(6,4), mean_hp=c(110.0, 82.64))) @@ -18076,37 +17589,511 @@ if (base::getRversion() >= "4.1.0") { test(2212.013, EVAL("mtcars |> DT(mpg>20, .SD[hp>mean(hp)])"), droprn(mtcars[ mtcars$mpg>20 & mtcars$hp>mean(mtcars$hp[mtcars$mpg>20]), ])) D = copy(mtcars) - test(2212.02, EVAL("D |> DT(,.SD)"), D) - test(2212.03, EVAL("D |> DT(, .SD, .SDcols=5:8)"), D[,5:8]) - test(2212.04, EVAL("D |> DT(, 5:8)"), droprn(D[,5:8])) - test(2212.05, EVAL("D |> DT(, lapply(.SD, sum))"), as.data.frame(lapply(D,sum))) - test(2212.06, EVAL("D |> DT(, .SD, keyby=cyl) |> setkey(NULL)"), droprn(D[order(D$cyl),c(2,1,3:11)])) - test(2212.07, EVAL("D |> DT(1:20, .SD)"), droprn(D[1:20,])) - test(2212.08, EVAL("D |> DT(, .SD, by=cyl, .SDcols=5:8)"), droprn(D[unlist(tapply(1:32, D$cyl, c)[c(2,1,3)]), c(2,5:8)])) - test(2212.09, EVAL("D |> DT(1:20, .SD, .SDcols=5:8)"), droprn(D[1:20, 5:8])) - test(2212.10, EVAL("D |> DT(1:20, .SD, by=cyl, .SDcols=5:8)"), droprn(D[unlist(tapply(1:20, D$cyl[1:20], c)[c(2,1,3)]), c(2,5:8)])) - test(2212.11, EVAL("D |> DT(1:20, lapply(.SD, sum))"), as.data.frame(lapply(D[1:20,],sum))) + test(2212.02, EVAL("D |> DT(,.SD)"), mtcars) + test(2212.03, EVAL("D |> DT(, .SD, .SDcols=5:8)"), mtcars[,5:8]) + test(2212.04, EVAL("D |> DT(, 5:8)"), droprn(mtcars[,5:8])) + test(2212.05, EVAL("D |> DT(, lapply(.SD, sum))"), as.data.frame(lapply(mtcars,sum))) + test(2212.06, EVAL("D |> DT(, .SD, keyby=cyl) |> setkey(NULL)"), droprn(mtcars[order(mtcars$cyl),c(2,1,3:11)])) + test(2212.07, EVAL("D |> DT(1:20, .SD)"), droprn(mtcars[1:20,])) + test(2212.08, EVAL("D |> DT(, .SD, by=cyl, .SDcols=5:8)"), droprn(mtcars[unlist(tapply(1:32, mtcars$cyl, c)[c(2,1,3)]), c(2,5:8)])) + test(2212.09, EVAL("D |> DT(1:20, .SD, .SDcols=5:8)"), droprn(mtcars[1:20, 5:8])) + test(2212.10, EVAL("D |> DT(1:20, .SD, by=cyl, .SDcols=5:8)"), droprn(mtcars[unlist(tapply(1:20, mtcars$cyl[1:20], c)[c(2,1,3)]), c(2,5:8)])) + test(2212.11, EVAL("D |> DT(1:20, lapply(.SD, sum))"), as.data.frame(lapply(mtcars[1:20,],sum))) test(2212.12, droprn(EVAL("D |> DT(1:20, c(N=.N, lapply(.SD, sum)), by=cyl)")[c(1,3),c("cyl","N","carb")]), data.frame(cyl=c(6,8), N=c(6L,8L), carb=c(18,27))) - test(2212.13, EVAL("D |> DT(cyl==4)"), droprn(D[D$cyl==4,])) - test(2212.14, EVAL("D |> DT(cyl==4 & vs==0)"), droprn(D[D$cyl==4 & D$vs==0,])) - test(2212.15, EVAL("D |> DT(cyl==4 & vs>0)"), droprn(D[D$cyl==4 & D$vs>0,])) - test(2212.16, EVAL("D |> DT(cyl>=4)"), droprn(D[D$cyl>=4,])) - test(2212.17, EVAL("D |> DT(cyl!=4)"), droprn(D[D$cyl!=4,])) - test(2212.18, EVAL("D |> DT(cyl!=4 & vs!=0)"), droprn(D[D$cyl!=4 & D$vs!=0,])) + test(2212.13, EVAL("D |> DT(cyl==4)"), droprn(mtcars[mtcars$cyl==4,])) + test(2212.14, EVAL("D |> DT(cyl==4 & vs==0)"), droprn(mtcars[mtcars$cyl==4 & mtcars$vs==0,])) + test(2212.15, EVAL("D |> DT(cyl==4 & vs>0)"), droprn(mtcars[mtcars$cyl==4 & mtcars$vs>0,])) + test(2212.16, EVAL("D |> DT(cyl>=4)"), droprn(mtcars[mtcars$cyl>=4,])) + test(2212.17, EVAL("D |> DT(cyl!=4)"), droprn(mtcars[mtcars$cyl!=4,])) + test(2212.18, EVAL("D |> DT(cyl!=4 & vs!=0)"), droprn(mtcars[mtcars$cyl!=4 & mtcars$vs!=0,])) test(2212.19, EVAL("iris |> DT(Sepal.Length==5.0 & Species=='setosa')"), droprn(iris[iris$Sepal.Length==5.0 & iris$Species=="setosa",])) test(2212.20, EVAL("iris |> DT(Sepal.Length==5.0)"), droprn(iris[iris$Sepal.Length==5.0,])) test(2212.21, EVAL("iris |> DT(Species=='setosa')"), droprn(iris[iris$Species=='setosa',])) - test(2212.22, EVAL("D |> DT(, cyl)"), droprn(D[,"cyl"])) - test(2212.23, EVAL("D |> DT(1:2, cyl)"), droprn(D[1:2, "cyl"])) - test(2212.24, EVAL("D |> DT(, list(cyl))"), droprn(D[,"cyl",drop=FALSE])) - test(2212.25, EVAL("D |> DT(1:2, .(cyl))"), droprn(D[1:2, "cyl", drop=FALSE])) - test(2212.26, EVAL("D |> DT(, z:=sum(cyl))"), cbind(D, z=sum(D$cyl))) - test(2212.27, EVAL("D |> DT(, z:=round(mean(mpg),2), by=cyl)"), cbind(D, z=c("6"=19.74, "4"=26.66, "8"=15.10)[as.character(D$cyl)])) - test(2212.28, EVAL("D |> DT(1:3, z:=5, by=cyl)"), cbind(D, z=c(5,5,5,rep(NA,nrow(D)-3)))) + test(2212.22, EVAL("D |> DT(, cyl)"), droprn(mtcars[,"cyl"])) + test(2212.23, EVAL("D |> DT(1:2, cyl)"), droprn(mtcars[1:2, "cyl"])) + test(2212.24, EVAL("D |> DT(, list(cyl))"), droprn(mtcars[,"cyl",drop=FALSE])) + test(2212.25, EVAL("D |> DT(1:2, .(cyl))"), droprn(mtcars[1:2, "cyl", drop=FALSE])) + test(2212.26, EVAL("D |> DT(, z:=sum(cyl))"), cbind(mtcars, z=sum(mtcars$cyl))) + D = copy(mtcars) # D was changed by := so recopy mtcars; TODO: remove this line when #5129 is fully closed + test(2212.27, EVAL("D |> DT(, z:=round(mean(mpg),2), by=cyl)"), cbind(mtcars, z=c("6"=19.74, "4"=26.66, "8"=15.10)[as.character(mtcars$cyl)])) + D = copy(mtcars) # D was changed by := so recopy mtcars; TODO: remove this line when #5129 is fully closed + test(2212.28, EVAL("D |> DT(1:3, z:=5, by=cyl)"), cbind(mtcars, z=c(5,5,5,rep(NA,nrow(mtcars)-3)))) + D = copy(mtcars) # D was changed by := so recopy mtcars; TODO: remove this line when #5129 is fully closed test(2212.29, EVAL("D |> DT(1:3, z:=NULL)"), error="When deleting columns, i should not be provided") - test(2212.30, EVAL("D |> DT(data.table(cyl=4), on='cyl')"), droprn(D[D$cyl==4,])) - test(2212.31, EVAL("D |> DT(data.frame(cyl=4), on='cyl')"), droprn(D[D$cyl==4,])) - test(2212.32, EVAL("D |> DT(.(4), on='cyl')"), droprn(D[D$cyl==4,])) + test(2212.30, EVAL("D |> DT(data.table(cyl=4), on='cyl')"), droprn(mtcars[mtcars$cyl==4,])) + test(2212.31, EVAL("D |> DT(data.frame(cyl=4), on='cyl')"), droprn(mtcars[mtcars$cyl==4,])) + test(2212.32, EVAL("D |> DT(.(4), on='cyl')"), droprn(mtcars[mtcars$cyl==4,])) test(2212.33, EVAL("iris |> DT('setosa', on='Species')"), {tt=droprn(iris[iris$Species=="setosa",]); tt$Species=as.character(tt$Species); tt}) + filter = mtcars # mask stats::filter + dt = df = D = as.data.table(filter) # mask stats::D + test(2212.50, EVAL("df |> DT(df[, .I[which.max(mpg)], by=cyl]$V1)"), ans<-dt[c(4,20,25)]) + test(2212.51, EVAL("dt |> DT(dt[, .I[which.max(mpg)], by=cyl]$V1)"), ans) + test(2212.52, EVAL("D |> DT(D[, .I[which.max(mpg)], by=cyl]$V1)"), ans) + test(2212.53, EVAL("filter |> DT(filter[, .I[which.max(mpg)], by=cyl]$V1)"), error="unused.*argument.*by.*cyl") # R's [.data.frame error on filter[...] + test(2212.54, EVAL("filter |> DT((filter |> DT(, .I[which.max(mpg)], by=cyl))$V1)"), as.data.frame(ans)) +} + +# precision powers of 10^(-n), #4461 +test(2213, identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)) + +# droplevels.data.table method, and fdroplevels, #647 +x = factor(letters[1:10]) +DT = data.table(a = x)[1:5] +test(2214.01, fdroplevels(factor()), droplevels(factor())) +test(2214.02, fdroplevels(x[1:5]), droplevels(x[1:5])) +if (base::getRversion() >= "3.4.0") { + # bug fix in R 3.4.0: "droplevels(f) now keeps levels when present." + test(2214.03, fdroplevels(x[1:5], c("b", "d")), droplevels(x[1:5], c("b", "d"))) + test(2214.04, fdroplevels(x[1:5], letters[1:5]), droplevels(x[1:5], letters[1:5])) + test(2214.05, droplevels(DT, exclude=c("b", "d"))[["a"]], droplevels(DT[1:5,a], c("b", "d"))) +} +test(2214.06, droplevels(DT)[["a"]], droplevels(DT[1:5,a])) +test(2214.07, droplevels(DT, 1)[["a"]], x[1:5]) +test(2214.08, droplevels(DT, in.place=TRUE), DT) +# support ordered factors in fdroplevels +o = factor(letters[1:10], ordered=TRUE) +test(2214.09, fdroplevels(o[1:5]), droplevels(o[1:5])) +# edge case for empty table #5184 +test(2214.10, droplevels(DT[0]), DT[0]) +test(2214.11, droplevels(data.table()), data.table()) + + +# factor i should be just like character i and work, #1632 +DT = data.table(A=letters[1:3], B=4:6, key="A") +test(2215.1, DT["b", B], 5L) # has worked forever +test(2215.2, DT[factor("b"), B], 5L) # now works too, joining fact/fact, char/fact and fact/char have plenty of tests + +# segfault on merge keyed all-NA_character_ due to is.sorted, #5070 +DT1 = data.table(x1 = rep(letters[1:4], each=3), x2=NA_character_, key="x2") +DT2 = data.table(x1 = letters[1:3]) +test(2216.1, DT1[DT2, on="x1"][,.(x1,x2)], DT1[1:9]) # segfault in v1.14.0 +test(2216.2, merge(DT1, DT2, by="x1")[,.(x1,x2)], setkey(DT1[1:9], x1)) # ok before but included for completeness verbatim from issue + +# copy attributes assigned to elements of list columns in grouping #4963 +DT1 = data.table(id=1:3, grp=c('a', 'a', 'b'), value=4:6) +DT2 = data.table(grp = c('a', 'b'), agg = list(c('1' = 4, '2' = 5), c('3' = 6))) +test(2217, DT1[, by = grp, .(agg = list(setNames(as.numeric(value), id)))], DT2) + +# shift integer64 when fill isn't integer32, #4865 +testnum = 2218 +funs = c(as.integer, as.double, as.complex, as.character, if (test_bit64) as.integer64) +# when test_bit64==FALSE these all passed before; now passes with test_bit64==TRUE too +# add grouping tests for #5205 +g = rep(c(1,2), each=2) +options(datatable.optimize = 2L) +for (f1 in funs) { + DT = data.table(x=f1(1:4), g=g) + for (f2 in funs) { + testnum = testnum + 0.001 + test(testnum, DT[, shift(x)], f1(c(NA, 1:3))) + testnum = testnum + 0.001 + w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" + test(testnum, DT[, shift(x, fill=f2(NA))], f1(c(NA, 1:3)), warning=w) + testnum = testnum + 0.001 + if (identical(f1,as.character) && identical(f2,as.complex)) { + # one special case due to as.complex(0)=="0+0i"!="0" + test(testnum, DT[, shift(x, fill="0")], f1(0:3)) + } else { + test(testnum, DT[, shift(x, fill=f2(0))], f1(0:3), warning=w) + } + + testnum = testnum + 0.001 + test(testnum, DT[, shift(x), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3)))) + testnum = testnum + 0.001 + w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" + f = f2(NA) + test(testnum, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3))), warning=w) + testnum = testnum + 0.001 + if (identical(f1,as.character) && identical(f2,as.complex)) { + # one special case due to as.complex(0)=="0+0i"!="0" + test(testnum, DT[, shift(x, fill="0"), by=g], data.table(g=g, V1=f1(c(0,1,0,3)))) + } else { + f = f2(0) + test(testnum, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(0,1,0,3))), warning=w) + } + } +} + +# subassign coerce a class to character, part of PR#5189 +DT = data.table(A=letters[1:3]) +test(2219.1, DT[2, A:=as.IDate("2021-02-03")], data.table(A=c("a","2021-02-03","c"))) +if (test_bit64) test(2219.2, DT[3, A:=as.integer64("4611686018427387906")], data.table(A=c("a","2021-02-03","4611686018427387906"))) + +# gforce improve coverage +DT = data.table(g=1:2, i=c(NA, 1:4, NA), f=factor(letters[1:6]), l=as.list(1:6)) +options(datatable.optimize = 2L) +funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") +testnum = 2220 +for (fun in funs) { + testnum = testnum + 0.01 + test(testnum, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + testnum = testnum + 0.01 + test(testnum, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) +} +test(testnum+0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") + +# tables() error when called from inside a function(...), #5197 +test(2221, (function(...) tables())(), output = "No objects of class data.table exist") + +# some revdeps do class(x)="data.table" without inheriting from data.frame, PR#5210 +DT = data.table(A=1:3) +class(DT) = "data.table" +test(2222, print(DT), output="A.*3") + +# retain nomatch=FALSE backwards compatibility #5214, and nomatch=NA_character_ PR#5216 +DT = data.table(A=1:3, key="A") +test(2223.1, DT[.(4), nomatch=FALSE], data.table(A=integer(), key="A")) +test(2223.2, DT[.(4), nomatch=NA_character_], data.table(A=4L, key="A")) + +# gshift, #5205 +options(datatable.optimize = 2L) +set.seed(123) +DT = data.table(x = sample(letters[1:5], 20, TRUE), + y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly + i = sample(c(-2L,0L,3L,NA), 20, TRUE), + d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), + s = sample(c("foo","bar",NA), 20, TRUE), + c = sample(c(0+3i,1,-1-1i,NA), 20, TRUE), + l = sample(c(TRUE, FALSE, NA), 20, TRUE), + r = as.raw(sample(1:5, 20, TRUE))) +load(testDir("test2224.Rdata")) # 47KB array 24x8 where each cell contains a length-20 result +if (test_bit64) { + DT[, i64:=as.integer64(sample(c(-2L,0L,2L,NA), 20, TRUE))] +} else { + ans = ans[, -match("i64",colnames(ans))] +} +i = 1L +for (col in names(DT)[-1]) { + for (n in list(1, 5, -1, -5, c(1,2), c(-1,1))) { + for (type in c('lag','lead','shift','cyclic')) { + # fill is tested by group in tests 2218.*; see comments in #5205 + # sapply(sapply()) changed to for(for(for())) to save 29MB, #5517 + test(2224.1+i/10000, # 192 tests here when test_bit64=TRUE; 168 when FALSE + EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), + ans[[i]]) + i = i+1L + } + } +} +a = 1:2 # fill argument with length > 1 which is not a call +test(2224.2, DT[, shift(i, fill=a), by=x], error="fill must be a vector of length 1") +DT = data.table(x=pairlist(1), g=1) +# unsupported type as argument +test(2224.3, DT[, shift(x), g], error="Type 'list' is not supported by GForce gshift.") + +# groupingsets by named by argument +test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), + data.table(Species=factor(c("setosa", "versicolor", "virginica")), V1=c(250.3, 296.8, 329.4))) +test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), + groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species'))) + +# make gprod work for bit64, #5225 +if (test_bit64) { + test(2226.1, base::prod(2147483647L,2L), 4294967294) # just to illustrate that base returns double + DT = data.table(x=c(lim.integer64(), 2, 1, NA, NA, -2, 4), g=INT(1,2,1,2,1,2,3,3)) + test(2226.2, DT[, prod(x), g], data.table(g=1:3, V1=as.integer64(c(NA,NA,-8L)))) + test(2226.3, DT[, prod(x,na.rm=TRUE), g], data.table(g=1:3, V1=as.integer64(c(NA,"9223372036854775807",-8L)))) +} + +# set ops when DT has column names x and y, #5255 +DT = data.table(x=c(1,2,2,2), y=LETTERS[c(1,2,2,3)]) +test(2227.1, fintersect(DT, DT, all=TRUE), DT) +test(2227.2, fsetdiff(DT, DT, all=TRUE), DT[0]) + +# fwrite POSIXct rounding, #5238 +DT = data.table(as.POSIXct(c(1678296152.99999952316284179688, -118944658.0000004, -.00000004), origin='1970-01-01 00:00:00')) +test(2228, fwrite(DT), output="2023-03-08T17:22:33Z.*1966-03-26T07:49:02Z.*1970-01-01T00:00:00Z") + +# automatically infer known files signatures and attempt auto-unzip #3834 +DT = fread(testDir("russellCRLF.csv")) +test(2229.1, fread(testDir("russellCRLF.zip")), DT) +test(2229.2, fread(testDir("russellCRLF.tar")), DT) +# guess binary file type +f = tempfile() +file.copy(testDir("russellCRLF.zip"), f, overwrite=TRUE) +test(2229.3, fread(f), DT) +if (test_R.utils) { + file.copy(testDir("ch11b.dat.bz2"), f, overwrite=TRUE) + test(2229.4, fread(f, logical01=FALSE)[,1], data.table(V1 = 1:100)) + file.copy(testDir("issue_785_fread.txt.gz"), f, overwrite=TRUE) + test(2229.5, fread(f, logical01=FALSE)[,25], data.table(Sv3 = c(10,14,14,15))) +} +unlink(f) +# not supporting multi file zips yet +test(2229.6, fread(testDir("multi-file.zip")), error="Compressed files containing more than 1 file are currently not supported.") + +# merge.data.table ignored incomparables argument without warning, #2587 +x = data.frame(k1 = c(NA,NA,3,4,5), k2 = c(1,NA,NA,4,5), data = 1:5) +y = data.frame(k1 = c(NA,2,NA,4,5), k2 = c(NA,NA,3,4,5), data = 1:5) +DT = as.data.table(x) +test(2230.1, setDF(merge(DT, y, by="k2", incomparables=NA)), merge(x, y, by="k2", incomparables=NA)) +test(2230.2, setDF(merge(DT, y, by="k2", incomparables=c(NA,4))), merge(x, y, by="k2", incomparables=c(NA,4))) +test(2230.3, setDF(merge(DT, y, by="k2", incomparables=c(4,5))), merge(x, y, by="k2", incomparables=c(4,5))) +test(2230.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x, y, by="k2", incomparables=c(1,NA,4,5))) +test(2230.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA,3,4,5))) +test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unknown argument 'unk' has been passed.") +test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L), + merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments.")) + +# weighted.mean GForce optimized, #3977 +old = options(datatable.optimize=1L) +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2231.01, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce FALSE") +test(2231.02, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce FALSE") +test(2231.03, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce FALSE") +# multiple groups +DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.04, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") +test(2231.05, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") +test(2231.06, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce FALSE") +# (only x XOR w) containing NA +DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.07, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") +test(2231.08, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce FALSE") +test(2231.09, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.10, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (only x XOR w) containing NaN +DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.11, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce FALSE") +test(2231.12, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") +test(2231.13, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce FALSE") +test(2231.14, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (only x XOR w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.15, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") +test(2231.16, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") +test(2231.17, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.18, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (x and w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.19, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.20, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.21, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.22, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# same as previous test cases but now GForce optimized +options(datatable.optimize=2L) +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2231.31, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce optimized j to") +test(2231.32, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce optimized j to") +test(2231.33, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to") +# multiple groups +DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.34, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") +test(2231.35, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") +test(2231.36, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce optimized j to") +# (only x XOR w) containing NA +DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.37, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") +test(2231.38, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce optimized j to") +test(2231.39, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.40, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (only x XOR w) containing NaN +DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.41, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce optimized j to") +test(2231.42, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") +test(2231.43, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce optimized j to") +test(2231.44, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (only x XOR w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.45, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") +test(2231.46, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") +test(2231.47, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.48, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (x and w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.49, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.50, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +options(old) + +# cols argument for unique.data.table, #5243 +DT = data.table(g = rep(letters, 3), v1=1:78, v2=78:1) +test(2232.1, unique(DT, by='g', cols='v1'), DT[1:26, !'v2']) +test(2232.2, unique(DT, by='g', cols='v2'), DT[1:26, !'v1']) +## no duplicates +test(2232.3, unique(DT[1:26], by='g', cols='v1'), DT[1:26, !'v2']) +## invalid columns fail as expected +test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") + +# support := with GForce #1414 +options(datatable.optimize = 2L) +DT = data.table(a=1:3,b=(1:9)/10) +test(2233.01, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output="GForce optimized j to") +# GForce returning full length +test(2233.02, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output="GForce optimized j to") +# GForce neither returning 1 per group nor full length +test(2233.03, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +# compare to non GForce version +DT = data.table(a=1:3,b=(1:9)/10) +test(2233.04, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output="GForce optimized j to") +test(2233.05, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to") + +# with key and grouping by key +DT = data.table(a=1:3,b=(1:9)/10, key="a") +test(2233.06, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10, key="a"), output="GForce optimized j to") +test(2233.07, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10, key="a"), output="GForce optimized j to") +test(2233.08, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +DT = data.table(a=1:3,b=(1:9)/10, key="a") +test(2233.09, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output="GForce optimized j to") +test(2233.10, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to") + +# with key and grouping by nonkey +DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c") +test(2233.11, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:3)/10, key="c"), output="GForce optimized j to") +test(2233.12, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:9)/10, key="c"), output="GForce optimized j to") +test(2233.13, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c") +test(2233.14, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output="GForce optimized j to") +test(2233.15, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to") + +# with key and keyby by nonkey +DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c") +test(2233.16, copy(DT)[, v := min(b), keyby=a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:3)/10, key="a"), output="GForce optimized j to") +test(2233.17, copy(DT)[, v := head(b, 3L), keyby=a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:9)/10, key="a"), output="GForce optimized j to") +test(2233.18, copy(DT)[, v := head(b, 2L), keyby=a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c") +test(2233.19, copy(DT)[, v := min(b), keyby=a, verbose=TRUE], copy(DT)[, v := base::min(b), keyby=a], output="GForce optimized j to") +test(2233.20, copy(DT)[, v := head(b, 3L), keyby=a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), keyby=a], output="GForce optimized j to") +# with irows +DT = data.table(a=1:3,b=(1:9)/10) +test(2233.21, DT[a==2, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=c(NA,0.2,NA)), output="GForce optimized j to") +test(2233.22, DT[a!=4, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output="GForce optimized j to") +test(2233.23, DT[a!=4, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +DT = data.table(a=1:3,b=(1:9)/10) +test(2233.24, copy(DT)[a==2, v := min(b), a, verbose=TRUE], copy(DT)[a==2, v := base::min(b), a, ], output="GForce optimized j to") +test(2233.25, copy(DT)[a!=4, v := head(b, 3L), a, verbose=TRUE], copy(DT)[a!=4, v := utils::head(b, 3L), a], output="GForce optimized j to") + +# multiple assignments +DT = data.table(a=1:3,b=(1:9)/10) +test(2233.26, DT[, c("v1","v2") := .(min(b), max(b)), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v1=(1:3)/10, v2=(7:9)/10), output="GForce optimized j to") +test(2233.27, DT[, c("v1","v2") := .(head(b,3L), tail(b,3L)), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v1=(1:9)/10, v2=(1:9)/10), output="GForce optimized j to") +test(2233.28, DT[, c("v1","v2") := .(head(b,3L), tail(b,2L)), a], error="Supplied 6 items to be assigned to 9 items of column 'v2'.") +test(2233.29, DT[, c("v1","v2") := .(head(b,2L), tail(b,3L)), a], error="Supplied 6 items to be assigned to 9 items of column 'v1'.") +test(2233.30, DT[, c("v1","v2") := .(head(b,2L), tail(b,2L)), a], error="Supplied 6 items to be assigned to 9 items of column 'v1'.") +test(2233.31, DT[, c("v1","v2") := .(min(b), max(b)), a, verbose=TRUE], DT[, c("v1","v2") := .(base::min(b), base::max(b)), a ], output="GForce optimized j to") +test(2233.32, DT[, c("v1","v2") := .(head(b,3L), tail(b,3L)), a, verbose=TRUE], DT[, c("v1","v2") := .(utils::head(b,3L), utils::tail(b,3L)), a], output="GForce optimized j to") + +# gforce needs to evaluate variable arguments before calling C part (part of test 101.17 in programming.Rraw) +set.seed(108) +yn = c(1, 5, 10, 20) +ycols = paste0("y", yn) +ydt = data.table(symbol = rep(1:3, each = 100)) +ydt[, date := seq_len(.N), by = symbol] +ydt[, ret := rnorm(.N)] +f = shift +test(2233.33, copy(ydt)[, (ycols) := shift(ret, yn, type = "lead"), by = symbol, verbose=TRUE], copy(ydt)[, (ycols) := f(ret, yn, type = "lead"), by = symbol], output="GForce optimized j to") + +# optimized := by= with out-of-order groups broke in dev before release; #5307, #5326, #5337, #5345 +# verbatim from #5307 +DT = data.table(by1=c("a","a","b","b"), by2=c("c","d","c","d"), value=c("ac","ad","bc","bd")) +test(2233.34, copy(DT)[, same_value:=value[1], by=.(by1, by2), verbose=TRUE], ans<-copy(DT)[, same_value:=value], output=out<-"GForce.*g[[]") +test(2233.35, copy(DT)[, same_value:=value[1], by=.(by2, by1), verbose=TRUE], ans, output=out) +test(2233.36, copy(DT)[, same_value:=value[1], keyby=.(by2, by1), verbose=TRUE], setkey(ans,by2,by1), output=out) +# similar to #5307 using integer +DT = data.table(A=INT(2,1,2,1), B=6:3, v=11:14) +test(2233.37, copy(DT)[, val:=v[1L], by=.(A,B), verbose=TRUE], copy(DT)[, val:=11:14], output=out) +test(2233.38, copy(DT)[, val:=v[1L], keyby=.(A,B), verbose=TRUE], data.table(A=INT(1,1,2,2), B=INT(3,5,4,6), v=INT(14,12,13,11), val=INT(14,12,13,11), key=c("A","B")), output=out) +# test from #5326 but with n=100 rather than n=100000; confirmed that n=100 fails tests 2233.403-405 before fix +set.seed(10) +n = 100 +a = data.table(id1=1:n, id2=sample(1:900,n,replace=TRUE), flag=sample(c(0,0,0,1),n,replace=TRUE)) +testnum = 2233.39 +for (opt in c(0,Inf)) { + options(datatable.optimize=opt) + out = if (opt) "GForce.*gsum" else "GForce FALSE" + B = copy(a) + A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle + test(testnum+0.001, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= + setorder(A, id1) + test(testnum+0.002, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) + test(testnum+0.003, any(A[,t1!=t2]), FALSE) + test(testnum+0.004, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) + test(testnum+0.005, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) + testnum = 2233.40 +} +# test from #5337 +n=4; k=2 +mm = data.table(a = rep(1:k,n), b=seq_len(n*k), d=rep(1:n,k)) +ans = copy(mm)[, e:=INT(NA,8,NA,12,NA,8,NA,12)] +options(datatable.optimize=0) +test(2233.41, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output="GForce FALSE") +options(datatable.optimize=Inf) +test(2233.42, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output="GForce.*gsum") +# test from #5345 +set.seed(1) +DT = data.table( + t = sample(c(1:3), size=15, replace=TRUE), + id = sample(LETTERS[1:3], size=15, replace=TRUE), + v1 = sample(1:10, size=15, replace=TRUE), + v2 = 1 +) +load(testDir("test2233-43.Rdata")) # ans +setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk +old = options(datatable.verbose=TRUE) +testnum = 2233.43 +for (opt in c(0,Inf)) { + options(datatable.optimize=opt) + out = if (opt) "GForce.*gsum" else "GForce FALSE" + test(testnum, + copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") + ][, n_idT :=dim(.SD)[[1]], by=list(t, id) + ][, sum_v2_id :=sum(v2), by=.(id) + ][, sum_v1_idT:=sum(v1), by=c("id", "t") + ][, sum_v1_id :=sum(v1), by=c("id")], + ans, + output=out) + testnum = 2233.44 } +options(old) +# support by=.I; #1732 +DT = data.table(V1=1:5, V2=3:7, V3=5:1) +test(2234.1, DT[, min(.SD), by=.I], setnames(DT[, min(.SD), by=1:nrow(DT)], "nrow", "I")) +test(2234.2, DT[, min(.SD), by=.I], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +# works also with i +test(2234.3, DT[c(1,3,5), min(.SD), by=.I], data.table(I=c(1L, 3L, 5L), V1=c(1L, 3L, 1L))) +test(2234.4, DT[c(4, NA), min(.SD), by=.I], data.table(I=c(4L, NA), V1=c(2L, NA))) +# other writing styles of by=.I +test(2234.5, DT[, min(.SD), by=.(.I)], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +test(2234.6, DT[, min(.SD), by=list(.I)], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +test(2234.7, DT[, min(.SD), by=c(.I)], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +test(2234.8, DT[, min(.SD), by=.I%%2L], error="by.*contains .I.*supported") # would be nice to support in future; i.e. by odd/even rows, and by=(.I+1L)%/%2L for pairs of rows; i.e. any expression of .I +test(2234.9, DT[, min(.SD), by=somefun(.I)], error="by.*contains .I.*supported") + +# copying values of j could lead to recycling if j is a list containing NULL #5284 +DT = data.table(x = 1) +test(2235.1, copy(DT)[, c("z", "x") := {x = NULL; list(2, NULL)}], data.table(z = 2)) +test(2235.2, copy(DT)[, c("z", "x") := {list(2, NULL)}], data.table(z = 2)) + +# move IDate from POSIXlt to C, add yearquarter; #649 +x = c("1111-11-11", "2019-01-01", "2019-02-28", "2019-03-01", "2019-12-31", "2020-02-29", "2020-03-01", "2020-12-31", "2040-01-01", "2040-12-31", "2100-03-01") +test(2236.1, yday(x), c(315L, 1L, 59L, 60L, 365L, 60L, 61L, 366L, 1L, 366L, 60L)) +test(2236.2, mday(x), c(11L, 1L, 28L, 1L, 31L, 29L, 1L, 31L, 1L, 31L, 1L)) +test(2236.3, wday(x), c(7L, 3L, 5L, 6L, 3L, 7L, 1L, 5L, 1L, 2L, 2L)) +test(2236.4, week(x), c(46L, 1L, 9L, 9L, 53L, 9L, 9L, 53L, 1L, 53L, 9L)) +test(2236.5, month(x), c(11L, 1L, 2L, 3L, 12L, 2L, 3L, 12L, 1L, 12L, 3L)) +test(2236.6, quarter(x), c(4L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 4L, 1L)) +test(2236.7, year(x), c(1111L, 2019L, 2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2040L, 2040L, 2100L)) +test(2236.8, yearmon(x), c(1111+10/12, 2019, 2019+1/12, 2019+2/12, 2019+11/12, 2020+1/12, 2020+2/12, 2020+11/12, 2040, 2040+11/12, 2100+2/12)) +test(2236.9, yearqtr(x), c(1111.75, 2019, 2019, 2019, 2019.75, 2020, 2020, 2020.75, 2040, 2040.75, 2100)) + +# as.data.table() no longer ignores row.names=, #5319 +dt = data.table(a=1:2, b=3:4) +df = structure(list(a=1:2, b=3:4), row.names=c("x", "y"), class="data.frame") +test(2237.1, as.data.frame(dt, row.names=c("x", "y")), df) +df = data.frame(a=1:2, b=3:4) +test(2237.2, as.data.frame(dt, row.names=NULL), df) + +# Test new feature %notin%, #4152 +test(2238.1, 11 %notin% 1:10, TRUE) +test(2238.2, "a" %notin% c(), TRUE) +test(2238.3, "a" %notin% c("a", "b", "c"), FALSE) +test(2238.4, c(1, 2) %notin% c(1,2,3), c(FALSE, FALSE)) +test(2238.5, "a" %notin% character(), TRUE) +test(2238.6, "a" %notin% integer(), TRUE) +test(2238.7, "a" %notin% NULL, TRUE) +test(2238.8, NA %notin% 1:5, TRUE) +test(2238.9, NA %notin% c(1:5, NA), FALSE) diff --git a/man/IDateTime.Rd b/man/IDateTime.Rd index 03e464c36..6854f59ae 100644 --- a/man/IDateTime.Rd +++ b/man/IDateTime.Rd @@ -41,13 +41,27 @@ \alias{month} \alias{quarter} \alias{year} +\alias{yearmon} +\alias{yearqtr} \alias{IDate-class} \alias{ITime-class} \title{ Integer based date class } \description{ - Date and time classes with integer storage for fast sorting and - grouping. Still experimental! + Classes (\code{IDate} and \code{ITime}) with \emph{integer} storage + for fast sorting and grouping. + + \code{IDate} inherits from the base class \code{Date}; the main + difference is that the latter uses double storage, allowing e.g. for + fractional dates at the cost of storage & sorting inefficiency. + + Using \code{IDate}, if sub-day granularity is needed, use a second + \code{ITime} column. \code{IDateTime()} facilitates building such + paired columns. + + Lastly, there are date-time helpers for extracting parts of dates as + integers, for example the year (\code{year()}), month + (\code{month()}), or day in the month (\code{mday()}); see Usage and Exampels. } \usage{ as.IDate(x, \dots) @@ -81,6 +95,8 @@ isoweek(x) month(x) quarter(x) year(x) +yearmon(x) +yearqtr(x) } @@ -92,7 +108,9 @@ year(x) \item{tz}{time zone (see \code{strptime}).} \item{date}{date object convertible with \code{as.IDate}.} \item{time}{time-of-day object convertible with \code{as.ITime}.} - \item{digits}{really \code{units}; one of the units listed for rounding. May be abbreviated.} + \item{digits}{really \code{units}; one of the units listed for + rounding. May be abbreviated. Named \code{digits} for consistency with + the S3 generic.} \item{units}{one of the units listed for truncating. May be abbreviated.} \item{ms}{ For \code{as.ITime} methods, what should be done with sub-second fractions of input? Valid values are \code{'truncate'} (floor), \code{'nearest'} (round), and \code{'ceil'} (ceiling). See Details. } } @@ -100,7 +118,13 @@ year(x) \code{IDate} is a date class derived from \code{Date}. It has the same internal representation as the \code{Date} class, except the storage mode is integer. \code{IDate} is a relatively simple wrapper, and it -should work in almost all situations as a replacement for \code{Date}. +should work in almost all situations as a replacement for +\code{Date}. The main limitations of integer storage are (1) fractional + dates are not supported (use \code{IDateTime()} instead) and (2) the + range of supported dates is bounded by \code{.Machine$integer.max} + dates away from January 1, 1970 (a rather impractical limitation as + these dates are roughly 6 million years in the future/past, but + consider this your caveat). Functions that use \code{Date} objects generally work for \code{IDate} objects. This package provides specific methods for @@ -113,11 +137,10 @@ hours. Because \code{ITime} is stored in seconds, you can add it to a \code{POSIXct} object, but you should not add it to a \code{Date} object. -Conversions to and from \code{Date} and \code{POSIXct} formats are provided. +We also provide S3 methods to convert to and from \code{Date} and \code{POSIXct}. -\code{ITime} does not account for time zones. When converting -\code{ITime} and \code{IDate} to POSIXct with \code{as.POSIXct}, a time -zone may be specified. +\code{ITime} is time zone-agnostic. When converting \code{ITime} and +\code{IDate} to POSIXct with \code{as.POSIXct}, a time zone may be specified. Inputs like \code{'2018-05-15 12:34:56.789'} are ambiguous from the perspective of an \code{ITime} object -- the method of coercion of the 789 milliseconds is controlled by the \code{ms} argument to relevant methods. The default behavior (\code{ms = 'truncate'}) is to use \code{as.integer}, which has the effect of truncating anything after the decimal. Alternatives are to round to the nearest integer (\code{ms = 'nearest'}) or to round up (\code{ms = 'ceil'}). @@ -145,11 +168,11 @@ functions \code{weekdays}, \code{months}, and \code{quarters} can also be used, but these return character values, so they must be converted to factors for use with data.table. \code{isoweek} is ISO 8601-consistent. -The \code{round} method for IDate's is useful for grouping and plotting. +The \code{round} method for IDate's is useful for grouping and plotting. It can round to weeks, months, quarters, and years. Similarly, the \code{round} and \code{trunc} methods for ITime's are useful for grouping and plotting. -They can round or truncate to hours and minutes. -Note for ITime's with 30 seconds, rounding is inconsistent due to rounding off a 5. +They can round or truncate to hours and minutes. +Note for ITime's with 30 seconds, rounding is inconsistent due to rounding off a 5. See 'Details' in \code{\link{round}} for more information. } @@ -169,9 +192,14 @@ See 'Details' in \code{\link{round}} for more information. and \code{year} return integer values for second, minute, hour, day of year, day of week, day of month, week, month, quarter, and year, respectively. - - These values are all taken directly from the \code{POSIXlt} representation - of \code{x}, with the notable difference that while \code{yday}, \code{wday}, + \code{yearmon} and \code{yearqtr} return double values representing + respectively `year + (month-1) / 12` and `year + (quarter-1) / 4`. + + \code{second}, \code{minute}, \code{hour} are taken directly from + the \code{POSIXlt} representation. + All other values are computed from the underlying integer representation + and comparable with the values of their \code{POSIXlt} representation + of \code{x}, with the notable difference that while \code{yday}, \code{wday}, and \code{mon} are all 0-based, here they are 1-based. } @@ -234,7 +262,7 @@ round(seqdates, "months") (seqtimes <- seq(as.ITime("07:00"), as.ITime("08:00"), by = 20)) round(seqtimes, "hours") trunc(seqtimes, "hours") - + } \keyword{utilities} diff --git a/man/as.xts.data.table.Rd b/man/as.xts.data.table.Rd index 1f42cceab..1328229ed 100644 --- a/man/as.xts.data.table.Rd +++ b/man/as.xts.data.table.Rd @@ -2,13 +2,14 @@ \alias{as.xts.data.table} \title{Efficient data.table to xts conversion} \description{ - Efficient conversion of data.table to xts, data.table must have \emph{POSIXct} or \emph{Date} type in first column. + Efficient conversion of data.table to xts, data.table must have a time based type in first column. See ?xts::timeBased for supported types } \usage{ -as.xts.data.table(x, \dots) +as.xts.data.table(x, numeric.only = TRUE, \dots) } \arguments{ -\item{x}{data.table to convert to xts, must have \emph{POSIXct} or \emph{Date} in the first column. All others non-numeric columns will be omitted with warning.} +\item{x}{data.table to convert to xts, must have a time based first column. As xts objects are indexed matrixes, all columns must be of the same type. If columns of multiple types are selected, standard as.matrix rules are applied during the conversion. } +\item{numeric.only}{If TRUE, only include numeric columns in the conversion and all non-numeric columns will be omitted with warning} \item{\dots}{ignored, just for consistency with generic method.} } \seealso{ \code{\link{as.data.table.xts}} } diff --git a/man/assign.Rd b/man/assign.Rd index f62275560..bb87a5221 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -1,23 +1,31 @@ \name{:=} \alias{:=} \alias{set} +\alias{let} \title{ Assignment by reference } \description{ Fast add, remove and update subsets of columns, by reference. \code{:=} operator can be used in two ways: \code{LHS := RHS} form, and \code{Functional form}. See \code{Usage}. \code{set} is a low-overhead loop-able version of \code{:=}. It is particularly useful for repetitively updating rows of certain columns by reference (using a for-loop). See \code{Examples}. It can not perform grouping operations. + \code{let} is an alias for the functional form and behaves exactly like \code{`:=`}. + } \usage{ # 1. LHS := RHS form # DT[i, LHS := RHS, by = ...] # DT[i, c("LHS1", "LHS2") := list(RHS1, RHS2), by = ...] -# 2. Functional form +# 2a. Functional form with `:=` # DT[i, `:=`(LHS1 = RHS1, # LHS2 = RHS2, # ...), by = ...] +# 2b. Functional form with let +# DT[i, let(LHS1 = RHS1, +# LHS2 = RHS2, +# ...), by = ...] + set(x, i = NULL, j, value) } \arguments{ @@ -42,6 +50,7 @@ set(x, i = NULL, j, value) DT[i, (colvector) := val] # same (NOW PREFERRED) shorthand syntax. The parens are enough to stop the LHS being a symbol; same as c(colvector). DT[i, colC := mean(colB), by = colA] # update (or add) column called "colC" by reference by group. A major feature of `:=`. DT[,`:=`(new1 = sum(colB), new2 = sum(colC))] # Functional form + DT[, let(new1 = sum(colB), new2 = sum(colC))] # New alias for functional form. } The \code{\link{.Last.updated}} variable contains the number of rows updated by the most recent \code{:=} or \code{set} calls, which may be useful, for example, in production settings for testing assumptions about the number of rows affected by a statement; see \code{\link{.Last.updated}} for details. diff --git a/man/data.table.Rd b/man/data.table.Rd index 7ec8cec3a..ecc79e2a5 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -453,8 +453,8 @@ if (interactive()) { # keep up to date with latest stable version on CRAN update.packages() - # get the latest devel version - update.dev.pkg() + # get the latest devel version that has passed all tests + update_dev_pkg() # read more at: # https://github.com/Rdatatable/data.table/wiki/Installation } diff --git a/man/duplicated.Rd b/man/duplicated.Rd index a9c333beb..daf7c39d5 100644 --- a/man/duplicated.Rd +++ b/man/duplicated.Rd @@ -28,7 +28,8 @@ memory efficient. \usage{ \method{duplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots) -\method{unique}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots) +\method{unique}{data.table}(x, incomparables=FALSE, fromLast=FALSE, +by=seq_along(x), cols=NULL, \dots) \method{anyDuplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots) @@ -46,6 +47,8 @@ correspond to \code{duplicated = FALSE}.} of columns from \code{x} to use for uniqueness checks. By default all columns are being used. That was changed recently for consistency to data.frame methods. In version \code{< 1.9.8} default was \code{key(x)}.} +\item{cols}{Columns (in addition to \code{by}) from \code{x} to include in the + resulting \code{data.table}.} \item{na.rm}{Logical (default is \code{FALSE}). Should missing values (including \code{NaN}) be removed?} } @@ -59,7 +62,11 @@ handle cases where limitations in floating point representation is undesirable. \code{v1.9.4} introduces \code{anyDuplicated} method for data.tables and is similar to base in functionality. It also implements the logical argument -\code{fromLast} for all three functions, with default value \code{FALSE}. +\code{fromLast} for all three functions, with default value +\code{FALSE}. + +Note: When \code{cols} is specified, the resulting table will have +columns \code{c(by, cols)}, in that order. } \value{ \code{duplicated} returns a logical vector of length \code{nrow(x)} diff --git a/man/fdroplevels.Rd b/man/fdroplevels.Rd new file mode 100644 index 000000000..98334f011 --- /dev/null +++ b/man/fdroplevels.Rd @@ -0,0 +1,48 @@ +\name{fdroplevels} +\alias{fdroplevels} +\alias{droplevels} +\alias{droplevels.data.table} +\title{Fast droplevels} +\description{ + Similar to \code{base::droplevels} but \emph{much faster}. +} + +\usage{ +fdroplevels(x, exclude = if (anyNA(levels(x))) NULL else NA, \dots) + +\method{droplevels}{data.table}(x, except = NULL, exclude, in.place = FALSE, \dots) +} +\arguments{ + \item{x}{ \code{factor} or \code{data.table} where unused levels should be dropped. } + \item{exclude}{ A \code{character} vector of factor levels which are dropped no matter of presented or not. } + \item{except}{ An \code{integer} vector of indices of data.table columns which are not modified by dropping levels. } + \item{in.place}{ logical (default is \code{FALSE}). If \code{TRUE} levels of factors of \code{data.table} are modified in-place. } + \item{\dots}{ further arguments passed to methods } +} + +\value{ + \code{fdroplevels} returns a \code{factor}. + + \code{droplevels} returns a \code{data.table} where levels are dropped at factor columns. +} + +\examples{ +# on vectors +x = factor(letters[1:10]) +fdroplevels(x[1:5]) +# exclude levels from drop +fdroplevels(x[1:5], exclude = c("a", "c")) + +# on data.table +DT = data.table(a = factor(1:10), b = factor(letters[1:10])) +droplevels(head(DT))[["b"]] +# exclude levels +droplevels(head(DT), exclude = c("b", "c"))[["b"]] +# except columns from drop +droplevels(head(DT), except = 2)[["b"]] +droplevels(head(DT), except = 1)[["b"]] +} +\seealso{ + \code{\link{data.table}}, \code{\link{duplicated}}, \code{\link{unique}} +} +\keyword{ data } diff --git a/man/fread.Rd b/man/fread.Rd index c7b7da856..cc96062de 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -271,42 +271,13 @@ system.time(DT <- fread("testbig.csv")) all(mapply(all.equal, DF, DT)) - -# Real data example (Airline data) -# https://stat-computing.org/dataexpo/2009/the-data.html - -download.file("https://stat-computing.org/dataexpo/2009/2008.csv.bz2", - destfile="2008.csv.bz2") -# 109MB (compressed) - -system("bunzip2 2008.csv.bz2") -# 658MB (7,009,728 rows x 29 columns) - -colClasses = sapply(read.csv("2008.csv",nrows=100,stringsAsFactors=FALSE),class) -# 4 character, 24 integer, 1 logical. Incorrect. - -colClasses = sapply(read.csv("2008.csv",nrows=200,stringsAsFactors=FALSE),class) -# 5 character, 24 integer. Correct. Might have missed data only using 100 rows -# since read.table assumes colClasses is correct. - -system.time(DF <- read.table("2008.csv", header=TRUE, sep=",", - quote="",stringsAsFactors=FALSE,comment.char="",nrows=7009730, - colClasses=colClasses)) -# 24.4 secs - -system.time(DT <- fread("2008.csv")) -# 1.9 secs - -table(sapply(DT,class)) -# 5 character and 24 integer columns. Correct without needing to worry about colClasses -# issue above. - - # Reads URLs directly : fread("https://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat") # Decompresses .gz and .bz2 automatically : -fread("https://stat-computing.org/dataexpo/2009/1987.csv.bz2") +fread("https://github.com/Rdatatable/data.table/raw/1.14.0/inst/tests/ch11b.dat.bz2") + +fread("https://github.com/Rdatatable/data.table/raw/1.14.0/inst/tests/issue_785_fread.txt.gz") } } diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 870acaac7..ba6eb4751 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -64,7 +64,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{encoding}{ The encoding of the strings written to the CSV file. Default is \code{""}, which means writting raw bytes without considering the encoding. Other possible options are \code{"UTF-8"} and \code{"native"}. } } \details{ -\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://www.h2o.ai/blog/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. +\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://h2o.ai/blog/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. To save space, \code{fwrite} prefers to write wide numeric values in scientific notation -- e.g. \code{10000000000} takes up much more space than \code{1e+10}. Most file readers (e.g. \code{\link{fread}}) understand scientific notation, so there's no fidelity loss. Like in base R, users can control this by specifying the \code{scipen} argument, which follows the same rules as \code{\link[base]{options}('scipen')}. \code{fwrite} will see how much space a value will take to write in scientific vs. decimal notation, and will only write in scientific notation if the latter is more than \code{scipen} characters wider. For \code{10000000000}, then, \code{1e+10} will be written whenever \code{scipen<6}. diff --git a/man/merge.Rd b/man/merge.Rd index 6fcbc1086..d8246668c 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -21,7 +21,7 @@ Use the \code{by}, \code{by.x} and \code{by.y} arguments explicitly to override \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), # default FALSE -\dots) +incomparables = NULL, \dots) } \arguments{ @@ -51,6 +51,7 @@ fashion as the \code{\link{merge.data.frame}} method does.} non-\code{by.y} column names in \code{y} when they have the same column name as any \code{by.x}.} \item{allow.cartesian}{See \code{allow.cartesian} in \code{\link{[.data.table}}.} +\item{incomparables}{values which cannot be matched and therefore are excluded from by columns.} \item{\dots}{Not used at this time.} } @@ -125,6 +126,12 @@ setnames(d2, "a", "b") merge(d1, d2, by.x="a", by.y="b") merge(d1, d2, by.x="a", by.y="b", all=TRUE) merge(d2, d1, by.x="b", by.y="a") + +# using incomparables values +d1 <- data.table(a=c(1,2,NA,NA,3,1), z=1:6) +d2 <- data.table(a=c(1,2,NA), z=10:12) +merge(d1, d2, by="a") +merge(d1, d2, by="a", incomparables=NA) } \keyword{ data } diff --git a/man/notin.Rd b/man/notin.Rd new file mode 100644 index 000000000..d84bb2024 --- /dev/null +++ b/man/notin.Rd @@ -0,0 +1,33 @@ +\name{notin} +\alias{\%notin\%} + +\title{ +Convenience operator for checking if an example is not in a set of elements +} + +\description{ +Check whether an object is absent from a table, i.e., the logical inverse of \code{\link[=base]{in}}. +} + +\usage{ +x \%notin\% table +} + +\arguments{ + \item{x}{ Vector or \code{NULL}: the values to be matched. } + \item{table}{ Vector or \code{NULL}: the values to be matched against. } +} + + +\value{ + Logical vector, \code{TRUE} for each element of \code{x} \emph{absent} from \code{table}, and \code{FALSE} for each element of \code{x} \emph{present} in \code{table}. +} + +\seealso{ \code{\link[base]{match}}, \code{\link[data.table]{chmatch}} } + + +\examples{ + 11 \%notin\% 1:10 # TRUE + "a" \%notin\% c("a", "b") # FALSE +} + diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index 234fcd8ff..b4929e789 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -18,10 +18,10 @@ \method{print}{data.table}(x, topn=getOption("datatable.print.topn"), # default: 5 nrows=getOption("datatable.print.nrows"), # default: 100 - class=getOption("datatable.print.class"), # default: FALSE + class=getOption("datatable.print.class"), # default: TRUE row.names=getOption("datatable.print.rownames"), # default: TRUE col.names=getOption("datatable.print.colnames"), # default: "auto" - print.keys=getOption("datatable.print.keys"), # default: FALSE + print.keys=getOption("datatable.print.keys"), # default: TRUE trunc.cols=getOption("datatable.print.trunc.cols"), # default: FALSE quote=FALSE, timezone=FALSE, \dots) @@ -57,7 +57,7 @@ \details{ By default, with an eye to the typically large number of observations in a \code{data.table}, only the beginning and end of the object are displayed (specifically, \code{head(x, topn)} and \code{tail(x, topn)} are displayed unless \code{nrow(x) < nrows}, in which case all rows will print). - \code{format_col} is applied at a column level; for example, \code{format_col.POSIXct} is used to tag the time zones of \code{POSIXct} columns. \code{format_list_item} is applied to the elements (rows) of \code{list} columns; see Examples. + \code{format_col} is applied at a column level; for example, \code{format_col.POSIXct} is used to tag the time zones of \code{POSIXct} columns. \code{format_list_item} is applied to the elements (rows) of \code{list} columns; see Examples. The default \code{format_col} method uses \code{\link[utils]{getS3method}} to test if a \code{format} method exists for the column, and if so uses it. Otherwise, the default \code{format_list_item} method uses the S3 format method (if one exists) for each item of a \code{list} column. } \seealso{\code{\link{print.default}}} \examples{ diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd index 192fb5135..2ba39a2a9 100644 --- a/man/rbindlist.Rd +++ b/man/rbindlist.Rd @@ -13,7 +13,7 @@ rbindlist(l, use.names="check", fill=FALSE, idcol=NULL) \arguments{ \item{l}{ A list containing \code{data.table}, \code{data.frame} or \code{list} objects. \code{\dots} is the same but you pass the objects by name separately. } \item{use.names}{\code{TRUE} binds by matching column name, \code{FALSE} by position. `check` (default) warns if all items don't have the same names in the same order and then currently proceeds as if `use.names=FALSE` for backwards compatibility (\code{TRUE} in future); see news for v1.12.2.} - \item{fill}{\code{TRUE} fills missing columns with NAs. By default \code{FALSE}. When \code{TRUE}, \code{use.names} is set to \code{TRUE}.} + \item{fill}{\code{TRUE} fills missing columns with NAs. By default \code{FALSE}.} \item{idcol}{Creates a column in the result showing which list item those rows came from. \code{TRUE} names this column \code{".id"}. \code{idcol="file"} names this column \code{"file"}. If the input list has names, those names are the values placed in this id column, otherwise the values are an integer vector \code{1:length(l)}. See \code{examples}.} } \details{ diff --git a/man/shift.Rd b/man/shift.Rd index c710ab368..219b8f3d8 100644 --- a/man/shift.Rd +++ b/man/shift.Rd @@ -10,14 +10,14 @@ } \usage{ -shift(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FALSE) +shift(x, n=1L, fill, type=c("lag", "lead", "shift", "cyclic"), give.names=FALSE) } \arguments{ \item{x}{ A vector, list, data.frame or data.table. } \item{n}{ integer vector denoting the offset by which to lead or lag the input. To create multiple lead/lag vectors, provide multiple values to \code{n}; negative values of \code{n} will "flip" the value of \code{type}, i.e., \code{n=-1} and \code{type='lead'} is the same as \code{n=1} and \code{type='lag'}. } - \item{fill}{ Value to use for padding when the window goes beyond the input length. } - \item{type}{ default is \code{"lag"} (look "backwards"). The other possible values \code{"lead"} (look "forwards") and \code{"shift"} (behave same as \code{"lag"} except given names). } - \item{give.names}{default is \code{FALSE} which returns an unnamed list. When \code{TRUE}, names are automatically generated corresponding to \code{type} and \code{n}. If answer is an atomic vector, then the argument is ignored. } + \item{fill}{ default is \code{NA}. Value to use for padding when the window goes beyond the input length. } + \item{type}{ default is \code{"lag"} (look "backwards"). The other possible values \code{"lead"} (look "forwards"), \code{"shift"} (behave same as \code{"lag"} except given names) and \code{"cyclic"} where pushed out values are re-introduced at the front/back. } + \item{give.names}{ default is \code{FALSE} which returns an unnamed list. When \code{TRUE}, names are automatically generated corresponding to \code{type} and \code{n}. If answer is an atomic vector, then the argument is ignored. } } \details{ \code{shift} accepts vectors, lists, data.frames or data.tables. It always returns a list except when the input is a \code{vector} and \code{length(n) == 1} in which case a \code{vector} is returned, for convenience. This is so that it can be used conveniently within data.table's syntax. For example, \code{DT[, (cols) := shift(.SD, 1L), by=id]} would lag every column of \code{.SD} by 1 for each group and \code{DT[, newcol := colA + shift(colB)]} would assign the sum of two \emph{vectors} to \code{newcol}. @@ -40,6 +40,8 @@ shift(x, n=1:2, fill=0, type="lag") # getting a window by using positive and negative n: shift(x, n = -1:1) shift(x, n = -1:1, type = "shift", give.names = TRUE) +# cyclic shift where pad uses pushed out values +shift(x, n = -1:1, type = "cyclic") # on data.tables DT = data.table(year=2010:2014, v1=runif(5), v2=1:5, v3=letters[1:5]) diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd index 9bfa72fce..c96cbef5c 100644 --- a/man/special-symbols.Rd +++ b/man/special-symbols.Rd @@ -10,7 +10,7 @@ \alias{.NGRP} \title{ Special symbols } \description{ - \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}. + \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. \code{.I} can be used in \code{by} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}. \code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}. } \details{ @@ -22,13 +22,13 @@ \item{\code{.SD} is a \code{data.table} containing the \bold{S}ubset of \code{x}'s \bold{D}ata for each group, excluding any columns used in \code{by} (or \code{keyby}).} \item{\code{.BY} is a \code{list} containing a length 1 vector for each item in \code{by}. This can be useful when \code{by} is not known in advance. The \code{by} variables are also available to \code{j} directly by name; useful for example for titles of graphs if \code{j} is a plot command, or to branch with \code{if()} depending on the value of a group variable.} \item{\code{.N} is an integer, length 1, containing the number of rows in the group. This may be useful when the column names are not known in advance and for convenience generally. When grouping by \code{i}, \code{.N} is the number of rows in \code{x} matched to, for each row of \code{i}, regardless of whether \code{nomatch} is \code{NA} or \code{NULL}. It is renamed to \code{N} (no dot) in the result (otherwise a column called \code{".N"} could conflict with the \code{.N} variable, see FAQ 4.6 for more details and example), unless it is explicitly named; e.g., \code{DT[,list(total=.N),by=a]}.} - \item{\code{.I} is an integer vector equal to \code{seq_len(nrow(x))}. While grouping, it holds for each item in the group, its row location in \code{x}. This is useful to subset in \code{j}; e.g. \code{DT[, .I[which.max(somecol)], by=grp]}.} + \item{\code{.I} is an integer vector equal to \code{seq_len(nrow(x))}. While grouping, it holds for each item in the group, its row location in \code{x}. This is useful to subset in \code{j}; e.g. \code{DT[, .I[which.max(somecol)], by=grp]}. If used in \code{by} it corresponds to applying a function rowwise. } \item{\code{.GRP} is an integer, length 1, containing a simple group counter. 1 for the 1st group, 2 for the 2nd, etc.} \item{\code{.NGRP} is an integer, length 1, containing the number of groups. } } \code{.EACHI} is defined as \code{NULL} but its value is not used. Its usage is \code{by=.EACHI} (or \code{keyby=.EACHI}) which invokes grouping-by-each-row-of-i; see \code{\link{data.table}}'s \code{by} argument for more details. - + Note that \code{.N} in \code{i} is computed up-front, while that in \code{j} applies \emph{after filtering in \code{i}}. That means that even absent grouping, \code{.N} in \code{i} can be different from \code{.N} in \code{j}. See Examples. } \seealso{ @@ -43,10 +43,10 @@ X DT[.N] # last row, only special symbol allowed in 'i' DT[, .N] # total number of rows in DT DT[, .N, by=x] # number of rows in each group -DT[, .SD, .SDcols=x:y] # select columns 'x' and 'y' +DT[, .SD, .SDcols=x:y] # select columns 'x' through 'y' DT[, .SD[1]] # first row of all columns -DT[, .SD[1], by=x] # first row of 'y' and 'v' for each group in 'x' -DT[, c(.N, lapply(.SD, sum)), by=x] # get rows *and* sum columns 'v' and 'y' by group +DT[, .SD[1], by=x] # first row of all columns for each group in 'x' +DT[, c(.N, lapply(.SD, sum)), by=x] # get rows *and* sum all columns by group DT[, .I[1], by=x] # row number in DT corresponding to each group DT[, .N, by=rleid(v)] # get count of consecutive runs of 'v' DT[, c(.(y=max(y)), lapply(.SD, min)), @@ -58,5 +58,9 @@ X[, DT[.BY, y, on="x"], by=x] # join within each group # .N can be different in i and j DT[{cat(sprintf('in i, .N is \%d\n', .N)); a < .N/2}, {cat(sprintf('in j, .N is \%d\n', .N)); mean(a)}] + +# .I can be different in j and by, enabling rowwise operations in by +DT[, .(.I, min(.SD[,-1]))] +DT[, .(min(.SD[,-1])), by=.I] } \keyword{ data } diff --git a/man/tables.Rd b/man/tables.Rd index 5b95edffa..a8a74b0a7 100644 --- a/man/tables.Rd +++ b/man/tables.Rd @@ -5,11 +5,11 @@ Convenience function for concisely summarizing some metadata of all \code{data.table}s in memory (or an optionally specified environment). } \usage{ -tables(mb=TRUE, order.col="NAME", width=80, +tables(mb=type_size, order.col="NAME", width=80, env=parent.frame(), silent=FALSE, index=FALSE) } \arguments{ - \item{mb}{ \code{logical}; \code{TRUE} adds the rough size of each \code{data.table} in megabytes to the output under column \code{MB}. } + \item{mb}{ a function which accepts a \code{data.table} and returns its size in bytes. By default, \code{type_size} (same as \code{TRUE}) provides a fast lower bound by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). A column \code{"MB"} is included in the output unless \code{FALSE} or \code{NULL}. } \item{order.col}{ Column name (\code{character}) by which to sort the output. } \item{width}{ \code{integer}; number of characters beyond which the output for each of the columns \code{COLS}, \code{KEY}, and \code{INDICES} are truncated. } \item{env}{ An \code{environment}, typically the \code{.GlobalEnv} by default, see Details. } @@ -19,9 +19,9 @@ tables(mb=TRUE, order.col="NAME", width=80, \details{ Usually \code{tables()} is executed at the prompt, where \code{parent.frame()} returns \code{.GlobalEnv}. \code{tables()} may also be useful inside functions where \code{parent.frame()} is the local scope of the function; in such a scenario, simply set it to \code{.GlobalEnv} to get the same behaviour as at prompt. -Note that on older versions of \R, \code{object.size} may be slow, so setting \code{mb=FALSE} may speed up execution of \code{tables} significantly. +`mb = utils::object.size` provides a higher and more accurate estimate of size, but may take longer. Its default `units="b"` is appropriate. -Setting \code{silent=TRUE} prints nothing; the metadata are returned as a \code{data.table}, invisibly, whether silent is \code{TRUE} or \code{FALSE}. +Setting \code{silent=TRUE} prints nothing; the metadata is returned as a \code{data.table} invisibly whether \code{silent} is \code{TRUE} or \code{FALSE}. } \value{ A \code{data.table} containing the information printed. diff --git a/man/test.data.table.Rd b/man/test.data.table.Rd index ba0fe25f9..c36e5f9d4 100644 --- a/man/test.data.table.Rd +++ b/man/test.data.table.Rd @@ -7,7 +7,9 @@ \usage{ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", silent = FALSE, - showProgress = interactive() && !silent) + showProgress = interactive() && !silent, + memtest = Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), + memtest.id = NULL) } \arguments{ \item{script}{ Run arbitrary R test script. } @@ -15,6 +17,8 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", \item{pkg}{ Root directory name under which all package content (ex: DESCRIPTION, src/, R/, inst/ etc..) resides. Used only in \emph{dev-mode}. } \item{silent}{ Controls what happens if a test fails. Like \code{silent} in \code{\link{try}}, \code{TRUE} causes the error message to be suppressed and \code{FALSE} to be returned, otherwise the error is returned. } \item{showProgress}{ Output 'Running test ...\\r' at the start of each test? } +\item{memtest}{ Measure and report memory usage of tests (1:gc before ps, 2:gc after ps) rather than time taken (0) by default. Intended for and tested on Linux. See PR #5515 for more details. } +\item{memtest.id}{ An id for which to print memory usage for every sub id. May be a range of ids. } } \details{ Runs a series of tests. These can be used to see features and examples of usage, too. Running test.data.table will tell you the full location of the test file(s) to open. diff --git a/man/update.dev.pkg.Rd b/man/update_dev_pkg.Rd similarity index 69% rename from man/update.dev.pkg.Rd rename to man/update_dev_pkg.Rd index 72b6e7b16..3db5b9831 100644 --- a/man/update.dev.pkg.Rd +++ b/man/update_dev_pkg.Rd @@ -1,11 +1,10 @@ -\name{update.dev.pkg} -\alias{update} -\alias{update.dev.pkg} +\name{update_dev_pkg} +\alias{update_dev_pkg} \title{Perform update of development version of a package} \description{ - It will download and install package from devel repository only when new commit is available there, otherwise only PACKAGES file is transferred. Defaults are set to update \code{data.table}, other packages can be used as well. Their repository has to include git commit information in PACKAGES file. + Downloads and installs latest development version only when a new commit is available which has also passed all tests. Defaults are set to update \code{data.table}, other packages can be used as well. Their repository has to include git commit information in PACKAGES file. } -\usage{\method{update}{dev.pkg}(object="data.table", +\usage{update_dev_pkg(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, \dots) } @@ -30,7 +29,8 @@ NULL. } \examples{ - # data.table::update.dev.pkg() +\dontshow{ # using if(FALSE) because \dontrun could still be run by --run-dontrun; #5421 } + if (FALSE) data.table::update_dev_pkg() } \seealso{ \code{\link{data.table}} diff --git a/po/R-data.table.pot b/po/R-data.table.pot index ad00f1277..2fe5f0b7c 100644 --- a/po/R-data.table.pot +++ b/po/R-data.table.pot @@ -1567,7 +1567,7 @@ msgstr "" msgid "**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********" msgstr "" -msgid "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********" +msgid "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update_dev_pkg()\n**********" msgstr "" msgid "**********" diff --git a/po/R-zh_CN.po b/po/R-zh_CN.po index 7e78584fd..105b94145 100644 --- a/po/R-zh_CN.po +++ b/po/R-zh_CN.po @@ -2161,11 +2161,11 @@ msgstr "" msgid "" "**********\n" "This development version of data.table was built more than 4 weeks ago. " -"Please update: data.table::update.dev.pkg()\n" +"Please update: data.table::update_dev_pkg()\n" "**********" msgstr "" "**********这个data.table的开发版本是在4个多星期之前构建的。请更新版本:data." -"table::update.dev.pkg()\n" +"table::update_dev_pkg()\n" "**********" msgid "**********" diff --git a/src/assign.c b/src/assign.c index d0faf337c..61f38a554 100644 --- a/src/assign.c +++ b/src/assign.c @@ -684,6 +684,12 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) #define MSGSIZE 1000 static char memrecycle_message[MSGSIZE+1]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects +const char *targetDesc(const int colnum, const char *colname) { + static char str[501]; // #5463 + snprintf(str, 500, colnum==0 ? _("target vector") : _("column %d named '%s'"), colnum, colname); + return str; +} + const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceStart, const int sourceLen, const int colnum, const char *colname) // like memcpy but recycles single-item source // 'where' a 1-based INTEGER vector subset of target to assign to, or NULL or integer() @@ -707,8 +713,6 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con if (colname==NULL) error(_("Internal error: memrecycle has received NULL colname")); // # nocov *memrecycle_message = '\0'; - static char targetDesc[501]; // from 1.14.1 coerceAs reuses memrecycle for a target vector, PR#4491 - snprintf(targetDesc, 500, colnum==0 ? _("target vector") : _("column %d named '%s'"), colnum, colname); int protecti=0; const bool sourceIsFactor=isFactor(source), targetIsFactor=isFactor(target); const bool sourceIsI64=isReal(source) && INHERITS(source, char_integer64); @@ -730,7 +734,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con for (int i=0; inlevel) { - error(_("Assigning factor numbers to %s. But %d is outside the level range [1,%d]"), targetDesc, val, nlevel); + error(_("Assigning factor numbers to %s. But %d is outside the level range [1,%d]"), targetDesc(colnum, colname), val, nlevel); } } } else { @@ -738,7 +742,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con for (int i=0; inlevel)) { - error(_("Assigning factor numbers to %s. But %f is outside the level range [1,%d], or is not a whole number."), targetDesc, val, nlevel); + error(_("Assigning factor numbers to %s. But %f is outside the level range [1,%d], or is not a whole number."), targetDesc(colnum, colname), val, nlevel); } } } @@ -830,19 +834,19 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con } } } else if (isString(source) && !isString(target) && !isNewList(target)) { - warning(_("Coercing 'character' RHS to '%s' to match the type of %s."), type2char(TYPEOF(target)), targetDesc); + warning(_("Coercing 'character' RHS to '%s' to match the type of %s."), targetIsI64?"integer64":type2char(TYPEOF(target)), targetDesc(colnum, colname)); // this "Coercing ..." warning first to give context in case coerceVector warns 'NAs introduced by coercion' // and also because 'character' to integer/double coercion is often a user mistake (e.g. wrong target column, or wrong // variable on RHS) which they are more likely to appreciate than find inconvenient source = PROTECT(coerceVector(source, TYPEOF(target))); protecti++; } else if (isNewList(source) && !isNewList(target)) { if (targetIsI64) { - error(_("Cannot coerce 'list' RHS to 'integer64' to match the type of %s."), targetDesc); + error(_("Cannot coerce 'list' RHS to 'integer64' to match the type of %s."), targetDesc(colnum, colname)); // because R's coerceVector doesn't know about integer64 } // as in base R; e.g. let as.double(list(1,2,3)) work but not as.double(list(1,c(2,4),3)) // relied on by NNS, simstudy and table.express; tests 1294.* - warning(_("Coercing 'list' RHS to '%s' to match the type of %s."), type2char(TYPEOF(target)), targetDesc); + warning(_("Coercing 'list' RHS to '%s' to match the type of %s."), type2char(TYPEOF(target)), targetDesc(colnum, colname)); source = PROTECT(coerceVector(source, TYPEOF(target))); protecti++; } else if ((TYPEOF(target)!=TYPEOF(source) || targetIsI64!=sourceIsI64) && !isNewList(target)) { if (GetVerbose()>=3) { @@ -850,98 +854,106 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con Rprintf(_("Zero-copy coerce when assigning '%s' to '%s' %s.\n"), sourceIsI64 ? "integer64" : type2char(TYPEOF(source)), targetIsI64 ? "integer64" : type2char(TYPEOF(target)), - targetDesc); + targetDesc(colnum, colname)); } // The following checks are up front here, otherwise we'd need them twice in the two branches // inside BODY that cater for 'where' or not. Maybe there's a way to merge the two macros in future. // The idea is to do these range checks without calling coerceVector() (which allocates) -#define CHECK_RANGE(STYPE, RFUN, COND, FMT, TO) {{ \ - const STYPE *sd = (const STYPE *)RFUN(source); \ - for (int i=0; i255, "d", "taken as 0") + case INTSXP: CHECK_RANGE(int, INTEGER, val<0 || val>255, "d", "taken as 0", val) case REALSXP: if (sourceIsI64) - CHECK_RANGE(int64_t, REAL, val<0 || val>255, PRId64, "taken as 0") - else CHECK_RANGE(double, REAL, !R_FINITE(val) || val<0.0 || val>256.0 || (int)val!=val, "f", "either truncated (precision lost) or taken as 0") + CHECK_RANGE(int64_t, REAL, val<0 || val>255, PRId64, "taken as 0", val) + else CHECK_RANGE(double, REAL, !R_FINITE(val) || val<0.0 || val>256.0 || (int)val!=val, "f", "either truncated (precision lost) or taken as 0", val) } break; case INTSXP: - if (TYPEOF(source)==REALSXP) { - if (sourceIsI64) - CHECK_RANGE(int64_t, REAL, val!=NA_INTEGER64 && (val<=NA_INTEGER || val>INT_MAX), PRId64, "out-of-range (NA)") - else CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "f", "truncated (precision lost)") + switch (TYPEOF(source)) { + case REALSXP: if (sourceIsI64) + CHECK_RANGE(int64_t, REAL, val!=NA_INTEGER64 && (val<=NA_INTEGER || val>INT_MAX), PRId64, "out-of-range (NA)", val) + else CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "f", "truncated (precision lost)", val) + case CPLXSXP: CHECK_RANGE(Rcomplex, COMPLEX, !((ISNAN(val.i) || (R_FINITE(val.i) && val.i==0.0)) && + (ISNAN(val.r) || (R_FINITE(val.r) && (int)val.r==val.r))), "f", "either imaginary part discarded or real part truncated (precision lost)", val.r) } break; case REALSXP: - if (targetIsI64 && isReal(source) && !sourceIsI64) { - CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "f", "truncated (precision lost)") + switch (TYPEOF(source)) { + case REALSXP: if (targetIsI64 && !sourceIsI64) + CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "f", "truncated (precision lost)", val) + break; + case CPLXSXP: if (targetIsI64) + CHECK_RANGE(Rcomplex, COMPLEX, !((ISNAN(val.i) || (R_FINITE(val.i) && val.i==0.0)) && + (ISNAN(val.r) || (R_FINITE(val.r) && (int64_t)val.r==val.r))), "f", "either imaginary part discarded or real part truncated (precision lost)", val.r) + else CHECK_RANGE(Rcomplex, COMPLEX, !(ISNAN(val.i) || (R_FINITE(val.i) && val.i==0.0)), "f", "imaginary part discarded", val.i) } } } -#undef BODY -#define BODY(STYPE, RFUN, CTYPE, CAST, ASSIGN) {{ \ - const STYPE *sd = (const STYPE *)RFUN(source); \ - if (length(where)) { \ - if (slen==1) { \ - const STYPE val = sd[soff]; \ - const CTYPE cval = CAST; \ - for (int wi=0; wi0 && slen==len && soff==0; // mc=memcpy; only if types match and not for single items (a single assign faster than these non-const memcpy calls) @@ -992,6 +1004,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con if (sourceIsI64) BODY(int64_t, REAL, int, (val==NA_INTEGER64||val>INT_MAX||val<=NA_INTEGER) ? NA_INTEGER : (int)val, td[i]=cval) else BODY(double, REAL, int, ISNAN(val) ? NA_INTEGER : (int)val, td[i]=cval) + case CPLXSXP: BODY(Rcomplex, COMPLEX, int, ISNAN(val.r) ? NA_INTEGER : (int)val.r, td[i]=cval) default: COERCE_ERROR("integer"); // test 2005.4 } } break; @@ -1008,6 +1021,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con memcpy(td, (int64_t *)REAL(source), slen*sizeof(int64_t)); break; } else BODY(int64_t, REAL, int64_t, val, td[i]=cval) } else BODY(double, REAL, int64_t, R_FINITE(val) ? val : NA_INTEGER64, td[i]=cval) + case CPLXSXP: BODY(Rcomplex, COMPLEX, int64_t, ISNAN(val.r) ? NA_INTEGER64 : (int64_t)val.r, td[i]=cval) default: COERCE_ERROR("integer64"); } } else { @@ -1022,6 +1036,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con memcpy(td, (double *)REAL(source), slen*sizeof(double)); break; } else BODY(double, REAL, double, val, td[i]=cval) } else BODY(int64_t, REAL, double, val==NA_INTEGER64 ? NA_REAL : val, td[i]=cval) + case CPLXSXP: BODY(Rcomplex, COMPLEX, double, val.r, td[i]=cval) default: COERCE_ERROR("double"); } } @@ -1060,9 +1075,13 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con } break; } - if (sourceIsI64) - error(_("To assign integer64 to a target of type character, please use as.character() for clarity.")); // TODO: handle that here as well - source = PROTECT(coerceVector(source, STRSXP)); protecti++; + if (OBJECT(source) && getAttrib(source, R_ClassSymbol)!=R_NilValue) { + // otherwise coerceVector doesn't call the as.character method for Date, IDate, integer64, nanotime, etc; PR#5189 + // this if() is to save the overhead of the R call eval() when we know there can be no method + source = PROTECT(eval(PROTECT(lang2(sym_as_character, source)), R_GlobalEnv)); protecti+=2; + } else { + source = PROTECT(coerceVector(source, STRSXP)); protecti++; + } } BODY(SEXP, STRING_PTR, SEXP, val, SET_STRING_ELT(target, off+i, cval)) } @@ -1175,7 +1194,7 @@ SEXP allocNAVectorLike(SEXP x, R_len_t n) { static SEXP *saveds=NULL; static R_len_t *savedtl=NULL, nalloc=0, nsaved=0; -void savetl_init() { +void savetl_init(void) { if (nsaved || nalloc || saveds || savedtl) { error(_("Internal error: savetl_init checks failed (%d %d %p %p). please report to data.table issue tracker."), nsaved, nalloc, saveds, savedtl); // # nocov } @@ -1216,7 +1235,7 @@ void savetl(SEXP s) nsaved++; } -void savetl_end() { +void savetl_end(void) { // Can get called if nothing has been saved yet (nsaved==0), or even if _init() hasn't been called yet (pointers NULL). Such // as to clear up before error. Also, it might be that nothing needed to be saved anyway. for (int i=0; i #include @@ -6,11 +7,13 @@ # define USE_RINTERNALS // #3301 # define DATAPTR_RO(x) ((const void *)DATAPTR(x)) #endif +#if !defined(R_VERSION) || R_VERSION < R_Version(3, 4, 0) +# define SET_GROWABLE_BIT(x) // #3292 +#endif #include #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT #include // for uint64_t rather than unsigned long long #include -#include "myomp.h" #include "types.h" #include "po.h" #ifdef WIN32 // positional specifiers (%n$) used in translations; #4402 @@ -103,6 +106,7 @@ extern SEXP sym_datatable_locked; extern SEXP sym_tzone; extern SEXP sym_old_fread_datetime_character; extern SEXP sym_variable_table; +extern SEXP sym_as_character; extern double NA_INT64_D; extern long long NA_INT64_LL; extern Rcomplex NA_CPLX; // initialized in init.c; see there for comments @@ -111,7 +115,7 @@ extern size_t __typeorder[100]; // __ prefix otherwise if we use these names dir long long DtoLL(double x); double LLtoD(long long x); -int GetVerbose(); +int GetVerbose(void); // cj.c SEXP cj(SEXP base_list); @@ -124,14 +128,14 @@ SEXP growVector(SEXP x, R_len_t newlen); SEXP allocNAVector(SEXPTYPE type, R_len_t n); SEXP allocNAVectorLike(SEXP x, R_len_t n); void writeNA(SEXP v, const int from, const int n, const bool listNA); -void savetl_init(), savetl(SEXP s), savetl_end(); +void savetl_init(void), savetl(SEXP s), savetl_end(void); int checkOverAlloc(SEXP x); // forder.c int StrCmp(SEXP x, SEXP y); uint64_t dtwiddle(double x); SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); -int getNumericRounding_C(); +int getNumericRounding_C(void); // reorder.c SEXP reorder(SEXP x, SEXP order); @@ -188,12 +192,12 @@ double iquickselect(int *x, int n); double i64quickselect(int64_t *x, int n); // fread.c -double wallclock(); +double wallclock(void); // openmp-utils.c -void initDTthreads(); +void initDTthreads(void); int getDTthreads(const int64_t n, const bool throttle); -void avoid_openmp_hang_within_fork(); +void avoid_openmp_hang_within_fork(void); // froll.c void frollmean(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose); @@ -258,3 +262,78 @@ int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...); // programming.c SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); + +//negate.c +SEXP notchin(SEXP x, SEXP table); + +// functions called from R level .Call/.External and registered in init.c +// these now live here to pass -Wstrict-prototypes, #5477 +// all arguments must be SEXP since they are called from R level +// where there are no arguments, it must be (void) not () to be a strict prototype +SEXP setattrib(SEXP, SEXP, SEXP); +SEXP assign(SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP copy(SEXP); +SEXP alloccolwrapper(SEXP, SEXP, SEXP); +SEXP selfrefokwrapper(SEXP, SEXP); +SEXP truelength(SEXP); +SEXP setcharvec(SEXP, SEXP, SEXP); +SEXP chmatch_R(SEXP, SEXP, SEXP); +SEXP chmatchdup_R(SEXP, SEXP, SEXP); +SEXP chin_R(SEXP, SEXP); +SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP rbindlist(SEXP, SEXP, SEXP, SEXP); +SEXP setlistelt(SEXP, SEXP, SEXP); +SEXP address(SEXP); +SEXP expandAltRep(SEXP); +SEXP fmelt(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP fcast(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP issorted(SEXP, SEXP); +SEXP gforce(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP gsum(SEXP, SEXP); +SEXP gmean(SEXP, SEXP); +SEXP gmin(SEXP, SEXP); +SEXP gmax(SEXP, SEXP); +SEXP setNumericRounding(SEXP); +SEXP getNumericRounding(void); +SEXP binary(SEXP); +SEXP subsetDT(SEXP, SEXP, SEXP); +SEXP convertNegAndZeroIdx(SEXP, SEXP, SEXP, SEXP); +SEXP frank(SEXP, SEXP, SEXP, SEXP); +SEXP lookup(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP overlaps(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP whichwrapper(SEXP, SEXP); +SEXP shift(SEXP, SEXP, SEXP, SEXP); +SEXP transpose(SEXP, SEXP, SEXP, SEXP); +SEXP anyNA(SEXP, SEXP); +SEXP setlevels(SEXP, SEXP, SEXP); +SEXP rleid(SEXP, SEXP); +SEXP gmedian(SEXP, SEXP); +SEXP gtail(SEXP, SEXP); +SEXP ghead(SEXP, SEXP); +SEXP glast(SEXP); +SEXP gfirst(SEXP); +SEXP gnthvalue(SEXP, SEXP); +SEXP dim(SEXP); +SEXP gvar(SEXP, SEXP); +SEXP gsd(SEXP, SEXP); +SEXP gprod(SEXP, SEXP); +SEXP gshift(SEXP, SEXP, SEXP, SEXP); +SEXP nestedid(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP setDTthreads(SEXP, SEXP, SEXP, SEXP); +SEXP getDTthreads_R(SEXP); +SEXP nqRecreateIndices(SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP fsort(SEXP, SEXP); +SEXP inrange(SEXP, SEXP, SEXP, SEXP); +SEXP hasOpenMP(void); +SEXP beforeR340(void); +SEXP uniqueNlogical(SEXP, SEXP); +SEXP dllVersion(void); +SEXP initLastUpdated(SEXP); +SEXP allNAR(SEXP); +SEXP test_dt_win_snprintf(void); +SEXP dt_zlib_version(void); +SEXP startsWithAny(SEXP, SEXP, SEXP); +SEXP convertDate(SEXP, SEXP); +SEXP fastmean(SEXP); + diff --git a/src/dogroups.c b/src/dogroups.c index d76889932..5ddd1f672 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -39,6 +39,7 @@ static bool anySpecialStatic(SEXP x) { // with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed. // Currently the marker is negative truelength. These specials are protected by us here and before we release them // we restore the true truelength for when R starts to use vector truelength. + SEXP attribs, list_el; const int n = length(x); // use length() not LENGTH() because LENGTH() on NULL is segfault in R<3.5 where we still define USE_RINTERNALS // (see data.table.h), and isNewList() is true for NULL @@ -50,8 +51,13 @@ static bool anySpecialStatic(SEXP x) { if (TRUELENGTH(x)<0) return true; // test 2158 for (int i=0; incol; +// convert NA in user-supplied integer vector input to -1 in order to +// trigger error in uniq_diff(). +SEXP na_to_negative(SEXP vec_with_NA){ + SEXP vec_with_negatives = PROTECT(allocVector(INTSXP, length(vec_with_NA))); + for (int i=0; i ncol) - error(_("One or more values in 'id.vars' is invalid.")); - else if (!LOGICAL(booltmp)[i]) targetcols++; - else continue; - } - unqtmp = PROTECT(allocVector(INTSXP, targetcols)); protecti++; - u = 0; - for (int i=0; i ncol) - error(_("One or more values in 'id.vars' is invalid.")); - } - idcols = PROTECT(tmp); protecti++; - switch(TYPEOF(measure)) { - case STRSXP : tmp2 = PROTECT(chmatch(measure, dtnames, 0)); protecti++; break; - case REALSXP : tmp2 = PROTECT(coerceVector(measure, INTSXP)); protecti++; break; - case INTSXP : tmp2 = measure; break; - case VECSXP : tmp2 = PROTECT(measurelist(measure, dtnames)); protecti++; break; - default : error(_("Unknown 'measure.vars' type %s, must be character or integer vector"), type2char(TYPEOF(measure))); - } - tmp = tmp2; - if (isNewList(measure)) { - tmp = PROTECT(unlist_(tmp2)); protecti++; - } - for (int i=0; i> approach to cleanup() on error. */ -static void free_ustr() { +static void free_ustr(void) { for(int i=0; i // ceil, sqrt, isfinite #endif #include -#include "fread.h" #include "freadLookups.h" // Private globals to save passing all of them through to highly iterated field processors @@ -66,8 +67,8 @@ static int8_t *type = NULL, *tmpType = NULL, *size = NULL; static lenOff *colNames = NULL; static freadMainArgs args = {0}; // global for use by DTPRINT; static implies ={0} but include the ={0} anyway just in case for valgrind #4639 -const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; -int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; +const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; +int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; // In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043 // So we assign them through below init function. @@ -75,7 +76,7 @@ static double NAND; static double INFD; // NAN and INFINITY constants are float, so cast to double once up front. -void init() { +void init(void) { NAND = (double)NAN; INFD = (double)INFINITY; } @@ -652,8 +653,8 @@ static void StrtoI64(FieldParseContext *ctx) // TODO: review ERANGE checks and tests; that range outside [1.7e-308,1.7e+308] coerces to [0.0,Inf] /* f = "~/data.table/src/freadLookups.h" -cat("const long double pow10lookup[601] = {\n", file=f, append=FALSE) -for (i in (-300):(299)) cat("1.0E",i,"L,\n", sep="", file=f, append=TRUE) +cat("const long double pow10lookup[301] = {\n", file=f, append=FALSE) +for (i in 0:299) cat("1.0E",i,"L,\n", sep="", file=f, append=TRUE) cat("1.0E300L\n};\n", file=f, append=TRUE) */ @@ -780,12 +781,13 @@ static void parse_double_regular_core(const char **pch, double *target) // fail to be encoded by the compiler, even though the values can actually // be stored correctly. int_fast8_t extra = e < 0 ? e + 300 : e - 300; - r *= pow10lookup[extra + 300]; + r = extra<0 ? r/pow10lookup[-extra] : r*pow10lookup[extra]; e -= extra; } - e += 300; // lookup table is arranged from -300 (0) to +300 (600) - r *= pow10lookup[e]; + // pow10lookup[301] contains 10^(0:300). Storing negative powers there too + // avoids this ternary but is slightly less accurate in some cases, #4461 + r = e < 0 ? r/pow10lookup[-e] : r*pow10lookup[e]; *target = (double)(neg? -r : r); *pch = ch; return; @@ -1075,6 +1077,12 @@ static void parse_iso8601_timestamp(FieldParseContext *ctx) *target = NA_FLOAT64; } +static void parse_empty(FieldParseContext *ctx) +{ + int8_t *target = (int8_t*) ctx->targets[sizeof(int8_t)]; + *target = NA_BOOL8; +} + /* Parse numbers 0 | 1 as boolean and ,, as NA (fwrite's default) */ static void parse_bool_numeric(FieldParseContext *ctx) { @@ -1151,7 +1159,8 @@ static void parse_bool_lowercase(FieldParseContext *ctx) */ typedef void (*reader_fun_t)(FieldParseContext *ctx); static reader_fun_t fun[NUMTYPE] = { - (reader_fun_t) &Field, + (reader_fun_t) &Field, // CT_DROP + (reader_fun_t) &parse_empty, // CT_EMPTY (reader_fun_t) &parse_bool_numeric, (reader_fun_t) &parse_bool_uppercase, (reader_fun_t) &parse_bool_titlecase, @@ -1166,7 +1175,7 @@ static reader_fun_t fun[NUMTYPE] = { (reader_fun_t) &Field }; -static int disabled_parsers[NUMTYPE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +static int disabled_parsers[NUMTYPE] = {0}; static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped) { // used in sampling column types and whether column names are present @@ -1275,24 +1284,22 @@ int freadMain(freadMainArgs _args) { while (*nastr) { if (**nastr == '\0') { blank_is_a_NAstring = true; - // if blank is the only one, as is the default, clear NAstrings so that doesn't have to be checked - if (nastr==NAstrings && nastr+1==NULL) NAstrings=NULL; - nastr++; - continue; + } else { + const char *ch = *nastr; + size_t nchar = strlen(ch); + if (isspace(ch[0]) || isspace(ch[nchar-1])) + STOP(_("freadMain: NAstring <<%s>> has whitespace at the beginning or end"), ch); + if (strcmp(ch,"T")==0 || strcmp(ch,"F")==0 || + strcmp(ch,"TRUE")==0 || strcmp(ch,"FALSE")==0 || + strcmp(ch,"True")==0 || strcmp(ch,"False")==0) + STOP(_("freadMain: NAstring <<%s>> is recognized as type boolean, this is not permitted."), ch); + if ((strcmp(ch,"1")==0 || strcmp(ch,"0")==0) && args.logical01) + STOP(_("freadMain: NAstring <<%s>> and logical01=TRUE, this is not permitted."), ch); + char *end; + errno = 0; + (void)strtod(ch, &end); // careful not to let "" get to here as strtod considers "" numeric + if (errno==0 && (size_t)(end - ch) == nchar) any_number_like_NAstrings = true; } - const char *ch = *nastr; - size_t nchar = strlen(ch); - if (isspace(ch[0]) || isspace(ch[nchar-1])) - STOP(_("freadMain: NAstring <<%s>> has whitespace at the beginning or end"), ch); - if (strcmp(ch,"T")==0 || strcmp(ch,"F")==0 || - strcmp(ch,"TRUE")==0 || strcmp(ch,"FALSE")==0 || - strcmp(ch,"True")==0 || strcmp(ch,"False")==0 || - strcmp(ch,"1")==0 || strcmp(ch,"0")==0) - STOP(_("freadMain: NAstring <<%s>> is recognized as type boolean, this is not permitted."), ch); - char *end; - errno = 0; - (void)strtod(ch, &end); // careful not to let "" get to here (see continue above) as strtod considers "" numeric - if (errno==0 && (size_t)(end - ch) == nchar) any_number_like_NAstrings = true; nastr++; } disabled_parsers[CT_BOOL8_N] = !args.logical01; @@ -1315,6 +1322,10 @@ int freadMain(freadMainArgs _args) { DTPRINT(_(" show progress = %d\n"), args.showProgress); DTPRINT(_(" 0/1 column will be read as %s\n"), args.logical01? "boolean" : "integer"); } + if (*NAstrings==NULL || // user sets na.strings=NULL + (**NAstrings=='\0' && *(NAstrings+1)==NULL)) { // user sets na.strings="" + NAstrings=NULL; // clear NAstrings to save end_NA_string() dealing with these cases (blank_is_a_NAstring was set to true above) + } stripWhite = args.stripWhite; skipEmptyLines = args.skipEmptyLines; @@ -1577,7 +1588,7 @@ int freadMain(freadMainArgs _args) { int ncol; // Detected number of columns in the file const char *firstJumpEnd=NULL; // remember where the winning jumpline from jump 0 ends, to know its size excluding header const char *prevStart = NULL; // the start of the non-empty line before the first not-ignored row (for warning message later, or taking as column names) - int jumpLines = (int)umin(100,nrowLimit); // how many lines from each jump point to use. If nrowLimit is supplied, nJumps is later set to 1 as well. + int jumpLines = nrowLimit==0 ? 100 : (int)umin(100, nrowLimit); // how many lines from each jump point to use. If nrows>0 is supplied, nJumps is later set to 1. #4029 { if (verbose) DTPRINT(_("[06] Detect separator, quoting rule, and ncolumns\n")); @@ -1810,7 +1821,7 @@ int freadMain(freadMainArgs _args) { (uint64_t)sz, (uint64_t)jump0size, (uint64_t)(sz/(2*jump0size))); } nJumps++; // the extra sample at the very end (up to eof) is sampled and format checked but not jumped to when reading - if (nrowLimit0) nJumps=1; // when nrows>0 supplied by user, no jumps (not even at the end) and single threaded sampleLines = 0; double sumLen=0.0, sumLenSq=0.0; @@ -1881,8 +1892,7 @@ int freadMain(freadMainArgs _args) { bool bumped=false; detect_types(&ch, tmpType, ncol, &bumped); if (sampleLines>0) for (int j=0; jCT_EMPTY) { args.header=true; if (verbose) DTPRINT(_(" 'header' determined to be true due to column %d containing a string on row 1 and a lower type (%s) in the rest of the %d sample rows\n"), j+1, typeName[type[j]], sampleLines); @@ -2526,9 +2536,8 @@ int freadMain(freadMainArgs _args) { rowSize1 = rowSize4 = rowSize8 = 0; nStringCols = 0; nNonStringCols = 0; - for (int j=0, resj=-1; j -#include "data.table.h" SEXP coerceToRealListR(SEXP obj) { // accept atomic/list of integer/logical/real returns list of real diff --git a/src/fwrite.c b/src/fwrite.c index 2d10d222f..c5f977212 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -60,17 +60,17 @@ inline void write_chars(const char *x, char **pch) *pch = ch; } -void writeBool8(int8_t *col, int64_t row, char **pch) +void writeBool8(const void *col, int64_t row, char **pch) { - int8_t x = col[row]; + int8_t x = ((const int8_t *)col)[row]; char *ch = *pch; *ch++ = '0'+(x==1); *pch = ch-(x==INT8_MIN); // if NA then step back, to save a branch } -void writeBool32(int32_t *col, int64_t row, char **pch) +void writeBool32(const void *col, int64_t row, char **pch) { - int32_t x = col[row]; + int32_t x = ((const int32_t *)col)[row]; char *ch = *pch; if (x==INT32_MIN) { // TODO: when na=='\0' as recommended, use a branchless writer write_chars(na, &ch); @@ -80,9 +80,9 @@ void writeBool32(int32_t *col, int64_t row, char **pch) *pch = ch; } -void writeBool32AsString(int32_t *col, int64_t row, char **pch) +void writeBool32AsString(const void *col, int64_t row, char **pch) { - int32_t x = col[row]; + int32_t x = ((const int32_t *)col)[row]; char *ch = *pch; if (x == INT32_MIN) { write_chars(na, &ch); @@ -106,10 +106,10 @@ static inline void reverse(char *upp, char *low) } } -void writeInt32(int32_t *col, int64_t row, char **pch) +void writeInt32(const void *col, int64_t row, char **pch) { char *ch = *pch; - int32_t x = col[row]; + int32_t x = ((const int32_t *)col)[row]; if (x == INT32_MIN) { write_chars(na, &ch); } else { @@ -122,10 +122,10 @@ void writeInt32(int32_t *col, int64_t row, char **pch) *pch = ch; } -void writeInt64(int64_t *col, int64_t row, char **pch) +void writeInt64(const void *col, int64_t row, char **pch) { char *ch = *pch; - int64_t x = col[row]; + int64_t x = ((const int64_t *)col)[row]; if (x == INT64_MIN) { write_chars(na, &ch); } else { @@ -177,7 +177,7 @@ void genLookups() { } */ -void writeFloat64(double *col, int64_t row, char **pch) +void writeFloat64(const void *col, int64_t row, char **pch) { // hand-rolled / specialized for speed // *pch is safely the output destination with enough space (ensured via calculating maxLineLen up front) @@ -187,7 +187,7 @@ void writeFloat64(double *col, int64_t row, char **pch) // ii) no C libary calls such as sprintf() where the fmt string has to be interpretted over and over // iii) no need to return variables or flags. Just writes. // iv) shorter, easier to read and reason with in one self contained place. - double x = col[row]; + double x = ((const double *)col)[row]; char *ch = *pch; if (!isfinite(x)) { if (isnan(x)) { @@ -301,9 +301,9 @@ void writeFloat64(double *col, int64_t row, char **pch) *pch = ch; } -void writeComplex(Rcomplex *col, int64_t row, char **pch) +void writeComplex(const void *col, int64_t row, char **pch) { - Rcomplex x = col[row]; + Rcomplex x = ((const Rcomplex *)col)[row]; char *ch = *pch; writeFloat64(&x.r, 0, &ch); if (!ISNAN(x.i)) { @@ -340,8 +340,8 @@ static inline void write_time(int32_t x, char **pch) *pch = ch; } -void writeITime(int32_t *col, int64_t row, char **pch) { - write_time(col[row], pch); +void writeITime(const void *col, int64_t row, char **pch) { + write_time(((const int32_t *)col)[row], pch); } static inline void write_date(int32_t x, char **pch) @@ -394,15 +394,16 @@ static inline void write_date(int32_t x, char **pch) *pch = ch; } -void writeDateInt32(int32_t *col, int64_t row, char **pch) { - write_date(col[row], pch); +void writeDateInt32(const void *col, int64_t row, char **pch) { + write_date(((const int32_t *)col)[row], pch); } -void writeDateFloat64(double *col, int64_t row, char **pch) { - write_date(isfinite(col[row]) ? (int)(col[row]) : INT32_MIN, pch); +void writeDateFloat64(const void *col, int64_t row, char **pch) { + double x = ((const double *)col)[row]; + write_date(isfinite(x) ? (int)(x) : INT32_MIN, pch); } -void writePOSIXct(double *col, int64_t row, char **pch) +void writePOSIXct(const void *col, int64_t row, char **pch) { // Write ISO8601 UTC by default to encourage ISO standards, stymie ambiguity and for speed. // R internally represents POSIX datetime in UTC always. Its 'tzone' attribute can be ignored. @@ -411,25 +412,27 @@ void writePOSIXct(double *col, int64_t row, char **pch) // All positive integers up to 2^53 (9e15) are exactly representable by double which is relied // on in the ops here; number of seconds since epoch. - double x = col[row]; + double x = ((const double *)col)[row]; char *ch = *pch; if (!isfinite(x)) { write_chars(na, &ch); } else { int64_t xi, d, t; - if (x>=0) { - xi = floor(x); + xi = floor(x); + int m = ((x-xi)*10000000); // 7th digit used to round up if 9 + m += (m%10); // 9 is numerical accuracy, 8 or less then we truncate to last microsecond + m /= 10; + int carry = m / 1000000; // Need to know if we rounded up to a whole second + m -= carry * 1000000; + xi += carry; + if (xi>=0) { d = xi / 86400; t = xi % 86400; } else { // before 1970-01-01T00:00:00Z - xi = floor(x); d = (xi+1)/86400 - 1; t = xi - d*86400; // xi and d are both negative here; t becomes the positive number of seconds into the day } - int m = ((x-xi)*10000000); // 7th digit used to round up if 9 - m += (m%10); // 9 is numerical accuracy, 8 or less then we truncate to last microsecond - m /= 10; write_date(d, &ch); *ch++ = 'T'; ch -= squashDateTime; @@ -462,9 +465,9 @@ void writePOSIXct(double *col, int64_t row, char **pch) *pch = ch; } -void writeNanotime(int64_t *col, int64_t row, char **pch) +void writeNanotime(const void *col, int64_t row, char **pch) { - int64_t x = col[row]; + int64_t x = ((const int64_t *)col)[row]; char *ch = *pch; if (x == INT64_MIN) { write_chars(na, &ch); @@ -547,12 +550,12 @@ static inline void write_string(const char *x, char **pch) void writeString(const void *col, int64_t row, char **pch) { - write_string(getString(col, row), pch); + write_string(getString((const SEXP *)col, row), pch); } void writeCategString(const void *col, int64_t row, char **pch) { - write_string(getCategString(col, row), pch); + write_string(getCategString((const SEXP *)col, row), pch); } #ifndef NOZLIB diff --git a/src/fwrite.h b/src/fwrite.h index 6886c7791..0fef0c7f6 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -3,29 +3,30 @@ #else #define STRICT_R_HEADERS #include + #include // for SEXP in writeList() prototype #include "po.h" #define STOP error #define DTPRINT Rprintf #endif -typedef void (*writer_fun_t)(const void *, int64_t, char **); +typedef void writer_fun_t(const void *, int64_t, char **); // in the order of writer_fun_t in fwriteR.c -void writeBool8(); -void writeBool32(); -void writeBool32AsString(); -void writeInt32(); -void writeInt64(); -void writeFloat64(); -void writeComplex(); -void writeITime(); -void writeDateInt32(); -void writeDateFloat64(); -void writePOSIXct(); -void writeNanotime(); -void writeString(); -void writeCategString(); -void writeList(); +writer_fun_t writeBool8; +writer_fun_t writeBool32; +writer_fun_t writeBool32AsString; +writer_fun_t writeInt32; +writer_fun_t writeInt64; +writer_fun_t writeFloat64; +writer_fun_t writeComplex; +writer_fun_t writeITime; +writer_fun_t writeDateInt32; +writer_fun_t writeDateFloat64; +writer_fun_t writePOSIXct; +writer_fun_t writeNanotime; +writer_fun_t writeString; +writer_fun_t writeCategString; +writer_fun_t writeList; void write_chars(const char *source, char **dest); @@ -75,7 +76,7 @@ typedef struct fwriteMainArgs int64_t nrow; // a vector of pointers to all-same-length column vectors const void **columns; - writer_fun_t *funs; // a vector of writer_fun_t function pointers + writer_fun_t **funs; // a vector of writer_fun_t function pointers // length ncol vector containing which fun[] to use for each column // one byte to use 8 times less cache lines than a vector of function pointers would do diff --git a/src/fwriteR.c b/src/fwriteR.c index a36e44315..f64768d70 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -19,7 +19,7 @@ static const char *sep2start, *sep2end; // if there are no list columns, set sep2=='\0' // Non-agnostic helpers ... -const char *getString(SEXP *col, int64_t row) { // TODO: inline for use in fwrite.c +const char *getString(const SEXP *col, int64_t row) { // TODO: inline for use in fwrite.c SEXP x = col[row]; return x==NA_STRING ? NULL : ENCODED_CHAR(x); } @@ -53,7 +53,7 @@ const char *getCategString(SEXP col, int64_t row) { return x==NA_INTEGER ? NULL : ENCODED_CHAR(STRING_ELT(getAttrib(col, R_LevelsSymbol), x-1)); } -writer_fun_t funs[] = { +writer_fun_t *funs[] = { &writeBool8, &writeBool32, &writeBool32AsString, @@ -73,8 +73,8 @@ writer_fun_t funs[] = { static int32_t whichWriter(SEXP); -void writeList(SEXP *col, int64_t row, char **pch) { - SEXP v = col[row]; +void writeList(const void *col, int64_t row, char **pch) { + SEXP v = ((const SEXP *)col)[row]; int32_t wf = whichWriter(v); if (TYPEOF(v)==VECSXP || wf==INT32_MIN || isFactor(v)) { error(_("Internal error: getMaxListItemLen should have caught this up front.")); // # nocov @@ -82,7 +82,7 @@ void writeList(SEXP *col, int64_t row, char **pch) { char *ch = *pch; write_chars(sep2start, &ch); const void *data = DATAPTR_RO(v); - writer_fun_t fun = funs[wf]; + writer_fun_t *fun = funs[wf]; for (int j=0; j16) shift=nb/2; // TODO: when we have stress-test off mode, do this - mask = (1<>shift) + 1; + bitshift = nb/2; // /2 so that high and low can be uint16_t, and no limit (even for nb=4) to stress-test. + // bitshift=MAX(nb-8,0); if (bitshift>16) bitshift=nb/2; // TODO: when we have stress-test off mode, do this + mask = (1<>bitshift) + 1; grp = (int *)R_alloc(nrow, sizeof(int)); // TODO: use malloc and made this local as not needed globally when all functions here use gather // maybe better to malloc to avoid R's heap. This grp isn't global, so it doesn't need to be R_alloc @@ -86,8 +86,8 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { // TODO: enable stress-test mode in tests only (#3205) which can be turned off by default in release to decrease overhead on small data // if that is established to be biting (it may be fine). if (nBatch<1 || batchSize<1 || lastBatchSize<1) { - error(_("Internal error: nrow=%d ngrp=%d nbit=%d shift=%d highSize=%d nBatch=%d batchSize=%d lastBatchSize=%d\n"), // # nocov - nrow, ngrp, nb, shift, highSize, nBatch, batchSize, lastBatchSize); // # nocov + error(_("Internal error: nrow=%d ngrp=%d nbit=%d bitshift=%d highSize=%d nBatch=%d batchSize=%d lastBatchSize=%d\n"), // # nocov + nrow, ngrp, nb, bitshift, highSize, nBatch, batchSize, lastBatchSize); // # nocov } // initial population of g: #pragma omp parallel for num_threads(getDTthreads(ngrp, false)) @@ -108,9 +108,9 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { const int *restrict op = INTEGER(o); // o is a permutation of 1:nrow int nb = nbit(nrow-1); - int shift = MAX(nb-8, 0); // TODO: experiment nb/2. Here it doesn't have to be /2 currently. - int highSize = ((nrow-1)>>shift) + 1; - //Rprintf(_("When assigning grp[o] = g, highSize=%d nb=%d shift=%d nBatch=%d\n"), highSize, nb, shift, nBatch); + int bitshift = MAX(nb-8, 0); // TODO: experiment nb/2. Here it doesn't have to be /2 currently. + int highSize = ((nrow-1)>>bitshift) + 1; + //Rprintf(_("When assigning grp[o] = g, highSize=%d nb=%d bitshift=%d nBatch=%d\n"), highSize, nb, bitshift, nBatch); int *counts = calloc(nBatch*highSize, sizeof(int)); // TODO: cache-line align and make highSize a multiple of 64 int *TMP = malloc(nrow*2l*sizeof(int)); // must multiple the long int otherwise overflow may happen, #4295 if (!counts || !TMP ) error(_("Internal error: Failed to allocate counts or TMP when assigning g in gforce")); @@ -120,7 +120,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { const int *my_o = op + b*batchSize; int *restrict my_counts = counts + b*highSize; for (int i=0; i> shift; + const int w = (my_o[i]-1) >> bitshift; my_counts[w]++; } for (int i=0, cum=0; i> shift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too + const int w = (my_o[i]-1) >> bitshift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too int *p = my_tmp + 2*my_counts[w]++; *p++ = my_o[i]-1; *p = my_g[i]; @@ -172,7 +172,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { const int *my_pg = gp + b*batchSize; const int howMany = b==nBatch-1 ? lastBatchSize : batchSize; for (int i=0; i> shift; + const int w = my_pg[i] >> bitshift; my_counts[w]++; my_high[i] = (uint16_t)w; // reduce 4 bytes to 2 } @@ -185,7 +185,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize; memcpy(my_tmpcounts, my_counts, highSize*sizeof(int)); for (int i=0; i> shift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too + const int w = my_pg[i] >> bitshift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too my_low[my_tmpcounts[w]++] = (uint16_t)(my_pg[i] & mask); } // counts is now cumulated within batch (with ending values) and we leave it that way @@ -348,8 +348,7 @@ SEXP gsum(SEXP x, SEXP narmArg) double started = wallclock(); const bool verbose=GetVerbose(); if (verbose) Rprintf(_("This gsum (narm=%s) took ... "), narm?"TRUE":"FALSE"); - if (nrow != n) - error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gsum"); + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gsum"); bool anyNA=false; SEXP ans; switch(TYPEOF(x)) { @@ -363,7 +362,7 @@ SEXP gsum(SEXP x, SEXP narmArg) if (!anyNA) { #pragma omp parallel for num_threads(getDTthreads(highSize, false)) //schedule(dynamic,1) for (int h=0; h DBL_MAX) ansd[i] = R_PosInf; - else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; - else ansd[i] = (double)s[i]; + SEXP ans = PROTECT(allocVector(REALSXP, ngrp)); + if (INHERITS(x, char_integer64)) { + int64_t *ansd = (int64_t *)REAL(ans); + for (int i=0; iINT64_MAX || s[i]<=INT64_MIN) ? NA_INTEGER64 : (int64_t)s[i]; + } + } else { + double *ansd = REAL(ans); + for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; + else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; + else ansd[i] = (double)s[i]; + } } free(s); copyMostAttrib(x, ans); UNPROTECT(1); // Rprintf(_("this gprod took %8.3f\n"), 1.0*(clock()-start)/CLOCKS_PER_SEC); + return ans; +} + +SEXP gshift(SEXP x, SEXP nArg, SEXP fillArg, SEXP typeArg) { + const bool nosubset = irowslen == -1; + const bool issorted = !isunsorted; + const int n = nosubset ? length(x) : irowslen; + if (nrow != n) error(_("Internal error: nrow [%d] != length(x) [%d] in %s"), nrow, n, "gshift"); + + int nprotect=0; + enum {LAG, LEAD/*, SHIFT*/,CYCLIC} stype = LAG; + if (!(length(fillArg) == 1)) + error(_("fill must be a vector of length 1")); + + if (!isString(typeArg) || length(typeArg) != 1) + error(_("Internal error: invalid type for gshift(), should have been caught before. please report to data.table issue tracker")); // # nocov + if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "lag")) stype = LAG; + else if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "lead")) stype = LEAD; + else if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "shift")) stype = LAG; + else if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "cyclic")) stype = CYCLIC; + else error(_("Internal error: invalid type for gshift(), should have been caught before. please report to data.table issue tracker")); // # nocov + + bool lag; + const bool cycle = stype == CYCLIC; + + R_xlen_t nx = xlength(x), nk = length(nArg); + if (!isInteger(nArg)) error(_("Internal error: n must be integer")); // # nocov + const int *kd = INTEGER(nArg); + for (int i=0; i grpn -> jend = jstart */ \ + if (lag) { \ + const int o = ff[i]-1+(grpn-thisn); \ + for (int j=0; j 305) + ++year; + + if (type == YEAR) { + *(int *)out = year; + return; + } + + int leap = !years1 && (years4 || !years100); + + if (type == YDAY || type == WEEK) { + int yday = days + 31 + 28 + leap; + if (yday >= YEARS1 + leap) + yday -= YEARS1 + leap; + *(int *)out = ++yday; + if (type == WEEK) + *(int *)out = (*(int *)out / 7) + 1; + return; + } + + if (type == MONTH || type == YEARMON) { + int i; + if (days==0 && !leap && isLeapYear(year)) { + i = 1; + } else { + i = 2; + while (months[i-2] <= days) { + days -= months[i-2]; + i++; + } + } + if (i >= 12) + i -= 12; + + if (type == MONTH) { + *(int *)out = i + 1; + } else { + *(double *)out = year + i / 12.0; + } + return; + } + + if (type == MDAY) { + if (days==0 && !leap && isLeapYear(year)) { + *(int *)out = 29; + return; + } + int i = 0; + while (months[i] <= days) { + days -= months[i]; + i++; + } + *(int *)out = ++days; + return; + } + + if (type == QUARTER || type == YEARQTR) { + int i = 0; + while (quarter[i] <= days) { + days -= quarter[i]; + i++; + } + if (i >= 4) + i -= 4; + if (type == QUARTER) { + *(int *)out = i + 1; + } else { + *(double *)out = year + (i / 4.0); + } + return; + } +} + +SEXP convertDate(SEXP x, SEXP type) +{ + if (!isInteger(x)) error(_("x must be an integer vector")); + const int *ix = INTEGER(x); + const int n = length(x); + if (!isString(type) || length(type) != 1) + error(_("Internal error: invalid type for convertDate(), should have been caught before. please report to data.table issue tracker")); // # nocov + datetype ctype; + bool ansint = true; + if (!strcmp(CHAR(STRING_ELT(type, 0)), "yday")) ctype = YDAY; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "wday")) ctype = WDAY; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "mday")) ctype = MDAY; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "week")) ctype = WEEK; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "month")) ctype = MONTH; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "quarter")) ctype = QUARTER; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "year")) ctype = YEAR; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "yearmon")) { ctype = YEARMON; ansint = false; } + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "yearqtr")) { ctype = YEARQTR; ansint = false; } + else error(_("Internal error: invalid type for convertDate, should have been caught before. please report to data.table issue tracker")); // # nocov + + SEXP ans; + if (ansint) { + ans = PROTECT(allocVector(INTSXP, n)); + int *ansp = INTEGER(ans); + for (int i=0; i < n; ++i) { + convertSingleDate(ix[i], ctype, &ansp[i]); + } + } else { + ans = PROTECT(allocVector(REALSXP, n)); + double *ansp = REAL(ans); + for (int i=0; i < n; ++i) { + convertSingleDate(ix[i], ctype, &ansp[i]); + } + } + UNPROTECT(1); + return ans; +} diff --git a/src/init.c b/src/init.c index 56bf66d41..2cffabd34 100644 --- a/src/init.c +++ b/src/init.c @@ -36,102 +36,13 @@ SEXP sym_datatable_locked; SEXP sym_tzone; SEXP sym_old_fread_datetime_character; SEXP sym_variable_table; +SEXP sym_as_character; double NA_INT64_D; long long NA_INT64_LL; Rcomplex NA_CPLX; size_t __sizes[100]; size_t __typeorder[100]; -// .Calls -SEXP setattrib(); -SEXP bmerge(); -SEXP assign(); -SEXP dogroups(); -SEXP copy(); -SEXP shallowwrapper(); -SEXP alloccolwrapper(); -SEXP selfrefokwrapper(); -SEXP truelength(); -SEXP setcharvec(); -SEXP setcolorder(); -SEXP chmatch_R(); -SEXP chmatchdup_R(); -SEXP chin_R(); -SEXP fifelseR(); -SEXP fcaseR(); -SEXP freadR(); -SEXP fwriteR(); -SEXP reorder(); -SEXP rbindlist(); -SEXP vecseq(); -SEXP setlistelt(); -SEXP address(); -SEXP expandAltRep(); -SEXP fmelt(); -SEXP fcast(); -SEXP uniqlist(); -SEXP uniqlengths(); -SEXP forder(); -SEXP issorted(); -SEXP gforce(); -SEXP gsum(); -SEXP gmean(); -SEXP gmin(); -SEXP gmax(); -SEXP isOrderedSubset(); -SEXP setNumericRounding(); -SEXP getNumericRounding(); -SEXP binary(); -SEXP subsetDT(); -SEXP subsetVector(); -SEXP convertNegAndZeroIdx(); -SEXP frank(); -SEXP dt_na(); -SEXP lookup(); -SEXP overlaps(); -SEXP whichwrapper(); -SEXP shift(); -SEXP transpose(); -SEXP anyNA(); -SEXP isReallyReal(); -SEXP setlevels(); -SEXP rleid(); -SEXP gmedian(); -SEXP gtail(); -SEXP ghead(); -SEXP glast(); -SEXP gfirst(); -SEXP gnthvalue(); -SEXP dim(); -SEXP gvar(); -SEXP gsd(); -SEXP gprod(); -SEXP nestedid(); -SEXP setDTthreads(); -SEXP getDTthreads_R(); -SEXP nqRecreateIndices(); -SEXP fsort(); -SEXP inrange(); -SEXP between(); -SEXP hasOpenMP(); -SEXP uniqueNlogical(); -SEXP frollfunR(); -SEXP dllVersion(); -SEXP nafillR(); -SEXP colnamesInt(); -SEXP initLastUpdated(); -SEXP cj(); -SEXP lock(); -SEXP unlock(); -SEXP islockedR(); -SEXP allNAR(); -SEXP test_dt_win_snprintf(); -SEXP dt_zlib_version(); -SEXP startsWithAny(); - -// .Externals -SEXP fastmean(); - static const R_CallMethodDef callMethods[] = { {"Csetattrib", (DL_FUNC) &setattrib, -1}, @@ -196,6 +107,7 @@ R_CallMethodDef callMethods[] = { {"Cgvar", (DL_FUNC) &gvar, -1}, {"Cgsd", (DL_FUNC) &gsd, -1}, {"Cgprod", (DL_FUNC) &gprod, -1}, +{"Cgshift", (DL_FUNC) &gshift, -1}, {"Cnestedid", (DL_FUNC) &nestedid, -1}, {"CsetDTthreads", (DL_FUNC) &setDTthreads, -1}, {"CgetDTthreads", (DL_FUNC) &getDTthreads_R, -1}, @@ -204,6 +116,7 @@ R_CallMethodDef callMethods[] = { {"Cinrange", (DL_FUNC) &inrange, -1}, {"Cbetween", (DL_FUNC) &between, -1}, {"ChasOpenMP", (DL_FUNC) &hasOpenMP, -1}, +{"CbeforeR340", (DL_FUNC) &beforeR340, -1}, {"CuniqueNlogical", (DL_FUNC) &uniqueNlogical, -1}, {"CfrollfunR", (DL_FUNC) &frollfunR, -1}, {"CdllVersion", (DL_FUNC) &dllVersion, -1}, @@ -225,6 +138,8 @@ R_CallMethodDef callMethods[] = { {"Cdt_zlib_version", (DL_FUNC)&dt_zlib_version, -1}, {"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, +{"CconvertDate", (DL_FUNC)&convertDate, -1}, +{"Cnotchin", (DL_FUNC)¬chin, -1}, {NULL, NULL, 0} }; @@ -234,7 +149,7 @@ R_ExternalMethodDef externalMethods[] = { {NULL, NULL, 0} }; -static void setSizes() { +static void setSizes(void) { for (int i=0; i<100; ++i) { __sizes[i]=0; __typeorder[i]=0; } // only these types are currently allowed as column types : __sizes[LGLSXP] = sizeof(int); __typeorder[LGLSXP] = 0; @@ -366,6 +281,7 @@ void attribute_visible R_init_data_table(DllInfo *info) sym_tzone = install("tzone"); sym_old_fread_datetime_character = install("datatable.old.fread.datetime.character"); sym_variable_table = install("variable_table"); + sym_as_character = install("as.character"); initDTthreads(); avoid_openmp_hang_within_fork(); @@ -393,7 +309,7 @@ inline double LLtoD(long long x) { return u.d; } -int GetVerbose() { +int GetVerbose(void) { // don't call repetitively; save first in that case SEXP opt = GetOption(sym_verbose, R_NilValue); if ((!isLogical(opt) && !isInteger(opt)) || LENGTH(opt)!=1 || INTEGER(opt)[0]==NA_INTEGER) @@ -402,7 +318,7 @@ int GetVerbose() { } // # nocov start -SEXP hasOpenMP() { +SEXP hasOpenMP(void) { // Just for use by onAttach (hence nocov) to avoid an RPRINTF from C level which isn't suppressable by CRAN // There is now a 'grep' in CRAN_Release.cmd to detect any use of RPRINTF in init.c, which is // why RPRINTF is capitalized in this comment to avoid that grep. @@ -415,6 +331,16 @@ SEXP hasOpenMP() { } // # nocov end +SEXP beforeR340(void) { + // used in onAttach.R for message about fread memory leak fix needing R 3.4.0 + // at C level to catch if user upgrades R but does not reinstall data.table + #if defined(R_VERSION) && R_VERSION= xn) { + if (j>=xn || ixo[j]<=0) { // NA_integer_ = INT_MIN is checked in init.c // j >= xn needed for special nomatch=NULL case, see issue#4388 (due to xo[irows] from R removing '0' value in xo) inewstarts[i] = inomatch; diff --git a/src/openmp-utils.c b/src/openmp-utils.c index c9003ee07..483a91654 100644 --- a/src/openmp-utils.c +++ b/src/openmp-utils.c @@ -29,7 +29,7 @@ static int getIntEnv(const char *name, int def) static inline int imin(int a, int b) { return a < b ? a : b; } static inline int imax(int a, int b) { return a > b ? a : b; } -void initDTthreads() { +void initDTthreads(void) { // called at package startup from init.c // also called by setDTthreads(threads=NULL) (default) to reread environment variables; see setDTthreads below // No verbosity here in this setter. Verbosity is in getDTthreads(verbose=TRUE) @@ -169,16 +169,16 @@ SEXP setDTthreads(SEXP threads, SEXP restore_after_fork, SEXP percent, SEXP thro static int pre_fork_DTthreads = 0; -void when_fork() { +void when_fork(void) { pre_fork_DTthreads = DTthreads; DTthreads = 1; } -void after_fork() { +void after_fork(void) { if (RestoreAfterFork) DTthreads = pre_fork_DTthreads; } -void avoid_openmp_hang_within_fork() { +void avoid_openmp_hang_within_fork(void) { // Called once on loading data.table from init.c #ifdef _OPENMP pthread_atfork(&when_fork, &after_fork, NULL); diff --git a/src/rbindlist.c b/src/rbindlist.c index 5d0b6547e..366902883 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -12,8 +12,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists.")); Rboolean usenames = LOGICAL(usenamesArg)[0]; const bool fill = LOGICAL(fillArg)[0]; - if (fill && usenames!=TRUE) { - if (usenames==FALSE) warning(_("use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.")); // else no warning if usenames==NA (default) + if (fill && usenames==NA_LOGICAL) { usenames=TRUE; } const bool idcol = !isNull(idcolArg); diff --git a/src/reorder.c b/src/reorder.c index c2deea8ae..debdb0217 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -13,7 +13,7 @@ SEXP reorder(SEXP x, SEXP order) ncol = length(x); for (int i=0; i= 0) || (stype == LEAD && kd[j] < 0)) { - // LAG when type = 'lag' and n >= 0 _or_ type = 'lead' and n < 0 - if (tailk > 0) memmove(itmp+thisk, INTEGER(elem), tailk*size); - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + // LAG when type %in% c('lag','cyclic') and n >= 0 _or_ type = 'lead' and n < 0 + if (tailk > 0) memmove(itmp+thisk, ielem, tailk*size); + if (cycle) { + if (thisk > 0) memmove(itmp, ielem+tailk, thisk*size); + } else for (int m=0; m=0 _or_ type = 'lag', n<0 - if (tailk > 0) memmove(itmp, INTEGER(elem)+thisk, tailk*size); - for (int m=xrows-thisk; m=0 _or_ type %in% c('lag','cyclic'), n<0 + if (tailk > 0) memmove(itmp, ielem+thisk, tailk*size); + if (cycle) { + if (thisk > 0) memmove(itmp+tailk, ielem, thisk*size); + } else for (int m=tailk; m= 0) || (stype == LEAD && kd[j] < 0)) { - if (tailk > 0) memmove(dtmp+thisk, REAL(elem), tailk*size); - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + if (tailk > 0) memmove(dtmp+thisk, delem, tailk*size); + if (cycle) { + if (thisk > 0) memmove(dtmp, delem+tailk, thisk*size); + } else for (int m=0; m 0) memmove(dtmp, REAL(elem)+thisk, tailk*size); - for (int m=tailk; m 0) memmove(dtmp, delem+thisk, tailk*size); + if (cycle) { + if (thisk > 0) memmove(dtmp+tailk, delem, thisk*size); + } else for (int m=tailk; m= 0) || (stype == LEAD && kd[j] < 0)) { - if (tailk > 0) memmove(ctmp+thisk, COMPLEX(elem), tailk*size); - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + if (tailk > 0) memmove(ctmp+thisk, celem, tailk*size); + if (cycle) { + if (thisk > 0) memmove(ctmp, celem+tailk, thisk*size); + } else for (int m=0; m 0) memmove(ctmp, COMPLEX(elem)+thisk, tailk*size); - for (int m=tailk; m 0) memmove(ctmp, celem+thisk, tailk*size); + if (cycle) { + if (thisk > 0) memmove(ctmp+tailk, celem, thisk*size); + } else for (int m=tailk; m= 0) || (stype == LEAD && kd[j] < 0)) { - if (tailk > 0) memmove(ltmp+thisk, LOGICAL(elem), tailk*size); - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + if (tailk > 0) memmove(ltmp+thisk, lelem, tailk*size); + if (cycle) { + if (thisk > 0) memmove(ltmp, lelem+tailk, thisk*size); + } else for (int m=0; m 0) memmove(ltmp, LOGICAL(elem)+thisk, tailk*size); - for (int m=tailk; m 0) memmove(ltmp, lelem+thisk, tailk*size); + if (cycle) { + if (thisk > 0) memmove(ltmp+tailk, lelem, thisk*size); + } else for (int m=tailk; m= 0) || (stype == LEAD && kd[j] < 0)) { - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + for (int m=0; m -#include "data.table.h" /* * find end of a string, used to append verbose messages or warnings diff --git a/src/utils.c b/src/utils.c index e499aced0..fa10fd97c 100644 --- a/src/utils.c +++ b/src/utils.c @@ -345,8 +345,6 @@ SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg) { // copyArg does not update in place, but only IF an object is of the same type-class as class to be coerced, it will return with no copy if (!isVectorAtomic(x)) error(_("'x' is not atomic")); - if (!isVectorAtomic(as)) - error(_("'as' is not atomic")); if (!isNull(getAttrib(x, R_DimSymbol))) error(_("'x' must not be matrix or array")); if (!isNull(getAttrib(as, R_DimSymbol))) @@ -372,7 +370,7 @@ SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg) { #ifndef NOZLIB #include #endif -SEXP dt_zlib_version() { +SEXP dt_zlib_version(void) { char out[71]; #ifndef NOZLIB snprintf(out, 70, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION); diff --git a/tests/autoprint.Rout.save b/tests/autoprint.Rout.save index 60ae5056f..a2879ff15 100644 --- a/tests/autoprint.Rout.save +++ b/tests/autoprint.Rout.save @@ -1,6 +1,6 @@ -R version 3.1.1 (2014-07-10) -- "Sock it to Me" -Copyright (C) 2014 The R Foundation for Statistical Computing +R version 4.1.1 (2021-08-10) -- "Kick Things" +Copyright (C) 2021 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. @@ -21,104 +21,122 @@ Loading required package: data.table > # Since this tests autoprinting at the console, it needs to use the .Rout.save mechanism in R CMD check > DT = data.table(a=1:2) # Should print at console? > DT # yes - a -1: 1 -2: 2 + a + +1: 1 +2: 2 > DT[1] # yes - a -1: 1 + a + +1: 1 > DT[2,a:=3L] # no > DT # yes - a -1: 1 -2: 3 + a + +1: 1 +2: 3 > DT[FALSE,a:=3L] # no > DT[a==4L,a:=5L] # no > DT[a %in% 4:8, a:=5L] # no > DT # yes - a -1: 1 -2: 3 +Index: + a + +1: 1 +2: 3 > print(DT[2,a:=4L]) # no > print(DT) # yes - a -1: 1 -2: 4 + a + +1: 1 +2: 4 > if (TRUE) DT[2,a:=5L] # no. used to print before v1.9.5 > if (TRUE) if (TRUE) DT[2,a:=6L] # no. used to print before v1.9.5 > (function(){DT[2,a:=5L];NULL})() # print NULL NULL > DT # no (from v1.9.5+). := suppresses next auto print (can't distinguish just "DT" symbol alone at the prompt) > DT # yes. 2nd time needed, or solutions below - a -1: 1 -2: 5 + a + +1: 1 +2: 5 > (function(){DT[2,a:=5L];NULL})() # print NULL NULL > DT[] # yes. guaranteed print - a -1: 1 -2: 5 + a + +1: 1 +2: 5 > (function(){DT[2,a:=5L];NULL})() # print NULL NULL > print(DT) # no. only DT[] is guaranteed print from v1.9.6 and R 3.2.0 > (function(){DT[2,a:=5L][];NULL})() # print NULL NULL > DT # yes. i) function needs to add [] after last one, so that "DT" alone is guaranteed anyway - a -1: 1 -2: 5 + a + +1: 1 +2: 5 > (function(){DT[2,a:=5L];DT[];NULL})() # print NULL NULL > DT # yes. ii) or as a separate DT[] after the last := inside the function - a -1: 1 -2: 5 + a + +1: 1 +2: 5 > DT2 = data.table(b=3:4) # no > (function(){DT[2,a:=6L];DT2[1,b:=7L];NULL})() NULL > DT # yes. last := was on DT2 not DT - a -1: 1 -2: 6 + a + +1: 1 +2: 6 > {DT[2,a:=6L];invisible()} # no > print(DT) # no > (function(){print(DT[2,a:=7L]);print(DT);invisible()})() # yes*2 - a -1: 1 -2: 7 - a -1: 1 -2: 7 + a + +1: 1 +2: 7 + a + +1: 1 +2: 7 > {print(DT[2,a:=8L]);print(DT);invisible()} # yes*1 Not within function so as at prompt - a -1: 1 -2: 8 + a + +1: 1 +2: 8 > DT[1][,a:=9L] # no (was too tricky to detect that DT[1] is a new object). Simple rule is that := always doesn't print > DT[2,a:=10L][1] # yes - a -1: 1 + a + +1: 1 > DT[1,a:=10L][1,a:=10L] # no > DT[,a:=as.integer(a)] # no > DT[1,a:=as.integer(a)] # no > DT[1,a:=10L][] # yes. ...[] == oops, forgot print(...) - a -1: 10 -2: 10 + a + +1: 10 +2: 10 > > # Test that error in := doesn't suppress next valid print, bug #2376 > tryCatch(DT[,foo:=ColumnNameTypo], error=function(e) e$message) # error: not found. [1] "object 'ColumnNameTypo' not found" > DT # yes - a -1: 10 -2: 10 + a + +1: 10 +2: 10 > DT # yes - a -1: 10 -2: 10 + a + +1: 10 +2: 10 > > > proc.time() user system elapsed - 3.14 0.10 3.22 + 0.723 0.637 0.217 diff --git a/tests/knitr.Rout.save b/tests/knitr.Rout.save index f97eeb4a4..3d4b0cf72 100644 --- a/tests/knitr.Rout.save +++ b/tests/knitr.Rout.save @@ -1,6 +1,6 @@ -R version 3.1.1 (2014-07-10) -- "Sock it to Me" -Copyright (C) 2014 The R Foundation for Statistical Computing +R version 4.1.1 (2021-08-10) -- "Kick Things" +Copyright (C) 2021 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. @@ -33,10 +33,11 @@ DT # yes ``` ``` -## x y -## 1: 1 4 -## 2: 2 5 -## 3: 3 6 +## x y +## +## 1: 1 4 +## 2: 2 5 +## 3: 3 6 ``` ```r @@ -45,10 +46,11 @@ print(DT[, z := 10:12]) # yes ``` ``` -## x y z -## 1: 1 4 10 -## 2: 2 5 11 -## 3: 3 6 12 +## x y z +## +## 1: 1 4 10 +## 2: 2 5 11 +## 3: 3 6 12 ``` ```r @@ -57,10 +59,11 @@ DT # yes ``` ``` -## x y z a -## 1: 1 4 10 1 -## 2: 2 5 11 1 -## 3: 3 6 12 1 +## x y z a +## +## 1: 1 4 10 1 +## 2: 2 5 11 1 +## 3: 3 6 12 1 ``` Some text. @@ -68,4 +71,4 @@ Some text. > > proc.time() user system elapsed - 3.116 0.128 3.257 + 0.742 0.666 0.261 diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index f66f9611f..4b0645e6b 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -6,7 +6,7 @@ output: toc: true number_sections: true vignette: > - %\VignetteIndexEntry{Frequently asked questions} + %\VignetteIndexEntry{Frequently Asked Questions about data.table} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -98,7 +98,7 @@ We _have_ proposed enhancements to R wherever possible, too. One of these was ac > `unique()` and `match()` are now faster on character vectors where all elements are in the global CHARSXP cache and have unmarked encoding (ASCII). Thanks to Matt Dowle for suggesting improvements to the way the hash code is generated in unique.c. -A second proposal was to use `memcpy` in duplicate.c, which is much faster than a for loop in C. This would improve the _way_ that R copies data internally (on some measures by 13 times). The thread on r-devel is [here](https://r.789695.n4.nabble.com/suggestion-how-to-use-memcpy-in-duplicate-c-td2019184.html). +A second proposal was to use `memcpy` in duplicate.c, which is much faster than a for loop in C. This would improve the _way_ that R copies data internally (on some measures by 13 times). The thread on r-devel is [here](https://stat.ethz.ch/pipermail/r-devel/2010-April/057249.html). A third more significant proposal that was accepted is that R now uses data.table's radix sort code as from R 3.3.0 : diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index c5da5d87d..3a5eda34c 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -38,7 +38,7 @@ Briefly, if you are interested in reducing *programming* and *compute* time trem ## Data {#data} -In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the Bureau of Transporation Statistics for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/hadley/nycflights13)). The data is available only for Jan-Oct'14. +In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the Bureau of Transporation Statistics for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/tidyverse/nycflights13)). The data is available only for Jan-Oct'14. We can use `data.table`'s fast-and-friendly file reader `fread` to load `flights` directly as follows: diff --git a/vignettes/datatable-keys-fast-subset.Rmd b/vignettes/datatable-keys-fast-subset.Rmd index 917a90413..465052d94 100644 --- a/vignettes/datatable-keys-fast-subset.Rmd +++ b/vignettes/datatable-keys-fast-subset.Rmd @@ -138,7 +138,7 @@ head(flights) * Alternatively you can pass a character vector of column names to the function `setkeyv()`. This is particularly useful while designing functions to pass columns to set key on as function arguments. -* Note that we did not have to assign the result back to a variable. This is because like the `:=` function we saw in the *"Introduction to data.table"* vignette, `setkey()` and `setkeyv()` modify the input *data.table* *by reference*. They return the result invisibly. +* Note that we did not have to assign the result back to a variable. This is because like the `:=` function we saw in the *"Reference semantics"* vignette, `setkey()` and `setkeyv()` modify the input *data.table* *by reference*. They return the result invisibly. * The *data.table* is now reordered (or sorted) by the column we provided - `origin`. Since we reorder by reference, we only require additional memory of one column of length equal to the number of rows in the *data.table*, and is therefore very memory efficient. diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index 46008e704..bf481f06f 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -102,7 +102,7 @@ my_subset = function(data, col, val) { my_subset(iris, Species, "setosa") ``` -Here, we used the base R `substitute` function to transform the call `subset(data, col == val)` into `subset(iris, Species == "setosa")` by substituting `data`, `col`, and `val` with their original names (or values) from their parent environment. The benefits of this approach to the previous ones should be clear. Note that because we operate at the level of language objects, and don't have to resort to string manipulation, we refer to this as *computing on the language*. There is a dedicated chapter on *Computing on the language* in [R language manual](https://cloud.r-project.org/doc/manuals/r-release/R-lang.html). Although it is not necessary for *programming on data.table*, we encourage readers to read this chapter for the sake of better understanding this powerful and unique feature. +Here, we used the base R `substitute` function to transform the call `subset(data, col == val)` into `subset(iris, Species == "setosa")` by substituting `data`, `col`, and `val` with their original names (or values) from their parent environment. The benefits of this approach to the previous ones should be clear. Note that because we operate at the level of language objects, and don't have to resort to string manipulation, we refer to this as *computing on the language*. There is a dedicated chapter on *Computing on the language* in [R language manual](https://cran.r-project.org/doc/manuals/r-release/R-lang.html). Although it is not necessary for *programming on data.table*, we encourage readers to read this chapter for the sake of better understanding this powerful and unique feature. #### Use third party packages diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index 2f3457056..33da89bb9 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -92,7 +92,7 @@ It can be used in `j` in two ways: # when you have only one column to assign to you # can drop the quotes and list(), for convenience DT[, colA := valA] - ``` + ``` (b) The functional form @@ -367,4 +367,3 @@ However we could improve this functionality further by *shallow* copying instead So far we have seen a whole lot in `j`, and how to combine it with `by` and little of `i`. Let's turn our attention back to `i` in the next vignette *"Keys and fast binary search based subset"* to perform *blazing fast subsets* by *keying data.tables*. *** - diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index 9c55cdbd0..3f94392fc 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -110,7 +110,7 @@ DT.m1 In the previous section, we saw how to get from wide form to long form. Let's see the reverse operation in this section. -#### - How can we get back to the original data table `DT` from `DT.m`? +#### - How can we get back to the original data table `DT` from `DT.m1`? That is, we'd like to collect all *child* observations corresponding to each `family_id, age_mother` together under the same row. We can accomplish it using `dcast` as follows: @@ -126,7 +126,7 @@ dcast(DT.m1, family_id + age_mother ~ child, value.var = "dob") * `dcast` also tries to preserve attributes in result wherever possible. -#### - Starting from `DT.m`, how can we get the number of children in each family? +#### - Starting from `DT.m1`, how can we get the number of children in each family? You can also pass a function to aggregate by in `dcast` with the argument `fun.aggregate`. This is particularly essential when the formula provided does not identify single observation for each cell. @@ -327,4 +327,3 @@ You can also provide *multiple functions* to `fun.aggregate` to `dcast` for *dat # *** - diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index fda2c4751..f84fd6ea6 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -34,7 +34,7 @@ The simpler usage of `.SD` is for column subsetting (i.e., when `.SDcols` is spe ## Loading and Previewing Lahman Data -To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](http://www.seanlahman.com/baseball-archive/statistics/). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. +To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](https://www.seanlahman.com/baseball-archive/statistics/). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. ```{r download_lahman} load('Teams.RData') @@ -46,7 +46,7 @@ setDT(Pitching) Pitching ``` -Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](http://www.seanlahman.com/files/database/readme2017.txt) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. +Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](https://www.seanlahman.com/files/database/readme2017.txt) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. # `.SD` on Ungrouped Data