From 72cb22993f64c3d4bc62e3331ce2042d35f84dff Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 19 May 2020 12:10:09 +0100 Subject: [PATCH 001/588] upgrade CI to R 4.0 (#4431) --- .gitattributes | 1 + .gitlab-ci.yml | 294 ++++++++++++++++++++++++++----------------------- 2 files changed, 159 insertions(+), 136 deletions(-) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..fa1385d99a --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* -text diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4f227e79c7..84af05b7a1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,63 +12,84 @@ stages: .artifacts-template: &artifacts artifacts: - expire_in: 2 weeks + expire_in: 4 weeks when: always paths: - bus -mirror-packages: # download all recursive dependencies of data.table suggests and integration suggests from inst/tests/tests-DESCRIPTION +mirror-packages: ## mirror all recursive dependencies, source and win.binary, of data.table suggests and integration suggests from inst/tests/tests-DESCRIPTION stage: dependencies tags: - linux image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev cache: paths: - - bus/$CI_BUILD_NAME/cran + - bus/$CI_BUILD_NAME/cran variables: - R_BIN_VERSION: "3.6" - R_DEVEL_BIN_VERSION: "4.0" + R_BIN_VERSION: "4.0" + R_DEVEL_BIN_VERSION: "4.1" script: - echo 'source(".ci/ci.R")' >> .Rprofile - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib - # mirror R dependencies: source, win.binary - Rscript -e 'mirror.packages(dcf.dependencies(c("DESCRIPTION","inst/tests/tests-DESCRIPTION"), "all"), repos=c(Sys.getenv("CRAN_MIRROR"), dcf.repos("inst/tests/tests-DESCRIPTION")), repodir="bus/mirror-packages/cran")' - - rm bus/$CI_BUILD_NAME/cran/src/contrib/PACKAGES.rds # fallback to PACKAGES dcf so available.packages 3.4.4 works + - rm bus/$CI_BUILD_NAME/cran/src/contrib/PACKAGES.rds ## fallback to PACKAGES dcf so available.packages:3.4.4 works - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_BIN_VERSION","R_DEVEL_BIN_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' <<: *artifacts -build: # build data.table sources as tar.gz archive +build: ## build data.table sources as tar.gz archive stage: build tags: - linux image: registry.gitlab.com/jangorecki/dockerfiles/r-builder dependencies: - - mirror-packages - script: + - mirror-packages + before_script: - Rscript -e 'install.packages("knitr", repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' - rm -r bus - echo "Revision:" $CI_BUILD_REF >> ./DESCRIPTION + script: - R CMD build . - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib - mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_BUILD_NAME/cran/src/contrib/. - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/build/cran"), fields="Revision", addFiles=TRUE)' - - rm bus/$CI_BUILD_NAME/cran/src/contrib/PACKAGES.rds # fallback to PACKAGES dcf so available.packages 3.4.4 works + - rm bus/$CI_BUILD_NAME/cran/src/contrib/PACKAGES.rds ## fallback to PACKAGES dcf so available.packages:3.4.4 works <<: *artifacts -.test-copy-src: ©-src +.test-install-deps: &install-deps + - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="most"), quiet=TRUE)' +.test-install-deps-win: &install-deps-win + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most'), quiet=TRUE)" + +.test-cp-src: &cp-src - cp $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) . +.test-cp-src-win: &cp-src-win + - cp.exe $(ls.exe -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head.exe -n 1) . -.test-move-src: &move-src +.test-mv-src: &mv-src - mkdir -p bus/$CI_BUILD_NAME && mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_BUILD_NAME +.test-mv-src-win: &mv-src-win + - mkdir.exe -p bus/$CI_BUILD_NAME; mv.exe $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) bus/$CI_BUILD_NAME -.test-cleanup-src: &cleanup-src +.test-rm-src: &rm-src - rm $(ls -1t data.table_*.tar.gz | head -n 1) +.test-rm-src-win: &rm-src-win + - rm.exe $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + +.test-mv-bin-win: &mv-bin-win + - mkdir.exe -p cran/bin/windows/contrib/$R_BIN_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_BIN_VERSION + +.test-install-r-rel-win: &install-r-rel-win + - curl.exe -s -o ../R-win.exe https://cloud.r-project.org/bin/windows/base/R-4.0.0-win.exe; Start-Process -FilePath ..\R-win.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait +.test-install-r-dev-win: &install-r-dev-win + - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait +.test-install-rtools-win: &install-rtools-win + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait .test-template: &test stage: test dependencies: - - mirror-packages - - build + - mirror-packages + - build <<: *artifacts .test-lin-template: &test-lin @@ -81,160 +102,161 @@ build: # build data.table sources as tar.gz archive variables: _R_CHECK_CRAN_INCOMING_: "TRUE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" - script: - - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="most"), quiet=TRUE)' - - *copy-src + before_script: + - *install-deps + - *cp-src - rm -r bus - - *move-src + script: + - *mv-src - cd bus/$CI_BUILD_NAME - R CMD check --as-cran --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) - - *cleanup-src + - *rm-src .test-win-template: &test-win <<: *test tags: - windows - - private - before_script: - - export PATH="/c/$R_DIR/bin:/c/Rtools/bin:$PATH" - - rm -rf /tmp/$R_DIR/library && mkdir -p /tmp/$R_DIR/library - - export R_LIBS_USER="/tmp/$R_DIR/library" + - shared-windows .test-osx-template: &test-osx <<: *test tags: - macosx -test-rel-lin: # most comprehensive tests, force all suggests, also integration tests, using gcc -O3 -flto -fno-common -Wunused-result +test-rel-lin: ## most comprehensive tests, force all suggests, also integration tests, using gcc -O3 -flto -fno-common -Wunused-result <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-builder - variables: # unlike CRAN + variables: _R_CHECK_CRAN_INCOMING_: "FALSE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_CHECK_FORCE_SUGGESTS_: "TRUE" _R_CHECK_TESTS_NLINES_: "0" OPENBLAS_MAIN_FREE: "1" TEST_DATA_TABLE_WITH_OTHER_PACKAGES: "TRUE" - script: + before_script: - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies(c("DESCRIPTION","inst/tests/tests-DESCRIPTION"), which="all"), quiet=TRUE)' - - *copy-src + - *cp-src - rm -r bus - - *move-src - mkdir -p ~/.R - echo 'CFLAGS=-g -O3 -flto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - echo 'CXXFLAGS=-g -O3 -flto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + script: + - *mv-src - cd bus/$CI_BUILD_NAME - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) - - *cleanup-src + - *rm-src - (! grep "warning:" data.table.Rcheck/00install.out) -test-rel-vanilla-lin: # minimal installation, no suggested deps, no vignettes or manuals, measure memory, using gcc -O0 -fno-openmp +test-rel-vanilla-lin: ## minimal, no suggested deps, no vignettes or manuals, measure memory, using gcc -O0 -fno-openmp <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev variables: TEST_DATA_TABLE_MEMTEST: "TRUE" before_script: + - *cp-src + - rm -r bus - mkdir -p ~/.R - echo 'CFLAGS=-g -O0 -fno-openmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - echo 'CXXFLAGS=-g -O0 -fno-openmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars script: - - *copy-src - - rm -r bus - - *move-src + - *mv-src - cd bus/$CI_BUILD_NAME - R CMD check --no-manual --ignore-vignettes $(ls -1t data.table_*.tar.gz | head -n 1) - - *cleanup-src + - *rm-src -test-rel-cran-lin: # currently released R on Linux, extra NOTEs check and build pdf manual thus not from cran-lin template +test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual thus not from cran-lin template <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-builder variables: - _R_CHECK_CRAN_INCOMING_: "TRUE" # stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though) - _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" # Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 + _R_CHECK_CRAN_INCOMING_: "TRUE" ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though) + _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 + _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0, then 00check.log can be checked for "OK" rather than "1 NOTE" before_script: + - *install-deps + - *cp-src + - rm -r bus - mkdir -p ~/.R - - echo 'CFLAGS=-g0 -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2'> ~/.R/Makevars # -g0 because -g increases datatable.so size from 0.5MB to 1.5MB and breaches 'installed package size <= 5MB' note + - echo 'CFLAGS=-g0 -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2'> ~/.R/Makevars ## -g0 because -g increases datatable.so size from 0.5MB to 1.5MB and breaches 'installed package size <= 5MB' note - echo 'CXXFLAGS=-g0 -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars script: - - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="most"), quiet=TRUE)' - - *copy-src - - rm -r bus - - *move-src + - *mv-src - cd bus/$CI_BUILD_NAME - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - - *cleanup-src + - *rm-src - >- - Rscript -e 'l<-readLines("data.table.Rcheck/00check.log"); if (!identical(l[length(l)], "Status: 1 NOTE")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 1 NOTE"), "(size of tarball) but ", shQuote(toString(l[length(l)]))) else q("no")' + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 1 NOTE")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 1 NOTE"), " (size of tarball) but ", shQuote(l)) else q("no")' -test-dev-cran-lin: # R-devel on Linux, --enable-strict-barrier --disable-long-double +test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double <<: *test-cran-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-devel -test-310-cran-lin: # test stated R dependency 3.1.0 +test-310-cran-lin: ## R-3.1.0 on Linux, stated dependency of R <<: *test-cran-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-3.1.0 -test-344-cran-lin: # test last R non-altrep version +test-344-cran-lin: ## R-3.4.4 on Linux, last R non-altrep version <<: *test-cran-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-3.4.4 -test-350-cran-lin: # test first R altrep version +test-350-cran-lin: ## R-3.5.0 on Linux, first R altrep version <<: *test-cran-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-3.5.0 -test-rel-win: # windows test and build binaries +test-rel-win: ## R-release on Windows, test and build binaries <<: *test-win variables: - R_BIN_VERSION: "3.6" - R_DIR: "R-3.6.0" + R_BIN_VERSION: "4.0" + before_script: + - *install-r-rel-win + - *install-rtools-win + - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" + - *install-deps-win + - *cp-src-win + - rm.exe -r bus script: - - Rscript -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='all'), quiet=TRUE)" - - *copy-src - - rm -r bus - - *move-src + - *mv-src-win - cd bus/$CI_BUILD_NAME - - R CMD check --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) - - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) - - mkdir -p cran/bin/windows/contrib/$R_BIN_VERSION - - mv $(ls -1t data.table_*.zip | head -n 1) cran/bin/windows/contrib/$R_BIN_VERSION - - *cleanup-src + - R.exe CMD check --no-manual $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - *rm-src-win + - *mv-bin-win -test-dev-win: # R-devel on windows +test-dev-win: ## R-devel on Windows <<: *test-win variables: - R_BIN_VERSION: "4.0" - R_DIR: "R-devel" - TEST_DATA_TABLE_MEMTEST: "FALSE" # disabled as described in #3147 - allow_failure: false + R_BIN_VERSION: "4.1" + before_script: + - *install-r-dev-win + - *install-rtools-win + - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" + - *install-deps-win + - *cp-src-win + - rm.exe -r bus script: - - Rscript -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='all'), quiet=TRUE, contriburl=contrib.url(getOption('repos'), 'binary', ver=Sys.getenv('R_BIN_VERSION')))" - - *copy-src - - rm -r bus - - *move-src + - *mv-src-win - cd bus/$CI_BUILD_NAME - - R CMD check --no-manual --ignore-vignettes $(ls -1t data.table_*.tar.gz | head -n 1) - - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) - - mkdir -p cran/bin/windows/contrib/$R_BIN_VERSION - - mv $(ls -1t data.table_*.zip | head -n 1) cran/bin/windows/contrib/$R_BIN_VERSION - - *cleanup-src + - R.exe CMD check --no-manual --ignore-vignettes $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - *rm-src-win + - *mv-bin-win -.test-rel-osx: # macosx test and build binaries - <<: *test-osx - variables: - R_BIN_VERSION: "3.6" - script: - - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), quiet=TRUE)' - - *copy-src - - rm -r bus - - *move-src - - cd bus/$CI_BUILD_NAME - - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) - - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) - - mkdir -p cran/bin/macosx/el-capitan/contrib/$R_BIN_VERSION - - mv $(ls -1t data.table_*.tgz | head -n 1) cran/bin/macosx/el-capitan/contrib/$R_BIN_VERSION - - *cleanup-src +#test-rel-osx: ## R-release on MacOS, no macosx runner yet +# variables: +# R_BIN_VERSION: "4.0" +# before_script: +# - *install-deps +# - *cp-src +# - rm -r bus +# script: +# - *mv-src +# - cd bus/$CI_BUILD_NAME +# - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) +# - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) +# - mkdir -p cran/bin/macosx/el-capitan/contrib/$R_BIN_VERSION +# - mv $(ls -1t data.table_*.tgz | head -n 1) cran/bin/macosx/el-capitan/contrib/$R_BIN_VERSION +# - *rm-src -integration: # merging all artifacts to produce single R repository and summaries +integration: ## merging all artifacts to produce single R repository, documentation and website stage: integration image: registry.gitlab.com/jangorecki/dockerfiles/r-builder tags: @@ -242,45 +264,45 @@ integration: # merging all artifacts to produce single R repository and summarie only: - master dependencies: - - mirror-packages - - build - - test-rel-lin - - test-rel-cran-lin - - test-dev-cran-lin - - test-rel-vanilla-lin - - test-310-cran-lin - - test-344-cran-lin - - test-350-cran-lin - - test-rel-win - - test-dev-win - #- test-rel-osx + - mirror-packages + - build + - test-rel-lin + - test-rel-cran-lin + - test-dev-cran-lin + - test-rel-vanilla-lin + - test-310-cran-lin + - test-344-cran-lin + - test-350-cran-lin + - test-rel-win + - test-dev-win + #- test-rel-osx variables: - R_BIN_VERSION: "3.6" - R_DEVEL_BIN_VERSION: "4.0" + R_BIN_VERSION: "4.0" + R_DEVEL_BIN_VERSION: "4.1" script: - # pkgdown installs pkgs from "." so run at start to have clean root dir + ## pkgdown installs pkgs from "." so run at start to have clean root dir - apt-get update -qq && apt-get install -y libxml2-dev - mkdir -p /tmp/pkgdown/library - - R_LIBS_USER=/tmp/pkgdown/library Rscript -e 'install.packages("remotes", repos=Sys.getenv("CRAN_MIRROR"), quiet=TRUE); remotes::install_github("r-lib/pkgdown", repos=Sys.getenv("CRAN_MIRROR"), quiet=TRUE); pkgdown::build_site(override=list(destination="./pkgdown"))' - # html manual, vignettes, repos, cran_web, cran_checks + - R_LIBS_USER=/tmp/pkgdown/library Rscript -e 'install.packages("pkgdown", repos=Sys.getenv("CRAN_MIRROR"), quiet=TRUE); pkgdown::build_site(override=list(destination="./pkgdown"))' + ## html manual, vignettes, repos, cran_web, cran_checks - echo 'source(".ci/ci.R"); source(".ci/publish.R")' >> .Rprofile - # list of available test-* jobs dynamically based on bus/test-* directories + ## list of available test-* jobs dynamically based on bus/test-* directories - Rscript -e 'cat("\ntest.jobs <- c(\n"); cat(paste0(" \"",list.files("bus",pattern="^test-"),"\" = \"data.table\""), sep=",\n"); cat(")\n")' >> .Rprofile - Rscript -e 'sapply(names(test.jobs), check.test, pkg="data.table", simplify=FALSE)' - mkdir -p bus/$CI_BUILD_NAME - # delete any existing non-dev version of data.table + ## delete any existing non-dev version of data.table - rm -f bus/mirror-packages/cran/src/contrib/data.table_*.tar.gz - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_BIN_VERSION/data.table_*.zip - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_DEVEL_BIN_VERSION/data.table_*.zip #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_BIN_VERSION/data.table_*.tgz #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_DEVEL_BIN_VERSION/data.table_*.tgz - # merge mirror-packages and R devel packages + ## merge mirror-packages and R devel packages - mv bus/mirror-packages/cran bus/$CI_BUILD_NAME/ - # publish package sources + ## publish package sources - mkdir -p bus/$CI_BUILD_NAME/cran/library bus/$CI_BUILD_NAME/cran/doc - mv $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) bus/$CI_BUILD_NAME/cran/src/contrib - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="source"), type="source", fields="Revision", addFiles=TRUE)' - # publish binaries + ## publish binaries - Rscript -e 'move.bin("test-rel-win", Sys.getenv("R_BIN_VERSION"), os.type="windows")' - Rscript -e 'move.bin("test-dev-win", Sys.getenv("R_DEVEL_BIN_VERSION"), os.type="windows", silent=TRUE)' - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_BIN_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' @@ -289,31 +311,31 @@ integration: # merging all artifacts to produce single R repository and summarie #- Rscript -e 'move.bin("test-dev-osx", Sys.getenv("R_DEVEL_BIN_VERSION"), os.type="macosx", silent=TRUE)' #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_BIN_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_DEVEL_BIN_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' - # install all pkgs to render html and double check successful installation of all devel packages + ## install all pkgs to render html and double check successful installation of all devel packages - mkdir -p /tmp/opencran/library /tmp/opencran/doc/html - Rscript -e 'install.packages("data.table", dependencies=TRUE, lib="/tmp/opencran/library", repos=file.path("file:",normalizePath("bus/integration/cran")), INSTALL_opts="--html", quiet=TRUE)' - Rscript -e 'packageVersion("data.table", lib.loc="/tmp/opencran/library")' - # CRAN style web/CRAN_web.css + ## CRAN style web/CRAN_web.css - wget -q -P bus/integration/cran/web https://cran.r-project.org/web/CRAN_web.css - # web/packages/$pkg/index.html + ## web/packages/$pkg/index.html - Rscript -e 'sapply(rownames(installed.packages(lib.loc="/tmp/opencran/library", priority="NA")), package.index, lib.loc="/tmp/opencran/library")' - # R docs, html, css, icons + ## R docs, html, css, icons - Rscript -e 'doc.copy(repodir="/tmp/opencran")' - # Update packages.html, rewrite file:/ to relative path + ## Update packages.html, fix paths - Rscript -e 'setwd("/tmp/opencran/doc/html"); make.packages.html(lib.loc="../../library", docdir="/tmp/opencran/doc"); tmp<-readLines(f<-"/tmp/opencran/doc/html/packages.html"); writeLines(gsub("file:///../../library","../../library", tmp, fixed=TRUE), f)' - mv /tmp/opencran/doc bus/integration/cran/ - # library html manual, vignettes + ## library html manual, vignettes - Rscript -e 'lib.copy(lib.from="/tmp/opencran/library")' - # web/checks/$pkg/$job: 00install.out, 00check.log, *.Rout, memtest.csv, memtest.png + ## web/checks/$pkg/$job 00install.out, 00check.log, *.Rout, memtest.csv, memtest.png - Rscript -e 'sapply(names(test.jobs), check.copy, simplify=FALSE)' - # web/packages/$pkg/$pkg.pdf + ## web/packages/$pkg/$pkg.pdf - Rscript -e 'pdf.copy("data.table", "test-rel-lin")' - # web/checks/check_results_$pkg.html + ## web/checks/check_results_$pkg.html - Rscript -e 'check.index("data.table", names(test.jobs))' - # pkgdown merge + ## pkgdown merge - Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); msg = if (length(f<-common_files("pkgdown","bus/integration/cran"))) paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0(" ", f)), collapse="\n") else "No overlapping files from pkgdown artifacts"; message(msg); q("no")' - mv pkgdown/* bus/integration/cran/ - # cleanup artifacts from other jobs + ## cleanup artifacts from other jobs - mkdir tmpbus - mv bus/$CI_BUILD_NAME tmpbus - rm -r bus @@ -326,9 +348,9 @@ integration: # merging all artifacts to produce single R repository and summarie - linux image: docker services: - - docker:dind + - docker:dind dependencies: - - build + - build before_script: - sed "s/SRC_IMAGE_NAME/$SRC_IMAGE_NAME/" < .ci/Dockerfile.in > Dockerfile - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY @@ -337,7 +359,7 @@ integration: # merging all artifacts to produce single R repository and summarie - docker run --rm "$CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG" Rscript -e 'cat(R.version.string, "\ndata.table revision", read.dcf(system.file("DESCRIPTION", package="data.table"), fields="Revision")[[1L]], "\n"); require(data.table); test.data.table()' - docker push "$CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG" -docker-r-release: # publish docker image of data.table on R-release +docker-r-release: ## data.table on R-release only: - master variables: @@ -346,7 +368,7 @@ docker-r-release: # publish docker image of data.table on R-release IMAGE_TAG: "latest" <<: *docker -docker-r-release-builder: # publish on R-release and OS dependencies for building Rmd vignettes +docker-r-release-builder: ## data.table on R-release extended for Rmd vignettes build dependencies only: - master variables: @@ -355,7 +377,7 @@ docker-r-release-builder: # publish on R-release and OS dependencies for buildin IMAGE_TAG: "latest" <<: *docker -docker-r-devel: # publish docker image of data.table on R-devel +docker-r-devel: ## data.table on R-devel only: - master variables: @@ -364,7 +386,7 @@ docker-r-devel: # publish docker image of data.table on R-devel IMAGE_TAG: "latest" <<: *docker -docker-tags: # publish only on tagged commits, we use tags for version +docker-tags: ## data.table on R-release fixed version images only: - tags variables: @@ -373,7 +395,7 @@ docker-tags: # publish only on tagged commits, we use tags for version IMAGE_TAG: $CI_COMMIT_TAG <<: *docker -pages: # publish R repository, test jobs summaries, html documentation of all packages in repo, pkgdown +pages: ## publish R repository, test jobs summaries, html documentation of all packages in repo, pkgdown stage: deploy environment: production tags: @@ -382,12 +404,12 @@ pages: # publish R repository, test jobs summaries, html documentation of all pa - master image: ubuntu dependencies: - - integration + - integration script: - mkdir -p public - cp -r bus/integration/cran/* public - cat public/src/contrib/PACKAGES - artifacts: # publish when no failure - expire_in: 2 weeks + artifacts: ## publish only when no failure + expire_in: 4 weeks paths: - public From df93c3d5b79d81edc99d250067abdc45f4a0e402 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 19 May 2020 13:16:53 +0100 Subject: [PATCH 002/588] remove old talk (#4279) --- README.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/README.md b/README.md index 0e96ab1d35..fe85fd8164 100644 --- a/README.md +++ b/README.md @@ -16,14 +16,6 @@ `data.table` provides a high-performance version of [base R](https://www.r-project.org/about.html)'s `data.frame` with syntax and feature enhancements for ease of use, convenience and programming speed. ---- - -**30 January 2020
-List-columns in data.table - Tyson Barrett, [rstudio::conf(2020L)](https://rstudio.com/conference/)** -
- ---- - ## Why `data.table`? * concise syntax: fast to type, fast to read From f477fac43eec644f8ba54514cdf89f44870eed86 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 19 May 2020 23:50:23 +0100 Subject: [PATCH 003/588] fixes R-devel foverlaps failures due to c.POSIXct change (#4428) --- NEWS.md | 2 ++ R/foverlaps.R | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 026f06c86e..7ecd4bc5af 100644 --- a/NEWS.md +++ b/NEWS.md @@ -147,6 +147,8 @@ unit = "s") 7. Added more explanation/examples to `?data.table` for how to use `.BY`, [#1363](https://github.com/Rdatatable/data.table/issues/1363). +8. Change of `c.POSIXct` method planned for R 4.1.0 impacted `foverlaps` function that could raise `'origin' must be supplied` error. Fix for planned change has been provided in [#4428](https://github.com/Rdatatable/data.table/pull/4428). + # data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) diff --git a/R/foverlaps.R b/R/foverlaps.R index d4c8a2ae12..8028482abb 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -109,6 +109,7 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k setattr(icall, 'names', icols) mcall = make_call(mcols, quote(c)) if (type %chin% c("within", "any")) { + if (isposix) mcall[[2L]] = call("unclass", mcall[[2L]]) # fix for R-devel change in c.POSIXct mcall[[3L]] = substitute( # datetimes before 1970-01-01 are represented as -ve numerics, #3349 if (isposix) unclass(val)*(1L + sign(unclass(val))*dt_eps()) @@ -128,7 +129,7 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k within =, equal = yintervals) call = construct(head(ynames, -2L), uycols, type) if (verbose) {last.started.at=proc.time();cat("unique() + setkey() operations done in ...");flush.console()} - uy = unique(y[, eval(call)]) + uy = unique(y[, eval(call)]) # this started to fail from R 4.1 due to c(POSIXct, numeric) setkey(uy)[, `:=`(lookup = list(list(integer(0L))), type_lookup = list(list(integer(0L))), count=0L, type_count=0L)] if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} matches = function(ii, xx, del, ...) { From 4bda6dac7ede429434412ede26b71cfada76f33d Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 21 May 2020 22:45:40 +0100 Subject: [PATCH 004/588] Remove 'typedef R_xlen_t RLEN' in data.table.h (#4465) --- src/data.table.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 90ff7fb6fc..9be142086b 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -16,13 +16,6 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); -// data.table depends on R>=3.0.0 when R_xlen_t was introduced -// Before R 3.0.0, RLEN used to be switched to R_len_t as R_xlen_t wasn't available. -// We could now replace all RLEN with R_xlen_t directly. Or keep RLEN for the shorter -// name so as not to have to check closely one letter difference R_xlen_t/R_len_t. We -// might also undefine R_len_t to ensure not to use it. -typedef R_xlen_t RLEN; - #define IS_UTF8(x) (LEVELS(x) & 8) #define IS_ASCII(x) (LEVELS(x) & 64) #define IS_LATIN(x) (LEVELS(x) & 4) From d47a83fb2e25582e508f191f87a31ca81b736b57 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 22 May 2020 18:27:57 -0600 Subject: [PATCH 005/588] dev script updates only: CRAN_Release, and added my .bash_aliases and .Rprofile --- .dev/.Rprofile | 14 ++++++++++++++ .dev/.bash_aliases | 21 +++++++++++++++++++++ .dev/CRAN_Release.cmd | 11 ++++++----- 3 files changed, 41 insertions(+), 5 deletions(-) create mode 100644 .dev/.Rprofile create mode 100644 .dev/.bash_aliases diff --git a/.dev/.Rprofile b/.dev/.Rprofile new file mode 100644 index 0000000000..7d4ab3239d --- /dev/null +++ b/.dev/.Rprofile @@ -0,0 +1,14 @@ +# Matt's ~/.Rprofile is a link to this file at ~/GitHub/data.table/.dev/.Rprofile + +# options(repos = c(CRAN="http://cran.stat.ucla.edu")) +# options(repos = c(CRAN=c("http://cran.stat.ucla.edu", "http://cloud.r-project.org"))) # both needed for revdep checks sometimes +options(repos = c(CRAN="http://cloud.r-project.org")) + +options(help_type="html") +options(error=quote(dump.frames())) +options(width=200) +options(digits.secs=3) # for POSIXct to print milliseconds +suppressWarnings(RNGversion("3.5.0")) # so when I create tests in dev there isn't a mismatch when run by cc() + +Sys.setenv(PROJ_PATH=path.expand("~/GitHub/data.table")) +source(paste0(Sys.getenv("PROJ_PATH"),"/.dev/cc.R")) diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases new file mode 100644 index 0000000000..93ea44ed5c --- /dev/null +++ b/.dev/.bash_aliases @@ -0,0 +1,21 @@ +# Matt's ~/.bash_aliases is a link to this file ~/GitHub/data.table/.dev/.bash_aliases + +# One off configure meld as difftool: +# git config --global diff.tool meld +# git config --global difftool.prompt false +alias gd='git difftool &> /dev/null' +alias gdm='git difftool master &> /dev/null' + +alias Rdevel='~/build/R-devel/bin/R --vanilla' +alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' +alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' +alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla' +alias R310='~/build/R-3.1.0/bin/R --vanilla' +alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false' +alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R ~/build/R-devel/bin/R' + +export R_PROFILE_USER='~/.Rprofile' +# there's a .Rprofile in ~/GitHub/data.table/ so Matt sets R_PROFILE_USER here to always use ~/.Rprofile +# even when starting R in ~/GitHub/data.table +# Matt's ~/.Rprofile as a link to ~/GitHub/data.table/.dev/.Rprofile + diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index e629ee980b..f9d435455e 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -304,7 +304,7 @@ cd R-devel # used for revdep testing: .dev/revdep.R. ./configure CFLAGS="-O2 -Wall -pedantic" make -# use latest available below `apt cache search gcc-` or `clang-` +# use latest available below `apt-cache search gcc-` or `clang-` cd ../R-devel-strict-clang ./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="clang-8 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make @@ -479,7 +479,7 @@ sudo apt-get -y install r-base r-base-dev sudo apt-get -y build-dep r-base-dev sudo apt-get -y build-dep qpdf sudo apt-get -y install aptitude -sudo aptitude build-dep r-cran-rgl # leads to libglu1-mesa-dev +sudo aptitude -y build-dep r-cran-rgl # leads to libglu1-mesa-dev sudo apt-get -y build-dep r-cran-rmpi sudo apt-get -y build-dep r-cran-cairodevice sudo apt-get -y build-dep r-cran-tkrplot @@ -490,8 +490,7 @@ sudo apt-get -y install libv8-dev sudo apt-get -y install gsl-bin libgsl0-dev sudo apt-get -y install libgtk2.0-dev netcdf-bin sudo apt-get -y install libcanberra-gtk-module -sudo apt-get -y install git -sudo apt-get -y install openjdk-8-jdk +sudo apt-get -y install openjdk-11-jdk # solves "fatal error: jni.h: No such file or directory"; change 11 to match "java --version" sudo apt-get -y install libnetcdf-dev udunits-bin libudunits2-dev sudo apt-get -y install tk8.6-dev sudo apt-get -y install clustalo # for package LowMACA @@ -512,7 +511,7 @@ sudo apt-get -y install libmagick++-dev # for magick sudo apt-get -y install libjq-dev libprotoc-dev libprotobuf-dev and protobuf-compiler # for protolite sudo apt-get -y install python-dev # for PythonInR sudo apt-get -y install gdal-bin libgeos-dev # for rgdal/raster tested via lidR -sudo apt-get build-dep r-cran-rsymphony # for Rsymphony: coinor-libcgl-dev coinor-libclp-dev coinor-libcoinutils-dev coinor-libosi-dev coinor-libsymphony-dev +sudo apt-get -y build-dep r-cran-rsymphony # for Rsymphony: coinor-libcgl-dev coinor-libclp-dev coinor-libcoinutils-dev coinor-libosi-dev coinor-libsymphony-dev sudo apt-get -y install libtesseract-dev libleptonica-dev tesseract-ocr-eng # for tesseract sudo apt-get -y install libssl-dev libsasl2-dev sudo apt-get -y install biber # for ctsem @@ -520,6 +519,8 @@ sudo apt-get -y install libopenblas-dev # for ivmte (+ local R build with defau sudo apt-get -y install libhiredis-dev # for redux used by nodbi sudo apt-get -y install libzmq3-dev # for rzmq sudo apt-get -y install libimage-exiftool-perl # for camtrapR +sudo apt-get -y install parallel # for revdepr.R +sudo apt-get -y install pandoc-citeproc # for basecallQC sudo R CMD javareconf # ENDIF From f6bc553e73f3a5c8df1c7cfefb577fa7dcdebe48 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 25 May 2020 16:54:45 -0600 Subject: [PATCH 006/588] relaxed test 1590 given change in R-devel on ordering encodings (#4492) --- inst/tests/tests.Rraw | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ed17470383..3a6148221d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8120,7 +8120,7 @@ test(1588.7, dt[ch>"c"], dt[4:6]) # coverage of a return(NULL) in .prepareFastS # data.table operates consistently independent of locale, but it's R that changes and is sensitive to it. # Because keys/indexes depend on a sort order. If a data.table is stored on disk with a key -# created in a locale-sensitive order and then loaded by another R session in a different locale, the ability to re-use existing sortedness +# created in a locale-sensitive order and then loaded by another R session in a different locale, the ability to reuse existing sortedness # will break because the order would depend on the locale. Which is why data.table is deliberately C-locale only. For consistency and simpler # internals for robustness to reduce the change of errors and to avoid that class of bug. It would be possible to have locale-sensitive keys # and indexes but we've, so far, decided not to, for those reasons. @@ -8137,12 +8137,20 @@ Encoding(x1) = "latin1" x2 = iconv(x1, "latin1", "UTF-8") test(1590.01, identical(x1,x2)) test(1590.02, x1==x2) -test(1590.03, forderv( c(x2,x1,x1,x2)), integer()) # desirable consistent result given data.table's needs -test(1590.04, base::order(c(x2,x1,x1,x2)), INT(1,4,2,3)) # different result in base R under C locale even though identical(x1,x2) +test(1590.03, forderv( c(x2,x1,x1,x2)), integer()) # desirable consistent result given identical(x1, x2) + # ^^ data.table consistent over time regardless of which version of R or locale +baseR = base::order(c(x2,x1,x1,x2)) + # Even though C locale and identical(x1,x2), base R considers the encoding too; i.e. orders the same-encoding together. + # In R <= 4.0.0, base R put x2 (UTF-8) before x1 (latin1). + # Then in R-devel around May 2020, R-devel on Windows started putting x1 before x2. + # Jan emailed R-devel on 23 May 2020. PR#4492 retained this test of base R but relaxed the encoding to be in either order. + # It's good to know that baseR changed. We still want to know in future if base R changes again (so we relaxed 1590.04 and 1590.07 rather than remove them). +test(1590.04, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4))) Encoding(x2) = "unknown" test(1590.05, x1!=x2) test(1590.06, forderv( c(x2,x1,x1,x2)), INT(1,4,2,3)) # consistent with Windows-1252 result, tested further below -test(1590.07, base::order(c(x2,x1,x1,x2)), INT(2,3,1,4)) # different result; base R is encoding-sensitive in C-locale +baseR = base::order(c(x2,x1,x1,x2)) +test(1590.07, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4))) Sys.setlocale("LC_CTYPE", ctype) Sys.setlocale("LC_COLLATE", collate) test(1590.08, Sys.getlocale(), oldlocale) # checked restored locale fully back to how it was before this test From cacdc92df71b777369a217b6c902c687cf35a70d Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 25 May 2020 20:25:59 -0600 Subject: [PATCH 007/588] further relaxation of 1590.04 and 1590.07; base R ordering of identical strings in different encodings (#4494) --- inst/tests/tests.Rraw | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3a6148221d..f51f2b641f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8140,17 +8140,17 @@ test(1590.02, x1==x2) test(1590.03, forderv( c(x2,x1,x1,x2)), integer()) # desirable consistent result given identical(x1, x2) # ^^ data.table consistent over time regardless of which version of R or locale baseR = base::order(c(x2,x1,x1,x2)) - # Even though C locale and identical(x1,x2), base R considers the encoding too; i.e. orders the same-encoding together. - # In R <= 4.0.0, base R put x2 (UTF-8) before x1 (latin1). - # Then in R-devel around May 2020, R-devel on Windows started putting x1 before x2. - # Jan emailed R-devel on 23 May 2020. PR#4492 retained this test of base R but relaxed the encoding to be in either order. - # It's good to know that baseR changed. We still want to know in future if base R changes again (so we relaxed 1590.04 and 1590.07 rather than remove them). -test(1590.04, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4))) + # Even though C locale and identical(x1,x2), base R<=4.0.0 considers the encoding too; i.e. orders the encoding together x2 (UTF-8) before x1 (latin1). + # Then around May 2020, R-devel (but just on Windows) started either respecting identical() like data.table has always done, or put latin1 before UTF-8. + # Jan emailed R-devel on 23 May 2020. + # We relaxed 1590.04 and 1590.07 (tests of base R behaviour) rather than remove them, PR#4492 and its follow-up. But these two tests + # are so relaxed now that they barely testing anything. It appears base R behaviour is undefined in this rare case of identical strings in different encodings. +test(1590.04, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) || identical(baseR, 1:4)) Encoding(x2) = "unknown" test(1590.05, x1!=x2) test(1590.06, forderv( c(x2,x1,x1,x2)), INT(1,4,2,3)) # consistent with Windows-1252 result, tested further below baseR = base::order(c(x2,x1,x1,x2)) -test(1590.07, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4))) +test(1590.07, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) || identical(baseR, 1:4)) Sys.setlocale("LC_CTYPE", ctype) Sys.setlocale("LC_COLLATE", collate) test(1590.08, Sys.getlocale(), oldlocale) # checked restored locale fully back to how it was before this test From d3bd408ab3c2f46365b9e0d94a508d7874034d01 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 29 May 2020 01:01:05 -0600 Subject: [PATCH 008/588] is.sorted now multi-column and does not use forderv (#4508) --- NEWS.md | 2 + R/data.table.R | 2 +- R/setkey.R | 13 ++-- inst/tests/tests.Rraw | 24 ++++++- src/data.table.h | 10 +-- src/forder.c | 153 +++++++++++++++++++++++++++++++----------- src/init.c | 24 +++---- src/uniqlist.c | 1 + 8 files changed, 162 insertions(+), 67 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7ecd4bc5af..c90352ed87 100644 --- a/NEWS.md +++ b/NEWS.md @@ -109,6 +109,8 @@ unit = "s") 13. A relatively rare case of segfault when combining non-equi joins with `by=.EACHI` is now fixed, closes [#4388](https://github.com/Rdatatable/data.table/issues/4388). +14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report. + ## NOTES 0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. diff --git a/R/data.table.R b/R/data.table.R index 98651d6e0b..08db908db5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1336,7 +1336,7 @@ replace_dot_alias = function(e) { if (is.data.table(jval)) { setattr(jval, 'class', class(x)) # fix for #64 - if (haskey(x) && all(key(x) %chin% names(jval)) && suppressWarnings(is.sorted(jval, by=key(x)))) # TO DO: perhaps this usage of is.sorted should be allowed internally then (tidy up and make efficient) + if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) setattr(jval, 'sorted', key(x)) if (any(sapply(jval, is.null))) stop("Internal error: j has created a data.table result containing a NULL column") # nocov } diff --git a/R/setkey.R b/R/setkey.R index 334ca1e801..1f3763b1f6 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -155,20 +155,15 @@ setreordervec = function(x, order) .Call(Creorder, x, order) # The others (order, sort.int etc) are turned off to protect ourselves from using them internally, for speed and for # consistency; e.g., consistent twiddling of numeric/integer64, NA at the beginning of integer, locale ordering of character vectors. -is.sorted = function(x, by=seq_along(x)) { +is.sorted = function(x, by=NULL) { if (is.list(x)) { - warning("Use 'if (length(o <- forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted.") - # could pass through a flag for forderv to return early on first FALSE. But we don't need that internally - # since internally we always then need ordering, an it's better in one step. Don't want inefficiency to creep in. - # This is only here for user/debugging use to check/test valid keys; e.g. data.table:::is.sorted(DT,by) - 0L == length(forderv(x,by,retGrp=FALSE,sort=TRUE)) + if (missing(by)) by = seq_along(x) # wouldn't make sense when x is a vector; hence by=seq_along(x) is not the argument default + if (is.character(by)) by = chmatch(by, names(x)) } else { if (!missing(by)) stop("x is vector but 'by' is supplied") - .Call(Cfsorted, x) } - # Cfsorted could be named CfIsSorted, but since "sorted" is an adjective not verb, it's clear; e.g., Cfsort would sort it ("sort" is verb). + .Call(Cissorted, x, as.integer(by)) # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1) - # Important to call forder.c::fsorted here, for consistent character ordering and numeric/integer64 twiddling. } ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f51f2b641f..fd43a6119c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4014,8 +4014,28 @@ test(1162.09, length(forderv(DT, by=2:3)), 0L) setkey(DT) # test number 1162.10 skipped because if it fails it confusingly prints out as 1662.1 not 1662.10 test(1162.10, length(forderv(DT, by=1:3)), 0L) -test(1162.11, is.sorted(DT, by=1:3), TRUE, warning="Use.*forderv.*for efficiency in one step, so you have o as well if not sorted") -test(1162.12, is.sorted(DT, by=2:1), FALSE, warning="Use.*forderv.*for efficiency in one step, so you have o as well if not sorted") +test(1162.11, is.sorted(DT, by=1:3), TRUE) +test(1162.12, is.sorted(DT, by=2:1), FALSE) +test(1162.13, is.sorted(DT), TRUE) +DT = data.table(A=INT(1,1,2), B=c(NA,"a",NA)) +test(1162.14, is.sorted(DT), TRUE) +test(1162.15, is.sorted(DT, by=c("B","A")), FALSE) +DT = data.table(A=INT(1,1,2), B=c("a",NA,NA)) +test(1162.16, is.sorted(DT), FALSE) +test(1162.17, is.sorted(DT, by=2), FALSE) +if (test_bit64) { + DT[, A:=as.integer64(A)] + test(1162.18, is.sorted(DT, by="A"), TRUE) # tests the single-column special case + test(1162.19, is.sorted(DT), FALSE) # tests the 2-column case branch for integer64 + DT[2, B:="b"] + test(1162.20, is.sorted(DT), TRUE) +} +utf8_strings = c("\u00a1tas", "\u00de") +latin1_strings = iconv(utf8_strings, from="UTF-8", to="latin1") +DT = data.table(A=c(utf8_strings, latin1_strings), B=1:4) +test(1162.21, is.sorted(DT), FALSE) +setkey(DT) +test(1162.22, is.sorted(DT), TRUE) # FR #351 - last on length=0 arguments x <- character(0) diff --git a/src/data.table.h b/src/data.table.h index 9be142086b..fca63a0d69 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -23,8 +23,8 @@ #define IS_FALSE(x) (TYPEOF(x)==LGLSXP && LENGTH(x)==1 && LOGICAL(x)[0]==FALSE) #define IS_TRUE_OR_FALSE(x) (TYPEOF(x)==LGLSXP && LENGTH(x)==1 && LOGICAL(x)[0]!=NA_LOGICAL) -#define SIZEOF(x) sizes[TYPEOF(x)] -#define TYPEORDER(x) typeorder[x] +#define SIZEOF(x) __sizes[TYPEOF(x)] +#define TYPEORDER(x) __typeorder[x] #ifdef MIN # undef MIN @@ -92,8 +92,8 @@ extern SEXP sym_datatable_locked; extern double NA_INT64_D; extern long long NA_INT64_LL; extern Rcomplex NA_CPLX; // initialized in init.c; see there for comments -extern size_t sizes[100]; // max appears to be FUNSXP = 99, see Rinternals.h -extern size_t typeorder[100]; +extern size_t __sizes[100]; // max appears to be FUNSXP = 99, see Rinternals.h +extern size_t __typeorder[100]; // __ prefix otherwise if we use these names directly, the SIZEOF define ends up using the local one long long DtoLL(double x); double LLtoD(long long x); @@ -115,7 +115,7 @@ int checkOverAlloc(SEXP x); // forder.c int StrCmp(SEXP x, SEXP y); -uint64_t dtwiddle(void *p, int i); +uint64_t dtwiddle(const void *p, int i); SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg); int getNumericRounding_C(); diff --git a/src/forder.c b/src/forder.c index ea0be76d04..1ee8c74ce3 100644 --- a/src/forder.c +++ b/src/forder.c @@ -215,7 +215,7 @@ int StrCmp(SEXP x, SEXP y) if (x == y) return 0; // same cached pointer (including NA_STRING==NA_STRING) if (x == NA_STRING) return -1; // xy - return strcmp(CHAR(ENC2UTF8(x)), CHAR(ENC2UTF8(y))); + return strcmp(CHAR(ENC2UTF8(x)), CHAR(ENC2UTF8(y))); // TODO: always calling ENC2UTF8 here could be expensive } /* ENC2UTF8 handles encoding issues by converting all marked non-utf8 encodings alone to utf8 first. The function could be wrapped in the first if-statement already instead of at the last stage, but this is to ensure that all-ascii cases are handled with maximum efficiency. @@ -394,7 +394,7 @@ int getNumericRounding_C() // for signed integers it's easy: flip sign bit to swap positives and negatives; the resulting unsigned is in the right order with INT_MIN ending up as 0 // for floating point finite you have to flip the other bits too if it was signed: http://stereopsis.com/radix.html -uint64_t dtwiddle(void *p, int i) +uint64_t dtwiddle(const void *p, int i) { union { double d; @@ -1257,50 +1257,127 @@ void radix_r(const int from, const int to, const int radix) { } -SEXP fsorted(SEXP x) +SEXP issorted(SEXP x, SEXP by) { // Just checks if ordered and returns FALSE early if not. Does not return ordering if so, unlike forder. // Always increasing order with NA's first - // Similar to base:is.unsorted but accepts NA at the beginning (standard in data.table and considered sorted) rather than returning NA when NA present. + // Similar to base:is.unsorted but accepts NA at the beginning (standard in data.table and considered sorted) rather than + // returning NA when NA present, and is multi-column. // TODO: test in big steps first to return faster if unsortedness is at the end (a common case of rbind'ing data to end) - // These are all sequential access to x, so very quick and cache efficient. Could be parallel by checking continuity at batch boundaries. - const int n = length(x); - if (n <= 1) return(ScalarLogical(TRUE)); - if (!isVectorAtomic(x)) STOP(_("is.sorted (R level) and fsorted (C level) only to be used on vectors. If needed on a list/data.table, you'll need the order anyway if not sorted, so use if (length(o<-forder(...))) for efficiency in one step, or equivalent at C level")); - int i=1; - switch(TYPEOF(x)) { - case INTSXP : case LGLSXP : { - int *xd = INTEGER(x); - while (i=xd[i-1]) i++; - } break; - case REALSXP : - if (inherits(x,"integer64")) { - int64_t *xd = (int64_t *)REAL(x); + // These are all sequential access to x, so quick and cache efficient. Could be parallel by checking continuity at batch boundaries. + + if (!isNull(by) && !isInteger(by)) STOP(_("Internal error: issorted 'by' must be NULL or integer vector")); + if (isVectorAtomic(x) || length(by)==1) { + // one-column special case is very common so specialize it by avoiding column-type switches inside the row-loop later + if (length(by)==1) { + if (INTEGER(by)[0]<1 || INTEGER(by)[0]>length(x)) STOP(_("issorted 'by' [%d] out of range [1,%d]"), INTEGER(by)[0], length(x)); + x = VECTOR_ELT(x, INTEGER(by)[0]-1); + } + const int n = length(x); + if (n <= 1) return(ScalarLogical(TRUE)); + if (!isVectorAtomic(x)) STOP(_("is.sorted does not work on list columns")); + int i=1; + switch(TYPEOF(x)) { + case INTSXP : case LGLSXP : { + int *xd = INTEGER(x); while (i=xd[i-1]) i++; - } else { - double *xd = REAL(x); - while (i=dtwiddle(xd,i-1)) i++; + } break; + case REALSXP : + if (inherits(x,"integer64")) { + int64_t *xd = (int64_t *)REAL(x); + while (i=xd[i-1]) i++; + } else { + double *xd = REAL(x); + while (i=dtwiddle(xd,i-1)) i++; // TODO: change to loop over any NA or -Inf at the beginning and then proceed without dtwiddle() (but rounding) + } + break; + case STRSXP : { + SEXP *xd = STRING_PTR(x); + i = 0; + while (i1 + // pre-save lookups to save deep switch later for each column type + size_t *sizes = (size_t *)R_alloc(ncol, sizeof(size_t)); + const char **ptrs = (const char **)R_alloc(ncol, sizeof(char *)); + int *types = (int *)R_alloc(ncol, sizeof(int)); + for (int j=0; jlength(x)) STOP(_("issorted 'by' [%d] out of range [1,%d]"), c, length(x)); + SEXP col = VECTOR_ELT(x, c-1); + sizes[j] = SIZEOF(col); + switch(TYPEOF(col)) { + case INTSXP: case LGLSXP: + types[j] = 0; + ptrs[j] = (const char *)INTEGER(col); + break; + case REALSXP: + types[j] = inherits(col, "integer64") ? 2 : 1; + ptrs[j] = (const char *)REAL(col); + break; + case STRSXP: + types[j] = 3; + ptrs[j] = (const char *)STRING_PTR(col); + break; + default: + STOP(_("type '%s' is not yet supported"), type2char(TYPEOF(col))); // # nocov + } + } + for (R_xlen_t i=1; ip[-1]; + } break; + case 1: { // regular double in REALSXP + const double *p = (const double *)colp; + ok = dtwiddle(p,0)>dtwiddle(p,-1); // TODO: avoid dtwiddle by looping over any NA at the beginning, and remove NumericRounding. + } break; + case 2: { // integer64 in REALSXP + const int64_t *p = (const int64_t *)colp; + ok = p[0]>p[-1]; + } break; + case 3 : { // STRSXP + const SEXP *p = (const SEXP *)colp; + if (*p==NA_STRING) { + ok = false; // previous value not NA (otherwise memcmp would have returned equal above) so can't be ordered + } else { + ok = (NEED2UTF8(p[0]) || NEED2UTF8(p[-1]) ? // TODO: provide user option to choose ascii-only mode + strcmp(CHAR(ENC2UTF8(p[0])), CHAR(ENC2UTF8(p[-1]))) : + strcmp(CHAR(p[0]), CHAR(p[-1]))) >= 0; + } + } break; + default : + STOP(_("type '%s' is not yet supported"), type2char(TYPEOF(x))); // # nocov + } + if (!ok) return ScalarLogical(FALSE); // not sorted so return early + break; // this item is greater than previous in this column so ignore any remaining columns on this row } - } break; - default : - STOP(_("type '%s' is not yet supported"), type2char(TYPEOF(x))); } - return ScalarLogical(i==n); + return ScalarLogical(TRUE); } SEXP isOrderedSubset(SEXP x, SEXP nrowArg) diff --git a/src/init.c b/src/init.c index aed2da3dbd..916db3ab57 100644 --- a/src/init.c +++ b/src/init.c @@ -33,8 +33,8 @@ SEXP sym_datatable_locked; double NA_INT64_D; long long NA_INT64_LL; Rcomplex NA_CPLX; -size_t sizes[100]; -size_t typeorder[100]; +size_t __sizes[100]; +size_t __typeorder[100]; // .Calls SEXP setattrib(); @@ -66,7 +66,7 @@ SEXP fcast(); SEXP uniqlist(); SEXP uniqlengths(); SEXP forder(); -SEXP fsorted(); +SEXP issorted(); SEXP gforce(); SEXP gsum(); SEXP gmean(); @@ -152,7 +152,7 @@ R_CallMethodDef callMethods[] = { {"Cuniqlist", (DL_FUNC) &uniqlist, -1}, {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1}, {"Cforder", (DL_FUNC) &forder, -1}, -{"Cfsorted", (DL_FUNC) &fsorted, -1}, +{"Cissorted", (DL_FUNC) &issorted, -1}, {"Cgforce", (DL_FUNC) &gforce, -1}, {"Cgsum", (DL_FUNC) &gsum, -1}, {"Cgmean", (DL_FUNC) &gmean, -1}, @@ -221,15 +221,15 @@ R_ExternalMethodDef externalMethods[] = { }; static void setSizes() { - for (int i=0; i<100; ++i) { sizes[i]=0; typeorder[i]=0; } + for (int i=0; i<100; ++i) { __sizes[i]=0; __typeorder[i]=0; } // only these types are currently allowed as column types : - sizes[LGLSXP] = sizeof(int); typeorder[LGLSXP] = 0; - sizes[RAWSXP] = sizeof(Rbyte); typeorder[RAWSXP] = 1; - sizes[INTSXP] = sizeof(int); typeorder[INTSXP] = 2; // integer and factor - sizes[REALSXP] = sizeof(double); typeorder[REALSXP] = 3; // numeric and integer64 - sizes[CPLXSXP] = sizeof(Rcomplex); typeorder[CPLXSXP] = 4; - sizes[STRSXP] = sizeof(SEXP *); typeorder[STRSXP] = 5; - sizes[VECSXP] = sizeof(SEXP *); typeorder[VECSXP] = 6; // list column + __sizes[LGLSXP] = sizeof(int); __typeorder[LGLSXP] = 0; + __sizes[RAWSXP] = sizeof(Rbyte); __typeorder[RAWSXP] = 1; + __sizes[INTSXP] = sizeof(int); __typeorder[INTSXP] = 2; // integer and factor + __sizes[REALSXP] = sizeof(double); __typeorder[REALSXP] = 3; // numeric and integer64 + __sizes[CPLXSXP] = sizeof(Rcomplex); __typeorder[CPLXSXP] = 4; + __sizes[STRSXP] = sizeof(SEXP *); __typeorder[STRSXP] = 5; + __sizes[VECSXP] = sizeof(SEXP *); __typeorder[VECSXP] = 6; // list column if (sizeof(char *)>8) error(_("Pointers are %d bytes, greater than 8. We have not tested on any architecture greater than 64bit yet."), sizeof(char *)); // One place we need the largest sizeof is the working memory malloc in reorder.c } diff --git a/src/uniqlist.c b/src/uniqlist.c index 447b3ea057..e4cdfaa0e5 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -114,6 +114,7 @@ SEXP uniqlist(SEXP l, SEXP order) // fix for #469, when key is set, duplicated calls uniqlist, where encoding // needs to be taken care of. b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break; // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings. + // TODO: surely faster way than this two deep STRING_ELT() case REALSXP : ulv = (unsigned long long *)REAL(v); b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup) From 8b93bb22715b45d38acf185f40d573bda8748cb4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 1 Jun 2020 18:22:03 -0600 Subject: [PATCH 009/588] bmerge.c compile warning in dev (#4518) --- src/bmerge.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bmerge.c b/src/bmerge.c index 15d7d6f4f7..5273ae59b9 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -205,7 +205,7 @@ static union { SEXP s; } ival, xval; -static uint64_t i64twiddle(void *p, int i) +static uint64_t i64twiddle(const void *p, int i) { return ((uint64_t *)p)[i] ^ 0x8000000000000000; // Always ascending and NA first (0) when used by bmerge @@ -343,7 +343,7 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisg double *dic = REAL(ic); double *dxc = REAL(xc); isInt64 = INHERITS(xc, char_integer64); - uint64_t (*twiddle)(void *, int) = isInt64 ? &i64twiddle : &dtwiddle; + uint64_t (*twiddle)(const void *, int) = isInt64 ? &i64twiddle : &dtwiddle; // TODO: remove this last remaining use of i64twiddle. ival.ull = twiddle(dic, ir); while(xlow < xupp-1) { From cb44a1e41ade6150ec2ea7a403d1eeaf757efa1e Mon Sep 17 00:00:00 2001 From: JenspederM <37183160+JenspederM@users.noreply.github.com> Date: Tue, 2 Jun 2020 03:06:17 +0200 Subject: [PATCH 010/588] Updated my name (#4336) * Updated my name * Removed white space --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5dd73e284c..0e52423574 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -57,7 +57,7 @@ Authors@R: c( person("David","Simons", role="ctb"), person("Elliott","Sales de Andrade", role="ctb"), person("Cole","Miller", role="ctb"), - person("@JenspederM","", role="ctb")) + person("Jens Peder","Meldgaard", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml From 389a5f89683c364d9fac64452b0b008a146840b4 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 2 Jun 2020 02:13:40 +0100 Subject: [PATCH 011/588] build and install stuff (#4510) --- Makefile | 3 +++ configure | 24 ++++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 5cd797ca75..634c823d9a 100644 --- a/Makefile +++ b/Makefile @@ -42,3 +42,6 @@ test: check: _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.12.9.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error +.PHONY: revision +revision: + echo "Revision: $(shell git rev-parse HEAD)" >> DESCRIPTION diff --git a/configure b/configure index e29430ad77..4aef525edc 100755 --- a/configure +++ b/configure @@ -1,4 +1,18 @@ #!/bin/sh + +# Find R compilers +CC=`${R_HOME}/bin/R CMD config CC` +CFLAGS=`${R_HOME}/bin/R CMD config CFLAGS` +# compiler and flags to 'cc' file +echo "CC=${CC}" > inst/cc +echo "CFLAGS=${CFLAGS}" >> inst/cc + +# gcc compiler info to output #3291 +case $CC in gcc*) + GCCV=`${CC} -dumpfullversion -dumpversion` + echo "$CC $GCCV" +esac + # Let's keep this simple. If pkg-config is available, use it. Otherwise print # the helpful message to aid user if compilation does fail. Note 25 of R-exts: # "[pkg-config] is available on the machines used to produce the CRAN binary packages" @@ -51,10 +65,6 @@ fi version=`pkg-config --modversion zlib` echo "zlib ${version} is available ok" -# Find R compilers -CC=`${R_HOME}/bin/R CMD config CC` -CFLAGS=`${R_HOME}/bin/R CMD config CFLAGS` - # Test if we have a OPENMP compatible compiler # Aside: ${SHLIB_OPENMP_CFLAGS} does not appear to be defined at this point according to Matt's testing on # Linux, and R CMD config SHLIB_OPENMP_CFLAGS also returns 'no information for variable'. That's not @@ -77,10 +87,4 @@ else sed -e "s|@openmp_cflags@|\$(SHLIB_OPENMP_CFLAGS)|" src/Makevars.in > src/Makevars fi -# compiler info to output #3291 -if [ "$CC"=~"gcc" ]; then - GCCV=`${CC} -dumpfullversion -dumpversion` - echo "$CC $GCCV" -fi - exit 0 From 8a57236439d2c0be78bc2f9a40b881822fea8135 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 2 Jun 2020 02:23:30 +0100 Subject: [PATCH 012/588] amend doc for C exports, #4504 (#4507) --- NEWS.md | 2 +- man/cdt.Rd | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index c90352ed87..75460bd6bb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,7 +8,7 @@ 1. `%chin%` and `chmatch(x, table)` are faster when `x` is length 1, `table` is long, and `x` occurs near the start of `table`. Thanks to Michael Chirico for the suggestion, [#4117](https://github.com/Rdatatable/data.table/pull/4117#discussion_r358378409). -2. The C function `CsubsetDT` is now exported for use by other packages, [#3751](https://github.com/Rdatatable/data.table/issues/3751). Thanks to Leonardo Silvestri for the request and the PR. This uses R's `R_RegisterCCallable` and `R_GetCCallable` mechanism, [R-exts§5.4.3](https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Linking-to-native-routines-in-other-packages) and [`?cdt`](https://rdatatable.gitlab.io/data.table/reference/cdt.html). +2. The C function `CsubsetDT` is now exported for use by other packages, [#3751](https://github.com/Rdatatable/data.table/issues/3751). Thanks to Leonardo Silvestri for the request and the PR. This uses R's `R_RegisterCCallable` and `R_GetCCallable` mechanism, [R-exts§5.4.3](https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Linking-to-native-routines-in-other-packages) and [`?cdt`](https://rdatatable.gitlab.io/data.table/reference/cdt.html). Note that organisation of our C interface will be changed in next release. 3. `print` method for `data.table`s gains `trunc.cols` argument (and corresponding option `datatable.print.trunc.cols`, default `FALSE`), [#1497](https://github.com/Rdatatable/data.table/issues/1497), part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). This prints only as many columns as fit in the console without wrapping to new lines (e.g., the first 5 of 80 columns) and a message that states the count and names of the variables not shown. When `class=TRUE` the message also contains the classes of the variables. `data.table` has always automatically truncated _rows_ of a table for efficiency (e.g. printing 10 rows instead of 10 million); in the future, we may do the same for _columns_ (e.g., 10 columns instead of 20,000) by changing the default for this argument. Thanks to @nverno for the initial suggestion and to @TysonStanley for the PR. diff --git a/man/cdt.Rd b/man/cdt.Rd index 13fa58b64d..ea7c3a76eb 100644 --- a/man/cdt.Rd +++ b/man/cdt.Rd @@ -2,6 +2,7 @@ \alias{cdatatable} \title{ data.table exported C routines } \description{ + Note that this interface is going to be changed in next release. Some of internally used C routines are now exported. This interface should be considered experimental. List of exported C routines and their signatures are provided below in the usage section. } \usage{ From 09fb6c3181b60f5c2e4201aca0b66accb46a4485 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 2 Jun 2020 02:47:28 +0100 Subject: [PATCH 013/588] unload namespace before reinstalling, #4403 (#4404) --- NEWS.md | 2 ++ R/devel.R | 31 +++++++++++++++++++++---------- man/update.dev.pkg.Rd | 17 ++++++----------- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/NEWS.md b/NEWS.md index 75460bd6bb..514ea3bf62 100644 --- a/NEWS.md +++ b/NEWS.md @@ -151,6 +151,8 @@ unit = "s") 8. Change of `c.POSIXct` method planned for R 4.1.0 impacted `foverlaps` function that could raise `'origin' must be supplied` error. Fix for planned change has been provided in [#4428](https://github.com/Rdatatable/data.table/pull/4428). +9. `data.table::update.dev.pkg()` now unloads the `data.table` namespace to alleviate a DLL lock issue on Windows, [#4403](https://github.com/Rdatatable/data.table/issues/4403). Thanks to @drag5 for reporting. + # data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) diff --git a/R/devel.R b/R/devel.R index 8db74e47ce..b0dfb71858 100644 --- a/R/devel.R +++ b/R/devel.R @@ -7,7 +7,7 @@ dcf.lib = function(pkg, field, lib.loc=NULL){ if (nzchar(dcf)) read.dcf(dcf, fields=field)[1L] else NA_character_ } -dcf.repo = function(pkg, repo, field, type){ +dcf.repo = function(pkg, repo, field, type) { # get DESCRIPTION metadata field from remote PACKAGES file stopifnot(is.character(pkg), is.character(field), length(pkg)==1L, length(field)==1L, is.character(repo), length(repo)==1L, field!="Package") idx = file(file.path(contrib.url(repo, type=type),"PACKAGES")) @@ -17,22 +17,33 @@ dcf.repo = function(pkg, repo, field, type){ dcf[dcf[,"Package"]==pkg, field][[1L]] } -update.dev.pkg = function(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...){ +update.dev.pkg = function(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { + # this works for any package, not just data.table pkg = object # perform package upgrade when new Revision present stopifnot(is.character(pkg), length(pkg)==1L, !is.na(pkg), is.character(repo), length(repo)==1L, !is.na(repo), is.character(field), length(field)==1L, !is.na(field), is.null(lib) || (is.character(lib) && length(lib)==1L && !is.na(lib))) + # get Revision field from remote repository PACKAGES file una = is.na(ups<-dcf.repo(pkg, repo, field, type)) - upg = una | !identical(ups, dcf.lib(pkg, field, lib.loc=lib)) - if (upg) utils::install.packages(pkg, repos=repo, type=type, lib=lib, ...) - if (una) cat(sprintf("No commit information found in DESCRIPTION file for %s package. Unsure '%s' is correct field name in PACKAGES file in your devel repository '%s'.\n", pkg, field, file.path(repo, "src","contrib","PACKAGES"))) - cat(sprintf("R %s package %s %s (%s)\n", - pkg, - c("is up-to-date at","has been updated to")[upg+1L], - dcf.lib(pkg, field, lib.loc=lib), - utils::packageVersion(pkg, lib.loc=lib))) + if (una) + cat(sprintf("No revision information found in DESCRIPTION file for %s package. Unsure '%s' is correct field in PACKAGES file in your package repository '%s'. Otherwise package will be re-installed every time, proceeding to installation.\n", + pkg, field, contrib.url(repo, type=type))) + # see if Revision is different then currently installed Revision, note that installed package will have Revision info only when it was installed from remote devel repo + upg = una || !identical(ups, dcf.lib(pkg, field, lib.loc=lib)) + # update.dev.pkg fails on windows R 4.0.0, we have to unload package namespace before installing new version #4403 + on.exit({ + if (upg) { + unloadNamespace(pkg) ## hopefully will release dll lock on Windows + utils::install.packages(pkg, repos=repo, type=type, lib=lib, ...) + } + cat(sprintf("R %s package %s %s (%s)\n", + pkg, + c("is up-to-date at","has been updated to")[upg+1L], + unname(read.dcf(system.file("DESCRIPTION", package=pkg, lib.loc=lib, mustWork=TRUE), fields=field)[, field]), + utils::packageVersion(pkg, lib.loc=lib))) + }) } # non-exported utility when using devel version #3272: data.table:::.git() diff --git a/man/update.dev.pkg.Rd b/man/update.dev.pkg.Rd index f4802641cc..96f87c296d 100644 --- a/man/update.dev.pkg.Rd +++ b/man/update.dev.pkg.Rd @@ -3,15 +3,9 @@ \alias{update.dev.pkg} \title{Perform update of development version of a package} \description{ - It will download and install package from devel repository only when new commit is - available there, otherwise only PACKAGES file is transferred. Defaults are set to update \code{data.table}, other - packages can be used. Their repository has to include git commit - information in PACKAGES file. + It will download and install package from devel repository only when new commit is available there, otherwise only PACKAGES file is transferred. Defaults are set to update \code{data.table}, other packages can be used as well. Their repository has to include git commit information in PACKAGES file. } - -\usage{\method{update}{dev.pkg}(object="data.table", -repo="https://Rdatatable.gitlab.io/data.table", field="Revision", -type=getOption("pkgType"), lib=NULL, \dots) + \usage{\method{update}{dev.pkg}(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, \dots) } \arguments{ \item{object}{ character scalar, package name. } @@ -25,9 +19,10 @@ type=getOption("pkgType"), lib=NULL, \dots) \item{\dots}{ passed to \code{\link[utils]{install.packages}}. } } \details{ - In case if devel repository does not provide package binaries user has - have development tools installed for package compilation to use - this function. + In case if a devel repository does not provide binaries user will need development tools installed for package compilation, like \emph{Rtools} on Windows, and eventually set \code{type="source"}. +} +\note{ + Package namespace is unloaded before attempting to install newer version. } \value{ NULL. From b1f73cfb5e3542b95e4b727736176d6ddea5a7b5 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 1 Jun 2020 20:42:58 -0600 Subject: [PATCH 014/588] .Rd 90 char width warning; follow-up to #4404 --- man/update.dev.pkg.Rd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/man/update.dev.pkg.Rd b/man/update.dev.pkg.Rd index 96f87c296d..72b6e7b166 100644 --- a/man/update.dev.pkg.Rd +++ b/man/update.dev.pkg.Rd @@ -5,7 +5,9 @@ \description{ It will download and install package from devel repository only when new commit is available there, otherwise only PACKAGES file is transferred. Defaults are set to update \code{data.table}, other packages can be used as well. Their repository has to include git commit information in PACKAGES file. } - \usage{\method{update}{dev.pkg}(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, \dots) +\usage{\method{update}{dev.pkg}(object="data.table", + repo="https://Rdatatable.gitlab.io/data.table", + field="Revision", type=getOption("pkgType"), lib=NULL, \dots) } \arguments{ \item{object}{ character scalar, package name. } From 738289daa62ae2e6e1da3c8c3e9770a045c5f219 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 2 Jun 2020 07:37:27 +0100 Subject: [PATCH 015/588] all.equal ignore row order, fix #4422 (#4423) --- NEWS.md | 2 ++ R/setops.R | 11 +++++------ inst/tests/tests.Rraw | 5 +++++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 514ea3bf62..68d69b8d1a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -111,6 +111,8 @@ unit = "s") 14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report. +15. `all.equal(DT1, DT2, ignore.row.order=TRUE)` could return TRUE incorrectly in the presence of NAs, [#4422](https://github.com/Rdatatable/data.table/issues/4422). + ## NOTES 0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. diff --git a/R/setops.R b/R/setops.R index 4c65773117..1ac949601b 100644 --- a/R/setops.R +++ b/R/setops.R @@ -216,13 +216,12 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu tolerance = 0 } jn.on = copy(names(target)) # default, possible altered later on - char.cols = vapply_1c(target,typeof)=="character" - if (!identical(tolerance, 0)) { # handling character columns only for tolerance!=0 - if (all(char.cols)) { - msg = c(msg, "Both datasets have character columns only, together with ignore.row.order this force 'tolerance' argument to 0, for character columns it does not have effect") + dbl.cols = vapply_1c(target,typeof)=="double" + if (!identical(tolerance, 0)) { + if (!any(dbl.cols)) { # dbl.cols handles (removed) "all character columns" (char.cols) case as well tolerance = 0 - } else if (any(char.cols)) { # character col cannot be the last one during rolling join - jn.on = jn.on[c(which(char.cols), which(!char.cols))] + } else { + jn.on = jn.on[c(which(!dbl.cols), which(dbl.cols))] # double column must be last for rolling join } } if (target_dup && current_dup) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fd43a6119c..6ce67d4dbb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16881,3 +16881,8 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN)) test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) A = data.table(A=as.complex(rep(NA, 5))) test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) + +# all.equal ignore row order improperly handle NAs #4422 +d1 = data.table(a=1:2, b=c(1L,NA)) +d2 = data.table(a=1:2, b=1:2) +test(2139.1, all.equal(d1, d2, ignore.row.order=TRUE), "Dataset 'current' has rows not present in 'target'") From 3feb33b936913a36b0e3e38acfb48d2cbbd83a74 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 2 Jun 2020 05:48:13 -0400 Subject: [PATCH 016/588] Non-equi joins are allow.cartesian = TRUE (#4493) --- NEWS.md | 2 ++ R/data.table.R | 3 +++ inst/tests/tests.Rraw | 10 ++++++++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 68d69b8d1a..99b369b540 100644 --- a/NEWS.md +++ b/NEWS.md @@ -113,6 +113,8 @@ unit = "s") 15. `all.equal(DT1, DT2, ignore.row.order=TRUE)` could return TRUE incorrectly in the presence of NAs, [#4422](https://github.com/Rdatatable/data.table/issues/4422). +16. Non-equi joins now automatically set `allow.cartesian=TRUE`, [4489](https://github.com/Rdatatable/data.table/issues/4489). Thanks to @Henrik-P for reporting. + ## NOTES 0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. diff --git a/R/data.table.R b/R/data.table.R index 08db908db5..20d7cfa396 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -428,6 +428,9 @@ replace_dot_alias = function(e) { on_ops = .parse_on(substitute(on), isnull_inames) on = on_ops[[1L]] ops = on_ops[[2L]] + if (any(ops > 1L)) { ## fix for #4489; ops = c("==", "<=", "<", ">=", ">", "!=") + allow.cartesian = TRUE + } # TODO: collect all '==' ops first to speeden up Cnestedid rightcols = colnamesInt(x, names(on), check_dups=FALSE) leftcols = colnamesInt(i, unname(on), check_dups=FALSE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6ce67d4dbb..85a9dbe771 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16882,7 +16882,13 @@ test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) A = data.table(A=as.complex(rep(NA, 5))) test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) -# all.equal ignore row order improperly handle NAs #4422 +# all.equal ignore row order improperly handle NAs, #4422 d1 = data.table(a=1:2, b=c(1L,NA)) d2 = data.table(a=1:2, b=1:2) -test(2139.1, all.equal(d1, d2, ignore.row.order=TRUE), "Dataset 'current' has rows not present in 'target'") +test(2139, all.equal(d1, d2, ignore.row.order=TRUE), "Dataset 'current' has rows not present in 'target'") + +# Set allow.cartesian = TRUE when non-equi, #4489 +dt = data.table(time = 1:8, v = INT(5,7,6,1,8,4,2,3)) +dt[time == 2L, v := 2L] +dt[time == 7L, v := 7L] +test(2140, dt[dt, on=.(time>time, v>v), .N, by=.EACHI], data.table(time=1:8, v=INT(5,2,6,1,8,4,7,3), N=INT(3,5,2,4,0,1,0,0))) From 17ac2d23467bdc01bf2a66edd92f06bb2723ee3a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 2 Jun 2020 17:56:16 +0800 Subject: [PATCH 017/588] Runlock max (#4183) --- NEWS.md | 2 +- R/data.table.R | 9 ++++++--- inst/tests/tests.Rraw | 15 +++++++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 99b369b540..efc1a43ca6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -89,7 +89,7 @@ unit = "s") 3. Dispatch of `first` and `last` functions now properly works again for `xts` objects, [#4053](https://github.com/Rdatatable/data.table/issues/4053). Thanks to @ethanbsmith for reporting. -4. If `.SD` is returned as-is during grouping, it is now unlocked for downstream usage, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). +4. If `.SD` is returned as-is during grouping, it is now unlocked for downstream usage, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks also to @mllg for detecting a problem with the initial fix here during the dev release [#4173](https://github.com/Rdatatable/data.table/issues/4173). 5. `GForce` is deactivated for `[[` on non-atomic input, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). diff --git a/R/data.table.R b/R/data.table.R index 20d7cfa396..c668fc8020 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1769,10 +1769,13 @@ replace_dot_alias = function(e) { ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose) } # unlock any locked data.table components of the answer, #4159 - runlock = function(x) { - if (is.recursive(x)) { + # MAX_DEPTH prevents possible infinite recursion from truly recursive object, #4173 + # TODO: is there an efficient way to get around this MAX_DEPTH limit? + MAX_DEPTH = 5L + runlock = function(x, current_depth = 1L) { + if (is.recursive(x) && current_depth <= MAX_DEPTH) { if (inherits(x, 'data.table')) .Call(C_unlock, x) - else return(lapply(x, runlock)) + else return(lapply(x, runlock, current_depth = current_depth + 1L)) } return(invisible()) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 85a9dbe771..ee09fc3517 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16811,8 +16811,19 @@ test(2130.03, print(DT), output=c(" x y", "1: 1 ", # .SD from grouping should be unlocked, part of #4159 x = data.table(a=1:3, b=4:6) -test(2131, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.locked'), - list(NULL, NULL, NULL)) +test(2131.1, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.locked'), + list(NULL, NULL, NULL)) +## truly recursive object (contains itself) can cause infinite recursion, #4173 +f = function(data) { + x = new.env() + x$a = 2 + x$b = x + x +} + +dt = data.table(x = rep(1:3, each = 3), y = runif(9)) +out = dt[, list(evaluated = list(f(copy(.SD)))), by = x] +test(2131.2, class(out$evaluated[[1L]]), 'environment') # S4 object not suported in fifelse and fcase, #4135 class2132 = setClass("class2132", slots=list(x="numeric")) From fbd4a3343cae3be177898e3643303e46a1a10618 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 3 Jun 2020 01:33:31 +0800 Subject: [PATCH 018/588] fix regression in detecting g[[ (#4414) --- R/data.table.R | 2 +- inst/tests/tests.Rraw | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index c668fc8020..75b6b290ed 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1541,7 +1541,7 @@ replace_dot_alias = function(e) { # g[[ only applies to atomic input, for now, was causing #4159 subopt = length(jsub) == 3L && (jsub[[1L]] == "[" || - (jsub[[1L]] == "[[" && eval(call('is.atomic', jsub[[2L]]), envir = x))) && + (jsub[[1L]] == "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), envir = x))) && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail" firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ee09fc3517..c71b27ca02 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8079,6 +8079,13 @@ dt = data.table(a=1:3) dt[ , l := .(list(1, 2, 3))] test(1581.16, dt[ , .(l = l[[1L]]), by=a, verbose=TRUE], dt[ , l := unlist(l)], output='(GForce FALSE)') +# make sure not to apply when `[[` is applied to a nested call, #4413 +DT = data.table(f1=c("a","b"), f2=c("x","y")) +l = list(a = c(x = "ax", y = "ay"), b = c(x = "bx", y = "by")) +test(1581.17, DT[ , as.list(l[[f1]])[[f2]], by=c("f1","f2")], + data.table(f1 = c("a", "b"), f2 = c("x", "y"), V1 = c("ax", "by"))) +test(1581.18, DT[, v:=l[[f1]][f2], by=c("f1","f2")], + data.table(f1=c("a","b"), f2=c("x","y"), v=c("ax", "by"))) # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) From 43465a408d618957b8c3bda6b6c53cdf221558a3 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 3 Jun 2020 21:05:11 -0600 Subject: [PATCH 019/588] Support positional specifiers on Windows for translations (#4523) --- src/data.table.h | 5 +++++ src/dt_stdio.h | 1 + src/fmelt.c | 4 ++-- src/freadR.c | 2 +- src/rbindlist.c | 6 +++--- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index fca63a0d69..1cf975e68b 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -13,6 +13,11 @@ #include "myomp.h" #include "types.h" #include "po.h" +#ifdef WIN32 // positional specifiers (%n$) used in translations; #4402 +//# define snprintf _sprintf_p // the non-n one in Windows takes n anyway so there's no separate _snprintf_f +#endif +#define sprintf USE_SNPRINTF_NOT_SPRINTF // prevent use of sprintf in data.table source; force us to use n always + // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); diff --git a/src/dt_stdio.h b/src/dt_stdio.h index 4e69e0d87e..f652da9805 100644 --- a/src/dt_stdio.h +++ b/src/dt_stdio.h @@ -23,6 +23,7 @@ #define DT_STDIO_H #if defined(__MINGW32__) || (defined __MINGW64__) #define __USE_MINGW_ANSI_STDIO 1 + #define _XOPEN_SOURCE 1 #include #define PRId64 "lld" #define PRIu64 "llu" diff --git a/src/fmelt.c b/src/fmelt.c index 3d1effa8a9..6824768e69 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -526,7 +526,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; if (thislen==0) continue; // so as not to bump level char buff[20]; - sprintf(buff, "%d", level++); + snprintf(buff, 20, "%d", level++); SEXP str = PROTECT(mkChar(buff)); for (int k=0; knarm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; if (thislen==0) continue; // so as not to bump level char buff[20]; - sprintf(buff, "%d", nlevel+1); + snprintf(buff, 20, "%d", nlevel+1); SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels for (int k=0; k=last) { // if tl>=0 then also tl>=last because last<=0 if (tl>=0) { - sprintf(warnStr, // not direct warning as we're inside tl region + snprintf(warnStr, 1000, // not direct warning as we're inside tl region _("Column %d of item %d is an ordered factor but level %d ['%s'] is missing from the ordered levels from column %d of item %d. " \ "Each set of ordered factor levels should be an ordered subset of the first longest. A regular factor will be created for this column."), w+1, i+1, k+1, CHAR(s), longestW+1, longestI+1); } else { - sprintf(warnStr, + snprintf(warnStr, 1000, _("Column %d of item %d is an ordered factor with '%s'<'%s' in its levels. But '%s'<'%s' in the ordered levels from column %d of item %d. " \ "A regular factor will be created for this column due to this ambiguity."), w+1, i+1, CHAR(levelsD[k-1]), CHAR(s), CHAR(s), CHAR(levelsD[k-1]), longestW+1, longestI+1); From 2063ed78580334e51a7509f3a014596594e53f67 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 3 Jun 2020 21:54:09 -0600 Subject: [PATCH 020/588] CRAN_Release script update only --- .dev/CRAN_Release.cmd | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index f9d435455e..7b1e0a3a94 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -304,13 +304,14 @@ cd R-devel # used for revdep testing: .dev/revdep.R. ./configure CFLAGS="-O2 -Wall -pedantic" make -# use latest available below `apt-cache search gcc-` or `clang-` -cd ../R-devel-strict-clang -./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="clang-8 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" +# use latest available `apt-cache search gcc-` or `clang-` +cd ~/build/R-devel-strict-clang +./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="clang-10 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make -cd ../R-devel-strict-gcc -./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="gcc-8 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" +cd ~/build/R-devel-strict-gcc +# gcc-10 (in dev currently) failed to build R, so using regular gcc-9 (9.3.0 as per focal/Pop!_OS 20.04) +./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="gcc-9 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make # See R-exts#4.3.3 From 343656849f5e2a3d35856d9d4223424ac4fe5efc Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 5 Jun 2020 01:21:45 -0600 Subject: [PATCH 021/588] attempt to add test of Chinese error message (#4524) --- inst/tests/tests.Rraw | 37 +++++++++++++++ src/data.table.h | 6 ++- src/dt_stdio.h | 1 - src/init.c | 2 + src/snprintf.c | 103 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 147 insertions(+), 2 deletions(-) create mode 100644 src/snprintf.c diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c71b27ca02..e28885fb4e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -25,6 +25,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { binary = data.table:::binary bmerge = data.table:::bmerge brackify = data.table:::brackify + Ctest_dt_win_snprintf = data.table:::Ctest_dt_win_snprintf chmatchdup = data.table:::chmatchdup compactprint = data.table:::compactprint cube.data.table = data.table:::cube.data.table @@ -16910,3 +16911,39 @@ dt = data.table(time = 1:8, v = INT(5,7,6,1,8,4,2,3)) dt[time == 2L, v := 2L] dt[time == 7L, v := 7L] test(2140, dt[dt, on=.(time>time, v>v), .N, by=.EACHI], data.table(time=1:8, v=INT(5,2,6,1,8,4,7,3), N=INT(3,5,2,4,0,1,0,0))) + +# repeat of test 450 for #4402 +test(2141, .Call(Ctest_dt_win_snprintf), NULL) +DT = data.table(a=1:3,b=4:6) +test(2142, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1") +if (.Platform$OS.type=="windows") local({ + x = list( + LC_COLLATE = "Chinese (Simplified)_China.936", + LC_CTYPE = "Chinese (Simplified)_China.936", + LC_MONETARY = "Chinese (Simplified)_China.936", + LC_NUMERIC = "C", + LC_TIME = "Chinese (Simplified)_China.936" + ) + for (i in seq_along(x)) { + lc = names(x)[[i]] + old = Sys.getlocale(lc) + Sys.setlocale(lc, x[[i]]) + on.exit(Sys.setlocale(lc, old), add = TRUE) + } + old = Sys.getenv('LANGUAGE') + Sys.setenv('LANGUAGE' = 'zh_CN') + on.exit({ + if (nzchar(old)) + Sys.setenv('LANGUAGE' = old) + else + Sys.unsetenv('LANGUAGE') + }, add = TRUE) + # triggered segfault here in #4402, Windows-only under translation. + # test that the argument order changes correctly (the 'item 2' moves to the beginning of the message) + # since the argument order changes in this example (and that was the crash) we don't need to test + # the display of the Chinese characters here. Thanks to @shrektan for all his help on this. + test(2143, rbind(DT,list(c=4L,a=7L)), error="2.*1.*c.*1") +}) +# test back to English (the argument order is back to 1,c,2,1) +test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1") + diff --git a/src/data.table.h b/src/data.table.h index 1cf975e68b..aff0088ac0 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -14,7 +14,7 @@ #include "types.h" #include "po.h" #ifdef WIN32 // positional specifiers (%n$) used in translations; #4402 -//# define snprintf _sprintf_p // the non-n one in Windows takes n anyway so there's no separate _snprintf_f +# define snprintf dt_win_snprintf // see our snprintf.c; tried and failed to link to _sprintf_p on Windows #endif #define sprintf USE_SNPRINTF_NOT_SPRINTF // prevent use of sprintf in data.table source; force us to use n always @@ -243,3 +243,7 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k); //fifelse.c SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); + +//snprintf.c +int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...); + diff --git a/src/dt_stdio.h b/src/dt_stdio.h index f652da9805..4e69e0d87e 100644 --- a/src/dt_stdio.h +++ b/src/dt_stdio.h @@ -23,7 +23,6 @@ #define DT_STDIO_H #if defined(__MINGW32__) || (defined __MINGW64__) #define __USE_MINGW_ANSI_STDIO 1 - #define _XOPEN_SOURCE 1 #include #define PRId64 "lld" #define PRIu64 "llu" diff --git a/src/init.c b/src/init.c index 916db3ab57..d650a64661 100644 --- a/src/init.c +++ b/src/init.c @@ -119,6 +119,7 @@ SEXP lock(); SEXP unlock(); SEXP islockedR(); SEXP allNAR(); +SEXP test_dt_win_snprintf(); // .Externals SEXP fastmean(); @@ -211,6 +212,7 @@ R_CallMethodDef callMethods[] = { {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1}, {"CtestMsgR", (DL_FUNC) &testMsgR, -1}, {"C_allNAR", (DL_FUNC) &allNAR, -1}, +{"Ctest_dt_win_snprintf", (DL_FUNC)&test_dt_win_snprintf, -1}, {NULL, NULL, 0} }; diff --git a/src/snprintf.c b/src/snprintf.c new file mode 100644 index 0000000000..44f7848d6a --- /dev/null +++ b/src/snprintf.c @@ -0,0 +1,103 @@ +// For translations (#4402) we need positional specifiers (%n$), a non-C99 POSIX extension. +// On Linux and Mac, standard snprintf supports positional specifiers. +// On Windows, we tried many things but just couldn't achieve it. This may be why R uses +// a third party library, trio, on Windows. But R does not expose trio for use by packages. +// So ... +// Rather than require compile flags (such as _XOPEN_SOURCE or POSIX_C_SOURCE), or require +// linking to particular Windows libraries which may be fragile over time depending on +// user's environments, we use the standard C99 features here. To do so, we simulate +// positionals via format massage. Just on Windows, all snprintf calls are replaced with +// this dt_win_snprintf via a #define in data.table.h. The goal of this massage is to be +// as light and minimal as possible. +// In C it is not possible, portably, to reorder a va_list (sadly). +// In C you must past the correct type to va_arg(), so even to navigate va_list you +// must parse and rely on fmt. But we don't want to reimplement all the types and modifiers. +// Hence, reordering the specifiers, passing the va_list to the library, and then +// putting the output strings into the desired order afterwards. +// NB: must be thread-safe + +#include "data.table.h" +#include +#undef snprintf // on Windows, just in this file, we do want to use the C library's snprintf + +int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + const char *ch = strstr(fmt, "%1$"); + if (ch==NULL) { + // no positionals present, just pass on to the C library vsnprintf as-is + int ans = vsnprintf(dest, n, fmt, ap); + va_end(ap); + return ans; + } + // Standards say that if one specifier uses position, they all must. Good. + // We will not allow repeats though; must be a permutation. + // As in C, there are few checks; wrong/mismatching positionals will be a crash. + // This is for messages/errors, so time should not be spent on a fast solution. + char *buff = (char *)malloc(n); // not R_alloc as we need to be thread-safe + if (!buff) error("Unable to allocate %d bytes for buffer in dt_win_snprintf", n); + int pos=1; + // Use dest as temp to write the reordered specifiers + char *ch2=dest; + #define NDELIM 2 + const char delim[NDELIM+1] = "\x7f\x7f"; // tokenize using 2 DELs + while (ch!=NULL) { // ch is resting on start of %pos$ in fmt + // Find end of %[parameter][flags][width][.precision][length]type + // https://en.wikipedia.org/wiki/Printf_format_string#Syntax + const char *start = strchr(ch, '$')+1; // look for $ since pos could be > 9 or potentially > 99 + const char *end = strpbrk(start,"diufFeEgGxXoscpaA"); // last character of specifier + *ch2++ = '%'; + strncpy(ch2, start, end-start+1); // write the specifer in order without the n$ part + ch2 += end-start+1; + strcpy(ch2, delim); // includes '\0' + ch2 += NDELIM; // now resting on the '\0' + char posstr[15]; // 15 to avoid C compiler warnings + snprintf(posstr, 15, "%%%d$", ++pos); // snprintf was #undef above, so this is the C library one + ch = strstr(fmt, posstr); + } + int narg = pos-1; + vsnprintf(buff, n, dest, ap); // dest used as tmp here, holds reordered specifiers same order as ap + // All the hard formatting work and va_arg type navigation has now been done by the C library + // Now we just need to put the string results for each argument back into the desired positions + // First create lookups so we can loop through fmt once replacing the specifiers as they appear + const char *arg[narg]; + int len[narg]; + ch = buff; + for (int i=0; i'9') error("When positional %n$ is used, all specifiers must include positional"); + int pos = atoi(ch+1); + ch = strpbrk(ch,"diufFeEgGxXoscpaA")+1; // move to the end of the specifier + strncpy(ch2, arg[pos-1], len[pos-1]); // write the result of the appropriate argument + ch2 += len[pos-1]; + } + *ch2='\0'; + free(buff); + va_end(ap); + return ch2-dest; +} + +SEXP test_dt_win_snprintf() +{ + char buff[50]; + dt_win_snprintf(buff, 50, "No pos %d%%%d ok", 42, -84); + if (strcmp(buff, "No pos 42%-84 ok")) error("dt_win_snprintf test 1 failed: %s", buff); + dt_win_snprintf(buff, 50, "With pos %1$d%%%2$d ok", 42, -84); + if (strcmp(buff, "With pos 42%-84 ok")) error("dt_win_snprintf test 2 failed: %s", buff); + dt_win_snprintf(buff, 50, "With pos %2$d%%%1$d ok", 42, -84); + if (strcmp(buff, "With pos -84%42 ok")) error("dt_win_snprintf test 3 failed: %s", buff); + dt_win_snprintf(buff, 50, "%3$s %1$d %4$10s %2$03d$", -99, 12, "hello%2$d", "short"); + if (strcmp(buff, "hello%2$d -99 short 012$")) error("dt_win_snprintf test 4 failed: %s", buff); + return R_NilValue; +} + From 3a5ed9b4eda89b47c7bbcfe96930947d067d199e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 7 Jun 2020 09:33:40 +0800 Subject: [PATCH 022/588] undef sprintf before blocking it (#4530) --- src/data.table.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/data.table.h b/src/data.table.h index aff0088ac0..eed481be69 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -16,6 +16,9 @@ #ifdef WIN32 // positional specifiers (%n$) used in translations; #4402 # define snprintf dt_win_snprintf // see our snprintf.c; tried and failed to link to _sprintf_p on Windows #endif +#ifdef sprintf +#undef sprintf +#endif #define sprintf USE_SNPRINTF_NOT_SPRINTF // prevent use of sprintf in data.table source; force us to use n always // #include // the debugging machinery + breakpoint aidee From fc9e4453c35c8639f57e528dd0ed89db337c7230 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 6 Jun 2020 21:18:34 -0600 Subject: [PATCH 023/588] Reworked dt_win_snprintf (#4531) --- src/snprintf.c | 230 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 186 insertions(+), 44 deletions(-) diff --git a/src/snprintf.c b/src/snprintf.c index 44f7848d6a..076b637971 100644 --- a/src/snprintf.c +++ b/src/snprintf.c @@ -18,86 +18,228 @@ #include "data.table.h" #include +#include // isdigit #undef snprintf // on Windows, just in this file, we do want to use the C library's snprintf int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) { + if (n<1) return 0; va_list ap; va_start(ap, fmt); - const char *ch = strstr(fmt, "%1$"); - if (ch==NULL) { + const char *strp[99]={NULL}; + int strl[99]={0}; + int narg=0; + // are any positional specifiers present? + // previously used strstr(fmt, "%1$") here but that could match to %%1$ and then + // what if there's another %1$ as well as the %%1$. Hence a more complicated + // loop here with more robust checks as well to catch mistakes in fmt + bool posSpec=false, nonPosSpec=false; + int specAlloc=0; // total characters of specifiers for alloc + const char *ch = fmt; + while (*ch!='\0') { + if (*ch!='%') {ch++; continue;} + if (ch[1]=='%') {ch+=2; continue; } // %% means literal % + // Find end of %[parameter][flags][width][.precision][length]type + // https://en.wikipedia.org/wiki/Printf_format_string#Syntax + // These letters do not appear in flags or length modifiers, just type + const char *end = strpbrk(ch,"diufFeEgGxXoscpaA"); + if (!end) { + // an error() call is not thread-safe; placing error in dest is better than a crash. This way + // we have a better chance of the user reporting the strange error and we'll see it's a fmt issue + // in the message itself. + snprintf(dest, n, "snprintf %-5s does not end with recognized type letter", ch); + return -1; + } + const char *d = ch+1; + if (*d=='-') d++; // to give helpful outside-range message for %-1$ too + while (isdigit(*d)) d++; + if (*d=='$') { + posSpec=true; + int pos = atoi(ch+1); + if (pos<1 || pos>99) { + // up to 99 supported here; should not need more than 99 in a message + snprintf(dest, n, "snprintf %.*s outside range [1,99]", (int)(d-ch+1), ch); + return -1; + } + if (pos>narg) narg=pos; + if (strp[pos-1]) { + // no dups allowed because it's reasonable to not support dups, but this wrapper + // could not cope with the same argument formatted differently; e.g. "%1$d %1$5d" + snprintf(dest, n, "snprintf %%%d$ appears twice", pos); + return -1; + } + strp[pos-1] = strchr(ch, '$')+1; + strl[pos-1] = end-strp[pos-1]+1; + specAlloc += strl[pos-1]+1; // +1 for leading '%' + } else { + nonPosSpec=true; + } + ch = end+1; + } + if (posSpec && nonPosSpec) { + // Standards state that if one specifier uses position, they all must; good. + snprintf(dest, n, "snprintf some %%n$ but not all"); + return -1; + } + if (!posSpec) { // no positionals present, just pass on to the C library vsnprintf as-is int ans = vsnprintf(dest, n, fmt, ap); va_end(ap); return ans; } - // Standards say that if one specifier uses position, they all must. Good. - // We will not allow repeats though; must be a permutation. - // As in C, there are few checks; wrong/mismatching positionals will be a crash. - // This is for messages/errors, so time should not be spent on a fast solution. - char *buff = (char *)malloc(n); // not R_alloc as we need to be thread-safe - if (!buff) error("Unable to allocate %d bytes for buffer in dt_win_snprintf", n); - int pos=1; - // Use dest as temp to write the reordered specifiers - char *ch2=dest; #define NDELIM 2 - const char delim[NDELIM+1] = "\x7f\x7f"; // tokenize using 2 DELs - while (ch!=NULL) { // ch is resting on start of %pos$ in fmt - // Find end of %[parameter][flags][width][.precision][length]type - // https://en.wikipedia.org/wiki/Printf_format_string#Syntax - const char *start = strchr(ch, '$')+1; // look for $ since pos could be > 9 or potentially > 99 - const char *end = strpbrk(start,"diufFeEgGxXoscpaA"); // last character of specifier + const char delim[NDELIM+1] = "\x7f\x7f"; // tokenize temporary using 2 DELs + specAlloc += narg*NDELIM + 1; // +1 for final '\0' + char *spec = (char *)malloc(specAlloc); // not R_alloc as we need to be thread-safe + if (!spec) { + // # nocov start + snprintf(dest, n, "snprintf: %d byte spec alloc failed", (int)specAlloc); + return -1; + // # nocov end + } + char *ch2 = spec; + for (int i=0; i=n) { + // 0.01% likely: n wasn't big enough to hold result; test 9 covers this + // C99 standard states that vsnprintf returns the size that would be big enough + char *new = realloc(buff, res+1); + if (!new) { + // # nocov start + snprintf(dest, n, "snprintf: %d byte buff realloc failed", (int)res+1); + free(spec); + free(buff); + return -1; + // # nocov end + } + buff = new; + int newres = vsnprintf(buff, res+1, spec, ap); // try again; test 9 + if (newres!=res) { + // # nocov start + snprintf(dest, n, "snprintf: second vsnprintf %d != %d", newres, res); + free(spec); + free(buff); + return -1; + // # nocov end + } + } else if (res<1) { // negative is error, cover 0 as error too here + // # nocov start + snprintf(dest, n, "snprintf: clib error %d", res); + free(spec); + free(buff); + // # nocov end + } + // now we just need to put the string results for each arg back into the desired positions + // create lookups so we can loop through fmt once replacing the specifiers as they appear ch = buff; for (int i=0; i'9') error("When positional %n$ is used, all specifiers must include positional"); - int pos = atoi(ch+1); - ch = strpbrk(ch,"diufFeEgGxXoscpaA")+1; // move to the end of the specifier - strncpy(ch2, arg[pos-1], len[pos-1]); // write the result of the appropriate argument - ch2 += len[pos-1]; + const int space = nc>=n-1 ? 0 : n-1-nc; // space remaining + if (*ch!='%') { if (space) *ch2++=*ch; ch++; nc++; continue; } // copy non-specifier to the result as-is + if (ch[1]=='%') { if (space) *ch2++='%'; ch+=2; nc++; continue; } // interpret %% as a single % + const int pos = atoi(ch+1); // valid position already checked above + nc += strl[pos-1]; + const int nWrite = MIN(strl[pos-1], space); // potentially write half of this field to fill up n + strncpy(ch2, strp[pos-1], nWrite); + ch2 += nWrite; + ch = strpbrk(ch,"diufFeEgGxXoscpaA")+1; // move to the end of the specifier; valid checked earlier } *ch2='\0'; + free(spec); free(buff); va_end(ap); - return ch2-dest; + return nc; } SEXP test_dt_win_snprintf() { char buff[50]; + dt_win_snprintf(buff, 50, "No pos %d%%%d ok", 42, -84); - if (strcmp(buff, "No pos 42%-84 ok")) error("dt_win_snprintf test 1 failed: %s", buff); + if (strcmp(buff, "No pos 42%-84 ok")) error("dt_win_snprintf test 1 failed: %s", buff); + dt_win_snprintf(buff, 50, "With pos %1$d%%%2$d ok", 42, -84); - if (strcmp(buff, "With pos 42%-84 ok")) error("dt_win_snprintf test 2 failed: %s", buff); + if (strcmp(buff, "With pos 42%-84 ok")) error("dt_win_snprintf test 2 failed: %s", buff); + dt_win_snprintf(buff, 50, "With pos %2$d%%%1$d ok", 42, -84); - if (strcmp(buff, "With pos -84%42 ok")) error("dt_win_snprintf test 3 failed: %s", buff); + if (strcmp(buff, "With pos -84%42 ok")) error("dt_win_snprintf test 3 failed: %s", buff); + dt_win_snprintf(buff, 50, "%3$s %1$d %4$10s %2$03d$", -99, 12, "hello%2$d", "short"); - if (strcmp(buff, "hello%2$d -99 short 012$")) error("dt_win_snprintf test 4 failed: %s", buff); + if (strcmp(buff, "hello%2$d -99 short 012$")) error("dt_win_snprintf test 4 failed: %s", buff); + + dt_win_snprintf(buff, 50, "%1$d %s", 9, "foo"); + if (strcmp(buff, "snprintf some %n$ but not all")) error("dt_win_snprintf test 5 failed: %s", buff); + + dt_win_snprintf(buff, 50, "%%1$foo%d", 9); // The %1$f is not a specifier because % is doubled + if (strcmp(buff, "%1$foo9")) error("dt_win_snprintf test 6 failed: %s", buff); + + dt_win_snprintf(buff, 40, "long format string more than n==%d chopped", 40); // regular library (no %n$) chops to 39 chars + '/0' + if (strlen(buff)!=39 || strcmp(buff, "long format string more than n==40 chop")) error("dt_win_snprintf test 7 failed: %s", buff); + + dt_win_snprintf(buff, 40, "long %3$s %2$s more than n==%1$d chopped", 40, "string", "format"); // same with dt_win_snprintf + if (strlen(buff)!=39 || strcmp(buff, "long format string more than n==40 chop")) error("dt_win_snprintf test 8 failed: %s", buff); + + int res = dt_win_snprintf(buff, 10, "%4$d%2$d%3$d%5$d%1$d", 111, 222, 33, 44, 555); // fmt longer than n + if (strlen(buff)!=9 || strcmp(buff, "442223355")) error("dt_win_snprintf test 9 failed: %s", buff); + if (res!=13) /* should return what would have been written if not chopped */ error("dt_win_snprintf test 10 failed: %d", res); + + dt_win_snprintf(buff, 47, "%l", 3); + if (strlen(buff)!=46 || strcmp(buff, "snprintf %l does not end with recognized ty")) error("dt_win_snprintf test 11 failed: %s", buff); + + dt_win_snprintf(buff, 19, "%l", 3); + if (strlen(buff)!=18 || strcmp(buff, "snprintf %l doe")) error("dt_win_snprintf test 12 failed: %s", buff); + + dt_win_snprintf(buff, 50, "%1$d == %0$d", 1, 2); + if (strcmp(buff, "snprintf %0$ outside range [1,99]")) error("dt_win_snprintf test 13 failed: %s", buff); + + dt_win_snprintf(buff, 50, "%1$d == %$d", 1, 2); + if (strcmp(buff, "snprintf %$ outside range [1,99]")) error("dt_win_snprintf test 14 failed: %s", buff); + + dt_win_snprintf(buff, 50, "%1$d == %100$d", 1, 2); + if (strcmp(buff, "snprintf %100$ outside range [1,99]")) error("dt_win_snprintf test 15 failed: %s", buff); + + dt_win_snprintf(buff, 50, "%1$d == %-1$d", 1, 2); + if (strcmp(buff, "snprintf %-1$ outside range [1,99]")) error("dt_win_snprintf test 16 failed: %s", buff); + + dt_win_snprintf(buff, 50, "%1$d == %3$d", 1, 2, 3); + if (strcmp(buff, "snprintf %2$ missing")) error("dt_win_snprintf test 17 failed: %s", buff); + + dt_win_snprintf(buff, 50, "%1$d == %1$d", 42); + if (strcmp(buff, "snprintf %1$ appears twice")) error("dt_win_snprintf test 18 failed: %s", buff); + + dt_win_snprintf(buff, 50, "%1$d + %3$d - %2$d == %3$d", 1, 1, 2); + if (strcmp(buff, "snprintf %3$ appears twice")) error("dt_win_snprintf test 19 failed: %s", buff); + return R_NilValue; } From 2440d710970345af59f8ab1d3d1aaa8afbdf8f28 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 6 Jun 2020 22:42:37 -0600 Subject: [PATCH 024/588] mention _sprintf_p in top comment & shorten test string to vertically align tests; #4531 --- src/snprintf.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/snprintf.c b/src/snprintf.c index 076b637971..62a823ae7c 100644 --- a/src/snprintf.c +++ b/src/snprintf.c @@ -1,16 +1,16 @@ // For translations (#4402) we need positional specifiers (%n$), a non-C99 POSIX extension. // On Linux and Mac, standard snprintf supports positional specifiers. -// On Windows, we tried many things but just couldn't achieve it. This may be why R uses -// a third party library, trio, on Windows. But R does not expose trio for use by packages. -// So ... -// Rather than require compile flags (such as _XOPEN_SOURCE or POSIX_C_SOURCE), or require -// linking to particular Windows libraries which may be fragile over time depending on -// user's environments, we use the standard C99 features here. To do so, we simulate +// On Windows, we tried many things but just couldn't achieve linking to _sprintf_p. Even +// if we managed that on AppVeyor we may have fragility in the future on Windows given +// varying Windows versions, compile environments/flags, and dll libraries. This may be +// why R uses a third party library, trio, on Windows. But R does not expose trio for use +// by packages. +// So, in this file we use standard C99 features to support %n$. We simulate // positionals via format massage. Just on Windows, all snprintf calls are replaced with // this dt_win_snprintf via a #define in data.table.h. The goal of this massage is to be // as light and minimal as possible. // In C it is not possible, portably, to reorder a va_list (sadly). -// In C you must past the correct type to va_arg(), so even to navigate va_list you +// In C you must pass the correct type to va_arg(). So even to navigate va_list you // must parse and rely on fmt. But we don't want to reimplement all the types and modifiers. // Hence, reordering the specifiers, passing the va_list to the library, and then // putting the output strings into the desired order afterwards. @@ -148,6 +148,7 @@ int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) snprintf(dest, n, "snprintf: clib error %d", res); free(spec); free(buff); + return -1; // # nocov end } // now we just need to put the string results for each arg back into the desired positions @@ -213,8 +214,8 @@ SEXP test_dt_win_snprintf() if (strlen(buff)!=9 || strcmp(buff, "442223355")) error("dt_win_snprintf test 9 failed: %s", buff); if (res!=13) /* should return what would have been written if not chopped */ error("dt_win_snprintf test 10 failed: %d", res); - dt_win_snprintf(buff, 47, "%l", 3); - if (strlen(buff)!=46 || strcmp(buff, "snprintf %l does not end with recognized ty")) error("dt_win_snprintf test 11 failed: %s", buff); + dt_win_snprintf(buff, 39, "%l", 3); + if (strlen(buff)!=38 || strcmp(buff, "snprintf %l does not end with recog")) error("dt_win_snprintf test 11 failed: %s", buff); dt_win_snprintf(buff, 19, "%l", 3); if (strlen(buff)!=18 || strcmp(buff, "snprintf %l doe")) error("dt_win_snprintf test 12 failed: %s", buff); From 88bd1eb1f32acf69ba8e4ca1d39f0acf5543c0db Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 7 Jun 2020 06:23:55 +0100 Subject: [PATCH 025/588] add more checks to detect internal error more early, #4364 (#4365) --- R/data.table.R | 2 ++ src/dogroups.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/R/data.table.R b/R/data.table.R index 75b6b290ed..48c0782ed8 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1391,6 +1391,8 @@ replace_dot_alias = function(e) { jisvars = if (any(c("get", "mget") %chin% av)) names_i else intersect(gsub("^i[.]","", setdiff(av, xjisvars)), names_i) # JIS (non join cols) but includes join columns too (as there are named in i) if (length(jisvars)) { + if (!nrow(i)) + stop("internal error: doing byjoin but i has 0 rows") # nocov #4364 tt = min(nrow(i),1L) SDenv$.iSD = i[tt,jisvars,with=FALSE] for (ii in jisvars) { diff --git a/src/dogroups.c b/src/dogroups.c index e07057b325..2c5bdb8eb6 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -47,6 +47,8 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX N = PROTECT(findVar(install(".N"), env)); nprotect++; // PROTECT for rchk GRP = PROTECT(findVar(install(".GRP"), env)); nprotect++; iSD = PROTECT(findVar(install(".iSD"), env)); nprotect++; // 1-row and possibly no cols (if no i variables are used via JIS) + if (length(iSD) && !length(VECTOR_ELT(iSD, 0))) + error("internal error dogroups: iSD is a zero rows data.table"); // # nocov xSD = PROTECT(findVar(install(".xSD"), env)); nprotect++; R_len_t maxGrpSize = 0; const int *ilens = INTEGER(lens), n=LENGTH(lens); From 7a629d468f8b6ddcee415252852d74566c958813 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 8 Jun 2020 21:41:58 +0100 Subject: [PATCH 026/588] news entry for #4528 (#4532) --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index efc1a43ca6..939e99742a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -157,6 +157,7 @@ unit = "s") 9. `data.table::update.dev.pkg()` now unloads the `data.table` namespace to alleviate a DLL lock issue on Windows, [#4403](https://github.com/Rdatatable/data.table/issues/4403). Thanks to @drag5 for reporting. +10. Starting from 4.0.0, data.table is using R's `rbind` and `cbind` methods, as described in v1.12.6 news entry. Support for R 3.x.x is resolved when processing `NAMESPACE` file, at install time, or at the time of building package binaries. As a result, users on R 3.x.x, if installing from binaries, must use binaries built by R 3.x.x, and users on R 4.x.x, if installing from binaries, must use binaries built by R 4.x.x. Users will see `package ‘data.table’ was built under R version...` warning when this happen. Thanks to @vinhdizzo for reporting in [#4528](https://github.com/Rdatatable/data.table/issues/4528). # data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) From 9fd131d767f66728499ffd525ed6c25f3f78d462 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Mon, 8 Jun 2020 23:51:01 +0200 Subject: [PATCH 027/588] Fixing crash when attempting to join on character(0) (#4272) --- NEWS.md | 2 ++ R/data.table.R | 2 +- R/merge.R | 4 ++-- inst/tests/tests.Rraw | 13 ++++++++++++- src/bmerge.c | 4 +++- 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 939e99742a..d35bd68b1e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -115,6 +115,8 @@ unit = "s") 16. Non-equi joins now automatically set `allow.cartesian=TRUE`, [4489](https://github.com/Rdatatable/data.table/issues/4489). Thanks to @Henrik-P for reporting. +17. `X[Y, on=character(0)]` and `merge(X, Y, by.x=character(0), by.y=character(0))` no longer crash, [#4272](https://github.com/Rdatatable/data.table/pull/4272). Thanks to @tlapak for the PR. + ## NOTES 0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. diff --git a/R/data.table.R b/R/data.table.R index 48c0782ed8..ccfc327b6f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3039,7 +3039,7 @@ isReallyReal = function(x) { onsub = as.call(c(quote(c), onsub)) } on = eval(onsub, parent.frame(2L), parent.frame(2L)) - if (!is.character(on)) + if (length(on) == 0L || !is.character(on)) stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") ## extract the operators and potential variable names from 'on'. ## split at backticks to take care about variable names like `col1<=`. diff --git a/R/merge.R b/R/merge.R index 31f322fce5..fe3bdb4549 100644 --- a/R/merge.R +++ b/R/merge.R @@ -21,8 +21,8 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (!missing(by) && !missing(by.x)) warning("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.") if (!is.null(by.x)) { - if ( !is.character(by.x) || !is.character(by.y)) - stop("A non-empty vector of column names are required for `by.x` and `by.y`.") + if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y)) + stop("A non-empty vector of column names is required for `by.x` and `by.y`.") if (!all(by.x %chin% names(x))) stop("Elements listed in `by.x` must be valid column names in x.") if (!all(by.y %chin% names(y))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e28885fb4e..a010b5b995 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15912,7 +15912,7 @@ test(2074.31, dcast(DT, V1 ~ z, fun.aggregate=eval(quote(length)), value.var='z' test(2074.32, fwrite(DT, logical01=TRUE, logicalAsInt=TRUE), error="logicalAsInt has been renamed") # merge.data.table -test(2074.33, merge(DT, DT, by.x = 1i, by.y=1i), error="A non-empty vector of column names are required") +test(2074.33, merge(DT, DT, by.x = 1i, by.y=1i), error="A non-empty vector of column names is required") # shift naming test(2074.34, shift(list(a=1:5, b=6:10), give.names=TRUE), list(a_lag_1=c(NA, 1:4), b_lag_1=c(NA, 6:9))) @@ -16682,6 +16682,9 @@ options(old_width) DT = data.table(A="a", key="A") test(2126.1, DT[J(NULL)], DT[0]) test(2126.2, DT[data.table()], DT[0]) +# additional segfault when i is NULL and roll = 'nearest' +test(2126.3, DT[J(NULL), roll = 'nearest'], DT[0]) +test(2126.4, DT[data.table(), roll = 'nearest'], DT[0]) # fcase, #3823 test_vec1 = -5L:5L < 0L @@ -16947,3 +16950,11 @@ if (.Platform$OS.type=="windows") local({ # test back to English (the argument order is back to 1,c,2,1) test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1") +# Attempting to join on character(0) shouldn't crash R +A = data.table(A='a') +B = data.table(B='b') +test(2145.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector") +test(2145.2, merge(A, B, by=character(0) ), error = "non-empty vector of column names for `by` is required.") +test(2145.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required") +# Also shouldn't crash when using internal functions +test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), error = 'icols and xcols must be non-empty') diff --git a/src/bmerge.c b/src/bmerge.c index 5273ae59b9..4c13f14b95 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -47,6 +47,8 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE i = iArg; x = xArg; // set globals so bmerge_r can see them. if (!isInteger(icolsArg)) error(_("Internal error: icols is not integer vector")); // # nocov if (!isInteger(xcolsArg)) error(_("Internal error: xcols is not integer vector")); // # nocov + if ((LENGTH(icolsArg) == 0 || LENGTH(xcolsArg) == 0) && LENGTH(i) > 0) // We let through LENGTH(i) == 0 for tests 2126.* + error(_("Internal error: icols and xcols must be non-empty integer vectors.")); if (LENGTH(icolsArg) > LENGTH(xcolsArg)) error(_("Internal error: length(icols) [%d] > length(xcols) [%d]"), LENGTH(icolsArg), LENGTH(xcolsArg)); // # nocov icols = INTEGER(icolsArg); xcols = INTEGER(xcolsArg); @@ -68,7 +70,7 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE roll = 0.0; rollToNearest = FALSE; if (isString(rollarg)) { if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) error(_("roll is character but not 'nearest'")); - if (TYPEOF(VECTOR_ELT(i, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet.")); + if (ncol > 0 && TYPEOF(VECTOR_ELT(i, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet.")); roll=1.0; rollToNearest=TRUE; // the 1.0 here is just any non-0.0, so roll!=0.0 can be used later } else { if (!isReal(rollarg)) error(_("Internal error: roll is not character or double")); // # nocov From dc5e11a45bf2dd78bfbed0d26bbe52b1872bf3eb Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 8 Jun 2020 16:08:17 -0600 Subject: [PATCH 028/588] Added Vaclav to contributor list in DESCRIPTION; #4272 --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0e52423574..fbee40d7f1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -57,7 +57,8 @@ Authors@R: c( person("David","Simons", role="ctb"), person("Elliott","Sales de Andrade", role="ctb"), person("Cole","Miller", role="ctb"), - person("Jens Peder","Meldgaard", role="ctb")) + person("Jens Peder","Meldgaard", role="ctb"), + person("Vaclav","Tlapak", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml From 65f0516a055266b1c0538318d54604a7a100d34e Mon Sep 17 00:00:00 2001 From: Kevin Ushey Date: Mon, 8 Jun 2020 18:54:49 -0700 Subject: [PATCH 029/588] check if R CMD SHLIB can compile OpenMP directly (#4374) --- configure | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/configure b/configure index 4aef525edc..c2fd7a48aa 100755 --- a/configure +++ b/configure @@ -1,4 +1,4 @@ -#!/bin/sh +#!/usr/bin/env sh # Find R compilers CC=`${R_HOME}/bin/R CMD config CC` @@ -71,11 +71,28 @@ echo "zlib ${version} is available ok" # inconsistent with R-exts$1.2.1.1, though, which states it's 'available for use in Makevars' (so not # necessarily here in configure). Hence use -fopenmp directly for this detection step. # printf not echo to pass checkbashisms w.r.t. to the \n -printf "#include \nint main () { return omp_get_num_threads(); }" | ${CC} ${CFLAGS} -fopenmp -xc - >/dev/null 2>&1 || R_NO_OPENMP=1; -rm a.out >/dev/null 2>&1 + +cat < test-omp.c +#include +int main() { + return omp_get_num_threads(); +} +EOF + +# First, try R CMD SHLIB to see if R can already compile +# things using OpenMP without any extra help from data.table +"${R_HOME}/bin/R" CMD SHLIB test-omp.c >/dev/null 2>&1 || R_NO_OPENMP=1 + +if [ "$R_NO_OPENMP" = "1" ]; then + # Compilation failed -- try forcing -fopenmp instead. + "${CC}" "${CFLAGS}" -fopenmp test-omp.c || R_NO_OPENMP=1 +fi + +# Clean up. +rm -f test-omp.{c,o,so} a.out # Write to Makevars -if [ $R_NO_OPENMP ]; then +if [ "$R_NO_OPENMP" = "1" ]; then echo "*** OpenMP not supported! data.table uses OpenMP to automatically" echo "*** parallelize operations like sorting, grouping, file reading, etc." echo "*** For details on how to install the necessary toolchains on your OS see:" From 0cb9e955e982b6d1686ddc9306ca0665614414c2 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 8 Jun 2020 20:07:37 -0600 Subject: [PATCH 030/588] Added Kevin to contributor list in DESCRIPTION and made the change he suggested to pass checkbashisms; #4374 --- DESCRIPTION | 3 ++- configure | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fbee40d7f1..9c4a3c4e98 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -58,7 +58,8 @@ Authors@R: c( person("Elliott","Sales de Andrade", role="ctb"), person("Cole","Miller", role="ctb"), person("Jens Peder","Meldgaard", role="ctb"), - person("Vaclav","Tlapak", role="ctb")) + person("Vaclav","Tlapak", role="ctb"), + person("Kevin","Ushey", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml diff --git a/configure b/configure index c2fd7a48aa..a0746dee00 100755 --- a/configure +++ b/configure @@ -89,7 +89,7 @@ if [ "$R_NO_OPENMP" = "1" ]; then fi # Clean up. -rm -f test-omp.{c,o,so} a.out +rm -f test-omp.* a.out # Write to Makevars if [ "$R_NO_OPENMP" = "1" ]; then From 0e563838c43b891fdd659fd02d1b0a053a1dc88b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 9 Jun 2020 00:05:28 -0600 Subject: [PATCH 031/588] fixed iSD recycle when empty, and added test (#4537) --- R/data.table.R | 4 +--- inst/tests/tests.Rraw | 6 ++++++ src/dogroups.c | 4 +--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index ccfc327b6f..f03efb77ff 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1391,9 +1391,7 @@ replace_dot_alias = function(e) { jisvars = if (any(c("get", "mget") %chin% av)) names_i else intersect(gsub("^i[.]","", setdiff(av, xjisvars)), names_i) # JIS (non join cols) but includes join columns too (as there are named in i) if (length(jisvars)) { - if (!nrow(i)) - stop("internal error: doing byjoin but i has 0 rows") # nocov #4364 - tt = min(nrow(i),1L) + tt = min(nrow(i),1L) # min() is here for when nrow(i)==0 SDenv$.iSD = i[tt,jisvars,with=FALSE] for (ii in jisvars) { assign(ii, SDenv$.iSD[[ii]], SDenv) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a010b5b995..f902d563db 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16958,3 +16958,9 @@ test(2145.2, merge(A, B, by=character(0) ), test(2145.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required") # Also shouldn't crash when using internal functions test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), error = 'icols and xcols must be non-empty') + +# nrow(i)==0 by-join, #4364 (broke in dev 1.12.9) +d0 = data.table(id=integer(), n=integer()) +d2 = data.table(id=1:2) +test(2146, d2[d0, i.n, on="id", by=.EACHI], data.table(id=integer(), i.n=integer())) + diff --git a/src/dogroups.c b/src/dogroups.c index 2c5bdb8eb6..776d6f1ad7 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -47,8 +47,6 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX N = PROTECT(findVar(install(".N"), env)); nprotect++; // PROTECT for rchk GRP = PROTECT(findVar(install(".GRP"), env)); nprotect++; iSD = PROTECT(findVar(install(".iSD"), env)); nprotect++; // 1-row and possibly no cols (if no i variables are used via JIS) - if (length(iSD) && !length(VECTOR_ELT(iSD, 0))) - error("internal error dogroups: iSD is a zero rows data.table"); // # nocov xSD = PROTECT(findVar(install(".xSD"), env)); nprotect++; R_len_t maxGrpSize = 0; const int *ilens = INTEGER(lens), n=LENGTH(lens); @@ -127,7 +125,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX defineVar(xknameSyms[j], VECTOR_ELT(xSD, j), env); } - for (int j=0; j Date: Tue, 9 Jun 2020 02:46:06 -0400 Subject: [PATCH 032/588] By colon key fix (#4376) --- NEWS.md | 2 ++ R/data.table.R | 3 ++- inst/tests/tests.Rraw | 5 +++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index d35bd68b1e..0ba902c877 100644 --- a/NEWS.md +++ b/NEWS.md @@ -117,6 +117,8 @@ unit = "s") 17. `X[Y, on=character(0)]` and `merge(X, Y, by.x=character(0), by.y=character(0))` no longer crash, [#4272](https://github.com/Rdatatable/data.table/pull/4272). Thanks to @tlapak for the PR. +18. `by=col1:col4` gave an incorrect result if `key(DT)==c("col1","col4")`, [#4285](https://github.com/Rdatatable/data.table/issues/4285). Thanks to @cbilot for reporting, and Cole Miller for the PR. + ## NOTES 0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. diff --git a/R/data.table.R b/R/data.table.R index f03efb77ff..e71b0a161d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -752,7 +752,8 @@ replace_dot_alias = function(e) { allbyvars = intersect(all.vars(bysub), names_x) orderedirows = .Call(CisOrderedSubset, irows, nrow(x)) # TRUE when irows is NULL (i.e. no i clause). Similar but better than is.sorted(f__) bysameorder = byindex = FALSE - if (all(vapply_1b(bysubl, is.name))) { + if (!bysub %iscall% ":" && ##Fix #4285 + all(vapply_1b(bysubl, is.name))) { bysameorder = orderedirows && haskey(x) && length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars))) # either bysameorder or byindex can be true but not both. TODO: better name for bysameorder might be bykeyx if (!bysameorder && keyby && !length(irows) && isTRUE(getOption("datatable.use.index"))) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f902d563db..1def357eb0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16964,3 +16964,8 @@ d0 = data.table(id=integer(), n=integer()) d2 = data.table(id=1:2) test(2146, d2[d0, i.n, on="id", by=.EACHI], data.table(id=integer(), i.n=integer())) +# by=col1:col4 wrong result when key(DT)==c('col1','col4'), #4285 +DT = data.table(col1=c(1,1,1), col2=c("a","b","a"), col3=c("A","B","A"), col4=c(2,2,2)) +setkey(DT, col1, col4) +test(2147.1, DT[, .N, by = col1:col4], ans<-data.table(col1=1, col2=c("a","b"), col3=c("A","B"), col4=2, N=INT(2,1))) +test(2147.2, DT[, .N, by = c("col1", "col2", "col3", "col4")], ans) From 89830e97b1582c1f4603b6b8f0260e0945876a27 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 18 Jun 2020 09:56:31 +0100 Subject: [PATCH 033/588] throttle threads for iterated small data tasks (#4484) --- NEWS.md | 2 ++ R/openmp-utils.R | 6 +++--- inst/tests/tests.Rraw | 6 +++++- man/openmp-utils.Rd | 3 ++- src/between.c | 16 +++++++-------- src/cj.c | 12 +++++------ src/coalesce.c | 8 ++++---- src/data.table.h | 2 +- src/fifelse.c | 8 ++++---- src/forder.c | 35 ++++++++++++++++--------------- src/froll.c | 8 ++++---- src/frollR.c | 2 +- src/frolladaptive.c | 16 +++++++-------- src/fsort.c | 6 +++--- src/gsumm.c | 40 ++++++++++++++++++------------------ src/nafill.c | 2 +- src/openmp-utils.c | 48 ++++++++++++++++++++++++++----------------- src/reorder.c | 6 +++--- src/subset.c | 6 +++--- src/types.c | 2 +- 20 files changed, 126 insertions(+), 108 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0ba902c877..98484687c2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -81,6 +81,8 @@ unit = "s") 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. +15. A new throttle feature has been introduced to speed up small data tasks that are repeated in a loop, [#3175](https://github.com/Rdatatable/data.table/issues/3175) [#3438](https://github.com/Rdatatable/data.table/issues/3438) [#3205](https://github.com/Rdatatable/data.table/issues/3205) [#3735](https://github.com/Rdatatable/data.table/issues/3735) [#3739](https://github.com/Rdatatable/data.table/issues/3739) [#4284](https://github.com/Rdatatable/data.table/issues/4284) [#4527](https://github.com/Rdatatable/data.table/issues/4527) [#4294](https://github.com/Rdatatable/data.table/issues/4294) [#1120](https://github.com/Rdatatable/data.table/issues/1120). The default throttle of 1024 means that a single thread will be used when nrow<=1024, two threads when nrow<=2048, etc. To change the default, use `setDTthreads(throttle=)`. Or use the new environment variable `R_DATATABLE_THROTTLE`. If you use `Sys.setenv()` in a running R session to change this environment variable, be sure to run an empty `setDTthreads()` call afterwards for the change to take effect; see `?setDTthreads`. The word *throttle* is used to convey that the number of threads is restricted (throttled) for small data tasks. Reducing throttle to 1 will turn off throttling and should revert behaviour to past versions (i.e. using many threads even for small data). Increasing throttle to, say, 65536 will utilize multi-threading only for larger datasets. The value 1024 is a guess. We welcome feedback and test results indicating what the best default should be. + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). diff --git a/R/openmp-utils.R b/R/openmp-utils.R index 5e11222c5c..9df55f1148 100644 --- a/R/openmp-utils.R +++ b/R/openmp-utils.R @@ -1,12 +1,12 @@ -setDTthreads = function(threads=NULL, restore_after_fork=NULL, percent=NULL) { +setDTthreads = function(threads=NULL, restore_after_fork=NULL, percent=NULL, throttle=NULL) { if (!missing(percent)) { if (!missing(threads)) stop("Provide either threads= or percent= but not both") if (length(percent)!=1) stop("percent= is provided but is length ", length(percent)) percent=as.integer(percent) if (is.na(percent) || percent<2L || percent>100L) stop("percent==",percent," but should be a number between 2 and 100") - invisible(.Call(CsetDTthreads, percent, restore_after_fork, TRUE)) + invisible(.Call(CsetDTthreads, percent, restore_after_fork, TRUE, as.integer(throttle))) } else { - invisible(.Call(CsetDTthreads, threads, restore_after_fork, FALSE)) + invisible(.Call(CsetDTthreads, as.integer(threads), restore_after_fork, FALSE, as.integer(throttle))) } } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1def357eb0..12790ed92a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14188,7 +14188,7 @@ test(1996.2, d[, eval(qcall)], data.table(a=1L, b=3)) # setDTthreads; #3435 test(1997.01, setDTthreads(NULL, percent=75), error="Provide either threads= or percent= but not both") test(1997.02, setDTthreads(1L, percent=75), error="Provide either threads= or percent= but not both") -test(1997.03, setDTthreads(-1L), error="must be either NULL or a single integer >= 0") +test(1997.03, setDTthreads(-1L), error="threads= must be either NULL or a single number >= 0") test(1997.04, setDTthreads(percent=101), error="should be a number between 2 and 100") test(1997.05, setDTthreads(percent=1), error="should be a number between 2 and 100") test(1997.06, setDTthreads(percent=NULL), error="but is length 0") @@ -14211,6 +14211,10 @@ test(1997.14, getDTthreads(), new) Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT=oldenv) test(1997.15, setDTthreads(old), new) test(1997.16, getDTthreads(), old) +test(1997.17, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1") +setDTthreads(throttle=65536) +test(1997.18, getDTthreads(TRUE), output="throttle==65536") +setDTthreads(throttle=1024) # test that a copy is being made and output is printed, #3385 after partial revert of #3281 x = 5L diff --git a/man/openmp-utils.Rd b/man/openmp-utils.Rd index 8bb6dccc2b..b8d014976e 100644 --- a/man/openmp-utils.Rd +++ b/man/openmp-utils.Rd @@ -8,13 +8,14 @@ Set and get number of threads to be used in \code{data.table} functions that are parallelized with OpenMP. The number of threads is initialized when \code{data.table} is first loaded in the R session using optional envioronment variables. Thereafter, the number of threads may be changed by calling \code{setDTthreads}. If you change an environment variable using \code{Sys.setenv} you will need to call \code{setDTthreads} again to reread the environment variables. } \usage{ - setDTthreads(threads = NULL, restore_after_fork = NULL, percent = NULL) + setDTthreads(threads = NULL, restore_after_fork = NULL, percent = NULL, throttle = NULL) getDTthreads(verbose = getOption("datatable.verbose")) } \arguments{ \item{threads}{ NULL (default) rereads environment variables. 0 means to use all logical CPUs available. Otherwise a number >= 1 } \item{restore_after_fork}{ Should data.table be multi-threaded after a fork has completed? NULL leaves the current setting unchanged which by default is TRUE. See details below. } \item{percent}{ If provided it should be a number between 2 and 100; the percentage of logical CPUs to use. By default on startup, 50\%. } + \item{throttle}{ 1024 (default) means that, roughly speaking, a single thread will be used when nrow(DT)<=1024, 2 threads when nrow(DT)<=2048, etc. The throttle is to speed up small data tasks (especially when repeated many times) by not incurring the overhead of managing multiple threads. Hence the number of threads is throttled (restricted) for small tasks. } \item{verbose}{ Display the value of relevant OpenMP settings plus the \code{restore_after_fork} internal option. } } \value{ diff --git a/src/between.c b/src/between.c index b4444d968c..c5d91b30c0 100644 --- a/src/between.c +++ b/src/between.c @@ -64,14 +64,14 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg, S error(_("Item %d of lower (%d) is greater than item %d of upper (%d)"), (i&lowMask)+1, l, (i&uppMask)+1, u); } if (NAbounds) { // default NAbounds==TRUE => NA bound means TRUE; i.e. asif lower=-Inf or upper==Inf) - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(longest, true)) for (int i=0; i= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) } } else { - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(longest, true)) for (int i=0; i0; // flag to re-run with NA support if NAs detected if (!truehasna || !narm) { - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=k-1; i0; if (!truehasna || !narm) { - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=k-1; i1) schedule(auto) collapse(2) num_threads(getDTthreads()) + #pragma omp parallel for if (ialgo==0) schedule(dynamic) collapse(2) num_threads(getDTthreads(nx*nk, false)) for (R_len_t i=0; idbl_v[i] = cs[i]/k[i]; // current obs window width exactly same as obs position in a vector @@ -82,7 +82,7 @@ void fadaptiverollmeanFast(double *x, uint64_t nx, ans_t *ans, int *k, double fi cs[i] = (double) w; // cumsum, na.rm=TRUE always, NAs handled using cum NA counter cn[i] = nc; // cum NA counter } - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; idbl_v[i] = fill; @@ -114,7 +114,7 @@ void fadaptiverollmeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double f snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasna %d, narm %d\n"), "fadaptiverollmeanExact", (uint64_t)nx, hasna, (int) narm); bool truehasna = hasna>0; // flag to re-run if NAs detected if (!truehasna || !narm) { // narm=FALSE handled here as NAs properly propagated in exact algo - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; idbl_v[i] = fill; // partial window @@ -231,7 +231,7 @@ void fadaptiverollsumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fil cs[i] = (double) w; } if (R_FINITE((double) w)) { - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; idbl_v[i] = cs[i]; @@ -271,7 +271,7 @@ void fadaptiverollsumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fil cs[i] = (double) w; cn[i] = nc; } - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; idbl_v[i] = fill; @@ -298,7 +298,7 @@ void fadaptiverollsumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fi snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasna %d, narm %d\n"), "fadaptiverollsumExact", (uint64_t)nx, hasna, (int) narm); bool truehasna = hasna>0; if (!truehasna || !narm) { - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(nx, true)) for (uint64_t i=0; idbl_v[i] = fill; diff --git a/src/fsort.c b/src/fsort.c index d3c695eac3..00c7e5c10b 100644 --- a/src/fsort.c +++ b/src/fsort.c @@ -117,7 +117,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) { // allocate early in case fails if not enough RAM // TODO: document this is much cheaper than a copy followed by in-place. - int nth = getDTthreads(); + int nth = getDTthreads(xlength(x), true); int nBatch=nth*2; // at least nth; more to reduce last-man-home; but not too large to keep counts small in cache if (verbose) Rprintf(_("nth=%d, nBatch=%d\n"),nth,nBatch); @@ -131,7 +131,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) { t[1] = wallclock(); double mins[nBatch], maxs[nBatch]; const double *restrict xp = REAL(x); - #pragma omp parallel for schedule(dynamic) num_threads(nth) + #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(nBatch, false)) for (int batch=0; batch1) num_threads(getDTthreads()) + #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(nx, false)) for (R_len_t i=0; i // errno #include // isspace -static int DTthreads = -1; // Never read directly hence static; use getDTthreads(). -1 so we know for sure initDTthreads() ran and set it >= 1. +static int DTthreads = -1; // Never read directly hence static; use getDTthreads(n, /*throttle=*/0|1). -1 so we know for sure initDTthreads() ran and set it >= 1. +static int DTthrottle = -1; // Thread 1 is assigned DTthrottle iterations before a 2nd thread is utilized; #4484. static bool RestoreAfterFork = true; // see #2885 in v1.12.0 static int getIntEnv(const char *name, int def) @@ -50,12 +51,19 @@ void initDTthreads() { ans = imin(ans, getIntEnv("OMP_THREAD_LIMIT", INT_MAX)); // user might expect `Sys.setenv(OMP_THREAD_LIMIT=2);setDTthreads()` to work. Satisfy this ans = imin(ans, getIntEnv("OMP_NUM_THREADS", INT_MAX)); // expectation by reading them again now. OpenMP just reads them on startup (quite reasonably) DTthreads = ans; + DTthrottle = imax(1, getIntEnv("R_DATATABLE_THROTTLE", 1024)); // 2nd thread is used only when n>1024, 3rd thread when n>2048, etc } -int getDTthreads() { - // this is the main getter used by all parallel regions; they specify num_threads(getDTthreads()) - // Therefore keep it light, simple and robust. Local static variable. initDTthreads() ensures 1 <= DTthreads <= omp_get_num_proc() - return DTthreads; +int getDTthreads(const int64_t n, const bool throttle) { + // this is the main getter used by all parallel regions; they specify num_threads(n, true|false). + // Keep this light, simple and robust. initDTthreads() ensures 1 <= DTthreads <= omp_get_num_proc() + // throttle introduced in 1.12.10 (see NEWS item); #4484 + // throttle==true : a number of iterations per thread (DTthrottle) is applied before a second thread is utilized + // throttle==false : parallel region is already pre-chunked such as in fread; e.g. two batches intended for two threads + if (n<1) return 1; // 0 or negative could be deliberate in calling code for edge cases where loop is not intended to run at all + int64_t ans = throttle ? 1+(n-1)/DTthrottle : // 1 thread for n<=1024, 2 thread for n<=2048, etc + n; // don't use 20 threads for just one or two batches + return ans>=DTthreads ? DTthreads : (int)ans; // apply limit in static local DTthreads saved there by initDTthreads() and setDTthreads() } static const char *mygetenv(const char *name, const char *unset) { @@ -75,40 +83,42 @@ SEXP getDTthreads_R(SEXP verbose) { Rprintf(_(" omp_get_num_procs() %d\n"), omp_get_num_procs()); Rprintf(_(" R_DATATABLE_NUM_PROCS_PERCENT %s\n"), mygetenv("R_DATATABLE_NUM_PROCS_PERCENT", "unset (default 50)")); Rprintf(_(" R_DATATABLE_NUM_THREADS %s\n"), mygetenv("R_DATATABLE_NUM_THREADS", "unset")); + Rprintf(_(" R_DATATABLE_THROTTLE %s\n"), mygetenv("R_DATATABLE_THROTTLE", "unset (default 1024)")); Rprintf(_(" omp_get_thread_limit() %d\n"), omp_get_thread_limit()); Rprintf(_(" omp_get_max_threads() %d\n"), omp_get_max_threads()); Rprintf(_(" OMP_THREAD_LIMIT %s\n"), mygetenv("OMP_THREAD_LIMIT", "unset")); // CRAN sets to 2 Rprintf(_(" OMP_NUM_THREADS %s\n"), mygetenv("OMP_NUM_THREADS", "unset")); Rprintf(_(" RestoreAfterFork %s\n"), RestoreAfterFork ? "true" : "false"); - Rprintf(_(" data.table is using %d threads. See ?setDTthreads.\n"), getDTthreads()); + Rprintf(_(" data.table is using %d threads with throttle==%d. See ?setDTthreads.\n"), getDTthreads(INT_MAX, false), DTthrottle); } - return ScalarInteger(getDTthreads()); + return ScalarInteger(getDTthreads(INT_MAX, false)); } -SEXP setDTthreads(SEXP threads, SEXP restore_after_fork, SEXP percent) { +SEXP setDTthreads(SEXP threads, SEXP restore_after_fork, SEXP percent, SEXP throttle) { if (!isNull(restore_after_fork)) { if (!isLogical(restore_after_fork) || LOGICAL(restore_after_fork)[0]==NA_LOGICAL) { error(_("restore_after_fork= must be TRUE, FALSE, or NULL (default). getDTthreads(verbose=TRUE) reports the current setting.\n")); } RestoreAfterFork = LOGICAL(restore_after_fork)[0]; // # nocov } + if (length(throttle)) { + if (!isInteger(throttle) || LENGTH(throttle)!=1 || INTEGER(throttle)[0]<1) + error(_("'throttle' must be a single number, non-NA, and >=1")); + DTthrottle = INTEGER(throttle)[0]; + } int old = DTthreads; - if (isNull(threads)) { + if (!length(threads) && !length(throttle)) { initDTthreads(); // Rerun exactly the same function used on startup (re-reads env variables); this is now default setDTthreads() behavior from 1.12.2 // Allows robust testing of environment variables using Sys.setenv() to experiment. // Default is now (as from 1.12.2) threads=NULL which re-reads environment variables. // If a CPU has been unplugged (high end servers allow live hardware replacement) then omp_get_num_procs() will // reflect that and a call to setDTthreads(threads=NULL) will update DTthreads. - } else { - int n=0, protecti=0; - if (length(threads)!=1) error(_("threads= must be either NULL (default) or a single number. It has length %d"), length(threads)); - if (isReal(threads)) { threads = PROTECT(coerceVector(threads, INTSXP)); protecti++; } - if (!isInteger(threads)) error(_("threads= must be either NULL (default) or type integer/numeric")); - if ((n=INTEGER(threads)[0]) < 0) { // <0 catches NA too since NA is negative (INT_MIN) - error(_("threads= must be either NULL or a single integer >= 0. See ?setDTthreads.")); + } else if (length(threads)) { + int n=0; + if (length(threads)!=1 || !isInteger(threads) || (n=INTEGER(threads)[0]) < 0) { // <0 catches NA too since NA is negative (INT_MIN) + error(_("threads= must be either NULL or a single number >= 0. See ?setDTthreads.")); } - UNPROTECT(protecti); int num_procs = imax(omp_get_num_procs(), 1); // max just in case omp_get_num_procs() returns <= 0 (perhaps error, or unsupported) if (!isLogical(percent) || length(percent)!=1 || LOGICAL(percent)[0]==NA_LOGICAL) { error(_("Internal error: percent= must be TRUE or FALSE at C level")); // # nocov @@ -124,8 +134,8 @@ SEXP setDTthreads(SEXP threads, SEXP restore_after_fork, SEXP percent) { DTthreads = imax(n, 1); // imax just in case // Do not call omp_set_num_threads() here. Any calls to omp_set_num_threads() affect other // packages and R itself too which has some OpenMP usage. Instead we set our own DTthreads - // static variable and read that from getDTthreads(). - // All parallel regions should include num_threads(getDTthreads()) and this is ensured via + // static variable and read that from getDTthreads(n, throttle). + // All parallel regions should include num_threads(getDTthreads(n, true|false)) and this is ensured via // a grep in CRAN_Release.cmd. } return ScalarInteger(old); diff --git a/src/reorder.c b/src/reorder.c index da3784e94d..c2deea8ae9 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -64,7 +64,7 @@ SEXP reorder(SEXP x, SEXP order) if (size==4) { const int *restrict vd = DATAPTR_RO(v); int *restrict tmp = (int *)TMP; - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(end, true)) for (int i=start; i<=end; ++i) { tmp[i-start] = vd[idx[i]-1]; // copies 4 bytes; e.g. INTSXP and also SEXP pointers on 32bit (STRSXP and VECSXP) } @@ -75,14 +75,14 @@ SEXP reorder(SEXP x, SEXP order) } else if (size==8) { const double *restrict vd = DATAPTR_RO(v); double *restrict tmp = (double *)TMP; - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(end, true)) for (int i=start; i<=end; ++i) { tmp[i-start] = vd[idx[i]-1]; // copies 8 bytes; e.g. REALSXP and also SEXP pointers on 64bit (STRSXP and VECSXP) } } else { // size 16; checked up front const Rcomplex *restrict vd = DATAPTR_RO(v); Rcomplex *restrict tmp = (Rcomplex *)TMP; - #pragma omp parallel for num_threads(getDTthreads()) + #pragma omp parallel for num_threads(getDTthreads(end, true)) for (int i=start; i<=end; ++i) { tmp[i-start] = vd[idx[i]-1]; } diff --git a/src/subset.c b/src/subset.c index d9fea2800c..91a4018e2c 100644 --- a/src/subset.c +++ b/src/subset.c @@ -13,13 +13,13 @@ void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA) #define PARLOOP(_NAVAL_) \ if (anyNA) { \ - _Pragma("omp parallel for num_threads(getDTthreads())") \ + _Pragma("omp parallel for num_threads(getDTthreads(n, true))") \ for (int i=0; i1) schedule(auto) collapse(2) num_threads(getDTthreads()) + #pragma omp parallel for schedule(dynamic) collapse(2) num_threads(getDTthreads(nx*nk, false)) for (R_len_t i=0; i Date: Thu, 18 Jun 2020 09:58:14 +0100 Subject: [PATCH 034/588] we dont use private windows runners anymore (#4522) --- .ci/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/README.md b/.ci/README.md index ddf76d3d80..3464e20b25 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -1,6 +1,6 @@ # data.table continuous integration and deployment -On each Pull Request opened in GitHub we run Travis CI and Appveyor to provide prompt feedback about the status of PR. Our main CI pipeline runs on GitLab CI. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch. It tests more environments and different configurations. It publish variety of artifacts. Windows jobs are being run on our private windows CI runner. +On each Pull Request opened in GitHub we run Travis CI and Appveyor to provide prompt feedback about the status of PR. Our main CI pipeline runs on GitLab CI. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch. It tests more environments and different configurations. It publish variety of artifacts. ## Environments From 26ffb8a8d2799dc7bb7a18fa3d1ea5a8705b97ac Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 18 Jun 2020 09:59:22 +0100 Subject: [PATCH 035/588] proper docker urls (#4550) --- .ci/publish.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/publish.R b/.ci/publish.R index 147a397538..fd95947ed4 100644 --- a/.ci/publish.R +++ b/.ci/publish.R @@ -61,7 +61,7 @@ package.index <- function(package, lib.loc, repodir="bus/integration/cran") { images = c("r-release","r-devel","r-release-builder") images.title = c("Base R release", "Base R development", "R release package builder") tags = rep("latest", 3) - docker.dl = sprintf(" %s:
docker pull %s/%s/%s/%s:%s
", images.title, registry, namespace, project, images, tags) + docker.dl = sprintf(" %s:
docker pull %s/%s/%s/%s:%s
", images.title, tolower(registry), tolower(namespace), tolower(project), tolower(images), tags) } index.file = file.path(repodir, "web/packages", pkg, "index.html") if (!dir.exists(dirname(index.file))) dir.create(dirname(index.file), recursive=TRUE) From cb2fe72c568f790fd129f953755f2aac5fa43003 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 18 Jun 2020 10:04:53 +0100 Subject: [PATCH 036/588] alloc on heap to avoid stack overflow (#4542) --- src/frollR.c | 6 +++--- src/nafill.c | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/frollR.c b/src/frollR.c index f8ff977817..131dfeca1b 100644 --- a/src/frollR.c +++ b/src/frollR.c @@ -95,7 +95,7 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX } } } - int* ikl[nk]; // pointers to adaptive window width + int **ikl = (int**)R_alloc(nk, sizeof(int*)); // to not recalculate `length(x[[i]])` we store it in extra array if (badaptive) { for (int j=0; j Date: Thu, 18 Jun 2020 10:39:18 +0100 Subject: [PATCH 037/588] set print opt in tests, closes #4552 (#4553) --- R/test.data.table.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/test.data.table.R b/R/test.data.table.R index 14d5ae83bf..4c86299cbd 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -75,6 +75,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F datatable.optimize = Inf, datatable.alloccol = 1024L, datatable.print.class = FALSE, # this is TRUE in cc.R and we like TRUE. But output= tests need to be updated (they assume FALSE currently) + datatable.print.trunc.cols = FALSE, #4552 datatable.rbindlist.check = NULL, datatable.integer64 = "integer64", warnPartialMatchArgs = base::getRversion()>="3.6.0", # ensure we don't rely on partial argument matching in internal code, #3664; >=3.6.0 for #3865 From c152ced0e5799acee1589910c69c1a2c6586b95d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Thu, 18 Jun 2020 12:48:14 +0200 Subject: [PATCH 038/588] Added explicit copy of colnames to Ops.data.table when result is.matrix (#4328) --- NEWS.md | 2 ++ R/data.table.R | 4 ++-- inst/tests/tests.Rraw | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 98484687c2..4d1c16aebf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -121,6 +121,8 @@ unit = "s") 18. `by=col1:col4` gave an incorrect result if `key(DT)==c("col1","col4")`, [#4285](https://github.com/Rdatatable/data.table/issues/4285). Thanks to @cbilot for reporting, and Cole Miller for the PR. +19. Matrices resulting from logical operators or comparisons on `data.table`s, e.g. in `dta == dtb`, can no longer have their colnames changed by reference later, [#4323](https://github.com/Rdatatable/data.table/issues/4323). Thanks to @eyherabh for reporting and @tlapak for the PR. + ## NOTES 0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. diff --git a/R/data.table.R b/R/data.table.R index e71b0a161d..a6b731fde6 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2261,8 +2261,8 @@ is.na.data.table = function (x) { Ops.data.table = function(e1, e2 = NULL) { ans = NextMethod() - if (cedta() && is.data.frame(ans)) - ans = as.data.table(ans) + if (cedta() && is.data.frame(ans)) ans = as.data.table(ans) + else if (is.matrix(ans)) colnames(ans) = copy(colnames(ans)) ans } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 12790ed92a..a3399987e1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16973,3 +16973,10 @@ DT = data.table(col1=c(1,1,1), col2=c("a","b","a"), col3=c("A","B","A"), col4=c( setkey(DT, col1, col4) test(2147.1, DT[, .N, by = col1:col4], ans<-data.table(col1=1, col2=c("a","b"), col3=c("A","B"), col4=2, N=INT(2,1))) test(2147.2, DT[, .N, by = c("col1", "col2", "col3", "col4")], ans) + +# Result matrix of comparison operators could have its colnames changed by reference, #4323 +A = data.table(x=1:2) +B = data.table(x=1:2) +X = A == B +A[, y := 3:4] +test(2148, colnames(X), c('x')) From 9f60eb7c5522e29f0ab22fadf881dbed2518d259 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 18 Jun 2020 18:38:27 +0100 Subject: [PATCH 039/588] fix current win failures (#4556) --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 84af05b7a1..97f30a0266 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -79,7 +79,7 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_BIN_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_BIN_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-win.exe https://cloud.r-project.org/bin/windows/base/R-4.0.0-win.exe; Start-Process -FilePath ..\R-win.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-win.exe https://cloud.r-project.org/bin/windows/base/old/R-4.0.0-win.exe; Start-Process -FilePath ..\R-win.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-dev-win: &install-r-dev-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-rtools-win: &install-rtools-win From 3c788b7914918e5b6e3c1d76bed42fcb1e61a69d Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 18 Jun 2020 19:44:45 +0100 Subject: [PATCH 040/588] proper link (#4559) --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 97f30a0266..51faefe280 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -79,7 +79,7 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_BIN_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_BIN_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-win.exe https://cloud.r-project.org/bin/windows/base/old/R-4.0.0-win.exe; Start-Process -FilePath ..\R-win.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-win.exe https://cloud.r-project.org/bin/windows/base/old/4.0.0/R-4.0.0-win.exe; Start-Process -FilePath ..\R-win.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-dev-win: &install-r-dev-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-rtools-win: &install-rtools-win From 2b238c3bf29fff18970bf7973d693c83dd395b7e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Jun 2020 14:06:03 +0800 Subject: [PATCH 041/588] ITime utc in test (#4473) --- inst/tests/tests.Rraw | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a3399987e1..4b50a3613b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16882,20 +16882,29 @@ cols = c('x', 'y') test(2136, dt[, (cols) := lapply(.SD[get("x") == 1],function(x){x + 2L}), .SDcols = cols ,by = z], data.table(x = 1L + 2L, y = 2L + 2L, z = 3L)) # round, trunc should all be 'integer' and and have class 'ITime', #4207 -DT = data.table(hour31 = as.ITime(seq(as.POSIXct("2020-01-01 07:00:40"), by = "31 min", length.out = 9)), - hour30 = as.ITime(seq(as.POSIXct("2020-01-01 07:00:00"), by = "30 min", length.out = 9)), - minute31 = as.ITime(seq(as.POSIXct("2020-01-01 07:00:00"), by = "31 sec", length.out = 9)), - minute30 = as.ITime(seq(as.POSIXct("2020-01-01 07:00:00"), by = "30 sec", length.out = 9))) -test(2137.01, TRUE, DT[, all(sapply(.SD, class) == "ITime")]) -test(2137.02, TRUE, DT[, all(sapply(.SD, typeof) == "integer")]) -test(2137.03, FALSE, DT[, all(round(hour30, "hours") == as.ITime(c("07:00", "08:00", "08:00", "09:00", "09:00", "10:00", "10:00", "11:00", "11:00")))]) -test(2137.04, TRUE, DT[, all(round(hour31, "hours") == as.ITime(c("07:00", "08:00", "08:00", "09:00", "09:00", "10:00", "10:00", "11:00", "11:00")))]) -test(2137.05, FALSE, DT[, all(round(minute30, "minutes") == as.ITime(c("07:00:00", "07:01:00", "07:01:00", "07:02:00", "07:02:00", "07:03:00", "07:03:00", "07:04:00", "07:04:00")))]) -test(2137.06, TRUE, DT[, all(round(minute31, "minutes") == as.ITime(c("07:00:00", "07:01:00", "07:01:00", "07:02:00", "07:02:00", "07:03:00", "07:03:00", "07:04:00", "07:04:00")))]) -test(2137.07, TRUE, DT[, all(trunc(hour30, "hours") == as.ITime(c("07:00", "07:00", "08:00", "08:00", "09:00", "09:00", "10:00", "10:00", "11:00")))]) -test(2137.08, TRUE, DT[, all(trunc(hour31, "hours") == as.ITime(c("07:00", "07:00", "08:00", "08:00", "09:00", "09:00", "10:00", "10:00", "11:00")))]) -test(2137.09, TRUE, DT[, all(trunc(minute30, "minutes") == as.ITime(c("07:00:00", "07:00:00", "07:01:00", "07:01:00", "07:02:00", "07:02:00", "07:03:00", "07:03:00", "07:04:00")))]) -test(2137.10, TRUE, DT[, all(trunc(minute31, "minutes") == as.ITime(c("07:00:00", "07:00:00", "07:01:00", "07:01:00", "07:02:00", "07:02:00", "07:03:00", "07:03:00", "07:04:00")))]) +start_time = as.POSIXct("2020-01-01 07:00:00", tz='UTC') +l = list( + hour31 = as.ITime(seq(start_time+40, by = "31 min", length.out = 9L)), + hour30 = as.ITime(seq(start_time, by = "30 min", length.out = 9L)), + minute31 = as.ITime(seq(start_time, by = "31 sec", length.out = 9L)), + minute30 = as.ITime(seq(start_time, by = "30 sec", length.out = 9L)) +) +ans = list( + a = as.ITime(c("07:00", "08:00", "08:00", "09:00", "09:00", "10:00", "10:00", "11:00", "11:00")), + b = as.ITime(c("07:00", "07:01", "07:01", "07:02", "07:02", "07:03", "07:03", "07:04", "07:04")), + c = as.ITime(c("07:00", "07:00", "08:00", "08:00", "09:00", "09:00", "10:00", "10:00", "11:00")), + d = as.ITime(c("07:00", "07:00", "07:01", "07:01", "07:02", "07:02", "07:03", "07:03", "07:04")) +) +test(2137.01, all(sapply(l, inherits, "ITime"))) +test(2137.02, all(sapply(l, typeof) == "integer")) +test(2137.03, which(round(l$hour30, "hours") != ans$a), c(4L, 8L)) +test(2137.04, round(l$hour31, "hours"), ans$a) +test(2137.05, which(round(l$minute30, "minutes") != ans$b), c(2L, 6L)) +test(2137.06, round(l$minute31, "minutes"), ans$b) +test(2137.07, trunc(l$hour30, "hours"), ans$c) +test(2137.08, trunc(l$hour31, "hours"), ans$c) +test(2137.09, trunc(l$minute30, "minutes"), ans$d) +test(2137.10, trunc(l$minute31, "minutes"), ans$d) # Complex to character conversion in rbindlist, #4202 A = data.table(A=complex(real = 1:3, imaginary=c(0, -1, 1))) From 936a4e3e7e602718e53be21566744ef7d2842848 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 19 Jun 2020 00:53:47 -0600 Subject: [PATCH 042/588] snprintf test9 (#4555) --- src/snprintf.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/src/snprintf.c b/src/snprintf.c index 62a823ae7c..52f7ea37c6 100644 --- a/src/snprintf.c +++ b/src/snprintf.c @@ -21,7 +21,7 @@ #include // isdigit #undef snprintf // on Windows, just in this file, we do want to use the C library's snprintf -int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) +int dt_win_snprintf(char *dest, const size_t n, const char *fmt, ...) { if (n<1) return 0; va_list ap; @@ -47,7 +47,7 @@ int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) // an error() call is not thread-safe; placing error in dest is better than a crash. This way // we have a better chance of the user reporting the strange error and we'll see it's a fmt issue // in the message itself. - snprintf(dest, n, "snprintf %-5s does not end with recognized type letter", ch); + snprintf(dest, n, "0 %-5s does not end with recognized type letter", ch); return -1; } const char *d = ch+1; @@ -58,14 +58,14 @@ int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) int pos = atoi(ch+1); if (pos<1 || pos>99) { // up to 99 supported here; should not need more than 99 in a message - snprintf(dest, n, "snprintf %.*s outside range [1,99]", (int)(d-ch+1), ch); + snprintf(dest, n, "1 %.*s outside range [1,99]", (int)(d-ch+1), ch); return -1; } if (pos>narg) narg=pos; if (strp[pos-1]) { // no dups allowed because it's reasonable to not support dups, but this wrapper // could not cope with the same argument formatted differently; e.g. "%1$d %1$5d" - snprintf(dest, n, "snprintf %%%d$ appears twice", pos); + snprintf(dest, n, "2 %%%d$ appears twice", pos); return -1; } strp[pos-1] = strchr(ch, '$')+1; @@ -78,7 +78,7 @@ int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) } if (posSpec && nonPosSpec) { // Standards state that if one specifier uses position, they all must; good. - snprintf(dest, n, "snprintf some %%n$ but not all"); + snprintf(dest, n, "3 some %%n$ but not all"); return -1; } if (!posSpec) { @@ -93,7 +93,7 @@ int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) char *spec = (char *)malloc(specAlloc); // not R_alloc as we need to be thread-safe if (!spec) { // # nocov start - snprintf(dest, n, "snprintf: %d byte spec alloc failed", (int)specAlloc); + snprintf(dest, n, "4 %d byte spec alloc failed", (int)specAlloc); return -1; // # nocov end } @@ -101,7 +101,7 @@ int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) for (int i=0; i=n) { - // 0.01% likely: n wasn't big enough to hold result; test 9 covers this + // n wasn't big enough to hold result; test 9 covers this unlikely event // C99 standard states that vsnprintf returns the size that would be big enough char *new = realloc(buff, res+1); if (!new) { // # nocov start - snprintf(dest, n, "snprintf: %d byte buff realloc failed", (int)res+1); + snprintf(dest, n, "7 %d byte buff realloc failed", (int)res+1); free(spec); free(buff); return -1; // # nocov end } buff = new; + va_start(ap, fmt); // to use ap again must reset it; #4545 int newres = vsnprintf(buff, res+1, spec, ap); // try again; test 9 + va_end(ap); if (newres!=res) { // # nocov start - snprintf(dest, n, "snprintf: second vsnprintf %d != %d", newres, res); + snprintf(dest, n, "8 %d %d second vsnprintf", newres, res); free(spec); free(buff); return -1; @@ -145,7 +148,7 @@ int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) } } else if (res<1) { // negative is error, cover 0 as error too here // # nocov start - snprintf(dest, n, "snprintf: clib error %d", res); + snprintf(dest, n, "9 %d clib error", res); free(spec); free(buff); return -1; @@ -178,7 +181,6 @@ int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...) *ch2='\0'; free(spec); free(buff); - va_end(ap); return nc; } @@ -199,7 +201,7 @@ SEXP test_dt_win_snprintf() if (strcmp(buff, "hello%2$d -99 short 012$")) error("dt_win_snprintf test 4 failed: %s", buff); dt_win_snprintf(buff, 50, "%1$d %s", 9, "foo"); - if (strcmp(buff, "snprintf some %n$ but not all")) error("dt_win_snprintf test 5 failed: %s", buff); + if (strcmp(buff, "3 some %n$ but not all")) error("dt_win_snprintf test 5 failed: %s", buff); dt_win_snprintf(buff, 50, "%%1$foo%d", 9); // The %1$f is not a specifier because % is doubled if (strcmp(buff, "%1$foo9")) error("dt_win_snprintf test 6 failed: %s", buff); @@ -215,31 +217,31 @@ SEXP test_dt_win_snprintf() if (res!=13) /* should return what would have been written if not chopped */ error("dt_win_snprintf test 10 failed: %d", res); dt_win_snprintf(buff, 39, "%l", 3); - if (strlen(buff)!=38 || strcmp(buff, "snprintf %l does not end with recog")) error("dt_win_snprintf test 11 failed: %s", buff); + if (strlen(buff)!=38 || strcmp(buff, "0 %l does not end with recognized t")) error("dt_win_snprintf test 11 failed: %s", buff); dt_win_snprintf(buff, 19, "%l", 3); - if (strlen(buff)!=18 || strcmp(buff, "snprintf %l doe")) error("dt_win_snprintf test 12 failed: %s", buff); + if (strlen(buff)!=18 || strcmp(buff, "0 %l does not e")) error("dt_win_snprintf test 12 failed: %s", buff); dt_win_snprintf(buff, 50, "%1$d == %0$d", 1, 2); - if (strcmp(buff, "snprintf %0$ outside range [1,99]")) error("dt_win_snprintf test 13 failed: %s", buff); + if (strcmp(buff, "1 %0$ outside range [1,99]")) error("dt_win_snprintf test 13 failed: %s", buff); dt_win_snprintf(buff, 50, "%1$d == %$d", 1, 2); - if (strcmp(buff, "snprintf %$ outside range [1,99]")) error("dt_win_snprintf test 14 failed: %s", buff); + if (strcmp(buff, "1 %$ outside range [1,99]")) error("dt_win_snprintf test 14 failed: %s", buff); dt_win_snprintf(buff, 50, "%1$d == %100$d", 1, 2); - if (strcmp(buff, "snprintf %100$ outside range [1,99]")) error("dt_win_snprintf test 15 failed: %s", buff); + if (strcmp(buff, "1 %100$ outside range [1,99]")) error("dt_win_snprintf test 15 failed: %s", buff); dt_win_snprintf(buff, 50, "%1$d == %-1$d", 1, 2); - if (strcmp(buff, "snprintf %-1$ outside range [1,99]")) error("dt_win_snprintf test 16 failed: %s", buff); + if (strcmp(buff, "1 %-1$ outside range [1,99]")) error("dt_win_snprintf test 16 failed: %s", buff); dt_win_snprintf(buff, 50, "%1$d == %3$d", 1, 2, 3); - if (strcmp(buff, "snprintf %2$ missing")) error("dt_win_snprintf test 17 failed: %s", buff); + if (strcmp(buff, "5 %2$ missing")) error("dt_win_snprintf test 17 failed: %s", buff); dt_win_snprintf(buff, 50, "%1$d == %1$d", 42); - if (strcmp(buff, "snprintf %1$ appears twice")) error("dt_win_snprintf test 18 failed: %s", buff); + if (strcmp(buff, "2 %1$ appears twice")) error("dt_win_snprintf test 18 failed: %s", buff); dt_win_snprintf(buff, 50, "%1$d + %3$d - %2$d == %3$d", 1, 1, 2); - if (strcmp(buff, "snprintf %3$ appears twice")) error("dt_win_snprintf test 19 failed: %s", buff); + if (strcmp(buff, "2 %3$ appears twice")) error("dt_win_snprintf test 19 failed: %s", buff); return R_NilValue; } From 752012f577f8e268bb6d0084ca39a09fa7fbc1c4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 19 Jun 2020 20:09:59 -0600 Subject: [PATCH 043/588] R_DATATABLE_NUM_THREADS env variable limit fixed (#4562) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 22 ++++++++++++++++------ src/openmp-utils.c | 27 +++++++++++++++------------ 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4d1c16aebf..05e595a768 100644 --- a/NEWS.md +++ b/NEWS.md @@ -123,6 +123,8 @@ unit = "s") 19. Matrices resulting from logical operators or comparisons on `data.table`s, e.g. in `dta == dtb`, can no longer have their colnames changed by reference later, [#4323](https://github.com/Rdatatable/data.table/issues/4323). Thanks to @eyherabh for reporting and @tlapak for the PR. +20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8). + ## NOTES 0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4b50a3613b..5c607bcb27 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14195,7 +14195,9 @@ test(1997.06, setDTthreads(percent=NULL), error="but is length 0") test(1997.07, setDTthreads(percent=1:2), error="but is length 2") test(1997.08, setDTthreads(restore_after_fork=21), error="must be TRUE, FALSE, or NULL") old = getDTthreads() # (1) -oldenv = Sys.getenv("R_DATATABLE_NUM_PROCS_PERCENT") +oldenv1 = Sys.getenv("R_DATATABLE_NUM_PROCS_PERCENT") +oldenv2 = Sys.getenv("R_DATATABLE_NUM_THREADS") +Sys.setenv(R_DATATABLE_NUM_THREADS="") # in case user has this set, so we can test PROCS_PERCENT Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="3.0") test(1997.09, setDTthreads(), old, warning="Ignoring invalid.*Please remove any.*not a digit") new = getDTthreads() # old above at (1) may not have been default. new now is. @@ -14208,12 +14210,20 @@ test(1997.13, setDTthreads(), new) new = getDTthreads() setDTthreads(percent=75) test(1997.14, getDTthreads(), new) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT=oldenv) -test(1997.15, setDTthreads(old), new) -test(1997.16, getDTthreads(), old) -test(1997.17, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1") +Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="100") +setDTthreads() +allcpu = getDTthreads() +Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75") +Sys.setenv(R_DATATABLE_NUM_THREADS=allcpu) +setDTthreads() +test(1997.15, getDTthreads(), allcpu) +Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT=oldenv1) +Sys.setenv(R_DATATABLE_NUM_THREADS=oldenv2) +test(1997.16, setDTthreads(old), allcpu) +test(1997.17, getDTthreads(), old) +test(1997.18, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1") setDTthreads(throttle=65536) -test(1997.18, getDTthreads(TRUE), output="throttle==65536") +test(1997.19, getDTthreads(TRUE), output="throttle==65536") setDTthreads(throttle=1024) # test that a copy is being made and output is printed, #3385 after partial revert of #3281 diff --git a/src/openmp-utils.c b/src/openmp-utils.c index cfd0e3806f..51393f3b7c 100644 --- a/src/openmp-utils.c +++ b/src/openmp-utils.c @@ -20,7 +20,7 @@ static int getIntEnv(const char *name, int def) long int ans = strtol(val, &end, 10); // ignores leading whitespace. If it fully consumed the string, *end=='\0' and isspace('\0')==false while (isspace(*end)) end++; // ignore trailing whitespace if (errno || (size_t)(end-val)!=nchar || ans<1 || ans>INT_MAX) { - warning(_("Ignoring invalid %s==\")%s\". Not an integer >= 1. Please remove any characters that are not a digit [0-9]. See ?data.table::setDTthreads."), name, val); + warning(_("Ignoring invalid %s==\"%s\". Not an integer >= 1. Please remove any characters that are not a digit [0-9]. See ?data.table::setDTthreads."), name, val); return def; } return (int)ans; @@ -33,23 +33,26 @@ void initDTthreads() { // called at package startup from init.c // also called by setDTthreads(threads=NULL) (default) to reread environment variables; see setDTthreads below // No verbosity here in this setter. Verbosity is in getDTthreads(verbose=TRUE) - int ans = omp_get_num_procs(); // starting point is all logical CPUs. This is a hard limit; user cannot achieve more than this. - // ifndef _OPENMP then myomp.h defines this to be 1 - int perc = getIntEnv("R_DATATABLE_NUM_PROCS_PERCENT", 50); // use "NUM_PROCS" to use the same name as the OpenMP function this uses - // 50% of logical CPUs by default; half of 8 is 4 on laptop with 4 cores. Leaves plenty of room for other processes: #3395 & #3298 - if (perc<=1 || perc>100) { - warning(_("Ignoring invalid R_DATATABLE_NUM_PROCS_PERCENT==%d. If used it must be an integer between 2 and 100. Default is 50. See ?setDTtheads."), perc); - // not allowing 1 is to catch attempts to use 1 or 1.0 to represent 100%. - perc = 50; + int ans = getIntEnv("R_DATATABLE_NUM_THREADS", INT_MIN); + if (ans>=1) { + ans = imin(ans, omp_get_num_procs()); // num_procs is a hard limit; user cannot achieve more. ifndef _OPENMP then myomp.h defines this to be 1 + } else { + // Only when R_DATATABLE_NUM_THREADS is unset (or <=0) do we use PROCS_PERCENT; #4514 + int perc = getIntEnv("R_DATATABLE_NUM_PROCS_PERCENT", 50); // use "NUM_PROCS" to use the same name as the OpenMP function this uses + // 50% of logical CPUs by default; half of 8 is 4 on laptop with 4 cores. Leaves plenty of room for other processes: #3395 & #3298 + if (perc<=1 || perc>100) { + warning(_("Ignoring invalid R_DATATABLE_NUM_PROCS_PERCENT==%d. If used it must be an integer between 2 and 100. Default is 50. See ?setDTtheads."), perc); + // not allowing 1 is to catch attempts to use 1 or 1.0 to represent 100%. + perc = 50; + } + ans = imax(omp_get_num_procs()*perc/100, 1); // imax for when formula would result in 0. } - ans = imax(ans*perc/100, 1); ans = imin(ans, omp_get_thread_limit()); // honors OMP_THREAD_LIMIT when OpenMP started; e.g. CRAN sets this to 2. Often INT_MAX meaning unlimited/unset ans = imin(ans, omp_get_max_threads()); // honors OMP_NUM_THREADS when OpenMP started, plus reflects any omp_set_* calls made since - ans = imax(ans, 1); // just in case omp_get_* returned <= 0 for any reason // max_threads() -vs- num_procs(): https://software.intel.com/en-us/forums/intel-visual-fortran-compiler-for-windows/topic/302866 - ans = imin(ans, getIntEnv("R_DATATABLE_NUM_THREADS", INT_MAX)); ans = imin(ans, getIntEnv("OMP_THREAD_LIMIT", INT_MAX)); // user might expect `Sys.setenv(OMP_THREAD_LIMIT=2);setDTthreads()` to work. Satisfy this ans = imin(ans, getIntEnv("OMP_NUM_THREADS", INT_MAX)); // expectation by reading them again now. OpenMP just reads them on startup (quite reasonably) + ans = imax(ans, 1); // just in case omp_get_* returned <=0 for any reason, or the env variables above are set <=0 DTthreads = ans; DTthrottle = imax(1, getIntEnv("R_DATATABLE_THROTTLE", 1024)); // 2nd thread is used only when n>1024, 3rd thread when n>2048, etc } From ad7b67c80a551b7a1e2ef8b73d6162ed7737c934 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 21 Jun 2020 18:21:20 +0100 Subject: [PATCH 044/588] no need re-order for equi join (#4548) --- R/data.table.R | 2 +- inst/tests/tests.Rraw | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index a6b731fde6..e91a47d861 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -463,7 +463,7 @@ replace_dot_alias = function(e) { allLen1 = ans$allLen1 f__ = ans$starts len__ = ans$lens - allGrp1 = FALSE # was previously 'ans$allGrp1'. Fixing #1991. TODO: Revisit about allGrp1 possibility for speedups in certain cases when I find some time. + allGrp1 = all(ops==1L) # was previously 'ans$allGrp1'. Fixing #1991. TODO: Revisit about allGrp1 possibility for speedups in certain cases when I find some time. indices__ = if (length(ans$indices)) ans$indices else seq_along(f__) # also for #1991 fix # length of input nomatch (single 0 or NA) is 1 in both cases. # When no match, len__ is 0 for nomatch=0 and 1 for nomatch=NA, so len__ isn't .N diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5c607bcb27..11168db5c5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14584,7 +14584,7 @@ test(2023.6, DT[, .N, by = CLASS], data.table(CLASS=c("aaaa","dddd","gggg","eeee # more verbose timings #1265 DT = data.table(x=c("a","b","c","b","a","c"), y=c(1,3,6,1,6,3), v=1:6) setindex(DT, y) -test(2024, DT[y==6, v:=10L, verbose=TRUE], output=c("Constructing irows for.*", "Reorder irows for.*")) +test(2024, DT[y==6, v:=10L, verbose=TRUE], output="Constructing irows for.*") # fread embedded '\0', #3400 test(2025.01, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data.table(A=INT(1,3,4), B=INT(2,2,5), C=INT(3,1,6))) From 9d3b9202fddb980345025a4f6ac451ed26a423be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Fri, 26 Jun 2020 03:24:34 +0200 Subject: [PATCH 045/588] Remove deep copy of indices from shallow() (#4440) --- NEWS.md | 3 +++ inst/tests/tests.Rraw | 33 ++++++++++++++++++++++++--------- src/assign.c | 20 ++++++++++++++++---- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index 05e595a768..2d31090120 100644 --- a/NEWS.md +++ b/NEWS.md @@ -169,6 +169,9 @@ unit = "s") 10. Starting from 4.0.0, data.table is using R's `rbind` and `cbind` methods, as described in v1.12.6 news entry. Support for R 3.x.x is resolved when processing `NAMESPACE` file, at install time, or at the time of building package binaries. As a result, users on R 3.x.x, if installing from binaries, must use binaries built by R 3.x.x, and users on R 4.x.x, if installing from binaries, must use binaries built by R 4.x.x. Users will see `package ‘data.table’ was built under R version...` warning when this happen. Thanks to @vinhdizzo for reporting in [#4528](https://github.com/Rdatatable/data.table/issues/4528). +11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. + + # data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) ## NEW FEATURES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 11168db5c5..7b3902a2b5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12871,30 +12871,32 @@ ids <- sample(letters[1:3], 10, replace=TRUE) scores <- rnorm(10) dt <- data.table(id=ids, score=scores) dt.s4 <- new("Data.Table", data.table(id=ids, score=scores)) -test(1914.1, isS4(dt.s4)) -test(1914.2, inherits(dt.s4, 'data.table')) +test(1914.01, isS4(dt.s4)) +test(1914.02, inherits(dt.s4, 'data.table')) +# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table +test(1914.03, isS4(shallow(dt.s4))) ## pull out data from S4 as.list, and compare to list from dt dt.s4.list <- dt.s4@.Data names(dt.s4.list) <- names(dt.s4) -test(1914.3, dt.s4.list, as.list(dt)) # Underlying data not identical +test(1914.04, dt.s4.list, as.list(dt)) # Underlying data not identical # simple S4 conversion-isms work df = data.frame(a=sample(letters, 10), b=1:10) dt = as.data.table(df) -test(1914.4, identical(as(df, 'data.table'), dt)) -test(1914.5, identical(as(dt, 'data.frame'), df)) +test(1914.05, identical(as(df, 'data.table'), dt)) +test(1914.06, identical(as(dt, 'data.frame'), df)) # data.table can be used in an S4 slot dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) dt.comp <- new("S4Composition", data=dt) -test(1914.6, dt.comp@data, dt) +test(1914.07, dt.comp@data, dt) # S4 methods dispatch properly on data.table slots" dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) dt.comp <- new("S4Composition", data=dt) setGeneric("dtGet", function(x, what) standardGeneric("dtGet")) setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data}) setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]}) -test(1914.7, dtGet(dt.comp), dt) # actually -test(1914.8, identical(dtGet(dt.comp, 1), dt[[1]])) -test(1914.9, identical(dtGet(dt.comp, 'b'), dt$b)) +test(1914.08, dtGet(dt.comp), dt) # actually +test(1914.09, identical(dtGet(dt.comp, 1), dt[[1]])) +test(1914.10, identical(dtGet(dt.comp, 'b'), dt$b)) removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev removeClass("S4Composition") # END port of old testthat tests @@ -16999,3 +17001,16 @@ B = data.table(x=1:2) X = A == B A[, y := 3:4] test(2148, colnames(X), c('x')) + +# shallow() shouldn't take a deep copy of indices, #4311 +dt <- data.table(a = c(3, 1)) +setindex(dt, a) +dt2 <- shallow(dt) +test(2149.1, address(attr(attr(dt, 'index'), '__a')), address(attr(attr(dt2, 'index'), '__a'))) +# Testing possible future regression. shallow() needs to copy the names of indices and keys. +setnames(dt2, 'a', 'A') +test(2149.2, indices(dt), 'a') +setkey(dt, a) +dt2 <- shallow(dt) +setnames(dt2, 'a', 'A') +test(2149.3, key(dt), 'a') diff --git a/src/assign.c b/src/assign.c index 1392079e72..88fc260655 100644 --- a/src/assign.c +++ b/src/assign.c @@ -152,13 +152,25 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) R_len_t i,l; int protecti=0; SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here? - //copyMostAttrib(dt, newdt); // including class - DUPLICATE_ATTRIB(newdt, dt); + SET_ATTRIB(newdt, shallow_duplicate(ATTRIB(dt))); + SET_OBJECT(newdt, OBJECT(dt)); + IS_S4_OBJECT(dt) ? SET_S4_OBJECT(newdt) : UNSET_S4_OBJECT(newdt); // To support S4 objects that incude data.table + //SHALLOW_DUPLICATE_ATTRIB(newdt, dt); // SHALLOW_DUPLICATE_ATTRIB would be a bit neater but is only available from R 3.3.0 + // TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It // also increases truelength. Perhaps make that distinction, then, and split out, but marked // so that the next change knows to duplicate. - // Does copyMostAttrib duplicate each attrib or does it point? It seems to point, hence DUPLICATE_ATTRIB - // for now otherwise example(merge.data.table) fails (since attr(d4,"sorted") gets written by setnames). + // keepattr() also merely points to the entire attrbutes list and thus doesn't allow replacing + // some of its elements. + + // We copy all attributes that refer to column names so that calling setnames on either + // the original or the shallow copy doesn't break anything. + SEXP index = PROTECT(getAttrib(dt, sym_index)); protecti++; + setAttrib(newdt, sym_index, shallow_duplicate(index)); + + SEXP sorted = PROTECT(getAttrib(dt, sym_sorted)); protecti++; + setAttrib(newdt, sym_sorted, duplicate(sorted)); + SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++; SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++; if (isNull(cols)) { From 6f360be0b2a6cf425f6df751ca9a99ec5d35ed93 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 26 Jun 2020 20:43:10 +0100 Subject: [PATCH 046/588] more ignores (#4565) --- .Rbuildignore | 9 +++++++++ .gitignore | 13 ++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/.Rbuildignore b/.Rbuildignore index 2b3483fa0e..a910621f52 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,6 +1,13 @@ ^\.Rprofile$ ^data\.table_.*\.tar\.gz$ ^vignettes/plots/figures$ +^\.Renviron$ +^[^/]+\.R$ +^[^/]+\.csv$ +^[^/]+\.csvy$ +^[^/]+\.RDS$ +^[^/]+\.diff$ +^[^/]+\.patch$ ^\.ci$ ^\.dev$ @@ -30,3 +37,5 @@ ^bus$ ^pkgdown$ +^lib$ +^library$ diff --git a/.gitignore b/.gitignore index 35a25bc087..51cc13cd69 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -# Source: https://github.com/github/gitignore/blob/master/R.gitignore # History files .RData .Rhistory @@ -29,7 +28,19 @@ vignettes/plots/figures *.so *.dll +# temp files *~ .DS_Store .idea *.sw[op] + +# common devel objects +.Renviron +lib +library +*.R +*.csv +*.csvy +*.RDS +*.diff +*.patch From ba32f3cba38ec270587e395f6e6c26a80be36be6 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 26 Jun 2020 23:05:25 +0100 Subject: [PATCH 047/588] towards #4200, overhead in dogroups (#4558) --- src/subset.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/src/subset.c b/src/subset.c index 91a4018e2c..f9c66e2df8 100644 --- a/src/subset.c +++ b/src/subset.c @@ -11,23 +11,41 @@ void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA) // negatives, zeros and out-of-bounds have already been dealt with in convertNegAndZero so we can rely // here on idx in range [1,length(ans)]. + const int nth = getDTthreads(n, /*throttle=*/true); + // For small n such as 2,3,4 etc we had hoped OpenMP would be sensible inside it and not create a team + // with each thread doing just one item. Otherwise, call overhead would be too high for highly iterated + // calls on very small subsets. Timings were tested in #3175. However, the overhead does seem to add up + // significantly. Hence the throttle was introduced, #4484. And not having the OpenMP region at all here + // when nth==1 (the ifs below in PARLOOP) seems to help too, #4200. + // To stress test the code for correctness by forcing multi-threading on for small data, the throttle can + // be turned off using setDThreads() or R_DATATABLE_THROTTLE environment variable. + #define PARLOOP(_NAVAL_) \ if (anyNA) { \ - _Pragma("omp parallel for num_threads(getDTthreads(n, true))") \ - for (int i=0; i1) { \ + _Pragma("omp parallel for num_threads(nth)") \ + for (int i=0; i1) { \ + _Pragma("omp parallel for num_threads(nth)") \ + for (int i=0; i Date: Tue, 30 Jun 2020 23:35:58 +0100 Subject: [PATCH 048/588] R-devel check detects soft issues now (#4584) --- .ci/README.md | 6 +++--- .gitlab-ci.yml | 20 ++++++++++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.ci/README.md b/.ci/README.md index 3464e20b25..7b8ea3d2a9 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -9,7 +9,7 @@ On each Pull Request opened in GitHub we run Travis CI and Appveyor to provide p Test jobs: - `test-rel-lin` - `r-release` on Linux, most comprehensive test environment, `-O3 -flto -fno-common -Wunused-result`, extra check for no compilation warnings, includes testing [_with other packages_](./../inst/tests/other.Rraw) ([extended suggests](./../inst/tests/tests-DESCRIPTION)) - `test-rel-cran-lin` - `--as-cran` on Linux, `-g0`, extra check for final status of `R CMD check` where we allow one NOTE (_size of tarball_). -- `test-dev-cran-lin` - `r-devel` and `--as-cran` on Linux, `--enable-strict-barrier --disable-long-double` +- `test-dev-cran-lin` - `r-devel` and `--as-cran` on Linux, `--with-recommended-packages --enable-strict-barrier --disable-long-double`, tests for compilation warnings in pkg install and new NOTEs/Warnings in pkg check, and because it is R-devel it is marked as allow_failure - `test-rel-vanilla-lin` - `r-release` on Linux, no suggested deps, no OpenMP, `-O0`, tracks memory usage during tests - `test-310-cran-lin` - R 3.1.0 on Linux - `test-344-cran-lin` - R 3.4.4 on Linux @@ -27,7 +27,7 @@ Artifacts: - sources - Windows binaries for `r-release` and `r-devel` - [CRAN-like homepage](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html) -- [CRAN-like checks results](https://rdatatable.gitlab.io/data.table/web/checks/check_results_data.table.html) - note that all artifacts, including this page, are being published only when all test jobs successfully pass, thus one will not see an _ERROR_ status there (unless `allow_failure` option has been used in a job). +- [CRAN-like checks results](https://rdatatable.gitlab.io/data.table/web/checks/check_results_data.table.html) - note that all artifacts, including check results page, are being published only when all test jobs successfully pass, thus one will not see an _ERROR_ status there (unless error happened on a job marked as `allow_failure`). - [docker images](https://gitlab.com/Rdatatable/data.table/container_registry) - copy/paste-able `docker pull` commands can be found at the bottom of our [CRAN-like homepage](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html) ### [Travis CI](./../.travis.yml) @@ -64,7 +64,7 @@ Base R implemented helper script to orchestrate generation of most artifacts. It Template file to produce `Dockerfile` for, as of now, three docker images. Docker images are being built and published in [_deploy_ stage in GitLab CI pipeline](./../.gitlab-ci.yml). - `r-base-dev` using `r-release`: publish docker image of `data.table` on R-release - `r-builder` using `r-release`: publish on R-release and OS dependencies for building Rmarkdown vignettes -- `r-devel`: publish docker image of `data.table` on R-devel +- `r-devel`: publish docker image of `data.table` on R-devel built with `--with-recommended-packages --enable-strict-barrier --disable-long-double` ### [`deploy.sh`](./deploy.sh) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 51faefe280..dfda08355b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -186,9 +186,25 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual - >- Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 1 NOTE")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 1 NOTE"), " (size of tarball) but ", shQuote(l)) else q("no")' -test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double - <<: *test-cran-lin +test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double, check for new notes and compilation warnings, thus allow_failure + <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-devel + allow_failure: true + variables: + _R_CHECK_CRAN_INCOMING_: "TRUE" + _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" + before_script: + - *install-deps + - *cp-src + - rm -r bus + script: + - *mv-src + - cd bus/$CI_BUILD_NAME + - R CMD check --as-cran --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) + - *rm-src + - (! grep "warning:" data.table.Rcheck/00install.out) + - >- + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 3 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 3 NOTEs"), " (size of tarball, installed package size, top-level files) but ", shQuote(l)) else q("no")' test-310-cran-lin: ## R-3.1.0 on Linux, stated dependency of R <<: *test-cran-lin From 862d8ac897489e92d2d2db1ad3bc59498098280f Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 14 Jul 2020 17:51:55 +0800 Subject: [PATCH 049/588] Add support for native parsing of iso8601 dates/timestamps in fread (#4464) --- .dev/.bash_aliases | 2 + NEWS.md | 5 ++ R/fread.R | 5 +- R/test.data.table.R | 28 ++++--- inst/tests/tests.Rraw | 74 +++++++++++++++++- man/fread.Rd | 6 +- src/data.table.h | 4 + src/fread.c | 170 ++++++++++++++++++++++++++++++++++++++---- src/fread.h | 22 +++--- src/freadLookups.h | 115 ++++++++++++++++++++++++++++ src/freadR.c | 63 +++++++++++++--- src/freadR.h | 7 +- src/init.c | 9 +++ 13 files changed, 457 insertions(+), 53 deletions(-) diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases index 93ea44ed5c..d9e2b6a387 100644 --- a/.dev/.bash_aliases +++ b/.dev/.bash_aliases @@ -5,6 +5,8 @@ # git config --global difftool.prompt false alias gd='git difftool &> /dev/null' alias gdm='git difftool master &> /dev/null' +# If meld has scrolling issues, turn off GTK animation (which I don't need anyway): +# https://gitlab.gnome.org/GNOME/meld/-/issues/479#note_866040 alias Rdevel='~/build/R-devel/bin/R --vanilla' alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' diff --git a/NEWS.md b/NEWS.md index 2d31090120..b6298b1f5f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,11 @@ # data.table [v1.12.9](https://github.com/Rdatatable/data.table/milestone/19) (in development) +## POTENTIALLY BREAKING CHANGES + +1. `fread` now supports native parsing of `%Y-%m-%d`, and [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `%Y-%m-%dT%H:%M:%OS%z`, [#4464](https://github.com/Rdatatable/data.table/pull/4464). Dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and datetimes are returned as `POSIXct` provided either `Z` or the offset from `UTC` is present; e.g. `fwrite()` outputs UTC by default including the final `Z`. `IDate` inherits from R's `Date` and is identical other than it uses the `integer` type where (oddly) R uses the `double` type for dates (8 bytes instead of 4). Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour should you need it: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. For example, calls already using `colClasses="POSIXct"` will now use the faster parser if the `Z` is present, otherwise R's `as.POSIXct` will be used as before which interprets datetimes that are missing the UTC marker to be in the local timezone. + The minor version number is bumped from 12 to 13, i.e. `v1.13.0`, where the `.0` conveys 'be-aware' as is common practice. As with any new feature, there may be bugs to fix and changes to defaults required in future. In addition to convenience, `fread` is now significantly faster in the presence of dates, and UTC-marked datetimes. + ## NEW FEATURES 1. `%chin%` and `chmatch(x, table)` are faster when `x` is length 1, `table` is long, and `x` occurs near the start of `table`. Thanks to Michael Chirico for the suggestion, [#4117](https://github.com/Rdatatable/data.table/pull/4117#discussion_r358378409). diff --git a/R/fread.R b/R/fread.R index d57d2cd6fd..9a627613c7 100644 --- a/R/fread.R +++ b/R/fread.R @@ -295,7 +295,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir()) "complex" = as.complex(v), "raw" = as_raw(v), # Internal implementation "Date" = as.Date(v), - "POSIXct" = as.POSIXct(v), + "POSIXct" = as.POSIXct(v), # test 2150.14 covers this by setting the option to restore old behaviour. Otherwise types that + # are recognized by freadR.c (e.g. POSIXct; #4464) result in user-override-bump at C level before reading so do not reach this switch + # see https://github.com/Rdatatable/data.table/pull/4464#discussion_r447275278. + # Aside: as(v,"POSIXct") fails with error in R so has to be caught explicitly above # finally: methods::as(v, new_class)) }, diff --git a/R/test.data.table.R b/R/test.data.table.R index 4c86299cbd..9c895c69a4 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -50,11 +50,13 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F } fn = setNames(file.path(fulldir, fn), file.path(subdir, fn)) + # These environment variables are restored to their previous state (including not defined) after sourcing test script + oldEnv = Sys.getenv(c("_R_CHECK_LENGTH_1_LOGIC2_", "TZ"), unset=NA_character_) # From R 3.6.0 onwards, we can check that && and || are using only length-1 logicals (in the test suite) # rather than relying on x && y being equivalent to x[[1L]] && y[[1L]] silently. - orig__R_CHECK_LENGTH_1_LOGIC2_ = Sys.getenv("_R_CHECK_LENGTH_1_LOGIC2_", unset = NA_character_) Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) - # This environment variable is restored to its previous state (including not defined) after sourcing test script + # TZ is not changed here so that tests run under the user's timezone. But we save and restore it here anyway just in case + # the test script stops early during a test that changes TZ (e.g. 2124 referred to in PR #4464). oldRNG = suppressWarnings(RNGversion("3.5.0")) # sample method changed in R 3.6 to remove bias; see #3431 for links and notes @@ -81,7 +83,8 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F warnPartialMatchArgs = base::getRversion()>="3.6.0", # ensure we don't rely on partial argument matching in internal code, #3664; >=3.6.0 for #3865 warnPartialMatchAttr = TRUE, warnPartialMatchDollar = TRUE, - width = max(getOption('width'), 80L) # some tests (e.g. 1066, 1293) rely on capturing output that will be garbled with small width + width = max(getOption('width'), 80L), # some tests (e.g. 1066, 1293) rely on capturing output that will be garbled with small width + datatable.old.fread.datetime.character = FALSE ) cat("getDTthreads(verbose=TRUE):\n") # for tracing on CRAN; output to log before anything is attempted @@ -115,10 +118,11 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F err = try(sys.source(fn, envir=env), silent=silent) options(oldOptions) - if (is.na(orig__R_CHECK_LENGTH_1_LOGIC2_)) { - Sys.unsetenv("_R_CHECK_LENGTH_1_LOGIC2_") - } else { - Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = orig__R_CHECK_LENGTH_1_LOGIC2_) # nocov + for (i in oldEnv) { + if (is.na(oldEnv[i])) + Sys.unsetenv(names(oldEnv)[i]) + else + do.call("Sys.setenv", as.list(oldEnv[i])) # nocov } # Sys.setlocale("LC_CTYPE", oldlocale) suppressWarnings(do.call("RNGkind",as.list(oldRNG))) @@ -129,14 +133,16 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # of those 13 line and give a better chance of seeing more of the output before it. Having said that, CRAN # does show the full file output these days, so the 13 line limit no longer bites so much. It still bit recently # when receiving output of R CMD check sent over email, though. + tz = Sys.getenv("TZ", unset=NA) cat("\n", date(), # so we can tell exactly when these tests ran on CRAN to double-check the result is up to date " endian==", .Platform$endian, ", sizeof(long double)==", .Machine$sizeof.longdouble, ", sizeof(pointer)==", .Machine$sizeof.pointer, - ", TZ=", suppressWarnings(Sys.timezone()), - ", locale='", Sys.getlocale(), "'", - ", l10n_info()='", paste0(names(l10n_info()), "=", l10n_info(), collapse="; "), "'", - ", getDTthreads()='", paste0(gsub("[ ][ ]+","==",gsub("^[ ]+","",capture.output(invisible(getDTthreads(verbose=TRUE))))), collapse="; "), "'", + ", TZ==", if (is.na(tz)) "unset" else paste0("'",tz,"'"), + ", Sys.timezone()=='", suppressWarnings(Sys.timezone()), "'", + ", Sys.getlocale()=='", Sys.getlocale(), "'", + ", l10n_info()=='", paste0(names(l10n_info()), "=", l10n_info(), collapse="; "), "'", + ", getDTthreads()=='", paste0(gsub("[ ][ ]+","==",gsub("^[ ]+","",capture.output(invisible(getDTthreads(verbose=TRUE))))), collapse="; "), "'", "\n", sep="") if (inherits(err,"try-error")) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7b3902a2b5..abf5df09ed 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8645,6 +8645,8 @@ if (test_R.utils) { # fix for #1573 ans1 = fread(testDir("issue_1573_fill.txt"), fill=TRUE, na.strings="") ans2 = setDT(read.table(testDir("issue_1573_fill.txt"), header=TRUE, fill=TRUE, stringsAsFactors=FALSE, na.strings="")) +date_cols = c('SD2', 'SD3', 'SD4') +ans2[ , (date_cols) := lapply(.SD, as.IDate), .SDcols = date_cols] test(1622.1, ans1, ans2) test(1622.2, ans1, fread(testDir("issue_1573_fill.txt"), fill=TRUE, sep=" ", na.strings="")) @@ -10756,7 +10758,9 @@ test(1743.08, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "int test(1743.09, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), c(a="Date", b="integer", c="complex")) test(1743.10, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", NA)), class), c(a="Date",b="integer",c="complex",d="integer")) test(1743.11, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", "raw")), class), c(a="Date",b="integer",c="complex",d="raw")) -test(1743.12, x = vapply(fread("a,b\n2015-01-01,2015-01-01", colClasses = c(NA, "IDate")), inherits, what = "IDate", FUN.VALUE = logical(1)), y = c(a=FALSE, b=TRUE)) +test(1743.121, sapply(fread("a,b\n2015-01-01,2015-01-01", colClasses=c(NA,"IDate")), inherits, what="IDate"), c(a=TRUE, b=TRUE)) +test(1743.122, fread("a,b\n2015-01-01,2015-01-01", colClasses=c("POSIXct","Date")), data.table(a=as.POSIXct("2015-01-01"), b=as.Date("2015-01-01"))) +test(1743.123, fread("a,b\n1+3i,2015-01-01", colClasses=c(NA,"IDate")), data.table(a="1+3i", b=as.IDate("2015-01-01"))) ## Attempts to impose incompatible colClasses is a warning (not an error) ## and does not change the value of the columns @@ -16611,12 +16615,13 @@ dt = data.table(SomeNumberA=c(1,1,1),SomeNumberB=c(1,1,1)) test(2123, dt[, .(.N, TotalA=sum(SomeNumberA), TotalB=sum(SomeNumberB)), by=SomeNumberA], data.table(SomeNumberA=1, N=3L, TotalA=1, TotalB=3)) # system timezone is not usually UTC, so as.ITime.POSIXct shouldn't assume so, #4085 -oldtz=Sys.getenv('TZ') +oldtz=Sys.getenv('TZ', unset=NA) Sys.setenv(TZ='Asia/Jakarta') # UTC+7 t0 = as.POSIXct('2019-10-01') test(2124.1, format(as.ITime(t0)), '00:00:00') test(2124.2, format(as.IDate(t0)), '2019-10-01') -Sys.setenv(TZ=oldtz) +if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) +# careful to unset because TZ="" means UTC whereas unset TZ means local # trunc.cols in print.data.table, #4074 old_width = options("width" = 40) @@ -17014,3 +17019,66 @@ setkey(dt, a) dt2 <- shallow(dt) setnames(dt2, 'a', 'A') test(2149.3, key(dt), 'a') + +# native reading of [-]?[0-9]+[-][0-9]{2}[-][0-9]{2} dates and +# [T ][0-9]{2}[:][0-9]{2}[:][0-9]{2}(?:[.][0-9]+)?(?:Z|[+-][0-9]{2}[:]?[0-9]{2})? timestamps +dates = as.IDate(c(9610, 19109, 19643, 20385, -1413, 9847, 4116, -11145, -2327, 1760)) +times = .POSIXct(tz = 'UTC', c( + 937402277.067304, -626563403.382897, -506636228.039861, -2066740882.02417, + -2398617863.28256, -1054008563.60793, 1535199547.55902, 2075410085.54399, + 1201364458.72486, 939956943.690777 +)) +DT = data.table(dates, times) +tmp = tempfile() +## ISO8601 format (%FT%TZ) by default +fwrite(DT, tmp) +test(2150.01, fread(tmp), DT) # defaults for fwrite/fread simple and preserving +fwrite(DT, tmp, dateTimeAs='write.csv') # writes the UTC times as-is not local because the time column has tzone=="UTC", but without the Z marker +test(2150.021, sapply(fread(tmp), typeof), c(dates="integer", times="character")) # as before v1.13.0, datetime with missing timezone read as character +oldtz = Sys.getenv("TZ", unset=NA) +Sys.setenv(TZ="UTC") +# as before v1.13.0, dispatches to as.POSIXct() which interprets as local time, so we need to set TZ here to get the original UTC times from the write.csv version +tt = fread(tmp, colClasses=list(POSIXct="times")) +test(2150.022, attr(tt$times, "tzone"), "") # as.POSIXct puts "" on the result (testing the write.csv version here with missing tzone) +setattr(tt$times, "tzone", "UTC") +test(2150.023, tt, DT) +if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) +fwrite(copy(DT)[ , times := format(times, '%FT%T+00:00')], tmp) +test(2150.03, fread(tmp), DT) +fwrite(copy(DT)[ , times := format(times, '%FT%T+0000')], tmp) +test(2150.04, fread(tmp), DT) +fwrite(copy(DT)[ , times := format(times, '%FT%T+0115')], tmp) +test(2150.05, fread(tmp), copy(DT)[ , times := times - 4500]) +fwrite(copy(DT)[ , times := format(times, '%FT%T+01')], tmp) +test(2150.06, fread(tmp), copy(DT)[ , times := times - 3600]) +## invalid tz specifiers +fwrite(copy(DT)[ , times := format(times, '%FT%T+3600')], tmp) +test(2150.07, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+3600')]) +fwrite(copy(DT)[ , times := format(times, '%FT%T+36')], tmp) +test(2150.08, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+36')]) +fwrite(copy(DT)[ , times := format(times, '%FT%T+XXX')], tmp) +test(2150.09, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+XXX')]) +fwrite(copy(DT)[ , times := format(times, '%FT%T+00:XX')], tmp) +test(2150.10, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+00:XX')]) +# allow colClasses='POSIXct' to force YMD column to read as POSIXct +test(2150.11,fread("a,b\n2015-01-01,2015-01-01", colClasses="POSIXct"), # local time for backwards compatibility + data.table(a=as.POSIXct("2015-01-01"), b=as.POSIXct("2015-01-01"))) +test(2150.12,fread("a,b\n2015-01-01,2015-01-01", select=c(a="Date",b="POSIXct")), # select colClasses form, for coverage + data.table(a=as.Date("2015-01-01"), b=as.POSIXct("2015-01-01"))) +test(2150.13, fread("a,b\n2015-01-01,1.1\n2015-01-02 01:02:03,1.2"), # no Z so as character as before v1.13.0 + data.table(a=c("2015-01-01","2015-01-02 01:02:03"), b=c(1.1, 1.2))) +# some rows are date-only, some rows UTC-timestamp --> read the date-only in UTC too +test(2150.14, fread("a,b\n2015-01-01,1.1\n2015-01-02T01:02:03Z,1.2"), + data.table(a = .POSIXct(1420070400 + c(0, 90123), tz="UTC"), b = c(1.1, 1.2))) +old = options(datatable.old.fread.datetime.character=TRUE) +test(2150.15, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03T01:02:03Z"), + data.table(a="2015-01-01", b="2015-01-02", c="2015-01-03T01:02:03Z")) +test(2150.16, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), + ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03"))) +ans_print = capture.output(print(ans)) +options(datatable.old.fread.datetime.character=NULL) +test(2150.17, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), + ans, output=ans_print) +test(2150.18, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)), + data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print) +options(old) diff --git a/man/fread.Rd b/man/fread.Rd index 3a6daa083b..c11013b710 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -2,11 +2,11 @@ \alias{fread} \title{ Fast and friendly file finagler } \description{ - Similar to \code{read.table} but faster and more convenient. All controls such as \code{sep}, \code{colClasses} and \code{nrows} are automatically detected. \code{bit64::integer64} types are also detected and read directly without needing to read as character before converting. + Similar to \code{read.table} but faster and more convenient. All controls such as \code{sep}, \code{colClasses} and \code{nrows} are automatically detected. - Dates are read as character currently. They can be converted afterwards using the excellent \code{fasttime} package or standard base functions. + \code{bit64::integer64}, \code{\link{IDate}}, and \code{\link{POSIXct}} types are also detected and read directly without needing to read as character before converting. - `fread` is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector. + \code{fread} is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector. } \usage{ fread(input, file, text, cmd, sep="auto", sep2="auto", dec=".", quote="\"", diff --git a/src/data.table.h b/src/data.table.h index 6f907bfa76..c6e4bbf61e 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -77,6 +77,8 @@ extern SEXP char_ITime; extern SEXP char_IDate; extern SEXP char_Date; extern SEXP char_POSIXct; +extern SEXP char_POSIXt; +extern SEXP char_UTC; extern SEXP char_nanotime; extern SEXP char_lens; extern SEXP char_indices; @@ -97,6 +99,8 @@ extern SEXP sym_verbose; extern SEXP SelfRefSymbol; extern SEXP sym_inherits; extern SEXP sym_datatable_locked; +extern SEXP sym_tzone; +extern SEXP sym_old_fread_datetime_character; extern double NA_INT64_D; extern long long NA_INT64_LL; extern Rcomplex NA_CPLX; // initialized in init.c; see there for comments diff --git a/src/fread.c b/src/fread.c index c94aeac069..1dfb8ae77e 100644 --- a/src/fread.c +++ b/src/fread.c @@ -66,8 +66,8 @@ static int8_t *type = NULL, *tmpType = NULL, *size = NULL; static lenOff *colNames = NULL; static freadMainArgs args; // global for use by DTPRINT -const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "string"}; -int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 4, 8, 8, 8, 8, 8 }; +const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; +int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; // In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043 // So we assign them through below init function. @@ -571,11 +571,9 @@ static void Field(FieldParseContext *ctx) } } - -static void StrtoI32(FieldParseContext *ctx) +static void str_to_i32_core(const char **pch, int32_t *target) { - const char *ch = *(ctx->ch); - int32_t *target = (int32_t*) ctx->targets[sizeof(int32_t)]; + const char *ch = *pch; if (*ch=='0' && args.keepLeadingZeros && (uint_fast8_t)(ch[1]-'0')<10) return; bool neg = *ch=='-'; @@ -605,12 +603,17 @@ static void StrtoI32(FieldParseContext *ctx) // (acc==0 && ch-start==1) ) { if ((sf || ch>start) && sf<=10 && acc<=INT32_MAX) { *target = neg ? -(int32_t)acc : (int32_t)acc; - *(ctx->ch) = ch; + *pch = ch; } else { *target = NA_INT32; // empty field ideally, contains NA and fall through to check if NA (in which case this write is important), or just plain invalid } } +static void StrtoI32(FieldParseContext *ctx) +{ + str_to_i32_core(ctx->ch, (int32_t*) ctx->targets[sizeof(int32_t)]); +} + static void StrtoI64(FieldParseContext *ctx) { @@ -669,11 +672,10 @@ cat("1.0E300L\n};\n", file=f, append=TRUE) * of precision, for example `1.2439827340958723094785103` will not be parsed * as a double. */ -static void parse_double_regular(FieldParseContext *ctx) +static void parse_double_regular_core(const char **pch, double *target) { #define FLOAT_MAX_DIGITS 18 - const char *ch = *(ctx->ch); - double *target = (double*) ctx->targets[sizeof(double)]; + const char *ch = *pch; if (*ch=='0' && args.keepLeadingZeros && (uint_fast8_t)(ch[1]-'0')<10) return; bool neg, Eneg; @@ -784,13 +786,16 @@ static void parse_double_regular(FieldParseContext *ctx) r *= pow10lookup[e]; *target = (double)(neg? -r : r); - *(ctx->ch) = ch; + *pch = ch; return; fail: *target = NA_FLOAT64; } +static void parse_double_regular(FieldParseContext *ctx) { + parse_double_regular_core(ctx->ch, (double*) ctx->targets[sizeof(double)]); +} /** @@ -937,6 +942,136 @@ static void parse_double_hexadecimal(FieldParseContext *ctx) *target = NA_FLOAT64; } +/* +f = 'src/freadLookups.h' +cat('const uint8_t cumDaysCycleYears[401] = {\n', file=f, append=TRUE) +t = format(as.double(difftime(as.Date(sprintf('%04d-01-01', 1600:1999)), .Date(0), units='days'))) +rows = paste0(apply(matrix(t, ncol = 4L, byrow = TRUE), 1L, paste, collapse = ', '), ',\n') +cat(rows, sep='', file=f, append=TRUE) +cat(146097, '// total days in 400 years\n};\n', sep = '', file=f, append=TRUE) +*/ +static void parse_iso8601_date_core(const char **pch, int32_t *target) +{ + const char *ch = *pch; + + int32_t year, month, day; + + str_to_i32_core(&ch, &year); + + // .Date(.Machine$integer.max*c(-1, 1)): + // -5877641-06-24 -- 5881580-07-11 + // rather than fiddle with dates within those terminal years (unlikely + // to be showing up in data sets any time soon), just truncate towards 0 + if (year == NA_INT32 || year < -5877640 || year > 5881579 || *ch != '-') + goto fail; + + // Multiples of 4, excluding 3/4 of centuries + bool isLeapYear = year % 4 == 0 && (year % 100 != 0 || year/100 % 4 == 0); + ch++; + + str_to_i32_core(&ch, &month); + if (month == NA_INT32 || month < 1 || month > 12 || *ch != '-') + goto fail; + ch++; + + str_to_i32_core(&ch, &day); + if (day == NA_INT32 || day < 1 || + (day > (isLeapYear ? leapYearDays[month-1] : normYearDays[month-1]))) + goto fail; + + *target = + (year/400 - 4)*cumDaysCycleYears[400] + // days to beginning of 400-year cycle + cumDaysCycleYears[year % 400] + // days to beginning of year within 400-year cycle + (isLeapYear ? cumDaysCycleMonthsLeap[month-1] : cumDaysCycleMonthsNorm[month-1]) + // days to beginning of month within year + day-1; // day within month (subtract 1: 1970-01-01 -> 0) + + *pch = ch; + return; + + fail: + *target = NA_FLOAT64; +} + +static void parse_iso8601_date(FieldParseContext *ctx) { + parse_iso8601_date_core(ctx->ch, (int32_t*) ctx->targets[sizeof(int32_t)]); +} + +static void parse_iso8601_timestamp(FieldParseContext *ctx) +{ + const char *ch = *(ctx->ch); + double *target = (double*) ctx->targets[sizeof(double)]; + + int32_t date, hour=0, minute=0, tz_hour=0, tz_minute=0; + double second=0; + + parse_iso8601_date_core(&ch, &date); + if (date == NA_INT32) + goto fail; + if (*ch != ' ' && *ch != 'T') + goto date_only; + // allows date-only field in a column with UTC-marked datetimes to be parsed as UTC too; test 2150.13 + ch++; + + str_to_i32_core(&ch, &hour); + if (hour == NA_INT32 || hour < 0 || hour > 23 || *ch != ':') + goto fail; + ch++; + + str_to_i32_core(&ch, &minute); + if (minute == NA_INT32 || minute < 0 || minute > 59 || *ch != ':') + goto fail; + ch++; + + parse_double_regular_core(&ch, &second); + if (second == NA_FLOAT64 || second < 0 || second >= 60) + goto fail; + + if (*ch == 'Z') { + ch++; // "Zulu time"=UTC + } else { + if (*ch == ' ') + ch++; + if (*ch == '+' || *ch == '-') { + const char *start = ch; // facilitates distinguishing +04, +0004, +0000, +00:00 + // three recognized formats: [+-]AA:BB, [+-]AABB, and [+-]AA + str_to_i32_core(&ch, &tz_hour); + if (tz_hour == NA_INT32) + goto fail; + if (ch - start == 5 && tz_hour != 0) { // +AABB + if (abs(tz_hour) > 2400) + goto fail; + tz_minute = tz_hour % 100; + tz_hour /= 100; + } else if (ch - start == 3) { + if (abs(tz_hour) > 24) + goto fail; + if (*ch == ':') { + ch++; + str_to_i32_core(&ch, &tz_minute); + if (tz_minute == NA_INT32) + goto fail; + } + } + } else { + goto fail; + // if neither Z nor UTC offset is present, then it's local time and that's not directly supported yet; see news for v1.13.0 + // if local time is UTC (TZ="" or TZ=="UTC") then it's UTC though and that could be fairly easily checked here + // tz= could also be added as new argument of fread to allow user to specify datetime is UTC where the Z or offset is missing from the data + } + } + + date_only: + + //Rprintf("date=%d\thour=%d\tz_hour=%d\tminute=%d\ttz_minute=%d\tsecond=%.1f\n", date, hour, tz_hour, minute, tz_minute, second); + // cast upfront needed to prevent silent overflow + *target = 86400*(double)date + 3600*(hour - tz_hour) + 60*(minute - tz_minute) + second; + + *(ctx->ch) = ch; + return; + + fail: + *target = NA_FLOAT64; +} /* Parse numbers 0 | 1 as boolean and ,, as NA (fwrite's default) */ static void parse_bool_numeric(FieldParseContext *ctx) @@ -1005,7 +1140,13 @@ static void parse_bool_lowercase(FieldParseContext *ctx) } - +/* How to register a new parser + * (1) Write the parser + * (2) Add it to fun array here + * (3) Extend disabled_parsers, typeName, and typeSize here as appropriate + * (4) Extend colType typdef in fread.h as appropriate + * (5) Extend typeSxp, typeRName, typeEnum in freadR.c as appropriate + */ typedef void (*reader_fun_t)(FieldParseContext *ctx); static reader_fun_t fun[NUMTYPE] = { (reader_fun_t) &Field, @@ -1018,10 +1159,12 @@ static reader_fun_t fun[NUMTYPE] = { (reader_fun_t) &parse_double_regular, (reader_fun_t) &parse_double_extended, (reader_fun_t) &parse_double_hexadecimal, + (reader_fun_t) &parse_iso8601_date, + (reader_fun_t) &parse_iso8601_timestamp, (reader_fun_t) &Field }; -static int disabled_parsers[NUMTYPE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +static int disabled_parsers[NUMTYPE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped) { // used in sampling column types and whether column names are present @@ -1151,6 +1294,7 @@ int freadMain(freadMainArgs _args) { nastr++; } disabled_parsers[CT_BOOL8_N] = !args.logical01; + disabled_parsers[CT_ISO8601_DATE] = disabled_parsers[CT_ISO8601_TIME] = args.oldNoDateTime; // temporary new option in v1.13.0; see NEWS if (verbose) { if (*NAstrings == NULL) { DTPRINT(_(" No NAstrings provided.\n")); diff --git a/src/fread.h b/src/fread.h index 60894a873f..1a7403868b 100644 --- a/src/fread.h +++ b/src/fread.h @@ -13,19 +13,21 @@ // Ordered hierarchy of types typedef enum { - NEG = -1, // dummy to force signed type; sign bit used for out-of-sample type bump management - CT_DROP = 0, // skip column requested by user; it is navigated as a string column with the prevailing quoteRule - CT_BOOL8_N, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1. + NEG = -1, // dummy to force signed type; sign bit used for out-of-sample type bump management + CT_DROP = 0, // skip column requested by user; it is navigated as a string column with the prevailing quoteRule + CT_BOOL8_N, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1. CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, - CT_INT32, // int32_t - CT_INT64, // int64_t - CT_FLOAT64, // double (64-bit IEEE 754 float) - CT_FLOAT64_EXT, // double, with NAN/INF literals - CT_FLOAT64_HEX, // double, in hexadecimal format - CT_STRING, // lenOff struct below - NUMTYPE // placeholder for the number of types including drop; used for allocation and loop bounds + CT_INT32, // int32_t + CT_INT64, // int64_t + CT_FLOAT64, // double (64-bit IEEE 754 float) + CT_FLOAT64_EXT, // double, with NAN/INF literals + CT_FLOAT64_HEX, // double, in hexadecimal format + CT_ISO8601_DATE, // integer, as read from a date in ISO-8601 format + CT_ISO8601_TIME, // double, as read from a timestamp in ISO-8601 time + CT_STRING, // lenOff struct below + NUMTYPE // placeholder for the number of types including drop; used for allocation and loop bounds } colType; extern int8_t typeSize[NUMTYPE]; diff --git a/src/freadLookups.h b/src/freadLookups.h index bb736a60ac..80c4861014 100644 --- a/src/freadLookups.h +++ b/src/freadLookups.h @@ -26,7 +26,122 @@ const uint8_t hexdigits[256] = { 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99 // 0xF0 - 0xFF }; +const uint8_t normYearDays[12] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; +const uint16_t cumDaysCycleMonthsNorm[12] = { + 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 +}; +const uint8_t leapYearDays[12] = {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; +const uint16_t cumDaysCycleMonthsLeap[12] = { + 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335 +}; +// pattern of leap days repeats every 400 years, starting with e.g. 1600. +// to facilitate working with 1970-01-01 UTC epoch time, days are expressed +// relative to 1970 % 400. 401st element is 365*400+100-3, +// the total number of days in 400 years [100 leap years - 3 non-leap centuries] +const int32_t cumDaysCycleYears[401] = { +-135140, -134774, -134409, -134044, +-133679, -133313, -132948, -132583, +-132218, -131852, -131487, -131122, +-130757, -130391, -130026, -129661, +-129296, -128930, -128565, -128200, +-127835, -127469, -127104, -126739, +-126374, -126008, -125643, -125278, +-124913, -124547, -124182, -123817, +-123452, -123086, -122721, -122356, +-121991, -121625, -121260, -120895, +-120530, -120164, -119799, -119434, +-119069, -118703, -118338, -117973, +-117608, -117242, -116877, -116512, +-116147, -115781, -115416, -115051, +-114686, -114320, -113955, -113590, +-113225, -112859, -112494, -112129, +-111764, -111398, -111033, -110668, +-110303, -109937, -109572, -109207, +-108842, -108476, -108111, -107746, +-107381, -107015, -106650, -106285, +-105920, -105554, -105189, -104824, +-104459, -104093, -103728, -103363, +-102998, -102632, -102267, -101902, +-101537, -101171, -100806, -100441, +-100076, -99710, -99345, -98980, + -98615, -98250, -97885, -97520, + -97155, -96789, -96424, -96059, + -95694, -95328, -94963, -94598, + -94233, -93867, -93502, -93137, + -92772, -92406, -92041, -91676, + -91311, -90945, -90580, -90215, + -89850, -89484, -89119, -88754, + -88389, -88023, -87658, -87293, + -86928, -86562, -86197, -85832, + -85467, -85101, -84736, -84371, + -84006, -83640, -83275, -82910, + -82545, -82179, -81814, -81449, + -81084, -80718, -80353, -79988, + -79623, -79257, -78892, -78527, + -78162, -77796, -77431, -77066, + -76701, -76335, -75970, -75605, + -75240, -74874, -74509, -74144, + -73779, -73413, -73048, -72683, + -72318, -71952, -71587, -71222, + -70857, -70491, -70126, -69761, + -69396, -69030, -68665, -68300, + -67935, -67569, -67204, -66839, + -66474, -66108, -65743, -65378, + -65013, -64647, -64282, -63917, + -63552, -63186, -62821, -62456, + -62091, -61726, -61361, -60996, + -60631, -60265, -59900, -59535, + -59170, -58804, -58439, -58074, + -57709, -57343, -56978, -56613, + -56248, -55882, -55517, -55152, + -54787, -54421, -54056, -53691, + -53326, -52960, -52595, -52230, + -51865, -51499, -51134, -50769, + -50404, -50038, -49673, -49308, + -48943, -48577, -48212, -47847, + -47482, -47116, -46751, -46386, + -46021, -45655, -45290, -44925, + -44560, -44194, -43829, -43464, + -43099, -42733, -42368, -42003, + -41638, -41272, -40907, -40542, + -40177, -39811, -39446, -39081, + -38716, -38350, -37985, -37620, + -37255, -36889, -36524, -36159, + -35794, -35428, -35063, -34698, + -34333, -33967, -33602, -33237, + -32872, -32506, -32141, -31776, + -31411, -31045, -30680, -30315, + -29950, -29584, -29219, -28854, + -28489, -28123, -27758, -27393, + -27028, -26662, -26297, -25932, + -25567, -25202, -24837, -24472, + -24107, -23741, -23376, -23011, + -22646, -22280, -21915, -21550, + -21185, -20819, -20454, -20089, + -19724, -19358, -18993, -18628, + -18263, -17897, -17532, -17167, + -16802, -16436, -16071, -15706, + -15341, -14975, -14610, -14245, + -13880, -13514, -13149, -12784, + -12419, -12053, -11688, -11323, + -10958, -10592, -10227, -9862, + -9497, -9131, -8766, -8401, + -8036, -7670, -7305, -6940, + -6575, -6209, -5844, -5479, + -5114, -4748, -4383, -4018, + -3653, -3287, -2922, -2557, + -2192, -1826, -1461, -1096, + -731, -365, 0, 365, + 730, 1096, 1461, 1826, + 2191, 2557, 2922, 3287, + 3652, 4018, 4383, 4748, + 5113, 5479, 5844, 6209, + 6574, 6940, 7305, 7670, + 8035, 8401, 8766, 9131, + 9496, 9862, 10227, 10592, +146097// total days in 400 years +}; const long double pow10lookup[601] = { 1.0E-300L, 1.0E-299L, diff --git a/src/freadR.c b/src/freadR.c index 6419e4e71e..fafd8fd283 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -24,9 +24,9 @@ Secondary separator for list() columns, such as columns 11 and 12 in BED (no nee #define NUT NUMTYPE+2 // +1 for "numeric" alias for "double"; +1 for CLASS fallback using as.class() at R level afterwards -static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, STRSXP, REALSXP, STRSXP }; -static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "character", "numeric", "CLASS" }; -static int typeEnum[NUT] = {CT_DROP, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_STRING, CT_FLOAT64, CT_STRING}; +static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; +static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; +static int typeEnum[NUT] = {CT_DROP, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; static colType readInt64As=CT_INT64; static SEXP selectSxp; static SEXP dropSxp; @@ -44,6 +44,7 @@ static int ncol = 0; static int64_t dtnrows = 0; static bool verbose = false; static bool warningsAreErrors = false; +static bool oldNoDateTime = false; SEXP freadR( @@ -128,6 +129,11 @@ SEXP freadR( } args.logical01 = LOGICAL(logical01Arg)[0]; + { + SEXP tt = PROTECT(GetOption(sym_old_fread_datetime_character, R_NilValue)); + args.oldNoDateTime = oldNoDateTime = isLogical(tt) && LENGTH(tt)==1 && LOGICAL(tt)[0]==TRUE; + UNPROTECT(1); + } args.skipNrow=-1; args.skipString=NULL; if (isString(skipArg)) { @@ -305,6 +311,11 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int if (length(colClassesSxp)) { SEXP typeRName_sxp = PROTECT(allocVector(STRSXP, NUT)); for (int i=0; i #include "po.h" -#define FREAD_MAIN_ARGS_EXTRA_FIELDS +#define FREAD_MAIN_ARGS_EXTRA_FIELDS \ + bool oldNoDateTime; #define FREAD_PUSH_BUFFERS_EXTRA_FIELDS \ - int nStringCols; \ - int nNonStringCols; + int nStringCols; \ + int nNonStringCols; // Before error() [or warning() with options(warn=2)] call freadCleanup() to close mmp and fix : // http://stackoverflow.com/questions/18597123/fread-data-table-locks-files diff --git a/src/init.c b/src/init.c index d650a64661..4e7c5ec313 100644 --- a/src/init.c +++ b/src/init.c @@ -10,6 +10,8 @@ SEXP char_ITime; SEXP char_IDate; SEXP char_Date; SEXP char_POSIXct; +SEXP char_POSIXt; +SEXP char_UTC; SEXP char_nanotime; SEXP char_lens; SEXP char_indices; @@ -30,6 +32,8 @@ SEXP sym_verbose; SEXP SelfRefSymbol; SEXP sym_inherits; SEXP sym_datatable_locked; +SEXP sym_tzone; +SEXP sym_old_fread_datetime_character; double NA_INT64_D; long long NA_INT64_LL; Rcomplex NA_CPLX; @@ -311,8 +315,11 @@ void attribute_visible R_init_datatable(DllInfo *info) // either use PRINTNAME(install()) or R_PreserveObject(mkChar()) here. char_integer64 = PRINTNAME(install("integer64")); char_ITime = PRINTNAME(install("ITime")); + char_IDate = PRINTNAME(install("IDate")); char_Date = PRINTNAME(install("Date")); // used for IDate too since IDate inherits from Date char_POSIXct = PRINTNAME(install("POSIXct")); + char_POSIXt = PRINTNAME(install("POSIXt")); + char_UTC = PRINTNAME(install("UTC")); char_nanotime = PRINTNAME(install("nanotime")); char_starts = PRINTNAME(sym_starts = install("starts")); char_lens = PRINTNAME(install("lens")); @@ -346,6 +353,8 @@ void attribute_visible R_init_datatable(DllInfo *info) SelfRefSymbol = install(".internal.selfref"); sym_inherits = install("inherits"); sym_datatable_locked = install(".data.table.locked"); + sym_tzone = install("tzone"); + sym_old_fread_datetime_character = install("datatable.old.fread.datetime.character"); initDTthreads(); avoid_openmp_hang_within_fork(); From fc2779995f92ff74eca37873d5eb151558c2c0e4 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 15 Jul 2020 03:33:07 +0300 Subject: [PATCH 050/588] clarify startup message (#4516) --- R/onAttach.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/onAttach.R b/R/onAttach.R index 57007b417c..75b48eb394 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -25,9 +25,13 @@ if (dev && (Sys.Date() - as.Date(d))>28L) packageStartupMessage("**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") if (!.Call(ChasOpenMP)) - packageStartupMessage("**********\nThis installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.", - " If this is a Mac, please ensure you are using R>=3.4.0 and have followed our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation.", - " This warning message should not occur on Windows or Linux. If it does, please file a GitHub issue.\n**********") + packageStartupMessage("**********\n", + "This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", + if (Sys.info()["sysname"]=="Darwin") + "This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux." + else + paste0("This is ", Sys.info()["sysname"], ". This warning should not normally occur on Windows or Linux where OpenMP is turned on by data.table's configure script by passing -fopenmp to the compiler. If you see this warning on Windows or Linux, please file a GitHub issue."), + "\n**********") } } From a34762353e3e46da38567e933248dbcbf07f4b55 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 15 Jul 2020 19:01:51 -0600 Subject: [PATCH 051/588] removed old.unique.by.key option (#4602) --- NEWS.md | 2 ++ R/duplicated.R | 20 -------------------- R/onLoad.R | 5 +++-- inst/tests/tests.Rraw | 14 ++++---------- 4 files changed, 9 insertions(+), 32 deletions(-) diff --git a/NEWS.md b/NEWS.md index b6298b1f5f..0bfa0a0b01 100644 --- a/NEWS.md +++ b/NEWS.md @@ -176,6 +176,8 @@ unit = "s") 11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. +12. The `datatable.old.unique.by.key` option has been removed as per the 4 year schedule detailed in note 10 of v1.12.4 (Oct 2019), note 10 of v1.11.0 (May 2018), and note 1 of v1.9.8 (Nov 2016). It has been generating a helpful warning for 2 years, and helpful error for 1 year. + # data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) diff --git a/R/duplicated.R b/R/duplicated.R index ba19dd42cd..1ae7e8a6e4 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -1,16 +1,8 @@ - -error_oldUniqueByKey = "The deprecated option 'datatable.old.unique.by.key' is being used. Please stop using it and pass 'by=key(DT)' instead for clarity. For more information please search the NEWS file for this option." -# remove this option in June 2020 (see note 10 from 1.12.4 in May 2019 which said one year from then ) - duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) { if (!cedta()) return(NextMethod("duplicated")) #nocov if (!identical(incomparables, FALSE)) { .NotYetUsed("incomparables != FALSE") } - if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) { #1284 - by = key(x) - stop(error_oldUniqueByKey) - } if (nrow(x) == 0L || ncol(x) == 0L) return(logical(0L)) # fix for bug #28 if (is.na(fromLast) || !is.logical(fromLast)) stop("'fromLast' must be TRUE or FALSE") query = .duplicated.helper(x, by) @@ -39,10 +31,6 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon .NotYetUsed("incomparables != FALSE") } if (nrow(x) <= 1L) return(x) - if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) { - by = key(x) - stop(error_oldUniqueByKey) - } o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't @@ -99,10 +87,6 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon # This is just a wrapper. That being said, it should be incredibly fast on data.tables (due to data.table's fast forder) anyDuplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) { if (!cedta()) return(NextMethod("anyDuplicated")) # nocov - if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) { - by = key(x) - stop(error_oldUniqueByKey) - } dups = duplicated(x, incomparables, fromLast, by, ...) if (fromLast) idx = tail(which(dups), 1L) else idx = head(which(dups), 1L) if (!length(idx)) idx=0L @@ -114,10 +98,6 @@ anyDuplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=s # we really mean `.SD` - used in a grouping operation # TODO: optimise uniqueN further with GForce. uniqueN = function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) { # na.rm, #1455 - if (missing(by) && is.data.table(x) && isTRUE(getOption("datatable.old.unique.by.key"))) { - by = key(x) - stop(error_oldUniqueByKey) - } if (is.null(x)) return(0L) if (!is.atomic(x) && !is.data.frame(x)) stop("x must be an atomic vector or data.frames/data.tables") diff --git a/R/onLoad.R b/R/onLoad.R index cc667e65e1..230929c4b6 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -86,8 +86,7 @@ "datatable.alloccol"="1024L", # argument 'n' of alloc.col. Over-allocate 1024 spare column slots "datatable.auto.index"="TRUE", # DT[col=="val"] to auto add index so 2nd time faster "datatable.use.index"="TRUE", # global switch to address #1422 - "datatable.prettyprint.char" = NULL, # FR #1091 - "datatable.old.unique.by.key" = "FALSE" # TODO: remove in May 2020 + "datatable.prettyprint.char" = NULL # FR #1091 ) for (i in setdiff(names(opts),names(options()))) { eval(parse(text=paste0("options(",i,"=",opts[i],")"))) @@ -95,6 +94,8 @@ if (!is.null(getOption("datatable.old.bywithoutby"))) warning("Option 'datatable.old.bywithoutby' has been removed as warned for 2 years. It is now ignored. Please use by=.EACHI instead and stop using this option.") + if (!is.null(getOption("datatable.old.unique.by.key"))) + warning("Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. It is now ignored. Please use by=key(DT) instead and stop using this option.") # Test R behaviour that changed in v3.1 and is now depended on x = 1L:3L diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index abf5df09ed..21caa1fd26 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12101,9 +12101,7 @@ test(1861, address(unique(DT)) != address(DT), TRUE) # New warning for deprecated old behaviour option setkey(DT,A) -options(datatable.old.unique.by.key=TRUE) -test(1862.1, unique(DT), error="deprecated option") -options(datatable.old.unique.by.key=NULL) +test(1862.1, unique(DT), DT) test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A")) # fix for -ve indices issue in gmedian (2046) and gvar (2111) @@ -13289,14 +13287,10 @@ gs = groupingsets(d, j = sum(val), by = c("a", "b", "c"), test(1961, cb, gs) # coverage tests -## duplicated.R -old = options("datatable.old.unique.by.key" = TRUE) -DT = data.table(x = c(1, 1, 3, 2), key = 'x') -test(1962.001, duplicated(DT), error = 'deprecated option') -test(1962.0021, anyDuplicated(DT), error = 'deprecated option') -test(1962.0022, uniqueN(DT), error = 'deprecated option') -options(old) +# tests 1962.001 and 1962.002 were testing now removed option datatable.old.unique.by.key; see NEWS items over 4 years + +DT = data.table(x = c(1, 1, 3, 2), key = 'x') test(1962.003, duplicated(DT, fromLast = NA), error = 'must be TRUE or FALSE') test(1962.004, duplicated(DT, by = -1L), From 460b919ed97f2b04079c69a606fe76dd2cd07e1f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 16 Jul 2020 16:39:15 -0600 Subject: [PATCH 052/588] add rmarkdown to suggests (#4605) --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9c4a3c4e98..945b0accca 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,7 +62,7 @@ Authors@R: c( person("Kevin","Ushey", role="ctb")) Depends: R (>= 3.1.0) Imports: methods -Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml +Suggests: bit64, curl, R.utils, xts, nanotime, zoo, yaml, knitr, rmarkdown SystemRequirements: zlib Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE From 4d849e03c3c0331a37ead3d9884406bf3715e716 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 17 Jul 2020 01:52:23 -0600 Subject: [PATCH 053/588] Add tz= to fread (#4608) --- NEWS.md | 5 +++-- R/fread.R | 10 ++++++++-- inst/tests/tests.Rraw | 46 ++++++++++++++++++++++++++++++++----------- man/fread.Rd | 7 ++++--- src/fread.c | 7 ++++--- src/fread.h | 3 +++ src/freadR.c | 4 +++- 7 files changed, 59 insertions(+), 23 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0bfa0a0b01..9c1d3cbc43 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,8 +6,9 @@ ## POTENTIALLY BREAKING CHANGES -1. `fread` now supports native parsing of `%Y-%m-%d`, and [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `%Y-%m-%dT%H:%M:%OS%z`, [#4464](https://github.com/Rdatatable/data.table/pull/4464). Dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and datetimes are returned as `POSIXct` provided either `Z` or the offset from `UTC` is present; e.g. `fwrite()` outputs UTC by default including the final `Z`. `IDate` inherits from R's `Date` and is identical other than it uses the `integer` type where (oddly) R uses the `double` type for dates (8 bytes instead of 4). Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour should you need it: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. For example, calls already using `colClasses="POSIXct"` will now use the faster parser if the `Z` is present, otherwise R's `as.POSIXct` will be used as before which interprets datetimes that are missing the UTC marker to be in the local timezone. - The minor version number is bumped from 12 to 13, i.e. `v1.13.0`, where the `.0` conveys 'be-aware' as is common practice. As with any new feature, there may be bugs to fix and changes to defaults required in future. In addition to convenience, `fread` is now significantly faster in the presence of dates, and UTC-marked datetimes. +1. `fread` now supports native parsing of `%Y-%m-%d`, and [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `%Y-%m-%dT%H:%M:%OS%z`, [#4464](https://github.com/Rdatatable/data.table/pull/4464). Dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and datetimes are returned as `POSIXct` provided either `Z` or the offset from `UTC` is present; e.g. `fwrite()` outputs UTC by default including the final `Z`. Reminder that `IDate` inherits from R's `Date` and is identical other than it uses the `integer` type where (oddly) R uses the `double` type for dates (8 bytes instead of 4). `fread()` gains a `tz` argument to control datetime values that are missing a Z or UTC-offset (now referred to as *unmarked* datetimes); e.g. as written by `write.csv`. By default `tz=""` means, as in R, read the unmarked datetime in local time. Unless the timezone of the R session is UTC (e.g. the TZ environment variable is set to `"UTC"`, or `""` on non-Windows), unmarked datetime will then by read by `fread` as character, as before. If you have been using `colClasses="POSIXct"` that will still work using R's `as.POSIXct()` which will interpret the unmarked datetime in local time, as before, and still slowly. You can tell `fread` to read unmarked datetime as UTC, and quickly, by passing `tz="UTC"` which may be appropriate in many circumstances. Note that the default behaviour of R to read and write csv using unmarked datetime can lead to different research results when the csv file has been saved in one timezone and read in another due to observations being shifted to a different date. If you have been using `colClasses="POSIXct"` for UTC-marked datetime (e.g. as written by `fwrite` including the final `Z`) then it will automatically speed up with no changes needed. + Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour should you need it: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. + The minor version number is bumped from 12 to 13, i.e. `v1.13.0`, where the `.0` conveys 'be-aware' as is common practice. As with any new feature, there may be bugs to fix and changes to defaults required in future. In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided. ## NEW FEATURES diff --git a/R/fread.R b/R/fread.R index 9a627613c7..95e5c4a45a 100644 --- a/R/fread.R +++ b/R/fread.R @@ -5,7 +5,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("d col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE), -yaml=FALSE, autostart=NA, tmpdir=tempdir()) +yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="") { if (missing(input)+is.null(file)+is.null(text)+is.null(cmd) < 3L) stop("Used more than one of the arguments input=, file=, text= and cmd=.") input_has_vars = length(all.vars(substitute(input)))>0L # see news for v1.11.6 @@ -267,8 +267,14 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir()) if (is.integer(skip)) skip = skip + n_read } warnings2errors = getOption("warn") >= 2 + stopifnot(identical(tz,"UTC") || identical(tz,"")) + if (tz=="") { + tt = Sys.getenv("TZ", unset=NA_character_) + if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local + tz="UTC" + } ans = .Call(CfreadR,input,sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, - fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros) + fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns nr = length(ans[[1L]]) require_bit64_if_needed(ans) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 21caa1fd26..97af3c7201 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -40,6 +40,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { is_na = data.table:::is_na is.sorted = data.table:::is.sorted isReallyReal = data.table:::isReallyReal + is_utc = data.table:::is_utc melt.data.table = data.table:::melt.data.table # for test 1953.4 null.data.table = data.table:::null.data.table print.data.table = data.table:::print.data.table @@ -10811,11 +10812,17 @@ test(1743.241, fread("a,b,c\n2,2,f", colClasses = list(character="c", integer="b test(1743.242, fread("a,b,c\n2,2,f", colClasses = c("integer", "integer", "factor"), drop="a"), data.table(b=2L, c=factor("f"))) ## POSIXct -test(1743.25, fread("a,b,c\n2015-06-01 11:00:00,1,ae", colClasses=c("POSIXct","integer","character")), data.table(a=as.POSIXct("2015-06-01 11:00:00"),b=1L,c="ae")) -test(1743.26, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character="b"), drop=c("a","b"), logical01=TRUE), +tt = Sys.getenv("TZ", unset=NA) +TZnotUTC = !identical(tt,"") && !is_utc(tt) +if (TZnotUTC) { + # from v1.13.0 these tests work when running under non-UTC because they compare to as.POSIXct which reads these unmarked datetime in local + # the new tests 2150.* cover more cases + test(1743.25, fread("a,b,c\n2015-06-01 11:00:00,1,ae", colClasses=c("POSIXct","integer","character")), data.table(a=as.POSIXct("2015-06-01 11:00:00"),b=1L,c="ae")) + test(1743.26, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character="b"), drop=c("a","b"), logical01=TRUE), ans<-data.table(c=as.POSIXct("2015-06-01 11:00:00"), d="a", e=1.5, f="M", g=9L, h=FALSE)) -test(1743.27, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character=2), drop=c("a","b"), logical01=TRUE), + test(1743.27, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character=2), drop=c("a","b"), logical01=TRUE), ans) +} ## raw same behaviour as read.csv test(1743.28, sapply(fread("a,b\n05,05", colClasses = c("raw", "integer")), class), sapply(read.csv(text ="a,b\n05,05", colClasses = c("raw", "integer")), class)) @@ -17027,15 +17034,22 @@ tmp = tempfile() ## ISO8601 format (%FT%TZ) by default fwrite(DT, tmp) test(2150.01, fread(tmp), DT) # defaults for fwrite/fread simple and preserving -fwrite(DT, tmp, dateTimeAs='write.csv') # writes the UTC times as-is not local because the time column has tzone=="UTC", but without the Z marker -test(2150.021, sapply(fread(tmp), typeof), c(dates="integer", times="character")) # as before v1.13.0, datetime with missing timezone read as character +fwrite(DT, tmp, dateTimeAs='write.csv') # as write.csv, writes the UTC times as-is not local because the time column has tzone=="UTC", but without the Z marker oldtz = Sys.getenv("TZ", unset=NA) +Sys.unsetenv("TZ") +test(2150.021, sapply(fread(tmp), typeof), c(dates="integer", times="character")) # as before v1.13.0, datetime with missing timezone read as character +test(2150.022, fread(tmp,tz="UTC"), DT) # user can tell fread to interpet the unmarked datetimes as UTC Sys.setenv(TZ="UTC") -# as before v1.13.0, dispatches to as.POSIXct() which interprets as local time, so we need to set TZ here to get the original UTC times from the write.csv version +test(2150.023, fread(tmp), DT) # TZ environment variable is also recognized +if (.Platform$OS.type!="windows") { + Sys.setenv(TZ="") # on Windows this unsets TZ, see ?Sys.setenv + test(2150.024, fread(tmp), DT) + # blank TZ env variable on non-Windows is recognized as UTC consistent with C and R; but R's tz= argument is the opposite and uses "" for local +} +Sys.unsetenv("TZ") tt = fread(tmp, colClasses=list(POSIXct="times")) -test(2150.022, attr(tt$times, "tzone"), "") # as.POSIXct puts "" on the result (testing the write.csv version here with missing tzone) -setattr(tt$times, "tzone", "UTC") -test(2150.023, tt, DT) +test(2150.025, attr(tt$times, "tzone"), "") # as.POSIXct puts "" on the result (testing the write.csv version here with missing tzone) +# the times will be different though here because as.POSIXct read them as local time. if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) fwrite(copy(DT)[ , times := format(times, '%FT%T+00:00')], tmp) test(2150.03, fread(tmp), DT) @@ -17060,7 +17074,8 @@ test(2150.11,fread("a,b\n2015-01-01,2015-01-01", colClasses="POSIXct"), # local test(2150.12,fread("a,b\n2015-01-01,2015-01-01", select=c(a="Date",b="POSIXct")), # select colClasses form, for coverage data.table(a=as.Date("2015-01-01"), b=as.POSIXct("2015-01-01"))) test(2150.13, fread("a,b\n2015-01-01,1.1\n2015-01-02 01:02:03,1.2"), # no Z so as character as before v1.13.0 - data.table(a=c("2015-01-01","2015-01-02 01:02:03"), b=c(1.1, 1.2))) + if (TZnotUTC) data.table(a=c("2015-01-01","2015-01-02 01:02:03"), b=c(1.1, 1.2)) + else data.table(a=setattr(c(as.POSIXct("2015-01-01",tz="UTC"), as.POSIXct("2015-01-02 01:02:03",tz="UTC")),"tzone","UTC"), b=c(1.1, 1.2))) # some rows are date-only, some rows UTC-timestamp --> read the date-only in UTC too test(2150.14, fread("a,b\n2015-01-01,1.1\n2015-01-02T01:02:03Z,1.2"), data.table(a = .POSIXct(1420070400 + c(0, 90123), tz="UTC"), b = c(1.1, 1.2))) @@ -17071,8 +17086,15 @@ test(2150.16, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClass ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03"))) ans_print = capture.output(print(ans)) options(datatable.old.fread.datetime.character=NULL) -test(2150.17, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), +if (TZnotUTC) { + test(2150.17, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), ans, output=ans_print) -test(2150.18, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)), + test(2150.18, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)), data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print) +} else { + test(2150.19, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), + ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print) + test(2150.20, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)), + ans, output=ans_print) +} options(old) diff --git a/man/fread.Rd b/man/fread.Rd index c11013b710..2dea746d84 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -24,7 +24,7 @@ data.table=getOption("datatable.fread.datatable", TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS keepLeadingZeros = getOption("datatable.keepLeadingZeros", FALSE), -yaml=FALSE, autostart=NA, tmpdir=tempdir() +yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="" ) } \arguments{ @@ -34,9 +34,9 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir() \item{cmd}{ A shell command that pre-processes the file; e.g. \code{fread(cmd=paste("grep",word,"filename")}. See Details. } \item{sep}{ The separator between columns. Defaults to the character in the set \code{[,\\t |;:]} that separates the sample of rows into the most number of lines with the same number of fields. Use \code{NULL} or \code{""} to specify no separator; i.e. each line a single character column like \code{base::readLines} does.} \item{sep2}{ The separator \emph{within} columns. A \code{list} column will be returned where each cell is a vector of values. This is much faster using less working memory than \code{strsplit} afterwards or similar techniques. For each column \code{sep2} can be different and is the first character in the same set above [\code{,\\t |;}], other than \code{sep}, that exists inside each field outside quoted regions in the sample. NB: \code{sep2} is not yet implemented. } - \item{nrows}{ The maximum number of rows to read. Unlike \code{read.table}, you do not need to set this to an estimate of the number of rows in the file for better speed because that is already automatically determined by \code{fread} almost instantly using the large sample of lines. `nrows=0` returns the column names and typed empty columns determined by the large sample; useful for a dry run of a large file or to quickly check format consistency of a set of files before starting to read any of them. } + \item{nrows}{ The maximum number of rows to read. Unlike \code{read.table}, you do not need to set this to an estimate of the number of rows in the file for better speed because that is already automatically determined by \code{fread} almost instantly using the large sample of lines. \code{nrows=0} returns the column names and typed empty columns determined by the large sample; useful for a dry run of a large file or to quickly check format consistency of a set of files before starting to read any of them. } \item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. } - \item{na.strings}{ A character vector of strings which are to be interpreted as \code{NA} values. By default, \code{",,"} for columns of all types, including type `character` is read as \code{NA} for consistency. \code{,"",} is unambiguous and read as an empty string. To read \code{,NA,} as \code{NA}, set \code{na.strings="NA"}. To read \code{,,} as blank string \code{""}, set \code{na.strings=NULL}. When they occur in the file, the strings in \code{na.strings} should not appear quoted since that is how the string literal \code{,"NA",} is distinguished from \code{,NA,}, for example, when \code{na.strings="NA"}. } + \item{na.strings}{ A character vector of strings which are to be interpreted as \code{NA} values. By default, \code{",,"} for columns of all types, including type \code{character} is read as \code{NA} for consistency. \code{,"",} is unambiguous and read as an empty string. To read \code{,NA,} as \code{NA}, set \code{na.strings="NA"}. To read \code{,,} as blank string \code{""}, set \code{na.strings=NULL}. When they occur in the file, the strings in \code{na.strings} should not appear quoted since that is how the string literal \code{,"NA",} is distinguished from \code{,NA,}, for example, when \code{na.strings="NA"}. } \item{stringsAsFactors}{ Convert all character columns to factors? } \item{verbose}{ Be chatty and report timings? } \item{skip}{ If 0 (default) start on the first line and from there finds the first row with a consistent number of columns. This automatically avoids irregular header information before the column names row. \code{skip>0} means ignore the first \code{skip} rows manually. \code{skip="string"} searches for \code{"string"} in the file (e.g. a substring of the column names row) and starts on that line (inspired by read.xls in package gdata). } @@ -64,6 +64,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir() \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. } \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. } \item{tmpdir}{ Directory to use as the \code{tmpdir} argument for any \code{tempfile} calls, e.g. when the input is a URL or a shell command. The default is \code{tempdir()} which can be controlled by setting \code{TMPDIR} before starting the R session; see \code{\link[base:tempfile]{base::tempdir}}. } + \item{tz}{ Relevant to datetime values which have no Z or UTC-offset at the end, i.e. \emph{unmarked} datetime, as written by \code{\link[utils]{write.csv}}. The default \code{tz=""} means interpet unmarked datetime in the timezone of the R session, for consistency with R's \code{as.POSIXct()} and backwards compatibility. Set \code{tz="UTC"} to read unmarked datetime in UTC. Note that \code{fwrite()} by default writes datetime in UTC including the final Z (i.e. UTC-marked datetime) and \code{fwrite}'s output will be read by \code{fread} consistently and quickly without needing to use \code{tz=} or \code{colClasses=}. If the TZ environment variable is set to \code{"UTC"} (or \code{""} on non-Windows where unset vs `""` is significant) then R's timezone is already UTC, the default \code{tz=""} means UTC, and unmarked datetime will be read as UTC. The TZ environment variable being unset, however, means local time, in both C and R, and is quite different from the TZ environment variable being set to \code{""} on non-Windows which means UTC not local. You can use \code{Sys.setenv(TZ="UTC")}, and \code{Sys.unsetenv("TZ")}, too, and \code{fread} will use the latest value. } } \details{ diff --git a/src/fread.c b/src/fread.c index 1dfb8ae77e..e9ae0288be 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1053,10 +1053,11 @@ static void parse_iso8601_timestamp(FieldParseContext *ctx) } } } else { - goto fail; + if (!args.noTZasUTC) + goto fail; // if neither Z nor UTC offset is present, then it's local time and that's not directly supported yet; see news for v1.13.0 - // if local time is UTC (TZ="" or TZ=="UTC") then it's UTC though and that could be fairly easily checked here - // tz= could also be added as new argument of fread to allow user to specify datetime is UTC where the Z or offset is missing from the data + // but user can specify that the unmarked datetimes are UTC by passing tz="UTC" + // if local time is UTC (env variable TZ is "" or "UTC", not unset) then local time is UTC, and that's caught by fread at R level too } } diff --git a/src/fread.h b/src/fread.h index 1a7403868b..e5eff12263 100644 --- a/src/fread.h +++ b/src/fread.h @@ -140,6 +140,9 @@ typedef struct freadMainArgs bool logical01; bool keepLeadingZeros; + + // should datetime with no Z or UTZ-offset be read as UTC? + bool noTZasUTC; char _padding[1]; diff --git a/src/freadR.c b/src/freadR.c index fafd8fd283..bab178dc39 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -72,7 +72,8 @@ SEXP freadR( SEXP colClassesArg, SEXP integer64Arg, SEXP encodingArg, - SEXP keepLeadingZerosArgs + SEXP keepLeadingZerosArgs, + SEXP noTZasUTC ) { verbose = LOGICAL(verboseArg)[0]; warningsAreErrors = LOGICAL(warnings2errorsArg)[0]; @@ -161,6 +162,7 @@ SEXP freadR( args.verbose = verbose; args.warningsAreErrors = warningsAreErrors; args.keepLeadingZeros = LOGICAL(keepLeadingZerosArgs)[0]; + args.noTZasUTC = LOGICAL(noTZasUTC)[0]; // === extras used for callbacks === if (!isString(integer64Arg) || LENGTH(integer64Arg)!=1) error(_("'integer64' must be a single character string")); From 1a8a39074e73a857f95f628ba0c26d820b0cee39 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 18 Jul 2020 00:21:05 +0800 Subject: [PATCH 054/588] Translations update (#4610) --- R/data.table.R | 2 +- R/xts.R | 2 +- inst/po/en@quot/LC_MESSAGES/R-data.table.mo | Bin 89020 -> 91028 bytes inst/po/en@quot/LC_MESSAGES/data.table.mo | Bin 139310 -> 139702 bytes inst/po/zh_CN/LC_MESSAGES/R-data.table.mo | Bin 87135 -> 89247 bytes inst/po/zh_CN/LC_MESSAGES/data.table.mo | Bin 138485 -> 138861 bytes inst/tests/tests.Rraw | 4 +- po/R-data.table.pot | 59 +- po/R-zh_CN.po | 119 +- po/data.table.pot | 1130 +++++++++--------- po/zh_CN.po | 1189 ++++++++++--------- 11 files changed, 1304 insertions(+), 1201 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index e91a47d861..e95420b6e0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2925,7 +2925,7 @@ isReallyReal = function(x) { RHS = eval(stub[[3L]], x, enclos) if (is.list(RHS)) RHS = as.character(RHS) # fix for #961 if (length(RHS) != 1L && !operator %chin% c("%in%", "%chin%")){ - if (length(RHS) != nrow(x)) stop("RHS of ", operator, " is length ",length(RHS)," which is not 1 or nrow (",nrow(x),"). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %in% instead.") + if (length(RHS) != nrow(x)) stop(gettextf("RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %%in%% instead.", operator, length(RHS), nrow(x), domain="R-data.table"), domain=NA) return(NULL) # DT[colA == colB] regular element-wise vector scan } if ( mode(x[[col]]) != mode(RHS) || # mode() so that doubleLHS/integerRHS and integerLHS/doubleRHS!isReallyReal are optimized (both sides mode 'numeric') diff --git a/R/xts.R b/R/xts.R index 81395cefce..bfb6f813a7 100644 --- a/R/xts.R +++ b/R/xts.R @@ -7,7 +7,7 @@ as.data.table.xts = function(x, keep.rownames = TRUE, key=NULL, ...) { r = setDT(as.data.frame(x, row.names=NULL)) if (identical(keep.rownames, FALSE)) return(r[]) index_nm = if (is.character(keep.rownames)) keep.rownames else "index" - if (index_nm %chin% names(x)) stop(sprintf("Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index col name.", index_nm, index_nm)) + if (index_nm %chin% names(x)) stop(gettextf("Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index column name.", index_nm, index_nm, domain="R-data.table"), domain=NA) r[, c(index_nm) := zoo::index(x)] setcolorder(r, c(index_nm, setdiff(names(r), index_nm))) # save to end to allow for key=index_nm diff --git a/inst/po/en@quot/LC_MESSAGES/R-data.table.mo b/inst/po/en@quot/LC_MESSAGES/R-data.table.mo index 76d9bd3c920062171cb0e44495a9f7c5313a05bc..3f5d477edd3f3e64e839d2e3a02cf8e9d889874a 100644 GIT binary patch delta 15473 zcmeI&cXU+M`tb365;{pB^cFY)0tqCO&_WV=?;Qk{;*boXpR*4ve(SDxt@mAT`RBc7-JQ>V_C9mYKD(V^ zygRm)UbeS%@MM)z%N0lOFs16@=qgH84OePaV<}bYLW)w&@FJ#SwHDfbScdpvERFNf zz{jvXF2VA60Hg32I`9XK!x}A>YNAw7wI>sxAQO}2MB8$$l&VRbhV^kU@&V<=8u$cG z!S%Qp<60|qFK)&}jB2A)MQn$4u@_dwOpL)P*pU0H0F*zs7i zt!_|f97Q||({U5hIqEXLjrG!$noCz4#a_e@WGKZ$t92+na2)xcD&J12PTXI0mkiEA zMnQdma)Z5SqDg=D!!Fnd7vV%aYTLDgQbUN>V?N$Od64{$O3lWl_!idUCUx-;*25n$ z_&Avgos|-wM)_c37k%S_C|&8rMz{*)N!~*lf`4HWmhY-mC2Wh4*bC?32yBX{P`bQq zH-;1AZ~%_&M*mAoUZ)@yKSsIGWt1D=#M&6&U6*%77x7R#ehcRi??VFz_t2ATC@vu` zLMgA%Q>juIf#on7Wlq)U8PqpOr$8phFr@cXHiqF^WE9jo+>Gb&4P42<${6IS@x(};KAR7_($t6(TdMq2n0N>A*={&*gxt6CCB7fwQ{FF}S) zZA25Vp-jSP`m+)C!*b}wSS-T!_!MU1Q9O_B269gf#tu>{oXiD$4X+?8NV9+17=!v;yx%1ABQpoPTYwX?fRF8Dpi?yC)U9OC=YTGWiCVyv-X9cN+u&u&=KW{ zi;!Vcn~-^<{;iv?hgb3U>IayLvRvQ8c&u;N-Pya`f`H+Gl{1TaGsy|aT8lOSA;TxEWTT#0F zN34div`)G<6=iab!)7=KWe&ZGweULD#b}pO&tekF-1y8D)a$qG1iFcWbd(+$i>EOg zr(mCKy$W{YCgRK37N5z{FQp?`K>Qazh(5O-%AZgg5I<2b>nM;&NkT4 zqnFtOm_)@3C>J<{DPe3nUOmWoSHD2A{_SdUTtyX}W=lSe^3L zn1+2Z6(2!q_}iF*=TMesbtc48Y=RHrc9b4To37Lr?yuI7c?8E&+X>HLMHCXy;vIOqYT;On1mZK3{PTjJdFw1>`7)m4##lZgozm3LMDpL87$(HKVt%SocJ_L zi1KBt^cRoOOaupUF-niDLz&HIum(C;>mew@`owQx8$60KX(ON2%P<|K2L~Vx3#w6M z7`a;DhT{{PvsOouSZ&J-))H~v{*q`_k%9FKStCw9KMi2+F8ZNYb4&?#1 zqKT)lC#sip5A+hbzZy#BD;$H&AJzJ0Jxf2r4#Z{G>E+e~P2xgK!xh*N52AGGFIXD? z!Ae-_6}<3g0S>^RbWI+aCipc<*Q!@pYETQqFzhw`m8&$$B=y+N#C+mq z=)=;l>!B+`>8bnI>jznfF~k>MB(6Xi*w-_TuF6Q#vt zQRcuyC{O+hCgOgSNp}P5VXcic65F5yUqt4q`VuSQh(Q`jA^V^{1D(hZ$~6^Zv^1b&G!SH43TvKpK9_l7~XbAn__QLzhK z;s+>C_6N$C*4d(sLs@ny7>g5e94@D4}5|p9Xf}Qa&%8*pwt|wtj zOd`GqYs>ndMMfsYYBaFl+xp1{U|-_(7?0Oc#ya*LK8yoV=FD1bil5>v`~@SqP|*(k zg10bI)_=;o`u*JzGpHy)nS2|u8*W9Jbbq4EiMU;Qm82pQUk$`+ zQjhYvO7H18Qyrzt<59Z2CCX5=!36ApErMjS$#lS#C^tNca$?=x`s;Kflol^R>5)BH z$)NRn^nMWYzV3mJwiB=c=PgG0+y?BAr%)Q0_<`=ZmKc<-8%0K(fvLC-)9@6gW33PM zvK)$yh;winE<_W{eI#2cPlb(fCAPtLQ6}N9C_NCiSE)5v8sqS#z4U)Ao@74-(#7ZZ z=>}B(SlbKBQ2r2>#@Wc}Y5{h}MJP8ofKhk?WffgUxk3F;^mV&qfY^(z@SJU}PwD@f z6!ia8&)O_(Li`BUz*lh!?#9KKzF+^yv=3!6HvCL)D1)#r@i?rC(=Y}f!-lvXWfdI6 zSo{$WV5J~`hm$#pGRs?it}iqiM-dmH48b9kv8;HIZpHQ}KQi6Ke#EoB&|Q8Ir3cD< zso#p}*p0Zzb_dE($hQ@_elVPjjM-=$hBI*h9>GFvbVU0Ujv$WyN_S;8N*A9%d1-|m zRSf?| zzjkZla^eJRj=NC0_A<7|$P@aX<9*R2UWL-&oha8ii>+n-|3OARnE0)(7>JXJ$J_B? zTtIvQ>2Vb}sV`KB^5m;f${nZlR-S_8h})n%Ku4^NV^D591v}sZ3=5L^m`pp|kDIaF zclt-BotQzKcUt$rI;=o^9OXx*^Ei)q_*vb6lQ@m|JDiH6&*_cmC~~^Gg!S+W%6?G( zJpJFE%spge{g^&CO(gK@OP94 ziN2zjUpht+kHmDGfN8h}d*Ihs=>M){8vLjokEz6Ku?SCKPaJVorYd8N(y)3zvF~F` zl(C$N^5nZvCTpW>x{F6(bK>VQ1rMNH{}wjJ)Zov$OYcYN5{adSA(RhX!L}IlFItZy za0KqaK3L@!yOGJc3v@GH#0i?|4f{-H1Q9m<^e0ej&!+qCPtd+bB?8Da5sbM$t1>0gC z6>Csh7WJq8r&4v4C(Xm5xDDmJ3OB6_s3MdbuEz}g%{JvPJv5UrlJZyZLEMIevGw1w zk1(jS$;b^?U@ET0ws-`qVdOu0Yi@`#x%y!<^rK9o)mRJ9VO{(iU&GqB^pk#wEr@TS zjCl*DwDi|nx6G{W( zOBvR>?t=FaFGgwLc}&9Qr48%*!@XFa_(crL`hA~_Jkfb!zzp)-BMH$vNtG+0o8yXeV zC(N)5HllpsB*tJA-?J)VB96lL=)&dL5pSYQ&Wy^2wYSg00mM(DjPduF!VdNmK1FO+ zF|2=xyn}wq?+sQptRI;^t7cd)iTkSS76!07A9xVk<5H9xevYzJM%CcLn1-@E-MAE| z;zJl&Q}@V1+(P^pK7w!Zb*mGm)~3NUAXrF7#&mui!}^hF1Imt-SXbY86v_^{9Ha3V z$}GNwG9;Di8EOVrvt5p|<<^KXtXUt2<%vh4^sEzQjxE7}tp819)>6=%1reXcaG&5GumFW!)>z{6i&>=TyWLWLnG zm!T}1%@~Gfus5DZnUs!JdijmQaN;c}lXN@EoVbMY8`BM>law!&^}m;jRc-b9%t|w? zZ?7}43+0bPjVe&h|~KR*6#@;QQnHB`r6A4cGiAaTGqdljC5HxN{ct3 z%z*H46&OmfhJv(bUS;xpK&zwVjcDCZr**?1MJVqk!NZO_HZ++Xb` zBkS@D?1Se~TAn;m4?%D2OS}Z7p(n5+HX5YoL^8@MXpd>=L;2qDn(bMXt-9`D{SGlv z9&8*2WtluoMqGsH_$0>S7dQ@oMVWlVhZxqc*G`lRY)6@#Cs1~@t5_a?M;THzR5vsn zMVT90Q5x20xE_ig*qJyBrK{JX zOu{`V8`bw%8>@}bbHafJ@i&-_-;QAYOV{?eM=zttQO0^Z-hA5r59LqhRZH>|c z4ry$1oA65f8@3xF2OJ{n?J4_&ygIhaC|^TjY6R8GWaNm(&6tAiN!jG(jr9U) zhkecf@^XAn%HU$W0<1$;%oS2|%1-Da6-Ql&^ek~S<&mTu;=0(4G?VgoZ;yXhDtHfB zM-`NJ^mnAkN%Fds!%caRByYDI(p>v|sar;R_pVsV-X!sMxBd%hfcz=)KiTJOkQ3yP z{%=P*L&a9Ra;#L?M{B!GLV3^2aX%?wm$xK;jN~Q$g(R<9InI#^iMwD8d=;&+#y?2x zL+TS!FqJu$e#?200vGvq_DLTSuP2$r z?7G%F{2H+w|JdhjBmX>U3h7tMDsYp@b{$2kHL)D?N#!VSOx%gQyd^iv`u~W+h4u$( zQ#pr<3U^&#Ir&DECzI+^ca?P6t`Czk(p<_uB`v1>7Ktri9U_(^k^15G`StL9%3>*- ziu+~!k5M?0#Md|VJLzlE%T!LnH%L=Say&=6M*cT!YL}I@O~gM)A5r%OX|!G5g8ZW- zIUXZTAXOkeL>kCB-Gj8GJeg-m&B*_Te^V((b>aoY<;V{tJw!f@vZhkWae~+&eNK|2 z7O9_=qsMR&V%0Ei zc$O4FT^Y)69|y?Pv_CIpx9o~P1$KXUzdEWofk@zekGZ91D>K@jxROV-zLnT#tlCMbGD#2Zce04iV`D)S(^1Df$ zxQ-m1D3hZS@eewyO0@F)`j_fGoa7^)K}sUcAU;cafP5+Y0%TPy@*X>0Kpaj!f>g^c z>rL4O@|W=otinwXUrJ{fqn-QYVWz`I)qW;5AYL`CX(<1*OBsHNJ}Y~W0=8g zNz-}jUoUIyyx<@;u;aJv55y49Bh4pvVn;6gj#U>Lo6@#yFg`CK+ZAYPPVg3ZvP`$f zoaiu}9)F?J=Qqc@+@5UH?+Ta!uj$OqH8Z`r1$iF7!yMsvnT}!IiT$S6XF5$kXXd)h zi5*;p&fNGsXQt1az}5T#mov*@cJ&5w%qecJkmU-vGPz!kOJ6L*<2MWZt}Js(jyp5Q zbo)(rUcT2CaC!nX$!EGIOmJtqU7pNhM}(VaEAG<@e7wLNV!!%7>13{WC1o zy64wM$<00yp@rXGH9}MSJ`!#Y&*6eooIa2B968-T|u@6Y&GIN}sY-x&%5z2BE*~4mj zoO!O$L;r+_g_=$rTcMmi3o=5}1DlLmE}t{Ncw}_2hmihZvbYmVUU|G>Fwq=e5Rjhp zdUA_RW`NtOl{@%d^cmyk%Pz=sc>;+tzXo{y0n{v%&WsF5 zm&yeuxP6?R>CMY?n(;12w!`c`{C)8R|X_jxC>3LSKG zvYxZacTT4fPA10`H%;l1&s?vwkzv5tw(Jz!R=k>5qm-+WL2)+2q;IMR8zQgC8BJ0C%<`pm<$GdJ1 zR|_@C^>5X($8gHPE}@@a8yZ>GU+kC9ZfiZn=x>A;zdhL~+3;zFZRT*}B_p(NXGEE( zmMKXoZIfEGHCwdKXq6T^^L~7};F%du>dpDgjFt51|Lg6!lfBvhZ*0#$Rt!0gH4gu` zojGWPrkpMp_W#5799n*1V}#zCyZ#nkimmxfco>`U@xObQj`K6JB?q$`|4(hp%OmdW z$R*PQzpv;QVMMK{6k%-n&({1u+?vnKSn+9uv2?|r2%}czySL(E>G^?$=D%&n9%I`} zQN~H*e|1xy5%zMasz!zX?9Bi7cIIvD%$34Ry3C)q?O3dl`+s0-z8bb|UQgro3U_bN MABWxEn^%?nFHo?|-~a#s delta 13617 zcmb{1d3a6N-oWv-lOQ7zLWOW*$V7rfBtj%H#9VXDL&PB=5|SW>_E7WCl%}MHqOGB& zhW5~C(H5;Lt*J#TrAkdj&20_k{r>ja`grei|G4k--u3k7vwmyuvxhb8eRBHxSEY6q zmvWr(b6cb+EnSqVhK>A`D&?wF$J)}W)Zhk6g<~o<#`%_eu{8MuEQLlx0?S}C^gs_x z!Lrx|BXJPc!2%3c%AvMW$)lkJ>&uQ&+9_C>d<#~`{m2K@C9H&Hqm>$ub+7>6#6DQB z5rH$&3!PXMKf?-Gf`NDgYjS?&6Ql3E0S2+56CT2B?1Ig??NEFb<$|_jZ#;rcFf3Ln zIzgr3Y`WleY(pN%M(#=3QSQ75<8dD*qCt|#`BgF%dS6XNIq-C};c4uEkFg!LZ>rP? zeBJU1b|oJ{e~-qUC^rz1pwtZPjN9=zN)Hq?*EhNz7m$C24yp9!a^!-6|MIUraR%#CVVF)fq>FO^q4$omHtlC0fNJb0#KZu5zG|1S$ zi*mppP{!~wRzd%ky1hQy$rG)74bCFph6X0I(vvF@7n0ji+W$m1ypQhq7-deCX-)sj z34&Ye$&rNgmg<2n_!`p7Y7q{?qqr3trqBy`5g8q|m;sj_I*M7G(Y3Aai5$l5McN;s zT*$=sN=?GaI1&GHQ1PRZ%RtKxGf|e|Z0v|@P`d6mN|)B9QMPwSMoW!G8*V}7l{$*G z(2IVSJFSmF7>7&`)fdyS2=}4mAr)6Dd!AQn1MWjslFI30XBR_0;AN|Q|bnx6cf14{es#>!zq?XV zQ9Y4iSIaOIFCf!GmF>fP!d@7|`PFqQ za_6=B>N_8SrODsHxwsx}7}Jlt!545krlEB4W%R@QI2676EAMj@l4icz+E4%R)BkBQ`KP>y@S z(w3@MS3mTjeMu_)pHF2S4a^&5qaVxT7?dlYhK+G9N>}g3ig*uKq9;>Xx^_LvXin}&@jJ#q|x!4e#gyN2mi5Hnman?V>$ zdvLmbDGkK286P=L3rxUKmW~ZnTF`I>Bd}hUK0se= z;KCT9jQNStJUlRbjCKafP@P3Lyp1wMf1(Us-5lLxQJ78MA7zNY!HyW1%RaLHM^KS5 zT8w3IGlpTY-a-9hmv< z!UH%Br%ckT!;OjJ*-?146a5NdBUMO9@6lJo$kJ9x=k(*a1 zEIV*hJX+P~*bon+2O3lKgUk!%d`;0IAM8ivO+11#Fl(yzF3KHFe3^Z?&<*&Iyv%ex zCtTTC_V2|%k$4*AMBX#?&{RUEmC8n$tfz4)-op}HFpKdYKqYgwUKR&1nLKcgzVbns zNWKu6=;|mo#e})KM^#>+l^gO*^9?JGD*b{$48QLVqyDhez@6bCeq(Qp$ zAbR3W^u~uM%gg;0#ucaFAbO%_fu2;`UgcpzyXyiy1Z%JtdB8$F+jB63d@uT8#B2J2 zm4r9RGaOVnp6Xku?T-QE1F;lNz#8bl2waIS_$|sE9KkTG@;XZlTcRtj!g{zGWiA}W zd_H&&BH>tp#9N_~XMD9i6Jl;fnY)IB>6nPzGs=5l@&uu4Di3a}#^enh#mUNs7fij00zpYKi6XYF{X`pJoqbKP$OyKRr2DWA9zuV- zyPp0pMa6T2?mAzTEANHjI3A@3-bR_tUt$*C#z;)%EDXC^fYKw88}(#vjx)&np-jqO zPZu(l?m6iT-a#LpK_p!{yitw_yu>fGL=;S^qIP9lgmHpv;jsQ4aha#-jHY zZ7XympNvsB73D^@q9>lTJm;Vy%k4S_VaNyi+iE+M6TOGs@godC&kyx%u7lF0(ddCq z(FI$h7q-P_*aJ)BLY#wdpj=2zr~aIyHI)H0bha8cVtw*rlsRz;<-)pb)$2Y7lgJlh zH9Us3@CHU;g>CvRnv9H!>W2opvnB_jC$`5C$m(&ZZB**fa0XvMuVOt(hF}Q!Y@Ci; z(1#<%Y}a2(TYjwX^j&Pg_O0lH7qBtj!FUY+M6Vh{kGPgFPudM%5R4UVO6`5Dc zb%$Q(n^7j`7wCrvP`dgwN>^V(8G_$X=FV*lL$96sffj=aw>oVFS*u z2JO;avH;6*hb3sE{pzQ>2fRPiHo_XT_eZ(o$=DG$qFm?$tb^)vy??Of^C&$s4rQ`# z#3pzV9sX2oyY(10#!2LT(S|258~?=GIDC(u_484>dJjrZ9Klui9m)md?WIQy-t{P5 z?fgPtz!^*bFZHDC_@%Y}yVJn_st+b%Ka>+J$FjH{qwr&_gV#_F?6*(9b`vm~{6ou& zC_@_Xm7cUw7);(3D`7s4$5+2{=pT>D?AJdYEk>D)H?bTBey#s-h`7XKG`6J4YN1os4-+T(c)m{BF%ANaj z)aKY0WirjOJcusj!$EPsp_ z2EKzeF!MV-C+6dF@{N}5zt^9;gmPk^j=}e=yy}l! zl&t^yROorN1Lc6bPE8rmV z&d5VjZN|+w^d}xroL^Nut-GW%%8y4=aSrV_&*;AgCjP81@GqQ1`vaVa*HyULqHd@<{V4S8df>4 z{|bH)UCCShs-K)~F`E1?hGOUi{Y30)ISKt~UxNnjxIq8UqOyyIUD*1f-of>feot4y z0NTSbM7Cofj>57y51ZgpOu_Fl4y#_)ukRj~McA13Q<#tCuju{fUUA42aLxOGD%OOEMM1a`UH(oR>4TDg$q!6WGl*rT}IhI=(?V? z-5gYg(J&W#;w@~4ZGPA5b|Kay{~7zE_YJ*DGBAex3Cf+v+|(zSjHSu9;auE_Hf(cC zf1gOl>EvV47ab3%_)+n?t>5?6@fGq}CP+xV(Eyr@dX@%a)J9X0?QhP`IV{}Rwti@vTWC4xa@~D@D8@Z3ND8EcI=G~xx&#@ zWVT+$_UP@Z56}yP$tR#3Xcfv39JKUsGt6(vZBQ;`F3RV(qx^Q{TFNkALXFB8Y9Q^i zEiYkt@)UP|e{rMI+ubmi$pDnGTZnSNH!vIbVSjAwp?jhb>yw|wYUt}}m|v|TP(Bxf zvOUep3$1)N2GV{Ddt*8I{l!it-ODg%@gbDu=i_ad&-7l{iF_o=*nW&laS!J52phmR z()r{^%NpjpU?AV$%8=)yKTg4BxEPz`cNm6#zVr|_b5N0WI11&?U&5Dg2Fjhgl{d^E zkMeLDdAOfpKD#$z1uo!sl%aE}Xqf+_AsFSsv>N5aN3aF@`0Fn3f-+f$p!B?B7L_Sf z=3DwyGR#M70XC%lP4vLSC|&v^%4G7WY?wbfh2a|V6}S*v@x@O(fpUlGRSfeNlO@Ok zQTGG%oVZujkU!BJs$Dh1{9EoY%84GMjE#GsVLlg9u`>B#T#J|R5*Ac9%s*Vl)iBJT zRKLe$+Cyym!ZR?Dd>KklmS9ut5Ttu#7ADC0KR_jk4emAd@@R+h!2*=oehB-ZcP%}( z!%&uE7RoZ*fuU&BHq5{6!qJ;N24%=v;P(-^X4b@ZHi5^SiqIKS!?qPy}24k8~K zs(auI%pecs2OB?}g>s^$coX*`$5UIwwI89Z|Ls@`Pht%`jj}5KMpmV&Th}muJBmby z%*OFlWO==XuJ{;b>|E;Uxe$%=<53IboN7m;Vg7h@p}t;z8ygtrm(&9&L-04sBu;E- zm|wR)Md{JwC>Q7-WtcCoMp3MPnXMyekTG72p129+!al}Icpl5*13ZTw(YhzDp-fW$ zM*0EO2IV+8=!0{y1-^mOQza-5w9_cZb&FyBkEBvHM$hh9_z8J2%B0C@tPgY!1ITaU zZ1jp{{p0H>PdxWHJtw+j4EZW7k0(&(!ey+8_bfe|=o_fvpkiY~TTH9W2Fx@)K674ju0lWAIWLuKMjlrFtv>6vJlza0f(4(+cY-(gg#B;8YY z(L>gMabVevHOV_- zJ#YS97Su;vUNfD9`4ow)&+Ji*hI9P{wkJKGWmX_!F)nt74o{B z^c-l72Kj9qj(1VIw*T{b`K&`3+aoC7jQ&QMBRx7B=2yCjIGy}$^x;4uT@3U8kj8Y? zH@dVd>tB}7S{memaS|Kj1&qgP-Si~ug>A?OU^uQpnF~iy9yq_C6Yry}^Nrp0=k}n? zmHj9^U4qin7f^=avV)3Do|_nk9zFCEEgI$JW5<^G4$22E;V8U;a^VAd8s;A?1z3SQ zJdHNmZ@r*JM#B7u~u@~jSR3E(z zo1+Y251fSgSR3!6JVC4U)%hHhS-%OTr+-Cy!l7kM@U)C_9yrb9z>ZZz8rZ4E~8C~yfwLQm27784)rl z{3?!y+UzLZB@c|cAkD>?bQBfzL~k^Q)0Gyh1FMgfgA_YgmPyzQ6_KQar5>NnMuRMyp-&2T~`Pm+CL7 z-t1eM`c~>Z0M*;XeCoSVo-rMWj&gpPJnM+3MP~6VlA6S7g7=JB9`Nr_>TeOpbkjfo zrqRaqQH_Xm#A}3XxS!Czi25`u^=tSM$|G|#^?QUAd0t4lLVQNsIXo>5&lGw7OIgVF(!>+u0P(aGUJG?M=YaVz zu{In+`4aI3#y+$21nW~3X^EOt!+UhHFyV^kKm7K{EK=tA*CuC>k;16J&2~Xx5E?G_8!!wOw^eFlToVW z_{6G_CUgp@oyV%2wI zIBgz$AfCEmwZDmlw5JhutqUke+YE9kQA88s1=^-c9i`MJKA>Jja?Wp-y_R!nJVO0B z%)=qX`?4dY5iyylZ{^iF=}hv`R$fBAAN5PbR$>q#ujM~bN^{~3;t%3^Ldw(gtJm4- zFb$bRL+btwYPKKvH#_lbk#A6jk4EuY7hY#VIlHK8da_?r@_{n` z^+;k#HsGUYcFHH8O`NCAi-R4)RYVDGiG&;V^0dE-PfG;tek7X-gKbBjZb0WV^`})g zmw)d6X_K}#2YTOXT7u)qUq(L;`W!LM+QuJ9Y8CO2$RJv>Z5#F>r2I;(pxy}s+4i*5 zq8>v;%K0C$LqluJdsya~9hb7L8o3jb*fx#`q}~;!{7LjBb`U=hQm(SSs;;WN)OS#(a)Xn>kvQC)HhKs~@+0!aEwt3PPPd=KcPhK8dSljDZpY>v&muU01+9PTI z*Up{Cw$It`Y3V7u#E>;&=bKMk@q)EeHSACRHIYUAIx&#?8H^z=QGZ&p$)&tQ{7(1~ zafFoj*mjrrov2A1B{rE`xWw)xQcmMG?1UGv4sntgVQr_XT3B^I%TXB4#b>ZB(dvou zw6!5BlFy~RJoRN5XKkly{#E=L{w7_NKPto_zdUqVPJOE{=^CEhk$m`|L;93nf7@6tHi+8}&q)i2^nwtEw< z)Z3$!+qfRLqd!qBjnCX+JKE+EV`v_OQaWHR{z#mq9x8wPUL$cOxsFooL>T#(L}L#4 zH_j)d)UsS}I4?GcE#rK2WVYcP9RG?(#nhbP_Ph{VYSvI&NZQz(9D7z?NYRmjMu78b z%SSFnn<^PyyfU)t*fO$m^X#cZ3wNBY=6sy8$8a8Od&4N~ao)G^#_TNTnez_A*`%YJ zi?deed^gwB9DCv8Ypn{Gb#GEQphxw>!Pm2$>v}Bla<&JR_@pwyKO@DSQ{U=jm^!-8gA>>JRv%=QDg)2`qER!*=@tJGc&Wt+l$_+ zZS-~qJ&re=+4dDKr82UH+Vcy`rMI$bJ=5znNJvTU*uC{&^RuZ#hjI(-V;hy4H8wRf zbAm0EE6B>uOSk9P(y}whj>>W_OHcJKw2!S@G_strw=ij(XQA7;4+<}iUsT+ug3-=! z4xD(%DE6*wOmT6>IktPa=GwZwHm_fGxt;N;IazYSqwKl4sl)BI-1O|RnL}+Hzi`0w z0p5|ecEfBLdA6b1_T0j{mjk^#!~ZGHKJ)Io3=hc|W(&=WMqi6+U&!IWu#@;v(hGTOSbGR&X#8%UC)-AZT6noP1$mLpFzuHJIa+gY${r%Ta_N`_bQ z*?z{ShV#L0@6u%(Hi>8$6%pOg)-XE0L6o!N{!n-4!z0(c0?heyZpsp~m(NXk!1bd=|}e{kIdyIr#b(Z)fEPW!?U(ljowVyE!Mg*h1g*7G7NbKluykwg1l( zsCb!+vB;c3#mu0YMo~>aV~Nb25La{Bc>kAXOktmSor+ro8(+ALluP~(CXNiy|99#X hGj$%+_&3w0_-+Ry+=qG7!6+Wn*LczRXZCC@^FNwpVIKeh diff --git a/inst/po/en@quot/LC_MESSAGES/data.table.mo b/inst/po/en@quot/LC_MESSAGES/data.table.mo index f88de8edf3658ac0de0066f31f1d52e78ebd041d..c89d3bf1dec90c2ef526210be91a9d37b71b8955 100644 GIT binary patch delta 20689 zcmdtpb$AumzVPvxKyZ)XWa9)#NPyrToZ!JFgk%FL5r{yrZi+jh8ydX06emFN0>!0; z;!wN?DYRIT7Q5fytd*8SpYzInp8Njs&bcd}^;>I=Et#2w-u3%3ZTl`$lDnf(dc#Qa zGYl8QFqYLf3?JNr{Th&Ab8lf7CVHS3=EV(I6`SEbw4r~HVHCl3SPTcE9BU?a!vi=0 zi?rm}xDv-3hQr9!N*!P}_NHP7X2L?P4TJwPO7es2GZL^8uD}9#1*7l<$_c~T7)B`^ zhjnlreuj@w+IMPe7?m*=E8t3WA4M?!BriGj#98M)dUMmY>YU!063 zaHsYL$_dg38%7?ij@}rAa=bw(?US@yMb2+rAyXYc=!)tc)D`wbKgy?}T-diruNgmJ z6)fCQ-B~csA&$W=n7NZ-G{G>Go>_wd_&v%6r0=ZsMu!x%A`^&l=z*yyU3Lmf;(heT z931@!)sS!)A){cV>%qaX6H5IBoR2qf1rF!zv#@vw{U1VR z2R{a2{$8rfC!t*FE-Z>a>o`NG>e31*d_p)GIfpUauG(G4dc^P0hIRU=3kkux#7^|W zvse_LqYSx6U&CO!8IBrcq${UmW88_JX!cVVmQULqWy}_7KOom)^o&q`hv|vSu;wbF z4{pX@Sk7el`l}1B8>xDvCmxdZKUEjBXEEE@VH8SNEk&9Am#`{kj#hV8A7%X}U~ycI zh4B~`!+Xg5GqS|c2JKi1uVQ|5VVr%i5T26t-<-^P3i1t9%j5{kx_yVzRi$GMgKIQ` zuodpWILyQkhS3;~^r~?K3t{7Ub)jJ>Lz;}f_#MiPJVTiqWw|{~0_Dz~+MOu--@>+-F_HdfMHpSkh^ZKY z9!yc`!UUA~G&VxNp{fT4VKVU{oPj-90$<@{EQ8~Q8%6`%fF1D}GOdheBh))%6PnD4 zHzVl(@2PmlV&$fdlXOQuY%tO=4r4bQO~Ybe@c1NNF-kpP3XfJ#%x)+{Gz-0OC;H9+@vj#&JwY9E@^De_{!&JzhOv zdZP^0EtD?JJ3&px2rNK69xLJ!ly>J)=0etq>O3_ZWSUVBhthBl*1;!O7b{Fs&-xf- zB^fKQ9p;{_CQ~^25zj|C@F}c-Z&CIyKZS&mm>cI{A6$cZ(2-}V>Y|b;UDOL@c`U=m z_#-yO{L@ss-nfBy0m=onoX)akh_+)A@nwGa5P!zRkn-7IDt9iAumFbuWB zm_R0!9T#IcOvlWUF0X*}s<9av4Wq_v!{DNgeJDNFa*mn{$=H(kJj${vK37e;5bRF8 z4Q1%u=BXR%h;qIK7$EEa0~t9$zITKNv1Xafiel3tx%I>ILe*g!P;11 zrJ8(ESeJMo_CjNodhmo|bHXV|ry3Wr0>-UYE=4*1S!{>&*Ntk7eerYR!8(44fy8w;sh8GhtUW}4dG0KoSz9&!p@*8z23FQFCQ7$m!cWSP*#eeVV0#gU|-?kQXeMY zqMi66_Q%@SxLaI}Az0};!^oXEu>i6AO=db)z+89`FW`BUp;~)O-H_uMnOYR&xUFV! z3#>~#34`%C%9wfG;nysTzzm$I>CZewh+F@Q-(-lNVsT>gp86OqiE^TrS||DvUq%_y zO!xU4fvo>3WMl|dVP6jP9On>s`$di63w%dh+_(AE5zQm-8EI$&|%MSQGQUQbW=n%Md4_ zOvW83^>{7cr4x| zUiQJEUJ_#s(^^KWuo@Ma`1M4rk4?`q zYdLnqg2aPRZs<#t8{Od`Q;y7Otcq_@#=LTR({N=q^uPkdVFdp8Im+{5BUZtiD0iHf zHTgN#McIEZcEx*G5*uVx`}e~f#B(tk9jnR60rF*1cUBK&hfwW!tV6sGrK=yI%$W?C z)t$D+bi|`E4~|EFc@(jw{c#efJQx?;jBc+fVaTsmL$fOvFZSVj}SG#33t@YeP zyAb6`b^}B43D(AT+0>mTpDl7B)$v-OjQwCdhYQhK|8aR# z52RpgDqdiAY~*HI-*)XWH}Of7F1(KM_!gzB2fCZqGF^pofn4Y)= z7Q;5^kUNhfQydqgbm1Z0;33M1b9kz;wxP`GV3aPMgmQw#C>MN8*Iz`L?b-9H_TDJZ zkr1qh!?7~%$!j`{EM%ThAd}@CmO!t3rnPPxVm{(9l*u(5r3a>CZ(N6iFjsyxG~-cj zWF^WSo@f^D4=?7G|C0NF2Jv(vV%uK(|WX4M;Yq`lnYseGT9EH++pcLrqK%n za0Y&b!C0oSX?>=Tz%Imppbgs>Q5O=AD~OY^57zNg%X+zkj9lqH^us$S%gfzn8mF-b z-bde}>O%7tQ$14?TTc%5$KFx9ZASD9`e3C_Q!_Wo%z#2>O;at-lXUL;3V7 zSk5#i;{>#!dwDfy0x%QtY-}y-e=!;9`ll$*?qWV_Y&v2T@j#Rl-auJiF21UZy-^;a zktmPot0=SjCCVeZTm{qmqjfiwo{GVoxB#Vx*P;Bu%Q!-&70&ciU33R!D02F%F07*s zLpi|&l+~~Y<#l@+tsz6{a`%eL>e`-Io$}E--h*<1x6vVYo~M$!vOttEZ;$d|S&Y)< zmv9cg!Y(+2fop;fQF^9QfEtR9C>JnAyGeTirTrW9z`!c1$GTKu{mX2Or$F9rbMXkC zL^<)ys_NyEg8PY6QJ(ojs+mRxiBS%?7v1q3$|Ly!%6iXJ-L$?92copwgX!=r%A@>Z zb%%P-zoNj60>2uj^`X!ZWi~fKnJlp=vwRrJhetAw!iSg*qidSRA{>U_V4+&7$Bv*( z&d}PX_3sP&QF^>$9d)5$4l=U*rs#t8C|!C{m;Z^whzr$KlXEso{Yk8ek5T4Ixq51D zbj2aW(@`c{=KAWxBtOcS_dwZyD)vLiMl#a%`5Tx9pJzr3lv&?2P&M3!O^Gj~jBT-o z>O-Xo1`<24IDUsRl=o1EAbTUz=!{iRE^s|A|!nt^e`eRrTr?*fyZ~*1g?jBZ_^=}JO zUD*NU6Kn>`bKn@t+wC375S47H)_)U}$rX*#)u+v2kJwMuHX zQOkB9TL1nxk4z#B_F*fm-d2tI1pJIReLK^L#Tb-F?`@O|{DAUEt<+xKaR|x{OhcI) zhf#*sC0H%L%G%)=K>0zm{{8PyGIEDr9n=`s*Y-sj^QkC3u^DAmTtRv8xOP-MQbF4h zWe$wMwzw2~;0ta0PNwzC>n4<;F5H>*FE4{Joz=7cKDHyS*u}K|fH)Rs5f|-h8gnrf zWpcIarXIn=up{wa?28`V)dOqQnsWx09v zQzvSL@)jMB((Vk_!SoS)e4-EQ4V|yTN@r(z?@Z=k#tOAb-XDh_4RZNToBVW=9qFyxMm6&Q-Yq6|svVQT-e*o643 zjtdT_obwwIWMsXjqC8lhV+(9ILOoy>qV!O{&rR!3HKDkhcp;9$ZcJkFI!?mSk!pw@ zVJ+f9UzpZk*}9-i+O66QqgekksXCEqj;po5qug=b(dsoh6=fBy)A0qA$@3EBj;oAO zZ?_RBbLBU!*I3i~BUT%fx8_!qNtk(@nj^KwvHs;wmr@|_^~Wfa&u6@vEF-ZO@j)z$ zB`2s0ZG-cO=b=o-suPucQBIJIGDOEwdba2!^_kuSW%h5z-uQG9T_)=>c(Qzo@}xuA zG5ZvCqHvTh+kn!}n5yo$IyNL6iD7sM>!bHHwHo@P%#GP7`@KN90pIDSQ4hPIT*zDp z8R^=~C}ZrBq#i^qP~zh#%f@YnI#4HUK%AsKi!!S{zf@h{0Oi3GhtjjhP$svBQ$0`S z;TYm#GgXf`R+1S;!CRDuiL*@W9~j3wc`3a`8LG;2)CGp4P1gTTGIFI?QNC_V&sAOD9c7ZOL>be2DBtx3=BXYV zkFq>BqP&J*Vh|3Puj==pJUQ>841sNd`V{Mea^X9$i46L$WaP>M7ODnzw8j+W0=&Og zMqq2=)hJKQKd>13Em9ZW9%b@QKv{NIQHC(zV!doJig+Q)+cVn|E>tF66*AIgAt+Zo z1?2#TQ63oSma56p0%a0SLTSGh<-%{HjCr1AY6$wFbooM*JHLQ(oSe%|>mN8ep-k5O z=#VZ?w?e%htD(F`*Q4Bt=SsDFB2W&r9A&ayN7*mSD)q>&i}G$5itBJS%IfH}TAgq{ zeovg?8+E+%7)_je4eMX-Xyh98pjd;l+;Xi|4~$Th3)zbD7R$6w-Dx|NRj>+&;zLZu zKI>JxA{$hFKkP>N8kDZjxKZ6m0zM#KztN#y7PB|0b$uA+fsuW)vKdO(jm1283WwuO ztcN{P)P*lZ&TXXIVp{*euz9Px(AKGHGA~ED(I+Tx$%5O}9Pn}IOeZWz#TV#>3vdAL zL3vyG?oi)=dvGAJ-%eG(1Z57m?NUP$h;jpoD647`%NLCt~isYE{fb9)iXjln;*y`_$`rDdr-sw%@e=fuR}7va}sgH`E7ZGLJ%e z;(d$Kqwg_T)_>#g)R@h~>s0&;vv8t=2Tfx&@ehZ%K;j{X)hF1DBkE&xE6Pjinzr0g zb%#Sx9z+{adg?B=!p_H-+#F{mE+)=-LWYp_w~5R<3ZCIbd~(vX{*Hh2l={>P{a$sY z1M5+L0*hj{AJlh(FUs5)iTTlqGL-AE7#_p<_y}j?#MA0;LB-CP#u8co$H+)UK~!Zjl7rC zkhDjKynK?#c;gO~26s?)^t!BGJ|QS~ycFeqe+Xs$XS<>vMD?)&aWu-4aTCfI-$!{Q z`(0IIKLY)UccVOlf4j>1w~;CKlX_wWqA&37INzv&Iue>x?-b*gWg z%jpQpcR==^RoAsendM_qmg6pzNp}I|hMu7(=Kq)KK|hqC?T#|!v(eRL{p}|s2RL$9 zjm>S8u5`brp6v}$?%08&a3#vvm%XnZBq1n0w-aS-Z($Bu9s;lEsUYpZUCdoxihgl!0WtJ0Vk`=?E*ac;748y{hgfd6gqwIeP>tnV@ z>fO=`n-PDF*7|=yM&4Fse^pPi7_<|QMA`8!_Qo>5sVkq1(zTmVCg)Kd|Bhb7Ss$zN za=4ec6Z)X%6E(RTqU;xk)_)T9H5qHRK2--efikuaQQq@KpQ%aN3T1VCq1}kmwKq@> znDMzfP9;1?+!JLPSNdJep*WODy#upj_7|-GL^1`*vz3k{mY53Qy^pY1m(_4yjES@ z73Bm2QLcCnO8rWd*?vvOe_$cv3V*0abP&oUor+oT0LolBg(dJY%CdF;)1m$#Q0Y%K zxq?u7AOvO1N8lv9i88kR->5s8jM5VuP%h{KO8abY)ju%Apj^<;C^ulfQ$yj6GSm?$ z7c$d9CX~z$lshc=Uj4@7jWdWRVK5f@pgz+FqD;(D4 z6BZ*riSh~e7-e;padEN!eZeu1%zX+Pq;s(*QMvSLhfdg*@}VdvI*E<&DK@~Wt}fQx z7>aU%n{XUv$lzj)@l=$aOhM`TGboeyIm+BAmeEBXcn+f$85yfdC?68vqIBJ9^v7RO z#@Z{Bi`8{Cu_19ElnYyfQhyp{6+A-ue9xEJ#bAM1zu1Pm@DC9LGO13l)<^BHy z8S0JF6c*4GyQG%wEfq3eY1?q>v+6*)F^c-ilzm737Aci{eNx)CnoKb{INMdq538)v zMYmbvpi-7#5tOUWLB+g}4)7)UKGgNmWzBFM?QA-(Mcp9MTv7z73gstpH*IA2WRvI2 zAl-f?abBI5I>$mC$Twg&()T2W(zrufMxMvH^{=30jYlLWYI&WgD;CNxrx$a~N55Whp9?NIXLf-Z91%^2s!mIUw6Oy>m6{_UQa$ zRb~D3&}C0_*$92y+By!SeugevrP~I(F#bPMc!?yFN4Al=k!v~?Ptvh9NmP;brR%Q` zBHs&pba^J-HWYuNK18?qoR|gn*Cy`;*?#5xMo}sk;t}M*WNq&JlrNWaDztu{!VjN}2cdVEDn z$H6aCw}`qM#Is4>B-z&DPL=)ZX8>j4ADz4ud3k>1qkJY7#Jn!D{@Ci0c+DG)NCim~ zh&#|gwwriJWv!oH{G6cccxW1pbY3cZkbjHiaWH8g>5f#eeNWp?q-&%vhimW* z!=_Xm!pZopRmu2}e?*yVuB2jCPW?DX-7K>GP`>}uHlAt*&me1?#?PNf?@3!|Hx<*< zlk!&WOS102Cl&fS48BFIzjkiNqts_d`8<2459mWI+k8Ah*>GLHMXPH0@Fc@0rL{$+ z6=ED|hmNzek9FQYy5OZg$Z9;JcRsC^Z^Am{W3dbMC$Kw-*N3%5QTJSBjXJvQ1OB4( z)wT05QRlbv{bBthQ6&oadNPKRN|I0IK#NG5h-H%(Ml$)Ysr#Lj!7AmO5AzVGZF6Xo zU+*)G^6R8Qy0R+e8A-KBxhd~Y`EhApTlMqai%colhRSzC(2`dNve z9VwrWdvyDcs*UQOlzHkn4|_%N_BcxBG`iy-`Ut)7q0aMrqxDMdM*YXzC>`fQ zzT&O_XSLBZs7n08>0i^$v4V(?Lt`vSzIm6B8dKYt_)oNx_EB~l>tNc(ho`ZXvOsJ= zVR__(!}=PPHZ@46sh2HQdrSLR+g}!U8FuPHqc%8^iZi5v!O)d%iPa$R)gWp>=9 z8!RT@i}XU*IdK4WD=1q{lC7|AD_>LcEtyM~=ftMEt^(Ghu7!^GQ`drYMi#g)nTAvZ z(?GU3-ROcgH}zM^%h#7|Ye-{=H=~_YnIzjt$~vg5u?6?B-)53*PP|Uy*=GHId6Bj= zNVOfhK^ChSqo?EZ+BuX@Ais<>n|wE2o&}2#x5IKANRGoVQr3Usp$}zzODx*~JVl!! zB!m1CT|NpOG5pv_#SQ8vV+?L2$@Uv*8D$Twi2u4$4!{<|KE*MeKF|otSCFbvW+Nr( z@|9YOtUnxA($df+`8wzZb4wa-UC`q5%B@gZ8R zB)^}Of&8c2Qpy8$VT9IXB-)ejOI=C5Pgn9EZ`lY|>4JeWRXu6&B?SXW-;l3~etM_b z`XG%dD^1x%{GGCX`0?hh+xcmC;(2|HgV>eE3ou!i$-YBL`(>)W*M%|KPE_P4|Aw@j z{1sA9@`X^gy9(>)XPn4Q@0U%V{G)2?KZlTbpl(xvHcN@$V0&rL`K>Kn`&u958|__8 z(L1L{ADur%xtngIcQuZa$wz8Jn`an6{V9yYVWc&>{azeR>P0*2{PLHD^Q7GrUc=4$ zBxJ2WjhEGNCF(OU#nY(WDQ=UvVJ*xMclNSE~DxPo=JbK0z$@ql_eiqMBK{eR68qs<;-$LnXT)#GcO_M(ehg-!>^VLl zouzCv=>qw`HaUMmQbh_!;x^JKQX3k|_Bm!HEvLSODzbjo)%B)6SW6vCA`l2ng2Pe}*KKcu`SsR{Y^lr193HiEQ)a%=rf!e6P_LHd?-l#1!Np7J>; z+bYVYk-TLmwn)l-@gixtZj)KpO~D1E?bIzGeM3sy>XZ4A_N}Ntj1GBwHl%PyS_^DI zMRC$x8p*Z`?-CEdxx~lG*CT%)H&`vyk2S>GN%Ki*TNB#J+!&tO#d0(j3gop$3JT=> z{m6_V%_glS?IWEd-6y5z%JPwFk=l^LNr|MHq>ZGLq?@GQNtqh4LP*s}!K8kq;iM$e zGSWWMInsU7TT(7wD&-q9{`JTNllqZ{laffwNZU#0NcTx^N%@TBJ6la8e>^3TY8(3+XiJXVPoZq|DCu4GMT3q}ScKGzP=^75@WU=`2p) zzyhveSzHV^r+;AHJ_{vB1KDztq|rXgBriM4s7l!x%H&|OrIp>IEI+Zdt3r}?Rd{g8 z@uV!fF6)@pc|5Ry=Lo8#NUG$EK$?G*)%j;&u{tNIs;#TC=qfipcC+d-`96^5T_~%k z%hIl5UN&dfh6Nmls7gC=Cf%f!-px0=`WvB~`O_QgPMuA+PrKlWlm+Orj3}o*M48mf zmex_XD3fnTj_Z8hFt=w;-e1g0>kiW{uR{)Jfkp)!d#K8wtLmW~GF?s=>z_pBvgAuh z$|h09yt585MwbQWQh(Bvb~$ue5@j~>QdV5+iE@kw)alEz+EC>9P&&8zv!?7|(+vkw zmex=nx3crHkJ_b`-Jz@!^|EhnZ93h)Y##OJPr|<@h7V;jWU|Qw`S79dhYw#+D8nuX z$zv6M`0zDlnaRu6mLwPO2W75SiTX87%J#XrSnmTVmVuG7Ec_Z=w=*jSDs& zLXEs%Ws?@t`YdJAPqLLGNm)@37wZ8oSL8(!(~dOF!&*Z}Y4YhBD(C6^t#JX@7(Qb2 zI-fSqTYIzQ*jKi+sd0(2Pp8@g%068~ADp30+&tUm{d@AJO~Tl`)~cC9)u*dwb6#uJ z+^6c(RZ}6~KUYlxW%anWkMA!vpRAg~x2XD0tERcLZPS7tiTVGYXla+W-1$XQH_uzt ze7Yju<#+zluz+VdKJet#`0*Ks6p)2b`xaH7E`-tr|G5y>Q}*dXNObx)b8~FrGt;II z{^``uUij}BEp23&TToV3w@)jZ^-;UDS$_1Rc4=i9i~M8e7qNc#lmoY-QtnW;w0YCk zd8?UQpXJniI@iCW?9;hk*z51Pewd@kQSVbGr2M0B>*=(tEE1HAMl(iiSn0ai2BjWqn z;`_zM#K%Y4Yu7Fro}6Se-BYe~OwCa-t+=!;Dj_c3*4u6iwZ%n5_l>mMq7$Ne+hfby zqGO_cTGcOa3ylu9)v4{DGTnZ@P|Au?q3$V}l3g-ZwME&ZV(no=!XoW8Y;iFOv0?W1 z@u9KtoJWU2_Gk$r?a_VX`;}2myq#_5yZQXvu4y~b?xW_E_O-`*Q%QbR>`?>a zhu9*b|hA>y3-H@l^doMA?#PYKvs*(@9p7srw8;kMGfhy2|wtvUn+* zA7qc#11kH|rwLK^==eDHu=)D>mbZn)M8`$M#WV6!9~sk^3yYM}@CZ3+d0VVKE+NvY z9~2syV7Cnjjfjo2m0|z*;C`X;>cYK!ZLMPB?MyaGxM5qky-z5+{dK~>?kqYqic7X9 zN)JTDM-C}(v-|etI`j^;{hak~gr)SqIol(p&C4U%Q|;-^eOU|3Xj^?fI3~8gjlPKF znz_ozgsAAaB#$^QTC@^kjTIW@UpU%Ij70dtxx#M5T#>&s@hY^lrdn)$N*A1p`De-7@y zElF=PIpDFGGufWU^hx!7VXn`R$`U>I$?CjnrnYf)IhMngW|%zV!MH@->! r{Yx{K(`%1wGXE)VfV1c}X zkRl=|Rl14-RzS-8{hhVq%gdcR_r7yy?%bKX$Cc0eueEktW$&HvJi8BO+`d2Kg52Jq zw2rf&isQH)$63+TamwR+*tHoMwh65r$AzzBQS{;tT#c=;LmS8O;8HAtd$Blvhk^JM z`(XLDjx!Ty;9&G?=QwY0IA;tQ4+Tzp#~BF4Fe6UG4mcONKIbx4$LtgaU^5(suc1zO z982S`*a-7&oIlt44j4CFeu52wh{VeD=7qBn?&4P^4*UDSb}V<{}q z({ZX|V?2T*P-9$|v!%hF_$l_qh8WP>anh?Cb7FtYjU%uu&cJ@S1DO_1_CEB#PTZRr zq5}>`O|k^6gKwcfeuZxQ0n6h})Ex2a>o^55KhmpC4V;KGQS}*KV^U)d+=$z7F?QuD z`r}Xi=>OqlS`(?uKSo{YJuHTK2AH@us!Io;%BSLZde^gkbSG9tP0D%r06#&F;cN>x?ffDfrwMT-^k6JD#sydu zPh&BBfExQ;k=!-bM)lwvY>D0!GI_~7#~{oz$k+=tRx7Pfk#jqJ1{?QdTH;bsj#CZG z<4$}HE4b)#)}C&t!BEo^eee+REE{)>_FcHw@sd#&tw9aJIjoK8S={Q{MyOR0hb3?w z7RJL^9B(4?$;lAoI3sZ&mc|QM0AFAr7KmjYVK8pTJdAryt^Y5`Xq`Smbydj`tYK`0 z?Qu88W4bu=fEtT*mU9UUVGxzN&;h8Sdc31&5gVKDJx$@!hjWNKn5 zE?*bW2Q_w+uoSMd9<}wqVj0T)N13s#kGg?K)SWM|rl9t}f}Jq!Xod&7TX&(iKLxqQ zm@bS%#b03y3}nix2jVdaKgWgGdz|Ae!F#CHGL@y!4Bx}3k^aw5L1Vh%F!sc$G%PyFakyw_{bc)Knc_H2RgZOX8LFqg!J_yWt73tv zJO!~0*2A|k2+yL9mv5Tmbi;X=18+{_g!#z)NkJXVHl3w}Esm0eBdz;SZ>Gxo4U=(HeE0vDgZCddaBaQ*4Bl-!$tu25S*-N4>>v zVQ1_(%S@{EScUi^YO-aWZB|KrY)m{9E8EiqjlgLH=T5b1qq95at=VlYjY&Bca<=dm5;S!$M77h__;Md?7i%(|oO2Zf~OIz$t6?b!OJLLzU0RqIeR^;;%N& zx896-ZPd_>#%TNuHAgCJF!sf8;bfoi~)2KU+-eg>XI{pdlj4x0<+Ih2i8O_^F|Le|fP|y#HZ!ybbGL9$S ziyG5fTg{2)Vn^Z&SO?3!XYP0)HYeVU{qa|9itV?VRj>g25`T)?FJQa5fiBzW|0Wbn zq(B$456j|n48bxx%z7V=itl3()_LEoiV0YOI0dWV4eW-wKQKKx1T~3w;|y%DlMh=w zh+Q$nn{4iMC8}Y8UFK!88Z}w`KjdQq2U)LSb>d#T%{yW))+FA68nRnh8%v~^A?SnM zh-ae*f50~A-ecan-u7fPhEq{vd=fQg**`Lqs41#PwxdpP9@P_h_L@oC562R3z_wWA zW7B>p)+K%m>)=V$&}3lAXpRg*R-M;LvISRB7vlG+Y1kXXi1(r{tjK4^e%OI{71qLE zu{f66XYMEjHQ8Rr26!5E!P)kk3+awg#PhM7*8gKNy2BC&%w%bcy7IB810*9?>O8=cThGO$C=uyt^Y$Q_yZ{kQSaLBwq z=cDfAZ>*2451V!~P?PRs8{fft#D$KScR>$aNjw9aW67iD!lLjb@jle?<{hK|W5|3- zraadD(mWu-u_N&@)FcczZZ0GSTM!?>PMG0@dBpa@al|W-`Q_yM$~-|&VKd?aoV5@3 z#bLMy)gz@((f>b^>3_<+99o?=JB&tM;a=-s*qFH58T0mv!zRSjF&q!$b{DO0gOgj_JI zV>fCjN_=P5e+R5hJPF(29@N}O`@Na0HLy4FQ1o56mrOniDqM7&2n@oZxEbHXJeSOB zSdEjovlkdZ`J^i>EnJQ{u*y|4G)+-A@&+>e&U(}&9(v78-c8t;_zreOZ$Vxn2pmD2h^;Z*9VRu$ z>4-~-_uTajq1Wlm1bC)~$gnvJfAYQRors^!C)IOQR~EWw#(XdqBi@Xi@Hpne!oM)l zunMY2TVoL%h;QR$RF4Jx%96z$xJv84^LQW;71NwKxg={xP$C0X8Mxk2;R~skx9q)bU1OUVP&z{a=90 zDhf*D9xR1du^OghVsybesCIKu2i%Bd@C^FnW7Jq@dv5+tP!SbZ#%tIfiz|L%R#RuJ zL%j0^{jbc=6a-^Q$L0G5j7AUfZmf&vQL{a#%jN6R9;lvLjy3TV24Fh3%eVYIs2gg4 zy3;VMfa9<>ZbS|F6)%}|EQ<_jT)wZ_Y-!Db`e0QWj6yv?HeoHifVy)(cAAK_Q3u+M zz40ol2kWthwf_Ljj*Bn`*P`~%k-_D=L2n&0+M$=#i;aliL3Q~})Z}@Ny5r!CE+-9+ z$6V+|UDzTlhR0BIp#{X?H8k-}?l68JZ zO|rY$%>n#*{HZIeqq;f*E8#rrr`CJOGt4QN!yGSNPM7lyaaC-L>2sMoZ;2g7ZUiO}&qsA}+T1SRGOdrATs=`k3rMU8PXYF6Jtb!Fka<^(k`8*yJ${SYjOYi#@p79#!)o1kAl zm+t}95i=7{N6nE1SQ2;Ub9sI1^gIRmDfkODsr>Sr9w?3jiJRakT#p)pfC6S{g0KW} z80tpeKwa1t?1$G-J=i$FT-YvD`!iS{{|w+)Q;l`ig64|)q9)s9)SceI0r(IXV&_6G z-)r+Gjwh~L*ya1v>xbwe_A6p8qzY~%?t+o{1oeCfEo$l~U=`xkUNU;SoklIYLLRdk z`rv8ecknKbDrT;{e{q+u$EIRC%GaT~@&UHMJSAMdPp&?wIkO5`X3jaBfu&2DvEPPT zp5C)$G$s#Gv)aFunN-bCC+LfMxokn-azu^sT@1u@rOh0uj_Sz{*c`{9E^Hs>!<(p8 z_7uagY#HVT^Us+>#znz_vSv&V;tt{)sOLgrIn%Y5u?X>V)L0iTZ^pU__9q^NbMRY? z#xVX0IU7%-2m4kubLn-=NPG=DX#M|4Mq^Po(9HfJsIgjw!|)^23CmP6%dRyxBOZZz zr0znsFHzad_C}~DT|DaLv>w${Dd>l{P|NmDJfZdPSHqMrS;>bQLWz;G1v5TC-_cnkGhc!K?~L|wD0rl8uL#58yli{f3> zYd>Q>uNkAJ^<0jdf^Mi;+!HlPrlOv3bFd3;!YTL^v!b`Y%UOeSa0>=DFg@HJ|V1vN4knt)n$E4{X0A8L}_wG~+!n+HrF>XEz|Reu@vISSPciaF&DlZHP-u4J@pvXgO%FqQ;hXDgp9g) z9qKjuEoy9@qh7O~c4krqqq=w_`pVmzWfzQE-=TO8=i_#a=wMby?v7@ewnp`20*=9@ z*k0>DS0^*pp*V#2GR9#@XY;827%MuyJIJufUn_6YsDTe z-!HSXFp~1WP|yC?dYVW3fu5}Y&J<+n}S3!Uf^NM^Y?f8{(a#pP9zQs^_p+8>!HSy1I>db z5;dzgpl175s4jendYzUDGq3A07(|?c`at;yyJ1|od0QSuz5TvJO~Qv5i{&HCb7Yy9 zOfw1!N1Co5fHR1vV_nQT$aHN>)cg8v)NQS%|ci;)s1x<*dM_A`~@Gau3!_5@Qt9qViTpCzNW zmf!2<5gCJe(Co!eu*O6)Ssq#wn6$Gf{~I+VQzp55e^A(oeTeU)CTW|=#ucdL`3ttj z##4;5(5ovvM@EyQ#8mTo9Eh6zQ*68y8xtQ#-D#F-=B?HlHAfCw|G;qKQq#@bau#ax zT|v!_oHNV~jzPW7znEdyzxxd{NxI}SX9q$#}GX4B{PwXXMrhLjan{uQO|{n z3(f3}MRoOZ)C1-O>UCOtk!CUPcbtKL;4~bcXnMeT%e-5H%PBJfvPN)Y=JZj7~qh309P(xAlZF4~#&_nzV>cS4AUbnfHnw||t&7no8 zA^RG6-FuyM%S@NVp-!+I)ivLtmSOSbW;G1Pw#3&_U08XAc|!I?b@?LfkH=9rP+_H6 zcKuNo`YvjS&!gHEU!|dB7EB?dE4ytiz1sYK9*KHH?#ANy3+j$?tT9i@7TAFJZPW#y zvvHBN=1Zy%>aBSUb-}-*=1QS;=E7TH1J3VEC8Gz#e$*t%xZX^jVAKwAs5@AV8v9eI zAt|-N3`HMQ4yhU5gZ%8LmtD)gm^L!YLdO&@(mG$3+ zOp*6Ym&T#qT8B_~>bK3Tf(RT(ya`jV#CFr}B5IYC+F_o2gK-$~7pNYo_P+U(%uoy@ z?)HIM&Qnnr_~i#)lS#kRbX_ydMa3*^f-6x^v@6KDoSH`mG40< zvp-PvO+Gf0bvX7WeiwD&?oZ4I$RLa$_To^yg>NeE^r=}67m*BQFn3*kK@m%RgiMPOx{0Gld$^d<^ePwHKeEU3I-hHZ$X^rF>coSPkV?f zq+mOipuyq8=Hv4x)XOQ?5#s>Va@>xZ{kKp(Rq&|!?RhQ^;y9OZE#=L>G(&hBpAnZi z?&8n5%#{-^-@gYu!@^qsOTIE)xeJ@nz&UB2d`(fW%`nuQ_z(-=QB)6HM?Dvw;|eTx z${hGJ+(_Kxw9EGo4bSmy;7BwU@QBTZ6s0UE43uaQ*N6n!K^y&a>$>^o?0~W(l z-wpq^MC;7lxY+4RW!*p&Dhs%MH{F^}wan1^^J=EY>x@xR2< zcoR!umaFC^RP`$BUk{*g3e<29>VRKk8GMHRSn8VjWwS22iF?|(58fmmhnfrhe=y$% zQ&10}+t#AjO;7biP3kr1!C$>(>XPxhVU|}b)KJVvy~WO=W_yO4W?42wO}?I}J4!&^ z>2lN~dnamaub{>};FkH#Cq?l7iw|_{%qz*2P{k+j+!e|Q2W1!zPW*V$rQh59zgNfg!og`OY1r2(E1Ph#XK6j zqITSi1Mw;9%KQInx_COOi&xnAC>ABYYRg~XXT%lmo0rW`s9B!>f!Qw@9sY|)&9Q`s z?9cg~)nqir`%ug5PppQ;e=`T@W}SvVQl5;ukd?oie_%L3T6 z!Y`qE&Us|&OQTNQ3N=K1P&YjP5$j*G_Yeg-!6nRwX&;*gxlpq`#Kz&M3s{CtFap40n~r%wVms80?(mWsK;~0ih(%tQ*XH{; zo;V$kn+TkYI=~&&&}4PFeJ`2nIE46n48Rs{x1%8$fO-H;#Wwf}>d~ArjobG+4#v~O z-i2iDk{OuR?VD6>(wPH{#!i$kL!I~@w!oa}-A*&?ftoYRuq&Ry8CW`l+pKp~7oWvq z_z<-k{4=_JPugb4lhEt*C8IIig8I<7iW=hwSQWEna{J~$V^ml6!R9yvbz#SC{R7mh z$eP*h`xUJYvf7*&{a`zTbdoa$7ZX<@jb#2g42|!Zu0}lWonxerX_&e(MSLF`XHyGs z1oi42ZR(9Sw8==?NPaJMJX@THs3o74G?8?IG?Mxq=ugsCLI=M`!>*($B<9<{TYK^e zlxZ7)yYUM4wPhOD2DbfnTc);q$Zw=vz4yAU=i%w}pnQTFu$3o&3cW+9&_w8<1~?qQ zp>jLWY@^XMxK4hDjW=j7yXmQwowS_;Y$E9y@H*|jAay04K`KkqwwbcIq<2i{@HBQh z5#J&uk`Jd`?+M@L;pZWe*6w2(q;5xS{$^s+`u^T0M77wpK4!9~QOl{sfwpB)%HO7p z_qcPEltOAxe2n@&*o62G+g4u)tXHQdNsH8<^bdL7c+OhVW8(7o35ktPvYQ^JQKZJC zukFD)v%XhTQo>o@hfa0 zS!W>WH7ai_uv;DK|DsG=N!)?WD9cNFgTw>Hw;d+mm)tVq^`tb!FK?p=cm+GfZTo-k zzb*waq-9hFk~YyWh&Z!7Ks`K98lAc$^?G3sqpUlAK}wf~A8eCJ^Jy20ZtAtIB#tLv zmsE{>L&YRt|M#Kr3h5yYUnl7qsaMlilxdqlLv8&?zmp$HT1nkd;*YQ#{$Sf|vF@Sl z<*haC29q{Wzst7C;{Nyk`Cg?^?@4V7Y=eXB@bWf}@~b4?Xuj=hTRD#SQ_8pD_js8! z$hP^G{JXRZL2VzA-X~wh7c+yB$h=MCU#UpOcS*G<`-HTPyk1R{@#XD4K~Bo5kUpZ_ zHCx%7xE1;IIG*xin7T!{S#Wv1gNPTUmSGo~%_OxVts(x!*0sb@l(pp$dGQ(X1oDeX z`q1DD*SSf#w$fOJG?uy$;(VC8og&|Yx|XDE8t`Fcf^CC&sjYAgWp7fKLOT7@0R|B_ zBWdeR`D&a0gt!Iq0Su??Lp6A5OQh^G>eHfL=S_TdOt?uDL=aS?QrkZU-_I|pOWoqg z%p_H}?Ft~D3cf!!=C|$Mv;K)OcE?!q(WDu+u9w|^H*t5p6?>AoNSdE|CcOAk{wR4D zi|L6i`<%7~*{`0;?ZK{L8_KmUB0m-nk`7R|0yB{o+O{R}67dD}enaNv?K5_$P1;Q5 zA5@GXzXb;pzaXEw=>sKoJ42=ysUY?5V@XnB@?%NAkzb18?EfbD2>gv$TQQn4<;vy z^l9pX*C%s53a(%y+(y#&0jW9pBlZBF`%38{3{ySjok+)R`)>HNZF3lJ+j4$k`gdDL zS!+^dmFa=kz&28gY2^2jE|8x@gRJByzf^yM^8A#C;$wWD)RlZiOx@BjSG4`ieyeeb z-TzDS=}4irZENy6zmuLF-nET(Quz+~H8xiLRLX|ixFh+UeFNQmyY49;78lkj4`o8{Y>8Hw5wp-_U9PK?Q!~0c8_+CNGI)n#V9*TI#0a8w(Cqj zlJtd*Phu(J)ApES$oC@u1}Vt4`HuV{5`Qi5I#VbtN~70E+TJ4Vr>r4PA@#F6RUohJ zF7XVUi>uIfxZ_UEP_~n_mJ`gP&2&6MtgRw>Z68wp6`s==%%bo^(mo1@`Koz{+lGB9 z`;Ih|q-`&03HeLZ*Y%aK%x#^*&&f|kH}&(#HzIYm^{-jgt{Q0o@fuzc99xW_JTCt*3F|%n;-EslXa@vvY#o-M_Fmor?&l8+h!td)|3AclW{X?7j1lV za2lBvR8*#LH~Avu7n9nNZ%Wx6^6RiVcEBXci=(y-q)5t#kZO=Nk#x|%Nh?X(GLt@L zp9j?Wk+fyN6ME^FwFM(+pskdRmG4ISJ+&MgsnBi%tT}XG@*PS2**7;SC;9!PvZOpD z{tDz=A{8S)m-O$XG54<$z7`Bl0S-`t6v+-%Nk7 zY~5Y*%k4ojl7B%8v+;e}EVu2xvSn2JwiJGzq3j3h{c#jl($~1Q??{g*?Cp!Fz=g!m zv7#*#hwQ<#;C9~*!|w3E_EwJK8KBGQ`k@o^D3D)?apf0(&isw z7g9MJe{bDt4^R%5Xa~0Gw(e_8XY;0pi?i_@3Ia*vNDXYI@BR}AzO@D7qTQht=CI`l zIoW0M*+_3uK9G1k`3&Sk$)Ck{NZKa(ewrUO;2ia-+fQWf&@K_*(D>gX)6O=CqM{6G zK54Km`vtZ7S7dhkT#rwi@rq=2RwRXx!bxLDb4Y7QyGTb#H%X63g_`r>OlnT*NlG9s zAibym%i?}A-;nN-(zf8hq#C3UQeV<=(lpW<(k{|b(k0RZQr4DqFR2D8gw&TbhBSw? zhV%*P0_h$pBh$SksSc?vX#gp~%a56)4WvD!OQZ*+v>{vwsRpSdDVj8i^cHCo=?l^Y z(jTOZeE%0BRUtJe4Im|uW|CHr-Y1lNUXS7Y)qb{I*_QdTW0gWbPcRLOX<-Il#ina7ruu;#UOnR2;(e?L}xk11>XBACMR1TPi7 zSol&yz1DP5U%b>VwJeQ)V)oVnom%rW)$ZC-`?nos>VIt-Z5?7QWg0@Y&1K8>QI?6k zwoW9qE0Fk6%Yx;Dd1a-(8XDW0wUp_uwCOudt?%bfeBLs^-8YYK`Yy@yKThArc@kR( z2c&DrueA9R+Xd%yO-h^`?B8%B@v9Ryg|b&C>{pZ(=486;)CucJJQM8iZjw(^)HNv4 z72@xnluwhg+-_douTIKmlx49;NS%~HiJ|N?J-?=*Ye(Y3kb^CyvARhQxo`I?$tTfk{8sg zb81TB+ExYgT%@LmJ%z5%jMSM{{^i5?JutNI{je?9~}^pr~d@a_%u)z_Ff~*wHC-rn)k@k}q#a$(nfVgC@yscdkg6 zSZdFb>ijPR(D6w(j5pf(YCLuH`Cf*b32_HE;Iw~wQA;M`D zlMoRX6B_M_h>MGjtLKURU+*2`Y2VZn8|R4~nGhHo88|R@WK4Mce;z3L*vI^fNvZIp zXGPsLlk%^1q%iGM=b-Ix4|4JR)vbR6;^TczsV)rHD%Elkm9M;hxawXxqJX zM@+aUIyN+1I}GNSNr!K{W~AIr=W3C~Teh<&I$~5rbU9zE$oSYej^!yElv)}a6FpW3 z9T?#msXmX5@r;THONfn&ujFYJ>4}Mm2sfpnp6ICfgv$SUxfMNQV@HXoAPyK~n3SuE3O4IbHWNl(k(J>g%+T z384v5+zkUzwO&kI?C3tF!e4WI=S%v*-&HVa_CKy_Dc==v^-N!>YQy{ds@AJkBV~OV*Jqiv#HPDbSY3gBDN~xe&J|8wRw=Cq zx;|rFrDPoKDx!ro$(>ShvTKCch*+Q9Ky>6BhQDangfiz8W6Qibl|98%4y!ZQ_@6MCM@4V;C&YU@I7R-%(WwxCx2(@Me;-Y2fD@`|TJ=(zmfK89HLbXfrnSTp?0_}eDhFX%;(1sGm!Ji+unVrn3V0DK z;};l;MHqm!+G$#IO>=2o$fQybgRSL6)ACPiT5aNRY=py*4`@!Th3jz^9>SFv&|cHV z;V}%x%3+#T2|Htb?1$Ac1{>lmY{LDub!6npPoO`3h8OW34#RhN>_DvDQ8lOsjwK$8 zo$x5qIofwvfDOVmZ82SO1^W?CiO@72THA-x179KkX%#waT6gZR^_C3IM@B(=3*`py zqK-QKIS3ZWN6a1EZoI^3i_evA$92Xw6>Q?ZAp ziJMVA7#yi?JOrgHo!As}QJ&;Alp**TL$E?mO{;<((GUCK5*&ps@CHhkm+QrFVgN?r zq+ayDwB#TK{`fx1g}y_%@k6YOfxT6EPqYz7oAC);MEo9Fa9AHTxuS73aXLzQ#lD(W z8hx=mRz;ap_4>Nh4LVUElVb$Zds-a&;BI6Tw0(FC3o#EjGO#j+1Ny5TnU58TkE4S- zzK+rZZ3e0a6yR*))A$^QGoIDZ?II&Bdqo)wD4) zttRmRl!i}683HSw!4k86Z?vX8MSKS9;YE}ODM6VFRYw^6f=g>nMxLM>$`hv}!=@cY z=8bmGG-4#ZK|BZJ@dz@pwYsC!J}?%gVV96z*1Ra!Z9AGq;tUMI^C&%XA6v-!?=VJ9 zwn->AScW>D#m0C8TjSr@9$Sx9=f~pH#H(>CzKuM*7C25lz;h_e^)(E{iqEji;?uYe zpT{unuT>kbY0N9FKZau#mc{pQDSm`HJ~M$CjI(h*E=K8MzlmxxHNsfpwzvYxSnJlNtMBv*PhSFd1q#aN$I0F;$63PX;O<|>AACxEDhzzTC8|AzX zR@FnhusiX`DA%t_H;IF>D=wN!|ND`7n}T$_jLb7_FjKWEZb7+W9(KUvC|&*o)RC55xeJ&nIDXT!V6*lPFzZXzFU>P|Iuz zhETB+G)0Pu2cuu_|s?Amn491$2x5scC zh#l}nl!g~z8!SXwo;8^eYp^-KfG1IUBz%shy~_QyePmw5Nz``7TUdz(G@-IFacCyH z9u7h2kqg)bf5)B}HdhTrI?5zoils3R=^X8tY0y0NRctG^rTh@ODv-HBrZtwJ+^{aK zkq>slEqE9g;NbbnPf(t4EQRAOOg?-_>{+Ph%FRXUys*XU8_y|Z__c3QhIrx<_1)06 zgz;a^1)LPf7+RL8S^W%dB2LDu=*Kur#1f3gek=Gkgqu*FO-fGIk0+#cJ4%nHY$}@h(1xTu+;kqfEql z#7-=OOHqbw6^7su^ue#OKirjEy-KDsnOm67Cx5~q?l^rjONjEd zx$29@1SUc#aRy3{>_eH&x3Cskx2hpX$40~_FbuDtOj^HfY8iGy>A@(ZVJ>Yf8M(kb zl!k1?TeuIUCzidW#`qYvB=%x$tV+N7VGwr1j!3(;nJ7D6I?8po;S4;Et+B}t@)(Uy z?yptYsVF0~_W!@8)Fo@q-zq%G{-Mdx>noIQiD3^gFXk;SFSQBlhk247n6wBVltLF zsD>^brKiRpQV+5Z8xofwRa&{jYRf-%nEsdL_8|oU_!-v3hbT`{HBWU}ZIl*ILYV_E zpgj4@7>pNCCf$8(fOU@0NDRYJ+=a|j?J`!uXOF7$<{qX07f`U3g6i1p71g!j_>`Q8 z)o>mTz;##$Z(wiy9eZL|w`yo6Rw8~Eeep8NT=@oN$Z8!^-y4RSE^?76O~pBEhi{=g z**%mot@o-j0A<;=L4TZ%lW{r9jlRbbcpqgFM=`}^b;P6e@Jy_LFQ5$RVw8rumXVRI zF$c@yr?>>KqBO*ouRgdKClaqQYP ztSjq(J{g%5ThW4p3e=ND;XvX;7>K{4jJ5wMK8!<9=FDzvf#-2P{(^p7DE+j0@`Y#A zm)MmkPy7qE!N1W@)_iyjfBdAD2nS4jE7am8MbPrJGM8G+! z4c=FKcQBTvycd?ifyn9FFqEN+#tOIsD`O6}#XYFw7wC`o(TepyP&b}~GM9FKK>y2z zaESsr@f(!M=5t;>aVtzBj>KGi1>*Kdr3Cn({hM*b-5w}AfCtxF- zgLJNz>moCVjP;^g7KhN6_$tZ}6k`kY|47|%0DeK7jDswsOKO?re5{7%3)4R^nDg{c z)ZB_PU52G8e;>!8>mnJMEPC_^;`r926r z!xvE|}zUAbK!jAT$FPE>#A!5Q072Ol38W&Rj8%mGOxuxn?p!84{4#2(W3Mcau znGkGwTdm)*xRQ8|8T%Hh2DC+)y%VrJ&PG|b^RY7?z%f{avQE2u_|=|l53lN>vPHZX zh%2J(Gp`lV|Ndm|Q6LwpTdbBz53EOQMXsYgk2&}P9>ZEC>PgO{boFhNxpN2QLCSoq z=G4HMh@#h$Z zEq_p}X9~6;K80QJHU^>pkLrmBVJqT!C=Y(vWo9m6YYKkG_Soc(YRDMV`Dmm3DE7zN zchv=_U?1WflnY~Hw1xe8Ej5MXMUG-{+fdO52jWbuhOc9N{1oMacQFL(KU5pd0IWm25_Q~*v+zBXp6Kz18p=$R&mY1H z_yvB#{k2=$`CG6qeu6HY%=ctumiqZvj23ppF2qx@CT>IN@>fx2ZL3liV|GWO><7!R zDegzP&PB8ScN{_7p|r*5*`+8wa0&;Yw={o$@%N$I%UFz;PINhoHlB*@D9h(hlp*L` z-eSy&At*gE3T0Bgh%yJ3;6%*BKIm7$V*IK#0%cBYLV1vPP`dsa%5}UIT&inoRJ0i1 zTq04{?=;kLB}xPG(1~xL+^8LYpXr3FP;U4E`r=c*7Gr1{VtL}l$St(hDEmj%%F3NM zhVn2MU*u%$S3IS5D3Fx-bShF_ajucn5eKYmDg942E}b&K)$ z{(bm5v2P8FG5ar~O!9#>Em{N(aLpzoV|cul#hA69q4YphZHw`qPeU22zfhjAe;qZ} zqfnM<20n|Kraz-}aco_6BL~WP+feSa2W2SU#<{Zouae23U}8OsvElq`YOilGHj3-m ziI(~`uxMX%UcZJG59UBneKpFEtP`bE6 zh+1wT$g0&wplmRsQP%YulqWxfG6WU*mLfe;6`#U>IE4?6#c*zTQvUvWn2Mj4d>?CKlMbo}!co?HAC#VQp?q#G$_=)o6;ER%HtHyoi!ZAf zi`P-Qz9)|@4UEQ_XmgRtBy$aW;Y6nTK-`WI_&v&#v|zDHmv%w9KrfV`8I95tG1w3n zp!CR2l)3YT>1~vj|A^Ztzth=b{DIPy)kSTE*Rd@XmAa~zLs#rhJO*X#b5Ne(6iP$R zqx8@xrp4Hd_%E}*X*bodmN=R6!5D-Gu{M5=f64m)k&H~HAG)ip^DfHUX?zc58qOfj z!UTMXH!(g^&6R;YEm{xa43xKE0Up51C`04wrFv=;%7g4hDL;>;Wc_=|bfe%~)G?^H z>XCscb0HlgaXU7_+b9<(+sC3!Lw}U>7UNDliZT?V`l?lwg7Q*Zg))@iqwEWoeoRL0 zuSJlNmJLE#r%se7xrqI-2&DmS`df@|F3DJl_(POA@FmJnc~RDN^8sqZipDm?^H7G& zjUBK6;l7_37%!+7>@vV3rO2dn>5&neo#7~V-JzEd^5|2XZpSal8?b>a6QWBTtAUXCUYNU zj1oqx^_qiE^Mv_iWJhr*-a?rR?Z&CD4@WtFILfR~NBKSA zC>r1YJ!E9Zs_~547zUtR*ow1o0m@|i6Nh6f{>3Q(=U{8xiL$3(M(Ke<+=4|YJ+@$i z8uM5lrmvj9@jj^u`C?iBa;zae#R&(E2+4Of^FNZ8V>9WLkyAgmQ{IZiR4@)%+xyAC zfnB)xx8z?YoiXd5rTk~|er7yHl>5s#j3$jEF$%_E=jTrHi7GUHvOgKetN52$e$*_J zFpo5!)S9x&G=Mp695Mm#tE|T6`S_4wF`j=SnGz}2)L&pDmQY*@?sUj_aIyY$>aaGFwNb$t=u@`AB|ENuDy?jWH*Qn@&d}A_>ll(ky<}axteqm*U;rx}vXGro^lkNJ^ann5iJqoX;e7M*es5cTm2D%l7zb*WrLMa3sB zu#S9F%3G5fQTHS1JG0(L%1DbTJ5O3k`QM}%(#OPd1XDlKJih_HNtr)o&*23b|1T(< zPU1_6_8aL-(q1ZOU>@l?k{sJfcgg>XEzGiVroniR^bU2OkS3VrZOJbr$&p2xMyg19 zfi#43db?;z1u|PmEy@3ie^Mz&P2y$5<;f2ry+A&kvKCUwagEqQ`iLY)9nv5pr~V?o zg7P;>v6L+_&-vY~PbaR!wOpO}(TT#k6x_fD#BwAM^HpB!Ogx!*KKbEB4daHlNxsyT zrTo!xkxXs#^HTP=S#gzf>X8na=XRr9{(s}^GX8A{zBVheu!))XV0BKCFAG)73%yR+ zO*3DgyuXp=(#-~JqFxU9jrq~hoVqYlBeQP(qxFB1f^Rvi$gF5@ej?6{CEQ9{ zMEo{M{?Dj=_>h!M8cbaQ%5j?XA&IYk#@FmwX1RRrD#PdAB0fQ0j^4~a`8kvwAWb955kR>dq4*W{Bh?~bllrgBb3UT%Me++tk)+FJ zxobC>_s!rlEX@Z4Nze0vJ;YI@{$@Qz+BWi)NXI33;*j433n|}9$|V0fsXNz^qdR4C zR3R=>SuI;B&#(S>$Tv7CnS2B(gp^5qn>2-dY4ZYPjW4ARGhRkqio7qWj#<{9vSRYz z;U`#)n|_Kv5X5IYNI3)tNI~S!k&cqzM0$}llGL0s9T$*x zlYB`HDJw(rAvuW)NLNUWiT&_noJtx>elyB37pr5K85{jyn}Q9b=^_aeV)=QPC@lU28g@XE|rsU;_A@wnD z*o?9X#KW;2DUJL*Od-u7{|dfG8cVuPEJq;uE2Np^dz#mg@?S`6D3@b|h1n9W^2VRa zUNZASD5$Ldp}N(P zGSixzqEEHi9dUY!Emco->ehq=J;s@kmgq5|Bn{;a|2&2rH_o~TC=Rl4&xExooRg7;Y=kZC-Nv6*((>;b8%N&vP~bFWOGCf zlbffeB|B)0)1gnz&||Hs*3eYz)C60L6k6+;W{*qb?39>fds3>LkYr7f=hj1}Ik`e& zl5~@Lvcx2Nf-N~zA23bNaHi3PHY%QI5H}cOcLc>cXBquB*zQP657v_sY*spU8lR<0 z``J_br%k0*DQPzM{ix?lXP=Jl^Ztcf;vvv{j-d2_rY+;dWoTI$%6 zt*P{UL|1dL=ntlaJvjU2RZU#M`qZ>k=_#ipAwy@Z?MAI!JjF&IF+$04X^A#RYOu_t zC}&El?wpota|Ejnh_@!uRg6r$%^?HK+)I@%lnYF=Cv$d;GcnPs2iij8LiOGw$2)AZ zn2XYyu!*|erDhbDlsY|Fr+V_^`k+vKnCik>5jC-sSKkwG%3~2khO@-4x72M z$>WImVN8!uXF_afn#2BVnk{r{M(7M%_TZUKEbX(MGlR;_uw?}H9vS4`ItB3id*QHCamBqckivjRiu`qpaJwtjp%&9X96X4z?6 zWD>K{=pAc{(S?b|dQ3@6N^%Azr-tgijJ{0{kyWC4fsr%LNlT*4mLh$dJv?TtOOIp% zBr`u|GPM%)44dpNj9{YGk)bEsWVsu$Y!@s|tI^lAlMzgmi6)N}tlLvrg{k&LW-?>M zoUtV{1CyL~b_yqRGBw_6>?^F$cxCx{d3R=dUtQ)|RG_yo_ut!@^B(QGo+As3cN{1_ ze75jJfp4gQ$i!&E^ zwk+~w<*ECPpzxqc@OO>KD5WPdTHT_%;Lod3s0=@6s)B!MJta;k31R+o<=t|Lfv!H z?a4YqKhY}B=9lP)f3{*>?)}eZTs(i1H$9t)|(JpGdkbFJC@m z?$$E6S*r>QcBy@vD{JnaH?5_!4}0|Not{AVcOF0AioLC}ZEK7{cH7A{+*N*lr<6PG zw`0ck9r_?jw(rabjoiB)yi%#Ucm0tkx?5g+TK1g#jq+2=T8>(bDP^Y>x@U!3w#3lT z(1|j`_(;(~`X#gQ#D4YAoFt8VH1oXKD+ zasgM{?{OLZ$#vDt^e$Q9&0WH@;kg+NrVXFxvL$Ob7U$(j->u!oWj*IM6dyj~%|GQ? z`qJZ8mSpC5vhuw-$7#)ChQs_!?h)35KEC(J|J>|#&%%v`t~0s+`I5MEdI@)zQCYac zdvLov@V4E>d5662b>0nmo>hCihjtciKP=}MliYY6%uufbdD=D0pL`#%qKa4Md$QK6 z#;ZorxvUOad)p^-UQV;}mhAk1@|L$sHDi0H&XeW#E?HK1cCQ{7-G4-oUb60-r(h4W zm>oY@&v|*UrJ)ZSe6=s9&%0dKHa%6;Gk2N(pS$`$ zXKp^me3WiFd7NoQE4+(W7oIvOGnBUJs-@Y1zx6U^rW6&PSgO`~(dHAL1#ZvbCB-`z zl;j*P*|C97d6({HV_@;hvOIaL@Z_?>lMA?hNp2?Fjd%GX`7Ea~a{upE@xsuB!%C|;Xru+E)+m4?7bxEyy l|5n0-tY9mZQ5X`L|3Mc^;^Qqozr#RFickK#gDo4%{SO=@5I6t; delta 13621 zcma*scYICv|G@FjNeGe<39-eE5E4lQi5(gCRx7ndTuF$83|lX?_eeFAmfEF8dl7q7 z(J!rCRcbZ~p{kUc)!*xVKSzIl-@kt6(RZKk_vf5@#%G*+v2su08ygGTu6q?)qS%@{ zDOCx>yp$^JtW<|;(yG+JP^AJf4(sDw)1&A@{SphKQHQ`H*c6LlF>Hk;unUIZ0IZJl z(O)T>+Ch>*!&R&;2iDb2!m`xc(FeanKA`Sk87xsxsnJ*+=i{f?3xmQ4oQ`g2$BK9e zOW{?lj1RFY*H^{E^^=F9FFQKoX-vf~*o?=ni3?C}Xg9u(XR#3mG*F68Q1LjEF8Byr zQ&(mu&!nsK@_7Y3mpbuW}BnTTa@ z8wTTX^uQ-r91BG&H4DA42ChKq>Tj_j-o}ntF-G4=QVjj?OT%;;Wb8MhobWQr7~aM5 z=-phm*G4OKq*<@S8Pq$`z=#%laz)}IYAZ_nGc1HJuqeJknNvkt(*JS+|CV}kL?OMU zx}y^=MtWH-!2x&-IVG37tZlil5jd7bu`)iaN8P`s!@i+m; z;d2{_7fCt;EeA|TS%x#Q1Fl2qy2mJ8T8l>6-whcpH3BWT9hq0^9QvUf{VY#f8-1}M zGCfppjK^#|j<#1M&LrP-Qff0EM^=(b>#S$}dJLdGWBLNUsr|a>8*YR$1WoV&9!1$d znVDGkiP-qgFBQgx}{$3&ciOkni{ zM`OM2`of!$PE|)xdc?hl9=cW-OuZhZCoW1v z;dmA0Nj>}P6Sl(P)SHpsR^bQT%ovQMUWantJX1@YUR`~#IPJ^g z=>IV!8)#tOC=31Qi6c?&d`y6F%L|bio-Hx{^RgH!%C_Qo>Z{bxOjR%J4RS=$_m(2idK)ZjU{!;3Xnbgy8 z0=~kAI3`Koz*or1R3}k-tn5&IgF|g3!8B~gc6c3qFkqNow{0+xdMH-GCD;OYVp)8N za)VyUdfm@PInP$~#TV$06&W8nPYg!ja8ui6k{B8aFc^bU^a*-ns1sv|GUk^?@bbXG zk=kh}Lv<4i;bW8`dWJG|wbFEt)x}imz9>WdJ$As#=^P{Le<+EJ(Z^T>w_yP0=mXR( zvo4DJtv$wSdNe7e<64c zmC`4ji~pd^weJ%h4tp><1$e8Z^s^Z4`pbh81FXNX0A;iu!sif(o^V)f1o?QLRnr# z=P|B02?x*LFN%vD6h7>Diu!Nz_Nt3xgNy7giMhL7ikH;e4uh zmbNcepze=_aV%Cr8wTS_biyA{p5QD7VEK<&V%Qv=aTNyPT9mnP4#)7pn;5_adu8+c zk9OOXrTPa&`(=6z6H&S>8<|h)9G1Z@44e!_0{Y+v49DYG3twX?thrqGU{jPEYmM^x zktjDZ6K`WS)|B->ZiOD}H5kZ_A5kWY%*f(U4x6DrN>5qQ9}`ecJRgVQI;@SJpDDEq zqfwUMbCmNWuGBp{3YlhV5vFr}RbiEWtcIaP6 zO8>W|p(_pLa0Rx&ofv~Ju@y#a(|?$pitf}4QRc{}C?~#v4bXkNwgncVo`7|6GRlMO zKv(?9^tO#emfHjL#TsAfU#o3VF7yR; zG#VKd)dvkM%9rX?Hs7P4 zbR&kce+L%FJgkp@Vq*;4t5=N`n^O-%nOobiq^$odBxPy1kIXCOyic$5Z77rTTlB({ zC|!LGrK|6t48c8=x$_tU&~3kdp@m}vbppzT*P$F=fWz?-hH`y1;DGLuh3LT(UPTM- z_YdkGa6hCC!z#4*MS0>0*a5eo+~`ZJj_R;J-ruwnN{@^}nXFr|5$2=Ko5XTNk70cr zPu&|WcnMSS8CJuDZ}hC6i_+EKp!CF9T!j}tF_2a6S6s35>%6td8xE>$x-%J*YoJIc_U5+01W$*Rztaa!L%D&iSP^%lEUSxH1#hB^wfc{~p+NMZY>zyg8j9_) z_DQ`e#-n37qYS|=tbq@0BsEFOp5g-rOADi@=YFq;(J? zcogLZE1%Xs(<87G^-@g1JElF)=v8DpMlzEffoJs-*>NfLHRN5PMxE1L7;s+qM0?Dn zJpt3@IFv4a?}EPJbd)+9rK|T~3p|5fSo)&wks8Pl*;Ee_*;aD0!KpDy`lvS}GyW$~~9`^cKF38^p;8Kjb!erSvNb zEPa(bXAVT-4C<6$lp2VCAMSbPpB1t^b7Mj`eB(24xPs zK>2*dU-c@9!3xyzC=c|(uk?R5$s`(fV7XlV1V5m3bsl=-Ba|m7lBZ7`jwPsLu@MeI z>A_W41y5lF-p6_vkgr!yKO8|l8yn!Oe4GAFR_~4;(_`3;9hWg2gYN2OGze=@A3!;A z9tNOafqvqSSc`fp%9C$5J&(1ipI{hPyQgoYm+5pHiIp98Y=hk|&bmek8oPJ9Cg zpxbZyk5I#~JGILL{X?Y>MpA!<(lgg_0Y1R0IPrJ=0$PRX)LXDP+I%1CC$Eh%TVrq* zEhENhS?9 zusZhmQ-58KM|tu?ruQ+By6h7@Ib(1eeaUE8( zTwoBk#d+xK#IIcJME%0aP<^nCv*GysT9l!Kql~eKtL}kMu?Y2dSP{>n^w@70j)885 zV`vAOF0qkFSD!@bk^)S_*BFnf?uI&o7jQf-;U}5_)yOQ$KykQ_sKAP-N&@$Fg(rRQx?BqK>$o9;pB%35CfLg6R}4neulaP-ExI2l)C zHdZTTI9@FKO+!l?j`zV@Y{ZRT#OoYaqKx7A*RcoKm+PzWvign|qjcRa?12Z-ilxgL zj<4BqIDvW&{(?ozGX$82GAAM`7>?OJ73D>A1nc4>T!_^w8jin!+=ZL)4hkjRrSM;Yt8C_VBJOJa3@Lk(m-*T;rjF`FOv zTWLSb-z3Qk$`)ui-uX*0hWZ2!Lf2Y`lnYKpUp$QRg1cRd^&db|Hb~F@Xe>?L z9V=iGmcpqhH?kD@{!m|F6vdto=}WsvpYd2B4hZ7Q*_E zBl(1eXnc(_$(q$R9RHNE4?UL^IX0>5|DPlV_Ld5tLj19#_)-Lp{Usx7owP^qX!U)}g%s<=tN~ z+;Dt3g`0QMJ1dFr8WCev!%jQdc=D4~h& zvJX+7XaP#Q9i8zsHpBC1LC>bTXX>KNiGdi6vv3HWGTZA$7;25I|2`x#1plBcpUTbj zw^d`5G5i?ieXtp266T`Z$WxT%Sw2$V_!w+My$I!kSFkr$iZUEOW=Erp{S=JEg;6U|-*^DZg{EO&T#WLq_z&u(Ya-)dPj$di7J(xv;c6+T1hnP%CyY0^?8eS(OvJ4G87$A=GaK=jJDMz z@{{W^%JO@T{^zkZmpd|G_nQ3KNKu zocS|cPMd7<9(bQT+EMYFfP9u?AM39WiFAc*gUID3{IRv8g+IBF`w$fgS*9`!JqdZr zx7$!NNx|_P$JdBr@~g(&hzxHixoo%ao|(%-PnGr0n^9Hcz*KUXbhW4_5fx~2Cgh)C zr&HI#TkMxDntTXtvc=(LVlVk%LcTFq5XT7FdJ%6N8E;jND^5i4dD-OpY8Vakh)<+o zn@YYI%X84TxR6>lSMx)X2N7G$cBx-VPB41vIjK9x@(a%K$9h(+T20KQ{RqlCrajR? zt}nB41Mzk%#Q`%YsuF7n-pY>cCI25x{yA}8H~stnc-ndsdw(9cg-n;&Sxjx6$V#3Xh{5(minvU98snc=eCC*;-qGwqX!gXHoCl&uK$ zI$TIBB8CxFIqzQDmyl1vAF&MKxc+e(l8L{m4-*q5sH+oK$bZ9KDC>F~`BOqR`C68( zfH*|kZM-H8?`*MVzKH!U#6QGI;_a68Tg{@5IUp~Ee&&wTDBon?B46wOZez`?G;JTh z)6N&6f*$hsax4)g{goHRzF|#0$QI)OH%=WwHmy%1+|#7Q;co zL_Olaw=|B+Hjn)Q<;5j0msvzJjxS|?rVDi(AzK%Y8-hMW1IK=z|0{~6?C49}B!5MG zNWL9qvyi_bniDR%N&P|o0g*#E5f#k#!sN1T)~I6CkIC2L_k>)tmbuTDq6*(Zj#uy= z8lRKbBV?<{&LF~_T)v!|(B2j=nftqw%QjBq_(w)bJ@`xk$7GZ9HSmA766||owsbYM z4Ipu0$6XHohA1J@c7Tv~zidlrTd7mWpXL8U{VQ>UkgW=zm#rJ|HPMKAm$^^c1IhVL zQzr-~^Vp{Nos7RL2Uv($cAmuH#8z@&97Aj&E>g=j0iWV8#0>I3iI2$(n30WP5vk^$`c1p&^;5L+;JVH(*!u!=Kas zf;dI}wb}NgX(w#XzJX?4g}jW}Bc$q2`QJNR4~~;R6ZV(!529#5u^C^#bI=&-nZ&QO zxpA`7xQe(+TO?74+>`bN_;w4X-HT!yVX*Jq+Z||sCx4q3a{BlA-!^HhaiTBHre!#a z`a|^Mq~(Yy=Dw!1uOeO%Nknt@?ZjS$Y`Mf|mu()G_ybfkoaq z@Duh`qPAlc`$iF!$=^fSo)Nu?eZ*x#w)^a_s8e;6{0R9&lEP8G(% zf4ptQJoBJR*q8c0L<;!>qCfd{3@7f8zui))Wm`l1P825^60&{4z9+=*L{;J(vDLAM zTkJ+5+cn&Y9Wf896F(6{&Hbb*#>~A;hhrc&pTxdMvnNK=)|x0yJ)3q<^5xjj+)wKG z8Gi`xNf$*B-x1R|@B*=tsL8%nNTar@JGpuog?YCQ9 zH|(d{JT$Vtzv*hYX5E{aZr|H(inHCX^O!=;acS19OWj&zE$`MSt6z5?d&BNU-R-Rh zK6Y-Ak(iWj;lDUb$GG?)%ZOxaT)Nd_O-av8vs%WcW?J~DB{MxKCBf3QSwy{%u#iy4 zEhWZ{vRa0uCMTzkwq_Nc{k}bX@MY)1NhyP^W3oIFTbNmo#Ok3Dt)e@0YdO&Iskp&| zxnqv843A66j7v@)Yl-99DXAHW)-+3eYI5fA6#MeTIQJ}TW-WVz%nyvL{LxG7-eb!; z+jmcB=AHAMA0N5t(Q#=ha$Cc#>FIF^R!e$fYG(3a3+Kw}*Qr8Sh^6fiOHzhq zaH=)^-{%^Vlx*cuTPJ0-$sELEr)OG2Tm%2z?7il^a7w6=G{jOf*_x7&kyta;(lnUI zOdD)X8&$JK_n<*z0|EjpAt50(q}5^_G9)QJ$(j;BmdCQBrf?S-))7IL^i)URIXWp- z9&xlaIoZN_xjS~IWmpG?*gMYeU&TdV$G%}pcPC?P*0RmX_O;t@yW8g<*zc6pZi1V= z>EZew_V#D@xCWW?A#d^A|D6w(f++{`4oxfgd|&?3ZI;1t8F3*Qaf6brcH` z#>65rNq(L5q5Z+NZH9frjVv&gWZm4}F^d;l|tphwgl|@y;hxyWW{KGk?XG=Dgsj zyoGb~vv(Ct-kEoBQ|{jB`7;;g?)#Dx54?w0j=i)QEVnwdX&8~2m9{=@vwR^+dkr5`tc!NT0VYwzq^_U>%quy^K5PLQv$ z*~v!V=q~wx&YU1i?!I-o2M=+Nxw|LjZ&{bSdrSVDbzJ%G!6|v`4$xKkOJ?4gKjYsX z)wh;=VAb72OY>K3%AdbDZ|?&3(vrK!bLHLnIe7=O>F~S-OABT!NX@zAXRPrlR1w1oWzX?4D(ATLFqRehAA-leF8}}l diff --git a/inst/po/zh_CN/LC_MESSAGES/data.table.mo b/inst/po/zh_CN/LC_MESSAGES/data.table.mo index c5d63fb7d333e36e882685abafccd4c609d72aa0..d8636f5bbc844d58d252de5f86f2b367fce443ad 100644 GIT binary patch delta 20536 zcma*u2YgT0|M>BHBlea>1Rol)VE4p!h;|dwM(uY_SOZ6b3AD+h`8RPeZy#nVJJQG8P>;dQ7*u>kJ1kv66i`M2;0{UVm z`ePA}ei~b#jCH<#hLH;^V=7j|0L;QX5@+CxV@-6&7FY#);UJvSpZ;h57&i&XiTwts z12#vQY(dxv!_W=apbPHCs+f#2S8icxe1MFCk!v6a$M;d{f5J6*8@J#j&b}Ng4yOMH zlR3zr(O7DT>hgIgS9%!BW124K2~}NM3nktIKg4*fi;q#dywXs^Xo~Gn=E`)ehkI}$ zK0q1LP)C^R`Y|YzYadG2-bHT4Fv8X39Egu8&PUE+OtPzXx3C4}H)zAaVd_E#V{^(* zEQgn{JU&Aia*yGL!E`emO~^=BF2=Ta2tCmpp)RbXb^ywlt=DEF*J2EcP=1B3l&i4j z>R=7rgF~>I$?%O*7u-Bj^~fMRDeHfs4)kO(+t^_mN>^<}nf+I>A?A-(ch(YR{f@r<{WPF=ttl(CzRzPMR?PS-!d%EUd#tDy`;xq%3jJ9laiq3nMbyJOx7^gk=Y=u1XS z!NKUk6qPO7&RA%MVKl+M7=+7Ej&l`zVYiu_uo#)mGkLS&er$x_BJ;(_JBta4<5BMDFRX+C zv(*D;D9TXXMd?znIchRSpf}~&SO+(tw7ZNl7Yfc*=V{^~(}6%7O2bqP#K+hiYt2*7 z`WR#-8C$Ri7N4&sQ#jV9yawgK7qAw-M%lmm0un}Iaa@JN@G~rdjuH!17x|)e(GZm7 zu?gGa57-_{EmG}<;x5W-Q7)*#gfEbVVYFJO zdgur?qn!H_!#E;wlpD#`PFrsnjGmD@Nexi~*5>@iaWXye3AV;I85Kh5JJc%+_>TFhqp(C8YGBVxpPn1d6VT+n1lThyT zJ_cZ|t!nZ`VROpIaR?gQ)PpA!J5pSLbgFR$Yhm1WDVe^Xs<+W{ zlso+e2ch3*YMIQz4=I0%y)j^iI?*x=rhFS^+1B2v?l=NlQ{Ih(@i|J5_4-_`hD01d z`RmW=f7!A8E_Da}um$CrC>L@Ht6=WkYK&{*6w2du`2_}1ZvKUOX-&r_ln-Dle1_Gq z!5-D+Ls1?)PV9tP4l*;z)KBKIiHC6pw%V(Dk{`kpxzxFP#!>|usSB845{N=G7ZTTI;h5|HOdvnqYclZTE};G~cJF>(rzuvWcxvv|G_4d>wcr&4biw( z+F@&KdWQa&E1OE@Tl@)|;^wo45si0I?kM=2`o{YZgDKy^p6GvGUC2yqL-{Ip$FdjH z>>rIEP)};yob_bQPw=RZgRJ{9S39GTMQ$2=0tDG?su5!SPP5c*Z31&Mj5IdchwC! zo|0)spvXNni#uU+%JVP;&!UW3x%>Rg!U)X6iQ1>}5TV>HonJDPpI}AG<^%OH>Wgxs zE?Otnqm0$_*a!;=p>p(p#ACH}$EN zgqJuLL6dJwh5K+4fr2X|r+-a$uu zGL`;P6;UWBT90+{Ec)O-x?K8ib)ntRgZKwn857VQ51{OS0i*CGPDT4`{UpSelsy?w zx#2!<=zqDA=>+6}r_mE{Vk!IsrR$3Pqkg&6N4bOk*b^6_w7Z9Lz}F}jR5@GydqFoW zOF0Z(aGEa9#5M~&L)F(rSIV8R0tTZ)?mUi6MNC5J!jrneLzEL2@l<1NLz&efC|x=anWqF~vb@1cSgxdLt=raEl5!Zz*(f)%73B`kpLi#QN7v1WO7pAAe`nbyZ;9h3_j zk6yS2WmWCPFua4@o5N^b+2on(Bhipv>xDa3GeeWm+$_$tV}@#G;s@cZ-yT);r>H0@SwPW(K|c?va9v%hQu*1vRVV*>J4 z8;WOfDas@q*ig-l@yKrrV9Y1y7VaDTVU73`BSAf}L?F4#QoT4?UZj#!f7UiFg4M(B90n zeyY8|0hBugsJGu1lsR_MK}Od9QH<$4Wex#n%D9$b!!WaNP`0UzQzlqcA{R;tU=Q119Ilq+r* zq#jfe*qHKal=zP*v;2kDyS2LWK$JevMagGp_BThAgP7 z8p;pRhw~eo$t2?mlqcD^Zl?9mkBn(2lkDX+sa%!&N{)myak0MnRG{1wV%oixz2{(*AcKnKf&hD`>k z&*5%^RaYHFxzk)j)Y#WU>H3~1%P0or#A~rS{(>?jUZLv4qzlSq-i^iZ1!4i7LX>Czc9gf^36wGa6Xi8r$F3ZR za$$2&>W`va*h`d0xg%hhx}zC5oWLIJh~C3h57<#w!3vZ~b`a%`e@A(=mL8$TelW@% z&P5s9btv!sCn(D`e}uK(jSeU`Gz*!m4z7*A2Iu=+?|i)hFGSZ$PPFfEMPxSaB> zNIt8yi^iMQKfx|T zd3Jk`RXwl=n^Jy*GHV-;Q!k$-D9`@yP~P(m#;YgWGL*@D6T{Gdg6i4j*n#p_C|zE7 zqMGC(*uh%=WOx)Bk5E1=s(+xm@B>Vsd;#0xpbymrZNVLs@8J|&JV{w(vg)CQD95Wf zMa_*NC{NNvl%aWmA=re;?~uu`n2gNk%h(kwOjRF|V^G%hUX*2bOP5Q0q)yNb>CV?H0=9tU6si(0Wa0{Yw|_BOsH@ZHDTB_9#27K=~DW59LAv zW}4Q&ehtU*lrN!7rcSfe{*zJ4dvFk@p*+fiW~(8cg7W=v2IaVU=WxgJ$Q(LHJLqYl><XZJx!d%j=@_On;Ofyo3|6T!Lx+ zQ|v07K-qnXI*wx@nXv?J;baV1sv7)&viv$Z)pKF1w&*g|BjZu-AQf%+9Hl2cm#bMH zjB@8wP}YAc$_>52!Ps$ywV%WIl#Cqc9LgNXm8fQWBa{z@sVI~0435M%D0e<$rE0ev z<=K7{W!8tTQa!N_W$aI(JXg%s>c*O&^msD1mi2#=jJzf*e5@u#ca+(@17))KtWocT zekf!53Cbh)50nQ=wYBO5T~RK0wk}^pSxx!Zsro)BlX3~lBz=t5|Nd9u6ZJaniPD7* zlsi6%(q+G)EUPB#ReY#+1PK-Ne$5klm|{G%H*rHK|Q!ep_G3>hule# zjp_;42Ia}N7UfP|H>nHii?RxwXiYklA?Qp-wak<)yX{U0#EakI19jZ&0f3DtUXHX_f;a$qMS_irlKY+4)E}+bf zw!77Z&%mE~`TT;n@$??G{I>2@bLtt&7gN`L>Z|$#lu4N2(3wLhW1fcPu;hOA)@p?F zDQ`gOiJB><^-rs-Fp_fK18Q!>qYTX&tcqt)hVT!pg&hv6p_+&TDDOj=jE)kA)K91} zIF!J2?1>q;3Y#2OPqrWNEAFho5%t^c+f?<7Cky2ZC-JEIMB9dPoYBYB2g)UEP5D=p z8}vV}=9C@zHgp)Dkm*H*>zC?ZJp188%0o_=#&+z?SzqG|lrDSrwX)1fHP-F4^RPDc z7qATegYuy9Ii-HkOu|y!z)!fD_~X;kbFBXr-R&qkLYWg@ z=hU49V_C|}QI^|Yl)3Q+OJIrf>dt+&fjE!&`#2l#;anVff%9;F;}0_Har(EaVWscX z2gxp6L%jY)^-Jdmv<~#Ws=tTjsUPu!dJfFf9>8hDGf>_E5tr11Y9Gp^y^3<-oS9o;BTK$$BGt~=CZNhKgXa0BIzj2r3>>!FNkJWBm+EQdRE{5<+l zzKt&U8l_z}p2tc&6y(mIpe)BSH`R$_Q7&+!gN*g|LwWY+x}}C70P9d5iS2MLcEofY zuYFtX7lx&Ye~fZRDJUQ2? zPc65BxR~;6lmmF(S3gjK(3kQKloS7eh0rxkeQ*@VDwG$Xyd(CZ%(0(U++pNTS6y8P zi&7Db!8i?D<0Z_6B{S5V@Ije8bx`KQD3rOe5M^0zMjL*OGFj76X1(o!dR`2|Zj_VI zdjCHm(~>}gO!Xw3fYM+d%H+9@-LZ0(8ncO5l5!HtqxKL=SEr- t^EV*7i} zlsg{wP<>}OQHJsaUX%4-_*eDi^FZtKo4WGuC{MfvC_}Ly<$eD<$^m_TS6$ixWf_mg zdbkYN;~5C9o{BPt*HPw- z`Be2tX_WdPl%X1evI@dcE_g1=aZ^x+>?iH3r>uWDvG+6eKCg~}lp|2q|2CB8!%?h- z*HC&Q{~zi=Em2lU7|I2$KpBGFC_V8L7Qlkf)g<&pS$55}!=F3U2^JHOb^a+zSEgb- z-bHzo_I{z3*)^;{`8k%sk}uT-HNZ&9(Krro=y;D;>Ui_81@X^OR>>oj9xLzoQ~g*B z#6ko%pnP`k!WmfbFZDW2z*x%9Fa(GFtuE*=+9+SayZ8#18ux2B7Wi!}sG zPxsa=i)wMR12T>qOSH+_Zm!N#4?m-!< zQn_8MITCax5-qo}V(>?`tjNh#!8l5#frfT`tEr`vC(?4|Qk=UAr;@(7iQ z_brK@Gwzc%k>}xO{jZ>8jbBMl;+;u9NPw*x^+zeoHl7qp`ij_h#9ormj~yw?He0uo zd_@&=7zK6ZdsHTnSXl;7WF`zBl{A#)BHJvzb0g|fb^ej6vi^GL*kc`=tdA?te%ZpP zU!r5%blVUY#{UO`S4r|#lWnSQlslIy&(mdVlBkgNpVIj2gUACYRmbz`wxRel^@DYr zDU_MtZ?~g5_nSQb%Tu`yPb16T+T8goFQFe*(fWIdzZ0kr(QVRoU4E?#7yJwPcusVO zHcdz+bz9xUctzb(QkK3LM-?*VbYU5d<;7sf-*xPBd`Zg1!LL!bp1RwVSCaflvhBb_ zD*N_tePZG7oLt_7@=hs9d>NKTFBe&VY|Tl$C5<+u(xf?*d(l9)J9tuMt-s~?J4e^? z5Hs57yi^V({~D{~c+zpweW_skmbUMcZjwHtJk*M+^Bb}Z+f#88=i@P}lAqq>e3!sh_%rDr(q7svL|1xJzN&|ltot8Cg}x4hPXMD5 z=>X{)>I_ zY{&2Q&KI@v`VAy6FQLBFpThwp9_Q8;Mcp%%H3D@k8-LOH#@df@g3j;b`@?$4)Fr^j zitz!-mwXBbT2K0dvTX7KNhJRXb$^iZSW)#~#+IO*v#p{{DZS4k;=@w}vF zq~gR!5nIgnhw(mzB+@@rbihtjR3I-KAG5{{%tKtZWQFy&E`NIyUxTT-{X5l0T(GzM+#-%kL4Bn4VjDRj(_SS48eyw&(A>XmD->BceiP}Tnzb0wtjF-r$Ix?&z=5F z-5gse@vUXdBy}Lafz+1Twv_)uJLx#Fdnn&sIUC=1#y(;}*ot6vXbH3NEfM> zEmnJ1`$#)V7IzhPk}u3)oJ+;`q%q_lqrBX^;w_wsLrEvdXOgQ$Ajc>3C6Wuj^`I3+g)Q@|VUA|7++&Q8n4Af9g}@OAblxQ^&cIK(Y{YbDe`}lHj}?j8bsa)Wy?@le@Aj6 zH@#mWee!pzt$$pQ@)+Hw7Hu|C{u_Hrd(Lld;o4XFAfIY8Fj?>HiZyip0&zFpM(=9< z>vBm7?P&89>r;OL!R< zGbH(KRfbqz>d)g?Jfz$DU^KCuZ3X2X6n0?|{1E5KcWm8_k zfzn9Lblq_BDb%&nCy2#C)U6{{16_5zH-197moCfhHrmMc6?Tz#an9C*KyL~=ahBfU zEPhM*0ZF#yl+&<0<=S|O@&L^`<^p7hRUh{W^A2$9yRNt@riAvbz2`I%W}AOnOS= zC%6p#b%ST*lXX6ty03Nq3-Yd{)1-Z*yTtQiH`>o8e?_+=WOOJ0khpB|Qb){|tLOxi`Ij&?FP3gvOJbVHvYUTdVzL7aaCnHi*&q#dN=q#sF{Bv-Di zB&iuGm=sQ$Kw3uHO*&7yL;8c1uMI1N)QA*98bO*wN+4|_9Vh)r$|Sue72~B+y)EP4 zf=mc$1Zff}fwYNqfb=6Nlk}QYl0mFTYEKF$O(iWOZ6h5aT_mNEPF-c`E--6 zD7zh{u8ebMd#}J>$=P&m&gGRVp#A|+_ROpI=tWHGWy|T7F$J8zwD$J=ges=qe|jgq zfYZB;x1$SB?L4}w1qC`|9iC+b)BcK zn@F8}w@Q1PiaD}V2v+@XFx%v%n$u98sd54Jh~>1)iNz7CtM@Ig&86EPqE1dETTZ{H z5|eR}O=dxM_VDcN;zeDov6aRptYCI_En@k}%hsJF7ciGtZY!dGi%BfFn2Yt$mvDXE zZZ)yyx?RqRZ#W0GE$x@Txcc{FsgxFSgfL>#PqI}bN$d+}QrprlWlK1Zw)OVtRYLva zuw0%DiZnRn{H<;2+}YF=aXxA56%gb`jyl=OkfcpCvHx8W4r0{rK+5E)tWAk&Ty()eV7s3+FsQR_fS&VmUu3D(G1G zGA?hw@#JYIT^LNPvNN=;SJe;6$vl-U=XMVhla{izAjuwi$|eu(ct5}A;MOfe_`g=2 z+YAnA-=kYdNar>-n=QFPNPy?08HIyvFW zwy3dj@wTCMTc|BAB6@hF-4;DIYN$Q7x-B{;x<=QQ)or2C;kLj4|K!E?%Rb3lriHpE z=Sy_S+t3zekBYU2O$>{)H?hUVjExPm_lysXjpsbN*x4Q}g-Cn!@c0o`R1-gE_cd-c z{{iKPVzUNf*Ut(}k)7^O_I$uEe*wK*@VWFI;rY&Ry9c>@a z@xy8M&KShUMuf%NM%!bf#>R)nN5n+iVusoLn?xti`2Kidhdp8(m&y4fqDPO7k85fh z9cd4Zvs>K|Igz2V*~i(V^$^OQ^y1hkdvtspd)R8$tXbU_784y85f{%`OMPU_a4soQ zqTvy8y6U!Ad)(MatA1Q)y;H$u)D8kz=Ex;}Ttqn>9-Ochvg(h7Xjk_DJzBZa&KEbQxYE@tCWb zH)UukbAF!02{yBktIX__xi+(K39lai_mz_}r;a(#rAYPvESr=P_06avwfHh`u~3su_}!J$8r8&E6P=tR7&iJ=55!M`BTkp zh4a5%REg2&Os~X>i_8Tn(-xVrxhuWx;q=ARxGA+f)Xm$h)sZo0MdG4GX1T;0o6M$( z)t8z5%c@mEFJvu0mVRgp{Up;hbLs-=DE}1x>&{BKQmU;qkGZ7yeQd_LJ02aG<6Eav z_@g6p*~#YXUn$&{m2^<{>)P_skp-FiXQt0Sl(}Gf`p0{1{r#)^)~ysikT|DKJFqBy z=Ki$9D`=lJdtv&fh4-gU%UH2nhG5as^h1l&kImFq%L1}ShVYz3L)BIZjSI6!r)=0| z_AcVcShP7~+5Yr7yX9&b*36^M^qHGEUgoZp^n@j{bO`UvSe%e{^yBo8_R}4-QvD)l z-L*LFz*JStS~E3$!OZl8&$6Z-&OEv#V{KB}VUD~hW5rIcJMHl1^u_bicO6VWv@9#> z5Le0NK4^c~nXzsSvxW+F(v0M788a4V9bTFF`6hkbq{A627d=S&{K4T>bV+)`!t|Yq z95H>%w0{><;>%QXa*FpcdN$?SC9`Ycl<{}XErop24lK-Awku=7N8EpA(l$C>78IRM zIq}Btrf>eYgPmCQH?#WwM`p2nDUDy6&GIIi*`|l9HKo0?&Hlx6_lOy9^L1{&eZpBZ z)x+tVTHo1cT65DcCBB$Tc)`Nz1X)S@GLshgCi)C9y%X0QHK!!rSZ(?wzFKWoOw1f? URxIGh713pxv-hT4;d9OWKQjkYZU6uP delta 20306 zcmajn2YgQV-~aJ*B1Y{UVqJD(?_ETU*n7`NE`&%l605lERjYzfYEz>qQHfO@Xtib+ zZI!Adq7)rk`cwX|_w_wj>%Q;*{kYHL@yYYM&iamXPNKix&&P8-I+9~~VNa;PWi78| zS$>vftq-xRD)=_`YDI!=Y6r`*aT=CE5AMf}*dDufv@92{#nN~b%i%Xz9iQSrtkTJ{ z7T`P_h50&L)_e|UO(NltKd$GeVd|JboqFUxAo`K?wYR52BGWt*`UUPfJzUvJBzldOW+ z0EeOOC<)i%LF|v6`dC(d+=1$mYfk<->iG5h8V5Lf&=X3)Jg}u0nq4?7f z`hOgW4&>D3$52;#56fbaVJ6=Y)uqEw<+Jfs+>UjzY`CfKjZMiS>05=BWo!%!?T(%2t0R&P2!Mb2#v9A$hT{mEC1vaC8-1rOq2 ztZLKctUcXOvoWS82I6V*i=2G7Xzzu4EDs5F(I(Uoe1#1$8;e_A+Z?qj;;=k!!BThz z%V8QapRDXLmX(0Ru@YXz;`kh^WARw#5w^p9ScGw}ul0Y5gx2X_sIIE;3TqhKV;4M( z<1uTTc|c7@I?K9-B{7srUFa~>P_D!p_&(}J9-!t#<#@B2BCs9#Rg&{t-;=116}fy} zz(Ca4&A^Jd#qkrT{ytWwywC(QmQ7JNFamYw%NwrP(bm?P&Vyx^Wre678{-?Op868Y;1jHk#b@&r z#E#em*I+1qfjVBm9LwsBOEEvD&EbRrBpy-F2=mNkDPddWnATxzg^lN#6HdkH$i+Xpo%cVZx(!8&*o)voXYGbcKr&NCU?<3SGzHGGQAvF1Xvj$^O^`F*IjSUUE= zZi~#M+KRQve}|fExnDD@q$#!_KL)Ge9xQ-YaRmN^1+n{L(?gy}66&IjsAX~s+hTzw zmKBEmP+hqh_v1Cx1ub66-7{2w;&SrE_)(qwE>!syCW?H#f=PzOSDNFmT}4~ke~jT& zSwE1dNe0DgxZvB8_>Sw0JolYfeuD@QgM zn{DJNNB$(%#15OxP))&}je@hCc zQ=khui9z@b!?5yxv);#{@((Z+8@*#z#Z;_HJ_T#xE$oej515`Djhe)VaUM21$cHU{ zguO7#lWgwvO;p3;hs?`nBWkh~dY6w09O?K2)+OKnuz5!;!TRL)qlPRU8)EqsGXw*% zH~H7lg*UMy`lXt;uBQtLjp1z67+*k*S>7XN5{008WFP7TS5Q4sO%6JFbxM_1o@+=3oHGeaR_!LzX2QI zeJqC+PntUlLru16*bFbBE;!Ho=0f^l6!~Qsto8qdgzm8X2WGN#LS6Y})B%!_E43bC zb8PgXnLHD*8u=}#JGzMK>gT8->HU!ziq}v*@Db`bPca-@pQ1-OzqNxzJxs#{EPmR& zK9`~HVQZ}LiMg;Syg>dW>Uc}f(*H3e zJ|$5F8-HpZ5Rur8{8`i_4E)SoNDQ_i{{eQ#?B~oQwm(iGzaE)iR=|1l1igr@$QS3V z191?J#Z**}RJut2-yt#dqIo&Azhriph`Pd~j(=ba@^vnow^tmtBtI7;@eJ;>>6Op9 z2>kX7bH@|EGIQ!Y>IRE_ZQh2V7)-wR*Yv*<9tuj}E-ZtmaTKQGBJ6U-veZ-Gpza{- zs#zU}QA1Jw8?*kqVngyXup_3T=7#^bX0q190p!P^_rg6S0w}2Zon^T(6vyB$T!=-k znboinXK-iFF_7{Z*I8P)4)bHJ8)j%iP&YCk8GdUkY7&q6!A#zr*n<3R?1i4nH_cd0 zMUG{i!EBsp`;WZy$-jMzB}u+Zn)&=5l5RdeXQAf8ZpVjMgM5PwGn8(8g?u7*z^u2K z)EuW9t|gzE=^a9k)q@G}w;CeDX1)HC_f>DX@0w4lXQ-|$dC!dbC@f2U7k0c}i~Dhd)_;!&rs5Y&qGH}d%UXe@e=!XYpmyx_t2xje zTt$A?Z)WU^{cb+HTRV=y3DmDZb@fY+%;am2P07cjhU6XW#QCkCNd#e&Kg?u|Ks8)~ zI>1>B#h3mxcitJ>l8?oDco?hUU2KlU9-BvaU(|k!Fbpp^NtK+&4pA)9q$z^iu0e+ z|HVmcpr8__Vnw`xbucRvqbD{(wOfoj;0~;em$48&L5+2uXXft&)lm7G_ycyqa>_q9 ztEmSzB7g8X{jbDb3ff@>%l3W)Mx%@TVQh?7P_w;&ZF{@4FRG{3VST)aftc0L_AWmc z>V}%3?lc0c;uLI%J5WP@-9sWP%OZOg+xs<}$KM=iAl9bA1k?j$CpN&Vs5{TcPSdd= z>OhBa0Nz0LU=!A`_8*3MaRtWUX4L-qv)kSq^fV%&9r`SP(s^3tNF@@hobN+{IF8y<~gmN@>)MgkcCy!fxzt9VOA8f?PRm?=!u(<7(8C z?FxotCccava+y0_fV%Kos4jjfx9MURD&GmqV1!dX8;_FTkJWK>9-9XifNgzAzBabNYz57ox5cjH z7hoRFZ(SuZ2{W(&4lQK5?iGwDzYNvI{)KJtGHr^QT>Vf(dI0@%?avZ9`b`w^`o%_ZgTR+u_XCl zu_fjUu)PncZkUt&T+|#{jur4=fbH?F(<>Adqu>wJq{>&!^guZrPQE2hz^$ku2rO=f zCKSt)k3ikXeAI>Q#v%9vss~#HnhQIGYJVA<;-f%*HPu+xE@7@{5Nfi`MBV8v9EQK( z>)4~D?R{;g;j82um$JQodVLpN8G)IH9b$@}cEyZ;#E!&XjLKb>%~BgGI{Q-cPQ9s5!F%S!UK(I1ejT zFk`G4dTu>AQeq=u|BYL(o;xA6&P#~syleAeGS616GNpD*;%se|efHx9!2SO(Kjlh40~ z>6z-diu|kS!U8qTY_EfZ$&bV$cp7!#-(fy{?v&@NWm~7U{wtDDm)u1yr>?b)V^Cc< z-|=nKI=_q>iYKV&K+Za5_0&eS@8B4V^~kSu@@G*uc;6{6TbKSXKtUf8>bg;=9a7PS zKjT`=ThI1>aO^}an+I4LtJXIc)B$zkIL9Q%dBg=gSq2Y7(#v&2C0vClW0Q0 z7pNh~+tD1jHkKjZ4fO~ehkBWOfa>}Noy@WujXL3MJcY+`JI?HEhN^NGGlWA?PtJvy zj61L+=eGuQwY`5X-{F|Io9+Em$pqAxeT{mn{fb&PLEX(V?T#9fd8i9`AN2@-g00oA zhjA=w-5*3fKhjapjXXV>V=AaeLJmSz%t1}MJy;9BLOr{)^)eSw2UR~1wclLqiF+{; z?cTOE6-QuSyoH(@P5YR3OR+8aAN#QW)wRKW&FgXsCXz4P&-VWI`Yx)gr}Z~?d;m2m zGq4NR9bkL^7(EM1F(-b;OO)ptWLvZG8fv)>8*F?3KsgHw)2{Ro^EF&|sK<2CmZ9eB z^fu~_1BRJ75QZ9pk*H-f8+8E(uo^x?4NcW>^Pv)sT8^hMKjs{6mSYLj>$nz1;Yf_Z z%N`QlNwkbGlW7I&3AP7oW8p~iwc8eTA)8TON+(bU{2Dd(dEDmp+t%?F)P=1_)qjq< zuv{Zd*LO$VkY^2vktEJw2dpvD^uT1)YS@i>C!9y!vEL~3XswBw1LILo$Tv}Ad=T}% z&l+WxZ7I~cAB?)8H&B!IoY7-t8*Q$zC7z~YG7iKRV{D6gVy(kf?IvyJ0*i^X0s>v}?*8M;(dS6@NhNXU5GdJiLUE|!m%tiN3(^z5#gV0z$VY)Zc1 z1T%ZPpk6*ZP|yA})cd{TMDv8(g_`}pVg$CGWV&`YwkLlTHQUQgHqVhLRQXYCruFYX z#e7hNpt^7wF2^77Z5;oq`CZR7)wmJWBc-RAu04dB3%{VAph45kaVB9e@}Hr`-o<3s z8oVq@n_1~F9p*d!UaMW+Hdr=pXW3KJ}YgiqeNPYuq5*3|i4iJXQ zPs1U&AN45DJKqdtd(_aaL>>2Y)a0waz&y%FE}#oE>#tEDD=#!%6^9MUr=l+KA*#Ol zB69}|QOoN#YE_ha&2)7K)XQcgY7Sk;ZWy@O9B&Nj#im{MRjq=GPC~@oI*bPa`U_7EY$x0!36B|x@}Fv4{;Kdq!%3rs=X!JnwHZN190--g=%G-{0h#&%d|wOK7wP;=`e z9L-d-{v@F;9G+w**)r5BIEUJ?%o_77ABmcTcTqhNwAPGuH`EjEHPoG*LUr-WZ zNYq>LZPX2Xhnfq4>okd3e+eWsNzzfT&GPHbSdKz1s}raP$nU5V1-xnQq#G)~7PXu{ zck*R7n8`Q*H93!9Exe0*i{J}BjyXFGwVHwItVH7U#ka&&6ugG$+rXRMwe^R}J(d7H4m^pC@H6-`23KmZ_ zle9JJH>kC!Av%kLFzXT1^~13M`44b7et~*fH8^U1weu_@p(j~|W484XcQyb6$(KH1 zezR$e`N`i#9pEt*!-4OauiSXlstG)4?rbS#Y@p_bW?s5w#fGxM8FOH>c^bd1JDk5VY>uj*aE0}+<+7fF z5_k{;@B)^_44j5Jt{Uf|i@g6g=0Yl>o|vPt6RyC5cmoUJuc$eb?OQWv8lrljH|lvX zX z3^PaiqLyhax^O9K((XsCf}c>&kD%M;&c~pRe+)y=%G8sQ_1B7ox~3O)!fn_ce?g61 z=pC~v#-JX(b5LErAHB02bs;}tC7XYU_{rRHt-I#KV=#80eldQ7SFj!)xQAN*861mGtj&4QC*mUpJ1W;W<6iSP2{ZyCchcGkv)xTvFJndN9=cSF!}ee0G9m4 zTyPEaw4tCgiD8(Cn&rP^B`ol(nJkS^J<$VIKLs^JNvKt@4t1gLqk7^#YN+!3W~_-i zad&Kn5!f6z{pPIy8x-gP@d#^T{@+axv_u_f5^6QPfyMDWs_So~x-!or^A8NIP?M}P z>VhXYZa^LX3~D*wL_HCIf8;SA48{I1kJKfo1LXhHEVss}M{92^g_BT2vl^rE0#3lD zk4^bO)PaA(mYDa6StVUjJvS30a0BK>KhIz0J3JrGp8;!Y`rPWq)QKH07}~`N61Fu?IC7@8AdMsrTHr{vh!OYRrG;`KALD zw*9PrSPQkoLe%Q`61A+#`uTasFcMY274^2ej~be3S^O;Z+*Z_(oxv{ngOji6@2BU5 z$7)YP&*D?4XZB-MSLe>^=Y23$L3Q0Qbm27A*X;&WS7*)U=N;Q%)P=OhQMe3kjL{Fa z%gD7`lTbaT$8!Sn&!We?&vZ57scD@hj?vJ!F-5%pCBqlK0(^yfJtMTWB0AD02eE_n zQR-OH)-TA4v;2wa1fK)mkJ$ZKh|pGE2fs(dUc@YdVg6_9LOPx@ZNu;|UdKUBncnTq zoc8;iGPO-5y@T@Fq^CLcJd>=xluuOywko7AqGvP}mq>I~10086Qn`<`y74oD=d^db zO8MK=pCtFXQzjZw-igq5mY7Hz-!`3uo|ffaC{tZ7B82CU<idO`!=_OX*rOAKjR$nzJu2JG4tUqatDn|yVrWf{uXP?3S35GjNf+*#@eVoUPB zJ8heiXL(rl39U7)yZ?~p1!HX{o{+DC#|bvQq|BzLdlb=v_}n>I4?Je7y+42BCy&v8 zwrElX$!{QHo$|g|$w`l+K8|BFB;OUslke@+@vyLVYmiz~xZWuwX$>a^Q<J82k7 zKBselCU}LI=-ZL{JH%MZ`rs)dYZiX6%_Nr5E*AZ$*Y+m)@uV9Qbx6OgJmKyCffQaR zexczsLeE4!|ISmUZ7L154IzFboj|-v-5BymP#<+Soi@82Qz?6K>p;6v#5U>=Ic;+J z{d50%x|{zVPCG4k8hpeKFK$yPzd`VZ@ot|xl~c%{pnNZWi{BF?oi<;Sev5WtsO<>x z4(VFnJToYX#2Omkry?2OA{tP3oY+FT4dpZO#q9x^0+iJvj?nH0r?NHq_N24ntCW{T z-{$sX!4>t4B)`H}hCOMvfM`!_BLA~f*A^#G)`>$D#lOih^V@b4g8Z7m-!cCMdq%MWH^uhr~l5a(58$kI+Cw-iJ8}c7u zBxUca!3$d=W$#h%j}0hq>8)eJ&7iclJL_zK`CD{YZRAEc2ZSzk4D532B?f^p{iiA#F>rUlWx( z2jiC>t0U#wR*;^I9}yo=wjN(1UU%A7z-#2MqUTEzFK+L#LqlR0mA_LliS%w9PX0M* z-&Vz!z{^B`q6GEtUB+>eq}O63`!6Kz#$U;6%cHjdnn--0hjqb3ww$L6?~ z&~|`mP5NW!03Uiw=^>0zJ>}hr&z$zX@vhV63}!gx{QmOKwuQ0|L`{|Hf!EAwq!x2X zpCqo5oDlN<{Zi7+i5^btA+er{niL);U7GYN zqBH3b%5Ib1f_1SgCQ)7vwPhzpP(GTdN9-hY&_9Vc32ixvW9;*gx_pGT?08Nu{UE2{ z6&h%(=;W2|P5kC7$L1<@wqcI>bzsunh@tFTm?%K{eIke`Lhv^e?;qC7l3qgmGil8I ztAwuwgH@aZlqW{8Q*Gj=Q!WB1|BJH4sI4?<-?p4|s*`_>`sGxVdZAwJiuxMkQR<%R zYJYSJGf5YwFfQ>GzQzxb)3yN%5dO~mu1PwS@<0ru^;a*P;%~~<5P5xNq`xG+nf4b+ z`?eOOzaajgAdvbUq{}$XeH|#4V>Tlk__0AUQ}sCnB8u z1KO-}+MRdGsP=9t{Jc!rP3jBb1gxR2ac$oak0~7B%~64`lYfTQoHB9RIe0F7K>bfd z5z;P$t}`KrpV> z4s3Isy3aAIlQuP6oReQnL3LsZ(afpz-hVuqubl$%owGxG%s^BOVhaTl3*ev?lry@x*fCZT(Lc-zV`UkxBTs z;lM;aB8(VBj3ed{n}|ciC&V@4A(6W+-AmLX!iYh{Bw{hKi8xMNCGHV9nC=ybMnorK z7!mK`#{yy-kxE=69uodxTnJH*=te{nGl-SMPT~}CmH3^=!S{bjq88Db7)Hbs3yAf^ zJH$ES2jX`k2VVsxiAF@uPW%`~WXqmdIyA6oOCFeo>1_tU`wV{0wl#ZVm(akhH>fI< z*gZ60L|_i{*DczZ{|J;u%_z&~?5L5`j-4sf!L<3xA}A|PUMFZksNF&5cq-HUQrYJ@ z5`W@)o=~MCRpoZ73ci#W*t%TvFg_pk_EuF+r)n8xxty}5PT5zKwRFmSR}qjiaZ2mJ zvLQLmjpz_LoF-jR7qFYUQi%t`0-9eX<#K9$mzOn{`GcPJ%nMLyrh!!3f=*cp9*HlJ*4CX+yM)9eZA%1irN;My zXyMfSO_^>?o4(f6x?|$sZAyF1P@%XMJF+%6F2ElvQ%df}G}S6T`y_HNHfRZ;ATmx=yN- zJfbeEvzMl&jyTn+o0hmYtVHncBL7Yq--1e`OlwiCeMfR7{t{LwYeW7HP)3Wu9+_CS zeW6mHk+1Ev)Ev+r6$qWXeq#6bgl)euh)EKI|CE*b_fjql`5S~o3E1}QTE>_ zX#2$O9SRj6%`dt-xi;SzU!|;a;)%8aRkH9N*UZx9yTnk+G#uJm61wrFC6oW^aOo|3fsZSpCtKZPwuvHm4A(1lgGJT zu_Ig&vC#=*V_f4$#U?~Yx<-XhaJz=P-7&6+QSOK_?nu}0$*!2#iOFAXeLqj)*#j+; zI~`n~HL+sq>g3GSJANM5xbX4gX%!#q3LWo`=U}mMk?uH-8WSHL6*Jxy?utwp7abK5 z9`Cl=$HcqiV#1?c?zp(vxF)XH|9kHkSC*6JjFA|9hb1v&WA6 z`BjWea_9G}pLl9t{-j~~{rr>Xz873Lx#|~t^Ca%SIW$|3*oiKhEY0WlVRGId*Az*t z{%5YL-1)@VxG_;NBVA+NV`JkcyGC%DIJfumRT>!{A6_Fqe0a1wdD)-W^R#}sa+ltn zI#qT>ji*22UE|zwW255Z-H}aQQ8nB()B%xkvEy9f(a}zqa_=#buISkCNbN9+V_HeB zaJz8I#H{w=T%MpFu4wlJcXY6~*@*G6aU9GQ6zVIDjftMDqYihw64b-7F|G;ji1^sJ z@ikoSN4R3#?nqM_?uw2YA7As|>#gRR9Gg&?>rfk4{3wRmJ0{^VlP88xc15vsZ2T~n zIb$`{2~->u?T#54KPo7;S#>&tVGOEKqegIWu(xa6BSu6;M7d)kI6@3p<93aYcaN*4 z!EwKm5H%s3o#S2M@h;~Mk~$QytEX%zU_ZznJi32M} z4z|4KOS)OeE|Kz9aeGlVPf(%SZ;UK2lxsW{RdSCiwp z=?jvWZyihe1*Vi6X8W=3QYysTO|qx__^N%~zxspYZ)DC-&e*$+cDLV{bNj%t^bN12 zA6=QAa`5)CWtm&{WNe+0v2lC)zQq{}mZv1ou;0y-v}UP2HHH6fu}#*bb1Ut{lqIX| z!dbF3afK$$Tx17SicMd>Cu99W&YzaL`_B9&>D!j1&p2+S&zpN^_Rc#i*JkeD?`vBv z_Me7bLV|7|KbpSn0BwUa7Oc*kF*jq){LZQ*VyO$@^Yb>n|I#X z{4STvojot)Cbjo!;$l0J84=Dt*}?ZNSNuJk3z=?ga9S-UM`;SBCnmp)@LW0SsU zQ|5^iX~&MGomi#o;QyP!EXS0a_WIl`$MgfU?!37rIDN%p*9)uBDNQ?Z^7hJ|Tv+CT z&1uKxrllTDU;g@?`O9w2n3cX`R>p?)ch=9y+%c0jX-Ai8J<^^3uE+<+7n^zxkUsZt z=E-&Rk+~tWcTkJ}W$Ce}Id#Ue<#+b1$yl}E_R{$ja`@ZpQqxZ)ayq)#l@xp5u9$N6 zzMVfu(4Eak>8Z5TrLNkp^aby@d~3`_59y=}lGGi`7pGkP!+tq?{h&L?)@1J6ra9m; zlfX=)j8*S295r0G*X>JRxFmhw>^rMYrXStGt?Oy}^l!U(#s9gh*{Np6szc0aGqcsS bhQIPDHJ;f63#5FP-!CFpO1DyerR@I)%1|;3 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 97af3c7201..b990d96b29 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16875,7 +16875,7 @@ if (test_xts) { test(2133.1, colnames(DT), c("DATE", "VALUE")) test(2133.2, key(DT), "DATE") test(2133.3, as.data.table(xts, keep.rownames = "VALUE"), - error = "Input xts object should not have 'VALUE' column because it would result in duplicate column names. Rename 'VALUE' column in xts or use `keep.rownames` to change the index col name.") + error = "Input xts object should not have 'VALUE' column because it would result in duplicate column names. Rename 'VALUE' column in xts or use `keep.rownames` to change the index column name.") test(2133.4, as.data.table(xts, keep.rownames = character()), error = "keep.rownames must be length 1") test(2133.5, as.data.table(xts, keep.rownames = NA_character_), @@ -16979,7 +16979,7 @@ if (.Platform$OS.type=="windows") local({ test(2143, rbind(DT,list(c=4L,a=7L)), error="2.*1.*c.*1") }) # test back to English (the argument order is back to 1,c,2,1) -test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1") +test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1") # Attempting to join on character(0) shouldn't crash R A = data.table(A='a') diff --git a/po/R-data.table.pot b/po/R-data.table.pot index 9e93031c9d..bdb0d1e0f0 100644 --- a/po/R-data.table.pot +++ b/po/R-data.table.pot @@ -1,7 +1,7 @@ msgid "" msgstr "" "Project-Id-Version: data.table 1.12.9\n" -"POT-Creation-Date: 2019-12-31 13:02\n" +"POT-Creation-Date: 2020-07-17 14:38\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -304,6 +304,9 @@ msgstr "" msgid "not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch." msgstr "" +msgid "Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number." +msgstr "" + msgid "is not found in calling scope" msgstr "" @@ -445,6 +448,12 @@ msgstr "" msgid "Some items of .SDcols are not column names:" msgstr "" +msgid "'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s" +msgstr "" + +msgid "New ansvars: %s" +msgstr "" + msgid "This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table." msgstr "" @@ -805,16 +814,7 @@ msgstr "" msgid "x is a list, 'cols' cannot be 0-length." msgstr "" -msgid "RHS of" -msgstr "" - -msgid "is length" -msgstr "" - -msgid "which is not 1 or nrow (" -msgstr "" - -msgid "). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %in% instead." +msgid "RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %%in%% instead." msgstr "" msgid "Internal error in .isFastSubsettable. Please report to data.table developers" @@ -1345,7 +1345,7 @@ msgstr "" msgid "Supplied both `by` and `by.x/by.y`. `by` argument will be ignored." msgstr "" -msgid "A non-empty vector of column names are required for `by.x` and `by.y`." +msgid "A non-empty vector of column names is required for `by.x` and `by.y`." msgstr "" msgid "Elements listed in `by.x` must be valid column names in x." @@ -1384,13 +1384,25 @@ msgstr "" msgid "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********" msgstr "" -msgid "**********\nThis installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode." +msgid "**********" +msgstr "" + +msgid "This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode." +msgstr "" + +msgid "sysname" +msgstr "" + +msgid "Darwin" +msgstr "" + +msgid "This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux." msgstr "" -msgid "If this is a Mac, please ensure you are using R>=3.4.0 and have followed our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation." +msgid "This is" msgstr "" -msgid "This warning message should not occur on Windows or Linux. If it does, please file a GitHub issue.\n**********" +msgid ". This warning should not normally occur on Windows or Linux where OpenMP is turned on by data.table's configure script by passing -fopenmp to the compiler. If you see this warning on Windows or Linux, please file a GitHub issue." msgstr "" msgid "The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option." @@ -1423,6 +1435,9 @@ msgstr "" msgid "Option 'datatable.old.bywithoutby' has been removed as warned for 2 years. It is now ignored. Please use by=.EACHI instead and stop using this option." msgstr "" +msgid "Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. It is now ignored. Please use by=key(DT) instead and stop using this option." +msgstr "" + msgid "Unexpected base R behaviour: list(x) has copied x" msgstr "" @@ -1507,9 +1522,6 @@ msgstr "" msgid "' exists but is invalid" msgstr "" -msgid "Use 'if (length(o <- forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted." -msgstr "" - msgid "x is vector but 'by' is supplied" msgstr "" @@ -1609,9 +1621,6 @@ msgstr "" msgid "None of the datasets should contain a column named '.seqn'" msgstr "" -msgid "'target' and 'current' must both be data.tables" -msgstr "" - msgid "Internal error: ncol(current)==ncol(target) was checked above" msgstr "" @@ -1732,7 +1741,13 @@ msgstr "" msgid "not found: [" msgstr "" -msgid "Input xts object should not have 'index' column because it would result in duplicate column names. Rename 'index' column in xts or use `keep.rownames=FALSE` and add index manually as another column." +msgid "keep.rownames must be length 1" +msgstr "" + +msgid "keep.rownames must not be NA" +msgstr "" + +msgid "Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index column name." msgstr "" msgid "data.table must have a time based column in first position, use `setcolorder` function to change the order, or see ?timeBased for supported types" diff --git a/po/R-zh_CN.po b/po/R-zh_CN.po index 9411e50fd2..a73b8e4a1b 100644 --- a/po/R-zh_CN.po +++ b/po/R-zh_CN.po @@ -1,7 +1,7 @@ msgid "" msgstr "" "Project-Id-Version: data.table 1.12.5\n" -"POT-Creation-Date: 2019-12-31 13:02\n" +"POT-Creation-Date: 2020-07-17 14:38\n" "PO-Revision-Date: 2019-11-16 18:37+0800\n" "Last-Translator: Xianying Tan \n" "Language-Team: Mandarin\n" @@ -387,6 +387,17 @@ msgid "" msgstr "" "not-join '!' 前缀在 i 中存在,但是 nomatch 也被提供了。需要移除nomatch。" +msgid "" +"Operator := detected in i, the first argument inside DT[...], but is only " +"valid in the second argument, j. Most often, this happens when forgetting " +"the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). " +"Please double-check the syntax. Run traceback(), and debugger() to get a " +"line number." +msgstr "在 i, 即 DT[...] 中的第一个参数,中检测出操作符 := ,但该操作符仅在 j," +"即 DT[...] 中的第二个参数中使用才有效。通常,该错误发生在忘记" +"添加第一个逗号时 (如错误地将 [DT , new_var := 5] 写作 DT[newvar := 5])。" +"请再次检查语法是否正确。运行 trackback(),和 debugger() 来获取发生错误的行号。" + msgid "is not found in calling scope" msgstr "不存在调用环境里" @@ -594,6 +605,18 @@ msgstr ".SDcols 应为列数或是列名" msgid "Some items of .SDcols are not column names:" msgstr ".SDcols 中的部份项目不是列名:" +msgid "" +"'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a " +"single j=eval(macro) instead. Both will detect the columns used which is " +"important for efficiency.\n" +"Old ansvars: %s" +msgstr "在 j 中检测出 '(m)get'。ansvars 将被设为所以列。请使用 .SDcols 或" +"j=eval(macro) 来代替。二者均可检测出实际参与运算的列,这对提高运行效率非常重要。\n" +"旧的 ansvars:%s" + +msgid "New ansvars: %s" +msgstr "新的 ansvars: %s" + msgid "" "This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?" "data.table." @@ -1091,21 +1114,12 @@ msgstr "x 是单个向量,非空的 'cols' 没有意义。" msgid "x is a list, 'cols' cannot be 0-length." msgstr "x 是一个列表(list),'cols' 长度不能为0。" -msgid "RHS of" -msgstr "右手侧(RHS)" - -msgid "is length" -msgstr "长度为" - -msgid "which is not 1 or nrow (" -msgstr "其非 1 或 总行数 nrow (" - msgid "" -"). For robustness, no recycling is allowed (other than of length 1 RHS). " -"Consider %in% instead." +"RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no " +"recycling is allowed (other than of length 1 RHS). Consider %%in%% instead." msgstr "" -")。考虑到程序的稳健性,只有在右侧元素长度为 1 的情况下,我们才会对之进行循" -"环。考虑改用 %in% 。" +"%s 的右手侧 (RHS) 长度为 %d, 其非 1 或 总行数 nrow (%d)。考虑到程序的稳健性," +"只有在右侧元素长度为 1 的情况下,我们才会对之进行循环。考虑改用 %%in%% 。" msgid "" "Internal error in .isFastSubsettable. Please report to data.table developers" @@ -1838,7 +1852,7 @@ msgstr "`by.x`和`by.y`必须是相同的长度。" msgid "Supplied both `by` and `by.x/by.y`. `by` argument will be ignored." msgstr "参数`by`和`by.x/by.y`都提供了值。参数`by`的值会被忽略。" -msgid "A non-empty vector of column names are required for `by.x` and `by.y`." +msgid "A non-empty vector of column names is required for `by.x` and `by.y`." msgstr "`by.x`和`by.y`必须是非空的列名。" msgid "Elements listed in `by.x` must be valid column names in x." @@ -1896,29 +1910,47 @@ msgstr "" "table::update.dev.pkg()\n" "**********" +msgid "**********" +msgstr "**********" + msgid "" -"**********\n" "This installation of data.table has not detected OpenMP support. It should " "still work but in single-threaded mode." msgstr "" -"**********\n" "data.table的安装未检测到OpenMP支持。在单线程模式下应该仍能运行" +msgid "sysname" +msgstr "sysname" + +msgid "Darwin" +msgstr "Darwin" + msgid "" -"If this is a Mac, please ensure you are using R>=3.4.0 and have followed our " -"Mac instructions here: https://github.com/Rdatatable/data.table/wiki/" -"Installation." +"This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage " +"with Apple and ask them for support. Check r-datatable.com for updates, and " +"our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/" +"Installation. After several years of many reports of installation problems " +"on Mac, it's time to gingerly point out that there have been no similar " +"problems on Windows or Linux." msgstr "" -"如果是Mac,请确保您使用的R版本>=3.4.0,同时遵循了我们Mac上的安装说明:" -"https://github.com/Rdatatable/data.table/wiki/Installation。" +"此设备为 Mac。请阅读 https://mac.r-project.org/openmp/。请" +"与 Apple 公司联系以获取支持。查看 r-datatable.com 以获取更新,并" +"参阅我们的 Mac 设备说明:https://github.com/Rdatatable/data.table/wiki/Installation" +"在 Mac 上出现相关安装问题的报告已数年之久," +"需要指出的是在 Windows 或 Linux 平台上一般不存在类似问题。" + +msgid "This is" +msgstr "这是" msgid "" -"This warning message should not occur on Windows or Linux. If it does, " -"please file a GitHub issue.\n" -"**********" +". This warning should not normally occur on Windows or Linux where OpenMP is " +"turned on by data.table's configure script by passing -fopenmp to the " +"compiler. If you see this warning on Windows or Linux, please file a GitHub " +"issue." msgstr "" -"在Windows或Linux上不应出现此警告消息。如果有,请提交给GitHub issue。\n" -"**********" +"。此警告一般不应出现在 Windows 或 Linux 平台中,因为" +"data.table 的 configure 脚本中已通过向编译器传递 -fopenmp 参数启用了 OpenMP。" +"如果你在 Windows 或 Linux 平台中发现此警告,请在 GitHub 中提交 issue。" msgid "" "The option 'datatable.nomatch' is being used and is not set to the default " @@ -1977,6 +2009,13 @@ msgstr "" "选项'datatable.old.bywithoutby'已经被移除,警告了2年。它现在被忽略。 请改用" "by = .EACHI,然后停止使用这个选项。" +msgid "" +"Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. " +"It is now ignored. Please use by=key(DT) instead and stop using this option." +msgstr "" +"选项'datatable.old.bywithoutby'已经被移除,警告了2年。它现在被忽略。 请改用" +"by = .EACHI,然后停止使用这个选项。" + msgid "Unexpected base R behaviour: list(x) has copied x" msgstr "意外的base R行为:list(x)已经复制了x" @@ -2089,13 +2128,6 @@ msgstr "内部错误:索引" msgid "' exists but is invalid" msgstr "存在但无效" -msgid "" -"Use 'if (length(o <- forderv(DT,by))) ...' for efficiency in one step, so " -"you have o as well if not sorted." -msgstr "" -"请使用'if (length(o <- forderv(DT,by))) ...' , 以便在一步中拥有较好的效率,同" -"时如果你还未排序,你也获得了变量o" - msgid "x is vector but 'by' is supplied" msgstr "x是一个向量, 但是参数'by'被提供" @@ -2216,9 +2248,6 @@ msgstr "' 然而 y 中对应的项是:'" msgid "None of the datasets should contain a column named '.seqn'" msgstr "所有的数据集都不应该包含名为 '.seqn' 的列" -msgid "'target' and 'current' must both be data.tables" -msgstr "'target' 和 'current' 都必须是 data.table" - msgid "Internal error: ncol(current)==ncol(target) was checked above" msgstr "内部错误:ncol(current)==ncol(target) 之前已经检查" @@ -2363,13 +2392,19 @@ msgstr "Pattern" msgid "not found: [" msgstr "未找到: [" +msgid "keep.rownames must be length 1" +msgstr "keep.rownames 的长度必须为 1" + +msgid "keep.rownames must not be NA" +msgstr "keep.rownames 不可为 NA" + msgid "" -"Input xts object should not have 'index' column because it would result in " -"duplicate column names. Rename 'index' column in xts or use `keep." -"rownames=FALSE` and add index manually as another column." +"Input xts object should not have '%s' column because it would result in " +"duplicate column names. Rename '%s' column in xts or use `keep.rownames` to " +"change the index column name." msgstr "" -"输入的xts对象不能含有'index'列,因这会导致出现重复的列名。请尝试重新命名xts中" -"的'index'列或者使用`keep.rownames=FALSE`并手动添加index为另外的列" +"输入的xts对象不能含有'%s'列,因这会导致出现重复的列名。请尝试重新命名xts中" +"的'%s'列或者使用`keep.rownames`并手动添加index为另外的列" msgid "" "data.table must have a time based column in first position, use " diff --git a/po/data.table.pot b/po/data.table.pot index a826bab881..78a6a2beeb 100644 --- a/po/data.table.pot +++ b/po/data.table.pot @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: data.table 1.12.9\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2019-12-30 01:24+0800\n" +"POT-Creation-Date: 2020-07-17 14:38+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -47,41 +47,41 @@ msgstr "" msgid "Internal error: .internal.selfref tag isn't NULL or a character vector" msgstr "" -#: assign.c:168 +#: assign.c:180 msgid "Internal error: length(names)>0 but =0 and not NA." msgstr "" -#: assign.c:239 fsort.c:109 +#: assign.c:251 fsort.c:109 msgid "verbose must be TRUE or FALSE" msgstr "" -#: assign.c:287 +#: assign.c:299 msgid "assign has been passed a NULL dt" msgstr "" -#: assign.c:288 +#: assign.c:300 msgid "dt passed to assign isn't type VECSXP" msgstr "" -#: assign.c:290 +#: assign.c:302 msgid "" ".SD is locked. Updating .SD by reference using := or set are reserved for " "future use. Use := in j directly. Or use copy(.SD) as a (slow) last resort, " "until shallow() is exported." msgstr "" -#: assign.c:298 +#: assign.c:310 msgid "Internal error: dt passed to Cassign is not a data.table or data.frame" msgstr "" -#: assign.c:302 +#: assign.c:314 msgid "dt passed to assign has no names" msgstr "" -#: assign.c:304 +#: assign.c:316 #, c-format msgid "Internal error in assign: length of names (%d) is not length of dt (%d)" msgstr "" -#: assign.c:306 +#: assign.c:318 msgid "" "data.table is NULL; malformed. A null data.table should be an empty list. " "typeof() should always return 'list' for data.table." msgstr "" -#: assign.c:315 +#: assign.c:327 #, c-format msgid "Assigning to all %d rows\n" msgstr "" -#: assign.c:320 +#: assign.c:332 msgid "" "Coerced i from numeric to integer. Please pass integer for efficiency; e.g., " "2L rather than 2" msgstr "" -#: assign.c:323 +#: assign.c:335 #, c-format msgid "" "i is type '%s'. Must be integer, or numeric is coerced with warning. If i is " @@ -179,68 +179,68 @@ msgid "" "loop if possible for efficiency." msgstr "" -#: assign.c:329 +#: assign.c:341 #, c-format msgid "i[%d] is %d which is out of range [1,nrow=%d]." msgstr "" -#: assign.c:332 +#: assign.c:344 #, c-format msgid "Assigning to %d row subset of %d rows\n" msgstr "" -#: assign.c:340 +#: assign.c:352 #, c-format msgid "Added %d new column%s initialized with all-NA\n" msgstr "" -#: assign.c:345 +#: assign.c:357 msgid "length(LHS)==0; no columns to delete or assign RHS to." msgstr "" -#: assign.c:359 +#: assign.c:371 msgid "" "set() on a data.frame is for changing existing columns, not adding new ones. " "Please use a data.table for that. data.table's are over-allocated and don't " "shallow copy." msgstr "" -#: assign.c:370 +#: assign.c:382 msgid "" "Coerced j from numeric to integer. Please pass integer for efficiency; e.g., " "2L rather than 2" msgstr "" -#: assign.c:373 +#: assign.c:385 #, c-format msgid "" "j is type '%s'. Must be integer, character, or numeric is coerced with " "warning." msgstr "" -#: assign.c:375 +#: assign.c:387 msgid "" "Can't assign to the same column twice in the same query (duplicates " "detected)." msgstr "" -#: assign.c:376 +#: assign.c:388 msgid "newcolnames is supplied but isn't a character vector" msgstr "" -#: assign.c:378 +#: assign.c:390 #, c-format msgid "RHS_list_of_columns == %s\n" msgstr "" -#: assign.c:383 +#: assign.c:395 #, c-format msgid "" "RHS_list_of_columns revised to true because RHS list has 1 item which is " "NULL, or whose length %d is either 1 or targetlen (%d). Please unwrap RHS.\n" msgstr "" -#: assign.c:388 +#: assign.c:400 #, c-format msgid "" "Supplied %d columns to be assigned an empty list (which may be an empty data." @@ -248,18 +248,18 @@ msgid "" "use NULL instead. To add multiple empty list columns, use list(list())." msgstr "" -#: assign.c:393 +#: assign.c:405 #, c-format msgid "Recycling single RHS list item across %d columns. Please unwrap RHS.\n" msgstr "" -#: assign.c:395 +#: assign.c:407 #, c-format msgid "" "Supplied %d columns to be assigned %d items. Please see NEWS for v1.12.2." msgstr "" -#: assign.c:403 +#: assign.c:415 #, c-format msgid "" "Item %d of column numbers in j is %d which is outside range [1,ncol=%d]. " @@ -267,18 +267,18 @@ msgid "" "Please use a data.table for that." msgstr "" -#: assign.c:404 +#: assign.c:416 #, c-format msgid "" "Item %d of column numbers in j is %d which is outside range [1,ncol=%d]. Use " "column names instead in j to add new columns." msgstr "" -#: assign.c:409 +#: assign.c:421 msgid "When deleting columns, i should not be provided" msgstr "" -#: assign.c:415 +#: assign.c:427 #, c-format msgid "" "RHS of assignment to existing column '%s' is zero length but not NULL. If " @@ -289,30 +289,30 @@ msgid "" "new column." msgstr "" -#: assign.c:420 +#: assign.c:432 #, c-format msgid "" "Internal error in assign.c: length(newcolnames)=%d, length(names)=%d, coln=%d" msgstr "" -#: assign.c:422 +#: assign.c:434 #, c-format msgid "Column '%s' does not exist to remove" msgstr "" -#: assign.c:428 +#: assign.c:440 #, c-format msgid "%d column matrix RHS of := will be treated as one vector" msgstr "" -#: assign.c:432 +#: assign.c:444 #, c-format msgid "" "Can't assign to column '%s' (type 'factor') a value of type '%s' (not " "character, factor, integer or numeric)" msgstr "" -#: assign.c:437 +#: assign.c:449 #, c-format msgid "" "Supplied %d items to be assigned to %d items of column '%s'. If you wish to " @@ -320,7 +320,7 @@ msgid "" "your code." msgstr "" -#: assign.c:447 +#: assign.c:459 msgid "" "This data.table has either been loaded from disk (e.g. using readRDS()/" "load()) or constructed manually (e.g. using structure()). Please run setDT() " @@ -328,14 +328,14 @@ msgid "" "assigning by reference to it." msgstr "" -#: assign.c:448 +#: assign.c:460 #, c-format msgid "" "Internal error: oldtncol(%d) < oldncol(%d). Please report to data.table " "issue tracker, including result of sessionInfo()." msgstr "" -#: assign.c:450 +#: assign.c:462 #, c-format msgid "" "truelength (%d) is greater than 10,000 items over-allocated (length = %d). " @@ -344,241 +344,234 @@ msgid "" "sessionInfo()." msgstr "" -#: assign.c:452 +#: assign.c:464 #, c-format msgid "" "Internal error: DT passed to assign has not been allocated enough column " "slots. l=%d, tl=%d, adding %d" msgstr "" -#: assign.c:454 +#: assign.c:466 msgid "" "It appears that at some earlier point, names of this data.table have been " "reassigned. Please ensure to use setnames() rather than names<- or " "colnames<-. Otherwise, please report to data.table issue tracker." msgstr "" -#: assign.c:458 +#: assign.c:470 #, c-format msgid "Internal error: selfrefnames is ok but tl names [%d] != tl [%d]" msgstr "" -#: assign.c:469 +#: assign.c:481 msgid "" "Internal error: earlier error 'When deleting columns, i should not be " "provided' did not happen." msgstr "" -#: assign.c:480 +#: assign.c:492 #, c-format msgid "" "RHS for item %d has been duplicated because NAMED==%d MAYBE_SHARED==%d, but " "then is being plonked. length(values)==%d; length(cols)==%d)\n" msgstr "" -#: assign.c:485 +#: assign.c:497 #, c-format msgid "Direct plonk of unnamed RHS, no copy. NAMED==%d, MAYBE_SHARED==%d\n" msgstr "" -#: assign.c:554 +#: assign.c:566 #, c-format msgid "" "Dropping index '%s' as it doesn't have '__' at the beginning of its name. It " "was very likely created by v1.9.4 of data.table.\n" msgstr "" -#: assign.c:562 +#: assign.c:574 msgid "Internal error: index name ends with trailing __" msgstr "" -#: assign.c:567 +#: assign.c:579 msgid "Internal error: Couldn't allocate memory for s4." msgstr "" -#: assign.c:578 +#: assign.c:590 msgid "Internal error: Couldn't allocate memory for s5." msgstr "" -#: assign.c:599 assign.c:615 +#: assign.c:611 assign.c:627 #, c-format msgid "Dropping index '%s' due to an update on a key column\n" msgstr "" -#: assign.c:608 +#: assign.c:620 #, c-format msgid "Shortening index '%s' to '%s' due to an update on a key column\n" msgstr "" -#: assign.c:680 +#: assign.c:695 +#, c-format +msgid "" +"Internal error memrecycle: sourceStart=%d sourceLen=%d length(source)=%d" +msgstr "" + +#: assign.c:697 +#, c-format +msgid "Internal error memrecycle: start=%d len=%d length(target)=%d" +msgstr "" + +#: assign.c:700 #, c-format msgid "Internal error: recycle length error not caught earlier. slen=%d len=%d" msgstr "" -#: assign.c:684 +#: assign.c:704 msgid "Internal error: memrecycle has received NULL colname" msgstr "" -#: assign.c:710 +#: assign.c:730 #, c-format msgid "" "Cannot assign 'factor' to '%s'. Factors can only be assigned to factor, " "character or list columns." msgstr "" -#: assign.c:724 +#: assign.c:744 #, c-format msgid "" "Assigning factor numbers to column %d named '%s'. But %d is outside the " "level range [1,%d]" msgstr "" -#: assign.c:732 +#: assign.c:752 #, c-format msgid "" "Assigning factor numbers to column %d named '%s'. But %f is outside the " "level range [1,%d], or is not a whole number." msgstr "" -#: assign.c:738 +#: assign.c:758 #, c-format msgid "" "Cannot assign '%s' to 'factor'. Factor columns can be assigned factor, " "character, NA in any type, or level numbers." msgstr "" -#: assign.c:759 +#: assign.c:779 msgid "" "Internal error: levels of target are either not unique or have truelength<0" msgstr "" -#: assign.c:798 +#: assign.c:818 #, c-format msgid "Unable to allocate working memory of %d bytes to combine factor levels" msgstr "" -#: assign.c:805 +#: assign.c:825 msgid "Internal error: extra level check sum failed" msgstr "" -#: assign.c:824 +#: assign.c:844 #, c-format msgid "" "Coercing 'character' RHS to '%s' to match the type of the target column " "(column %d named '%s')." msgstr "" -#: assign.c:830 +#: assign.c:850 #, c-format msgid "" "Cannot coerce 'list' RHS to 'integer64' to match the type of the target " "column (column %d named '%s')." msgstr "" -#: assign.c:835 +#: assign.c:855 #, c-format msgid "" "Coercing 'list' RHS to '%s' to match the type of the target column (column " "%d named '%s')." msgstr "" -#: assign.c:841 +#: assign.c:861 #, c-format msgid "Zero-copy coerce when assigning '%s' to '%s' column %d named '%s'.\n" msgstr "" -#: assign.c:936 +#: assign.c:956 #, c-format msgid "type '%s' cannot be coerced to '%s'" msgstr "" -#: assign.c:1056 +#: assign.c:1076 msgid "" "To assign integer64 to a character column, please use as.character() for " "clarity." msgstr "" -#: assign.c:1068 +#: assign.c:1088 #, c-format msgid "Unsupported column type in assign.c:memrecycle '%s'" msgstr "" -#: assign.c:1115 +#: assign.c:1135 #, c-format msgid "Internal error: writeNA passed a vector of type '%s'" msgstr "" -#: assign.c:1146 +#: assign.c:1166 #, c-format msgid "" "Internal error: savetl_init checks failed (%d %d %p %p). please report to " "data.table issue tracker." msgstr "" -#: assign.c:1154 +#: assign.c:1174 #, c-format msgid "Failed to allocate initial %d items in savetl_init" msgstr "" -#: assign.c:1163 +#: assign.c:1183 #, c-format msgid "" "Internal error: reached maximum %d items for savetl. Please report to data." "table issue tracker." msgstr "" -#: assign.c:1170 +#: assign.c:1190 #, c-format msgid "Failed to realloc saveds to %d items in savetl" msgstr "" -#: assign.c:1176 +#: assign.c:1196 #, c-format msgid "Failed to realloc savedtl to %d items in savetl" msgstr "" -#: assign.c:1199 +#: assign.c:1219 msgid "x must be a character vector" msgstr "" -#: assign.c:1200 +#: assign.c:1220 msgid "'which' must be an integer vector" msgstr "" -#: assign.c:1201 +#: assign.c:1221 msgid "'new' must be a character vector" msgstr "" -#: assign.c:1202 +#: assign.c:1222 #, c-format msgid "'new' is length %d. Should be the same as length of 'which' (%d)" msgstr "" -#: assign.c:1205 +#: assign.c:1225 #, c-format msgid "" "Item %d of 'which' is %d which is outside range of the length %d character " "vector" msgstr "" -#: assign.c:1215 -msgid "dt passed to setcolorder has no names" -msgstr "" - -#: assign.c:1217 -#, c-format -msgid "Internal error: dt passed to setcolorder has %d columns but %d names" -msgstr "" - -#: assign.c:1224 -msgid "" -"Internal error: o passed to Csetcolorder contains an NA or out-of-bounds" -msgstr "" - -#: assign.c:1226 -msgid "Internal error: o passed to Csetcolorder contains a duplicate" -msgstr "" - #: between.c:12 #, c-format msgid "" @@ -668,121 +661,130 @@ msgstr "" msgid "Internal error: xcols is not integer vector" msgstr "" -#: bmerge.c:50 +#: bmerge.c:51 +msgid "Internal error: icols and xcols must be non-empty integer vectors." +msgstr "" + +#: bmerge.c:52 #, c-format msgid "Internal error: length(icols) [%d] > length(xcols) [%d]" msgstr "" -#: bmerge.c:57 +#: bmerge.c:59 #, c-format msgid "Internal error. icols[%d] is NA" msgstr "" -#: bmerge.c:58 +#: bmerge.c:60 #, c-format msgid "Internal error. xcols[%d] is NA" msgstr "" -#: bmerge.c:59 +#: bmerge.c:61 #, c-format msgid "icols[%d]=%d outside range [1,length(i)=%d]" msgstr "" -#: bmerge.c:60 +#: bmerge.c:62 #, c-format msgid "xcols[%d]=%d outside range [1,length(x)=%d]" msgstr "" -#: bmerge.c:63 +#: bmerge.c:65 #, c-format msgid "typeof x.%s (%s) != typeof i.%s (%s)" msgstr "" -#: bmerge.c:70 +#: bmerge.c:72 msgid "roll is character but not 'nearest'" msgstr "" -#: bmerge.c:71 +#: bmerge.c:73 msgid "roll='nearest' can't be applied to a character column, yet." msgstr "" -#: bmerge.c:74 +#: bmerge.c:76 msgid "Internal error: roll is not character or double" msgstr "" -#: bmerge.c:79 +#: bmerge.c:81 msgid "rollends must be a length 2 logical vector" msgstr "" -#: bmerge.c:89 uniqlist.c:270 +#: bmerge.c:91 uniqlist.c:271 msgid "" "Internal error: invalid value for 'mult'. please report to data.table issue " "tracker" msgstr "" -#: bmerge.c:93 +#: bmerge.c:95 msgid "" "Internal error: opArg is not an integer vector of length equal to length(on)" msgstr "" -#: bmerge.c:96 +#: bmerge.c:98 msgid "Internal error: nqgrpArg must be an integer vector" msgstr "" -#: bmerge.c:102 +#: bmerge.c:104 msgid "Intrnal error: nqmaxgrpArg is not a positive length-1 integer vector" msgstr "" -#: bmerge.c:111 +#: bmerge.c:113 msgid "Internal error in allocating memory for non-equi join" msgstr "" -#: bmerge.c:156 +#: bmerge.c:158 msgid "Internal error: xoArg is not an integer vector" msgstr "" -#: bmerge.c:271 bmerge.c:379 +#: bmerge.c:273 bmerge.c:381 #, c-format msgid "" "Internal error in bmerge_r for '%s' column. Unrecognized value op[col]=%d" msgstr "" -#: bmerge.c:303 +#: bmerge.c:305 #, c-format msgid "Only '==' operator is supported for columns of type %s." msgstr "" -#: bmerge.c:410 +#: bmerge.c:412 #, c-format msgid "Type '%s' not supported for joining/merging" msgstr "" -#: bmerge.c:468 +#: bmerge.c:470 msgid "Internal error: xlow!=xupp-1 || xlowxuppIn" msgstr "" -#: chmatch.c:4 -#, c-format -msgid "x is type '%s' (must be 'character' or NULL)" -msgstr "" - #: chmatch.c:5 #, c-format msgid "table is type '%s' (must be 'character' or NULL)" msgstr "" -#: chmatch.c:6 +#: chmatch.c:7 msgid "Internal error: either chin or chmatchdup should be true not both" msgstr "" -#: chmatch.c:44 +#: chmatch.c:12 +#, c-format +msgid "Internal error: length of SYMSXP is %d not 1" +msgstr "" + +#: chmatch.c:19 +#, c-format +msgid "x is type '%s' (must be 'character' or NULL)" +msgstr "" + +#: chmatch.c:66 #, c-format msgid "" "Internal error: CHARSXP '%s' has a negative truelength (%d). Please file an " "issue on the data.table tracker." msgstr "" -#: chmatch.c:73 +#: chmatch.c:95 #, c-format msgid "" "Failed to allocate % bytes working memory in chmatchdup: " @@ -858,107 +860,103 @@ msgstr "" msgid "Unsupported type: %s" msgstr "" -#: dogroups.c:14 +#: dogroups.c:15 msgid "Internal error: order not integer vector" msgstr "" -#: dogroups.c:15 +#: dogroups.c:16 msgid "Internal error: starts not integer" msgstr "" -#: dogroups.c:16 +#: dogroups.c:17 msgid "Internal error: lens not integer" msgstr "" -#: dogroups.c:18 +#: dogroups.c:19 msgid "Internal error: jiscols not NULL but o__ has length" msgstr "" -#: dogroups.c:19 +#: dogroups.c:20 msgid "Internal error: xjiscols not NULL but o__ has length" msgstr "" -#: dogroups.c:20 +#: dogroups.c:21 msgid "'env' should be an environment" msgstr "" -#: dogroups.c:39 +#: dogroups.c:40 #, c-format msgid "" "Internal error: unsupported size-0 type '%s' in column %d of 'by' should " "have been caught earlier" msgstr "" -#: dogroups.c:43 +#: dogroups.c:44 #, c-format msgid "!length(bynames)[%d]==length(groups)[%d]==length(grpcols)[%d]" msgstr "" -#: dogroups.c:62 +#: dogroups.c:63 msgid "row.names attribute of .SD not found" msgstr "" -#: dogroups.c:64 +#: dogroups.c:65 #, c-format msgid "" "row.names of .SD isn't integer length 2 with NA as first item; i.e., ." "set_row_names(). [%s %d %d]" msgstr "" -#: dogroups.c:69 +#: dogroups.c:70 msgid "length(names)!=length(SD)" msgstr "" -#: dogroups.c:73 +#: dogroups.c:74 #, c-format msgid "" "Internal error: size-0 type %d in .SD column %d should have been caught " "earlier" msgstr "" -#: dogroups.c:83 +#: dogroups.c:84 msgid "length(xknames)!=length(xSD)" msgstr "" -#: dogroups.c:87 +#: dogroups.c:88 #, c-format msgid "" "Internal error: type %d in .xSD column %d should have been caught by now" msgstr "" -#: dogroups.c:91 +#: dogroups.c:92 #, c-format msgid "length(iSD)[%d] != length(jiscols)[%d]" msgstr "" -#: dogroups.c:92 +#: dogroups.c:93 #, c-format msgid "length(xSD)[%d] != length(xjiscols)[%d]" msgstr "" -#: dogroups.c:155 dogroups.c:184 -msgid "Internal error. Type of column should have been checked by now" -msgstr "" - -#: dogroups.c:273 +#: dogroups.c:198 #, c-format msgid "j evaluates to type '%s'. Must evaluate to atomic vector or list." msgstr "" -#: dogroups.c:281 +#: dogroups.c:206 msgid "" "All items in j=list(...) should be atomic vectors or lists. If you are " "trying something like j=list(.SD,newcol=mean(colA)) then use := by group " "instead (much quicker), or cbind or merge afterwards." msgstr "" -#: dogroups.c:290 +#: dogroups.c:215 msgid "" "RHS of := is NULL during grouped assignment, but it's not possible to delete " "parts of a column." msgstr "" -#: dogroups.c:294 +#: dogroups.c:219 #, c-format msgid "" "Supplied %d items to be assigned to group %d of size %d in column '%s'. The " @@ -967,23 +965,23 @@ msgid "" "make this intent clear to readers of your code." msgstr "" -#: dogroups.c:305 +#: dogroups.c:230 msgid "" "Internal error: Trying to add new column by reference but tl is full; " "setalloccol should have run first at R level before getting to this point in " "dogroups" msgstr "" -#: dogroups.c:320 +#: dogroups.c:245 #, c-format msgid "Group %d column '%s': %s" msgstr "" -#: dogroups.c:327 +#: dogroups.c:252 msgid "j doesn't evaluate to the same number of columns for each group" msgstr "" -#: dogroups.c:361 +#: dogroups.c:286 #, c-format msgid "" "Column %d of j's result for the first group is NULL. We rely on the column " @@ -994,14 +992,14 @@ msgid "" "integer() or numeric()." msgstr "" -#: dogroups.c:364 +#: dogroups.c:289 msgid "" "j appears to be a named vector. The same names will likely be created over " "and over again for each group and slow things down. Try and pass a named " "list (which data.table optimizes) or an unnamed list() instead.\n" msgstr "" -#: dogroups.c:366 +#: dogroups.c:291 #, c-format msgid "" "Column %d of j is a named vector (each item down the rows is named, " @@ -1009,7 +1007,7 @@ msgid "" "over and over for each group). They are ignored anyway.\n" msgstr "" -#: dogroups.c:374 +#: dogroups.c:299 msgid "" "The result of j is a named list. It's very inefficient to create the same " "names over and over again for each group. When j=list(...), any names are " @@ -1018,17 +1016,17 @@ msgid "" "to :=). This message may be upgraded to warning in future.\n" msgstr "" -#: dogroups.c:386 +#: dogroups.c:311 #, c-format msgid "dogroups: growing from %d to %d rows\n" msgstr "" -#: dogroups.c:387 +#: dogroups.c:312 #, c-format msgid "dogroups: length(ans)[%d]!=ngrpcols[%d]+njval[%d]" msgstr "" -#: dogroups.c:420 +#: dogroups.c:330 #, c-format msgid "" "Item %d of j's result for group %d is zero length. This will be filled with " @@ -1037,14 +1035,14 @@ msgid "" "buffer." msgstr "" -#: dogroups.c:427 +#: dogroups.c:337 #, c-format msgid "" "Column %d of result for group %d is type '%s' but expecting type '%s'. " "Column types must be consistent for each group." msgstr "" -#: dogroups.c:429 +#: dogroups.c:339 #, c-format msgid "" "Supplied %d items for column %d of group %d which has %d rows. The RHS " @@ -1053,32 +1051,37 @@ msgid "" "make this intent clear to readers of your code." msgstr "" -#: dogroups.c:444 +#: dogroups.c:354 #, c-format msgid "Wrote less rows (%d) than allocated (%d).\n" msgstr "" -#: dogroups.c:454 +#: dogroups.c:364 #, c-format msgid "Internal error: block 0 [%d] and block 1 [%d] have both run" msgstr "" -#: dogroups.c:456 +#: dogroups.c:366 #, c-format msgid "" "\n" " %s took %.3fs for %d groups\n" msgstr "" -#: dogroups.c:458 +#: dogroups.c:368 #, c-format msgid " eval(j) took %.3fs for %d calls\n" msgstr "" -#: dogroups.c:482 +#: dogroups.c:392 msgid "growVector passed NULL" msgstr "" +#: dogroups.c:412 +#, c-format +msgid "Internal error: growVector doesn't support type '%s'" +msgstr "" + #: fastmean.c:39 msgid "narm should be TRUE or FALSE" msgstr "" @@ -1093,7 +1096,7 @@ msgstr "" msgid "Internal error: type '%s' not caught earlier in fastmean" msgstr "" -#: fcast.c:80 +#: fcast.c:78 #, c-format msgid "Unsupported column type in fcast val: '%s'" msgstr "" @@ -1102,62 +1105,70 @@ msgstr "" msgid "Argument 'test' must be logical." msgstr "" -#: fifelse.c:23 +#: fifelse.c:28 #, c-format msgid "" "'yes' is of type %s but 'no' is of type %s. Please make sure that both " "arguments have the same type." msgstr "" -#: fifelse.c:28 +#: fifelse.c:33 msgid "" "'yes' has different class than 'no'. Please make sure that both arguments " "have the same class." msgstr "" -#: fifelse.c:33 +#: fifelse.c:38 msgid "'yes' and 'no' are both type factor but their levels are different." msgstr "" -#: fifelse.c:38 +#: fifelse.c:43 #, c-format msgid "" "Length of 'yes' is % but must be 1 or length of 'test' (%)." msgstr "" -#: fifelse.c:40 +#: fifelse.c:45 #, c-format msgid "" "Length of 'no' is % but must be 1 or length of 'test' (%)." msgstr "" -#: fifelse.c:51 +#: fifelse.c:56 #, c-format msgid "Length of 'na' is % but must be 1" msgstr "" -#: fifelse.c:57 +#: fifelse.c:62 #, c-format msgid "" "'yes' is of type %s but 'na' is of type %s. Please make sure that both " "arguments have the same type." msgstr "" -#: fifelse.c:59 +#: fifelse.c:64 msgid "" "'yes' has different class than 'na'. Please make sure that both arguments " "have the same class." msgstr "" -#: fifelse.c:63 +#: fifelse.c:68 msgid "'yes' and 'na' are both type factor but their levels are different." msgstr "" -#: fifelse.c:133 +#: fifelse.c:138 #, c-format msgid "Type %s is not supported." msgstr "" +#: fifelse.c:152 +#, c-format +msgid "" +"Received %d inputs; please supply an even number of arguments in ..., " +"consisting of logical condition, resulting value pairs (in that order). Note " +"that the default argument must be named explicitly, e.g., default=0" +msgstr "" + #: fmelt.c:18 msgid "'x' must be an integer" msgstr "" @@ -1338,144 +1349,144 @@ msgstr "" msgid "names(data) is NULL. Please report to data.table-help" msgstr "" -#: forder.c:106 +#: forder.c:107 #, c-format msgid "Failed to realloc thread private group size buffer to %d*4bytes" msgstr "" -#: forder.c:120 +#: forder.c:121 #, c-format msgid "Failed to realloc group size result to %d*4bytes" msgstr "" -#: forder.c:263 +#: forder.c:264 #, c-format msgid "" "Logical error. counts[0]=%d in cradix but should have been decremented to 0. " "radix=%d" msgstr "" -#: forder.c:278 +#: forder.c:279 msgid "Failed to alloc cradix_counts" msgstr "" -#: forder.c:280 +#: forder.c:281 msgid "Failed to alloc cradix_tmp" msgstr "" -#: forder.c:291 +#: forder.c:292 #, c-format msgid "" "Internal error: ustr isn't empty when starting range_str: ustr_n=%d, " "ustr_alloc=%d" msgstr "" -#: forder.c:292 +#: forder.c:293 msgid "Internal error: ustr_maxlen isn't 0 when starting range_str" msgstr "" -#: forder.c:312 +#: forder.c:313 #, c-format msgid "Unable to realloc %d * %d bytes in range_str" msgstr "" -#: forder.c:330 +#: forder.c:331 msgid "Failed to alloc ustr3 when converting strings to UTF8" msgstr "" -#: forder.c:348 +#: forder.c:349 msgid "Failed to alloc tl when converting strings to UTF8" msgstr "" -#: forder.c:377 +#: forder.c:378 msgid "Must an integer or numeric vector length 1" msgstr "" -#: forder.c:378 +#: forder.c:379 msgid "Must be 2, 1 or 0" msgstr "" -#: forder.c:412 +#: forder.c:413 msgid "Unknown non-finite value; not NA, NaN, -Inf or +Inf" msgstr "" -#: forder.c:434 +#: forder.c:435 msgid "" "Internal error: input is not either a list of columns, or an atomic vector." msgstr "" -#: forder.c:436 +#: forder.c:437 msgid "" "Internal error: input is an atomic vector (not a list of columns) but by= is " "not NULL" msgstr "" -#: forder.c:438 +#: forder.c:439 msgid "" "Input is an atomic vector (not a list of columns) but order= is not a length " "1 integer" msgstr "" -#: forder.c:440 +#: forder.c:441 #, c-format msgid "forder.c received a vector type '%s' length %d\n" msgstr "" -#: forder.c:448 +#: forder.c:449 #, c-format msgid "forder.c received %d rows and %d columns\n" msgstr "" -#: forder.c:451 +#: forder.c:452 msgid "Internal error: DT is an empty list() of 0 columns" msgstr "" -#: forder.c:453 +#: forder.c:454 #, c-format msgid "" "Internal error: DT has %d columns but 'by' is either not integer or is " "length 0" msgstr "" -#: forder.c:455 +#: forder.c:456 #, c-format msgid "" "Either order= is not integer or its length (%d) is different to by='s length " "(%d)" msgstr "" -#: forder.c:461 +#: forder.c:462 #, c-format msgid "internal error: 'by' value %d out of range [1,%d]" msgstr "" -#: forder.c:463 +#: forder.c:464 #, c-format msgid "Column %d is length %d which differs from length of column 1 (%d)\n" msgstr "" -#: forder.c:467 +#: forder.c:468 msgid "retGrp must be TRUE or FALSE" msgstr "" -#: forder.c:470 +#: forder.c:471 msgid "sort must be TRUE or FALSE" msgstr "" -#: forder.c:473 +#: forder.c:474 msgid "At least one of retGrp= or sort= must be TRUE" msgstr "" -#: forder.c:475 +#: forder.c:476 msgid "na.last must be logical TRUE, FALSE or NA of length 1" msgstr "" -#: forder.c:519 +#: forder.c:520 #, c-format msgid "Item %d of order (ascending/descending) is %d. Must be +1 or -1." msgstr "" -#: forder.c:545 +#: forder.c:546 #, c-format msgid "" "\n" @@ -1484,124 +1495,129 @@ msgid "" "to save space and time.\n" msgstr "" -#: forder.c:561 +#: forder.c:562 #, c-format msgid "Column %d passed to [f]order is type '%s', not yet supported." msgstr "" -#: forder.c:714 +#: forder.c:715 msgid "Internal error: column not supported, not caught earlier" msgstr "" -#: forder.c:722 +#: forder.c:723 #, c-format msgid "nradix=%d\n" msgstr "" -#: forder.c:728 +#: forder.c:729 #, c-format msgid "" "Failed to allocate TMP or UGRP or they weren't cache line aligned: nth=%d" msgstr "" -#: forder.c:733 +#: forder.c:734 msgid "Could not allocate (very tiny) group size thread buffers" msgstr "" -#: forder.c:794 +#: forder.c:795 #, c-format msgid "Timing block %2d%s = %8.3f %8d\n" msgstr "" -#: forder.c:797 +#: forder.c:798 #, c-format msgid "stat[%03d]==%20\n" msgstr "" -#: forder.c:1053 +#: forder.c:1054 #, c-format msgid "Failed to allocate parallel counts. my_n=%d, nBatch=%d" msgstr "" -#: forder.c:1162 +#: forder.c:1163 #, c-format msgid "Unable to allocate TMP for my_n=%d items in parallel batch counting" msgstr "" -#: forder.c:1269 -msgid "" -"is.sorted (R level) and fsorted (C level) only to be used on vectors. If " -"needed on a list/data.table, you'll need the order anyway if not sorted, so " -"use if (length(o<-forder(...))) for efficiency in one step, or equivalent at " -"C level" +#: forder.c:1270 +msgid "Internal error: issorted 'by' must be NULL or integer vector" +msgstr "" + +#: forder.c:1274 forder.c:1324 +#, c-format +msgid "issorted 'by' [%d] out of range [1,%d]" +msgstr "" + +#: forder.c:1279 +msgid "is.sorted does not work on list columns" msgstr "" -#: forder.c:1301 +#: forder.c:1311 forder.c:1341 forder.c:1375 #, c-format msgid "type '%s' is not yet supported" msgstr "" -#: forder.c:1310 +#: forder.c:1388 msgid "x must be either NULL or an integer vector" msgstr "" -#: forder.c:1312 +#: forder.c:1390 msgid "nrow must be integer vector length 1" msgstr "" -#: forder.c:1314 +#: forder.c:1392 #, c-format msgid "nrow==%d but must be >=0" msgstr "" -#: forder.c:1331 +#: forder.c:1409 msgid "x must be type 'double'" msgstr "" -#: frank.c:11 +#: frank.c:9 #, c-format msgid "Internal error. Argument 'x' to Cdt_na is type '%s' not 'list'" msgstr "" -#: frank.c:12 +#: frank.c:10 #, c-format msgid "Internal error. Argument 'cols' to Cdt_na is type '%s' not 'integer'" msgstr "" -#: frank.c:16 frank.c:146 subset.c:263 +#: frank.c:14 frank.c:155 subset.c:276 #, c-format msgid "Item %d of 'cols' is %d which is outside 1-based range [1,ncol(x)=%d]" msgstr "" -#: frank.c:26 frank.c:155 +#: frank.c:24 frank.c:164 #, c-format msgid "" "Column %d of input list x is length %d, inconsistent with first column of " "that item which is length %d." msgstr "" -#: frank.c:65 frank.c:202 transpose.c:88 +#: frank.c:63 frank.c:211 transpose.c:88 #, c-format msgid "Unsupported column type '%s'" msgstr "" -#: frank.c:83 +#: frank.c:82 msgid "" "Internal error: invalid ties.method for frankv(), should have been caught " "before. please report to data.table issue tracker" msgstr "" -#: frank.c:130 +#: frank.c:139 #, c-format msgid "Internal error: unknown ties value in frank: %d" msgstr "" -#: frank.c:141 +#: frank.c:150 #, c-format msgid "Internal error. Argument 'x' to CanyNA is type '%s' not 'list'" msgstr "" -#: frank.c:142 +#: frank.c:151 #, c-format msgid "Internal error. Argument 'cols' to CanyNA is type '%s' not 'integer'" msgstr "" @@ -1642,332 +1658,332 @@ msgstr "" msgid " File copy in RAM took %.3f seconds.\n" msgstr "" -#: fread.c:1093 +#: fread.c:1248 msgid "" "Previous fread() session was not cleaned up properly. Cleaned up ok at the " "beginning of this fread() call.\n" msgstr "" -#: fread.c:1096 +#: fread.c:1251 msgid "[01] Check arguments\n" msgstr "" -#: fread.c:1103 +#: fread.c:1258 #, c-format msgid " Using %d threads (omp_get_max_threads()=%d, nth=%d)\n" msgstr "" -#: fread.c:1111 +#: fread.c:1266 msgid "" "Internal error: NAstrings is itself NULL. When empty it should be pointer to " "NULL." msgstr "" -#: fread.c:1129 +#: fread.c:1284 #, c-format msgid "freadMain: NAstring <<%s>> has whitespace at the beginning or end" msgstr "" -#: fread.c:1134 +#: fread.c:1289 #, c-format msgid "" "freadMain: NAstring <<%s>> is recognized as type boolean, this is not " "permitted." msgstr "" -#: fread.c:1144 +#: fread.c:1300 msgid " No NAstrings provided.\n" msgstr "" -#: fread.c:1146 +#: fread.c:1302 msgid " NAstrings = [" msgstr "" -#: fread.c:1149 +#: fread.c:1305 msgid "]\n" msgstr "" -#: fread.c:1151 +#: fread.c:1307 msgid " One or more of the NAstrings looks like a number.\n" msgstr "" -#: fread.c:1153 +#: fread.c:1309 msgid " None of the NAstrings look like numbers.\n" msgstr "" -#: fread.c:1155 +#: fread.c:1311 #, c-format msgid " skip num lines = %\n" msgstr "" -#: fread.c:1156 +#: fread.c:1312 #, c-format msgid " skip to string = <<%s>>\n" msgstr "" -#: fread.c:1157 +#: fread.c:1313 #, c-format msgid " show progress = %d\n" msgstr "" -#: fread.c:1158 +#: fread.c:1314 #, c-format msgid " 0/1 column will be read as %s\n" msgstr "" -#: fread.c:1166 +#: fread.c:1322 #, c-format msgid "sep == quote ('%c') is not allowed" msgstr "" -#: fread.c:1167 +#: fread.c:1323 msgid "dec='' not allowed. Should be '.' or ','" msgstr "" -#: fread.c:1168 +#: fread.c:1324 #, c-format msgid "sep == dec ('%c') is not allowed" msgstr "" -#: fread.c:1169 +#: fread.c:1325 #, c-format msgid "quote == dec ('%c') is not allowed" msgstr "" -#: fread.c:1186 +#: fread.c:1342 msgid "[02] Opening the file\n" msgstr "" -#: fread.c:1189 +#: fread.c:1345 msgid "" " `input` argument is provided rather than a file name, interpreting as raw " "text to read\n" msgstr "" -#: fread.c:1193 +#: fread.c:1349 msgid "Internal error: last byte of character input isn't \\0" msgstr "" -#: fread.c:1196 +#: fread.c:1352 #, c-format msgid " Opening file %s\n" msgstr "" -#: fread.c:1200 +#: fread.c:1356 #, c-format msgid "file not found: %s" msgstr "" -#: fread.c:1204 +#: fread.c:1360 #, c-format msgid "Opened file ok but couldn't obtain its size: %s" msgstr "" -#: fread.c:1207 fread.c:1235 +#: fread.c:1363 fread.c:1391 #, c-format msgid "File is empty: %s" msgstr "" -#: fread.c:1208 fread.c:1236 +#: fread.c:1364 fread.c:1392 #, c-format msgid " File opened, size = %s.\n" msgstr "" -#: fread.c:1225 +#: fread.c:1381 #, c-format msgid "File not found: %s" msgstr "" -#: fread.c:1231 +#: fread.c:1387 #, c-format msgid "Unable to open file after %d attempts (error %d): %s" msgstr "" -#: fread.c:1233 +#: fread.c:1389 #, c-format msgid "GetFileSizeEx failed (returned 0) on file: %s" msgstr "" -#: fread.c:1238 +#: fread.c:1394 #, c-format msgid "This is Windows, CreateFileMapping returned error %d for file %s" msgstr "" -#: fread.c:1245 +#: fread.c:1401 #, c-format msgid "" "Opened %s file ok but could not memory map it. This is a %dbit process. %s." msgstr "" -#: fread.c:1246 +#: fread.c:1402 msgid "Please upgrade to 64bit" msgstr "" -#: fread.c:1246 +#: fread.c:1402 msgid "There is probably not enough contiguous virtual memory available" msgstr "" -#: fread.c:1249 +#: fread.c:1405 msgid " Memory mapped ok\n" msgstr "" -#: fread.c:1251 +#: fread.c:1407 msgid "" "Internal error: Neither `input` nor `filename` are given, nothing to read." msgstr "" -#: fread.c:1268 +#: fread.c:1424 msgid "[03] Detect and skip BOM\n" msgstr "" -#: fread.c:1272 +#: fread.c:1428 msgid "" " UTF-8 byte order mark EF BB BF found at the start of the file and " "skipped.\n" msgstr "" -#: fread.c:1277 +#: fread.c:1433 msgid "" "GB-18030 encoding detected, however fread() is unable to decode it. Some " "character fields may be garbled.\n" msgstr "" -#: fread.c:1280 +#: fread.c:1436 msgid "" "File is encoded in UTF-16, this encoding is not supported by fread(). Please " "recode the file to UTF-8." msgstr "" -#: fread.c:1285 +#: fread.c:1441 #, c-format msgid " Last byte(s) of input found to be %s and removed.\n" msgstr "" -#: fread.c:1288 +#: fread.c:1444 msgid "Input is empty or only contains BOM or terminal control characters" msgstr "" -#: fread.c:1295 +#: fread.c:1451 msgid "[04] Arrange mmap to be \\0 terminated\n" msgstr "" -#: fread.c:1302 +#: fread.c:1458 msgid "" " No \\n exists in the file at all, so single \\r (if any) will be taken as " "one line ending. This is unusual but will happen normally when there is no " "\\r either; e.g. a single line missing its end of line.\n" msgstr "" -#: fread.c:1303 +#: fread.c:1459 msgid "" " \\n has been found in the input and different lines can end with different " "line endings (e.g. mixed \\n and \\r\\n in one file). This is common and " "ideal.\n" msgstr "" -#: fread.c:1327 +#: fread.c:1483 #, c-format msgid "" " File ends abruptly with '%c'. Final end-of-line is missing. Using cow page " "to write 0 to the last byte.\n" msgstr "" -#: fread.c:1333 +#: fread.c:1489 msgid "" "This file is very unusual: it ends abruptly without a final newline, and " "also its size is a multiple of 4096 bytes. Please properly end the last row " "with a newline using for example 'echo >> file' to avoid this " msgstr "" -#: fread.c:1334 +#: fread.c:1490 #, c-format msgid " File ends abruptly with '%c'. Copying file in RAM. %s copy.\n" msgstr "" -#: fread.c:1368 +#: fread.c:1524 msgid "[05] Skipping initial rows if needed\n" msgstr "" -#: fread.c:1374 +#: fread.c:1530 #, c-format msgid "" "skip='%s' not found in input (it is case sensitive and literal; i.e., no " "patterns, wildcards or regex)" msgstr "" -#: fread.c:1380 +#: fread.c:1536 #, c-format msgid "" "Found skip='%s' on line %. Taking this to be header row or first row " "of data.\n" msgstr "" -#: fread.c:1393 +#: fread.c:1549 #, c-format msgid " Skipped to line % in the file" msgstr "" -#: fread.c:1394 +#: fread.c:1550 #, c-format msgid "skip=% but the input only has % line%s" msgstr "" -#: fread.c:1403 +#: fread.c:1559 msgid "" "Input is either empty, fully whitespace, or skip has been set after the last " "non-whitespace." msgstr "" -#: fread.c:1405 +#: fread.c:1561 #, c-format msgid " Moved forward to first non-blank line (%d)\n" msgstr "" -#: fread.c:1406 +#: fread.c:1562 #, c-format msgid " Positioned on line %d starting: <<%s>>\n" msgstr "" -#: fread.c:1424 +#: fread.c:1580 msgid "[06] Detect separator, quoting rule, and ncolumns\n" msgstr "" -#: fread.c:1428 +#: fread.c:1584 msgid " sep='\\n' passed in meaning read lines as single character column\n" msgstr "" -#: fread.c:1447 +#: fread.c:1603 msgid " Detecting sep automatically ...\n" msgstr "" -#: fread.c:1454 +#: fread.c:1610 #, c-format msgid " Using supplied sep '%s'\n" msgstr "" -#: fread.c:1488 +#: fread.c:1644 #, c-format msgid " with %d fields using quote rule %d\n" msgstr "" -#: fread.c:1538 +#: fread.c:1694 #, c-format msgid " with %d lines of %d fields using quote rule %d\n" msgstr "" -#: fread.c:1545 +#: fread.c:1701 msgid "" " No sep and quote rule found a block of 2x2 or greater. Single column " "input.\n" msgstr "" -#: fread.c:1561 +#: fread.c:1717 msgid "" "Single column input contains invalid quotes. Self healing only effective " "when ncol>1" msgstr "" -#: fread.c:1566 +#: fread.c:1722 #, c-format msgid "" "Found and resolved improper quoting in first %d rows. If the fields are not " @@ -1975,282 +1991,282 @@ msgid "" "\"\" to avoid this warning." msgstr "" -#: fread.c:1582 +#: fread.c:1738 #, c-format msgid "" "Internal error: ncol==%d line==%d after detecting sep, ncol and first line" msgstr "" -#: fread.c:1585 +#: fread.c:1741 #, c-format msgid "Internal error: first line has field count %d but expecting %d" msgstr "" -#: fread.c:1587 +#: fread.c:1743 #, c-format msgid "" " Detected %d columns on line %d. This line is either column names or first " "data row. Line starts as: <<%s>>\n" msgstr "" -#: fread.c:1589 +#: fread.c:1745 #, c-format msgid " Quote rule picked = %d\n" msgstr "" -#: fread.c:1590 +#: fread.c:1746 #, c-format msgid " fill=%s and the most number of columns found is %d\n" msgstr "" -#: fread.c:1596 +#: fread.c:1752 msgid "" "This file is very unusual: it's one single column, ends with 2 or more end-" "of-line (representing several NA at the end), and is a multiple of 4096, too." msgstr "" -#: fread.c:1597 +#: fread.c:1753 #, c-format msgid " Copying file in RAM. %s\n" msgstr "" -#: fread.c:1603 +#: fread.c:1759 msgid "" " 1-column file ends with 2 or more end-of-line. Restoring last eol using " "extra byte in cow page.\n" msgstr "" -#: fread.c:1622 +#: fread.c:1778 msgid "" "[07] Detect column types, good nrow estimate and whether first row is column " "names\n" msgstr "" -#: fread.c:1623 +#: fread.c:1779 #, c-format msgid " 'header' changed by user from 'auto' to %s\n" msgstr "" -#: fread.c:1627 +#: fread.c:1783 #, c-format msgid "Failed to allocate 2 x %d bytes for type and tmpType: %s" msgstr "" -#: fread.c:1648 +#: fread.c:1804 #, c-format msgid " Number of sampling jump points = %d because " msgstr "" -#: fread.c:1649 +#: fread.c:1805 #, c-format msgid "nrow limit (%) supplied\n" msgstr "" -#: fread.c:1650 +#: fread.c:1806 msgid "jump0size==0\n" msgstr "" -#: fread.c:1651 +#: fread.c:1807 #, c-format msgid "" "(% bytes from row 1 to eof) / (2 * % jump0size) == " "%\n" msgstr "" -#: fread.c:1689 +#: fread.c:1845 #, c-format msgid "" " A line with too-%s fields (%d/%d) was found on line %d of sample jump %d. " "%s\n" msgstr "" -#: fread.c:1690 +#: fread.c:1846 msgid "few" msgstr "" -#: fread.c:1690 +#: fread.c:1846 msgid "many" msgstr "" -#: fread.c:1690 +#: fread.c:1846 msgid "" "Most likely this jump landed awkwardly so type bumps here will be skipped." msgstr "" -#: fread.c:1716 +#: fread.c:1872 #, c-format msgid " Type codes (jump %03d) : %s Quote rule %d\n" msgstr "" -#: fread.c:1729 +#: fread.c:1885 #, c-format msgid "" " 'header' determined to be true due to column %d containing a string on row " "1 and a lower type (%s) in the rest of the %d sample rows\n" msgstr "" -#: fread.c:1741 +#: fread.c:1897 msgid "" "Internal error: row before first data row has the same number of fields but " "we're not using it." msgstr "" -#: fread.c:1742 +#: fread.c:1898 msgid "" "Internal error: ch!=pos after counting fields in the line before the first " "data row." msgstr "" -#: fread.c:1743 +#: fread.c:1899 #, c-format msgid "" "Types in 1st data row match types in 2nd data row but previous row has %d " "fields. Taking previous row as column names." msgstr "" -#: fread.c:1746 +#: fread.c:1902 #, c-format msgid "" "Detected %d column names but the data has %d columns (i.e. invalid file). " "Added %d extra default column name%s\n" msgstr "" -#: fread.c:1747 +#: fread.c:1903 msgid "" " for the first column which is guessed to be row names or an index. Use " "setnames() afterwards if this guess is not correct, or fix the file write " "command that created the file to create a valid file." msgstr "" -#: fread.c:1747 +#: fread.c:1903 msgid "s at the end." msgstr "" -#: fread.c:1749 +#: fread.c:1905 msgid "" "Internal error: fill=true but there is a previous row which should already " "have been filled." msgstr "" -#: fread.c:1750 +#: fread.c:1906 #, c-format msgid "" "Detected %d column names but the data has %d columns. Filling rows " "automatically. Set fill=TRUE explicitly to avoid this warning.\n" msgstr "" -#: fread.c:1754 +#: fread.c:1910 #, c-format msgid "Failed to realloc 2 x %d bytes for type and tmpType: %s" msgstr "" -#: fread.c:1774 +#: fread.c:1930 #, c-format msgid "" " 'header' determined to be %s because there are%s number fields in the " "first and only row\n" msgstr "" -#: fread.c:1774 +#: fread.c:1930 msgid " no" msgstr "" -#: fread.c:1777 +#: fread.c:1933 msgid "" " 'header' determined to be true because all columns are type string and a " "better guess is not possible\n" msgstr "" -#: fread.c:1779 +#: fread.c:1935 msgid "" " 'header' determined to be false because there are some number columns and " "those columns do not have a string field at the top of them\n" msgstr "" -#: fread.c:1795 +#: fread.c:1951 #, c-format msgid " Type codes (first row) : %s Quote rule %d\n" msgstr "" -#: fread.c:1804 +#: fread.c:1960 #, c-format msgid "" " All rows were sampled since file is small so we know nrow=% " "exactly\n" msgstr "" -#: fread.c:1816 fread.c:1823 +#: fread.c:1972 fread.c:1979 msgid " =====\n" msgstr "" -#: fread.c:1817 +#: fread.c:1973 #, c-format msgid "" " Sampled % rows (handled \\n inside quoted fields) at %d jump " "points\n" msgstr "" -#: fread.c:1818 +#: fread.c:1974 #, c-format msgid "" " Bytes from first data row on line %d to the end of last row: %\n" msgstr "" -#: fread.c:1819 +#: fread.c:1975 #, c-format msgid " Line length: mean=%.2f sd=%.2f min=%d max=%d\n" msgstr "" -#: fread.c:1820 +#: fread.c:1976 #, c-format msgid " Estimated number of rows: % / %.2f = %\n" msgstr "" -#: fread.c:1821 +#: fread.c:1977 #, c-format msgid "" " Initial alloc = % rows (% + %d%%) using bytes/" "max(mean-2*sd,min) clamped between [1.1*estn, 2.0*estn]\n" msgstr "" -#: fread.c:1825 +#: fread.c:1981 #, c-format msgid "Internal error: sampleLines(%) > allocnrow(%)" msgstr "" -#: fread.c:1829 +#: fread.c:1985 #, c-format msgid " Alloc limited to lower nrows=% passed in.\n" msgstr "" -#: fread.c:1841 +#: fread.c:1997 msgid "[08] Assign column names\n" msgstr "" -#: fread.c:1849 +#: fread.c:2005 #, c-format msgid "Unable to allocate %d*%d bytes for column name pointers: %s" msgstr "" -#: fread.c:1871 +#: fread.c:2027 #, c-format msgid "Internal error: reading colnames ending on '%c'" msgstr "" -#: fread.c:1889 +#: fread.c:2045 msgid "[09] Apply user overrides on column types\n" msgstr "" -#: fread.c:1893 +#: fread.c:2049 msgid " Cancelled by user: userOverride() returned false." msgstr "" -#: fread.c:1903 +#: fread.c:2059 #, c-format msgid "Failed to allocate %d bytes for size array: %s" msgstr "" -#: fread.c:1910 +#: fread.c:2066 #, c-format msgid "" "Attempt to override column %d <<%.*s>> of inherent type '%s' down to '%s' " @@ -2258,103 +2274,103 @@ msgid "" "was intended, please coerce to the lower type afterwards." msgstr "" -#: fread.c:1924 +#: fread.c:2080 #, c-format msgid " After %d type and %d drop user overrides : %s\n" msgstr "" -#: fread.c:1932 +#: fread.c:2088 msgid "[10] Allocate memory for the datatable\n" msgstr "" -#: fread.c:1933 +#: fread.c:2089 #, c-format msgid " Allocating %d column slots (%d - %d dropped) with % rows\n" msgstr "" -#: fread.c:1987 +#: fread.c:2143 #, c-format msgid "Buffer size % is too large\n" msgstr "" -#: fread.c:1990 +#: fread.c:2146 msgid "[11] Read the data\n" msgstr "" -#: fread.c:1993 +#: fread.c:2149 #, c-format msgid " jumps=[%d..%d), chunk_size=%, total_size=%\n" msgstr "" -#: fread.c:2005 +#: fread.c:2161 #, c-format msgid "Internal error: Master thread is not thread 0 but thread %d.\n" msgstr "" -#: fread.c:2213 +#: fread.c:2369 #, c-format msgid "" "Column %d (\"%.*s\") bumped from '%s' to '%s' due to <<%.*s>> on row " "%\n" msgstr "" -#: fread.c:2262 +#: fread.c:2418 #, c-format msgid "" "Internal error: invalid head position. jump=%d, headPos=%p, thisJumpStart=" "%p, sof=%p" msgstr "" -#: fread.c:2335 +#: fread.c:2491 #, c-format msgid "" " Too few rows allocated. Allocating additional % rows (now nrows=" "%) and continue reading from jump %d\n" msgstr "" -#: fread.c:2342 +#: fread.c:2498 #, c-format msgid " Restarting team from jump %d. nSwept==%d quoteRule==%d\n" msgstr "" -#: fread.c:2362 +#: fread.c:2518 #, c-format msgid " %d out-of-sample type bumps: %s\n" msgstr "" -#: fread.c:2398 +#: fread.c:2554 #, c-format msgid "" "Read % rows x %d columns from %s file in %02d:%06.3f wall clock " "time\n" msgstr "" -#: fread.c:2405 +#: fread.c:2561 msgid "[12] Finalizing the datatable\n" msgstr "" -#: fread.c:2406 +#: fread.c:2562 msgid " Type counts:\n" msgstr "" -#: fread.c:2408 +#: fread.c:2564 #, c-format msgid "%10d : %-9s '%c'\n" msgstr "" -#: fread.c:2424 +#: fread.c:2580 #, c-format msgid "Discarded single-line footer: <<%s>>" msgstr "" -#: fread.c:2429 +#: fread.c:2585 #, c-format msgid "" "Stopped early on line %. Expected %d fields but found %d. Consider " "fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>" msgstr "" -#: fread.c:2435 +#: fread.c:2591 #, c-format msgid "" "Found and resolved improper quoting out-of-sample. First healed line " @@ -2362,218 +2378,213 @@ msgid "" "not appear within any field), try quote=\"\" to avoid this warning." msgstr "" -#: fread.c:2439 +#: fread.c:2595 msgid "=============================\n" msgstr "" -#: fread.c:2441 +#: fread.c:2597 #, c-format msgid "%8.3fs (%3.0f%%) Memory map %.3fGB file\n" msgstr "" -#: fread.c:2442 +#: fread.c:2598 #, c-format msgid "%8.3fs (%3.0f%%) sep=" msgstr "" -#: fread.c:2444 +#: fread.c:2600 #, c-format msgid " ncol=%d and header detection\n" msgstr "" -#: fread.c:2445 +#: fread.c:2601 #, c-format msgid "%8.3fs (%3.0f%%) Column type detection using % sample rows\n" msgstr "" -#: fread.c:2447 +#: fread.c:2603 #, c-format msgid "" "%8.3fs (%3.0f%%) Allocation of % rows x %d cols (%.3fGB) of which " "% (%3.0f%%) rows used\n" msgstr "" -#: fread.c:2451 +#: fread.c:2607 #, c-format msgid "" "%8.3fs (%3.0f%%) Reading %d chunks (%d swept) of %.3fMB (each chunk %d rows) " "using %d threads\n" msgstr "" -#: fread.c:2453 +#: fread.c:2609 #, c-format msgid "" " + %8.3fs (%3.0f%%) Parse to row-major thread buffers (grown %d times)\n" msgstr "" -#: fread.c:2454 +#: fread.c:2610 #, c-format msgid " + %8.3fs (%3.0f%%) Transpose\n" msgstr "" -#: fread.c:2455 +#: fread.c:2611 #, c-format msgid " + %8.3fs (%3.0f%%) Waiting\n" msgstr "" -#: fread.c:2456 +#: fread.c:2612 #, c-format msgid "" "%8.3fs (%3.0f%%) Rereading %d columns due to out-of-sample type exceptions\n" msgstr "" -#: fread.c:2458 +#: fread.c:2614 #, c-format msgid "%8.3fs Total\n" msgstr "" -#: freadR.c:84 +#: freadR.c:85 msgid "" "Internal error: freadR input not a single character string: a filename or " "the data itself. Should have been caught at R level." msgstr "" -#: freadR.c:92 +#: freadR.c:93 msgid "" "Input contains a \\n or is \")\". Taking this to be text input (not a " "filename)\n" msgstr "" -#: freadR.c:95 +#: freadR.c:96 msgid "Input contains no \\n. Taking this to be a filename to open\n" msgstr "" -#: freadR.c:101 +#: freadR.c:102 msgid "" "Internal error: freadR sep not a single character. R level catches this." msgstr "" -#: freadR.c:105 +#: freadR.c:106 msgid "" "Internal error: freadR dec not a single character. R level catches this." msgstr "" -#: freadR.c:112 +#: freadR.c:113 msgid "quote= must be a single character, blank \"\", or FALSE" msgstr "" -#: freadR.c:137 +#: freadR.c:143 msgid "Internal error: skip not integer or string in freadR.c" msgstr "" -#: freadR.c:140 +#: freadR.c:146 #, c-format msgid "Internal error: NAstringsArg is type '%s'. R level catches this" msgstr "" -#: freadR.c:153 +#: freadR.c:159 #, c-format msgid "nThread(%d)<1" msgstr "" -#: freadR.c:160 +#: freadR.c:166 msgid "'integer64' must be a single character string" msgstr "" -#: freadR.c:168 +#: freadR.c:174 #, c-format msgid "" "Invalid value integer64='%s'. Must be 'integer64', 'character', 'double' or " "'numeric'" msgstr "" -#: freadR.c:176 +#: freadR.c:182 msgid "Use either select= or drop= but not both." msgstr "" -#: freadR.c:179 +#: freadR.c:185 msgid "" "select= is type list for specifying types in select=, but colClasses= has " "been provided as well. Please remove colClasses=." msgstr "" -#: freadR.c:181 +#: freadR.c:187 msgid "" "select= is type list but has no names; expecting list(type1=cols1, " "type2=cols2, ...)" msgstr "" -#: freadR.c:188 +#: freadR.c:194 msgid "" "select= is a named vector specifying the columns to select and their types, " "but colClasses= has been provided as well. Please remove colClasses=." msgstr "" -#: freadR.c:196 freadR.c:346 +#: freadR.c:202 freadR.c:368 msgid "colClasses is type list but has no names" msgstr "" -#: freadR.c:206 +#: freadR.c:212 #, c-format msgid "encoding='%s' invalid. Must be 'unknown', 'Latin-1' or 'UTF-8'" msgstr "" -#: freadR.c:229 +#: freadR.c:235 #, c-format msgid "Column name '%s' (%s) not found" msgstr "" -#: freadR.c:231 +#: freadR.c:237 #, c-format msgid "%s is NA" msgstr "" -#: freadR.c:233 +#: freadR.c:239 #, c-format msgid "%s is %d which is out of range [1,ncol=%d]" msgstr "" -#: freadR.c:247 +#: freadR.c:253 msgid "Internal error: typeSize[CT_BOOL8_N] != 1" msgstr "" -#: freadR.c:248 +#: freadR.c:254 msgid "Internal error: typeSize[CT_STRING] != 1" msgstr "" -#: freadR.c:282 +#: freadR.c:288 #, c-format msgid "" "Column name '%s' not found in column name header (case sensitive), skipping." msgstr "" -#: freadR.c:292 +#: freadR.c:298 #, c-format msgid "" "Column number %d (select[%d]) is negative but should be in the range [1,ncol=" "%d]. Consider drop= for column exclusion." msgstr "" -#: freadR.c:293 +#: freadR.c:299 #, c-format msgid "" "select = 0 (select[%d]) has no meaning. All values of select should be in " "the range [1,ncol=%d]." msgstr "" -#: freadR.c:294 +#: freadR.c:300 #, c-format msgid "" "Column number %d (select[%d]) is too large for this table, which only has %d " "columns." msgstr "" -#: freadR.c:295 +#: freadR.c:301 #, c-format msgid "Column number %d ('%s') has been selected twice by select=" msgstr "" -#: freadR.c:313 -msgid "" -"colClasses='NULL' is not permitted; i.e. to drop all columns and load nothing" -msgstr "" - -#: freadR.c:318 +#: freadR.c:324 #, c-format msgid "" "colClasses= is an unnamed vector of types, length %d, but there are %d " @@ -2582,54 +2593,54 @@ msgid "" "colClasses=. Please see examples in ?fread." msgstr "" -#: freadR.c:329 +#: freadR.c:344 msgid "Internal error: selectInts is NULL but selectColClasses is true" msgstr "" -#: freadR.c:330 +#: freadR.c:346 msgid "" "Internal error: length(selectSxp)!=length(colClassesSxp) but " "selectColClasses is true" msgstr "" -#: freadR.c:344 +#: freadR.c:366 #, c-format msgid "colClasses is type '%s' but should be list or character" msgstr "" -#: freadR.c:368 +#: freadR.c:390 #, c-format msgid "Column name '%s' (colClasses[[%d]][%d]) not found" msgstr "" -#: freadR.c:370 +#: freadR.c:392 #, c-format msgid "colClasses[[%d]][%d] is NA" msgstr "" -#: freadR.c:374 +#: freadR.c:396 #, c-format msgid "" "Column %d ('%s') appears more than once in colClasses. The second time is " "colClasses[[%d]][%d]." msgstr "" -#: freadR.c:381 +#: freadR.c:408 #, c-format msgid "Column number %d (colClasses[[%d]][%d]) is out of range [1,ncol=%d]" msgstr "" -#: freadR.c:583 +#: freadR.c:624 #, c-format msgid "Field size is 1 but the field is of type %d\n" msgstr "" -#: freadR.c:592 +#: freadR.c:633 #, c-format msgid "Internal error: unexpected field of size %d\n" msgstr "" -#: freadR.c:660 +#: freadR.c:701 #, c-format msgid "%s" msgstr "" @@ -2747,7 +2758,7 @@ msgid "n must be integer vector or list of integer vectors" msgstr "" #: frollR.c:104 gsumm.c:342 gsumm.c:577 gsumm.c:686 gsumm.c:805 gsumm.c:950 -#: gsumm.c:1261 gsumm.c:1402 uniqlist.c:350 +#: gsumm.c:1261 gsumm.c:1402 uniqlist.c:351 msgid "na.rm must be TRUE or FALSE" msgstr "" @@ -2798,7 +2809,7 @@ msgid "" "caught before. please report to data.table issue tracker." msgstr "" -#: frollR.c:155 frollR.c:279 nafill.c:152 shift.c:21 +#: frollR.c:155 frollR.c:279 nafill.c:162 shift.c:21 msgid "fill must be a vector of length 1" msgstr "" @@ -3068,15 +3079,16 @@ msgstr "" #: fwriteR.c:98 #, c-format msgid "" -"Row %d of list column is type '%s' - not yet implemented. fwrite() can write " -"list columns containing items which are atomic vectors of type logical, " -"integer, integer64, double, complex and character." +"Row % of list column is type '%s' - not yet implemented. fwrite() " +"can write list columns containing items which are atomic vectors of type " +"logical, integer, integer64, double, complex and character." msgstr "" #: fwriteR.c:103 #, c-format msgid "" -"Internal error: row %d of list column has no max length method implemented" +"Internal error: row % of list column has no max length method " +"implemented" msgstr "" #: fwriteR.c:170 @@ -3088,30 +3100,31 @@ msgstr "" msgid "fwrite was passed an empty list of no columns. Nothing to write." msgstr "" -#: fwriteR.c:234 +#: fwriteR.c:232 #, c-format -msgid "Column %d's length (%d) is not the same as column 1's length (%d)" +msgid "" +"Column %d's length (%d) is not the same as column 1's length (%)" msgstr "" -#: fwriteR.c:237 +#: fwriteR.c:236 #, c-format msgid "Column %d's type is '%s' - not yet implemented in fwrite." msgstr "" -#: fwriteR.c:262 +#: fwriteR.c:261 msgid "" "No list columns are present. Setting sep2='' otherwise quote='auto' would " "quote fields containing sep2.\n" msgstr "" -#: fwriteR.c:266 +#: fwriteR.c:265 #, c-format msgid "" "If quote='auto', fields will be quoted if the field contains either sep " "('%c') or sep2 ('%c') because column %d is a list column.\n" msgstr "" -#: fwriteR.c:270 +#: fwriteR.c:269 #, c-format msgid "" "sep ('%c'), sep2 ('%c') and dec ('%c') must all be different. Column %d is a " @@ -3516,156 +3529,156 @@ msgstr "" msgid "Final step, fetching indices in overlaps ... done in %8.3f seconds\n" msgstr "" -#: init.c:233 +#: init.c:239 #, c-format msgid "" "Pointers are %d bytes, greater than 8. We have not tested on any " "architecture greater than 64bit yet." msgstr "" -#: init.c:247 +#: init.c:253 #, c-format msgid "Checking NA_INTEGER [%d] == INT_MIN [%d] %s" msgstr "" -#: init.c:248 +#: init.c:254 #, c-format msgid "Checking NA_INTEGER [%d] == NA_LOGICAL [%d] %s" msgstr "" -#: init.c:249 +#: init.c:255 #, c-format msgid "Checking sizeof(int) [%d] is 4 %s" msgstr "" -#: init.c:250 +#: init.c:256 #, c-format msgid "Checking sizeof(double) [%d] is 8 %s" msgstr "" -#: init.c:252 +#: init.c:258 #, c-format msgid "Checking sizeof(long long) [%d] is 8 %s" msgstr "" -#: init.c:253 +#: init.c:259 #, c-format msgid "Checking sizeof(pointer) [%d] is 4 or 8 %s" msgstr "" -#: init.c:254 +#: init.c:260 #, c-format msgid "Checking sizeof(SEXP) [%d] == sizeof(pointer) [%d] %s" msgstr "" -#: init.c:255 +#: init.c:261 #, c-format msgid "Checking sizeof(uint64_t) [%d] is 8 %s" msgstr "" -#: init.c:256 +#: init.c:262 #, c-format msgid "Checking sizeof(int64_t) [%d] is 8 %s" msgstr "" -#: init.c:257 +#: init.c:263 #, c-format msgid "Checking sizeof(signed char) [%d] is 1 %s" msgstr "" -#: init.c:258 +#: init.c:264 #, c-format msgid "Checking sizeof(int8_t) [%d] is 1 %s" msgstr "" -#: init.c:259 +#: init.c:265 #, c-format msgid "Checking sizeof(uint8_t) [%d] is 1 %s" msgstr "" -#: init.c:260 +#: init.c:266 #, c-format msgid "Checking sizeof(int16_t) [%d] is 2 %s" msgstr "" -#: init.c:261 +#: init.c:267 #, c-format msgid "Checking sizeof(uint16_t) [%d] is 2 %s" msgstr "" -#: init.c:264 +#: init.c:270 #, c-format msgid "Checking LENGTH(allocVector(INTSXP,2)) [%d] is 2 %s" msgstr "" -#: init.c:265 +#: init.c:271 #, c-format msgid "Checking TRUELENGTH(allocVector(INTSXP,2)) [%d] is 0 %s" msgstr "" -#: init.c:272 +#: init.c:278 #, c-format msgid "Checking memset(&i,0,sizeof(int)); i == (int)0 %s" msgstr "" -#: init.c:275 +#: init.c:281 #, c-format msgid "Checking memset(&ui, 0, sizeof(unsigned int)); ui == (unsigned int)0 %s" msgstr "" -#: init.c:278 +#: init.c:284 #, c-format msgid "Checking memset(&d, 0, sizeof(double)); d == (double)0.0 %s" msgstr "" -#: init.c:281 +#: init.c:287 #, c-format msgid "Checking memset(&ld, 0, sizeof(long double)); ld == (long double)0.0 %s" msgstr "" -#: init.c:284 +#: init.c:290 msgid "The ascii character '/' is not just before '0'" msgstr "" -#: init.c:285 +#: init.c:291 msgid "The C expression (uint_fast8_t)('/'-'0')<10 is true. Should be false." msgstr "" -#: init.c:286 +#: init.c:292 msgid "The ascii character ':' is not just after '9'" msgstr "" -#: init.c:287 +#: init.c:293 msgid "The C expression (uint_fast8_t)('9'-':')<10 is true. Should be false." msgstr "" -#: init.c:292 +#: init.c:298 #, c-format msgid "Conversion of NA_INT64 via double failed %!=%" msgstr "" -#: init.c:296 +#: init.c:302 msgid "NA_INT64_D (negative -0.0) is not == 0.0." msgstr "" -#: init.c:297 +#: init.c:303 msgid "NA_INT64_D (negative -0.0) is not ==-0.0." msgstr "" -#: init.c:298 +#: init.c:304 msgid "ISNAN(NA_INT64_D) is TRUE but should not be" msgstr "" -#: init.c:299 +#: init.c:305 msgid "isnan(NA_INT64_D) is TRUE but should not be" msgstr "" -#: init.c:328 +#: init.c:337 #, c-format msgid "PRINTNAME(install(\"integer64\")) has returned %s not %s" msgstr "" -#: init.c:397 +#: init.c:408 msgid ".Last.value in namespace is not a length 1 integer" msgstr "" @@ -3679,111 +3692,111 @@ msgstr "" msgid "'x' argument must be numeric type, or list/data.table of numeric types" msgstr "" -#: nafill.c:149 nafill.c:180 +#: nafill.c:159 nafill.c:190 msgid "" "Internal error: invalid type argument in nafillR function, should have been " "caught before. Please report to data.table issue tracker." msgstr "" -#: nafill.c:196 +#: nafill.c:206 #, c-format msgid "%s: parallel processing of %d column(s) took %.3fs\n" msgstr "" -#: openmp-utils.c:22 +#: openmp-utils.c:23 #, c-format msgid "" -"Ignoring invalid %s==\")%s\". Not an integer >= 1. Please remove any " +"Ignoring invalid %s==\"%s\". Not an integer >= 1. Please remove any " "characters that are not a digit [0-9]. See ?data.table::setDTthreads." msgstr "" -#: openmp-utils.c:40 +#: openmp-utils.c:44 #, c-format msgid "" "Ignoring invalid R_DATATABLE_NUM_PROCS_PERCENT==%d. If used it must be an " "integer between 2 and 100. Default is 50. See ?setDTtheads." msgstr "" -#: openmp-utils.c:67 +#: openmp-utils.c:78 msgid "'verbose' must be TRUE or FALSE" msgstr "" -#: openmp-utils.c:70 +#: openmp-utils.c:81 msgid "" "This installation of data.table has not been compiled with OpenMP support.\n" msgstr "" -#: openmp-utils.c:75 +#: openmp-utils.c:86 #, c-format msgid " omp_get_num_procs() %d\n" msgstr "" -#: openmp-utils.c:76 +#: openmp-utils.c:87 #, c-format msgid " R_DATATABLE_NUM_PROCS_PERCENT %s\n" msgstr "" -#: openmp-utils.c:77 +#: openmp-utils.c:88 #, c-format msgid " R_DATATABLE_NUM_THREADS %s\n" msgstr "" -#: openmp-utils.c:78 +#: openmp-utils.c:89 +#, c-format +msgid " R_DATATABLE_THROTTLE %s\n" +msgstr "" + +#: openmp-utils.c:90 #, c-format msgid " omp_get_thread_limit() %d\n" msgstr "" -#: openmp-utils.c:79 +#: openmp-utils.c:91 #, c-format msgid " omp_get_max_threads() %d\n" msgstr "" -#: openmp-utils.c:80 +#: openmp-utils.c:92 #, c-format msgid " OMP_THREAD_LIMIT %s\n" msgstr "" -#: openmp-utils.c:81 +#: openmp-utils.c:93 #, c-format msgid " OMP_NUM_THREADS %s\n" msgstr "" -#: openmp-utils.c:82 +#: openmp-utils.c:94 #, c-format msgid " RestoreAfterFork %s\n" msgstr "" -#: openmp-utils.c:83 +#: openmp-utils.c:95 #, c-format -msgid " data.table is using %d threads. See ?setDTthreads.\n" +msgid "" +" data.table is using %d threads with throttle==%d. See ?setDTthreads.\n" msgstr "" -#: openmp-utils.c:91 +#: openmp-utils.c:103 msgid "" "restore_after_fork= must be TRUE, FALSE, or NULL (default). " "getDTthreads(verbose=TRUE) reports the current setting.\n" msgstr "" -#: openmp-utils.c:105 -#, c-format -msgid "" -"threads= must be either NULL (default) or a single number. It has length %d" -msgstr "" - -#: openmp-utils.c:107 -msgid "threads= must be either NULL (default) or type integer/numeric" +#: openmp-utils.c:109 +msgid "'throttle' must be a single number, non-NA, and >=1" msgstr "" -#: openmp-utils.c:109 +#: openmp-utils.c:123 msgid "" -"threads= must be either NULL or a single integer >= 0. See ?setDTthreads." +"threads= must be either NULL or a single number >= 0. See ?setDTthreads." msgstr "" -#: openmp-utils.c:114 +#: openmp-utils.c:127 msgid "Internal error: percent= must be TRUE or FALSE at C level" msgstr "" -#: openmp-utils.c:117 +#: openmp-utils.c:130 #, c-format msgid "" "Internal error: threads==%d should be between 2 and 100 (percent=TRUE at C " @@ -4015,15 +4028,20 @@ msgstr "" msgid "nrow(x)[%d]!=length(order)[%d]" msgstr "" -#: reorder.c:48 +#: reorder.c:51 #, c-format -msgid "order is not a permutation of 1:nrow[%d]" +msgid "" +"Item %d of order (%d) is either NA, out of range [1,%d], or is duplicated. " +"The new order must be a strict permutation of 1:n" +msgstr "" + +#: reorder.c:105 +msgid "dt passed to setcolorder has no names" msgstr "" -#: reorder.c:57 +#: reorder.c:107 #, c-format -msgid "" -"Unable to allocate %d * %d bytes of working memory for reordering data.table" +msgid "Internal error: dt passed to setcolorder has %d columns but %d names" msgstr "" #: shift.c:17 @@ -4057,98 +4075,98 @@ msgstr "" msgid "Internal error: subsetVectorRaw length(ans)==%d n=%d" msgstr "" -#: subset.c:88 +#: subset.c:101 #, c-format msgid "" "Internal error: column type '%s' not supported by data.table subset. All " "known types are supported so please report as bug." msgstr "" -#: subset.c:97 subset.c:121 +#: subset.c:110 subset.c:134 #, c-format msgid "Internal error. 'idx' is type '%s' not 'integer'" msgstr "" -#: subset.c:122 +#: subset.c:135 #, c-format msgid "" "Internal error. 'maxArg' is type '%s' and length %d, should be an integer " "singleton" msgstr "" -#: subset.c:123 +#: subset.c:136 msgid "Internal error: allowOverMax must be TRUE/FALSE" msgstr "" -#: subset.c:125 +#: subset.c:138 #, c-format msgid "Internal error. max is %d, must be >= 0." msgstr "" -#: subset.c:149 +#: subset.c:162 #, c-format msgid "i[%d] is %d which is out of range [1,nrow=%d]" msgstr "" -#: subset.c:161 +#: subset.c:174 #, c-format msgid "" "Item %d of i is %d and item %d is %d. Cannot mix positives and negatives." msgstr "" -#: subset.c:171 +#: subset.c:184 #, c-format msgid "Item %d of i is %d and item %d is NA. Cannot mix negatives and NA." msgstr "" -#: subset.c:207 +#: subset.c:220 #, c-format msgid "" "Item %d of i is %d but there are only %d rows. Ignoring this and %d more " "like it out of %d." msgstr "" -#: subset.c:209 +#: subset.c:222 #, c-format msgid "" "Item %d of i is %d which removes that item but that has occurred before. " "Ignoring this dup and %d other dups." msgstr "" -#: subset.c:223 +#: subset.c:236 #, c-format msgid "Column %d is NULL; malformed data.table." msgstr "" -#: subset.c:226 +#: subset.c:239 #, c-format msgid "Column %d ['%s'] is a data.frame or data.table; malformed data.table." msgstr "" -#: subset.c:231 +#: subset.c:244 #, c-format msgid "" "Column %d ['%s'] is length %d but column 1 is length %d; malformed data." "table." msgstr "" -#: subset.c:247 +#: subset.c:260 #, c-format msgid "Internal error. Argument 'x' to CsubsetDT is type '%s' not 'list'" msgstr "" -#: subset.c:260 +#: subset.c:273 #, c-format msgid "Internal error. Argument 'cols' to Csubset is type '%s' not 'integer'" msgstr "" -#: subset.c:337 +#: subset.c:350 msgid "" "Internal error: NULL can not be subset. It is invalid for a data.table to " "contain a NULL column." msgstr "" -#: subset.c:339 +#: subset.c:352 msgid "" "Internal error: CsubsetVector is internal-use-only but has received " "negatives, zeros or out-of-range" @@ -4198,118 +4216,118 @@ msgstr "" msgid "Internal error: uniqlist has been passed length(order)==%d but nrow==%d" msgstr "" -#: uniqlist.c:96 uniqlist.c:127 uniqlist.c:208 uniqlist.c:245 uniqlist.c:318 +#: uniqlist.c:96 uniqlist.c:128 uniqlist.c:209 uniqlist.c:246 uniqlist.c:319 #, c-format msgid "Type '%s' not supported" msgstr "" -#: uniqlist.c:148 +#: uniqlist.c:149 msgid "Input argument 'x' to 'uniqlengths' must be an integer vector" msgstr "" -#: uniqlist.c:149 +#: uniqlist.c:150 msgid "" "Input argument 'n' to 'uniqlengths' must be an integer vector of length 1" msgstr "" -#: uniqlist.c:167 +#: uniqlist.c:168 msgid "cols must be an integer vector with length >= 1" msgstr "" -#: uniqlist.c:171 +#: uniqlist.c:172 #, c-format msgid "Item %d of cols is %d which is outside range of l [1,length(l)=%d]" msgstr "" -#: uniqlist.c:174 +#: uniqlist.c:175 #, c-format msgid "" "All elements to input list must be of same length. Element [%d] has length " "% != length of first element = %." msgstr "" -#: uniqlist.c:255 +#: uniqlist.c:256 msgid "Internal error: nestedid was not passed a list length 1 or more" msgstr "" -#: uniqlist.c:262 +#: uniqlist.c:263 #, c-format msgid "Internal error: nrows[%d]>0 but ngrps==0" msgstr "" -#: uniqlist.c:264 +#: uniqlist.c:265 msgid "cols must be an integer vector of positive length" msgstr "" -#: uniqlist.c:349 +#: uniqlist.c:350 msgid "x is not a logical vector" msgstr "" -#: utils.c:73 +#: utils.c:80 #, c-format msgid "Unsupported type '%s' passed to allNA()" msgstr "" -#: utils.c:92 +#: utils.c:99 msgid "'x' argument must be data.table compatible" msgstr "" -#: utils.c:94 +#: utils.c:101 msgid "'check_dups' argument must be TRUE or FALSE" msgstr "" -#: utils.c:110 +#: utils.c:117 msgid "" "argument specifying columns is type 'double' and one or more items in it are " "not whole integers" msgstr "" -#: utils.c:116 +#: utils.c:123 #, c-format msgid "argument specifying columns specify non existing column(s): cols[%d]=%d" msgstr "" -#: utils.c:121 +#: utils.c:128 msgid "'x' argument data.table has no names" msgstr "" -#: utils.c:126 +#: utils.c:133 #, c-format msgid "" "argument specifying columns specify non existing column(s): cols[%d]='%s'" msgstr "" -#: utils.c:129 +#: utils.c:136 msgid "argument specifying columns must be character or numeric" msgstr "" -#: utils.c:132 +#: utils.c:139 msgid "argument specifying columns specify duplicated column(s)" msgstr "" -#: utils.c:138 +#: utils.c:145 #, c-format msgid "%s: fill argument must be length 1" msgstr "" -#: utils.c:171 +#: utils.c:178 #, c-format msgid "%s: fill argument must be numeric" msgstr "" -#: utils.c:273 +#: utils.c:280 #, c-format msgid "Internal error: unsupported type '%s' passed to copyAsPlain()" msgstr "" -#: utils.c:277 +#: utils.c:284 #, c-format msgid "" "Internal error: type '%s' passed to copyAsPlain() but it seems " "copyMostAttrib() retains ALTREP attributes" msgstr "" -#: utils.c:312 +#: utils.c:319 #, c-format msgid "Found and copied %d column%s with a shared memory address\n" msgstr "" diff --git a/po/zh_CN.po b/po/zh_CN.po index 6a95727f07..3965a017e4 100644 --- a/po/zh_CN.po +++ b/po/zh_CN.po @@ -2,7 +2,7 @@ msgid "" msgstr "" "Project-Id-Version: data.table 1.12.5\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2019-12-30 01:24+0800\n" +"POT-Creation-Date: 2020-07-17 14:38+0800\n" "PO-Revision-Date: 2019-11-18 00:26-04\n" "Last-Translator: Yuhang Chen \n" "Language-Team: Mandarin\n" @@ -44,19 +44,19 @@ msgstr "内部错误: .internal.selfref ptr不为NULL或R_NilValue" msgid "Internal error: .internal.selfref tag isn't NULL or a character vector" msgstr "内部错误: .internal.selfref ptr不为NULL或字符向量" -#: assign.c:168 +#: assign.c:180 msgid "Internal error: length(names)>0 but =0 and not NA." msgstr "getOption('datatable.alloc')值为%d, 其必须大于等于零且不能为NA" -#: assign.c:239 fsort.c:109 +#: assign.c:251 fsort.c:109 msgid "verbose must be TRUE or FALSE" msgstr "verbose参数必须为TRUE或FALSE" -#: assign.c:287 +#: assign.c:299 msgid "assign has been passed a NULL dt" msgstr "赋值已经被传递给一个空的(NULL)dt" -#: assign.c:288 +#: assign.c:300 msgid "dt passed to assign isn't type VECSXP" msgstr "传递给赋值操作的dt不是VECSXP类型" -#: assign.c:290 +#: assign.c:302 msgid "" ".SD is locked. Updating .SD by reference using := or set are reserved for " "future use. Use := in j directly. Or use copy(.SD) as a (slow) last resort, " @@ -151,20 +151,20 @@ msgstr "" ".SD被锁定。 使用':='更新.SD操作保留将来使用对'j'直接使用':=', 或可以使用" "copy(.SD), 直到导出shallow()" -#: assign.c:298 +#: assign.c:310 msgid "Internal error: dt passed to Cassign is not a data.table or data.frame" msgstr "内部错误: 传递给赋值操作的dt不是data.table或data.frame类型" -#: assign.c:302 +#: assign.c:314 msgid "dt passed to assign has no names" msgstr "传递给赋值操作的dt没有命名" -#: assign.c:304 +#: assign.c:316 #, c-format msgid "Internal error in assign: length of names (%d) is not length of dt (%d)" msgstr "赋值的内部错误: names的长度(%d)与dt的长度(%d)不匹配" -#: assign.c:306 +#: assign.c:318 msgid "" "data.table is NULL; malformed. A null data.table should be an empty list. " "typeof() should always return 'list' for data.table." @@ -172,18 +172,18 @@ msgstr "" "data.table为空, 格式错误,一个null的data.table应该为空的列表list即对data." "table使用typeof()函数应该返回'list'类型" -#: assign.c:315 +#: assign.c:327 #, c-format msgid "Assigning to all %d rows\n" msgstr "为所有的%d行赋值\n" -#: assign.c:320 +#: assign.c:332 msgid "" "Coerced i from numeric to integer. Please pass integer for efficiency; e.g., " "2L rather than 2" msgstr "将i由数值型强制转换为整数型。请直接传入整数以提高效率,如传入2L而非2" -#: assign.c:323 +#: assign.c:335 #, c-format msgid "" "i is type '%s'. Must be integer, or numeric is coerced with warning. If i is " @@ -194,26 +194,26 @@ msgstr "" "整型并发出警告)。如果 i 为一个用于筛选的逻辑(logical)向量,请直接将它传给 " "which(),且如果可能的话将 which() 放置于循环之外以保持高效。" -#: assign.c:329 +#: assign.c:341 #, c-format msgid "i[%d] is %d which is out of range [1,nrow=%d]." msgstr "i[%d] 为 %d 且超出了范围 [1,nrow=%d]。" -#: assign.c:332 +#: assign.c:344 #, c-format msgid "Assigning to %d row subset of %d rows\n" msgstr "正在为 %d 行(总数为 %d 行)进行赋值\n" -#: assign.c:340 +#: assign.c:352 #, c-format msgid "Added %d new column%s initialized with all-NA\n" msgstr "添加了 %d 个新列 %s 并全部初始化为 NA\n" -#: assign.c:345 +#: assign.c:357 msgid "length(LHS)==0; no columns to delete or assign RHS to." msgstr "左手侧长度为0(length(LHS)==0);没有列可供删除或赋值给右手侧(RHS)。" -#: assign.c:359 +#: assign.c:371 msgid "" "set() on a data.frame is for changing existing columns, not adding new ones. " "Please use a data.table for that. data.table's are over-allocated and don't " @@ -223,7 +223,7 @@ msgstr "" "table 来添加新列。data.table 的操作是超额分配的(over-allocated)并且不进行浅" "拷贝(shallow copy)。" -#: assign.c:370 +#: assign.c:382 msgid "" "Coerced j from numeric to integer. Please pass integer for efficiency; e.g., " "2L rather than 2" @@ -231,7 +231,7 @@ msgstr "" "将 j 从数值(numeric)型自动转换为整(integer)型。为了保持高效请直接传入整" "型,如2L 而非 2" -#: assign.c:373 +#: assign.c:385 #, c-format msgid "" "j is type '%s'. Must be integer, character, or numeric is coerced with " @@ -240,22 +240,22 @@ msgstr "" "j 为 '%s' 型。j 必须为整(integer)型、字符(character)型,或数值(numeric)" "型(将被自动转换成整型并发出警告)。" -#: assign.c:375 +#: assign.c:387 msgid "" "Can't assign to the same column twice in the same query (duplicates " "detected)." msgstr "在一次查询中无法对同一列赋值两次(检测出重复项)。" -#: assign.c:376 +#: assign.c:388 msgid "newcolnames is supplied but isn't a character vector" msgstr "指定了 newcolnames 但其并非一字符串向量" -#: assign.c:378 +#: assign.c:390 #, c-format msgid "RHS_list_of_columns == %s\n" msgstr "RHS_list_of_columns == %s\n" -#: assign.c:383 +#: assign.c:395 #, c-format msgid "" "RHS_list_of_columns revised to true because RHS list has 1 item which is " @@ -264,7 +264,7 @@ msgstr "" "RHS_list_of_columns 改为真(True),因为右手侧列表(RHS list)有一子项为空值" "(NULL)或长度 %d 为 1 或 targetlen(%d)。请拆开右手侧。\n" -#: assign.c:388 +#: assign.c:400 #, c-format msgid "" "Supplied %d columns to be assigned an empty list (which may be an empty data." @@ -275,19 +275,19 @@ msgstr "" "后两者也是列表的一种)。删除多个列时请使用空值(NULL)。添加多个空列表列" "(list columns)时,请使用 list(list())。" -#: assign.c:393 +#: assign.c:405 #, c-format msgid "Recycling single RHS list item across %d columns. Please unwrap RHS.\n" msgstr "" "回收重用(Recycling)单个右手侧(RHS)列表子项于 %d 列。请拆开右手侧。\n" -#: assign.c:395 +#: assign.c:407 #, c-format msgid "" "Supplied %d columns to be assigned %d items. Please see NEWS for v1.12.2." msgstr "试图将 %2$d 项赋值给 %1$d 列。请阅读 v1.12.2 的更新信息(NEWS)。" -#: assign.c:403 +#: assign.c:415 #, c-format msgid "" "Item %d of column numbers in j is %d which is outside range [1,ncol=%d]. " @@ -297,7 +297,7 @@ msgstr "" "j 中的列编号里第 %d 项是 %d,超出了有效范围 [1,ncol=%d]。数据框(data.frame)" "的 set() 是用于修改现有列,而非添加新列。请使用 data.table 来添加新列。" -#: assign.c:404 +#: assign.c:416 #, c-format msgid "" "Item %d of column numbers in j is %d which is outside range [1,ncol=%d]. Use " @@ -306,11 +306,11 @@ msgstr "" "j 中的列编号里第 %d 项是 %d,超出了有效范围 [1,ncol=%d]。请在 j 中使用列名来" "添加新列。" -#: assign.c:409 +#: assign.c:421 msgid "When deleting columns, i should not be provided" msgstr "当删除列时,不应指定 i" -#: assign.c:415 +#: assign.c:427 #, c-format msgid "" "RHS of assignment to existing column '%s' is zero length but not NULL. If " @@ -326,23 +326,23 @@ msgstr "" "一个与该列原数据等长的向量,如 vector('list',nrow(DT)),即,用新数据替换" "(plonk)重新生成该列。" -#: assign.c:420 +#: assign.c:432 #, c-format msgid "" "Internal error in assign.c: length(newcolnames)=%d, length(names)=%d, coln=%d" msgstr "assign.c 内部错误:length(newcolnames)=%d, length(names)=%d, coln=%d" -#: assign.c:422 +#: assign.c:434 #, c-format msgid "Column '%s' does not exist to remove" msgstr "要删除的列 '%s' 不存在" -#: assign.c:428 +#: assign.c:440 #, c-format msgid "%d column matrix RHS of := will be treated as one vector" msgstr "':=' 右手侧(RHS)%d 列矩阵将被视为一维向量" -#: assign.c:432 +#: assign.c:444 #, c-format msgid "" "Can't assign to column '%s' (type 'factor') a value of type '%s' (not " @@ -351,7 +351,7 @@ msgstr "" "无法给因子(factor)类型列 '%s' 赋类型为 '%s' 的值(不是字符(character)、因" "子(factor)、整数(integer)或数值(numeric)类中的一种)" -#: assign.c:437 +#: assign.c:449 #, c-format msgid "" "Supplied %d items to be assigned to %d items of column '%s'. If you wish to " @@ -361,7 +361,7 @@ msgstr "" "试图将 %d 项赋值给 %d 项(列 '%s')。如果想'回收重用'('recycle')右手侧,请" "使用 rep() 以将该意图清晰地表述给阅读代码的人。" -#: assign.c:447 +#: assign.c:459 msgid "" "This data.table has either been loaded from disk (e.g. using readRDS()/" "load()) or constructed manually (e.g. using structure()). Please run setDT() " @@ -372,7 +372,7 @@ msgstr "" "structure() )。在通过引用的方式进行赋值前,请先运行 setDT() 或 setalloccol() " "来为增加的列预先分配空间" -#: assign.c:448 +#: assign.c:460 #, c-format msgid "" "Internal error: oldtncol(%d) < oldncol(%d). Please report to data.table " @@ -381,7 +381,7 @@ msgstr "" "内部错误: oldtncol(%d) < oldncol(%d)。 请将此问题汇报给 data.table 问题追踪" "器,包括 sessionInfo() 的信息。" -#: assign.c:450 +#: assign.c:462 #, c-format msgid "" "truelength (%d) is greater than 10,000 items over-allocated (length = %d). " @@ -393,7 +393,7 @@ msgstr "" "truelength。如果你没有将 datatable.alloccol 设置为非常大的数值,请将此问题汇" "报给 data.table 问题追踪器,包含 sessionInfo() 的信息" -#: assign.c:452 +#: assign.c:464 #, c-format msgid "" "Internal error: DT passed to assign has not been allocated enough column " @@ -401,7 +401,7 @@ msgid "" msgstr "" "内部错误: 传递出去赋值的 DT 没有被分配足够的列槽。 l=%d, tl=%d, 增加 %d" -#: assign.c:454 +#: assign.c:466 msgid "" "It appears that at some earlier point, names of this data.table have been " "reassigned. Please ensure to use setnames() rather than names<- or " @@ -411,18 +411,18 @@ msgstr "" "names<- 或 colnames<- 进行赋值。如果该办法无效,请将此问题汇报给 data.table " "问题追踪器,包含 sessionInfo() 的信息" -#: assign.c:458 +#: assign.c:470 #, c-format msgid "Internal error: selfrefnames is ok but tl names [%d] != tl [%d]" msgstr "内部错误: selfrefnames 正确,但 tl 的名称 [%d] != tl [%d]" -#: assign.c:469 +#: assign.c:481 msgid "" "Internal error: earlier error 'When deleting columns, i should not be " "provided' did not happen." msgstr "内部错误: 前期的错误 '当删除列的时候,不应该提供参数 i ' 没有发生" -#: assign.c:480 +#: assign.c:492 #, c-format msgid "" "RHS for item %d has been duplicated because NAMED==%d MAYBE_SHARED==%d, but " @@ -431,12 +431,12 @@ msgstr "" "因为 NAMED==%d MAYBE_SHARED==%d, 所以条目 %d 的 RHS 已经被复制,但是接下来又" "要被替换了。length(values)==%d; length(cols)==%d)\n" -#: assign.c:485 +#: assign.c:497 #, c-format msgid "Direct plonk of unnamed RHS, no copy. NAMED==%d, MAYBE_SHARED==%d\n" msgstr "直接替换没有名字的 RHS,并没有复制。 NAMED==%d, MAYBE_SHARED==%d\n" -#: assign.c:554 +#: assign.c:566 #, c-format msgid "" "Dropping index '%s' as it doesn't have '__' at the beginning of its name. It " @@ -445,38 +445,50 @@ msgstr "" "丢掉索引 '%s' 因为它的名字前面没有 '__' 。这个很可能是 data.table v1.9.4 创建" "的\n" -#: assign.c:562 +#: assign.c:574 msgid "Internal error: index name ends with trailing __" msgstr "内部错误: 索引名称以 __ 结尾" -#: assign.c:567 +#: assign.c:579 msgid "Internal error: Couldn't allocate memory for s4." msgstr "内部错误: 不能给 s4 分配内存" -#: assign.c:578 +#: assign.c:590 msgid "Internal error: Couldn't allocate memory for s5." msgstr "内部错误: 不能给 s5 分配内存" -#: assign.c:599 assign.c:615 +#: assign.c:611 assign.c:627 #, c-format msgid "Dropping index '%s' due to an update on a key column\n" msgstr " 因为一个主列的更新,丢掉索引 '%s'\n" -#: assign.c:608 +#: assign.c:620 #, c-format msgid "Shortening index '%s' to '%s' due to an update on a key column\n" msgstr "因为一个主列的更新,缩短索引 '%s' 到 '%s'\n" -#: assign.c:680 +#: assign.c:695 +#, c-format +msgid "" +"Internal error memrecycle: sourceStart=%d sourceLen=%d length(source)=%d" +msgstr "" +"memrecycle 内部错误:sourceStart=%d sourceLen=%d length(source)=%d" + +#: assign.c:697 +#, c-format +msgid "Internal error memrecycle: start=%d len=%d length(target)=%d" +msgstr "memrecycle 内部错误:start=%d len=%d length(target)=%d" + +#: assign.c:700 #, c-format msgid "Internal error: recycle length error not caught earlier. slen=%d len=%d" msgstr "内部错误: 早期未被发现的循环长度错误 slen=%d len=%d" -#: assign.c:684 +#: assign.c:704 msgid "Internal error: memrecycle has received NULL colname" msgstr "内部错误: memrecycle 接受到的列名为 NULL " -#: assign.c:710 +#: assign.c:730 #, c-format msgid "" "Cannot assign 'factor' to '%s'. Factors can only be assigned to factor, " @@ -484,14 +496,14 @@ msgid "" msgstr "" "不能将 'factor' 赋值为 '%s' 。因子类型只能赋值为因子,字符或者列表其中的列" -#: assign.c:724 +#: assign.c:744 #, c-format msgid "" "Assigning factor numbers to column %d named '%s'. But %d is outside the " "level range [1,%d]" msgstr "将列 %d 名称为 '%s' 赋值为因子。但是 %d 在层次范围[1,%d]之外" -#: assign.c:732 +#: assign.c:752 #, c-format msgid "" "Assigning factor numbers to column %d named '%s'. But %f is outside the " @@ -500,7 +512,7 @@ msgstr "" "将列 %d 名称为 '%s' 赋值为因子。但是 %f 在层次范围[1,%d]之外,或者不是一个完" "整的数字" -#: assign.c:738 +#: assign.c:758 #, c-format msgid "" "Cannot assign '%s' to 'factor'. Factor columns can be assigned factor, " @@ -508,28 +520,28 @@ msgid "" msgstr "" "不能将 'factor' 赋值为 '%s' 。 因子列可被赋值为因子,字符 ,NA 或者 层次数值" -#: assign.c:759 +#: assign.c:779 msgid "" "Internal error: levels of target are either not unique or have truelength<0" msgstr "内部错误: 目标的层次不是唯一或者长度<0" -#: assign.c:798 +#: assign.c:818 #, c-format msgid "Unable to allocate working memory of %d bytes to combine factor levels" msgstr "不能分配 %d 字节的工作内存来组合因子层次" -#: assign.c:805 +#: assign.c:825 msgid "Internal error: extra level check sum failed" msgstr "内部错误: 额外的层次校验和失败" -#: assign.c:824 +#: assign.c:844 #, c-format msgid "" "Coercing 'character' RHS to '%s' to match the type of the target column " "(column %d named '%s')." msgstr "将'character' RHS 强制转换成 '%s' 来匹配目标列的类型(列 %d 名称 '%s')" -#: assign.c:830 +#: assign.c:850 #, c-format msgid "" "Cannot coerce 'list' RHS to 'integer64' to match the type of the target " @@ -537,40 +549,40 @@ msgid "" msgstr "" "不能将'list' RHS 强制转换成 'integer64' 来匹配目标列的类型(列 %d 名称 '%s')" -#: assign.c:835 +#: assign.c:855 #, c-format msgid "" "Coercing 'list' RHS to '%s' to match the type of the target column (column " "%d named '%s')." msgstr "将'list' RHS 强制转换成 '%s' 来匹配目标列的类型(列 %d 名称 '%s')" -#: assign.c:841 +#: assign.c:861 #, c-format msgid "Zero-copy coerce when assigning '%s' to '%s' column %d named '%s'.\n" msgstr "当 '%s' 赋值成 '%s' 列 %d 名称 '%s',进行Zero-copy强制转换。\n" -#: assign.c:936 +#: assign.c:956 #, c-format msgid "type '%s' cannot be coerced to '%s'" msgstr "类型 '%s' 不能强制转换成 '%s'" -#: assign.c:1056 +#: assign.c:1076 msgid "" "To assign integer64 to a character column, please use as.character() for " "clarity." msgstr "请使用 as.character() 把 integer64 类型的数值赋值给字符列" -#: assign.c:1068 +#: assign.c:1088 #, c-format msgid "Unsupported column type in assign.c:memrecycle '%s'" msgstr "assign.c:memrecycle '%s' 里有不支持的列的类型" -#: assign.c:1115 +#: assign.c:1135 #, c-format msgid "Internal error: writeNA passed a vector of type '%s'" msgstr "内部错误:writeNA 函数读取到了一个类型是'%s'的向量" -#: assign.c:1146 +#: assign.c:1166 #, c-format msgid "" "Internal error: savetl_init checks failed (%d %d %p %p). please report to " @@ -579,12 +591,12 @@ msgstr "" "内部错误:savetl_init的校验失败 (%d %d %p %p),请将此问题汇报给data.table 问" "题追踪器。" -#: assign.c:1154 +#: assign.c:1174 #, c-format msgid "Failed to allocate initial %d items in savetl_init" msgstr "不能为 savetl_init 最开始的 %d 个项分配空间" -#: assign.c:1163 +#: assign.c:1183 #, c-format msgid "" "Internal error: reached maximum %d items for savetl. Please report to data." @@ -593,58 +605,40 @@ msgstr "" "内部错误:已经达到了 savetl 能处理的子项上限 %d。请将此问题汇报给data.table问" "题追踪器。" -#: assign.c:1170 +#: assign.c:1190 #, c-format msgid "Failed to realloc saveds to %d items in savetl" msgstr "不能给 savetl 里的 %d 个项重新分配 saveds" -#: assign.c:1176 +#: assign.c:1196 #, c-format msgid "Failed to realloc savedtl to %d items in savetl" msgstr "不能给savetl里的 %d 个项提供 savetl" -#: assign.c:1199 +#: assign.c:1219 msgid "x must be a character vector" msgstr "x 必须是一个字符向量" -#: assign.c:1200 +#: assign.c:1220 msgid "'which' must be an integer vector" msgstr "'which' 必须是一个整数向量" -#: assign.c:1201 +#: assign.c:1221 msgid "'new' must be a character vector" msgstr "'new' 必须是一个字符向量" -#: assign.c:1202 +#: assign.c:1222 #, c-format msgid "'new' is length %d. Should be the same as length of 'which' (%d)" msgstr "'new' 的长度是 %d。 它的长度必须和'which' (%d)的长度一致。" -#: assign.c:1205 +#: assign.c:1225 #, c-format msgid "" "Item %d of 'which' is %d which is outside range of the length %d character " "vector" msgstr "'which' 的 %d 项是 %d,这超出了 %d 字符的长度范围" -#: assign.c:1215 -msgid "dt passed to setcolorder has no names" -msgstr "setcolorder读取到的dt并没有名字" - -#: assign.c:1217 -#, c-format -msgid "Internal error: dt passed to setcolorder has %d columns but %d names" -msgstr "内部错误: setcolorder读取到的dt有 %d 列但是有 %d 个名字。" - -#: assign.c:1224 -msgid "" -"Internal error: o passed to Csetcolorder contains an NA or out-of-bounds" -msgstr "内部错误: Csetcolorder读取到的o有一个NA(缺失值)或者是下标出界" - -#: assign.c:1226 -msgid "Internal error: o passed to Csetcolorder contains a duplicate" -msgstr "内部错误: Csetcolorder读取到的o含有一个重复值" - #: between.c:12 #, c-format msgid "" @@ -738,114 +732,123 @@ msgstr "内部错误: icols 不是一个整数向量" msgid "Internal error: xcols is not integer vector" msgstr "内部错误: xcols 不是一个整数向量" -#: bmerge.c:50 +#: bmerge.c:51 +msgid "Internal error: icols and xcols must be non-empty integer vectors." +msgstr "内部错误: icols 不是一个整数向量" + +#: bmerge.c:52 #, c-format msgid "Internal error: length(icols) [%d] > length(xcols) [%d]" msgstr "内部错误: icols[%1$d] 的长度大于 xcols[%2$d] 的长度" -#: bmerge.c:57 +#: bmerge.c:59 #, c-format msgid "Internal error. icols[%d] is NA" msgstr "内部错误: icols[%d] 是 NA, 缺失值" -#: bmerge.c:58 +#: bmerge.c:60 #, c-format msgid "Internal error. xcols[%d] is NA" msgstr "内部错误: xcols[%d] 是 NA, 缺失值" -#: bmerge.c:59 +#: bmerge.c:61 #, c-format msgid "icols[%d]=%d outside range [1,length(i)=%d]" msgstr "icols[%1$d]=%2$d 造成了空间溢出,当前范围是[1,length(i)=%3$d]" -#: bmerge.c:60 +#: bmerge.c:62 #, c-format msgid "xcols[%d]=%d outside range [1,length(x)=%d]" msgstr "xcols[%1$d]=%2$d 造成了空间溢出,当前范围是[1,length(i)=%3$d]" -#: bmerge.c:63 +#: bmerge.c:65 #, c-format msgid "typeof x.%s (%s) != typeof i.%s (%s)" msgstr "x.%1$s (%2$s) 的数据类型和 i.%3$s (%4$s) 的数据类型并不一致" -#: bmerge.c:70 +#: bmerge.c:72 msgid "roll is character but not 'nearest'" msgstr "roll 是字符但并不是最近的" -#: bmerge.c:71 +#: bmerge.c:73 msgid "roll='nearest' can't be applied to a character column, yet." msgstr "roll='最近的'的功能当前并不能被使用在字符列。" -#: bmerge.c:74 +#: bmerge.c:76 msgid "Internal error: roll is not character or double" msgstr "内部错误: roll 不是字符或者是浮点" -#: bmerge.c:79 +#: bmerge.c:81 msgid "rollends must be a length 2 logical vector" msgstr "rollends 必须是一个长度为2的逻辑向量" -#: bmerge.c:89 uniqlist.c:270 +#: bmerge.c:91 uniqlist.c:271 msgid "" "Internal error: invalid value for 'mult'. please report to data.table issue " "tracker" msgstr "内部错误: 'mult' 是无效值。 请将此问题汇报给 data.table 问题追踪器。" -#: bmerge.c:93 +#: bmerge.c:95 msgid "" "Internal error: opArg is not an integer vector of length equal to length(on)" msgstr "内部错误: opArg 不是一个长度为 on 的整数向量" -#: bmerge.c:96 +#: bmerge.c:98 msgid "Internal error: nqgrpArg must be an integer vector" msgstr "内部错误:nqgrpArg 必须为一个整数向量" -#: bmerge.c:102 +#: bmerge.c:104 msgid "Intrnal error: nqmaxgrpArg is not a positive length-1 integer vector" msgstr "内部错误:nqmaxgrpArg不是长度为1的正整型向量" -#: bmerge.c:111 +#: bmerge.c:113 msgid "Internal error in allocating memory for non-equi join" msgstr "不等值联结分配内存出现内部错误" -#: bmerge.c:156 +#: bmerge.c:158 msgid "Internal error: xoArg is not an integer vector" msgstr "内部错误:xoArg不是整型向量" -#: bmerge.c:271 bmerge.c:379 +#: bmerge.c:273 bmerge.c:381 #, c-format msgid "" "Internal error in bmerge_r for '%s' column. Unrecognized value op[col]=%d" msgstr "bmerge_r 针对 '%s' 列的操作出现内部错误。无法识别值 op[col]=%d" -#: bmerge.c:303 +#: bmerge.c:305 #, c-format msgid "Only '==' operator is supported for columns of type %s." msgstr "%s 类型的列仅支持 '==' 操作符。" -#: bmerge.c:410 +#: bmerge.c:412 #, c-format msgid "Type '%s' not supported for joining/merging" msgstr "'%s' 类型不支持联结/归并" -#: bmerge.c:468 +#: bmerge.c:470 msgid "Internal error: xlow!=xupp-1 || xlowxuppIn" msgstr "内部错误:xlow!=xupp-1 或 xlowxuppIn" -#: chmatch.c:4 -#, c-format -msgid "x is type '%s' (must be 'character' or NULL)" -msgstr "x 类型为 '%s' (必须为'character'或 NULL)" - #: chmatch.c:5 #, c-format msgid "table is type '%s' (must be 'character' or NULL)" msgstr "table 类型为 '%s' (必须为 'character' 或 NULL)" -#: chmatch.c:6 +#: chmatch.c:7 msgid "Internal error: either chin or chmatchdup should be true not both" msgstr "内部错误:chin 和 chmatchdup 不能同时为真" -#: chmatch.c:44 +#: chmatch.c:12 +#, c-format +msgid "Internal error: length of SYMSXP is %d not 1" +msgstr "内部错误:SYMSXP的长度为 %d 而非 1" + +#: chmatch.c:19 +#, c-format +msgid "x is type '%s' (must be 'character' or NULL)" +msgstr "x 类型为 '%s' (必须为'character'或 NULL)" + +#: chmatch.c:66 #, c-format msgid "" "Internal error: CHARSXP '%s' has a negative truelength (%d). Please file an " @@ -854,7 +857,7 @@ msgstr "" "内部错误:CHARSXP '%s' 的 truelength (%d) 为负。请将此问题汇报给 data.table " "问题追踪器。" -#: chmatch.c:73 +#: chmatch.c:95 #, c-format msgid "" "Failed to allocate % bytes working memory in chmatchdup: " @@ -936,31 +939,31 @@ msgstr "coalesce 复制了第一项 (inplace=FALSE)\n" msgid "Unsupported type: %s" msgstr "不支持的类型:%s" -#: dogroups.c:14 +#: dogroups.c:15 msgid "Internal error: order not integer vector" msgstr "内部错误:order 不是整型向量" -#: dogroups.c:15 +#: dogroups.c:16 msgid "Internal error: starts not integer" msgstr "内部错误:starts 不是整型" -#: dogroups.c:16 +#: dogroups.c:17 msgid "Internal error: lens not integer" msgstr "内部错误:lens 不是整型" -#: dogroups.c:18 +#: dogroups.c:19 msgid "Internal error: jiscols not NULL but o__ has length" msgstr "内部错误:jiscols 非 NULL,但 o__ 长度不为0" -#: dogroups.c:19 +#: dogroups.c:20 msgid "Internal error: xjiscols not NULL but o__ has length" msgstr "内部错误:jiscols 非 NULL,但 o__ 长度不为0" -#: dogroups.c:20 +#: dogroups.c:21 msgid "'env' should be an environment" msgstr "'env' 应该是一个环境" -#: dogroups.c:39 +#: dogroups.c:40 #, c-format msgid "" "Internal error: unsupported size-0 type '%s' in column %d of 'by' should " @@ -968,16 +971,16 @@ msgid "" msgstr "" "内部错误:未能被提前捕获到 'by' 中第 %2$d 列不支持类型 '%1$s' 且size-0 的问题" -#: dogroups.c:43 +#: dogroups.c:44 #, c-format msgid "!length(bynames)[%d]==length(groups)[%d]==length(grpcols)[%d]" msgstr "!length(bynames)[%d]==length(groups)[%d]==length(grpcols)[%d]" -#: dogroups.c:62 +#: dogroups.c:63 msgid "row.names attribute of .SD not found" msgstr ".SD 的行名属性不存在" -#: dogroups.c:64 +#: dogroups.c:65 #, c-format msgid "" "row.names of .SD isn't integer length 2 with NA as first item; i.e., ." @@ -986,47 +989,43 @@ msgstr "" ".SD 的行名不是长度为2且首个元素为 NA 的整型;例如:set_row_names(). [%s %d " "%d]" -#: dogroups.c:69 +#: dogroups.c:70 msgid "length(names)!=length(SD)" msgstr "length(names)!=length(SD)" -#: dogroups.c:73 +#: dogroups.c:74 #, c-format msgid "" "Internal error: size-0 type %d in .SD column %d should have been caught " "earlier" msgstr "内部错误:未能提前捕获到 .SD 中第 %2$d 列类型 %1$d size-0 的问题" -#: dogroups.c:83 +#: dogroups.c:84 msgid "length(xknames)!=length(xSD)" msgstr "length(xknames)!=length(xSD)" -#: dogroups.c:87 +#: dogroups.c:88 #, c-format msgid "" "Internal error: type %d in .xSD column %d should have been caught by now" msgstr "内部错误:当前未能捕获到 .xSD 中第 %2$d 列类型 %1$d 的问题" -#: dogroups.c:91 +#: dogroups.c:92 #, c-format msgid "length(iSD)[%d] != length(jiscols)[%d]" msgstr "length(iSD)[%d] != length(jiscols)[%d]" -#: dogroups.c:92 +#: dogroups.c:93 #, c-format msgid "length(xSD)[%d] != length(xjiscols)[%d]" msgstr "length(xSD)[%d] != length(xjiscols)[%d]" -#: dogroups.c:155 dogroups.c:184 -msgid "Internal error. Type of column should have been checked by now" -msgstr "内部错误:至此列的类型应已经被检查完成" - -#: dogroups.c:273 +#: dogroups.c:198 #, c-format msgid "j evaluates to type '%s'. Must evaluate to atomic vector or list." msgstr "j的运算结果为'%s'类型。其运算结果必须为原子向量或列表。" -#: dogroups.c:281 +#: dogroups.c:206 msgid "" "All items in j=list(...) should be atomic vectors or lists. If you are " "trying something like j=list(.SD,newcol=mean(colA)) then use := by group " @@ -1036,13 +1035,13 @@ msgstr "" "newcol=mean(colA)) 之类的操作请使用 := by group 代替(更快速),或事后使用 " "cbind()、merge()" -#: dogroups.c:290 +#: dogroups.c:215 msgid "" "RHS of := is NULL during grouped assignment, but it's not possible to delete " "parts of a column." msgstr "用 := 分组时 RHS 为 NULL但無法刪除部分列" -#: dogroups.c:294 +#: dogroups.c:219 #, c-format msgid "" "Supplied %d items to be assigned to group %d of size %d in column '%s'. The " @@ -1054,7 +1053,7 @@ msgstr "" "须是 1(可以是单个值) 或完全符合 LHS 的长度如果您想回收(recycle) RHS,请使用 " "rep() 向你的代码读者明确表达你的意图" -#: dogroups.c:305 +#: dogroups.c:230 msgid "" "Internal error: Trying to add new column by reference but tl is full; " "setalloccol should have run first at R level before getting to this point in " @@ -1063,16 +1062,16 @@ msgstr "" "内部错误 : 尝试依照引用增加新列但 tl 已满在进入 dogroups 之前,setalloccol 应" "该先在 R 运行" -#: dogroups.c:320 +#: dogroups.c:245 #, c-format msgid "Group %d column '%s': %s" msgstr "列 '%2$s' 第 %1$d 组 : %3$s" -#: dogroups.c:327 +#: dogroups.c:252 msgid "j doesn't evaluate to the same number of columns for each group" msgstr "j 估算出的每组的列数不同" -#: dogroups.c:361 +#: dogroups.c:286 #, c-format msgid "" "Column %d of j's result for the first group is NULL. We rely on the column " @@ -1086,7 +1085,7 @@ msgstr "" "(需要一致性)空 (NULL) 列可以出现在后面的组(适当的以 NA 取代并回收)但不能是第 " "1 组请输入空向量代替,例如 integer() 或 numeric()" -#: dogroups.c:364 +#: dogroups.c:289 msgid "" "j appears to be a named vector. The same names will likely be created over " "and over again for each group and slow things down. Try and pass a named " @@ -1095,7 +1094,7 @@ msgstr "" "j 是名称向量,这可能使相同的名称不停重复创建导致速度变慢请尝试输入名称列表(较" "适合 data.table)或是非名称列表代替\n" -#: dogroups.c:366 +#: dogroups.c:291 #, c-format msgid "" "Column %d of j is a named vector (each item down the rows is named, " @@ -1105,7 +1104,7 @@ msgstr "" "j 的第 %d 列是名称向量(整行的项都是名称)为了效率请移除这些名称(避免在每组重复" "创建这些名称)总之他们被忽略了\n" -#: dogroups.c:374 +#: dogroups.c:299 msgid "" "The result of j is a named list. It's very inefficient to create the same " "names over and over again for each group. When j=list(...), any names are " @@ -1117,17 +1116,17 @@ msgstr "" "j=list(...) 时侦测到的所有名称会被移出,待分组完成后再放回来可以使用 " "j=transform() 避免这种加速此讯息可能会在未来升级为警告\n" -#: dogroups.c:386 +#: dogroups.c:311 #, c-format msgid "dogroups: growing from %d to %d rows\n" msgstr "dogroups: 从 %d 列增加至 %d 列\n" -#: dogroups.c:387 +#: dogroups.c:312 #, c-format msgid "dogroups: length(ans)[%d]!=ngrpcols[%d]+njval[%d]" msgstr "dogroups: length(ans)[%d]!=ngrpcols[%d]+njval[%d]" -#: dogroups.c:420 +#: dogroups.c:330 #, c-format msgid "" "Item %d of j's result for group %d is zero length. This will be filled with " @@ -1138,7 +1137,7 @@ msgstr "" "j 的结果第 %d 项在第 %d 组中为零长度(zero length)将使用 %d 个 NA 填入以符合结" "果中最长列的长度后面的分组也有相同问题,但只回报第一组以避免过多警告" -#: dogroups.c:427 +#: dogroups.c:337 #, c-format msgid "" "Column %d of result for group %d is type '%s' but expecting type '%s'. " @@ -1147,7 +1146,7 @@ msgstr "" "结果的第 %d 列在第 %d 组中是 '%s' 类别而非预期的 '%s' 类别所有组的列类别必须" "一致" -#: dogroups.c:429 +#: dogroups.c:339 #, c-format msgid "" "Supplied %d items for column %d of group %d which has %d rows. The RHS " @@ -1159,17 +1158,17 @@ msgstr "" "單個值) 或與 LHS 長度完全匹配如果您想回收(recycle) RHS,请使用 rep() 向你的代" "码读者明确表达你的意图" -#: dogroups.c:444 +#: dogroups.c:354 #, c-format msgid "Wrote less rows (%d) than allocated (%d).\n" msgstr "写入的行 (%d) 少于分配的 (%d)\n" -#: dogroups.c:454 +#: dogroups.c:364 #, c-format msgid "Internal error: block 0 [%d] and block 1 [%d] have both run" msgstr "内部错误 : 区块 0 [%d] 与区块 1 [%d] 都运行了" -#: dogroups.c:456 +#: dogroups.c:366 #, c-format msgid "" "\n" @@ -1178,15 +1177,20 @@ msgstr "" "\n" " %s 花了 %.3fs 在 %d 个组\n" -#: dogroups.c:458 +#: dogroups.c:368 #, c-format msgid " eval(j) took %.3fs for %d calls\n" msgstr " eval(j)取%.3fs给 %d 调用\n" -#: dogroups.c:482 +#: dogroups.c:392 msgid "growVector passed NULL" msgstr "growVector通过NULL" +#: dogroups.c:412 +#, c-format +msgid "Internal error: growVector doesn't support type '%s'" +msgstr "内部错误:growVector 不支持 '%s' 类型" + #: fastmean.c:39 msgid "narm should be TRUE or FALSE" msgstr "narm必须是TRUE或FALSE" @@ -1201,7 +1205,7 @@ msgstr "传递给 fastmean 的是 %s 类型,而不是数值或逻辑类型" msgid "Internal error: type '%s' not caught earlier in fastmean" msgstr "内部错误:先前fastmean没有侦测到类型 '%s' " -#: fcast.c:80 +#: fcast.c:78 #, c-format msgid "Unsupported column type in fcast val: '%s'" msgstr "fcast val不支持的列类型:'%s'" @@ -1210,62 +1214,73 @@ msgstr "fcast val不支持的列类型:'%s'" msgid "Argument 'test' must be logical." msgstr "参数'test'必须是逻辑类型。" -#: fifelse.c:23 +#: fifelse.c:28 #, c-format msgid "" "'yes' is of type %s but 'no' is of type %s. Please make sure that both " "arguments have the same type." msgstr "'yes'是%s类型,但'no'是%s类型。请确认两个参数是同一类型。" -#: fifelse.c:28 +#: fifelse.c:33 msgid "" "'yes' has different class than 'no'. Please make sure that both arguments " "have the same class." msgstr "'yes'的类型与'no'不同。请确认两个参数是同一类型。" -#: fifelse.c:33 +#: fifelse.c:38 msgid "'yes' and 'no' are both type factor but their levels are different." msgstr "'yes'和'no'都是因子类型但他们的因子水平不同。" -#: fifelse.c:38 +#: fifelse.c:43 #, c-format msgid "" "Length of 'yes' is % but must be 1 or length of 'test' (%)." msgstr "'yes'长度是%但长度必须是1或者等于'test'的长度 (%)。" -#: fifelse.c:40 +#: fifelse.c:45 #, c-format msgid "" "Length of 'no' is % but must be 1 or length of 'test' (%)." msgstr "'no'长度是%但长度必须是1或者等于'test'的长度 (%)。" -#: fifelse.c:51 +#: fifelse.c:56 #, c-format msgid "Length of 'na' is % but must be 1" msgstr "'na'长度是%但必须是长度必须是1" -#: fifelse.c:57 +#: fifelse.c:62 #, c-format msgid "" "'yes' is of type %s but 'na' is of type %s. Please make sure that both " "arguments have the same type." msgstr "'yes'是%s类型,但'na'是%s类型。请确认两个参数是同一类型。" -#: fifelse.c:59 +#: fifelse.c:64 msgid "" "'yes' has different class than 'na'. Please make sure that both arguments " "have the same class." msgstr "'yes'的类型与'na'不同。请确认两个参数是同一类型。" -#: fifelse.c:63 +#: fifelse.c:68 msgid "'yes' and 'na' are both type factor but their levels are different." msgstr "'yes'和'na'都是因子类型但他们的因子水平不同" -#: fifelse.c:133 +#: fifelse.c:138 #, c-format msgid "Type %s is not supported." msgstr "不支持类型 %s" +#: fifelse.c:152 +#, c-format +msgid "" +"Received %d inputs; please supply an even number of arguments in ..., " +"consisting of logical condition, resulting value pairs (in that order). Note " +"that the default argument must be named explicitly, e.g., default=0" +msgstr "" +"接收到 %d 个输入。请向 ... 中提供偶数个参数。" +"每一参数需包含逻辑条件判断,以及对应顺序的结果值对。请注意" +"默认参数须明确给出名字,如 default=0" + #: fmelt.c:18 msgid "'x' must be an integer" msgstr "'x'必须是整数" @@ -1457,73 +1472,73 @@ msgstr "ncol(data)为0,返回原 data.table" msgid "names(data) is NULL. Please report to data.table-help" msgstr "names(data)为NULL,请向 data.table-help 报告" -#: forder.c:106 +#: forder.c:107 #, c-format msgid "Failed to realloc thread private group size buffer to %d*4bytes" msgstr "无法将线程私有的组大小缓冲区重新分配为%d*4字节" -#: forder.c:120 +#: forder.c:121 #, c-format msgid "Failed to realloc group size result to %d*4bytes" msgstr "分配%d*4字节内存时失败。" -#: forder.c:263 +#: forder.c:264 #, c-format msgid "" "Logical error. counts[0]=%d in cradix but should have been decremented to 0. " "radix=%d" msgstr "逻辑错误:在 cradix 中的 counts[0] 应该为0,而不是%dradix=%d" -#: forder.c:278 +#: forder.c:279 msgid "Failed to alloc cradix_counts" msgstr "分配 cradix_counts 失败" -#: forder.c:280 +#: forder.c:281 msgid "Failed to alloc cradix_tmp" msgstr "分配 cradix_tmp 失败" -#: forder.c:291 +#: forder.c:292 #, c-format msgid "" "Internal error: ustr isn't empty when starting range_str: ustr_n=%d, " "ustr_alloc=%d" msgstr "内部错误:开始运行 range_str 时,ustr 未清空:ustr_n=%d,ustr_alloc=%d" -#: forder.c:292 +#: forder.c:293 msgid "Internal error: ustr_maxlen isn't 0 when starting range_str" msgstr "内部错误:开始 range_str 时,ustr_maxlen 不是0" -#: forder.c:312 +#: forder.c:313 #, c-format msgid "Unable to realloc %d * %d bytes in range_str" msgstr "在 range_str 中,无法重新分配%d * %d字节" -#: forder.c:330 +#: forder.c:331 msgid "Failed to alloc ustr3 when converting strings to UTF8" msgstr "将字符串转换为 UTF8 格式时,无法分配ustr3" -#: forder.c:348 +#: forder.c:349 msgid "Failed to alloc tl when converting strings to UTF8" msgstr "将字符串转换为 UTF8 格式时,无法分配 tl" -#: forder.c:377 +#: forder.c:378 msgid "Must an integer or numeric vector length 1" msgstr "必须是长度为1的整数或数字向量" -#: forder.c:378 +#: forder.c:379 msgid "Must be 2, 1 or 0" msgstr "必须是2、1或者0" -#: forder.c:412 +#: forder.c:413 msgid "Unknown non-finite value; not NA, NaN, -Inf or +Inf" msgstr "未知的取值范围,不属于 NA, NaN, -Inf 或 +Inf" -#: forder.c:434 +#: forder.c:435 msgid "" "Internal error: input is not either a list of columns, or an atomic vector." msgstr "内部错误:输入值既不是列表中的一列,也不是原子向量" -#: forder.c:436 +#: forder.c:437 msgid "" "Internal error: input is an atomic vector (not a list of columns) but by= is " "not NULL" @@ -1531,73 +1546,73 @@ msgstr "" "内部错误:输入值是一个原子向量(而不是列表中的一列),但是'by' 的参数是列表而不" "是NULL" -#: forder.c:438 +#: forder.c:439 msgid "" "Input is an atomic vector (not a list of columns) but order= is not a length " "1 integer" msgstr "" "输入值是一个原子向量(而不是列表中的一列),但参数 order不是长度为1的整数" -#: forder.c:440 +#: forder.c:441 #, c-format msgid "forder.c received a vector type '%s' length %d\n" msgstr "forder.c 接收到一个类型为'%s'长度为%d的向量\n" -#: forder.c:448 +#: forder.c:449 #, c-format msgid "forder.c received %d rows and %d columns\n" msgstr "forder.c 接收到%d行和%d列\n" -#: forder.c:451 +#: forder.c:452 msgid "Internal error: DT is an empty list() of 0 columns" msgstr "内部错误:DT 是一个0列的空 list" -#: forder.c:453 +#: forder.c:454 #, c-format msgid "" "Internal error: DT has %d columns but 'by' is either not integer or is " "length 0" msgstr "内部错误:DT 内部有%d列,但参数 'by' 不是整数或长度为0" -#: forder.c:455 +#: forder.c:456 #, c-format msgid "" "Either order= is not integer or its length (%d) is different to by='s length " "(%d)" msgstr "参数 order 不是整数,或者它的长度(%d)与参数 'by' 指定的长度(%d)不同" -#: forder.c:461 +#: forder.c:462 #, c-format msgid "internal error: 'by' value %d out of range [1,%d]" msgstr "内部错误:参数 'by' 的值%d超出[1,%d]的范围" -#: forder.c:463 +#: forder.c:464 #, c-format msgid "Column %d is length %d which differs from length of column 1 (%d)\n" msgstr "列%d的长度是%d,与第1列的长度(%d)不同\n" -#: forder.c:467 +#: forder.c:468 msgid "retGrp must be TRUE or FALSE" msgstr "retGrp 的参数是逻辑值,必须是 TRUE 或 FALSE" -#: forder.c:470 +#: forder.c:471 msgid "sort must be TRUE or FALSE" msgstr " sort 的参数是逻辑值,必须是 TRUE 或 FALSE" -#: forder.c:473 +#: forder.c:474 msgid "At least one of retGrp= or sort= must be TRUE" msgstr "retGrp 和sort 的参数中,至少一个必须是 TRUE" -#: forder.c:475 +#: forder.c:476 msgid "na.last must be logical TRUE, FALSE or NA of length 1" msgstr "na.last 的参数必须是逻辑值 TRUE, FALSE 或 NA " -#: forder.c:519 +#: forder.c:520 #, c-format msgid "Item %d of order (ascending/descending) is %d. Must be +1 or -1." msgstr "排序(ascending/descending)选项%d是%d,必须是+1 or -1" -#: forder.c:545 +#: forder.c:546 #, c-format msgid "" "\n" @@ -1609,111 +1624,113 @@ msgstr "" "***传递给 forder 的%d列是一个没有小数的8字节 double 类型的日期数据,请考虑使" "用4字节的整数日期(例如IDate)以节省空间和时间\n" -#: forder.c:561 +#: forder.c:562 #, c-format msgid "Column %d passed to [f]order is type '%s', not yet supported." msgstr "传递给 [f]order 的第%d列为 '%s'类型,目前尚不支持。" -#: forder.c:714 +#: forder.c:715 msgid "Internal error: column not supported, not caught earlier" msgstr "内部错误:列有不支持类型,未被前置识别" -#: forder.c:722 +#: forder.c:723 #, c-format msgid "nradix=%d\n" msgstr "nradix=%d\n" -#: forder.c:728 +#: forder.c:729 #, c-format msgid "" "Failed to allocate TMP or UGRP or they weren't cache line aligned: nth=%d" msgstr "分配TMP或UGRP失败或缓存行不一致: nth=%d" -#: forder.c:733 +#: forder.c:734 msgid "Could not allocate (very tiny) group size thread buffers" msgstr "无法分配(极小)块组大小的线程缓冲区" -#: forder.c:794 +#: forder.c:795 #, c-format msgid "Timing block %2d%s = %8.3f %8d\n" msgstr "定时块 %2d%s = %8.3f %8d\n" -#: forder.c:797 +#: forder.c:798 #, c-format msgid "stat[%03d]==%20\n" msgstr "stat[%03d]==%20\n" -#: forder.c:1053 +#: forder.c:1054 #, c-format msgid "Failed to allocate parallel counts. my_n=%d, nBatch=%d" msgstr "分配并行计算失败,my_n=%d, nBatch=%d" -#: forder.c:1162 +#: forder.c:1163 #, c-format msgid "Unable to allocate TMP for my_n=%d items in parallel batch counting" msgstr "无法分配TMP给并行批处理计算的 my_n=%d 项" -#: forder.c:1269 -msgid "" -"is.sorted (R level) and fsorted (C level) only to be used on vectors. If " -"needed on a list/data.table, you'll need the order anyway if not sorted, so " -"use if (length(o<-forder(...))) for efficiency in one step, or equivalent at " -"C level" -msgstr "" -"is.sorted (R层面)和 fsorted (C 层面)使用对象仅为向量。如果需要用于list或data." -"table,需要对其进行排序如果(length(o<-forder(...))),使用提高效率,或相当于" -"在 " +#: forder.c:1270 +msgid "Internal error: issorted 'by' must be NULL or integer vector" +msgstr "内部错误:issorted 参数 'by' 须为 NULL 或一个整数向量" -#: forder.c:1301 +#: forder.c:1274 forder.c:1324 +#, c-format +msgid "issorted 'by' [%d] out of range [1,%d]" +msgstr "issorted 参数 'by' 的值%d超出[1,%d]的范围" + +#: forder.c:1279 +msgid "is.sorted does not work on list columns" +msgstr "is.sorted 不支持列表(list)列" + +#: forder.c:1311 forder.c:1341 forder.c:1375 #, c-format msgid "type '%s' is not yet supported" msgstr "类型 '%s' 目前不支持" -#: forder.c:1310 +#: forder.c:1388 msgid "x must be either NULL or an integer vector" msgstr "x 必须为空值或整型向量" -#: forder.c:1312 +#: forder.c:1390 msgid "nrow must be integer vector length 1" msgstr "nrow 必须为长度为1的整型向量" -#: forder.c:1314 +#: forder.c:1392 #, c-format msgid "nrow==%d but must be >=0" msgstr "nrow==%d 但是必须 >=0" -#: forder.c:1331 +#: forder.c:1409 msgid "x must be type 'double'" msgstr "x 必须为浮点数类型" -#: frank.c:11 +#: frank.c:9 #, c-format msgid "Internal error. Argument 'x' to Cdt_na is type '%s' not 'list'" msgstr "内部错误:参数 'x' 关于 Cdt_na 是 '%s' 类型而不是 'list' 类型" -#: frank.c:12 +#: frank.c:10 #, c-format msgid "Internal error. Argument 'cols' to Cdt_na is type '%s' not 'integer'" msgstr "内部错误:参数 'cols' 关于 Cdt_na 是 '%s' 类型而不是 'integer' 类型" -#: frank.c:16 frank.c:146 subset.c:263 +#: frank.c:14 frank.c:155 subset.c:276 #, c-format msgid "Item %d of 'cols' is %d which is outside 1-based range [1,ncol(x)=%d]" msgstr "'cols' 的 %d 项为 %d ,超出1的范围 [1,ncol(x)=%d]" -#: frank.c:26 frank.c:155 +#: frank.c:24 frank.c:164 #, c-format msgid "" "Column %d of input list x is length %d, inconsistent with first column of " "that item which is length %d." msgstr "输入列表x的列 %d 长度为 %d,不同于第一列的该项长度为 %d" -#: frank.c:65 frank.c:202 transpose.c:88 +#: frank.c:63 frank.c:211 transpose.c:88 #, c-format msgid "Unsupported column type '%s'" msgstr "不支持的列类型 '%s'" -#: frank.c:83 +#: frank.c:82 msgid "" "Internal error: invalid ties.method for frankv(), should have been caught " "before. please report to data.table issue tracker" @@ -1721,17 +1738,17 @@ msgstr "" "内部错误:对于 frankv()的无效值ties.method,应在之前被捕获。请报告给 data." "table issue tracker" -#: frank.c:130 +#: frank.c:139 #, c-format msgid "Internal error: unknown ties value in frank: %d" msgstr "内部错误:frank中有未知的ties值 %d" -#: frank.c:141 +#: frank.c:150 #, c-format msgid "Internal error. Argument 'x' to CanyNA is type '%s' not 'list'" msgstr "内部错误:参数 'x' 关于 CanyNA 是 '%s' 类型而不是'list'类型" -#: frank.c:142 +#: frank.c:151 #, c-format msgid "Internal error. Argument 'cols' to CanyNA is type '%s' not 'integer'" msgstr "内部错误:参数 'cols' 关于 CanyNA 是 '%s' 类型而不是'integer'类型" @@ -1772,217 +1789,217 @@ msgstr "可避免的 %.3f 秒。 %s 复制用时\n" msgid " File copy in RAM took %.3f seconds.\n" msgstr "内存上的文件复制耗时 %.3f 秒\n" -#: fread.c:1093 +#: fread.c:1248 msgid "" "Previous fread() session was not cleaned up properly. Cleaned up ok at the " "beginning of this fread() call.\n" msgstr "之前的会话fread()未正确清理。在当前 fread() 会话开始前清理好\n" -#: fread.c:1096 +#: fread.c:1251 msgid "[01] Check arguments\n" msgstr "[01] 参数检查\n" -#: fread.c:1103 +#: fread.c:1258 #, c-format msgid " Using %d threads (omp_get_max_threads()=%d, nth=%d)\n" msgstr "使用 %d 线程 (omp_get_max_threads()=%d, nth=%d)\n" -#: fread.c:1111 +#: fread.c:1266 msgid "" "Internal error: NAstrings is itself NULL. When empty it should be pointer to " "NULL." msgstr "内部错误:NAstrings 自身为空值。当清空该项会指向NULL空值" -#: fread.c:1129 +#: fread.c:1284 #, c-format msgid "freadMain: NAstring <<%s>> has whitespace at the beginning or end" msgstr "freadMain: NAstring <<%s>> 在开始或者结束处有空白" -#: fread.c:1134 +#: fread.c:1289 #, c-format msgid "" "freadMain: NAstring <<%s>> is recognized as type boolean, this is not " "permitted." msgstr "freadMain: NAstring <<%s>> 被识别为布尔型,这是不允许" -#: fread.c:1144 +#: fread.c:1300 msgid " No NAstrings provided.\n" msgstr "未提供 NAstrings \n" -#: fread.c:1146 +#: fread.c:1302 msgid " NAstrings = [" msgstr " NAstrings = [" -#: fread.c:1149 +#: fread.c:1305 msgid "]\n" msgstr "]\n" -#: fread.c:1151 +#: fread.c:1307 msgid " One or more of the NAstrings looks like a number.\n" msgstr "一个或多个 NAstrings 类似数值\n" -#: fread.c:1153 +#: fread.c:1309 msgid " None of the NAstrings look like numbers.\n" msgstr "没有 NAstrings 为数值\n" -#: fread.c:1155 +#: fread.c:1311 #, c-format msgid " skip num lines = %\n" msgstr "跳过行数为 %\n" -#: fread.c:1156 +#: fread.c:1312 #, c-format msgid " skip to string = <<%s>>\n" msgstr "跳转至 string = <<%s>>\n" -#: fread.c:1157 +#: fread.c:1313 #, c-format msgid " show progress = %d\n" msgstr "显示进程 %d\n" -#: fread.c:1158 +#: fread.c:1314 #, c-format msgid " 0/1 column will be read as %s\n" msgstr " 0/1 列被读取为 %s\n" -#: fread.c:1166 +#: fread.c:1322 #, c-format msgid "sep == quote ('%c') is not allowed" msgstr "sep == quote ('%c') 不被允许" -#: fread.c:1167 +#: fread.c:1323 msgid "dec='' not allowed. Should be '.' or ','" msgstr "dec='' 不允许,应该为 '.' 或者 ','" -#: fread.c:1168 +#: fread.c:1324 #, c-format msgid "sep == dec ('%c') is not allowed" msgstr "sep == dec ('%c') 不允许" -#: fread.c:1169 +#: fread.c:1325 #, c-format msgid "quote == dec ('%c') is not allowed" msgstr "quote == dec ('%c') 不允许" -#: fread.c:1186 +#: fread.c:1342 msgid "[02] Opening the file\n" msgstr "[02] 打开文件\n" -#: fread.c:1189 +#: fread.c:1345 msgid "" " `input` argument is provided rather than a file name, interpreting as raw " "text to read\n" msgstr "提供 `input` 参数而非文件名,理解为原始的文本读取\n" -#: fread.c:1193 +#: fread.c:1349 msgid "Internal error: last byte of character input isn't \\0" msgstr "内部错误:字符输入的最后一个字节不是 \\0" -#: fread.c:1196 +#: fread.c:1352 #, c-format msgid " Opening file %s\n" msgstr "打开文件 %s\n" -#: fread.c:1200 +#: fread.c:1356 #, c-format msgid "file not found: %s" msgstr "文件未找到: %s" -#: fread.c:1204 +#: fread.c:1360 #, c-format msgid "Opened file ok but couldn't obtain its size: %s" msgstr "文件能够打开但无法获知其大小:%s" -#: fread.c:1207 fread.c:1235 +#: fread.c:1363 fread.c:1391 #, c-format msgid "File is empty: %s" msgstr "文件是空的:%s" -#: fread.c:1208 fread.c:1236 +#: fread.c:1364 fread.c:1392 #, c-format msgid " File opened, size = %s.\n" msgstr "文件已打开,大小为 %s.\n" -#: fread.c:1225 +#: fread.c:1381 #, c-format msgid "File not found: %s" msgstr "文件没有找到:%s" -#: fread.c:1231 +#: fread.c:1387 #, c-format msgid "Unable to open file after %d attempts (error %d): %s" msgstr "经过 %d 次尝试后仍无法打开文件(错误 %d):%s" -#: fread.c:1233 +#: fread.c:1389 #, c-format msgid "GetFileSizeEx failed (returned 0) on file: %s" msgstr "GetFileSizeEx 未能成功执行(返回值为0)于文件:%s" -#: fread.c:1238 +#: fread.c:1394 #, c-format msgid "This is Windows, CreateFileMapping returned error %d for file %s" msgstr "现在在Windows下,CreateFileMapping 返回错误 %d 于文件 %s" -#: fread.c:1245 +#: fread.c:1401 #, c-format msgid "" "Opened %s file ok but could not memory map it. This is a %dbit process. %s." msgstr "能够打开文件 %s 但不能创建内存映射。这是一个 %d 位进程。 %s." -#: fread.c:1246 +#: fread.c:1402 msgid "Please upgrade to 64bit" msgstr "请升级到64位" -#: fread.c:1246 +#: fread.c:1402 msgid "There is probably not enough contiguous virtual memory available" msgstr "多半没有足够的连续虚拟内存" -#: fread.c:1249 +#: fread.c:1405 msgid " Memory mapped ok\n" msgstr " 内存映射正常\n" -#: fread.c:1251 +#: fread.c:1407 msgid "" "Internal error: Neither `input` nor `filename` are given, nothing to read." msgstr "" "内部错误:既没有`input`(输入)也没有`filename`(文件名),没有什么可供读入。" -#: fread.c:1268 +#: fread.c:1424 msgid "[03] Detect and skip BOM\n" msgstr "[03] 检测并跳过字节顺序标记(BOM)\n" -#: fread.c:1272 +#: fread.c:1428 msgid "" " UTF-8 byte order mark EF BB BF found at the start of the file and " "skipped.\n" msgstr "在文件头发现了UTF-8 字节顺序标记(BOM)EF BB BF 并已跳过。\n" -#: fread.c:1277 +#: fread.c:1433 msgid "" "GB-18030 encoding detected, however fread() is unable to decode it. Some " "character fields may be garbled.\n" msgstr "检测到GB-18030 编码,但fread() 未能解码。某些 字符字段可能有乱码。\n" -#: fread.c:1280 +#: fread.c:1436 msgid "" "File is encoded in UTF-16, this encoding is not supported by fread(). Please " "recode the file to UTF-8." msgstr "文件编码是UTF-16,fread()不支持此编码。请 将文件转换为UTF-8。" -#: fread.c:1285 +#: fread.c:1441 #, c-format msgid " Last byte(s) of input found to be %s and removed.\n" msgstr " 发现输入的最后字节是 %s 并已去除。\n" -#: fread.c:1288 +#: fread.c:1444 msgid "Input is empty or only contains BOM or terminal control characters" msgstr "输入是空的或只有字节顺序标记(BOM)或终端控制字符" -#: fread.c:1295 +#: fread.c:1451 msgid "[04] Arrange mmap to be \\0 terminated\n" msgstr "[04] 设定mmap为 \\0 终止\n" -#: fread.c:1302 +#: fread.c:1458 msgid "" " No \\n exists in the file at all, so single \\r (if any) will be taken as " "one line ending. This is unusual but will happen normally when there is no " @@ -1991,7 +2008,7 @@ msgstr "" " 文件中完全没有换行符\\n,所以单个 \\r(如果有的话)将被当成一行的结束。这不" "太常见但如果没有\\r 的话属于正常;例如单个行没有行尾结束符。\n" -#: fread.c:1303 +#: fread.c:1459 msgid "" " \\n has been found in the input and different lines can end with different " "line endings (e.g. mixed \\n and \\r\\n in one file). This is common and " @@ -2000,7 +2017,7 @@ msgstr "" " 输入中有\\n 并且不同行可以有不同的 行尾结束符(如在一个文件中混合使用 \\n " "和\\r\\n)。这很常见也是理想情况。\n" -#: fread.c:1327 +#: fread.c:1483 #, c-format msgid "" " File ends abruptly with '%c'. Final end-of-line is missing. Using cow page " @@ -2009,7 +2026,7 @@ msgstr "" " 文件突然中止于 '%c'。没有最后一个行尾结束符。正使用写时复制页(cow, copy-" "on-write)写入 0 到最后一个字节。\n" -#: fread.c:1333 +#: fread.c:1489 msgid "" "This file is very unusual: it ends abruptly without a final newline, and " "also its size is a multiple of 4096 bytes. Please properly end the last row " @@ -2018,16 +2035,16 @@ msgstr "" "这个文件非常不正常:它突然中止而没有最后的换行,并且其大小是4096 字节的整数" "倍。请用一个换行(例如 'echo >> file')来恰当地结束最后一行以避免此错误" -#: fread.c:1334 +#: fread.c:1490 #, c-format msgid " File ends abruptly with '%c'. Copying file in RAM. %s copy.\n" msgstr " 文件突然中止于 '%c'。正在从内存中复制文件。%s 复制。\n" -#: fread.c:1368 +#: fread.c:1524 msgid "[05] Skipping initial rows if needed\n" msgstr "[05] 如需要的话跳过起始行\n" -#: fread.c:1374 +#: fread.c:1530 #, c-format msgid "" "skip='%s' not found in input (it is case sensitive and literal; i.e., no " @@ -2036,79 +2053,79 @@ msgstr "" "在输入中没有发现 skip='%s' (这里大小写敏感并需要是字面形式,也就是说不能使用" "模式,适配符或正则表达式)" -#: fread.c:1380 +#: fread.c:1536 #, c-format msgid "" "Found skip='%s' on line %. Taking this to be header row or first row " "of data.\n" msgstr "在行 %2$ 发现了 skip='%1$s'。将此当做表头或数据的第一行。\n" -#: fread.c:1393 +#: fread.c:1549 #, c-format msgid " Skipped to line % in the file" msgstr " 跳到文件的第 % 行" -#: fread.c:1394 +#: fread.c:1550 #, c-format msgid "skip=% but the input only has % line%s" msgstr "skip=% 但输入只有 % 行 %s" -#: fread.c:1403 +#: fread.c:1559 msgid "" "Input is either empty, fully whitespace, or skip has been set after the last " "non-whitespace." msgstr "输入是空,或全部为空白,或跳过设置是在最后一个非空白字符之后。" -#: fread.c:1405 +#: fread.c:1561 #, c-format msgid " Moved forward to first non-blank line (%d)\n" msgstr " 前移到第一个非空行 (%d)\n" -#: fread.c:1406 +#: fread.c:1562 #, c-format msgid " Positioned on line %d starting: <<%s>>\n" msgstr " 定位到行 %d 开始于: <<%s>>\n" -#: fread.c:1424 +#: fread.c:1580 msgid "[06] Detect separator, quoting rule, and ncolumns\n" msgstr "[06] 检测分隔符,引用规则,以及列数\n" -#: fread.c:1428 +#: fread.c:1584 msgid " sep='\\n' passed in meaning read lines as single character column\n" msgstr " sep='\\n' 设定意味着将把所有行读作一个字符列\n" -#: fread.c:1447 +#: fread.c:1603 msgid " Detecting sep automatically ...\n" msgstr " 自动检测分隔符中 ...\n" -#: fread.c:1454 +#: fread.c:1610 #, c-format msgid " Using supplied sep '%s'\n" msgstr " 使用提供的分隔符 '%s'\n" -#: fread.c:1488 +#: fread.c:1644 #, c-format msgid " with %d fields using quote rule %d\n" msgstr " 对 %d 个字段使用引用规则 %d\n" -#: fread.c:1538 +#: fread.c:1694 #, c-format msgid " with %d lines of %d fields using quote rule %d\n" msgstr " 对 %d 行的 %d 字段使用引用规则 %d\n" -#: fread.c:1545 +#: fread.c:1701 msgid "" " No sep and quote rule found a block of 2x2 or greater. Single column " "input.\n" msgstr " 没有分隔符并且引用规则发现了一个大于或等于2x2的区块。输入是单列。\n" -#: fread.c:1561 +#: fread.c:1717 msgid "" "Single column input contains invalid quotes. Self healing only effective " "when ncol>1" msgstr "单列输入包含了不合法的引用。自我修正只有在列数大于1(ncol>1)时才有效" -#: fread.c:1566 +#: fread.c:1722 #, c-format msgid "" "Found and resolved improper quoting in first %d rows. If the fields are not " @@ -2118,35 +2135,35 @@ msgstr "" "在前 %d 行中发现并修正了不合适的引号用法。如果字段没有加引号(例如字段间隔符" "没有在任何字段内出现),可以尝试使用 quote=\"\" 来避免此警告。" -#: fread.c:1582 +#: fread.c:1738 #, c-format msgid "" "Internal error: ncol==%d line==%d after detecting sep, ncol and first line" msgstr "内部错误:检测分隔符,列数和首行后,ncol==%d line==%d" -#: fread.c:1585 +#: fread.c:1741 #, c-format msgid "Internal error: first line has field count %d but expecting %d" msgstr "内部错误:首行有%d个字段,但应该有%d个" -#: fread.c:1587 +#: fread.c:1743 #, c-format msgid "" " Detected %d columns on line %d. This line is either column names or first " "data row. Line starts as: <<%s>>\n" msgstr "检测到第%2$d行有%1$d列。该行为列名或数据集首行。该行以<<%3$s>>开始\n" -#: fread.c:1589 +#: fread.c:1745 #, c-format msgid " Quote rule picked = %d\n" msgstr "标点符号规则 = %d\n" -#: fread.c:1590 +#: fread.c:1746 #, c-format msgid " fill=%s and the most number of columns found is %d\n" msgstr "fill=%s 且找到的最大列数为 %d\n" -#: fread.c:1596 +#: fread.c:1752 msgid "" "This file is very unusual: it's one single column, ends with 2 or more end-" "of-line (representing several NA at the end), and is a multiple of 4096, too." @@ -2154,12 +2171,12 @@ msgstr "" "该文件极为特殊,仅有一列数据,在结尾处包含多个行结束标记(表示多个空值),且" "长度为4096的整数倍。" -#: fread.c:1597 +#: fread.c:1753 #, c-format msgid " Copying file in RAM. %s\n" msgstr "正在将文件拷贝到RAM。%s\n" -#: fread.c:1603 +#: fread.c:1759 msgid "" " 1-column file ends with 2 or more end-of-line. Restoring last eol using " "extra byte in cow page.\n" @@ -2167,37 +2184,37 @@ msgstr "" "该文件包含一列数据,存在多个行结束标记(表示多个空值)。正在使用写时复制页" "(cow, copy-on-write)额外的字节恢复最后一个标记.\n" -#: fread.c:1622 +#: fread.c:1778 msgid "" "[07] Detect column types, good nrow estimate and whether first row is column " "names\n" msgstr "[07] 检测列类型,估计行数以及首行是否为列名\n" -#: fread.c:1623 +#: fread.c:1779 #, c-format msgid " 'header' changed by user from 'auto' to %s\n" msgstr " 用户已将'header'(列名)从 'auto' 改为 %s\n" -#: fread.c:1627 +#: fread.c:1783 #, c-format msgid "Failed to allocate 2 x %d bytes for type and tmpType: %s" msgstr "为 %2$s 类型分配 2 x %1$d bytes失败" -#: fread.c:1648 +#: fread.c:1804 #, c-format msgid " Number of sampling jump points = %d because " msgstr "采样跳点数 = %d 因为" -#: fread.c:1649 +#: fread.c:1805 #, c-format msgid "nrow limit (%) supplied\n" msgstr "指定了nrow 的最大值 (%) \n" -#: fread.c:1650 +#: fread.c:1806 msgid "jump0size==0\n" msgstr "jump0size==0\n" -#: fread.c:1651 +#: fread.c:1807 #, c-format msgid "" "(% bytes from row 1 to eof) / (2 * % jump0size) == " @@ -2205,32 +2222,32 @@ msgid "" msgstr "" "(从首行到结束共 % bytes) / (2 * % jump0size) == %\n" -#: fread.c:1689 +#: fread.c:1845 #, c-format msgid "" " A line with too-%s fields (%d/%d) was found on line %d of sample jump %d. " "%s\n" msgstr "第%5$d个跳点所找到的第%4$d行,该行字段过于%1$s(%2$d/%3$d). %6$s\n" -#: fread.c:1690 +#: fread.c:1846 msgid "few" msgstr "少" -#: fread.c:1690 +#: fread.c:1846 msgid "many" msgstr "多" -#: fread.c:1690 +#: fread.c:1846 msgid "" "Most likely this jump landed awkwardly so type bumps here will be skipped." msgstr "很有可能这一跳点的位置并不合适,因此此处的类型转换将被跳过。" -#: fread.c:1716 +#: fread.c:1872 #, c-format msgid " Type codes (jump %03d) : %s Quote rule %d\n" msgstr " 类型码(跳点 %03d) : %s 引用规则 %d\n" -#: fread.c:1729 +#: fread.c:1885 #, c-format msgid "" " 'header' determined to be true due to column %d containing a string on row " @@ -2239,19 +2256,19 @@ msgstr "" " 'header' 参数设为真,原因是第%1$d列首行包含字符串,并且在样本中的另外%3$d行" "包含有较底层的数据类型(%2$s)\n" -#: fread.c:1741 +#: fread.c:1897 msgid "" "Internal error: row before first data row has the same number of fields but " "we're not using it." msgstr "内部错误:数据首行的前一行包含相同数量的字段但不会用到该行。" -#: fread.c:1742 +#: fread.c:1898 msgid "" "Internal error: ch!=pos after counting fields in the line before the first " "data row." msgstr "内部错误:对数据首行前一行的字段计数后,ch不等于pos" -#: fread.c:1743 +#: fread.c:1899 #, c-format msgid "" "Types in 1st data row match types in 2nd data row but previous row has %d " @@ -2260,7 +2277,7 @@ msgstr "" "数据第一行的类型与第二行相匹配,但是之前的行有 %d 个字段。故将第一行数据的前" "一行作为列名" -#: fread.c:1746 +#: fread.c:1902 #, c-format msgid "" "Detected %d column names but the data has %d columns (i.e. invalid file). " @@ -2268,7 +2285,7 @@ msgid "" msgstr "" "检测到 %d 个列名,然而数据共有 %d 列(文件不合法)。添加了 %d 个额外列名%s\n" -#: fread.c:1747 +#: fread.c:1903 msgid "" " for the first column which is guessed to be row names or an index. Use " "setnames() afterwards if this guess is not correct, or fix the file write " @@ -2277,17 +2294,17 @@ msgstr "" "作为第一列,并被用于猜测行名或索引。若上述猜测不正确,可在后续使用setnames()" "进行修改,或修复用于生成该文件的文件写入命令以生成有效的文件。" -#: fread.c:1747 +#: fread.c:1903 msgid "s at the end." msgstr "到结尾处" -#: fread.c:1749 +#: fread.c:1905 msgid "" "Internal error: fill=true but there is a previous row which should already " "have been filled." msgstr "内部错误:参数fill=true,但是在此之前有一行应当已经被填充。" -#: fread.c:1750 +#: fread.c:1906 #, c-format msgid "" "Detected %d column names but the data has %d columns. Filling rows " @@ -2296,74 +2313,74 @@ msgstr "" "检测到%d个列名,但数据共有%d列。已经自动填充。设置参数fill=TRUE以屏蔽此警" "告。\n" -#: fread.c:1754 +#: fread.c:1910 #, c-format msgid "Failed to realloc 2 x %d bytes for type and tmpType: %s" msgstr "为 %2$s 类型重新分配 2 x %1$d bytes失败" -#: fread.c:1774 +#: fread.c:1930 #, c-format msgid "" " 'header' determined to be %s because there are%s number fields in the " "first and only row\n" msgstr " 参数'header' 被设置为%s, 因为唯一的一行包含 %s 个字段\n" -#: fread.c:1774 +#: fread.c:1930 msgid " no" msgstr "0" -#: fread.c:1777 +#: fread.c:1933 msgid "" " 'header' determined to be true because all columns are type string and a " "better guess is not possible\n" msgstr "参数 'header' 被设置为true,因为所有列类型均为字符串\n" -#: fread.c:1779 +#: fread.c:1935 msgid "" " 'header' determined to be false because there are some number columns and " "those columns do not have a string field at the top of them\n" msgstr "参数 'header' 被设置为false,因为部分字段的首行不为字符串\n" -#: fread.c:1795 +#: fread.c:1951 #, c-format msgid " Type codes (first row) : %s Quote rule %d\n" msgstr " 类型码(第一行) : %s 引用规则 %d\n" -#: fread.c:1804 +#: fread.c:1960 #, c-format msgid "" " All rows were sampled since file is small so we know nrow=% " "exactly\n" msgstr " 文件太小,全部行均被采样到,所以 nrow=%\n" -#: fread.c:1816 fread.c:1823 +#: fread.c:1972 fread.c:1979 msgid " =====\n" msgstr " =====\n" -#: fread.c:1817 +#: fread.c:1973 #, c-format msgid "" " Sampled % rows (handled \\n inside quoted fields) at %d jump " "points\n" msgstr " 已使用了 %2$d个跳点抽样 %1$ 行(处理了字段间的分隔符\\n)\n" -#: fread.c:1818 +#: fread.c:1974 #, c-format msgid "" " Bytes from first data row on line %d to the end of last row: %\n" msgstr " 从第一个数据行(%d)到最后一行的字节: %\n" -#: fread.c:1819 +#: fread.c:1975 #, c-format msgid " Line length: mean=%.2f sd=%.2f min=%d max=%d\n" msgstr "文件每行长度的统计量:均值=%.2f,标准差=%.2f,最小值=%d ,最大值=%d\n" -#: fread.c:1820 +#: fread.c:1976 #, c-format msgid " Estimated number of rows: % / %.2f = %\n" msgstr "估计数据共有 % / %.2f = % 行\n" -#: fread.c:1821 +#: fread.c:1977 #, c-format msgid "" " Initial alloc = % rows (% + %d%%) using bytes/" @@ -2372,44 +2389,44 @@ msgstr "" "为 % 行 (% + %d%%)分配初始内存,大小为字节数/max(mean-2*sd," "min),并确保该数值落于区间[1.1*estn, 2.0*estn]中\n" -#: fread.c:1825 +#: fread.c:1981 #, c-format msgid "Internal error: sampleLines(%) > allocnrow(%)" msgstr "内部错误:sampleLines(%) > allocnrow(%)" -#: fread.c:1829 +#: fread.c:1985 #, c-format msgid " Alloc limited to lower nrows=% passed in.\n" msgstr " 分配被限制在输入的更小的 nrows=% 值上。\n" -#: fread.c:1841 +#: fread.c:1997 msgid "[08] Assign column names\n" msgstr "[08] 指定列名\n" -#: fread.c:1849 +#: fread.c:2005 #, c-format msgid "Unable to allocate %d*%d bytes for column name pointers: %s" msgstr "无法分配 %d*%d 字节给列名指针: %s" -#: fread.c:1871 +#: fread.c:2027 #, c-format msgid "Internal error: reading colnames ending on '%c'" msgstr "内部错误:读取列名终止于 '%c'" -#: fread.c:1889 +#: fread.c:2045 msgid "[09] Apply user overrides on column types\n" msgstr "[09] 使用用户指定的列类型\n" -#: fread.c:1893 +#: fread.c:2049 msgid " Cancelled by user: userOverride() returned false." msgstr " 用户已取消:userOverride() 返回 false。" -#: fread.c:1903 +#: fread.c:2059 #, c-format msgid "Failed to allocate %d bytes for size array: %s" msgstr "无法分配 %d 字节给 size 数组:%s" -#: fread.c:1910 +#: fread.c:2066 #, c-format msgid "" "Attempt to override column %d <<%.*s>> of inherent type '%s' down to '%s' " @@ -2419,40 +2436,40 @@ msgstr "" "试图覆盖第 %d 列 <<%.*s>>,将内部类型 '%s' 降级为 '%s' 的操作被忽略。只支持将" "列类型升为更高阶的类型。如果确定此操作,请完成之后再转换类型。" -#: fread.c:1924 +#: fread.c:2080 #, c-format msgid " After %d type and %d drop user overrides : %s\n" msgstr " 经过 %d 类型和 %d 丢弃用户覆盖:%s\n" -#: fread.c:1932 +#: fread.c:2088 msgid "[10] Allocate memory for the datatable\n" msgstr "[10] 分配内存给 datatable\n" -#: fread.c:1933 +#: fread.c:2089 #, c-format msgid " Allocating %d column slots (%d - %d dropped) with % rows\n" msgstr " 正在分配 %d 列位置(%d - %d 已丢弃),% 行\n" -#: fread.c:1987 +#: fread.c:2143 #, c-format msgid "Buffer size % is too large\n" msgstr "缓冲长度 % 过大\n" -#: fread.c:1990 +#: fread.c:2146 msgid "[11] Read the data\n" msgstr "[11] 读取数据\n" -#: fread.c:1993 +#: fread.c:2149 #, c-format msgid " jumps=[%d..%d), chunk_size=%, total_size=%\n" msgstr " jumps=[%d..%d),chunk_size=%,total_size=%\n" -#: fread.c:2005 +#: fread.c:2161 #, c-format msgid "Internal error: Master thread is not thread 0 but thread %d.\n" msgstr "内部错误:主线程并非线程0而是线程%d\n" -#: fread.c:2213 +#: fread.c:2369 #, c-format msgid "" "Column %d (\"%.*s\") bumped from '%s' to '%s' due to <<%.*s>> on row " @@ -2461,14 +2478,14 @@ msgstr "" "第 %d 列(\"%.*s\") 发生了从 '%s' 到 '%s' 的类型转换,由于 <<%.*s>> 出现在第 " "% 行\n" -#: fread.c:2262 +#: fread.c:2418 #, c-format msgid "" "Internal error: invalid head position. jump=%d, headPos=%p, thisJumpStart=" "%p, sof=%p" msgstr "内部错误:head 位置无效。jump=%d, headPos=%p, thisJumpStart=%p, sof=%p" -#: fread.c:2335 +#: fread.c:2491 #, c-format msgid "" " Too few rows allocated. Allocating additional % rows (now nrows=" @@ -2477,42 +2494,42 @@ msgstr "" " 分配的行数太少。正在分配额外的 % 行(当前 nrows=%),并从跳" "跃 %d 继续读取\n" -#: fread.c:2342 +#: fread.c:2498 #, c-format msgid " Restarting team from jump %d. nSwept==%d quoteRule==%d\n" msgstr " 从跳跃 %d 重启组。nSwept==%d quoteRule==%d\n" -#: fread.c:2362 +#: fread.c:2518 #, c-format msgid " %d out-of-sample type bumps: %s\n" msgstr " %d 样本外类型变更:%s\n" -#: fread.c:2398 +#: fread.c:2554 #, c-format msgid "" "Read % rows x %d columns from %s file in %02d:%06.3f wall clock " "time\n" msgstr "读取 % 行 x %d 列,从 %s 文件(时钟时间 %02d:%06.3f)\n" -#: fread.c:2405 +#: fread.c:2561 msgid "[12] Finalizing the datatable\n" msgstr "[12] 最后定型 datatable\n" -#: fread.c:2406 +#: fread.c:2562 msgid " Type counts:\n" msgstr " 类型数量:\n" -#: fread.c:2408 +#: fread.c:2564 #, c-format msgid "%10d : %-9s '%c'\n" msgstr "%10d : %-9s '%c'\n" -#: fread.c:2424 +#: fread.c:2580 #, c-format msgid "Discarded single-line footer: <<%s>>" msgstr "丢弃末尾行:<<%s>>" -#: fread.c:2429 +#: fread.c:2585 #, c-format msgid "" "Stopped early on line %. Expected %d fields but found %d. Consider " @@ -2521,7 +2538,7 @@ msgstr "" "在第 % 行提前终止。预期有 %d 个字段但只找到 %d 个。可以考虑设置 " "fill=TRUE 和 comment.char=。 首个丢弃的非空行:<<%s>>" -#: fread.c:2435 +#: fread.c:2591 #, c-format msgid "" "Found and resolved improper quoting out-of-sample. First healed line " @@ -2532,31 +2549,31 @@ msgstr "" "不在引号内(例如:字段间隔符没有在任何一个字段中出现),尝试用 quote=\"\" 来" "避免该警告。" -#: fread.c:2439 +#: fread.c:2595 msgid "=============================\n" msgstr "=============================\n" -#: fread.c:2441 +#: fread.c:2597 #, c-format msgid "%8.3fs (%3.0f%%) Memory map %.3fGB file\n" msgstr "%8.3fs (%3.0f%%) 内存映射 %.3fGB 文件\n" -#: fread.c:2442 +#: fread.c:2598 #, c-format msgid "%8.3fs (%3.0f%%) sep=" msgstr "%8.3fs (%3.0f%%) sep=" -#: fread.c:2444 +#: fread.c:2600 #, c-format msgid " ncol=%d and header detection\n" msgstr " ncol=%d 和表头检测\n" -#: fread.c:2445 +#: fread.c:2601 #, c-format msgid "%8.3fs (%3.0f%%) Column type detection using % sample rows\n" msgstr "%8.3fs (%3.0f%%) 列类型检测基于 % 个样本行\n" -#: fread.c:2447 +#: fread.c:2603 #, c-format msgid "" "%8.3fs (%3.0f%%) Allocation of % rows x %d cols (%.3fGB) of which " @@ -2565,7 +2582,7 @@ msgstr "" "%8.3fs (%3.0f%%) % 行 x %d 列 (%.3fGB) 的分配中已使用 % " "(%3.0f%%) 行\n" -#: fread.c:2451 +#: fread.c:2607 #, c-format msgid "" "%8.3fs (%3.0f%%) Reading %d chunks (%d swept) of %.3fMB (each chunk %d rows) " @@ -2574,34 +2591,34 @@ msgstr "" "%8.3fs (%3.0f%%) 正在读取 %d 个块 (%d 已扫描) of %.3fMB (每个块 %d 行) 使用 " "%d 个线程\n" -#: fread.c:2453 +#: fread.c:2609 #, c-format msgid "" " + %8.3fs (%3.0f%%) Parse to row-major thread buffers (grown %d times)\n" msgstr " + %8.3fs (%3.0f%%) 解析到行处理线程的缓冲区(已增长 %d 次)\n" -#: fread.c:2454 +#: fread.c:2610 #, c-format msgid " + %8.3fs (%3.0f%%) Transpose\n" msgstr " + %8.3fs (%3.0f%%) 转置\n" -#: fread.c:2455 +#: fread.c:2611 #, c-format msgid " + %8.3fs (%3.0f%%) Waiting\n" msgstr " + %8.3fs (%3.0f%%) 正在等待\n" -#: fread.c:2456 +#: fread.c:2612 #, c-format msgid "" "%8.3fs (%3.0f%%) Rereading %d columns due to out-of-sample type exceptions\n" msgstr "%8.3fs (%3.0f%%) 正在重读 %d 列,由于样本外类型异常\n" -#: fread.c:2458 +#: fread.c:2614 #, c-format msgid "%8.3fs Total\n" msgstr "%8.3fs 总计\n" -#: freadR.c:84 +#: freadR.c:85 msgid "" "Internal error: freadR input not a single character string: a filename or " "the data itself. Should have been caught at R level." @@ -2609,49 +2626,49 @@ msgstr "" "内部错误:freadR 输入的不是单个字符串:文件名或者数据文本。该错误本应在 R 中" "被捕获。" -#: freadR.c:92 +#: freadR.c:93 msgid "" "Input contains a \\n or is \")\". Taking this to be text input (not a " "filename)\n" msgstr "输入中包含 \\n 或者是 \")\"。输入将被当做数据文本(而非文件名)\n" -#: freadR.c:95 +#: freadR.c:96 msgid "Input contains no \\n. Taking this to be a filename to open\n" msgstr "输入中不包含 \\n。输入将被当做文件名打开。\n" -#: freadR.c:101 +#: freadR.c:102 msgid "" "Internal error: freadR sep not a single character. R level catches this." msgstr "内部错误:freadR sep 不是单个字符。R 中应该捕获此错误。" -#: freadR.c:105 +#: freadR.c:106 msgid "" "Internal error: freadR dec not a single character. R level catches this." msgstr "内部错误:freadR dec 不是单个字符。R 中应该捕获此错误。" -#: freadR.c:112 +#: freadR.c:113 msgid "quote= must be a single character, blank \"\", or FALSE" msgstr "quote= 必须是单个字符,空白 \"\",或者 FALSE" -#: freadR.c:137 +#: freadR.c:143 msgid "Internal error: skip not integer or string in freadR.c" msgstr "内部错误:freadR.c 中 skip 非整数或字符串" -#: freadR.c:140 +#: freadR.c:146 #, c-format msgid "Internal error: NAstringsArg is type '%s'. R level catches this" msgstr "内部错误:NAstringsArg是'%s'数据类型.R中能够捕获这个信息" -#: freadR.c:153 +#: freadR.c:159 #, c-format msgid "nThread(%d)<1" msgstr "nThread(%1$d)<1(线程数(%1$d)小于1)" -#: freadR.c:160 +#: freadR.c:166 msgid "'integer64' must be a single character string" msgstr "'64整数型'必须是单个字符串" -#: freadR.c:168 +#: freadR.c:174 #, c-format msgid "" "Invalid value integer64='%s'. Must be 'integer64', 'character', 'double' or " @@ -2660,11 +2677,11 @@ msgstr "" "64位整数型有效值='%s'.必须是'64位整数型','字符串','双精度浮点型'或者'数值" "型'" -#: freadR.c:176 +#: freadR.c:182 msgid "Use either select= or drop= but not both." msgstr "select=和drop=不可同时使用" -#: freadR.c:179 +#: freadR.c:185 msgid "" "select= is type list for specifying types in select=, but colClasses= has " "been provided as well. Please remove colClasses=." @@ -2672,7 +2689,7 @@ msgstr "" "select=是用于在select=中指定类型的类型列表,但是还提供了colClasses=。请删除" "colClasses=。" -#: freadR.c:181 +#: freadR.c:187 msgid "" "select= is type list but has no names; expecting list(type1=cols1, " "type2=cols2, ...)" @@ -2680,7 +2697,7 @@ msgstr "" "select =是类型列表,但没有名称; 期望列表(type1 = cols1,type2 = " "cols2,...)" -#: freadR.c:188 +#: freadR.c:194 msgid "" "select= is a named vector specifying the columns to select and their types, " "but colClasses= has been provided as well. Please remove colClasses=." @@ -2688,45 +2705,45 @@ msgstr "" "select =是一个命名向量,用于指定要选择的列及其类型,但是还提供了colClasses " "=。 请删除colClasses =。" -#: freadR.c:196 freadR.c:346 +#: freadR.c:202 freadR.c:368 msgid "colClasses is type list but has no names" msgstr "colClasses是类型列表,但没有名称" -#: freadR.c:206 +#: freadR.c:212 #, c-format msgid "encoding='%s' invalid. Must be 'unknown', 'Latin-1' or 'UTF-8'" msgstr "encoding ='%s'无效。 必须为'未知','Latin-1'或'UTF-8'" -#: freadR.c:229 +#: freadR.c:235 #, c-format msgid "Column name '%s' (%s) not found" msgstr "找不到列名'%s'(%s)" -#: freadR.c:231 +#: freadR.c:237 #, c-format msgid "%s is NA" msgstr "%s是缺失值" -#: freadR.c:233 +#: freadR.c:239 #, c-format msgid "%s is %d which is out of range [1,ncol=%d]" msgstr "%s是%d,超出范围[1,ncol =%d]" -#: freadR.c:247 +#: freadR.c:253 msgid "Internal error: typeSize[CT_BOOL8_N] != 1" msgstr "内部错误:类型大小[CT_BOOL8_N]不等于1" -#: freadR.c:248 +#: freadR.c:254 msgid "Internal error: typeSize[CT_STRING] != 1" msgstr "内部错误:类型大小[CT_STRING]不等于1" -#: freadR.c:282 +#: freadR.c:288 #, c-format msgid "" "Column name '%s' not found in column name header (case sensitive), skipping." msgstr "在列名标题中找不到列名'%s'(区分大小写),正在跳过。" -#: freadR.c:292 +#: freadR.c:298 #, c-format msgid "" "Column number %d (select[%d]) is negative but should be in the range [1,ncol=" @@ -2734,7 +2751,7 @@ msgid "" msgstr "" "列号%d(select [%d])为负,但应在[1,ncol =%d]范围内。考虑drop=用于排除列。" -#: freadR.c:293 +#: freadR.c:299 #, c-format msgid "" "select = 0 (select[%d]) has no meaning. All values of select should be in " @@ -2742,24 +2759,19 @@ msgid "" msgstr "" "select=0(select[%d])没有意义。select的所有值都应在[1,ncol=%d]范围内。" -#: freadR.c:294 +#: freadR.c:300 #, c-format msgid "" "Column number %d (select[%d]) is too large for this table, which only has %d " "columns." msgstr "对于此表(仅包含%d列,)列号%d(select [%d])太大。" -#: freadR.c:295 +#: freadR.c:301 #, c-format msgid "Column number %d ('%s') has been selected twice by select=" msgstr "列号%d('%s')已由select =选择两次" -#: freadR.c:313 -msgid "" -"colClasses='NULL' is not permitted; i.e. to drop all columns and load nothing" -msgstr "colClasses ='NULL'是不允许的; 即删除所有列而不加载任何内容" - -#: freadR.c:318 +#: freadR.c:324 #, c-format msgid "" "colClasses= is an unnamed vector of types, length %d, but there are %d " @@ -2771,11 +2783,11 @@ msgstr "" "定类型,可以使用命名向量,列表格式或使用select=而不是colClasses=。请参阅'?" "fread'中的示例。" -#: freadR.c:329 +#: freadR.c:344 msgid "Internal error: selectInts is NULL but selectColClasses is true" msgstr "内部错误:selectInts为NULL,但selectColClasses为true" -#: freadR.c:330 +#: freadR.c:346 msgid "" "Internal error: length(selectSxp)!=length(colClassesSxp) but " "selectColClasses is true" @@ -2783,22 +2795,22 @@ msgstr "" "内部错误:length(select xp)!=length(colClasses xp),但select ColClasses" "为true" -#: freadR.c:344 +#: freadR.c:366 #, c-format msgid "colClasses is type '%s' but should be list or character" msgstr "colClasses是类型'%s',但应该是列表或字符" -#: freadR.c:368 +#: freadR.c:390 #, c-format msgid "Column name '%s' (colClasses[[%d]][%d]) not found" msgstr "找不到列名'%s'(colClasses[[%d]][%d])" -#: freadR.c:370 +#: freadR.c:392 #, c-format msgid "colClasses[[%d]][%d] is NA" msgstr "colClasses[[%d]][%d]是NA" -#: freadR.c:374 +#: freadR.c:396 #, c-format msgid "" "Column %d ('%s') appears more than once in colClasses. The second time is " @@ -2806,22 +2818,22 @@ msgid "" msgstr "" "Column %d ('%s')在colClasses中出现了多次。第二次是colClasses[[%d]][%d]." -#: freadR.c:381 +#: freadR.c:408 #, c-format msgid "Column number %d (colClasses[[%d]][%d]) is out of range [1,ncol=%d]" msgstr "列号%d(colClasses[[%d]][%d])超出范围[1,ncol=%d]" -#: freadR.c:583 +#: freadR.c:624 #, c-format msgid "Field size is 1 but the field is of type %d\n" msgstr "字段大小为1,但字段类型为%d \n" -#: freadR.c:592 +#: freadR.c:633 #, c-format msgid "Internal error: unexpected field of size %d\n" msgstr "内部错误:大小为%d 的意外字段\n" -#: freadR.c:660 +#: freadR.c:701 #, c-format msgid "%s" msgstr "%s" @@ -2951,7 +2963,7 @@ msgid "n must be integer vector or list of integer vectors" msgstr "n 必须是整数向量 或者由整数向量组成的列表" #: frollR.c:104 gsumm.c:342 gsumm.c:577 gsumm.c:686 gsumm.c:805 gsumm.c:950 -#: gsumm.c:1261 gsumm.c:1402 uniqlist.c:350 +#: gsumm.c:1261 gsumm.c:1402 uniqlist.c:351 msgid "na.rm must be TRUE or FALSE" msgstr "na.rm 必须是 TRUE 或者 FALSE" @@ -3010,7 +3022,7 @@ msgstr "" "内部错误: 在 rolling 函数中无效的 fun 参数, 理应在更早阶段排除请向data.table " "issue tracker报告" -#: frollR.c:155 frollR.c:279 nafill.c:152 shift.c:21 +#: frollR.c:155 frollR.c:279 nafill.c:162 shift.c:21 msgid "fill must be a vector of length 1" msgstr "fill 必须是长度为1的向量" @@ -3299,18 +3311,19 @@ msgstr "内部错误:getMaxListItemLen应该已经预先抓取了这个" #: fwriteR.c:98 #, c-format msgid "" -"Row %d of list column is type '%s' - not yet implemented. fwrite() can write " -"list columns containing items which are atomic vectors of type logical, " -"integer, integer64, double, complex and character." +"Row % of list column is type '%s' - not yet implemented. fwrite() " +"can write list columns containing items which are atomic vectors of type " +"logical, integer, integer64, double, complex and character." msgstr "" -"列表页行%d的类型是'%s' - 尚未实施. fwrite()可以写入包含逻辑类型原子向量项目的" -"列表页,整数,整数64,双精度,复数和字符" +"列表页行%的类型是'%s' - 尚未实施. fwrite()可以写入包含逻辑类型原子向" +"量项目的列表页,整数,整数64,双精度,复数和字符" #: fwriteR.c:103 #, c-format msgid "" -"Internal error: row %d of list column has no max length method implemented" -msgstr "内部错误:列表页的%d行没有实现最大长度方法" +"Internal error: row % of list column has no max length method " +"implemented" +msgstr "内部错误:列表页的%行没有实现最大长度方法" #: fwriteR.c:170 msgid "" @@ -3321,17 +3334,18 @@ msgstr "fwrite必须传递一个类型为列表的对象;比如data.frame, dat msgid "fwrite was passed an empty list of no columns. Nothing to write." msgstr "fwrite传递了一个没有列的空列表. 没有对象可以写入" -#: fwriteR.c:234 +#: fwriteR.c:232 #, c-format -msgid "Column %d's length (%d) is not the same as column 1's length (%d)" -msgstr "列%d的长度(%d)和列1的长度(%d)不一致" +msgid "" +"Column %d's length (%d) is not the same as column 1's length (%)" +msgstr "列%d的长度(%d)和列1的长度(%)不一致" -#: fwriteR.c:237 +#: fwriteR.c:236 #, c-format msgid "Column %d's type is '%s' - not yet implemented in fwrite." msgstr "列%d的类型是'%s' - 尚未在fwrite中实施" -#: fwriteR.c:262 +#: fwriteR.c:261 msgid "" "No list columns are present. Setting sep2='' otherwise quote='auto' would " "quote fields containing sep2.\n" @@ -3339,7 +3353,7 @@ msgstr "" "当前没有列表页. 设置sep2=''否则quote='auto'会自动为所有包含sep2的字段加上引" "号.\n" -#: fwriteR.c:266 +#: fwriteR.c:265 #, c-format msgid "" "If quote='auto', fields will be quoted if the field contains either sep " @@ -3349,7 +3363,7 @@ msgstr "" "that host lists),所有包含sep('%1$c') 或 sep2 ('%2$c')的字段将会被自动加上引" "号。\n" -#: fwriteR.c:270 +#: fwriteR.c:269 #, c-format msgid "" "sep ('%c'), sep2 ('%c') and dec ('%c') must all be different. Column %d is a " @@ -3641,20 +3655,26 @@ msgstr "" msgid "" "Internal error, gtail is only implemented for n=1. This should have been " "caught before. please report to data.table issue tracker." -msgstr "内部错误:gtail仅能应用于n=1的情况。此错误理应已被处理。请在 data.table 的 GitHub中提交报告。" +msgstr "" +"内部错误:gtail仅能应用于n=1的情况。此错误理应已被处理。请在 data.table 的 " +"GitHub中提交报告。" #: gsumm.c:1166 msgid "" "Internal error, ghead is only implemented for n=1. This should have been " "caught before. please report to data.table issue tracker." -msgstr "内部错误:ghead仅能应用于n=1的情况。此错误理应已被处理。请在 data.table 的 GitHub中提交报告。" +msgstr "" +"内部错误:ghead仅能应用于n=1的情况。此错误理应已被处理。请在 data.table 的 " +"GitHub中提交报告。" #: gsumm.c:1172 msgid "" "Internal error, `g[` (gnthvalue) is only implemented single value subsets " "with positive index, e.g., .SD[2]. This should have been caught before. " "please report to data.table issue tracker." -msgstr "内部错误:`g[` (gnthvalue) 仅能用于采用单个正数索引求取子集,如 .SD[2]。此错误理应已被处理。请在 data.table 的 GitHub中提交报告。" +msgstr "" +"内部错误:`g[` (gnthvalue) 仅能用于采用单个正数索引求取子集,如 .SD[2]。此错" +"误理应已被处理。请在 data.table 的 GitHub中提交报告。" #: gsumm.c:1250 #, c-format @@ -3662,7 +3682,9 @@ msgid "" "Type '%s' not supported by GForce subset `[` (gnthvalue). Either add the " "prefix utils::head(.) or turn off GForce optimization using " "options(datatable.optimize=1)" -msgstr "GForce取子集运算符`[` (gnthvalue)尚不支持'%s'类型。。请添加前缀stats::var(.),或使用options(datatable.optimize=1) 关闭 GForce优化" +msgstr "" +"GForce取子集运算符`[` (gnthvalue)尚不支持'%s'类型。。请添加前缀stats::" +"var(.),或使用options(datatable.optimize=1) 关闭 GForce优化" #: gsumm.c:1262 msgid "" @@ -3672,7 +3694,11 @@ msgid "" "using options(datatable.optimize=1). Alternatively, if you only need the " "diagonal elements, 'DT[,lapply(.SD,var),by=,.SDcols=]' is the optimized way " "to do this." -msgstr "GForce var/sd 仅能应用于列,而非.SD或其他。若要求取某一列表,如.SD,所有元素的全协方差矩阵,请添加前缀stats::var(.SD)(或stats::sd(.SD)),或使用options(datatable.optimize=1) 关闭 GForce优化。另外,若仅需获得对角线元素,最佳的方式是使用'DT[,lapply(.SD,var),by=,.SDcols=]'。" +msgstr "" +"GForce var/sd 仅能应用于列,而非.SD或其他。若要求取某一列表,如.SD,所有元素" +"的全协方差矩阵,请添加前缀stats::var(.SD)(或stats::sd(.SD)),或使用" +"options(datatable.optimize=1) 关闭 GForce优化。另外,若仅需获得对角线元素,最" +"佳的方式是使用'DT[,lapply(.SD,var),by=,.SDcols=]'。" #: gsumm.c:1263 msgid "var/sd is not meaningful for factors." @@ -3683,7 +3709,9 @@ msgstr "无法对因子类型使用 var/sd。" msgid "" "Type '%s' not supported by GForce var (gvar). Either add the prefix stats::" "var(.) or turn off GForce optimization using options(datatable.optimize=1)" -msgstr "GForce var (gvar) 尚不支持 '%s'类型。请添加前缀stats::var(.),或使用options(datatable.optimize=1) 关闭 GForce优化" +msgstr "" +"GForce var (gvar) 尚不支持 '%s'类型。请添加前缀stats::var(.),或使用" +"options(datatable.optimize=1) 关闭 GForce优化" #: gsumm.c:1384 #, c-format @@ -3800,156 +3828,156 @@ msgstr "内部错误:在重叠中出现未知的mult:%d" msgid "Final step, fetching indices in overlaps ... done in %8.3f seconds\n" msgstr "重叠的最后一步:获取索引...在%8.3f秒内完成\n" -#: init.c:233 +#: init.c:239 #, c-format msgid "" "Pointers are %d bytes, greater than 8. We have not tested on any " "architecture greater than 64bit yet." msgstr "指针是%d个字节,大于8。我们尚未在大于64位的任何体系结构上进行测试。" -#: init.c:247 +#: init.c:253 #, c-format msgid "Checking NA_INTEGER [%d] == INT_MIN [%d] %s" msgstr "检查NA_INTEGER [%d] == INT_MIN [%d] %s" -#: init.c:248 +#: init.c:254 #, c-format msgid "Checking NA_INTEGER [%d] == NA_LOGICAL [%d] %s" msgstr "检查Checking NA_INTEGER [%d] == NA_LOGICAL [%d] %s" -#: init.c:249 +#: init.c:255 #, c-format msgid "Checking sizeof(int) [%d] is 4 %s" msgstr "检查sizeof(int)[%d]是否为4 %s" -#: init.c:250 +#: init.c:256 #, c-format msgid "Checking sizeof(double) [%d] is 8 %s" msgstr "检查 sizeof(double) [%d]是否为8 %s" -#: init.c:252 +#: init.c:258 #, c-format msgid "Checking sizeof(long long) [%d] is 8 %s" msgstr "检查sizeof(long long) [%d]是否为8 %s" -#: init.c:253 +#: init.c:259 #, c-format msgid "Checking sizeof(pointer) [%d] is 4 or 8 %s" msgstr "检查sizeof(pointer) [%d]是否为4 或者 8 %s" -#: init.c:254 +#: init.c:260 #, c-format msgid "Checking sizeof(SEXP) [%d] == sizeof(pointer) [%d] %s" msgstr "检查sizeof(SEXP) [%d] == sizeof(pointer) [%d] %s" -#: init.c:255 +#: init.c:261 #, c-format msgid "Checking sizeof(uint64_t) [%d] is 8 %s" msgstr "检查 sizeof(uint64_t) [%d]是否为8 %s" -#: init.c:256 +#: init.c:262 #, c-format msgid "Checking sizeof(int64_t) [%d] is 8 %s" msgstr "检查sizeof(int64_t) [%d]是否为8 %s" -#: init.c:257 +#: init.c:263 #, c-format msgid "Checking sizeof(signed char) [%d] is 1 %s" msgstr "检查sizeof(signed char) [%d]是否为1 %s" -#: init.c:258 +#: init.c:264 #, c-format msgid "Checking sizeof(int8_t) [%d] is 1 %s" msgstr "检查sizeof(int8_t) [%d]是否为1 %s" -#: init.c:259 +#: init.c:265 #, c-format msgid "Checking sizeof(uint8_t) [%d] is 1 %s" msgstr "检查sizeof(uint8_t) [%d]是否为1 %s" -#: init.c:260 +#: init.c:266 #, c-format msgid "Checking sizeof(int16_t) [%d] is 2 %s" msgstr "检查sizeof(int16_t) [%d]是否为2 %s" -#: init.c:261 +#: init.c:267 #, c-format msgid "Checking sizeof(uint16_t) [%d] is 2 %s" msgstr "检查sizeof(uint16_t) [%d]是否为2 %s" -#: init.c:264 +#: init.c:270 #, c-format msgid "Checking LENGTH(allocVector(INTSXP,2)) [%d] is 2 %s" msgstr "检查LENGTH(allocVector(INTSXP,2)) [%d]是否为2 %s" -#: init.c:265 +#: init.c:271 #, c-format msgid "Checking TRUELENGTH(allocVector(INTSXP,2)) [%d] is 0 %s" msgstr "检查TRUELENGTH(allocVector(INTSXP,2)) [%d]是否为0 %s" -#: init.c:272 +#: init.c:278 #, c-format msgid "Checking memset(&i,0,sizeof(int)); i == (int)0 %s" msgstr "检查memset(&i,0,sizeof(int)); i == (int)0 %s" -#: init.c:275 +#: init.c:281 #, c-format msgid "Checking memset(&ui, 0, sizeof(unsigned int)); ui == (unsigned int)0 %s" msgstr "检查memset(&ui, 0, sizeof(unsigned int)); ui == (unsigned int)0 %s" -#: init.c:278 +#: init.c:284 #, c-format msgid "Checking memset(&d, 0, sizeof(double)); d == (double)0.0 %s" msgstr "检查memset(&d, 0, sizeof(double)); d == (double)0.0 %s" -#: init.c:281 +#: init.c:287 #, c-format msgid "Checking memset(&ld, 0, sizeof(long double)); ld == (long double)0.0 %s" msgstr "检查memset(&ld, 0, sizeof(long double)); ld == (long double)0.0 %s" -#: init.c:284 +#: init.c:290 msgid "The ascii character '/' is not just before '0'" msgstr "ASCII 字符 '/' 后一个字符并非字符 '0'" -#: init.c:285 +#: init.c:291 msgid "The C expression (uint_fast8_t)('/'-'0')<10 is true. Should be false." msgstr "C表达式 (uint_fast8_t)('/'-'0') <10 为 true. 应该是 false." -#: init.c:286 +#: init.c:292 msgid "The ascii character ':' is not just after '9'" msgstr "ascii字符':'不是在'9'后" -#: init.c:287 +#: init.c:293 msgid "The C expression (uint_fast8_t)('9'-':')<10 is true. Should be false." msgstr "C表达式(uint_fast8_t)('9'-':') < 10 为 true. 应该是 false." -#: init.c:292 +#: init.c:298 #, c-format msgid "Conversion of NA_INT64 via double failed %!=%" msgstr "double类型转化为NA_INT64失败,%!=%" -#: init.c:296 +#: init.c:302 msgid "NA_INT64_D (negative -0.0) is not == 0.0." msgstr "NA_INT64_D (negative -0.0) 不是 == 0.0." -#: init.c:297 +#: init.c:303 msgid "NA_INT64_D (negative -0.0) is not ==-0.0." msgstr "NA_INT64_D (negative -0.0) 不是 ==-0.0." -#: init.c:298 +#: init.c:304 msgid "ISNAN(NA_INT64_D) is TRUE but should not be" msgstr "ISNAN(NA_INT64_D) 不应该是TRUE" -#: init.c:299 +#: init.c:305 msgid "isnan(NA_INT64_D) is TRUE but should not be" msgstr "isnan(NA_INT64_D) 不应该是 TRUE" -#: init.c:328 +#: init.c:337 #, c-format msgid "PRINTNAME(install(\"integer64\")) has returned %s not %s" msgstr "PRINTNAME(install(\"integer64\")) 返回了 %s , 而不是 %s" -#: init.c:397 +#: init.c:408 msgid ".Last.value in namespace is not a length 1 integer" msgstr "命名空间中,.Last.value 不是一个长度为 1 的整型" @@ -3963,7 +3991,7 @@ msgstr "参数'x'是一个原子型矢量,原位的更新只为list 或 data.t msgid "'x' argument must be numeric type, or list/data.table of numeric types" msgstr "参数'x'必须是数字类型,或者是数字类型的list/data.table" -#: nafill.c:149 nafill.c:180 +#: nafill.c:159 nafill.c:190 msgid "" "Internal error: invalid type argument in nafillR function, should have been " "caught before. Please report to data.table issue tracker." @@ -3971,21 +3999,21 @@ msgstr "" "内部错误:函数 nafillR 中有无效类型的参数, 该错误理应已被捕获,请向data.table" "的issue通道报告" -#: nafill.c:196 +#: nafill.c:206 #, c-format msgid "%s: parallel processing of %d column(s) took %.3fs\n" msgstr "%s : 并行处理 %d 列, 用时 %.3fs\n" -#: openmp-utils.c:22 +#: openmp-utils.c:23 #, c-format msgid "" -"Ignoring invalid %s==\")%s\". Not an integer >= 1. Please remove any " +"Ignoring invalid %s==\"%s\". Not an integer >= 1. Please remove any " "characters that are not a digit [0-9]. See ?data.table::setDTthreads." msgstr "" -"忽略无效的 %s==\")%s\". 不是一个 >= 1 的整型. 请去除任何不是[0-9]数字的字" +"忽略无效的 %s==\"%s\". 不是一个 >= 1 的整型. 请去除任何不是[0-9]数字的字" "符。 查看?data.table::setDTthreads." -#: openmp-utils.c:40 +#: openmp-utils.c:44 #, c-format msgid "" "Ignoring invalid R_DATATABLE_NUM_PROCS_PERCENT==%d. If used it must be an " @@ -3994,61 +4022,67 @@ msgstr "" "忽略无效的R_DATATABLE_NUM_PROCS_PERCENT==%d. 如需使用,它必须是一个2-100的整" "型,默认值为50查看?setDTtheads." -#: openmp-utils.c:67 +#: openmp-utils.c:78 msgid "'verbose' must be TRUE or FALSE" msgstr "'verbose'必须是TRUE或者FALSE" -#: openmp-utils.c:70 +#: openmp-utils.c:81 msgid "" "This installation of data.table has not been compiled with OpenMP support.\n" msgstr "安装的data.table并不是获得OpenMP支持的编译\n" -#: openmp-utils.c:75 +#: openmp-utils.c:86 #, c-format msgid " omp_get_num_procs() %d\n" msgstr " omp_get_num_procs() %d\n" -#: openmp-utils.c:76 +#: openmp-utils.c:87 #, c-format msgid " R_DATATABLE_NUM_PROCS_PERCENT %s\n" msgstr " R_DATATABLE_NUM_PROCS_PERCENT %s\n" -#: openmp-utils.c:77 +#: openmp-utils.c:88 #, c-format msgid " R_DATATABLE_NUM_THREADS %s\n" msgstr " R_DATATABLE_NUM_THREADS %s\n" -#: openmp-utils.c:78 +#: openmp-utils.c:89 +#, c-format +msgid " R_DATATABLE_THROTTLE %s\n" +msgstr " R_DATATABLE_THROTTLE %s\n" + +#: openmp-utils.c:90 #, c-format msgid " omp_get_thread_limit() %d\n" msgstr " omp_get_thread_limit() %d\n" -#: openmp-utils.c:79 +#: openmp-utils.c:91 #, c-format msgid " omp_get_max_threads() %d\n" msgstr " omp_get_max_threads() %d\n" -#: openmp-utils.c:80 +#: openmp-utils.c:92 #, c-format msgid " OMP_THREAD_LIMIT %s\n" msgstr " OMP_THREAD_LIMIT %s\n" -#: openmp-utils.c:81 +#: openmp-utils.c:93 #, c-format msgid " OMP_NUM_THREADS %s\n" msgstr " OMP_NUM_THREADS %s\n" -#: openmp-utils.c:82 +#: openmp-utils.c:94 #, c-format msgid " RestoreAfterFork %s\n" msgstr " RestoreAfterFork %s\n" -#: openmp-utils.c:83 +#: openmp-utils.c:95 #, c-format -msgid " data.table is using %d threads. See ?setDTthreads.\n" -msgstr " data.table 正在使用 %d 线程. 查看 ?setDTthreads.\n" +msgid "" +" data.table is using %d threads with throttle==%d. See ?setDTthreads.\n" +msgstr " data.table 正在使用 %d 线程, throttle==%d. 查看 ?setDTthreads.\n" -#: openmp-utils.c:91 +#: openmp-utils.c:103 msgid "" "restore_after_fork= must be TRUE, FALSE, or NULL (default). " "getDTthreads(verbose=TRUE) reports the current setting.\n" @@ -4056,26 +4090,20 @@ msgstr "" "restore_after_fork= 必须是 TRUE, FALSE, 或者 NULL (默认值). " "getDTthreads(verbose=TRUE) 来查看当前设置.\n" -#: openmp-utils.c:105 -#, c-format -msgid "" -"threads= must be either NULL (default) or a single number. It has length %d" -msgstr "threads= 必须是 NULL (默认值) 或者一个数字. 目前它长度为 %d" - -#: openmp-utils.c:107 -msgid "threads= must be either NULL (default) or type integer/numeric" -msgstr "threads= 必须是 NULL (默认值) 或者数字/整型类型" - #: openmp-utils.c:109 +msgid "'throttle' must be a single number, non-NA, and >=1" +msgstr "'throttle' 须为单个非 NA 且 >= 1 的数值" + +#: openmp-utils.c:123 msgid "" -"threads= must be either NULL or a single integer >= 0. See ?setDTthreads." -msgstr "threads= 必须是 NULL 或者一个>=0 的整型。 查看 ?setDTthreads." +"threads= must be either NULL or a single number >= 0. See ?setDTthreads." +msgstr "threads= 必须是 NULL 或者一个>=0 的数值。 查看 ?setDTthreads." -#: openmp-utils.c:114 +#: openmp-utils.c:127 msgid "Internal error: percent= must be TRUE or FALSE at C level" msgstr "内部错误: 在 C 中,percent= 必须是TRUE or FALSE " -#: openmp-utils.c:117 +#: openmp-utils.c:130 #, c-format msgid "" "Internal error: threads==%d should be between 2 and 100 (percent=TRUE at C " @@ -4296,7 +4324,8 @@ msgstr "" msgid "" "Failed to allocate working memory for %d factor levels of result column %d " "when reading item %d of item %d" -msgstr "当读取第%4$d项的第%3$d个子项时,无法为第%2$d列的%1$d个因素水平分配工作内存" +msgstr "" +"当读取第%4$d项的第%3$d个子项时,无法为第%2$d列的%1$d个因素水平分配工作内存" #: rbindlist.c:523 #, c-format @@ -4341,16 +4370,22 @@ msgstr "排序必须是整数向量" msgid "nrow(x)[%d]!=length(order)[%d]" msgstr "nrow(x)[%d] 不等于 length(order)[%d]" -#: reorder.c:48 +#: reorder.c:51 #, c-format -msgid "order is not a permutation of 1:nrow[%d]" -msgstr "顺序与 1 到 nrow[%d] 的排列不同" +msgid "" +"Item %d of order (%d) is either NA, out of range [1,%d], or is duplicated. " +"The new order must be a strict permutation of 1:n" +msgstr "排序(%2$d)的 %1$d 项为 NA,超出范围 [1,%3$d],或与其他项重复。" +"新的排序必须为 1:n 的排列" + +#: reorder.c:105 +msgid "dt passed to setcolorder has no names" +msgstr "setcolorder读取到的dt并没有名字" -#: reorder.c:57 +#: reorder.c:107 #, c-format -msgid "" -"Unable to allocate %d * %d bytes of working memory for reordering data.table" -msgstr "在工作内存中无法分配 %d * %d 个字节对 data.table 重新排序" +msgid "Internal error: dt passed to setcolorder has %d columns but %d names" +msgstr "内部错误: setcolorder读取到的dt有 %d 列但是有 %d 个名字。" #: shift.c:17 #, c-format @@ -4386,7 +4421,7 @@ msgstr "不支持 '%s' 类型" msgid "Internal error: subsetVectorRaw length(ans)==%d n=%d" msgstr "内部错误: subsetVectorRaw ans length(ans)==%d n=%d" -#: subset.c:88 +#: subset.c:101 #, c-format msgid "" "Internal error: column type '%s' not supported by data.table subset. All " @@ -4395,44 +4430,44 @@ msgstr "" "内部错误:data.table 子集不支持列类型 '%s' 。已知所有类型均被支持,因此请提交" "此BUG。" -#: subset.c:97 subset.c:121 +#: subset.c:110 subset.c:134 #, c-format msgid "Internal error. 'idx' is type '%s' not 'integer'" msgstr "内部错误:'idx' 是 '%s' 类型,而非 '整数'" -#: subset.c:122 +#: subset.c:135 #, c-format msgid "" "Internal error. 'maxArg' is type '%s' and length %d, should be an integer " "singleton" msgstr "内部错误:'maxArg' 是 '%s' 类型,长度为 %d ,应该是单一整数" -#: subset.c:123 +#: subset.c:136 msgid "Internal error: allowOverMax must be TRUE/FALSE" msgstr "内部错误:allowOverMax 必须是 TRUE 或 FALSE" -#: subset.c:125 +#: subset.c:138 #, c-format msgid "Internal error. max is %d, must be >= 0." msgstr "内部错误。最大值是 %d ,且必须 >= 0。" -#: subset.c:149 +#: subset.c:162 #, c-format msgid "i[%d] is %d which is out of range [1,nrow=%d]" msgstr "i[%d] 是 %d ,超出 [1,nrow=%d] 的范围" -#: subset.c:161 +#: subset.c:174 #, c-format msgid "" "Item %d of i is %d and item %d is %d. Cannot mix positives and negatives." msgstr "i 的第 %d 项是 %d ,第 %d 项是 %d 。正负不能混用。" -#: subset.c:171 +#: subset.c:184 #, c-format msgid "Item %d of i is %d and item %d is NA. Cannot mix negatives and NA." msgstr "i 的第 %d 项是 %d ,第 %d 项是 NA 。负值和 NA 不能混用。" -#: subset.c:207 +#: subset.c:220 #, c-format msgid "" "Item %d of i is %d but there are only %d rows. Ignoring this and %d more " @@ -4440,7 +4475,7 @@ msgid "" msgstr "" "i 的第 %d 项是 %d ,但只有 %d 行。忽略这项以及其他相似的 %d 项(共 %d 项)。" -#: subset.c:209 +#: subset.c:222 #, c-format msgid "" "Item %d of i is %d which removes that item but that has occurred before. " @@ -4449,40 +4484,40 @@ msgstr "" "i 的第 %d 项是 %d ,它删除了这项但此操作之前发生过。忽略该重复以及其他 %d 个" "重复。" -#: subset.c:223 +#: subset.c:236 #, c-format msgid "Column %d is NULL; malformed data.table." msgstr "%d 列为空(NULL);data.table 格式错误。" -#: subset.c:226 +#: subset.c:239 #, c-format msgid "Column %d ['%s'] is a data.frame or data.table; malformed data.table." msgstr "%d ['%s'] 列是 data.frame 或 data.table; data.table 格式错误。" -#: subset.c:231 +#: subset.c:244 #, c-format msgid "" "Column %d ['%s'] is length %d but column 1 is length %d; malformed data." "table." msgstr "%d ['%s'] 长度为 %d ,而列 1 的长度为 %d ;data.table 格式错误。" -#: subset.c:247 +#: subset.c:260 #, c-format msgid "Internal error. Argument 'x' to CsubsetDT is type '%s' not 'list'" msgstr "内部错误:CsubsetDT 的参数 'x' 是 '%s' 类型而非列表" -#: subset.c:260 +#: subset.c:273 #, c-format msgid "Internal error. Argument 'cols' to Csubset is type '%s' not 'integer'" msgstr "内部错误:CsubsetDT 的参数 'cols' 是 '%s' 类型而非整数" -#: subset.c:337 +#: subset.c:350 msgid "" "Internal error: NULL can not be subset. It is invalid for a data.table to " "contain a NULL column." msgstr "内部错误:空集(NULL)不能作为子集。data.table 包含空列是无效的。" -#: subset.c:339 +#: subset.c:352 msgid "" "Internal error: CsubsetVector is internal-use-only but has received " "negatives, zeros or out-of-range" @@ -4533,30 +4568,30 @@ msgstr "内部错误:uniqlist 已经传递长度为 0 的序列" msgid "Internal error: uniqlist has been passed length(order)==%d but nrow==%d" msgstr "内部错误:uniqlist 已经传递长度为 %d 的序列,而行数是 %d" -#: uniqlist.c:96 uniqlist.c:127 uniqlist.c:208 uniqlist.c:245 uniqlist.c:318 +#: uniqlist.c:96 uniqlist.c:128 uniqlist.c:209 uniqlist.c:246 uniqlist.c:319 #, c-format msgid "Type '%s' not supported" msgstr "类型 '%s' 不被支持" -#: uniqlist.c:148 +#: uniqlist.c:149 msgid "Input argument 'x' to 'uniqlengths' must be an integer vector" msgstr "输入到 'uniqlengths' 的参数 'x' 必须是整数向量" -#: uniqlist.c:149 +#: uniqlist.c:150 msgid "" "Input argument 'n' to 'uniqlengths' must be an integer vector of length 1" msgstr "输入到 'uniqlengths' 的参数 'n' 必须是长度为 1 的整数向量" -#: uniqlist.c:167 +#: uniqlist.c:168 msgid "cols must be an integer vector with length >= 1" msgstr "cols必须是一个长度大于等于1的整数向量" -#: uniqlist.c:171 +#: uniqlist.c:172 #, c-format msgid "Item %d of cols is %d which is outside range of l [1,length(l)=%d]" msgstr "列的%d项是%d,它超出l的所在区间[1,length(l)=%d]" -#: uniqlist.c:174 +#: uniqlist.c:175 #, c-format msgid "" "All elements to input list must be of same length. Element [%d] has length " @@ -4565,81 +4600,81 @@ msgstr "" "列表的所有元素必须是同样的长度。元素[%d]的长度%不等于第一个元素的长" "度%" -#: uniqlist.c:255 +#: uniqlist.c:256 msgid "Internal error: nestedid was not passed a list length 1 or more" msgstr "内部错误:nestedid并不是一个长度大于或者等于1的列表" -#: uniqlist.c:262 +#: uniqlist.c:263 #, c-format msgid "Internal error: nrows[%d]>0 but ngrps==0" msgstr "内部错误:nrows[%d]>0但是but ngrps==0" -#: uniqlist.c:264 +#: uniqlist.c:265 msgid "cols must be an integer vector of positive length" msgstr "cols必须是一个长度大于零的整数向量" -#: uniqlist.c:349 +#: uniqlist.c:350 msgid "x is not a logical vector" msgstr "x不是一个逻辑向量" -#: utils.c:73 +#: utils.c:80 #, c-format msgid "Unsupported type '%s' passed to allNA()" msgstr "allNA() 不支持'%s'类型" -#: utils.c:92 +#: utils.c:99 msgid "'x' argument must be data.table compatible" msgstr "'x' 必须为data.table支持的类型" -#: utils.c:94 +#: utils.c:101 msgid "'check_dups' argument must be TRUE or FALSE" msgstr "参数'check_dups'必须为TRUE或者是FALSE" -#: utils.c:110 +#: utils.c:117 msgid "" "argument specifying columns is type 'double' and one or more items in it are " "not whole integers" msgstr "指定列的参数是一个双精度类型而其中至少有一个元素不是整数" -#: utils.c:116 +#: utils.c:123 #, c-format msgid "argument specifying columns specify non existing column(s): cols[%d]=%d" msgstr "指定列的参数指定了不存在的列: cols[%d]=%d" -#: utils.c:121 +#: utils.c:128 msgid "'x' argument data.table has no names" msgstr "data.table的参数x并没有名字" -#: utils.c:126 +#: utils.c:133 #, c-format msgid "" "argument specifying columns specify non existing column(s): cols[%d]='%s'" msgstr "指定列的参数指定了不存在的列: cols[%d]='%s'" -#: utils.c:129 +#: utils.c:136 msgid "argument specifying columns must be character or numeric" msgstr "指定列的参数必须是字符或者是数值" -#: utils.c:132 +#: utils.c:139 msgid "argument specifying columns specify duplicated column(s)" msgstr "指定列的参数指定了重复的列" -#: utils.c:138 +#: utils.c:145 #, c-format msgid "%s: fill argument must be length 1" msgstr "%s:fill参数的长度必须为1" -#: utils.c:171 +#: utils.c:178 #, c-format msgid "%s: fill argument must be numeric" msgstr "%s:fill参数必须为数值类型" -#: utils.c:273 +#: utils.c:280 #, c-format msgid "Internal error: unsupported type '%s' passed to copyAsPlain()" msgstr "内部错误:copyAsPlain()不支持类型为'%s'的参数" -#: utils.c:277 +#: utils.c:284 #, c-format msgid "" "Internal error: type '%s' passed to copyAsPlain() but it seems " @@ -4647,7 +4682,7 @@ msgid "" msgstr "" "内部错误:copyAsPlain()中参数为'%s'类型,但copyMostAttrib() 保留了ALTREP属性" -#: utils.c:312 +#: utils.c:319 #, c-format msgid "Found and copied %d column%s with a shared memory address\n" msgstr "发现并拷贝了具有相同的内存地址的%d列%s\n" From bd198f9e5798419eff242521653b17ccf3719f86 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 17 Jul 2020 12:37:58 -0600 Subject: [PATCH 055/588] correct milestone number link in NEWS heading --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 9c1d3cbc43..3737ff26f9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.12.9](https://github.com/Rdatatable/data.table/milestone/19) (in development) +# data.table [v1.12.9](https://github.com/Rdatatable/data.table/milestone/17) (in development) ## POTENTIALLY BREAKING CHANGES From bdc6da98b6dc1276accd906d1ec0f0cd921bbfc4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 21 Jul 2020 05:35:01 +0800 Subject: [PATCH 056/588] fix regression in dcast (#4619) --- R/fcast.R | 11 ++++++----- inst/tests/tests.Rraw | 4 ++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/R/fcast.R b/R/fcast.R index 91613960e8..db7a4b94b5 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -39,12 +39,13 @@ check_formula = function(formula, varnames, valnames) { deparse_formula = function(expr, varnames, allvars) { lvars = lapply(expr, function(this) { - if (this %iscall% '+') { - unlist(deparse_formula(as.list(this)[-1L], varnames, allvars)) - } else if (is.name(this) && this==quote(`...`)) { + if (!is.language(this)) return(NULL) + if (this %iscall% '+') return(unlist(deparse_formula(this[-1L], varnames, allvars))) + if (is.name(this) && this == quote(`...`)) { subvars = setdiff(varnames, allvars) - lapply(subvars, as.name) - } else this + return(lapply(subvars, as.name)) + } + this }) lvars = lapply(lvars, function(x) if (length(x) && !is.list(x)) list(x) else x) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b990d96b29..774911872f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17098,3 +17098,7 @@ if (TZnotUTC) { ans, output=ans_print) } options(old) + +# 1 is treated as . in dcast formula, #4615 +DT = data.table(a = c("s", "x"), survmean = 1:2) +test(2151, dcast(DT, 1 ~ a, value.var='survmean'), data.table('.'='.', s=1L, x=2L, key='.')) From 1cb18e384728c46eb620edbadf930d677279e304 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 21 Jul 2020 06:41:48 +0800 Subject: [PATCH 057/588] fix copy not to recurse infinitely on some objects (#4620) --- R/data.table.R | 5 ++++- inst/tests/tests.Rraw | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index e95420b6e0..ccb7f36e60 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2261,7 +2261,7 @@ is.na.data.table = function (x) { Ops.data.table = function(e1, e2 = NULL) { ans = NextMethod() - if (cedta() && is.data.frame(ans)) ans = as.data.table(ans) + if (cedta() && is.data.frame(ans)) ans = as.data.table(ans) else if (is.matrix(ans)) colnames(ans) = copy(colnames(ans)) ans } @@ -2358,7 +2358,10 @@ copy = function(x) { .Call(C_unlock, y) setalloccol(y) } else if (is.list(y)) { + oldClass = class(y) + setattr(y, 'class', NULL) # otherwise [[.person method (which returns itself) results in infinite recursion, #4620 y[] = lapply(y, reallocate) + if (!identical(oldClass, 'list')) setattr(y, 'class', oldClass) } y } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 774911872f..adaea569c6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17102,3 +17102,8 @@ options(old) # 1 is treated as . in dcast formula, #4615 DT = data.table(a = c("s", "x"), survmean = 1:2) test(2151, dcast(DT, 1 ~ a, value.var='survmean'), data.table('.'='.', s=1L, x=2L, key='.')) + +# list object with [[ method that returns itself (e.g. person) lead to infinite loop in copy(), #4620 +y = person(given='Joel', family='Mossong') +test(2152, copy(y), y) + From e266c0f2073dc539d886c22324d3a0c2a65eaf0c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 21 Jul 2020 07:11:40 +0800 Subject: [PATCH 058/588] fix eval environment for [[ GForce is.atomic test (#4622) --- NEWS.md | 2 +- R/data.table.R | 4 ++-- inst/tests/tests.Rraw | 7 +++++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3737ff26f9..80ca7dd6ed 100644 --- a/NEWS.md +++ b/NEWS.md @@ -99,7 +99,7 @@ unit = "s") 4. If `.SD` is returned as-is during grouping, it is now unlocked for downstream usage, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks also to @mllg for detecting a problem with the initial fix here during the dev release [#4173](https://github.com/Rdatatable/data.table/issues/4173). -5. `GForce` is deactivated for `[[` on non-atomic input, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). +5. `GForce` is deactivated for `[[` on non-atomic input, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks @hongyuanjia and @ColeMiller1 for helping debug an issue in dev with the original fix before release, [#4612](https://github.com/Rdatatable/data.table/issues/4612). 6. `all.equal(DT, y)` no longer errors when `y` is not a data.table, [#4042](https://github.com/Rdatatable/data.table/issues/4042). Thanks to @d-sci for reporting and the PR. diff --git a/R/data.table.R b/R/data.table.R index ccb7f36e60..99afcfb271 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1539,10 +1539,10 @@ replace_dot_alias = function(e) { jvnames = sdvars } } else if (length(as.character(jsub[[1L]])) == 1L) { # Else expect problems with - # g[[ only applies to atomic input, for now, was causing #4159 + # g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612 subopt = length(jsub) == 3L && (jsub[[1L]] == "[" || - (jsub[[1L]] == "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), envir = x))) && + (jsub[[1L]] == "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail" firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index adaea569c6..dbc73b33fa 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8088,6 +8088,13 @@ test(1581.17, DT[ , as.list(l[[f1]])[[f2]], by=c("f1","f2")], data.table(f1 = c("a", "b"), f2 = c("x", "y"), V1 = c("ax", "by"))) test(1581.18, DT[, v:=l[[f1]][f2], by=c("f1","f2")], data.table(f1=c("a","b"), f2=c("x","y"), v=c("ax", "by"))) +# When the object being [[ is in parent.frame(), not x, +# need eval to have enclos=parent.frame(), #4612 +DT = data.table(id = c(1, 1, 2), value = c("a", "b", "c")) +DT0 = copy(DT) +fun = function (DT, tag = c("A", "B")) DT[, var := tag[[.GRP]], by = "id"] +fun(DT) +test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')]) # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) From 8a0f2241cd3c4a3034f802a2688e5be4db72fccf Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 20 Jul 2020 21:05:08 -0600 Subject: [PATCH 059/588] CRAN_Release basic section finds; #4606 --- .appveyor.yml | 2 +- .dev/CRAN_Release.cmd | 2 +- .travis.yml | 2 +- man/fread.Rd | 2 +- src/bmerge.c | 2 +- vignettes/datatable-intro.Rmd | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 5d1e2c7149..edd916d992 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -32,7 +32,7 @@ environment: - R_VERSION: release # the single Windows.zip binary (both 32bit/64bit) that users following dev version of installation instructions should click -# - R_VERSION: devel # When off it's to speed up dev cycle; R-devel is still checked but by GLCI on a roughly hourly cycle. + - R_VERSION: devel # When off it's to speed up dev cycle; R-devel is still checked but by GLCI on a roughly hourly cycle. CRAN_Release.cmd has a reminder to turn back on. before_build: - cmd: ECHO no Revision metadata added to DESCRIPTION diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 7b1e0a3a94..a82d8ce9d3 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -208,7 +208,7 @@ grep asCharacter *.c | grep -v PROTECT | grep -v SET_VECTOR_ELT | grep -v setAtt cd .. R -cc(test=TRUE, clean=TRUE, CC="gcc-8") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html +cc(test=TRUE, clean=TRUE, CC="gcc-10") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html saf = options()$stringsAsFactors options(stringsAsFactors=!saf) # check tests (that might be run by user) are insensitive to option, #2718 test.data.table() diff --git a/.travis.yml b/.travis.yml index f27b73b8f6..a6b9cc6c84 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ r: os: - linux -# - osx # Takes 13m (+9m linux = 22m total); #3357; #3326; #3331. When off it's to speed up dev cycle; CRAN_Release.cmd has a reminder to turn back on. + - osx # Takes 13m (+9m linux = 22m total); #3357; #3326; #3331. When off it's to speed up dev cycle; CRAN_Release.cmd has a reminder to turn back on. before_install: - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install llvm && diff --git a/man/fread.Rd b/man/fread.Rd index 2dea746d84..37a4d06b9e 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -64,7 +64,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="" \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. } \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. } \item{tmpdir}{ Directory to use as the \code{tmpdir} argument for any \code{tempfile} calls, e.g. when the input is a URL or a shell command. The default is \code{tempdir()} which can be controlled by setting \code{TMPDIR} before starting the R session; see \code{\link[base:tempfile]{base::tempdir}}. } - \item{tz}{ Relevant to datetime values which have no Z or UTC-offset at the end, i.e. \emph{unmarked} datetime, as written by \code{\link[utils]{write.csv}}. The default \code{tz=""} means interpet unmarked datetime in the timezone of the R session, for consistency with R's \code{as.POSIXct()} and backwards compatibility. Set \code{tz="UTC"} to read unmarked datetime in UTC. Note that \code{fwrite()} by default writes datetime in UTC including the final Z (i.e. UTC-marked datetime) and \code{fwrite}'s output will be read by \code{fread} consistently and quickly without needing to use \code{tz=} or \code{colClasses=}. If the TZ environment variable is set to \code{"UTC"} (or \code{""} on non-Windows where unset vs `""` is significant) then R's timezone is already UTC, the default \code{tz=""} means UTC, and unmarked datetime will be read as UTC. The TZ environment variable being unset, however, means local time, in both C and R, and is quite different from the TZ environment variable being set to \code{""} on non-Windows which means UTC not local. You can use \code{Sys.setenv(TZ="UTC")}, and \code{Sys.unsetenv("TZ")}, too, and \code{fread} will use the latest value. } + \item{tz}{ Relevant to datetime values which have no Z or UTC-offset at the end, i.e. \emph{unmarked} datetime, as written by \code{\link[utils:write.table]{utils::write.csv}}. The default \code{tz=""} means interpet unmarked datetime in the timezone of the R session, for consistency with R's \code{as.POSIXct()} and backwards compatibility. Set \code{tz="UTC"} to read unmarked datetime in UTC. Note that \code{fwrite()} by default writes datetime in UTC including the final Z (i.e. UTC-marked datetime) and \code{fwrite}'s output will be read by \code{fread} consistently and quickly without needing to use \code{tz=} or \code{colClasses=}. If the TZ environment variable is set to \code{"UTC"} (or \code{""} on non-Windows where unset vs `""` is significant) then R's timezone is already UTC, the default \code{tz=""} means UTC, and unmarked datetime will be read as UTC. The TZ environment variable being unset, however, means local time, in both C and R, and is quite different from the TZ environment variable being set to \code{""} on non-Windows which means UTC not local. You can use \code{Sys.setenv(TZ="UTC")}, and \code{Sys.unsetenv("TZ")}, too, and \code{fread} will use the latest value. } } \details{ diff --git a/src/bmerge.c b/src/bmerge.c index 4c13f14b95..c6ae3e48ee 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -48,7 +48,7 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE if (!isInteger(icolsArg)) error(_("Internal error: icols is not integer vector")); // # nocov if (!isInteger(xcolsArg)) error(_("Internal error: xcols is not integer vector")); // # nocov if ((LENGTH(icolsArg) == 0 || LENGTH(xcolsArg) == 0) && LENGTH(i) > 0) // We let through LENGTH(i) == 0 for tests 2126.* - error(_("Internal error: icols and xcols must be non-empty integer vectors.")); + error(_("Internal error: icols and xcols must be non-empty integer vectors.")); if (LENGTH(icolsArg) > LENGTH(xcolsArg)) error(_("Internal error: length(icols) [%d] > length(xcols) [%d]"), LENGTH(icolsArg), LENGTH(xcolsArg)); // # nocov icols = INTEGER(icolsArg); xcols = INTEGER(xcolsArg); diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index 75ebd5bd14..85da6703a1 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -38,7 +38,7 @@ Briefly, if you are interested in reducing *programming* and *compute* time trem ## Data {#data} -In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the [Bureau of Transporation Statistics](http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236) for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/hadley/nycflights13)). The data is available only for Jan-Oct'14. +In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the [Bureau of Transporation Statistics](http://www.transtats.bts.gov) for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/hadley/nycflights13)). The data is available only for Jan-Oct'14. We can use `data.table`'s fast-and-friendly file reader `fread` to load `flights` directly as follows: From cbd99d4ab9f3ac18a40195c5dbda86740d48cc58 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 21 Jul 2020 01:10:34 -0600 Subject: [PATCH 060/588] pass clang-ubsan, all 4 in fread.c (#4624) --- src/fread.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/fread.c b/src/fread.c index e9ae0288be..6976080822 100644 --- a/src/fread.c +++ b/src/fread.c @@ -989,7 +989,7 @@ static void parse_iso8601_date_core(const char **pch, int32_t *target) return; fail: - *target = NA_FLOAT64; + *target = NA_INT32; } static void parse_iso8601_date(FieldParseContext *ctx) { @@ -2267,10 +2267,10 @@ int freadMain(freadMainArgs _args) { // DTPRINT(_("Field %d: '%.10s' as type %d (tch=%p)\n"), j+1, tch, type[j], tch); fieldStart = tch; int8_t thisType = type[j]; // fetch shared type once. Cannot read half-written byte is one reason type's type is single byte to avoid atomic read here. - int8_t thisSize = size[j]; fun[abs(thisType)](&fctx); if (*tch!=sep) break; - ((char **) targets)[thisSize] += thisSize; + int8_t thisSize = size[j]; + if (thisSize) ((char **) targets)[thisSize] += thisSize; // 'if' for when rereading to avoid undefined NULL+0 tch++; j++; } @@ -2283,7 +2283,7 @@ int freadMain(freadMainArgs _args) { } else if (eol(&tch) && j Date: Tue, 21 Jul 2020 12:27:07 -0600 Subject: [PATCH 061/588] OSX Travis (#4623) * first steps in fixing issue by following advice in the output * Remove fortran symlink in Travis build (#4623) (#4627) Co-authored-by: Mark Klik --- .travis.yml | 101 +++++++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 53 deletions(-) diff --git a/.travis.yml b/.travis.yml index a6b9cc6c84..b219af006f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,53 +1,48 @@ -language: r -dist: trusty -sudo: required -cache: packages # to rebuild cache see tweet thread ending here https://twitter.com/jimhester_/status/1115718589804421121 -warnings_are_errors: true - -branches: - only: - - "master" - -r: - - release - -os: - - linux - - osx # Takes 13m (+9m linux = 22m total); #3357; #3326; #3331. When off it's to speed up dev cycle; CRAN_Release.cmd has a reminder to turn back on. - -before_install: - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install llvm && - export PATH="/usr/local/opt/llvm/bin:$PATH" && - export LDFLAGS="-L/usr/local/opt/llvm/lib" && - export CFLAGS="-I/usr/local/opt/llvm/include"; fi - -r_packages: - - drat # used in .ci/deploy.sh to publish tar.gz to github.io/Rdatatable/data.table - - covr - -before_script: - - echo "Revision:" $TRAVIS_COMMIT >> ./DESCRIPTION - -after_success: - - test $TRAVIS_OS_NAME == "linux" && - travis_wait Rscript -e 'library(covr); codecov()' - - test $TRAVIS_OS_NAME == "linux" && - test $TRAVIS_REPO_SLUG == "Rdatatable/data.table" && - test $TRAVIS_PULL_REQUEST == "false" && - test $TRAVIS_BRANCH == "master" && - bash .ci/deploy.sh - -notifications: - email: - on_success: change - on_failure: change - -env: - global: - - PKG_CFLAGS="-O3 -Wall -pedantic" - - _R_CHECK_NO_STOP_ON_TEST_ERROR_=true - - _R_CHECK_CRAN_INCOMING_REMOTE_=false - # Block truncation of any error messages in R CMD check - - _R_CHECK_TESTS_NLINES_=0 - # drat using @jangorecki token - - secure: "CxDW++rsQApQWos+h1z/F76odysyD6AtXJrDwlCHlgqXeKJNRATR4wZDDR18SK+85jUqjoqOvpyrq+5kKuyg6AnA/zduaX2uYE5mcntEUiyzlG/jJUKbcJqt22nyAvFXP3VS60T2u4H6IIhVmr7dArdxLkv8W+pJvf2Tg6kx8Ws=" +language: r +dist: bionic +cache: packages # to rebuild cache see tweet thread ending here https://twitter.com/jimhester_/status/1115718589804421121 +warnings_are_errors: true + +r: + - release + +os: + - linux + - osx # Takes 13m (+9m linux = 22m total); #3357; #3326; #3331. When off it's to speed up dev cycle; CRAN_Release.cmd has a reminder to turn back on. + +brew_packages: + - llvm + +r_packages: + - drat # used in .ci/deploy.sh to publish tar.gz to github.io/Rdatatable/data.table + - covr + +before_install: + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then rm "/usr/local/bin/gfortran"; fi + +before_script: + - echo "Revision:" $TRAVIS_COMMIT >> ./DESCRIPTION + +after_success: + - test $TRAVIS_OS_NAME == "linux" && + travis_wait Rscript -e 'library(covr); codecov()' + - test $TRAVIS_OS_NAME == "linux" && + test $TRAVIS_REPO_SLUG == "Rdatatable/data.table" && + test $TRAVIS_PULL_REQUEST == "false" && + test $TRAVIS_BRANCH == "master" && + bash .ci/deploy.sh + +notifications: + email: + on_success: change + on_failure: change + +env: + global: + - PKG_CFLAGS="-O3 -Wall -pedantic" + - _R_CHECK_NO_STOP_ON_TEST_ERROR_=true + - _R_CHECK_CRAN_INCOMING_REMOTE_=false + # Block truncation of any error messages in R CMD check + - _R_CHECK_TESTS_NLINES_=0 + # drat using @jangorecki token + - secure: "CxDW++rsQApQWos+h1z/F76odysyD6AtXJrDwlCHlgqXeKJNRATR4wZDDR18SK+85jUqjoqOvpyrq+5kKuyg6AnA/zduaX2uYE5mcntEUiyzlG/jJUKbcJqt22nyAvFXP3VS60T2u4H6IIhVmr7dArdxLkv8W+pJvf2Tg6kx8Ws=" From 55b9561d2e0d20767de06a59382391c0927bb1b2 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 22 Jul 2020 00:03:44 -0600 Subject: [PATCH 062/588] to pass rchk (#4629) --- inst/tests/tests.Rraw | 3 +- src/chmatch.c | 17 ++++-- src/fifelse.c | 126 +++++++++++++++++++++--------------------- src/fmelt.c | 30 ++++++---- src/shift.c | 91 +++++++++++++++++------------- 5 files changed, 146 insertions(+), 121 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dbc73b33fa..eee8ea569d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16871,7 +16871,8 @@ s1 = class2132(x=20191231) s2 = class2132(x=20191230) test(2132.1, fifelse(TRUE, s1, s2), error = "S4 class objects (except nanotime) are not supported.") test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanotime) are not supported.") -test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see https://github.com/Rdatatable/data.table/issues/4131.") +test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") +test(2132.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") rm(s1, s2, class2132) if (test_xts) { diff --git a/src/chmatch.c b/src/chmatch.c index c0f3397d6a..f80e7dd2c7 100644 --- a/src/chmatch.c +++ b/src/chmatch.c @@ -20,9 +20,10 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch } } // allocations up front before savetl starts in case allocs fail - SEXP ans = PROTECT(allocVector(chin?LGLSXP:INTSXP, xlen)); + int nprotect=0; + SEXP ans = PROTECT(allocVector(chin?LGLSXP:INTSXP, xlen)); nprotect++; if (xlen==0) { // no need to look at table when x is empty (including null) - UNPROTECT(1); + UNPROTECT(nprotect); return ans; } int *ansd = INTEGER(ans); @@ -30,14 +31,18 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch if (tablelen==0) { const int val=(chin?0:nomatch), n=xlen; for (int i=0; i0) { if (xlength(cons) != len0) { error("Argument #%d has a different length than argument #1. " "Please make sure all logical conditions have the same length.", @@ -232,18 +228,21 @@ SEXP fcaseR(SEXP na, SEXP rho, SEXP args) { UNPROTECT(2); } } - len1 = xlength(outs); - if (len1 != len0 && len1 != 1) { + int64_t len1 = xlength(outs); + if (len1!=len0 && len1!=1) { error("Length of output value #%d must either be 1 or length of logical condition.", i*2+2); } int64_t amask = len1>1 ? INT64_MAX : 0; + const int *restrict pcons = LOGICAL(cons); + const bool imask = i==0; + int64_t l=0; // how many this case didn't satisfy; i.e. left for next case switch(TYPEOF(outs)) { case LGLSXP: { const int *restrict pouts = LOGICAL(outs); int *restrict pans = LOGICAL(ans); const int pna = nonna ? LOGICAL(na)[0] : NA_LOGICAL; for (int64_t j=0; j -SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) { - - size_t size; - int protecti=0; - SEXP x, tmp=R_NilValue, elem, ans, thisfill; - unsigned long long *dthisfill; +SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) +{ + int nprotect=0; enum {LAG, LEAD/*, SHIFT, CYCLIC*/} stype = LAG; // currently SHIFT maps to LAG and CYCLIC is unimplemented (see comments in #1708) if (!xlength(obj)) return(obj); // NULL, list() + SEXP x; if (isVectorAtomic(obj)) { - x = PROTECT(allocVector(VECSXP, 1)); protecti++; + x = PROTECT(allocVector(VECSXP, 1)); nprotect++; SET_VECTOR_ELT(x, 0, obj); } else { if (!isNewList(obj)) @@ -32,17 +30,19 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) { const int *kd = INTEGER(k); for (int i=0; i= 0) || (stype == LEAD && kd[j] < 0)) { - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { - for (int m=0; m Date: Wed, 22 Jul 2020 14:30:30 -0600 Subject: [PATCH 063/588] reworked fmelt.c:concat to survive gctorture (#4631) --- inst/tests/tests.Rraw | 17 +++++++----- src/fmelt.c | 60 ++++++++++++++++++++++++------------------- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index eee8ea569d..3081f79dde 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3056,7 +3056,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) ans <- data.table(a=c(1, 2), b=c(2, 3), variable=factor('c'), value=c(3, 4))) test(1035.152, melt(x, measure.vars=as.raw(0)), error="Unknown 'measure.vars' type raw") test(1035.153, melt(x, measure.vars=3L, verbose=TRUE), ans, - output="'id.vars' is missing. Assigning all.*Assigned 'id.vars' are") + output="'id.vars' is missing. Assigning all.*Assigned 'id.vars' are [[]a, b[]]") test(1035.16, melt(x, id.vars="a", measure.vars="d"), error="One or more values") test(1035.17, melt(x, id.vars="d", measure.vars="a"), error="One or more values") @@ -3065,10 +3065,11 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) foo = function(input, by, var) { melt(input, id.vars = by, measure.vars=var) } - test(1035.18, foo(DT, by="x"), data.table(x=rep(DT$x, 2L), variable=factor(rep(c("y", "v"), each=9L), levels=c("y", "v")), value=c(DT$y, DT$v)), warning="are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'") + test(1035.18, foo(DT, by="x"), data.table(x=rep(DT$x, 2L), variable=factor(rep(c("y", "v"), each=9L), levels=c("y", "v")), value=c(DT$y, DT$v)), + warning="'measure.vars' [[]y, v[]] are not all of the same type.*molten data value column will be of type 'double'.*'double'") test(1035.19, foo(DT), data.table(x=rep(DT$x, 2L), variable=factor(rep(c("y", "v"), each=9L), levels=c("y", "v")), value=c(DT$y, DT$v)), - warning=c("id.vars and measure.vars are internally guessed when both are 'NULL'", - "are not all of the same type. By order of hierarchy")) + warning=c("id.vars and measure.vars are internally guessed.*this case are columns [[]x[]]", + "'measure.vars' [[]y, v[]] are not all of the same type.*'double'.*'double'")) # Fix for #1055; was test 1495 DT <- data.table(A = 1:2, B = 3:4, D = 5:6, D = 7:8) test(1035.20, melt(DT, id.vars=1:2), data.table(A=1:2, B=3:4, @@ -3102,7 +3103,8 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) R.utils::decompressFile(testDir("melt_1754.R.gz"), tt<-tempfile(), remove=FALSE, FUN=gzfile, ext=NULL) source(tt, local=TRUE) # creates DT test(1036.01, dim(DT), INT(1,327)) - test(1036.02, dim(ans<-melt(DT, 1:2)), INT(325,4), warning="All measure variables not of type 'character' will be coerced") + test(1036.02, dim(ans<-melt(DT, 1:2)), INT(325,4), + warning="'measure.vars' [[]Geography, Estimate; SEX AND AGE - Total population, Margin of Error; SEX AND AGE - Total population, Percent; SEX AND AGE - Total population, [.][.][.][]] are not all of the same type.*the molten data value column will be of type 'character'.*not of type 'character' will be coerced too") test(1036.03, length(levels(ans$variable)), 317L) test(1036.04, levels(ans$variable)[c(1,2,316,317)], tt <- c("Geography", @@ -3112,7 +3114,8 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1036.05, range(as.integer(ans$variable)), INT(1,317)) test(1036.06, as.vector(table(table(as.integer(ans$variable)))), INT(309,8)) test(1036.07, sapply(ans, class), c(Id="character",Id2="integer",variable="factor",value="character")) - test(1036.08, dim(ans<-melt(DT, 1:2, variable.factor=FALSE)), INT(325,4), warning="All measure variables not of type 'character' will be coerced") + test(1036.08, dim(ans<-melt(DT, 1:2, variable.factor=FALSE)), INT(325,4), + warning="'measure.vars' [[]Geography, Estimate;.*[.][.][.][]].*'character'.*'character'") test(1036.09, sapply(ans, class), c(Id="character",Id2="integer",variable="character",value="character")) test(1036.10, ans$variable[c(1,2,324,325)], tt) @@ -3147,7 +3150,7 @@ Jun,34.5,23.7,19.3,14.9,1.1,87.5,87.5,0,13.8,13.8,0,250.1 Jul,36.1,26.6,22.3,17.9,7.8,106.2,106.2,0,12.3,12.3,0,271.6 Aug,35.6,24.8,20.8,16.7,6.1,100.6,100.6,0,13.4,13.4,0,230.7 Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") - test(1037.301, print(melt(DT, id.vars="month", verbose=TRUE)), output="'measure.vars' is missing.*Assigned.*are.*Record high.*1:.*Jan.*Record high.*12.8.*108:.*Sep.*sunshine hours.*174.1") + test(1037.301, print(melt(DT, id.vars="month", verbose=TRUE)), output="'measure.vars' is missing.*Assigned 'measure.vars' are [[]Record high, Average high, Daily mean, Average low, ...[]].*1:.*Jan.*Record high.*12.8.*108:.*Sep.*sunshine hours.*174.1") # coverage of reworked fmelt.c:getvarcols, #1754; was test 1574 # missing id satisfies data->lvalues!=1 at C level to test those branches diff --git a/src/fmelt.c b/src/fmelt.c index fe96d54ac0..76228cac67 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -61,32 +61,40 @@ SEXP whichwrapper(SEXP x, SEXP val) { return which(x, LOGICAL(val)[0]); } -// hack by calling paste using eval. could change this to strcat, but not sure about buffer size for large data.tables... Any ideas Matthew? -SEXP concat(SEXP vec, SEXP idx) { - - SEXP s, t, v; - int nidx=length(idx); - - if (TYPEOF(vec) != STRSXP) error(_("concat: 'vec must be a character vector")); +static const char *concat(SEXP vec, SEXP idx) { + if (!isString(vec)) error(_("concat: 'vec must be a character vector")); if (!isInteger(idx) || length(idx) < 0) error(_("concat: 'idx' must be an integer vector of length >= 0")); + + static char ans[1024]; // so only one call to concat() per calling warning/error + int nidx=length(idx), nvec=length(vec); + ans[0]='\0'; + if (nidx==0) return ans; const int *iidx = INTEGER(idx); - for (int i=0; i length(vec)) - error(_("Internal error in concat: 'idx' must take values between 0 and length(vec); 0 <= idx <= %d"), length(vec)); // # nocov + for (int i=0; invec) + error(_("Internal error in concat: 'idx' must take values between 1 and length(vec); 1 <= idx <= %d"), nvec); // # nocov } - PROTECT(v = allocVector(STRSXP, nidx > 5 ? 5 : nidx)); - for (int i=0; i4) nidx=4; // first 4 following by ... if there are more than 4 + int remaining=1018; // leaving space for ", ...\0" at the end of the 1024, potentially + char *pos=ans; + int i=0; + for (; iremaining) break; + strncpy(pos, CHAR(this), len); + pos+=len; + remaining-=len; + *pos++ = ','; + *pos++ = ' '; } - if (nidx > 5) SET_STRING_ELT(v, 4, mkChar("...")); - PROTECT(t = s = allocList(3)); - SET_TYPEOF(t, LANGSXP); - SETCAR(t, install("paste")); t = CDR(t); - SETCAR(t, v); t = CDR(t); - SETCAR(t, mkString(", ")); - SET_TAG(t, install("collapse")); - UNPROTECT(2); // v, (t,s) - return(eval(s, R_GlobalEnv)); + if (length(vec)>4 || ilvalues; ++i) { SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); if (!data->isidentical[i]) - warning(_("'measure.vars' [%s] are not all of the same type. By order of hierarchy, the molten data value column will be of type '%s'. All measure variables not of type '%s' will be coerced too. Check DETAILS in ?melt.data.table for more on coercion.\n"), CHAR(STRING_ELT(concat(dtnames, thisvaluecols), 0)), type2char(data->maxtype[i]), type2char(data->maxtype[i])); + warning(_("'measure.vars' [%s] are not all of the same type. By order of hierarchy, the molten data value column will be of type '%s'. All measure variables not of type '%s' will be coerced too. Check DETAILS in ?melt.data.table for more on coercion.\n"), concat(dtnames, thisvaluecols), type2char(data->maxtype[i]), type2char(data->maxtype[i])); if (data->maxtype[i] == VECSXP && data->narm) { if (verbose) Rprintf(_("The molten data value type is a list at item %d. 'na.rm=TRUE' is ignored.\n"), i+1); data->narm = FALSE; From 8eadb9850b446cf590a2a55127788791a8b4e2f1 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 22 Jul 2020 15:52:47 -0600 Subject: [PATCH 064/588] news item review and tidy --- NEWS.md | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 80ca7dd6ed..8191e04675 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,16 +7,18 @@ ## POTENTIALLY BREAKING CHANGES 1. `fread` now supports native parsing of `%Y-%m-%d`, and [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `%Y-%m-%dT%H:%M:%OS%z`, [#4464](https://github.com/Rdatatable/data.table/pull/4464). Dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and datetimes are returned as `POSIXct` provided either `Z` or the offset from `UTC` is present; e.g. `fwrite()` outputs UTC by default including the final `Z`. Reminder that `IDate` inherits from R's `Date` and is identical other than it uses the `integer` type where (oddly) R uses the `double` type for dates (8 bytes instead of 4). `fread()` gains a `tz` argument to control datetime values that are missing a Z or UTC-offset (now referred to as *unmarked* datetimes); e.g. as written by `write.csv`. By default `tz=""` means, as in R, read the unmarked datetime in local time. Unless the timezone of the R session is UTC (e.g. the TZ environment variable is set to `"UTC"`, or `""` on non-Windows), unmarked datetime will then by read by `fread` as character, as before. If you have been using `colClasses="POSIXct"` that will still work using R's `as.POSIXct()` which will interpret the unmarked datetime in local time, as before, and still slowly. You can tell `fread` to read unmarked datetime as UTC, and quickly, by passing `tz="UTC"` which may be appropriate in many circumstances. Note that the default behaviour of R to read and write csv using unmarked datetime can lead to different research results when the csv file has been saved in one timezone and read in another due to observations being shifted to a different date. If you have been using `colClasses="POSIXct"` for UTC-marked datetime (e.g. as written by `fwrite` including the final `Z`) then it will automatically speed up with no changes needed. - Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour should you need it: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. + + Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. + The minor version number is bumped from 12 to 13, i.e. `v1.13.0`, where the `.0` conveys 'be-aware' as is common practice. As with any new feature, there may be bugs to fix and changes to defaults required in future. In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided. ## NEW FEATURES 1. `%chin%` and `chmatch(x, table)` are faster when `x` is length 1, `table` is long, and `x` occurs near the start of `table`. Thanks to Michael Chirico for the suggestion, [#4117](https://github.com/Rdatatable/data.table/pull/4117#discussion_r358378409). -2. The C function `CsubsetDT` is now exported for use by other packages, [#3751](https://github.com/Rdatatable/data.table/issues/3751). Thanks to Leonardo Silvestri for the request and the PR. This uses R's `R_RegisterCCallable` and `R_GetCCallable` mechanism, [R-exts§5.4.3](https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Linking-to-native-routines-in-other-packages) and [`?cdt`](https://rdatatable.gitlab.io/data.table/reference/cdt.html). Note that organisation of our C interface will be changed in next release. +2. `CsubsetDT` C function is now exported for use by other packages, [#3751](https://github.com/Rdatatable/data.table/issues/3751). Thanks to Leonardo Silvestri for the request and the PR. This uses R's `R_RegisterCCallable` and `R_GetCCallable` mechanism, [R-exts§5.4.3](https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Linking-to-native-routines-in-other-packages) and [`?cdt`](https://rdatatable.gitlab.io/data.table/reference/cdt.html). Note that organization of our C interface will be changed in future. -3. `print` method for `data.table`s gains `trunc.cols` argument (and corresponding option `datatable.print.trunc.cols`, default `FALSE`), [#1497](https://github.com/Rdatatable/data.table/issues/1497), part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). This prints only as many columns as fit in the console without wrapping to new lines (e.g., the first 5 of 80 columns) and a message that states the count and names of the variables not shown. When `class=TRUE` the message also contains the classes of the variables. `data.table` has always automatically truncated _rows_ of a table for efficiency (e.g. printing 10 rows instead of 10 million); in the future, we may do the same for _columns_ (e.g., 10 columns instead of 20,000) by changing the default for this argument. Thanks to @nverno for the initial suggestion and to @TysonStanley for the PR. +3. `print` method for `data.table` gains `trunc.cols` argument (and corresponding option `datatable.print.trunc.cols`, default `FALSE`), [#1497](https://github.com/Rdatatable/data.table/issues/1497), part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). This prints only as many columns as fit in the console without wrapping to new lines (e.g., the first 5 of 80 columns) and a message that states the count and names of the variables not shown. When `class=TRUE` the message also contains the classes of the variables. `data.table` has always automatically truncated _rows_ of a table for efficiency (e.g. printing 10 rows instead of 10 million); in the future, we may do the same for _columns_ (e.g., 10 columns instead of 20,000) by changing the default for this argument. Thanks to @nverno for the initial suggestion and to @TysonStanley for the PR. 4. `setnames(DT, new=new_names)` (i.e. explicitly named `new=` argument) now works as expected rather than an error message requesting that `old=` be supplied too, [#4041](https://github.com/Rdatatable/data.table/issues/4041). Thanks @Kodiologist for the suggestion. @@ -73,7 +75,7 @@ unit = "s") 7. `.SDcols=is.numeric` now works; i.e., `SDcols=` accepts a function which is used to select the columns of `.SD`, [#3950](https://github.com/Rdatatable/data.table/issues/3950). Any function (even _ad hoc_) that returns scalar `TRUE`/`FALSE` for each column will do; e.g., `.SDcols=!is.character` will return _non_-character columns (_a la_ `Negate()`). Note that `patterns=` can still be used for filtering based on the column names. -8. Compiler support for OpenMP is now detected during installation, which allows data.table to compile from source (in single threaded mode) on macOS which, frustratingly, does not include OpenMP support by default, [#2161](https://github.com/Rdatatable/data.table/issues/2161), unlike Windows and Linux. A helpful message is emitted during installation from source, and on package startup as before. Many thanks to @jimhester for the PR. This was typically a problem just after release to CRAN in the few days before macOS binaries (which do support OpenMP) are made available by CRAN. +8. Compiler support for OpenMP is now detected during installation, which allows `data.table` to compile from source (in single threaded mode) on macOS which, frustratingly, does not include OpenMP support by default, [#2161](https://github.com/Rdatatable/data.table/issues/2161), unlike Windows and Linux. A helpful message is emitted during installation from source, and on package startup as before. Many thanks to @jimhester for the PR. 9. `rbindlist` now supports columns of type `expression`, [#546](https://github.com/Rdatatable/data.table/issues/546). Thanks @jangorecki for the report. @@ -137,7 +139,7 @@ unit = "s") 1. `as.IDate`, `as.ITime`, `second`, `minute`, and `hour` now recognize UTC equivalents for speed: GMT, GMT-0, GMT+0, GMT0, Etc/GMT, and Etc/UTC, [#4116](https://github.com/Rdatatable/data.table/issues/4116). -2. `set2key`, `set2keyv`, and `key2` have been removed, as they have been warning since v1.9.8 (Nov 2016) and halting with helpful message since v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental' and quickly superceded by `setindex` and `indices`. +2. `set2key`, `set2keyv`, and `key2` have been removed, as they have been warning since v1.9.8 (Nov 2016) and halting with helpful message since v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental' and quickly superseded by `setindex` and `indices`. 3. `data.table` now supports messaging in simplified Chinese (locale `zh_CN`). This was the result of a monumental collaboration to translate `data.table`'s roughly 1400 warnings, errors, and verbose messages (about 16,000 words/100,000 characters) over the course of two months from volunteer translators in at least 4 time zones, most of whom are first-time `data.table` contributors and many of whom are first-time OSS contributors! @@ -149,7 +151,7 @@ unit = "s") We will evaluate the feasibility (in terms of maintenance difficulty and CRAN package size limits) of offering support for other languages in later releases. -4. `fifelse` and `fcase` notify users that S4 objects (except `nanotime`) are not supported [#4135](https://github.com/Rdatatable/data.table/issues/4135). Thanks to @torema-ed for bringing it to our attention and Morgan Jacob for the PR. +4. `fifelse` and `fcase` now notify users that S4 objects (except `nanotime`) are not supported [#4135](https://github.com/Rdatatable/data.table/issues/4135). Thanks to @torema-ed for bringing it to our attention and Morgan Jacob for the PR. 5. `frank(..., ties.method="random", na.last=NA)` now returns the same random ordering that `base::rank` does, [#4243](https://github.com/Rdatatable/data.table/pull/4243). @@ -158,7 +160,10 @@ unit = "s") ```R > DT = data.table(A=1:2) > DT[B:=3] - Error: Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number. + Error: Operator := detected in i, the first argument inside DT[...], but is only valid in + the second argument, j. Most often, this happens when forgetting the first comma + (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the + syntax. Run traceback(), and debugger() to get a line number. > DT[,B:=3] > DT A B @@ -169,11 +174,11 @@ unit = "s") 7. Added more explanation/examples to `?data.table` for how to use `.BY`, [#1363](https://github.com/Rdatatable/data.table/issues/1363). -8. Change of `c.POSIXct` method planned for R 4.1.0 impacted `foverlaps` function that could raise `'origin' must be supplied` error. Fix for planned change has been provided in [#4428](https://github.com/Rdatatable/data.table/pull/4428). +8. Changes upstream in R have been accomodated; e.g. `c.POSIXct` now raises `'origin' must be supplied` which impacted `foverlaps`, [#4428](https://github.com/Rdatatable/data.table/pull/4428). 9. `data.table::update.dev.pkg()` now unloads the `data.table` namespace to alleviate a DLL lock issue on Windows, [#4403](https://github.com/Rdatatable/data.table/issues/4403). Thanks to @drag5 for reporting. -10. Starting from 4.0.0, data.table is using R's `rbind` and `cbind` methods, as described in v1.12.6 news entry. Support for R 3.x.x is resolved when processing `NAMESPACE` file, at install time, or at the time of building package binaries. As a result, users on R 3.x.x, if installing from binaries, must use binaries built by R 3.x.x, and users on R 4.x.x, if installing from binaries, must use binaries built by R 4.x.x. Users will see `package ‘data.table’ was built under R version...` warning when this happen. Thanks to @vinhdizzo for reporting in [#4528](https://github.com/Rdatatable/data.table/issues/4528). +10. `data.table` packages binaries built by R 3.* should only be installed in R 3.*, and similarly `data.table` package binaries built by R 4.* should only be installed in R 4.*. Otherwise, `package ‘data.table’ was built under R version...` warning will occur and should not be ignored. This is due to a very welcome change to `rbind` and `cbind` in R 4.0.0 which enabled us to remove workarounds, see news item in v1.12.6 below in this file. To continue to support both R 3.* and R 4.*, though, `data.table`'s NAMESPACE file contains a conditional on the R major version and this is what gives rise to the requirement that the major version (3 or 4) used to build `data.table` must match the major version used to install it. Thanks to @vinhdizzo for reporting, [#4528](https://github.com/Rdatatable/data.table/issues/4528). 11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. From d5081e367e7ce2095862429139a3493ec772bf1d Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 22 Jul 2020 16:09:02 -0600 Subject: [PATCH 065/588] news tweak --- NEWS.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8191e04675..ed898e78db 100644 --- a/NEWS.md +++ b/NEWS.md @@ -162,9 +162,9 @@ unit = "s") > DT[B:=3] Error: Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma - (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the + (e.g. DT[newvar:=5] instead of DT[, new_var:=5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number. - > DT[,B:=3] + > DT[, B:=3] > DT A B @@ -178,7 +178,7 @@ unit = "s") 9. `data.table::update.dev.pkg()` now unloads the `data.table` namespace to alleviate a DLL lock issue on Windows, [#4403](https://github.com/Rdatatable/data.table/issues/4403). Thanks to @drag5 for reporting. -10. `data.table` packages binaries built by R 3.* should only be installed in R 3.*, and similarly `data.table` package binaries built by R 4.* should only be installed in R 4.*. Otherwise, `package ‘data.table’ was built under R version...` warning will occur and should not be ignored. This is due to a very welcome change to `rbind` and `cbind` in R 4.0.0 which enabled us to remove workarounds, see news item in v1.12.6 below in this file. To continue to support both R 3.* and R 4.*, though, `data.table`'s NAMESPACE file contains a conditional on the R major version and this is what gives rise to the requirement that the major version (3 or 4) used to build `data.table` must match the major version used to install it. Thanks to @vinhdizzo for reporting, [#4528](https://github.com/Rdatatable/data.table/issues/4528). +10. `data.table` packages binaries built by R version `3.*.*` should only be installed in R `3.*.*`, and similarly `data.table` package binaries built by R `4.*.*` should only be installed in R `4.*.*`. Otherwise, `package ‘data.table’ was built under R version...` warning will occur and this should not be ignored. This is due to a very welcome change to `rbind` and `cbind` in R `4.0.0` which enabled us to remove workarounds, see news item in v1.12.6 below in this file. To continue to support both R `3.*.*` and R `4.*.*`, though, `data.table`'s NAMESPACE file contains a conditional on the R major version (3 or 4) and this is what gives rise to the requirement that the major version used to build `data.table` must match the major version used to install it. Thanks to @vinhdizzo for reporting, [#4528](https://github.com/Rdatatable/data.table/issues/4528). 11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. From aa608710cf0ec03c3c6cc3d7c03c96f2034ac856 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 22 Jul 2020 16:14:53 -0600 Subject: [PATCH 066/588] news tweak --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ed898e78db..b79bb8475e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -178,7 +178,7 @@ unit = "s") 9. `data.table::update.dev.pkg()` now unloads the `data.table` namespace to alleviate a DLL lock issue on Windows, [#4403](https://github.com/Rdatatable/data.table/issues/4403). Thanks to @drag5 for reporting. -10. `data.table` packages binaries built by R version `3.*.*` should only be installed in R `3.*.*`, and similarly `data.table` package binaries built by R `4.*.*` should only be installed in R `4.*.*`. Otherwise, `package ‘data.table’ was built under R version...` warning will occur and this should not be ignored. This is due to a very welcome change to `rbind` and `cbind` in R `4.0.0` which enabled us to remove workarounds, see news item in v1.12.6 below in this file. To continue to support both R `3.*.*` and R `4.*.*`, though, `data.table`'s NAMESPACE file contains a conditional on the R major version (3 or 4) and this is what gives rise to the requirement that the major version used to build `data.table` must match the major version used to install it. Thanks to @vinhdizzo for reporting, [#4528](https://github.com/Rdatatable/data.table/issues/4528). +10. `data.table` packages binaries built by R version 3 (R3) should only be installed in R3, and similarly `data.table` package binaries built by R4 should only be installed in R4. Otherwise, `package ‘data.table’ was built under R version...` warning will occur which should not be ignored. This is due to a very welcome change to `rbind` and `cbind` in R 4.0.0 which enabled us to remove workarounds, see news item in v1.12.6 below in this file. To continue to support both R3 and R4, `data.table`'s NAMESPACE file contains a condition on the R major version (3 or 4) and this is what gives rise to the requirement that the major version used to build `data.table` must match the major version used to install it. Thanks to @vinhdizzo for reporting, [#4528](https://github.com/Rdatatable/data.table/issues/4528). 11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. From a8ec94484d2cc375d8295a94bacc5353576c238a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 24 Jul 2020 11:04:08 -0600 Subject: [PATCH 067/588] 1.13.0 on CRAN. Bump to 1.13.1 --- .dev/CRAN_Release.cmd | 41 +++++++++++++++++++++-------------------- DESCRIPTION | 2 +- Makefile | 6 +++--- NEWS.md | 11 ++++++++++- src/init.c | 2 +- 5 files changed, 36 insertions(+), 26 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index a82d8ce9d3..82a7aeb384 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -215,15 +215,15 @@ test.data.table() install.packages("xml2") # to check the 150 URLs in NEWS.md under --as-cran below q("no") R CMD build . -R CMD check data.table_1.12.9.tar.gz --as-cran -R CMD INSTALL data.table_1.12.9.tar.gz --html +R CMD check data.table_1.13.1.tar.gz --as-cran +R CMD INSTALL data.table_1.13.1.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.12.9.tar.gz +R CMD check data.table_1.13.1.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -262,7 +262,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.12.9.tar.gz +R310 CMD INSTALL ./data.table_1.13.1.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -274,7 +274,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.12.9.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.13.1.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -282,7 +282,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.12.9.tar.gz +R CMD check data.table_1.13.1.tar.gz ##################################################### @@ -332,8 +332,8 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-gcc CMD INSTALL data.table_1.12.9.tar.gz -Rdevel-strict-clang CMD INSTALL data.table_1.12.9.tar.gz +Rdevel-strict-gcc CMD INSTALL data.table_1.13.1.tar.gz +Rdevel-strict-clang CMD INSTALL data.table_1.13.1.tar.gz # Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so should be passed through to here Rdevel-strict-gcc Rdevel-strict-clang # repeat below with clang and gcc @@ -374,7 +374,7 @@ cd R-devel make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O0 -g line active, for info on source lines with any problems -Rdevel CMD INSTALL data.table_1.12.9.tar.gz +Rdevel CMD INSTALL data.table_1.13.1.tar.gz Rdevel -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite" # gctorture(TRUE) # very slow, many days # gctorture2(step=100) @@ -412,7 +412,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.12.9.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.13.1.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -565,7 +565,7 @@ du -k inst/tests # 1.5MB before bzip2 inst/tests/*.Rraw # compress *.Rraw just for release to CRAN; do not commit compressed *.Rraw to git du -k inst/tests # 0.75MB after R CMD build . -R CMD check data.table_1.12.8.tar.gz --as-cran +R CMD check data.table_1.13.0.tar.gz --as-cran # bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit compressed *.Rraw to git # @@ -573,30 +573,31 @@ Resubmit to winbuilder (R-release, R-devel and R-oldrelease) Submit to CRAN. Message template : ------------------------------------------------------------ Hello, -779 CRAN revdeps checked. No status changes. -All R-devel issues resolved. -New gcc10 warnings resolved. -Solaris is not resolved but this release will write more output upon that error so I can trace the problem. +870 CRAN revdeps checked. +The following 3 are impacted and we have communicated with their maintainers: + expss nc memochange +All known issues resolved including clang-UBSAN additional issue. +Solaris is not resolved but this release will write more output upon that error so I can continue to trace that problem. Many thanks! Best, Matt ------------------------------------------------------------ DO NOT commit or push to GitHub. Leave 4 files (.dev/CRAN_Release.cmd, DESCRIPTION, NEWS and init.c) edited and not committed. Include these in a single and final bump commit below. DO NOT even use a PR. Because PRs build binaries and we don't want any binary versions of even release numbers available from anywhere other than CRAN. -Leave milestone open with a 'final checks' issue open. Keep updating status there. +Leave milestone open with a 'release checks' issue open. Keep updating status there. ** If on EC2, shutdown instance. Otherwise get charged for potentially many days/weeks idle time with no alerts ** If it's evening, SLEEP. It can take a few days for CRAN's checks to run. If any issues arise, backport locally. Resubmit the same even version to CRAN. CRAN's first check is automatic and usually received within an hour. WAIT FOR THAT EMAIL. When CRAN's email contains "Pretest results OK pending a manual inspection" (or similar), or if not and it is known why not and ok, then bump dev. ###### Bump dev -0. Close milestone to prevent new issues being tagged with it. The final 'release checks' issue can be left open in a closed milestone. +0. Close milestone to prevent new issues being tagged with it. Update its name to the even release. The final 'release checks' issue can be left open in a closed milestone. 1. Check that 'git status' shows 4 files in modified and uncommitted state: DESCRIPTION, NEWS.md, init.c and this .dev/CRAN_Release.cmd 2. Bump version in DESCRIPTION to next odd number. Note that DESCRIPTION was in edited and uncommitted state so even number never appears in git. 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. 4. Bump dllVersion() in init.c 5. Bump 3 version numbers in Makefile -6. Search and replace this .dev/CRAN_Release.cmd to update 1.12.7 to 1.12.9, and 1.12.6 to 1.12.8 (e.g. in step 8 and 9 below) +6. Search and replace this .dev/CRAN_Release.cmd to update 1.12.9 to 1.13.1, and 1.12.8 to 1.13.0 (e.g. in step 8 and 9 below) 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -8. Push to master with this consistent commit message: "1.12.8 on CRAN. Bump to 1.12.9" -9. Take sha from step 8 and run `git tag 1.12.8 34796cd1524828df9bf13a174265cb68a09fcd77` then `git push origin 1.12.8` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +8. Push to master with this consistent commit message: "1.13.0 on CRAN. Bump to 1.13.1" +9. Take sha from step 8 and run `git tag 1.13.0 34796cd1524828df9bf13a174265cb68a09fcd77` then `git push origin 1.13.0` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) ###### diff --git a/DESCRIPTION b/DESCRIPTION index 945b0accca..f8c578f359 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.12.9 +Version: 1.13.1 Title: Extension of `data.frame` Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), diff --git a/Makefile b/Makefile index 634c823d9a..d10a43fc9d 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.12.9.tar.gz + $(RM) data.table_1.13.1.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.12.9.tar.gz + $(R) CMD INSTALL data.table_1.13.1.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.12.9.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.13.1.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NEWS.md b/NEWS.md index b79bb8475e..2616e353ec 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,16 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.12.9](https://github.com/Rdatatable/data.table/milestone/17) (in development) +# data.table [v1.13.1](https://github.com/Rdatatable/data.table/milestone/19) (in development) + +## NEW FEATURES + +## BUG FIXES + +## NOTES + + +# data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) ## POTENTIALLY BREAKING CHANGES diff --git a/src/init.c b/src/init.c index 4e7c5ec313..cc51f9f2da 100644 --- a/src/init.c +++ b/src/init.c @@ -412,6 +412,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion() { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.12.9"))); + return(ScalarString(mkChar("1.13.1"))); } From 4fd649ea54c582618015c6c66f15d8f65d3ca89c Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 24 Jul 2020 11:55:24 -0600 Subject: [PATCH 068/588] CRAN_Release.cmd: add reminder to add ?closed=1 to the milestone line --- .dev/CRAN_Release.cmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 82a7aeb384..e77f4d5eec 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -558,7 +558,7 @@ ls -1 *.tar.gz | grep -E 'Chicago|dada2|flowWorkspace|LymphoSeq' | TZ='UTC' para Bump version to even release number in 3 places : 1) DESCRIPTION - 2) NEWS (without 'on CRAN date' text as that's not yet known) + 2) NEWS; add ?closed=1 to the milestone link, don't add date yet as that published-on-CRAN date isn't yet known 3) dllVersion() at the end of init.c DO NOT push to GitHub. Prevents even a slim possibility of user getting premature version. Even release numbers must have been obtained from CRAN and only CRAN. There were too many support problems in the past before this procedure was brought in. du -k inst/tests # 1.5MB before From db618444a4d16b91cccf35d80fa3f306739dd571 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 26 Jul 2020 15:24:04 +0800 Subject: [PATCH 069/588] small typo (#4632) --- src/fmelt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index 76228cac67..22a4ac1fc5 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -62,9 +62,9 @@ SEXP whichwrapper(SEXP x, SEXP val) { } static const char *concat(SEXP vec, SEXP idx) { - if (!isString(vec)) error(_("concat: 'vec must be a character vector")); + if (!isString(vec)) error(_("concat: 'vec' must be a character vector")); if (!isInteger(idx) || length(idx) < 0) error(_("concat: 'idx' must be an integer vector of length >= 0")); - + static char ans[1024]; // so only one call to concat() per calling warning/error int nidx=length(idx), nvec=length(vec); ans[0]='\0'; @@ -81,7 +81,7 @@ static const char *concat(SEXP vec, SEXP idx) { for (; iremaining) break; + if (len>remaining) break; strncpy(pos, CHAR(this), len); pos+=len; remaining-=len; @@ -104,13 +104,13 @@ SEXP measurelist(SEXP measure, SEXP dtnames) { for (int i=0; i Date: Mon, 3 Aug 2020 18:43:15 -0400 Subject: [PATCH 070/588] Fix bit::copy for unit tests (#4656) --- NEWS.md | 17 +++++++++++++++++ inst/tests/nafill.Rraw | 6 ++++++ inst/tests/tests.Rraw | 5 +++-- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2616e353ec..653ab87b6d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,23 @@ ## NOTES +1. `bit64` v4.0.2 released on 30th July broke `data.table`'s tests. It seems that reverse dependency testing of `bit64` (i.e. testing of the packages which use `bit64`) did not include `data.table` because `data.table` merely suggests `bit64` and does not depend on it. Like other packages on our `Suggest` list, we test `data.table` works with `bit64` in our tests. In testing of our own reverse dependencies (packages which use `data.table`) we do include packages which suggest `data.table`, although it appears it is not CRAN policy to do so. We have requested that CRAN policy be changed to include suggests in reverse dependency testing. + + The first break was because `all.equal` did not work in previous versions of `bit64`; e.g., + +```R +require(bit64) +all.equal(as.integer64(3), as.integer64(4)) +TRUE # < v4.0.0 +FALSE # >= v4.0.0 +``` + + We feel the need to explain this in detail here because the addition of the `integer64` method for `all.equal` appears as a very brief "new feature" in `bit64`'s NEWS. We like `bit64` a lot and we know users of `data.table` also use `bit64`. They may be impacted in the same way; e.g., equality tests previously passing when they should not have passed. In our case, two `fcase` tests on `integer64` and `nanotime` started to fail upon `bit64`'s update. Fortunately, the `fcase` results were correct but the tests were comparing to an incorrect result which was incorrectly passing due to `all.equal` always returning TRUE for any `integer64` input. + + The second break caused by `bit64` was the addition of a `copy` function. Since `data.table::copy` is long standing we hope that `bit64` can rename its new `copy` function. Otherwise, users of `data.table` may need to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. + + Thanks to Cole Miller for the PR to accomodate `bit64`'s update. + # data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index 99a404b4d9..f22a66f702 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -8,6 +8,12 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { INT = data.table:::INT colnamesInt = data.table:::colnamesInt coerceFill = data.table:::coerceFill + # masked by which package? + # ================================= + copy = data.table::copy # bit64; copy is used in this file, so this line is needed + setattr = data.table::setattr # bit ; setattr does not appear in this file, so not needed. Here in case that changes. + # use of copy and setattr within data.table's own code is not masked by other packages + # we only have to do this in test files because, like a user would, these test files run like a user } sugg = c( diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3081f79dde..a7430c8e10 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -77,6 +77,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { melt = data.table::melt # reshape2 last = data.table::last # xts first = data.table::first # xts, S4Vectors + copy = data.table::copy # bit64 v4; bit64 offered to rename though so this is just in case bit64 unoffers } # Load optional Suggests packages, which are tested by Travis for code coverage, and on CRAN @@ -16755,11 +16756,11 @@ test(2127.24, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-1 test(2127.25, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default=123), error="Resulting value has different class than 'default'. Please make sure that both arguments have the same class.") if(test_bit64) { i=as.integer64(1:12)+3e9 - test(2127.26, fcase(test_vec_na1, i, test_vec_na2, i+100), c(i[1L:5L], as.integer64(NA),i[7L:12L]+100)) + test(2127.26, fcase(test_vec_na1, i, test_vec_na2, i+100), c(i[1L:5L], as.integer64(NA),i[7L:11L]+100, as.integer64(NA))) } if(test_nanotime) { n=nanotime(1:12) - test(2127.27, fcase(test_vec_na1, n, test_vec_na2, n+100), c(n[1L:5L], nanotime(NA),n[7L:12L]+100)) + test(2127.27, fcase(test_vec_na1, n, test_vec_na2, n+100), c(n[1L:5L], nanotime(NA),n[7L:11L]+100, as.integer64(NA))) } test(2127.28, fcase(test_vec1, rep(1L,11L), test_vec2, rep(0L,11L)), as.integer(out_vec)) test(2127.29, fcase(test_vec1, rep(1,11L), test_vec2, rep(0,11L)), out_vec) From 863e6611440d98b772f98eb50dd846ac7c553b2d Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 3 Aug 2020 16:51:28 -0600 Subject: [PATCH 071/588] news formatting only (code section indentation); follow up to #4656 --- NEWS.md | 104 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/NEWS.md b/NEWS.md index 653ab87b6d..601c8a7e1c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,12 +14,12 @@ The first break was because `all.equal` did not work in previous versions of `bit64`; e.g., -```R -require(bit64) -all.equal(as.integer64(3), as.integer64(4)) -TRUE # < v4.0.0 -FALSE # >= v4.0.0 -``` + ```R + require(bit64) + all.equal(as.integer64(3), as.integer64(4)) + TRUE # < v4.0.0 + FALSE # >= v4.0.0 + ``` We feel the need to explain this in detail here because the addition of the `integer64` method for `all.equal` appears as a very brief "new feature" in `bit64`'s NEWS. We like `bit64` a lot and we know users of `data.table` also use `bit64`. They may be impacted in the same way; e.g., equality tests previously passing when they should not have passed. In our case, two `fcase` tests on `integer64` and `nanotime` started to fail upon `bit64`'s update. Fortunately, the `fcase` results were correct but the tests were comparing to an incorrect result which was incorrectly passing due to `all.equal` always returning TRUE for any `integer64` input. @@ -52,52 +52,52 @@ FALSE # >= v4.0.0 6. New function `fcase(...,default)` implemented in C by Morgan Jacob, [#3823](https://github.com/Rdatatable/data.table/issues/3823), is inspired by SQL `CASE WHEN` which is a common tool in SQL for e.g. building labels or cutting age groups based on conditions. `fcase` is comparable to R function `dplyr::case_when` however it evaluates its arguments in a lazy way (i.e. only when needed) as shown below. Please see `?fcase` for more details. -```R -# Lazy evaluation -x = 1:10 -data.table::fcase( - x < 5L, 1L, - x >= 5L, 3L, - x == 5L, stop("provided value is an unexpected one!") -) -# [1] 1 1 1 1 3 3 3 3 3 3 - -dplyr::case_when( - x < 5L ~ 1L, - x >= 5L ~ 3L, - x == 5L ~ stop("provided value is an unexpected one!") -) -# Error in eval_tidy(pair$rhs, env = default_env) : -# provided value is an unexpected one! - -# Benchmark -x = sample(1:100, 3e7, replace = TRUE) # 114 MB -microbenchmark::microbenchmark( -dplyr::case_when( - x < 10L ~ 0L, - x < 20L ~ 10L, - x < 30L ~ 20L, - x < 40L ~ 30L, - x < 50L ~ 40L, - x < 60L ~ 50L, - x > 60L ~ 60L -), -data.table::fcase( - x < 10L, 0L, - x < 20L, 10L, - x < 30L, 20L, - x < 40L, 30L, - x < 50L, 40L, - x < 60L, 50L, - x > 60L, 60L -), -times = 5L, -unit = "s") -# Unit: seconds -# expr min lq mean median uq max neval -# dplyr::case_when 11.57 11.71 12.22 11.82 12.00 14.02 5 -# data.table::fcase 1.49 1.55 1.67 1.71 1.73 1.86 5 -``` + ```R + # Lazy evaluation + x = 1:10 + data.table::fcase( + x < 5L, 1L, + x >= 5L, 3L, + x == 5L, stop("provided value is an unexpected one!") + ) + # [1] 1 1 1 1 3 3 3 3 3 3 + + dplyr::case_when( + x < 5L ~ 1L, + x >= 5L ~ 3L, + x == 5L ~ stop("provided value is an unexpected one!") + ) + # Error in eval_tidy(pair$rhs, env = default_env) : + # provided value is an unexpected one! + + # Benchmark + x = sample(1:100, 3e7, replace = TRUE) # 114 MB + microbenchmark::microbenchmark( + dplyr::case_when( + x < 10L ~ 0L, + x < 20L ~ 10L, + x < 30L ~ 20L, + x < 40L ~ 30L, + x < 50L ~ 40L, + x < 60L ~ 50L, + x > 60L ~ 60L + ), + data.table::fcase( + x < 10L, 0L, + x < 20L, 10L, + x < 30L, 20L, + x < 40L, 30L, + x < 50L, 40L, + x < 60L, 50L, + x > 60L, 60L + ), + times = 5L, + unit = "s") + # Unit: seconds + # expr min lq mean median uq max neval + # dplyr::case_when 11.57 11.71 12.22 11.82 12.00 14.02 5 + # data.table::fcase 1.49 1.55 1.67 1.71 1.73 1.86 5 + ``` 7. `.SDcols=is.numeric` now works; i.e., `SDcols=` accepts a function which is used to select the columns of `.SD`, [#3950](https://github.com/Rdatatable/data.table/issues/3950). Any function (even _ad hoc_) that returns scalar `TRUE`/`FALSE` for each column will do; e.g., `.SDcols=!is.character` will return _non_-character columns (_a la_ `Negate()`). Note that `patterns=` can still be used for filtering based on the column names. From 8665aa0b412cd943cae787874092dede469936f7 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Mon, 3 Aug 2020 20:04:12 -0400 Subject: [PATCH 072/588] Include .NGRP alias In Special Symbols (#4657) --- NEWS.md | 4 +++- man/special-symbols.Rd | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 601c8a7e1c..039674b907 100644 --- a/NEWS.md +++ b/NEWS.md @@ -25,7 +25,9 @@ The second break caused by `bit64` was the addition of a `copy` function. Since `data.table::copy` is long standing we hope that `bit64` can rename its new `copy` function. Otherwise, users of `data.table` may need to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. - Thanks to Cole Miller for the PR to accomodate `bit64`'s update. + Thanks to Cole Miller for the PR to accomodate `bit64`'s update. + +2. `?.NGRP` now displays the help page as intended, [#4946](https://github.com/Rdatatable/data.table/issues/4649). Thanks to @KyleHaynes for posting the issue, and Cole Miller for the fix. `.NGRP` is a symbol new in v1.13.0; see below in this file. # data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd index a22042af1a..30cfedc5fa 100644 --- a/man/special-symbols.Rd +++ b/man/special-symbols.Rd @@ -7,6 +7,7 @@ \alias{.BY} \alias{.N} \alias{.EACHI} +\alias{.NGRP} \title{ Special symbols } \description{ \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. See the vignettes and examples here and in \code{\link{data.table}}. From 39c06aa9e679a8291474f4076dcee067693c4a40 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 4 Aug 2020 08:06:58 +0800 Subject: [PATCH 073/588] fix misleading NEWS item (#4648) --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 039674b907..ee51abed4d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -101,7 +101,7 @@ # data.table::fcase 1.49 1.55 1.67 1.71 1.73 1.86 5 ``` -7. `.SDcols=is.numeric` now works; i.e., `SDcols=` accepts a function which is used to select the columns of `.SD`, [#3950](https://github.com/Rdatatable/data.table/issues/3950). Any function (even _ad hoc_) that returns scalar `TRUE`/`FALSE` for each column will do; e.g., `.SDcols=!is.character` will return _non_-character columns (_a la_ `Negate()`). Note that `patterns=` can still be used for filtering based on the column names. +7. `.SDcols=is.numeric` now works; i.e., `SDcols=` accepts a function which is used to select the columns of `.SD`, [#3950](https://github.com/Rdatatable/data.table/issues/3950). Any function (even _ad hoc_) that returns scalar `TRUE`/`FALSE` for each column will do; e.g., `.SDcols=!is.character` will return _non_-character columns (_a la_ `Negate()`). Note that `.SDcols=patterns(...)` can still be used for filtering based on the column names. 8. Compiler support for OpenMP is now detected during installation, which allows `data.table` to compile from source (in single threaded mode) on macOS which, frustratingly, does not include OpenMP support by default, [#2161](https://github.com/Rdatatable/data.table/issues/2161), unlike Windows and Linux. A helpful message is emitted during installation from source, and on package startup as before. Many thanks to @jimhester for the PR. From f4e89a5ae8ab8b0bc95a2c2f7f44aed4684f40f1 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Mon, 3 Aug 2020 19:15:47 -0500 Subject: [PATCH 074/588] Add header to declare C level API (#4645) --- inst/include/datatableAPI.h | 45 +++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 inst/include/datatableAPI.h diff --git a/inst/include/datatableAPI.h b/inst/include/datatableAPI.h new file mode 100644 index 0000000000..9e8bb48f31 --- /dev/null +++ b/inst/include/datatableAPI.h @@ -0,0 +1,45 @@ + +/* This header file provides the interface used by other packages, + and should be included once per package. */ + +#ifndef _R_data_table_API_h_ +#define _R_data_table_API_h_ + +/* number of R header files (possibly listing too many) */ +#include + +#ifdef HAVE_VISIBILITY_ATTRIBUTE + # define attribute_hidden __attribute__ ((visibility ("hidden"))) +#else + # define attribute_hidden +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* provided the interface for the function exported in + ../src/init.c via R_RegisterCCallable() */ + +SEXP attribute_hidden DT_subsetDT(SEXP x, SEXP rows, SEXP cols) { + static SEXP(*fun)(SEXP, SEXP, SEXP) = + (SEXP(*)(SEXP,SEXP,SEXP)) R_GetCCallable("data.table", "CsubsetDT"); + return fun(x,rows,cols); +} + +/* permit opt-in to redefine shorter identifiers */ +#if defined(DATATABLE_REMAP_API) + #define subsetDT DT_subsetDT +#endif + +#ifdef __cplusplus +} + +/* add a namespace for C++ use */ +namespace dt { + inline SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { return DT_subsetDT(x, rows, cols); } +} + +#endif /* __cplusplus */ + +#endif /* _R_data_table_API_h_ */ From d0942ad1d00308a9a07f06f51aa42b35cbee42ab Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 3 Aug 2020 18:21:30 -0600 Subject: [PATCH 075/588] Added Dirk to contributors list in DESCRIPTION; #4645 --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index f8c578f359..3fb8269de6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -59,7 +59,8 @@ Authors@R: c( person("Cole","Miller", role="ctb"), person("Jens Peder","Meldgaard", role="ctb"), person("Vaclav","Tlapak", role="ctb"), - person("Kevin","Ushey", role="ctb")) + person("Kevin","Ushey", role="ctb"), + person("Dirk","Eddelbuettel", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64, curl, R.utils, xts, nanotime, zoo, yaml, knitr, rmarkdown From dfd91ca0caa47fca735d3d9f80c87fd8ac271bf7 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Mon, 3 Aug 2020 21:27:48 -0400 Subject: [PATCH 076/588] Update locale in test.data.table (#4630) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 9 +++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index ee51abed4d..f5f83a6c7b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ ## BUG FIXES +1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing. + ## NOTES 1. `bit64` v4.0.2 released on 30th July broke `data.table`'s tests. It seems that reverse dependency testing of `bit64` (i.e. testing of the packages which use `bit64`) did not include `data.table` because `data.table` merely suggests `bit64` and does not depend on it. Like other packages on our `Suggest` list, we test `data.table` works with `bit64` in our tests. In testing of our own reverse dependencies (packages which use `data.table`) we do include packages which suggest `data.table`, although it appears it is not CRAN policy to do so. We have requested that CRAN policy be changed to include suggests in reverse dependency testing. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a7430c8e10..f71cb37ba4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16970,12 +16970,8 @@ if (.Platform$OS.type=="windows") local({ LC_NUMERIC = "C", LC_TIME = "Chinese (Simplified)_China.936" ) - for (i in seq_along(x)) { - lc = names(x)[[i]] - old = Sys.getlocale(lc) - Sys.setlocale(lc, x[[i]]) - on.exit(Sys.setlocale(lc, old), add = TRUE) - } + x_old = Map(Sys.getlocale, names(x)) + invisible(Map(Sys.setlocale, names(x), x)) old = Sys.getenv('LANGUAGE') Sys.setenv('LANGUAGE' = 'zh_CN') on.exit({ @@ -16983,6 +16979,7 @@ if (.Platform$OS.type=="windows") local({ Sys.setenv('LANGUAGE' = old) else Sys.unsetenv('LANGUAGE') + invisible(Map(Sys.setlocale, names(x_old), x_old)) }, add = TRUE) # triggered segfault here in #4402, Windows-only under translation. # test that the argument order changes correctly (the 'item 2' moves to the beginning of the message) From 42d88500e01ddbc576a8904b341161d31832c4ea Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 4 Aug 2020 10:36:32 -0600 Subject: [PATCH 077/588] news item tweak, and GLCI rel-cran now 2 NOTE (both size) up from 1 NOTE --- .gitlab-ci.yml | 4 ++-- NEWS.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dfda08355b..a1dadf97ce 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -170,7 +170,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual variables: _R_CHECK_CRAN_INCOMING_: "TRUE" ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though) _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 - _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0, then 00check.log can be checked for "OK" rather than "1 NOTE" + _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0, then 00check.log can be checked for "OK" rather than "2 NOTE" before_script: - *install-deps - *cp-src @@ -184,7 +184,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - *rm-src - >- - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 1 NOTE")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 1 NOTE"), " (size of tarball) but ", shQuote(l)) else q("no")' + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 2 NOTE")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 2 NOTE"), " (size of tarball) but ", shQuote(l)) else q("no")' test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double, check for new notes and compilation warnings, thus allow_failure <<: *test-lin diff --git a/NEWS.md b/NEWS.md index f5f83a6c7b..7f1c22379f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -23,7 +23,7 @@ FALSE # >= v4.0.0 ``` - We feel the need to explain this in detail here because the addition of the `integer64` method for `all.equal` appears as a very brief "new feature" in `bit64`'s NEWS. We like `bit64` a lot and we know users of `data.table` also use `bit64`. They may be impacted in the same way; e.g., equality tests previously passing when they should not have passed. In our case, two `fcase` tests on `integer64` and `nanotime` started to fail upon `bit64`'s update. Fortunately, the `fcase` results were correct but the tests were comparing to an incorrect result which was incorrectly passing due to `all.equal` always returning TRUE for any `integer64` input. + We feel the need to explain this in detail here because the addition of the `integer64` method for `all.equal` appears as a very brief "new feature" in `bit64`'s NEWS. We like `bit64` a lot and we know users of `data.table` also use `bit64`. They may be impacted in the same way; e.g., equality tests previously passing when they should not have passed. In our case, two `fcase` tests started to fail upon `bit64`'s update. Fortunately, the `fcase` results were correct but the tests were comparing to an incorrect result. These tests were incorrectly passing due to `all.equal` always returning TRUE for any `integer64` input. Note also that `all.equal` always returned TRUE for any `nanotime` input, since `nanotime`'s underlying type is `bit64`. The second break caused by `bit64` was the addition of a `copy` function. Since `data.table::copy` is long standing we hope that `bit64` can rename its new `copy` function. Otherwise, users of `data.table` may need to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. From 80d83d11a320954a42d3728efee6848236e72e64 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 4 Aug 2020 12:08:51 -0600 Subject: [PATCH 078/588] GLCI rel-cran 'NOTEs' not 'NOTE' --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a1dadf97ce..0a0ed13a84 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -170,7 +170,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual variables: _R_CHECK_CRAN_INCOMING_: "TRUE" ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though) _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 - _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0, then 00check.log can be checked for "OK" rather than "2 NOTE" + _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0, then 00check.log can be checked for "OK" rather than "2 NOTEs" before_script: - *install-deps - *cp-src @@ -184,7 +184,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - *rm-src - >- - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 2 NOTE")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 2 NOTE"), " (size of tarball) but ", shQuote(l)) else q("no")' + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 2 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 2 NOTEs"), " (size of tarball) but ", shQuote(l)) else q("no")' test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double, check for new notes and compilation warnings, thus allow_failure <<: *test-lin From 3715f726e57390b38c8c6368b99c1b2c4fff78ae Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 4 Aug 2020 15:18:51 -0600 Subject: [PATCH 079/588] removed the exit 0 and added the -- for FreeBSD (#4662) --- NEWS.md | 2 ++ configure | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7f1c22379f..6074ee33eb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ 1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing. +2. A regression in v1.13.0 resulted in installation on Mac often failing with `shared object 'datatable.so' not found`, and FreeBSD always failing with `expr: illegal option -- l`, [#4652](https://github.com/Rdatatable/data.table/issues/4652) [#4640](https://github.com/Rdatatable/data.table/issues/4640) [#4650](https://github.com/Rdatatable/data.table/issues/4650). Thanks to many for assistance including Simon Urbanek, Brian Ripley, Wes Morgan, and @ale07alvarez. There were no installation problems on Windows or Linux. + ## NOTES 1. `bit64` v4.0.2 released on 30th July broke `data.table`'s tests. It seems that reverse dependency testing of `bit64` (i.e. testing of the packages which use `bit64`) did not include `data.table` because `data.table` merely suggests `bit64` and does not depend on it. Like other packages on our `Suggest` list, we test `data.table` works with `bit64` in our tests. In testing of our own reverse dependencies (packages which use `data.table`) we do include packages which suggest `data.table`, although it appears it is not CRAN policy to do so. We have requested that CRAN policy be changed to include suggests in reverse dependency testing. diff --git a/configure b/configure index a0746dee00..b80d9f1dd9 100755 --- a/configure +++ b/configure @@ -31,9 +31,9 @@ else msg=1 else lib=`pkg-config --libs zlib` - expr "$lib" : ".*-lz$" >/dev/null + expr -- "$lib" : ".*-lz$" >/dev/null # -- for FreeBSD, #4652 if [ $? -ne 0 ]; then - expr "$lib" : ".*-lz " >/dev/null + expr -- "$lib" : ".*-lz " >/dev/null # would use \b in one expr but MacOS does not support \b if [ $? -ne 0 ]; then echo "*** pkg-config is installed and 'pkg-config --exists zlib' succeeds but" @@ -45,12 +45,13 @@ else fi if [ $msg -ne 0 ]; then - echo "*** Compilation will now be attempted and if it works you can ignore this message. However," - echo "*** if compilation fails, try 'locate zlib.h zconf.h' and ensure the zlib development library" - echo "*** is installed :" + echo "*** Compilation will now be attempted and if it works you can ignore this message. In" + echo "*** particular, this should be the case on Mac where zlib is built in." + echo "*** However, if compilation fails, try 'locate zlib.h zconf.h' and ensure the zlib" + echo "*** development library is installed :" echo "*** deb: zlib1g-dev (Debian, Ubuntu, ...)" echo "*** rpm: zlib-devel (Fedora, EPEL, ...)" - echo "*** brew: zlib (OSX)" + echo "*** There is a zlib in brew for OSX but the built in zlib should work." echo "*** Note that zlib is required to compile R itself so you may find the advice in the R-admin" echo "*** guide helpful regarding zlib. On Debian/Ubuntu, zlib1g-dev is a dependency of r-base as" echo "*** shown by 'apt-cache showsrc r-base | grep ^Build-Depends | grep zlib', and therefore" @@ -59,12 +60,11 @@ if [ $msg -ne 0 ]; then echo "*** 1) 'pkg-config --exists zlib' succeeds (i.e. \$? -eq 0)" echo "*** 2) 'pkg-config --libs zlib' contains -lz" echo "*** Compilation will now be attempted ..." - exit 0 +else + version=`pkg-config --modversion zlib` + echo "zlib ${version} is available ok" fi -version=`pkg-config --modversion zlib` -echo "zlib ${version} is available ok" - # Test if we have a OPENMP compatible compiler # Aside: ${SHLIB_OPENMP_CFLAGS} does not appear to be defined at this point according to Matt's testing on # Linux, and R CMD config SHLIB_OPENMP_CFLAGS also returns 'no information for variable'. That's not From 4f761622606e7253e6da1829ef546296aa83126c Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 4 Aug 2020 18:31:32 -0600 Subject: [PATCH 080/588] Solaris const nth and added grep to CRAN_Release.cmd (#4663) --- .dev/CRAN_Release.cmd | 4 +++- src/subset.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index e77f4d5eec..968a308788 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -103,7 +103,9 @@ grep omp_set_nested ./src/*.c grep --exclude="./src/openmp-utils.c" omp_get_max_threads ./src/* # Ensure all #pragama omp parallel directives include a num_threads() clause -grep "pragma omp parallel" ./src/*.c | grep -v getDTthreads +grep -i "pragma.*omp parallel" ./src/*.c | grep -v getDTthreads +# for each num_threads(nth) above, ensure for Solaris that the variable is not declared const, #4638 +grep -i "const.*int.*nth" ./src/*.c # Update documented list of places where openMP parallelism is used: c.f. ?openmp grep -Elr "[pP]ragma.*omp" src | sort diff --git a/src/subset.c b/src/subset.c index f9c66e2df8..0eb1b2a72d 100644 --- a/src/subset.c +++ b/src/subset.c @@ -11,7 +11,7 @@ void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA) // negatives, zeros and out-of-bounds have already been dealt with in convertNegAndZero so we can rely // here on idx in range [1,length(ans)]. - const int nth = getDTthreads(n, /*throttle=*/true); + int nth = getDTthreads(n, /*throttle=*/true); // not const for Solaris, #4638 // For small n such as 2,3,4 etc we had hoped OpenMP would be sensible inside it and not create a team // with each thread doing just one item. Otherwise, call overhead would be too high for highly iterated // calls on very small subsets. Timings were tested in #3175. However, the overhead does seem to add up From 5144c5e14526a955a76b019058e05a53e8ad1ccc Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 7 Aug 2020 00:26:38 -0600 Subject: [PATCH 081/588] valgrind (#4666) --- .dev/.bash_aliases | 1 + .dev/CRAN_Release.cmd | 30 +++---- .dev/valgrind.supp | 24 ++++++ .travis.yml | 2 +- R/test.data.table.R | 8 +- inst/tests/froll.Rraw | 21 +++-- inst/tests/tests.Rraw | 185 ++++++++++++++++++++++-------------------- src/fread.c | 4 +- src/fwrite.c | 8 +- src/fwriteR.c | 2 +- 10 files changed, 166 insertions(+), 119 deletions(-) create mode 100644 .dev/valgrind.supp diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases index d9e2b6a387..42388a7a4e 100644 --- a/.dev/.bash_aliases +++ b/.dev/.bash_aliases @@ -11,6 +11,7 @@ alias gdm='git difftool master &> /dev/null' alias Rdevel='~/build/R-devel/bin/R --vanilla' alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' +alias Rdevel-valgrind='~/build/R-devel-valgrind/bin/R --vanilla' alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla' alias R310='~/build/R-3.1.0/bin/R --vanilla' alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false' diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 968a308788..e6e16209f6 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -368,29 +368,29 @@ print(Sys.time()); started.at<-proc.time(); try(test.data.table()); print(Sys.ti ############################################### cd ~/build -rm -rf R-devel # easiest way to remove ASAN from compiled packages in R-devel/library - # to avoid "ASan runtime does not come first in initial library list" error; no need for LD_PRELOAD -tar xvf R-devel.tar.gz -cd R-devel -./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --with-valgrind-instrumentation=1 CC="gcc" CFLAGS="-O0 -g -Wall -pedantic" LIBS="-lpthread" +mkdir R-devel-valgrind # separate build to avoid differences in installed packages, and + # to avoid "ASan runtime does not come first in initial library list" error; no need for LD_PRELOAD +tar xvf R-devel.tar.gz -C R-devel-valgrind --strip-components 1 +cd R-devel-valgrind +./configure --without-recommended-packages --with-valgrind-instrumentation=2 --with-system-valgrind-headers CC="gcc" CFLAGS="-O2 -g -Wall -pedantic" make cd ~/GitHub/data.table -vi ~/.R/Makevars # make the -O0 -g line active, for info on source lines with any problems -Rdevel CMD INSTALL data.table_1.13.1.tar.gz -Rdevel -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite" +vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems +Rdevel-valgrind CMD INSTALL data.table_1.13.1.tar.gz +R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" +# the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. +# including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks # gctorture(TRUE) # very slow, many days # gctorture2(step=100) -print(Sys.time()); require(data.table); print(Sys.time()); started.at<-proc.time(); try(test.data.table()); print(Sys.time()); print(timetaken(started.at)) -# 3m require; 62m test +print(Sys.time()); require(data.table); print(Sys.time()); started.at<-proc.time(); try(test.data.table(script="*.Rraw")); print(Sys.time()); print(timetaken(started.at)) +# 3m require; 62m test # level 1 -O0 +# 1m require; 33m test # level 2 -O2 +q() # valgrind output printed after q() -# Investigated and ignore : -# Tests 648 and 1262 (see their comments) have single precision issues under valgrind that don't occur on CRAN, even Solaris. -# Old comment from gsumm.c ... // long double usage here used to result in test 648 failing when run under valgrind - // http://valgrind.org/docs/manual/manual-core.html#manual-core.limits" +# Precision issues under valgrind are now avoided using test_longdouble in tests.Rraw, and exact_NaN in froll.Rraw # Ignore all "set address range perms" warnings : # http://stackoverflow.com/questions/13558067/what-does-this-valgrind-warning-mean-warning-set-address-range-perms # Ignore heap summaries around test 1705 and 1707/1708 due to the fork() test opening/closing, I guess. -# Tests 1729.4, 1729.8, 1729.11, 1729.13 again have precision issues under valgrind only. # Leaks for tests 1738.5, 1739.3 but no data.table .c lines are flagged, rather libcairo.so # and libfontconfig.so via GEMetricInfo and GEStrWidth in libR.so diff --git a/.dev/valgrind.supp b/.dev/valgrind.supp new file mode 100644 index 0000000000..2d9eb0bb7b --- /dev/null +++ b/.dev/valgrind.supp @@ -0,0 +1,24 @@ +{ + + Memcheck:Leak + ... + obj:*/libfontconfig.so.* + ... +} + +{ + + Memcheck:Leak + ... + obj:*libpango*.so.* + ... +} + +{ + + Memcheck:Leak + ... + obj:*libgobject*.so.* + ... +} + diff --git a/.travis.yml b/.travis.yml index b219af006f..8455e3dc88 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ r: os: - linux - - osx # Takes 13m (+9m linux = 22m total); #3357; #3326; #3331. When off it's to speed up dev cycle; CRAN_Release.cmd has a reminder to turn back on. + # - osx # Takes 13m (+9m linux = 22m total); #3357; #3326; #3331. When off it's to speed up dev cycle; CRAN_Release.cmd has a reminder to turn back on. brew_packages: - llvm diff --git a/R/test.data.table.R b/R/test.data.table.R index 9c895c69a4..c4b6cfaf6d 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -23,8 +23,11 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F scripts = dir(fulldir, "*.Rraw.*") scripts = scripts[!grepl("bench|other", scripts)] scripts = gsub("[.]bz2$","",scripts) - for (fn in scripts) {test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress); cat("\n");} - return(invisible()) + return(sapply(scripts, function(fn) { + err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress)) + cat("\n"); + identical(err, TRUE) + })) # nocov end } @@ -137,6 +140,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F cat("\n", date(), # so we can tell exactly when these tests ran on CRAN to double-check the result is up to date " endian==", .Platform$endian, ", sizeof(long double)==", .Machine$sizeof.longdouble, + ", longdouble.digits==", .Machine$longdouble.digits, # 64 normally, 53 for example under valgrind where some high accuracy tests need turning off, #4639 ", sizeof(pointer)==", .Machine$sizeof.pointer, ", TZ==", if (is.na(tz)) "unset" else paste0("'",tz,"'"), ", Sys.timezone()=='", suppressWarnings(Sys.timezone()), "'", diff --git a/inst/tests/froll.Rraw b/inst/tests/froll.Rraw index 62c16801ca..84143e587c 100644 --- a/inst/tests/froll.Rraw +++ b/inst/tests/froll.Rraw @@ -9,6 +9,13 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { froll = data.table:::froll } +exact_NaN = isTRUE(capabilities()["long.double"]) && identical(as.integer(.Machine$longdouble.digits), 64L) +if (!exact_NaN) { + cat("\n**** Skipping 7 NaN/NA algo='exact' tests because .Machine$longdouble.digits==", .Machine$longdouble.digits, " (!=64); e.g. under valgrind\n\n", sep="") + # for Matt when he runs valgrind it is 53, but 64 when running regular R + # froll.c uses long double and appears to require full long double accuracy in the algo='exact' +} + ## rolling features #### atomic vectors input and single window returns atomic vectors @@ -192,7 +199,7 @@ expected = list( c(rep(NA_real_,4), seq(1.5,2,0.25), rep(NA_real_, 1)) ) test(6000.040, ans1, expected) -test(6000.041, ans2, expected) +if (exact_NaN) test(6000.041, ans2, expected) ans1 = frollmean(d, 3, align="right", na.rm=TRUE) ans2 = frollmean(d, 3, align="right", algo="exact", na.rm=TRUE) expected = list( @@ -208,7 +215,7 @@ expected = list( c(rep(NA_real_,3), seq(1.5,2,0.25), rep(NA_real_, 2)) ) test(6000.044, ans1, expected) -test(6000.045, ans2, expected) +if (exact_NaN) test(6000.045, ans2, expected) ans1 = frollmean(d, 3, align="center", na.rm=TRUE) # x even, n odd ans2 = frollmean(d, 3, align="center", algo="exact", na.rm=TRUE) expected = list( @@ -224,7 +231,7 @@ expected = list( c(rep(NA_real_,3), 1.625, 1.875, rep(NA_real_, 3)) ) test(6000.048, ans1, expected) -test(6000.049, ans2, expected) +if (exact_NaN) test(6000.049, ans2, expected) ans1 = frollmean(d, 4, align="center", na.rm=TRUE) # x even, n even ans2 = frollmean(d, 4, align="center", algo="exact", na.rm=TRUE) expected = list( @@ -241,7 +248,7 @@ expected = list( c(rep(NA_real_,3), 1.5, 1.75, 2, rep(NA_real_, 3)) ) test(6000.052, ans1, expected) -test(6000.053, ans2, expected) +if (exact_NaN) test(6000.053, ans2, expected) ans1 = frollmean(de, 3, align="center", na.rm=TRUE) # x odd, n odd ans2 = frollmean(de, 3, align="center", algo="exact", na.rm=TRUE) expected = list( @@ -257,7 +264,7 @@ expected = list( c(rep(NA_real_, 3), 1.625, 1.875, rep(NA_real_,4)) ) test(6000.056, ans1, expected) -test(6000.057, ans2, expected) +if (exact_NaN) test(6000.057, ans2, expected) ans1 = frollmean(de, 4, align="center", na.rm=TRUE) # x odd, n even ans2 = frollmean(de, 4, align="center", algo="exact", na.rm=TRUE) expected = list( @@ -273,7 +280,7 @@ expected = list( c(rep(NA_real_, 2), 1.5, 1.75, 2, rep(NA_real_,3)) ) test(6000.060, ans1, expected) -test(6000.061, ans2, expected) +if (exact_NaN) test(6000.061, ans2, expected) ans1 = frollmean(d, 3, align="left", na.rm=TRUE) ans2 = frollmean(d, 3, align="left", algo="exact", na.rm=TRUE) expected = list( @@ -289,7 +296,7 @@ ans1 = frollmean(d, 2:3) ans2 = frollmean(d, 2:3, algo="exact") expected = list(c(NA, NA, NA, 1.75, NA, NA), rep(NA_real_, 6), c(NA, 0.875, 1.125, NA, NA, NA), c(NA, NA, 1, NA, NA, NA)) test(6000.064, ans1, expected) -test(6000.065, ans2, expected) +if (exact_NaN) test(6000.065, ans2, expected) ans1 = frollmean(d, 2:3, na.rm=TRUE) ans2 = frollmean(d, 2:3, algo="exact", na.rm=TRUE) expected = list(c(NA, 0.5, 1.5, 1.75, 2, 3), c(NA, NA, 1, 1.75, 1.75, 2.5), c(NA, 0.875, 1.125, 1.25, NaN, NaN), c(NA, NA, 1, 1.125, 1.25, NaN)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f71cb37ba4..c966a5efc6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -95,6 +95,12 @@ for (s in sugg) { if (!loaded) cat("\n**** Suggested package",s,"is not installed. Tests using it will be skipped.\n\n") } +test_longdouble = isTRUE(capabilities()["long.double"]) && identical(as.integer(.Machine$longdouble.digits), 64L) +if (!test_longdouble) { + cat("\n**** Full long double accuracy is not available. Tests using this will be skipped.\n\n") + # e.g. under valgrind, longdouble.digits==53; causing these to fail: 1262, 1729.04, 1729.08, 1729.09, 1729.11, 1729.13, 1830.7; #4639 +} + ########################## test(1.1, tables(env=new.env()), null.data.table(), output = "No objects of class") @@ -1864,10 +1870,13 @@ basemean = base::mean # to isolate time of `::` itself ans3 = DT[,list(basemean(x),basemean(y)),by=list(grp1,grp2)] test(646, ans1, ans2) test(647, ans1, ans3) -# this'll error with `valgrind` because of the 'long double' usage in gsumm.c (although I wonder if we need long double precision). -# http://valgrind.org/docs/manual/manual-core.html#manual-core.limits -# http://comments.gmane.org/gmane.comp.debugging.valgrind/10340 -test(648, any(is.na(ans1$V1)) && !any(is.nan(ans1$V1))) +if (test_longdouble) { + test(648, any(is.na(ans1$V1)) && !any(is.nan(ans1$V1))) + # used to error with `valgrind` because of the 'long double' usage in gsumm.c (although I wonder if we need long double precision). + # it doesn't seem to error under valgrind anymore so the test_longdouble may be removable + # http://valgrind.org/docs/manual/manual-core.html#manual-core.limits + # http://comments.gmane.org/gmane.comp.debugging.valgrind/10340 +} ans1 = DT[,list(mean(x,na.rm=TRUE),mean(y,na.rm=TRUE)),by=list(grp1,grp2)] ans2 = DT[,list(mean.default(x,na.rm=TRUE),mean.default(y,na.rm=TRUE)),by=list(grp1,grp2)] test(651, ans1, ans2) @@ -3851,8 +3860,8 @@ DF <- as.data.frame(DT) test(1146.2, {set(DF, i=NULL, j=1L, value=seq_len(nrow(DF)));setattr(DF,"reference",NULL);DF}, data.frame(Time=1:nrow(BOD), demand=BOD$demand)) test(1146.3, set(DF, i=NULL, j="bla", value=seq_len(nrow(DF))), error="set() on a data.frame is for changing existing columns, not adding new ones. Please use a data.table for that.") -if (.Machine$sizeof.longdouble == 16) { - # To not run on CRAN's solaris-sparc 32bit where sizeof.longdouble==0 +if (test_longdouble) { + # e.g. not on CRAN's solaris-sparc 32bit, and not under valgrind which uses 53 instead of 64 longdouble.digits old = getNumericRounding() @@ -4601,10 +4610,12 @@ test(1259, DT[,.N,by=upc], data.table(upc=c(360734147771, 360734147770), N=3L)) test(1260, DT[,.N,by=upc][order(upc)], data.table(upc=c(360734147770, 360734147771), N=3L)) test(1261, getNumericRounding(), 1L) # the limit of double precision (16 s.f.) ... -if (.Machine$sizeof.longdouble==16) - test(1262, length(unique(c(1.2345678901234560, 1.2345678901234561, 1.2345678901234562, 1.2345678901234563))), 2L) - # 2 not 4 is double precision limit which base::unique() relies on in this test - # valgrind will also return (3) instead of (2) here.. due to floating point precision limitation. changing the last two values to 1.2345678901234563 and 1.2345678901234564 returns 2. +if (test_longdouble) { + test(1262, length(unique(c(1.2345678901234560, 1.2345678901234561, 1.2345678901234562, 1.2345678901234563))), 2L) + # 2 not 4 is double precision limit which base::unique() relies on in this test + # valgrind will also return (3) instead of (2) here due to floating point precision limitation. + # changing the last two values to 1.2345678901234563 and 1.2345678901234564 returns 2. +} DT = data.table(id=c(1.234567890123450, 1.234567890123451, 1.234567890123452, 1.234567890123453)) # one less digit is limit test(1263, length(unique(DT$id)), 4L) test(1264, DT[,.N,by=id]$N, 4L) # 1 byte rounding isn't enough @@ -10424,71 +10435,73 @@ test(1728.11, DT[order(x,na.last=FALSE)], DT) test(1728.12, DT[order(x,na.last=NA)], DT[2]) # was randomly wrong # fwrite wrong and crash on 9.9999999999999982236431605, #1847 -options(datatable.verbose = FALSE) -test(1729.01, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))), - output="V1,V2\n1,10") -test(1729.02, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))), - output="V2,V1\n10,1") -DT = data.table(V1=c(9999999999.99, 0.00000000000000099, 0.0000000000000000000009, 0.9, 9.0, 9.1, 99.9, - 0.000000000000000000000999999999999999999999999, - 99999999999999999999999999999.999999)) -ans = "V1\n9999999999.99\n9.9e-16\n9e-22\n0.9\n9\n9.1\n99.9\n1e-21\n1e+29" -test(1729.03, fwrite(DT), output=ans) -test(1729.04, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans) - -# same decimal/scientific rule (shortest format) as write.csv -DT = data.table(V1=c(-00000.00006, -123456789.123456789, - seq.int(-1000,1000,17), - seq(-1000,1000,pi*87), - -1.2345678912345 * 10^(c((-30):30)), - +1.2345678912345 * 10^(c((-30):30)), - -1.2345 * 10^((-20):20), - +1.2345 * 10^((-20):20), - -1.7 * 10^((-20):20), - +1.7 * 10^((-20):20), - -7 * 10^((-20):20), - +7 * 10^((-20):20), - 0, NA, NaN, Inf, -Inf, - 5.123456789e-290, -5.123456789e-290, - 5.123456789e-307, -5.123456789e-307, - 5.123456789e+307, -5.123456789e+307)) -test(1729.05, nrow(DT), 507L) - -options(datatable.verbose = FALSE) # capture.output() exact tests must not be polluted with verbosity -x = capture.output(fwrite(DT,na="NA"))[-1] # -1 to remove the column name V1 -y = capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))[-1] -# One mismatch that seems to be accuracy in base R's write.csv -# tmp = cbind(row=1:length(x), `fwrite`=x, `write.csv`=y) -# tmp[x!=y,] -# row fwrite write.csv -# 177 "-1234567891234500000" "-1234567891234499840" -# 238 "1234567891234500000" "1234567891234499840" -# looking in surrounding rows for the first one shows the switch point : -# tmp[175:179,] -# row fwrite write.csv -# 175 "-12345678912345000" "-12345678912345000" # ok -# 176 "-123456789123450000" "-123456789123450000" # ok -# 177 "-1234567891234500000" "-1234567891234499840" # e+18 last before switch to scientific -# 178 "-1.2345678912345e+19" "-1.2345678912345e+19" # ok -# 179 "-1.2345678912345e+20" "-1.2345678912345e+20" # ok -test(1729.06, x[c(177,238)], c("-1234567891234500000","1234567891234500000")) -x = x[-c(177,238)] -y = y[-c(177,238)] -test(1729.07, length(x), 505L) -test(1729.08, x, y) -if (!identical(x,y)) print(data.table(row=1:length(x), `fwrite`=x, `write.csv`=y)[x!=y]) - -DT = data.table(c(5.123456789e+300, -5.123456789e+300, - 1e-305,1e+305, 1.2e-305,1.2e+305, 1.23e-305,1.23e+305)) -ans = c("V1","5.123456789e+300","-5.123456789e+300", - "1e-305","1e+305","1.2e-305","1.2e+305","1.23e-305","1.23e+305") -# explicitly check against ans rather than just comparing fwrite to write.csv so that : -# i) we can easily see intended results right here in future without needing to run -# ii) we don't get a false pass if fwrite and write.csv agree but are both wrong because of -# a problem with the test mechanism itself or something else strange or unexpected -# Exactly the same binary representation on both linux and windows (so any differences in -# output are not because the value itself is stored differently) : -if (isTRUE(LD<-capabilities()["long.double"])) { #3258 +if (test_longdouble) { #3258 + + old = options(datatable.verbose=FALSE) # capture.output() exact tests must not be polluted with verbosity + + test(1729.01, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))), + output="V1,V2\n1,10") + test(1729.02, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))), + output="V2,V1\n10,1") + DT = data.table(V1=c(9999999999.99, 0.00000000000000099, 0.0000000000000000000009, 0.9, 9.0, 9.1, 99.9, + 0.000000000000000000000999999999999999999999999, + 99999999999999999999999999999.999999)) + ans = "V1\n9999999999.99\n9.9e-16\n9e-22\n0.9\n9\n9.1\n99.9\n1e-21\n1e+29" + test(1729.03, fwrite(DT), output=ans) + test(1729.04, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans) + + # same decimal/scientific rule (shortest format) as write.csv + DT = data.table(V1=c(-00000.00006, -123456789.123456789, + seq.int(-1000,1000,17), + seq(-1000,1000,pi*87), + -1.2345678912345 * 10^(c((-30):30)), + +1.2345678912345 * 10^(c((-30):30)), + -1.2345 * 10^((-20):20), + +1.2345 * 10^((-20):20), + -1.7 * 10^((-20):20), + +1.7 * 10^((-20):20), + -7 * 10^((-20):20), + +7 * 10^((-20):20), + 0, NA, NaN, Inf, -Inf, + 5.123456789e-290, -5.123456789e-290, + 5.123456789e-307, -5.123456789e-307, + 5.123456789e+307, -5.123456789e+307)) + test(1729.05, nrow(DT), 507L) + + x = capture.output(fwrite(DT,na="NA"))[-1] # -1 to remove the column name V1 + y = capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))[-1] + # One mismatch that seems to be accuracy in base R's write.csv + # tmp = cbind(row=1:length(x), `fwrite`=x, `write.csv`=y) + # tmp[x!=y,] + # row fwrite write.csv + # 177 "-1234567891234500000" "-1234567891234499840" + # 238 "1234567891234500000" "1234567891234499840" + # looking in surrounding rows for the first one shows the switch point : + # tmp[175:179,] + # row fwrite write.csv + # 175 "-12345678912345000" "-12345678912345000" # ok + # 176 "-123456789123450000" "-123456789123450000" # ok + # 177 "-1234567891234500000" "-1234567891234499840" # e+18 last before switch to scientific + # 178 "-1.2345678912345e+19" "-1.2345678912345e+19" # ok + # 179 "-1.2345678912345e+20" "-1.2345678912345e+20" # ok + test(1729.06, x[c(177,238)], c("-1234567891234500000","1234567891234500000")) + x = x[-c(177,238)] + y = y[-c(177,238)] + test(1729.07, length(x), 505L) + test(1729.08, x, y) + if (!identical(x,y)) print(data.table(row=1:length(x), `fwrite`=x, `write.csv`=y)[x!=y]) + + DT = data.table(c(5.123456789e+300, -5.123456789e+300, + 1e-305,1e+305, 1.2e-305,1.2e+305, 1.23e-305,1.23e+305)) + ans = c("V1","5.123456789e+300","-5.123456789e+300", + "1e-305","1e+305","1.2e-305","1.2e+305","1.23e-305","1.23e+305") + # explicitly check against ans rather than just comparing fwrite to write.csv so that : + # i) we can easily see intended results right here in future without needing to run + # ii) we don't get a false pass if fwrite and write.csv agree but are both wrong because of + # a problem with the test mechanism itself or something else strange or unexpected + # Exactly the same binary representation on both linux and windows (so any differences in + # output are not because the value itself is stored differently) : + test(1729.09, binary(DT[[1]]), c("0 11111100101 111010011010000100010111101110000100 11110100 00000100", "1 11111100101 111010011010000100010111101110000100 11110100 00000100", @@ -10498,16 +10511,16 @@ if (isTRUE(LD<-capabilities()["long.double"])) { #3258 "0 11111110100 010111011111100101001110101100000011 01101011 10101100", "0 00000001010 000101000110010100110011101010000110 00111110 01010001", "0 11111110100 011001101011100100100011110110110000 01001110 01011101")) -} else { - cat('Skipped test 1729.9 due to capabilities()["long.double"] ==', LD, '\n') + test(1729.10, fwrite(DT,na=""), output=ans) + test(1729.11, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans) + DT = data.table(unlist(.Machine[c("double.eps","double.neg.eps","double.xmin","double.xmax")])) + # double.eps double.neg.eps double.xmin double.xmax + # 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308 + test(1729.12, typeof(DT[[1L]]), "double") + test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + + options(old) # restore the previous datatable.verbose value, for example for the CRAN_Release test with verbose on } -test(1729.10, fwrite(DT,na=""), output=ans) -test(1729.11, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans) -DT = data.table(unlist(.Machine[c("double.eps","double.neg.eps","double.xmin","double.xmax")])) -# double.eps double.neg.eps double.xmin double.xmax -# 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308 -test(1729.12, typeof(DT[[1L]]), "double") -test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) if (test_bit64) { test(1730.1, typeof(-2147483647L), "integer") @@ -11820,15 +11833,13 @@ test(1830.5, identical( test(1830.6, identical( fread("E\n0e0\n#DIV/0!\n#VALUE!\n#NULL!\n#NAME?\n#NUM!\n#REF!\n#N/A\n1e0\n"), data.table(E=c(0, NaN, NaN, NA, NA, NA, NA, NA, 1)))) -if (isTRUE(LD<-capabilities()["long.double"])) { #3258 +if (test_longdouble) { #3258 test(1830.7, identical( fread("F\n1.1\n+1.333333333333333\n5.9e300\n45609E11\n-00890.e-003\n"), data.table(F=c(1.1, 1.333333333333333, 5.9e300, 45609e11, -890e-3)))) test(1830.8, identical( fread("G\n0.000000000000000000000000000000000000000000000000000000000000449548\n"), data.table(G=c(4.49548e-61)))) -} else { - cat('Skipped tests 1830.7 and 1830.8 due to capabilities()["long.double"] ==', LD, '\n'); } # Test that integers just above 128 or 256 characters in length parse as strings, not as integers/floats diff --git a/src/fread.c b/src/fread.c index 6976080822..5b108cdcc5 100644 --- a/src/fread.c +++ b/src/fread.c @@ -64,7 +64,7 @@ static void *mmp_copy = NULL; static size_t fileSize; static int8_t *type = NULL, *tmpType = NULL, *size = NULL; static lenOff *colNames = NULL; -static freadMainArgs args; // global for use by DTPRINT +static freadMainArgs args = {0}; // global for use by DTPRINT; static implies ={0} but include the ={0} anyway just in case for valgrind #4639 const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; @@ -954,7 +954,7 @@ static void parse_iso8601_date_core(const char **pch, int32_t *target) { const char *ch = *pch; - int32_t year, month, day; + int32_t year=0, month=0, day=0; str_to_i32_core(&ch, &year); diff --git a/src/fwrite.c b/src/fwrite.c index cf58c2581b..5c5e9eb579 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -585,7 +585,7 @@ void print_z_stream(const z_stream *s) // temporary tracing function for #4099 { const unsigned char *byte = (unsigned char *)s; for (int i=0; i Date: Fri, 7 Aug 2020 21:02:58 -0600 Subject: [PATCH 082/588] news item update re bit::copy rename --- NEWS.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6074ee33eb..62b0415c0a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,20 +14,20 @@ ## NOTES -1. `bit64` v4.0.2 released on 30th July broke `data.table`'s tests. It seems that reverse dependency testing of `bit64` (i.e. testing of the packages which use `bit64`) did not include `data.table` because `data.table` merely suggests `bit64` and does not depend on it. Like other packages on our `Suggest` list, we test `data.table` works with `bit64` in our tests. In testing of our own reverse dependencies (packages which use `data.table`) we do include packages which suggest `data.table`, although it appears it is not CRAN policy to do so. We have requested that CRAN policy be changed to include suggests in reverse dependency testing. +1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, broke `data.table`'s tests. It seems that reverse dependency testing of `bit64` (i.e. testing of the packages which use `bit64`) did not include `data.table` because `data.table` suggests `bit64` but does not depend on it. Like other packages on our `Suggest` list, we test `data.table` works with `bit64` in our tests. In testing of our own reverse dependencies (packages which use `data.table`) we do include packages which suggest `data.table`, although it appears it is not CRAN policy to do so. We have requested that CRAN policy be improved to include suggests in reverse dependency testing. The first break was because `all.equal` did not work in previous versions of `bit64`; e.g., ```R require(bit64) all.equal(as.integer64(3), as.integer64(4)) - TRUE # < v4.0.0 - FALSE # >= v4.0.0 + TRUE # < v4.0.0; incorrect + FALSE # >= v4.0.0; correct because 3!=4 ``` - We feel the need to explain this in detail here because the addition of the `integer64` method for `all.equal` appears as a very brief "new feature" in `bit64`'s NEWS. We like `bit64` a lot and we know users of `data.table` also use `bit64`. They may be impacted in the same way; e.g., equality tests previously passing when they should not have passed. In our case, two `fcase` tests started to fail upon `bit64`'s update. Fortunately, the `fcase` results were correct but the tests were comparing to an incorrect result. These tests were incorrectly passing due to `all.equal` always returning TRUE for any `integer64` input. Note also that `all.equal` always returned TRUE for any `nanotime` input, since `nanotime`'s underlying type is `bit64`. + We feel the need to explain this in detail here because the addition of the `integer64` method for `all.equal` appears as a brief new-feature in `bit64`'s NEWS. We like `bit64` a lot and we know users of `data.table` also use `bit64`. They may be impacted in the same way; e.g., equality tests previously passing when they should not have passed. In our case, two `fcase` tests started to fail upon `bit64`'s update. The `fcase` results were correct but the tests were comparing to an incorrect result. These tests were incorrectly passing due to `all.equal` always returning TRUE for any `integer64` input. Note also that `all.equal` always returned TRUE for any `nanotime` input, since `nanotime`'s underlying type is `bit64`. - The second break caused by `bit64` was the addition of a `copy` function. Since `data.table::copy` is long standing we hope that `bit64` can rename its new `copy` function. Otherwise, users of `data.table` may need to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. + The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. Thanks to Cole Miller for the PR to accomodate `bit64`'s update. From 93967418ee0e40370ce9d4c4b63bf599e4ef1ecd Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Thu, 24 Sep 2020 12:29:00 -0400 Subject: [PATCH 083/588] [[ by group performance (#4655) --- NEWS.md | 2 + inst/tests/tests.Rraw | 11 +++++ src/assign.c | 24 ---------- src/dogroups.c | 100 +++++++++++++++++++++++++++++++++++++----- 4 files changed, 101 insertions(+), 36 deletions(-) diff --git a/NEWS.md b/NEWS.md index 62b0415c0a..308d427250 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,8 @@ 2. A regression in v1.13.0 resulted in installation on Mac often failing with `shared object 'datatable.so' not found`, and FreeBSD always failing with `expr: illegal option -- l`, [#4652](https://github.com/Rdatatable/data.table/issues/4652) [#4640](https://github.com/Rdatatable/data.table/issues/4640) [#4650](https://github.com/Rdatatable/data.table/issues/4650). Thanks to many for assistance including Simon Urbanek, Brian Ripley, Wes Morgan, and @ale07alvarez. There were no installation problems on Windows or Linux. +3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. + ## NOTES 1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, broke `data.table`'s tests. It seems that reverse dependency testing of `bit64` (i.e. testing of the packages which use `bit64`) did not include `data.table` because `data.table` suggests `bit64` but does not depend on it. Like other packages on our `Suggest` list, we test `data.table` works with `bit64` in our tests. In testing of our own reverse dependencies (packages which use `data.table`) we do include packages which suggest `data.table`, although it appears it is not CRAN policy to do so. We have requested that CRAN policy be improved to include suggests in reverse dependency testing. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c966a5efc6..058c7db3b2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17127,3 +17127,14 @@ test(2151, dcast(DT, 1 ~ a, value.var='survmean'), data.table('.'='.', s=1L, x=2 y = person(given='Joel', family='Mossong') test(2152, copy(y), y) +# .N and .GRP special statics copied correctly when placed as a vector in a list column; part of PR#4655 +# see comments in anySpecialStatic() at the top of dogroups.c +# .SD, .I and .BY are covered by previous tests +DT = data.table(x=c(1L,2L,2L), y=1:3) +test(2153.1, DT[, .(list(.N)), by=x], data.table(x=1:2, V1=as.list(1:2))) +test(2153.2, DT[, .(list(.GRP)), by=x], data.table(x=1:2, V1=as.list(1:2))) +test(2153.3, ans<-DT[, .(list(.NGRP)), by=x], data.table(x=1:2, V1=list(2L,2L))) +test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't change group to group so the same object can be referenced many times unlike .N and .GRP +test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result + data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) + diff --git a/src/assign.c b/src/assign.c index 88fc260655..579af7d3e5 100644 --- a/src/assign.c +++ b/src/assign.c @@ -670,13 +670,6 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) return(dt); // needed for `*tmp*` mechanism (when := isn't used), and to return the new object after a := for compound syntax. } -static bool anyNamed(SEXP x) { - if (MAYBE_REFERENCED(x)) return true; - if (isNewList(x)) for (int i=0; i #include +static bool anySpecialStatic(SEXP x) { + // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd + // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups + // creates .SD for the largest group once up front, overwriting the contents for each group. Their + // value changes across group but not their memory address. (.NGRP is also special static but its value + // is constant across groups so that's excluded here.) + // This works well, other than a relatively rare case when two conditions are both true : + // 1) the j expression returns a group column as-is without doing any aggregation + // 2) that result is placed in a list column result + // The list column result can then incorrectly contain the result for the last group repeated for all + // groups because the list column ends up holding a pointer to these special static vectors. + // See test 2153, and to illustrate here, consider a simplified test 1341 + // > DT + // x y + // + // 1: 1 1 + // 2: 2 2 + // 3: 1 3 + // 4: 2 4 + // > DT[, .(list(y)), by=x] + // x V1 + // + // 1: 1 2,4 # should be 1,3 + // 2: 2 2,4 + // + // This has been fixed for a decade but the solution has changed over time. + // + // We don't wish to inspect the j expression for these cases because there are so many; e.g. user defined functions. + // A special symbol does not need to appear in j for the problem to occur. Using a member of .SD is enough as the example above illustrates. + // Using R's own reference counting could invoke too many unnecessary copies because these specials are routinely referenced. + // Hence we mark these specials (SD, BY, I) here in dogroups and if j's value is being assigned to a list column, we check to + // see if any specials are present and copy them if so. + // This keeps the special logic in one place in one file here. Previously this copy was done by memrecycle in assign.c but then + // with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed. + // Currently the marker is negative truelength. These specials are protected by us here and before we release them + // we restore the true truelength for when R starts to use vector truelength. + if (isVectorAtomic(x)) + return TRUELENGTH(x)<0; + if (isNewList(x)) for (int i=0; i maxGrpSize) maxGrpSize = ilens[i]; } defineVar(install(".I"), I = PROTECT(allocVector(INTSXP, maxGrpSize)), env); nprotect++; + SET_TRUELENGTH(I, -maxGrpSize); // marker for anySpecialStatic(); see its comments R_LockBinding(install(".I"), env); SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by" @@ -69,23 +118,25 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX SEXP names = PROTECT(getAttrib(SDall, R_NamesSymbol)); nprotect++; if (length(names) != length(SDall)) error(_("length(names)!=length(SD)")); SEXP *nameSyms = (SEXP *)R_alloc(length(names), sizeof(SEXP)); + for(int i=0; i1 && thislen!=maxn && grpn>0) { // grpn>0 for grouping empty tables; test 1986 error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn); } + bool copied = false; + if (isNewList(target) && anySpecialStatic(source)) { // see comments in anySpecialStatic() + source = PROTECT(duplicate(source)); + copied = true; + } memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, ""); + if (copied) UNPROTECT(1); } } ansloc += maxn; @@ -358,8 +422,20 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } } else ans = R_NilValue; // Now reset length of .SD columns and .I to length of largest group, otherwise leak if the last group is smaller (often is). - for (int j=0; j0; From a8932b2ff8d8d8435fe48aa12cfb934115178c83 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 24 Sep 2020 13:46:48 -0600 Subject: [PATCH 084/588] avoid 'unable to verify current time' R CMD check note --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0a0ed13a84..04bba9c71d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,6 +2,7 @@ variables: CRAN_MIRROR: "https://cloud.r-project.org" _R_CHECK_FORCE_SUGGESTS_: "false" _R_CHECK_NO_STOP_ON_TEST_ERROR_: "true" + _R_CHECK_SYSTEM_CLOCK_: "false" ## https://stackoverflow.com/questions/63613301/r-cmd-check-note-unable-to-verify-current-time stages: - dependencies From e1a2f5d12dba2b3301ce3fad7f6f32d370fc807a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 24 Sep 2020 14:59:05 -0600 Subject: [PATCH 085/588] GLCI TZ UTC to avoid apparent timedatectl problems under Docker with R 3.4 --- .gitlab-ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 04bba9c71d..8df02e580d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,6 +3,9 @@ variables: _R_CHECK_FORCE_SUGGESTS_: "false" _R_CHECK_NO_STOP_ON_TEST_ERROR_: "true" _R_CHECK_SYSTEM_CLOCK_: "false" ## https://stackoverflow.com/questions/63613301/r-cmd-check-note-unable-to-verify-current-time + TZ: "UTC" ## to avoid 'Failed to create bus connection' from timedatectl via Sys.timezone() on Docker with R 3.4. + ## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under + ## a non-UTC timezone, although, that's what we do routinely in dev. stages: - dependencies From e21a4f32d4637b3bd065170cd02d83d4a4dc8ff0 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 24 Sep 2020 23:05:31 -0600 Subject: [PATCH 086/588] turn on showProgress temporarily to see where GLCI R3.4 and R3.1 segfault is; #4090 --- tests/main.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/main.R b/tests/main.R index 8a56142ee1..a72af446bf 100644 --- a/tests/main.R +++ b/tests/main.R @@ -1,6 +1,11 @@ require(data.table) -test.data.table() # runs the main test suite of 5,000+ tests in /inst/tests/tests.Rraw +# test.data.table() # runs the main test suite of 5,000+ tests in /inst/tests/tests.Rraw + +# turning on showProgress temporarily to find where segfault is that GLCI shows with R 3.4 and R 3.1 +# showProgress is default interactive() so it's off for CRAN but also off for GLCI +# strict R-devel passes locally as does R 3.1 locally, so now running with strict torture locally but that's still running +test.data.table(showProgress=TRUE) # Turn off verbose repeat to save time (particularly Travis, but also CRAN) : # test.data.table(verbose=TRUE) From f3470bce8d2097bb6b1e3a92b5e93169d97e1cad Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 25 Sep 2020 01:02:30 -0600 Subject: [PATCH 087/588] attempt to fix GLCI segfault in R<3.5 by using length not LENGTH on NULL; #4655 --- src/dogroups.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dogroups.c b/src/dogroups.c index 9b7b4cbc9a..cbf10e09e4 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -39,9 +39,12 @@ static bool anySpecialStatic(SEXP x) { // with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed. // Currently the marker is negative truelength. These specials are protected by us here and before we release them // we restore the true truelength for when R starts to use vector truelength. + const int n = length(x); + if (n==0) + return false; if (isVectorAtomic(x)) return TRUELENGTH(x)<0; - if (isNewList(x)) for (int i=0; i Date: Fri, 25 Sep 2020 08:37:30 -0600 Subject: [PATCH 088/588] showProgress back to default interactive() now that segfault on R<3.5 shown by GLCI is fixed --- src/dogroups.c | 4 +++- tests/main.R | 9 ++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index cbf10e09e4..4c243104a5 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -40,11 +40,13 @@ static bool anySpecialStatic(SEXP x) { // Currently the marker is negative truelength. These specials are protected by us here and before we release them // we restore the true truelength for when R starts to use vector truelength. const int n = length(x); + // use length() not LENGTH() because LENGTH() on NULL is segfault in R<3.5 where we still define USE_RINTERNALS + // (see data.table.h), and isNewList() is true for NULL if (n==0) return false; if (isVectorAtomic(x)) return TRUELENGTH(x)<0; - if (isNewList(x)) for (int i=0; i Date: Fri, 25 Sep 2020 23:03:43 +0200 Subject: [PATCH 089/588] new OS dep required by pkgdown (#4718) --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8df02e580d..2adbf6d894 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -301,7 +301,7 @@ integration: ## merging all artifacts to produce single R repository, documentat R_DEVEL_BIN_VERSION: "4.1" script: ## pkgdown installs pkgs from "." so run at start to have clean root dir - - apt-get update -qq && apt-get install -y libxml2-dev + - apt-get update -qq && apt-get install -y libxml2-dev libfontconfig1-dev ## fontconfig1 for #4717 - mkdir -p /tmp/pkgdown/library - R_LIBS_USER=/tmp/pkgdown/library Rscript -e 'install.packages("pkgdown", repos=Sys.getenv("CRAN_MIRROR"), quiet=TRUE); pkgdown::build_site(override=list(destination="./pkgdown"))' ## html manual, vignettes, repos, cran_web, cran_checks From 1cc5c0cf7ef5305c5e1aa302c96a90392013e6b8 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Fri, 25 Sep 2020 16:52:20 -0500 Subject: [PATCH 090/588] add inline to function definition (#4671) --- inst/include/datatableAPI.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/include/datatableAPI.h b/inst/include/datatableAPI.h index 9e8bb48f31..44f52018f4 100644 --- a/inst/include/datatableAPI.h +++ b/inst/include/datatableAPI.h @@ -21,7 +21,7 @@ extern "C" { /* provided the interface for the function exported in ../src/init.c via R_RegisterCCallable() */ -SEXP attribute_hidden DT_subsetDT(SEXP x, SEXP rows, SEXP cols) { +inline SEXP attribute_hidden DT_subsetDT(SEXP x, SEXP rows, SEXP cols) { static SEXP(*fun)(SEXP, SEXP, SEXP) = (SEXP(*)(SEXP,SEXP,SEXP)) R_GetCCallable("data.table", "CsubsetDT"); return fun(x,rows,cols); From 172ad6982446fac7542033dc89e5361acbc60d24 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 26 Sep 2020 16:18:42 +0200 Subject: [PATCH 091/588] even more OS deps, use another docker image instead (#4722) --- .gitlab-ci.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2adbf6d894..9a5b4845f8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -278,7 +278,7 @@ test-dev-win: ## R-devel on Windows integration: ## merging all artifacts to produce single R repository, documentation and website stage: integration - image: registry.gitlab.com/jangorecki/dockerfiles/r-builder + image: registry.gitlab.com/jangorecki/dockerfiles/r-pkgdown tags: - linux only: @@ -300,10 +300,7 @@ integration: ## merging all artifacts to produce single R repository, documentat R_BIN_VERSION: "4.0" R_DEVEL_BIN_VERSION: "4.1" script: - ## pkgdown installs pkgs from "." so run at start to have clean root dir - - apt-get update -qq && apt-get install -y libxml2-dev libfontconfig1-dev ## fontconfig1 for #4717 - - mkdir -p /tmp/pkgdown/library - - R_LIBS_USER=/tmp/pkgdown/library Rscript -e 'install.packages("pkgdown", repos=Sys.getenv("CRAN_MIRROR"), quiet=TRUE); pkgdown::build_site(override=list(destination="./pkgdown"))' + - Rscript -e 'pkgdown::build_site(override=list(destination="./pkgdown"))' ## html manual, vignettes, repos, cran_web, cran_checks - echo 'source(".ci/ci.R"); source(".ci/publish.R")' >> .Rprofile ## list of available test-* jobs dynamically based on bus/test-* directories From e22e6596f9cde5cc5ffcb35ecdec8e881bdfb86d Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sat, 26 Sep 2020 09:15:25 -0700 Subject: [PATCH 092/588] Make tests 168,2043 work for any LC_TIME (#4719) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 308d427250..b5c743994c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -35,6 +35,8 @@ 2. `?.NGRP` now displays the help page as intended, [#4946](https://github.com/Rdatatable/data.table/issues/4649). Thanks to @KyleHaynes for posting the issue, and Cole Miller for the fix. `.NGRP` is a symbol new in v1.13.0; see below in this file. +3. `test.data.table()` failed in non-English locales such as `LC_TIME=fr_FR.UTF-8` due to `Jan` vs `janv.` in tests 168 and 2042, [#3450](https://github.com/Rdatatable/data.table/issues/3450). Thanks to @shrektan for reporting, and @tdhock for making the tests locale-aware. + # data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 058c7db3b2..46b6716cee 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -471,10 +471,13 @@ test(167.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer())) try(graphics.off(),silent=TRUE) # IDateTime conversion methods that ggplot2 uses (it calls as.data.frame method) -datetimes = c("2011 NOV18 09:29:16", "2011 NOV18 10:42:40", "2011 NOV18 23:47:12", - "2011 NOV19 01:06:01", "2011 NOV19 11:35:34", "2011 NOV19 11:51:09") +# Since %b is e.g. "nov." in LC_TIME=fr_FR.UTF-8 locale, we need to +# have the target/y value in these tests depend on the locale as well, #3450. +NOV = format(strptime("2000-11-01", "%Y-%m-%d"), "%b") +x = c("09:29:16","10:42:40","23:47:12","01:06:01","11:35:34","11:51:09") +datetimes = paste0("2011 ", NOV, c(18,18,18,19,19,19), " ", x) DT = IDateTime(strptime(datetimes,"%Y %b%d %H:%M:%S")) -test(168.1, DT[,as.data.frame(itime)], data.frame(V1=as.ITime(x<-c("09:29:16","10:42:40","23:47:12","01:06:01","11:35:34","11:51:09")))) +test(168.1, DT[,as.data.frame(itime)], data.frame(V1=as.ITime(x))) test(168.2, as.character(DT[,as.POSIXct(itime,tz="UTC")]), paste(Sys.Date(), x)) test(168.3, as.character(DT[,as.POSIXct(idate,tz="UTC")]), c("2011-11-18","2011-11-18","2011-11-18","2011-11-19","2011-11-19","2011-11-19")) @@ -15065,10 +15068,13 @@ test(2041.2, DT[, median(time), by=g], DT[c(2,5),.(g=g, V1=time)]) # 'invalid trim argument' with optimization level 1; #1876 test(2042.1, DT[ , as.character(mean(date)), by=g, verbose=TRUE ], data.table(g=c("a","b"), V1=c("2018-01-04","2018-01-21")), - output=msg<-"GForce is on, left j unchanged.*Old mean optimization is on, left j unchanged") -test(2042.2, DT[ , format(mean(date),"%b-%Y")], "Jan-2018") + output=msg<-"GForce is on, left j unchanged.*Old mean optimization is on, left j unchanged") +# Since %b is e.g. "janv." in LC_TIME=fr_FR.UTF-8 locale, we need to +# have the target/y value in these tests depend on the locale as well, #3450. +Jan.2018 = format(strptime("2018-01-01", "%Y-%m-%d"), "%b-%Y") +test(2042.2, DT[ , format(mean(date),"%b-%Y")], Jan.2018) test(2042.3, DT[ , format(mean(date),"%b-%Y"), by=g, verbose=TRUE ], # just this case generated the error - data.table(g=c("a","b"), V1=c("Jan-2018","Jan-2018")), output=msg) + data.table(g=c("a","b"), V1=c(Jan.2018, Jan.2018)), output=msg) # gforce wrongly applied to external variable; #875 DT = data.table(x=INT(1,1,1,2,2), y=1:5) From 5267157d6311f3bbc3c4cfcb01542160e64645aa Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 26 Sep 2020 10:28:13 -0600 Subject: [PATCH 093/588] CRAN_Release.cmd only: added LC_TIME=fr_FR.UTF-8 follow up to #4719 --- .dev/CRAN_Release.cmd | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index e6e16209f6..2464afde99 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -234,6 +234,12 @@ require(data.table) test.data.table() q("no") +# passes under non-English LC_TIME, #2350 +LC_TIME=fr_FR.UTF-8 R +require(data.table) +test.data.table() +q("no") + R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) require(data.table) From 49806a76e04a1e7d4a9cb6e4784c5344d1a39dad Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 28 Sep 2020 15:49:51 -0600 Subject: [PATCH 094/588] segfault in fread warning message fixed (#4724) --- NEWS.md | 3 +++ inst/tests/tests.Rraw | 6 ++++++ src/fread.c | 5 +++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index b5c743994c..05d2adcb35 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,9 @@ 3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. +4. `fread("1.2\n", colClasses='integer')` would segfault when creating the warning message due to no column names in the output, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. + + ## NOTES 1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, broke `data.table`'s tests. It seems that reverse dependency testing of `bit64` (i.e. testing of the packages which use `bit64`) did not include `data.table` because `data.table` suggests `bit64` but does not depend on it. Like other packages on our `Suggest` list, we test `data.table` works with `bit64` in our tests. In testing of our own reverse dependencies (packages which use `data.table`) we do include packages which suggest `data.table`, although it appears it is not CRAN policy to do so. We have requested that CRAN policy be improved to include suggests in reverse dependency testing. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 46b6716cee..e4dbcb480e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17144,3 +17144,9 @@ test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't chan test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) +# warning message segfault when no column names present, #4644 +test(2154.1, fread("0.0\n", colClasses="integer"), data.table(V1=0.0), + warning="Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.*please") +test(2154.2, fread("A\n0.0\n", colClasses="integer"), data.table(A=0.0), + warning="Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.*please") + diff --git a/src/fread.c b/src/fread.c index 5b108cdcc5..5b9bac3f03 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2064,8 +2064,9 @@ int freadMain(freadMainArgs _args) { if (type[j]==CT_DROP) { size[j]=0; ndrop++; continue; } if (type[j]> of inherent type '%s' down to '%s' ignored. Only overrides to a higher type are currently supported. If this was intended, please coerce to the lower type afterwards."), - j+1, colNames[j].len, colNamesAnchor+colNames[j].off, typeName[tmpType[j]], typeName[type[j]]); + DTWARN(_("Attempt to override column %d%s%.*s%s of inherent type '%s' down to '%s' ignored. Only overrides to a higher type are currently supported. If this was intended, please coerce to the lower type afterwards."), + j+1, colNames?" <<":"", colNames?(colNames[j].len):0, colNames?(colNamesAnchor+colNames[j].off):"", colNames?">>":"", // #4644 + typeName[tmpType[j]], typeName[type[j]]); } type[j] = tmpType[j]; // TODO: apply overrides to lower type afterwards and warn about the loss of accuracy then (if any); e.g. "4.0" would be fine to coerce to integer with no warning since From ecddad8ddeef8e210a0761343ca20c8d1472fedc Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 28 Sep 2020 16:31:57 -0600 Subject: [PATCH 095/588] shortened the news item about bit64 --- NEWS.md | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/NEWS.md b/NEWS.md index 05d2adcb35..19ac03ccef 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,25 +16,11 @@ 4. `fread("1.2\n", colClasses='integer')` would segfault when creating the warning message due to no column names in the output, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. - ## NOTES -1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, broke `data.table`'s tests. It seems that reverse dependency testing of `bit64` (i.e. testing of the packages which use `bit64`) did not include `data.table` because `data.table` suggests `bit64` but does not depend on it. Like other packages on our `Suggest` list, we test `data.table` works with `bit64` in our tests. In testing of our own reverse dependencies (packages which use `data.table`) we do include packages which suggest `data.table`, although it appears it is not CRAN policy to do so. We have requested that CRAN policy be improved to include suggests in reverse dependency testing. - - The first break was because `all.equal` did not work in previous versions of `bit64`; e.g., - - ```R - require(bit64) - all.equal(as.integer64(3), as.integer64(4)) - TRUE # < v4.0.0; incorrect - FALSE # >= v4.0.0; correct because 3!=4 - ``` - - We feel the need to explain this in detail here because the addition of the `integer64` method for `all.equal` appears as a brief new-feature in `bit64`'s NEWS. We like `bit64` a lot and we know users of `data.table` also use `bit64`. They may be impacted in the same way; e.g., equality tests previously passing when they should not have passed. In our case, two `fcase` tests started to fail upon `bit64`'s update. The `fcase` results were correct but the tests were comparing to an incorrect result. These tests were incorrectly passing due to `all.equal` always returning TRUE for any `integer64` input. Note also that `all.equal` always returned TRUE for any `nanotime` input, since `nanotime`'s underlying type is `bit64`. - - The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. - - Thanks to Cole Miller for the PR to accomodate `bit64`'s update. +1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accomodate `bit64`'s update. + The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on (rather than importing) `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. + We have requested that CRAN policy be modified to require that reverse dependency testing include packages which `Suggest` the package. Had this been the case, reverse dependency testing of `bit64` would have caught the impact on `data.table` before release. 2. `?.NGRP` now displays the help page as intended, [#4946](https://github.com/Rdatatable/data.table/issues/4649). Thanks to @KyleHaynes for posting the issue, and Cole Miller for the fix. `.NGRP` is a symbol new in v1.13.0; see below in this file. From db5afd26562c60bbd25eb160bcbae957474f253a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 28 Sep 2020 16:34:27 -0600 Subject: [PATCH 096/588] paragraph newlines in news item --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 19ac03ccef..81a3b67eac 100644 --- a/NEWS.md +++ b/NEWS.md @@ -19,7 +19,9 @@ ## NOTES 1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accomodate `bit64`'s update. + The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on (rather than importing) `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. + We have requested that CRAN policy be modified to require that reverse dependency testing include packages which `Suggest` the package. Had this been the case, reverse dependency testing of `bit64` would have caught the impact on `data.table` before release. 2. `?.NGRP` now displays the help page as intended, [#4946](https://github.com/Rdatatable/data.table/issues/4649). Thanks to @KyleHaynes for posting the issue, and Cole Miller for the fix. `.NGRP` is a symbol new in v1.13.0; see below in this file. From ba2d7bb170bb0aa120192b0daa65605bc4b038c8 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 3 Oct 2020 01:48:33 -0600 Subject: [PATCH 097/588] retain PKG_* user-supplied env values (#4735) --- .dev/CRAN_Release.cmd | 6 ++++++ NEWS.md | 4 ++++ configure | 5 ++++- src/Makevars.in | 9 +++++++-- 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 2464afde99..66b7b12e67 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -240,6 +240,12 @@ require(data.table) test.data.table() q("no") +# User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 +# Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.13.1.tar.gz +# Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.13.1.tar.gz + R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) require(data.table) diff --git a/NEWS.md b/NEWS.md index 81a3b67eac..b82342925d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,6 +28,10 @@ 3. `test.data.table()` failed in non-English locales such as `LC_TIME=fr_FR.UTF-8` due to `Jan` vs `janv.` in tests 168 and 2042, [#3450](https://github.com/Rdatatable/data.table/issues/3450). Thanks to @shrektan for reporting, and @tdhock for making the tests locale-aware. +4. User-supplied `PKG_LIBS` and `PKG_CFLAGS` are now retained and the suggestion in https://mac.r-project.org/openmp/; i.e., + `PKG_CPPFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_.tar.gz` +has a better chance of working on Mac. + # data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) diff --git a/configure b/configure index b80d9f1dd9..3d41d87b6e 100755 --- a/configure +++ b/configure @@ -85,7 +85,7 @@ EOF if [ "$R_NO_OPENMP" = "1" ]; then # Compilation failed -- try forcing -fopenmp instead. - "${CC}" "${CFLAGS}" -fopenmp test-omp.c || R_NO_OPENMP=1 + ${CC} ${CFLAGS} -fopenmp test-omp.c || R_NO_OPENMP=1 fi # Clean up. @@ -103,5 +103,8 @@ else echo "OpenMP supported" sed -e "s|@openmp_cflags@|\$(SHLIB_OPENMP_CFLAGS)|" src/Makevars.in > src/Makevars fi +# retain user supplied PKG_ env variables, #4664. See comments in Makevars.in too. +sed -i "s|@PKG_CFLAGS@|$PKG_CFLAGS|" src/Makevars +sed -i "s|@PKG_LIBS@|$PKG_LIBS|" src/Makevars exit 0 diff --git a/src/Makevars.in b/src/Makevars.in index 491b0afa0d..76218cb65a 100644 --- a/src/Makevars.in +++ b/src/Makevars.in @@ -1,5 +1,10 @@ -PKG_CFLAGS = @openmp_cflags@ -PKG_LIBS = @openmp_cflags@ -lz +PKG_CFLAGS = @PKG_CFLAGS@ @openmp_cflags@ +PKG_LIBS = @PKG_LIBS@ @openmp_cflags@ -lz +# See WRE $1.2.1.1. But retain user supplied PKG_* too, #4664. +# WRE states ($1.6) that += isn't portable and that we aren't allowed to use it. +# Otherwise we could use the much simpler PKG_LIBS += @openmp_cflags@ -lz. +# Can't do PKG_LIBS = $(PKG_LIBS)... either because that's a 'recursive variable reference' error in make +# Hence the onerous @...@ substitution. Is it still appropriate in 2020 that we can't use +=? all: $(SHLIB) if [ "$(SHLIB)" != "datatable$(SHLIB_EXT)" ]; then mv $(SHLIB) datatable$(SHLIB_EXT); fi From 041c9c1b93b3e636d8ccd7fc8fabc859c4364dae Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 12 Oct 2020 21:30:44 -0600 Subject: [PATCH 098/588] added test for POUMM ASAN error with 1.13.0 (#4755) --- NEWS.md | 2 +- inst/tests/tests.Rraw | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index b82342925d..d5c53fa9d6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,7 +12,7 @@ 2. A regression in v1.13.0 resulted in installation on Mac often failing with `shared object 'datatable.so' not found`, and FreeBSD always failing with `expr: illegal option -- l`, [#4652](https://github.com/Rdatatable/data.table/issues/4652) [#4640](https://github.com/Rdatatable/data.table/issues/4640) [#4650](https://github.com/Rdatatable/data.table/issues/4650). Thanks to many for assistance including Simon Urbanek, Brian Ripley, Wes Morgan, and @ale07alvarez. There were no installation problems on Windows or Linux. -3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. +3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. Related, and also fixed, was a segfault revealed by package POUMM, [#4746](https://github.com/Rdatatable/data.table/issues/4746), when grouping a list column where each item has an attribute; e.g., `coda::mcmc.list`. Detected thanks to CRAN's ASAN checks. Thanks to Venelin Mitov for assistance in tracing the memory fault. 4. `fread("1.2\n", colClasses='integer')` would segfault when creating the warning message due to no column names in the output, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e4dbcb480e..dc49a604a5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17150,3 +17150,11 @@ test(2154.1, fread("0.0\n", colClasses="integer"), data.table(V1=0.0), test(2154.2, fread("A\n0.0\n", colClasses="integer"), data.table(A=0.0), warning="Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.*please") +# asan heap-use-after-free on list columns with attributes on each item, #4746 +DT = data.table(A=INT(1,1,2,3,3,4,5,5,6,7), + B=lapply(1:10, function(x) structure(rnorm(90), foo=c(42,12,36)))) +for (i in 0:4) test(2155+i/10, + { gctorture2(step=20); ans=DT[, .(attr(B[[1L]],"foo")[1L]), by=A]; gctorture2(step=0); gc(); ans }, + data.table(A=1:7, V1=42) +) + From 381623099a9961671ce614ecaf210cb457fe37f5 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Tue, 13 Oct 2020 11:54:54 +0800 Subject: [PATCH 099/588] Fix building from source on Mac OSX (#4743) --- configure | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 3d41d87b6e..c0745e527e 100755 --- a/configure +++ b/configure @@ -104,7 +104,7 @@ else sed -e "s|@openmp_cflags@|\$(SHLIB_OPENMP_CFLAGS)|" src/Makevars.in > src/Makevars fi # retain user supplied PKG_ env variables, #4664. See comments in Makevars.in too. -sed -i "s|@PKG_CFLAGS@|$PKG_CFLAGS|" src/Makevars -sed -i "s|@PKG_LIBS@|$PKG_LIBS|" src/Makevars +sed -e "s|@PKG_CFLAGS@|$PKG_CFLAGS|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars +sed -e "s|@PKG_LIBS@|$PKG_LIBS|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars exit 0 From fbaa98515bf180a417ea14863e8c7a23f6084df7 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 12 Oct 2020 23:54:12 -0600 Subject: [PATCH 100/588] added ALTREP to _selfrefok (#4756) --- NEWS.md | 2 ++ src/assign.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index d5c53fa9d6..1f2dbf88a4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,8 @@ 4. `fread("1.2\n", colClasses='integer')` would segfault when creating the warning message due to no column names in the output, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. +5. `dplyr::mutate(setDT(as.list(1:64)), V1=11)` threw error `can't set ALTREP truelength`, [#4734](https://github.com/Rdatatable/data.table/issues/4734). Thanks to @etryn for the reproducible example, and to Cole Miller for refinements. + ## NOTES 1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accomodate `bit64`'s update. diff --git a/src/assign.c b/src/assign.c index 579af7d3e5..5a944b4993 100644 --- a/src/assign.c +++ b/src/assign.c @@ -125,14 +125,14 @@ static int _selfrefok(SEXP x, Rboolean checkNames, Rboolean verbose) { tag = R_ExternalPtrTag(v); if (!(isNull(tag) || isString(tag))) error(_("Internal error: .internal.selfref tag isn't NULL or a character vector")); // # nocov names = getAttrib(x, R_NamesSymbol); - if (names != tag && isString(names)) + if (names!=tag && isString(names) && !ALTREP(names)) // !ALTREP for #4734 SET_TRUELENGTH(names, LENGTH(names)); // R copied this vector not data.table; it's not actually over-allocated. It looks over-allocated // because R copies the original vector's tl over despite allocating length. prot = R_ExternalPtrProtected(v); if (TYPEOF(prot) != EXTPTRSXP) // Very rare. Was error(_(".internal.selfref prot is not itself an extptr")). return 0; // # nocov ; see http://stackoverflow.com/questions/15342227/getting-a-random-internal-selfref-error-in-data-table-for-r - if (x != R_ExternalPtrAddr(prot)) + if (x!=R_ExternalPtrAddr(prot) && !ALTREP(x)) SET_TRUELENGTH(x, LENGTH(x)); // R copied this vector not data.table, it's not actually over-allocated return checkNames ? names==tag : x==R_ExternalPtrAddr(prot); } From 72bcdea8df042a4c6ddf65ea1394e8788fab4810 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 13 Oct 2020 16:23:00 -0600 Subject: [PATCH 101/588] reinstate README on CRAN (#4757) --- .Rbuildignore | 1 - README.md | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index a910621f52..ad51ae2da7 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,7 +20,6 @@ ^Makefile$ ^NEWS\.0\.md$ -^README\.md$ ^_pkgdown\.yml$ ^src/Makevars$ diff --git a/README.md b/README.md index fe85fd8164..c360b8c0f8 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ * fast and friendly delimited **file reader**: **[`?fread`](https://rdatatable.gitlab.io/data.table/reference/fread.html)**, see also [convenience features for _small_ data](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread) * fast and feature rich delimited **file writer**: **[`?fwrite`](https://rdatatable.gitlab.io/data.table/reference/fwrite.html)** * low-level **parallelism**: many common operations are internally parallelized to use multiple CPU threads -* fast and scalable **aggregations**; e.g. 100GB in RAM (see [benchmarks](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping) on up to **two billion rows**) +* fast and scalable aggregations; e.g. 100GB in RAM (see [benchmarks](https://h2oai.github.io/db-benchmark) on up to **two billion rows**) * fast and feature rich joins: **ordered joins** (e.g. rolling forwards, backwards, nearest and limited staleness), **[overlapping range joins](https://github.com/Rdatatable/data.table/wiki/talks/EARL2014_OverlapRangeJoin_Arun.pdf)** (similar to `IRanges::findOverlaps`), **[non-equi joins](https://github.com/Rdatatable/data.table/wiki/talks/ArunSrinivasanUseR2016.pdf)** (i.e. joins using operators `>, >=, <, <=`), **aggregate on join** (`by=.EACHI`), **update on join** * fast add/update/delete columns **by reference** by group using no copies at all * fast and feature rich **reshaping** data: **[`?dcast`](https://rdatatable.gitlab.io/data.table/reference/dcast.data.table.html)** (_pivot/wider/spread_) and **[`?melt`](https://rdatatable.gitlab.io/data.table/reference/melt.data.table.html)** (_unpivot/longer/gather_) From 4155bf307058cf1d33b33fd0f206a40b770ebf61 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 13 Oct 2020 18:56:40 -0600 Subject: [PATCH 102/588] .dev-only: rerun.cran and rerun.bioc added to revdepr.R:run() --- .dev/CRAN_Release.cmd | 4 ++-- .dev/revdep.R | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 66b7b12e67..600011ad46 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -360,8 +360,8 @@ test.data.table(script="*.Rraw") # 7 mins (vs 1min normally) under UBSAN, ASAN a # without the fix in PR#3515, the --disable-long-double lumped into this build does now work and correctly reproduces the noLD problem # If any problems, edit ~/.R/Makevars and activate "CFLAGS=-O0 -g" to trace. Rerun 'Rdevel-strict CMD INSTALL' and rerun tests. for (i in 1:10) if (!test.data.table()) break # try several runs maybe even 100; e.g a few tests generate data with a non-fixed random seed -# gctorture(TRUE) # very slow, many days -gctorture2(step=100) # [12-18hrs] under ASAN, UBSAN and --strict-barrier +# gctorture(TRUE) # very slow, many days maybe weeks +gctorture2(step=100) # [12-18hrs (TODO: UPDATE, checkbox in #4637)] under ASAN, UBSAN and --strict-barrier print(Sys.time()); started.at<-proc.time(); try(test.data.table()); print(Sys.time()); print(timetaken(started.at)) ## In case want to ever try again with 32bit on 64bit Ubuntu for tracing any 32bit-only problems diff --git a/.dev/revdep.R b/.dev/revdep.R index 772486558e..6e6792b9b7 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -144,7 +144,7 @@ run = function(pkgs=NULL) { if (length(pkgs)==1) pkgs = strsplit(pkgs, split="[, ]")[[1]] if (anyDuplicated(pkgs)) stop("pkgs contains dups") if (!length(pkgs)) { - opts = c("not.started","cran.fail","bioc.fail","both.fail","rerun.all") + opts = c("not.started","cran.fail","bioc.fail","both.fail","rerun.cran","rerun.bioc","rerun.all") cat(paste0(1:length(opts),": ",opts) , sep="\n") w = suppressWarnings(as.integer(readline("Enter option: "))) if (is.na(w) || !w %in% seq_along(opts)) stop(w," is invalid") @@ -158,6 +158,10 @@ run = function(pkgs=NULL) { cat("Proceed? (ctrl-c or enter)\n") scan(quiet=TRUE) system(cmd) + } else if (which=="rerun.cran") { + pkgs = deps[ !grepl("bioconductor", avail[deps,"Repository"]) ] + } else if (which=="rerun.bioc") { + pkgs = deps[ grepl("bioconductor", avail[deps,"Repository"]) ] } else { pkgs = NULL if (which=="not.started") pkgs = deps[!file.exists(paste0("./",deps,".Rcheck"))] # those that haven't run From 75e7dd262dd63b6a4d84888670d7d005887fcf77 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 14 Oct 2020 00:35:11 -0600 Subject: [PATCH 103/588] .dev-only: clean revdeplib of packages built with older R-devel x.y --- .dev/revdep.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 6e6792b9b7..135d354300 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -64,7 +64,17 @@ cat("New downloaded:",new," Already had latest:", old, " TOTAL:", length(deps), update.packages(repos=BiocManager::repositories(), checkBuilt=TRUE) # double-check all dependencies are latest too cat("This is R ",R.version$major,".",R.version$minor,"; ",R.version.string,"\n",sep="") cat("Installed packages built using:\n") -drop(table(installed.packages()[,"Built"])) # ensure all built with this major release of R +x = installed.packages() +drop(table(x[,"Built"])) # manually inspect to ensure all built with this x.y release of R +if (FALSE) { # if not, run this manually replacing "4.0.0" appropriately + for (p in rownames(x)[x[,"Built"]=="4.0.0"]) { + install.packages(p, repos=BiocManager::repositories()) + } + # warnings may suggest many of them were removed from CRAN, so remove the remaining from revdeplib to be clean + x = installed.packages() + remove.packages(rownames(x)[x[,"Built"]=="4.0.0"]) + drop(table(installed.packages()[,"Built"])) # check again to make sure all built in current R-devel x.y version +} # Remove the tar.gz no longer needed : for (p in deps) { From e7018c91e2c5331cdaf9ba5bd0340399540c8a00 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 15 Oct 2020 22:12:34 -0600 Subject: [PATCH 104/588] copy altrep refs to specials too (#4759) --- inst/tests/tests.Rraw | 13 ++++++++++ src/dogroups.c | 6 ++--- src/utils.c | 56 ++++++++++++++++++++++--------------------- 3 files changed, 45 insertions(+), 30 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dc49a604a5..78a391e108 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17158,3 +17158,16 @@ for (i in 0:4) test(2155+i/10, data.table(A=1:7, V1=42) ) +# dogroups.c eval(j) could create list columns containing altrep references to the specials, #4759 +# thanks to revdep testing of 1.13.2 where package tstools revealed this via ts() creating ALTREP, #4758 +# the attr(value,"class")<-"newclass" lines mimics a line at the end of stats::ts(). When the +# length(value)>=64, R creates an ALTREP REF wrapper. Which dogroups.c now catches. +# Hence this test needs to be at least 128 rows, 2 groups of 64 each. +DT = data.table(series=c("ts1","ts2"), value=rnorm(128)) +test(2156.1, DT[,list(list({attr(value,"class")<-"newclass";value})),by=series]$V1[[1L]][1L], + DT[1,value]) +test(2156.2, truelength(DT[,list(list(value)),by=series]$V1[[1L]])>=0L) # not -64 carried over by duplicate() of the .SD column +# cover NULL case in copyAsPlain by putting a NULL alongside a dogroups .SD column. The 'if(.GRP==1L)' is just for fun. +test(2156.3, sapply(DT[, list(if (.GRP==1L) list(value,NULL) else list(NULL,value)), by=series]$V1, length), + INT(64,0,0,64)) + diff --git a/src/dogroups.c b/src/dogroups.c index 4c243104a5..6f88c321b6 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -45,7 +45,7 @@ static bool anySpecialStatic(SEXP x) { if (n==0) return false; if (isVectorAtomic(x)) - return TRUELENGTH(x)<0; + return TRUELENGTH(x)<0 || ALTREP(x); if (isNewList(x)) for (int i=0; i make.levels -> rbindlist -> [ -> [.data.table - // Perhaps related to row names and the copyMostAttrib() below is not quite sufficient - - size_t n = XLENGTH(x); - SEXP ans = PROTECT(allocVector(TYPEOF(x), XLENGTH(x))); - switch (TYPEOF(ans)) { + + if (isNull(x)) { + // deal with up front because isNewList(R_NilValue) is true + return R_NilValue; + } + if (!isVectorAtomic(x) && !isNewList(x)) { + // e.g. defer to R the CLOSXP in test 173.3 where a list column item is the function 'mean' + return duplicate(x); + } + const int64_t n = XLENGTH(x); + SEXP ans = PROTECT(allocVector(TYPEOF(x), n)); + switch (TYPEOF(x)) { case RAWSXP: - memcpy(RAW(ans), RAW(x), n*sizeof(Rbyte)); // # nocov; add coverage when ALTREP is turned on for all types - break; // # nocov + memcpy(RAW(ans), RAW(x), n*sizeof(Rbyte)); + break; case LGLSXP: - memcpy(LOGICAL(ans), LOGICAL(x), n*sizeof(Rboolean)); // # nocov - break; // # nocov + memcpy(LOGICAL(ans), LOGICAL(x), n*sizeof(Rboolean)); + break; case INTSXP: memcpy(INTEGER(ans), INTEGER(x), n*sizeof(int)); // covered by 10:1 after test 178 break; @@ -266,22 +267,23 @@ SEXP copyAsPlain(SEXP x) { memcpy(REAL(ans), REAL(x), n*sizeof(double)); // covered by as.Date("2013-01-01")+seq(1,1000,by=10) after test 1075 break; case CPLXSXP: - memcpy(COMPLEX(ans), COMPLEX(x), n*sizeof(Rcomplex)); // # nocov - break; // # nocov + memcpy(COMPLEX(ans), COMPLEX(x), n*sizeof(Rcomplex)); + break; case STRSXP: { const SEXP *xp=STRING_PTR(x); // covered by as.character(as.hexmode(1:500)) after test 642 - for (R_xlen_t i=0; i Date: Fri, 16 Oct 2020 02:49:36 -0600 Subject: [PATCH 105/588] copySharedColumns revised to fix CornerstoneR (#4760) --- inst/tests/tests.Rraw | 7 +++++++ src/dogroups.c | 2 +- src/utils.c | 31 ++++++++++++++++++++----------- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 78a391e108..c981f3e739 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17171,3 +17171,10 @@ test(2156.2, truelength(DT[,list(list(value)),by=series]$V1[[1L]])>=0L) # not - test(2156.3, sapply(DT[, list(if (.GRP==1L) list(value,NULL) else list(NULL,value)), by=series]$V1, length), INT(64,0,0,64)) +# CornerstoneR usage revealed copySharedColumns needed work afer PR#4655 +# this example fails reliably under Rdevel-strict ASAN before the fix in PR#4760 +set.seed(123) +DT = data.table(A=rnorm(100), B=rep(c("a","b"),c(47,53)), C=rnorm(20), D=1:20) +test(2157, DT[, head(setorderv(.SD, "A")), by=B]$D, + INT(18,6,3,8,9,6,12,17,18,5,20,4)) + diff --git a/src/dogroups.c b/src/dogroups.c index 6f88c321b6..15962e697c 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -45,7 +45,7 @@ static bool anySpecialStatic(SEXP x) { if (n==0) return false; if (isVectorAtomic(x)) - return TRUELENGTH(x)<0 || ALTREP(x); + return ALTREP(x) || TRUELENGTH(x)<0; if (isNewList(x)) for (int i=0; i1?"s":""); // GetVerbose() (slightly expensive call of all options) called here only when needed From 7b10d4ccf94752f5260e0b7ec21d05662da5f1a4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 16 Oct 2020 02:55:45 -0600 Subject: [PATCH 106/588] .dev-only: update runtime to 74hrs in CRAN_Release.cmd, checkbox in #4637 --- .dev/CRAN_Release.cmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 600011ad46..0e970df497 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -361,7 +361,7 @@ test.data.table(script="*.Rraw") # 7 mins (vs 1min normally) under UBSAN, ASAN a # If any problems, edit ~/.R/Makevars and activate "CFLAGS=-O0 -g" to trace. Rerun 'Rdevel-strict CMD INSTALL' and rerun tests. for (i in 1:10) if (!test.data.table()) break # try several runs maybe even 100; e.g a few tests generate data with a non-fixed random seed # gctorture(TRUE) # very slow, many days maybe weeks -gctorture2(step=100) # [12-18hrs (TODO: UPDATE, checkbox in #4637)] under ASAN, UBSAN and --strict-barrier +gctorture2(step=100) # 74 hours under ASAN, UBSAN and --strict-barrier print(Sys.time()); started.at<-proc.time(); try(test.data.table()); print(Sys.time()); print(timetaken(started.at)) ## In case want to ever try again with 32bit on 64bit Ubuntu for tracing any 32bit-only problems From f7b306c6e7b44df8552bf72578abdce99bd41ba4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 16 Oct 2020 12:41:58 -0600 Subject: [PATCH 107/588] .dev-only: added R_DEFAULT_INTERNET_TIMEOUT=300 to revdepsh --- .dev/.bash_aliases | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases index 42388a7a4e..effcc7d9ea 100644 --- a/.dev/.bash_aliases +++ b/.dev/.bash_aliases @@ -5,7 +5,7 @@ # git config --global difftool.prompt false alias gd='git difftool &> /dev/null' alias gdm='git difftool master &> /dev/null' -# If meld has scrolling issues, turn off GTK animation (which I don't need anyway): +# If meld has scrolling issues, turn off GTK animation which I don't need: # https://gitlab.gnome.org/GNOME/meld/-/issues/479#note_866040 alias Rdevel='~/build/R-devel/bin/R --vanilla' @@ -14,7 +14,8 @@ alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' alias Rdevel-valgrind='~/build/R-devel-valgrind/bin/R --vanilla' alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla' alias R310='~/build/R-3.1.0/bin/R --vanilla' -alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false' + +alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false && export R_DEFAULT_INTERNET_TIMEOUT=300' alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R ~/build/R-devel/bin/R' export R_PROFILE_USER='~/.Rprofile' From bca83e0f695a3ddd403f5448df41e13a7e737002 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 16 Oct 2020 13:21:38 -0600 Subject: [PATCH 108/588] http: -> https: in .md, .Rd and .Rmd, to reduce win-builder output notes --- NEWS.0.md | 182 ++++++++++---------- NEWS.md | 6 +- README.md | 4 +- man/IDateTime.Rd | 4 +- man/address.Rd | 2 +- man/assign.Rd | 2 +- man/data.table.Rd | 4 +- man/foverlaps.Rd | 2 +- man/fread.Rd | 38 ++-- man/fwrite.Rd | 4 +- man/groupingsets.Rd | 4 +- man/merge.Rd | 2 +- man/rleid.Rd | 2 +- man/setDT.Rd | 2 +- man/setNumericRounding.Rd | 6 +- man/setkey.Rd | 2 +- man/setorder.Rd | 2 +- vignettes/datatable-faq.Rmd | 6 +- vignettes/datatable-intro.Rmd | 2 +- vignettes/datatable-reference-semantics.Rmd | 2 +- vignettes/datatable-sd-usage.Rmd | 4 +- 21 files changed, 141 insertions(+), 141 deletions(-) diff --git a/NEWS.0.md b/NEWS.0.md index dee284d37f..67f4f73af1 100644 --- a/NEWS.0.md +++ b/NEWS.0.md @@ -15,14 +15,14 @@ 1. `fwrite()` - parallel .csv writer: * Thanks to Otto Seiskari for the initial pull request [#580](https://github.com/Rdatatable/data.table/issues/580) that provided C code, R wrapper, manual page and extensive tests. - * From there Matt parallelized and specialized C functions for writing integer/numeric exactly matching `write.csv` between 2.225074e-308 and 1.797693e+308 to 15 significant figures, dates (between 0000-03-01 and 9999-12-31), times down to microseconds in POSIXct, automatic quoting, `bit64::integer64`, `row.names` and `sep2` for `list` columns where each cell can itself be a vector. See [this blog post](http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/) for implementation details and benchmarks. + * From there Matt parallelized and specialized C functions for writing integer/numeric exactly matching `write.csv` between 2.225074e-308 and 1.797693e+308 to 15 significant figures, dates (between 0000-03-01 and 9999-12-31), times down to microseconds in POSIXct, automatic quoting, `bit64::integer64`, `row.names` and `sep2` for `list` columns where each cell can itself be a vector. See [this blog post](https://blog.h2o.ai/2016/04/fast-csv-writing-for-r/) for implementation details and benchmarks. * Accepts any `list` of same length vectors; e.g. `data.frame` and `data.table`. * Caught in development before release to CRAN: thanks to Francesco Grossetti for [#1725](https://github.com/Rdatatable/data.table/issues/1725) (NA handling), Torsten Betz for [#1847](https://github.com/Rdatatable/data.table/issues/1847) (rounding of 9.999999999999998) and @ambils for [#1903](https://github.com/Rdatatable/data.table/issues/1903) (> 1 million columns). * `fwrite` status was tracked here: [#1664](https://github.com/Rdatatable/data.table/issues/1664) 2. `fread()`: * gains `quote` argument. `quote = ""` disables quoting altogether which reads each field *as is*, [#1367](https://github.com/Rdatatable/data.table/issues/1367). Thanks @manimal. - * With [#1462](https://github.com/Rdatatable/data.table/issues/1462) fix, quotes are handled slightly better. Thanks @Pascal for [posting on SO](http://stackoverflow.com/q/34144314/559784). + * With [#1462](https://github.com/Rdatatable/data.table/issues/1462) fix, quotes are handled slightly better. Thanks @Pascal for [posting on SO](https://stackoverflow.com/q/34144314/559784). * gains `blank.lines.skip` argument that continues reading by skipping empty lines. Default is `FALSE` for backwards compatibility, [#530](https://github.com/Rdatatable/data.table/issues/530). Thanks @DirkJonker. Also closes [#1575](https://github.com/Rdatatable/data.table/issues/1575). * gains `fill` argument with default `FALSE` for backwards compatibility. Closes [#536](https://github.com/Rdatatable/data.table/issues/536). Also, `fill=TRUE` prioritises maximum cols instead of longest run with identical columns when `fill=TRUE` which allows handle missing columns slightly more robustly, [#1573](https://github.com/Rdatatable/data.table/issues/1573). * gains `key` argument, [#590](https://github.com/Rdatatable/data.table/issues/590). @@ -53,7 +53,7 @@ * `var`, `sd` and `prod` are all GForce optimised for speed and memory. Partly addresses [#523](https://github.com/Rdatatable/data.table/issues/523). See that post for benchmarks. 8. Reshaping: - * `dcast.data.table` now allows `drop = c(FALSE, TRUE)` and `drop = c(TRUE, FALSE)`. The former only fills all missing combinations of formula LHS, where as the latter fills only all missing combinations of formula RHS. Thanks to Ananda Mahto for [this SO post](http://stackoverflow.com/q/34830908/559784) and to Jaap for filing [#1512](https://github.com/Rdatatable/data.table/issues/1512). + * `dcast.data.table` now allows `drop = c(FALSE, TRUE)` and `drop = c(TRUE, FALSE)`. The former only fills all missing combinations of formula LHS, where as the latter fills only all missing combinations of formula RHS. Thanks to Ananda Mahto for [this SO post](https://stackoverflow.com/q/34830908/559784) and to Jaap for filing [#1512](https://github.com/Rdatatable/data.table/issues/1512). * `melt.data.table` finds variables provided to `patterns()` when called from within user defined functions, [#1749](https://github.com/Rdatatable/data.table/issues/1749). Thanks to @kendonB for the report. 9. We can now refer to the columns that are not mentioned in `.SD` / `.SDcols` in `j` as well. For example, `DT[, .(sum(v1), lapply(.SD, mean)), by=grp, .SDcols=v2:v3]` works as expected, [#495](https://github.com/Rdatatable/data.table/issues/495). Thanks to @MattWeller for report and to others for linking various SO posts to be updated. Also closes [#484](https://github.com/Rdatatable/data.table/issues/484). @@ -74,7 +74,7 @@ 17. `rleid()` gains `prefix` argument, similar to `rowid()`. - 18. `shift()` understands and operates on list-of-list inputs as well, [#1595](https://github.com/Rdatatable/data.table/issues/1595). Thanks to @enfascination and to @chris for [asking on SO](http://stackoverflow.com/q/38900293/559784). + 18. `shift()` understands and operates on list-of-list inputs as well, [#1595](https://github.com/Rdatatable/data.table/issues/1595). Thanks to @enfascination and to @chris for [asking on SO](https://stackoverflow.com/q/38900293/559784). 19. `uniqueN` gains `na.rm` argument, [#1455](https://github.com/Rdatatable/data.table/issues/1455). @@ -137,7 +137,7 @@ 17. `uniqueN()` now handles NULL properly, [#1429](https://github.com/Rdatatable/data.table/issues/1429). Thanks @JanGorecki. - 18. GForce `min` and `max` functions handle `NaN` correctly, [#1461](https://github.com/Rdatatable/data.table/issues/1461). Thanks to @LyssBucks for [asking on SO](http://stackoverflow.com/q/34081848/559784). + 18. GForce `min` and `max` functions handle `NaN` correctly, [#1461](https://github.com/Rdatatable/data.table/issues/1461). Thanks to @LyssBucks for [asking on SO](https://stackoverflow.com/q/34081848/559784). 19. Warnings on unable to detect column types from middle/last 5 lines are now moved to messages when `verbose=TRUE`. Closes [#1124](https://github.com/Rdatatable/data.table/issues/1124). @@ -163,7 +163,7 @@ 30. `rbindlist` (and `rbind`) works as expected when `fill = TRUE` and the first element of input list doesn't have columns present in other elements of the list, [#1549](https://github.com/Rdatatable/data.table/issues/1549). Thanks to @alexkowa. - 31. `DT[, .(col), with=FALSE]` now returns a meaningful error message, [#1440](https://github.com/Rdatatable/data.table/issues/1440). Thanks to @VasilyA for [posting on SO](http://stackoverflow.com/q/33851742/559784). + 31. `DT[, .(col), with=FALSE]` now returns a meaningful error message, [#1440](https://github.com/Rdatatable/data.table/issues/1440). Thanks to @VasilyA for [posting on SO](https://stackoverflow.com/q/33851742/559784). 32. Fixed a segault in `forder` when elements of input list are not of same length, [#1531](https://github.com/Rdatatable/data.table/issues/1531). Thanks to @MichaelChirico. @@ -201,7 +201,7 @@ 49. UTF8 BOM header is excluded properly in `fread()`, [#1087](https://github.com/Rdatatable/data.table/issues/1087) and [#1465](https://github.com/Rdatatable/data.table/issues/1465). Thanks to @nigmastar and @MichaelChirico. - 50. Joins using `on=` retains (and discards) keys properly, [#1268](https://github.com/Rdatatable/data.table/issues/1268). Thanks @DouglasClark for [this SO post](http://stackoverflow.com/q/29918595/559784) that helped discover the issue. + 50. Joins using `on=` retains (and discards) keys properly, [#1268](https://github.com/Rdatatable/data.table/issues/1268). Thanks @DouglasClark for [this SO post](https://stackoverflow.com/q/29918595/559784) that helped discover the issue. 51. Secondary keys are properly removed when those columns get updated, [#1479](https://github.com/Rdatatable/data.table/issues/1479). Thanks @fabiangehring for the report, and also @ChristK for the MRE. @@ -245,7 +245,7 @@ 70. Retaining / removing keys is handled better when join is performed on non-key columns using `on` argument, [#1766](https://github.com/Rdatatable/data.table/issues/1766), [#1704](https://github.com/Rdatatable/data.table/issues/1704) and [#1823](https://github.com/Rdatatable/data.table/issues/1823). Thanks @mllg, @DavidArenburg and @mllg. - 71. `rbind` for data.tables now coerces non-list inputs to data.tables first before calling `rbindlist` so that binding list of data.tables and matrices work as expected to be consistent with base's rbind, [#1626](https://github.com/Rdatatable/data.table/issues/1626). Thanks @ems for reporting [here](http://stackoverflow.com/q/34426957/559784) on SO. + 71. `rbind` for data.tables now coerces non-list inputs to data.tables first before calling `rbindlist` so that binding list of data.tables and matrices work as expected to be consistent with base's rbind, [#1626](https://github.com/Rdatatable/data.table/issues/1626). Thanks @ems for reporting [here](https://stackoverflow.com/q/34426957/559784) on SO. 72. Subassigning a factor column with `NA` works as expected. Also, the warning message on coercion is suppressed when RHS is singleton NA, [#1740](https://github.com/Rdatatable/data.table/issues/1740). Thanks @Zus. @@ -357,7 +357,7 @@ 1. `fread` * passes `showProgress=FALSE` through to `download.file()` (as `quiet=TRUE`). Thanks to a pull request from Karl Broman and Richard Scriven for filing the issue, [#741](https://github.com/Rdatatable/data.table/issues/741). * accepts `dec=','` (and other non-'.' decimal separators), [#917](https://github.com/Rdatatable/data.table/issues/917). A new paragraph has been added to `?fread`. On Windows this should just-work. On Unix it may just-work but if not you will need to read the paragraph for an extra step. In case it somehow breaks `dec='.'`, this new feature can be turned off with `options(datatable.fread.dec.experiment=FALSE)`. - * Implemented `stringsAsFactors` argument for `fread()`. When `TRUE`, character columns are converted to factors. Default is `FALSE`. Thanks to Artem Klevtsov for filing [#501](https://github.com/Rdatatable/data.table/issues/501), and to @hmi2015 for [this SO post](http://stackoverflow.com/q/31350209/559784). + * Implemented `stringsAsFactors` argument for `fread()`. When `TRUE`, character columns are converted to factors. Default is `FALSE`. Thanks to Artem Klevtsov for filing [#501](https://github.com/Rdatatable/data.table/issues/501), and to @hmi2015 for [this SO post](https://stackoverflow.com/q/31350209/559784). * gains `check.names` argument, with default value `FALSE`. When `TRUE`, it uses the base function `make.unique()` to ensure that the column names of the data.table read in are all unique. Thanks to David Arenburg for filing [#1027](https://github.com/Rdatatable/data.table/issues/1027). * gains `encoding` argument. Acceptable values are "unknown", "UTF-8" and "Latin-1" with default value of "unknown". Closes [#563](https://github.com/Rdatatable/data.table/issues/563). Thanks to @BenMarwick for the original report and to the many requests from others, and Q on SO. * gains `col.names` argument, and is similar to `base::read.table()`. Closes [#768](https://github.com/Rdatatable/data.table/issues/768). Thanks to @dardesta for filing the FR. @@ -393,7 +393,7 @@ 13. `dcast` can now: * cast multiple `value.var` columns simultaneously. Closes [#739](https://github.com/Rdatatable/data.table/issues/739). * accept multiple functions under `fun.aggregate`. Closes [#716](https://github.com/Rdatatable/data.table/issues/716). - * supports optional column prefixes as mentioned under [this SO post](http://stackoverflow.com/q/26225206/559784). Closes [#862](https://github.com/Rdatatable/data.table/issues/862). Thanks to @JohnAndrews. + * supports optional column prefixes as mentioned under [this SO post](https://stackoverflow.com/q/26225206/559784). Closes [#862](https://github.com/Rdatatable/data.table/issues/862). Thanks to @JohnAndrews. * works with undefined variables directly in formula. Closes [#1037](https://github.com/Rdatatable/data.table/issues/1037). Thanks to @DavidArenburg for the MRE. * Naming conventions on multiple columns changed according to [#1153](https://github.com/Rdatatable/data.table/issues/1153). Thanks to @MichaelChirico for the FR. * also has a `sep` argument with default `_` for backwards compatibility. [#1210](https://github.com/Rdatatable/data.table/issues/1210). Thanks to @dbetebenner for the FR. @@ -465,7 +465,7 @@ * Works fine when RHS is of `list` type - quite unusual operation but could happen. Closes [#961](https://github.com/Rdatatable/data.table/issues/961). Thanks to @Gsee for the minimal report. * Auto indexing errored in some cases when LHS and RHS were not of same type. This is fixed now. Closes [#957](https://github.com/Rdatatable/data.table/issues/957). Thanks to @GSee for the minimal report. * `DT[x == 2.5]` where `x` is integer type resulted in `val` being coerced to integer (for binary search) and therefore returned incorrect result. This is now identified using the function `isReallyReal()` and if so, auto indexing is turned off. Closes [#1050](https://github.com/Rdatatable/data.table/issues/1050). - * Auto indexing errored during `DT[x %in% val]` when `val` has some values not present in `x`. Closes [#1072](https://github.com/Rdatatable/data.table/issues/1072). Thanks to @CarlosCinelli for asking on [StackOverflow](http://stackoverflow.com/q/28932742/559784). + * Auto indexing errored during `DT[x %in% val]` when `val` has some values not present in `x`. Closes [#1072](https://github.com/Rdatatable/data.table/issues/1072). Thanks to @CarlosCinelli for asking on [StackOverflow](https://stackoverflow.com/q/28932742/559784). 7. `as.data.table.list` with list input having 0-length items, e.g. `x = list(a=integer(0), b=3:4)`. `as.data.table(x)` recycles item `a` with `NA`s to fit the length of the longer column `b` (length=2), as before now, but with an additional warning message that the item has been recycled with `NA`. Closes [#847](https://github.com/Rdatatable/data.table/issues/847). Thanks to @tvinodr for the report. This was a regression from 1.9.2. @@ -477,7 +477,7 @@ In both these cases (and during a `not-join` which was already fixed in [1.9.4](https://github.com/Rdatatable/data.table/blob/master/README.md#bug-fixes-1)), `allow.cartesian` can be safely ignored. - 10. `names<-.data.table` works as intended on data.table unaware packages with Rv3.1.0+. Closes [#476](https://github.com/Rdatatable/data.table/issues/476) and [#825](https://github.com/Rdatatable/data.table/issues/825). Thanks to ezbentley for reporting [here](http://stackoverflow.com/q/23256177/559784) on SO and to @narrenfrei. + 10. `names<-.data.table` works as intended on data.table unaware packages with Rv3.1.0+. Closes [#476](https://github.com/Rdatatable/data.table/issues/476) and [#825](https://github.com/Rdatatable/data.table/issues/825). Thanks to ezbentley for reporting [here](https://stackoverflow.com/q/23256177/559784) on SO and to @narrenfrei. 11. `.EACHI` is now an exported symbol (just like `.SD`,`.N`,`.I`,`.GRP` and `.BY` already were) so that packages using `data.table` and `.EACHI` pass `R CMD check` with no NOTE that this symbol is undefined. Thanks to Matt Bannert for highlighting. @@ -487,7 +487,7 @@ 14. `format.ITime` now handles negative values properly. Closes [#811](https://github.com/Rdatatable/data.table/issues/811). Thanks to @StefanFritsch for the report along with the fix! - 15. Compatibility with big endian machines (e.g., SPARC and PowerPC) is restored. Most Windows, Linux and Mac systems are little endian; type `.Platform$endian` to confirm. Thanks to Gerhard Nachtmann for reporting and the [QEMU project](http://qemu.org/) for their PowerPC emulator. + 15. Compatibility with big endian machines (e.g., SPARC and PowerPC) is restored. Most Windows, Linux and Mac systems are little endian; type `.Platform$endian` to confirm. Thanks to Gerhard Nachtmann for reporting and the [QEMU project](https://qemu.org/) for their PowerPC emulator. 16. `DT[, LHS := RHS]` with RHS is of the form `eval(parse(text = foo[1]))` referring to columns in `DT` is now handled properly. Closes [#880](https://github.com/Rdatatable/data.table/issues/880). Thanks to tyner. @@ -497,13 +497,13 @@ 19. Updating `.SD` by reference using `set` also errors appropriately now; similar to `:=`. Closes [#927](https://github.com/Rdatatable/data.table/issues/927). Thanks to @jrowen for the minimal example. - 20. `X[Y, .N]` returned the same result as `X[Y, .N, nomatch=0L]`) when `Y` contained rows that has no matches in `X`. Fixed now. Closes [#963](https://github.com/Rdatatable/data.table/issues/963). Thanks to [this SO post](http://stackoverflow.com/q/27004002/559784) from @Alex which helped discover the bug. + 20. `X[Y, .N]` returned the same result as `X[Y, .N, nomatch=0L]`) when `Y` contained rows that has no matches in `X`. Fixed now. Closes [#963](https://github.com/Rdatatable/data.table/issues/963). Thanks to [this SO post](https://stackoverflow.com/q/27004002/559784) from @Alex which helped discover the bug. 21. `data.table::dcast` handles levels in factor columns properly when `drop = FALSE`. Closes [#893](https://github.com/Rdatatable/data.table/issues/893). Thanks to @matthieugomez for the great minimal example. 22. `[.data.table` subsets complex and raw type objects again. Thanks to @richierocks for the nice minimal example. Closes [#982](https://github.com/Rdatatable/data.table/issues/982). - 23. Fixed a bug in the internal optimisation of `j-expression` with more than one `lapply(.SD, function(..) ..)` as illustrated [here on SO](http://stackoverflow.com/a/27495844/559784). Closes #985. Thanks to @jadaliha for the report and to @BrodieG for the debugging on SO. + 23. Fixed a bug in the internal optimisation of `j-expression` with more than one `lapply(.SD, function(..) ..)` as illustrated [here on SO](https://stackoverflow.com/a/27495844/559784). Closes #985. Thanks to @jadaliha for the report and to @BrodieG for the debugging on SO. 24. `mget` fetches columns from the default environment `.SD` when called from within the frame of `DT`. That is, `DT[, mget(cols)]`, `DT[, lapply(mget(cols), sum), by=.]` etc.. work as intended. Thanks to @Roland for filing this issue. Closes [#994](https://github.com/Rdatatable/data.table/issues/994). @@ -537,7 +537,7 @@ 39. `setattr` now returns an error when trying to set `data.table` and/or `data.frame` as class to a *non-list* type object (ex: `matrix`). Closes [#832](https://github.com/Rdatatable/data.table/issues/832). Thanks to @Rick for the minimal example. - 40. data.table(table) works as expected. Closes [#1043](https://github.com/Rdatatable/data.table/issues/1043). Thanks to @rnso for the [SO post](http://stackoverflow.com/q/28499359/559784). + 40. data.table(table) works as expected. Closes [#1043](https://github.com/Rdatatable/data.table/issues/1043). Thanks to @rnso for the [SO post](https://stackoverflow.com/q/28499359/559784). 41. Joins and binary search based subsets of the form `x[i]` where `x`'s key column is integer and `i` a logical column threw an error before. This is now fixed by converting the logical column to integer type and then performing the join, so that it works as expected. @@ -551,14 +551,14 @@ 46. `DT[rows, newcol := NULL]` resulted in a segfault on the next assignment by reference. Closes [#1082](https://github.com/Rdatatable/data.table/issues/1082). Thanks to @stevenbagley for the MRE. - 47. `as.matrix(DT)` handles cases where `DT` contains both numeric and logical columns correctly (doesn't coerce to character columns anymore). Closes [#1083](https://github.com/Rdatatable/data.table/issues/1083). Thanks to @bramvisser for the [SO post](http://stackoverflow.com/questions/29068328/correlation-between-numeric-and-logical-variable-gives-intended-error). + 47. `as.matrix(DT)` handles cases where `DT` contains both numeric and logical columns correctly (doesn't coerce to character columns anymore). Closes [#1083](https://github.com/Rdatatable/data.table/issues/1083). Thanks to @bramvisser for the [SO post](https://stackoverflow.com/questions/29068328/correlation-between-numeric-and-logical-variable-gives-intended-error). 48. Coercion is handled properly on subsets/joins on `integer64` key columns. Closes [#1108](https://github.com/Rdatatable/data.table/issues/1108). Thanks to @vspinu. 49. `setDT()` and `as.data.table()` both strip *all classes* preceding *data.table*/*data.frame*, to be consistent with base R. Closes [#1078](https://github.com/Rdatatable/data.table/issues/1078) and [#1128](https://github.com/Rdatatable/data.table/issues/1128). Thanks to Jan and @helix123 for the reports. 50. `setattr(x, 'levels', value)` handles duplicate levels in `value` - appropriately. Thanks to Jeffrey Horner for pointing it out [here](http://jeffreyhorner.tumblr.com/post/118297392563/tidyr-challenge-help-me-do-my-job). Closes [#1142](https://github.com/Rdatatable/data.table/issues/1142). + appropriately. Thanks to Jeffrey Horner for pointing it out [here](https://jeffreyhorner.tumblr.com/post/118297392563/tidyr-challenge-help-me-do-my-job). Closes [#1142](https://github.com/Rdatatable/data.table/issues/1142). 51. `x[J(vals), .N, nomatch=0L]` also included no matches in result, [#1074](https://github.com/Rdatatable/data.table/issues/1074). And `x[J(...), col := val, nomatch=0L]` returned a warning with incorrect results when join resulted in no matches as well, even though `nomatch=0L` should have no effect in `:=`, [#1092](https://github.com/Rdatatable/data.table/issues/1092). Both issues are fixed now. Thanks to @riabusan and @cguill95 for #1092. @@ -658,15 +658,15 @@ ``` where `top` is a non-join column in `Y`; i.e., join inherited column. Thanks to many, especially eddi, Sadao Milberg and Gabor Grothendieck for extended discussions. Closes [#538](https://github.com/Rdatatable/data.table/issues/538). -2. Accordingly, `X[Y, j]` now does what `X[Y][, j]` did. To return the old behaviour: `options(datatable.old.bywithoutby=TRUE)`. This is a temporary option to aid migration and will be removed in future. See [this](http://stackoverflow.com/questions/16093289/data-table-join-and-j-expression-unexpected-behavior) and [this](http://stackoverflow.com/a/16222108/403310) post for discussions and motivation. +2. Accordingly, `X[Y, j]` now does what `X[Y][, j]` did. To return the old behaviour: `options(datatable.old.bywithoutby=TRUE)`. This is a temporary option to aid migration and will be removed in future. See [this](https://stackoverflow.com/questions/16093289/data-table-join-and-j-expression-unexpected-behavior) and [this](https://stackoverflow.com/a/16222108/403310) post for discussions and motivation. 3. `Overlap joins` ([#528](https://github.com/Rdatatable/data.table/issues/528)) is now here, finally!! Except for `type="equal"` and `maxgap` and `minoverlap` arguments, everything else is implemented. Check out `?foverlaps` and the examples there on its usage. This is a major feature addition to `data.table`. 4. `DT[column==value]` and `DT[column %in% values]` are now optimized to use `DT`'s key when `key(DT)[1]=="column"`, otherwise a secondary key (a.k.a. _index_) is automatically added so the next `DT[column==value]` is much faster. No code changes are needed; existing code should automatically benefit. Secondary keys can be added manually using `set2key()` and existence checked using `key2()`. These optimizations and function names/arguments are experimental and may be turned off with `options(datatable.auto.index=FALSE)`. 5. `fread()`: - * accepts line breaks inside quoted fields. Thanks to Clayton Stanley for highlighting [here](http://stackoverflow.com/questions/21006661/fread-and-a-quoted-multi-line-column-value). - * accepts trailing backslash in quoted fields. Thanks to user2970844 for highlighting [here](http://stackoverflow.com/questions/24375832/fread-and-column-with-a-trailing-backslash). + * accepts line breaks inside quoted fields. Thanks to Clayton Stanley for highlighting [here](https://stackoverflow.com/questions/21006661/fread-and-a-quoted-multi-line-column-value). + * accepts trailing backslash in quoted fields. Thanks to user2970844 for highlighting [here](https://stackoverflow.com/questions/24375832/fread-and-column-with-a-trailing-backslash). * Blank and `"NA"` values in logical columns (`T`,`True`,`TRUE`) no longer cause them to be read as character, [#567](https://github.com/Rdatatable/data.table/issues/567). Thanks to Adam November for reporting. * URLs now work on Windows. R's `download.file()` converts `\r\n` to `\r\r\n` on Windows. Now avoided by downloading in binary mode. Thanks to Steve Miller and Dean MacGregor for reporting, [#492](https://github.com/Rdatatable/data.table/issues/492). * Fixed segfault in sparse data files when bumping to character, [#796](https://github.com/Rdatatable/data.table/issues/796) and [#722](https://github.com/Rdatatable/data.table/issues/722). Thanks to Adam Kennedy and Richard Cotton for the detailed reproducible reports. @@ -693,7 +693,7 @@ * And incredibly fast ;). * Documentation updated in much detail. Closes [#333](https://github.com/Rdatatable/data.table/issues/333). - 8. `bit64::integer64` now works in grouping and joins, [#342](https://github.com/Rdatatable/data.table/issues/342). Thanks to James Sams for highlighting UPCs and Clayton Stanley for [this SO post](http://stackoverflow.com/questions/22273321/large-integers-in-data-table-grouping-results-different-in-1-9-2-compared-to-1). `fread()` has been detecting and reading `integer64` for a while. + 8. `bit64::integer64` now works in grouping and joins, [#342](https://github.com/Rdatatable/data.table/issues/342). Thanks to James Sams for highlighting UPCs and Clayton Stanley for [this SO post](https://stackoverflow.com/questions/22273321/large-integers-in-data-table-grouping-results-different-in-1-9-2-compared-to-1). `fread()` has been detecting and reading `integer64` for a while. 9. `setNumericRounding()` may be used to reduce to 1 byte or 0 byte rounding when joining to or grouping columns of type 'numeric', [#342](https://github.com/Rdatatable/data.table/issues/342). See example in `?setNumericRounding` and NEWS item below for v1.9.2. `getNumericRounding()` returns the current setting. @@ -773,7 +773,7 @@ 29. `setorder()` and `setorderv()` gain `na.last = TRUE/FALSE`. Closes [#706](https://github.com/Rdatatable/data.table/issues/706). - 30. `.N` is now available in `i`, [FR#724](https://github.com/Rdatatable/data.table/issues/724). Thanks to newbie indirectly [here](http://stackoverflow.com/a/24649115/403310) and Farrel directly [here](http://stackoverflow.com/questions/24685421/how-do-you-extract-a-few-random-rows-from-a-data-table-on-the-fly). + 30. `.N` is now available in `i`, [FR#724](https://github.com/Rdatatable/data.table/issues/724). Thanks to newbie indirectly [here](https://stackoverflow.com/a/24649115/403310) and Farrel directly [here](https://stackoverflow.com/questions/24685421/how-do-you-extract-a-few-random-rows-from-a-data-table-on-the-fly). 31. `by=.EACHI` is now implemented for *not-joins* as well. Closes [#604](https://github.com/Rdatatable/data.table/issues/604). Thanks to Garrett See for filing the FR. As an example: ```R @@ -791,7 +791,7 @@ DT[.(1), list(b,...)] # correct result again (joining just to a not b but using b) ``` - 2. `setkey` works again when a non-key column is type list (e.g. each cell can itself be a vector), [#54](https://github.com/Rdatatable/data.table/issues/54). Test added. Thanks to James Sams, Michael Nelson and Musx [for the reproducible examples](http://stackoverflow.com/questions/22186798/r-data-table-1-9-2-issue-on-setkey). + 2. `setkey` works again when a non-key column is type list (e.g. each cell can itself be a vector), [#54](https://github.com/Rdatatable/data.table/issues/54). Test added. Thanks to James Sams, Michael Nelson and Musx [for the reproducible examples](https://stackoverflow.com/questions/22186798/r-data-table-1-9-2-issue-on-setkey). 3. The warning "internal TRUE value has been modified" with recently released R 3.1 when grouping a table containing a logical column *and* where all groups are just 1 row is now fixed and tests added. Thanks to James Sams for the reproducible example. The warning is issued by R and we have asked if it can be upgraded to error (UPDATE: change now made for R 3.1.1 thanks to Luke Tierney). @@ -799,19 +799,19 @@ 5. `unique()` now returns a null data.table, [#44](https://github.com/Rdatatable/data.table/issues/44). Thanks to agstudy for reporting. - 6. `data.table()` converted POSIXlt to POSIXct, consistent with `base:::data.frame()`, but now also provides a helpful warning instead of coercing silently, [#59](https://github.com/Rdatatable/data.table/issues/59). Thanks to Brodie Gaslam, Patrick and Ragy Isaac for reporting [here](http://stackoverflow.com/questions/21487614/error-creating-r-data-table-with-date-time-posixlt) and [here](http://stackoverflow.com/questions/21320215/converting-from-data-frame-to-data-table-i-get-an-error-with-head). + 6. `data.table()` converted POSIXlt to POSIXct, consistent with `base:::data.frame()`, but now also provides a helpful warning instead of coercing silently, [#59](https://github.com/Rdatatable/data.table/issues/59). Thanks to Brodie Gaslam, Patrick and Ragy Isaac for reporting [here](https://stackoverflow.com/questions/21487614/error-creating-r-data-table-with-date-time-posixlt) and [here](https://stackoverflow.com/questions/21320215/converting-from-data-frame-to-data-table-i-get-an-error-with-head). 7. If another class inherits from data.table; e.g. `class(DT) == c("UserClass","data.table","data.frame")` then `DT[...]` now retains `UserClass` in the result. Thanks to Daniel Krizian for reporting, [#64](https://github.com/Rdatatable/data.table/issues/44). Test added. - 8. An error `object '' not found` could occur in some circumstances, particularly after a previous error. [Reported on SO](http://stackoverflow.com/questions/22128047/how-to-avoid-weird-umlaute-error-when-using-data-table) with non-ASCII characters in a column name, a red herring we hope since non-ASCII characters are fully supported in data.table including in column names. Fix implemented and tests added. + 8. An error `object '' not found` could occur in some circumstances, particularly after a previous error. [Reported on SO](https://stackoverflow.com/questions/22128047/how-to-avoid-weird-umlaute-error-when-using-data-table) with non-ASCII characters in a column name, a red herring we hope since non-ASCII characters are fully supported in data.table including in column names. Fix implemented and tests added. 9. Column order was reversed in some cases by `as.data.table.table()`, [#43](https://github.com/Rdatatable/data.table/issues/43). Test added. Thanks to Benjamin Barnes for reporting. 10. `DT[, !"missingcol", with=FALSE]` now returns `DT` (rather than a NULL data.table) with warning that "missingcol" is not present. - 11. `DT[,y := y * eval(parse(text="1*2"))]` resulted in error unless `eval()` was wrapped with paranthesis. That is, `DT[,y := y * (eval(parse(text="1*2")))]`, **#5423**. Thanks to Wet Feet for reporting and to Simon O'Hanlon for identifying the issue [here on SO](http://stackoverflow.com/questions/22375404/unable-to-use-evalparse-in-data-table-function/22375557#22375557). + 11. `DT[,y := y * eval(parse(text="1*2"))]` resulted in error unless `eval()` was wrapped with paranthesis. That is, `DT[,y := y * (eval(parse(text="1*2")))]`, **#5423**. Thanks to Wet Feet for reporting and to Simon O'Hanlon for identifying the issue [here on SO](https://stackoverflow.com/questions/22375404/unable-to-use-evalparse-in-data-table-function/22375557#22375557). - 12. Using `by` columns with attributes (ex: factor, Date) in `j` did not retain the attributes, also in case of `:=`. This was partially a regression from an earlier fix ([#155](https://github.com/Rdatatable/data.table/issues/155)) due to recent changes for R3.1.0. Now fixed and clearer tests added. Thanks to Christophe Dervieux for reporting and to Adam B for reporting [here on SO](http://stackoverflow.com/questions/22536586/by-seems-to-not-retain-attribute-of-date-type-columns-in-data-table-possibl). Closes [#36](https://github.com/Rdatatable/data.table/issues/36). + 12. Using `by` columns with attributes (ex: factor, Date) in `j` did not retain the attributes, also in case of `:=`. This was partially a regression from an earlier fix ([#155](https://github.com/Rdatatable/data.table/issues/155)) due to recent changes for R3.1.0. Now fixed and clearer tests added. Thanks to Christophe Dervieux for reporting and to Adam B for reporting [here on SO](https://stackoverflow.com/questions/22536586/by-seems-to-not-retain-attribute-of-date-type-columns-in-data-table-possibl). Closes [#36](https://github.com/Rdatatable/data.table/issues/36). 13. `.BY` special variable did not retain names of the grouping columns which resulted in not being able to access `.BY$grpcol` in `j`. Ex: `DT[, .BY$x, by=x]`. This is now fixed. Closes **#5415**. Thanks to Stephane Vernede for the bug report. @@ -825,7 +825,7 @@ 18. `merge(x, y, all=TRUE)` error when `x` is empty data.table is now fixed. Closes [#24](https://github.com/Rdatatable/data.table/issues/24). Thanks to Garrett See for filing the report. - 19. Implementing #5249 closes bug [#26](https://github.com/Rdatatable/data.table/issues/26), a case where rbind gave error when binding with empty data.tables. Thanks to Roger for [reporting on SO](http://stackoverflow.com/q/23216033/559784). + 19. Implementing #5249 closes bug [#26](https://github.com/Rdatatable/data.table/issues/26), a case where rbind gave error when binding with empty data.tables. Thanks to Roger for [reporting on SO](https://stackoverflow.com/q/23216033/559784). 20. Fixed a segfault during grouping with assignment by reference, ex: `DT[, LHS := RHS, by=.]`, where length(RHS) > group size (.N). Closes [#25](https://github.com/Rdatatable/data.table/issues/25). Thanks to Zachary Long for reporting on datatable-help mailing list. @@ -841,11 +841,11 @@ 25. FR # 2551 implemented leniance in warning messages when columns are coerced with `DT[, LHS := RHS]`, when `length(RHS)==1`. But this was very lenient; e.g., `DT[, a := "bla"]`, where `a` is a logical column should get a warning. This is now fixed such that only very obvious cases coerces silently; e.g., `DT[, a := 1]` where `a` is `integer`. Closes [#35](https://github.com/Rdatatable/data.table/issues/35). Thanks to Michele Carriero and John Laing for reporting. - 26. `dcast.data.table` provides better error message when `fun.aggregate` is specified but it returns length != 1. Closes [#693](https://github.com/Rdatatable/data.table/issues/693). Thanks to Trevor Alexander for reporting [here on SO](http://stackoverflow.com/questions/24152733/undocumented-error-in-dcast-data-table). + 26. `dcast.data.table` provides better error message when `fun.aggregate` is specified but it returns length != 1. Closes [#693](https://github.com/Rdatatable/data.table/issues/693). Thanks to Trevor Alexander for reporting [here on SO](https://stackoverflow.com/questions/24152733/undocumented-error-in-dcast-data-table). 27. `dcast.data.table` tries to preserve attributes wherever possible, except when `value.var` is a `factor` (or ordered factor). For `factor` types, the casted columns will be coerced to type `character` thereby losing the `levels` attribute. Closes [#688](https://github.com/Rdatatable/data.table/issues/688). Thanks to juancentro for reporting. - 28. `melt` now returns friendly error when `meaure.vars` are not in data instead of segfault. Closes [#699](https://github.com/Rdatatable/data.table/issues/688). Thanks to vsalmendra for [this post on SO](http://stackoverflow.com/q/24326797/559784) and the subsequent bug report. + 28. `melt` now returns friendly error when `meaure.vars` are not in data instead of segfault. Closes [#699](https://github.com/Rdatatable/data.table/issues/688). Thanks to vsalmendra for [this post on SO](https://stackoverflow.com/q/24326797/559784) and the subsequent bug report. 29. `DT[, list(m1 = eval(expr1), m2=eval(expr2)), by=val]` where `expr1` and `expr2` are constructed using `parse(text=.)` now works instead of resulting in error. Closes [#472](https://github.com/Rdatatable/data.table/issues/472). Thanks to Benjamin Barnes for reporting with a nice reproducible example. @@ -855,17 +855,17 @@ 32. `DT[, list(list(.)), by=.]` and `DT[, col := list(list(.)), by=.]` now return correct results in R >= 3.1.0. The bug was due to a welcome change in R 3.1.0 where `list(.)` no longer copies. Closes [#481](https://github.com/Rdatatable/data.table/issues/481). Also thanks to KrishnaPG for filing [#728](https://github.com/Rdatatable/data.table/issues/728). - 33. `dcast.data.table` handles `fun.aggregate` argument properly when called from within a function that accepts `fun.aggregate` argument and passes to `dcast.data.table()`. Closes [#713](https://github.com/Rdatatable/data.table/issues/713). Thanks to mathematicalcoffee for reporting [here](http://stackoverflow.com/q/24542976/559784) on SO. + 33. `dcast.data.table` handles `fun.aggregate` argument properly when called from within a function that accepts `fun.aggregate` argument and passes to `dcast.data.table()`. Closes [#713](https://github.com/Rdatatable/data.table/issues/713). Thanks to mathematicalcoffee for reporting [here](https://stackoverflow.com/q/24542976/559784) on SO. 34. `dcast.data.table` now returns a friendly error when fun.aggregate value for missing combinations is 0-length, and 'fill' argument is not provided. Closes [#715](https://github.com/Rdatatable/data.table/issues/715) 35. `rbind/rbindlist` binds in the same order of occurrence also when binding tables with duplicate names along with 'fill=TRUE' (previously, it grouped all duplicate columns together). This was the underlying reason for [#725](https://github.com/Rdatatable/data.table/issues/715). Thanks to Stefan Fritsch for the report with a nice reproducible example and discussion. - 36. `setDT` now provides a friendly error when attempted to change a variable to data.table by reference whose binding is locked (usually when the variable is within a package, ex: CO2). Closes [#475](https://github.com/Rdatatable/data.table/issues/475). Thanks to David Arenburg for filing the report [here](http://stackoverflow.com/questions/23361080/error-in-setdt-from-data-table-package) on SO. + 36. `setDT` now provides a friendly error when attempted to change a variable to data.table by reference whose binding is locked (usually when the variable is within a package, ex: CO2). Closes [#475](https://github.com/Rdatatable/data.table/issues/475). Thanks to David Arenburg for filing the report [here](https://stackoverflow.com/questions/23361080/error-in-setdt-from-data-table-package) on SO. 37. `X[!Y]` where `X` and `Y` are both data.tables ignores 'allow.cartesian' argument, and rightly so because a not-join (or anti-join) cannot exceed nrow(x). Thanks to @fedyakov for spotting this. Closes [#698](https://github.com/Rdatatable/data.table/issues/698). - 38. `as.data.table.matrix` does not convert strings to factors by default. `data.table` likes and prefers using character vectors to factors. Closes [#745](https://github.com/Rdatatable/data.table/issues/698). Thanks to @fpinter for reporting the issue on the github issue tracker and to vijay for reporting [here](http://stackoverflow.com/questions/17691050/data-table-still-converts-strings-to-factors) on SO. + 38. `as.data.table.matrix` does not convert strings to factors by default. `data.table` likes and prefers using character vectors to factors. Closes [#745](https://github.com/Rdatatable/data.table/issues/698). Thanks to @fpinter for reporting the issue on the github issue tracker and to vijay for reporting [here](https://stackoverflow.com/questions/17691050/data-table-still-converts-strings-to-factors) on SO. 39. Joins of the form `x[y[z]]` resulted in duplicate names when all `x`, `y` and `z` had the same column names as non-key columns. This is now fixed. Closes [#471](https://github.com/Rdatatable/data.table/issues/471). Thanks to Christian Sigg for the nice reproducible example. @@ -900,7 +900,7 @@ 3. `?duplicated.data.table` explained that `by=NULL` or `by=FALSE` would use all columns, however `by=FALSE` resulted in error. `by=FALSE` is removed from help and `duplicated` returns an error when `by=TRUE/FALSE` now. Closes [#38](https://github.com/Rdatatable/data.table/issues/38). - 4. More info about distinguishing small numbers from 0.0 in v1.9.2+ is [here](http://stackoverflow.com/questions/22290544/grouping-very-small-numbers-e-g-1e-28-and-0-0-in-data-table-v1-8-10-vs-v1-9-2). + 4. More info about distinguishing small numbers from 0.0 in v1.9.2+ is [here](https://stackoverflow.com/questions/22290544/grouping-very-small-numbers-e-g-1e-28-and-0-0-in-data-table-v1-8-10-vs-v1-9-2). 5. `?dcast.data.table` now explains how the names are generated for the columns that are being casted. Closes **#5676**. @@ -910,9 +910,9 @@ `?setorder` (with alias `?order` and `?forder`). Closes [#478](https://github.com/Rdatatable/data.table/issues/478) and also [#704](https://github.com/Rdatatable/data.table/issues/704). Thanks to Christian Wolf for the report. - 8. Added tests (1351.1 and 1351.2) to catch any future regressions on particular case of binary search based subset reported [here](http://stackoverflow.com/q/24729001/559784) on SO. Thanks to Scott for the post. The regression was contained to v1.9.2 AFAICT. Closes [#734](https://github.com/Rdatatable/data.table/issues/704). + 8. Added tests (1351.1 and 1351.2) to catch any future regressions on particular case of binary search based subset reported [here](https://stackoverflow.com/q/24729001/559784) on SO. Thanks to Scott for the post. The regression was contained to v1.9.2 AFAICT. Closes [#734](https://github.com/Rdatatable/data.table/issues/704). - 9. Added an `.onUnload` method to unload `data.table`'s shared object properly. Since the name of the shared object is 'datatable.so' and not 'data.table.so', 'detach' fails to unload correctly. This was the reason for the issue reported [here](http://stackoverflow.com/questions/23498804/load-detach-re-load-anomaly) on SO. Closes [#474](https://github.com/Rdatatable/data.table/issues/474). Thanks to Matthew Plourde for reporting. + 9. Added an `.onUnload` method to unload `data.table`'s shared object properly. Since the name of the shared object is 'datatable.so' and not 'data.table.so', 'detach' fails to unload correctly. This was the reason for the issue reported [here](https://stackoverflow.com/questions/23498804/load-detach-re-load-anomaly) on SO. Closes [#474](https://github.com/Rdatatable/data.table/issues/474). Thanks to Matthew Plourde for reporting. 10. Updated `BugReports` link in DESCRIPTION. Thanks to @chrsigg for reporting. Closes [#754](https://github.com/Rdatatable/data.table/issues/754). @@ -922,7 +922,7 @@ 13. Clarified `.I` in `?data.table`. Closes [#510](https://github.com/Rdatatable/data.table/issues/510). Thanks to Gabor for reporting. - 14. Moved `?copy` to its own help page, and documented that `dt_names <- copy(names(DT))` is necessary for `dt_names` to be not modified by reference as a result of updating `DT` by reference (e.g. adding a new column by reference). Closes [#512](https://github.com/Rdatatable/data.table/issues/512). Thanks to Zach for [this SO question](http://stackoverflow.com/q/15913417/559784) and user1971988 for [this SO question](http://stackoverflow.com/q/18662715/559784). + 14. Moved `?copy` to its own help page, and documented that `dt_names <- copy(names(DT))` is necessary for `dt_names` to be not modified by reference as a result of updating `DT` by reference (e.g. adding a new column by reference). Closes [#512](https://github.com/Rdatatable/data.table/issues/512). Thanks to Zach for [this SO question](https://stackoverflow.com/q/15913417/559784) and user1971988 for [this SO question](https://stackoverflow.com/q/18662715/559784). 15. `address(x)` doesn't increment `NAM()` value when `x` is a vector. Using the object as argument to a non-primitive function is sufficient to increment its reference. Closes #824. Thanks to @tarakc02 for the [question on twitter](https://twitter.com/tarakc02/status/513796515026837504) and hint from Hadley. @@ -947,7 +947,7 @@ > Reminder: bmerge allows the rolling join feature: forwards, backwards, limited and nearest. - 3. Sorting (`setkey` and ad-hoc `by=`) is faster and scales better on randomly ordered data and now also adapts to almost sorted data. The remaining comparison sorts have been removed. We use a combination of counting sort and forwards radix (MSD) for all types including double, character and integers with range>100,000; forwards not backwards through columns. This was inspired by [Terdiman](http://codercorner.com/RadixSortRevisited.htm) and [Herf's](http://stereopsis.com/radix.html) (LSD) radix approach for floating point : + 3. Sorting (`setkey` and ad-hoc `by=`) is faster and scales better on randomly ordered data and now also adapts to almost sorted data. The remaining comparison sorts have been removed. We use a combination of counting sort and forwards radix (MSD) for all types including double, character and integers with range>100,000; forwards not backwards through columns. This was inspired by [Terdiman](https://codercorner.com/RadixSortRevisited.htm) and [Herf's](https://stereopsis.com/radix.html) (LSD) radix approach for floating point : 4. `unique` and `duplicated` methods for `data.table` are significantly faster especially for type numeric (i.e. double), and type integer where range > 100,000 or contains negatives. @@ -978,7 +978,7 @@ 14. fread now understand system commands; e.g., `fread("grep blah file.txt")`. - 15. `as.data.table` method for `table()` implemented, #4848. Thanks to Frank Pinter for suggesting [here on SO](http://stackoverflow.com/questions/18390947/data-table-of-table-is-very-different-from-data-frame-of-table). + 15. `as.data.table` method for `table()` implemented, #4848. Thanks to Frank Pinter for suggesting [here on SO](https://stackoverflow.com/questions/18390947/data-table-of-table-is-very-different-from-data-frame-of-table). 16. `as.data.table` methods added for integer, numeric, character, logical, factor, ordered and Date. @@ -990,7 +990,7 @@ set(DT, i=3:5, j="newCol", 5L) # same ``` - 19. eval will now be evaluated anywhere in a `j`-expression as long as it has just one argument, #4677. Will still need to use `.SD` as environment in complex cases. Also fixes bug [here on SO](http://stackoverflow.com/a/19054962/817778). + 19. eval will now be evaluated anywhere in a `j`-expression as long as it has just one argument, #4677. Will still need to use `.SD` as environment in complex cases. Also fixes bug [here on SO](https://stackoverflow.com/a/19054962/817778). 20. `!` at the head of the expression will no longer trigger a not-join if the expression is logical, #4650. Thanks to Arunkumar Srinivasan for reporting. @@ -1006,7 +1006,7 @@ 26. `rbind` now relies exclusively on `rbindlist` to bind `data.tables` together. This makes rbind'ing factors faster, #2115. - 27. `DT[, as.factor('x'), with=FALSE]` where `x` is a column in `DT` is now equivalent to `DT[, "x", with=FALSE]` instead of ending up with an error, #4867. Thanks to tresbot for reporting [here on SO](http://stackoverflow.com/questions/18525976/converting-multiple-data-table-columns-to-factors-in-r). + 27. `DT[, as.factor('x'), with=FALSE]` where `x` is a column in `DT` is now equivalent to `DT[, "x", with=FALSE]` instead of ending up with an error, #4867. Thanks to tresbot for reporting [here on SO](https://stackoverflow.com/questions/18525976/converting-multiple-data-table-columns-to-factors-in-r). 28. `format.data.table` now understands 'formula' and displays embedded formulas as expected, FR #2591. @@ -1015,7 +1015,7 @@ DT[, { `:=`(...)}] # now works DT[, {`:=`(...)}, by=(...)] # now works ``` - Thanks to Alex for reporting [here on SO](http://stackoverflow.com/questions/14541959/expression-syntax-for-data-table-in-r). + Thanks to Alex for reporting [here on SO](https://stackoverflow.com/questions/14541959/expression-syntax-for-data-table-in-r). 30. `x[J(2), a]`, where `a` is the key column sees `a` in `j`, #2693 and FAQ 2.8. Also, `x[J(2)]` automatically names the columns from `i` using the key columns of `x`. In cases where the key columns of `x` and `i` are identical, i's columns can be referred to by using `i.name`; e.g., `x[J(2), i.a]`. Thanks to mnel and Gabor for the discussion on datatable-help. @@ -1044,9 +1044,9 @@ 36. `X[Y, col:=value]` when no match exists in the join is now caught early and X is simply returned. Also a message when `datatable.verbose` is TRUE is provided. In addition, if `col` is an existing column, since no update actually takes place, the key is now retained. Thanks to Frank Erickson for suggesting, #4996. - 37. New function `setDT()` takes a `list` (named and/or unnamed) or `data.frame` and changes its type by reference to `data.table`, *without any copy*. It also has a logical argument `giveNames` which is used for a list inputs. See `?setDT` examples for more. Based on [this FR on SO](http://stackoverflow.com/questions/20345022/convert-a-data-frame-to-a-data-table-without-copy/20346697#20346697). + 37. New function `setDT()` takes a `list` (named and/or unnamed) or `data.frame` and changes its type by reference to `data.table`, *without any copy*. It also has a logical argument `giveNames` which is used for a list inputs. See `?setDT` examples for more. Based on [this FR on SO](https://stackoverflow.com/questions/20345022/convert-a-data-frame-to-a-data-table-without-copy/20346697#20346697). - 38. `setnames(DT,"oldname","newname")` no longer complains about any duplicated column names in `DT` so long as oldname is unique and unambiguous. Thanks to Wet Feet for highlighting [here on SO](http://stackoverflow.com/questions/20942905/ignore-safety-check-when-using-setnames). + 38. `setnames(DT,"oldname","newname")` no longer complains about any duplicated column names in `DT` so long as oldname is unique and unambiguous. Thanks to Wet Feet for highlighting [here on SO](https://stackoverflow.com/questions/20942905/ignore-safety-check-when-using-setnames). 39. `last(x)` where `length(x)=0` now returns 'x' instead of an error, #5152. Thanks to Garrett See for reporting. @@ -1069,18 +1069,18 @@ ## BUG FIXES 1. Long outstanding (usually small) memory leak in grouping fixed, #2648. When the last group is smaller than the largest group, the difference in those sizes was not being released. Also evident in non-trivial aggregations where each group returns a different number of rows. Most users run a grouping - query once and will never have noticed these, but anyone looping calls to grouping (such as when running in parallel, or benchmarking) may have suffered. Tests added. Thanks to many including vc273 and Y T for reporting [here](http://stackoverflow.com/questions/20349159/memory-leak-in-data-table-grouped-assignment-by-reference) and [here](http://stackoverflow.com/questions/15651515/slow-memory-leak-in-data-table-when-returning-named-lists-in-j-trying-to-reshap) on SO. + query once and will never have noticed these, but anyone looping calls to grouping (such as when running in parallel, or benchmarking) may have suffered. Tests added. Thanks to many including vc273 and Y T for reporting [here](https://stackoverflow.com/questions/20349159/memory-leak-in-data-table-grouped-assignment-by-reference) and [here](https://stackoverflow.com/questions/15651515/slow-memory-leak-in-data-table-when-returning-named-lists-in-j-trying-to-reshap) on SO. - 2. In long running computations where data.table is called many times repetitively the following error could sometimes occur, #2647: *"Internal error: .internal.selfref prot is not itself an extptr"*. Now fixed. Thanks to theEricStone, StevieP and JasonB for (difficult) reproducible examples [here](http://stackoverflow.com/questions/15342227/getting-a-random-internal-selfref-error-in-data-table-for-r). + 2. In long running computations where data.table is called many times repetitively the following error could sometimes occur, #2647: *"Internal error: .internal.selfref prot is not itself an extptr"*. Now fixed. Thanks to theEricStone, StevieP and JasonB for (difficult) reproducible examples [here](https://stackoverflow.com/questions/15342227/getting-a-random-internal-selfref-error-in-data-table-for-r). 3. If `fread` returns a data error (such as no closing quote on a quoted field) it now closes the file first rather than holding a lock open, a Windows only problem. - Thanks to nigmastar for reporting [here](http://stackoverflow.com/questions/18597123/fread-data-table-locks-files) and Carl Witthoft for the hint. Tests added. + Thanks to nigmastar for reporting [here](https://stackoverflow.com/questions/18597123/fread-data-table-locks-files) and Carl Witthoft for the hint. Tests added. 4. `DT[0,col:=value]` is now a helpful error rather than crash, #2754. Thanks to Ricardo Saporta for reporting. `DT[NA,col:=value]`'s error message has also been improved. Tests added. - 5. Assigning to the same column twice in the same query is now an error rather than a crash in some circumstances; e.g., `DT[,c("B","B"):=NULL]` (delete by reference the same column twice). Thanks to Ricardo (#2751) and matt_k (#2791) for reporting [here](http://stackoverflow.com/questions/16638484/remove-multiple-columns-from-data-table). Tests added. + 5. Assigning to the same column twice in the same query is now an error rather than a crash in some circumstances; e.g., `DT[,c("B","B"):=NULL]` (delete by reference the same column twice). Thanks to Ricardo (#2751) and matt_k (#2791) for reporting [here](https://stackoverflow.com/questions/16638484/remove-multiple-columns-from-data-table). Tests added. - 6. Crash and/or incorrect aggregate results with negative indexing in `i` is fixed, with a warning when the `abs(negative index) > nrow(DT)`, #2697. Thanks to Eduard Antonyan (eddi) for reporting [here](http://stackoverflow.com/questions/16046696/data-table-bug-causing-a-segfault-in-r). Tests added. + 6. Crash and/or incorrect aggregate results with negative indexing in `i` is fixed, with a warning when the `abs(negative index) > nrow(DT)`, #2697. Thanks to Eduard Antonyan (eddi) for reporting [here](https://stackoverflow.com/questions/16046696/data-table-bug-causing-a-segfault-in-r). Tests added. 7. `head()` and `tail()` handle negative `n` values correctly now, #2375. Thanks to Garrett See for reporting. Also it results in an error when `length(n) != 1`. Tests added. @@ -1108,7 +1108,7 @@ 17. Cartesian Join (`allow.cartesian = TRUE`) when both `x` and `i` are keyed and `length(key(x)) > length(key(i))` set resulting key incorrectly. This is now fixed, #2677. Tests added. Thanks to Shir Levkowitz for reporting. - 18. `:=` (assignment by reference) loses POSIXct or ITime attribute *while grouping* is now fixed, #2531. Tests added. Thanks to stat quant for reporting [here](http://stackoverflow.com/questions/14604820/why-does-this-posixct-or-itime-loses-its-format-attribute) and to Paul Murray for reporting [here](http://stackoverflow.com/questions/15996692/cannot-assign-columns-as-date-by-reference-in-data-table) on SO. + 18. `:=` (assignment by reference) loses POSIXct or ITime attribute *while grouping* is now fixed, #2531. Tests added. Thanks to stat quant for reporting [here](https://stackoverflow.com/questions/14604820/why-does-this-posixct-or-itime-loses-its-format-attribute) and to Paul Murray for reporting [here](https://stackoverflow.com/questions/15996692/cannot-assign-columns-as-date-by-reference-in-data-table) on SO. 19. `chmatch()` didn't always match non-ascii characters, #2538 and #4818. chmatch is used internally so `DT[is.na(päs), päs := 99L]` now works. Thanks to Benjamin Barnes and Stefan Fritsch for reporting. Tests added. @@ -1116,7 +1116,7 @@ 21. A special case of not-join and logical TRUE, `DT[!TRUE]`, gave an error whereas it should be identical to `DT[FALSE]`. Now fixed and tests added. Thanks once again to Ricardo Saporta for filing #4930. - 22. `X[Y,roll=-Inf,rollends=FALSE]` didn't roll the middle correctly if `Y` was keyed. It was ok if `Y` was unkeyed or rollends left as the default [c(TRUE,FALSE) when roll < 0]. Thanks to user338714 for reporting [here](http://stackoverflow.com/questions/18984179/roll-data-table-with-rollends). Tests added. + 22. `X[Y,roll=-Inf,rollends=FALSE]` didn't roll the middle correctly if `Y` was keyed. It was ok if `Y` was unkeyed or rollends left as the default [c(TRUE,FALSE) when roll < 0]. Thanks to user338714 for reporting [here](https://stackoverflow.com/questions/18984179/roll-data-table-with-rollends). Tests added. 23. Key is now retained after an order-preserving subset, #295. @@ -1124,15 +1124,15 @@ 25. Fixed bug #4927. Unusual column names in normal quotes, ex: `by=".Col"`, now works as expected in `by`. Thanks to Ricardo Saporta for reporting. - 26. `setkey` resulted in error when column names contained ",". This is now fixed. Thanks to Corone for reporting [here](http://stackoverflow.com/a/19166273/817778) on SO. + 26. `setkey` resulted in error when column names contained ",". This is now fixed. Thanks to Corone for reporting [here](https://stackoverflow.com/a/19166273/817778) on SO. 27. `rbind` when at least one argument was a data.table, but not the first, returned the rbind'd data.table with key. This is now fixed, #4995. Thanks to Frank Erickson for reporting. - 28. That `.SD` doesn't retain column's class is now fixed (#2530). Thanks to Corone for reporting [here](http://stackoverflow.com/questions/14753411/why-does-data-table-lose-class-definition-in-sd-after-group-by). + 28. That `.SD` doesn't retain column's class is now fixed (#2530). Thanks to Corone for reporting [here](https://stackoverflow.com/questions/14753411/why-does-data-table-lose-class-definition-in-sd-after-group-by). 29. `eval(quote())` returned error when the quoted expression is a not-join, #4994. This is now fixed. Tests added. - 30. `DT[, lapply(.SD, function(), by=]` did not see columns of DT when optimisation is "on". This is now fixed, #2381. Tests added. Thanks to David F for reporting [here](http://stackoverflow.com/questions/13441868/data-table-and-stratified-means) on SO. + 30. `DT[, lapply(.SD, function(), by=]` did not see columns of DT when optimisation is "on". This is now fixed, #2381. Tests added. Thanks to David F for reporting [here](https://stackoverflow.com/questions/13441868/data-table-and-stratified-means) on SO. 31. #4959 - rbind'ing empty data.tables now works @@ -1140,7 +1140,7 @@ 33. Fixed bug #5007, `j` did not see variables declared within a local (function) environment properly. Now, `DT[, lapply(.SD, function(x) fun_const), by=x]` where "fun_const" is a local variable within a function works as expected. Thanks to Ricardo Saporta for catching this and providing a very nice reproducible example. - 34. Fixing #5007 also fixes #4957, where `.N` was not visible during `lapply(.SD, function(x) ...)` in `j`. Thanks to juba for noticing it [here](http://stackoverflow.com/questions/19094771/replace-values-in-each-column-based-on-conditions-according-to-groups-by-rows) on SO. + 34. Fixing #5007 also fixes #4957, where `.N` was not visible during `lapply(.SD, function(x) ...)` in `j`. Thanks to juba for noticing it [here](https://stackoverflow.com/questions/19094771/replace-values-in-each-column-based-on-conditions-according-to-groups-by-rows) on SO. 35. Fixed another case where function expressions were not constructed properly in `j`, while fixing #5007. `DT[, lapply(.SD, function(x) my_const), by=x]` now works as expected instead of ending up in an error. @@ -1175,7 +1175,7 @@ 48. Fixed a rare segfault that occurred on >250m rows (integer overflow during memory allocation); closes #5305. Thanks to Guenter J. Hitsch for reporting. - 49. `rbindlist` with at least one factor column along with the presence of at least one empty data.table resulted in segfault (or in linux/mac reported an error related to hash tables). This is now fixed, #5355. Thanks to Trevor Alexander for [reporting on SO](http://stackoverflow.com/questions/21591433/merging-really-not-that-large-data-tables-immediately-results-in-r-being-killed) (and mnel for filing the bug report): + 49. `rbindlist` with at least one factor column along with the presence of at least one empty data.table resulted in segfault (or in linux/mac reported an error related to hash tables). This is now fixed, #5355. Thanks to Trevor Alexander for [reporting on SO](https://stackoverflow.com/questions/21591433/merging-really-not-that-large-data-tables-immediately-results-in-r-being-killed) (and mnel for filing the bug report): 50. `CJ()` now orders character vectors in a locale consistent with `setkey`, #5375. Typically this affected whether upper case letters were ordered before lower case letters; they were by `setkey()` but not by `CJ()`. This difference started in v1.8.10 with the change "CJ() is 90% faster...", see NEWS below. Test added and avenues for differences closed off and nailed down, with no loss in performance. Many thanks to Malcolm Hawkes for reporting. @@ -1198,7 +1198,7 @@ 7. Gsee for reporting that `set()` and `:=` could no longer add columns by reference to an object that inherits from data.table; e.g., `class = c("myclass", data.table", "data.frame"))`, #5115. - 8. Clayton Stanley for reporting #5307 [here on SO](http://stackoverflow.com/questions/21437546/data-table-1-8-11-and-aggregation-issues). Aggregating logical types could give wrong results. + 8. Clayton Stanley for reporting #5307 [here on SO](https://stackoverflow.com/questions/21437546/data-table-1-8-11-and-aggregation-issues). Aggregating logical types could give wrong results. 9. New and very welcome ASAN and UBSAN checks on CRAN detected : * integer64 overflow in test 899 reading integers longer than apx 18 digits @@ -1244,14 +1244,14 @@ * "+" and "-" are now read as character rather than integer 0. Thanks to Alvaro Gonzalez and Roby Joehanes for reporting, #4814. - http://stackoverflow.com/questions/15388714/reading-strand-column-with-fread-data-table-package + https://stackoverflow.com/questions/15388714/reading-strand-column-with-fread-data-table-package * % progress console meter has been removed. The ouput was inconvenient in batch mode, log files and reports which don't handle \r. It was too difficult to detect where fread is being called from, plus, removing it speeds up fread a little by saving code inside the C for loop (which is why it wasn't made optional instead). Use your operating system's system monitor to confirm fread is progressing. Thanks to Baptiste for highlighting : - http://stackoverflow.com/questions/15370993/strange-output-from-fread-when-called-from-knitr + https://stackoverflow.com/questions/15370993/strange-output-from-fread-when-called-from-knitr * colClasses has been added. Same character vector format as read.csv (may be named or unnamed), but additionally may be type list. Type list enables setting ranges of columns by numeric position. @@ -1276,12 +1276,12 @@ such as a footer (the first line of which will be included in the warning message). * Now reads files that are open in Excel without having to close them first, #2661. And up to 5 attempts - are made every 250ms on Windows as recommended here : http://support.microsoft.com/kb/316609. + are made every 250ms on Windows as recommended here : https://support.microsoft.com/kb/316609. * "nan%" observed in output of fread(...,verbose=TRUE) timings are now 0% when fread takes 0.000 seconds. * An unintended 50,000 column limit in fread has been removed. Thanks to mpmorley for reporting. Test added. - http://stackoverflow.com/questions/18449997/fread-protection-stack-overflow-error + https://stackoverflow.com/questions/18449997/fread-protection-stack-overflow-error * unique() and duplicated() methods gain 'by' to allow testing for uniqueness using any subset of columns, not just the keyed columns (if keyed) or all columns (if not). By default by=key(dt) for backwards @@ -1298,13 +1298,13 @@ * New function address() returns the address in RAM of its argument. Sometimes useful in determining whether a value has been copied or not by R, programatically. - http://stackoverflow.com/a/10913296/403310 + https://stackoverflow.com/a/10913296/403310 ## BUG FIXES * merge no longer returns spurious NA row(s) when y is empty and all.y=TRUE (or all=TRUE), #2633. Thanks to Vinicius Almendra for reporting. Test added. - http://stackoverflow.com/questions/15566250/merge-data-table-with-all-true-introduces-na-row-is-this-correct + https://stackoverflow.com/questions/15566250/merge-data-table-with-all-true-introduces-na-row-is-this-correct * rbind'ing data.tables containing duplicate, "" or NA column names now works, #2726 & #2384. Thanks to Garrett See and Arun Srinivasan for reporting. This also affected the printing of data.tables @@ -1322,11 +1322,11 @@ * Deleting a (0-length) factor column using :=NULL on an empty data.table now works, #4809. Thanks to Frank Pinter for reporting. Test added. - http://stackoverflow.com/questions/18089587/error-deleting-factor-column-in-empty-data-table + https://stackoverflow.com/questions/18089587/error-deleting-factor-column-in-empty-data-table * Writing FUN= in DT[,lapply(.SD,FUN=...),] now works, #4893. Thanks to Jan Wijffels for reporting and Arun for suggesting and testing a fix. Committed and test added. - http://stackoverflow.com/questions/18314757/why-cant-i-used-fun-in-lapply-when-grouping-by-using-data-table + https://stackoverflow.com/questions/18314757/why-cant-i-used-fun-in-lapply-when-grouping-by-using-data-table * The slowness of transform() on data.table has been fixed, #2599. But, please use :=. @@ -1335,7 +1335,7 @@ * mean() in j has been optimized since v1.8.2 (see NEWS below) but wasn't respecting na.rm=TRUE (the default). Many thanks to Colin Fang for reporting. Test added. - http://stackoverflow.com/questions/18571774/data-table-auto-remove-na-in-by-for-mean-function + https://stackoverflow.com/questions/18571774/data-table-auto-remove-na-in-by-for-mean-function USER VISIBLE CHANGES @@ -1352,11 +1352,11 @@ USER VISIBLE CHANGES * data.table(NULL) now prints "Null data.table (0 rows and 0 cols)" and FAQ 2.5 has been improved. Thanks to: - http://stackoverflow.com/questions/15317536/is-null-does-not-work-on-null-data-table-in-r-possible-bug + https://stackoverflow.com/questions/15317536/is-null-does-not-work-on-null-data-table-in-r-possible-bug * The braces {} have been removed from rollends's default, to solve a trace() problem. Thanks to Josh O'Brien's investigation : - http://stackoverflow.com/questions/15931801/why-does-trace-edit-true-not-work-when-data-table + https://stackoverflow.com/questions/15931801/why-does-trace-edit-true-not-work-when-data-table ## NOTES @@ -1365,7 +1365,7 @@ USER VISIBLE CHANGES * The default for datatable.alloccol has changed from max(100L, 2L*ncol(DT)) to max(100L, ncol(DT)+64L). And a pointer to ?truelength has been added to an error message as suggested and thanks to Roland : - http://stackoverflow.com/questions/15436356/potential-problems-from-over-allocating-truelength-more-than-1000-times + https://stackoverflow.com/questions/15436356/potential-problems-from-over-allocating-truelength-more-than-1000-times * For packages wishing to use data.table optionally (e.g. according to user of that package) and therefore not wishing to Depend on data.table (which is the normal determination of data.table-awareness via .Depends), @@ -1408,7 +1408,7 @@ USER VISIBLE CHANGES for when more than max(nrow(X),nrow(Y)) rows would be returned. The error message is verbose and includes advice. Thanks to a question by Nick Clark, help from user1935457 and a detailed reproducible crash report from JR. - http://stackoverflow.com/questions/14231737/greatest-n-per-group-reference-with-intervals-in-r-or-sql + https://stackoverflow.com/questions/14231737/greatest-n-per-group-reference-with-intervals-in-r-or-sql If the new option affects existing code you can set : options(datatable.allow.cartesian=TRUE) to restore the previous behaviour until you have time to address. @@ -1447,7 +1447,7 @@ USER VISIBLE CHANGES which should have been ISNA(x). Support for double in keyed joins is a relatively recent addition to data.table, but embarrassing all the same. Fixed and tests added. Many thanks to statquant for the thorough and reproducible report : - http://stackoverflow.com/questions/14076065/data-table-inner-outer-join-to-merge-with-na + https://stackoverflow.com/questions/14076065/data-table-inner-outer-join-to-merge-with-na * setnames() of all column names (such as setnames(DT,toupper(names(DT)))) failed on a keyed table where columns 1:length(key) were not the key. Fixed and test added. @@ -1465,7 +1465,7 @@ USER VISIBLE CHANGES to aid tracing root causes like this in future. Tests added. Many thanks to statquant for the reproducible example revealed by his interesting solution and to user1935457 for the assistance : - http://stackoverflow.com/a/14359701/403310 + https://stackoverflow.com/a/14359701/403310 * merge(...,all.y=TRUE) was 'setcolorder' error if a y column name included a space and there were rows in y not in x, #2555. The non syntactically valid column names @@ -1477,7 +1477,7 @@ USER VISIBLE CHANGES > DT # now prints DT ok > DT # used to have to type DT a second time to see it Many thanks to Charles, Joris Meys, and, Spacedman whose solution is now used - by data.table internally (http://stackoverflow.com/a/13606880/403310). + by data.table internally (https://stackoverflow.com/a/13606880/403310). ## NOTES @@ -1492,7 +1492,7 @@ USER VISIBLE CHANGES Please use data.table() directly instead of J(), outside DT[...]. * ?merge.data.table and FAQ 1.12 have been improved (#2457), and FAQ 2.24 added. - Thanks to dnlbrky for highlighting : http://stackoverflow.com/a/14164411/403310. + Thanks to dnlbrky for highlighting : https://stackoverflow.com/a/14164411/403310. * There are now 943 raw tests, as reported by test.data.table(). @@ -1578,12 +1578,12 @@ USER VISIBLE CHANGES colname = "newcol" DT[,colname:=f(),by=grp,with=FALSE] Thanks to Alex Chernyakov : - http://stackoverflow.com/questions/11745169/dynamic-column-names-in-data-table-r - http://stackoverflow.com/questions/11680579/assign-multiple-columns-using-in-data-table-by-group + https://stackoverflow.com/questions/11745169/dynamic-column-names-in-data-table-r + https://stackoverflow.com/questions/11680579/assign-multiple-columns-using-in-data-table-by-group * .GRP is a new symbol available to j. Value 1 for the first group, 2 for the 2nd, etc. Thanks to Josh O'Brien for the suggestion : - http://stackoverflow.com/questions/13018696/data-table-key-indices-or-group-counter + https://stackoverflow.com/questions/13018696/data-table-key-indices-or-group-counter * .I is a new symbol available to j. An integer vector length .N. It contains the group's row locations in DT. This implements FR#1962. @@ -1639,7 +1639,7 @@ USER VISIBLE CHANGES more than one row in x. Possibly in other similar circumstances too. The workaround was to set mult="first" which is no longer required. Test added. Thanks to a question and report from Alex Chernyakov : - http://stackoverflow.com/questions/12042779/time-of-data-table-join + https://stackoverflow.com/questions/12042779/time-of-data-table-join * Indexing columns of data.table with a logical vector and `with=FALSE` now works as expected, fixing #1797. Thanks to Mani Narayanan for reporting. Test added. @@ -1702,7 +1702,7 @@ USER VISIBLE CHANGES data.table:::cedta.override by using assignInNamespace(). Thanks to Zach Waite and Yihui Xie for investigating and providing reproducible examples : - http://stackoverflow.com/questions/13106018/data-table-error-when-used-through-knitr-gwidgetswww + https://stackoverflow.com/questions/13106018/data-table-error-when-used-through-knitr-gwidgetswww * Optimization of lapply when FUN is a character function name now works, #2212. DT[,lapply(.SD, "+", 1), by=id] # no longer an error @@ -1722,7 +1722,7 @@ USER VISIBLE CHANGES * A matrix RHS of := is now treated as vector, with warning if it has more than 1 column, #2333. Thanks to Alex Chernyakov for highlighting. Tests added. DT[,b:=scale(a)] # now works rather than creating an invalid column of type matrix - http://stackoverflow.com/questions/13076509/why-error-from-na-omit-after-running-scale-in-r-in-data-table + https://stackoverflow.com/questions/13076509/why-error-from-na-omit-after-running-scale-in-r-in-data-table * last() is now S3 generic for compatibility with xts::last, #2312. Strictly speaking, for speed, last(x) deals with vector, list and data.table inputs directly before falling back to @@ -1730,7 +1730,7 @@ USER VISIBLE CHANGES * DT[,lapply(.SD,sum)] in the case of no grouping now returns a data.table for consistency, rather than list, #2263. Thanks to Justin and mnel for highlighting. Existing test changed. - http://stackoverflow.com/a/12290443/403310 + https://stackoverflow.com/a/12290443/403310 * L[[2L]][,newcol:=] now works, where L is a list of data.table objects, #2204. Thanks to Melanie Bacou for reporting. Tests added. A warning is issued when the first column is added if L was created with @@ -1766,7 +1766,7 @@ USER VISIBLE CHANGES * DT[,LHS:=RHS,...] no longer prints DT. This implements #2128 "Try again to get DT[i,j:=value] to return invisibly". Thanks to discussion here : - http://stackoverflow.com/questions/11359553/how-to-suppress-output-when-using-in-r-data-table + https://stackoverflow.com/questions/11359553/how-to-suppress-output-when-using-in-r-data-table FAQs 2.21 and 2.22 have been updated. * DT[] now returns DT rather than an error that either i or j must be supplied. @@ -1781,11 +1781,11 @@ USER VISIBLE CHANGES changing it, #2282. This can be turned off using options(datatable.warnredundantby=FALSE) in case it occurs after upgrading, until those lines can be modified. Thanks to Ben Barnes for highlighting : - http://stackoverflow.com/a/12474211/403310 + https://stackoverflow.com/a/12474211/403310 * Description of how join columns are determined in X[Y] syntax has been further clarified in ?data.table. Thanks to Alex : - http://stackoverflow.com/questions/12920803/merge-data-table-when-the-number-of-key-columns-are-different + https://stackoverflow.com/questions/12920803/merge-data-table-when-the-number-of-key-columns-are-different * ?transform and example(transform) has been fixed and embelished, #2316. Thanks to Garrett See's suggestion. @@ -1881,7 +1881,7 @@ USER VISIBLE CHANGES * sapply(DT,class) gets a significant speed boost by avoiding a call to unclass() in as.list.data.table() called by lapply(DT,...), which copied the entire object. Thanks to a question by user1393348 on Stack Overflow, implementing #2000. - http://stackoverflow.com/questions/10584993/r-loop-over-columns-in-data-table + https://stackoverflow.com/questions/10584993/r-loop-over-columns-in-data-table * The J() alias is now deprecated outside DT[...], but will still work inside DT[...], as in DT[J(...)]. @@ -1953,7 +1953,7 @@ USER VISIBLE CHANGES * When grouping by i, if the first row of i had no match, .N was 1 rather than 0. Fixed and tests added. Thanks to a question by user1165199 on Stack Overflow : - http://stackoverflow.com/questions/10721517/count-number-of-times-data-is-in-another-dataframe-in-r + https://stackoverflow.com/questions/10721517/count-number-of-times-data-is-in-another-dataframe-in-r * All object attributes are now retained by grouping; e.g., tzone of POSIXct is no longer lost, fixing #1704. Test added. Thanks to Karl Ove Hufthammer for reporting. @@ -1971,11 +1971,11 @@ USER VISIBLE CHANGES * merge() with common names, and, all.y=TRUE (or all=TRUE) no longer returns an error, #2011. Tests added. Thanks to a question by Ina on Stack Overflow : - http://stackoverflow.com/questions/10618837/joining-two-partial-data-tables-keeping-all-x-and-all-y + https://stackoverflow.com/questions/10618837/joining-two-partial-data-tables-keeping-all-x-and-all-y * Removing or setting datatable.alloccol to NULL is no longer a memory leak, #2014. Tests added. Thanks to a question by Vanja on Stack Overflow : - http://stackoverflow.com/questions/10628371/r-importing-data-table-package-namespace-unexplainable-jump-in-memory-consumpt + https://stackoverflow.com/questions/10628371/r-importing-data-table-package-namespace-unexplainable-jump-in-memory-consumpt * DT[,2:=someval,with=FALSE] now changes column 2 even if column 1 has the same (duplicate) name, #2025. Thanks to Sean Creighton for reporting. Tests added. @@ -2116,12 +2116,12 @@ USER VISIBLE CHANGES (author of Python package Pandas). Matching 1 million strings of which of which 600,000 are unique is now reduced from 16s to 0.5s, for example. Background here : - http://stackoverflow.com/questions/8991709/why-are-pandas-merges-in-python-faster-than-data-table-merges-in-r + https://stackoverflow.com/questions/8991709/why-are-pandas-merges-in-python-faster-than-data-table-merges-in-r * rbind.data.table() gains a use.names argument, by default TRUE. Set to FALSE to combine columns in order rather than by name. Thanks to a question by Zach on Stack Overflow : - http://stackoverflow.com/questions/9315258/aggregating-sub-totals-and-grand-totals-with-data-table + https://stackoverflow.com/questions/9315258/aggregating-sub-totals-and-grand-totals-with-data-table * New argument 'keyby'. An ad hoc by just as 'by' but with an additional setkey() on the by columns of the result, for convenience. Not to be confused with a diff --git a/NEWS.md b/NEWS.md index 1f2dbf88a4..28cbcc872f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -269,7 +269,7 @@ has a better chance of working on Mac. * `colClasses` now supports `'complex'`, `'raw'`, `'Date'`, `'POSIXct'`, and user-defined classes (so long as an `as.` method exists), [#491](https://github.com/Rdatatable/data.table/issues/491) [#1634](https://github.com/Rdatatable/data.table/issues/1634) [#2610](https://github.com/Rdatatable/data.table/issues/2610). Any error during coercion results in a warning and the column is left as the default type (probably `"character"`). Thanks to @hughparsonage for the PR. * `stringsAsFactors=0.10` will factorize any character column containing under `0.10*nrow` unique strings, [#2025](https://github.com/Rdatatable/data.table/issues/2025). Thanks to @hughparsonage for the PR. * `colClasses=list(numeric=20:30, numeric="ID")` will apply the `numeric` type to column numbers `20:30` as before and now also column name `"ID"`; i.e. all duplicate class names are now respected rather than only the first. This need may arise when specifying some columns by name and others by number, as in this example. Thanks to @hughparsonage for the PR. - * gains `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files; i.e., csv files with metadata in a header formatted as YAML (http://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats. Please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/leeper/rio) for the inspiration and @MichaelChirico for implementing. + * gains `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files; i.e., csv files with metadata in a header formatted as YAML (https://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats. Please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/leeper/rio) for the inspiration and @MichaelChirico for implementing. * `select` can now be used to specify types for just the columns selected, [#1426](https://github.com/Rdatatable/data.table/issues/1426). Just like `colClasses` it can be a named vector of `colname=type` pairs, or a named `list` of `type=col(s)` pairs. For example: ```R @@ -678,7 +678,7 @@ has a better chance of working on Mac. 4. `rbind` and `rbindlist` now retain the position of duplicate column names rather than grouping them together [#3373](https://github.com/Rdatatable/data.table/issues/3373), fill length 0 columns (including NULL) with NA with warning [#1871](https://github.com/Rdatatable/data.table/issues/1871), and recycle length-1 columns [#524](https://github.com/Rdatatable/data.table/issues/524). Thanks to Kun Ren for the requests which arose when parsing JSON. -5. `rbindlist`'s `use.names=` default has changed from `FALSE` to `"check"`. This emits a message if the column names of each item are not identical and then proceeds as if `use.names=FALSE` for backwards compatibility; i.e., bind by column position not by column name. The `rbind` method for `data.table` already sets `use.names=TRUE` so this change affects `rbindlist` only and not `rbind.data.table`. To stack differently named columns together silently (the previous default behavior of `rbindlist`), it is now necessary to specify `use.names=FALSE` for clarity to readers of your code. Thanks to Clayton Stanley who first raised the issue [here](http://lists.r-forge.r-project.org/pipermail/datatable-help/2014-April/002480.html). To aid pinpointing the calls to `rbindlist` that need attention, the message can be turned to error using `options(datatable.rbindlist.check="error")`. This option also accepts `"warning"`, `"message"` and `"none"`. In this release the message is suppressed for default column names (`"V[0-9]+"`); the next release will emit the message for those too. In 6 months the default will be upgraded from message to warning. There are two slightly different messages. They are helpful, include context and point to this news item : +5. `rbindlist`'s `use.names=` default has changed from `FALSE` to `"check"`. This emits a message if the column names of each item are not identical and then proceeds as if `use.names=FALSE` for backwards compatibility; i.e., bind by column position not by column name. The `rbind` method for `data.table` already sets `use.names=TRUE` so this change affects `rbindlist` only and not `rbind.data.table`. To stack differently named columns together silently (the previous default behavior of `rbindlist`), it is now necessary to specify `use.names=FALSE` for clarity to readers of your code. Thanks to Clayton Stanley who first raised the issue [here](https://lists.r-forge.r-project.org/pipermail/datatable-help/2014-April/002480.html). To aid pinpointing the calls to `rbindlist` that need attention, the message can be turned to error using `options(datatable.rbindlist.check="error")`. This option also accepts `"warning"`, `"message"` and `"none"`. In this release the message is suppressed for default column names (`"V[0-9]+"`); the next release will emit the message for those too. In 6 months the default will be upgraded from message to warning. There are two slightly different messages. They are helpful, include context and point to this news item : ``` Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with @@ -866,7 +866,7 @@ has a better chance of working on Mac. ## NEW FEATURES -1. `fread()` can now read `.gz` and `.bz2` files directly: `fread("file.csv.gz")`, [#717](https://github.com/Rdatatable/data.table/issues/717) [#3058](https://github.com/Rdatatable/data.table/issues/3058). It uses `R.utils::decompressFile` to decompress to a `tempfile()` which is then read by `fread()` in the usual way. For greater speed on large-RAM servers, it is recommended to use ramdisk for temporary files by setting `TMPDIR` to `/dev/shm` before starting R; see `?tempdir`. The decompressed temporary file is removed as soon as `fread` completes even if there is an error reading the file. Reading a remote compressed file in one step will be supported in the next version; e.g. `fread("http://domain.org/file.csv.bz2")`. +1. `fread()` can now read `.gz` and `.bz2` files directly: `fread("file.csv.gz")`, [#717](https://github.com/Rdatatable/data.table/issues/717) [#3058](https://github.com/Rdatatable/data.table/issues/3058). It uses `R.utils::decompressFile` to decompress to a `tempfile()` which is then read by `fread()` in the usual way. For greater speed on large-RAM servers, it is recommended to use ramdisk for temporary files by setting `TMPDIR` to `/dev/shm` before starting R; see `?tempdir`. The decompressed temporary file is removed as soon as `fread` completes even if there is an error reading the file. Reading a remote compressed file in one step will be supported in the next version; e.g. `fread("https://domain.org/file.csv.bz2")`. ## BUG FIXES diff --git a/README.md b/README.md index c360b8c0f8..73bc371927 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://codecov.io/github/Rdatatable/data.table?branch=master) [![GitLab CI build status](https://gitlab.com/Rdatatable/data.table/badges/master/pipeline.svg)](https://gitlab.com/Rdatatable/data.table/pipelines) [![downloads](https://cranlogs.r-pkg.org/badges/data.table)](https://www.rdocumentation.org/trends) -[![depsy](http://depsy.org/api/package/cran/data.table/badge.svg)](http://depsy.org/package/r/data.table) +[![depsy](https://depsy.org/api/package/cran/data.table/badge.svg)](https://depsy.org/package/r/data.table) [![CRAN usage](https://jangorecki.gitlab.io/rdeps/data.table/CRAN_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) [![BioC usage](https://jangorecki.gitlab.io/rdeps/data.table/BioC_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) [![indirect usage](https://jangorecki.gitlab.io/rdeps/data.table/indirect_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) @@ -82,7 +82,7 @@ DT[Petal.Width > 1.0, mean(Petal.Length), by = Species] ## Community -`data.table` is widely used by the R community. It is being directly used by hundreds of CRAN and Bioconductor packages, and indirectly by thousands. It is one of the [top most starred](http://www.r-pkg.org/starred) R package on GitHub. If you need help, the `data.table` community is active on [StackOverflow](http://stackoverflow.com/questions/tagged/data.table). +`data.table` is widely used by the R community. It is being directly used by hundreds of CRAN and Bioconductor packages, and indirectly by thousands. It is one of the [top most starred](https://www.r-pkg.org/starred) R package on GitHub. If you need help, the `data.table` community is active on [StackOverflow](https://stackoverflow.com/questions/tagged/data.table). ### Stay up-to-date diff --git a/man/IDateTime.Rd b/man/IDateTime.Rd index 2e5989449e..03e464c360 100644 --- a/man/IDateTime.Rd +++ b/man/IDateTime.Rd @@ -180,9 +180,9 @@ See 'Details' in \code{\link{round}} for more information. G. Grothendieck and T. Petzoldt, ``Date and Time Classes in R,'' R News, vol. 4, no. 1, June 2004. - H. Wickham, http://gist.github.com/10238. + H. Wickham, https://gist.github.com/10238. - ISO 8601, http://www.iso.org/iso/home/standards/iso8601.htm + ISO 8601, https://www.iso.org/iso/home/standards/iso8601.htm } \author{ Tom Short, t.short@ieee.org } diff --git a/man/address.Rd b/man/address.Rd index 222e0993f2..258c0241f2 100644 --- a/man/address.Rd +++ b/man/address.Rd @@ -17,7 +17,7 @@ Sometimes useful in determining whether a value has been copied or not, programm A character vector length 1. } \references{ -\url{http://stackoverflow.com/a/10913296/403310} (but implemented in C without using \code{.Internal(inspect())}) +\url{https://stackoverflow.com/a/10913296/403310} (but implemented in C without using \code{.Internal(inspect())}) } \keyword{ data } diff --git a/man/assign.Rd b/man/assign.Rd index 4f2609c726..5cfc42b9a9 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -57,7 +57,7 @@ All of the following result in a friendly error (by design) : DT[, {col1 := 1L; col2 := 2L}] # Use the functional form, `:=`(), instead (see above). } -For additional resources, please read \href{../doc/datatable-faq.html}{\code{vignette("datatable-faq")}}. Also have a look at StackOverflow's \href{http://stackoverflow.com/search?q=\%5Bdata.table\%5D+reference}{data.table tag}. +For additional resources, please read \href{../doc/datatable-faq.html}{\code{vignette("datatable-faq")}}. Also have a look at StackOverflow's \href{https://stackoverflow.com/search?q=\%5Bdata.table\%5D+reference}{data.table tag}. \code{:=} in \code{j} can be combined with all types of \code{i} (such as binary search), and all types of \code{by}. This a one reason why \code{:=} has been implemented in \code{j}. Please see \href{../doc/datatable-reference-semantics}{\code{vignette("datatable-reference-semantics")}} and also \code{FAQ 2.16} for analogies to SQL. diff --git a/man/data.table.Rd b/man/data.table.Rd index 8c8e0d5375..59b6aae1e1 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -106,7 +106,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{or of the form \code{startcol:endcol}: e.g., \code{DT[, sum(a), by=x:z]}} } - \emph{Advanced:} When \code{i} is a \code{list} (or \code{data.frame} or \code{data.table}), \code{DT[i, j, by=.EACHI]} evaluates \code{j} for the groups in `DT` that each row in \code{i} joins to. That is, you can join (in \code{i}) and aggregate (in \code{j}) simultaneously. We call this \emph{grouping by each i}. See \href{http://stackoverflow.com/a/27004566/559784}{this StackOverflow answer} for a more detailed explanation until we \href{https://github.com/Rdatatable/data.table/issues/944}{roll out vignettes}. + \emph{Advanced:} When \code{i} is a \code{list} (or \code{data.frame} or \code{data.table}), \code{DT[i, j, by=.EACHI]} evaluates \code{j} for the groups in `DT` that each row in \code{i} joins to. That is, you can join (in \code{i}) and aggregate (in \code{j}) simultaneously. We call this \emph{grouping by each i}. See \href{https://stackoverflow.com/a/27004566/559784}{this StackOverflow answer} for a more detailed explanation until we \href{https://github.com/Rdatatable/data.table/issues/944}{roll out vignettes}. \emph{Advanced:} In the \code{X[Y, j]} form of grouping, the \code{j} expression sees variables in \code{X} first, then \code{Y}. We call this \emph{join inherited scope}. If the variable is not in \code{X} or \code{Y} then the calling frame is searched, its calling frame, and so on in the usual way up to and including the global environment.} @@ -221,7 +221,7 @@ See the \code{see also} section for the several other \emph{methods} that are av } \references{ \url{https://github.com/Rdatatable/data.table/wiki} (\code{data.table} homepage)\cr -\url{http://en.wikipedia.org/wiki/Binary_search} +\url{https://en.wikipedia.org/wiki/Binary_search} } \note{ If \code{keep.rownames} or \code{check.names} are supplied they must be written in full because \R does not allow partial argument names after `\code{\dots}`. For example, \code{data.table(DF, keep=TRUE)} will create a column called \code{"keep"} containing \code{TRUE} and this is correct behaviour; \code{data.table(DF, keep.rownames=TRUE)} was intended. diff --git a/man/foverlaps.Rd b/man/foverlaps.Rd index 0174209a84..e90d251338 100644 --- a/man/foverlaps.Rd +++ b/man/foverlaps.Rd @@ -155,7 +155,7 @@ foverlaps(x, y, by.x=c("seq", "start", "end"), } \seealso{ \code{\link{data.table}}, -\url{http://www.bioconductor.org/packages/release/bioc/html/IRanges.html}, +\url{https://www.bioconductor.org/packages/release/bioc/html/IRanges.html}, \code{\link{setNumericRounding}} } \keyword{ data } diff --git a/man/fread.Rd b/man/fread.Rd index 37a4d06b9e..f432e70c30 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -127,21 +127,21 @@ When \code{input} begins with http://, https://, ftp://, ftps://, or file://, \c \references{ Background :\cr \url{https://cran.r-project.org/doc/manuals/R-data.html}\cr -\url{http://stackoverflow.com/questions/1727772/quickly-reading-very-large-tables-as-dataframes-in-r}\cr -\url{http://www.biostat.jhsph.edu/~rpeng/docs/R-large-tables.html}\cr -\url{http://www.cerebralmastication.com/2009/11/loading-big-data-into-r/}\cr -\url{http://stackoverflow.com/questions/9061736/faster-than-scan-with-rcpp}\cr -\url{http://stackoverflow.com/questions/415515/how-can-i-read-and-manipulate-csv-file-data-in-c}\cr -\url{http://stackoverflow.com/questions/9352887/strategies-for-reading-in-csv-files-in-pieces}\cr -\url{http://stackoverflow.com/questions/11782084/reading-in-large-text-files-in-r}\cr -\url{http://stackoverflow.com/questions/45972/mmap-vs-reading-blocks}\cr -\url{http://stackoverflow.com/questions/258091/when-should-i-use-mmap-for-file-access}\cr -\url{http://stackoverflow.com/a/9818473/403310}\cr -\url{http://stackoverflow.com/questions/9608950/reading-huge-files-using-memory-mapped-files} - -finagler = "to get or achieve by guile or manipulation" \url{http://dictionary.reference.com/browse/finagler} - -On YAML, see \url{http://yaml.org/}; on csvy, see \url{http://csvy.org/}. +\url{https://stackoverflow.com/questions/1727772/quickly-reading-very-large-tables-as-dataframes-in-r}\cr +\url{https://www.biostat.jhsph.edu/~rpeng/docs/R-large-tables.html}\cr +\url{https://www.cerebralmastication.com/2009/11/loading-big-data-into-r/}\cr +\url{https://stackoverflow.com/questions/9061736/faster-than-scan-with-rcpp}\cr +\url{https://stackoverflow.com/questions/415515/how-can-i-read-and-manipulate-csv-file-data-in-c}\cr +\url{https://stackoverflow.com/questions/9352887/strategies-for-reading-in-csv-files-in-pieces}\cr +\url{https://stackoverflow.com/questions/11782084/reading-in-large-text-files-in-r}\cr +\url{https://stackoverflow.com/questions/45972/mmap-vs-reading-blocks}\cr +\url{https://stackoverflow.com/questions/258091/when-should-i-use-mmap-for-file-access}\cr +\url{https://stackoverflow.com/a/9818473/403310}\cr +\url{https://stackoverflow.com/questions/9608950/reading-huge-files-using-memory-mapped-files} + +finagler = "to get or achieve by guile or manipulation" \url{https://dictionary.reference.com/browse/finagler} + +On YAML, see \url{https://yaml.org/}; on csvy, see \url{https://csvy.org/}. } \seealso{ \code{\link[utils:read.table]{read.csv}}, \code{\link[base:connections]{url}}, \code{\link[base:locales]{Sys.setlocale}}, \code{\link{setDTthreads}}, \code{\link{fwrite}}, \href{https://CRAN.R-project.org/package=bit64}{\code{bit64::integer64}} @@ -273,9 +273,9 @@ all(mapply(all.equal, DF, DT)) # Real data example (Airline data) -# http://stat-computing.org/dataexpo/2009/the-data.html +# https://stat-computing.org/dataexpo/2009/the-data.html -download.file("http://stat-computing.org/dataexpo/2009/2008.csv.bz2", +download.file("https://stat-computing.org/dataexpo/2009/2008.csv.bz2", destfile="2008.csv.bz2") # 109MB (compressed) @@ -303,10 +303,10 @@ table(sapply(DT,class)) # Reads URLs directly : -fread("http://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat") +fread("https://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat") # Decompresses .gz and .bz2 automatically : -fread("http://stat-computing.org/dataexpo/2009/1987.csv.bz2") +fread("https://stat-computing.org/dataexpo/2009/1987.csv.bz2") } } diff --git a/man/fwrite.Rd b/man/fwrite.Rd index c785c74f41..6f7682b98b 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -61,7 +61,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{verbose}{Be chatty and report timings?} } \details{ -\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. +\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://blog.h2o.ai/2016/04/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. To save space, \code{fwrite} prefers to write wide numeric values in scientific notation -- e.g. \code{10000000000} takes up much more space than \code{1e+10}. Most file readers (e.g. \code{\link{fread}}) understand scientific notation, so there's no fidelity loss. Like in base R, users can control this by specifying the \code{scipen} argument, which follows the same rules as \code{\link[base]{options}('scipen')}. \code{fwrite} will see how much space a value will take to write in scientific vs. decimal notation, and will only write in scientific notation if the latter is more than \code{scipen} characters wider. For \code{10000000000}, then, \code{1e+10} will be written whenever \code{scipen<6}. @@ -88,7 +88,7 @@ The following fields will be written to the header of the file and surrounded by \code{\link{setDTthreads}}, \code{\link{fread}}, \code{\link[utils:write.table]{write.csv}}, \code{\link[utils:write.table]{write.table}}, \href{https://CRAN.R-project.org/package=bit64}{\code{bit64::integer64}} } \references{ - \url{http://howardhinnant.github.io/date_algorithms.html}\cr + \url{https://howardhinnant.github.io/date_algorithms.html}\cr \url{https://en.wikipedia.org/wiki/Decimal_mark} } \examples{ diff --git a/man/groupingsets.Rd b/man/groupingsets.Rd index d897a9984c..6ae02779c1 100644 --- a/man/groupingsets.Rd +++ b/man/groupingsets.Rd @@ -36,8 +36,8 @@ groupingsets(x, \dots) \seealso{ \code{\link{data.table}}, \code{\link{rbindlist}} } \references{ -\url{http://www.postgresql.org/docs/9.5/static/queries-table-expressions.html#QUERIES-GROUPING-SETS} -\url{http://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE} +\url{https://www.postgresql.org/docs/9.5/static/queries-table-expressions.html#QUERIES-GROUPING-SETS} +\url{https://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE} } \examples{ n = 24L diff --git a/man/merge.Rd b/man/merge.Rd index 65f1f14948..fe0a03f7a0 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -73,7 +73,7 @@ comparison of \code{merge} and \code{x[y, \dots]}. If any column names provided to \code{by.x} also occur in \code{names(y)} but not in \code{by.y}, then this \code{data.table} method will add the \code{suffixes} to those column names. As of R v3.4.3, the \code{data.frame} method will not (leading to duplicate column names in the result) but a patch has -been proposed (see r-devel thread \href{http://r.789695.n4.nabble.com/Duplicate-column-names-created-by-base-merge-when-by-x-has-the-same-name-as-a-column-in-y-td4748345.html}{here}) +been proposed (see r-devel thread \href{https://r.789695.n4.nabble.com/Duplicate-column-names-created-by-base-merge-when-by-x-has-the-same-name-as-a-column-in-y-td4748345.html}{here}) which is looking likely to be accepted for a future version of R. } diff --git a/man/rleid.Rd b/man/rleid.Rd index bc21637b1c..837d4c4ea4 100644 --- a/man/rleid.Rd +++ b/man/rleid.Rd @@ -36,6 +36,6 @@ DT[, sum(value), by=.(grp, rleid(grp, prefix="grp"))] } \seealso{ - \code{\link{data.table}}, \code{\link{rowid}}, \url{http://stackoverflow.com/q/21421047/559784} + \code{\link{data.table}}, \code{\link{rowid}}, \url{https://stackoverflow.com/q/21421047/559784} } \keyword{ data } diff --git a/man/setDT.Rd b/man/setDT.Rd index aa2c1b775a..c00ba0f46a 100644 --- a/man/setDT.Rd +++ b/man/setDT.Rd @@ -4,7 +4,7 @@ \description{ In \code{data.table} parlance, all \code{set*} functions change their input \emph{by reference}. That is, no copy is made at all, other than temporary working memory, which is as large as one column.. The only other \code{data.table} operator that modifies input by reference is \code{\link{:=}}. Check out the \code{See Also} section below for other \code{set*} function \code{data.table} provides. - \code{setDT} converts lists (both named and unnamed) and data.frames to data.tables \emph{by reference}. This feature was requested on \href{http://stackoverflow.com/questions/20345022/convert-a-data-frame-to-a-data-table-without-copy}{Stackoverflow}. + \code{setDT} converts lists (both named and unnamed) and data.frames to data.tables \emph{by reference}. This feature was requested on \href{https://stackoverflow.com/questions/20345022/convert-a-data-frame-to-a-data-table-without-copy}{Stackoverflow}. } \usage{ diff --git a/man/setNumericRounding.Rd b/man/setNumericRounding.Rd index 9b397e1a27..87ce2256b5 100644 --- a/man/setNumericRounding.Rd +++ b/man/setNumericRounding.Rd @@ -37,9 +37,9 @@ precision). } \seealso{ \code{\link{datatable-optimize}}\cr -\url{http://en.wikipedia.org/wiki/Double-precision_floating-point_format}\cr -\url{http://en.wikipedia.org/wiki/Floating_point}\cr -\url{http://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html} +\url{https://en.wikipedia.org/wiki/Double-precision_floating-point_format}\cr +\url{https://en.wikipedia.org/wiki/Floating_point}\cr +\url{https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html} } \examples{ DT = data.table(a=seq(0,1,by=0.2),b=1:2, key="a") diff --git a/man/setkey.Rd b/man/setkey.Rd index daf10c83ad..387b62114d 100644 --- a/man/setkey.Rd +++ b/man/setkey.Rd @@ -93,7 +93,7 @@ reference. \references{ \url{https://en.wikipedia.org/wiki/Radix_sort}\cr \url{https://en.wikipedia.org/wiki/Counting_sort}\cr - \url{http://stereopsis.com/radix.html}\cr + \url{https://stereopsis.com/radix.html}\cr \url{https://codercorner.com/RadixSortRevisited.htm}\cr \url{https://cran.r-project.org/package=bit64}\cr \url{https://github.com/Rdatatable/data.table/wiki/Presentations} diff --git a/man/setorder.Rd b/man/setorder.Rd index 6e7b598427..267e5b5ca0 100644 --- a/man/setorder.Rd +++ b/man/setorder.Rd @@ -105,7 +105,7 @@ If you require a copy, take a copy first (using \code{DT2 = copy(DT)}). See \references{ \url{https://en.wikipedia.org/wiki/Radix_sort}\cr \url{https://en.wikipedia.org/wiki/Counting_sort}\cr - \url{http://stereopsis.com/radix.html}\cr + \url{https://stereopsis.com/radix.html}\cr \url{https://codercorner.com/RadixSortRevisited.htm}\cr \url{https://medium.com/basecs/getting-to-the-root-of-sorting-with-radix-sort-f8e9240d4224} } diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index 84254f7790..e0cd81b343 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -82,13 +82,13 @@ This runs the `j` expression on the set of rows where the `i` expression is true As [highlighted above](#j-num), `j` in `[.data.table` is fundamentally different from `j` in `[.data.frame`. Even if something as simple as `DF[ , 1]` was changed in base R to return a data.frame rather than a vector, that would break existing code in many 1000's of CRAN packages and user code. As soon as we took the step to create a new class that inherited from data.frame, we had the opportunity to change a few things and we did. We want data.table to be slightly different and to work this way for more complicated syntax to work. There are other differences, too (see [below](#SmallerDiffs) ). -Furthermore, data.table _inherits_ from `data.frame`. It _is_ a `data.frame`, too. A data.table can be passed to any package that only accepts `data.frame` and that package can use `[.data.frame` syntax on the data.table. See [this answer](http://stackoverflow.com/a/10529888/403310) for how that is achieved. +Furthermore, data.table _inherits_ from `data.frame`. It _is_ a `data.frame`, too. A data.table can be passed to any package that only accepts `data.frame` and that package can use `[.data.frame` syntax on the data.table. See [this answer](https://stackoverflow.com/a/10529888/403310) for how that is achieved. We _have_ proposed enhancements to R wherever possible, too. One of these was accepted as a new feature in R 2.12.0 : > `unique()` and `match()` are now faster on character vectors where all elements are in the global CHARSXP cache and have unmarked encoding (ASCII). Thanks to Matt Dowle for suggesting improvements to the way the hash code is generated in unique.c. -A second proposal was to use `memcpy` in duplicate.c, which is much faster than a for loop in C. This would improve the _way_ that R copies data internally (on some measures by 13 times). The thread on r-devel is [here](http://r.789695.n4.nabble.com/suggestion-how-to-use-memcpy-in-duplicate-c-td2019184.html). +A second proposal was to use `memcpy` in duplicate.c, which is much faster than a for loop in C. This would improve the _way_ that R copies data internally (on some measures by 13 times). The thread on r-devel is [here](https://r.789695.n4.nabble.com/suggestion-how-to-use-memcpy-in-duplicate-c-td2019184.html). A third more significant proposal that was accepted is that R now uses data.table's radix sort code as from R 3.3.0 : @@ -600,5 +600,5 @@ Sure. You're more likely to get a faster answer from the Issues page or Stack Ov ## I have created a package that uses data.table. How do I ensure my package is data.table-aware so that inheritance from `data.frame` works? -Please see [this answer](http://stackoverflow.com/a/10529888/403310). +Please see [this answer](https://stackoverflow.com/a/10529888/403310). diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index 85da6703a1..ddbb59024d 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -38,7 +38,7 @@ Briefly, if you are interested in reducing *programming* and *compute* time trem ## Data {#data} -In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the [Bureau of Transporation Statistics](http://www.transtats.bts.gov) for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/hadley/nycflights13)). The data is available only for Jan-Oct'14. +In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the [Bureau of Transporation Statistics](https://www.transtats.bts.gov) for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/hadley/nycflights13)). The data is available only for Jan-Oct'14. We can use `data.table`'s fast-and-friendly file reader `fread` to load `flights` directly as follows: diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index a89538fba2..4747a76fd2 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -67,7 +67,7 @@ DF$c <- 18:13 # (1) -- replace entire column DF$c[DF$ID == "b"] <- 15:13 # (2) -- subassign in column 'c' ``` -both (1) and (2) resulted in deep copy of the entire data.frame in versions of `R` versions `< 3.1`. [It copied more than once](http://stackoverflow.com/q/23898969/559784). To improve performance by avoiding these redundant copies, *data.table* utilised the [available but unused `:=` operator in R](http://stackoverflow.com/q/7033106/559784). +both (1) and (2) resulted in deep copy of the entire data.frame in versions of `R` versions `< 3.1`. [It copied more than once](https://stackoverflow.com/q/23898969/559784). To improve performance by avoiding these redundant copies, *data.table* utilised the [available but unused `:=` operator in R](https://stackoverflow.com/q/7033106/559784). Great performance improvements were made in `R v3.1` as a result of which only a *shallow* copy is made for (1) and not *deep* copy. However, for (2) still, the entire column is *deep* copied even in `R v3.1+`. This means the more columns one subassigns to in the *same query*, the more *deep* copies R does. diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 8f23c58554..3506bc785e 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -34,7 +34,7 @@ The simpler usage of `.SD` is for column subsetting (i.e., when `.SDcols` is spe ## Loading and Previewing Lahman Data -To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](http://www.seanlahman.com/baseball-archive/statistics/). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. +To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](https://www.seanlahman.com/baseball-archive/statistics/). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. ```{r download_lahman} load('Teams.RData') @@ -46,7 +46,7 @@ setDT(Pitching) Pitching ``` -Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](http://www.seanlahman.com/files/database/readme2017.txt) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. +Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](https://www.seanlahman.com/files/database/readme2017.txt) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. # `.SD` on Ungrouped Data From 441818c096882469fa3a16fd22cda9d7dfe707df Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 16 Oct 2020 16:23:05 -0600 Subject: [PATCH 109/588] more URL fixes to pass --as-cran on win-builder --- NEWS.0.md | 2 +- README.md | 6 +++--- man/fread.Rd | 2 +- man/setkey.Rd | 2 +- man/setorder.Rd | 2 +- vignettes/datatable-sd-usage.Rmd | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/NEWS.0.md b/NEWS.0.md index 67f4f73af1..44db687e05 100644 --- a/NEWS.0.md +++ b/NEWS.0.md @@ -947,7 +947,7 @@ > Reminder: bmerge allows the rolling join feature: forwards, backwards, limited and nearest. - 3. Sorting (`setkey` and ad-hoc `by=`) is faster and scales better on randomly ordered data and now also adapts to almost sorted data. The remaining comparison sorts have been removed. We use a combination of counting sort and forwards radix (MSD) for all types including double, character and integers with range>100,000; forwards not backwards through columns. This was inspired by [Terdiman](https://codercorner.com/RadixSortRevisited.htm) and [Herf's](https://stereopsis.com/radix.html) (LSD) radix approach for floating point : + 3. Sorting (`setkey` and ad-hoc `by=`) is faster and scales better on randomly ordered data and now also adapts to almost sorted data. The remaining comparison sorts have been removed. We use a combination of counting sort and forwards radix (MSD) for all types including double, character and integers with range>100,000; forwards not backwards through columns. This was inspired by [Terdiman](https://codercorner.com/RadixSortRevisited.htm) and [Herf's](http://stereopsis.com/radix.html) (LSD) radix approach for floating point : 4. `unique` and `duplicated` methods for `data.table` are significantly faster especially for type numeric (i.e. double), and type integer where range > 100,000 or contains negatives. diff --git a/README.md b/README.md index 73bc371927..e58941a848 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://codecov.io/github/Rdatatable/data.table?branch=master) [![GitLab CI build status](https://gitlab.com/Rdatatable/data.table/badges/master/pipeline.svg)](https://gitlab.com/Rdatatable/data.table/pipelines) [![downloads](https://cranlogs.r-pkg.org/badges/data.table)](https://www.rdocumentation.org/trends) -[![depsy](https://depsy.org/api/package/cran/data.table/badge.svg)](https://depsy.org/package/r/data.table) +[![depsy](http://depsy.org/api/package/cran/data.table/badge.svg)](http://depsy.org/package/r/data.table) [![CRAN usage](https://jangorecki.gitlab.io/rdeps/data.table/CRAN_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) [![BioC usage](https://jangorecki.gitlab.io/rdeps/data.table/BioC_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) [![indirect usage](https://jangorecki.gitlab.io/rdeps/data.table/indirect_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) @@ -30,7 +30,7 @@ * fast and friendly delimited **file reader**: **[`?fread`](https://rdatatable.gitlab.io/data.table/reference/fread.html)**, see also [convenience features for _small_ data](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread) * fast and feature rich delimited **file writer**: **[`?fwrite`](https://rdatatable.gitlab.io/data.table/reference/fwrite.html)** * low-level **parallelism**: many common operations are internally parallelized to use multiple CPU threads -* fast and scalable aggregations; e.g. 100GB in RAM (see [benchmarks](https://h2oai.github.io/db-benchmark) on up to **two billion rows**) +* fast and scalable aggregations; e.g. 100GB in RAM (see [benchmarks](https://h2oai.github.io/db-benchmark/) on up to **two billion rows**) * fast and feature rich joins: **ordered joins** (e.g. rolling forwards, backwards, nearest and limited staleness), **[overlapping range joins](https://github.com/Rdatatable/data.table/wiki/talks/EARL2014_OverlapRangeJoin_Arun.pdf)** (similar to `IRanges::findOverlaps`), **[non-equi joins](https://github.com/Rdatatable/data.table/wiki/talks/ArunSrinivasanUseR2016.pdf)** (i.e. joins using operators `>, >=, <, <=`), **aggregate on join** (`by=.EACHI`), **update on join** * fast add/update/delete columns **by reference** by group using no copies at all * fast and feature rich **reshaping** data: **[`?dcast`](https://rdatatable.gitlab.io/data.table/reference/dcast.data.table.html)** (_pivot/wider/spread_) and **[`?melt`](https://rdatatable.gitlab.io/data.table/reference/melt.data.table.html)** (_unpivot/longer/gather_) @@ -72,7 +72,7 @@ DT[Petal.Width > 1.0, mean(Petal.Length), by = Species] ### Getting started -* [Introduction to data.table](https://cloud.r-project.org/web/packages/data.table/vignettes/datatable-intro.html) vignette +* [Introduction to data.table](https://cran.r-project.org/package=data.table/vignettes/datatable-intro.html) vignette * [Getting started](https://github.com/Rdatatable/data.table/wiki/Getting-started) wiki page * [Examples](https://rdatatable.gitlab.io/data.table/reference/data.table.html#examples) produced by `example(data.table)` diff --git a/man/fread.Rd b/man/fread.Rd index f432e70c30..48eb9625bb 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -128,7 +128,7 @@ When \code{input} begins with http://, https://, ftp://, ftps://, or file://, \c Background :\cr \url{https://cran.r-project.org/doc/manuals/R-data.html}\cr \url{https://stackoverflow.com/questions/1727772/quickly-reading-very-large-tables-as-dataframes-in-r}\cr -\url{https://www.biostat.jhsph.edu/~rpeng/docs/R-large-tables.html}\cr +\url{http://www.biostat.jhsph.edu/~rpeng/docs/R-large-tables.html}\cr \url{https://www.cerebralmastication.com/2009/11/loading-big-data-into-r/}\cr \url{https://stackoverflow.com/questions/9061736/faster-than-scan-with-rcpp}\cr \url{https://stackoverflow.com/questions/415515/how-can-i-read-and-manipulate-csv-file-data-in-c}\cr diff --git a/man/setkey.Rd b/man/setkey.Rd index 387b62114d..daf10c83ad 100644 --- a/man/setkey.Rd +++ b/man/setkey.Rd @@ -93,7 +93,7 @@ reference. \references{ \url{https://en.wikipedia.org/wiki/Radix_sort}\cr \url{https://en.wikipedia.org/wiki/Counting_sort}\cr - \url{https://stereopsis.com/radix.html}\cr + \url{http://stereopsis.com/radix.html}\cr \url{https://codercorner.com/RadixSortRevisited.htm}\cr \url{https://cran.r-project.org/package=bit64}\cr \url{https://github.com/Rdatatable/data.table/wiki/Presentations} diff --git a/man/setorder.Rd b/man/setorder.Rd index 267e5b5ca0..6e7b598427 100644 --- a/man/setorder.Rd +++ b/man/setorder.Rd @@ -105,7 +105,7 @@ If you require a copy, take a copy first (using \code{DT2 = copy(DT)}). See \references{ \url{https://en.wikipedia.org/wiki/Radix_sort}\cr \url{https://en.wikipedia.org/wiki/Counting_sort}\cr - \url{https://stereopsis.com/radix.html}\cr + \url{http://stereopsis.com/radix.html}\cr \url{https://codercorner.com/RadixSortRevisited.htm}\cr \url{https://medium.com/basecs/getting-to-the-root-of-sorting-with-radix-sort-f8e9240d4224} } diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 3506bc785e..8f23c58554 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -34,7 +34,7 @@ The simpler usage of `.SD` is for column subsetting (i.e., when `.SDcols` is spe ## Loading and Previewing Lahman Data -To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](https://www.seanlahman.com/baseball-archive/statistics/). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. +To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](http://www.seanlahman.com/baseball-archive/statistics/). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. ```{r download_lahman} load('Teams.RData') @@ -46,7 +46,7 @@ setDT(Pitching) Pitching ``` -Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](https://www.seanlahman.com/files/database/readme2017.txt) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. +Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](http://www.seanlahman.com/files/database/readme2017.txt) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. # `.SD` on Ungrouped Data From 4e687cd662fa10f79938ad551eb3e179dbf46e51 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 16 Oct 2020 20:17:24 -0600 Subject: [PATCH 110/588] list columns in .SD copied too (#4763) --- NEWS.md | 4 ++-- inst/tests/tests.Rraw | 8 ++++++++ src/dogroups.c | 10 +++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 28cbcc872f..753c6aae19 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,9 +12,9 @@ 2. A regression in v1.13.0 resulted in installation on Mac often failing with `shared object 'datatable.so' not found`, and FreeBSD always failing with `expr: illegal option -- l`, [#4652](https://github.com/Rdatatable/data.table/issues/4652) [#4640](https://github.com/Rdatatable/data.table/issues/4640) [#4650](https://github.com/Rdatatable/data.table/issues/4650). Thanks to many for assistance including Simon Urbanek, Brian Ripley, Wes Morgan, and @ale07alvarez. There were no installation problems on Windows or Linux. -3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. Related, and also fixed, was a segfault revealed by package POUMM, [#4746](https://github.com/Rdatatable/data.table/issues/4746), when grouping a list column where each item has an attribute; e.g., `coda::mcmc.list`. Detected thanks to CRAN's ASAN checks. Thanks to Venelin Mitov for assistance in tracing the memory fault. +3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. Related, and also fixed, was a segfault revealed by package POUMM, [#4746](https://github.com/Rdatatable/data.table/issues/4746), when grouping a list column where each item has an attribute; e.g., `coda::mcmc.list`. Detected thanks to CRAN's ASAN checks, and thanks to Venelin Mitov for assistance in tracing the memory fault. Thanks also to Hongyuan Jia and @ben-schwen for assistance in debugging the fix in dev to pass reverse dependency testing which highlighted, before release, that package `eplusr` would fail. Its good usage has been added to `data.table`'s test suite. -4. `fread("1.2\n", colClasses='integer')` would segfault when creating the warning message due to no column names in the output, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. +4. `fread("1.2\n", colClasses='integer')` (note no columns names in the data) would segfault when creating a warning message, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present however, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. 5. `dplyr::mutate(setDT(as.list(1:64)), V1=11)` threw error `can't set ALTREP truelength`, [#4734](https://github.com/Rdatatable/data.table/issues/4734). Thanks to @etryn for the reproducible example, and to Cole Miller for refinements. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c981f3e739..f5f7136641 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17178,3 +17178,11 @@ DT = data.table(A=rnorm(100), B=rep(c("a","b"),c(47,53)), C=rnorm(20), D=1:20) test(2157, DT[, head(setorderv(.SD, "A")), by=B]$D, INT(18,6,3,8,9,6,12,17,18,5,20,4)) +# .SD list column itself needs copy, #4761 +DT = data.table(value=as.list(1:2), index=1:2) +test(2158.1, DT[, .(value = list(value)), index], + data.table(index=1:2, value=list( list(1L), list(2L) ))) +DT = data.table(value=as.list(1:6), index=rep(1:2, each=3)) +test(2158.2, DT[, by="index", list(value=list(value))], + data.table(index=1:2, value=list(as.list(1:3), as.list(4:6)))) + diff --git a/src/dogroups.c b/src/dogroups.c index 15962e697c..6ef4cb9815 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -46,9 +46,13 @@ static bool anySpecialStatic(SEXP x) { return false; if (isVectorAtomic(x)) return ALTREP(x) || TRUELENGTH(x)<0; - if (isNewList(x)) for (int i=0; i Date: Sat, 17 Oct 2020 00:01:58 -0600 Subject: [PATCH 111/588] spelling in NEWS.md --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 753c6aae19..810dd02961 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,7 +20,7 @@ ## NOTES -1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accomodate `bit64`'s update. +1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accommodate `bit64`'s update. The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on (rather than importing) `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. From 63632e6f55f1f5289c689edab37f6a69d2df25cf Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 19 Oct 2020 21:59:12 -0600 Subject: [PATCH 112/588] 1.13.2 on CRAN. Bump to 1.13.3 --- .dev/CRAN_Release.cmd | 41 ++++++++++++++++++++--------------------- DESCRIPTION | 2 +- Makefile | 6 +++--- NEWS.md | 9 ++++++++- src/init.c | 2 +- 5 files changed, 33 insertions(+), 27 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 0e970df497..625c745d8e 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -217,15 +217,15 @@ test.data.table() install.packages("xml2") # to check the 150 URLs in NEWS.md under --as-cran below q("no") R CMD build . -R CMD check data.table_1.13.1.tar.gz --as-cran -R CMD INSTALL data.table_1.13.1.tar.gz --html +R CMD check data.table_1.13.3.tar.gz --as-cran +R CMD INSTALL data.table_1.13.3.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.13.1.tar.gz +R CMD check data.table_1.13.3.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -242,9 +242,9 @@ q("no") # User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 # Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. -PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.13.1.tar.gz +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.13.3.tar.gz # Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work -PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.13.1.tar.gz +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.13.3.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) @@ -276,7 +276,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.13.1.tar.gz +R310 CMD INSTALL ./data.table_1.13.3.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -288,7 +288,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.13.1.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.13.3.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -296,7 +296,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.13.1.tar.gz +R CMD check data.table_1.13.3.tar.gz ##################################################### @@ -346,8 +346,8 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-gcc CMD INSTALL data.table_1.13.1.tar.gz -Rdevel-strict-clang CMD INSTALL data.table_1.13.1.tar.gz +Rdevel-strict-gcc CMD INSTALL data.table_1.13.3.tar.gz +Rdevel-strict-clang CMD INSTALL data.table_1.13.3.tar.gz # Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so should be passed through to here Rdevel-strict-gcc Rdevel-strict-clang # repeat below with clang and gcc @@ -388,7 +388,7 @@ cd R-devel-valgrind make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems -Rdevel-valgrind CMD INSTALL data.table_1.13.1.tar.gz +Rdevel-valgrind CMD INSTALL data.table_1.13.3.tar.gz R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" # the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. # including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks @@ -426,7 +426,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.13.1.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.13.3.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -579,7 +579,7 @@ du -k inst/tests # 1.5MB before bzip2 inst/tests/*.Rraw # compress *.Rraw just for release to CRAN; do not commit compressed *.Rraw to git du -k inst/tests # 0.75MB after R CMD build . -R CMD check data.table_1.13.0.tar.gz --as-cran +R CMD check data.table_1.13.2.tar.gz --as-cran # bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit compressed *.Rraw to git # @@ -587,11 +587,10 @@ Resubmit to winbuilder (R-release, R-devel and R-oldrelease) Submit to CRAN. Message template : ------------------------------------------------------------ Hello, -870 CRAN revdeps checked. -The following 3 are impacted and we have communicated with their maintainers: - expss nc memochange -All known issues resolved including clang-UBSAN additional issue. -Solaris is not resolved but this release will write more output upon that error so I can continue to trace that problem. +921 CRAN revdeps checked. None are impacted. +valgrind 'additional check' fixed. +Solaris not yet resolved. +POUMM's gcc-ASAN error resolved by this data.table update. Many thanks! Best, Matt ------------------------------------------------------------ @@ -610,8 +609,8 @@ When CRAN's email contains "Pretest results OK pending a manual inspection" (or 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. 4. Bump dllVersion() in init.c 5. Bump 3 version numbers in Makefile -6. Search and replace this .dev/CRAN_Release.cmd to update 1.12.9 to 1.13.1, and 1.12.8 to 1.13.0 (e.g. in step 8 and 9 below) +6. Search and replace this .dev/CRAN_Release.cmd to update 1.13.1 to 1.13.3, and 1.13.0 to 1.13.2 (e.g. in step 8 and 9 below) 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -8. Push to master with this consistent commit message: "1.13.0 on CRAN. Bump to 1.13.1" -9. Take sha from step 8 and run `git tag 1.13.0 34796cd1524828df9bf13a174265cb68a09fcd77` then `git push origin 1.13.0` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +8. Push to master with this consistent commit message: "1.13.2 on CRAN. Bump to 1.13.3" +9. Take sha from step 8 and run `git tag 1.13.2 34796cd1524828df9bf13a174265cb68a09fcd77` then `git push origin 1.13.2` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) ###### diff --git a/DESCRIPTION b/DESCRIPTION index 3fb8269de6..06b7f33c3d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.13.1 +Version: 1.13.3 Title: Extension of `data.frame` Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), diff --git a/Makefile b/Makefile index d10a43fc9d..46061efeeb 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.13.1.tar.gz + $(RM) data.table_1.13.3.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.13.1.tar.gz + $(R) CMD INSTALL data.table_1.13.3.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.13.1.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.13.3.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NEWS.md b/NEWS.md index 810dd02961..5e3fb88de6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,12 +2,19 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.13.1](https://github.com/Rdatatable/data.table/milestone/19) (in development) +# data.table [v1.13.3](https://github.com/Rdatatable/data.table/milestone/20) (in development) ## NEW FEATURES ## BUG FIXES +## NOTES + + +# data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1) (19 Oct 2020) + +## BUG FIXES + 1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing. 2. A regression in v1.13.0 resulted in installation on Mac often failing with `shared object 'datatable.so' not found`, and FreeBSD always failing with `expr: illegal option -- l`, [#4652](https://github.com/Rdatatable/data.table/issues/4652) [#4640](https://github.com/Rdatatable/data.table/issues/4640) [#4650](https://github.com/Rdatatable/data.table/issues/4650). Thanks to many for assistance including Simon Urbanek, Brian Ripley, Wes Morgan, and @ale07alvarez. There were no installation problems on Windows or Linux. diff --git a/src/init.c b/src/init.c index cc51f9f2da..1247c585b6 100644 --- a/src/init.c +++ b/src/init.c @@ -412,6 +412,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion() { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.13.1"))); + return(ScalarString(mkChar("1.13.3"))); } From 72124cfe0d327e158a09a385df60c53d731e0e38 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 31 Oct 2020 02:34:45 +0200 Subject: [PATCH 113/588] update unit test err msg, closes #4769 (#4770) --- NEWS.md | 4 +++- inst/tests/tests.Rraw | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5e3fb88de6..f7c46a17b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.13.3](https://github.com/Rdatatable/data.table/milestone/20) (in development) +# data.table [v1.13.3](https://github.com/Rdatatable/data.table/milestone/21) (in development) ## NEW FEATURES @@ -10,6 +10,8 @@ ## NOTES +1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://www.h2o.ai/blog/behind-the-scenes-of-cran/). + # data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1) (19 Oct 2020) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f5f7136641..e3753ebd2d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4195,7 +4195,7 @@ setNumericRounding(old_rounding) DT = data.table(id=INT(1,2,1), val1=3:1, val2=3:1, val3=list(2:3,4:6,7:10)) # 5380 test(1199.1, DT[, sum(.SD), by=id, .SDcols=2:3], data.table(id=1:2, V1=INT(8,4))) #875 made the .SD case work -test(1199.2, DT[, sum(.SD), by=id], error="only defined on a data frame with all numeric variables") +test(1199.2, DT[, sum(.SD), by=id], error="data.*frame.*numeric") # this is R's error message so use flexible string pattern to insulate from minor changes in R, #4769 test(1199.3, DT[, sum(val3), by=id], error="Type 'list' not supported by GForce sum [(]gsum[)]. Either.*or turn off") # Selection of columns, copy column to maintain the same as R <= 3.0.2, in Rdevel, for now From 7319ead97e8ab667ee7f70bf067e8b00ecc8e9bf Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 31 Oct 2020 10:16:44 -0600 Subject: [PATCH 114/588] Removed Depsy badge due to it not working on CRAN's display of README, although it was fine on GitHUb. Perhaps because it is http and not https. Moved it to be a link in community section which sits better there. --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index e58941a848..01e7b4aa15 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,6 @@ [![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://codecov.io/github/Rdatatable/data.table?branch=master) [![GitLab CI build status](https://gitlab.com/Rdatatable/data.table/badges/master/pipeline.svg)](https://gitlab.com/Rdatatable/data.table/pipelines) [![downloads](https://cranlogs.r-pkg.org/badges/data.table)](https://www.rdocumentation.org/trends) -[![depsy](http://depsy.org/api/package/cran/data.table/badge.svg)](http://depsy.org/package/r/data.table) [![CRAN usage](https://jangorecki.gitlab.io/rdeps/data.table/CRAN_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) [![BioC usage](https://jangorecki.gitlab.io/rdeps/data.table/BioC_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) [![indirect usage](https://jangorecki.gitlab.io/rdeps/data.table/indirect_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) @@ -82,7 +81,7 @@ DT[Petal.Width > 1.0, mean(Petal.Length), by = Species] ## Community -`data.table` is widely used by the R community. It is being directly used by hundreds of CRAN and Bioconductor packages, and indirectly by thousands. It is one of the [top most starred](https://www.r-pkg.org/starred) R package on GitHub. If you need help, the `data.table` community is active on [StackOverflow](https://stackoverflow.com/questions/tagged/data.table). +`data.table` is widely used by the R community. It is being directly used by hundreds of CRAN and Bioconductor packages, and indirectly by thousands. It is one of the [top most starred](https://www.r-pkg.org/starred) R packages on GitHub, and was highly rated by the [Depsy project](http://depsy.org/package/r/data.table). If you need help, the `data.table` community is active on [StackOverflow](https://stackoverflow.com/questions/tagged/data.table). ### Stay up-to-date From 5f9d075e67d4be29f134d6b030b41f593072d75b Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 31 Oct 2020 12:32:55 -0400 Subject: [PATCH 115/588] new translations for upcoming release (#4765) --- .dev/CRAN_Release.cmd | 26 +- inst/po/en@quot/LC_MESSAGES/R-data.table.mo | Bin 91028 -> 91028 bytes inst/po/en@quot/LC_MESSAGES/data.table.mo | Bin 139702 -> 143608 bytes inst/po/zh_CN/LC_MESSAGES/data.table.mo | Bin 138861 -> 142657 bytes po/R-data.table.pot | 4 +- po/data.table.pot | 748 ++++++++++-------- po/zh_CN.po | 819 ++++++++++++-------- src/assign.c | 2 +- src/fifelse.c | 42 +- src/forder.c | 4 +- src/fwrite.c | 22 +- src/nafill.c | 2 +- src/snprintf.c | 39 +- 13 files changed, 983 insertions(+), 725 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 625c745d8e..902d66efd1 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -19,40 +19,28 @@ for MSG in error warning DTWARN DTPRINT Rprintf STOP Error; do for SRC_FILE in src/*.c; # no inplace -i in default mac sed - do sed -E "s/$MSG[(]("[^"]*")/$MSG(_(\1)/g" $SRC_FILE > out; + do sed -E "s/$MSG[(](\"[^\"]*\")/$MSG(_(\1)/g" $SRC_FILE > out; mv out $SRC_FILE; done done ## checking for other lines calling these that didn't get _()-wrapped for MSG in error warning DTWARN DTPRINT Rprintf STOP Error; - do grep -Er "\b$MSG[(]" src --include=*.c | grep -v _ | grep -Ev "(?://|[*]).*$MSG[(]" + do grep -Er "\b$MSG[(]" src --include=*.c | grep -v _ | grep -Ev "(?:\s*//|[*]).*$MSG[(]" +done ## similar, but a bit more manual to check snprintf usage ## look for char array that haven't been covered yet -grep -Er '"[^"]+"' src --include=*.c | grep -Fv '_("' | grep -v "#include" | grep -v '//.*".*"' +grep -Er '"[^"]+"' src --include=*.c | grep -Fv '_("' | \ + grep -Ev '#include|//.*".*"|strcmp|COERCE_ERROR|install\("|\{"' ## look for lines starting with a char array (likely continued from prev line & can be combined) grep -Er '^\s*"' src/*.c -## Now extract these messages with xgettext -cd src -xgettext --keyword=_ -o data.table.pot *.c -cd .. - ## (b) Update R template file: src/R-data.table.pot -## much easier, once the update_pkg_po bug is fixed -R --no-save -## a bug fix in R still hadn't made the 2019-12-12 release, -## so run the following to source the corrected function manually -STEM='https://raw.githubusercontent.com/wch/r-source/trunk/src/library/tools/R' -source(file.path(STEM, 'utils.R')) -source(file.path(STEM, 'xgettext.R')) -source(file.path(STEM, 'translations.R')) -## shouldn't be any errors from this... -update_pkg_po('.') -q() +## NB: this relies on R >= 4.0 to remove a bug in update_pkg_po +Rscript -e "tools::update_pkg_po('.')" # 2) Open a PR with the new templates & contact the translators # * zh_CN: diff --git a/inst/po/en@quot/LC_MESSAGES/R-data.table.mo b/inst/po/en@quot/LC_MESSAGES/R-data.table.mo index 3f5d477edd3f3e64e839d2e3a02cf8e9d889874a..95fcad832b98620b8aeff0f526f36f8b0a2704f6 100644 GIT binary patch delta 34 qcmbPooOQ}^)(!8DGaKs}PX2gYg4@tQ*U((S(8$WbbhF5bhS>o0feh;a delta 34 qcmbPooOQ}^)(!8DGaKnyPX2gYg4@7c*U((S(8S8vVzbDJhS>o1s|^4E diff --git a/inst/po/en@quot/LC_MESSAGES/data.table.mo b/inst/po/en@quot/LC_MESSAGES/data.table.mo index c89d3bf1dec90c2ef526210be91a9d37b71b8955..5bc184735c5016b975a7ca1891418f616a8dc13b 100644 GIT binary patch delta 22374 zcmeI3cX(CRwzt;`y-2U2?Es+@2p~uey@W2HNHK(DhlEs8Xrdbg0TodRRX{)x0TGlI zMNttDj)Dadkt!$xih|_;D$0F-YtCroobS8ObH984xX<%V&&WH*m~*W)%NTR6y*Fpa zp29cI77m;)lYfEhI8N1kj?)yDs_Zy1`5otKxZ?z2PwC<~VQ>!Y+7*RTwXfsonrg5H ztN}N}2-pi=fj-!}pW{@A!(dG~2{wky;Bfd}KPTYKMyb=^apK`-I15%9U~aGyj>g^% z3&Yw29fu}PW2nwP2%E!AFc_YL$?$im2PWRlz2RKg0d9w5;m=U*qXq>Wrv=L7L5|Y| zZiZ#y8K|EA1WUq@!H&bkIN|Uh>*yIJHh&J zFx34X3ZUqM#g@A)&%xIC3*Bq#J3)0Y4r=5Jp*pr7GTY8)uq6x~YDP8!F2SAv?}MfJ z(*q_#&CFJKCkz}#Q3r|-GYq%92eRbO1F$@N3kJhaVJN%`!(jy$azE?|wbrFZI1ZuY zG=aNdbJ!mK01L`LlFq=EupG~KdZN_BF%sSnXTsj_Jk*23aq5OWpn|P8ybC5mE!7$r z1Yd;>;OkJa@-?giuRvDODHg?a!zifw({LHjcP^l;#W7=)<1By;qiF*@-d+8@ zsST?o5Sg$u)Ko5l-Qhd10xUxAbb&t0Sjg-;>nsZ;Iu2uS;w;~X`LUZOIZiWpC)@(# zVMCYYNoM{vvK}dBO5)%?>;*6b4o@}t`XS81Zo_(1gD=BA@B^r+tTN7wG!iP^5@A!g z61IXLL(M>ubdyExVJ+ThhxET`I&S9trHl`PK;A&VIz6Z50d<74}GMQ{X z@C>{Y&gG)|a4*!1eh(R}Q+B-BFQ!5+bv}fIkJBYEf$2h73#-CACYqi$fLiM$i;$ z ztOxr}B>>@gcn^FH63|YmY36X52wj5b9rywMy$_qE%AU@4j9p}g@Pz*C((I^vyO=KOE{tE8q}V;ZZsTlVC;oAiN7M zfW)iw0iRNmR3F~S^PP1ln$pv-8f-epY_ zX&x}o@+8!K>#Q(1sM)Iy@uQlues)t&ld!IKu z+$=Z<`&HN#7GG;P5O%{}2o)12Evvl1O9=KbSOW%@q0~cp!#d8xRO|xlOblef?%3O5 z61)kU!i4pP&%jvh_hEn7bb|@TsW1+EE7XXCUo=x52Ni^CU^;^l@e(H%c5A4Se`;A^lRZ?RZWs?W)vv?B@H*7U`)oFc)Z9Uz}k z^uTMd4ea!a;|zdPpe{HBH3J2=n5pdxqp;^eP34cU5p1#5#7s2Qb#KB3@H*@WYi=_g zih=d8pMyPlzVklHL$KU-6ARf;>2wmdfW953V?$tP?5AKu_z7$ZZ^0q3X^xrdneY+p z!*C`X`YL+^JOc;A*w@Sqyaoemc;`-YD!mQ|VK;f*aVEm)mN%h-GmFS<4!6J%_%*x} z7T#rMpaX1=od{dO^-xQ87OI1#-()aw4D1bOzsdY-ghy~_35vXBf^INuhCLr9!TnHE z*5Yjw)gz(OXEp2ttL--RV_|FTY^WtT1fyWVcT7j(U=Qqt);|4Cz%(eahh3106c_;y zTb9{tqWW&w6903sCj1<#<2T{mFnphxp&76z_HL*S6n)p6rd^;0kPHHTIK2*bxU_)5ops~ZDvLgyS5&pD= z$6yL9_n{fkbQp`h36f<2r|d^2=whKd@+#~Gi+yZP#euLh_G&m8ehmq1C;G72YAb(Y z>SN$={A*z{3_4`zVm-LS3lKmSqF(~eMUKLwV7Z@?+=L)a1a`ONhE z3Ao86@8L4I>T~nj-Qk3Z>XA^w0zL|RoHN<+F`ViW9B58Bj|X`L!9)Q&3B~ z5o$?p!3>yu!36Kw3(Ws`92GB`ESL*BV80I|U{MxsFpPw~;0nl~olCGF9Xjw0(~te( zw|sKK9(UQi*H8P-ywpAq)uBU{WxqGUdmn6!fA;qQE<)LXLu*y<3TeTOGT;*I6IadJ zXZ=8msvRW0oNYhyksZ$d$-J_K{A{K?0(QjzI1GUY;XwF3R7`dJr}-Q)2x{h&11Qx| z9)Y^y8aN-ezs6<*55s3+)^+1Acf-6suZQ#SclyP&I|6mxgkQ~livMOl%svklBW-V* z*ZDD)bD?HFupLEH-H72SN|Rt)xCpj`pTJC5>JKF=9fkE^fm;jKK z%|Io`^-j+=um$!s*a+@`9dyetQJUeX$xFR190Gg7B~bozus$pwVD&(I8Xt@LSz<&kyf~|_Ujt@?Qt>JS}QGXI@AT^7cnTvyWVm||e;YTn~ z1?5{5ji^vDGtw}qpy>v+p^SrC^YxHHI)|ZdaICoNePa3vYNH9g!}U(dAyDn-z=z;= zYquz21~wAjhkt4bH{k6^$8l&X?<9%C;c!?2J^@qUGN>smR?3XDIaK?>mXl#k?3Iw9 za`r$iK~QNEOI2ZWY(LBgm%uV`d1*dXv_yFshobcYRL~SD<9Z*(szU`&JE#%HLB+~K zH~=1lT7vpzT`%u5EVn`JjQ@nuuwXgYJBH)oFzmHZ18@W7&6L-Ln(_|T9t&$=PqY42 za3}U5=`o0Odf516` z;;mIR(}Ai`4Z6boFdAxYy}W_J{DGhE1_a22dcwo zyn6EQI}~L>ts3Tn2&k#gg6hZu*b#1lE#ObED6Ho*G1DB@#vTHd*OOr-xCAQ5wm{9q zJ~$eF3AOch58*jH-`Rzt5uJh>saw-@q&B=Cdl=Ldz6RByuv(^lZ>X&|6>6h-5vn6c zp@Qx=sF9AW?K-1iB77PigoEMzq2&JqC|gmKk4@^hjt@Qx)sYQwEj$F{U}jyD_a~t0 zufe9Uay^q~k?hLW$L>%HUkf=b6ZQ0cV? z&V=O}nYCUDHI=(z2s{H7&2D28LrtL`5D9h2EQU(MeNZ!Z6gGxGK`nVmSinqO`!Lsg zi;ais*-KF8^I@oL_y%fCD>N}P&;-U{N5U2^?}HFHUp6(da~^KSE)Z_^hwadZeG4i^ ze9g=ZMg&k4rK4dKTny*H8!!nz(%f|(hd)9ee2Awhh*rSDFz8O3hEPi}94h+fKyA@) z!(?~@YRP)FG-;g#J7Wj3QMAP#g}R_yD--3Tpmw|kP^Z>DsHyuLmV|}wGE-g=9>flZ z{oo%^GuF4YS(-^uGrG?55L6cZ2+59sQ@4%TV%tFN1F=vSJY>1r@&Ig&|2u2fZEHH% z2WsR~pgQ&vR1EBbilt)h%#?SAOR%G%g8etxL+AgH_GW69LoLPIP#w5tS*3%qJ3<}1 zepnv91T|xOpf;k@P+M*O2-o{Xq$Si^{{i#CY8_o?Hw=O8;g_(W@;{`LxnVt64!Z-? zUOfmZ|EIv-@C4L@>vlFbY!7Q-cY=4pXsD%n1_r@RQ2W9wP_c3nD(cU}LD20&{y&T| z1f@7U23NxKa08s#)l6AMms_IW1;q$nNYzQ z)YH_r>`DH&#c>}F4Eg`8te;!I5U_r^B1r+o7U< zOK;Q8*T-y7U7(hB4%CayD}BiSE-069)PjxsnzbAVHL?kCC|n8E!Czn`tk=)1eFoIh z%(XlYwNuvVZ}LlGY2&iTLj}$GyUhdh4|2U6!t?=dqQ2{q#Zs2K>nj-oaB3F>&P zd#_ou{;&@AIH(|847DWJbNxe2x-N#=7q-A7@C&#JZokiDQ|vI4-p@dFXfK=yFG2Q; zfHQiy3658wM%H+Q>!icQPY9>y=1~A_!GlDRv85wFBfXe%|a3DMgN5bmShErf7 z_Hn3~=swz<881P-GnS5Vox7F)!%>#PO>hB>jCGx7;1^Iq_<-N+)ob8A*cad!*dfmB zbZcP&V&`Xg8vn0jOl*7}Zw{^g39k2xNIaZQyC!4JYx&kh^1l-oG)po&QYO>`w?JjX z38<;N1qZ-8lg;b(OsK5c2lXaXB*pc<@t6urVIPG$Jug57-z}I5?@D#Oujki69X<`y z$bU_FN}4(6vteuO;BjV32SY{oTBxmdFI04&f$CV9bQ23bp&m2=YHDAGYWFMb02^nR zm)9hyGhriCtbLggU=2|EWtw1G1$CHw4i#iYv&@Fm8b)9zz(#OAEDeuBt?flv2KFEC zdf$?bhMJ+}P}y-7c84V=n2qQj*amw=0A(}Eai|_WHjyA?js6MeV+T(rsdf!RPTV@a1j1L3a#}fK6tYwVMMqq7Pv-tni3gl8I2)Z-PDGO>1|ZY5en`?)wE~UkNx> zXPGVZ0jLdTKkNxR&33&nARdEn$PaZ`&3M$X&||Lmi^yY8XF~bMP5W+eIQGL(L3`S= z$s9A#sW1}%VUOfrwI@u^;$aOcc0gspQET6ViryM?O$w6;K~s62d0!t1wE?{e)#FRnZjo*Fizuj( z|7O{6zIo7as3n^VHPz?gU>LH%WW#tk8hbkoC~Yb&M1cdLF8mnkf%O)d3o@Xlatl=Z ze2dKtw1Zu-r^6U{0Cs|nmY6I_fr^=>P}kjn8c_H%uG0~Yc!vB}elEwMDL)Cd_63%j z9jYIceF$o1Dl9WM8V0q2WLqAEiuTIO%~EuOL$Jp~P4z+e2(0w1*=L@GGq6KfkS3a% z4J*tcbPKBC11rsO>#lO0d$F_N6nMxo@;Nht{ZOZ8;nn6)X$Q42-4C@-%!hi{+YNPC z-GW-OR%=WL#{^J(ICeqx^fYV%8$558APPocZ-Cm-zJYq7sJ7P3*ke#>yA^tGOt24j z#tWwYZKxgf64VmZT4!G6MnH8u@CJ(V^eR-(?pkje#6xdQp*qlLgJBZvkG&CUE4>M8 z!saiUj^6_n%#Xu%@HEsCR@rFmSg3DQRza3L;JlBbsrnhJry(zyhQpycwiqf#zJLmz z8kON~=0=x_rr1!k64iol^QL=Cxf@5HpSIh`r zgi4=3pr)?P787I{P)oD|Y7ajGbyk$z>N+pM4p3Qf8R~)I+g$G#k)=@g8@t_n;#vzE zDDBGcFdIfks5E;XYU8*DHIVNsI7K4 z)M?giuX#t@4aZ@(*k|h3Ld8VIcg+&@fEv&gsO)+bsy_dI6TB_nGr8?9X6PI<(_s*IAFf z`!EB+PW;5Y%1u9FUTR-}I-WnY3{?BnOhqD8`aB2KvtzIyj66zKaHAz~6}EfKto>^E zJN8%bGkETE*ZTou=W+AOHt2+z@^P>u^}ApQ`~wbzzLWOze>zHK9FIV)`3k5F<#o6W zo`#Fz1Eue#9j%t=AS|B=`}8ySV)55*e^ou;isSvmj1@< ze9fRXq_I#ly973bTVXkP2G)l^!+>_a>fgHFKf|35l^)eDn=P_0)C@cc`@@~EJS_U1 z`AXFX^?+8e4jc$W;Y6sz>RG6b>|>~Qq2HVPMZmh)W4|Z=%c9J~Q5~*@L2#e755TXm zFF?h_!7Juax&pP64Z3PL2WqDFK}Gi+KbRRA4qIbSfl9+2Q11~pp=Pe*kL16iKJ!PD z#~Yx6a39o&&cX_?@K0t6Ye21SZ>Tk&1q--*;DFi(cKmFX=3}Ut%=b@oMl^?7;uQEW zTncrDR1aJ;BO3(o!!aIeL;476DjQxmHyj8{U{8lBa30hYx;M;7>qE7Vv`mAVv4v1+ z{2J5}6#m5=$2Fm5AdrZX4`l@`16M-@-wvo~{SGQmU|pvLcJ%DX{D?FehnhB^yYSpQq_ zP3&)>4zUe?n5aJnbzPxbtPB5Z4i&^Lh%8+n1-0Y>*hu-m9YxW44k~TRyM|q%rZ@xY zhD)Jtv>zUUKSJgA{-7W)xC-S9@}j*jyaWF#sC{BHEDh`B5Ar_Dwt+fDhr*UT-}w}! zDlA?g$P2pqP-`^+dT9bxKM&@Io1xZr8`Q`@gSB9Rf9OOlDCs-77JPd);U~RY>Dyhg;W!?H^PwJ8qo^5C1XLQvKy~afsP=EeOjxU!>Cj@Rfoz9b zl2cGiRjs(`NKY7zoeZ@!AHq@aRPjKN_mfD6JA%Ap^&|KI6`>`9ypPeVphi-tr0GZ_ zxE6a591DMi%6oq)Gv!mEHm223Z@nksM=-Q>koSwoHKCMy?OWI! z|6chA#0)r#P=;IQvv>-jPld7eqFl7KZS=AchyFHl(^ghQ=LN{=%yk>#Gsq9<90%UF z9J%*cjPESID{vz6lJY;dyhlYrYmA4l(m>?{{3_35KZ<;3Oy^1K{{>$sgj3vU4hut8 z%_+~es%W2#e2%XvJd3nJRJ1<@6MxPeQMI&WGDJabK+P zJ><_)4t)z&eHaHf!81rbM5T~{(=$&;6P`cgi)Qr_%EaHg1_SDIB-%RTZvZ|!gukhRI$ECu}d(hcI-dS1e~=@ zy;BkAuU4-_qciBQBdzQW*1&tLu4dW#L)nY;Kz7TZ`17>+341WoT@{oLwiDXGRw@48 zr12)}42OFW_7(nR4DWQv1JqBq_5Xx_;L9yp*st4%y^pTV<}LUI%+FxnqV6=dcD-zP z5%R9$l~16Ka%VObDs>QTHp>5j_;QdCbd{pn85J{=KyeNdm3iJ-P-BIk#wvA%&H>5N2IKH{oKLiHeqhU29 z8ObdJP-Y;J)}eQd`qWpp7rudgC;CRH*Mb7bWLt;o?9m@vmgDD>q%+dqe=PpKwrv%C zEjogVVb*aNzLKXSHE3KwQAIg!{SmZTU|i1cdD}l4w$W)p0c@~wEsYr!~kO=hAupW(iLY0=5dMC*(hfrqHHq_qqS=to0Iw5@x^lHtYrP!y`M$d0;8wcUkhgg*l z@wGRqvjcwz>|54ukNzvN9=kGi$I(@~8aR!xXQ4ky-9C5%-d@VnhEEx0{0mUZBYz-! zsAz~x=Zbc)I=`@)*BD8h8;xb zyzvj9!3(zLKJ;_wpU|KoT#ZEAi}iZ1bd9n%ZUf)I-VLiz_o&+2wy4fY^ly-RsejsD z6Ar&ZKEl5YiPa6~;CLP8Ey46|DvlrV|3ZB$^zP`7p{sPZlpc;$qWu)>J819KN0pRI z*xw>esQ=K`Rkv-u_`gA;=7`UW4_5!NBA56so_}0;8iESreKjb<5Mc{Q9 z23y%{_rXiFeH-6V=vx0W8kPS*QVwz&g7hZM$F zk2ddE{{c7_I|+Fj`4~G5S%t0=W7z{9!9Pzs@M4rh)Y`M*)I4>nzk=;Oe>283j1c5I zZW0d{Abrq7smp}lVXHihyy$6v{kfO=Rn$L%>u2;a$PDaa$UC;#0n35-CZZ2SdhrdF zQ;;?9g>xkrmBd*Zy9IV`sfxaiI`91+3_-t!ZzlTVwow=CB=n2aje*;(zmC1nYnI)t zKNpFAn}58sxIup!??7He?+r`Y*6(7EMy6r^fd3}?r-;fVw(+>OrG6^Y9#c%K&6~XBDpcjI%q=J|TZ*t65Vi=8(o=7Y*5t)OmM7ALZ zkdw#_q4}U$(vTU*Vq_a~06B?VMe_F~ z*pL=TPhIfh(D^g~NIq$$!B8QP~d7obc< z<|FHnH;`k^4(#5iTDcg%6p|t z`3m#U@11bJ_~^68}0BU12nM3*nOdoNeXp4>IK z`f;i>LRA&E)ak3sx>U{H)HArlIF2!&t;+4-v-s|`zCyP7aeS&*$(^igRkMqB3$9SR zn)!Y*ZwJO!%Wl&xIB=LMhU#^oBh-T{@t&xcU^UmvvV1G>5!0S;hV>QzC+F9yfqVFZbPN9E`BXfyIkK5Uv`Pe;6Oyk zUtgo<&TnUYntzqtFjH89CA)8rih=#obPr1ILVt(v_N)l5^RHRa5MS;)N$ym2!dH!ZXkn6E3ntAy$ z@e%QV&x-9$ysY?=y4$m&J+GU$XT{@9l@(jd_3l|SP%AXn7ayJBi}uAP#Krm3{VAEg zB!9}7%y?gBd~}L0I(7(b=5+e=R`!`ZaZ!sNZkfPe4{6ru-#;WZD>E%C(>Fdk zDa(KR{bQ1%GcsSvMJg(pfE{;h}%1Tc0rDP?K_NQm~GE;rA{v>~<-9PLwFkipa#<5vx8NLZDoi8TdACu^h_4%XI zlM=GS-zvAY-rxAqnjp+}0vbBoHjV(Zt!G%}*yp1_rh8 zc_HRY9X-|`lbPYG=bsqkPh)kWS)+txe|;|kv|brmX=$nHnf}-?=f8ASZt!RL{l56j z%(RR)O`0?w!#rh;4vR@mZgO{QbSD3ePVzUA7G~6hj101(NsI92;bDuWz2nZ^y7XM} z@*_L@8fc+2;|ZBqhNbk*XzQyR8rz0(*8MY`z3Pa#3F!$$9ZT&?8SGus$h)R*XK#wh zp3$k~erlSR;$hiOUau7Y_b1T7v~+()hX23)VOyWPKEFhE$jz#YIuvv_1pIMH(R4K- zMOy;7;g3#cL}?nV_qV7+IC@P?B-0_PCNPb*b>j}1shQD9QN(5Y_Mx$DeZ!)<_v+Vc zP)}Knu=@AT)drCOo%%*~>(^Bt?wpXEGy0;t#|??ijGB;;5|xpXmY$H3$!6mx@_9~N zbV8C}Q=Ic+VRuJ~LMc%R8BriSVY4!&~{no3{xMZ``y+&f2=}kG1~)R?cq!-?^3N zJlWgr`qy5bbF!a1IPCwlnP1G8(`cG|Jm`OXOV94N=EF+|v zqU7H<`zF7*-~3lM`?#D3iv)FXv!A+Nu99!`Bn}t7&`wBAPt>bVvOhUBeNxU_MT06A S|Gzd_M3d$@MJonvEczcv7qP|w delta 19344 zcmbu`cYGJc{_pYKB=k-~FA2R#2)&okdkr8pgixgk2}t#eAiWzfbV0h(TL5X&tAI#V zns_J&REiW8;lAEGli%TT?(eU=k287BXJ)od+1&)s%Uj8}A58AMo-tsK>o`tQf5!>K zE?FI?PJrXgB=93%9_~2BaVz$!Nro+`k>hBeOc;z=a08ad26z`kuw-M$$&D>BFAhW< zYc_Vk1C1S@Gm%X0CLA1B;RHj!l7vO@6xKqn$w}YJ zaf)FihT;^=kGrflP{$95aGZ=-5sP4B)bWP+$Y_U|)@|0SSdsEKHm=y(TwrGmqkKB* z!uBDZ=A6NDn5&Jsvk06=JOJBainfkZ52H~%^EH-5-*;qm0RitDi&&dtEy`ms6DFX# z>@*g@dsq_Fa`dBE12xvk+dB?Z$ti?;uozaw`KSt({ za}9OkA|1^EtD`1cEv$gi7>Ene4>w{_+=7}b*D(j)MMlB#@5HgOEvo(^F68{q4Kgb! z7|Ge^V7^Gk6L<3CL(JC2bope|g?@v1@E02=jWS(Y0#)7;M_??L#z&|w&)?N?Dq}s= zTp5dHIKQ)*%rLx*8q=s~)Aa*UlWQBQYi}Vp<2c>Sv-_CSW9SCT6I*a4;%9i*+z;xaol*7*Bi{XJIFnz~}fo7RK=-9j7L4z&7{< znN~ih!6@_6*n}>V;&1$p2LF6)#_AM9l$E&VXvaB%9dIn|@_xeOlX&GA^MJ`U);uvg zpoVA;2IDSNk6y*bnEF#DIJU!DTK}`j=s=gTH8vZ^a~)S;7Tk^%@H=GQILXE{A@L*B z9sPy*vFZf#fa!`Fs#~ZY%`(wU#@?8ncmkHfFR?7=cP^08WJop1oTw5uAdW#b+=JEd z5mv_%lg+b!0J4&tmDmz9Ofhq*8-@`tL>>1uYIVFq?f>3X681%31~T)=^uVt%BW9dt zx~KrwBJP64aXHq-@3B5+n{L{5#SO%ZP#4r>2FsQq+JQ5PukfQd@dr!{mCyamxHFFa z_fc?lwmCt2W{k#qB1Yj-EQbEfEOmJaq*tBI$Y?l~<~k17?CeMNSd)2XF2rLK;tQx% zm2bY8bdlJRcspw70vFK#x}!D=%n28v#_kR30A&}NWjGdff|J%vi_C!|u|4Grun|5- z7KT%EvFV}RScN#r7ml-A<)|BZV;$pL;y4VSlVqtGqnW6&+mEgAG1kU9Um7Q49pW>n zIgx3ZF#^Mh<1iR6U}1b_`dV!?pffuQ$sf z9y?NT2(@FL4dxEoV-4bQs0%rQg)zxSGsdC#G4V$>evY+>t8X$dt+7~%cn8+Rr&tWj zZRX-QztfeB9z1c_2=C)KEW3rrCVqpTV$H3lM~>H5B? zIgo&*@CEk8qTiSvi^n>|`*9d1-A(^DCNr9~p)Ng*74bP%z_2}Lta@V!;^SBk|Fm)V zUQ<61qbWa!x`49#jDxWm@ir`nN%ouXfhwpAkKFGwlWsNzRVcWN^)dgq<^uX*Z{n3$ z1k)cd7g`C`W6`J!o{8H31nL5l9W-;L85Sj;j%vRXbt89BJ(}5f$UJcRqPqM`)SX{N z9Vqayc`$Xrs>FLxSN=B+#;_yaYt&hZEs4_}HLE8I)ou}LvVCvk*I0?T^fB{p@bxFN zNDZ+zRz7a7Y&3pHd=V?-iW830A8(=VsQF3roEU-4iEm;nEP2XY$T+M+d>LC{?$c)W ze~810w;=P;=Y(>W1{B=EnizJ*{BjtB{fN(?dZg-ERt65i;@IV!X+Irxf#s`iwADM+NcXz?ud8$ihHbco>V0#BfZEciCRW?^qk%8Ba#!ZU=p*}wQjhWIh&BX;kakI@3C^E9!> z-PQP0aD@VmY4UsgkpRnKLtKqLInYy_N8I7xW(=R%QF}5J?kD43%e`EZ!kvUI67W@@+W0K#^0rR7}Y!7~kEgw0~Vtj@wpZ(Z; z^%i;JIG+)Ji)t6~)YQK~UD(t=%mwZ z%#gIg!o)LClW`}i{x)jA;Fsp*6N!0CF70dci%VJ54RpX(I33mQXVmdtp)RP<8}s*qW|*5e z8vU4m&KO%T4sTMi9P{Eh$Mu%cYOFw%9w#qh@(e)Qzk| z_1t4Div|2$Z#j0r9Gu@7LPmG=8R|}VVlh06_Qy4-17uBZ?yLrChbZd=tVX;J z)zuGCb0%pDbEnPFpLi^0#0jVin~$25=TLLwX$qddxyU3>>3Wl-AnHzo#| zT^*Rp_11GI>tfWC>;^{RBdm%oQ=2=TiMsFysGiM`#`J7qR2-3p-#&uL^syCFa2Ig` z7RNzpUGD+41T`dQkfrSWjhbAKc+6@49O=zaR7Q1mKMcjst%t3Dpn5i626MbdJ~A5n zkMIXvjMXtFqv?Sy*o^oYrolRauJ>)%3NsL&LiONvjKx=|t{xcVddqY*>iCDz8!FTV zB+X>%eW7FmC}@P5_03Ut9)tOCDXI$(+xiEn6Q|8=#ySKwt0PcdIvI6>rKk%&ZtE{$ z4&pRfO#33pbHwLFlBq$(NGyYUFeN@gO_qN!KL%%Yy>(j~vl2(6Cf7(*56r->xDJP4 z`fO%sCZKL)73v00qAvJ1?5y>lKfCF=v8X58OVkdTa=6~3wIXV)2cs@z32L$(M%`hd zoUYRa%i=8j93!xBF4y}^ABF9R|HKe%mD^lMEUx7IPCS_&SS{GB>lLUg-H%~-8@0TG zLR{x8cEWoYn#Wvdmb|8CDq|DM+oLXYHP*q?SQ7*DnYqyhwW_9}Z#wV33MJ>M>sGeJb#c>a6tRJ9yE?Gg>`?f5Fy0DKh3ob0k z`q#4BN-@x_n9%He)yfHxMsCJqH>UFah!`v3-e=7+Tcz z{ys1r_34$PnCnczi5P-G@A=H6Df^!5{ZV)>YPl^%b^T-1vpa8bGc;|mAMrrc32&g5 zmtUyq;v%RgXkXML`bX5HevW#u6)WL-f3)s^>Zt)fGU>=HLUr*v_2)87o>lV@1k+V{O46)D`}Wy7P>s&4txM zjd?57gJmhI%P-?Re1Yw77UNbAAE0`sbXhYLZBQ35)w;=g5ozyp{w9-&2DQqWE^CLH zjIpSf(|kONr%)%JUEaK0w&1tK38-iOunMk|R59v!doc)qKs}Q0V`t1*(e=I!2Vy3z z|2<^|c^e(u1b&4% ztC$`;ikh5JRbA&p&hLCnMqOU2nz_t2^ zbEQ}fGdJ4fFya}gxt1c_e3)cIpT@is86996_QH**uFqD}b@)1S8lh%={aU8ocC1f) z1vR#LYnu<1dRU9thxzazYAElbh9FHH*Lfeyp)PPn9oBzCGJ7e|a!FCwj7@&)ho~px zZfm}JW*twqzQ6$LyVZBSZ@}KTnRp?x#GPgh%$ym~(425S9;W^{mckj0eC7@gG%{m; z7qy&18k??cjrs(eg?bJgN4?$tK@CxXCT9KDLrt#!sIESpSl-ku!~UrCJ`peAc3h8B zea*}=sodPG+kvPqU4TPzKQ_gREzFot#1DuATDs049DsWC{*1c7H>f9S=~m{BBT+Xn z9W^(OpoZ2j!Yo%`8Jii2dUPH_UBF+cI}C1Z#xUI46IDMA)f1ahtKur^!IPwo>5&rF zHmEr;3R~bZ?1ayZKBrY%*Zb{t6Kbq;y>H$HpQ4`i_pl|FYUg@?Kpcm2i1V~}o%xu6 znp`bAm`CskY(u;kdt#=J<^eVelQ1Vv;&sZ;b#nO&3hQrUXY)Z(Fw*t@fuTB%rD4)8 z=JR=Sl=&1((bYU?8lg@&2{p?%qk8TlHpkS_=Igi@P|5eHzp?&d+_!e6C8&1-uUY85R&&F<}}3%ie6Zo$3G zd77enb^`j;@En&K!d+b+})6!^eQrlF_@)<)EUZ=s$e>HC=dYv6mt zqcH=nLk;ad%!md1y51k5D)wdltBYbO(DK-Ub@2}B0aLu6X*dKo5N|?VP*i_Cw`8)<8q(!cEeGq zx$>Jec%19~5vw`!*7P~s$Y>I#7;h#?71W(BL%r93M@_!s6U=fNja`TjVNonF(OhVA zTtK`4H5toKGWJ9rKOQwiCr~|`XR(sz@s0*2o>e?%)G4`8j z9z=~$@d?zb37lmP)D~+J&$OOLP3p{_nXa#idho=cdiFSKa%YOO-~S89d`dyy+2%Kw zRXBq96{_LTIj;8)46m^*@u1IL?@y{Hto7o}4IDzfgp$lP?~aP7+1>&5JeY+V(!Hpc z(yO_we~nd{dFBecVF>Xq)Rq2-dL0*bs*irl<=j@`bTCHX~kxdSd>G zc`x$st~$vY9X?0!TIVb-N~+4{)zqhK-W^_luhbEV}_T^5PD;;E6|1vXI z8lhIfWK{cYs0;rYHRc(Yn<40h>hi^?JHLoJjxXH`*ZUWawy4?qEvn1?SL*f1pWd-2 zu1DQT=2d3-^hO-Flf2-^L z`@-gJ=0ck#n8~~Xz4iZyj9!yDc9=O(+}aj%Q2q%9<08}}br0%g6}r=W1Mb0r#9_Nk z{gZg|TT%7SZq~nMZ5WyM*b~*|2QUj3+GAdm)v*up3Y>%)_L^0( z0C@;Hf1^G;Chj+{<7JqhxWc!ta}gV$mSxBRb3;8)lX=Vm*1w*3`zTPCzQzcwd(e#8 z0=!QAFHFga4jpoxHNYBCKF|#a(p&mpVQ9X4Bo8tS& zeavnSwCcF)ETtgbNi&9<@E_tQcnKe!a=pLfA3JS6wW7W=UFpLb)StvWnEH%)dxoOs z#%Rojaj2nOhk8C7$A$RNM`kXWNoV_)vj ze!~#V`;&QM)xuEX5vZ>G3iZG_ghBWSb)iYFnFm%0oJhO^HOoVZMaehX%D|gczusIeY9)W?l40B@w`r$Zui^8o5*U4-hYi>TS0`?l$kzF3iX8frNmMSTaP`Ni~H3)Czhhgy!` zpeEf#)D1ns%$V(8rU%2&r?KrwMq@q~leoMCPzN}A#|+KSsICmUYo6`3QFrXaF}Mmf z_C@cR=SU=~=XRln_9mvqjQ7nWJ`ZaDnfF=$A!K$?pdGGQ-=NlKfq$E>jzzsTr=#Y` zCG^Ks56m)4hni%0QIoPAYHp0cTsRXoN7keEzl`CS`k~LfT$(;K51cQs1{L>FZ>yre znkU%+>`puywc{P^iiLkO7d{2mwVP0r^O%kQKs~xs{cg&O;a=jlsCP_e-y<`-Yom6I zL5Kf+f!=I=Yz}Y|HMS2>@A*7W%%p6JS{hfrM|{@na3H54_u=Ac%^Zqx-_wDqqsfH=ns zGwVZ8cU}wg>HR;DjK+SBZSW21#Me<{^$2z6`CpnYZjU;_K-2}#L)EWB&Gu_H{u6T& zm-y2RX=BtRorWp#0OryA|1_EW6#R}_w?TiIzXy~?J;NKLdLR-t=A&>j-b9UUpTEsZ zXbP$)HlQx(BC37rSLWXr2B0qJ7t{^7|1cDs-zh>yW8E8dA+s?GccShv$7}N&PZ69& zJQ*V}=Nt2xJ`goIe?{%zipP#FWGJr0FYp7b>-u@idK2a)K7~Gg!~IT1%d4=TpZ5<8 z1MwblO@BXc5)}(DJG8|Xln+Op=oHq$$5<1~C-L*<#&FaHZo=`HG^w9A#M4kcxdrp! zxukyP-~XRdpvjXrnVtAZM<$*2#BeW>&ADqy+Neq{K}hFnM)wwjX)^cz<+K(9SmclFFgPxlvc0mhuH}9pE$a zJ*ewp%NpQ1+J)G-3UxzB^GUr)w zL+RWmEho?8-1}Ehvd%+N9OaEk->ZPF81;LJwS7d2A|0gc3}r9KGYU>aVm6;M!8TNs z&lEbTtOco?N#Yse@Q!h=l8>jI=76^GcHaur?XmgaO_ldElP!B>%SPGbR<&_7^Us-O z3s>955%@i2mr0sD+D6+pN&HPb*~Z=^F-6{&Zb^F#eJ|{><;iW^DEx`~NZaOPVt=0v z)K)JBZNHN8P`Mb7BF`mn3*x7~Tz)X2_wzhIhf*J5+x*McrLg*O!T%;7%Xx0nrV=Tu zZL22U_`js`Gtzx~F@-VM=4aDb525b(n=M<9FG&6z{0enTsJlTtmsEtLZ7uFH+5h=j zma=YdoxC95AKn8bD;2Y8mIJepv{fhZns@4ua*!qxx3+aR@vs`%Ef^=-Iv$!%9h+Bm zC-SfGJ^YBYpLAOlY~N}9w`GTGq)#a5>J^(G8k+i)AI2%T&(`tkcOFuvEeR>Fmoq

%s

", dcf[,"Title"]), + sprintf("

%s: %s

", pkg, dcf[,"Title"]), sprintf("

%s

", dcf[,"Description"]), sprintf("", pkg), tbl, @@ -117,7 +165,48 @@ doc.copy <- function(repodir="bus/integration/cran"){ c(ans1, ans2) } -plat <- function(x) if (grepl("^.*win", x)) "Windows" else if (grepl("^.*osx", x)) "Mac OS X" else "Linux" +plat <- function(x) if (grepl("^.*win", x)) "Windows" else if (grepl("^.*mac", x)) "macOS" else "Linux" + +r.ver <- function(x) { + tmp = strsplit(x, "-", fixed=TRUE)[[1L]] + if (length(tmp) < 2L) stop("test job names must be test-[r.version]-...") + v = tmp[2L] + if (identical(v, "rel")) "r-release" + else if (identical(v, "dev")) "r-devel" + else if (identical(v, "old")) "r-oldrel" + else { + if (grepl("\\D", v)) stop("second word in test job name must be rel/dev/old or numbers of R version") + paste0("r-", paste(strsplit(v, "")[[1L]], collapse=".")) + } +} + +# this for now is constant but when we move to independent pipelines (commit, daily, weekly) those values can be different +pkg.version <- function(job, pkg) { + dcf = read.dcf(file.path("bus", job, paste(pkg, "Rcheck", sep="."), pkg, "DESCRIPTION")) + dcf[,"Version"] +} +pkg.revision <- function(job, pkg) { + dcf = read.dcf(file.path("bus", job, paste(pkg, "Rcheck", sep="."), pkg, "DESCRIPTION")) + if ("Revision" %in% colnames(dcf)) { + proj.url = Sys.getenv("CI_PROJECT_URL", "") + if (!nzchar(proj.url)) { + warning("pkg.revision was designed to be run on GLCI where CI_PROJECT_URL var is set, links to commits will not be produced for checks table") + substr(dcf[,"Revision"], 1, 7) + } else { + sprintf("%s", file.path(proj.url, "-", "commit", dcf[,"Revision"]), substr(dcf[,"Revision"], 1, 7)) + } + } else "" +} +pkg.flags <- function(job, pkg) { + cc = file.path("bus", job, paste(pkg, "Rcheck", sep="."), pkg, "cc") ## data.table style cc file + if (file.exists(cc)) { + d = readLines(cc) + w.cflags = substr(d, 1, 7)=="CFLAGS=" + if (sum(w.cflags)==1L) + return(sub("CFLAGS=", "", d[w.cflags], fixed=TRUE)) + } + "" +} check.copy <- function(job, repodir="bus/integration/cran"){ dir.create(job.checks<-file.path(repodir, "web", "checks", pkg<-"data.table", job), recursive=TRUE); @@ -146,6 +235,39 @@ check.copy <- function(job, repodir="bus/integration/cran"){ setNames(file.exists(file.path(job.checks, c(inst.check, routs))), c(inst.check, routs)) } +check.flavors <- function(jobs, repodir="bus/integration/cran") { + th = "" + tbl = sprintf( + "", + sub("test-", "", jobs, fixed=TRUE), + sapply(jobs, r.ver), + sapply(jobs, plat), + "", # "x86_64" + "", # "Debian GNU/Linux testing" + "", # "2x 8-core Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz" + "" # "GCC 10.2.0 (Debian 10.2.0-13)" + ) + file = file.path(repodir, "web/checks", "check_flavors.html") + writeLines(c( + "", + "", + "Package Check Flavors", + "", + "", + "", + "", + "

Package Check Flavors

", + sprintf("

Last updated on %s.

", format(Sys.time(), usetz=TRUE)), + "
FlavorR VersionOS TypeCPU TypeOS InfoCPU InfoCompilers
%s%s%s%s%s%s%s
", + "",th,"", + tbl, + "
", + "", + "" + ), file) + setNames(file.exists(file), file) +} + check.index <- function(pkg, jobs, repodir="bus/integration/cran") { status = function(x) if (grepl("^.*ERROR", x)) "ERROR" else if (grepl("^.*WARNING", x)) "WARNING" else if (grepl("^.*NOTE", x)) "NOTE" else if (grepl("^.*OK", x)) "OK" else NA_character_ test.files = function(job, files, trim.name=FALSE, trim.exts=0L, pkg="data.table") { @@ -186,30 +308,36 @@ check.index <- function(pkg, jobs, repodir="bus/integration/cran") { } memouts }) - tbl = sprintf("%s%sout%s%s%s", - sub("test-", "", jobs, fixed=TRUE), - sapply(jobs, plat), - pkg, jobs, - pkg, jobs, sapply(sapply(jobs, check.test, pkg="data.table"), status), - mapply(test.files, jobs, routs, trim.exts=2L), # 1st fail, 2nd Rout, keep just: tests_x64/main - mapply(test.files, jobs, memouts, trim.name=TRUE)) + th = "FlavorVersionRevisionInstallStatusFlagsRout.failMemtest" + tbl = sprintf( + "%s%s%sout%s%s%s%s", + sub("test-", "", jobs, fixed=TRUE), + sapply(jobs, pkg.version, pkg), + sapply(jobs, pkg.revision, pkg), + pkg, jobs, ## install + pkg, jobs, sapply(sapply(jobs, check.test, pkg="data.table"), status), ## check + sapply(jobs, pkg.flags, pkg), + mapply(test.files, jobs, routs, trim.exts=2L), # 1st fail, 2nd Rout, keep just: tests_x64/main + mapply(test.files, jobs, memouts, trim.name=TRUE) + ) file = file.path(repodir, "web/checks", sprintf("check_results_%s.html", pkg)) - writeLines(c("", - "", - sprintf("Package Check Results for Package %s", pkg), - "", - "", - "", - "", - sprintf("

Package Check Results for Package %s

", pkg, pkg), - sprintf("

Last updated on %s.

", format(Sys.time(), usetz=TRUE)), - sprintf("", pkg), - "", - tbl, - "
Test jobOS typeInstallCheckRout.failMemtest
", - "", - ""), - file) + writeLines(c( + "", + "", + sprintf("Package Check Results for Package %s", pkg), + "", + "", + "", + "", + sprintf("

Package Check Results for Package %s

", pkg, pkg), + sprintf("

Last updated on %s.

", format(Sys.time(), usetz=TRUE)), + sprintf("", pkg), + "",th,"", + tbl, + "
", + "", + "" + ), file) setNames(file.exists(file), file) } diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9a5b4845f8..02b1471f52 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,6 +6,9 @@ variables: TZ: "UTC" ## to avoid 'Failed to create bus connection' from timedatectl via Sys.timezone() on Docker with R 3.4. ## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under ## a non-UTC timezone, although, that's what we do routinely in dev. + R_REL_VERSION: "4.0" + R_DEVEL_VERSION: "4.1" + R_OLDREL_VERSION: "3.6" stages: - dependencies @@ -16,12 +19,12 @@ stages: .artifacts-template: &artifacts artifacts: - expire_in: 4 weeks + expire_in: 2 weeks when: always paths: - bus -mirror-packages: ## mirror all recursive dependencies, source and win.binary, of data.table suggests and integration suggests from inst/tests/tests-DESCRIPTION +mirror-packages: ## mirror all recursive dependencies, source and win.binary of data.table suggests from inst/tests/tests-DESCRIPTION stage: dependencies tags: - linux @@ -29,15 +32,26 @@ mirror-packages: ## mirror all recursive dependencies, source and win.binary, of cache: paths: - bus/$CI_BUILD_NAME/cran - variables: - R_BIN_VERSION: "4.0" - R_DEVEL_BIN_VERSION: "4.1" script: - echo 'source(".ci/ci.R")' >> .Rprofile - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib - - Rscript -e 'mirror.packages(dcf.dependencies(c("DESCRIPTION","inst/tests/tests-DESCRIPTION"), "all"), repos=c(Sys.getenv("CRAN_MIRROR"), dcf.repos("inst/tests/tests-DESCRIPTION")), repodir="bus/mirror-packages/cran")' + - Rscript -e 'mirror.packages(dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran")' - rm bus/$CI_BUILD_NAME/cran/src/contrib/PACKAGES.rds ## fallback to PACKAGES dcf so available.packages:3.4.4 works - - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_BIN_VERSION","R_DEVEL_BIN_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' + - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_REL_VERSION","R_DEVEL_VERSION","R_OLDREL_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' + <<: *artifacts + +mirror-other-packages: ## mirror integration suggests from inst/tests/tests-DESCRIPTION + stage: dependencies + tags: + - linux + image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev + cache: + paths: + - bus/$CI_BUILD_NAME/cran + script: + - echo 'source(".ci/ci.R")' >> .Rprofile + - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib + - Rscript -e 'mirror.packages(dcf.dependencies("inst/tests/tests-DESCRIPTION", "all"), repos=c(Sys.getenv("CRAN_MIRROR"), dcf.repos("inst/tests/tests-DESCRIPTION")), repodir="bus/mirror-other-packages/cran")' <<: *artifacts build: ## build data.table sources as tar.gz archive @@ -45,8 +59,7 @@ build: ## build data.table sources as tar.gz archive tags: - linux image: registry.gitlab.com/jangorecki/dockerfiles/r-builder - dependencies: - - mirror-packages + needs: ["mirror-packages"] before_script: - Rscript -e 'install.packages("knitr", repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' - rm -r bus @@ -80,20 +93,23 @@ build: ## build data.table sources as tar.gz archive - rm.exe $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) .test-mv-bin-win: &mv-bin-win - - mkdir.exe -p cran/bin/windows/contrib/$R_BIN_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_BIN_VERSION + - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-win.exe https://cloud.r-project.org/bin/windows/base/old/4.0.0/R-4.0.0-win.exe; Start-Process -FilePath ..\R-win.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait -.test-install-r-dev-win: &install-r-dev-win + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.0.3/R-4.0.3-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait +.test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait +.test-install-r-oldrel-win: &install-r-oldrel-win + - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/3.6.3/R-3.6.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + .test-install-rtools-win: &install-rtools-win - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait +.test-install-rtools35-win: &install-rtools35-win + - curl.exe -s -o ../Rtools35.exe https://cloud.r-project.org/bin/windows/Rtools/Rtools35.exe; Start-Process -FilePath ..\Rtools35.exe -ArgumentList "/VERYSILENT /DIR=C:\Rtools" -NoNewWindow -Wait .test-template: &test stage: test - dependencies: - - mirror-packages - - build + needs: ["mirror-packages","build"] <<: *artifacts .test-lin-template: &test-lin @@ -122,14 +138,15 @@ build: ## build data.table sources as tar.gz archive - windows - shared-windows -.test-osx-template: &test-osx - <<: *test - tags: - - macosx +#.test-mac-template: &test-mac +# <<: *test +# tags: +# - macosx test-rel-lin: ## most comprehensive tests, force all suggests, also integration tests, using gcc -O3 -flto -fno-common -Wunused-result <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-builder + needs: ["mirror-packages","mirror-other-packages","build"] variables: _R_CHECK_CRAN_INCOMING_: "FALSE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" @@ -138,7 +155,7 @@ test-rel-lin: ## most comprehensive tests, force all suggests, also integration OPENBLAS_MAIN_FREE: "1" TEST_DATA_TABLE_WITH_OTHER_PACKAGES: "TRUE" before_script: - - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies(c("DESCRIPTION","inst/tests/tests-DESCRIPTION"), which="all"), quiet=TRUE)' + - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies(c("DESCRIPTION","inst/tests/tests-DESCRIPTION"), which="all"), quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' - *cp-src - rm -r bus - mkdir -p ~/.R @@ -225,7 +242,7 @@ test-350-cran-lin: ## R-3.5.0 on Linux, first R altrep version test-rel-win: ## R-release on Windows, test and build binaries <<: *test-win variables: - R_BIN_VERSION: "4.0" + R_VERSION: "$R_REL_VERSION" before_script: - *install-r-rel-win - *install-rtools-win @@ -244,9 +261,9 @@ test-rel-win: ## R-release on Windows, test and build binaries test-dev-win: ## R-devel on Windows <<: *test-win variables: - R_BIN_VERSION: "4.1" + R_VERSION: "$R_DEVEL_VERSION" before_script: - - *install-r-dev-win + - *install-r-devel-win - *install-rtools-win - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - *install-deps-win @@ -260,9 +277,29 @@ test-dev-win: ## R-devel on Windows - *rm-src-win - *mv-bin-win -#test-rel-osx: ## R-release on MacOS, no macosx runner yet +test-old-win: ## R-oldrel on Windows + <<: *test-win + variables: + R_VERSION: "$R_OLDREL_VERSION" + before_script: + - *install-r-oldrel-win + - *install-rtools35-win + - $ENV:PATH = "C:\R\bin;C:\Rtools\bin;$ENV:PATH" + - *install-deps-win + - *cp-src-win + - rm.exe -r bus + script: + - *mv-src-win + - cd bus/$CI_BUILD_NAME + - R.exe CMD check --no-manual --ignore-vignettes $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - *rm-src-win + - *mv-bin-win + +#test-rel-mac: ## R-release on MacOS, no macosx runner yet +# <<: *test-mac # variables: -# R_BIN_VERSION: "4.0" +# R_VERSION: "$R_REL_VERSION" # before_script: # - *install-deps # - *cp-src @@ -272,9 +309,10 @@ test-dev-win: ## R-devel on Windows # - cd bus/$CI_BUILD_NAME # - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) # - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) -# - mkdir -p cran/bin/macosx/el-capitan/contrib/$R_BIN_VERSION -# - mv $(ls -1t data.table_*.tgz | head -n 1) cran/bin/macosx/el-capitan/contrib/$R_BIN_VERSION +# - mkdir -p cran/bin/macosx/el-capitan/contrib/$R_VERSION +# - mv $(ls -1t data.table_*.tgz | head -n 1) cran/bin/macosx/el-capitan/contrib/$R_VERSION # - *rm-src +# - *mv-bin-mac integration: ## merging all artifacts to produce single R repository, documentation and website stage: integration @@ -283,22 +321,7 @@ integration: ## merging all artifacts to produce single R repository, documentat - linux only: - master - dependencies: - - mirror-packages - - build - - test-rel-lin - - test-rel-cran-lin - - test-dev-cran-lin - - test-rel-vanilla-lin - - test-310-cran-lin - - test-344-cran-lin - - test-350-cran-lin - - test-rel-win - - test-dev-win - #- test-rel-osx - variables: - R_BIN_VERSION: "4.0" - R_DEVEL_BIN_VERSION: "4.1" + needs: ["mirror-packages","build","test-rel-lin","test-rel-cran-lin","test-dev-cran-lin","test-rel-vanilla-lin","test-310-cran-lin","test-344-cran-lin","test-350-cran-lin","test-rel-win","test-dev-win","test-old-win"] script: - Rscript -e 'pkgdown::build_site(override=list(destination="./pkgdown"))' ## html manual, vignettes, repos, cran_web, cran_checks @@ -309,10 +332,12 @@ integration: ## merging all artifacts to produce single R repository, documentat - mkdir -p bus/$CI_BUILD_NAME ## delete any existing non-dev version of data.table - rm -f bus/mirror-packages/cran/src/contrib/data.table_*.tar.gz - - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_BIN_VERSION/data.table_*.zip - - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_DEVEL_BIN_VERSION/data.table_*.zip - #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_BIN_VERSION/data.table_*.tgz - #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_DEVEL_BIN_VERSION/data.table_*.tgz + - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_REL_VERSION/data.table_*.zip + - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_DEVEL_VERSION/data.table_*.zip + - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_OLDREL_VERSION/data.table_*.zip + #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_REL_VERSION/data.table_*.tgz + #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_DEVEL_VERSION/data.table_*.tgz + #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_OLDREL_VERSION/data.table_*.tgz ## merge mirror-packages and R devel packages - mv bus/mirror-packages/cran bus/$CI_BUILD_NAME/ ## publish package sources @@ -320,17 +345,21 @@ integration: ## merging all artifacts to produce single R repository, documentat - mv $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) bus/$CI_BUILD_NAME/cran/src/contrib - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="source"), type="source", fields="Revision", addFiles=TRUE)' ## publish binaries - - Rscript -e 'move.bin("test-rel-win", Sys.getenv("R_BIN_VERSION"), os.type="windows")' - - Rscript -e 'move.bin("test-dev-win", Sys.getenv("R_DEVEL_BIN_VERSION"), os.type="windows", silent=TRUE)' - - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_BIN_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_DEVEL_BIN_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - #- Rscript -e 'move.bin("test-rel-osx", Sys.getenv("R_BIN_VERSION"), os.type="macosx")' - #- Rscript -e 'move.bin("test-dev-osx", Sys.getenv("R_DEVEL_BIN_VERSION"), os.type="macosx", silent=TRUE)' - #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_BIN_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' - #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_DEVEL_BIN_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' + - Rscript -e 'move.bin("test-rel-win", Sys.getenv("R_REL_VERSION"), os.type="windows")' + - Rscript -e 'move.bin("test-dev-win", Sys.getenv("R_DEVEL_VERSION"), os.type="windows")' + - Rscript -e 'move.bin("test-old-win", Sys.getenv("R_OLDREL_VERSION"), os.type="windows")' + - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_REL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' + - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_DEVEL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' + - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_OLDREL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' + #- Rscript -e 'move.bin("test-rel-mac", Sys.getenv("R_REL_VERSION"), os.type="macosx")' + #- Rscript -e 'move.bin("test-dev-mac", Sys.getenv("R_DEVEL_VERSION"), os.type="macosx")' + #- Rscript -e 'move.bin("test-old-mac", Sys.getenv("R_OLDREL_VERSION"), os.type="macosx")' + #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_REL_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' + #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_DEVEL_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' + #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_OLDREL_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' ## install all pkgs to render html and double check successful installation of all devel packages - - mkdir -p /tmp/opencran/library /tmp/opencran/doc/html - - Rscript -e 'install.packages("data.table", dependencies=TRUE, lib="/tmp/opencran/library", repos=file.path("file:",normalizePath("bus/integration/cran")), INSTALL_opts="--html", quiet=TRUE)' + - mkdir -p /tmp/opencran/library /tmp/opencran/doc/html ## reset R_LIBS_USER to re-install all with html because pkgdown image has pre installed curl knitr + - R_LIBS_USER="" Rscript -e 'install.packages("data.table", dependencies=TRUE, lib="/tmp/opencran/library", repos=file.path("file:",normalizePath("bus/integration/cran")), INSTALL_opts="--html", quiet=TRUE)' - Rscript -e 'packageVersion("data.table", lib.loc="/tmp/opencran/library")' ## CRAN style web/CRAN_web.css - wget -q -P bus/integration/cran/web https://cran.r-project.org/web/CRAN_web.css @@ -349,6 +378,8 @@ integration: ## merging all artifacts to produce single R repository, documentat - Rscript -e 'pdf.copy("data.table", "test-rel-lin")' ## web/checks/check_results_$pkg.html - Rscript -e 'check.index("data.table", names(test.jobs))' + ## web/checks/check_flavors.html + - Rscript -e 'check.flavors(names(test.jobs))' ## pkgdown merge - Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); msg = if (length(f<-common_files("pkgdown","bus/integration/cran"))) paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0(" ", f)), collapse="\n") else "No overlapping files from pkgdown artifacts"; message(msg); q("no")' - mv pkgdown/* bus/integration/cran/ @@ -366,8 +397,10 @@ integration: ## merging all artifacts to produce single R repository, documentat image: docker services: - docker:dind - dependencies: - - build + needs: + - job: build + - job: integration + artifacts: false before_script: - sed "s/SRC_IMAGE_NAME/$SRC_IMAGE_NAME/" < .ci/Dockerfile.in > Dockerfile - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY @@ -420,13 +453,12 @@ pages: ## publish R repository, test jobs summaries, html documentation of all p only: - master image: ubuntu - dependencies: - - integration + needs: ["integration"] script: - mkdir -p public - cp -r bus/integration/cran/* public - cat public/src/contrib/PACKAGES artifacts: ## publish only when no failure - expire_in: 4 weeks + expire_in: 2 weeks paths: - public diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 55718e23b4..1bd91286f9 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -10,7 +10,7 @@ if (!"package:data.table" %in% search()) stop("data.table should be already atta test = data.table:::test INT = data.table:::INT -pkgs = c("ggplot2", "hexbin", "plyr", "caret", "xts", "gdata", "zoo", "nlme", "bit64", "knitr", "plm", "parallel") +pkgs = c("ggplot2", "hexbin", "plyr", "caret", "xts", "gdata", "zoo", "nlme", "bit64", "knitr", "parallel") if (any(duplicated(pkgs))) stop("Packages defined to be loaded for integration tests in 'inst/tests/other.Rraw' contains duplicates.") is.require = function(pkg) suppressWarnings(suppressMessages(isTRUE(require(pkg, character.only=TRUE, quietly=TRUE, warn.conflicts=FALSE)))) @@ -155,15 +155,6 @@ if (loaded[["knitr"]]) { test(11, kable(DT), output="x.*y.*1.*2") } -# for plm package -if (loaded[["plm"]]) { - set.seed(45L) - x = data.table(V1=c(1L,2L), V2=LETTERS[1:3], V3=round(rnorm(4),4), V4=1:12) - px = pdata.frame(x, index=c("V2", "V4"), drop.index=FALSE, row.names=TRUE) - test(12.1, class(as.data.table(px)), class(x)) - test(12.2, class(setDT(px)), class(x)) -} - if (loaded[["parallel"]]) { #1745 and #1727 if (.Platform$OS.type=="windows") { @@ -200,7 +191,3 @@ test(14.1, !inherits(res, 'error')) res = tryCatch(example('CJ', package='data.table', local=TRUE)) test(14.2, !inherits(res, 'error')) - -################################### -# Add new tests above this line # -################################### diff --git a/inst/tests/tests-DESCRIPTION b/inst/tests/tests-DESCRIPTION index edfadceb0b..35e3411ad0 100644 --- a/inst/tests/tests-DESCRIPTION +++ b/inst/tests/tests-DESCRIPTION @@ -4,4 +4,4 @@ Type: Backend Title: List of data.table dependencies used in integration tests Authors@R: c(person("data.table team", role = c("aut", "cre", "cph"), email="mattjdowle@gmail.com")) Description: Standalone R DESCRIPTION file which defines R dependencies for integration tests of data.table package. Integration tests are not part of main testing workflow. They are performed only when TEST_DATA_TABLE_WITH_OTHER_PACKAGES environment variable is set to true. This allows us to run those integration tests in our CI pipeline and not impose dependency chains on the user. -Suggests: ggplot2 (>= 0.9.0), reshape, hexbin, fastmatch, nlme, gdata, caret, plm, rmarkdown, parallel +Suggests: ggplot2 (>= 0.9.0), reshape, hexbin, fastmatch, nlme, gdata, caret, rmarkdown, parallel From 71e543375a3eff62d37dee85536a19c3fd896f38 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 3 Nov 2020 00:13:45 -0500 Subject: [PATCH 118/588] signature of memrecycle synced (#4791) --- src/data.table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data.table.h b/src/data.table.h index c6e4bbf61e..d045d50f44 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -166,7 +166,7 @@ SEXP dt_na(SEXP x, SEXP cols); // assign.c SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose); -const char *memrecycle(const SEXP target, const SEXP where, const int r, const int len, SEXP source, const int sourceStart, const int sourceLen, const int coln, const char *colname); +const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceStart, const int sourceLen, const int colnum, const char *colname); SEXP shallowwrapper(SEXP dt, SEXP cols); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, From 68f6aca5414c63603e7b324d01e75d8dd2a4be91 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 3 Nov 2020 07:20:52 +0200 Subject: [PATCH 119/588] do not export as.Date.IDate anymore (#4782) --- .gitlab-ci.yml | 2 ++ DESCRIPTION | 2 +- NAMESPACE | 10 +--------- NEWS.md | 2 ++ 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 02b1471f52..8e53b950e1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -214,6 +214,8 @@ test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-d variables: _R_CHECK_CRAN_INCOMING_: "TRUE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" + _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" ## detects S3 method lookup found on search path #4777 + _R_S3_METHOD_LOOKUP_REPORT_SEARCH_PATH_USES_: "TRUE" before_script: - *install-deps - *cp-src diff --git a/DESCRIPTION b/DESCRIPTION index 06b7f33c3d..913b9e74a9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -63,7 +63,7 @@ Authors@R: c( person("Dirk","Eddelbuettel", role="ctb")) Depends: R (>= 3.1.0) Imports: methods -Suggests: bit64, curl, R.utils, xts, nanotime, zoo, yaml, knitr, rmarkdown +Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown SystemRequirements: zlib Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE diff --git a/NAMESPACE b/NAMESPACE index c2c095a1d8..57271aa04d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -151,8 +151,7 @@ S3method("+", IDate) S3method("-", IDate) S3method(as.character, ITime) S3method(as.data.frame, ITime) -S3method(as.Date, IDate) # note that zoo::as.Date masks base::as.Date. Both generic. -export(as.Date.IDate) # workaround for zoo bug, see #1500. Removing this export causes CI pipeline to fail on others.Rraw test 6, but I can't reproduce locally. +S3method(as.Date, IDate) # note that base::as.Date is masked by zoo::as.Date, #1500 #4777 S3method(as.IDate, Date) S3method(as.IDate, POSIXct) S3method(as.IDate, default) @@ -187,10 +186,3 @@ S3method(unique, ITime) S3method('[<-', IDate) S3method(edit, data.table) -# duplist -# getdots -# NCOL -# NROW -# which.first -# which.last - diff --git a/NEWS.md b/NEWS.md index a2318dda88..826573d7b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,8 @@ 1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://www.h2o.ai/blog/behind-the-scenes-of-cran/). +2. `as.Date.IDate` is no longer exported as a function to solve a new error in R-devel `S3 method lookup found 'as.Date.IDate' on search path`, [#4777](https://github.com/Rdatatable/data.table/issues/4777). The S3 method is still exported; i.e. `as.Date(x)` will still invoke the `as.Date.IDate` method when `x` is class `IDate`. The function had been exported, in addition to exporting the method, to solve a compatibility issue with `zoo` (and `xts` which uses `zoo`) because `zoo` exports `as.Date` which masks `base::as.Date`. Happily, since zoo 1.8-1 (Jan 2018) made a change to its `as.IDate`, the workaround is no longer needed. + # data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1) (19 Oct 2020) From 70b6b1368fd0aa88eb3356de1de32fb70e21a908 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 3 Nov 2020 01:57:09 -0700 Subject: [PATCH 120/588] add fcase tests (#4796) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/NEWS.md b/NEWS.md index 826573d7b2..afde983b46 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,8 @@ 2. `as.Date.IDate` is no longer exported as a function to solve a new error in R-devel `S3 method lookup found 'as.Date.IDate' on search path`, [#4777](https://github.com/Rdatatable/data.table/issues/4777). The S3 method is still exported; i.e. `as.Date(x)` will still invoke the `as.Date.IDate` method when `x` is class `IDate`. The function had been exported, in addition to exporting the method, to solve a compatibility issue with `zoo` (and `xts` which uses `zoo`) because `zoo` exports `as.Date` which masks `base::as.Date`. Happily, since zoo 1.8-1 (Jan 2018) made a change to its `as.IDate`, the workaround is no longer needed. +3. Thanks to @fredguinog for testing `fcase` in development before 1.13.0 was released and finding a segfault, [#4378](https://github.com/Rdatatable/data.table/issues/4378). It was found separately by the `rchk` tool (which uses static code analysis) in release procedures and fixed before `fcase` was released, but the reproducible example has now been added to the test suite for completeness. Thanks also to @shrektan for investigating, proposing a very similar fix at C level, and a different reproducible example wich has also been added to the test suite. + # data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1) (19 Oct 2020) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e68b818aa8..9d207f8866 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17212,3 +17212,26 @@ test(2159.14, typeof(as.matrix(DT)), "character") test(2159.15, typeof(as.matrix(DT[0L])), "character") test(2159.16, min(DT[0L]), error="only.*numeric") +# fcase tests from dev 1.12.9 fixed before 1.13.0 was released, #4378 #4401 +# Matt tested that the smaller 100 size still fails in 1.12.9 under gctorture2(step=100) +set.seed(123) +x = structure(rnorm(100L), class='abc') +test(2160.1, fcase(x <= -100, structure(x*1.0, class='abc'), + x <= -10, structure(x*1.0, class='abc'), + x <= 0, structure(x*1.0, class='abc'), + x <= 100, structure(x*1.0, class='abc'), + x <= 1000, structure(x*1.0, class='abc'), + x >= 1000, structure(x*1.0, class='abc')), + structure(x, class='abc')) +x = data.table(rnorm(100L), rnorm(100L), rnorm(100L)) +test(2160.2, x[, v0 := fcase( + V1 > 0 & V2 <= 1 & V3 > 1, V2 * 100L, + V1 > 1 & V2 <= 0 & V3 > 0, V3 * 100L, + V1 > -1 & V2 <= 2 & V3 > 1, V1 * 100L, + V1 > 1 & V2 <= 0 & V3 > 2, 300, + V1 > 0 & V2 <= 1 & V3 > 1, 100, + V1 > -1 & V2 <= 0 & V3 > -1, V1 * 100L, + default = 0 +)][c(1,3,74,96,100), round(v0,1)], c(0, -24.7, 82.5, 6.7, 0)) +rm(x) + From 70a598d699757e0b227975e101e7398721797790 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 12 Nov 2020 23:09:06 -0700 Subject: [PATCH 121/588] fsort segfault with clang-11 openmp (#4808) --- .dev/CRAN_Release.cmd | 2 +- NEWS.md | 2 + configure | 5 +- inst/tests/tests.Rraw | 2 +- src/Makevars.in | 2 + src/fsort.c | 190 +++++++++++++++++++++++------------------- src/init.c | 6 +- src/myomp.h | 9 ++ src/openmp-utils.c | 2 + 9 files changed, 126 insertions(+), 94 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 902d66efd1..ce73aa84fe 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -308,7 +308,7 @@ make # use latest available `apt-cache search gcc-` or `clang-` cd ~/build/R-devel-strict-clang -./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="clang-10 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" +./configure --without-recommended-packages --disable-byte-compiled-packages --enable-strict-barrier --disable-long-double CC="clang-11 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make cd ~/build/R-devel-strict-gcc diff --git a/NEWS.md b/NEWS.md index afde983b46..94e1fee81c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ 1. `as.matrix()` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting. +2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. If you have an old compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway, and if so `fsort` will check that and work fine. If not, the compiler might accept `-fopenmp-version=45`, otherwise you will need to upgrade compiler. https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful. + ## NOTES 1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://www.h2o.ai/blog/behind-the-scenes-of-cran/). diff --git a/configure b/configure index c0745e527e..e29156d61a 100755 --- a/configure +++ b/configure @@ -85,7 +85,11 @@ EOF if [ "$R_NO_OPENMP" = "1" ]; then # Compilation failed -- try forcing -fopenmp instead. + # TODO: doesn't R_NO_OPENMP need to be set to 0 before next line? ${CC} ${CFLAGS} -fopenmp test-omp.c || R_NO_OPENMP=1 + # TODO: and then nothing seems to be done with this outcome +else + echo "R CMD SHLIB supports OpenMP without any extra hint" fi # Clean up. @@ -100,7 +104,6 @@ if [ "$R_NO_OPENMP" = "1" ]; then echo "*** Continuing installation without OpenMP support..." sed -e "s|@openmp_cflags@||" src/Makevars.in > src/Makevars else - echo "OpenMP supported" sed -e "s|@openmp_cflags@|\$(SHLIB_OPENMP_CFLAGS)|" src/Makevars.in > src/Makevars fi # retain user supplied PKG_ env variables, #4664. See comments in Makevars.in too. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9d207f8866..4dd2809f7d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12512,7 +12512,7 @@ x <- as.integer(x) test(1888.5, fsort(x), base::sort(x, na.last = FALSE), warning = "Input is not a vector of type double. New parallel sort has only been done for double vectors so far.*Using one thread") x = runif(1e6) -test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 5 MSB counts") +test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 20 MSB counts") test(1888.7, !base::is.unsorted(y)) test(1888.8, fsort(x,verbose=1), error="verbose must be TRUE or FALSE") rm(x) diff --git a/src/Makevars.in b/src/Makevars.in index 76218cb65a..7750c1e8ac 100644 --- a/src/Makevars.in +++ b/src/Makevars.in @@ -7,5 +7,7 @@ PKG_LIBS = @PKG_LIBS@ @openmp_cflags@ -lz # Hence the onerous @...@ substitution. Is it still appropriate in 2020 that we can't use +=? all: $(SHLIB) + @echo PKG_CFLAGS = $(PKG_CFLAGS) + @echo PKG_LIBS = $(PKG_LIBS) if [ "$(SHLIB)" != "datatable$(SHLIB_EXT)" ]; then mv $(SHLIB) datatable$(SHLIB_EXT); fi if [ "$(OS)" != "Windows_NT" ] && [ `uname -s` = 'Darwin' ]; then install_name_tool -id datatable$(SHLIB_EXT) datatable$(SHLIB_EXT); fi diff --git a/src/fsort.c b/src/fsort.c index 00c7e5c10b..748c2ad85d 100644 --- a/src/fsort.c +++ b/src/fsort.c @@ -2,43 +2,39 @@ #define INSERT_THRESH 200 // TODO: expose via api and test -static void dinsert(double *x, int n) { // TODO: if and when twiddled, double => ull +static void dinsert(double *x, const int n) { // TODO: if and when twiddled, double => ull if (n<2) return; - for (int i=1; i=0 && xtmp=0 && xtmp> fromBit & mask]++; + for (uint64_t i=0; i> fromBit & mask]++; tmp++; } - int last = (*(unsigned long long *)--tmp - minULL) >> fromBit & mask; + int last = (*(uint64_t *)--tmp - minULL) >> fromBit & mask; if (counts[last] == n) { // Single value for these bits here. All counted in one bucket which must be the bucket for the last item. counts[last] = 0; // clear ready for reuse. All other counts must be zero already so save time by not setting to 0. @@ -47,9 +43,9 @@ static void dradix_r( // single-threaded recursive worker return; } - R_xlen_t cumSum=0; - for (R_xlen_t i=0; cumSum> fromBit & mask; + for (uint64_t i=0; i> fromBit & mask; working[ counts[thisx]++ ] = *tmp; tmp++; } @@ -71,14 +67,14 @@ static void dradix_r( // single-threaded recursive worker // Also this way, we don't need to know how big thisCounts is and therefore no possibility of getting that wrong. // wasteful thisCounts[i]=0 even when already 0 is better than a branch. We are highly recursive at this point // so avoiding memset() is known to be worth it. - for (int i=0; counts[i]0 if the element a goes after the element b // doesn't master if stable or not - R_xlen_t x = qsort_data[*(int *)a]; - R_xlen_t y = qsort_data[*(int *)b]; + uint64_t x = qsort_data[*(int *)a]; + uint64_t y = qsort_data[*(int *)b]; // return x-y; would like this, but this is long and the cast to int return may not preserve sign // We have long vectors in mind (1e10(74GB), 1e11(740GB)) where extreme skew may feasibly mean the largest count // is greater than 2^32. The first split is (currently) 16 bits so should be very rare but to be safe keep 64bit counts. @@ -132,12 +128,12 @@ SEXP fsort(SEXP x, SEXP verboseArg) { double mins[nBatch], maxs[nBatch]; const double *restrict xp = REAL(x); #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(nBatch, false)) - for (int batch=0; batchmyMax) myMax=*d; @@ -148,7 +144,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) { } t[2] = wallclock(); double min=mins[0], max=maxs[0]; - for (int i=1; imax) max=maxs[i]; @@ -158,10 +154,9 @@ SEXP fsort(SEXP x, SEXP verboseArg) { // TODO: -0ULL should allow negatives // avoid twiddle function call as expensive in recent tests (0.34 vs 2.7) // possibly twiddle once to *ans, then untwiddle at the end in a fast parallel sweep - u.d = max; - unsigned long long maxULL = u.ull; - u.d = min; - minULL = u.ull; // set static global for use by dradix_r + + uint64_t maxULL = *(uint64_t *)&max; + minULL = *(uint64_t *)&min; // set static global for use by dradix_r int maxBit = floor(log(maxULL-minULL) / log(2)); // 0 is the least significant bit int MSBNbits = maxBit > 15 ? 16 : maxBit+1; // how many bits make up the MSB @@ -169,33 +164,32 @@ SEXP fsort(SEXP x, SEXP verboseArg) { size_t MSBsize = 1LL< 65,536) if (verbose) Rprintf(_("maxBit=%d; MSBNbits=%d; shift=%d; MSBsize=%d\n"), maxBit, MSBNbits, shift, MSBsize); - R_xlen_t *counts = calloc(nBatch*MSBsize, sizeof(R_xlen_t)); - if (counts==NULL) error(_("Unable to allocate working memory")); + uint64_t *counts = (uint64_t *)R_alloc(nBatch*MSBsize, sizeof(uint64_t)); + memset(counts, 0, nBatch*MSBsize*sizeof(uint64_t)); // provided MSBsize>=9, each batch is a multiple of at least one 4k page, so no page overlap - // TODO: change all calloc, malloc and free to Calloc and Free to be robust to error() and catch ooms. if (verbose) Rprintf(_("counts is %dMB (%d pages per nBatch=%d, batchSize=%"PRIu64", lastBatchSize=%"PRIu64")\n"), - (int)(nBatch*MSBsize*sizeof(R_xlen_t)/(1024*1024)), - (int)(nBatch*MSBsize*sizeof(R_xlen_t)/(4*1024*nBatch)), + (int)(nBatch*MSBsize*sizeof(uint64_t)/(1024*1024)), + (int)(nBatch*MSBsize*sizeof(uint64_t)/(4*1024*nBatch)), nBatch, (uint64_t)batchSize, (uint64_t)lastBatchSize); t[3] = wallclock(); #pragma omp parallel for num_threads(nth) - for (int batch=0; batch> shift]++; tmp++; } } // cumulate columnwise; parallel histogram; small so no need to parallelize - R_xlen_t rollSum=0; - for (int msb=0; msb> shift]++ ] = *source; // This assignment to ans is not random access as it may seem, but cache efficient by // design since target pages are written to contiguously. MSBsize * 4k < cache. @@ -226,13 +220,13 @@ SEXP fsort(SEXP x, SEXP verboseArg) { int fromBit = toBit>7 ? toBit-7 : 0; // sort bins by size, largest first to minimise last-man-home - R_xlen_t *msbCounts = counts + (nBatch-1)*MSBsize; + uint64_t *msbCounts = counts + (nBatch-1)*MSBsize; // msbCounts currently contains the ending position of each MSB (the starting location of the next) even across empty if (msbCounts[MSBsize-1] != xlength(x)) error(_("Internal error: counts[nBatch-1][MSBsize-1] != length(x)")); // # nocov - R_xlen_t *msbFrom = malloc(MSBsize*sizeof(R_xlen_t)); - int *order = malloc(MSBsize*sizeof(int)); - R_xlen_t cumSum = 0; - for (int i=0; i0 && msbCounts[order[MSBsize-1]] < 2) MSBsize--; @@ -252,63 +246,83 @@ SEXP fsort(SEXP x, SEXP verboseArg) { Rprintf(_("%d by excluding 0 and 1 counts\n"), MSBsize); } + bool failed=false, alloc_fail=false, non_monotonic=false; // shared bools only ever assigned true; no need for atomic or critical assign t[6] = wallclock(); #pragma omp parallel num_threads(getDTthreads(MSBsize, false)) { - R_xlen_t *counts = calloc((toBit/8 + 1)*256, sizeof(R_xlen_t)); - // each thread has its own (small) stack of counts + // each thread has its own small stack of counts // don't use VLAs here: perhaps too big for stack yes but more that VLAs apparently fail with schedule(dynamic) - - double *working=NULL; - // the working memory (for the largest groups) is allocated the first time the thread is assigned to - // an iteration. - - #pragma omp for schedule(dynamic,1) - // All we assume here is that a thread can never be assigned to an earlier iteration; i.e. threads 0:(nth-1) - // get iterations 0:(nth-1) possibly out of order, then first-come-first-served in order after that. - // If a thread deals with an msb lower than the first one it dealt with, then its *working will be too small. - for (int msb=0; msb 65,536) that the largest MSB should be // relatively small anyway (n/65,536 if uniformly distributed). - // For msb>=nth, that thread's *working will already be big - // enough because the smallest *working (for thread nth-1) is big enough for all iterations following. + // For msb>=nth, that thread's *myworking will already be big enough because + // the smallest *myworking (for thread nth-1) is big enough for all iterations following. // Progressively, less and less of the working will be needed by the thread (just the first thisN will be - // used) and the unused pages will simply not be cached. - // TODO: Calloc isn't thread-safe. But this deep malloc should be ok here as no possible error() points - // before free. Just need to add the check and exit thread safely somehow. + // used) and the unused lines will simply not be cached. if (thisN <= INSERT_THRESH) { dinsert(ans+from, thisN); } else { - dradix_r(ans+from, working, thisN, fromBit, toBit, counts); + dradix_r(ans+from, myworking, thisN, fromBit, toBit, mycounts); } } - free(counts); - free(working); + free(mycounts); + free(myworking); } - free(msbFrom); - free(order); + if (non_monotonic) + error("OpenMP %d did not assign threads to iterations monotonically. Please search Stack Overflow for this message.", MY_OPENMP); // # nocov; #4786 in v1.13.4 + if (alloc_fail) + error(_("Unable to allocate working memory")); // # nocov } t[7] = wallclock(); - free(counts); - + // TODO: parallel sweep to check sorted using <= on original input. Feasible that twiddling messed up. // After a few years of heavy use remove this check for speed, and move into unit tests. // It's a perfectly contiguous and cache efficient parallel scan so should be relatively negligible. double tot = t[7]-t[0]; - if (verbose) for (int i=1; i<=7; i++) { + if (verbose) for (int i=1; i<=7; ++i) { Rprintf(_("%d: %.3f (%4.1f%%)\n"), i, t[i]-t[i-1], 100.*(t[i]-t[i-1])/tot); } - UNPROTECT(nprotect); return(ansVec); } diff --git a/src/init.c b/src/init.c index 1247c585b6..6a5377ca2a 100644 --- a/src/init.c +++ b/src/init.c @@ -393,11 +393,11 @@ SEXP hasOpenMP() { // Just for use by onAttach (hence nocov) to avoid an RPRINTF from C level which isn't suppressable by CRAN // There is now a 'grep' in CRAN_Release.cmd to detect any use of RPRINTF in init.c, which is // why RPRINTF is capitalized in this comment to avoid that grep. - // TODO: perhaps .Platform or .Machine in R itself could contain whether OpenMP is available. + // .Platform or .Machine in R itself does not contain whether OpenMP is available because compiler and flags are per-package. #ifdef _OPENMP - return ScalarLogical(TRUE); + return ScalarInteger(_OPENMP); // return the version; e.g. 201511 (i.e. 4.5) #else - return ScalarLogical(FALSE); + return ScalarInteger(0); // 0 rather than NA so that if() can be used on the result #endif } // # nocov end diff --git a/src/myomp.h b/src/myomp.h index 58a5703f00..57d8b58734 100644 --- a/src/myomp.h +++ b/src/myomp.h @@ -1,5 +1,13 @@ #ifdef _OPENMP #include + #if _OPENMP >= 201511 + #define monotonic_dynamic monotonic:dynamic // #4786 + #else + #define monotonic_dynamic dynamic + #endif + #define MY_OPENMP _OPENMP + // for use in error messages (e.g. fsort.c; #4786) to save an #ifdef each time + // initially chose OMP_VERSION but figured OpenMP might define that in future, so picked MY_ prefix #else // for machines with compilers void of openmp support #define omp_get_num_threads() 1 @@ -9,5 +17,6 @@ #define omp_get_num_procs() 1 #define omp_set_nested(a) // empty statement to remove the call #define omp_get_wtime() 0 + #define MY_OPENMP 0 #endif diff --git a/src/openmp-utils.c b/src/openmp-utils.c index 51393f3b7c..b65a661eaf 100644 --- a/src/openmp-utils.c +++ b/src/openmp-utils.c @@ -79,6 +79,8 @@ SEXP getDTthreads_R(SEXP verbose) { if (LOGICAL(verbose)[0]) { #ifndef _OPENMP Rprintf(_("This installation of data.table has not been compiled with OpenMP support.\n")); + #else + Rprintf(_(" OpenMP version (_OPENMP) %d\n"), _OPENMP); // user can use Google to map 201511 to 4.5; it's odd that OpenMP API does not provide 4.5 #endif // this output is captured, paste0(collapse="; ")'d, and placed at the end of test.data.table() for display in the last 13 lines of CRAN check logs // it is also printed at the start of test.data.table() so that we can trace any Killed events on CRAN before the end is reached From 8480b6ad812366306c7d70f799408d715d3b5ced Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 13 Nov 2020 00:59:10 -0700 Subject: [PATCH 122/588] follow up to #4808: avoid type pun dereference --- src/fsort.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/fsort.c b/src/fsort.c index 748c2ad85d..5c1cf946e8 100644 --- a/src/fsort.c +++ b/src/fsort.c @@ -155,8 +155,11 @@ SEXP fsort(SEXP x, SEXP verboseArg) { // avoid twiddle function call as expensive in recent tests (0.34 vs 2.7) // possibly twiddle once to *ans, then untwiddle at the end in a fast parallel sweep - uint64_t maxULL = *(uint64_t *)&max; - minULL = *(uint64_t *)&min; // set static global for use by dradix_r + union {double d; uint64_t u64;} u; + u.d = max; + uint64_t maxULL = u.u64; + u.d = min; + minULL = u.u64; // set static global for use by dradix_r int maxBit = floor(log(maxULL-minULL) / log(2)); // 0 is the least significant bit int MSBNbits = maxBit > 15 ? 16 : maxBit+1; // how many bits make up the MSB From db1c77b1293cc46d88a3f688a770811bf43acb92 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 1 Dec 2020 03:13:38 -0700 Subject: [PATCH 123/588] Missing parenthesis in fread.Rd. Thanks to Mark Daniel Ward for reporting via email. --- man/fread.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fread.Rd b/man/fread.Rd index 48eb9625bb..6419247617 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -31,7 +31,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="" \item{input}{ A single character string. The value is inspected and deferred to either \code{file=} (if no \\n present), \code{text=} (if at least one \\n is present) or \code{cmd=} (if no \\n is present, at least one space is present, and it isn't a file name). Exactly one of \code{input=}, \code{file=}, \code{text=}, or \code{cmd=} should be used in the same call. } \item{file}{ File name in working directory, path to file (passed through \code{\link[base]{path.expand}} for convenience), or a URL starting http://, file://, etc. Compressed files with extension \file{.gz} and \file{.bz2} are supported if the \code{R.utils} package is installed. } \item{text}{ The input data itself as a character vector of one or more lines, for example as returned by \code{readLines()}. } - \item{cmd}{ A shell command that pre-processes the file; e.g. \code{fread(cmd=paste("grep",word,"filename")}. See Details. } + \item{cmd}{ A shell command that pre-processes the file; e.g. \code{fread(cmd=paste("grep",word,"filename"))}. See Details. } \item{sep}{ The separator between columns. Defaults to the character in the set \code{[,\\t |;:]} that separates the sample of rows into the most number of lines with the same number of fields. Use \code{NULL} or \code{""} to specify no separator; i.e. each line a single character column like \code{base::readLines} does.} \item{sep2}{ The separator \emph{within} columns. A \code{list} column will be returned where each cell is a vector of values. This is much faster using less working memory than \code{strsplit} afterwards or similar techniques. For each column \code{sep2} can be different and is the first character in the same set above [\code{,\\t |;}], other than \code{sep}, that exists inside each field outside quoted regions in the sample. NB: \code{sep2} is not yet implemented. } \item{nrows}{ The maximum number of rows to read. Unlike \code{read.table}, you do not need to set this to an estimate of the number of rows in the file for better speed because that is already automatically determined by \code{fread} almost instantly using the large sample of lines. \code{nrows=0} returns the column names and typed empty columns determined by the large sample; useful for a dry run of a large file or to quickly check format consistency of a set of files before starting to read any of them. } From 958b69310361848480fd2258c5035545901b2038 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 1 Dec 2020 09:20:21 -0700 Subject: [PATCH 124/588] More zlib/fwrite tracing for Solaris (#4826) --- R/test.data.table.R | 1 + src/fwrite.c | 52 ++++++++++++++++++++++++++++++++++----------- src/init.c | 2 ++ src/utils.c | 7 ++++++ 4 files changed, 50 insertions(+), 12 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index c4b6cfaf6d..92de111915 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -147,6 +147,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F ", Sys.getlocale()=='", Sys.getlocale(), "'", ", l10n_info()=='", paste0(names(l10n_info()), "=", l10n_info(), collapse="; "), "'", ", getDTthreads()=='", paste0(gsub("[ ][ ]+","==",gsub("^[ ]+","",capture.output(invisible(getDTthreads(verbose=TRUE))))), collapse="; "), "'", + ", ", .Call(Cdt_zlib_version), "\n", sep="") if (inherits(err,"try-error")) { diff --git a/src/fwrite.c b/src/fwrite.c index 7a79e6ad8c..dc18cf4b50 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -563,16 +563,50 @@ int init_stream(z_stream *stream) { return err; // # nocov } +void print_z_stream(const z_stream *s) // temporary tracing function for #4099 +{ + const char *byte = (char *)s; + DTPRINT("sizeof(z_stream)==%d: ", sizeof(z_stream)); + for (int i=0; istate->status which zlib:deflateStateCheck checks, #4826 + // this structure is not exposed, so we'll get to it via memory offsets using the trace output we put in to show on CRAN's Solaris output + const char *pos = (char *)&s->msg + sizeof(char *); // the field after *msg (exposed) is internal_state *state (not exposed) + byte = *(char **)pos; // byte now at start of internal_state pointed to by s->state + char *strm = *(char **)byte; // first 8 bytes (or 4 on 32bit) is strm labeled 'pointer back to this zlib stream' + DTPRINT("state: "); + for (int i=0; i<(sizeof(char *) + sizeof(int)); ++i) { + DTPRINT("%02x ", *(unsigned char *)byte++); + } + int status = *(int *)(byte-sizeof(int)); + DTPRINT("strm==%p state->strm==%p state->status==%d", s, strm, status); // two pointer values should be the same + DTPRINT(" zalloc==%p zfree==%p", s->zalloc, s->zfree); // checked to be !=0 by deflate.c:deflateStateCheck + DTPRINT(" (s->strm==strm)==%d", (char *)s==strm); // mimics the s->strm==strm check in deflate.c:deflateStateCheck + DTPRINT(" s->next_out==%p s->avail_in=%d s->next_in=%p", s->next_out, s->avail_in, s->next_in); // top of deflate.c:deflate() after the call to deflateStateCheck + DTPRINT(" deflates()'s checks (excluding status) would %s here", + (s->zalloc==(alloc_func)0 || s->zfree==(free_func)0 || s==Z_NULL || (char *)s!=strm || + s->next_out==Z_NULL || (s->avail_in!=0 && s->next_in==Z_NULL)) ? + "return -2" : "be ok"); + DTPRINT("\n"); +} + int compressbuff(z_stream *stream, void* dest, size_t *destLen, const void* source, size_t sourceLen) { stream->next_out = dest; stream->avail_out = *destLen; stream->next_in = (Bytef *)source; // don't use z_const anywhere; #3939 stream->avail_in = sourceLen; - if (verbose) DTPRINT(_("deflate input stream: %p %d %p %d\n"), stream->next_out, (int)(stream->avail_out), stream->next_in, (int)(stream->avail_in)); - + if (verbose) { + DTPRINT(_("deflate input stream: %p %d %p %d z_stream: "), stream->next_out, (int)(stream->avail_out), stream->next_in, (int)(stream->avail_in)); + print_z_stream(stream); + } int err = deflate(stream, Z_FINISH); - if (verbose) DTPRINT(_("deflate returned %d with stream->total_out==%d; Z_FINISH==%d, Z_OK==%d, Z_STREAM_END==%d\n"), err, (int)(stream->total_out), Z_FINISH, Z_OK, Z_STREAM_END); + if (verbose) { + DTPRINT(_("deflate returned %d with stream->total_out==%d; Z_FINISH==%d, Z_OK==%d, Z_STREAM_END==%d z_stream: "), err, (int)(stream->total_out), Z_FINISH, Z_OK, Z_STREAM_END); + print_z_stream(stream); + } if (err == Z_OK) { // with Z_FINISH, deflate must return Z_STREAM_END if correct, otherwise it's an error and we shouldn't return Z_OK (0) err = -9; // # nocov @@ -581,15 +615,6 @@ int compressbuff(z_stream *stream, void* dest, size_t *destLen, const void* sour return err == Z_STREAM_END ? Z_OK : err; } -void print_z_stream(const z_stream *s) // temporary tracing function for #4099 -{ - const unsigned char *byte = (unsigned char *)s; - for (int i=0; i +SEXP dt_zlib_version() { + char out[51]; + snprintf(out, 50, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION); + return ScalarString(mkChar(out)); +} + From 15e643655ce77c426f10a9240bc1966ac1e429e1 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 1 Dec 2020 12:24:33 -0500 Subject: [PATCH 125/588] runlock doesnt try to unlock functions (#4815) --- NEWS.md | 2 ++ R/data.table.R | 2 +- inst/tests/tests.Rraw | 10 ++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 94e1fee81c..f6e0e91fa0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,8 @@ 2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. If you have an old compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway, and if so `fsort` will check that and work fine. If not, the compiler might accept `-fopenmp-version=45`, otherwise you will need to upgrade compiler. https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful. +3. Columns containing functions that don't inherit the class `'function'` would fail to group, [#4814](https://github.com/Rdatatable/data.table/issues/4814). Thanks @mb706 for reporting, @ecoRoland2 for helping investigate, and @Coorsaa for a follow-up example involving environments. + ## NOTES 1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://www.h2o.ai/blog/behind-the-scenes-of-cran/). diff --git a/R/data.table.R b/R/data.table.R index d513891b93..2b010db77a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1774,7 +1774,7 @@ replace_dot_alias = function(e) { # TODO: is there an efficient way to get around this MAX_DEPTH limit? MAX_DEPTH = 5L runlock = function(x, current_depth = 1L) { - if (is.recursive(x) && current_depth <= MAX_DEPTH) { + if (is.list(x) && current_depth <= MAX_DEPTH) { # is.list() used to be is.recursive(), #4814 if (inherits(x, 'data.table')) .Call(C_unlock, x) else return(lapply(x, runlock, current_depth = current_depth + 1L)) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4dd2809f7d..a54962965e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17235,3 +17235,13 @@ test(2160.2, x[, v0 := fcase( )][c(1,3,74,96,100), round(v0,1)], c(0, -24.7, 82.5, 6.7, 0)) rm(x) +# runlock failed for "masked" functions (function storage but !inherits('function')), #4814 +f <- function(x) x +class(f) <- "fn" +dt <- data.table(id=1, f) +test(2161.1, dt[, .(f), by=id], dt) +e = environment() +class(e) = "foo" +dt = data.table(id=1, funs=list(e)) +test(2161.2, dt[, .(funs), by=id], dt) + From 36d4e9ba5f344f70a3672a8e14a8a79cbc0ef3a7 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 1 Dec 2020 10:31:29 -0700 Subject: [PATCH 126/588] spelling in NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index f6e0e91fa0..0f91bd4af0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,7 +20,7 @@ 2. `as.Date.IDate` is no longer exported as a function to solve a new error in R-devel `S3 method lookup found 'as.Date.IDate' on search path`, [#4777](https://github.com/Rdatatable/data.table/issues/4777). The S3 method is still exported; i.e. `as.Date(x)` will still invoke the `as.Date.IDate` method when `x` is class `IDate`. The function had been exported, in addition to exporting the method, to solve a compatibility issue with `zoo` (and `xts` which uses `zoo`) because `zoo` exports `as.Date` which masks `base::as.Date`. Happily, since zoo 1.8-1 (Jan 2018) made a change to its `as.IDate`, the workaround is no longer needed. -3. Thanks to @fredguinog for testing `fcase` in development before 1.13.0 was released and finding a segfault, [#4378](https://github.com/Rdatatable/data.table/issues/4378). It was found separately by the `rchk` tool (which uses static code analysis) in release procedures and fixed before `fcase` was released, but the reproducible example has now been added to the test suite for completeness. Thanks also to @shrektan for investigating, proposing a very similar fix at C level, and a different reproducible example wich has also been added to the test suite. +3. Thanks to @fredguinog for testing `fcase` in development before 1.13.0 was released and finding a segfault, [#4378](https://github.com/Rdatatable/data.table/issues/4378). It was found separately by the `rchk` tool (which uses static code analysis) in release procedures and fixed before `fcase` was released, but the reproducible example has now been added to the test suite for completeness. Thanks also to @shrektan for investigating, proposing a very similar fix at C level, and a different reproducible example which has also been added to the test suite. # data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1) (19 Oct 2020) From 25ee361c9b6e57f9bc8508efca1c43ca5aa4570f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 2 Dec 2020 17:00:00 -0700 Subject: [PATCH 127/588] .dev-only revdep.R: R-release refresh, check on CFLAGS=-O only, and no longer testing bioc revdeps --- .dev/.bash_aliases | 7 ++- .dev/revdep.R | 115 ++++++++++++++++++++++++++++----------------- 2 files changed, 77 insertions(+), 45 deletions(-) diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases index effcc7d9ea..01c3de23ad 100644 --- a/.dev/.bash_aliases +++ b/.dev/.bash_aliases @@ -16,7 +16,12 @@ alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla' alias R310='~/build/R-3.1.0/bin/R --vanilla' alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false && export R_DEFAULT_INTERNET_TIMEOUT=300' -alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R ~/build/R-devel/bin/R' +alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R R' +# use ~/build/R-devel/bin/R at the end of revdepr to use R-devel instead of R-release. +# If so, doing a `rm -rf *` in revdeplib first to rebuild everything is easiest way to avoid potential problems later. A full rebuild is a good idea periodically anyway. Packages in +# revdeplib may have been compiled many months ago, but the .so libraries they link to may have been updated in the meantime, or multiple packages may use the same .so libary, or +# switches inside the package's code may behave differently when R-devel is used instead of R-release, etc. I use R-release for revdepr, unless R-devel contains significant changes +# that we really need to test revdeps under. export R_PROFILE_USER='~/.Rprofile' # there's a .Rprofile in ~/GitHub/data.table/ so Matt sets R_PROFILE_USER here to always use ~/.Rprofile diff --git a/.dev/revdep.R b/.dev/revdep.R index 135d354300..558b579683 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -1,25 +1,44 @@ -# Run by package maintainer via these entries in ~/.bash_aliases : -# alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false' -# alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R ~/build/R-devel/bin/R' -# revdep = reverse first-order dependency; i.e. the CRAN and Bioconductor packages which directly use data.table (765 at the time of writing) +# Run by package maintainer via aliases revdepsh and revdepr in .dev/.bash_aliases. See +# that file for comments. +# revdep = reverse first-order dependency; i.e. the CRAN and Bioconductor packages which directly use data.table # Check that env variables have been set correctly: # export R_LIBS_SITE=none # export R_LIBS=~/build/revdeplib/ # export _R_CHECK_FORCE_SUGGESTS_=false stopifnot(identical(length(.libPaths()), 2L)) # revdeplib (writeable by me) and the pre-installed recommended R library (sudo writeable) -stopifnot(identical(file.info(.libPaths())[,"uname"], rep(as.vector(Sys.info()["user"]), 2))) # 2nd one is root when using default R rather than Rdevel +tt = file.info(.libPaths())[,"uname"] +stopifnot(identical(length(tt), 2L)) +stopifnot(tt[1L]==Sys.info()["user"]) +stopifnot(tt[2L] %in% c("root",Sys.info()["user"])) # root when using default R-release, user when using R-devel stopifnot(identical(.libPaths()[1], getwd())) stopifnot(identical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_"),"false")) + +cflags = system("grep \"^[^#]*CFLAGS\" ~/.R/Makevars", intern=TRUE) +cat("~/.R/Makevars contains", cflags, "\n") +if (!grepl("^CFLAGS=-O[0-3]$", cflags)) { + stop("Some packages have failed to install in the past (e.g. processx and RGtk2) when CFLAGS contains -pedandic, -Wall, and similar. ", + "So for revdepr keep CFLAGS simple; i.e. -O[0-3] only.") +} + options(repos = c("CRAN"=c("http://cloud.r-project.org"))) -R = "~/build/R-devel/bin/R" # alias doesn't work from system() # The alias sets R_PROFILE_USER so that this script runs on R starting up, leaving prompt running. # But if we don't unset it now, anything else from now on that does something like system("R CMD INSTALL") (e.g. update.packages() # and BiocManager::install()) will call this script again recursively. Sys.unsetenv("R_PROFILE_USER") -system(paste0(R," -e \"utils::update.packages('",.libPaths()[2],"', ask=FALSE, checkBuilt=TRUE)\"")) +if (is.null(utils::old.packages(.libPaths()[2]))) { + cat("All", length(dir(.libPaths()[2])), "recommended packages supplied with R in", .libPaths()[2], "are the latest version\n") +} else { + cat("Some recommended packages supplied with R need to be updated ...\n") + if (tt[2L]=="root") { + system(paste0("sudo R -e \"utils::update.packages('",.libPaths()[2],"', ask=TRUE, checkBuilt=TRUE)\"")) + } else { + system(paste0("~/build/R-devel/bin/R -e \"utils::update.packages('",.libPaths()[2],"', ask=TRUE, checkBuilt=TRUE)\"")) + # the Rdevel bash alias doesn't work from system() + } +} require(utils) # only base is loaded when R_PROFILE_USER runs update.packages(ask=FALSE, checkBuilt=TRUE) @@ -29,14 +48,20 @@ update.packages(ask=FALSE, checkBuilt=TRUE) # Follow: https://bioconductor.org/install # Ensure no library() call in .Rprofile, such as library(bit64) -require(BiocManager) -BiocManager::install(ask=FALSE, version="devel", checkBuilt=TRUE) -BiocManager::valid() +# As from October 2020, Matt no longer checks Bioconductor revdeps. After many years of trying, and repeated +# emails to Bioconductor maintainers, there were still too many issues not fixed for too long. The packages +# are big in size and have many warnings which make it hard to find the true problems. The way the Bioc +# devel and release repositories are set up require more work and confuses communication. That doesn't need +# to be done in the better and simpler way that CRAN is setup. +# require(BiocManager) +# BiocManager::install(ask=FALSE, version="devel", checkBuilt=TRUE) +# BiocManager::valid() +# avail = available.packages(repos=BiocManager::repositories()) # includes CRAN at the end from getOption("repos"). And ensure latest Bioc version is in repo path here. -avail = available.packages(repos=BiocManager::repositories()) # includes CRAN at the end from getOption("repos"). And ensure latest Bioc version is in repo path here. +avail = available.packages() # uses getOption("repos") which was set above deps = tools::package_dependencies("data.table", db=avail, which="all", reverse=TRUE, recursive=FALSE)[[1]] -exclude = c("TCGAbiolinks") # too long (>30mins): https://github.com/BioinformaticsFMRP/TCGAbiolinks/issues/240 -deps = deps[-match(exclude, deps)] +# exclude = c("TCGAbiolinks") # too long (>30mins): https://github.com/BioinformaticsFMRP/TCGAbiolinks/issues/240 +# deps = deps[-match(exclude, deps)] table(avail[deps,"Repository"]) old = 0 new = 0 @@ -48,7 +73,7 @@ for (p in deps) { packageVersion(p) != avail[p,"Version"]) { system(paste0("rm -rf ", p, ".Rcheck")) # Remove last check (of previous version) to move its status() to not yet run - install.packages(p, repos=BiocManager::repositories(), dependencies=TRUE) # again, bioc repos includes CRAN here + install.packages(p, dependencies=TRUE) # repos=BiocManager::repositories() used to be here which includes CRAN too # To install its dependencies. The package itsef is installed superfluously here because the tar.gz will be passed to R CMD check. # If we did download.packages() first and then passed that tar.gz to install.packages(), repos= is set to NULL when installing from # local file, so dependencies=TRUE wouldn't know where to get the dependencies. Hence usig install.packages first with repos= set. @@ -61,14 +86,14 @@ for (p in deps) { } } cat("New downloaded:",new," Already had latest:", old, " TOTAL:", length(deps), "\n") -update.packages(repos=BiocManager::repositories(), checkBuilt=TRUE) # double-check all dependencies are latest too +update.packages(checkBuilt=TRUE) # double-check all dependencies are latest too; again repos=BiocManager::repositories() used to be here cat("This is R ",R.version$major,".",R.version$minor,"; ",R.version.string,"\n",sep="") cat("Installed packages built using:\n") x = installed.packages() drop(table(x[,"Built"])) # manually inspect to ensure all built with this x.y release of R if (FALSE) { # if not, run this manually replacing "4.0.0" appropriately for (p in rownames(x)[x[,"Built"]=="4.0.0"]) { - install.packages(p, repos=BiocManager::repositories()) + install.packages(p) # repos=BiocManager::repositories() } # warnings may suggest many of them were removed from CRAN, so remove the remaining from revdeplib to be clean x = installed.packages() @@ -88,38 +113,15 @@ for (p in deps) { all = system("ls *.tar.gz", intern=TRUE) all = sapply(strsplit(all, split="_"),'[',1) for (i in all[!all %in% deps]) { - cat("Removing",i,"because it", if (!i %in% rownames(avail)) "has been removed from CRAN/Bioconductor\n" else "no longer uses data.table\n") + cat("Removing",i,"because it", if (!i %in% rownames(avail)) "has been removed from CRAN\n" else "no longer uses data.table\n") system(paste0("rm ",i,"_*.tar.gz")) } } num_tar.gz = as.integer(system("ls *.tar.gz | wc -l", intern=TRUE)) if (length(deps) != num_tar.gz) stop("num_tar.gz==",num_tar.gz," but length(deps)==",length(deps)) -status = function(which="both") { - if (which=="both") { - cat("Installed data.table to be tested against:", - as.character(packageVersion("data.table")), - format(as.POSIXct(packageDescription("data.table")$Packaged, tz="UTC"), tz=""), # local time - "\n") - cat("CRAN:\n"); status("cran") - cat("BIOC:\n"); status("bioc") - cat("TOTAL :", length(deps), "\n\n") - cat("Oldest 00check.log (to check no old stale ones somehow missed):\n") - system("find . -name '00check.log' | xargs ls -lt | tail -1") - cat("\n") - tt = length(system('ps -aux | grep "parallel.*R.* CMD check"', intern=TRUE))>2L - cat("parallel R CMD check is ", if(tt)"" else "not ", "running\n",sep="") - if (file.exists("/tmp/started.flag")) { - # system("ls -lrt /tmp/*.flag") - tt = as.POSIXct(file.info(c("/tmp/started.flag","/tmp/finished.flag"))$ctime) - if (is.na(tt[2])) { tt[2] = Sys.time(); cat("Has been running for "); } - else cat("Ran for "); - cat(round(diff(as.numeric(tt))/60, 1), "mins\n") - } - return(invisible()) - } - if (which=="cran") deps = deps[-grep("bioc",avail[deps,"Repository"])] - if (which=="bioc") deps = deps[grep("bioc",avail[deps,"Repository"])] +status0 = function(bioc=FALSE) { + deps = deps[grep("bioc", avail[deps,"Repository"], invert=!bioc)] x = unlist(sapply(deps, function(x) { fn = paste0("./",x,".Rcheck/00check.log") if (file.exists(fn)) { @@ -145,7 +147,32 @@ status = function(which="both") { if (length(ns)) paste0("NOT STARTED : ",paste(sort(names(x)[head(ns,20)]),collapse=" "), if(length(ns)>20)paste(" +",length(ns)-20,"more"), "\n"), "\n" ) - assign(paste0(".fail.",which), c(sort(names(x)[e]), sort(names(x)[w])), envir=.GlobalEnv) + assign(if (bioc) ".fail.bioc" else ".fail.cran", c(sort(names(x)[e]), sort(names(x)[w])), envir=.GlobalEnv) + invisible() +} + +status = function(bioc=FALSE) { + cat("Installed data.table to be tested against:", + as.character(packageVersion("data.table")), + format(as.POSIXct(packageDescription("data.table")$Packaged, tz="UTC"), tz=""), # local time + "\nCRAN:\n") + status0() + if (bioc) { + cat("BIOC:\n"); status0(bioc=TRUE) + cat("TOTAL :", length(deps), "\n\n") + } + cat("Oldest 00check.log (to check no old stale ones somehow missed):\n") + system("find . -name '00check.log' | xargs ls -lt | tail -1") + cat("\n") + tt = length(system('ps -aux | grep "parallel.*R.* CMD check"', intern=TRUE))>2L + cat("parallel R CMD check is ", if(tt)"" else "not ", "running\n",sep="") + if (file.exists("/tmp/started.flag")) { + # system("ls -lrt /tmp/*.flag") + tt = as.POSIXct(file.info(c("/tmp/started.flag","/tmp/finished.flag"))$ctime) + if (is.na(tt[2])) { tt[2] = Sys.time(); cat("Has been running for "); } + else cat("Ran for "); + cat(round(diff(as.numeric(tt))/60, 1), "mins\n") + } invisible() } @@ -175,7 +202,7 @@ run = function(pkgs=NULL) { } else { pkgs = NULL if (which=="not.started") pkgs = deps[!file.exists(paste0("./",deps,".Rcheck"))] # those that haven't run - if (which %in% c("cran.fail","both.fail")) pkgs = union(pkgs, .fail.cran) # .fail.* were written to .GlobalEnv by status() + if (which %in% c("cran.fail","both.fail")) pkgs = union(pkgs, .fail.cran) # .fail.* were written to .GlobalEnv by status0() if (which %in% c("bioc.fail","both.fail")) pkgs = union(pkgs, .fail.bioc) } } From bd2fa143121584506578a2262eeb9d57382cdc0a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 3 Dec 2020 13:16:44 -0700 Subject: [PATCH 128/588] .dev-only: revdep.R refinement --- .dev/CRAN_Release.cmd | 3 ++- .dev/revdep.R | 48 ++++++++++++++++++++++++------------------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index ce73aa84fe..3c77280456 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -524,6 +524,7 @@ sudo apt-get -y install libzmq3-dev # for rzmq sudo apt-get -y install libimage-exiftool-perl # for camtrapR sudo apt-get -y install parallel # for revdepr.R sudo apt-get -y install pandoc-citeproc # for basecallQC +sudo apt-get -y install libquantlib0-dev # for RQuantLib sudo R CMD javareconf # ENDIF @@ -567,7 +568,7 @@ du -k inst/tests # 1.5MB before bzip2 inst/tests/*.Rraw # compress *.Rraw just for release to CRAN; do not commit compressed *.Rraw to git du -k inst/tests # 0.75MB after R CMD build . -R CMD check data.table_1.13.2.tar.gz --as-cran +R CMD check data.table_1.13.4.tar.gz --as-cran # bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit compressed *.Rraw to git # diff --git a/.dev/revdep.R b/.dev/revdep.R index 558b579683..1de4b0480c 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -10,6 +10,13 @@ stopifnot(identical(length(.libPaths()), 2L)) # revdeplib (writeable by me) tt = file.info(.libPaths())[,"uname"] stopifnot(identical(length(tt), 2L)) stopifnot(tt[1L]==Sys.info()["user"]) +if (grepl("devel", .libPaths()[2L])) { + stopifnot(tt[2L]==Sys.info()["user"]) + R = "~/build/R-devel/bin/R" # would use Rdevel alias but the bash alias doesn't work from system() +} else { + stopifnot(tt[2L]=="root") + R = "R" # R-release +} stopifnot(tt[2L] %in% c("root",Sys.info()["user"])) # root when using default R-release, user when using R-devel stopifnot(identical(.libPaths()[1], getwd())) stopifnot(identical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_"),"false")) @@ -22,6 +29,8 @@ if (!grepl("^CFLAGS=-O[0-3]$", cflags)) { } options(repos = c("CRAN"=c("http://cloud.r-project.org"))) +options(warn=2) # stop on any warnings. Otherwise difficult to trace knock-on effects can build up. Keep improving this script so that no warnings occur. +options(timeout=3600) # to download BSgenome.Hsapiens.UCSC.hg19 (677GB) which is suggested (so needed by R CMD check) by CRAN package CNVScope which imports data.table # The alias sets R_PROFILE_USER so that this script runs on R starting up, leaving prompt running. # But if we don't unset it now, anything else from now on that does something like system("R CMD INSTALL") (e.g. update.packages() @@ -32,12 +41,8 @@ if (is.null(utils::old.packages(.libPaths()[2]))) { cat("All", length(dir(.libPaths()[2])), "recommended packages supplied with R in", .libPaths()[2], "are the latest version\n") } else { cat("Some recommended packages supplied with R need to be updated ...\n") - if (tt[2L]=="root") { - system(paste0("sudo R -e \"utils::update.packages('",.libPaths()[2],"', ask=TRUE, checkBuilt=TRUE)\"")) - } else { - system(paste0("~/build/R-devel/bin/R -e \"utils::update.packages('",.libPaths()[2],"', ask=TRUE, checkBuilt=TRUE)\"")) - # the Rdevel bash alias doesn't work from system() - } + system(paste0(if(R=="R")"sudo ", R, " -e \"utils::update.packages('",.libPaths()[2],"', ask=TRUE, checkBuilt=TRUE)\"")) + # old.packages was called first, to avoid entering password for sudo if, as is most often the case, all recommended packages are already to date } require(utils) # only base is loaded when R_PROFILE_USER runs @@ -58,11 +63,11 @@ update.packages(ask=FALSE, checkBuilt=TRUE) # BiocManager::valid() # avail = available.packages(repos=BiocManager::repositories()) # includes CRAN at the end from getOption("repos"). And ensure latest Bioc version is in repo path here. -avail = available.packages() # uses getOption("repos") which was set above +avail = available.packages() # uses getOption("repos") which was set above to CRAN deps = tools::package_dependencies("data.table", db=avail, which="all", reverse=TRUE, recursive=FALSE)[[1]] # exclude = c("TCGAbiolinks") # too long (>30mins): https://github.com/BioinformaticsFMRP/TCGAbiolinks/issues/240 # deps = deps[-match(exclude, deps)] -table(avail[deps,"Repository"]) +table(avail[deps,"Repository"], dnn=NULL) old = 0 new = 0 if (basename(.libPaths()[1]) != "revdeplib") stop("Must start R with exports as above") @@ -70,10 +75,11 @@ for (p in deps) { fn = paste0(p, "_", avail[p,"Version"], ".tar.gz") if (!file.exists(fn) || identical(tryCatch(packageVersion(p), error=function(e)FALSE), FALSE) || - packageVersion(p) != avail[p,"Version"]) { + packageVersion(p) != avail[p,"Version"]) { + cat("\n**** Installing revdep:", p, "\n") system(paste0("rm -rf ", p, ".Rcheck")) # Remove last check (of previous version) to move its status() to not yet run - install.packages(p, dependencies=TRUE) # repos=BiocManager::repositories() used to be here which includes CRAN too + install.packages(p, dependencies=TRUE, repos=BiocManager::repositories()) # some CRAN packages import Bioc packages; e.g. wilson imports DESeq2 # To install its dependencies. The package itsef is installed superfluously here because the tar.gz will be passed to R CMD check. # If we did download.packages() first and then passed that tar.gz to install.packages(), repos= is set to NULL when installing from # local file, so dependencies=TRUE wouldn't know where to get the dependencies. Hence usig install.packages first with repos= set. @@ -86,19 +92,19 @@ for (p in deps) { } } cat("New downloaded:",new," Already had latest:", old, " TOTAL:", length(deps), "\n") -update.packages(checkBuilt=TRUE) # double-check all dependencies are latest too; again repos=BiocManager::repositories() used to be here +update.packages(checkBuilt=TRUE, repos=BiocManager::repositories()) # include bioc for CRAN packages which import Bioc packages; e.g. wilson imports DESeq2 cat("This is R ",R.version$major,".",R.version$minor,"; ",R.version.string,"\n",sep="") -cat("Installed packages built using:\n") +cat("Previously installed packages were built using:\n") x = installed.packages() -drop(table(x[,"Built"])) # manually inspect to ensure all built with this x.y release of R +table(x[,"Built"], dnn=NULL) # manually inspect to ensure all built with this x.y release of R if (FALSE) { # if not, run this manually replacing "4.0.0" appropriately for (p in rownames(x)[x[,"Built"]=="4.0.0"]) { - install.packages(p) # repos=BiocManager::repositories() + install.packages(p, repos=BiocManager::repositories()) } # warnings may suggest many of them were removed from CRAN, so remove the remaining from revdeplib to be clean x = installed.packages() remove.packages(rownames(x)[x[,"Built"]=="4.0.0"]) - drop(table(installed.packages()[,"Built"])) # check again to make sure all built in current R-devel x.y version + table(installed.packages()[,"Built"], dnn=NULL) # check again to make sure all built in current R-devel x.y version } # Remove the tar.gz no longer needed : @@ -138,11 +144,11 @@ status0 = function(bioc=FALSE) { ok = setdiff( grep("OK",x), c(e,w,n) ) r = grep("RUNNING",x) ns = grep("NOT STARTED", x) - cat(" ERROR :",sprintf("%3d",length(e)),":",paste(sort(names(x)[e])),"\n", - "WARNING :",sprintf("%3d",length(w)),":",paste(sort(names(x)[w])),"\n", - "NOTE :",sprintf("%3d",length(n)),"\n", #":",paste(sort(names(x)[n])),"\n", - "OK :",sprintf("%3d",length(ok)),"\n", - "TOTAL :",length(e)+length(w)+length(n)+length(ok),"/",length(deps),"\n", + cat(" ERROR :",sprintf("%4d",length(e)),":",paste(sort(names(x)[e])),"\n", + "WARNING :",sprintf("%4d",length(w)),":",paste(sort(names(x)[w])),"\n", + "NOTE :",sprintf("%4d",length(n)),"\n", #":",paste(sort(names(x)[n])),"\n", + "OK :",sprintf("%4d",length(ok)),"\n", + "TOTAL :",sprintf("%4d",length(e)+length(w)+length(n)+length(ok)),"/",length(deps),"\n", if (length(r)) paste0("RUNNING : ",paste(sort(names(x)[r]),collapse=" "),"\n"), if (length(ns)) paste0("NOT STARTED : ",paste(sort(names(x)[head(ns,20)]),collapse=" "), if(length(ns)>20)paste(" +",length(ns)-20,"more"), "\n"), "\n" @@ -155,7 +161,7 @@ status = function(bioc=FALSE) { cat("Installed data.table to be tested against:", as.character(packageVersion("data.table")), format(as.POSIXct(packageDescription("data.table")$Packaged, tz="UTC"), tz=""), # local time - "\nCRAN:\n") + "\n\nCRAN:\n") status0() if (bioc) { cat("BIOC:\n"); status0(bioc=TRUE) From fe7fa68b261ec046e8a9d32f97f3e8249de6df5b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 3 Dec 2020 17:57:48 -0700 Subject: [PATCH 129/588] .dev-only: revdep.R refinement --- .dev/.bash_aliases | 2 +- .dev/revdep.R | 30 ++++++++++++++++++------------ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases index 01c3de23ad..450a765a4f 100644 --- a/.dev/.bash_aliases +++ b/.dev/.bash_aliases @@ -15,7 +15,7 @@ alias Rdevel-valgrind='~/build/R-devel-valgrind/bin/R --vanilla' alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla' alias R310='~/build/R-3.1.0/bin/R --vanilla' -alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false && export R_DEFAULT_INTERNET_TIMEOUT=300' +alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=true && export R_DEFAULT_INTERNET_TIMEOUT=3600' alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R R' # use ~/build/R-devel/bin/R at the end of revdepr to use R-devel instead of R-release. # If so, doing a `rm -rf *` in revdeplib first to rebuild everything is easiest way to avoid potential problems later. A full rebuild is a good idea periodically anyway. Packages in diff --git a/.dev/revdep.R b/.dev/revdep.R index 1de4b0480c..b799f57450 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -2,11 +2,17 @@ # that file for comments. # revdep = reverse first-order dependency; i.e. the CRAN and Bioconductor packages which directly use data.table +Sys.unsetenv("R_PROFILE_USER") +# The alias sets R_PROFILE_USER so that this script runs on R starting up, and leaves the R prompt running. +# But if we don't unset it now, anything else from now on that does something like system("R CMD INSTALL"), e.g. update.packages() +# and BiocManager::install(), will call this script again recursively. + # Check that env variables have been set correctly: # export R_LIBS_SITE=none # export R_LIBS=~/build/revdeplib/ # export _R_CHECK_FORCE_SUGGESTS_=false -stopifnot(identical(length(.libPaths()), 2L)) # revdeplib (writeable by me) and the pre-installed recommended R library (sudo writeable) +stopifnot(identical(length(.libPaths()), 2L)) # revdeplib writeable by me, and the pre-installed recommended R library (sudo writeable) +stopifnot(identical(.libPaths()[1L], getwd())) tt = file.info(.libPaths())[,"uname"] stopifnot(identical(length(tt), 2L)) stopifnot(tt[1L]==Sys.info()["user"]) @@ -17,9 +23,12 @@ if (grepl("devel", .libPaths()[2L])) { stopifnot(tt[2L]=="root") R = "R" # R-release } -stopifnot(tt[2L] %in% c("root",Sys.info()["user"])) # root when using default R-release, user when using R-devel -stopifnot(identical(.libPaths()[1], getwd())) -stopifnot(identical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_"),"false")) + +stopifnot(identical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_"),"true")) +# _R_CHECK_FORCE_SUGGESTS_=true explicitly in .dev/.bash_aliases +# All suggests should be installed for revdep checking. This avoids problems for some packages for which the attempt to run +# run R CMD check without all suggests can fail due to changed behaviour when some of the suggests aren't available; +# e.g. https://github.com/reimandlab/ActivePathways/issues/14 cflags = system("grep \"^[^#]*CFLAGS\" ~/.R/Makevars", intern=TRUE) cat("~/.R/Makevars contains", cflags, "\n") @@ -30,12 +39,9 @@ if (!grepl("^CFLAGS=-O[0-3]$", cflags)) { options(repos = c("CRAN"=c("http://cloud.r-project.org"))) options(warn=2) # stop on any warnings. Otherwise difficult to trace knock-on effects can build up. Keep improving this script so that no warnings occur. -options(timeout=3600) # to download BSgenome.Hsapiens.UCSC.hg19 (677GB) which is suggested (so needed by R CMD check) by CRAN package CNVScope which imports data.table - -# The alias sets R_PROFILE_USER so that this script runs on R starting up, leaving prompt running. -# But if we don't unset it now, anything else from now on that does something like system("R CMD INSTALL") (e.g. update.packages() -# and BiocManager::install()) will call this script again recursively. -Sys.unsetenv("R_PROFILE_USER") +cat("options()$timeout==", options()$timeout," set by R_DEFAULT_INTERNET_TIMEOUT in .dev/.bash_aliases revdepsh\n",sep="") +# R's default is 60. Before Dec 2020, we used 300 but that wasn't enough to download Bioc package BSgenome.Hsapiens.UCSC.hg19 (677GB) which is +# suggested by CRAN package CNVScope which imports data.table. From Dec 2020 we use 3600. if (is.null(utils::old.packages(.libPaths()[2]))) { cat("All", length(dir(.libPaths()[2])), "recommended packages supplied with R in", .libPaths()[2], "are the latest version\n") @@ -243,8 +249,8 @@ log = function(bioc=FALSE, fnam="~/fail.log") { require(BiocManager) # to ensure Bioc version is included in attached packages sessionInfo. It includes the minor version this way; e.g. 1.30.4 cat(capture.output(sessionInfo()), "\n", file=fnam, sep="\n") for (i in x) { - system(paste0("ls | grep '",i,".*tar.gz' >> ",fnam)) - if (i %in% .fail.bioc) { + system(paste0("ls | grep '",i,"_.*tar.gz' >> ",fnam)) + if (bioc && i %in% .fail.bioc) { # for Bioconductor only, now include the git commit and date. Although Bioc dev check status online may show OK : # https://bioconductor.org/checkResults/devel/bioc-LATEST/ # the Bioc package maintainer has to remember to bump the version number otherwise Bioc will not propogate it, From f4515eac8cb3f5659d6e13768cf6ecb45635d3fd Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 4 Dec 2020 23:20:34 -0700 Subject: [PATCH 130/588] .dev-only: stricter treatment of missing suggests --- .dev/CRAN_Release.cmd | 1 + .dev/revdep.R | 64 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 3c77280456..2e61568546 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -525,6 +525,7 @@ sudo apt-get -y install libimage-exiftool-perl # for camtrapR sudo apt-get -y install parallel # for revdepr.R sudo apt-get -y install pandoc-citeproc # for basecallQC sudo apt-get -y install libquantlib0-dev # for RQuantLib +sudo apt-get -y install cargo # for gifski, a suggest of nasoi sudo R CMD javareconf # ENDIF diff --git a/.dev/revdep.R b/.dev/revdep.R index b799f57450..be6e2a2e37 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -10,7 +10,7 @@ Sys.unsetenv("R_PROFILE_USER") # Check that env variables have been set correctly: # export R_LIBS_SITE=none # export R_LIBS=~/build/revdeplib/ -# export _R_CHECK_FORCE_SUGGESTS_=false +# export _R_CHECK_FORCE_SUGGESTS_=true stopifnot(identical(length(.libPaths()), 2L)) # revdeplib writeable by me, and the pre-installed recommended R library (sudo writeable) stopifnot(identical(.libPaths()[1L], getwd())) tt = file.info(.libPaths())[,"uname"] @@ -31,14 +31,19 @@ stopifnot(identical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_"),"true")) # e.g. https://github.com/reimandlab/ActivePathways/issues/14 cflags = system("grep \"^[^#]*CFLAGS\" ~/.R/Makevars", intern=TRUE) -cat("~/.R/Makevars contains", cflags, "\n") +cat("~/.R/Makevars contains", cflags, "ok\n") if (!grepl("^CFLAGS=-O[0-3]$", cflags)) { stop("Some packages have failed to install in the past (e.g. processx and RGtk2) when CFLAGS contains -pedandic, -Wall, and similar. ", "So for revdepr keep CFLAGS simple; i.e. -O[0-3] only.") } options(repos = c("CRAN"=c("http://cloud.r-project.org"))) -options(warn=2) # stop on any warnings. Otherwise difficult to trace knock-on effects can build up. Keep improving this script so that no warnings occur. +options(repos = BiocManager::repositories()) +# Some CRAN packages import Bioc packages; e.g. wilson imports DESeq2. So we need to install DESeq2 from Bioc. +# BiocManager::repositories() includes CRAN in its result (it appends to getOption("repos"). Using the Bioc function +# ensures the latest Bioc version is in the repo path here (their repos have the version number in the path). + +options(warn=1) # warning at the time so we can more easily see what's going on package by package when we scroll through output cat("options()$timeout==", options()$timeout," set by R_DEFAULT_INTERNET_TIMEOUT in .dev/.bash_aliases revdepsh\n",sep="") # R's default is 60. Before Dec 2020, we used 300 but that wasn't enough to download Bioc package BSgenome.Hsapiens.UCSC.hg19 (677GB) which is # suggested by CRAN package CNVScope which imports data.table. From Dec 2020 we use 3600. @@ -67,10 +72,16 @@ update.packages(ask=FALSE, checkBuilt=TRUE) # require(BiocManager) # BiocManager::install(ask=FALSE, version="devel", checkBuilt=TRUE) # BiocManager::valid() -# avail = available.packages(repos=BiocManager::repositories()) # includes CRAN at the end from getOption("repos"). And ensure latest Bioc version is in repo path here. -avail = available.packages() # uses getOption("repos") which was set above to CRAN -deps = tools::package_dependencies("data.table", db=avail, which="all", reverse=TRUE, recursive=FALSE)[[1]] +avail = available.packages() # includes CRAN and Bioc, from getOption("repos") set above + +avail = avail[-match("cplexAPI",rownames(avail)),] +# cplexAPI is suggested by revdeps ivmte and prioritizr. I haven't succeeded to install IBM ILOG CPLEX which requires a license, +# so consider cplexAPI not available when resolving missing suggests at the end of status(). + +deps = tools::package_dependencies("data.table", + db = available.packages(repos=getOption("repos")["CRAN"]), # just CRAN revdeps though (not Bioc) from October 2020 + which="all", reverse=TRUE, recursive=FALSE)[[1]] # exclude = c("TCGAbiolinks") # too long (>30mins): https://github.com/BioinformaticsFMRP/TCGAbiolinks/issues/240 # deps = deps[-match(exclude, deps)] table(avail[deps,"Repository"], dnn=NULL) @@ -85,7 +96,7 @@ for (p in deps) { cat("\n**** Installing revdep:", p, "\n") system(paste0("rm -rf ", p, ".Rcheck")) # Remove last check (of previous version) to move its status() to not yet run - install.packages(p, dependencies=TRUE, repos=BiocManager::repositories()) # some CRAN packages import Bioc packages; e.g. wilson imports DESeq2 + install.packages(p, dependencies=TRUE) # To install its dependencies. The package itsef is installed superfluously here because the tar.gz will be passed to R CMD check. # If we did download.packages() first and then passed that tar.gz to install.packages(), repos= is set to NULL when installing from # local file, so dependencies=TRUE wouldn't know where to get the dependencies. Hence usig install.packages first with repos= set. @@ -98,14 +109,14 @@ for (p in deps) { } } cat("New downloaded:",new," Already had latest:", old, " TOTAL:", length(deps), "\n") -update.packages(checkBuilt=TRUE, repos=BiocManager::repositories()) # include bioc for CRAN packages which import Bioc packages; e.g. wilson imports DESeq2 +update.packages(checkBuilt=TRUE) cat("This is R ",R.version$major,".",R.version$minor,"; ",R.version.string,"\n",sep="") cat("Previously installed packages were built using:\n") x = installed.packages() table(x[,"Built"], dnn=NULL) # manually inspect to ensure all built with this x.y release of R if (FALSE) { # if not, run this manually replacing "4.0.0" appropriately for (p in rownames(x)[x[,"Built"]=="4.0.0"]) { - install.packages(p, repos=BiocManager::repositories()) + install.packages(p) } # warnings may suggest many of them were removed from CRAN, so remove the remaining from revdeplib to be clean x = installed.packages() @@ -185,10 +196,40 @@ status = function(bioc=FALSE) { else cat("Ran for "); cat(round(diff(as.numeric(tt))/60, 1), "mins\n") } + + # Now deal with Suggests that are not available. Could have been removed from CRAN/Bioc, or are not installing for some reason like system library not installed. + tt = system("find . -name '00check.log' -exec grep -zl 'ERROR.Packages* suggested but not available' {} \\;", intern=TRUE) + if (length(tt)) { + tt = sort(substring(tt, 3L, nchar(tt)-nchar(".Rcheck/00check.log"))) + cat("\n", length(tt), " packages with unavailable suggests. The missing suggests might have been removed from CRAN/Bioc, or they might be failing to install.\n", sep="") + cat(paste(tt,collapse=", "),"\n") + installed = installed.packages() + all_sugg_unavail = c() + for (pkg in tt) { + sugg = strsplit(gsub("\n","",avail[pkg,"Suggests"]), split=",")[[1L]] + sugg = gsub("^ ","",sugg) + sugg = gsub(" [(].+[)]","",sugg) + miss = sugg[!sugg %in% rownames(installed)] + if (!length(miss)) stop("00check.log for ",pkg," states that some of its suggests are not installed, but they all appear to be.") + cat("\n", pkg, " is missing ",paste(miss,collapse=","), sep="") + if (any(tt <- miss %in% rownames(avail))) { + cat("; some are available, installing ...\n") + install.packages(miss[which(tt)]) # careful not to ask for unavailable packages here, to avoid the warnings we already know they aren't available + } else { + cat("; all unavailable on CRAN/Bioc\n") + all_sugg_unavail = c(all_sugg_unavail, pkg) + } + } + if (length(all_sugg_unavail)) { + cat('\nPackages for which all their missing suggests are not available, try:\n', + ' run("',paste(all_sugg_unavail,collapse=" "),'", R_CHECK_FORCE_SUGGESTS=FALSE)\n', sep="") + } + # Otherwise, inspect manually each result in fail.log written by log() + } invisible() } -run = function(pkgs=NULL) { +run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE) { cat("Installed data.table to be tested against:",as.character(packageVersion("data.table")),"\n") if (length(pkgs)==1) pkgs = strsplit(pkgs, split="[, ]")[[1]] if (anyDuplicated(pkgs)) stop("pkgs contains dups") @@ -229,7 +270,8 @@ run = function(pkgs=NULL) { cat("Proceed? (ctrl-c or enter)\n") scan(quiet=TRUE) if (!identical(pkgs,"_ALL_")) for (i in pkgs) system(paste0("rm -rf ./",i,".Rcheck")) - cmd = paste0("ls -1 *.tar.gz ", filter, "| TZ='UTC' OMP_THREAD_LIMIT=2 parallel --max-procs 50% ",R," CMD check") + SUGG = paste0("_R_CHECK_FORCE_SUGGESTS_=",tolower(R_CHECK_FORCE_SUGGESTS)) + cmd = paste0("ls -1 *.tar.gz ", filter, "| TZ='UTC' OMP_THREAD_LIMIT=2 ",SUGG," parallel --max-procs 50% ",R," CMD check") # TZ='UTC' because some packages have failed locally for me but not on CRAN or for their maintainer, due to sensitivity of tests to timezone if (as.integer(system("ps -e | grep perfbar | wc -l", intern=TRUE)) < 1) system("perfbar",wait=FALSE) system("touch /tmp/started.flag ; rm -f /tmp/finished.flag") From 616dfc5093b0f7eea6aafbc1ecc744a541088b12 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 5 Dec 2020 00:01:29 -0700 Subject: [PATCH 131/588] .dev-only: removed superfluous line in status() output --- .dev/revdep.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index be6e2a2e37..86c74b9212 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -201,8 +201,6 @@ status = function(bioc=FALSE) { tt = system("find . -name '00check.log' -exec grep -zl 'ERROR.Packages* suggested but not available' {} \\;", intern=TRUE) if (length(tt)) { tt = sort(substring(tt, 3L, nchar(tt)-nchar(".Rcheck/00check.log"))) - cat("\n", length(tt), " packages with unavailable suggests. The missing suggests might have been removed from CRAN/Bioc, or they might be failing to install.\n", sep="") - cat(paste(tt,collapse=", "),"\n") installed = installed.packages() all_sugg_unavail = c() for (pkg in tt) { From 06dd15d79dc9ae3f8111ac1d120bb9babe3ff69d Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 5 Dec 2020 01:26:12 -0700 Subject: [PATCH 132/588] urls (#4833) --- DESCRIPTION | 2 +- NEWS.md | 2 +- README.md | 2 +- man/fread.Rd | 3 +-- man/fwrite.Rd | 2 +- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 913b9e74a9..4340f016aa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,7 +67,7 @@ Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (> SystemRequirements: zlib Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE -URL: http://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table +URL: https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table BugReports: https://github.com/Rdatatable/data.table/issues VignetteBuilder: knitr ByteCompile: TRUE diff --git a/NEWS.md b/NEWS.md index 0f91bd4af0..d9201bd9b3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -862,7 +862,7 @@ has a better chance of working on Mac. 12. `DT[..., .SDcols=integer()]` failed with `.SDcols is numeric but has both +ve and -ve indices`, [#1789](https://github.com/Rdatatable/data.table/issues/1789) and [#3185](https://github.com/Rdatatable/data.table/issues/3185). It now functions as `.SDcols=character()` has done and creates an empty `.SD`. Thanks to Gabor Grothendieck and Hugh Parsonage for reporting. A related issue with empty `.SDcols` was fixed in development before release thanks to Kun Ren's testing, [#3211](https://github.com/Rdatatable/data.table/issues/3211). -13. Multithreaded stability should be much improved with R 3.5+. Many thanks to Luke Tierney for pinpointing a memory issue with package `constellation` caused by `data.table` and his advice, [#3165](https://github.com/Rdatatable/data.table/issues/3165). Luke also added an extra check to R-devel when compiled with `--enable-strict-barrier`. The test suite is run through latest daily R-devel after every commit as usual, but now with `--enable-strict-barrier` on too via GitLab Pipelines ("Extra" badge at the top of the data.table [homepage](http://r-datatable.com)) thanks to Jan Gorecki. +13. Multithreaded stability should be much improved with R 3.5+. Many thanks to Luke Tierney for pinpointing a memory issue with package `constellation` caused by `data.table` and his advice, [#3165](https://github.com/Rdatatable/data.table/issues/3165). Luke also added an extra check to R-devel when compiled with `--enable-strict-barrier`. The test suite is run through latest daily R-devel after every commit as usual, but now with `--enable-strict-barrier` on too via GitLab CI ("Extra" badge on the `data.table` homepage) thanks to Jan Gorecki. 14. Fixed an edge-case bug of platform-dependent output of `strtoi("", base = 2L)` on which `groupingsets` had relied, [#3267](https://github.com/Rdatatable/data.table/issues/3267). diff --git a/README.md b/README.md index 01e7b4aa15..da7a902b24 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# data.table +# data.table [![CRAN status](https://cranchecks.info/badges/flavor/release/data.table)](https://cran.r-project.org/web/checks/check_results_data.table.html) diff --git a/man/fread.Rd b/man/fread.Rd index 6419247617..fda9d166c8 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -128,8 +128,7 @@ When \code{input} begins with http://, https://, ftp://, ftps://, or file://, \c Background :\cr \url{https://cran.r-project.org/doc/manuals/R-data.html}\cr \url{https://stackoverflow.com/questions/1727772/quickly-reading-very-large-tables-as-dataframes-in-r}\cr -\url{http://www.biostat.jhsph.edu/~rpeng/docs/R-large-tables.html}\cr -\url{https://www.cerebralmastication.com/2009/11/loading-big-data-into-r/}\cr +\url{https://cerebralmastication.com/2009/11/loading-big-data-into-r/)}\cr \url{https://stackoverflow.com/questions/9061736/faster-than-scan-with-rcpp}\cr \url{https://stackoverflow.com/questions/415515/how-can-i-read-and-manipulate-csv-file-data-in-c}\cr \url{https://stackoverflow.com/questions/9352887/strategies-for-reading-in-csv-files-in-pieces}\cr diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 6f7682b98b..f784b6bc3b 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -61,7 +61,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{verbose}{Be chatty and report timings?} } \details{ -\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://blog.h2o.ai/2016/04/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. +\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://www.h2o.ai/blog/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. To save space, \code{fwrite} prefers to write wide numeric values in scientific notation -- e.g. \code{10000000000} takes up much more space than \code{1e+10}. Most file readers (e.g. \code{\link{fread}}) understand scientific notation, so there's no fidelity loss. Like in base R, users can control this by specifying the \code{scipen} argument, which follows the same rules as \code{\link[base]{options}('scipen')}. \code{fwrite} will see how much space a value will take to write in scientific vs. decimal notation, and will only write in scientific notation if the latter is more than \code{scipen} characters wider. For \code{10000000000}, then, \code{1e+10} will be written whenever \code{scipen<6}. From 25132daa4255589e49ecea831319fbb700163242 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 5 Dec 2020 01:49:01 -0700 Subject: [PATCH 133/588] .dev-only: revdep.R:status() tweak --- .dev/revdep.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 86c74b9212..bacf0cf53a 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -208,8 +208,12 @@ status = function(bioc=FALSE) { sugg = gsub("^ ","",sugg) sugg = gsub(" [(].+[)]","",sugg) miss = sugg[!sugg %in% rownames(installed)] - if (!length(miss)) stop("00check.log for ",pkg," states that some of its suggests are not installed, but they all appear to be.") - cat("\n", pkg, " is missing ",paste(miss,collapse=","), sep="") + cat("\n",pkg,sep="") + if (!length(miss)) { + cat(" 00check.log states that some of its suggests are not installed, but they all appear to be. Inspect and rerun.\n") + next + } + cat(" is missing",paste(miss,collapse=",")) if (any(tt <- miss %in% rownames(avail))) { cat("; some are available, installing ...\n") install.packages(miss[which(tt)]) # careful not to ask for unavailable packages here, to avoid the warnings we already know they aren't available From ba4c2e951edb14709601b2790c14563cb70090d6 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 5 Dec 2020 03:37:37 -0700 Subject: [PATCH 134/588] cater for latest change in R-devel today which breaks test 2022 (#4835) --- .dev/CRAN_Release.cmd | 2 +- R/test.data.table.R | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 2e61568546..b560633218 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -301,7 +301,7 @@ tar xvf R-devel.tar.gz mv R-devel R-devel-strict-clang tar xvf R-devel.tar.gz -cd R-devel # used for revdep testing: .dev/revdep.R. +cd R-devel # may be used for revdep testing: .dev/revdep.R. # important to change directory name before building not after because the path is baked into the build, iiuc ./configure CFLAGS="-O2 -Wall -pedantic" make diff --git a/R/test.data.table.R b/R/test.data.table.R index 92de111915..1cd27ed190 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -425,7 +425,8 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no setattr(xc,"index",NULL) # too onerous to create test RHS with the correct index as well, just check result setattr(yc,"index",NULL) if (identical(xc,yc) && identical(key(x),key(y))) return(invisible(TRUE)) # check key on original x and y because := above might have cleared it on xc or yc - if (isTRUE(all.equal.result<-all.equal(xc,yc)) && identical(key(x),key(y)) && + if (isTRUE(all.equal.result<-all.equal(xc,yc,check.environments=FALSE)) && identical(key(x),key(y)) && + # ^^ to pass tests 2022.[1-4] in R-devel from 5 Dec 2020, #4835 identical(vapply_1c(xc,typeof), vapply_1c(yc,typeof))) return(invisible(TRUE)) } } From 7f441778121b5ec56a302e88c5352c80806bdd2f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 5 Dec 2020 05:03:11 -0700 Subject: [PATCH 135/588] bracket in url, follow-up to #4833 --- man/fread.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fread.Rd b/man/fread.Rd index fda9d166c8..cd14c0fbd8 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -128,7 +128,7 @@ When \code{input} begins with http://, https://, ftp://, ftps://, or file://, \c Background :\cr \url{https://cran.r-project.org/doc/manuals/R-data.html}\cr \url{https://stackoverflow.com/questions/1727772/quickly-reading-very-large-tables-as-dataframes-in-r}\cr -\url{https://cerebralmastication.com/2009/11/loading-big-data-into-r/)}\cr +\url{https://cerebralmastication.com/2009/11/loading-big-data-into-r/}\cr \url{https://stackoverflow.com/questions/9061736/faster-than-scan-with-rcpp}\cr \url{https://stackoverflow.com/questions/415515/how-can-i-read-and-manipulate-csv-file-data-in-c}\cr \url{https://stackoverflow.com/questions/9352887/strategies-for-reading-in-csv-files-in-pieces}\cr From 4be2b9ddb616974df83b32caa5948770d392c5e8 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 6 Dec 2020 21:19:03 -0700 Subject: [PATCH 136/588] .dev-only: added GITHUB_PAT to solve too-many-requests, #4832. Thanks to Dirk. --- .dev/CRAN_Release.cmd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index b560633218..1bd7fb6a39 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -205,6 +205,9 @@ test.data.table() install.packages("xml2") # to check the 150 URLs in NEWS.md under --as-cran below q("no") R CMD build . +export GITHUB_PAT="f1c.. github personal access token ..7ad" +# avoids many too-many-requests in --as-cran's ping-all-URLs step (20 mins) inside the `checking CRAN incoming feasibility...` step. +# Many thanks to Dirk for the tipoff that setting this env variable solves the problem, #4832. R CMD check data.table_1.13.3.tar.gz --as-cran R CMD INSTALL data.table_1.13.3.tar.gz --html From 0a16162d6ea03282a2268fc9b5ba1068623e890b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 6 Dec 2020 21:31:07 -0700 Subject: [PATCH 137/588] .dev-only: increase R's internet timeout always not just in revdep testing --- .dev/.bash_aliases | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases index 450a765a4f..504df41504 100644 --- a/.dev/.bash_aliases +++ b/.dev/.bash_aliases @@ -15,7 +15,7 @@ alias Rdevel-valgrind='~/build/R-devel-valgrind/bin/R --vanilla' alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla' alias R310='~/build/R-3.1.0/bin/R --vanilla' -alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=true && export R_DEFAULT_INTERNET_TIMEOUT=3600' +alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=true' alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R R' # use ~/build/R-devel/bin/R at the end of revdepr to use R-devel instead of R-release. # If so, doing a `rm -rf *` in revdeplib first to rebuild everything is easiest way to avoid potential problems later. A full rebuild is a good idea periodically anyway. Packages in @@ -28,3 +28,6 @@ export R_PROFILE_USER='~/.Rprofile' # even when starting R in ~/GitHub/data.table # Matt's ~/.Rprofile as a link to ~/GitHub/data.table/.dev/.Rprofile +export R_DEFAULT_INTERNET_TIMEOUT=3600 +# increase from R's default 60, always not just in revdep testing, to help --as-cran + From b6a85bc3d5e56e362a7bf49dfe83f3f0e0c5768c Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 7 Dec 2020 19:25:35 -0700 Subject: [PATCH 138/588] y values independent of base::order on data.frame (#4839) --- inst/tests/tests.Rraw | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a54962965e..d26f7ff509 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5154,10 +5154,11 @@ test(1318.2, DT[, eval(meanExpr), by = aa], DT[, mean(bb, na.rm=TRUE), by=aa]) test(1318.3, DT[, list(mySum = eval(sumExpr), myMean = eval(meanExpr)), by = aa], DT[, list(mySum=sum(bb, na.rm=TRUE), myMean=mean(bb, na.rm=TRUE)), by=aa]) # get DT[order(.)] to make sense. In v1.12.4 these tests were changed to not be 100% consistent with base in -# cases where the base R behaviour doesn't make sense, #696 +# cases where the base R behaviour doesn't make sense, #696. In v1.13.4, more y values here were made +# independent of base R's order on data.frame when that was made an error in R-devel, #4838. DT <- data.table(a = 1:4, b = 8:5, c=letters[4:1]) -test(1319.1, DT[order(DT[, "b", with=FALSE])], DT[base::order(DT[, "b", with=FALSE])]) -test(1319.2, DT[order(DT[, "c", with=FALSE])], DT[base::order(DT[, "c", with=FALSE])]) +test(1319.1, DT[order(DT[, "b", with=FALSE])], DT[4:1]) # DT[base::order(DT[, "b", with=FALSE])]) +test(1319.2, DT[order(DT[, "c", with=FALSE])], DT[4:1]) # DT[base::order(DT[, "c", with=FALSE])]) test(1319.3, DT[order(DT[, c("b","c"), with=FALSE])], DT[4:1]) # DT[base::order(DT[, c("b","c"), with=FALSE])]) test(1319.4, DT[order(DT[, c("c","b"), with=FALSE])], DT[4:1]) # DT[base::order(DT[, c("c","b"), with=FALSE])]) test(1319.5, DT[order(DT[, "b", with=FALSE], DT[, "a", with=FALSE])], error="Column 1 passed to [f]order is type 'list', not yet supported") From 5b04b0ef9691f71fcb9f13658c0f7b5b30944a1f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 8 Dec 2020 16:26:58 -0700 Subject: [PATCH 139/588] 1.13.4 on CRAN. Bump to 1.13.5 --- .dev/CRAN_Release.cmd | 30 +++++++++++++++--------------- DESCRIPTION | 2 +- Makefile | 6 +++--- NEWS.md | 9 +++++++-- src/init.c | 2 +- 5 files changed, 27 insertions(+), 22 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 1bd7fb6a39..2221dff449 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -208,15 +208,15 @@ R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" # avoids many too-many-requests in --as-cran's ping-all-URLs step (20 mins) inside the `checking CRAN incoming feasibility...` step. # Many thanks to Dirk for the tipoff that setting this env variable solves the problem, #4832. -R CMD check data.table_1.13.3.tar.gz --as-cran -R CMD INSTALL data.table_1.13.3.tar.gz --html +R CMD check data.table_1.13.5.tar.gz --as-cran +R CMD INSTALL data.table_1.13.5.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.13.3.tar.gz +R CMD check data.table_1.13.5.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -233,9 +233,9 @@ q("no") # User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 # Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. -PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.13.3.tar.gz +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.13.5.tar.gz # Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work -PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.13.3.tar.gz +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.13.5.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) @@ -267,7 +267,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.13.3.tar.gz +R310 CMD INSTALL ./data.table_1.13.5.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -279,7 +279,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.13.3.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.13.5.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -287,7 +287,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.13.3.tar.gz +R CMD check data.table_1.13.5.tar.gz ##################################################### @@ -337,8 +337,8 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-gcc CMD INSTALL data.table_1.13.3.tar.gz -Rdevel-strict-clang CMD INSTALL data.table_1.13.3.tar.gz +Rdevel-strict-gcc CMD INSTALL data.table_1.13.5.tar.gz +Rdevel-strict-clang CMD INSTALL data.table_1.13.5.tar.gz # Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so should be passed through to here Rdevel-strict-gcc Rdevel-strict-clang # repeat below with clang and gcc @@ -379,7 +379,7 @@ cd R-devel-valgrind make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems -Rdevel-valgrind CMD INSTALL data.table_1.13.3.tar.gz +Rdevel-valgrind CMD INSTALL data.table_1.13.5.tar.gz R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" # the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. # including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks @@ -417,7 +417,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.13.3.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.13.5.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -602,8 +602,8 @@ When CRAN's email contains "Pretest results OK pending a manual inspection" (or 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. 4. Bump dllVersion() in init.c 5. Bump 3 version numbers in Makefile -6. Search and replace this .dev/CRAN_Release.cmd to update 1.13.1 to 1.13.3, and 1.13.0 to 1.13.2 (e.g. in step 8 and 9 below) +6. Search and replace this .dev/CRAN_Release.cmd to update 1.13.3 to 1.13.5, and 1.13.2 to 1.13.4 (e.g. in step 8 and 9 below) 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -8. Push to master with this consistent commit message: "1.13.2 on CRAN. Bump to 1.13.3" -9. Take sha from step 8 and run `git tag 1.13.2 34796cd1524828df9bf13a174265cb68a09fcd77` then `git push origin 1.13.2` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +8. Push to master with this consistent commit message: "1.13.4 on CRAN. Bump to 1.13.5" +9. Take sha from step 8 and run `git tag 1.13.4 34796cd1524828df9bf13a174265cb68a09fcd77` then `git push origin 1.13.4` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) ###### diff --git a/DESCRIPTION b/DESCRIPTION index 4340f016aa..44894189d7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.13.3 +Version: 1.13.5 Title: Extension of `data.frame` Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), diff --git a/Makefile b/Makefile index 46061efeeb..dd6e899967 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.13.3.tar.gz + $(RM) data.table_1.13.5.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.13.3.tar.gz + $(R) CMD INSTALL data.table_1.13.5.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.13.3.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.13.5.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NEWS.md b/NEWS.md index d9201bd9b3..b7b78b68ad 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,9 +2,14 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.13.3](https://github.com/Rdatatable/data.table/milestone/21) (in development) +# data.table [v1.13.5](https://github.com/Rdatatable/data.table/milestone/22) (in development) -## NEW FEATURES +## BUG FIXES + +## NOTES + + +# data.table [v1.13.4](https://github.com/Rdatatable/data.table/milestone/21?closed=1) (08 Dec 2020) ## BUG FIXES diff --git a/src/init.c b/src/init.c index 84933013e7..962d45c83a 100644 --- a/src/init.c +++ b/src/init.c @@ -414,6 +414,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion() { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.13.3"))); + return(ScalarString(mkChar("1.13.5"))); } From 57020e2ed61d140ccb0c0604f2a322b6c8abae72 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 9 Dec 2020 01:37:40 +0200 Subject: [PATCH 140/588] follow up #4835 another change in R-devel (#4840) --- R/test.data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 1cd27ed190..67e4ce6d85 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -425,7 +425,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no setattr(xc,"index",NULL) # too onerous to create test RHS with the correct index as well, just check result setattr(yc,"index",NULL) if (identical(xc,yc) && identical(key(x),key(y))) return(invisible(TRUE)) # check key on original x and y because := above might have cleared it on xc or yc - if (isTRUE(all.equal.result<-all.equal(xc,yc,check.environments=FALSE)) && identical(key(x),key(y)) && + if (isTRUE(all.equal.result<-all.equal(xc,yc,check.environment=FALSE)) && identical(key(x),key(y)) && # ^^ to pass tests 2022.[1-4] in R-devel from 5 Dec 2020, #4835 identical(vapply_1c(xc,typeof), vapply_1c(yc,typeof))) return(invisible(TRUE)) } From 01943ab424239dd328a3cf0cadc843eeb3b2d58b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 9 Dec 2020 00:03:38 -0700 Subject: [PATCH 141/588] NEWS-only: refined news item about monotonic:dynamic --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index b7b78b68ad..472cc52dca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,7 +15,7 @@ 1. `as.matrix()` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting. -2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the unmodified dynamic schedule. Although never guaranteed by the standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). In all cases, `fsort` now checks monotonic allocation and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report and we managed to reproduce it. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. If you have an old compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway, and if so `fsort` will check that and work fine. If not, the compiler might accept `-fopenmp-version=45`, otherwise you will need to upgrade compiler. https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful. +2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the dynamic schedule. Although never guaranteed by the OpenMP standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). If you have an old compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway, so `fsort` now checks that threads are receiving iterations monotonically and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. OpenMP 4.5 may be enabled in some compilers using `-fopenmp-version=45`. Otherwise, if you need to upgrade compiler, https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful. 3. Columns containing functions that don't inherit the class `'function'` would fail to group, [#4814](https://github.com/Rdatatable/data.table/issues/4814). Thanks @mb706 for reporting, @ecoRoland2 for helping investigate, and @Coorsaa for a follow-up example involving environments. From d405d724968fc30d6d5459ddab1d45d43fde1e29 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Thu, 10 Dec 2020 14:10:09 +0800 Subject: [PATCH 142/588] should use the long int 2 to avoid integer overflow (#4297) --- NEWS.md | 2 ++ src/gsumm.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 472cc52dca..bc5a49b639 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ ## BUG FIXES +1. `gforce()` now allocates the correct amount of memory for the data.table with more than 1e9 rows, [#4295](https://github.com/Rdatatable/data.table/issues/4295) and [#4818](https://github.com/Rdatatable/data.table/issues/4818). Before the fixing, data.table could throw an error "Failed to allocate counts or TMP when assigning g in gforce", due to an integer overflow when `malloc()` memories. Thanks to @renkun-ken and @jangorecki for reporting and @shrektan for fixing. + ## NOTES diff --git a/src/gsumm.c b/src/gsumm.c index 372ae59440..ed34e76207 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -112,7 +112,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { int highSize = ((nrow-1)>>shift) + 1; //Rprintf(_("When assigning grp[o] = g, highSize=%d nb=%d shift=%d nBatch=%d\n"), highSize, nb, shift, nBatch); int *counts = calloc(nBatch*highSize, sizeof(int)); // TODO: cache-line align and make highSize a multiple of 64 - int *TMP = malloc(nrow*2*sizeof(int)); + int *TMP = malloc(nrow*2l*sizeof(int)); // must multiple the long int otherwise overflow may happen, #4295 if (!counts || !TMP ) error(_("Internal error: Failed to allocate counts or TMP when assigning g in gforce")); #pragma omp parallel for num_threads(getDTthreads(nBatch, false)) // schedule(dynamic,1) for (int b=0; b Date: Mon, 14 Dec 2020 22:12:17 -0700 Subject: [PATCH 143/588] avoid struct move on Solaris (#4845) --- NEWS.md | 3 +++ src/fwrite.c | 26 +++++++++++++++++--------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index bc5a49b639..d6423ce87a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,9 @@ 1. `gforce()` now allocates the correct amount of memory for the data.table with more than 1e9 rows, [#4295](https://github.com/Rdatatable/data.table/issues/4295) and [#4818](https://github.com/Rdatatable/data.table/issues/4818). Before the fixing, data.table could throw an error "Failed to allocate counts or TMP when assigning g in gforce", due to an integer overflow when `malloc()` memories. Thanks to @renkun-ken and @jangorecki for reporting and @shrektan for fixing. +2. `fwrite()`'s mutithreaded `gzip` compression should now work on Solaris, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. Other than less elegant internal code needing a comment explaining why it is done that way in this case. + It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. For example, any optimizations, such as aligning the set of structures to cache line boundaries, could be performed at the start of the parallel region, not after the parallel for. If anyone finding this news items knows more, please let us know. + ## NOTES diff --git a/src/fwrite.c b/src/fwrite.c index dc18cf4b50..8044ec6ef2 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -553,6 +553,7 @@ void writeCategString(const void *col, int64_t row, char **pch) } int init_stream(z_stream *stream) { + memset(stream, 0, sizeof(z_stream)); // shouldn't be needed, done as part of #4099 to be sure stream->next_in = Z_NULL; stream->zalloc = Z_NULL; stream->zfree = Z_NULL; @@ -857,6 +858,12 @@ void fwriteMain(fwriteMainArgs args) int failed_write = 0; // same. could use +ve and -ve in the same code but separate it out to trace Solaris problem, #3931 if (nth>1) verbose=false; // printing isn't thread safe (there's a temporary print in compressbuff for tracing solaris; #4099) + + z_stream thread_streams[nth]; + // VLA on stack should be fine for nth structs; in zlib v1.2.11 sizeof(struct)==112 on 64bit + // not declared inside the parallel region because solaris appears to move the struct in + // memory when the #pragma omp for is entered, which causes zlib's internal self reference + // pointer to mismatch, #4099 #pragma omp parallel num_threads(nth) { @@ -867,19 +874,20 @@ void fwriteMain(fwriteMainArgs args) void *myzBuff = NULL; size_t myzbuffUsed = 0; - z_stream mystream = {0}; + z_stream *mystream = &thread_streams[me]; if (args.is_gzip) { myzBuff = zbuffPool + me*zbuffSize; - if (init_stream(&mystream)) { // this should be thread safe according to zlib documentation + if (init_stream(mystream)) { // this should be thread safe according to zlib documentation failed = true; // # nocov my_failed_compress = -998; // # nocov } - if (verbose) {DTPRINT(_("z_stream for data (%d): "), 1); print_z_stream(&mystream);} + if (verbose) {DTPRINT(_("z_stream for data (%d): "), 1); print_z_stream(mystream);} } #pragma omp for ordered schedule(dynamic) for(int64_t start=0; startmsg!=NULL) strncpy(failed_msg, mystream->msg, 1000); // copy zlib's msg for safe use after deflateEnd just in case zlib allocated the message } // else another thread could have failed below while I was working or waiting above; their reason got here first // # nocov end @@ -976,7 +984,7 @@ void fwriteMain(fwriteMainArgs args) // all threads will call this free on their buffer, even if one or more threads had malloc // or realloc fail. If the initial malloc failed, free(NULL) is ok and does nothing. if (args.is_gzip) { - deflateEnd(&mystream); + deflateEnd(mystream); } } free(buffPool); From 3fd354a2be1ccbbf4a807fbd2de9a499f2476e89 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 14 Dec 2020 22:33:24 -0700 Subject: [PATCH 144/588] appveyor r-devel permanently off for PRs (more stable and faster dev cycle); GLCI tests with daily r-devel after merge --- .appveyor.yml | 2 +- .dev/CRAN_Release.cmd | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index edd916d992..7cabbb9062 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -32,7 +32,7 @@ environment: - R_VERSION: release # the single Windows.zip binary (both 32bit/64bit) that users following dev version of installation instructions should click - - R_VERSION: devel # When off it's to speed up dev cycle; R-devel is still checked but by GLCI on a roughly hourly cycle. CRAN_Release.cmd has a reminder to turn back on. +# - R_VERSION: devel # Never turn back on. GLCI after merge covers latest daily R-devel very well, so we shouldn't confuse and slow down PR dev cycle by measuring PRs against daily R-devel too. If a change in R-devel yesterday breaks the PR, it's very unlikely to be due to something in the PR. So we should accept the PR if it passes R-release and fix separately anything related to R-devel which we'll see from GLCI. before_build: - cmd: ECHO no Revision metadata added to DESCRIPTION diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 2221dff449..69611a8d21 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -247,8 +247,7 @@ gctorture2(step=50) system.time(test.data.table(script="*.Rraw")) # apx 8h = froll 3h + nafill 1m + main 5h # Upload to win-builder: release, dev & old-release -# Turn on Travis OSX; it's off in dev until it's added to GLCI (#3326) as it adds 17min after 11min Linux. -# Turn on r-devel in Appveyor; it may be off in dev for similar dev cycle speed reasons +# Turn on Travis OSX until it's added to GLCI (#3326). If it's off it's because as it adds 17min after 11min Linux. ############################################### From 2e7d28341d585d583b01d8d920eb3d072eeea9d5 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 14 Dec 2020 23:42:11 -0700 Subject: [PATCH 145/588] NEWS-only: tidy --- NEWS.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index d6423ce87a..da9638df45 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,13 +6,15 @@ ## BUG FIXES -1. `gforce()` now allocates the correct amount of memory for the data.table with more than 1e9 rows, [#4295](https://github.com/Rdatatable/data.table/issues/4295) and [#4818](https://github.com/Rdatatable/data.table/issues/4818). Before the fixing, data.table could throw an error "Failed to allocate counts or TMP when assigning g in gforce", due to an integer overflow when `malloc()` memories. Thanks to @renkun-ken and @jangorecki for reporting and @shrektan for fixing. +1. Grouping could throw an error 'Failed to allocate counts or TMP` with more than 1e9 rows even with sufficient RAM due to an integer overflow, [#4295](https://github.com/Rdatatable/data.table/issues/4295) [#4818](https://github.com/Rdatatable/data.table/issues/4818). Thanks to @renkun-ken and @jangorecki for reporting, and @shrektan for fixing. -2. `fwrite()`'s mutithreaded `gzip` compression should now work on Solaris, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. Other than less elegant internal code needing a comment explaining why it is done that way in this case. - It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. For example, any optimizations, such as aligning the set of structures to cache line boundaries, could be performed at the start of the parallel region, not after the parallel for. If anyone finding this news items knows more, please let us know. +2. `fwrite()`'s mutithreaded `gzip` compression failed on Solaris with Z_STREAM_ERROR, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) there have been no known problems with it on Linux, Windows or Mac. For Solaris, we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. + It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone finding this news items knows more, please let us know. ## NOTES +1. The last release took place at the same time as several breaking changes were made to R-devel. The CRAN submissions process runs against latest daily R-devel so we had to keep up with those latest changes by making several resubmissions. Then each resubmissions runs against latest R-devel again. Overall it took 7 days. For example, we added the new `environments=FALSE` to our `all.equal` call. Then about 4 hours after 1.13.4 was accepted, the `s` was dropped and we now need to resubmit with `environment=FALSE`. In any case, we have suggested that the default should be FALSE first to give packages some notice, as opposed to generating errors in the CRAN submissions process within hours. Then the default for `environment` could be TRUE in 6 months time after packages have had some time to update in advance of the default change. Readers of this NEWS file will be familiar with `data.table`'s approach to change control and know that we do this ourselves. + # data.table [v1.13.4](https://github.com/Rdatatable/data.table/milestone/21?closed=1) (08 Dec 2020) From 18db0d374cf204c96e679ab64ab70aa5306c19e2 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 14 Dec 2020 23:45:40 -0700 Subject: [PATCH 146/588] NEWS-only: paragraph spacing --- NEWS.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index da9638df45..0802cbde9a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,11 +9,12 @@ 1. Grouping could throw an error 'Failed to allocate counts or TMP` with more than 1e9 rows even with sufficient RAM due to an integer overflow, [#4295](https://github.com/Rdatatable/data.table/issues/4295) [#4818](https://github.com/Rdatatable/data.table/issues/4818). Thanks to @renkun-ken and @jangorecki for reporting, and @shrektan for fixing. 2. `fwrite()`'s mutithreaded `gzip` compression failed on Solaris with Z_STREAM_ERROR, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) there have been no known problems with it on Linux, Windows or Mac. For Solaris, we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. - It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone finding this news items knows more, please let us know. + + It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone finding this news items knows more, please let us know. ## NOTES -1. The last release took place at the same time as several breaking changes were made to R-devel. The CRAN submissions process runs against latest daily R-devel so we had to keep up with those latest changes by making several resubmissions. Then each resubmissions runs against latest R-devel again. Overall it took 7 days. For example, we added the new `environments=FALSE` to our `all.equal` call. Then about 4 hours after 1.13.4 was accepted, the `s` was dropped and we now need to resubmit with `environment=FALSE`. In any case, we have suggested that the default should be FALSE first to give packages some notice, as opposed to generating errors in the CRAN submissions process within hours. Then the default for `environment` could be TRUE in 6 months time after packages have had some time to update in advance of the default change. Readers of this NEWS file will be familiar with `data.table`'s approach to change control and know that we do this ourselves. +1. The last release took place at the same time as several breaking changes were made to R-devel. The CRAN submissions process runs against latest daily R-devel so we had to keep up with those latest changes by making several resubmissions. Then each resubmissions reruns against the new latest R-devel again. Overall it took 7 days. For example, we added the new `environments=FALSE` to our `all.equal` call. Then about 4 hours after 1.13.4 was accepted, the `s` was dropped and we now need to resubmit with `environment=FALSE`. In any case, we have suggested that the default should be FALSE first to give packages some notice, as opposed to generating errors in the CRAN submissions process within hours. Then the default for `environment=` could be TRUE in 6 months time after packages have had some time to update in advance of the default change. Readers of this NEWS file will be familiar with `data.table`'s approach to change control and know that we do this ourselves. # data.table [v1.13.4](https://github.com/Rdatatable/data.table/milestone/21?closed=1) (08 Dec 2020) From 267315d4e585462c3dde3e5bbd7ac1f193ce4377 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 15 Dec 2020 00:01:39 -0700 Subject: [PATCH 147/588] NEWS-only: more tweaks --- NEWS.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0802cbde9a..f8f048aa69 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,15 +6,15 @@ ## BUG FIXES -1. Grouping could throw an error 'Failed to allocate counts or TMP` with more than 1e9 rows even with sufficient RAM due to an integer overflow, [#4295](https://github.com/Rdatatable/data.table/issues/4295) [#4818](https://github.com/Rdatatable/data.table/issues/4818). Thanks to @renkun-ken and @jangorecki for reporting, and @shrektan for fixing. +1. Grouping could throw an error `Failed to allocate counts or TMP` with more than 1e9 rows even with sufficient RAM due to an integer overflow, [#4295](https://github.com/Rdatatable/data.table/issues/4295) [#4818](https://github.com/Rdatatable/data.table/issues/4818). Thanks to @renkun-ken and @jangorecki for reporting, and @shrektan for fixing. 2. `fwrite()`'s mutithreaded `gzip` compression failed on Solaris with Z_STREAM_ERROR, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) there have been no known problems with it on Linux, Windows or Mac. For Solaris, we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. - It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone finding this news items knows more, please let us know. + It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone reading this knows more, please let us know. ## NOTES -1. The last release took place at the same time as several breaking changes were made to R-devel. The CRAN submissions process runs against latest daily R-devel so we had to keep up with those latest changes by making several resubmissions. Then each resubmissions reruns against the new latest R-devel again. Overall it took 7 days. For example, we added the new `environments=FALSE` to our `all.equal` call. Then about 4 hours after 1.13.4 was accepted, the `s` was dropped and we now need to resubmit with `environment=FALSE`. In any case, we have suggested that the default should be FALSE first to give packages some notice, as opposed to generating errors in the CRAN submissions process within hours. Then the default for `environment=` could be TRUE in 6 months time after packages have had some time to update in advance of the default change. Readers of this NEWS file will be familiar with `data.table`'s approach to change control and know that we do this ourselves. +1. The last release took place at the same time as several breaking changes were made to R-devel. The CRAN submissions process runs against latest daily R-devel so we had to keep up with those latest changes by making several resubmissions. Then each resubmission reruns against the new latest R-devel again. Overall it took 7 days. For example, we added the new `environments=FALSE` to our `all.equal` call. Then about 4 hours after 1.13.4 was accepted, the `s` was dropped and we now need to resubmit with `environment=FALSE`. In any case, we have suggested that the default should be FALSE first to give packages some notice, as opposed to generating errors in the CRAN submissions process within hours. Then the default for `environment=` could be TRUE in 6 months time after packages have had some time to update in advance of the default change. Readers of this NEWS file will be familiar with `data.table`'s approach to change control and know that we do this ourselves. # data.table [v1.13.4](https://github.com/Rdatatable/data.table/milestone/21?closed=1) (08 Dec 2020) From de4dc380ba3b5411f4584df77482efb61c9de156 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 15 Dec 2020 18:09:43 -0700 Subject: [PATCH 148/588] .dev-only: revdep autorun on startup --- .dev/CRAN_Release.cmd | 7 +++++-- .dev/revdep.R | 19 ++++++++++++------- DESCRIPTION | 2 +- NEWS.md | 2 +- src/init.c | 2 +- 5 files changed, 20 insertions(+), 12 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 69611a8d21..7de22a24d4 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -305,7 +305,7 @@ tar xvf R-devel.tar.gz cd R-devel # may be used for revdep testing: .dev/revdep.R. # important to change directory name before building not after because the path is baked into the build, iiuc -./configure CFLAGS="-O2 -Wall -pedantic" +./configure CFLAGS="-O0 -Wall -pedantic" make # use latest available `apt-cache search gcc-` or `clang-` @@ -528,6 +528,7 @@ sudo apt-get -y install parallel # for revdepr.R sudo apt-get -y install pandoc-citeproc # for basecallQC sudo apt-get -y install libquantlib0-dev # for RQuantLib sudo apt-get -y install cargo # for gifski, a suggest of nasoi +sudo apt-get -y install libgit2-dev # for gert sudo R CMD javareconf # ENDIF @@ -571,7 +572,9 @@ du -k inst/tests # 1.5MB before bzip2 inst/tests/*.Rraw # compress *.Rraw just for release to CRAN; do not commit compressed *.Rraw to git du -k inst/tests # 0.75MB after R CMD build . -R CMD check data.table_1.13.4.tar.gz --as-cran +export GITHUB_PAT="f1c.. github personal access token ..7ad" +Rdevel -q -e "packageVersion('xml2')" # ensure installed +Rdevel CMD check data.table_1.13.6.tar.gz --as-cran # use latest Rdevel as it may have extra checks # bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit compressed *.Rraw to git # diff --git a/.dev/revdep.R b/.dev/revdep.R index bacf0cf53a..62f0b9e5fa 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -175,7 +175,7 @@ status0 = function(bioc=FALSE) { } status = function(bioc=FALSE) { - cat("Installed data.table to be tested against:", + cat("\nInstalled data.table to be tested against:", as.character(packageVersion("data.table")), format(as.POSIXct(packageDescription("data.table")$Packaged, tz="UTC"), tz=""), # local time "\n\nCRAN:\n") @@ -231,14 +231,15 @@ status = function(bioc=FALSE) { invisible() } -run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE) { - cat("Installed data.table to be tested against:",as.character(packageVersion("data.table")),"\n") +run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE, choose=NULL) { if (length(pkgs)==1) pkgs = strsplit(pkgs, split="[, ]")[[1]] if (anyDuplicated(pkgs)) stop("pkgs contains dups") if (!length(pkgs)) { opts = c("not.started","cran.fail","bioc.fail","both.fail","rerun.cran","rerun.bioc","rerun.all") - cat(paste0(1:length(opts),": ",opts) , sep="\n") - w = suppressWarnings(as.integer(readline("Enter option: "))) + w = if (is.null(choose)) { + cat(paste0(1:length(opts),": ",opts) , sep="\n") + suppressWarnings(as.integer(readline("Enter option: "))) + } else choose if (is.na(w) || !w %in% seq_along(opts)) stop(w," is invalid") which = opts[w] numtgz = as.integer(system("ls -1 *.tar.gz | wc -l", intern=TRUE)) @@ -269,8 +270,10 @@ run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE) { cat("Running",length(pkgs),"packages:", paste(pkgs), "\n") filter = paste0("| grep -E '", paste0(paste0(pkgs,"_"),collapse="|"), "' ") } - cat("Proceed? (ctrl-c or enter)\n") - scan(quiet=TRUE) + if (is.null(choose)) { + cat("Proceed? (ctrl-c or enter)\n") + scan(quiet=TRUE) + } if (!identical(pkgs,"_ALL_")) for (i in pkgs) system(paste0("rm -rf ./",i,".Rcheck")) SUGG = paste0("_R_CHECK_FORCE_SUGGESTS_=",tolower(R_CHECK_FORCE_SUGGESTS)) cmd = paste0("ls -1 *.tar.gz ", filter, "| TZ='UTC' OMP_THREAD_LIMIT=2 ",SUGG," parallel --max-procs 50% ",R," CMD check") @@ -312,7 +315,9 @@ log = function(bioc=FALSE, fnam="~/fail.log") { } } +inst() status() +run(choose=1) # run not-started (i.e. updates to and new revdeps) automatically on revdep startup # Now R prompt is ready to fix any problems with CRAN or Bioconductor updates. # Then run run(), status() and log() as per section in CRAN_Release.cmd diff --git a/DESCRIPTION b/DESCRIPTION index 44894189d7..03ec48e58d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.13.5 +Version: 1.13.6 Title: Extension of `data.frame` Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), diff --git a/NEWS.md b/NEWS.md index f8f048aa69..cad7c616dc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.13.5](https://github.com/Rdatatable/data.table/milestone/22) (in development) +# data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) ## BUG FIXES diff --git a/src/init.c b/src/init.c index 962d45c83a..bd8c58e607 100644 --- a/src/init.c +++ b/src/init.c @@ -414,6 +414,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion() { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.13.5"))); + return(ScalarString(mkChar("1.13.6"))); } From 87b8c15ccd04bb181922fc6d60bdcb7e2a5a31ad Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 15 Dec 2020 18:21:18 -0700 Subject: [PATCH 149/588] revert mistaken inclusion of bump in last commit; should have just been .dev/revdep.R. CRAN submission is still falsely stuck on revdep prt --- DESCRIPTION | 2 +- NEWS.md | 2 +- src/init.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 03ec48e58d..44894189d7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.13.6 +Version: 1.13.5 Title: Extension of `data.frame` Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), diff --git a/NEWS.md b/NEWS.md index cad7c616dc..f8f048aa69 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) +# data.table [v1.13.5](https://github.com/Rdatatable/data.table/milestone/22) (in development) ## BUG FIXES diff --git a/src/init.c b/src/init.c index bd8c58e607..962d45c83a 100644 --- a/src/init.c +++ b/src/init.c @@ -414,6 +414,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion() { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.13.6"))); + return(ScalarString(mkChar("1.13.5"))); } From 21eec50d7309adcaea1efbba973e31de10955bd4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 16 Dec 2020 12:25:42 -0700 Subject: [PATCH 150/588] 1.13.6 on CRAN. Bump to 1.13.7 --- .dev/CRAN_Release.cmd | 30 +++++++++++++++--------------- DESCRIPTION | 2 +- Makefile | 6 +++--- NEWS.md | 9 ++++++++- src/init.c | 2 +- 5 files changed, 28 insertions(+), 21 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 7de22a24d4..15c7ab0bcb 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -208,15 +208,15 @@ R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" # avoids many too-many-requests in --as-cran's ping-all-URLs step (20 mins) inside the `checking CRAN incoming feasibility...` step. # Many thanks to Dirk for the tipoff that setting this env variable solves the problem, #4832. -R CMD check data.table_1.13.5.tar.gz --as-cran -R CMD INSTALL data.table_1.13.5.tar.gz --html +R CMD check data.table_1.13.7.tar.gz --as-cran +R CMD INSTALL data.table_1.13.7.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.13.5.tar.gz +R CMD check data.table_1.13.7.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -233,9 +233,9 @@ q("no") # User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 # Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. -PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.13.5.tar.gz +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.13.7.tar.gz # Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work -PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.13.5.tar.gz +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.13.7.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) @@ -266,7 +266,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.13.5.tar.gz +R310 CMD INSTALL ./data.table_1.13.7.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -278,7 +278,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.13.5.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.13.7.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -286,7 +286,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.13.5.tar.gz +R CMD check data.table_1.13.7.tar.gz ##################################################### @@ -336,8 +336,8 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-gcc CMD INSTALL data.table_1.13.5.tar.gz -Rdevel-strict-clang CMD INSTALL data.table_1.13.5.tar.gz +Rdevel-strict-gcc CMD INSTALL data.table_1.13.7.tar.gz +Rdevel-strict-clang CMD INSTALL data.table_1.13.7.tar.gz # Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so should be passed through to here Rdevel-strict-gcc Rdevel-strict-clang # repeat below with clang and gcc @@ -378,7 +378,7 @@ cd R-devel-valgrind make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems -Rdevel-valgrind CMD INSTALL data.table_1.13.5.tar.gz +Rdevel-valgrind CMD INSTALL data.table_1.13.7.tar.gz R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" # the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. # including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks @@ -416,7 +416,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.13.5.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.13.7.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -604,8 +604,8 @@ When CRAN's email contains "Pretest results OK pending a manual inspection" (or 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. 4. Bump dllVersion() in init.c 5. Bump 3 version numbers in Makefile -6. Search and replace this .dev/CRAN_Release.cmd to update 1.13.3 to 1.13.5, and 1.13.2 to 1.13.4 (e.g. in step 8 and 9 below) +6. Search and replace this .dev/CRAN_Release.cmd to update 1.13.5 to 1.13.7, and 1.13.4 to 1.13.6 (e.g. in step 8 and 9 below) 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -8. Push to master with this consistent commit message: "1.13.4 on CRAN. Bump to 1.13.5" -9. Take sha from step 8 and run `git tag 1.13.4 34796cd1524828df9bf13a174265cb68a09fcd77` then `git push origin 1.13.4` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +8. Push to master with this consistent commit message: "1.13.6 on CRAN. Bump to 1.13.7" +9. Take sha from step 8 and run `git tag 1.13.6 96c..sha..d77` then `git push origin 1.13.6` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) ###### diff --git a/DESCRIPTION b/DESCRIPTION index 44894189d7..3e878e84bd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.13.5 +Version: 1.13.7 Title: Extension of `data.frame` Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), diff --git a/Makefile b/Makefile index dd6e899967..e1331064d4 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.13.5.tar.gz + $(RM) data.table_1.13.7.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.13.5.tar.gz + $(R) CMD INSTALL data.table_1.13.7.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.13.5.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.13.7.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NEWS.md b/NEWS.md index f8f048aa69..99a1d25dba 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,14 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.13.5](https://github.com/Rdatatable/data.table/milestone/22) (in development) +# data.table [v1.13.7](https://github.com/Rdatatable/data.table/milestone/20) (in development) + +## BUG FIXES + +## NOTES + + +# data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (submitted to CRAN on 14 Dec 2020) ## BUG FIXES diff --git a/src/init.c b/src/init.c index 962d45c83a..6f3edec64c 100644 --- a/src/init.c +++ b/src/init.c @@ -414,6 +414,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion() { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.13.5"))); + return(ScalarString(mkChar("1.13.7"))); } From 7aa22ee6b245b9308352acd66384373a99376c13 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 16 Dec 2020 21:32:34 +0200 Subject: [PATCH 151/588] extra print in test summary (#4850) --- R/test.data.table.R | 2 +- inst/tests/tests.Rraw | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 67e4ce6d85..c5da3e0bac 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -177,7 +177,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F cat("10 longest running tests took ", as.integer(tt<-DT[, sum(time)]), "s (", as.integer(100*tt/(ss<-timings[,sum(time)])), "% of ", as.integer(ss), "s)\n", sep="") print(DT, class=FALSE) - cat("All ",ntest," tests in ",names(fn)," completed ok in ",timetaken(env$started.at),"\n",sep="") + cat("All ",ntest," tests (last ",env$prevtest,") in ",names(fn)," completed ok in ",timetaken(env$started.at),"\n",sep="") ## this chunk requires to include new suggested deps: graphics, grDevices #memtest.plot = function(.inittime) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d26f7ff509..390fadd0f3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12899,7 +12899,7 @@ for (col in c('b', 'c')) { # # tests-S4.R (S4 Compatability) # -suppressWarnings(setClass("Data.Table", contains="data.table")) # suppress 'Created a package name, ‘2018-05-26 06:14:43.444’, when none found' +suppressWarnings(setClass("Data.Table", contains="data.table")) # suppress "Created a package name, '2018-05-26 06:14:43.444', when none found" suppressWarnings(setClass("S4Composition", representation(data="data.table"))) # data.table can be a parent class ids <- sample(letters[1:3], 10, replace=TRUE) From 05202fe13a9f6dbb2bec511fa0e17e24136616d1 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 30 Dec 2020 10:41:48 -0700 Subject: [PATCH 152/588] NEWS-only: date of 1.13.6 on CRAN in heading --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 99a1d25dba..420cadc484 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,7 +9,7 @@ ## NOTES -# data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (submitted to CRAN on 14 Dec 2020) +# data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (30 Dec 2020) ## BUG FIXES From 33d67c4317d0d211379a71b02376412b802a736b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 30 Dec 2020 16:38:41 -0700 Subject: [PATCH 153/588] removed solaris tracing of zlib in fwrite (#4860) --- src/fwrite.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 8044ec6ef2..499d24bec9 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -564,50 +564,13 @@ int init_stream(z_stream *stream) { return err; // # nocov } -void print_z_stream(const z_stream *s) // temporary tracing function for #4099 -{ - const char *byte = (char *)s; - DTPRINT("sizeof(z_stream)==%d: ", sizeof(z_stream)); - for (int i=0; istate->status which zlib:deflateStateCheck checks, #4826 - // this structure is not exposed, so we'll get to it via memory offsets using the trace output we put in to show on CRAN's Solaris output - const char *pos = (char *)&s->msg + sizeof(char *); // the field after *msg (exposed) is internal_state *state (not exposed) - byte = *(char **)pos; // byte now at start of internal_state pointed to by s->state - char *strm = *(char **)byte; // first 8 bytes (or 4 on 32bit) is strm labeled 'pointer back to this zlib stream' - DTPRINT("state: "); - for (int i=0; i<(sizeof(char *) + sizeof(int)); ++i) { - DTPRINT("%02x ", *(unsigned char *)byte++); - } - int status = *(int *)(byte-sizeof(int)); - DTPRINT("strm==%p state->strm==%p state->status==%d", s, strm, status); // two pointer values should be the same - DTPRINT(" zalloc==%p zfree==%p", s->zalloc, s->zfree); // checked to be !=0 by deflate.c:deflateStateCheck - DTPRINT(" (s->strm==strm)==%d", (char *)s==strm); // mimics the s->strm==strm check in deflate.c:deflateStateCheck - DTPRINT(" s->next_out==%p s->avail_in=%d s->next_in=%p", s->next_out, s->avail_in, s->next_in); // top of deflate.c:deflate() after the call to deflateStateCheck - DTPRINT(" deflates()'s checks (excluding status) would %s here", - (s->zalloc==(alloc_func)0 || s->zfree==(free_func)0 || s==Z_NULL || (char *)s!=strm || - s->next_out==Z_NULL || (s->avail_in!=0 && s->next_in==Z_NULL)) ? - "return -2" : "be ok"); - DTPRINT("\n"); -} - int compressbuff(z_stream *stream, void* dest, size_t *destLen, const void* source, size_t sourceLen) { stream->next_out = dest; stream->avail_out = *destLen; stream->next_in = (Bytef *)source; // don't use z_const anywhere; #3939 stream->avail_in = sourceLen; - if (verbose) { - DTPRINT(_("deflate input stream: %p %d %p %d z_stream: "), stream->next_out, (int)(stream->avail_out), stream->next_in, (int)(stream->avail_in)); - print_z_stream(stream); - } int err = deflate(stream, Z_FINISH); - if (verbose) { - DTPRINT(_("deflate returned %d with stream->total_out==%d; Z_FINISH==%d, Z_OK==%d, Z_STREAM_END==%d z_stream: "), err, (int)(stream->total_out), Z_FINISH, Z_OK, Z_STREAM_END); - print_z_stream(stream); - } if (err == Z_OK) { // with Z_FINISH, deflate must return Z_STREAM_END if correct, otherwise it's an error and we shouldn't return Z_OK (0) err = -9; // # nocov @@ -766,7 +729,6 @@ void fwriteMain(fwriteMainArgs args) free(buff); // # nocov STOP(_("Can't allocate gzip stream structure")); // # nocov } - if (verbose) {DTPRINT(_("z_stream for header (%d): "), 1); print_z_stream(&stream);} size_t zbuffSize = deflateBound(&stream, headerLen); char *zbuff = malloc(zbuffSize); if (!zbuff) { @@ -775,7 +737,6 @@ void fwriteMain(fwriteMainArgs args) } size_t zbuffUsed = zbuffSize; ret1 = compressbuff(&stream, zbuff, &zbuffUsed, buff, (size_t)(ch-buff)); - if (verbose) {DTPRINT(_("z_stream for header (%d): "), 2); print_z_stream(&stream);} if (ret1==Z_OK) ret2 = WRITE(f, zbuff, (int)zbuffUsed); deflateEnd(&stream); free(zbuff); @@ -881,13 +842,11 @@ void fwriteMain(fwriteMainArgs args) failed = true; // # nocov my_failed_compress = -998; // # nocov } - if (verbose) {DTPRINT(_("z_stream for data (%d): "), 1); print_z_stream(mystream);} } #pragma omp for ordered schedule(dynamic) for(int64_t start=0; start Date: Sun, 3 Jan 2021 14:10:27 -0700 Subject: [PATCH 154/588] .dev-only: added cran status lookup of failing revdeps --- .dev/revdep.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.dev/revdep.R b/.dev/revdep.R index 62f0b9e5fa..3f340a4ae2 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -231,6 +231,18 @@ status = function(bioc=FALSE) { invisible() } +cran = function() # reports CRAN status of the .cran.fail packages +{ + require(data.table) + p = proc.time() + db = setDT(tools::CRAN_check_results()) + cat("tools::CRAN_check_results() returned",prettyNum(nrow(db), big.mark=","),"rows in",timetaken(p),"\n") + ans = db[Package %chin% .fail.cran, .N, keyby=.(Package, Status)] + stopifnot(all(ans$Status %chin% c("ERROR","WARN","NOTE","OK"))) + ans = dcast(ans, Package~Status, value.var="N", fill=0L) + ans[.fail.cran, .(Package,ERROR,WARN,"OK|NOTE"=OK+NOTE)] +} + run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE, choose=NULL) { if (length(pkgs)==1) pkgs = strsplit(pkgs, split="[, ]")[[1]] if (anyDuplicated(pkgs)) stop("pkgs contains dups") From deac792a9081d879aba4c027399b8e794a1f57d9 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 3 Jan 2021 15:03:09 -0700 Subject: [PATCH 155/588] .dev-only: cran status compare R-release instead of counts --- .dev/revdep.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 3f340a4ae2..225cb67c1b 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -237,10 +237,12 @@ cran = function() # reports CRAN status of the .cran.fail packages p = proc.time() db = setDT(tools::CRAN_check_results()) cat("tools::CRAN_check_results() returned",prettyNum(nrow(db), big.mark=","),"rows in",timetaken(p),"\n") - ans = db[Package %chin% .fail.cran, .N, keyby=.(Package, Status)] - stopifnot(all(ans$Status %chin% c("ERROR","WARN","NOTE","OK"))) - ans = dcast(ans, Package~Status, value.var="N", fill=0L) - ans[.fail.cran, .(Package,ERROR,WARN,"OK|NOTE"=OK+NOTE)] + rel = unique(db$Flavor) + rel = sort(rel[grep("release",rel)]) + stopifnot(identical(rel, c("r-release-linux-x86_64", "r-release-macos-x86_64", "r-release-windows-ix86+x86_64"))) + cat("R-release is used for revdep checking so comparing to CRAN results for R-release\n") + ans = db[Package %chin% .fail.cran & Flavor %chin% rel, Status, keyby=.(Package, Flavor)] + dcast(ans, Package~Flavor, value.var="Status", fill="")[.fail.cran,] } run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE, choose=NULL) { From 789dfe4df8b4bb149c5bad1c62722aa2fba4345f Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 4 Jan 2021 02:53:13 +0200 Subject: [PATCH 156/588] docker tags needs integration stage (#4854) --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8e53b950e1..2f760c2782 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -323,6 +323,7 @@ integration: ## merging all artifacts to produce single R repository, documentat - linux only: - master + - tags needs: ["mirror-packages","build","test-rel-lin","test-rel-cran-lin","test-dev-cran-lin","test-rel-vanilla-lin","test-310-cran-lin","test-344-cran-lin","test-350-cran-lin","test-rel-win","test-dev-win","test-old-win"] script: - Rscript -e 'pkgdown::build_site(override=list(destination="./pkgdown"))' From 354cf84dd3be62dbddd147f4e6ca8ad0b099f254 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Tue, 5 Jan 2021 04:47:37 +0800 Subject: [PATCH 157/588] fread() throws correct non-ASCII messages (#4751) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 12 ++++++++++++ src/fread.c | 2 +- src/fread.h | 5 +++++ src/freadR.c | 2 +- src/freadR.h | 1 + 6 files changed, 22 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 420cadc484..e0ea1222ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ ## BUG FIXES +1. If `fread()` discards a single line footer, the warning message which includes the discarded text now displays any non-ASCII characters correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. + ## NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 390fadd0f3..a6013a3a11 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17246,3 +17246,15 @@ class(e) = "foo" dt = data.table(id=1, funs=list(e)) test(2161.2, dt[, .(funs), by=id], dt) +# fread message display non-ASCII messages correctly, #4747 +x = "fa\u00e7ile"; Encoding(x) = "UTF-8" +# should only run this test if the native encoding can represent latin1 correctly +if (identical(x, enc2native(x))) { + txt = enc2utf8(sprintf("A,B\n%s,%s\n%s", x, x, x)) + txt2 = iconv(txt, "UTF-8", "latin1") + out = data.table(A = x, B = x) + test(2162.1, fread(text = txt, encoding = 'UTF-8'), out, + warning="Discarded single-line footer: <>") + test(2162.2, fread(text = txt2, encoding = 'Latin-1'), out, + warning="Discarded single-line footer: <>") +} diff --git a/src/fread.c b/src/fread.c index 5b9bac3f03..7b1ba6df03 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2575,7 +2575,7 @@ int freadMain(freadMainArgs _args) { if (ch==eof) { // whitespace at the end of the file is always skipped ok } else { - const char *skippedFooter = ch; + const char *skippedFooter = ENC2NATIVE(ch); // detect if it's a single line footer. Commonly the row count from SQL queries. while (ch +#include #include "po.h" #define FREAD_MAIN_ARGS_EXTRA_FIELDS \ From dae96526478203f7ec7d7669a0b5c87d48f145d8 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 5 Jan 2021 03:11:58 +0200 Subject: [PATCH 158/588] makes dependency on zlib optional (#4844) --- NEWS.md | 2 ++ configure | 17 ++++++++++++++--- src/Makevars.in | 5 +++-- src/fwrite.c | 30 +++++++++++++++++++++++++++--- src/utils.c | 6 ++++++ 5 files changed, 52 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index e0ea1222ea..df1b96436c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ ## NOTES +1. Compiling from source no longer requires `zlib` header files to be available, [#4844](https://github.com/Rdatatable/data.table/pull/4844). The output suggests installing `zlib` headers, and how (e.g. `zlib1g-dev` on Ubuntu) as before, but now proceeds with `gzip` compression disabled in `fwrite`. Upon calling `fwrite(DT, "file.csv.gz")` at runtime, an error message suggests to reinstall `data.table` with `zlib` headers available. This does not apply to users on Windows or Mac who install the pre-compiled binary package from CRAN. + # data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (30 Dec 2020) diff --git a/configure b/configure index e29156d61a..f2e98ec312 100755 --- a/configure +++ b/configure @@ -20,6 +20,7 @@ esac # and R-exts note 24 now suggests 'checkbashisms' as we proposed. msg=0 +NOZLIB=1 # if pkg-config is not available then zlib will be disabled for higher chance of compilation success pkg-config --version >/dev/null 2>&1 if [ $? -ne 0 ]; then echo "*** pkg-config is not installed." @@ -30,6 +31,7 @@ else echo "*** pkg-config is installed but 'pkg-config --exists zlib' did not return 0." msg=1 else + NOZLIB=0 lib=`pkg-config --libs zlib` expr -- "$lib" : ".*-lz$" >/dev/null # -- for FreeBSD, #4652 if [ $? -ne 0 ]; then @@ -46,9 +48,9 @@ fi if [ $msg -ne 0 ]; then echo "*** Compilation will now be attempted and if it works you can ignore this message. In" - echo "*** particular, this should be the case on Mac where zlib is built in." - echo "*** However, if compilation fails, try 'locate zlib.h zconf.h' and ensure the zlib" - echo "*** development library is installed :" + echo "*** particular, this should be the case on Mac where zlib is built in or pkg-config" + echo "*** is not installed. However, if compilation fails, try 'locate zlib.h zconf.h' and" + echo "*** ensure the zlib development library is installed :" echo "*** deb: zlib1g-dev (Debian, Ubuntu, ...)" echo "*** rpm: zlib-devel (Fedora, EPEL, ...)" echo "*** There is a zlib in brew for OSX but the built in zlib should work." @@ -109,5 +111,14 @@ fi # retain user supplied PKG_ env variables, #4664. See comments in Makevars.in too. sed -e "s|@PKG_CFLAGS@|$PKG_CFLAGS|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars sed -e "s|@PKG_LIBS@|$PKG_LIBS|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars +# optional dependency on zlib +if [ "$NOZLIB" = "1" ]; then + echo "*** Compilation without compression support in fwrite" + sed -e "s|@zlib_cflags@|-DNOZLIB|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars + sed -e "s|@zlib_libs@||" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars +else + sed -e "s|@zlib_cflags@||" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars + sed -e "s|@zlib_libs@|-lz|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars +fi exit 0 diff --git a/src/Makevars.in b/src/Makevars.in index 7750c1e8ac..b411786283 100644 --- a/src/Makevars.in +++ b/src/Makevars.in @@ -1,10 +1,11 @@ -PKG_CFLAGS = @PKG_CFLAGS@ @openmp_cflags@ -PKG_LIBS = @PKG_LIBS@ @openmp_cflags@ -lz +PKG_CFLAGS = @PKG_CFLAGS@ @openmp_cflags@ @zlib_cflags@ +PKG_LIBS = @PKG_LIBS@ @openmp_cflags@ @zlib_libs@ # See WRE $1.2.1.1. But retain user supplied PKG_* too, #4664. # WRE states ($1.6) that += isn't portable and that we aren't allowed to use it. # Otherwise we could use the much simpler PKG_LIBS += @openmp_cflags@ -lz. # Can't do PKG_LIBS = $(PKG_LIBS)... either because that's a 'recursive variable reference' error in make # Hence the onerous @...@ substitution. Is it still appropriate in 2020 that we can't use +=? +# Note that -lz is now escaped via @zlib_libs@ when zlib is not installed all: $(SHLIB) @echo PKG_CFLAGS = $(PKG_CFLAGS) diff --git a/src/fwrite.c b/src/fwrite.c index 499d24bec9..b85d513a6f 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -7,7 +7,9 @@ #include // isfinite, isnan #include // abs #include // strlen, strerror +#ifndef NOZLIB #include // for compression to .gz +#endif #ifdef WIN32 #include @@ -552,6 +554,7 @@ void writeCategString(const void *col, int64_t row, char **pch) write_string(getCategString(col, row), pch); } +#ifndef NOZLIB int init_stream(z_stream *stream) { memset(stream, 0, sizeof(z_stream)); // shouldn't be needed, done as part of #4099 to be sure stream->next_in = Z_NULL; @@ -578,6 +581,7 @@ int compressbuff(z_stream *stream, void* dest, size_t *destLen, const void* sour *destLen = stream->total_out; return err == Z_STREAM_END ? Z_OK : err; } +#endif void fwriteMain(fwriteMainArgs args) { @@ -683,6 +687,10 @@ void fwriteMain(fwriteMainArgs args) // # nocov end } } +#ifdef NOZLIB + if (args.is_gzip) + STOP(_("Compression in fwrite uses zlib library. Its header files were not found at the time data.table was compiled. To enable fwrite compression, please reinstall data.table and study the output for further guidance.")); // # nocov +#endif int yamlLen = strlen(args.yaml); if (verbose) { @@ -724,6 +732,7 @@ void fwriteMain(fwriteMainArgs args) } else { int ret1=0, ret2=0; if (args.is_gzip) { +#ifndef NOZLIB z_stream stream = {0}; if(init_stream(&stream)) { free(buff); // # nocov @@ -740,6 +749,7 @@ void fwriteMain(fwriteMainArgs args) if (ret1==Z_OK) ret2 = WRITE(f, zbuff, (int)zbuffUsed); deflateEnd(&stream); free(zbuff); +#endif } else { ret2 = WRITE(f, buff, (int)(ch-buff)); } @@ -785,12 +795,14 @@ void fwriteMain(fwriteMainArgs args) // compute zbuffSize which is the same for each thread size_t zbuffSize = 0; if(args.is_gzip){ +#ifndef NOZLIB z_stream stream = {0}; if(init_stream(&stream)) STOP(_("Can't allocate gzip stream structure")); // # nocov zbuffSize = deflateBound(&stream, buffSize); if (verbose) DTPRINT("zbuffSize=%d returned from deflateBound\n", (int)zbuffSize); deflateEnd(&stream); +#endif } errno=0; @@ -804,6 +816,7 @@ void fwriteMain(fwriteMainArgs args) char *zbuffPool = NULL; if (args.is_gzip) { zbuffPool = malloc(nth*(size_t)zbuffSize); +#ifndef NOZLIB if (!zbuffPool) { // # nocov start free(buffPool); @@ -811,20 +824,21 @@ void fwriteMain(fwriteMainArgs args) (size_t)zbuffSize/(1024^2), nth, errno, strerror(errno)); // # nocov end } +#endif } bool failed = false; // naked (unprotected by atomic) write to bool ok because only ever write true in this special paradigm int failed_compress = 0; // the first thread to fail writes their reason here when they first get to ordered section - char failed_msg[1001] = ""; // to hold zlib's msg; copied out of zlib in ordered section just in case the msg is allocated within zlib int failed_write = 0; // same. could use +ve and -ve in the same code but separate it out to trace Solaris problem, #3931 - if (nth>1) verbose=false; // printing isn't thread safe (there's a temporary print in compressbuff for tracing solaris; #4099) - +#ifndef NOZLIB z_stream thread_streams[nth]; // VLA on stack should be fine for nth structs; in zlib v1.2.11 sizeof(struct)==112 on 64bit // not declared inside the parallel region because solaris appears to move the struct in // memory when the #pragma omp for is entered, which causes zlib's internal self reference // pointer to mismatch, #4099 + char failed_msg[1001] = ""; // to hold zlib's msg; copied out of zlib in ordered section just in case the msg is allocated within zlib +#endif #pragma omp parallel num_threads(nth) { @@ -835,6 +849,7 @@ void fwriteMain(fwriteMainArgs args) void *myzBuff = NULL; size_t myzbuffUsed = 0; +#ifndef NOZLIB z_stream *mystream = &thread_streams[me]; if (args.is_gzip) { myzBuff = zbuffPool + me*zbuffSize; @@ -843,6 +858,7 @@ void fwriteMain(fwriteMainArgs args) my_failed_compress = -998; // # nocov } } +#endif #pragma omp for ordered schedule(dynamic) for(int64_t start=0; startmsg!=NULL) strncpy(failed_msg, mystream->msg, 1000); // copy zlib's msg for safe use after deflateEnd just in case zlib allocated the message +#endif } // else another thread could have failed below while I was working or waiting above; their reason got here first // # nocov end @@ -941,7 +961,9 @@ void fwriteMain(fwriteMainArgs args) // all threads will call this free on their buffer, even if one or more threads had malloc // or realloc fail. If the initial malloc failed, free(NULL) is ok and does nothing. if (args.is_gzip) { +#ifndef NOZLIB deflateEnd(mystream); +#endif } } free(buffPool); @@ -967,11 +989,13 @@ void fwriteMain(fwriteMainArgs args) // from the original error. if (failed) { // # nocov start +#ifndef NOZLIB if (failed_compress) STOP(_("zlib %s (zlib.h %s) deflate() returned error %d with z_stream->msg==\"%s\" Z_FINISH=%d Z_BLOCK=%d. %s"), zlibVersion(), ZLIB_VERSION, failed_compress, failed_msg, Z_FINISH, Z_BLOCK, verbose ? _("Please include the full output above and below this message in your data.table bug report.") : _("Please retry fwrite() with verbose=TRUE and include the full output with your data.table bug report.")); +#endif if (failed_write) STOP("%s: '%s'", strerror(failed_write), args.filename); // # nocov end diff --git a/src/utils.c b/src/utils.c index 8a3598f575..b1cb3b1aed 100644 --- a/src/utils.c +++ b/src/utils.c @@ -374,10 +374,16 @@ SEXP coerceUtf8IfNeeded(SEXP x) { return(ans); } +#ifndef NOZLIB #include +#endif SEXP dt_zlib_version() { char out[51]; +#ifndef NOZLIB snprintf(out, 50, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION); +#else + snprintf(out, 50, "zlib header files were not found when data.table was compiled"); +#endif return ScalarString(mkChar(out)); } From f395b9e2e735111caec8b94a51d896f8ae1ef23c Mon Sep 17 00:00:00 2001 From: ben-schwen <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 5 Jan 2021 04:58:56 +0100 Subject: [PATCH 159/588] fintersect order (#4725) --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ R/setops.R | 9 +++++---- inst/tests/tests.Rraw | 3 +++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3e878e84bd..00ec8c3bb6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -60,7 +60,8 @@ Authors@R: c( person("Jens Peder","Meldgaard", role="ctb"), person("Vaclav","Tlapak", role="ctb"), person("Kevin","Ushey", role="ctb"), - person("Dirk","Eddelbuettel", role="ctb")) + person("Dirk","Eddelbuettel", role="ctb"), + person("Ben","Schwen", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown diff --git a/NEWS.md b/NEWS.md index df1b96436c..3c62f3c261 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ 1. If `fread()` discards a single line footer, the warning message which includes the discarded text now displays any non-ASCII characters correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. +2. `fintersect()` now retains the order of the first argument as reasonably expected, rather than retaining the order of the second argument, [#4716](https://github.com/Rdatatable/data.table/issues/4716). Thanks to Michel Lang for reporting, and Ben Schwen for the PR. + ## NOTES 1. Compiling from source no longer requires `zlib` header files to be available, [#4844](https://github.com/Rdatatable/data.table/pull/4844). The output suggests installing `zlib` headers, and how (e.g. `zlib1g-dev` on Ubuntu) as before, but now proceeds with `gzip` compression disabled in `fwrite`. Upon calling `fwrite(DT, "file.csv.gz")` at runtime, an error message suggests to reinstall `data.table` with `zlib` headers available. This does not apply to users on Windows or Mac who install the pre-compiled binary package from CRAN. diff --git a/R/setops.R b/R/setops.R index 1ac949601b..b6dcd7b0b2 100644 --- a/R/setops.R +++ b/R/setops.R @@ -62,11 +62,12 @@ fintersect = function(x, y, all=FALSE) { if (all) { x = shallow(x)[, ".seqn" := rowidv(x)] y = shallow(y)[, ".seqn" := rowidv(y)] - jn.on = c(".seqn",setdiff(names(x),".seqn")) - x[y, .SD, .SDcols=setdiff(names(x),".seqn"), nomatch=NULL, on=jn.on] + jn.on = c(".seqn",setdiff(names(y),".seqn")) + # fixes #4716 by preserving order of 1st (uses y[x] join) argument instead of 2nd (uses x[y] join) + y[x, .SD, .SDcols=setdiff(names(y),".seqn"), nomatch=NULL, on=jn.on] } else { - z = funique(y) # fixes #3034. When .. prefix in i= is implemented (TODO), this can be x[funique(..y), on=, multi=] - x[z, nomatch=NULL, on=names(x), mult="first"] + z = funique(x) # fixes #3034. When .. prefix in i= is implemented (TODO), this can be x[funique(..y), on=, multi=] + y[z, nomatch=NULL, on=names(y), mult="first"] } } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a6013a3a11..2b44b3038f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17258,3 +17258,6 @@ if (identical(x, enc2native(x))) { test(2162.2, fread(text = txt2, encoding = 'Latin-1'), out, warning="Discarded single-line footer: <>") } + +# fintersect now preserves order of first argument like intersect, #4716 +test(2163, fintersect(data.table(x=c("b", "c", "a")), data.table(x=c("a","c")))$x, c("c", "a")) From 3fce74daa440f9b8ef82446036375abcdd6c43b1 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 9 Jan 2021 22:04:52 +0200 Subject: [PATCH 160/588] bmerge rework (#4566) --- src/bmerge.c | 479 +++++++++++++++++++---------------------------- src/data.table.h | 2 +- src/forder.c | 27 +-- src/uniqlist.c | 14 +- 4 files changed, 207 insertions(+), 315 deletions(-) diff --git a/src/bmerge.c b/src/bmerge.c index c6ae3e48ee..83c5b167d2 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -1,5 +1,4 @@ #include "data.table.h" -#include // the debugging machinery + breakpoint aidee /* Implements binary search (a.k.a. divide and conquer). @@ -10,11 +9,12 @@ Differences over standard binary search (e.g. bsearch in stdlib.h) : o list of vectors (key of many columns) of different types o ties (groups) o NA,NAN,-Inf,+Inf are distinct values and can be joined to - o type double is joined within tolerance (apx 11 s.f.) + o type double is joined within tolerance (apx 11 s.f.) according to setNumericRounding (default off) o join to prevailing value (roll join a.k.a locf), forwards or backwards o join to nearest o roll the beginning and end optionally o limit the roll distance to a user provided value + o non equi joins (no != yet) since 1.9.8 */ #define ENC_KNOWN(x) (LEVELS(x) & 12) @@ -26,9 +26,11 @@ Differences over standard binary search (e.g. bsearch in stdlib.h) : #define GE 4 #define GT 5 -static SEXP i, x, nqgrp; -static int ncol, *icols, *xcols, *o, *xo, *retFirst, *retLength, *retIndex, *allLen1, *allGrp1, *rollends, ilen, anslen; -static int *op, nqmaxgrp, scols; +static const SEXP *idtVec, *xdtVec; +static const int *icols, *xcols; +static SEXP nqgrp; +static int ncol, *o, *xo, *retFirst, *retLength, *retIndex, *allLen1, *allGrp1, *rollends, ilen, anslen; +static int *op, nqmaxgrp; static int ctr, nomatch; // populating matches for non-equi joins enum {ALL, FIRST, LAST} mult = ALL; static double roll, rollabs; @@ -37,40 +39,42 @@ static Rboolean rollToNearest=FALSE; void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisgrp, int lowmax, int uppmax); -SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg) { +SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg) { int xN, iN, protecti=0; ctr=0; // needed for non-equi join case SEXP retFirstArg, retLengthArg, retIndexArg, allLen1Arg, allGrp1Arg; retFirstArg = retLengthArg = retIndexArg = R_NilValue; // suppress gcc msg // iArg, xArg, icolsArg and xcolsArg - i = iArg; x = xArg; // set globals so bmerge_r can see them. + idtVec = SEXPPTR_RO(idt); // set globals so bmerge_r can see them. + xdtVec = SEXPPTR_RO(xdt); if (!isInteger(icolsArg)) error(_("Internal error: icols is not integer vector")); // # nocov if (!isInteger(xcolsArg)) error(_("Internal error: xcols is not integer vector")); // # nocov - if ((LENGTH(icolsArg) == 0 || LENGTH(xcolsArg) == 0) && LENGTH(i) > 0) // We let through LENGTH(i) == 0 for tests 2126.* + if ((LENGTH(icolsArg)==0 || LENGTH(xcolsArg)==0) && LENGTH(idt)>0) // We let through LENGTH(i) == 0 for tests 2126.* error(_("Internal error: icols and xcols must be non-empty integer vectors.")); if (LENGTH(icolsArg) > LENGTH(xcolsArg)) error(_("Internal error: length(icols) [%d] > length(xcols) [%d]"), LENGTH(icolsArg), LENGTH(xcolsArg)); // # nocov icols = INTEGER(icolsArg); xcols = INTEGER(xcolsArg); - xN = LENGTH(x) ? LENGTH(VECTOR_ELT(x,0)) : 0; - iN = ilen = anslen = LENGTH(i) ? LENGTH(VECTOR_ELT(i,0)) : 0; + xN = LENGTH(xdt) ? LENGTH(VECTOR_ELT(xdt,0)) : 0; + iN = ilen = anslen = LENGTH(idt) ? LENGTH(VECTOR_ELT(idt,0)) : 0; ncol = LENGTH(icolsArg); // there may be more sorted columns in x than involved in the join for(int col=0; colLENGTH(i) || icols[col]<1) error(_("icols[%d]=%d outside range [1,length(i)=%d]"), col, icols[col], LENGTH(i)); - if (xcols[col]>LENGTH(x) || xcols[col]<1) error(_("xcols[%d]=%d outside range [1,length(x)=%d]"), col, xcols[col], LENGTH(x)); - int it = TYPEOF(VECTOR_ELT(i, icols[col]-1)); - int xt = TYPEOF(VECTOR_ELT(x, xcols[col]-1)); - if (iN && it!=xt) error(_("typeof x.%s (%s) != typeof i.%s (%s)"), CHAR(STRING_ELT(getAttrib(x,R_NamesSymbol),xcols[col]-1)), type2char(xt), CHAR(STRING_ELT(getAttrib(i,R_NamesSymbol),icols[col]-1)), type2char(it)); + if (icols[col]>LENGTH(idt) || icols[col]<1) error(_("icols[%d]=%d outside range [1,length(i)=%d]"), col, icols[col], LENGTH(idt)); + if (xcols[col]>LENGTH(xdt) || xcols[col]<1) error(_("xcols[%d]=%d outside range [1,length(x)=%d]"), col, xcols[col], LENGTH(xdt)); + int it = TYPEOF(VECTOR_ELT(idt, icols[col]-1)); + int xt = TYPEOF(VECTOR_ELT(xdt, xcols[col]-1)); + if (iN && it!=xt) error(_("typeof x.%s (%s) != typeof i.%s (%s)"), CHAR(STRING_ELT(getAttrib(xdt,R_NamesSymbol),xcols[col]-1)), type2char(xt), CHAR(STRING_ELT(getAttrib(idt,R_NamesSymbol),icols[col]-1)), type2char(it)); + if (iN && it!=LGLSXP && it!=INTSXP && it!=REALSXP && it!=STRSXP) + error(_("Type '%s' not supported for joining/merging"), type2char(it)); } - // raise(SIGINT); // rollArg, rollendsArg roll = 0.0; rollToNearest = FALSE; if (isString(rollarg)) { if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) error(_("roll is character but not 'nearest'")); - if (ncol > 0 && TYPEOF(VECTOR_ELT(i, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet.")); + if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet.")); roll=1.0; rollToNearest=TRUE; // the 1.0 here is just any non-0.0, so roll!=0.0 can be used later } else { if (!isReal(rollarg)) error(_("Internal error: roll is not character or double")); // # nocov @@ -91,13 +95,22 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE else error(_("Internal error: invalid value for 'mult'. please report to data.table issue tracker")); // # nocov // opArg - if (!isInteger(opArg) || length(opArg) != ncol) + if (!isInteger(opArg) || length(opArg)!=ncol) error(_("Internal error: opArg is not an integer vector of length equal to length(on)")); // # nocov op = INTEGER(opArg); + for (int i=0; iGT/*5*/) + error(_("Internal error in bmerge_r for x.'%s'. Unrecognized value op[col]=%d"), // # nocov + CHAR(STRING_ELT(getAttrib(xdt,R_NamesSymbol),xcols[i]-1)), op[i]); // # nocov + if (op[i]!=EQ && TYPEOF(xdtVec[xcols[i]-1])==STRSXP) + error(_("Only '==' operator is supported for columns of type character.")); // # nocov + } + if (!isInteger(nqgrpArg)) error(_("Internal error: nqgrpArg must be an integer vector")); // # nocov nqgrp = nqgrpArg; // set global for bmerge_r - scols = (!length(nqgrpArg)) ? 0 : -1; // starting col index, -1 is external group column for non-equi join case + const int scols = (!length(nqgrpArg)) ? 0 : -1; // starting col index, -1 is external group column for non-equi join case // nqmaxgrpArg if (!isInteger(nqmaxgrpArg) || length(nqmaxgrpArg) != 1 || INTEGER(nqmaxgrpArg)[0] <= 0) @@ -146,7 +159,7 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE SEXP order = PROTECT(allocVector(INTSXP, length(icolsArg))); protecti++; for (int j=0; j0 and <=ncol-1 if this range of [xlow,xupp] and [ilow,iupp] match up to but not including that column // lowmax=1 if xlowIn is the lower bound of this group (needed for roll) // uppmax=1 if xuppIn is the upper bound of this group (needed for roll) // new: col starts with -1 for non-equi joins, which gathers rows from nested id group counter 'thisgrp' { - int xlow=xlowIn, xupp=xuppIn, ilow=ilowIn, iupp=iuppIn, j, k, ir, lir, tmp, tmplow, tmpupp; - ir = lir = ilow + (iupp-ilow)/2; // lir = logical i row. - if (o) ir = o[lir]-1; // ir = the actual i row if i were ordered + int xlow=xlowIn, xupp=xuppIn, ilow=ilowIn, iupp=iuppIn; + int lir = ilow + (iupp-ilow)/2; // lir = logical i row. + int ir = o ? o[lir]-1 : lir; // ir = the actual i row if i were ordered + const bool isDataCol = col>-1; // check once for non nq join grp id internal technical, non-data, field + const bool isRollCol = roll!=0.0 && col==ncol-1; // col==ncol-1 implies col>-1 SEXP ic, xc; - if (col>-1) { - ic = VECTOR_ELT(i,icols[col]-1); // ic = i column - xc = VECTOR_ELT(x,xcols[col]-1); // xc = x column - // it was checked in bmerge() that the types are equal + if (isDataCol) { + ic = idtVec[icols[col]-1]; // ic = i column + xc = xdtVec[xcols[col]-1]; // xc = x column + // it was checked in bmerge() above that TYPEOF(ic)==TYPEOF(xc) } else { ic = R_NilValue; xc = nqgrp; } - bool isInt64=false; + bool rollLow=false, rollUpp=false; + + #define DO(XVAL, CMP1, CMP2, TYPE, LOWDIST, UPPDIST, IVAL) \ + while (xlow < xupp-1) { \ + int mid = xlow + (xupp-xlow)/2; \ + XVAL; \ + if (CMP1) { /* relies on NA_INTEGER==INT_MIN, tested in init.c */ \ + xlow=mid; \ + } else if (CMP2) { /* TO DO: switch(sign(xval-ival)) ? */ \ + xupp=mid; \ + } else { \ + /* xval == ival including NA_INTEGER==NA_INTEGER \ + branch mid to find start and end of this group in this column \ + TO DO?: not if mult=first|last and colxlowIn) && (!uppmax || xupp0.0 && (!lowmax || xlow>xlowIn) && (xuppxlowIn || !lowmax || rollends[0])) \ + || ( roll>0.0 && xlow==xlowIn && lowmax && rollends[0]) ) \ + && ( isinf(rollabs) || ((UPPDIST)-(TYPE)rollabs <= (TYPE)1e-6) )) \ + rollUpp=true; \ + } \ + } \ + if (op[col] != EQ) { \ + /* never true for STRSXP checked up front */ \ + switch (op[col]) { \ + case LE : if (!ISNAT(ival)) xlow = xlowIn; break; \ + case LT : xupp = xlow + 1; if (!ISNAT(ival)) xlow = xlowIn; break; \ + case GE : if (!ISNAT(ival)) xupp = xuppIn; break; \ + case GT : xlow = xupp - 1; if (!ISNAT(ival)) xupp = xuppIn; break; \ + /* no other cases; checked up front to avoid handling error in parallel region */ \ + } \ + /* for LE/LT cases, ensure xlow excludes NA indices, != EQ is checked above already */ \ + if (op[col]<=3 && xlow-1) ? INTEGER(ic) : NULL; - const int *ixc = INTEGER(xc); - ival.i = (col>-1) ? iic[ir] : thisgrp; - while(xlow < xupp-1) { - int mid = xlow + (xupp-xlow)/2; // Same as (xlow+xupp)/2 but without risk of overflow - xval.i = ixc[XIND(mid)]; - if (xval.iival.i) { // TO DO: is *(&xlow, &xupp)[0|1]=mid more efficient than branch? - xupp=mid; - } else { - // xval.i == ival.i including NA_INTEGER==NA_INTEGER - // branch mid to find start and end of this group in this column - // TO DO?: not if mult=first|last and col-1 && op[col] != EQ) { - switch (op[col]) { - case LE : xlow = xlowIn; break; - case LT : xupp = xlow + 1; xlow = xlowIn; break; - case GE : if (ival.i != NA_INTEGER) xupp = xuppIn; break; - case GT : xlow = xupp - 1; if (ival.i != NA_INTEGER) xupp = xuppIn; break; - default : error(_("Internal error in bmerge_r for '%s' column. Unrecognized value op[col]=%d"), type2char(TYPEOF(xc)), op[col]); // #nocov - } - // for LE/LT cases, we need to ensure xlow excludes NA indices, != EQ is checked above already - if (op[col] <= 3 && xlow-1) { - while(tmplowival, int, ival-xcv[XIND(xlow)], xcv[XIND(xupp)]-ival, ival) + } break; case STRSXP : { - if (op[col] != EQ) error(_("Only '==' operator is supported for columns of type %s."), type2char(TYPEOF(xc))); - ival.s = ENC2UTF8(STRING_ELT(ic,ir)); - while(xlow < xupp-1) { - int mid = xlow + (xupp-xlow)/2; - xval.s = ENC2UTF8(STRING_ELT(xc, XIND(mid))); - tmp = StrCmp(xval.s, ival.s); // uses pointer equality first, NA_STRING are allowed and joined to, then uses strcmp on CHAR(). - if (tmp == 0) { // TO DO: deal with mixed encodings and locale optionally - tmplow = mid; - tmpupp = mid; - while(tmplowival.ull) { - xupp=mid; - } else { // xval.ull == ival.ull) - tmplow = mid; - tmpupp = mid; - while(tmplow-1 && op[col] != EQ) { - Rboolean isivalNA = !isInt64 ? ISNAN(dic[ir]) : (DtoLL(dic[ir]) == NA_INT64_LL); - switch (op[col]) { - case LE : if (!isivalNA) xlow = xlowIn; break; - case LT : xupp = xlow + 1; if (!isivalNA) xlow = xlowIn; break; - case GE : if (!isivalNA) xupp = xuppIn; break; - case GT : xlow = xupp - 1; if (!isivalNA) xupp = xuppIn; break; - default : error(_("Internal error in bmerge_r for '%s' column. Unrecognized value op[col]=%d"), type2char(TYPEOF(xc)), op[col]); // #nocov - } - // for LE/LT cases, we need to ensure xlow excludes NA indices, != EQ is checked above already - if (op[col] <= 3 && xlow-1) { - while(tmplow0, int, 0, 0, ival) + // NA_STRING are allowed and joined to; does not do ENC2UTF8 again inside StrCmp + // TO DO: deal with mixed encodings and locale optionally; could StrCmp non-ascii in a thread-safe non-alloc manner + } break; + case REALSXP : + if (INHERITS(xc, char_integer64)) { + const int64_t *icv = (const int64_t *)REAL(ic); + const int64_t *xcv = (const int64_t *)REAL(xc); + const int64_t ival = icv[ir]; + #undef ISNAT + #undef WRAP + #define ISNAT(x) ((x)==NA_INTEGER64) + #define WRAP(x) (x) + DO(const int64_t xval=xcv[XIND(mid)], xvalival, int64_t, ival-xcv[XIND(xlow)], xcv[XIND(xupp)]-ival, ival) + } else { + const double *icv = REAL(ic); + const double *xcv = REAL(xc); + const double ival = icv[ir]; + const uint64_t ivalt = dtwiddle(ival); // TO: remove dtwiddle by dealing with NA, NaN, -Inf, +Inf up front + #undef ISNAT + #undef WRAP + #define ISNAT(x) (ISNAN(x)) + #define WRAP(x) (dtwiddle(x)) + DO(const uint64_t xval=dtwiddle(xcv[XIND(mid)]), xvalivalt, double, icv[ir]-xcv[XIND(xlow)], xcv[XIND(xupp)]-icv[ir], ivalt) } - // ilow and iupp now surround the group in ic, too - } break; - default: - error(_("Type '%s' not supported for joining/merging"), type2char(TYPEOF(xc))); + // supported types were checked up front to avoid handling an error here in (future) parallel region } - if (xlow1) allLen1[0] = FALSE; if (nqmaxgrp == 1) { - for (j=ilow+1; jxuppIn) error(_("Internal error: xlow!=xupp-1 || xlowxuppIn")); // # nocov - if (rollToNearest) { // value of roll ignored currently when nearest - if ( (!lowmax || xlow>xlowIn) && (!uppmax || xupp0.0 && (!lowmax || xlow>xlowIn) && (xuppxlowIn || !lowmax || rollends[0])) - || (roll>0.0 && xlow==xlowIn && lowmax && rollends[0]) ) - && ( (TYPEOF(ic)==REALSXP && - (ival.d = REAL(ic)[ir], xval.d = REAL(xc)[XIND(xupp)], 1) && - (( !isInt64 && - (xval.d-ival.d-rollabs < 1e-6 || - xval.d-ival.d == rollabs /*#1007*/)) - || ( isInt64 && - (double)(xval.ll-ival.ll)-rollabs < 1e-6 ) )) - || (TYPEOF(ic)<=INTSXP && (double)(INTEGER(xc)[XIND(xupp)]-INTEGER(ic)[ir])-rollabs < 1e-6 ) - || (TYPEOF(ic)==STRSXP) )) { - retFirst[ir] = xupp+1; // == xlow+2 - retLength[ir] = 1; - } - } - if (iupp-ilow > 2 && retFirst[ir]!=NA_INTEGER) { - // >=2 equal values in the last column being rolling to the same point. - for (j=ilow+1; jilowIn && (xlow>xlowIn || ((roll!=0.0 || op[col] != EQ) && col==ncol-1))) + if (ilow>ilowIn && (xlow>xlowIn || isRollCol)) bmerge_r(xlowIn, xlow+1, ilowIn, ilow+1, col, 1, lowmax, uppmax && xlow+1==xuppIn); - if (iuppmax) max=tmp; else if (tmpy - return strcmp(CHAR(ENC2UTF8(x)), CHAR(ENC2UTF8(y))); // TODO: always calling ENC2UTF8 here could be expensive + return strcmp(CHAR(x), CHAR(y)); // bmerge calls ENC2UTF8 on x and y before passing here } -/* ENC2UTF8 handles encoding issues by converting all marked non-utf8 encodings alone to utf8 first. The function could be wrapped - in the first if-statement already instead of at the last stage, but this is to ensure that all-ascii cases are handled with maximum efficiency. - This seems to fix the issues as far as I've checked. Will revisit if necessary. - OLD COMMENT: can return 0 here for the same string in known and unknown encodings, good if the unknown string is in that encoding but not if not ordering is ascii only (C locale). - TO DO: revisit and allow user to change to strcoll, and take account of Encoding. see comments in bmerge(). 10k calls of strcmp = 0.37s, 10k calls of strcoll = 4.7s. See ?Comparison, ?Encoding, Scollate in R internals. - TO DO: check that all unknown encodings are ascii; i.e. no non-ascii unknowns are present, and that either Latin1 - or UTF-8 is used by user, not both. Then error if not. If ok, then can proceed with byte level. ascii is never marked known by R, but - non-ascii (i.e. knowable encoding) could be marked unknown. Does R API provide is_ascii? -*/ static void cradix_r(SEXP *xsub, int n, int radix) // xsub is a unique set of CHARSXP, to be ordered by reference @@ -395,13 +386,13 @@ int getNumericRounding_C() // for signed integers it's easy: flip sign bit to swap positives and negatives; the resulting unsigned is in the right order with INT_MIN ending up as 0 // for floating point finite you have to flip the other bits too if it was signed: http://stereopsis.com/radix.html -uint64_t dtwiddle(const void *p, int i) +uint64_t dtwiddle(double x) //const void *p, int i) { union { double d; uint64_t u64; } u; // local for thread safety - u.d = ((double *)p)[i]; + u.d = x; //((double *)p)[i]; if (R_FINITE(u.d)) { if (u.d==0) u.d=0; // changes -0.0 to 0.0, issue #743 u.u64 ^= (u.u64 & 0x8000000000000000) ? 0xffffffffffffffff : 0x8000000000000000; // always flip sign bit and if negative (sign bit was set) flip other bits too @@ -689,7 +680,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S elem = ISNA(xd[i]) ? naval : nanval; } } else { - elem = dtwiddle(xd, i); // TODO: could avoid twiddle() if all positive finite which could be known from range_d. + elem = dtwiddle(xd[i]); // TODO: could avoid twiddle() if all positive finite which could be known from range_d. // also R_FINITE is repeated within dtwiddle() currently, wastefully given the if() above } WRITE_KEY @@ -1289,7 +1280,7 @@ SEXP issorted(SEXP x, SEXP by) while (i=xd[i-1]) i++; } else { double *xd = REAL(x); - while (i=dtwiddle(xd,i-1)) i++; // TODO: change to loop over any NA or -Inf at the beginning and then proceed without dtwiddle() (but rounding) + while (i=dtwiddle(xd[i-1])) i++; // TODO: change to loop over any NA or -Inf at the beginning and then proceed without dtwiddle() (but rounding) } break; case STRSXP : { @@ -1355,7 +1346,7 @@ SEXP issorted(SEXP x, SEXP by) } break; case 1: { // regular double in REALSXP const double *p = (const double *)colp; - ok = dtwiddle(p,0)>dtwiddle(p,-1); // TODO: avoid dtwiddle by looping over any NA at the beginning, and remove NumericRounding. + ok = dtwiddle(p[0])>dtwiddle(p[-1]); // TODO: avoid dtwiddle by looping over any NA at the beginning, and remove NumericRounding. } break; case 2: { // integer64 in REALSXP const int64_t *p = (const int64_t *)colp; diff --git a/src/uniqlist.c b/src/uniqlist.c index e4cdfaa0e5..d79f7587e0 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -75,20 +75,22 @@ SEXP uniqlist(SEXP l, SEXP order) } } break; case REALSXP : { - const uint64_t *vd=(const uint64_t *)REAL(v); - uint64_t prev, elem; // grouping by integer64 makes sense (ids). grouping by float supported but a good use-case for that is harder to imagine if (getNumericRounding_C()==0 /*default*/ || inherits(v, "integer64")) { + const uint64_t *vd=(const uint64_t *)REAL(v); + uint64_t prev, elem; if (via_order) { COMPARE1_VIA_ORDER COMPARE2 } else { COMPARE1 COMPARE2 } } else { + const double *vd=(const double *)REAL(v); + double prev, elem; if (via_order) { - COMPARE1_VIA_ORDER && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2 + COMPARE1_VIA_ORDER && dtwiddle(elem)!=dtwiddle(prev) COMPARE2 } else { - COMPARE1 && dtwiddle(&elem, 0)!=dtwiddle(&prev, 0) COMPARE2 + COMPARE1 && dtwiddle(elem)!=dtwiddle(prev) COMPARE2 } } } break; @@ -119,7 +121,7 @@ SEXP uniqlist(SEXP l, SEXP order) ulv = (unsigned long long *)REAL(v); b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup) if (!b && !i64[j]) { - b = dtwiddle(ulv, thisi) == dtwiddle(ulv, previ); + b = dtwiddle(REAL(v)[thisi]) == dtwiddle(REAL(v)[previ]); // could store LHS for use next time as RHS (to save calling dtwiddle twice). However: i) there could be multiple double columns so vector of RHS would need // to be stored, ii) many short-circuit early before the if (!b) anyway (negating benefit) and iii) we may not have needed LHS this time so logic would be complex. } @@ -313,7 +315,7 @@ SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP mul case REALSXP: { double *xd = REAL(v); b = i64[j] ? ((int64_t *)xd)[thisi] >= ((int64_t *)xd)[previ] : - dtwiddle(xd, thisi) >= dtwiddle(xd, previ); + dtwiddle(xd[thisi]) >= dtwiddle(xd[previ]); } break; default: error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov From d88cdd4db75daf7996226bfca775071728eb7fe5 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 25 Jan 2021 13:23:18 -0700 Subject: [PATCH 161/588] https://r-datatable.com now works (#4881) --- DESCRIPTION | 2 +- NEWS.md | 4 +++- README.md | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 00ec8c3bb6..dc8f324342 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -68,7 +68,7 @@ Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (> SystemRequirements: zlib Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE -URL: https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table +URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table BugReports: https://github.com/Rdatatable/data.table/issues VignetteBuilder: knitr ByteCompile: TRUE diff --git a/NEWS.md b/NEWS.md index 3c62f3c261..e79ab4a795 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,8 @@ 1. Compiling from source no longer requires `zlib` header files to be available, [#4844](https://github.com/Rdatatable/data.table/pull/4844). The output suggests installing `zlib` headers, and how (e.g. `zlib1g-dev` on Ubuntu) as before, but now proceeds with `gzip` compression disabled in `fwrite`. Upon calling `fwrite(DT, "file.csv.gz")` at runtime, an error message suggests to reinstall `data.table` with `zlib` headers available. This does not apply to users on Windows or Mac who install the pre-compiled binary package from CRAN. +2. `r-datatable.com` continues to be the short, canonical and long-standing URL which forwards to the current homepage. The homepage domain has changed a few times over the years but those using `r-datatable.com` did not need to change their links. For example, we use `r-datatable.com` in messages (and translated messages) in preference to the word 'homepage' to save users time in searching for the current homepage. The web forwarding was provided by Domain Monster but they do not support `https://r-datatable.com`, only `http://r-datatable.com`, despite the homepage being forwarded to being `https:` for many years. Meanwhile, CRAN submission checks now require all URLs to be `https:`, rejecting `http:`. Therefore we have moved to [gandi.net](https://www.gandi.net) who do support `https:` web forwarding and so [https://r-datatable.com](https://r-datatable.com) now forwards correctly. Thanks to Dirk Eddelbuettel for suggesting Gandi. Further, Gandi allows the web-forward to be marked 301 (permanent) or 302 (temporary). Since the very point of `https://r-datatable.com` is to be a forward, 302 is appropriate in this case. This enables us to link to it in DESCRIPTION, README, and this NEWS item. Otherwise, CRAN submission checks would require the 301 forward to be followed; i.e. the forward replaced with where it points to and the package resubmitted. Thanks to Uwe Ligges for explaining this distinction. + # data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (30 Dec 2020) @@ -1441,7 +1443,7 @@ When `j` is a symbol (as in the quanteda and xgboost examples above) it will con 2. Just to state explicitly: data.table does not now depend on or require OpenMP. If you don't have it (as on CRAN's Mac it appears but not in general on Mac) then data.table should build, run and pass all tests just fine. -3. There are now 5,910 raw tests as reported by `test.data.table()`. Tests cover 91% of the 4k lines of R and 89% of the 7k lines of C. These stats are now known thanks to Jim Hester's [Covr](https://CRAN.R-project.org/package=covr) package and [Codecov.io](https://codecov.io/). If anyone is looking for something to help with, creating tests to hit the missed lines shown by clicking the `R` and `src` folders at the bottom [here](https://codecov.io/github/Rdatatable/data.table?branch=master) would be very much appreciated. +3. There are now 5,910 raw tests as reported by `test.data.table()`. Tests cover 91% of the 4k lines of R and 89% of the 7k lines of C. These stats are now known thanks to Jim Hester's [Covr](https://CRAN.R-project.org/package=covr) package and [Codecov.io](https://about.codecov.io/). If anyone is looking for something to help with, creating tests to hit the missed lines shown by clicking the `R` and `src` folders at the bottom [here](https://codecov.io/github/Rdatatable/data.table?branch=master) would be very much appreciated. 4. The FAQ vignette has been revised given the changes in v1.9.8. In particular, the very first FAQ. diff --git a/README.md b/README.md index da7a902b24..fcaa408b80 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# data.table +# data.table [![CRAN status](https://cranchecks.info/badges/flavor/release/data.table)](https://cran.r-project.org/web/checks/check_results_data.table.html) From d8f43cb984d9ed11cbadf0b758c8f29a0707fb47 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 2 Feb 2021 14:31:32 -0700 Subject: [PATCH 162/588] Arun's new email address --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index dc8f324342..c7820bbb34 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Version: 1.13.7 Title: Extension of `data.frame` Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), - person("Arun","Srinivasan", role="aut", email="arunkumar.sriniv@gmail.com"), + person("Arun","Srinivasan", role="aut", email="asrini@pm.me"), person("Jan","Gorecki", role="ctb"), person("Michael","Chirico", role="ctb"), person("Pasha","Stetsenko", role="ctb"), From 6924483890bfc9a08d481ce4d0a99660e7abb433 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 6 Feb 2021 03:27:14 -0700 Subject: [PATCH 163/588] fread tz= default changed from "" to "UTC" (#4894) --- NEWS.md | 10 ++++++++++ R/fread.R | 2 +- inst/tests/tests.Rraw | 18 ++++++++++-------- man/fread.Rd | 4 ++-- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index e79ab4a795..04204c299c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,16 @@ # data.table [v1.13.7](https://github.com/Rdatatable/data.table/milestone/20) (in development) +## POTENTIALLY BREAKING CHANGES + +1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved reading datetime. Before then datetime was read as character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. 2020-07-24T10:11:12.134Z where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz='UTC'` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended "In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.". + + At the `rstudio::global(2021)` conference, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow csv performance to data.table csv performance, [Bigger Data With Ease Using Apache Arrow](https://twitter.com/enpiar/status/1357729619420475392). He opened by comparing to data.table as his main point. Arrow was presented as 3 times faster than data.table. He talked at length about this result. This result is now being quoted in the community. However, no reproducible code was provided and we were not contacted in advance of the high profile talk in case we had any comments. Neal briefly mentioned New York Taxi data. That is a dataset known to us as containing unmarked datetime. We don't know if he set `tz='UTC'` or not. We could have suggested that if he had asked. We do know that setting `tz='UTC'` does speed up reading the New York Taxi dataset significantly. We don't know if the datetimes in the New York Taxi dataset really are in UTC, or local time, but we know it is common practice to read them as if they are UTC regardless. + + We are open source developers just trying to do our best. + + As an angry reaction to Neal's presentation, the default change from `tz=""` to `tz=UTC` is accelerated. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,004 CRAN packages directly using data.table are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behaviour. This migration option is temporary and will be removed in the near future. + ## BUG FIXES 1. If `fread()` discards a single line footer, the warning message which includes the discarded text now displays any non-ASCII characters correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. diff --git a/R/fread.R b/R/fread.R index 95e5c4a45a..0da96fe0e4 100644 --- a/R/fread.R +++ b/R/fread.R @@ -5,7 +5,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("d col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE), -yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="") +yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") { if (missing(input)+is.null(file)+is.null(text)+is.null(cmd) < 3L) stop("Used more than one of the arguments input=, file=, text= and cmd=.") input_has_vars = length(all.vars(substitute(input)))>0L # see news for v1.11.6 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2b44b3038f..c5910f5c81 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10845,10 +10845,12 @@ TZnotUTC = !identical(tt,"") && !is_utc(tt) if (TZnotUTC) { # from v1.13.0 these tests work when running under non-UTC because they compare to as.POSIXct which reads these unmarked datetime in local # the new tests 2150.* cover more cases - test(1743.25, fread("a,b,c\n2015-06-01 11:00:00,1,ae", colClasses=c("POSIXct","integer","character")), data.table(a=as.POSIXct("2015-06-01 11:00:00"),b=1L,c="ae")) - test(1743.26, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character="b"), drop=c("a","b"), logical01=TRUE), + # from v1.14.0, the tz="" is needed + test(1743.25, fread("a,b,c\n2015-06-01 11:00:00,1,ae", colClasses=c("POSIXct","integer","character"), tz=""), + data.table(a=as.POSIXct("2015-06-01 11:00:00"),b=1L,c="ae")) + test(1743.26, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character="b"), drop=c("a","b"), logical01=TRUE, tz=""), ans<-data.table(c=as.POSIXct("2015-06-01 11:00:00"), d="a", e=1.5, f="M", g=9L, h=FALSE)) - test(1743.27, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character=2), drop=c("a","b"), logical01=TRUE), + test(1743.27, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character=2), drop=c("a","b"), logical01=TRUE, tz=""), ans) } @@ -17062,7 +17064,7 @@ test(2150.01, fread(tmp), DT) # defaults for fwrite/fread simple and preservin fwrite(DT, tmp, dateTimeAs='write.csv') # as write.csv, writes the UTC times as-is not local because the time column has tzone=="UTC", but without the Z marker oldtz = Sys.getenv("TZ", unset=NA) Sys.unsetenv("TZ") -test(2150.021, sapply(fread(tmp), typeof), c(dates="integer", times="character")) # as before v1.13.0, datetime with missing timezone read as character +test(2150.021, sapply(fread(tmp,tz=""), typeof), c(dates="integer", times="character")) # from v1.14.0 tz="" needed to read datetime as character test(2150.022, fread(tmp,tz="UTC"), DT) # user can tell fread to interpet the unmarked datetimes as UTC Sys.setenv(TZ="UTC") test(2150.023, fread(tmp), DT) # TZ environment variable is also recognized @@ -17072,7 +17074,7 @@ if (.Platform$OS.type!="windows") { # blank TZ env variable on non-Windows is recognized as UTC consistent with C and R; but R's tz= argument is the opposite and uses "" for local } Sys.unsetenv("TZ") -tt = fread(tmp, colClasses=list(POSIXct="times")) +tt = fread(tmp, colClasses=list(POSIXct="times"), tz="") # from v1.14.0 tz="" needed test(2150.025, attr(tt$times, "tzone"), "") # as.POSIXct puts "" on the result (testing the write.csv version here with missing tzone) # the times will be different though here because as.POSIXct read them as local time. if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) @@ -17098,7 +17100,7 @@ test(2150.11,fread("a,b\n2015-01-01,2015-01-01", colClasses="POSIXct"), # local data.table(a=as.POSIXct("2015-01-01"), b=as.POSIXct("2015-01-01"))) test(2150.12,fread("a,b\n2015-01-01,2015-01-01", select=c(a="Date",b="POSIXct")), # select colClasses form, for coverage data.table(a=as.Date("2015-01-01"), b=as.POSIXct("2015-01-01"))) -test(2150.13, fread("a,b\n2015-01-01,1.1\n2015-01-02 01:02:03,1.2"), # no Z so as character as before v1.13.0 +test(2150.13, fread("a,b\n2015-01-01,1.1\n2015-01-02 01:02:03,1.2", tz=""), # no Z, tz="" needed for this test from v1.14.0 if (TZnotUTC) data.table(a=c("2015-01-01","2015-01-02 01:02:03"), b=c(1.1, 1.2)) else data.table(a=setattr(c(as.POSIXct("2015-01-01",tz="UTC"), as.POSIXct("2015-01-02 01:02:03",tz="UTC")),"tzone","UTC"), b=c(1.1, 1.2))) # some rows are date-only, some rows UTC-timestamp --> read the date-only in UTC too @@ -17112,9 +17114,9 @@ test(2150.16, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClass ans_print = capture.output(print(ans)) options(datatable.old.fread.datetime.character=NULL) if (TZnotUTC) { - test(2150.17, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), + test(2150.17, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct"), tz=""), ans, output=ans_print) - test(2150.18, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)), + test(2150.18, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA), tz=""), data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print) } else { test(2150.19, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), diff --git a/man/fread.Rd b/man/fread.Rd index cd14c0fbd8..703eb70d3e 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -24,7 +24,7 @@ data.table=getOption("datatable.fread.datatable", TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS keepLeadingZeros = getOption("datatable.keepLeadingZeros", FALSE), -yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="" +yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" ) } \arguments{ @@ -64,7 +64,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="" \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. } \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. } \item{tmpdir}{ Directory to use as the \code{tmpdir} argument for any \code{tempfile} calls, e.g. when the input is a URL or a shell command. The default is \code{tempdir()} which can be controlled by setting \code{TMPDIR} before starting the R session; see \code{\link[base:tempfile]{base::tempdir}}. } - \item{tz}{ Relevant to datetime values which have no Z or UTC-offset at the end, i.e. \emph{unmarked} datetime, as written by \code{\link[utils:write.table]{utils::write.csv}}. The default \code{tz=""} means interpet unmarked datetime in the timezone of the R session, for consistency with R's \code{as.POSIXct()} and backwards compatibility. Set \code{tz="UTC"} to read unmarked datetime in UTC. Note that \code{fwrite()} by default writes datetime in UTC including the final Z (i.e. UTC-marked datetime) and \code{fwrite}'s output will be read by \code{fread} consistently and quickly without needing to use \code{tz=} or \code{colClasses=}. If the TZ environment variable is set to \code{"UTC"} (or \code{""} on non-Windows where unset vs `""` is significant) then R's timezone is already UTC, the default \code{tz=""} means UTC, and unmarked datetime will be read as UTC. The TZ environment variable being unset, however, means local time, in both C and R, and is quite different from the TZ environment variable being set to \code{""} on non-Windows which means UTC not local. You can use \code{Sys.setenv(TZ="UTC")}, and \code{Sys.unsetenv("TZ")}, too, and \code{fread} will use the latest value. } + \item{tz}{ Relevant to datetime values which have no Z or UTC-offset at the end, i.e. \emph{unmarked} datetime, as written by \code{\link[utils:write.table]{utils::write.csv}}. The default \code{tz="UTC"} reads unmarked datetime as UTC POSIXct efficiently. \code{tz=""} reads unmarked datetime as type character (slowly) so that \code{as.POSIXct} can interpret (slowly) the character datetimes in local timezone; e.g. by using \code{"POSIXct"} in \code{colClasses=}. Note that \code{fwrite()} by default writes datetime in UTC including the final Z and therefore \code{fwrite}'s output will be read by \code{fread} consistently and quickly without needing to use \code{tz=} or \code{colClasses=}. If the \code{TZ} environment variable is set to \code{"UTC"} (or \code{""} on non-Windows where unset vs `""` is significant) then the R session's timezone is already UTC and \code{tz=""} will result in unmarked datetimes being read as UTC POSIXct. For more information, please see the news items from v1.13.0 and v1.14.0. } } \details{ From 97c96b2aacaacc0ae0e2df7b5ed85124da5eb524 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 6 Feb 2021 05:20:21 -0700 Subject: [PATCH 164/588] NEWS-only: community consultation link added to news item --- NEWS.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 04204c299c..1032017a00 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,13 +6,15 @@ ## POTENTIALLY BREAKING CHANGES -1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved reading datetime. Before then datetime was read as character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. 2020-07-24T10:11:12.134Z where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz='UTC'` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended "In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.". +1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. 2020-07-24T10:11:12.134Z where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz='UTC'` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended "In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.". At the `rstudio::global(2021)` conference, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow csv performance to data.table csv performance, [Bigger Data With Ease Using Apache Arrow](https://twitter.com/enpiar/status/1357729619420475392). He opened by comparing to data.table as his main point. Arrow was presented as 3 times faster than data.table. He talked at length about this result. This result is now being quoted in the community. However, no reproducible code was provided and we were not contacted in advance of the high profile talk in case we had any comments. Neal briefly mentioned New York Taxi data. That is a dataset known to us as containing unmarked datetime. We don't know if he set `tz='UTC'` or not. We could have suggested that if he had asked. We do know that setting `tz='UTC'` does speed up reading the New York Taxi dataset significantly. We don't know if the datetimes in the New York Taxi dataset really are in UTC, or local time, but we know it is common practice to read them as if they are UTC regardless. - + We are open source developers just trying to do our best. - - As an angry reaction to Neal's presentation, the default change from `tz=""` to `tz=UTC` is accelerated. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,004 CRAN packages directly using data.table are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behaviour. This migration option is temporary and will be removed in the near future. + + As an angry reaction to Neal's presentation, the default change from `tz=""` to `tz=UTC` is accelerated. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,004 CRAN packages directly using data.table are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. + + The community was consulted in [this tweet](https://twitter.com/MattDowle/status/1358011599336931328) before release. ## BUG FIXES From 61475136d81e45a90cfb0af191aed4f3f675ec1c Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 20 Feb 2021 14:01:51 -0700 Subject: [PATCH 165/588] NEWS-only: update item about tz='UTC' --- NEWS.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1032017a00..d53fcf0a17 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,13 +6,11 @@ ## POTENTIALLY BREAKING CHANGES -1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. 2020-07-24T10:11:12.134Z where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz='UTC'` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended "In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.". +1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz='UTC'` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended "In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.". - At the `rstudio::global(2021)` conference, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow csv performance to data.table csv performance, [Bigger Data With Ease Using Apache Arrow](https://twitter.com/enpiar/status/1357729619420475392). He opened by comparing to data.table as his main point. Arrow was presented as 3 times faster than data.table. He talked at length about this result. This result is now being quoted in the community. However, no reproducible code was provided and we were not contacted in advance of the high profile talk in case we had any comments. Neal briefly mentioned New York Taxi data. That is a dataset known to us as containing unmarked datetime. We don't know if he set `tz='UTC'` or not. We could have suggested that if he had asked. We do know that setting `tz='UTC'` does speed up reading the New York Taxi dataset significantly. We don't know if the datetimes in the New York Taxi dataset really are in UTC, or local time, but we know it is common practice to read them as if they are UTC regardless. + At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://twitter.com/enpiar/status/1357729619420475392). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than data.table. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). - We are open source developers just trying to do our best. - - As an angry reaction to Neal's presentation, the default change from `tz=""` to `tz=UTC` is accelerated. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,004 CRAN packages directly using data.table are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. + `tz=`'s default is now changed from `""` to `UTC`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. The community was consulted in [this tweet](https://twitter.com/MattDowle/status/1358011599336931328) before release. From 0310dc18b200904d2d8c084caecd9ea885afce0c Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 20 Feb 2021 14:09:40 -0700 Subject: [PATCH 166/588] NEWS-only: formatting --- NEWS.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index d53fcf0a17..77f7bf31f2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,11 +6,11 @@ ## POTENTIALLY BREAKING CHANGES -1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz='UTC'` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended "In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.". +1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz="UTC"` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended `In addition to convenience, fread is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.`. - At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://twitter.com/enpiar/status/1357729619420475392). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than data.table. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). + At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://twitter.com/enpiar/status/1357729619420475392). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). - `tz=`'s default is now changed from `""` to `UTC`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. + `tz=`'s default is now changed from `""` to `"UTC"`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. The community was consulted in [this tweet](https://twitter.com/MattDowle/status/1358011599336931328) before release. From 263b53e50241914a22f7ba6a139b52162c9d7927 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 20 Feb 2021 16:15:20 -0700 Subject: [PATCH 167/588] 1.14.0 on CRAN. Bump to 1.14.1 --- .dev/CRAN_Release.cmd | 37 ++++++++++++++++------------------- DESCRIPTION | 2 +- Makefile | 6 +++--- NEWS.md | 15 +++++++++++--- src/init.c | 2 +- vignettes/datatable-intro.Rmd | 2 +- 6 files changed, 35 insertions(+), 29 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 15c7ab0bcb..448b15676e 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -208,15 +208,15 @@ R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" # avoids many too-many-requests in --as-cran's ping-all-URLs step (20 mins) inside the `checking CRAN incoming feasibility...` step. # Many thanks to Dirk for the tipoff that setting this env variable solves the problem, #4832. -R CMD check data.table_1.13.7.tar.gz --as-cran -R CMD INSTALL data.table_1.13.7.tar.gz --html +R CMD check data.table_1.14.1.tar.gz --as-cran +R CMD INSTALL data.table_1.14.1.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.13.7.tar.gz +R CMD check data.table_1.14.1.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -233,9 +233,9 @@ q("no") # User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 # Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. -PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.13.7.tar.gz +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.1.tar.gz # Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work -PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.13.7.tar.gz +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.1.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) @@ -266,7 +266,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.13.7.tar.gz +R310 CMD INSTALL ./data.table_1.14.1.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -278,7 +278,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.13.7.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.14.1.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -286,7 +286,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.13.7.tar.gz +R CMD check data.table_1.14.1.tar.gz ##################################################### @@ -336,8 +336,8 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-gcc CMD INSTALL data.table_1.13.7.tar.gz -Rdevel-strict-clang CMD INSTALL data.table_1.13.7.tar.gz +Rdevel-strict-gcc CMD INSTALL data.table_1.14.1.tar.gz +Rdevel-strict-clang CMD INSTALL data.table_1.14.1.tar.gz # Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so should be passed through to here Rdevel-strict-gcc Rdevel-strict-clang # repeat below with clang and gcc @@ -378,7 +378,7 @@ cd R-devel-valgrind make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems -Rdevel-valgrind CMD INSTALL data.table_1.13.7.tar.gz +Rdevel-valgrind CMD INSTALL data.table_1.14.1.tar.gz R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" # the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. # including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks @@ -416,7 +416,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.13.7.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.14.1.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -574,7 +574,7 @@ du -k inst/tests # 0.75MB after R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" Rdevel -q -e "packageVersion('xml2')" # ensure installed -Rdevel CMD check data.table_1.13.6.tar.gz --as-cran # use latest Rdevel as it may have extra checks +Rdevel CMD check data.table_1.14.0.tar.gz --as-cran # use latest Rdevel as it may have extra checks # bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit compressed *.Rraw to git # @@ -582,10 +582,7 @@ Resubmit to winbuilder (R-release, R-devel and R-oldrelease) Submit to CRAN. Message template : ------------------------------------------------------------ Hello, -921 CRAN revdeps checked. None are impacted. -valgrind 'additional check' fixed. -Solaris not yet resolved. -POUMM's gcc-ASAN error resolved by this data.table update. +1,016 CRAN revdeps checked. None are impacted. Many thanks! Best, Matt ------------------------------------------------------------ @@ -604,8 +601,8 @@ When CRAN's email contains "Pretest results OK pending a manual inspection" (or 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. 4. Bump dllVersion() in init.c 5. Bump 3 version numbers in Makefile -6. Search and replace this .dev/CRAN_Release.cmd to update 1.13.5 to 1.13.7, and 1.13.4 to 1.13.6 (e.g. in step 8 and 9 below) +6. Search and replace this .dev/CRAN_Release.cmd to update 1.13.7 to 1.14.1, and 1.13.6 to 1.14.0 (e.g. in step 8 and 9 below) 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -8. Push to master with this consistent commit message: "1.13.6 on CRAN. Bump to 1.13.7" -9. Take sha from step 8 and run `git tag 1.13.6 96c..sha..d77` then `git push origin 1.13.6` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +8. Push to master with this consistent commit message: "1.14.0 on CRAN. Bump to 1.14.1" +9. Take sha from step 8 and run `git tag 1.14.0 96c..sha..d77` then `git push origin 1.14.0` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) ###### diff --git a/DESCRIPTION b/DESCRIPTION index c7820bbb34..78ca52b485 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.13.7 +Version: 1.14.1 Title: Extension of `data.frame` Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), diff --git a/Makefile b/Makefile index e1331064d4..2be00d3b74 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.13.7.tar.gz + $(RM) data.table_1.14.1.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.13.7.tar.gz + $(R) CMD INSTALL data.table_1.14.1.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.13.7.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.1.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NEWS.md b/NEWS.md index 77f7bf31f2..a51de94eb6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,13 +2,22 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.13.7](https://github.com/Rdatatable/data.table/milestone/20) (in development) +# data.table [v1.14.1](https://github.com/Rdatatable/data.table/milestone/20) (in development) + +## NEW FEATURES + +## BUG FIXES + +## NOTES + + +# data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (submitted to CRAN on 20 Feb 2021) ## POTENTIALLY BREAKING CHANGES 1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz="UTC"` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended `In addition to convenience, fread is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.`. - At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://twitter.com/enpiar/status/1357729619420475392). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). + At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://rstudio.com/resources/rstudioglobal-2021/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). `tz=`'s default is now changed from `""` to `"UTC"`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. @@ -999,7 +1008,7 @@ has a better chance of working on Mac. ## NOTES -1. The type coercion warning message has been improved, [#2989](https://github.com/Rdatatable/data.table/pull/2989). Thanks to @sarahbeeysian on [Twitter](https://twitter.com/sarahbeeysian/status/1021359529789775872) for highlighting. For example, given the follow statements: +1. The type coercion warning message has been improved, [#2989](https://github.com/Rdatatable/data.table/pull/2989). Thanks to @sarahbeeysian on Twitter for highlighting. For example, given the follow statements: ```R DT = data.table(id=1:3) diff --git a/src/init.c b/src/init.c index 6f3edec64c..714608c408 100644 --- a/src/init.c +++ b/src/init.c @@ -414,6 +414,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion() { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.13.7"))); + return(ScalarString(mkChar("1.14.1"))); } diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index ddbb59024d..1dcfe786f5 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -38,7 +38,7 @@ Briefly, if you are interested in reducing *programming* and *compute* time trem ## Data {#data} -In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the [Bureau of Transporation Statistics](https://www.transtats.bts.gov) for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/hadley/nycflights13)). The data is available only for Jan-Oct'14. +In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the Bureau of Transporation Statistics for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/hadley/nycflights13)). The data is available only for Jan-Oct'14. We can use `data.table`'s fast-and-friendly file reader `fread` to load `flights` directly as follows: From 3fa8b20435d33b3d4b5c26fd9b0ac14c10b98800 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 25 Feb 2021 11:36:16 -0700 Subject: [PATCH 168/588] .dev-only: revdep tweaks --- .dev/CRAN_Release.cmd | 1 + .dev/revdep.R | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 448b15676e..a2db3058b3 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -529,6 +529,7 @@ sudo apt-get -y install pandoc-citeproc # for basecallQC sudo apt-get -y install libquantlib0-dev # for RQuantLib sudo apt-get -y install cargo # for gifski, a suggest of nasoi sudo apt-get -y install libgit2-dev # for gert +sudo apt-get -y install cmake # for symengine for RxODE sudo R CMD javareconf # ENDIF diff --git a/.dev/revdep.R b/.dev/revdep.R index 225cb67c1b..49aa6e06f9 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -7,6 +7,11 @@ Sys.unsetenv("R_PROFILE_USER") # But if we don't unset it now, anything else from now on that does something like system("R CMD INSTALL"), e.g. update.packages() # and BiocManager::install(), will call this script again recursively. +# options copied from .dev/.Rprofile that aren't run due to the way this script is started via a profile +options(help_type="html") +options(error=quote(dump.frames())) +options(width=200) # for cran() output not to wrap + # Check that env variables have been set correctly: # export R_LIBS_SITE=none # export R_LIBS=~/build/revdeplib/ @@ -233,6 +238,10 @@ status = function(bioc=FALSE) { cran = function() # reports CRAN status of the .cran.fail packages { + if (!length(.fail.cran)) { + cat("No CRAN revdeps in error or warning status\n") + return(invisible()) + } require(data.table) p = proc.time() db = setDT(tools::CRAN_check_results()) From 85adf09e3463838d547977ae9bc75e3b37f9cbaf Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 8 Mar 2021 02:11:19 +0200 Subject: [PATCH 169/588] Internal class aware coerceAs. Already used in nafill and froll (#4491) --- NEWS.md | 11 +++ R/data.table.R | 3 +- R/shift.R | 4 - R/wrappers.R | 3 +- inst/tests/froll.Rraw | 12 +-- inst/tests/nafill.Rraw | 203 ++++++++++++++++++++++++++++++++--------- inst/tests/tests.Rraw | 4 +- man/froll.Rd | 2 +- man/nafill.Rd | 2 +- src/assign.c | 31 ++++--- src/data.table.h | 5 +- src/frollR.c | 67 ++++---------- src/init.c | 8 +- src/nafill.c | 114 +++++++++++++---------- src/types.h | 3 +- src/utils.c | 114 +++++++++++------------ 16 files changed, 353 insertions(+), 233 deletions(-) diff --git a/NEWS.md b/NEWS.md index a51de94eb6..0ec4b3d736 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,10 +6,21 @@ ## NEW FEATURES +1. `nafill()` now applies `fill=` to the front/back of the vector when `type="locf|nocb"`, [#3594](https://github.com/Rdatatable/data.table/issues/3594). Thanks to @ben519 for the feature request. It also now returns a named object based on the input names. Note that if you are considering joining and then using `nafill(...,type='locf|nocb')` afterwards, please review `roll=`/`rollends=` which should achieve the same result in one step more efficiently. `nafill()` is for when filling-while-joining (i.e. `roll=`/`rollends=`/`nomatch=`) cannot be applied. + ## BUG FIXES ## NOTES +1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : + + ```R + x = c(2L,NA,4L,5L) + nafill(x, fill=3) # no warning; requiring 3L too inconvenient + nafill(x, fill="3") # warns in case either x or "3" was a mistake + nafill(x, fill=3.14) # warns that precision has been lost + nafill(x, fill=as.integer(3.14)) # no warning; the as. conveys intent + ``` # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (submitted to CRAN on 20 Feb 2021) diff --git a/R/data.table.R b/R/data.table.R index 2b010db77a..961d9eb857 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -141,7 +141,8 @@ replace_dot_alias = function(e) { return(ans) } if (!missing(verbose)) { - stopifnot(isTRUEorFALSE(verbose)) + if (!is.integer(verbose) && !is.logical(verbose)) stop("verbose must be logical or integer") + if (length(verbose)!=1 || anyNA(verbose)) stop("verbose must be length 1 non-NA") # set the global verbose option because that is fetched from C code without having to pass it through oldverbose = options(datatable.verbose=verbose) on.exit(options(oldverbose)) diff --git a/R/shift.R b/R/shift.R index 63a1cdec42..c73d8b0840 100644 --- a/R/shift.R +++ b/R/shift.R @@ -26,14 +26,10 @@ shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FA nafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA) { type = match.arg(type) - if (type!="const" && !missing(fill)) - warning("argument 'fill' ignored, only make sense for type='const'") .Call(CnafillR, x, type, fill, nan_is_na(nan), FALSE, NULL) } setnafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x)) { type = match.arg(type) - if (type!="const" && !missing(fill)) - warning("argument 'fill' ignored, only make sense for type='const'") invisible(.Call(CnafillR, x, type, fill, nan_is_na(nan), TRUE, cols)) } diff --git a/R/wrappers.R b/R/wrappers.R index 5fec33a92f..0c226b9f30 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -9,6 +9,7 @@ fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na) fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.list(substitute(list(...)))[-1L]) colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) -coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) + +coerceAs = function(x, as, copy=TRUE) .Call(CcoerceAs, x, as, copy) diff --git a/inst/tests/froll.Rraw b/inst/tests/froll.Rraw index 84143e587c..f6a4f96a80 100644 --- a/inst/tests/froll.Rraw +++ b/inst/tests/froll.Rraw @@ -78,15 +78,15 @@ test(6000.011, frollmean(x, n, adaptive=TRUE), list(c(NA, 1, 1.25), c(NA, 1, 1.2 #### error on unsupported type dx = data.table(real=1:10/2, char=letters[1:10]) -test(6000.012, frollmean(dx, 3), error="x must be list, data.frame or data.table of numeric or logical types") +test(6000.012, frollmean(dx, 3), error="x must be of type numeric or logical, or a list, data.frame or data.table of such") dx = data.table(real=1:10/2, fact=factor(letters[1:10])) -test(6000.013, frollmean(dx, 3), error="x must be list, data.frame or data.table of numeric or logical types") +test(6000.013, frollmean(dx, 3), error="x must be of type numeric or logical, or a list, data.frame or data.table of such") #dx = data.table(real=1:10/2, logi=logical(10)) #test(6000.014, frollmean(dx, 3), error="x must be list, data.frame or data.table of numeric types") # commented out as support added in #3749, tested in .009 dx = data.table(real=1:10/2, list=rep(list(NA), 10)) -test(6000.015, frollmean(dx, 3), error="x must be list, data.frame or data.table of numeric or logical types") +test(6000.015, frollmean(dx, 3), error="x must be of type numeric or logical, or a list, data.frame or data.table of such") x = letters[1:10] -test(6000.016, frollmean(x, 3), error="x must be of type numeric or logical") +test(6000.016, frollmean(x, 3), error="x must be of type numeric or logical, or a list, data.frame or data.table of such") x = 1:10/2 test(6000.017, frollmean(x, "a"), error="n must be integer") test(6000.018, frollmean(x, factor("a")), error="n must be integer") @@ -355,8 +355,8 @@ test(6000.074, frollmean(1:3, 2, fill=0L), c(0, 1.5, 2.5)) test(6000.075, frollmean(1:3, 2, fill=NA_integer_), c(NA_real_, 1.5, 2.5)) test(6000.076, frollmean(1:3, 2, fill=1:2), error="fill must be a vector of length 1") test(6000.077, frollmean(1:3, 2, fill=NA), c(NA_real_, 1.5, 2.5)) -test(6000.078, frollmean(1:3, 2, fill=TRUE), error="fill must be numeric") -test(6000.079, frollmean(1:3, 2, fill=FALSE), error="fill must be numeric") +test(6000.078, frollmean(1:3, 2, fill=TRUE), frollmean(1:3, 2, fill=1)) #error="fill must be numeric") # fill already coerced, as 'x' arg +test(6000.079, frollmean(1:3, 2, fill=FALSE), frollmean(1:3, 2, fill=0)) #error="fill must be numeric") test(6000.080, frollmean(1:3, 2, fill="a"), error="fill must be numeric") test(6000.081, frollmean(1:3, 2, fill=factor("a")), error="fill must be numeric") test(6000.082, frollmean(1:3, 2, fill=list(NA)), error="fill must be numeric") diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index f22a66f702..dcaa0f40d4 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -7,13 +7,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { test = data.table:::test INT = data.table:::INT colnamesInt = data.table:::colnamesInt - coerceFill = data.table:::coerceFill - # masked by which package? - # ================================= - copy = data.table::copy # bit64; copy is used in this file, so this line is needed - setattr = data.table::setattr # bit ; setattr does not appear in this file, so not needed. Here in case that changes. - # use of copy and setattr within data.table's own code is not masked by other packages - # we only have to do this in test files because, like a user would, these test files run like a user + coerceAs = data.table:::coerceAs } sugg = c( @@ -34,7 +28,7 @@ test(1.04, nafill(x, fill=5), INT(5,5,3,4,5,5,7,8,5,5)) test(1.05, nafill(x, fill=NA_integer_), x) test(1.06, nafill(x, fill=NA), x) test(1.07, nafill(x, fill=NA_real_), x) -test(1.08, nafill(x, fill=Inf), x) +test(1.08, nafill(x, fill=Inf), x, warning="precision lost") test(1.09, nafill(x, fill=NaN), x) y = x/2 test(1.11, nafill(y, "locf"), c(NA,NA,3,4,4,4,7,8,8,8)/2) @@ -53,31 +47,31 @@ z[9L] = -Inf test(1.21, nafill(z, "locf"), c(NA,Inf,3,4,4,4,7,8,-Inf,-Inf)/2) test(1.22, nafill(z, "nocb"), c(Inf,Inf,3,4,7,7,7,8,-Inf,NA)/2) dt = data.table(x, y, z) -test(1.31, nafill(dt, "locf"), unname(lapply(dt, nafill, "locf"))) -test(1.32, nafill(dt, "nocb"), unname(lapply(dt, nafill, "nocb"))) -test(1.33, nafill(dt, fill=0), unname(lapply(dt, nafill, fill=0))) +test(1.31, nafill(dt, "locf"), lapply(dt, nafill, "locf")) +test(1.32, nafill(dt, "nocb"), lapply(dt, nafill, "nocb")) +test(1.33, nafill(dt, fill=0), lapply(dt, nafill, fill=0)) l = list(x, y[1:8], z[1:6]) test(1.41, nafill(l, "locf"), lapply(l, nafill, "locf")) test(1.42, nafill(l, "nocb"), lapply(l, nafill, "nocb")) test(1.43, nafill(l, fill=0), lapply(l, nafill, fill=0)) l = list(a=c(1:2,NA,4:5), b=as.Date(c(1:2,NA,4:5), origin="1970-01-01"), d=c(NA,2L,NA,4L,NA), e=as.Date(c(NA,2L,NA,4L,NA), origin="1970-01-01")) # Date retain class #3617 -test(1.44, nafill(l, "locf"), list(c(1:2,2L,4:5), structure(c(1,2,2,4,5), class="Date"), c(NA,2L,2L,4L,4L), structure(c(NA,2,2,4,4), class="Date"))) -test(1.45, nafill(l, "nocb"), list(c(1:2,4L,4:5), structure(c(1,2,4,4,5), class="Date"), c(2L,2L,4L,4L,NA), structure(c(2,2,4,4,NA), class="Date"))) -test(1.46, nafill(l, fill=0), list(c(1:2,0L,4:5), structure(c(1,2,0,4,5), class="Date"), c(0L,2L,0L,4L,0L), structure(c(0,2,0,4,0), class="Date"))) -test(1.47, nafill(l, fill=as.Date(0, origin="1970-01-01")), list(c(1:2,0L,4:5), structure(c(1,2,0,4,5), class="Date"), c(0L,2L,0L,4L,0L), structure(c(0,2,0,4,0), class="Date"))) -test(1.48, nafill(l, fill=as.Date("2019-06-05")), list(c(1:2,18052L,4:5), structure(c(1,2,18052,4,5), class="Date"), c(18052L,2L,18052L,4L,18052L), structure(c(18052,2,18052,4,18052), class="Date"))) +test(1.44, nafill(l, "locf"), list(a=c(1:2,2L,4:5), b=structure(c(1,2,2,4,5), class="Date"), d=c(NA,2L,2L,4L,4L), e=structure(c(NA,2,2,4,4), class="Date"))) +test(1.45, nafill(l, "nocb"), list(a=c(1:2,4L,4:5), b=structure(c(1,2,4,4,5), class="Date"), d=c(2L,2L,4L,4L,NA), e=structure(c(2,2,4,4,NA), class="Date"))) +test(1.46, nafill(l, fill=0), list(a=c(1:2,0L,4:5), b=structure(c(1,2,0,4,5), class="Date"), d=c(0L,2L,0L,4L,0L), e=structure(c(0,2,0,4,0), class="Date"))) +test(1.47, nafill(l, fill=as.Date(0, origin="1970-01-01")), list(a=c(1:2,0L,4:5), b=structure(c(1,2,0,4,5), class="Date"), d=c(0L,2L,0L,4L,0L), e=structure(c(0,2,0,4,0), class="Date"))) +test(1.48, nafill(l, fill=as.Date("2019-06-05")), list(a=c(1:2,18052L,4:5), b=structure(c(1,2,18052,4,5), class="Date"), d=c(18052L,2L,18052L,4L,18052L), e=structure(c(18052,2,18052,4,18052), class="Date"))) test(1.49, nafill(numeric()), numeric()) if (test_bit64) { l = list(a=as.integer64(c(1:2,NA,4:5)), b=as.integer64(c(NA,2L,NA,4L,NA))) - test(1.61, lapply(nafill(l, "locf"), as.character), lapply(list(c(1:2,2L,4:5), c(NA,2L,2L,4L,4L)), as.character)) - test(1.62, lapply(nafill(l, "nocb"), as.character), lapply(list(c(1:2,4L,4:5), c(2L,2L,4L,4L,NA)), as.character)) - test(1.63, lapply(nafill(l, fill=0), as.character), lapply(list(c(1:2,0L,4:5), c(0L,2L,0L,4L,0L)), as.character)) - test(1.64, lapply(nafill(l, fill=as.integer64(0)), as.character), lapply(list(c(1:2,0L,4:5), c(0L,2L,0L,4L,0L)), as.character)) - test(1.65, lapply(nafill(l, fill=as.integer64("3000000000")), as.character), list(c("1","2","3000000000","4","5"), c("3000000000","2","3000000000","4","3000000000"))) + test(1.61, lapply(nafill(l, "locf"), as.character), lapply(list(a=c(1:2,2L,4:5), b=c(NA,2L,2L,4L,4L)), as.character)) + test(1.62, lapply(nafill(l, "nocb"), as.character), lapply(list(a=c(1:2,4L,4:5), b=c(2L,2L,4L,4L,NA)), as.character)) + test(1.63, lapply(nafill(l, fill=0), as.character), lapply(list(a=c(1:2,0L,4:5), b=c(0L,2L,0L,4L,0L)), as.character)) + test(1.64, lapply(nafill(l, fill=as.integer64(0)), as.character), lapply(list(a=c(1:2,0L,4:5), b=c(0L,2L,0L,4L,0L)), as.character)) + test(1.65, lapply(nafill(l, fill=as.integer64("3000000000")), as.character), list(a=c("1","2","3000000000","4","5"), b=c("3000000000","2","3000000000","4","3000000000"))) l = lapply(l, `+`, as.integer64("3000000000")) - test(1.66, lapply(nafill(l, "locf"), as.character), list(c("3000000001","3000000002","3000000002","3000000004","3000000005"), c(NA_character_,"3000000002","3000000002","3000000004","3000000004"))) - test(1.67, lapply(nafill(l, "nocb"), as.character), list(c("3000000001","3000000002","3000000004","3000000004","3000000005"), c("3000000002","3000000002","3000000004","3000000004",NA_character_))) - test(1.68, lapply(nafill(l, fill=as.integer64("3000000000")), as.character), list(c("3000000001","3000000002","3000000000","3000000004","3000000005"), c("3000000000","3000000002","3000000000","3000000004","3000000000"))) + test(1.66, lapply(nafill(l, "locf"), as.character), list(a=c("3000000001","3000000002","3000000002","3000000004","3000000005"), b=c(NA_character_,"3000000002","3000000002","3000000004","3000000004"))) + test(1.67, lapply(nafill(l, "nocb"), as.character), list(a=c("3000000001","3000000002","3000000004","3000000004","3000000005"), b=c("3000000002","3000000002","3000000004","3000000004",NA_character_))) + test(1.68, lapply(nafill(l, fill=as.integer64("3000000000")), as.character), list(a=c("3000000001","3000000002","3000000000","3000000004","3000000005"), b=c("3000000000","3000000002","3000000000","3000000004","3000000000"))) test(1.69, nafill(c(1L,2L,NA,4L), fill=as.integer64(3L)), 1:4) test(1.70, nafill(c(1L,2L,NA,4L), fill=as.integer64(NA)), c(1:2,NA,4L)) test(1.71, nafill(c(1,2,NA,4), fill=as.integer64(3)), c(1,2,3,4)) @@ -90,10 +84,10 @@ if (test_bit64) { } if (test_nanotime) { l = list(a=nanotime(c(1:2,NA,4:5)), b=nanotime(c(NA,2L,NA,4L,NA))) - test(1.91, lapply(nafill(l, "locf"), as.character), lapply(list(nanotime(c(1:2,2L,4:5)), nanotime(c(NA,2L,2L,4L,4L))), as.character)) - test(1.92, lapply(nafill(l, "nocb"), as.character), lapply(list(nanotime(c(1:2,4L,4:5)), nanotime(c(2L,2L,4L,4L,NA))), as.character)) - test(1.93, lapply(nafill(l, fill=0), as.character), lapply(list(nanotime(c(1:2,0L,4:5)), nanotime(c(0L,2L,0L,4L,0L))), as.character)) - test(1.94, lapply(nafill(l, fill=nanotime(0)), as.character), lapply(list(nanotime(c(1:2,0L,4:5)), nanotime(c(0L,2L,0L,4L,0L))), as.character)) + test(1.91, lapply(nafill(l, "locf"), as.character), lapply(list(a=nanotime(c(1:2,2L,4:5)), b=nanotime(c(NA,2L,2L,4L,4L))), as.character)) + test(1.92, lapply(nafill(l, "nocb"), as.character), lapply(list(a=nanotime(c(1:2,4L,4:5)), b=nanotime(c(2L,2L,4L,4L,NA))), as.character)) + test(1.93, lapply(nafill(l, fill=0), as.character), lapply(list(a=nanotime(c(1:2,0L,4:5)), b=nanotime(c(0L,2L,0L,4L,0L))), as.character)) + test(1.94, lapply(nafill(l, fill=nanotime(0)), as.character), lapply(list(a=nanotime(c(1:2,0L,4:5)), b=nanotime(c(0L,2L,0L,4L,0L))), as.character)) } # setnafill @@ -120,13 +114,13 @@ test(2.08, unname(l), list(c(1:2,18052L,4:5), structure(c(1,2,18052,4,5), class= # exceptions test coverage x = 1:10 -test(3.01, nafill(x, "locf", fill=0L), nafill(x, "locf"), warning="argument 'fill' ignored") -test(3.02, setnafill(list(copy(x)), "locf", fill=0L), setnafill(list(copy(x)), "locf"), warning="argument 'fill' ignored") +test(3.01, nafill(x, "locf", fill=0L), x) +test(3.02, setnafill(list(copy(x)), "locf", fill=0L), list(x)) test(3.03, setnafill(x, "locf"), error="in-place update is supported only for list") test(3.04, nafill(letters[1:5], fill=0), error="must be numeric type, or list/data.table") test(3.05, setnafill(list(letters[1:5]), fill=0), error="must be numeric type, or list/data.table") test(3.06, nafill(x, fill=1:2), error="fill must be a vector of length 1") -test(3.07, nafill(x, fill="asd"), error="fill argument must be numeric") +test(3.07, nafill(x, fill="asd"), x, warning=c("Coercing.*character.*integer","NAs introduced by coercion")) # colnamesInt helper dt = data.table(a=1, b=2, d=3) @@ -166,32 +160,33 @@ if (test_bit64) { } options(old) -# coerceFill +# coerceAs int/numeric/int64 as used in nafill if (test_bit64) { - test(6.01, coerceFill(1:2), error="fill argument must be length 1") - test(6.02, coerceFill("a"), error="fill argument must be numeric") + coerceFill = function(x) lapply(list(1L, 1.0, as.integer64(1)), coerceAs, x=x) # old function used before #4491 + #test(6.01, coerceFill(1:2), error="fill argument must be length 1") + #test(6.02, coerceFill("a"), error="fill argument must be numeric") test(6.11, identical(coerceFill(NA), list(NA_integer_, NA_real_, as.integer64(NA)))) test(6.21, identical(coerceFill(3L), list(3L, 3, as.integer64(3)))) test(6.22, identical(coerceFill(0L), list(0L, 0, as.integer64(0)))) test(6.23, identical(coerceFill(NA_integer_), list(NA_integer_, NA_real_, as.integer64(NA)))) test(6.31, identical(coerceFill(as.integer64(3)), list(3L, 3, as.integer64(3)))) - test(6.32, identical(coerceFill(as.integer64(3000000003)), list(NA_integer_, 3000000003, as.integer64("3000000003")))) + test(6.32, identical(coerceFill(as.integer64(3000000003)), list(NA_integer_, 3000000003, as.integer64("3000000003"))), warning="out-of-range") test(6.33, identical(coerceFill(as.integer64(0)), list(0L, 0, as.integer64(0)))) test(6.34, identical(coerceFill(as.integer64(NA)), list(NA_integer_, NA_real_, as.integer64(NA)))) test(6.41, identical(coerceFill(3), list(3L, 3, as.integer64(3)))) test(6.42, identical(coerceFill(0), list(0L, 0, as.integer64(0)))) test(6.43, identical(coerceFill(NA_real_), list(NA_integer_, NA_real_, as.integer64(NA)))) test(6.44, identical(coerceFill(NaN), list(NA_integer_, NaN, as.integer64(NA)))) - test(6.45, identical(coerceFill(Inf), list(NA_integer_, Inf, as.integer64(NA)))) - test(6.46, identical(coerceFill(-Inf), list(NA_integer_, -Inf, as.integer64(NA)))) - test(6.47, identical(coerceFill(-(2^62)), list(NA_integer_, -(2^62), as.integer64("-4611686018427387904")))) - test(6.48, identical(coerceFill(-(2^64)), list(NA_integer_, -(2^64), as.integer64(NA)))) + test(6.45, identical(coerceFill(Inf), list(NA_integer_, Inf, as.integer64(NA))), warning=c("precision lost","precision lost")) + test(6.46, identical(coerceFill(-Inf), list(NA_integer_, -Inf, as.integer64(NA))), warning=c("precision lost","precision lost")) + test(6.47, identical(coerceFill(-(2^62)), list(NA_integer_, -(2^62), as.integer64("-4611686018427387904"))), warning=c("precision lost","precision lost")) + test(6.48, identical(coerceFill(-(2^64)), list(NA_integer_, -(2^64), as.integer64(NA))), warning=c("precision lost","precision lost")) test(6.49, identical(coerceFill(x<-as.integer64(-2147483647)), list(-2147483647L, -2147483647, x))) - test(6.50, identical(coerceFill(x<-as.integer64(-2147483648)), list(NA_integer_, -2147483648, x))) - test(6.51, identical(coerceFill(x<-as.integer64(-2147483649)), list(NA_integer_, -2147483649, x))) + test(6.50, identical(coerceFill(x<-as.integer64(-2147483648)), list(NA_integer_, -2147483648, x)), warning="out-of-range") + test(6.51, identical(coerceFill(x<-as.integer64(-2147483649)), list(NA_integer_, -2147483649, x)), warning="out-of-range") test(6.52, identical(coerceFill(-2147483647), list(-2147483647L, -2147483647, as.integer64("-2147483647")))) test(6.53, identical(coerceFill(-2147483648), list(NA_integer_, -2147483648, as.integer64("-2147483648")))) - test(6.54, identical(coerceFill(-2147483649), list(NA_integer_, -2147483649, as.integer64("-2147483649")))) + test(6.54, identical(coerceFill(-2147483649), list(NA_integer_, -2147483649, as.integer64("-2147483649"))), warning=c("precision lost","precision lost")) } # nan argument to treat NaN as NA in nafill, #4020 @@ -209,3 +204,127 @@ test(7.07, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1]) test(7.08, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) test(7.09, nafill(x, fill=0, nan=c(NA, NaN)), error="Argument 'nan' must be length 1") test(7.10, nafill(x, fill=0, nan=Inf), error="Argument 'nan' must be NA or NaN") + +# new tests for fill list +d = data.table(x = c(1:2,NA,4L), y = c(1,2,NA,4)) +test(8.01, nafill(d, fill=3), list(x=1:4, y=c(1,2,3,4))) +test(8.02, nafill(d, fill=3L), list(x=1:4, y=c(1,2,3,4))) +test(8.03, nafill(d, fill=list(3L,3)), list(x=1:4, y=c(1,2,3,4))) +test(8.04, nafill(d, fill=list(3,3L)), list(x=1:4, y=c(1,2,3,4))) +test(8.05, nafill(d, fill=list(3,NA)), list(x=1:4, y=c(1,2,NA,4))) +test(8.06, nafill(d, fill=list(1,9L)), list(x=c(1:2,1L,4L), y=c(1,2,9,4))) +d = as.data.table(setNames(as.list(seq_along(letters)), letters)) ## test names and scalar returned +test(8.11, names(nafill(d, fill=3)), letters) +test(8.12, nafill(c(1:2,NA,4L), "locf"), c(1:2,2L,4L)) +test(8.13, nafill(list(x=c(1:2,NA,4L)), "locf"), list(x=c(1:2,2L,4L))) + +# Extend functionality of nafill to use 'fill' argument for all types #3594 +test(9.01, nafill(c(NA,1,NA,NA,5,3,NA,0), type="locf", fill=-1), `[<-`(nafill(c(NA,1,NA,NA,5,3,NA,0), type="locf"), 1L, -1)) +x = xx = c(rep(NA,2),3:4,rep(NA,2)) +test(9.11, nafill(x, "locf", 0), `[<-`(nafill(x, "locf"), 1:2, 0L)) +test(9.12, nafill(x, "nocb", 0), `[<-`(nafill(x, "nocb"), 5:6, 0L)) +test(9.13, nafill(x, "locf", -1), `[<-`(nafill(x, "locf"), 1:2, -1L)) +test(9.14, nafill(x, "nocb", -1), `[<-`(nafill(x, "nocb"), 5:6, -1L)) +x = as.double(xx) +test(9.21, nafill(x, "locf", 0), `[<-`(nafill(x, "locf"), 1:2, 0)) +test(9.22, nafill(x, "nocb", 0), `[<-`(nafill(x, "nocb"), 5:6, 0)) +test(9.23, nafill(x, "locf", -1), `[<-`(nafill(x, "locf"), 1:2, -1)) +test(9.24, nafill(x, "nocb", -1), `[<-`(nafill(x, "nocb"), 5:6, -1)) +if (test_bit64) { + x = as.integer64(xx) + # `[<-.integer64` does not work + seti64 = function(x, i, value) {x[i] = value; x} + test(9.31, nafill(x, "locf", 0), seti64(nafill(x, "locf"), 1:2, as.integer64(0))) + test(9.32, nafill(x, "nocb", 0), seti64(nafill(x, "nocb"), 5:6, as.integer64(0))) + test(9.33, nafill(x, "locf", -1), seti64(nafill(x, "locf"), 1:2, as.integer64(-1))) + test(9.34, nafill(x, "nocb", -1), seti64(nafill(x, "nocb"), 5:6, as.integer64(-1))) +} + +# coerceAs verbose +options(datatable.verbose=2L) +input = 1 +test(10.01, ans<-coerceAs(input, 1), 1, output="double[numeric] into double[numeric]") +test(10.02, address(input)!=address(ans)) +test(10.03, ans<-coerceAs(input, 1, copy=FALSE), 1, output="copy=false and input already of expected type and class double[numeric]") +test(10.04, address(input), address(ans)) +test(10.05, ans<-coerceAs(input, 1L), 1L, output="double[numeric] into integer[integer]") +test(10.06, address(input)!=address(ans)) +test(10.07, ans<-coerceAs(input, 1L, copy=FALSE), 1L, output="double[numeric] into integer[integer]", notOutput="copy=false") +test(10.08, address(input)!=address(ans)) +test(10.09, coerceAs("1", 1L), 1L, output="character[character] into integer[integer]", warning="Coercing.*character.*integer") +test(10.10, coerceAs("1", 1), 1, output="character[character] into double[numeric]", warning="Coercing.*character.*double") +test(10.11, coerceAs("a", factor("x")), factor("a", levels=c("x","a")), output="character[character] into integer[factor]") ## levels of 'as' are retained! +test(10.12, coerceAs("a", factor()), factor("a"), output="character[character] into integer[factor]") +test(10.13, coerceAs(1, factor("x")), factor("x"), output="double[numeric] into integer[factor]") +test(10.14, coerceAs(1, factor("x", levels=c("x","y"))), factor("x", levels=c("x","y")), output="double[numeric] into integer[factor]") +test(10.15, coerceAs(2, factor("x", levels=c("x","y"))), factor("y", levels=c("x","y")), output="double[numeric] into integer[factor]") +test(10.16, coerceAs(1:2, factor(c("x","y"))), factor(c("x","y")), output="integer[integer] into integer[factor]") +test(10.17, coerceAs(1:3, factor(c("x","y"))), output="integer[integer] into integer[factor]", error="factor numbers.*3 is outside the level range") +test(10.18, coerceAs(c(1,2,3), factor(c("x","y"))), output="double[numeric] into integer[factor]", error="factor numbers.*3.000000 is outside the level range") +test(10.19, coerceAs(factor("x"), factor(c("x","y"))), factor("x", levels=c("x","y")), output="integer[factor] into integer[factor]") +test(10.20, coerceAs(factor("x"), factor(c("x","y")), copy=FALSE), factor("x", levels=c("x","y")), output="input already of expected type and class") ## copy=F has copyMostAttrib +a = structure("a", class="a") +b = structure("b", class="b") +test(10.21, coerceAs(a, b), structure("a", class="b"), output="character[a] into character[b]") +a = structure(1L, class="a") +b = structure(2L, class="b") +test(10.22, coerceAs(a, b), structure(1L, class="b"), output="integer[a] into integer[b]") +a = structure(1, class="a") +b = structure(2, class="b") +test(10.23, coerceAs(a, b), structure(1, class="b"), output="double[a] into double[b]") +a = structure(1, class="a") +b = structure(2L, class="b") +test(10.24, coerceAs(a, b), structure(1L, class="b"), output="double[a] into integer[b]") +if (test_bit64) { + x = as.integer64(1L) + test(10.81, coerceAs(x, 1), 1, output="double[integer64] into double[numeric]") + test(10.82, coerceAs(x, 1L), 1L, output="double[integer64] into integer[integer]") + test(10.83, coerceAs(x, "1"), error="please use as.character", output="double[integer64] into character[character]") # not yet implemented + test(10.84, coerceAs(1, x), x, output="double[numeric] into double[integer64]") + test(10.85, coerceAs(1L, x), x, output="integer[integer] into double[integer64]") + test(10.86, coerceAs("1", x), x, output="character[character] into double[integer64]", warning="Coercing.*character") + options(datatable.verbose=3L) + test(10.87, coerceAs(x, 1L), 1L, output=c("double[integer64] into integer[integer]","Zero-copy coerce when assigning 'integer64' to 'integer'")) + test(10.88, coerceAs(1L, x), x, output=c("integer[integer] into double[integer64]","Zero-copy coerce when assigning 'integer' to 'integer64'")) + options(datatable.verbose=2L) +} +if (test_nanotime) { + x = nanotime(1L) + test(10.91, coerceAs(x, 1), 1, output="double[nanotime] into double[numeric]") + test(10.92, coerceAs(x, 1L), 1L, output="double[nanotime] into integer[integer]") + test(10.93, coerceAs(x, "1"), error="please use as.character", output="double[nanotime] into character[character]") # not yet implemented + test(10.94, coerceAs(1, x), x, output="double[numeric] into double[nanotime]") + test(10.95, coerceAs(1L, x), x, output="integer[integer] into double[nanotime]") + test(10.96, coerceAs("1", x), x, output="character[character] into double[nanotime]", warning="Coercing.*character") +} +options(datatable.verbose=FALSE) +test(11.01, coerceAs(list(a=1), 1), error="is not atomic") +test(11.02, coerceAs(1, list(a=1)), error="is not atomic") +test(11.03, coerceAs(sum, 1), error="is not atomic") +test(11.04, coerceAs(quote(1+1), 1), error="is not atomic") +test(11.05, coerceAs(as.name("x"), 1), error="is not atomic") +m = matrix(1:4, 2, 2) +a = array(1:8, c(2,2,2)) +test(11.06, coerceAs(m, 1L), error="must not be matrix or array") +test(11.07, coerceAs(1L, m), error="must not be matrix or array") +test(11.08, coerceAs(a, 1L), error="must not be matrix or array") +test(11.09, coerceAs(1L, a), error="must not be matrix or array") + +# nafill, setnafill for character, factor and other types #3992 +## logical +## character +## factor +## Date +## POSIXct +## IDate +## ITime +## nanotime + +# related to !is.integer(verbose) +test(99.1, data.table(a=1,b=2)[1,1, verbose=1], error="verbose must be logical or integer") +test(99.2, data.table(a=1,b=2)[1,1, verbose=1:2], error="verbose must be length 1 non-NA") +test(99.3, data.table(a=1,b=2)[1,1, verbose=NA], error="verbose must be length 1 non-NA") +options(datatable.verbose=1) +test(99.4, coerceAs(1, 2L), error="verbose option must be length 1 non-NA logical or integer") +options(datatable.verbose=FALSE) + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c5910f5c81..95b8ba4492 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14352,7 +14352,7 @@ test(2005.09, set(DT, 1L, "c", expression(x+2)), error="type 'expression' cannot test(2005.10, set(DT, 1L, "d", expression(x+2)), error="type 'expression' cannot be coerced to 'logical'") test(2005.11, set(DT, 1L, "e", expression(x+2)), error="type 'expression' cannot be coerced to 'double'") test(2005.12, set(DT, 1L, "f", expression(x+2)), error="type 'expression' cannot be coerced to 'complex'") -test(2005.30, DT[2:3,c:=c(TRUE,FALSE), verbose=TRUE]$c, as.raw(INT(7,1,0)), +test(2005.30, DT[2:3,c:=c(TRUE,FALSE), verbose=3L]$c, as.raw(INT(7,1,0)), ## note verbose=3L for more deeper verbose output due to memrecycle messages when it is being re-used internally #4491 output="Zero-copy coerce when assigning 'logical' to 'raw' column 3 named 'c'") test(2005.31, set(DT,1L,"c",NA)$c, as.raw(INT(0,1,0))) test(2005.32, set(DT,1:2,"c",INT(-1,255))$c, as.raw(INT(0,255,0)), @@ -14388,7 +14388,7 @@ if (test_bit64) { warning="-1.*integer64.*position 1 taken as 0 when assigning.*raw.*column 3 named 'c'") test(2005.66, DT[2:3, f:=as.integer64(c(NA,"2147483648"))]$f, as.complex(c(-42,NA,2147483648))) DT[,h:=LETTERS[1:3]] - test(2005.67, DT[2:3, h:=as.integer64(1:2)], error="To assign integer64 to a character column, please use as.character.") + test(2005.67, DT[2:3, h:=as.integer64(1:2)], error="To assign integer64 to.*type character, please use as.character.") } # rbindlist raw type, #2819 diff --git a/man/froll.Rd b/man/froll.Rd index 070d28696d..388c47c485 100644 --- a/man/froll.Rd +++ b/man/froll.Rd @@ -25,7 +25,7 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) \item{x}{ vector, list, data.frame or data.table of numeric or logical columns. } \item{n}{ integer vector, for adaptive rolling function also list of integer vectors, rolling window size. } - \item{fill}{ numeric, value to pad by. Defaults to \code{NA}. } + \item{fill}{ numeric or logical, value to pad by. Defaults to \code{NA}. } \item{algo}{ character, default \code{"fast"}. When set to \code{"exact"}, then slower algorithm is used. It suffers less from floating point rounding error, performs extra pass to adjust rounding error diff --git a/man/nafill.Rd b/man/nafill.Rd index f8afb1dcfa..480f6ae118 100644 --- a/man/nafill.Rd +++ b/man/nafill.Rd @@ -16,7 +16,7 @@ setnafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x)) \arguments{ \item{x}{ vector, list, data.frame or data.table of numeric columns. } \item{type}{ character, one of \emph{"const"}, \emph{"locf"} or \emph{"nocb"}. Defaults to \code{"const"}. } - \item{fill}{ numeric or integer, value to be used to fill when \code{type=="const"}. } + \item{fill}{ numeric or integer, value to be used to fill. } \item{nan}{ (numeric \code{x} only) Either \code{NaN} or \code{NA}; if the former, \code{NaN} is treated as distinct from \code{NA}, otherwise, they are treated the same during replacement? } \item{cols}{ numeric or character vector specifying columns to be updated. } } diff --git a/src/assign.c b/src/assign.c index 5c0b808707..27fbccbd0e 100644 --- a/src/assign.c +++ b/src/assign.c @@ -696,6 +696,8 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con if (colname==NULL) error(_("Internal error: memrecycle has received NULL colname")); // # nocov *memrecycle_message = '\0'; + static char targetDesc[501]; // from 1.14.1 coerceAs reuses memrecycle for a target vector, PR#4491 + snprintf(targetDesc, 500, colnum==0 ? _("target vector") : _("column %d named '%s'"), colnum, colname); int protecti=0; const bool sourceIsFactor=isFactor(source), targetIsFactor=isFactor(target); const bool sourceIsI64=isReal(source) && Rinherits(source, char_integer64); @@ -717,7 +719,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con for (int i=0; inlevel) { - error(_("Assigning factor numbers to column %d named '%s'. But %d is outside the level range [1,%d]"), colnum, colname, val, nlevel); + error(_("Assigning factor numbers to %s. But %d is outside the level range [1,%d]"), targetDesc, val, nlevel); } } } else { @@ -725,7 +727,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con for (int i=0; inlevel)) { - error(_("Assigning factor numbers to column %d named '%s'. But %f is outside the level range [1,%d], or is not a whole number."), colnum, colname, val, nlevel); + error(_("Assigning factor numbers to %s. But %f is outside the level range [1,%d], or is not a whole number."), targetDesc, val, nlevel); } } } @@ -817,27 +819,27 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con } } } else if (isString(source) && !isString(target) && !isNewList(target)) { - warning(_("Coercing 'character' RHS to '%s' to match the type of the target column (column %d named '%s')."), - type2char(TYPEOF(target)), colnum, colname); + warning(_("Coercing 'character' RHS to '%s' to match the type of %s."), type2char(TYPEOF(target)), targetDesc); // this "Coercing ..." warning first to give context in case coerceVector warns 'NAs introduced by coercion' + // and also because 'character' to integer/double coercion is often a user mistake (e.g. wrong target column, or wrong + // variable on RHS) which they are more likely to appreciate than find inconvenient source = PROTECT(coerceVector(source, TYPEOF(target))); protecti++; } else if (isNewList(source) && !isNewList(target)) { if (targetIsI64) { - error(_("Cannot coerce 'list' RHS to 'integer64' to match the type of the target column (column %d named '%s')."), colnum, colname); + error(_("Cannot coerce 'list' RHS to 'integer64' to match the type of %s."), targetDesc); // because R's coerceVector doesn't know about integer64 } // as in base R; e.g. let as.double(list(1,2,3)) work but not as.double(list(1,c(2,4),3)) // relied on by NNS, simstudy and table.express; tests 1294.* - warning(_("Coercing 'list' RHS to '%s' to match the type of the target column (column %d named '%s')."), - type2char(TYPEOF(target)), colnum, colname); + warning(_("Coercing 'list' RHS to '%s' to match the type of %s."), type2char(TYPEOF(target)), targetDesc); source = PROTECT(coerceVector(source, TYPEOF(target))); protecti++; } else if ((TYPEOF(target)!=TYPEOF(source) || targetIsI64!=sourceIsI64) && !isNewList(target)) { - if (GetVerbose()) { + if (GetVerbose()>=3) { // only take the (small) cost of GetVerbose() (search of options() list) when types don't match - Rprintf(_("Zero-copy coerce when assigning '%s' to '%s' column %d named '%s'.\n"), + Rprintf(_("Zero-copy coerce when assigning '%s' to '%s' %s.\n"), sourceIsI64 ? "integer64" : type2char(TYPEOF(source)), targetIsI64 ? "integer64" : type2char(TYPEOF(target)), - colnum, colname); + targetDesc); } // The following checks are up front here, otherwise we'd need them twice in the two branches // inside BODY that cater for 'where' or not. Maybe there's a way to merge the two macros in future. @@ -850,10 +852,9 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con if (COND) { \ const char *sType = sourceIsI64 ? "integer64" : type2char(TYPEOF(source)); \ const char *tType = targetIsI64 ? "integer64" : type2char(TYPEOF(target)); \ - int n = snprintf(memrecycle_message, MSGSIZE, \ - "%"FMT" (type '%s') at RHS position %d "TO" when assigning to type '%s'", val, sType, i+1, tType); \ - if (colnum>0 && n>0 && ndbl_v[0] = x[0]; if (nan_is_na) { + ans->dbl_v[0] = ISNAN(x[0]) ? fill : x[0]; for (uint_fast64_t i=1; idbl_v[i] = ISNAN(x[i]) ? ans->dbl_v[i-1] : x[i]; } } else { + ans->dbl_v[0] = ISNA(x[0]) ? fill : x[0]; for (uint_fast64_t i=1; idbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i-1] : x[i]; } } } else if (type==2) { // nocb - ans->dbl_v[nx-1] = x[nx-1]; if (nan_is_na) { + ans->dbl_v[nx-1] = ISNAN(x[nx-1]) ? fill : x[nx-1]; for (int_fast64_t i=nx-2; i>=0; i--) { ans->dbl_v[i] = ISNAN(x[i]) ? ans->dbl_v[i+1] : x[i]; } } else { + ans->dbl_v[nx-1] = ISNA(x[nx-1]) ? fill : x[nx-1]; for (int_fast64_t i=nx-2; i>=0; i--) { ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i+1] : x[i]; } @@ -49,12 +51,12 @@ void nafillInteger(int32_t *x, uint_fast64_t nx, unsigned int type, int32_t fill ans->int_v[i] = x[i]==NA_INTEGER ? fill : x[i]; } } else if (type==1) { // locf - ans->int_v[0] = x[0]; + ans->int_v[0] = x[0]==NA_INTEGER ? fill : x[0]; for (uint_fast64_t i=1; iint_v[i] = x[i]==NA_INTEGER ? ans->int_v[i-1] : x[i]; } } else if (type==2) { // nocb - ans->int_v[nx-1] = x[nx-1]; + ans->int_v[nx-1] = x[nx-1]==NA_INTEGER ? fill : x[nx-1]; for (int_fast64_t i=nx-2; i>=0; i--) { ans->int_v[i] = x[i]==NA_INTEGER ? ans->int_v[i+1] : x[i]; } @@ -71,12 +73,12 @@ void nafillInteger64(int64_t *x, uint_fast64_t nx, unsigned int type, int64_t fi ans->int64_v[i] = x[i]==NA_INTEGER64 ? fill : x[i]; } } else if (type==1) { // locf - ans->int64_v[0] = x[0]; + ans->int64_v[0] = x[0]==NA_INTEGER64 ? fill : x[0]; for (uint_fast64_t i=1; iint64_v[i] = x[i]==NA_INTEGER64 ? ans->int64_v[i-1] : x[i]; } } else if (type==2) { // nocb - ans->int64_v[nx-1] = x[nx-1]; + ans->int64_v[nx-1] = x[nx-1]==NA_INTEGER64 ? fill : x[nx-1]; for (int_fast64_t i=nx-2; i>=0; i--) { ans->int64_v[i] = x[i]==NA_INTEGER64 ? ans->int64_v[i+1] : x[i]; } @@ -92,25 +94,34 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S if (!xlength(obj)) return(obj); + double tic=0.0; + if (verbose) + tic = omp_get_wtime(); + bool binplace = LOGICAL(inplace)[0]; + if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) + error("nan_is_na must be TRUE or FALSE"); // # nocov + bool nan_is_na = LOGICAL(nan_is_na_arg)[0]; + SEXP x = R_NilValue; - if (isVectorAtomic(obj)) { + bool obj_scalar = isVectorAtomic(obj); + if (obj_scalar) { if (binplace) error(_("'x' argument is atomic vector, in-place update is supported only for list/data.table")); else if (!isReal(obj) && !isInteger(obj)) error(_("'x' argument must be numeric type, or list/data.table of numeric types")); - x = PROTECT(allocVector(VECSXP, 1)); protecti++; // wrap into list - SET_VECTOR_ELT(x, 0, obj); - } else { - SEXP ricols = PROTECT(colnamesInt(obj, cols, ScalarLogical(TRUE))); protecti++; // nafill cols=NULL which turns into seq_along(obj) - x = PROTECT(allocVector(VECSXP, length(ricols))); protecti++; - int *icols = INTEGER(ricols); - for (int i=0; i1) num_threads(getDTthreads(nx, true)) for (R_len_t i=0; iINT32_MAX || rfill<=INT32_MIN) ? NA_INTEGER : (int32_t)rfill; - dfill[0] = (double)rfill; - i64fill[0] = rfill; - } - } else { - double rfill = REAL(fill)[0]; - if (ISNAN(rfill)) { - // NA -> NA, NaN -> NaN - ifill[0] = NA_INTEGER; dfill[0] = rfill; i64fill[0] = NA_INTEGER64; - } else { - ifill[0] = (!R_FINITE(rfill) || rfill>INT32_MAX || rfill<=INT32_MIN) ? NA_INTEGER : (int32_t)rfill; - dfill[0] = rfill; - i64fill[0] = (!R_FINITE(rfill) || rfill>(double)INT64_MAX || rfill<=(double)INT64_MIN) ? NA_INTEGER64 : (int64_t)rfill; - } - } - } else if (isLogical(fill) && LOGICAL(fill)[0]==NA_LOGICAL) { - ifill[0] = NA_INTEGER; dfill[0] = NA_REAL; i64fill[0] = NA_INTEGER64; - } else { - error(_("%s: fill argument must be numeric"), __func__); - } -} -SEXP coerceFillR(SEXP fill) { - int protecti=0; - double dfill=NA_REAL; - int32_t ifill=NA_INTEGER; - int64_t i64fill=NA_INTEGER64; - coerceFill(fill, &dfill, &ifill, &i64fill); - SEXP ans = PROTECT(allocVector(VECSXP, 3)); protecti++; - SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 1)); - SET_VECTOR_ELT(ans, 1, allocVector(REALSXP, 1)); - SET_VECTOR_ELT(ans, 2, allocVector(REALSXP, 1)); - INTEGER(VECTOR_ELT(ans, 0))[0] = ifill; - REAL(VECTOR_ELT(ans, 1))[0] = dfill; - ((int64_t *)REAL(VECTOR_ELT(ans, 2)))[0] = i64fill; - setAttrib(VECTOR_ELT(ans, 2), R_ClassSymbol, ScalarString(char_integer64)); - UNPROTECT(protecti); - return ans; -} - inline bool INHERITS(SEXP x, SEXP char_) { // Thread safe inherits() by pre-calling install() in init.c and then // passing those char_* in here for simple and fast non-API pointer compare. @@ -374,6 +319,64 @@ SEXP coerceUtf8IfNeeded(SEXP x) { return(ans); } +// class1 is used by coerseAs only, which is used by frollR.c and nafill.c only +const char *class1(SEXP x) { + SEXP cl = getAttrib(x, R_ClassSymbol); + if (length(cl)) + return(CHAR(STRING_ELT(cl, 0))); + SEXP d = getAttrib(x, R_DimSymbol); + int nd = length(d); + if (nd) { + if (nd==2) + return "matrix"; + else + return "array"; + } + SEXPTYPE t = TYPEOF(x); + // see TypeTable in src/main/utils.c to compare to the differences here vs type2char + switch(t) { + case CLOSXP: case SPECIALSXP: case BUILTINSXP: + return "function"; + case REALSXP: + return "numeric"; + case SYMSXP: + return "name"; + case LANGSXP: + return "call"; + default: + return type2char(t); + } +} + +// main motivation for this function is to have coercion helper that is aware of int64 NAs, unline base R coerce #3913 +SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg) { + // copyArg does not update in place, but only IF an object is of the same type-class as class to be coerced, it will return with no copy + if (!isVectorAtomic(x)) + error("'x' is not atomic"); + if (!isVectorAtomic(as)) + error("'as' is not atomic"); + if (!isNull(getAttrib(x, R_DimSymbol))) + error("'x' must not be matrix or array"); + if (!isNull(getAttrib(as, R_DimSymbol))) + error("'as' must not be matrix or array"); + bool verbose = GetVerbose()>=2; // verbose level 2 required + if (!LOGICAL(copyArg)[0] && TYPEOF(x)==TYPEOF(as) && class1(x)==class1(as)) { + if (verbose) + Rprintf("copy=false and input already of expected type and class %s[%s]\n", type2char(TYPEOF(x)), class1(x)); + copyMostAttrib(as, x); // so attrs like factor levels are same for copy=T|F + return(x); + } + int len = LENGTH(x); + SEXP ans = PROTECT(allocNAVectorLike(as, len)); + if (verbose) + Rprintf("Coercing %s[%s] into %s[%s]\n", type2char(TYPEOF(x)), class1(x), type2char(TYPEOF(as)), class1(as)); + const char *ret = memrecycle(/*target=*/ans, /*where=*/R_NilValue, /*start=*/0, /*len=*/LENGTH(x), /*source=*/x, /*sourceStart=*/0, /*sourceLen=*/-1, /*colnum=*/0, /*colname=*/""); + if (ret) + warning(_("%s"), ret); + UNPROTECT(1); + return ans; +} + #ifndef NOZLIB #include #endif @@ -386,4 +389,3 @@ SEXP dt_zlib_version() { #endif return ScalarString(mkChar(out)); } - From fbbf8c43cadd32c8db4ae8035029c251f37c18d0 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 7 Mar 2021 17:19:26 -0700 Subject: [PATCH 170/588] NEWS-only: publish date in 1.14.0 heading --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 0ec4b3d736..d25f3bfb41 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,7 +22,7 @@ nafill(x, fill=as.integer(3.14)) # no warning; the as. conveys intent ``` -# data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (submitted to CRAN on 20 Feb 2021) +# data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) ## POTENTIALLY BREAKING CHANGES From 788c585586798107f9eeb3263928f718ca1db81e Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Mon, 8 Mar 2021 13:47:54 -0600 Subject: [PATCH 171/588] Clarify status of reshape2 (#4908) --- NEWS.md | 2 +- R/fcast.R | 4 ++-- R/fmelt.R | 4 ++-- man/dcast.data.table.Rd | 2 +- man/melt.data.table.Rd | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index d25f3bfb41..f6d2f971cb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -697,7 +697,7 @@ has a better chance of working on Mac. 7. Added a note to `?frank` clarifying that ranking is being done according to C sorting (i.e., like `forder`), [#2328](https://github.com/Rdatatable/data.table/issues/2328). Thanks to @cguill95 for the request. -8. Historically, `dcast` and `melt` were built as enhancements to `reshape2`'s own `dcast`/`melt`. We removed dependency on `reshape2` in v1.9.6 but maintained some backward compatibility. As that package has been deprecated since December 2017, we will begin to formally complete the split from `reshape2` by removing some last vestiges. In particular we now warn when redirecting to `reshape2` methods and will later error before ultimately completing the split; see [#3549](https://github.com/Rdatatable/data.table/issues/3549) and [#3633](https://github.com/Rdatatable/data.table/issues/3633). We thank the `reshape2` authors for their original inspiration for these functions, and @ProfFancyPants for testing and reporting regressions in dev which have been fixed before release. +8. Historically, `dcast` and `melt` were built as enhancements to `reshape2`'s own `dcast`/`melt`. We removed dependency on `reshape2` in v1.9.6 but maintained some backward compatibility. As that package has been superseded since December 2017, we will begin to formally complete the split from `reshape2` by removing some last vestiges. In particular we now warn when redirecting to `reshape2` methods and will later error before ultimately completing the split; see [#3549](https://github.com/Rdatatable/data.table/issues/3549) and [#3633](https://github.com/Rdatatable/data.table/issues/3633). We thank the `reshape2` authors for their original inspiration for these functions, and @ProfFancyPants for testing and reporting regressions in dev which have been fixed before release. 9. `DT[col]` where `col` is a column containing row numbers of itself to select, now suggests the correct syntax (`DT[(col)]` or `DT[DT$col]`), [#697](https://github.com/Rdatatable/data.table/issues/697). This expands the message introduced in [#1884](https://github.com/Rdatatable/data.table/issues/1884) for the case where `col` is type `logical` and `DT[col==TRUE]` is suggested. diff --git a/R/fcast.R b/R/fcast.R index db7a4b94b5..dbde95846a 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -17,8 +17,8 @@ dcast <- function( else { data_name = deparse(substitute(data)) ns = tryCatch(getNamespace("reshape2"), error=function(e) - stop("The dcast generic in data.table has been passed a ",class(data)[1L],", but data.table::dcast currently only has a method for data.tables. Please confirm your input is a data.table, with setDT(", data_name, ") or as.data.table(", data_name, "). If you intend to use a reshape2::dcast, try installing that package first, but do note that reshape2 is deprecated and you should be migrating your code away from using it.")) - warning("The dcast generic in data.table has been passed a ", class(data)[1L], " and will attempt to redirect to the reshape2::dcast; please note that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(", data_name, "). In the next version, this warning will become an error.") + stop("The dcast generic in data.table has been passed a ",class(data)[1L],", but data.table::dcast currently only has a method for data.tables. Please confirm your input is a data.table, with setDT(", data_name, ") or as.data.table(", data_name, "). If you intend to use a reshape2::dcast, try installing that package first, but do note that reshape2 is superseded and is no longer actively developed.")) + warning("The dcast generic in data.table has been passed a ", class(data)[1L], " and will attempt to redirect to the reshape2::dcast; please note that reshape2 is superseded and is no longer actively developed, and this redirection is now deprecated. Please do this redirection yourself like reshape2::dcast(", data_name, "). In the next version, this warning will become an error.") ns$dcast(data, formula, fun.aggregate = fun.aggregate, ..., margins = margins, subset = subset, fill = fill, value.var = value.var) } diff --git a/R/fmelt.R b/R/fmelt.R index 12dd9fa5ac..3594fce8ca 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -12,8 +12,8 @@ melt <- function(data, ..., na.rm = FALSE, value.name = "value") { } else { data_name = deparse(substitute(data)) ns = tryCatch(getNamespace("reshape2"), error=function(e) - stop("The melt generic in data.table has been passed a ",class(data)[1L],", but data.table::melt currently only has a method for data.tables. Please confirm your input is a data.table, with setDT(", data_name, ") or as.data.table(", data_name, "). If you intend to use a method from reshape2, try installing that package first, but do note that reshape2 is deprecated and you should be migrating your code away from using it.")) - warning("The melt generic in data.table has been passed a ", class(data)[1L], " and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(", data_name, "). In the next version, this warning will become an error.") + stop("The melt generic in data.table has been passed a ",class(data)[1L],", but data.table::melt currently only has a method for data.tables. Please confirm your input is a data.table, with setDT(", data_name, ") or as.data.table(", data_name, "). If you intend to use a method from reshape2, try installing that package first, but do note that reshape2 is superseded and is no longer actively developed.")) + warning("The melt generic in data.table has been passed a ", class(data)[1L], " and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is superseded and is no longer actively developed, and this redirection is now deprecated. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(", data_name, "). In the next version, this warning will become an error.") ns$melt(data, ..., na.rm=na.rm, value.name=value.name) } # nocov end diff --git a/man/dcast.data.table.Rd b/man/dcast.data.table.Rd index 20f371a397..daf9fba655 100644 --- a/man/dcast.data.table.Rd +++ b/man/dcast.data.table.Rd @@ -51,7 +51,7 @@ From \code{v1.9.4}, \code{dcast} tries to preserve attributes wherever possible. From \code{v1.9.6}, it is possible to cast multiple \code{value.var} columns and also cast by providing multiple \code{fun.aggregate} functions. Multiple \code{fun.aggregate} functions should be provided as a \code{list}, for e.g., \code{list(mean, sum, function(x) paste(x, collapse="")}. \code{value.var} can be either a character vector or list of length one, or a list of length equal to \code{length(fun.aggregate)}. When \code{value.var} is a character vector or a list of length one, each function mentioned under \code{fun.aggregate} is applied to every column specified under \code{value.var} column. When \code{value.var} is a list of length equal to \code{length(fun.aggregate)} each element of \code{fun.aggregate} is applied to each element of \code{value.var} column. -Historical note: \code{dcast.data.table} was originally designed as an enhancement to \code{reshape2::dcast} in terms of computing and memory efficiency. \code{reshape2} has since been deprecated, and \code{dcast} has had a generic defined within \code{data.table} since \code{v1.9.6} in 2015, at which point the dependency between the packages became more etymological than programmatic. We thank the \code{reshape2} authors for the inspiration. +Historical note: \code{dcast.data.table} was originally designed as an enhancement to \code{reshape2::dcast} in terms of computing and memory efficiency. \code{reshape2} has since been superseded in favour of \code{tidyr}, and \code{dcast} has had a generic defined within \code{data.table} since \code{v1.9.6} in 2015, at which point the dependency between the packages became more etymological than programmatic. We thank the \code{reshape2} authors for the inspiration. } \value{ diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index a9d69b5f66..e56a10e4e1 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -75,7 +75,7 @@ be coerced to \code{character} type. To get a \code{factor} column, set \code{value.factor = TRUE}. \code{melt.data.table} also preserves \code{ordered} factors. -Historical note: \code{melt.data.table} was originally designed as an enhancement to \code{reshape2::melt} in terms of computing and memory efficiency. \code{reshape2} has since been deprecated, and \code{melt} has had a generic defined within \code{data.table} since \code{v1.9.6} in 2015, at which point the dependency between the packages became more etymological than programmatic. We thank the \code{reshape2} authors for the inspiration. +Historical note: \code{melt.data.table} was originally designed as an enhancement to \code{reshape2::melt} in terms of computing and memory efficiency. \code{reshape2} has since been superseded in favour of \code{tidyr}, and \code{melt} has had a generic defined within \code{data.table} since \code{v1.9.6} in 2015, at which point the dependency between the packages became more etymological than programmatic. We thank the \code{reshape2} authors for the inspiration. } From ec1259af1bf13fc0c96a1d3f9e84d55d8106a9a4 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 16 Mar 2021 00:50:18 +0200 Subject: [PATCH 172/588] mean na.rm=TRUE uses GForce (#4851) --- NEWS.md | 2 + R/data.table.R | 2 +- inst/tests/tests.Rraw | 10 ++ src/gsumm.c | 225 ++++++++++++++++++++++++------------------ 4 files changed, 142 insertions(+), 97 deletions(-) diff --git a/NEWS.md b/NEWS.md index f6d2f971cb..3db78f46c9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ 1. `nafill()` now applies `fill=` to the front/back of the vector when `type="locf|nocb"`, [#3594](https://github.com/Rdatatable/data.table/issues/3594). Thanks to @ben519 for the feature request. It also now returns a named object based on the input names. Note that if you are considering joining and then using `nafill(...,type='locf|nocb')` afterwards, please review `roll=`/`rollends=` which should achieve the same result in one step more efficiently. `nafill()` is for when filling-while-joining (i.e. `roll=`/`rollends=`/`nomatch=`) cannot be applied. +2. `mean(na.rm=TRUE)` by group is now GForce optimized, [#4849](https://github.com/Rdatatable/data.table/issues/4849). Thanks to the [h2oai/db-benchmark](https://github.com/h2oai/db-benchmark) project for spotting this issue. The 1 billion row example in the issue shows 48s reduced to 14s. The optimization also applies to type `integer64` resulting in a difference to the `bit64::mean.integer64` method: `data.table` returns a `double` result whereas `bit64` rounds the mean to the nearest integer. + ## BUG FIXES ## NOTES diff --git a/R/data.table.R b/R/data.table.R index 961d9eb857..1334a612d0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2856,7 +2856,7 @@ ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the mo gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment gfirst = function(x) .Call(Cgfirst, x) glast = function(x) .Call(Cglast, x) -gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm, TRUE) # warnOverflow=TRUE, #986 +gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm) gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm) gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm) gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 95b8ba4492..8f32278523 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17263,3 +17263,13 @@ if (identical(x, enc2native(x))) { # fintersect now preserves order of first argument like intersect, #4716 test(2163, fintersect(data.table(x=c("b", "c", "a")), data.table(x=c("a","c")))$x, c("c", "a")) + +# mean na.rm=TRUE GForce, #4849 +d = data.table(a=1, b=list(1,2)) +test(2164.1, d[, mean(b), by=a], error="not supported by GForce mean") +if (test_bit64) { + d = data.table(a=INT(1,1,2,2), b=as.integer64(c(2,3,4,NA))) + test(2164.2, d[, mean(b), by=a], data.table(a=INT(1,2), V1=c(2.5, NA))) + test(2164.3, d[, mean(b, na.rm=TRUE), by=a], data.table(a=INT(1,2), V1=c(2.5, 4))) +} + diff --git a/src/gsumm.c b/src/gsumm.c index ed34e76207..9c31f4a761 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -337,11 +337,10 @@ void *gather(SEXP x, bool *anyNA) return gx; } -SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg) +SEXP gsum(SEXP x, SEXP narmArg) { if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); const bool narm = LOGICAL(narmArg)[0]; - const bool warnOverflow = LOGICAL(warnOverflowArg)[0]; if (inherits(x, "factor")) error(_("sum is not meaningful for factors.")); const int n = (irowslen == -1) ? length(x) : irowslen; double started = wallclock(); @@ -401,7 +400,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg) //Rprintf(_("gsum int took %.3f\n"), wallclock()-started); if (overflow) { UNPROTECT(1); // discard the result with overflow - if (warnOverflow) warning(_("The sum of an integer column for a group was more than type 'integer' can hold so the result has been coerced to 'numeric' automatically for convenience.")); + warning(_("The sum of an integer column for a group was more than type 'integer' can hold so the result has been coerced to 'numeric' automatically for convenience.")); ans = PROTECT(allocVector(REALSXP, ngrp)); double *restrict ansp = REAL(ans); memset(ansp, 0, ngrp*sizeof(double)); @@ -570,113 +569,147 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg) return(ans); } -SEXP gmean(SEXP x, SEXP narm) +SEXP gmean(SEXP x, SEXP narmArg) { - SEXP ans=R_NilValue; - //clock_t start = clock(); - if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); - if (!isVectorAtomic(x)) error(_("GForce mean can only be applied to columns, not .SD or similar. Likely you're looking for 'DT[,lapply(.SD,mean),by=,.SDcols=]'. See ?data.table.")); if (inherits(x, "factor")) error(_("mean is not meaningful for factors.")); - if (!LOGICAL(narm)[0]) { - int protecti=0; - ans = PROTECT(gsum(x, narm, /*#986, warnOverflow=*/ScalarLogical(FALSE))); protecti++; - switch(TYPEOF(ans)) { - case LGLSXP: case INTSXP: - ans = PROTECT(coerceVector(ans, REALSXP)); protecti++; - case REALSXP: { - double *xd = REAL(ans); - for (int i=0; ii /= grpsize[i]; - xd->r /= grpsize[i]; - xd++; - } - } break; - default : - error(_("Internal error: gsum returned type '%s'. typeof(x) is '%s'"), type2char(TYPEOF(ans)), type2char(TYPEOF(x))); // # nocov - } - UNPROTECT(protecti); - return(ans); - } - // na.rm=TRUE. Similar to gsum, but we need to count the non-NA as well for the divisor + if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); + const bool narm = LOGICAL(narmArg)[0]; const int n = (irowslen == -1) ? length(x) : irowslen; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gsum"); - - long double *s = calloc(ngrp, sizeof(long double)), *si=NULL; // s = sum; si = sum imaginary just for complex - if (!s) error(_("Unable to allocate %d * %d bytes for sum in gmean na.rm=TRUE"), ngrp, sizeof(long double)); - - int *c = calloc(ngrp, sizeof(int)); - if (!c) error(_("Unable to allocate %d * %d bytes for counts in gmean na.rm=TRUE"), ngrp, sizeof(int)); - + double started = wallclock(); + const bool verbose=GetVerbose(); + if (verbose) Rprintf(_("This gmean took (narm=%s) ... "), narm?"TRUE":"FALSE"); // narm=TRUE only at this point + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gmean"); + bool anyNA=false; + SEXP ans=R_NilValue; + int protecti=0; switch(TYPEOF(x)) { - case LGLSXP: case INTSXP: { - const int *xd = INTEGER(x); - for (int i=0; iDBL_MAX ? R_PosInf : (s[i] < -DBL_MAX ? R_NegInf : (double)s[i]); + const double *restrict gx = gather(x, &anyNA); + ans = PROTECT(allocVector(REALSXP, ngrp)); protecti++; + double *restrict ansp = REAL(ans); + memset(ansp, 0, ngrp*sizeof(double)); + if (!narm || !anyNA) { + #pragma omp parallel for num_threads(getDTthreads(highSize, false)) + for (int h=0; hDBL_MAX ? R_PosInf : (s[i] < -DBL_MAX ? R_NegInf : (double)s[i]); - ansd[i].i = si[i]>DBL_MAX ? R_PosInf : (si[i]< -DBL_MAX ? R_NegInf : (double)si[i]); + const Rcomplex *restrict gx = gather(x, &anyNA); + ans = PROTECT(allocVector(CPLXSXP, ngrp)); protecti++; + Rcomplex *restrict ansp = COMPLEX(ans); + memset(ansp, 0, ngrp*sizeof(Rcomplex)); + if (!narm || !anyNA) { + #pragma omp parallel for num_threads(getDTthreads(highSize, false)) + for (int h=0; h Date: Tue, 13 Apr 2021 12:19:06 -0400 Subject: [PATCH 173/588] by = .EACHI key fix 4603 (#4917) --- NEWS.md | 2 ++ R/data.table.R | 3 ++- inst/tests/tests.Rraw | 22 ++++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 3db78f46c9..3919e0ca4d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,8 @@ ## BUG FIXES +1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/data.table.R b/R/data.table.R index 1334a612d0..d34026b562 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1385,7 +1385,8 @@ replace_dot_alias = function(e) { byval = i bynames = if (missing(on)) head(key(x),length(leftcols)) else names(on) allbyvars = NULL - bysameorder = haskey(i) || (is.sorted(f__) && ((roll == FALSE) || length(f__) == 1L)) # Fix for #1010 + bysameorder = (haskey(i) && identical(leftcols, chmatch(head(key(i),length(leftcols)), names(i)))) || # leftcols leading subset of key(i); see #4917 + (roll==FALSE && is.sorted(f__)) # roll==FALSE is fix for #1010 ## 'av' correct here ?? *** TO DO *** xjisvars = intersect(av, names_x[rightcols]) # no "x." for xvars. # if 'get' is in 'av' use all cols in 'i', fix for bug #34 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8f32278523..f80e7b0797 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17273,3 +17273,25 @@ if (test_bit64) { test(2164.3, d[, mean(b, na.rm=TRUE), by=a], data.table(a=INT(1,2), V1=c(2.5, 4))) } +# invalid key when by=.EACHI, haskey(i) but on= non-leading-subset of i's key, #4603 #4911 +X = data.table(id = c(6456372L, 6456372L, 6456372L, 6456372L,6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L), + id_round = c(197801L, 199405L, 199501L, 197901L, 197905L, 198001L, 198005L, 198101L, 198105L, 198201L, 198205L, 198301L, 198305L, 198401L), + field = c(NA, NA, NA, "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine"), + key = "id") +Y = data.table(id = c(6456372L, 6456345L, 6456356L), + id_round = c(197705L, 197905L, 201705L), + field = c("medicine", "teaching", "health"), + prio = c(6L, 1L, 10L), + key = c("id_round", "id", "prio", "field" )) +test(2165.1, X[Y, on = .(id, id_round > id_round, field), .(x.id_round[1], i.id_round[1]), by=.EACHI][id==6456372L], + data.table(id=6456372L, id_round=197705L, field='medicine', V1=197901L, V2=197705L)) +# Y$id_round happens to be sorted, so in 2165.2 we test Y$field which is not sorted +test(2165.2, X[Y, on="field", .(x.id_round[1]), by=.EACHI][field=="health"], + data.table(field="health", V1=NA_integer_)) +# a minimal example too ... +X = data.table(A=c(4L,2L,3L), B=1:3, key="A") +Y = data.table(A=2:1, B=2:3, key=c("B","A")) +test(2165.3, X[Y], data.table(A=2:3, B=2:3, i.A=2:1, key="A")) # keyed +test(2165.4, X[Y, on=.(A)], data.table(A=2:1, B=c(2L,NA), i.B=2:3)) # no key +test(2165.5, X[Y, on=.(A), x.B, by=.EACHI], data.table(A=2:1, x.B=c(2L,NA))) # no key + From 53e15854bb2c4d2faa7184a3c782cce40e9210f2 Mon Sep 17 00:00:00 2001 From: Bob Jansen Date: Thu, 15 Apr 2021 00:44:25 +0200 Subject: [PATCH 174/588] Fix typo in ?setDTthreads (#4938) --- man/openmp-utils.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/openmp-utils.Rd b/man/openmp-utils.Rd index b8d014976e..f3f616a6e4 100644 --- a/man/openmp-utils.Rd +++ b/man/openmp-utils.Rd @@ -5,7 +5,7 @@ \alias{openmp} \title{ Set or get number of threads that data.table should use } \description{ - Set and get number of threads to be used in \code{data.table} functions that are parallelized with OpenMP. The number of threads is initialized when \code{data.table} is first loaded in the R session using optional envioronment variables. Thereafter, the number of threads may be changed by calling \code{setDTthreads}. If you change an environment variable using \code{Sys.setenv} you will need to call \code{setDTthreads} again to reread the environment variables. + Set and get number of threads to be used in \code{data.table} functions that are parallelized with OpenMP. The number of threads is initialized when \code{data.table} is first loaded in the R session using optional environment variables. Thereafter, the number of threads may be changed by calling \code{setDTthreads}. If you change an environment variable using \code{Sys.setenv} you will need to call \code{setDTthreads} again to reread the environment variables. } \usage{ setDTthreads(threads = NULL, restore_after_fork = NULL, percent = NULL, throttle = NULL) From 189be779dc90044d4c6c2965b674994cc80898fa Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 15 Apr 2021 01:13:22 +0200 Subject: [PATCH 175/588] C data.table docs and minor rename (#4753) --- NEWS.md | 3 +++ inst/include/datatableAPI.h | 5 ++++- man/cdt.Rd | 17 ++++++++++++----- src/init.c | 6 ++++-- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3919e0ca4d..01e809f668 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,9 @@ nafill(x, fill=as.integer(3.14)) # no warning; the as. conveys intent ``` +2. `CsubsetDT` exported C function has been renamed to `DT_subsetDT`. This requires `R_GetCCallable("data.table", "CsubsetDT")` to be updated to `R_GetCCallable("data.table", "DT_subsetDT")`. Additionally there is now a dedicated header file for data.table C exports `include/datatableAPI.h`, [#4643](https://github.com/Rdatatable/data.table/issues/4643), thanks to @eddelbuettel, which makes it easier to _import_ data.table C functions. + + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) ## POTENTIALLY BREAKING CHANGES diff --git a/inst/include/datatableAPI.h b/inst/include/datatableAPI.h index 44f52018f4..e2a1b2fd32 100644 --- a/inst/include/datatableAPI.h +++ b/inst/include/datatableAPI.h @@ -21,11 +21,14 @@ extern "C" { /* provided the interface for the function exported in ../src/init.c via R_RegisterCCallable() */ +// subsetDT #3751 inline SEXP attribute_hidden DT_subsetDT(SEXP x, SEXP rows, SEXP cols) { static SEXP(*fun)(SEXP, SEXP, SEXP) = - (SEXP(*)(SEXP,SEXP,SEXP)) R_GetCCallable("data.table", "CsubsetDT"); + (SEXP(*)(SEXP,SEXP,SEXP)) R_GetCCallable("data.table", "DT_subsetDT"); return fun(x,rows,cols); } +// forder #4015 +// setalloccol alloccolwrapper setDT #4439 /* permit opt-in to redefine shorter identifiers */ #if defined(DATATABLE_REMAP_API) diff --git a/man/cdt.Rd b/man/cdt.Rd index ea7c3a76eb..8c0846cac9 100644 --- a/man/cdt.Rd +++ b/man/cdt.Rd @@ -2,18 +2,25 @@ \alias{cdatatable} \title{ data.table exported C routines } \description{ - Note that this interface is going to be changed in next release. Some of internally used C routines are now exported. This interface should be considered experimental. List of exported C routines and their signatures are provided below in the usage section. } \usage{ -# SEXP subsetDT(SEXP x, SEXP rows, SEXP cols); -# p_dtCsubsetDT = R_GetCCallable("data.table", "CsubsetDT"); +# SEXP DT_subsetDT(SEXP x, SEXP rows, SEXP cols); +# p_DT_subsetDT = R_GetCCallable("data.table", "DT_subsetDT"); } \details{ - For details how to use those see \emph{Writing R Extensions} manual \emph{Linking to native routines in other packages} section. + Details how to use those can be found in \emph{Writing R Extensions} manual \emph{Linking to native routines in other packages} section. + An example use with \code{Rcpp}: +\preformatted{ + dt = data.table::as.data.table(iris) + Rcpp::cppFunction("SEXP mysub2(SEXP x, SEXP rows, SEXP cols) { return DT_subsetDT(x,rows,cols); }", + include="#include ", + depends="data.table") + mysub2(dt, 1:4, 1:4) +} } \note{ - Be aware C routines are likely to have less input validation than their corresponding R interface. For example one should not expect \code{DT[-5L]} will be equal to \code{.Call(CsubsetDT, DT, -5L, seq_along(DT))} because translation of \code{i=-5L} to \code{seq_len(nrow(DT))[-5L]} might be happening on R level. Moreover checks that \code{i} argument is in range of \code{1:nrow(DT)}, missingness, etc. might be happening on R level too. + Be aware C routines are likely to have less input validation than their corresponding R interface. For example one should not expect \code{DT[-5L]} will be equal to \code{.Call(DT_subsetDT, DT, -5L, seq_along(DT))} because translation of \code{i=-5L} to \code{seq_len(nrow(DT))[-5L]} might be happening on R level. Moreover checks that \code{i} argument is in range of \code{1:nrow(DT)}, missingness, etc. might be happening on R level too. } \references{ \url{https://cran.r-project.org/doc/manuals/r-release/R-exts.html} diff --git a/src/init.c b/src/init.c index b388598301..979cd7b050 100644 --- a/src/init.c +++ b/src/init.c @@ -245,8 +245,10 @@ static void setSizes() { void attribute_visible R_init_datatable(DllInfo *info) // relies on pkg/src/Makevars to mv data.table.so to datatable.so { - // C exported routines, see ?cdt for details - R_RegisterCCallable("data.table", "CsubsetDT", (DL_FUNC) &subsetDT); + // C exported routines + // must be also listed in inst/include/datatableAPI.h + // for end user documentation see ?cdt + R_RegisterCCallable("data.table", "DT_subsetDT", (DL_FUNC) &subsetDT); R_registerRoutines(info, NULL, callMethods, NULL, externalMethods); R_useDynamicSymbols(info, FALSE); From 8f425c69279eef8b743bb049c7c983e0b6f3543e Mon Sep 17 00:00:00 2001 From: Mark Derry <3118171+markderry@users.noreply.github.com> Date: Wed, 14 Apr 2021 17:27:26 -0600 Subject: [PATCH 176/588] Fread documentation (#4745) --- NEWS.md | 2 ++ man/fread.Rd | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 01e809f668..a822736f61 100644 --- a/NEWS.md +++ b/NEWS.md @@ -118,6 +118,8 @@ `PKG_CPPFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_.tar.gz` has a better chance of working on Mac. +5. In v1.12.4, we added support for fractional `stringsAsFactors` in `fread` -- for example, if `stringsAsFactors=.2`, any character column with fewer than 20% unique strings would be cast as `factor`. This is now reflected in `?fread` as well, [#4706](https://github.com/Rdatatable/data.table/issues/4706). Thanks to @markderry for the PR. + # data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) diff --git a/man/fread.Rd b/man/fread.Rd index 703eb70d3e..c7b7da8566 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -37,7 +37,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{nrows}{ The maximum number of rows to read. Unlike \code{read.table}, you do not need to set this to an estimate of the number of rows in the file for better speed because that is already automatically determined by \code{fread} almost instantly using the large sample of lines. \code{nrows=0} returns the column names and typed empty columns determined by the large sample; useful for a dry run of a large file or to quickly check format consistency of a set of files before starting to read any of them. } \item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. } \item{na.strings}{ A character vector of strings which are to be interpreted as \code{NA} values. By default, \code{",,"} for columns of all types, including type \code{character} is read as \code{NA} for consistency. \code{,"",} is unambiguous and read as an empty string. To read \code{,NA,} as \code{NA}, set \code{na.strings="NA"}. To read \code{,,} as blank string \code{""}, set \code{na.strings=NULL}. When they occur in the file, the strings in \code{na.strings} should not appear quoted since that is how the string literal \code{,"NA",} is distinguished from \code{,NA,}, for example, when \code{na.strings="NA"}. } - \item{stringsAsFactors}{ Convert all character columns to factors? } + \item{stringsAsFactors}{ Convert all or some character columns to factors? Acceptable inputs are \code{TRUE}, \code{FALSE}, or a decimal value between 0.0 and 1.0. For \code{stringsAsFactors = FALSE}, all string columns are stored as \code{character} vs. all stored as \code{factor} when \code{TRUE}. When \code{stringsAsFactors = p} for \code{0 <= p <= 1}, string columns \code{col} are stored as \code{factor} if \code{uniqueN(col)/nrow < p}. + } \item{verbose}{ Be chatty and report timings? } \item{skip}{ If 0 (default) start on the first line and from there finds the first row with a consistent number of columns. This automatically avoids irregular header information before the column names row. \code{skip>0} means ignore the first \code{skip} rows manually. \code{skip="string"} searches for \code{"string"} in the file (e.g. a substring of the column names row) and starts on that line (inspired by read.xls in package gdata). } \item{select}{ A vector of column names or numbers to keep, drop the rest. \code{select} may specify types too in the same way as \code{colClasses}; i.e., a vector of \code{colname=type} pairs, or a \code{list} of \code{type=col(s)} pairs. In all forms of \code{select}, the order that the columns are specified determines the order of the columns in the result. } From 707ef0ca89f71af8a1c34b70d714e9d8faf9eff5 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 14 Apr 2021 17:36:17 -0600 Subject: [PATCH 177/588] moved #4745 news item up. Faster to merge PR and make this NEWS-only edit to master afterwards than merge master to the fork and wait for CI just for a news item. --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index a822736f61..6ce883c568 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,6 +28,8 @@ 2. `CsubsetDT` exported C function has been renamed to `DT_subsetDT`. This requires `R_GetCCallable("data.table", "CsubsetDT")` to be updated to `R_GetCCallable("data.table", "DT_subsetDT")`. Additionally there is now a dedicated header file for data.table C exports `include/datatableAPI.h`, [#4643](https://github.com/Rdatatable/data.table/issues/4643), thanks to @eddelbuettel, which makes it easier to _import_ data.table C functions. +3. In v1.12.4, fractional `fread(..., stringsAsFactors=)` was added. For example if `stringsAsFactors=0.2`, any character column with fewer than 20% unique strings would be cast as `factor`. This is now documented in `?fread` as well, [#4706](https://github.com/Rdatatable/data.table/issues/4706). Thanks to @markderry for the PR. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) @@ -118,8 +120,6 @@ `PKG_CPPFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_.tar.gz` has a better chance of working on Mac. -5. In v1.12.4, we added support for fractional `stringsAsFactors` in `fread` -- for example, if `stringsAsFactors=.2`, any character column with fewer than 20% unique strings would be cast as `factor`. This is now reflected in `?fread` as well, [#4706](https://github.com/Rdatatable/data.table/issues/4706). Thanks to @markderry for the PR. - # data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) From d589fb86fae727d467dd41e12f3d8dcf18cde6cb Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 14 Apr 2021 18:22:35 -0600 Subject: [PATCH 178/588] added markdown as well as rmarkdown to Suggests (#4954) --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 78ca52b485..7ba34218c3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -64,7 +64,7 @@ Authors@R: c( person("Ben","Schwen", role="ctb")) Depends: R (>= 3.1.0) Imports: methods -Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown +Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown SystemRequirements: zlib Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE From 4673f4862449075115369edae23e3c5580fe5b0b Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 14 Apr 2021 18:04:34 -0700 Subject: [PATCH 179/588] add whitespace to distinguish non-equi operators in error msg (#4570) --- R/data.table.R | 2 +- inst/tests/tests.Rraw | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index d34026b562..bbc1cf5693 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3119,7 +3119,7 @@ isReallyReal = function(x) { } idx_op = match(operators, ops, nomatch=0L) if (any(idx_op %in% c(0L, 6L))) - stop("Invalid operators ", paste(operators[idx_op %in% c(0L, 6L)], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".") + stop(gettextf("Invalid join operators %s. Only allowed operators are %s.", brackify(operators[idx_op %in% c(0L, 6L)]), brackify(ops[1:5]), domain="R-data.table"), domain=NA) ## the final on will contain the xCol as name, the iCol as value on = iCols names(on) = xCols diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f80e7b0797..dfeb47841f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13184,7 +13184,7 @@ test(1948.09, DT[i, on = eval(eval("id<=idi"))], DT[i, on = "id<=idi"]) test(1948.10, DT[i, on = ""], error = "'on' contains no column name: . Each 'on' clause must contain one or two column names.") test(1948.11, DT[i, on = "id>=idi>=1"], error = "Found more than one operator in one 'on' statement: id>=idi>=1. Please specify a single operator.") test(1948.12, DT[i, on = "`id``idi`<=id"], error = "'on' contains more than 2 column names: `id``idi`<=id. Each 'on' clause must contain one or two column names.") -test(1948.13, DT[i, on = "id != idi"], error = "Invalid operators !=. Only allowed operators are ==<=<>=>.") +test(1948.13, DT[i, on = "id != idi"], error = "Invalid join operators [!=]. Only allowed operators are [==, <=, <, >=, >].") test(1948.14, DT[i, on = 1L], error = "'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") # helpful error when on= is provided but not i, rather than silently ignoring on= From 54082e40c3bf94feb41f1357d6f5b9917a5424c9 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 15 Apr 2021 04:26:00 +0200 Subject: [PATCH 180/588] improve error message for #4214 (#4343) --- src/forder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/forder.c b/src/forder.c index 850cb457c2..464657b5d0 100644 --- a/src/forder.c +++ b/src/forder.c @@ -452,7 +452,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S if (by_i < 1 || by_i > length(DT)) STOP(_("internal error: 'by' value %d out of range [1,%d]"), by_i, length(DT)); // # nocov # R forderv already catch that using C colnamesInt if ( nrow != length(VECTOR_ELT(DT, by_i-1)) ) - STOP(_("Column %d is length %d which differs from length of column 1 (%d)\n"), INTEGER(by)[i], length(VECTOR_ELT(DT, INTEGER(by)[i]-1)), nrow); + STOP(_("Column %d is length %d which differs from length of column 1 (%d), are you attempting to order by a list column?\n"), INTEGER(by)[i], length(VECTOR_ELT(DT, INTEGER(by)[i]-1)), nrow); if (TYPEOF(VECTOR_ELT(DT, by_i-1)) == CPLXSXP) n_cplx++; } if (!isLogical(retGrpArg) || LENGTH(retGrpArg)!=1 || INTEGER(retGrpArg)[0]==NA_LOGICAL) From c2b55bf183c7f9485a0cc92b0e5e7e6fc0f1cbd0 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 14 Apr 2021 22:54:00 -0700 Subject: [PATCH 181/588] better error message for missing j in cube (#4282) --- NEWS.md | 2 ++ R/groupingsets.R | 4 +++- inst/tests/tests.Rraw | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 6ce883c568..12c05d4dff 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,6 +30,8 @@ 3. In v1.12.4, fractional `fread(..., stringsAsFactors=)` was added. For example if `stringsAsFactors=0.2`, any character column with fewer than 20% unique strings would be cast as `factor`. This is now documented in `?fread` as well, [#4706](https://github.com/Rdatatable/data.table/issues/4706). Thanks to @markderry for the PR. +4. `cube(DT, by="a")` now gives a more helpful error that `j` is missing, [#4282](https://github.com/Rdatatable/data.table/pull/4282). + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/R/groupingsets.R b/R/groupingsets.R index 6281615dd5..5c3ad02d4b 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -27,10 +27,12 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, ...) { stop("Argument 'by' must be a character vector of column names used in grouping.") if (!is.logical(id)) stop("Argument 'id' must be a logical scalar.") + if (missing(j)) + stop("Argument 'j' is required") # generate grouping sets for cube - power set: http://stackoverflow.com/a/32187892/2490497 n = length(by) keepBool = sapply(2L^(seq_len(n)-1L), function(k) rep(c(FALSE, TRUE), times=k, each=((2L^n)/(2L*k)))) - sets = lapply((2L^n):1L, function(j) by[keepBool[j, ]]) + sets = lapply((2L^n):1L, function(jj) by[keepBool[jj, ]]) # redirect to workhorse function jj = substitute(j) groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dfeb47841f..4535a6e048 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17295,3 +17295,6 @@ test(2165.3, X[Y], data.table(A=2:3, B=2:3, i.A=2:1, ke test(2165.4, X[Y, on=.(A)], data.table(A=2:1, B=c(2L,NA), i.B=2:3)) # no key test(2165.5, X[Y, on=.(A), x.B, by=.EACHI], data.table(A=2:1, x.B=c(2L,NA))) # no key +# missing j was caught in groupingsets but not cube, leading to unexpected error message, #4282 +DT = data.table(a=1) +test(2166, cube(DT, by='a'), error="Argument 'j' is required") From 19065fba64f9b48930d8e51573d01976cbfb25a6 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 14 Apr 2021 23:36:10 -0700 Subject: [PATCH 182/588] dt_width needs to be passed true nrow (#4268) --- NEWS.md | 2 ++ R/print.data.table.R | 16 +++++++++------- inst/tests/tests.Rraw | 18 +++++++++++------- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index 12c05d4dff..681568a701 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,8 @@ 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. +2. `print(DT, trunc.cols=TRUE)` and the corresponding `datatable.print.trunc.cols` option (new feature 3 in v1.13.0) could display an extra `diff.prev` column, [#4266](https://github.com/Rdatatable/data.table/issues/4266). Thanks to @tdhock for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/print.data.table.R b/R/print.data.table.R index 31a009d5b4..d53855d79d 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -59,13 +59,14 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), } return(invisible(x)) } - if ((topn*2L+1L)nrows || !topnmiss)) { + n_x = nrow(x) + if ((topn*2L+1L)nrows || !topnmiss)) { toprint = rbindlist(list(head(x, topn), tail(x, topn)), use.names=FALSE) # no need to match names because head and tail of same x, and #3306 - rn = c(seq_len(topn), seq.int(to=nrow(x), length.out=topn)) + rn = c(seq_len(topn), seq.int(to=n_x, length.out=topn)) printdots = TRUE } else { toprint = x - rn = seq_len(nrow(x)) + rn = seq_len(n_x) printdots = FALSE } toprint=format.data.table(toprint, na.encode=FALSE, timezone = timezone, ...) # na.encode=FALSE so that NA in character cols print as @@ -93,7 +94,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), if (quote) colnames(toprint) <- paste0('"', old <- colnames(toprint), '"') if (isTRUE(trunc.cols)) { # allow truncation of columns to print only what will fit in console PR #4074 - widths = dt_width(toprint, class, row.names, col.names) + widths = dt_width(toprint, n_x, class, row.names, col.names) cons_width = getOption("width") cols_to_print = widths < cons_width not_printed = colnames(toprint)[!cols_to_print] @@ -202,12 +203,13 @@ paste_dims = function(x) { # to calculate widths of data.table for PR #4074 # gets the width of the data.table at each column # and compares it to the console width -dt_width = function(x, class, row.names, col.names) { +# pass nrow because x is the head/tail only so nrow(x) is wrong, #4266 +dt_width = function(x, nrow, class, row.names, col.names) { widths = apply(nchar(x, type='width'), 2L, max) if (class) widths = pmax(widths, 6L) - if (col.names != "none") names = sapply(colnames(x), nchar, type = "width") else names = 0L + if (col.names != "none") names = sapply(colnames(x), nchar, type="width") else names = 0L dt_widths = pmax(widths, names) - rownum_width = if (row.names) as.integer(ceiling(log10(nrow(x)))+2) else 0L + rownum_width = if (row.names) as.integer(ceiling(log10(nrow))+2) else 0L cumsum(dt_widths + 1L) + rownum_width } # keeps the dim and dimnames attributes diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4535a6e048..373e075dc1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16654,17 +16654,17 @@ if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) # careful to unset because TZ="" means UTC whereas unset TZ means local # trunc.cols in print.data.table, #4074 -old_width = options("width" = 40) +old_width = options("width" = 40L) # Single row printing (to check issue with losing attributes) DT = data.table(a = "aaaaaaaaaaaaa", b = "bbbbbbbbbbbbb", c = "ccccccccccccc", d = "ddddddddddddd") test(2125.01, - capture.output(print(DT, trunc.cols=TRUE))[3], + capture.output(print(DT, trunc.cols=TRUE))[3L], "2 variables not shown: [c, d]") # Printing with dots -DT = data.table(a = vector("integer", 102), +DT = data.table(a = vector("integer", 102L), b = "bbbbbbbbbbbbb", c = "ccccccccccccc", d = c("ddddddddddddd", "d")) @@ -16696,8 +16696,12 @@ test(2125.03, capture.output(print(DT, trunc.cols=TRUE, row.names=FALSE)), " 0 bbbbbbbbbbbbb ccccccccccccc", " 0 bbbbbbbbbbbbb ccccccccccccc", "1 variable not shown: [d]" )) -test(2125.04, capture.output(print(DT, trunc.cols=TRUE, class=TRUE))[14], - "1 variable not shown: [d ]") +# also testing #4266 -- getting width of row #s register right +# TODO: understand why 2 variables truncated here. a,b,c combined have width +# _exactly_ 40, but still wraps. If we set options(width=41) it won't truncate. +# seems to be an issue with print.default. +test(2125.04, capture.output(print(DT, trunc.cols=TRUE, class=TRUE))[14L], + "2 variables not shown: [c , d ]") test(2125.05, capture.output(print(DT, trunc.cols=TRUE, class=TRUE, row.names=FALSE))[c(1,14)], c(" a b c", "1 variable not shown: [d ]" )) @@ -16705,8 +16709,8 @@ test(2125.06, capture.output(print(DT, trunc.cols=TRUE, col.names="none"))[c(1,1 c(" 1: 0 bbbbbbbbbbbbb ccccccccccccc", "1 variable not shown: [d]" )) test(2125.07, capture.output(print(DT, trunc.cols=TRUE, class=TRUE, col.names="none"))[c(1,13)], - c(" 1: 0 bbbbbbbbbbbbb ccccccccccccc", - "1 variable not shown: [d]" ), + c(" 1: 0 bbbbbbbbbbbbb", + "2 variables not shown: [c, d]" ), warning = "Column classes will be suppressed when col.names is 'none'") options("width" = 20) DT = data.table(a = vector("integer", 2), From 306f49c3a8318c1f92e676fe20c9033260b69ca4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 15 Apr 2021 00:06:12 -0700 Subject: [PATCH 183/588] ... -> \dots in fcase.Rd (#4452) --- man/fcase.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fcase.Rd b/man/fcase.Rd index 82e582ca43..dd3a119110 100644 --- a/man/fcase.Rd +++ b/man/fcase.Rd @@ -5,7 +5,7 @@ \code{fcase} is a fast implementation of SQL \code{CASE WHEN} statement for R. Conceptually, \code{fcase} is a nested version of \code{\link{fifelse}} (with smarter implementation than manual nesting). It is comparable to \code{dplyr::case_when} and supports \code{bit64}'s \code{integer64} and \code{nanotime} classes. } \usage{ - fcase(..., default=NA) + fcase(\dots, default=NA) } \arguments{ \item{...}{ A sequence consisting of logical condition (\code{when})-resulting value (\code{value}) \emph{pairs} in the following order \code{when1, value1, when2, value2, ..., whenN, valueN}. Logical conditions \code{when1, when2, ..., whenN} must all have the same length, type and attributes. Each \code{value} may either share length with \code{when} or be length 1. Please see Examples section for further details.} From 1eadb95ae1f2d5be5b1b6bf89d8db06450d4b9cc Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Fri, 16 Apr 2021 05:05:50 +0800 Subject: [PATCH 184/588] fwrite() UTF-8 csv file (#4785) --- NEWS.md | 2 ++ R/fwrite.R | 8 ++++++-- inst/tests/tests.Rraw | 17 +++++++++++++++++ man/fwrite.Rd | 4 +++- src/fwriteR.c | 17 +++++++++++++---- 5 files changed, 41 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 681568a701..29e830fed6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ 2. `mean(na.rm=TRUE)` by group is now GForce optimized, [#4849](https://github.com/Rdatatable/data.table/issues/4849). Thanks to the [h2oai/db-benchmark](https://github.com/h2oai/db-benchmark) project for spotting this issue. The 1 billion row example in the issue shows 48s reduced to 14s. The optimization also applies to type `integer64` resulting in a difference to the `bit64::mean.integer64` method: `data.table` returns a `double` result whereas `bit64` rounds the mean to the nearest integer. +3. `fwrite()` now writes UTF-8 or native csv files by specifying the `encoding=` argument, [#1770](https://github.com/Rdatatable/data.table/pull/1770). Thanks to @shrektan for the request and the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/fwrite.R b/R/fwrite.R index 1971c0e4ea..1a71f5ab0f 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -11,8 +11,12 @@ fwrite = function(x, file="", append=FALSE, quote="auto", compress = c("auto", "none", "gzip"), yaml = FALSE, bom = FALSE, - verbose=getOption("datatable.verbose", FALSE)) { + verbose=getOption("datatable.verbose", FALSE), + encoding = "") { na = as.character(na[1L]) # fix for #1725 + if (length(encoding) != 1L || !encoding %chin% c("", "UTF-8", "native")) { + stop("Argument 'encoding' must be '', 'UTF-8' or 'native'.") + } if (missing(qmethod)) qmethod = qmethod[1L] if (missing(compress)) compress = compress[1L] if (missing(dateTimeAs)) { dateTimeAs = dateTimeAs[1L] } @@ -108,7 +112,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, bom, yaml, verbose) + showProgress, is_gzip, bom, yaml, verbose, encoding) invisible() } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 373e075dc1..ef23279700 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17302,3 +17302,20 @@ test(2165.5, X[Y, on=.(A), x.B, by=.EACHI], data.table(A=2:1, x.B=c(2L,NA))) # missing j was caught in groupingsets but not cube, leading to unexpected error message, #4282 DT = data.table(a=1) test(2166, cube(DT, by='a'), error="Argument 'j' is required") + +# fwrite support encoding "native" and "UTF-8", #1770 +latin1 = "fa\xE7ile" +Encoding(latin1) = "latin1" +utf8 = iconv(latin1, "latin1", "UTF-8") +text = c(latin1, utf8, "aaaaaaaa") +dt = data.table(A = text, B = as.factor(text)) +dt2 = data.table(A = text, B = text) +csvfile = tempfile(fileext = ".csv") +fwrite(dt, csvfile, encoding = "UTF-8", bom = TRUE) +test(2167.1, fread(csvfile, encoding = "UTF-8"), dt2) +if (identical(text, enc2native(text))) { # ensure native encoding can represent latin1 strings + fwrite(dt, csvfile, encoding = "native") + test(2167.2, fread(csvfile), dt2) +} +test(2167.3, fwrite(dt, csvfile, encoding="nativ"), error="Argument 'encoding' must be") +unlink(csvfile) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index f784b6bc3b..48c7dd3085 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -19,7 +19,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto", compress = c("auto", "none", "gzip"), yaml = FALSE, bom = FALSE, - verbose = getOption("datatable.verbose", FALSE)) + verbose = getOption("datatable.verbose", FALSE), + encoding = "") } \arguments{ \item{x}{Any \code{list} of same length vectors; e.g. \code{data.frame} and \code{data.table}. If \code{matrix}, it gets internally coerced to \code{data.table} preserving col names but not row names} @@ -59,6 +60,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. See \code{Details}. } \item{bom}{If \code{TRUE} a BOM (Byte Order Mark) sequence (EF BB BF) is added at the beginning of the file; format 'UTF-8 with BOM'.} \item{verbose}{Be chatty and report timings?} + \item{encoding}{ The encoding of the strings written to the CSV file. Default is \code{""}, which means writting raw bytes without considering the encoding. Other possible options are \code{"UTF-8"} and \code{"native"}. } } \details{ \code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://www.h2o.ai/blog/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. diff --git a/src/fwriteR.c b/src/fwriteR.c index a1cba686b4..1d26aaf287 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -5,18 +5,23 @@ #define DATETIMEAS_EPOCH 2 #define DATETIMEAS_WRITECSV 3 +static bool utf8=false; +static bool native=false; +#define TO_UTF8(s) (utf8 && NEED2UTF8(s)) +#define TO_NATIVE(s) (native && (s)!=NA_STRING && !IS_ASCII(s)) +#define ENCODED_CHAR(s) (TO_UTF8(s) ? translateCharUTF8(s) : (TO_NATIVE(s) ? translateChar(s) : CHAR(s))) + static char sep2; // '\0' if there are no list columns. Otherwise, the within-column separator. static bool logical01=true; // should logicals be written as 0|1 or true|false. Needed by list column writer too in case a cell is a logical vector. static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd), 1=squash(yyyymmdd), 2=epoch, 3=write.csv static const char *sep2start, *sep2end; // sep2 is in main fwrite.c so that writeString can quote other fields if sep2 is present in them // if there are no list columns, set sep2=='\0' - // Non-agnostic helpers ... const char *getString(SEXP *col, int64_t row) { // TODO: inline for use in fwrite.c SEXP x = col[row]; - return x==NA_STRING ? NULL : CHAR(x); + return x==NA_STRING ? NULL : ENCODED_CHAR(x); } int getStringLen(SEXP *col, int64_t row) { @@ -45,7 +50,7 @@ int getMaxCategLen(SEXP col) { const char *getCategString(SEXP col, int64_t row) { // the only writer that needs to have the header of the SEXP column, to get to the levels int x = INTEGER(col)[row]; - return x==NA_INTEGER ? NULL : CHAR(STRING_ELT(getAttrib(col, R_LevelsSymbol), x-1)); + return x==NA_INTEGER ? NULL : ENCODED_CHAR(STRING_ELT(getAttrib(col, R_LevelsSymbol), x-1)); } writer_fun_t funs[] = { @@ -164,10 +169,12 @@ SEXP fwriteR( SEXP is_gzip_Arg, SEXP bom_Arg, SEXP yaml_Arg, - SEXP verbose_Arg + SEXP verbose_Arg, + SEXP encoding_Arg ) { if (!isNewList(DF)) error(_("fwrite must be passed an object of type list; e.g. data.frame, data.table")); + fwriteMainArgs args = {0}; // {0} to quieten valgrind's uninitialized, #4639 args.is_gzip = LOGICAL(is_gzip_Arg)[0]; args.bom = LOGICAL(bom_Arg)[0]; @@ -224,6 +231,8 @@ SEXP fwriteR( dateTimeAs = INTEGER(dateTimeAs_Arg)[0]; logical01 = LOGICAL(logical01_Arg)[0]; args.scipen = INTEGER(scipen_Arg)[0]; + utf8 = !strcmp(CHAR(STRING_ELT(encoding_Arg, 0)), "UTF-8"); + native = !strcmp(CHAR(STRING_ELT(encoding_Arg, 0)), "native"); int firstListColumn = 0; for (int j=0; j Date: Thu, 15 Apr 2021 16:36:46 -0600 Subject: [PATCH 185/588] add check for trunc.cols (#4766) --- R/print.data.table.R | 2 ++ inst/tests/tests.Rraw | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/R/print.data.table.R b/R/print.data.table.R index d53855d79d..96f3e8060c 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -15,6 +15,8 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), # trunc.cols - should only the columns be printed that can fit in the console? (FALSE) if (!col.names %chin% c("auto", "top", "none")) stop("Valid options for col.names are 'auto', 'top', and 'none'") + if (length(trunc.cols) != 1L || !is.logical(trunc.cols) || is.na(trunc.cols)) + stop("Valid options for trunc.cols are TRUE and FALSE") if (col.names == "none" && class) warning("Column classes will be suppressed when col.names is 'none'") if (!shouldPrint(x)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ef23279700..88465242c7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17319,3 +17319,10 @@ if (identical(text, enc2native(text))) { # ensure native encoding can represent } test(2167.3, fwrite(dt, csvfile, encoding="nativ"), error="Argument 'encoding' must be") unlink(csvfile) + +# check valid trunc.cols=, #4766 +DT = data.table(x = rnorm(10)) +test(2168.01, print(DT, trunc.cols = 5L), error=c("Valid options for trunc.cols are TRUE and FALSE")) +test(2168.02, print(DT, trunc.cols = NA), error=c("Valid options for trunc.cols are TRUE and FALSE")) +test(2168.03, print(DT, trunc.cols = "thing"), error=c("Valid options for trunc.cols are TRUE and FALSE")) +test(2168.04, print(DT, trunc.cols = c(TRUE, FALSE)), error=c("Valid options for trunc.cols are TRUE and FALSE")) From 374e20813f652b9f1729d26da1bad902979b06d6 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 16 Apr 2021 05:36:28 +0200 Subject: [PATCH 186/588] fread nrows=0L fixed to work like nrows=0 (#4694) --- NEWS.md | 2 ++ R/fread.R | 3 ++- inst/tests/tests.Rraw | 19 +++++++++++-------- src/freadR.c | 10 ++++------ 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index 29e830fed6..94856c2871 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,8 @@ 2. `print(DT, trunc.cols=TRUE)` and the corresponding `datatable.print.trunc.cols` option (new feature 3 in v1.13.0) could display an extra `diff.prev` column, [#4266](https://github.com/Rdatatable/data.table/issues/4266). Thanks to @tdhock for the PR. +3. `fread(..., nrows=0L)` now works as intended and the same as `nrows=0`; i.e. returning the column names and typed empty columns determined by the large sample, [#4686](https://github.com/Rdatatable/data.table/issues/4686). Thanks to @hongyuanjia for reporting, and Benjamin Schwendinger for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/fread.R b/R/fread.R index 0da96fe0e4..c03e1299b4 100644 --- a/R/fread.R +++ b/R/fread.R @@ -25,7 +25,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml) ) stopifnot( isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0)) stopifnot( is.numeric(nrows), length(nrows)==1L ) - if (is.na(nrows) || nrows<0L) nrows=Inf # accept -1 to mean Inf, as read.table does + nrows=as.double(nrows) #4686 + if (is.na(nrows) || nrows<0) nrows=Inf # accept -1 to mean Inf, as read.table does if (identical(header,"auto")) header=NA stopifnot(is.logical(header) && length(header)==1L) # TRUE, FALSE or NA stopifnot(is.numeric(nThread) && length(nThread)==1L) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 88465242c7..81612cceca 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13259,14 +13259,17 @@ test(1957.3, fread("A,B\na,b\nc,d\n", stringsAsFactors=TRUE, verbose=TRUE), data output="stringsAsFactors=TRUE converted 2 column(s): [A, B]") # misc. coverage tests in fread -test(1958.1, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encoding' must be") -test(1958.2, fread('a,b\n1,2', nrows = NA_real_), data.table(a = 1L, b = 2L)) -test(1958.3, fread('a,b\n1,2', nrows = -1), data.table(a = 1L, b = 2L)) -test(1958.4, fread('a,b\n1,2', key = 1), error = 'must be a character vector naming columns') -test(1958.5, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=logical(), B=logical(), C=logical())) #2747 -test(1958.6, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=logical(), B=logical(), C=logical())) -test(1958.7, fread('A,B,C,D\n"a,b",4,5,6\n"c,d",6,7\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(6,NA))) # 2547 -test(1958.8, fread('A,B,C,D\n"a,b",4,5\n"c,d",6,7,8\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(NA,8))) +test(1958.01, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encoding' must be") +test(1958.02, fread('a,b\n1,2', nrows = NA_real_), data.table(a = 1L, b = 2L)) +test(1958.03, fread('a,b\n1,2', nrows = -1), data.table(a = 1L, b = 2L)) +test(1958.04, fread('a,b\n1,2', key = 1), error = 'must be a character vector naming columns') +test(1958.05, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=logical(), B=logical(), C=logical())) #2747 +test(1958.06, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=logical(), B=logical(), C=logical())) +test(1958.07, fread('A,B,C,D\n"a,b",4,5,6\n"c,d",6,7\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(6,NA))) # 2547 +test(1958.08, fread('A,B,C,D\n"a,b",4,5\n"c,d",6,7,8\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(NA,8))) +# 4686 +test(1958.09, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0L), data.table(A=logical(), B=logical(), C=logical())) +test(1958.10, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0L, sep=','), data.table(A=logical(), B=logical(), C=logical())) # Skip should work with all types of newlines #3006 eols = c("\n", "\r\n", "\r", "\n\r") diff --git a/src/freadR.c b/src/freadR.c index 29c75db720..bd93555f8e 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -122,12 +122,10 @@ SEXP freadR( else if (LOGICAL(headerArg)[0]==TRUE) args.header = true; args.nrowLimit = INT64_MAX; - // checked at R level - if (isReal(nrowLimitArg)) { - if (R_FINITE(REAL(nrowLimitArg)[0]) && REAL(nrowLimitArg)[0]>=0.0) args.nrowLimit = (int64_t)(REAL(nrowLimitArg)[0]); - } else { - if (INTEGER(nrowLimitArg)[0]>=1) args.nrowLimit = (int64_t)INTEGER(nrowLimitArg)[0]; - } + if (!isReal(nrowLimitArg) || length(nrowLimitArg)!=1) + error(_("Internal error: freadR nrows not a single real. R level catches this.")); // # nocov + if (R_FINITE(REAL(nrowLimitArg)[0]) && REAL(nrowLimitArg)[0]>=0.0) + args.nrowLimit = (int64_t)(REAL(nrowLimitArg)[0]); args.logical01 = LOGICAL(logical01Arg)[0]; { From b1d25cd85e59a5549d9f319e6f5c513771ee4d98 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 18 Apr 2021 21:22:14 -0700 Subject: [PATCH 187/588] fix frank(.SD) for ties.method="random" / na.last=NA (#4434) --- NEWS.md | 2 ++ R/frank.R | 7 ++++++- inst/tests/tests.Rraw | 9 +++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 94856c2871..1e165bd427 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,8 @@ 3. `fread(..., nrows=0L)` now works as intended and the same as `nrows=0`; i.e. returning the column names and typed empty columns determined by the large sample, [#4686](https://github.com/Rdatatable/data.table/issues/4686). Thanks to @hongyuanjia for reporting, and Benjamin Schwendinger for the PR. +4. Passing `.SD` to `frankv()` with `ties.method='random'` or with `na.last=NA` failed with `.SD is locked`, [#4429](https://github.com/Rdatatable/data.table/issues/4429). Thanks @smarches for the report. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/frank.R b/R/frank.R index 763b8267e5..47e701c4cd 100644 --- a/R/frank.R +++ b/R/frank.R @@ -22,10 +22,13 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a if (!length(cols)) stop("x is a list, 'cols' can not be 0-length") } - x = .shallow(x, cols) # shallow copy even if list.. + # need to unlock for #4429 + x = .shallow(x, cols, unlock = TRUE) # shallow copy even if list.. setDT(x) cols = seq_along(cols) if (is.na(na.last)) { + if ("..na_prefix.." %chin% names(x)) + stop("Input column '..na_prefix..' conflicts with data.table internal usage; please rename") set(x, j = "..na_prefix..", value = is_na(x, cols)) order = if (length(order) == 1L) c(1L, rep(order, length(cols))) else c(1L, order) cols = c(ncol(x), cols) @@ -39,6 +42,8 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a idx = NULL n = nrow(x) } + if ('..stats_runif..' %chin% names(x)) + stop("Input column '..stats_runif..' conflicts with data.table internal usage; please rename") set(x, idx, '..stats_runif..', stats::runif(n)) order = if (length(order) == 1L) c(rep(order, length(cols)), 1L) else c(order, 1L) cols = c(cols, ncol(x)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 81612cceca..24b513875b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17329,3 +17329,12 @@ test(2168.01, print(DT, trunc.cols = 5L), error=c("Valid options for trunc.cols test(2168.02, print(DT, trunc.cols = NA), error=c("Valid options for trunc.cols are TRUE and FALSE")) test(2168.03, print(DT, trunc.cols = "thing"), error=c("Valid options for trunc.cols are TRUE and FALSE")) test(2168.04, print(DT, trunc.cols = c(TRUE, FALSE)), error=c("Valid options for trunc.cols are TRUE and FALSE")) + +# shallow copy of .SD must be unlocked for frank using na.last=NA or ties.method='random', #4429 +DT = data.table(a=1:10) +test(2169.1, DT[ , frankv(.SD, ties.method='average', na.last=NA)], as.double(1:10)) +test(2169.2, DT[ , frankv(.SD, ties.method='random')], 1:10) +# coverage tests for some issues discovered on the way +DT[, c('..na_prefix..', '..stats_runif..') := 1L] +test(2169.3, DT[ , frankv(.SD, ties.method='average', na.last=NA)], error="Input column '..na_prefix..' conflicts") +test(2169.4, DT[ , frankv(.SD, ties.method='random')], error="Input column '..stats_runif..' conflicts") From 01a1e03b9f2fb768ebd1c8242f3da1ad82fab2b1 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 19 Apr 2021 07:31:48 +0200 Subject: [PATCH 188/588] which=NA for not optimized i, closes #4411 (#4430) --- NEWS.md | 2 ++ R/data.table.R | 5 +++++ inst/tests/tests.Rraw | 8 ++++++++ 3 files changed, 15 insertions(+) diff --git a/NEWS.md b/NEWS.md index 1e165bd427..372dfcde64 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,6 +22,8 @@ 4. Passing `.SD` to `frankv()` with `ties.method='random'` or with `na.last=NA` failed with `.SD is locked`, [#4429](https://github.com/Rdatatable/data.table/issues/4429). Thanks @smarches for the report. +5. Filtering data.table using `which=NA` to return non-matching indices will now properly work for non-optimized subsetting as well, closes [#4411](https://github.com/Rdatatable/data.table/issues/4411). + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/data.table.R b/R/data.table.R index bbc1cf5693..e806850e20 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -553,6 +553,11 @@ replace_dot_alias = function(e) { # i is not a data.table if (!is.logical(i) && !is.numeric(i)) stop("i has evaluated to type ", typeof(i), ". Expecting logical, integer or double.") if (is.logical(i)) { + if (is.na(which)) { # #4411 i filter not optimized to join: DT[A > 1, which = NA] + ## we need this branch here, not below next to which=TRUE because irows=i=which(i) will filter out NAs: DT[A > 10, which = NA] will be incorrect + if (notjoin) stop("internal error: notjoin and which=NA (non-matches), huh? please provide reproducible example to issue tracker") # nocov + return(which(is.na(i) | !i)) + } if (length(i)==1L # to avoid unname copy when length(i)==nrow (normal case we don't want to slow down) && isTRUE(unname(i))) { irows=i=NULL } # unname() for #2152 - length 1 named logical vector. # NULL is efficient signal to avoid creating 1:nrow(x) but still return all rows, fixes #1249 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 24b513875b..9ae4864fe2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17338,3 +17338,11 @@ test(2169.2, DT[ , frankv(.SD, ties.method='random')], 1:10) DT[, c('..na_prefix..', '..stats_runif..') := 1L] test(2169.3, DT[ , frankv(.SD, ties.method='average', na.last=NA)], error="Input column '..na_prefix..' conflicts") test(2169.4, DT[ , frankv(.SD, ties.method='random')], error="Input column '..stats_runif..' conflicts") + +# which=NA inconsistent with ?data.table, #4411 +DT = data.table(A = c(NA, 3, 5, 0, 1, 2), B = c("foo", "foo", "foo", "bar", "bar", "bar")) +test(2170.1, DT[A > 1, which = NA], c(1L,4:5)) +test(2170.2, DT[A > -1, which = NA], 1L) +test(2170.3, DT[A > -1 | is.na(A), which = NA], integer()) +test(2170.4, DT[A > 10, which = NA], seq_len(nrow(DT))) +test(2170.5, DT[!(A > 1), which = NA], c(1:3,6L)) # matches DT[A <= 1, which = NA] From e595fb6949625ef807f0dc7fc0a8b5fb6cb55ab7 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 21 Apr 2021 18:42:55 +0200 Subject: [PATCH 189/588] workaround for #4960 (#4961) --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2f760c2782..9e4a52a06d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -262,6 +262,7 @@ test-rel-win: ## R-release on Windows, test and build binaries test-dev-win: ## R-devel on Windows <<: *test-win + allow_failure: true variables: R_VERSION: "$R_DEVEL_VERSION" before_script: From 7fbb471fe7d3e588c82bfa5648d77ad95d2b10b2 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 26 Apr 2021 22:17:07 +0200 Subject: [PATCH 190/588] r-devel bump to 4.2, install rmarkdown (#4966) --- .gitlab-ci.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9e4a52a06d..206502a56c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,7 +7,7 @@ variables: ## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under ## a non-UTC timezone, although, that's what we do routinely in dev. R_REL_VERSION: "4.0" - R_DEVEL_VERSION: "4.1" + R_DEVEL_VERSION: "4.2" R_OLDREL_VERSION: "3.6" stages: @@ -61,7 +61,7 @@ build: ## build data.table sources as tar.gz archive image: registry.gitlab.com/jangorecki/dockerfiles/r-builder needs: ["mirror-packages"] before_script: - - Rscript -e 'install.packages("knitr", repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' + - Rscript -e 'install.packages(c("knitr","rmarkdown"), repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' - rm -r bus - echo "Revision:" $CI_BUILD_REF >> ./DESCRIPTION script: @@ -262,7 +262,6 @@ test-rel-win: ## R-release on Windows, test and build binaries test-dev-win: ## R-devel on Windows <<: *test-win - allow_failure: true variables: R_VERSION: "$R_DEVEL_VERSION" before_script: From 18528fa27e2afc94d5a79159bc93540772c90789 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Tue, 27 Apr 2021 07:20:05 +0800 Subject: [PATCH 191/588] Should never recycle the zero-length vector (#4262) --- NEWS.md | 31 +++++++++++++++++++++++++++++++ R/as.data.table.R | 8 +++++--- inst/tests/tests.Rraw | 15 +++++++++++++-- 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 372dfcde64..c9a8280c3f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,37 @@ 3. `fwrite()` now writes UTF-8 or native csv files by specifying the `encoding=` argument, [#1770](https://github.com/Rdatatable/data.table/pull/1770). Thanks to @shrektan for the request and the PR. +4. `data.table()` no longer fills empty vectors with `NA` with warning. Instead a 0-row `data.table` is returned, [#3727](https://github.com/Rdatatable/data.table/issues/3727). Since `data.table()` is used internally by `.()`, this brings the following examples in line with expectations in most cases. Thanks to @shrektan for the suggestion and PR. + + ```R + DT = data.table(A=1:3, B=letters[1:3]) + DT[A>3, .(ITEM='A>3', A, B)] # (1) + DT[A>3][, .(ITEM='A>3', A, B)] # (2) + # the above are now equivalent as expected and return: + Empty data.table (0 rows and 3 cols): ITEM,A,B + # Previously, (2) returned : + ITEM A B + + 1: A>3 NA + Warning messages: + 1: In as.data.table.list(jval, .named = NULL) : + Item 2 has 0 rows but longest item has 1; filled with NA + 2: In as.data.table.list(jval, .named = NULL) : + Item 3 has 0 rows but longest item has 1; filled with NA + ``` + + ```R + DT = data.table(A=1:3, B=letters[1:3], key="A") + DT[.(1:3, double()), B] + # new result : + character(0) + # old result : + [1] "a" "b" "c" + Warning message: + In as.data.table.list(i) : + Item 2 has 0 rows but longest item has 3; filled with NA + ``` + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/as.data.table.R b/R/as.data.table.R index 308a7b2ffe..47219206a2 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -129,6 +129,7 @@ as.data.table.list = function(x, eachncol = integer(n) missing.check.names = missing(check.names) origListNames = if (missing(.named)) names(x) else NULL # as.data.table called directly, not from inside data.table() which provides .named, #3854 + empty_atomic = FALSE for (i in seq_len(n)) { xi = x[[i]] if (is.null(xi)) next # eachncol already initialized to 0 by integer() above @@ -148,10 +149,13 @@ as.data.table.list = function(x, } eachnrow[i] = NROW(xi) # for a vector (including list() columns) returns the length eachncol[i] = NCOL(xi) # for a vector returns 1 + if (is.atomic(xi) && length(xi)==0L && !is.null(xi)) { + empty_atomic = TRUE # any empty atomic (not empty list()) should result in nrows=0L, #3727 + } } ncol = sum(eachncol) # hence removes NULL items silently (no error or warning), #842. if (ncol==0L) return(null.data.table()) - nrow = max(eachnrow) + nrow = if (empty_atomic) 0L else max(eachnrow) ans = vector("list",ncol) # always return a new VECSXP recycle = function(x, nrow) { if (length(x)==nrow) { @@ -173,8 +177,6 @@ as.data.table.list = function(x, if (is.null(xi)) { n_null = n_null+1L; next } if (eachnrow[i]>1L && nrow%%eachnrow[i]!=0L) # in future: eachnrow[i]!=nrow warning("Item ", i, " has ", eachnrow[i], " rows but longest item has ", nrow, "; recycled with remainder.") - if (eachnrow[i]==0L && nrow>0L && is.atomic(xi)) # is.atomic to ignore list() since list() is a common way to initialize; let's not insist on list(NULL) - warning("Item ", i, " has 0 rows but longest item has ", nrow, "; filled with NA") # the rep() in recycle() above creates the NA vector if (is.data.table(xi)) { # matrix and data.frame were coerced to data.table above prefix = if (!isFALSE(.named[i]) && isTRUE(nchar(names(x)[i])>0L)) paste0(names(x)[i],".") else "" # test 2058.12 for (j in seq_along(xi)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9ae4864fe2..ba7cb0579e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5839,7 +5839,7 @@ test(1380, DT[a==TRUE], DT[3:4]) # Fix #847, as.data.table.list and character(0) issue x <- data.table(a=character(0), b=character(0), c=numeric(0)) setkey(x, a, b) -test(1381, x[J("foo", character(0)), nomatch=0L], x, warning="Item 2 has 0 rows but longest item has 1; filled with NA") +test(1381, x[J("foo", character(0)), nomatch=0L], x) # Fix for #813 and #758 DT = data.table(x = 1:2) @@ -13754,7 +13754,7 @@ test(1967.34, data.table(1:5, NULL), data.table(V1=1:5)) ### if (novname[i]) vnames[[i]] = namesi ### but, on pause for now pending #3193 ### test(1967.35, data.table(1:5, matrix(6:15, nrow = 5L)) -test(1967.35, data.table(1:5, integer(0L)), data.table(1:5, NA_integer_), warning="Item 2 has 0 rows but longest item has 5; filled with NA") +test(1967.35, data.table(1:5, integer(0L)), data.table(integer(0L), integer(0L))) # no longer NA-fill zero-length, PR#4262 test(1967.36, data.table(1:5, key = 5L), error = 'must be character') x = data.table(a = 1:5) @@ -17346,3 +17346,14 @@ test(2170.2, DT[A > -1, which = NA], 1L) test(2170.3, DT[A > -1 | is.na(A), which = NA], integer()) test(2170.4, DT[A > 10, which = NA], seq_len(nrow(DT))) test(2170.5, DT[!(A > 1), which = NA], c(1:3,6L)) # matches DT[A <= 1, which = NA] + +# data.table() zero-nrow result if any non-null & atomic element is length 0, #3727 +test(2171.1, data.table(A=double(), B=1:2), data.table(A=double(), B=integer())) +DT = data.table(CODE=c('a','b'), DATE=1:2, VALUE=c(1.3, 1.5), key=c('CODE','DATE')) +test(2171.2, DT[J(character(), 1), VALUE], double()) # because "J" is a wrapper of list() +test(2171.3, data.table(A=NULL, B=1.0), data.table(B=1.0)) # NULL is omited +test(2171.4, NROW(data.table(A=list(), B=1.0)), 1L) # empty list() regarded as `list(list())` which is length 1, and recycled +DT = data.table(A=1:3, B=letters[1:3]) +test(2171.5, ans <- DT[A>3, .(ITEM='A>3', A, B)], # now identical as expected + DT[A>3][, .(ITEM='A>3', A, B)]) +test(2171.6, ans, data.table(ITEM=character(), A=integer(), B=character())) # not just identical to each other, but correct too From 16df4238b6c02738703f46355d4dbb32e9d8dc39 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Tue, 27 Apr 2021 14:13:21 +0800 Subject: [PATCH 192/588] Should touch jval class only when it's a plain data.table object (#4354) --- NEWS.md | 9 +++++---- R/data.table.R | 4 +++- inst/tests/tests.Rraw | 32 +++++++++++++++++++------------- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index c9a8280c3f..ef3bed1bf2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -55,6 +55,8 @@ 5. Filtering data.table using `which=NA` to return non-matching indices will now properly work for non-optimized subsetting as well, closes [#4411](https://github.com/Rdatatable/data.table/issues/4411). +6. When `j` returns an object whose class `"X"` inherits from `data.table`; i.e. class `c("X", "data.table", "data.frame")`, the derived class `"X"` is no longer incorrectly dropped from the class of the `data.table` returned, [#4324](https://github.com/Rdatatable/data.table/issues/4324). Thanks to @HJAllen for reporting and @shrektan for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : @@ -107,7 +109,7 @@ 2. `fwrite()`'s mutithreaded `gzip` compression failed on Solaris with Z_STREAM_ERROR, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) there have been no known problems with it on Linux, Windows or Mac. For Solaris, we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. - It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone reading this knows more, please let us know. + It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone reading this knows more, please let us know. ## NOTES @@ -152,7 +154,7 @@ 1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accommodate `bit64`'s update. The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on (rather than importing) `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. - + We have requested that CRAN policy be modified to require that reverse dependency testing include packages which `Suggest` the package. Had this been the case, reverse dependency testing of `bit64` would have caught the impact on `data.table` before release. 2. `?.NGRP` now displays the help page as intended, [#4946](https://github.com/Rdatatable/data.table/issues/4649). Thanks to @KyleHaynes for posting the issue, and Cole Miller for the fix. `.NGRP` is a symbol new in v1.13.0; see below in this file. @@ -171,7 +173,7 @@ has a better chance of working on Mac. 1. `fread` now supports native parsing of `%Y-%m-%d`, and [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `%Y-%m-%dT%H:%M:%OS%z`, [#4464](https://github.com/Rdatatable/data.table/pull/4464). Dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and datetimes are returned as `POSIXct` provided either `Z` or the offset from `UTC` is present; e.g. `fwrite()` outputs UTC by default including the final `Z`. Reminder that `IDate` inherits from R's `Date` and is identical other than it uses the `integer` type where (oddly) R uses the `double` type for dates (8 bytes instead of 4). `fread()` gains a `tz` argument to control datetime values that are missing a Z or UTC-offset (now referred to as *unmarked* datetimes); e.g. as written by `write.csv`. By default `tz=""` means, as in R, read the unmarked datetime in local time. Unless the timezone of the R session is UTC (e.g. the TZ environment variable is set to `"UTC"`, or `""` on non-Windows), unmarked datetime will then by read by `fread` as character, as before. If you have been using `colClasses="POSIXct"` that will still work using R's `as.POSIXct()` which will interpret the unmarked datetime in local time, as before, and still slowly. You can tell `fread` to read unmarked datetime as UTC, and quickly, by passing `tz="UTC"` which may be appropriate in many circumstances. Note that the default behaviour of R to read and write csv using unmarked datetime can lead to different research results when the csv file has been saved in one timezone and read in another due to observations being shifted to a different date. If you have been using `colClasses="POSIXct"` for UTC-marked datetime (e.g. as written by `fwrite` including the final `Z`) then it will automatically speed up with no changes needed. Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. - + The minor version number is bumped from 12 to 13, i.e. `v1.13.0`, where the `.0` conveys 'be-aware' as is common practice. As with any new feature, there may be bugs to fix and changes to defaults required in future. In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided. ## NEW FEATURES @@ -1533,4 +1535,3 @@ When `j` is a symbol (as in the quanteda and xgboost examples above) it will con # data.table v1.9.8 (Nov 2016) back to v1.2 (Aug 2008) has been moved to [NEWS.0.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.0.md) - diff --git a/R/data.table.R b/R/data.table.R index e806850e20..cfd6b5b465 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1345,7 +1345,9 @@ replace_dot_alias = function(e) { } if (is.data.table(jval)) { - setattr(jval, 'class', class(x)) # fix for #64 + # should set the parent class only when jval is a plain data.table #4324 + if (identical(class(jval), c('data.table', 'data.frame'))) + setattr(jval, 'class', class(x)) # fix for #64 if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) setattr(jval, 'sorted', key(x)) if (any(sapply(jval, is.null))) stop("Internal error: j has created a data.table result containing a NULL column") # nocov diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ba7cb0579e..0e7c075106 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4195,7 +4195,7 @@ setNumericRounding(old_rounding) DT = data.table(id=INT(1,2,1), val1=3:1, val2=3:1, val3=list(2:3,4:6,7:10)) # 5380 test(1199.1, DT[, sum(.SD), by=id, .SDcols=2:3], data.table(id=1:2, V1=INT(8,4))) #875 made the .SD case work -test(1199.2, DT[, sum(.SD), by=id], error="data.*frame.*numeric") # this is R's error message so use flexible string pattern to insulate from minor changes in R, #4769 +test(1199.2, DT[, sum(.SD), by=id], error="data.*frame.*numeric") # this is R's error message so use flexible string pattern to insulate from minor changes in R, #4769 test(1199.3, DT[, sum(val3), by=id], error="Type 'list' not supported by GForce sum [(]gsum[)]. Either.*or turn off") # Selection of columns, copy column to maintain the same as R <= 3.0.2, in Rdevel, for now @@ -10442,7 +10442,7 @@ test(1728.12, DT[order(x,na.last=NA)], DT[2]) # was randomly wrong if (test_longdouble) { #3258 old = options(datatable.verbose=FALSE) # capture.output() exact tests must not be polluted with verbosity - + test(1729.01, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))), output="V1,V2\n1,10") test(1729.02, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))), @@ -10522,8 +10522,8 @@ if (test_longdouble) { #3258 # 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308 test(1729.12, typeof(DT[[1L]]), "double") test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) - - options(old) # restore the previous datatable.verbose value, for example for the CRAN_Release test with verbose on + + options(old) # restore the previous datatable.verbose value, for example for the CRAN_Release test with verbose on } if (test_bit64) { @@ -10846,7 +10846,7 @@ if (TZnotUTC) { # from v1.13.0 these tests work when running under non-UTC because they compare to as.POSIXct which reads these unmarked datetime in local # the new tests 2150.* cover more cases # from v1.14.0, the tz="" is needed - test(1743.25, fread("a,b,c\n2015-06-01 11:00:00,1,ae", colClasses=c("POSIXct","integer","character"), tz=""), + test(1743.25, fread("a,b,c\n2015-06-01 11:00:00,1,ae", colClasses=c("POSIXct","integer","character"), tz=""), data.table(a=as.POSIXct("2015-06-01 11:00:00"),b=1L,c="ae")) test(1743.26, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character="b"), drop=c("a","b"), logical01=TRUE, tz=""), ans<-data.table(c=as.POSIXct("2015-06-01 11:00:00"), d="a", e=1.5, f="M", g=9L, h=FALSE)) @@ -13268,7 +13268,7 @@ test(1958.06, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.ta test(1958.07, fread('A,B,C,D\n"a,b",4,5,6\n"c,d",6,7\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(6,NA))) # 2547 test(1958.08, fread('A,B,C,D\n"a,b",4,5\n"c,d",6,7,8\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(NA,8))) # 4686 -test(1958.09, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0L), data.table(A=logical(), B=logical(), C=logical())) +test(1958.09, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0L), data.table(A=logical(), B=logical(), C=logical())) test(1958.10, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0L, sep=','), data.table(A=logical(), B=logical(), C=logical())) # Skip should work with all types of newlines #3006 @@ -17150,7 +17150,7 @@ test(2153.2, DT[, .(list(.GRP)), by=x], data.table(x=1:2, V1=as.list(1:2))) test(2153.3, ans<-DT[, .(list(.NGRP)), by=x], data.table(x=1:2, V1=list(2L,2L))) test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't change group to group so the same object can be referenced many times unlike .N and .GRP test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result - data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) + data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) # warning message segfault when no column names present, #4644 test(2154.1, fread("0.0\n", colClasses="integer"), data.table(V1=0.0), @@ -17168,7 +17168,7 @@ for (i in 0:4) test(2155+i/10, # dogroups.c eval(j) could create list columns containing altrep references to the specials, #4759 # thanks to revdep testing of 1.13.2 where package tstools revealed this via ts() creating ALTREP, #4758 -# the attr(value,"class")<-"newclass" lines mimics a line at the end of stats::ts(). When the +# the attr(value,"class")<-"newclass" lines mimics a line at the end of stats::ts(). When the # length(value)>=64, R creates an ALTREP REF wrapper. Which dogroups.c now catches. # Hence this test needs to be at least 128 rows, 2 groups of 64 each. DT = data.table(series=c("ts1","ts2"), value=rnorm(128)) @@ -17193,7 +17193,7 @@ test(2158.1, DT[, .(value = list(value)), index], DT = data.table(value=as.list(1:6), index=rep(1:2, each=3)) test(2158.2, DT[, by="index", list(value=list(value))], data.table(index=1:2, value=list(as.list(1:3), as.list(4:6)))) - + # type consistency of empty input to as.matrix.data.table, #4762 DT = data.table(x = 1) test(2159.01, typeof(as.matrix(DT)), "double") @@ -17282,17 +17282,17 @@ if (test_bit64) { # invalid key when by=.EACHI, haskey(i) but on= non-leading-subset of i's key, #4603 #4911 X = data.table(id = c(6456372L, 6456372L, 6456372L, 6456372L,6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L), - id_round = c(197801L, 199405L, 199501L, 197901L, 197905L, 198001L, 198005L, 198101L, 198105L, 198201L, 198205L, 198301L, 198305L, 198401L), + id_round = c(197801L, 199405L, 199501L, 197901L, 197905L, 198001L, 198005L, 198101L, 198105L, 198201L, 198205L, 198301L, 198305L, 198401L), field = c(NA, NA, NA, "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine"), key = "id") -Y = data.table(id = c(6456372L, 6456345L, 6456356L), +Y = data.table(id = c(6456372L, 6456345L, 6456356L), id_round = c(197705L, 197905L, 201705L), - field = c("medicine", "teaching", "health"), + field = c("medicine", "teaching", "health"), prio = c(6L, 1L, 10L), key = c("id_round", "id", "prio", "field" )) test(2165.1, X[Y, on = .(id, id_round > id_round, field), .(x.id_round[1], i.id_round[1]), by=.EACHI][id==6456372L], data.table(id=6456372L, id_round=197705L, field='medicine', V1=197901L, V2=197705L)) -# Y$id_round happens to be sorted, so in 2165.2 we test Y$field which is not sorted +# Y$id_round happens to be sorted, so in 2165.2 we test Y$field which is not sorted test(2165.2, X[Y, on="field", .(x.id_round[1]), by=.EACHI][field=="health"], data.table(field="health", V1=NA_integer_)) # a minimal example too ... @@ -17357,3 +17357,9 @@ DT = data.table(A=1:3, B=letters[1:3]) test(2171.5, ans <- DT[A>3, .(ITEM='A>3', A, B)], # now identical as expected DT[A>3][, .(ITEM='A>3', A, B)]) test(2171.6, ans, data.table(ITEM=character(), A=integer(), B=character())) # not just identical to each other, but correct too + +# don't remove 'newclass' from jval's result, #4324 +A = data.table(COL = 'dt') +class(A) = c('newclass', class(A)) +DT = data.table(LIST_COL = list(A, A)) +test(2172, class(DT[1, LIST_COL[[1]]]), class(A)) From 7d1cf043947d037fe057c83829df5227547d7c0c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 27 Apr 2021 12:29:50 -0700 Subject: [PATCH 193/588] when as.data.frame dispatches to list method, force input to list (#4529) --- NEWS.md | 2 ++ R/as.data.table.R | 3 ++- inst/tests/tests.Rraw | 10 ++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ef3bed1bf2..edfb3b8a4d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -57,6 +57,8 @@ 6. When `j` returns an object whose class `"X"` inherits from `data.table`; i.e. class `c("X", "data.table", "data.frame")`, the derived class `"X"` is no longer incorrectly dropped from the class of the `data.table` returned, [#4324](https://github.com/Rdatatable/data.table/issues/4324). Thanks to @HJAllen for reporting and @shrektan for the PR. +7. `as.data.table()` failed with `.subset2(x, i, exact = exact): attempt to select less than one element in get1index` when passed an object inheriting from `data.table` with a different `[[` method, such as the class `dfidx` from the `dfidx` package, [#4526](https://github.com/Rdatatable/data.table/issues/4526). Thanks @RicoDiel for the report, and Michael Chirico for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/as.data.table.R b/R/as.data.table.R index 47219206a2..af02140bd1 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -221,7 +221,8 @@ as.data.table.data.frame = function(x, keep.rownames=FALSE, key=NULL, ...) { } if (any(vapply_1i(x, function(xi) length(dim(xi))))) { # not is.atomic because is.atomic(matrix) is true # a data.frame with a column that is data.frame needs to be expanded; test 2013.4 - return(as.data.table.list(x, keep.rownames=keep.rownames, ...)) + # x may be a class with [[ method that behaves differently, so as.list first for default [[, #4526 + return(as.data.table.list(as.list(x), keep.rownames=keep.rownames, ...)) } ans = copy(x) # TO DO: change this deep copy to be shallow. setattr(ans, "row.names", .set_row_names(nrow(x))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0e7c075106..b230f5f2e8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17363,3 +17363,13 @@ A = data.table(COL = 'dt') class(A) = c('newclass', class(A)) DT = data.table(LIST_COL = list(A, A)) test(2172, class(DT[1, LIST_COL[[1]]]), class(A)) + +# as.data.table.list edits list elements, so must be sure x does not use some other `[[` method, #4526 +x = data.frame(a = 1:5) +x$b = matrix(6:15, ncol=2L) +class(x) = c('foo', 'data.frame') +`[[.foo` = function(x, i) { + if (any(sapply(x, inherits, 'data.table'))) stop('failure') + as.list(x)[[i]] +} +test(2173, as.data.table(x), data.table(a=1:5, b.V1=6:10, b.V2=11:15)) From 20e762186cd9a5038c1b5658103cbb9c15d1b8a2 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 27 Apr 2021 14:07:02 -0700 Subject: [PATCH 194/588] credit Michael for PR (#4959) --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index edfb3b8a4d..f908539050 100644 --- a/NEWS.md +++ b/NEWS.md @@ -47,7 +47,7 @@ 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. -2. `print(DT, trunc.cols=TRUE)` and the corresponding `datatable.print.trunc.cols` option (new feature 3 in v1.13.0) could display an extra `diff.prev` column, [#4266](https://github.com/Rdatatable/data.table/issues/4266). Thanks to @tdhock for the PR. +2. `print(DT, trunc.cols=TRUE)` and the corresponding `datatable.print.trunc.cols` option (new feature 3 in v1.13.0) could incorrectly display an extra column, [#4266](https://github.com/Rdatatable/data.table/issues/4266). Thanks to @tdhock for the bug report and @MichaelChirico for the PR. 3. `fread(..., nrows=0L)` now works as intended and the same as `nrows=0`; i.e. returning the column names and typed empty columns determined by the large sample, [#4686](https://github.com/Rdatatable/data.table/issues/4686). Thanks to @hongyuanjia for reporting, and Benjamin Schwendinger for the PR. From be1c9878a3c91254993b86b287a9d22dc29cb0ab Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 27 Apr 2021 23:03:01 -0600 Subject: [PATCH 195/588] .dev-only; added debian/ubuntu package for revdep dependency --- .dev/CRAN_Release.cmd | 1 + 1 file changed, 1 insertion(+) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index a2db3058b3..274b55a2dd 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -530,6 +530,7 @@ sudo apt-get -y install libquantlib0-dev # for RQuantLib sudo apt-get -y install cargo # for gifski, a suggest of nasoi sudo apt-get -y install libgit2-dev # for gert sudo apt-get -y install cmake # for symengine for RxODE +sudo apt-get -y install libxslt1-dev # for xslt sudo R CMD javareconf # ENDIF From 6b38f61d893b8fb7c6f25adfa57ce8eb2068abbf Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Thu, 29 Apr 2021 13:49:00 +0800 Subject: [PATCH 196/588] rbind() now works with DTs with zero-length ordered factors (#4803) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 4 ++++ src/rbindlist.c | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index f908539050..c925bac7ee 100644 --- a/NEWS.md +++ b/NEWS.md @@ -59,6 +59,8 @@ 7. `as.data.table()` failed with `.subset2(x, i, exact = exact): attempt to select less than one element in get1index` when passed an object inheriting from `data.table` with a different `[[` method, such as the class `dfidx` from the `dfidx` package, [#4526](https://github.com/Rdatatable/data.table/issues/4526). Thanks @RicoDiel for the report, and Michael Chirico for the PR. +8. `rbind()` and `rbindlist()` of length-0 ordered factors failed with `Internal error: savetl_init checks failed`, [#4795](https://github.com/Rdatatable/data.table/issues/4795) [#4823](https://github.com/Rdatatable/data.table/issues/4823). Thanks to @shrektan and @dbart79 for reporting, and @shrektan for fixing. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b230f5f2e8..b7a444864c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17373,3 +17373,7 @@ class(x) = c('foo', 'data.frame') as.list(x)[[i]] } test(2173, as.data.table(x), data.table(a=1:5, b.V1=6:10, b.V2=11:15)) + +# rbind two length-0 ordered factors, #4795 +DT = data.table(A = ordered(character())) +test(2174, rbind(DT, DT), DT) diff --git a/src/rbindlist.c b/src/rbindlist.c index bb42502be6..5dab7fff51 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -273,7 +273,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) for(int j=0; j Date: Thu, 29 Apr 2021 15:16:42 +0800 Subject: [PATCH 197/588] speedup %like% for factors (#4750) --- NEWS.md | 2 ++ R/like.R | 5 ++++- inst/tests/tests.Rraw | 22 +++++++++++++--------- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index c925bac7ee..9b179489ce 100644 --- a/NEWS.md +++ b/NEWS.md @@ -43,6 +43,8 @@ Item 2 has 0 rows but longest item has 3; filled with NA ``` +5. `%like%` on factors with a large number of levels is now faster, [#4748](https://github.com/Rdatatable/data.table/issues/4748). The example in the PR shows 2.37s reduced to 0.86s on a factor lengh 100 million containing 1 million unique 10-character strings. Thanks to @statquant for reporting, and @shrektan for implementing. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/like.R b/R/like.R index c66678c643..dd2a8c5b59 100644 --- a/R/like.R +++ b/R/like.R @@ -3,7 +3,10 @@ # returns 'logical' so can be combined with other where clauses. like = function(vector, pattern, ignore.case = FALSE, fixed = FALSE) { if (is.factor(vector)) { - as.integer(vector) %in% grep(pattern, levels(vector), ignore.case = ignore.case, fixed = fixed) + # indexing by factors is equivalent to indexing by the numeric codes, see ?`[` #4748 + ret = grepl(pattern, levels(vector), ignore.case = ignore.case, fixed = fixed)[vector] + ret[is.na(ret)] = FALSE + ret } else { # most usually character, but integer and numerics will be silently coerced by grepl grepl(pattern, vector, ignore.case = ignore.case, fixed = fixed) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b7a444864c..ff1647bea8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7295,17 +7295,21 @@ set.seed(2L) x = apply(matrix(sample(letters, 12), nrow=2), 1, paste, collapse="") y = factor(sample(c(letters[1:5], x), 20, TRUE)) xsub = substring(x, 1L, 1L) -test(1532.1, y %like% xsub[1L], grepl(xsub[1L], y)) -test(1532.2, y %like% xsub[2L], grepl(xsub[2L], y)) -test(1532.3, like(y, xsub[1L]), grepl(xsub[1L], y)) -test(1532.4, like(y, xsub[2L]), grepl(xsub[2L], y)) +test(1532.01, y %like% xsub[1L], grepl(xsub[1L], y)) +test(1532.02, y %like% xsub[2L], grepl(xsub[2L], y)) +test(1532.03, like(y, xsub[1L]), grepl(xsub[1L], y)) +test(1532.04, like(y, xsub[2L]), grepl(xsub[2L], y)) ## %ilike% and %flike% for #3333 x = c('HEY', 'hey', '()') -test(1532.5, like(x, 'hey', ignore.case = TRUE), c(TRUE, TRUE, FALSE)) -test(1532.6, like(x, '()'), c(TRUE, TRUE, TRUE)) -test(1532.7, like(x, '()', fixed = TRUE), c(FALSE, FALSE, TRUE)) -test(1532.8, x %ilike% 'hey', c(TRUE, TRUE, FALSE)) -test(1532.9, x %flike% '()', c(FALSE, FALSE, TRUE)) +test(1532.05, like(x, 'hey', ignore.case = TRUE), c(TRUE, TRUE, FALSE)) +test(1532.06, like(x, '()'), c(TRUE, TRUE, TRUE)) +test(1532.07, like(x, '()', fixed = TRUE), c(FALSE, FALSE, TRUE)) +test(1532.08, x %ilike% 'hey', c(TRUE, TRUE, FALSE)) +test(1532.09, x %flike% '()', c(FALSE, FALSE, TRUE)) +## %like% test for ordered factor with NA +x = c("A", "B", "C", NA_character_) +x = ordered(x, levels = rev(x)[-1L]) +test(1532.10, x %like% "A", c(TRUE, FALSE, FALSE, FALSE)) # coverage for setkey() to 100% dt1 = data.table(x=sample(5), y=1:5, key="y") From 0f942123fc2221f3166caca8c61ce6a2e7ecf36e Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Thu, 29 Apr 2021 17:07:25 -0400 Subject: [PATCH 198/588] Add row names to null data.table on assign (#4609) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 10 ++++++++++ src/assign.c | 8 ++++++++ 3 files changed, 20 insertions(+) diff --git a/NEWS.md b/NEWS.md index 9b179489ce..8cbe8b2f7b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -63,6 +63,8 @@ 8. `rbind()` and `rbindlist()` of length-0 ordered factors failed with `Internal error: savetl_init checks failed`, [#4795](https://github.com/Rdatatable/data.table/issues/4795) [#4823](https://github.com/Rdatatable/data.table/issues/4823). Thanks to @shrektan and @dbart79 for reporting, and @shrektan for fixing. +9. `data.table(NULL)[, firstCol:=1L]` created `data.table(firstCol=1L)` ok but did not update the internal `row.names` attribute, causing `Error in '$<-.data.frame'(x, name, value) : replacement has 1 row, data has 0` when passed to packages like `ggplot` which use `DT` as if it is a `data.frame`, [#4597](https://github.com/Rdatatable/data.table/issues/4597). Thanks to Matthew Son for reporting, and Cole Miller for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ff1647bea8..bcdd90f6af 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17381,3 +17381,13 @@ test(2173, as.data.table(x), data.table(a=1:5, b.V1=6:10, b.V2=11:15)) # rbind two length-0 ordered factors, #4795 DT = data.table(A = ordered(character())) test(2174, rbind(DT, DT), DT) + +## set row.names when a null data.table has a column assigned for the first time, #4597 +DT = data.table() +test(2175.1, attr(DT[, x:=1:5], "row.names"), 1:5) +DT = data.table() +set(DT, j=c("v1","v2"), value=list(1:6, 2:7)) +test(2175.2, attr(DT, "row.names"), 1:6) +DT = data.table(x=integer()) +test(2175.3, DT[, y:=3L], data.table(x=integer(), y=integer())) # in keeping with recent #4262, view as recycling the length-1 3L to match the length-0 data + diff --git a/src/assign.c b/src/assign.c index 27fbccbd0e..e811276610 100644 --- a/src/assign.c +++ b/src/assign.c @@ -473,6 +473,14 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) for (i=0; i Date: Fri, 30 Apr 2021 06:15:01 +0200 Subject: [PATCH 199/588] froll partial doc, #4968 (#4971) --- man/froll.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/froll.Rd b/man/froll.Rd index 388c47c485..f1726d0723 100644 --- a/man/froll.Rd +++ b/man/froll.Rd @@ -113,7 +113,7 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) \item{ when \code{adaptive=TRUE}, then \code{n} must be vector of length equal to \code{nrow(x)}, or list of such vectors. } \item{ \code{partial} window feature is not supported, although it can - be accomplished by using \code{adaptive=TRUE}, see examples. } + be accomplished by using \code{adaptive=TRUE}, see examples. \code{NA} is always returned for incomplete windows. } } Be aware that rolling functions operates on the physical order of input. From f638d4d599a091a1ed096d2614613ffc03336855 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 5 May 2021 17:24:31 -0700 Subject: [PATCH 200/588] improve error message (#4937) --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index cfd6b5b465..cb0e844247 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1128,7 +1128,7 @@ replace_dot_alias = function(e) { if (is.list(k)) { origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame()) if (is.character(j)) { - if (length(j)!=1L) stop("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but it's length ", length(j)) + if (length(j)!=1L) stop("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but its length is ", length(j)) j = match(j, names(k)) if (is.na(j)) stop("Internal error -- item '", origj, "' not found in names of list") # nocov } From 89a14baac0e6fe1a87d956cd607282fa6d0ce6b5 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 6 May 2021 08:03:20 +0200 Subject: [PATCH 201/588] keyby=TRUE/FALSE together with by= (#4338) --- NEWS.md | 7 +++++++ R/data.table.R | 31 ++++++++++++++++++------------- inst/tests/tests.Rraw | 18 +++++++++++------- man/data.table.Rd | 10 ++++++---- 4 files changed, 42 insertions(+), 24 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8cbe8b2f7b..0ef6485b28 100644 --- a/NEWS.md +++ b/NEWS.md @@ -45,6 +45,13 @@ 5. `%like%` on factors with a large number of levels is now faster, [#4748](https://github.com/Rdatatable/data.table/issues/4748). The example in the PR shows 2.37s reduced to 0.86s on a factor lengh 100 million containing 1 million unique 10-character strings. Thanks to @statquant for reporting, and @shrektan for implementing. +6. `keyby=` now accepts `TRUE`/`FALSE` together with `by=`, [#4307](https://github.com/Rdatatable/data.table/issues/4307). The primary motivation is benchmarking where `by=` vs `keyby=` is varied across a set of queries. Thanks to Jan Gorecki for the request and the PR. + + ```R + DT[, sum(colB), keyby="colA"] + DT[, sum(colB), by="colA", keyby=TRUE] # same + ``` + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index cb0e844247..81e30befd0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -149,17 +149,22 @@ replace_dot_alias = function(e) { } .global$print="" missingby = missing(by) && missing(keyby) # for tests 359 & 590 where passing by=NULL results in data.table not vector - if (!missing(keyby)) { - if (!missing(by)) stop("Provide either by= or keyby= but not both") - if (missing(j)) { warning("Ignoring keyby= because j= is not supplied"); keyby=NULL; } - by=bysub=substitute(keyby) - keyby=TRUE - # Assign to 'by' so that by is no longer missing and we can proceed as if there were one by + if (missingby || missing(j)) { + if (!missingby) warning("Ignoring by/keyby because 'j' is not supplied") + by = bysub = NULL + keyby = FALSE } else { - if (!missing(by) && missing(j)) { warning("Ignoring by= because j= is not supplied"); by=NULL; } - by=bysub= if (missing(by)) NULL else substitute(by) - keyby=FALSE - } + if (missing(by)) { + by = bysub = substitute(keyby) + keyby = TRUE + } else { + by = bysub = substitute(by) + if (missing(keyby)) + keyby = FALSE + else if (!isTRUEorFALSE(keyby)) + stop("When by and keyby are both provided, keyby must be TRUE or FALSE") + } + } bynull = !missingby && is.null(by) #3530 byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI" naturaljoin = FALSE @@ -2333,11 +2338,11 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR list(.ll.tech.split=list(.expr)), list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) # simplify when `nomatch` accept NULL #857 ? ) - by.or.keyby = if (join) "by" else c("by"[!sorted], "keyby"[sorted])[1L] - dtq[[by.or.keyby]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`. + dtq[["by"]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`. .expr, - list(.expr = if(join) {as.name(".EACHI")} else if (flatten) by else .by) + list(.expr = if (join) as.name(".EACHI") else if (flatten) by else .by) ) + dtq[["keyby"]] = if (join) FALSE else sorted dtq[[".SDcols"]] = if (keep.by) names(x) else setdiff(names(x), if (flatten) by else .by) if (join) dtq[["on"]] = if (flatten) by else .by dtq = as.call(dtq) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bcdd90f6af..abcab25be7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1588,7 +1588,7 @@ test(534, names(transform(data.table('a b'=1), `c d`=`a b`)), c("a b","c d")) # Test keyby, new in v1.8.0 DT = data.table(a=INT(1,3,1,2,3,2),b=1:2,c=1:3,v=1:6) -test(535, DT[,sum(v),by=a, keyby=a], error="not both") +test(535, DT[,sum(v),by=a, keyby=a], error="When.*both.*keyby must be TRUE or FALSE") # updated after #4307 test(536, DT[,sum(v),by=a], data.table(a=c(1L,3L,2L),V1=c(4L,7L,10L))) # retains appearance order ans = data.table(a=1:3,V1=c(4L,10L,7L),key="a") test(537, DT[,sum(v),keyby=a], ans) @@ -13780,12 +13780,12 @@ test(1967.49, x[ , list(5) := 6], error = 'LHS of := must be a symbol') test(1967.50, x[ , 1 + 3i := 6], error = "LHS of := isn't column names") test(1967.511, x[ , .(5L), by = .EACHI, mult = 'all'], error='logical error. i is not data.table') test(1967.512, x[1+3i], error='i has evaluated to type complex. Expecting logical, integer or double') -test(1967.521, x[1:2, by=a], x[1:2,], warning="Ignoring by= because j= is not supplied") -test(1967.522, x[, by=a], x, warning=c("Ignoring by= because j= is not supplied","i and j are both missing.*upgraded to error in future")) -test(1967.523, x[by=a], x, warning=c("Ignoring by= because j= is not supplied","i and j are both missing.*upgraded to error in future")) -test(1967.524, x[1:2, keyby=a], x[1:2,], warning="Ignoring keyby= because j= is not supplied") -test(1967.525, x[, keyby=a], x, warning=c("Ignoring keyby= because j= is not supplied","i and j are both missing.*upgraded to error in future")) -test(1967.526, x[keyby=a], x, warning=c("Ignoring keyby= because j= is not supplied","i and j are both missing.*upgraded to error in future")) +test(1967.521, x[1:2, by=a], x[1:2,], warning="Ignoring by/keyby because 'j' is not supplied") +test(1967.522, x[, by=a], x, warning=c("Ignoring by/keyby because 'j' is not supplied","i and j are both missing.*upgraded to error in future")) +test(1967.523, x[by=a], x, warning=c("Ignoring by/keyby because 'j' is not supplied","i and j are both missing.*upgraded to error in future")) +test(1967.524, x[1:2, keyby=a], x[1:2,], warning="Ignoring by/keyby because 'j' is not supplied") +test(1967.525, x[, keyby=a], x, warning=c("Ignoring by/keyby because 'j' is not supplied","i and j are both missing.*upgraded to error in future")) +test(1967.526, x[keyby=a], x, warning=c("Ignoring by/keyby because 'j' is not supplied","i and j are both missing.*upgraded to error in future")) test(1967.53, as.matrix(x, rownames = 2:3), error='length(rownames)==2 but') test(1967.54, as.matrix(x[0L]), structure(integer(0), .Dim = c(0L, 2L), .Dimnames = list(NULL, c("a", "b")))) @@ -17391,3 +17391,7 @@ test(2175.2, attr(DT, "row.names"), 1:6) DT = data.table(x=integer()) test(2175.3, DT[, y:=3L], data.table(x=integer(), y=integer())) # in keeping with recent #4262, view as recycling the length-1 3L to match the length-0 data +# `keyby`=TRUE/FALSE together with by=, #4307 +DT = data.table(a=2:1, b=3:2, d=4:3) +test(2176.1, DT[, .SD, by="a", keyby=FALSE], data.table(a=2:1,b=3:2,d=4:3)) +test(2176.2, DT[, .SD, by="a", keyby=TRUE], data.table(a=1:2,b=2:3,d=3:4, key="a")) diff --git a/man/data.table.Rd b/man/data.table.Rd index 59b6aae1e1..637bdce86f 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -110,7 +110,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \emph{Advanced:} In the \code{X[Y, j]} form of grouping, the \code{j} expression sees variables in \code{X} first, then \code{Y}. We call this \emph{join inherited scope}. If the variable is not in \code{X} or \code{Y} then the calling frame is searched, its calling frame, and so on in the usual way up to and including the global environment.} - \item{keyby}{ Same as \code{by}, but with an additional \code{setkey()} run on the \code{by} columns of the result, for convenience. It is common practice to use `keyby=` routinely when you wish the result to be sorted.} + \item{keyby}{ Same as \code{by}, but with an additional \code{setkey()} run on the \code{by} columns of the result, for convenience. It is common practice to use `keyby=` routinely when you wish the result to be sorted. May also be \code{TRUE} or \code{FALSE} when \code{by} is provided as an alternative way to accomplish the same operation.} \item{with}{ By default \code{with=TRUE} and \code{j} is evaluated within the frame of \code{x}; column names can be used as variables. In case of overlapping variables names inside dataset and in parent scope you can use double dot prefix \code{..cols} to explicitly refer to `\code{cols} variable parent scope and not from your dataset. @@ -200,6 +200,7 @@ The way to read this out loud is: "Take \code{DT}, subset rows by \code{i}, \emp X[, sum(a), by=c:f] # get sum(a) grouped by all columns in between 'c' and 'f' (both inclusive) X[, sum(a), keyby=b] # get sum(a) grouped by 'b', and sort that result by the grouping column 'b' + X[, sum(a), by=b, keyby=TRUE] # same order as above, but using sorting flag X[, sum(a), by=b][order(b)] # same order as above, but by chaining compound expressions X[c>1, sum(a), by=c] # get rows where c>1 is TRUE, and on those rows, get sum(a) grouped by 'c' X[Y, .(a, b), on="c"] # get rows where Y$c == X$c, and select columns 'X$a' and 'X$b' for those rows @@ -220,11 +221,11 @@ See the \code{see also} section for the several other \emph{methods} that are av } \references{ -\url{https://github.com/Rdatatable/data.table/wiki} (\code{data.table} homepage)\cr +\url{https://r-datatable.com} (\code{data.table} homepage)\cr \url{https://en.wikipedia.org/wiki/Binary_search} } -\note{ If \code{keep.rownames} or \code{check.names} are supplied they must be written in full because \R does not allow partial argument names after `\code{\dots}`. For example, \code{data.table(DF, keep=TRUE)} will create a -column called \code{"keep"} containing \code{TRUE} and this is correct behaviour; \code{data.table(DF, keep.rownames=TRUE)} was intended. +\note{ If \code{keep.rownames} or \code{check.names} are supplied they must be written in full because \R does not allow partial argument names after \code{\dots}. For example, \code{data.table(DF, keep=TRUE)} will create a +column called \code{keep} containing \code{TRUE} and this is correct behaviour; \code{data.table(DF, keep.rownames=TRUE)} was intended. \code{POSIXlt} is not supported as a column type because it uses 40 bytes to store a single datetime. They are implicitly converted to \code{POSIXct} type with \emph{warning}. You may also be interested in \code{\link{IDateTime}} instead; it has methods to convert to and from \code{POSIXlt}. } @@ -280,6 +281,7 @@ DT[["v"]] # same as DT[, v] but much faster # grouping operations - j and by DT[, sum(v), by=x] # ad hoc by, order of groups preserved in result DT[, sum(v), keyby=x] # same, but order the result on by cols +DT[, sum(v), by=x, keyby=TRUE] # same, but using sorting flag DT[, sum(v), by=x][order(x)] # same but by chaining expressions together # fast ad hoc row subsets (subsets as joins) From 900711c7957b387e3d5a8b912619c281c69eebc7 Mon Sep 17 00:00:00 2001 From: Tony Fischetti Date: Thu, 6 May 2021 03:01:06 -0400 Subject: [PATCH 202/588] add `datatable.fwrite.sep` option (#4956) --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ R/fwrite.R | 3 ++- inst/tests/tests.Rraw | 9 +++++++++ man/fwrite.Rd | 3 ++- 5 files changed, 17 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7ba34218c3..68af8d8857 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -61,7 +61,8 @@ Authors@R: c( person("Vaclav","Tlapak", role="ctb"), person("Kevin","Ushey", role="ctb"), person("Dirk","Eddelbuettel", role="ctb"), - person("Ben","Schwen", role="ctb")) + person("Ben","Schwen", role="ctb"), + person("Tony","Fischetti", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NEWS.md b/NEWS.md index 0ef6485b28..56fbffbd8e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -51,6 +51,8 @@ DT[, sum(colB), keyby="colA"] DT[, sum(colB), by="colA", keyby=TRUE] # same ``` + +7. `fwrite()` gains a new `datatable.fwrite.sep` option to change the default separator, still `","` by default. Thanks to Tony Fischetti for the PR. As is good practice in R in general, we usually resist new global options for the reason that a user changing the option for their own code can inadvertently change the behaviour of any package using `data.table` too. However, in this case, the global option affects file output rather than code behaviour. In fact, the very reason the user may wish to change the default separator is that they know a different separator is more appropriate for their data being passed to the package using `fwrite` but cannot otherwise change the `fwrite` call within that package. ## BUG FIXES diff --git a/R/fwrite.R b/R/fwrite.R index 1a71f5ab0f..e8bc0f3121 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -1,5 +1,6 @@ fwrite = function(x, file="", append=FALSE, quote="auto", - sep=",", sep2=c("","|",""), eol=if (.Platform$OS.type=="windows") "\r\n" else "\n", + sep=getOption("datatable.fwrite.sep", ","), + sep2=c("","|",""), eol=if (.Platform$OS.type=="windows") "\r\n" else "\n", na="", dec=".", row.names=FALSE, col.names=TRUE, qmethod=c("double","escape"), logical01=getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index abcab25be7..3d80096054 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17395,3 +17395,12 @@ test(2175.3, DT[, y:=3L], data.table(x=integer(), y=integer())) # in keeping wit DT = data.table(a=2:1, b=3:2, d=4:3) test(2176.1, DT[, .SD, by="a", keyby=FALSE], data.table(a=2:1,b=3:2,d=4:3)) test(2176.2, DT[, .SD, by="a", keyby=TRUE], data.table(a=1:2,b=2:3,d=3:4, key="a")) + +# check fwrite output using new default separator option, #4956 +DT = data.table(a=1, b=2) +options(datatable.fwrite.sep='\t') +test(2177.01, fwrite(DT), output='a\tb\n1\t2') +options(datatable.fwrite.sep=';') +test(2177.02, fwrite(DT), output='a;b\n1;2') +options(datatable.fwrite.sep=NULL) +test(2177.03, fwrite(DT), output='a,b\n1,2') diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 48c7dd3085..870acaac75 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -6,7 +6,8 @@ As \code{write.csv} but much faster (e.g. 2 seconds versus 1 minute) and just as } \usage{ fwrite(x, file = "", append = FALSE, quote = "auto", - sep = ",", sep2 = c("","|",""), + sep=getOption("datatable.fwrite.sep", ","), + sep2 = c("","|",""), eol = if (.Platform$OS.type=="windows") "\r\n" else "\n", na = "", dec = ".", row.names = FALSE, col.names = TRUE, qmethod = c("double","escape"), From 38aa0b1ec39b6a312d774c888f524dbe33cb2cde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Thu, 6 May 2021 10:42:46 +0200 Subject: [PATCH 203/588] Keep joins grouped by foreign columns from crashing (#4944) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 6 ++++++ src/data.table.h | 1 + src/dogroups.c | 8 +++++++- src/subset.c | 2 +- 5 files changed, 17 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 56fbffbd8e..aece41efd9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -74,6 +74,8 @@ 9. `data.table(NULL)[, firstCol:=1L]` created `data.table(firstCol=1L)` ok but did not update the internal `row.names` attribute, causing `Error in '$<-.data.frame'(x, name, value) : replacement has 1 row, data has 0` when passed to packages like `ggplot` which use `DT` as if it is a `data.frame`, [#4597](https://github.com/Rdatatable/data.table/issues/4597). Thanks to Matthew Son for reporting, and Cole Miller for the PR. +10. `X[Y, .SD, by=]` (joining and grouping in the same query) could segfault if i) `by=` is supplied custom data (i.e. not simple expressions of columns), and ii) some rows of `Y` do not match to any rows in `X`, [#4892](https://github.com/Rdatatable/data.table/issues/4892). Thanks to @Kodiologist for reporting, @ColeMiller1 for investigating, and @tlapak for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3d80096054..af62363e84 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17404,3 +17404,9 @@ options(datatable.fwrite.sep=';') test(2177.02, fwrite(DT), output='a;b\n1;2') options(datatable.fwrite.sep=NULL) test(2177.03, fwrite(DT), output='a,b\n1,2') + +# segfault when joining and grouping and some rows don't match, #4892 +x = data.table(id = 1:4, key = 'id') +y = data.table(id = 2:5, key = 'id') +z = data.table(c=c(2L, 2L, 1L, 1L), id=c(2L, 4L, 3L, NA)) +test(2178, x[y, .SD, by=.(c(2L, 1L, 2L, 1L))], z) diff --git a/src/data.table.h b/src/data.table.h index 542de5f1af..9c79efb80c 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -138,6 +138,7 @@ SEXP setcolorder(SEXP x, SEXP o); // subset.c void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA); SEXP subsetVector(SEXP x, SEXP idx); +const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_out); // fcast.c SEXP int_vec_init(R_len_t n, int val); diff --git a/src/dogroups.c b/src/dogroups.c index 6ef4cb9815..d54dede2c1 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -158,6 +158,12 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX ansloc = 0; const int *istarts = INTEGER(starts); const int *iorder = INTEGER(order); + + // We just want to set anyNA for later. We do it only once for the whole operation + // because it is a rare edge case for it to be true. See #4892. + bool anyNA=false, orderedSubset=false; + check_idx(order, length(VECTOR_ELT(dt, 0)), &anyNA, &orderedSubset); + for(int i=0; i-1)) continue; @@ -233,7 +239,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX for (int k=0; kmax since they should have been dealt with by convertNegAndZeroIdx() called ealier at R level. // single cache efficient sweep with prefetch, so very low priority to go parallel From 657a8a4048e90322df69c2894c042757393d4cae Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 6 May 2021 01:50:16 -0700 Subject: [PATCH 204/588] tidy up usage of "domain" argument in translations (#4973) --- R/data.table.R | 12 ++++++------ R/devel.R | 2 +- R/last.R | 4 ++-- R/onAttach.R | 11 ++++++----- R/onLoad.R | 19 ++++++++++--------- R/setkey.R | 2 +- R/test.data.table.R | 2 +- R/xts.R | 2 +- 8 files changed, 28 insertions(+), 26 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 81e30befd0..78122771c7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -840,7 +840,7 @@ replace_dot_alias = function(e) { if (!typeof(byval[[jj]]) %chin% ORDERING_TYPES) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]") } tt = vapply_1i(byval,length) - if (any(tt!=xnrow)) stop(gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow, domain='R-data.table')) + if (any(tt!=xnrow)) stop(domain=NA, gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow)) if (is.null(bynames)) bynames = rep.int("",length(byval)) if (length(idx <- which(!nzchar(bynames))) && !bynull) { # TODO: improve this and unify auto-naming of jsub and bysub @@ -894,7 +894,7 @@ replace_dot_alias = function(e) { # attempt to auto-name unnamed columns for (jj in which(nm=="")) { thisq = q[[jj + 1L]] - if (missing(thisq)) stop(gettextf("Item %d of the .() or list() passed to j is missing", jj, domain="R-data.table")) #3507 + if (missing(thisq)) stop(domain=NA, gettextf("Item %d of the .() or list() passed to j is missing", jj)) #3507 if (is.name(thisq)) nm[jj] = drop_dot(thisq) # TO DO: if call to a[1] for example, then call it 'a' too } @@ -993,7 +993,7 @@ replace_dot_alias = function(e) { # added 'mget' - fix for #994 if (any(c("get", "mget") %chin% av)){ if (verbose) - cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", brackify(ansvars), domain = "R-data.table")) + cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", brackify(ansvars))) # get('varname') is too difficult to detect which columns are used in general # eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above. @@ -1013,7 +1013,7 @@ replace_dot_alias = function(e) { } non_sdvars = setdiff(ansvars, sdvars) ansvals = chmatch(ansvars, names_x) - if (verbose) cat(gettextf("New ansvars: %s \n", brackify(ansvars), domain = "R-data.table")) + if (verbose) cat(gettextf("New ansvars: %s \n", brackify(ansvars))) } else if (length(non_sdvars)) { # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc., @@ -2946,7 +2946,7 @@ isReallyReal = function(x) { RHS = eval(stub[[3L]], x, enclos) if (is.list(RHS)) RHS = as.character(RHS) # fix for #961 if (length(RHS) != 1L && !operator %chin% c("%in%", "%chin%")){ - if (length(RHS) != nrow(x)) stop(gettextf("RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %%in%% instead.", operator, length(RHS), nrow(x), domain="R-data.table"), domain=NA) + if (length(RHS) != nrow(x)) stop(domain=NA, gettextf("RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %%in%% instead.", operator, length(RHS), nrow(x))) return(NULL) # DT[colA == colB] regular element-wise vector scan } if ( mode(x[[col]]) != mode(RHS) || # mode() so that doubleLHS/integerRHS and integerLHS/doubleRHS!isReallyReal are optimized (both sides mode 'numeric') @@ -3131,7 +3131,7 @@ isReallyReal = function(x) { } idx_op = match(operators, ops, nomatch=0L) if (any(idx_op %in% c(0L, 6L))) - stop(gettextf("Invalid join operators %s. Only allowed operators are %s.", brackify(operators[idx_op %in% c(0L, 6L)]), brackify(ops[1:5]), domain="R-data.table"), domain=NA) + stop(domain=NA, gettextf("Invalid join operators %s. Only allowed operators are %s.", brackify(operators[idx_op %in% c(0L, 6L)]), brackify(ops[1:5]))) ## the final on will contain the xCol as name, the iCol as value on = iCols names(on) = xCols diff --git a/R/devel.R b/R/devel.R index b0dfb71858..b89d1af3aa 100644 --- a/R/devel.R +++ b/R/devel.R @@ -13,7 +13,7 @@ dcf.repo = function(pkg, repo, field, type) { idx = file(file.path(contrib.url(repo, type=type),"PACKAGES")) on.exit(close(idx)) dcf = read.dcf(idx, fields=c("Package",field)) - if (!pkg %in% dcf[,"Package"]) stop(gettextf("There is no package %s in provided repository.", pkg, domain='R-data.table')) + if (!pkg %in% dcf[,"Package"]) stop(domain=NA, gettextf("There is no package %s in provided repository.", pkg)) dcf[dcf[,"Package"]==pkg, field][[1L]] } diff --git a/R/last.R b/R/last.R index abf4050b40..fe6763b7d5 100644 --- a/R/last.R +++ b/R/last.R @@ -35,7 +35,7 @@ last = function(x, n=1L, ...) { } } else { if (!requireNamespace("xts", quietly=TRUE)) - stop(gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::last", domain="R-data.table")) # nocov + stop(domain=NA, gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::last")) # nocov if (verbose) cat("last: using xts::last: is.xts(x)\n") xts::last(x, n=n, ...) @@ -76,7 +76,7 @@ first = function(x, n=1L, ...) { } } else { if (!requireNamespace("xts", quietly=TRUE)) - stop(gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::first", domain="R-data.table")) # nocov + stop(domain=NA, gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::first")) # nocov if (verbose) cat("first: using xts::first: is.xts(x)\n") xts::first(x, n=n, ...) diff --git a/R/onAttach.R b/R/onAttach.R index 75b48eb394..3e93187e2e 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -19,13 +19,14 @@ dev = as.integer(v[1L, 3L]) %% 2L == 1L # version number odd => dev if (!isTRUE(getOption("datatable.quiet"))) { # new option in v1.12.4, #3489 packageStartupMessage("data.table ", v, if(dev)paste0(" IN DEVELOPMENT built ",d,g), - " using ", getDTthreads(verbose=FALSE), " threads (see ?getDTthreads). Latest news: r-datatable.com") - if (gettext("TRANSLATION CHECK", domain='R-data.table') != "TRANSLATION CHECK") - packageStartupMessage(gettext("**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********", domain="R-data.table")) + " using ", getDTthreads(verbose=FALSE), " threads (see ?getDTthreads). Latest news: r-datatable.com", domain="R-data.table") + # NB: domain= is necessary in .onAttach and .onLoad, see ?gettext and https://bugs.r-project.org/bugzilla/show_bug.cgi?id=18092. + if (gettext(domain="R-data.table", "TRANSLATION CHECK") != "TRANSLATION CHECK") + packageStartupMessage(domain="R-data.table", "**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********") if (dev && (Sys.Date() - as.Date(d))>28L) - packageStartupMessage("**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") + packageStartupMessage(domain="R-data.table", "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") if (!.Call(ChasOpenMP)) - packageStartupMessage("**********\n", + packageStartupMessage(domain="R-data.table", "**********\n", "This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", if (Sys.info()["sysname"]=="Darwin") "This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux." diff --git a/R/onLoad.R b/R/onLoad.R index 230929c4b6..3750510ece 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -25,11 +25,12 @@ if (dllV != RV) { dll = if (.Platform$OS.type=="windows") "dll" else "so" # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478 - stop("The datatable.",dll," version (",dllV,") does not match the package (",RV,"). Please close all R sessions to release the old ",toupper(dll)," and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.") + # NB: domain= is necessary in .onAttach and .onLoad, see ?gettext and https://bugs.r-project.org/bugzilla/show_bug.cgi?id=18092. + stop(domain="R-data.table", "The datatable.",dll," version (",dllV,") does not match the package (",RV,"). Please close all R sessions to release the old ",toupper(dll)," and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.") } builtUsing = readRDS(system.file("Meta/package.rds",package="data.table"))$Built$R if (!identical(base::getRversion()>="4.0.0", builtUsing>="4.0.0")) { - stop("This is R ", base::getRversion(), " but data.table has been installed using R ",builtUsing,". The major version must match. Please reinstall data.table.") + stop(domain="R-data.table", "This is R ", base::getRversion(), " but data.table has been installed using R ",builtUsing,". The major version must match. Please reinstall data.table.") # the if(R>=4.0.0) in NAMESPACE when registering S3 methods rbind.data.table and cbind.data.table happens on install; #3968 } } @@ -93,14 +94,14 @@ } if (!is.null(getOption("datatable.old.bywithoutby"))) - warning("Option 'datatable.old.bywithoutby' has been removed as warned for 2 years. It is now ignored. Please use by=.EACHI instead and stop using this option.") + warning(domain="R-data.table", "Option 'datatable.old.bywithoutby' has been removed as warned for 2 years. It is now ignored. Please use by=.EACHI instead and stop using this option.") if (!is.null(getOption("datatable.old.unique.by.key"))) - warning("Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. It is now ignored. Please use by=key(DT) instead and stop using this option.") + warning(domain="R-data.table", "Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. It is now ignored. Please use by=key(DT) instead and stop using this option.") # Test R behaviour that changed in v3.1 and is now depended on x = 1L:3L y = list(x) - if (address(x) != address(y[[1L]])) stop("Unexpected base R behaviour: list(x) has copied x") + if (address(x) != address(y[[1L]])) stop(domain="R-data.table", "Unexpected base R behaviour: list(x) has copied x") DF = data.frame(a=1:3, b=4:6) add1 = address(DF$a) @@ -108,7 +109,7 @@ names(DF) = c("A","B") add3 = address(DF$A) add4 = address(DF$B) - if (add1!=add3 || add2!=add4) stop("Unexpected base R behaviour: names<- has copied column contents") + if (add1!=add3 || add2!=add4) stop(domain="R-data.table", "Unexpected base R behaviour: names<- has copied column contents") DF = data.frame(a=1:3, b=4:6) add1 = address(DF$a) @@ -118,10 +119,10 @@ add4 = address(DF$a) add5 = address(DF$b) add6 = address(DF) - if (add2==add5) stop("Unexpected base R behaviour: DF[2,2]<- did not copy column 2 which was assigned to") - if (add1!=add4) stop("Unexpected base R behaviour: DF[2,2]<- copied the first column which was not assigned to, too") + if (add2==add5) stop(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- did not copy column 2 which was assigned to") + if (add1!=add4) stop(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- copied the first column which was not assigned to, too") - if (add3==add6) warning("Unexpected base R behaviour: DF[2,2]<- has not copied address(DF)") + if (add3==add6) warning(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- has not copied address(DF)") # R could feasibly in future not copy DF's vecsxp in this case. If that changes in R, we'd like to know via the warning # because tests will likely break too. The warning will quickly tell R-core and us why, so we can then update. diff --git a/R/setkey.R b/R/setkey.R index 1f3763b1f6..91cc87c7e4 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -352,7 +352,7 @@ CJ = function(..., sorted = TRUE, unique = FALSE) } } nrow = prod( vapply_1i(l, length) ) # lengths(l) will work from R 3.2.0 - if (nrow > .Machine$integer.max) stop(gettextf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max, domain='R-data.table')) + if (nrow > .Machine$integer.max) stop(domain=NA, gettextf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max)) l = .Call(Ccj, l) setDT(l) l = setalloccol(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since diff --git a/R/test.data.table.R b/R/test.data.table.R index c5da3e0bac..30cc47feb6 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -46,7 +46,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # nocov start fn2 = paste0(fn,".bz2") if (!file.exists(file.path(fulldir, fn2))) - stop(gettextf("Neither %s nor %s exist in %s",fn, fn2, fulldir, domain="R-data.table")) + stop(domain=NA, gettextf("Neither %s nor %s exist in %s",fn, fn2, fulldir)) fn = fn2 # nocov end # sys.source() below accepts .bz2 directly. diff --git a/R/xts.R b/R/xts.R index bfb6f813a7..121f36f1bd 100644 --- a/R/xts.R +++ b/R/xts.R @@ -7,7 +7,7 @@ as.data.table.xts = function(x, keep.rownames = TRUE, key=NULL, ...) { r = setDT(as.data.frame(x, row.names=NULL)) if (identical(keep.rownames, FALSE)) return(r[]) index_nm = if (is.character(keep.rownames)) keep.rownames else "index" - if (index_nm %chin% names(x)) stop(gettextf("Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index column name.", index_nm, index_nm, domain="R-data.table"), domain=NA) + if (index_nm %chin% names(x)) stop(domain=NA, gettextf("Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index column name.", index_nm, index_nm)) r[, c(index_nm) := zoo::index(x)] setcolorder(r, c(index_nm, setdiff(names(r), index_nm))) # save to end to allow for key=index_nm From 95e0a6ef9b3fc5a46d05d2373249aa56f6c77cc4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 6 May 2021 02:01:00 -0700 Subject: [PATCH 205/588] Use brackify more to replace paste(collapse=',') (#4929) --- R/data.table.R | 19 ++++++++----------- R/fcast.R | 2 +- R/print.data.table.R | 8 +++++--- R/setops.R | 22 +++++++++++++++------- R/test.data.table.R | 2 +- R/utils.R | 5 ++--- inst/tests/tests.Rraw | 18 +++++++++--------- 7 files changed, 41 insertions(+), 35 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 78122771c7..2484f4c5a1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -112,13 +112,9 @@ replace_dot_alias = function(e) { used = gsub(".*object '([^']+)'.*", "\\1", err$message) found = agrep(used, ref, value=TRUE, ignore.case=TRUE, fixed=TRUE) if (length(found)) { - stop("Object '", used, "' not found. Perhaps you intended ", - paste(head(found, 5L), collapse=", "), - if (length(found)<=5L) "" else paste(" or",length(found)-5L, "more")) + stop("Object '", used, "' not found. Perhaps you intended ", brackify(found)) } else { - stop("Object '", used, "' not found amongst ", - paste(head(ref, 5L), collapse=', '), - if (length(ref)<=5L) "" else paste(" and", length(ref)-5L, "more")) + stop("Object '", used, "' not found amongst ", brackify(ref)) } } else { stop(err$message, call.=FALSE) @@ -691,7 +687,7 @@ replace_dot_alias = function(e) { if (!length(ansvals)) return(null.data.table()) if (!length(leftcols)) { if (!anyNA(ansvals)) return(.Call(CsubsetDT, x, irows, ansvals)) - else stop("column(s) not found: ", paste(ansvars[is.na(ansvals)],collapse=", ")) + else stop("column(s) not found: ", brackify(ansvars[is.na(ansvals)])) } # else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977) # in both cases leave to the R-level subsetting of i and x together further below @@ -1164,7 +1160,7 @@ replace_dot_alias = function(e) { xcolsAns = seq_along(ansvars) icols = icolsAns = integer() } else { - if (!length(leftcols)) stop("Internal error -- column(s) not found: ", paste(ansvars[wna],collapse=", ")) # nocov + if (!length(leftcols)) stop("Internal error -- column(s) not found: ", brackify(ansvars[wna])) # nocov xcols = w[!wna] xcolsAns = which(!wna) map = c(seq_along(i), leftcols) # this map is to handle dups in leftcols, #3635 @@ -2994,10 +2990,11 @@ isReallyReal = function(x) { if(is.null(idx)){ ## check whether key fits the columns in i. ## order of key columns makes no difference, as long as they are all upfront in the key, I believe. - if (all(names(i) %chin% head(key(x), length(i)))){ - if (verbose) {cat("Optimized subsetting with key '", paste0( head(key(x), length(i)), collapse = ", "),"'\n",sep="");flush.console()} + key_head = head(key(x), length(i)) + if (all(names(i) %chin% key_head)){ + if (verbose) {cat("Optimized subsetting with key '", brackify(key_head),"'\n",sep="");flush.console()} idx = integer(0L) ## integer(0L) not NULL! Indicates that x is ordered correctly. - idxCols = head(key(x), length(i)) ## in correct order! + idxCols = key_head ## in correct order! } } if (is.null(idx)){ diff --git a/R/fcast.R b/R/fcast.R index dbde95846a..a95f03a448 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -57,7 +57,7 @@ value_vars = function(value.var, varnames) { valnames = unique(unlist(value.var)) iswrong = which(!valnames %chin% varnames) if (length(iswrong)) - stop("value.var values [", paste(value.var[iswrong], collapse=", "), "] are not found in 'data'.") + stop("value.var values ", brackify(value.var[iswrong]), " are not found in 'data'.") value.var } diff --git a/R/print.data.table.R b/R/print.data.table.R index 96f3e8060c..b1d5cbad50 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -45,10 +45,12 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), topn = max(as.integer(topn),1L) if (print.keys){ if (!is.null(ky <- key(x))) - cat("Key: <", paste(ky, collapse=", "), ">\n", sep="") + cat("Key: <", toString(ky), ">\n", sep="") if (!is.null(ixs <- indices(x))) - cat("Ind", if (length(ixs) > 1L) "ices" else "ex", ": <", - paste(ixs, collapse=">, <"), ">\n", sep="") + cat(sprintf( + ngettext(length(ixs), "Index: %s\n", "Indices: %s\n", domain="R-data.table"), + paste0("<", ixs, ">", collapse = ", ") + )) } if (any(dim(x)==0L)) { class = if (is.data.table(x)) "table" else "frame" # a data.frame could be passed to print.data.table() directly, #3363 diff --git a/R/setops.R b/R/setops.R index b6dcd7b0b2..89cf3fd81c 100644 --- a/R/setops.R +++ b/R/setops.R @@ -154,17 +154,25 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu k1 = key(target) k2 = key(current) if (!identical(k1, k2)) { - return(sprintf("Datasets has different keys. 'target'%s. 'current'%s.", - if(length(k1)) paste0(": ", paste(k1, collapse=", ")) else " has no key", - if(length(k2)) paste0(": ", paste(k2, collapse=", ")) else " has no key")) + return(gettextf( + "Datasets have different %s. 'target': %s. 'current': %s.", + "keys", + if(length(k1)) brackify(k1) else gettextf("has no key", domain="R-data.table"), + if(length(k2)) brackify(k2) else gettextf("has no key", domain="R-data.table"), + domain="R-data.table" + )) } # check index i1 = indices(target) i2 = indices(current) if (!identical(i1, i2)) { - return(sprintf("Datasets has different indexes. 'target'%s. 'current'%s.", - if(length(i1)) paste0(": ", paste(i1, collapse=", ")) else " has no index", - if(length(i2)) paste0(": ", paste(i2, collapse=", ")) else " has no index")) + return(gettextf( + "Datasets have different %s. 'target': %s. 'current': %s.", + "indices", + if(length(i1)) brackify(i1) else gettextf("has no index", domain = "R-data.table"), + if(length(i2)) brackify(i2) else gettextf("has no index", domain = "R-data.table"), + domain = "R-data.table" + )) } # Trim any extra row.names attributes that came from some inheritance @@ -173,7 +181,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu a1 = exclude.attrs(attributes(target)) a2 = exclude.attrs(attributes(current)) if (length(a1) != length(a2)) return(sprintf("Datasets has different number of (non-excluded) attributes: target %s, current %s", length(a1), length(a2))) - if (!identical(nm1 <- sort(names(a1)), nm2 <- sort(names(a2)))) return(sprintf("Datasets has attributes with different names: %s", paste(setdiff(union(names(a1), names(a2)), intersect(names(a1), names(a2))), collapse=", "))) + if (!identical(nm1 <- sort(names(a1)), nm2 <- sort(names(a2)))) return(sprintf("Datasets has attributes with different names: %s", brackify(setdiff(union(names(a1), names(a2)), intersect(names(a1), names(a2)))))) attrs.r = all.equal(a1[nm1], a2[nm2], ..., check.attributes = check.attributes) if (is.character(attrs.r)) return(paste("Attributes: <", attrs.r, ">")) # skip further heavy processing } diff --git a/R/test.data.table.R b/R/test.data.table.R index 30cc47feb6..88eeb321d6 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -163,7 +163,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (nfail > 0L) { # nocov start if (nfail > 1L) {s1="s";s2="s: "} else {s1="";s2=" "} - stop(nfail," error",s1," out of ",ntest,". Search ",names(fn)," for test number",s2,paste(env$whichfail,collapse=", "),".") + stop(nfail," error",s1," out of ",ntest,". Search ",names(fn)," for test number",s2,toString(env$whichfail),".") # important to stop() here, so that 'R CMD check' fails # nocov end } diff --git a/R/utils.R b/R/utils.R index 42e67ea8de..babc9bd6c7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -101,7 +101,7 @@ brackify = function(x, quote=FALSE) { # keep one more than needed to trigger dots if needed if (quote && is.character(x)) x = paste0("'",head(x,CUTOFF+1L),"'") if (length(x) > CUTOFF) x = c(x[1:CUTOFF], '...') - sprintf('[%s]', paste(x, collapse = ', ')) + sprintf('[%s]', toString(x)) } # patterns done via NSE in melt.data.table and .SDcols in `[.data.table` @@ -118,8 +118,7 @@ do_patterns = function(pat_sub, all_cols) { matched = patterns(pats, cols=cols) # replace with lengths when R 3.2.0 dependency arrives if (length(idx <- which(sapply(matched, length) == 0L))) - stop('Pattern', if (length(idx) > 1L) 's', ' not found: [', - paste(pats[idx], collapse = ', '), ']') + stop('Pattern', if (length(idx) > 1L) 's', ' not found: ', brackify(pats[idx])) return(matched) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index af62363e84..21e1ba6197 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2254,7 +2254,7 @@ test(811, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch= DT = data.table(a=1:3,b=4:6,c=7:9) # old tests using with=FALSE retained. Eventually will deprecate with=FALSE. test(812.1, DT[,!"b",with=FALSE], DT[,-match("b",names(DT)),with=FALSE]) -test(812.2, DT[,"foo",with=FALSE], error="column(s) not found: foo") +test(812.2, DT[,"foo",with=FALSE], error="column(s) not found: [foo]") test(812.3, DT[,!"foo",with=FALSE], DT, warning="column(s) not removed because not found: [foo]") test(812.4, DT[,!c("b","foo"),with=FALSE], DT[,list(a,c)], warning="column(s) not removed because not found: [foo]") test(812.5, DT[,!2:3,with=FALSE], DT[,-(2:3),with=FALSE]) # for consistency, but ! is really for character column names @@ -2274,7 +2274,7 @@ test(813.4, rownames(DT[2,"a"]), "1") # also repeat 812.* but without with=FALSE since that will be deprecated in future, and cover - as well as ! test(814.01, DT[,!"b"], DT[,c("a","c")]) test(814.02, DT[,-"b"], DT[,c("a","c")]) -test(814.03, DT[,"foo"], error="column(s) not found: foo") +test(814.03, DT[,"foo"], error="column(s) not found: [foo]") test(814.04, DT[,!"foo"], DT, warning="column(s) not removed because not found: [foo]") test(814.05, DT[,-"foo"], DT, warning="column(s) not removed because not found: [foo]") test(814.06, DT[,!c("b","foo")], DT[,list(a,c)], warning="column(s) not removed because not found: [foo]") @@ -8464,17 +8464,17 @@ test(1613.21, all.equal(DT2, DT1, ignore.row.order = TRUE), "Dataset 'current' h # test attributes: key DT1 <- data.table(a = 1:4, b = letters[1:4], key = "a") DT2 <- data.table(a = 1:4, b = letters[1:4]) -test(1613.22, all.equal(DT1, DT2), "Datasets has different keys. 'target': a. 'current' has no key.") +test(1613.22, all.equal(DT1, DT2), "Datasets have different keys. 'target': [a]. 'current': has no key.") test(1613.23, all.equal(DT1, DT2, check.attributes = FALSE), TRUE) test(1613.24, all.equal(DT1, setkeyv(DT2, "a"), check.attributes = TRUE), TRUE) # test attributes: index DT1 <- data.table(a = 1:4, b = letters[1:4]) DT2 <- data.table(a = 1:4, b = letters[1:4]) setindexv(DT1, "b") -test(1613.25, all.equal(DT1, DT2), "Datasets has different indexes. 'target': b. 'current' has no index.") +test(1613.25, all.equal(DT1, DT2), "Datasets have different indices. 'target': [b]. 'current': has no index.") test(1613.26, all.equal(DT1, DT2, check.attributes = FALSE), TRUE) -test(1613.27, all.equal(DT1, setindexv(DT2, "a")), "Datasets has different indexes. 'target': b. 'current': a.") -test(1613.28, all.equal(DT1, setindexv(DT2, "b")), "Datasets has different indexes. 'target': b. 'current': a, b.") +test(1613.27, all.equal(DT1, setindexv(DT2, "a")), "Datasets have different indices. 'target': [b]. 'current': [a].") +test(1613.28, all.equal(DT1, setindexv(DT2, "b")), "Datasets have different indices. 'target': [b]. 'current': [a, b].") test(1613.29, all.equal(DT1, setindexv(setindexv(DT2, NULL), "b")), TRUE) # test custom attribute DT1 <- data.table(a = 1:4, b = letters[1:4]) @@ -8483,7 +8483,7 @@ setattr(DT1, "custom", 1L) test(1613.30, all.equal(DT1, DT2), "Datasets has different number of (non-excluded) attributes: target 3, current 2") test(1613.31, all.equal(DT1, DT2, check.attributes = FALSE), TRUE) setattr(DT2, "custom2", 2L) -test(1613.32, all.equal(DT1, DT2), "Datasets has attributes with different names: custom, custom2") +test(1613.32, all.equal(DT1, DT2), "Datasets has attributes with different names: [custom, custom2]") setattr(DT1, "custom2", 2L) setattr(DT2, "custom", 0L) test(1613.33, all.equal(DT1, DT2), paste0("Attributes: < Component ", dQuote("custom"), ": Mean relative difference: 1 >")) @@ -13032,9 +13032,9 @@ test(1924.1, DT[var_name==1], error='not found\\. Perhaps you intended.*varname' test(1924.2, DT[variable==1], error='Object.*not found among') test(1924.3, DT[varname+'a'], error='non-numeric argument') DT[, VAR_NAME:=2] -test(1924.4, DT[var_name==1], error="Object 'var_name' not found. Perhaps you intended varname, VAR_NAME") +test(1924.4, DT[var_name==1], error="Object 'var_name' not found. Perhaps you intended [varname, VAR_NAME]") DT = setDT(lapply(integer(50), function(...) numeric(1L))) -test(1924.5, DT[V==0], error='Perhaps you intended.*V1.*V5 or 45 more') +test(1924.5, DT[V==0], error='Perhaps you intended.*V1.*V10, [.]{3}') # test suite of as.ITime methods (subsumes #2870) s = c('1970-01-01 00:00:00.1234', '2005-10-12 09:45:32.84') From 10463e32613bb9b400e54179d2ac92e655b22701 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 6 May 2021 02:32:22 -0700 Subject: [PATCH 206/588] Use scalar logical operators where appropriate (#4928) --- R/as.data.table.R | 2 +- R/data.table.R | 2 +- R/merge.R | 3 ++- R/setkey.R | 2 +- inst/tests/tests.Rraw | 6 +++--- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index af02140bd1..d4d234cb53 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -20,7 +20,7 @@ as.data.table.Date = as.data.table.ITime = function(x, keep.rownames=FALSE, key= tt = deparse(substitute(x))[1L] nm = names(x) # FR #2356 - transfer names of named vector as "rn" column if required - if (!identical(keep.rownames, FALSE) & !is.null(nm)) + if (!identical(keep.rownames, FALSE) && !is.null(nm)) x = list(nm, unname(x)) else x = list(x) if (tt == make.names(tt)) { diff --git a/R/data.table.R b/R/data.table.R index 2484f4c5a1..9ac5ae7d6c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -133,7 +133,7 @@ replace_dot_alias = function(e) { else if (missing(drop)) `[.data.frame`(x,i,j) else `[.data.frame`(x,i,j,drop) # added is.data.table(ans) check to fix bug #81 - if (!missing(i) & is.data.table(ans)) setkey(ans,NULL) # See test 304 + if (!missing(i) && is.data.table(ans)) setkey(ans, NULL) # See test 304 return(ans) } if (!missing(verbose)) { diff --git a/R/merge.R b/R/merge.R index fe3bdb4549..3dc4389965 100644 --- a/R/merge.R +++ b/R/merge.R @@ -11,7 +11,8 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL by = key(x) } } - if ((x0 <- length(x)==0L) | (y0 <- length(y)==0L)) warning("You are trying to join data.tables where ", if(x0 & y0) "'x' and 'y' arguments are" else if(x0 & !y0) "'x' argument is" else if(!x0 & y0) "'y' argument is", " 0 columns data.table.") + x0 = length(x)==0L; y0 = length(y)==0L + if (x0 || y0) warning("You are trying to join data.tables where ", if(x0 && y0) "arguments 'x' and 'y' have" else if(x0) "argument 'x' has" else "argument 'y' has", " no columns.") if (any(duplicated(names(x)))) stop("x has some duplicated column name(s): ",paste(names(x)[duplicated(names(x))],collapse=","),". Please remove or rename the duplicate(s) and try again.") if (any(duplicated(names(y)))) stop("y has some duplicated column name(s): ",paste(names(y)[duplicated(names(y))],collapse=","),". Please remove or rename the duplicate(s) and try again.") diff --git a/R/setkey.R b/R/setkey.R index 91cc87c7e4..95cf4288d1 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -295,7 +295,7 @@ setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE) o = forderv(x, cols, sort=TRUE, retGrp=FALSE, order=order, na.last=na.last) if (length(o)) { .Call(Creorder, x, o) - if (is.data.frame(x) & !is.data.table(x)) { + if (is.data.frame(x) && !is.data.table(x)) { setattr(x, 'row.names', rownames(x)[o]) } k = key(x) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 21e1ba6197..fb69d4d8b9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8319,9 +8319,9 @@ test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id # warn when merge empty data.table #597 test(1601.1, merge(data.table(a=1),data.table(a=1), by="a"), data.table(a=1, key="a")) -test(1601.2, tryCatch(merge(data.table(a=1),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'y' argument is 0 columns data.table.") -test(1601.3, tryCatch(merge(data.table(NULL),data.table(a=1), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'x' argument is 0 columns data.table.") -test(1601.4, tryCatch(merge(data.table(NULL),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'x' and 'y' arguments are 0 columns data.table.") +test(1601.2, tryCatch(merge(data.table(a=1),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where argument 'y' has no columns.") +test(1601.3, tryCatch(merge(data.table(NULL),data.table(a=1), by="a"), warning = function(w) w$message), "You are trying to join data.tables where argument 'x' has no columns.") +test(1601.4, tryCatch(merge(data.table(NULL),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where arguments 'x' and 'y' have no columns.") # fix for #1549 d1 <- data.table(v1=1:2,x=x) From 81676aab62f1f2c48a7fcf464e87f219e7440c5a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 6 May 2021 10:36:58 -0700 Subject: [PATCH 207/588] use paste0 in vignette (#4927) --- vignettes/datatable-reshape.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index c9fb43dabd..e8ebe04df9 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -190,8 +190,8 @@ Since we'd like for `data.table`s to perform this operation straightforward and The idea is quite simple. We pass a list of columns to `measure.vars`, where each element of the list contains the columns that should be combined together. ```{r} -colA = paste("dob_child", 1:3, sep = "") -colB = paste("gender_child", 1:3, sep = "") +colA = paste0("dob_child", 1:3) +colB = paste0("gender_child", 1:3) DT.m2 = melt(DT, measure = list(colA, colB), value.name = c("dob", "gender")) DT.m2 From f018088555a14008d607cde53917393485b5d7f7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 6 May 2021 10:40:37 -0700 Subject: [PATCH 208/588] use encodeString over gsub("\n", "\\\\n") (#4926) --- R/test.data.table.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 88eeb321d6..6736714923 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -390,16 +390,16 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no if (length(output) && !string_match(output, out)) { # nocov start cat("Test",numStr,"did not produce correct output:\n") - cat("Expected: <<",gsub("\n","\\\\n",output),">>\n",sep="") # \n printed as '\\n' so the two lines of output can be compared vertically - cat("Observed: <<",gsub("\n","\\\\n",out),">>\n",sep="") + cat("Expected: <<",encodeString(output),">>\n",sep="") # \n printed as '\\n' so the two lines of output can be compared vertically + cat("Observed: <<",encodeString(out),">>\n",sep="") fail = TRUE # nocov end } if (length(notOutput) && string_match(notOutput, out, ignore.case=TRUE)) { # nocov start cat("Test",numStr,"produced output but should not have:\n") - cat("Expected absent (case insensitive): <<",gsub("\n","\\\\n",notOutput),">>\n",sep="") - cat("Observed: <<",gsub("\n","\\\\n",out),">>\n",sep="") + cat("Expected absent (case insensitive): <<",encodeString(notOutput),">>\n",sep="") + cat("Observed: <<",encodeString(out),">>\n",sep="") fail = TRUE # nocov end } From f0f624b0f9afcf8ddbfaca3c2d7afb817ad0bd06 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 6 May 2021 10:41:55 -0700 Subject: [PATCH 209/588] remove tab (\t) from source (#4925) --- vignettes/datatable-reference-semantics.Rmd | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index 4747a76fd2..792bbf3b4b 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -96,12 +96,12 @@ It can be used in `j` in two ways: (b) The functional form - ```{r eval = FALSE} - DT[, `:=`(colA = valA, # valA is assigned to colA - colB = valB, # valB is assigned to colB - ... - )] - ``` + ```{r eval = FALSE} + DT[, `:=`(colA = valA, # valA is assigned to colA + colB = valB, # valB is assigned to colB + ... + )] + ``` #### {.bs-callout .bs-callout-warning} From 1f0d00e3ce2b5156b2b980ee9c6b25f5476b2f2d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 6 May 2021 10:48:49 -0700 Subject: [PATCH 210/588] use %iscall% in cedta() (#4895) This should be a bit safer --- R/cedta.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/cedta.R b/R/cedta.R index 262db0a105..181ad542e5 100644 --- a/R/cedta.R +++ b/R/cedta.R @@ -32,7 +32,7 @@ cedta = function(n=2L) { "data.table" %chin% names(getNamespaceImports(ns)) || # most common and recommended cases first for speed (nsname=="utils" && (exists("debugger.look", parent.frame(n+1L)) || - (length(sc<-sys.calls())>=8L && sc[[length(sc)-7L]][[1L]]=='example')) ) || # 'example' for #2972 + (length(sc<-sys.calls())>=8L && sc[[length(sc)-7L]] %iscall% 'example')) ) || # 'example' for #2972 (nsname=="base" && all(c("FUN", "X") %chin% ls(parent.frame(n)))) || # lapply (nsname %chin% cedta.pkgEvalsUserCode && any(sapply(sys.calls(), function(x) is.name(x[[1L]]) && (x[[1L]]=="eval" || x[[1L]]=="evalq")))) || nsname %chin% cedta.override || From be22eff3d60ace9fdbef996a81e7ece2e2f884a9 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Fri, 7 May 2021 16:08:51 +0800 Subject: [PATCH 211/588] Fix segfaults when assigning all-NA-vector to factor column (#4829) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 5 +++++ src/assign.c | 4 ++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index aece41efd9..ad673a4417 100644 --- a/NEWS.md +++ b/NEWS.md @@ -76,6 +76,8 @@ 10. `X[Y, .SD, by=]` (joining and grouping in the same query) could segfault if i) `by=` is supplied custom data (i.e. not simple expressions of columns), and ii) some rows of `Y` do not match to any rows in `X`, [#4892](https://github.com/Rdatatable/data.table/issues/4892). Thanks to @Kodiologist for reporting, @ColeMiller1 for investigating, and @tlapak for the PR. +11. Assigning a set of 2 or more all-NA values to a factor column could segfault, [#4824](https://github.com/Rdatatable/data.table/issues/4824). Thanks to @clerousset for reporting and @shrektan for fixing. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fb69d4d8b9..5db6c3fc2c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17410,3 +17410,8 @@ x = data.table(id = 1:4, key = 'id') y = data.table(id = 2:5, key = 'id') z = data.table(c=c(2L, 2L, 1L, 1L), id=c(2L, 4L, 3L, NA)) test(2178, x[y, .SD, by=.(c(2L, 1L, 2L, 1L))], z) + +# assigning all-na length>1 to a factor column was segfault, #4824 +DT = data.table(FACTOR = factor(rep("a", 3L))) +set(DT, i=1:2, j="FACTOR", value=rep(NA, 2L)) +test(2179, DT$FACTOR, factor(c(NA, NA, "a"))) diff --git a/src/assign.c b/src/assign.c index e811276610..3b9aba0074 100644 --- a/src/assign.c +++ b/src/assign.c @@ -690,7 +690,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con // sourceLen==1 is used in dogroups to recycle the group values into ans to match the nrow of each group's result; sourceStart is set to each group value row. { if (len<1) return NULL; - const int slen = sourceLen>=0 ? sourceLen : length(source); + int slen = sourceLen>=0 ? sourceLen : length(source); // since source may get reassigned to a scalar, we should not mark it as const if (slen==0) return NULL; if (sourceStart<0 || sourceStart+slen>length(source)) error(_("Internal error memrecycle: sourceStart=%d sourceLen=%d length(source)=%d"), sourceStart, sourceLen, length(source)); // # nocov @@ -718,7 +718,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con } else if (!sourceIsFactor && !isString(source)) { // target is factor if (allNA(source, false)) { // return false for list and other types that allNA does not support - source = ScalarLogical(NA_LOGICAL); // a global constant in R and won't allocate; fall through to regular zero-copy coerce + source = ScalarLogical(NA_LOGICAL); slen = 1; // a global constant in R and won't allocate; fall through to regular zero-copy coerce } else if (isInteger(source) || isReal(source)) { // allow assigning level numbers to factor columns; test 425, 426, 429 and 1945 const int nlevel = length(getAttrib(target, R_LevelsSymbol)); From 89c6b1b4b02048b1e5d8fc93ed58b9f6f50715eb Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 7 May 2021 10:48:14 +0200 Subject: [PATCH 212/588] extra test to ensure behavior will not silently change in future (#4836) --- inst/tests/tests.Rraw | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5db6c3fc2c..378d2dc93f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17415,3 +17415,7 @@ test(2178, x[y, .SD, by=.(c(2L, 1L, 2L, 1L))], z) DT = data.table(FACTOR = factor(rep("a", 3L))) set(DT, i=1:2, j="FACTOR", value=rep(NA, 2L)) test(2179, DT$FACTOR, factor(c(NA, NA, "a"))) + +# deleting duplicated column name removes only first +DT = data.table(a=1, b=2, a=3) +test(2180, DT[, a:=NULL], data.table(b=2, a=3)) From 0af42d7b43ec6baa8b939161ebf6d281c45ea562 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 7 May 2021 02:00:58 -0700 Subject: [PATCH 213/588] sync data.table.h signature of forder with that in forder.c (#4859) --- src/data.table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data.table.h b/src/data.table.h index 9c79efb80c..67b4fbe82b 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -128,7 +128,7 @@ int checkOverAlloc(SEXP x); // forder.c int StrCmp(SEXP x, SEXP y); uint64_t dtwiddle(double x); -SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg); +SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); int getNumericRounding_C(); // reorder.c From e0e0e49baf544781968f50300c7afdea3d7f9ad4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 7 May 2021 02:05:22 -0700 Subject: [PATCH 214/588] remove an obsolete reference from the FAQ vignette (#4862) --- vignettes/datatable-faq.Rmd | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index e0cd81b343..816cb99882 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -68,9 +68,20 @@ The `j` expression is the 2nd argument. Try `DT[ , c("x","y","z")]` or `DT[ , .( ## I assigned a variable `mycol = "x"` but then `DT[ , mycol]` returns `"x"`. How do I get it to look up the column name contained in the `mycol` variable? -In v1.9.8 released Nov 2016 there is an ability to turn on new behaviour: `options(datatable.WhenJisSymbolThenCallingScope=TRUE)`. It will then work as you expected, just like data.frame. If you are a new user of data.table, you should probably do this. You can place this command in your .Rprofile file so you don't have to remember again. See the long item in release notes about this. The release notes are linked at the top of the data.table homepage: [NEWS](https://github.com/Rdatatable/data.table/blob/master/NEWS.md). +What's happening is that the `j` expression sees objects in the calling scope. The variable `mycol` does not exist as a column name of `DT` so `data.table` then looked in the calling scope and found `mycol` there and returned its value `"x"`. This is correct behaviour currently. Had `mycol` been a column name, then that column's data would have been returned. -Without turning on that new behaviour, what's happening is that the `j` expression sees objects in the calling scope. The variable `mycol` does not exist as a column name of `DT` so data.table then looked in the calling scope and found `mycol` there and returned its value `"x"`. This is correct behaviour currently. Had `mycol` been a column name, then that column's data would have been returned. What has been done to date has been `DT[ , mycol, with = FALSE]` which will return the `x` column's data as required. That will still work in the future, too. Alternatively, since a data.table _is_ a `list`, too, you have been and still will be able to write and rely on `DT[[mycol]]`. +To get the column `x` from `DT`, there are a few options: + +```r +# using .. to tell data.table the variable should be evaluated +DT[ , ..mycol] +# using with=FALSE to do the same +DT[ , mycol, with=FALSE] +# treating DT as a list and using [[ +DT[[mycol]] +``` + +The `with` argument refers to the `base` function `with` -- when `with=TRUE`, `data.table` operates similar to `with`, i.e. `DT[ , mycol]` behaves like `with(DT, mycol)`. When `with=FALSE`, the standard `data.frame` evaluation rules apply. ## What are the benefits of being able to use column names as if they are variables inside `DT[...]`? From dec9383b3e584347b01c2a9d5b90c1e75f340c61 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 7 May 2021 02:16:14 -0700 Subject: [PATCH 215/588] mark a message for translation (#4884) --- src/fsort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fsort.c b/src/fsort.c index 5c1cf946e8..3d29401c03 100644 --- a/src/fsort.c +++ b/src/fsort.c @@ -312,7 +312,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) { free(myworking); } if (non_monotonic) - error("OpenMP %d did not assign threads to iterations monotonically. Please search Stack Overflow for this message.", MY_OPENMP); // # nocov; #4786 in v1.13.4 + error(_("OpenMP %d did not assign threads to iterations monotonically. Please search Stack Overflow for this message."), MY_OPENMP); // # nocov; #4786 in v1.13.4 if (alloc_fail) error(_("Unable to allocate working memory")); // # nocov } From 2939cb8128c340685792a8b0948499a2200fd8a6 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 7 May 2021 13:47:49 -0700 Subject: [PATCH 216/588] as.data.table(table(NULL)) returns null data.table (#4180) --- NEWS.md | 2 ++ R/as.data.table.R | 2 ++ inst/tests/tests.Rraw | 4 +++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ad673a4417..0a9e8f0745 100644 --- a/NEWS.md +++ b/NEWS.md @@ -78,6 +78,8 @@ 11. Assigning a set of 2 or more all-NA values to a factor column could segfault, [#4824](https://github.com/Rdatatable/data.table/issues/4824). Thanks to @clerousset for reporting and @shrektan for fixing. +12. `as.data.table(table(NULL))` now returns `data.table(NULL)` rather than error `attempt to set an attribute on NULL`, [#4179](https://github.com/Rdatatable/data.table/issues/4179). The result differs slightly to `as.data.frame(table(NULL))` (0-row, 1-column) because 0-column works better with other `data.table` functions like `rbindlist()`. Thanks to Michael Chirico for the report and fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/as.data.table.R b/R/as.data.table.R index d4d234cb53..9509774075 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -33,6 +33,8 @@ as.data.table.Date = as.data.table.ITime = function(x, keep.rownames=FALSE, key= # as.data.table.table - FR #361 as.data.table.table = function(x, keep.rownames=FALSE, key=NULL, ...) { + # prevent #4179 & just cut out here + if (any(dim(x) == 0L)) return(null.data.table()) # Fix for bug #43 - order of columns are different when doing as.data.table(with(DT, table(x, y))) val = rev(dimnames(provideDimnames(x))) if (is.null(names(val)) || !any(nzchar(names(val)))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 378d2dc93f..43cf3d87ca 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16907,7 +16907,6 @@ test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanot test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") test(2132.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") rm(s1, s2, class2132) - if (test_xts) { # keep.rownames in as.data.table.xts() supports a string, #4232 xts = xts::xts(1:10, structure(1:10, class = "Date")) @@ -17419,3 +17418,6 @@ test(2179, DT$FACTOR, factor(c(NA, NA, "a"))) # deleting duplicated column name removes only first DT = data.table(a=1, b=2, a=3) test(2180, DT[, a:=NULL], data.table(b=2, a=3)) + +# as.data.table(table(NULL)) was error, #4179 +test(2181, as.data.table(table(NULL)), data.table(NULL)) From 7b4bd7da03ca821cd8e840b62fe923d54035b908 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 7 May 2021 13:56:37 -0700 Subject: [PATCH 217/588] Add alt text to vignette plots (#4792) --- vignettes/datatable-sd-usage.Rmd | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 8f23c58554..fda2c4751f 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -143,7 +143,7 @@ Varying model specification is a core feature of robust statistical analysis. Le Here's a short script leveraging the power of `.SD` which explores this question: -```{r sd_for_lm, cache = FALSE} +```{r sd_for_lm, cache = FALSE, fig.cap="Fit OLS coefficient on W, various specifications, depicted as bars with distinct colors."} # this generates a list of the 2^k possible extra variables # for models of the form ERA ~ G + (...) extra_var = c('yearID', 'teamID', 'G', 'L') @@ -199,9 +199,7 @@ Note that the `x[y]` syntax returns `nrow(y)` values (i.e., it's a right join), Often, we'd like to perform some operation on our data _at the group level_. When we specify `by =` (or `keyby = `), the mental model for what happens when `data.table` processes `j` is to think of your `data.table` as being split into many component sub-`data.table`s, each of which corresponds to a single value of your `by` variable(s): -```{r grouping_png, fig.cap = "Grouping, Illustrated", echo = FALSE} -knitr::include_graphics('plots/grouping_illustration.png') -``` +![Grouping, Illustrated](plots/grouping_illustration.png 'A visual depiction of how grouping works. On the left is a grid. The first column is titled "ID COLUMN" with values the capital letters A through G, and the rest of the data is unlabelled, but is in a darker color and simply has "Data" written to indicate that's arbitrary. A right arrow shows how this data is split into groups. Each capital letter A through G has a grid on the right-hand side; the grid on the left has been subdivided to create that on the right.') In the case of grouping, `.SD` is multiple in nature -- it refers to _each_ of these sub-`data.table`s, _one-at-a-time_ (slightly more accurately, the scope of `.SD` is a single sub-`data.table`). This allows us to concisely express an operation that we'd like to perform on _each sub-`data.table`_ before the re-assembled result is returned to us. @@ -237,7 +235,7 @@ _NB_: `.SD[1L]` is currently optimized by [_`GForce`_](https://Rdatatable.gitlab Returning to the inquiry above regarding the relationship between `ERA` and `W`, suppose we expect this relationship to differ by team (i.e., there's a different slope for each team). We can easily re-run this regression to explore the heterogeneity in this relationship as follows (noting that the standard errors from this approach are generally incorrect -- the specification `ERA ~ W*teamID` will be better -- this approach is easier to read and the _coefficients_ are OK): -```{r group_lm, results = 'hide'} +```{r group_lm, results = 'hide', fig.cap="A histogram depicting the distribution of fitted coefficients. It is vaguely bell-shaped and concentrated around -.2"} # Overall coefficient for comparison overall_coef = Pitching[ , coef(lm(ERA ~ W))['W']] # use the .N > 20 filter to exclude teams with few observations From ebc5bc37ff4fb71280866bc4fbfdecbca036cbab Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 7 May 2021 14:44:57 -0700 Subject: [PATCH 218/588] fix variable with melt(measure.vars=list), na.rm=T/F consistency (#4723) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 7 +++++-- man/melt.data.table.Rd | 2 +- src/fmelt.c | 15 +++------------ 4 files changed, 11 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0a9e8f0745..398819815d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -80,6 +80,8 @@ 12. `as.data.table(table(NULL))` now returns `data.table(NULL)` rather than error `attempt to set an attribute on NULL`, [#4179](https://github.com/Rdatatable/data.table/issues/4179). The result differs slightly to `as.data.frame(table(NULL))` (0-row, 1-column) because 0-column works better with other `data.table` functions like `rbindlist()`. Thanks to Michael Chirico for the report and fix. +13. `melt` with a list for `measure.vars` would output `variable` inconsistently between `na.rm=TRUE` and `FALSE`, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Thanks to @tdhock for reporting and fixing. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 43cf3d87ca..9f3bb5eb30 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3014,6 +3014,9 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) error="Unknown 'id.vars' type raw") test(1035.012, melt(DT, id.vars=1:3, measure.vars=as.raw(0)), error="Unknown 'measure.vars' type raw") + test(1035.013, melt(data.table(a=1, b=1), id.vars=c(1,1)), data.table(a=1, a.1=1, variable=factor("b"), value=1)) + test(1035.014, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1,c("1","2")), a=1, b=1)) + test(1035.015, melt(data.table(a=1+2i, b=1), id.vars="a"), error="Unknown column type 'complex' for column 'a' in 'data'") ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] @@ -3175,9 +3178,9 @@ Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") x[, c("y1","z1"):=NA] test(1037.405, dim(melt(x, measure.vars=patterns("^y", "^z"))), INT(4,5)) test(1037.406, dim(ans<-melt(x, measure.vars=patterns("^y", "^z"), na.rm=TRUE)), INT(2,5)) - test(1037.407, ans$variable, factor(c("1","1"))) + test(1037.407, ans$variable, factor(c("2","2"), c("1", "2"))) test(1037.408, dim(ans<-melt(x, measure.vars=patterns("^y", "^z"), na.rm=TRUE, variable.factor=FALSE)), INT(2,5)) - test(1037.409, ans$variable, c("1","1")) + test(1037.409, ans$variable, c("2","2")) test(1037.410, melt(data.table(NULL), verbose=TRUE), data.table(NULL), output="ncol(data) is 0. Nothing to melt") diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index e56a10e4e1..e51c61aaac 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -31,7 +31,7 @@ non-measure columns will be assigned to it. If integer, must be positive; see De } For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } -\item{variable.name}{name for the measured variable names column. The default name is \code{'variable'}.} +\item{variable.name}{name (default \code{'variable'}) of output column containing information about which input column(s) were melted. If \code{measure.vars} is an integer/character vector, then each entry of this column contains the name of a melted column from \code{data}. If \code{measure.vars} is a list of integer/character vectors, then each entry of this column contains an integer indicating an index/position in each of those vectors.} \item{value.name}{name for the molten data values column(s). The default name is \code{'value'}. Multiple names can be provided here for the case when \code{measure.vars} is a \code{list}, though note well that the names provided in \code{measure.vars} take precedence. } \item{na.rm}{If \code{TRUE}, \code{NA} values will be removed from the molten data.} diff --git a/src/fmelt.c b/src/fmelt.c index 22a4ac1fc5..39b93d2e81 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -538,7 +538,6 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str } else { for (int j=0, ansloc=0, level=1; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - if (thislen==0) continue; // so as not to bump level char buff[20]; snprintf(buff, 20, "%d", level++); SEXP str = PROTECT(mkChar(buff)); @@ -546,11 +545,11 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str UNPROTECT(1); } } - } else { + } else {// varfactor==TRUE SET_VECTOR_ELT(ansvars, 0, target=allocVector(INTSXP, data->totlen)); SEXP levels; int *td = INTEGER(target); - if (data->lvalues == 1) { + if (data->lvalues == 1) {//single output column. SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); int len = length(thisvaluecols); levels = PROTECT(allocVector(STRSXP, len)); protecti++; @@ -573,24 +572,16 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; for (int k=0; klmax)); protecti++; for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - if (thislen==0) continue; // so as not to bump level char buff[20]; snprintf(buff, 20, "%d", nlevel+1); SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels for (int k=0; klmax) { - // data->narm is true and there are some all-NA items causing at least one 'if (thislen==0) continue' above - // shrink the levels - SEXP newlevels = PROTECT(allocVector(STRSXP, nlevel)); protecti++; - for (int i=0; i Date: Sat, 8 May 2021 23:09:21 -0700 Subject: [PATCH 219/588] support missing values in measure.vars arg to melt (#4720) --- NEWS.md | 2 + inst/tests/tests.Rraw | 11 ++++ man/melt.data.table.Rd | 10 +++- src/chmatch.c | 9 ++-- src/fmelt.c | 115 ++++++++++++++++++++++++----------------- 5 files changed, 97 insertions(+), 50 deletions(-) diff --git a/NEWS.md b/NEWS.md index 398819815d..11250c6c94 100644 --- a/NEWS.md +++ b/NEWS.md @@ -54,6 +54,8 @@ 7. `fwrite()` gains a new `datatable.fwrite.sep` option to change the default separator, still `","` by default. Thanks to Tony Fischetti for the PR. As is good practice in R in general, we usually resist new global options for the reason that a user changing the option for their own code can inadvertently change the behaviour of any package using `data.table` too. However, in this case, the global option affects file output rather than code behaviour. In fact, the very reason the user may wish to change the default separator is that they know a different separator is more appropriate for their data being passed to the package using `fwrite` but cannot otherwise change the `fwrite` call within that package. +8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9f3bb5eb30..e13fea88ca 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17424,3 +17424,14 @@ test(2180, DT[, a:=NULL], data.table(b=2, a=3)) # as.data.table(table(NULL)) was error, #4179 test(2181, as.data.table(table(NULL)), data.table(NULL)) + +# some missing variables in melt, #4027 +DT.wide = data.table(a2=2, b1=1, b2=2) +expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2)) +test(2182.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected) +test(2182.2, melt(DT.wide, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2"))), expected) +DTid = data.table(DT.wide, id=1) +exid = data.table(id=1, expected) +test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) +test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) +test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE)[, .(a, b)], data.table(a=2, b=2))#not testing variable because it is not computed correctly, #4455 diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index e51c61aaac..5ff25005d5 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -64,7 +64,11 @@ effect. From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars} accepting a list of \code{character} or \code{integer} vectors as well to melt -into multiple columns in a single function call efficiently. The function +into multiple columns in a single function call efficiently. +If a vector in the list contains missing values, or is shorter than the +max length of the list elements, then the output will include runs of +missing values at the specified position, or at the end. +The function \code{\link{patterns}} can be used to provide regular expression patterns. When used along with \code{melt}, if \code{cols} argument is not provided, the patterns will be matched against \code{names(data)}, for convenience. @@ -134,6 +138,10 @@ melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) # return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE) +# measure list with missing/short entries results in output with runs of NA +DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] +melt(DT.missing.cols, measure=list(d=1:2, c="c_1", f=c(NA, "f_2"))) + } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} diff --git a/src/chmatch.c b/src/chmatch.c index f80e7dd2c7..a091e646f0 100644 --- a/src/chmatch.c +++ b/src/chmatch.c @@ -74,11 +74,14 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch } int nuniq=0; for (int i=0; i0) { savetl(s); tl=0; } if (tl==0) SET_TRUELENGTH(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table } + // in future if we need NAs in x not to be matched to NAs in table ... + // if (!matchNAtoNA && TRUELENGTH(NA_STRING)<0) + // SET_TRUELENGTH(NA_STRING, 0); if (chmatchdup) { // chmatchdup() is basically base::pmatch() but without the partial matching part. For example : // chmatchdup(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table' @@ -107,7 +110,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch for (int i=0; incol; +} + SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { - int i, ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0; + int ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0; SEXP thiscol, idcols = R_NilValue, valuecols = R_NilValue, tmp, tmp2, booltmp, unqtmp, ans; SEXP dtnames = PROTECT(getAttrib(DT, R_NamesSymbol)); protecti++; if (isNull(id) && isNull(measure)) { - for (i=0; i ncol) error(_("One or more values in 'id.vars' is invalid.")); else if (!LOGICAL(booltmp)[i]) targetcols++; @@ -176,7 +180,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { } unqtmp = PROTECT(allocVector(INTSXP, targetcols)); protecti++; u = 0; - for (i=0; i ncol) + for (int i=0; i ncol) error(_("One or more values in 'id.vars' is invalid.")); } @@ -248,8 +252,8 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { if (isNewList(measure)) { tmp = PROTECT(unlist_(tmp2)); protecti++; } - for (i=0; i ncol) + for (int i=0; ilmax = 0; data->lmin = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); + data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose)); data->idcols = VECTOR_ELT(vars, 0); data->valuecols = VECTOR_ELT(vars, 1); @@ -296,29 +307,36 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int)); data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int)); data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE)); - for (i=0; ilvalues; i++) { + // first find max type of each output column. + for (int i=0; ilvalues; ++i) { // for each output column. tmp = VECTOR_ELT(data->valuecols, i); data->leach[i] = length(tmp); data->isidentical[i] = 1; // TODO - why 1 and not Rboolean TRUE? data->isfactor[i] = 0; // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable? data->maxtype[i] = 0; // R_alloc doesn't initialize so careful to here, relied on below data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i]; - data->lmin = (data->lmin < data->leach[i]) ? data->lmin : data->leach[i]; - for (j=0; jleach[i]; j++) { - thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); - if (isFactor(thiscol)) { - data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; - data->maxtype[i] = STRSXP; - } else { - type = TYPEOF(thiscol); - if (type > data->maxtype[i]) data->maxtype[i] = type; + for (int j=0; jleach[i]; ++j) { // for each input column. + int this_col_num = INTEGER(tmp)[j]; + if(this_col_num != NA_INTEGER){ + thiscol = VECTOR_ELT(DT, this_col_num-1); + if (isFactor(thiscol)) { + data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; + data->maxtype[i] = STRSXP; + } else { + type = TYPEOF(thiscol); + if (type > data->maxtype[i]) data->maxtype[i] = type; + } } } - for (j=0; jleach[i]; j++) { - thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); - if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { - data->isidentical[i] = 0; - break; + for (int j=0; jleach[i]; ++j) { + int this_col_num = INTEGER(tmp)[j]; + if(this_col_num != NA_INTEGER){ + thiscol = VECTOR_ELT(DT, this_col_num-1); + if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || + (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { + data->isidentical[i] = 0; + break; + } } } } @@ -392,6 +410,16 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType return ans; } +SEXP input_col_or_na(SEXP DT, struct processData* data, SEXP thisvaluecols, int out_col, int in_col) { + if (in_col < data->leach[out_col]) { + int input_column_num = INTEGER(thisvaluecols)[in_col]; + if (input_column_num != NA_INTEGER) { + return VECTOR_ELT(DT, input_column_num-1); + } + } + return allocNAVector(data->maxtype[out_col], data->nrow); +} + SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, struct processData *data) { for (int i=0; ilvalues; ++i) { SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); @@ -407,12 +435,8 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s for (int i=0; ilmax; ++i) { SEXP tmp = PROTECT(allocVector(VECSXP, data->lvalues)); for (int j=0; jlvalues; ++j) { - if (i < data->leach[j]) { - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); - SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, INTEGER(thisvaluecols)[i]-1)); - } else { - SET_VECTOR_ELT(tmp, j, allocNAVector(data->maxtype[j], data->nrow)); - } + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); + SET_VECTOR_ELT(tmp, j, input_col_or_na(DT, data, thisvaluecols, j, i)); } tmp = PROTECT(dt_na(tmp, seqcols)); SEXP w; @@ -427,18 +451,17 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s SEXP flevels = PROTECT(allocVector(VECSXP, data->lmax)); Rboolean *isordered = (Rboolean *)R_alloc(data->lmax, sizeof(Rboolean)); SEXP ansvals = PROTECT(allocVector(VECSXP, data->lvalues)); - for (int i=0; ilvalues; ++i) { + for (int i=0; ilvalues; ++i) {//for each output/value column. bool thisvalfactor = (data->maxtype[i] == VECSXP) ? false : valfactor; SEXP target = PROTECT(allocVector(data->maxtype[i], data->totlen)); // to keep rchk happy SET_VECTOR_ELT(ansvals, i, target); UNPROTECT(1); // still protected by virtue of being member of protected ansval. - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); // integer vector of column ids. int counter = 0; bool copyattr = false; - for (int j=0; jlmax; ++j) { + for (int j=0; jlmax; ++j) {// for each input column. int thisprotecti = 0; - SEXP thiscol = (j < data->leach[i]) ? VECTOR_ELT(DT, INTEGER(thisvaluecols)[j]-1) - : allocNAVector(data->maxtype[i], data->nrow); + SEXP thiscol = input_col_or_na(DT, data, thisvaluecols, i, j); if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { copyMostAttrib(thiscol, target); copyattr = true; From f5c65266983af9f52b5629fbdf8d43c066eaf4f6 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 9 May 2021 02:21:05 -0700 Subject: [PATCH 220/588] melt with custom variable columns using variable_table attribute (#4731) --- .Rbuildignore | 1 + NEWS.md | 2 + R/data.table.R | 2 +- R/fmelt.R | 147 ++++++++++++++++++- R/utils.R | 38 ++--- inst/tests/tests.Rraw | 82 ++++++++++- man/measure.Rd | 64 +++++++++ man/melt.data.table.Rd | 7 +- src/data.table.h | 1 + src/fmelt.c | 244 ++++++++++++++++++++------------ src/init.c | 2 + vignettes/datatable-reshape.Rmd | 91 +++++++++++- 12 files changed, 561 insertions(+), 120 deletions(-) create mode 100644 man/measure.Rd diff --git a/.Rbuildignore b/.Rbuildignore index ad51ae2da7..a6cb72b2a9 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,4 @@ +.dir-locals.el ^\.Rprofile$ ^data\.table_.*\.tar\.gz$ ^vignettes/plots/figures$ diff --git a/NEWS.md b/NEWS.md index 11250c6c94..ab3d1d97d3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -56,6 +56,8 @@ 8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing. +9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage for reporting, and to @tdhock for implementing. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index 9ac5ae7d6c..7d3acef2fa 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -948,7 +948,7 @@ replace_dot_alias = function(e) { } else { if (colsub %iscall% 'patterns') { # each pattern gives a new filter condition, intersect the end result - .SDcols = Reduce(intersect, do_patterns(colsub, names_x)) + .SDcols = Reduce(intersect, eval_with_cols(colsub, names_x)) } else { .SDcols = eval(colsub, parent.frame(), parent.frame()) # allow filtering via function in .SDcols, #3950 diff --git a/R/fmelt.R b/R/fmelt.R index 3594fce8ca..936876b4d8 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -3,7 +3,7 @@ # reshape2 package is deprecated since December 2017, so we'll deprecate our # redirection as well -melt <- function(data, ..., na.rm = FALSE, value.name = "value") { +melt = function(data, ..., na.rm = FALSE, value.name = "value") { if (is.data.table(data)) { UseMethod("melt", data) # if data is not data.table and reshape2 is installed, this won't dispatch to reshape2's method; @@ -22,10 +22,144 @@ melt <- function(data, ..., na.rm = FALSE, value.name = "value") { patterns = function(..., cols=character(0L)) { # if ... has no names, names(list(...)) will be ""; # this assures they'll be NULL instead - p = unlist(list(...), use.names = any(nzchar(names(...)))) + L = list(...) + p = unlist(L, use.names = any(nzchar(names(L)))) if (!is.character(p)) stop("Input patterns must be of type character.") - lapply(p, grep, cols) + matched = lapply(p, grep, cols) + # replace with lengths when R 3.2.0 dependency arrives + if (length(idx <- which(sapply(matched, length) == 0L))) + stop('Pattern', if (length(idx) > 1L) 's', ' not found: [', + paste(p[idx], collapse = ', '), ']') + matched +} + +measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { + # 1. basic error checking. + if (!missing(sep) && !missing(pattern)) { + stop("both sep and pattern arguments used in measure; must use either sep or pattern (not both)") + } + if (!(is.character(multiple.keyword) && length(multiple.keyword)==1 && !is.na(multiple.keyword) && nchar(multiple.keyword)>0)) { + stop("multiple.keyword must be a character string with nchar>0") + } + if (!is.character(cols)) { + stop("cols must be a character vector of column names") + } + # 2. compute conversion function list with group names. + mcall = match.call() + L = as.list(mcall)[-1] + formal.names <- names(formals()) + fun.list = L[-which(names(L) %in% formal.names)] + user.named = names(fun.list) != "" + is.symb = sapply(fun.list, is.symbol) + bad.i = which((!user.named) & (!is.symb)) + if (length(bad.i)) { + stop("each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: ", paste(bad.i, collapse=",")) + } + names(fun.list)[!user.named] = sapply(fun.list[!user.named], paste) + # group names error checking. + group.is.formal <- names(fun.list) %in% formal.names + if (any(group.is.formal)) { + bad.names <- names(fun.list)[group.is.formal] + stop("group names specified in ... conflict with measure argument names; please fix by changing group names: ", paste(bad.names, collapse=",")) + } + err.names.unique <- function(err.what, name.vec) { + name.tab = table(name.vec) + bad.counts = name.tab[1 < name.tab] + if (length(bad.counts)) { + stop(err.what, " names should be unique, problems: ", paste(names(bad.counts), collapse=",")) + } + } + err.args.groups <- function(type, N){ + if (N != length(fun.list)) { + stop("number of ... arguments to measure =", length(fun.list), " must be same as ", type, " =", N) + } + } + err.names.unique("measure group", names(fun.list)) + # 3. compute initial group data table, used as variable_table attribute. + group.mat = if (!missing(pattern)) { + if (!is.character(pattern)) { + stop("pattern must be character string") + } + match.vec = regexpr(pattern, cols, perl=TRUE) + measure.vec = which(0 < match.vec) + if (length(measure.vec) == 0L) { + stop("pattern did not match any cols, so nothing would be melted; fix by changing pattern") + } + start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE] + if (is.null(start)) { + stop("pattern must contain at least one capture group (parenthesized sub-pattern)") + } + err.args.groups("number of capture groups in pattern", ncol(start)) + end = attr(match.vec, "capture.length")[measure.vec,]+start-1L + names.mat = matrix(cols[measure.vec], nrow(start), ncol(start)) + substr(names.mat, start, end) + } else { #pattern not specified, so split using sep. + if (!is.character(sep)) { + stop("sep must be character string") + } + list.of.vectors = strsplit(cols, sep, fixed=TRUE) + vector.lengths = sapply(list.of.vectors, length) + n.groups = max(vector.lengths) + if (n.groups == 1) { + stop("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") + } + err.args.groups("max number of items after splitting column names", n.groups) + measure.vec = which(vector.lengths==n.groups) + do.call(rbind, list.of.vectors[measure.vec]) + } + err.names.unique("measured column", cols[measure.vec]) + uniq.mat <- unique(group.mat) + if (nrow(uniq.mat) < nrow(group.mat)) { + stop("number of unique column IDs =", nrow(uniq.mat), " is less than number of melted columns =", nrow(group.mat), "; fix by changing pattern/sep") + } + colnames(group.mat) = names(fun.list) + group.dt = data.table(group.mat) + # 4. apply conversion functions to group data table. + for (group.i in which(user.named)) { + group.name = names(fun.list)[[group.i]] + fun = eval(fun.list[[group.name]], parent.frame(1L)) + if (!is.function(fun) || length(formals(args(fun)))==0) { + stop("each ... argument to measure must be a function with at least one argument, problem: ", group.name) + } + group.val = fun(group.dt[[group.name]]) + if (!(is.atomic(group.val) && length(group.val)==nrow(group.dt))) { + stop("each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: ", group.name) + } + if (all(is.na(group.val))) { + stop(group.name, " conversion function returned vector of all NA") + } + set(group.dt, j=group.name, value=group.val) + } + group.uniq <- unique(group.dt) + if (nrow(group.uniq) < nrow(group.dt)) { + stop("number of unique groups after applying type conversion functions less than number of groups, change type conversion") + } + # 5. compute measure.vars list or vector. + if (multiple.keyword %in% names(fun.list)) {# multiple output columns. + if (!is.character(group.dt[[multiple.keyword]])) { + stop(multiple.keyword, " column class=", class(group.dt[[multiple.keyword]])[[1L]], " after applying conversion function, but must be character") + } + is.other = names(group.dt) != multiple.keyword + if (!any(is.other)) { + stop(multiple.keyword, " is the only group; fix by creating at least one more group") + } + other.values = lapply(group.dt[, is.other, with=FALSE], unique) + other.values$stringsAsFactors = FALSE + other.dt = data.table(do.call(expand.grid, other.values)) + measure.list = structure(list(), variable_table=other.dt) + column.values = unique(group.dt[[multiple.keyword]]) + for(column.val in column.values){ + select.dt = data.table(other.dt) + set(select.dt, j=multiple.keyword, value=column.val) + measure.list[[column.val]] = data.table( + measure.vec, group.dt + )[select.dt, measure.vec, on=names(select.dt)] + } + measure.list + } else {# single output column. + structure(measure.vec, variable_table=group.dt) + } } melt.data.table = function(data, id.vars, measure.vars, variable.name = "variable", @@ -35,8 +169,11 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl if (missing(id.vars)) id.vars=NULL if (missing(measure.vars)) measure.vars = NULL measure.sub = substitute(measure.vars) - if (measure.sub %iscall% "patterns") { - measure.vars = do_patterns(measure.sub, names(data)) + if (is.call(measure.sub)) { + eval.result = eval_with_cols(measure.sub, names(data)) + if (!is.null(eval.result)) { + measure.vars = eval.result + } } if (is.list(measure.vars) && length(measure.vars) > 1L) { meas.nm = names(measure.vars) diff --git a/R/utils.R b/R/utils.R index babc9bd6c7..45678f5a4d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -105,22 +105,28 @@ brackify = function(x, quote=FALSE) { } # patterns done via NSE in melt.data.table and .SDcols in `[.data.table` -do_patterns = function(pat_sub, all_cols) { - # received as substitute(patterns(...)) - pat_sub = as.list(pat_sub)[-1L] - # identify cols = argument if present - idx = which(names(pat_sub) == "cols") - if (length(idx)) { - cols = eval(pat_sub[["cols"]], parent.frame(2L)) - pat_sub = pat_sub[-idx] - } else cols = all_cols - pats = lapply(pat_sub, eval, parent.frame(2L)) - matched = patterns(pats, cols=cols) - # replace with lengths when R 3.2.0 dependency arrives - if (length(idx <- which(sapply(matched, length) == 0L))) - stop('Pattern', if (length(idx) > 1L) 's', ' not found: ', brackify(pats[idx])) - - return(matched) +# was called do_patterns() before PR#4731 +eval_with_cols = function(orig_call, all_cols) { + parent = parent.frame(2L) + fun_uneval = orig_call[[1L]] + # take fun from either calling env (parent) or from data.table + fun = tryCatch({ + maybe_fun = eval(fun_uneval, parent) + # parent env could have a non-function with this name, which we + # should ignore. + stopifnot(is.function(maybe_fun)) + maybe_fun + }, error=function(e) { + eval(fun_uneval)#take function from data.table namespace. + }) + if (!is.primitive(fun)) { + named_call = match.call(fun, orig_call) + if ("cols" %in% names(formals(fun)) && !"cols" %in% names(named_call)) { + named_call[["cols"]] = all_cols + } + named_call[[1L]] = fun + eval(named_call, parent) + } } # check UTC status diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e13fea88ca..819c1ba0ca 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15623,7 +15623,7 @@ DT <- data.table( f_1 = factor(c('a', 'c', 'b', NA, 'c', 'b', 'c', 'c', NA, 'c', NA, 'c', 'a', 'b', NA, NA, NA, 'a')), c_1 = c("a", "c", NA, NA, NA, "c", "b", NA, "a", "b", NA, "a", "c", "b", "c", "b", "a", "b") ) -test(2063.1, melt(DT, id=1:2, measure=3:4), melt(DT, id=c("i_1", "i_2"), measure=c("f_1", "c_1"))) +test(2063.1, melt(DT, id=1:2, measure.vars=3:4), melt(DT, id=c("i_1", "i_2"), measure.vars=c("f_1", "c_1"))) ## fun --> fun.aggregate DT = melt(as.data.table(ChickWeight), id.vars=2:4) setnames(DT, tolower(names(DT))) @@ -17435,3 +17435,83 @@ exid = data.table(id=1, expected) test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE)[, .(a, b)], data.table(a=2, b=2))#not testing variable because it is not computed correctly, #4455 + +# new variable.name attribute for measure.vars, PR#4731 for multiple issues +measure = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. +test(2183.01, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +measure = list("foo", "bar")#measure below should not use this since it is not a function. +test(2183.02, melt(DTid, measure.vars=measure(value.name, num=as.complex, pattern="([ab])([12])")), error="Type 'complex' not supported for joining/merging") +test(2183.03, melt(DTid, measure.vars=structure(list(a=c(NA,"a2"),b=c("b1","b2")), variable_table=data.table(number=as.complex(1:2)))), error="variable_table does not support column type 'complex' for column 'number'") +test(2183.04, melt(DTid, measure.vars=measure(value.name, istr, pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) +test(2183.05, melt(DTid, measure.vars=measure(column, istr, pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword +test(2183.06, melt(DTid, measure.vars=structure(list(1, 2), variable_table="foo")), error="variable_table attribute of measure.vars should be either NULL or a data table") +test(2183.07, melt(DTid, measure.vars=structure(1:3, variable_table="foo")), error="variable_table attribute of measure.vars should be either NULL or a data table") +test(2183.08, melt(DTid, measure.vars=structure(1:3, variable_table=data.table())), error="variable_table attribute of measure.vars should be a data table with at least one column") +test(2183.09, melt(DTid, measure.vars=structure(1:3, variable_table=data.table(x=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") +test(2183.10, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=data.table(x=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") +test(2183.11, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=list(x=1:2, y=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2")#make sure to check each list element, not just the first. +# general measure errors. +iris.dt = data.table(datasets::iris) +test(2183.20, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used in measure; must use either sep or pattern (not both)") +# school example. +schools.wide <- data.table( + school = c("A","B"), + read_1 = c(1.1,2.1), read_1_sp = c(T,T), + read_2 = c(1.2,2.2), + math_1 = c(10.1,20.1), math_1_sp = c(T,T), + math_2 = c(NA,20.2), math_2_sp = c(NA,F)) +schools.tall <- melt(schools.wide, na.rm=TRUE, measure.vars=measure(subject, number=as.integer, value.name=function(x)ifelse(x=="", "score", "sp"), pattern="([^_]+)_([12])(.*)")) +schools.expected = data.table(school=c("A","B","A","B","B"), subject=c("read","read","math","math","math"), number=as.integer(c(1,1,1,1,2)), score=c(1.1,2.1,10.1,20.1,20.2), sp=c(T,T,T,T,F)) +test(2183.21, schools.tall, schools.expected) +who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3) +test(2183.22, melt(who, measure.vars=measure(diagnosis, gender, ages, ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), pattern="new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))")), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) +wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) +# measure with sep= +test(2183.23, melt(wide.again, na.rm=TRUE, measure.vars=measure(value.name, subject, number=as.integer))[order(score)], schools.expected)#should work without sep due to same default _ as dcast. +test(2183.24, names(melt(iris.dt, measure.vars=measure(value.name, dim, sep="."))), c("Species", "dim", "Sepal", "Petal")) +test(2183.25, names(melt(iris.dt, measure.vars=measure(part, value.name, sep="."))), c("Species", "part", "Length", "Width")) +test(2183.26, names(melt(iris.dt, measure.vars=measure(part, dim, sep="."))), c("Species", "part", "dim", "value")) +test(2183.27, melt(iris.dt, measure.vars=measure(value.name, dim="bar", sep=".")), error="each ... argument to measure must be a function with at least one argument, problem: dim") +test(2183.28, melt(iris.dt, measure.vars=measure(value.name, dim, baz, sep=".")), error="number of ... arguments to measure =3 must be same as max number of items after splitting column names =2") +test(2183.29, melt(iris.dt, measure.vars=measure()), error="each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") +# patterns with iris data. +test(2183.40, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value")) +# measure with pattern= +test(2183.41, melt(DTid, measure.vars=measure(value.name, istr="bar", pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") +test(2183.42, melt(DTid, measure.vars=measure(value.name, istr=function()1, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") +test(2183.43, melt(DTid, measure.vars=measure(value.name, istr=interactive, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") +test(2183.44, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: istr") +test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2") +test(2183.46, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") +test(2183.47, melt(iris.dt, measure.vars=measure(function(x)factor(x), pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") +test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="measure group names should be unique, problems: value.name") +# measure with factor conversion. +myfac = function(x)factor(x)#user-defined conversion function. +test(2183.60, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) +# measure errors. +iris.i <- 1 +iris.num <- datasets::iris[iris.i, 1:4] +iris.days <- data.table( + day1=iris.num, day2=iris.num, Species=iris$Species[iris.i]) +test(2183.61, melt(iris.days, measure.vars=measure(before=as.integer, value.name, dim, sep=".")), error="before conversion function returned vector of all NA", warning="NAs introduced by coercion") +test(2183.62, melt(iris.days, measure.vars=measure(before=function(x)rep(4, length(x)), value.name, dim, sep=".")), error="number of unique groups after applying type conversion functions less than number of groups, change type conversion") +test(2183.63, melt(iris.days, measure.vars=measure(before, value.name, dim, pattern="(day)[12][.](.*)[.](.*)")), error="number of unique column IDs =4 is less than number of melted columns =8; fix by changing pattern/sep") +test(2183.64, melt(iris.days, measure.vars=measure(day=as.integer, value.name, dim, pattern="day(.)[.](.*)[.](.*)")), data.table(Species=factor("setosa"), day=as.integer(c(1,2,1,2)), dim=c("Length","Length","Width","Width"), Sepal=c(5.1,5.1,3.5,3.5), Petal=c(1.4,1.4,0.2,0.2))) +test(2183.65, melt(iris.days, measure.vars=measure(pattern="day")), error="pattern must contain at least one capture group (parenthesized sub-pattern)") +test(2183.66, melt(iris.days, measure.vars=measure(value.name, pattern="(.*)")), error="value.name is the only group; fix by creating at least one more group") +test(2183.67, melt(iris.days, measure.vars=measure(foo, bar, pattern="(foo)(bar)")), error="pattern did not match any cols, so nothing would be melted; fix by changing pattern") +test(2183.68, melt(iris.days, measure.vars=measure(value.name, bar, pattern="(foo)(bar)")), error="pattern did not match any cols, so nothing would be melted; fix by changing pattern") +test(2183.69, melt(data.table(ff=1, ff=2), measure.vars=measure(letter, number, pattern="(.)(.)")), error="measured column names should be unique, problems: ff") +test(2183.70, melt(data.table(f_f=1, f_f=2), measure.vars=measure(letter, number)), error="measured column names should be unique, problems: f_f") +test(2183.71, melt(iris.days, measure.vars=measure(value.name=as.integer, variable, pattern="day(.)[.](.*)")), error="value.name column class=integer after applying conversion function, but must be character") +test(2183.72, melt(data.table(ff=1, ff=2, a=3, b=4), measure.vars=measure(letter, pattern="([ab])"), id.vars="ff"), data.table(ff=1, letter=c("a","b"), value=c(3,4)))#duplicate column names are fine if they are not matched by pattern. +test(2183.73, melt(DTid, measure.vars=measure(letter, multiple.keyword, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: multiple.keyword") +test(2183.74, melt(DTid, measure.vars=measure(letter, number, multiple.keyword=as.integer, pattern="([ab])([12])")), error="multiple.keyword must be a character string") +test(2183.75, melt(DTid, measure.vars=measure(letter, number, multiple.keyword=NA_character_, pattern="([ab])([12])")), error="multiple.keyword must be a character string") +test(2183.76, melt(DTid, measure.vars=measure(letter, number, multiple.keyword="", pattern="([ab])([12])")), error="multiple.keyword must be a character string with nchar>0") +test(2183.77, melt(DTid, measure.vars=measure(letter, cols, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: cols") +test(2183.78, melt(DTid, measure.vars=measure(letter, cols=as.integer, pattern="([ab])([12])")), error="cols must be a character vector of column names") +test(2183.79, melt(DTid, measure.vars=measure(letter, number, pattern=as.integer)), error="pattern must be character string") +test(2183.80, melt(DTid, measure.vars=measure(letter, number, sep=as.integer)), error="sep must be character string") +##melt(DTid, measure.vars=measure(letter, number, sep=NA_character_) +##melt(DTid, measure.vars=measure(letter, number, sep=character()) diff --git a/man/measure.Rd b/man/measure.Rd new file mode 100644 index 0000000000..964660b6f8 --- /dev/null +++ b/man/measure.Rd @@ -0,0 +1,64 @@ +\name{measure} +\alias{measure} +\title{Specify measure.vars via regex or separator} +\description{ +\code{measure} computes an integer vector or list which can be passed as +the \code{measure.vars} argument to \code{melt}. +See the \code{Efficient reshaping using +data.tables} vignette linked below to learn more. +} +\usage{ +measure(\dots, sep, pattern, cols, multiple.keyword="value.name") +} +\arguments{ + \item{\dots}{One or more (1) symbols (without argument name; symbol + is used for output variable column name) or (2) + functions (with argument name that is used for output variable + column name). Must have same number of arguments as groups that are + specified by either \code{sep} or \code{pattern} arguments.} + \item{sep}{Separator to split \code{cols} into groups. Columns that + result in the maximum number of groups are considered measure variables.} + \item{pattern}{Perl-compatible regex with capture groups to match to + \code{cols}. Columns that match the regex are considered measure variables.} + \item{cols}{A character vector of column names.} + \item{multiple.keyword}{A string, if used in \code{\dots}, then + measure returns a list and melt returns multiple + value columns (with names defined by the unique values in that + group). Otherwise if the string not used in \code{\dots}, then + measure returns a vector and melt returns a single value column.} +} +\seealso{ + \code{\link{melt}}, + \url{https://github.com/Rdatatable/data.table/wiki/Getting-started} +} +\examples{ +(two.iris = data.table(datasets::iris)[c(1,150)]) +# melt into a single value column. +melt(two.iris, measure.vars = measure(part, dim, sep=".")) +# melt into two value columns, one for each part. +melt(two.iris, measure.vars = measure(value.name, dim, sep=".")) +# melt into two value columns, one for each dim. +melt(two.iris, measure.vars = measure(part, value.name, sep=".")) +# melt using either sep or pattern, converting child number to integer. +(two.families = data.table(sex_child1="M", sex_child2="F", age_child1=10, age_child2=20)) +print(melt(two.families, measure.vars = measure( + value.name, child=as.integer, + sep="_child" +)), class=TRUE) +print(melt(two.families, measure.vars = measure( + value.name, child=as.integer, + pattern="(.*)_child(.)" +)), class=TRUE) +# inspired by data(who, package="tidyr") +(who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3)) +# melt to three variable columns, all character. +melt(who, measure.vars = measure(diagnosis, gender, ages, pattern="new_?(.*)_(.)(.*)")) +# melt to five variable columns, two numeric (with custom conversion). +print(melt(who, measure.vars = measure( + diagnosis, gender, ages, + ymin=as.numeric, + ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), + pattern="new_?(.*)_(.)(([0-9]{2})([0-9]{0,2}))" +)), class=TRUE) +} +\keyword{data} diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 5ff25005d5..3794231f99 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -31,7 +31,7 @@ non-measure columns will be assigned to it. If integer, must be positive; see De } For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } -\item{variable.name}{name (default \code{'variable'}) of output column containing information about which input column(s) were melted. If \code{measure.vars} is an integer/character vector, then each entry of this column contains the name of a melted column from \code{data}. If \code{measure.vars} is a list of integer/character vectors, then each entry of this column contains an integer indicating an index/position in each of those vectors.} +\item{variable.name}{name (default \code{'variable'}) of output column containing information about which input column(s) were melted. If \code{measure.vars} is an integer/character vector, then each entry of this column contains the name of a melted column from \code{data}. If \code{measure.vars} is a list of integer/character vectors, then each entry of this column contains an integer indicating an index/position in each of those vectors. If \code{measure.vars} has attribute \code{variable_table} then it must be a data table with nrow = length of \code{measure.vars} vector(s), each row describing the corresponding measured variables(s), (typically created via \code{measure}) and its columns will be output instead of the \code{variable.name} column.} \item{value.name}{name for the molten data values column(s). The default name is \code{'value'}. Multiple names can be provided here for the case when \code{measure.vars} is a \code{list}, though note well that the names provided in \code{measure.vars} take precedence. } \item{na.rm}{If \code{TRUE}, \code{NA} values will be removed from the molten data.} @@ -142,6 +142,11 @@ melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE) DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] melt(DT.missing.cols, measure=list(d=1:2, c="c_1", f=c(NA, "f_2"))) +# specifying columns to melt via separator. +melt(DT.missing.cols, measure=measure(value.name, number=as.integer, sep="_")) + +# specifying columns to melt via regex. +melt(DT.missing.cols, measure=measure(value.name, number=as.integer, pattern="(.)_(.)")) } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} diff --git a/src/data.table.h b/src/data.table.h index 67b4fbe82b..6cb5413918 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -101,6 +101,7 @@ extern SEXP sym_inherits; extern SEXP sym_datatable_locked; extern SEXP sym_tzone; extern SEXP sym_old_fread_datetime_character; +extern SEXP sym_variable_table; extern double NA_INT64_D; extern long long NA_INT64_LL; extern Rcomplex NA_CPLX; // initialized in init.c; see there for comments diff --git a/src/fmelt.c b/src/fmelt.c index 5357f905c0..3a1da3bdc8 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -104,17 +104,17 @@ SEXP measurelist(SEXP measure, SEXP dtnames) { for (int i=0; i ncol) @@ -242,18 +242,18 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { } idcols = PROTECT(tmp); protecti++; switch(TYPEOF(measure)) { - case STRSXP : tmp2 = PROTECT(chmatch(measure, dtnames, 0)); protecti++; break; - case REALSXP : tmp2 = PROTECT(coerceVector(measure, INTSXP)); protecti++; break; - case INTSXP : tmp2 = measure; break; - case VECSXP : tmp2 = PROTECT(measurelist(measure, dtnames)); protecti++; break; - default : error(_("Unknown 'measure.vars' type %s, must be character or integer vector"), type2char(TYPEOF(measure))); + case STRSXP : tmp2 = PROTECT(chmatch(measure, dtnames, 0)); protecti++; break; + case REALSXP : tmp2 = PROTECT(coerceVector(measure, INTSXP)); protecti++; break; + case INTSXP : tmp2 = measure; break; + case VECSXP : tmp2 = PROTECT(measurelist(measure, dtnames)); protecti++; break; + default : error(_("Unknown 'measure.vars' type %s, must be character or integer vector"), type2char(TYPEOF(measure))); } tmp = tmp2; if (isNewList(measure)) { tmp = PROTECT(unlist_(tmp2)); protecti++; } for (int i=0; ileach[i]; ++j) { int this_col_num = INTEGER(tmp)[j]; if(this_col_num != NA_INTEGER){ @@ -343,6 +346,26 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna if (data->narm) { SET_VECTOR_ELT(data->RCHK, 1, data->naidx = allocVector(VECSXP, data->lmax)); } + // TDH 1 Oct 2020 variable table. + data->variable_table = getAttrib(measure, sym_variable_table); + if (isNull(data->variable_table)) { + // We need to include this check first because isNewList(NULL) == + // TRUE + data->lvars = 1; + } else if (isNewList(data->variable_table)) { + data->lvars = length(data->variable_table); + if (data->lvars == 0) { + error(_("variable_table attribute of measure.vars should be a data table with at least one column")); + } + for (int i=0; ivariable_table); ++i) { + int nrow = length(VECTOR_ELT(data->variable_table, i)); + if (data->lmax != nrow) { + error(_("variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =%d"), data->lmax); + } + } + } else {//neither NULL nor DT. + error(_("variable_table attribute of measure.vars should be either NULL or a data table")); + } } static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType, Rboolean * isRowOrdered) @@ -418,7 +441,7 @@ SEXP input_col_or_na(SEXP DT, struct processData* data, SEXP thisvaluecols, int } } return allocNAVector(data->maxtype[out_col], data->nrow); -} +} SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, struct processData *data) { for (int i=0; ilvalues; ++i) { @@ -545,69 +568,99 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str // reworked in PR#3455 to create character/factor directly for efficiency, and handle duplicates (#1754) // data->nrow * data->lmax == data->totlen int protecti=0; - SEXP ansvars=PROTECT(allocVector(VECSXP, 1)); protecti++; + SEXP ansvars=PROTECT(allocVector(VECSXP, data->lvars)); protecti++; SEXP target; if (data->lvalues==1 && length(VECTOR_ELT(data->valuecols, 0)) != data->lmax) error(_("Internal error: fmelt.c:getvarcols %d %d"), length(VECTOR_ELT(data->valuecols, 0)), data->lmax); // # nocov - if (!varfactor) { - SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen)); - if (data->lvalues == 1) { - const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); - for (int j=0, ansloc=0; jlmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - SEXP str = STRING_ELT(dtnames, thisvaluecols[j]-1); - for (int k=0; kvariable_table)) { + if (!varfactor) { + SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen)); + if (data->lvalues == 1) {//one value column to output. + const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); + for (int j=0, ansloc=0; jlmax; ++j) { + const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + SEXP str = STRING_ELT(dtnames, thisvaluecols[j]-1); + for (int k=0; klmax; ++j) { + const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + char buff[20]; + snprintf(buff, 20, "%d", level++); + for (int k=0; klmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - char buff[20]; - snprintf(buff, 20, "%d", level++); - SEXP str = PROTECT(mkChar(buff)); - for (int k=0; ktotlen)); + SEXP levels; + int *td = INTEGER(target); + if (data->lvalues == 1) {//one value column to output. + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); + int len = length(thisvaluecols); + levels = PROTECT(allocVector(STRSXP, len)); protecti++; + const int *vd = INTEGER(thisvaluecols); + for (int j=0; jnarm && length(VECTOR_ELT(data->naidx, j))==0)) { numRemove++; md[j]=0; } + } + if (numRemove) { + SEXP newlevels = PROTECT(allocVector(STRSXP, len-numRemove)); protecti++; + for (int i=0, loc=0; ilmax; ++j) { + const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + for (int k=0; klmax)); protecti++; + for (int j=0, ansloc=0; jlmax; ++j) { + const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + char buff[20]; + snprintf(buff, 20, "%d", nlevel+1); + SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels + for (int k=0; ktotlen)); - SEXP levels; - int *td = INTEGER(target); - if (data->lvalues == 1) {//single output column. - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); - int len = length(thisvaluecols); - levels = PROTECT(allocVector(STRSXP, len)); protecti++; - const int *vd = INTEGER(thisvaluecols); - for (int j=0; jnarm && length(VECTOR_ELT(data->naidx, j))==0)) { numRemove++; md[j]=0; } - } - if (numRemove) { - SEXP newlevels = PROTECT(allocVector(STRSXP, len-numRemove)); protecti++; - for (int i=0, loc=0; ilmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - for (int k=0; klmax)); protecti++; + } else { //variable_table specified + for (int out_col_i=0; out_col_ilvars; ++out_col_i) { + SEXP out_col = VECTOR_ELT(data->variable_table, out_col_i); + SET_VECTOR_ELT(ansvars, out_col_i, target=allocVector(TYPEOF(out_col), data->totlen)); for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; - char buff[20]; - snprintf(buff, 20, "%d", nlevel+1); - SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels - for (int k=0; kvariable_table, R_NamesSymbol), out_col_i))); + } } } - setAttrib(target, R_LevelsSymbol, levels); - setAttrib(target, R_ClassSymbol, ScalarString(char_factor)); } UNPROTECT(protecti); return(ansvars); @@ -729,22 +782,31 @@ SEXP fmelt(SEXP DT, SEXP id, SEXP measure, SEXP varfactor, SEXP valfactor, SEXP ansids = PROTECT(getidcols(DT, dtnames, verbose, &data)); protecti++; // populate 'ans' - ans = PROTECT(allocVector(VECSXP, data.lids+1+data.lvalues)); protecti++; // 1 is for variable column + int ncol_ans = data.lids+data.lvars+data.lvalues; + ans = PROTECT(allocVector(VECSXP, ncol_ans)); protecti++; // 1 is for variable column for (int i=0; i Date: Sun, 9 May 2021 03:59:55 -0600 Subject: [PATCH 221/588] added output= to new test 1035.013 (#4983) --- inst/tests/tests.Rraw | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 819c1ba0ca..6eebfb2222 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3014,7 +3014,8 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) error="Unknown 'id.vars' type raw") test(1035.012, melt(DT, id.vars=1:3, measure.vars=as.raw(0)), error="Unknown 'measure.vars' type raw") - test(1035.013, melt(data.table(a=1, b=1), id.vars=c(1,1)), data.table(a=1, a.1=1, variable=factor("b"), value=1)) + test(1035.013, melt(data.table(a=1, b=1), id.vars=c(1,1)), data.table(a=1, a.1=1, variable=factor("b"), value=1), + output="Duplicate column names found") test(1035.014, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1,c("1","2")), a=1, b=1)) test(1035.015, melt(data.table(a=1+2i, b=1), id.vars="a"), error="Unknown column type 'complex' for column 'a' in 'data'") From d8e0fb37efb7673a20ad79afb1685002c3fd65aa Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 9 May 2021 04:37:40 -0600 Subject: [PATCH 222/588] remove trailing whitespace (#4984) --- R/IDateTime.R | 12 ++++++------ R/as.data.table.R | 2 +- R/data.table.R | 4 ++-- R/setops.R | 2 +- src/assign.c | 8 ++++---- src/dogroups.c | 10 +++++----- src/fifelse.c | 6 +++--- src/forder.c | 4 ++-- src/fread.c | 4 ++-- src/fread.h | 2 +- src/freadR.c | 2 +- src/fsort.c | 8 ++++---- src/fwriteR.c | 2 +- src/nqrecreateindices.c | 2 +- src/openmp-utils.c | 2 +- src/snprintf.c | 30 +++++++++++++++--------------- src/utils.c | 8 ++++---- 17 files changed, 54 insertions(+), 54 deletions(-) diff --git a/R/IDateTime.R b/R/IDateTime.R index 0c0be82e83..0bf7bf0fac 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -240,20 +240,20 @@ rep.ITime = function (x, ...) class(y) = "ITime" # unlass and rep could feasibly not copy, hence use class<- not setattr() y } - -round.ITime <- function(x, digits = c("hours", "minutes"), ...) + +round.ITime <- function(x, digits = c("hours", "minutes"), ...) { (setattr(switch(match.arg(digits), hours = as.integer(round(unclass(x)/3600)*3600), - minutes = as.integer(round(unclass(x)/60)*60)), + minutes = as.integer(round(unclass(x)/60)*60)), "class", "ITime")) -} +} -trunc.ITime <- function(x, units = c("hours", "minutes"), ...) +trunc.ITime <- function(x, units = c("hours", "minutes"), ...) { (setattr(switch(match.arg(units), hours = as.integer(unclass(x)%/%3600*3600), - minutes = as.integer(unclass(x)%/%60*60)), + minutes = as.integer(unclass(x)%/%60*60)), "class", "ITime")) } diff --git a/R/as.data.table.R b/R/as.data.table.R index 9509774075..a706aecb86 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -131,7 +131,7 @@ as.data.table.list = function(x, eachncol = integer(n) missing.check.names = missing(check.names) origListNames = if (missing(.named)) names(x) else NULL # as.data.table called directly, not from inside data.table() which provides .named, #3854 - empty_atomic = FALSE + empty_atomic = FALSE for (i in seq_len(n)) { xi = x[[i]] if (is.null(xi)) next # eachncol already initialized to 0 by integer() above diff --git a/R/data.table.R b/R/data.table.R index 7d3acef2fa..8af2e6cf0b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -160,7 +160,7 @@ replace_dot_alias = function(e) { else if (!isTRUEorFALSE(keyby)) stop("When by and keyby are both provided, keyby must be TRUE or FALSE") } - } + } bynull = !missingby && is.null(by) #3530 byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI" naturaljoin = FALSE @@ -1347,7 +1347,7 @@ replace_dot_alias = function(e) { if (is.data.table(jval)) { # should set the parent class only when jval is a plain data.table #4324 - if (identical(class(jval), c('data.table', 'data.frame'))) + if (identical(class(jval), c('data.table', 'data.frame'))) setattr(jval, 'class', class(x)) # fix for #64 if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) setattr(jval, 'sorted', key(x)) diff --git a/R/setops.R b/R/setops.R index 89cf3fd81c..bd3a4eed27 100644 --- a/R/setops.R +++ b/R/setops.R @@ -63,7 +63,7 @@ fintersect = function(x, y, all=FALSE) { x = shallow(x)[, ".seqn" := rowidv(x)] y = shallow(y)[, ".seqn" := rowidv(y)] jn.on = c(".seqn",setdiff(names(y),".seqn")) - # fixes #4716 by preserving order of 1st (uses y[x] join) argument instead of 2nd (uses x[y] join) + # fixes #4716 by preserving order of 1st (uses y[x] join) argument instead of 2nd (uses x[y] join) y[x, .SD, .SDcols=setdiff(names(y),".seqn"), nomatch=NULL, on=jn.on] } else { z = funique(x) # fixes #3034. When .. prefix in i= is implemented (TODO), this can be x[funique(..y), on=, multi=] diff --git a/src/assign.c b/src/assign.c index 3b9aba0074..bcd7f29265 100644 --- a/src/assign.c +++ b/src/assign.c @@ -156,21 +156,21 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) SET_OBJECT(newdt, OBJECT(dt)); IS_S4_OBJECT(dt) ? SET_S4_OBJECT(newdt) : UNSET_S4_OBJECT(newdt); // To support S4 objects that incude data.table //SHALLOW_DUPLICATE_ATTRIB(newdt, dt); // SHALLOW_DUPLICATE_ATTRIB would be a bit neater but is only available from R 3.3.0 - + // TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It // also increases truelength. Perhaps make that distinction, then, and split out, but marked // so that the next change knows to duplicate. // keepattr() also merely points to the entire attrbutes list and thus doesn't allow replacing // some of its elements. - + // We copy all attributes that refer to column names so that calling setnames on either // the original or the shallow copy doesn't break anything. SEXP index = PROTECT(getAttrib(dt, sym_index)); protecti++; setAttrib(newdt, sym_index, shallow_duplicate(index)); - + SEXP sorted = PROTECT(getAttrib(dt, sym_sorted)); protecti++; setAttrib(newdt, sym_sorted, duplicate(sorted)); - + SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++; SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++; if (isNull(cols)) { diff --git a/src/dogroups.c b/src/dogroups.c index d54dede2c1..5aa827166b 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -49,7 +49,7 @@ static bool anySpecialStatic(SEXP x) { if (isNewList(x)) { if (TRUELENGTH(x)<0) return true; // test 2158 - for (int i=0; i-1)) continue; diff --git a/src/fifelse.c b/src/fifelse.c index 398cefb212..5218cf7b4f 100644 --- a/src/fifelse.c +++ b/src/fifelse.c @@ -154,7 +154,7 @@ SEXP fcaseR(SEXP na, SEXP rho, SEXP args) { "Note that the default argument must be named explicitly, e.g., default=0"), narg); } if (narg==0) return R_NilValue; - + SEXP cons0 = PROTECT(eval(SEXPPTR_RO(args)[0], rho)); SEXP value0 = PROTECT(eval(SEXPPTR_RO(args)[1], rho)); // value0 will be compared to from loop so leave it protected throughout SEXPTYPE type0 = TYPEOF(value0); @@ -167,7 +167,7 @@ SEXP fcaseR(SEXP na, SEXP rho, SEXP args) { SEXP tracker = PROTECT(allocVector(INTSXP, len0)); int *restrict p = INTEGER(tracker); copyMostAttrib(value0, ans); - + bool nonna=!isNull(na); if (nonna) { if (xlength(na) != 1) { @@ -194,7 +194,7 @@ SEXP fcaseR(SEXP na, SEXP rho, SEXP args) { } } } - + const int n = narg/2; for (int i=0; iy - return strcmp(CHAR(x), CHAR(y)); // bmerge calls ENC2UTF8 on x and y before passing here + return strcmp(CHAR(x), CHAR(y)); // bmerge calls ENC2UTF8 on x and y before passing here } static void cradix_r(SEXP *xsub, int n, int radix) @@ -1257,7 +1257,7 @@ SEXP issorted(SEXP x, SEXP by) // returning NA when NA present, and is multi-column. // TODO: test in big steps first to return faster if unsortedness is at the end (a common case of rbind'ing data to end) // These are all sequential access to x, so quick and cache efficient. Could be parallel by checking continuity at batch boundaries. - + if (!isNull(by) && !isInteger(by)) STOP(_("Internal error: issorted 'by' must be NULL or integer vector")); if (isVectorAtomic(x) || length(by)==1) { // one-column special case is very common so specialize it by avoiding column-type switches inside the row-loop later diff --git a/src/fread.c b/src/fread.c index 7b1ba6df03..aee28faf07 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1056,7 +1056,7 @@ static void parse_iso8601_timestamp(FieldParseContext *ctx) if (!args.noTZasUTC) goto fail; // if neither Z nor UTC offset is present, then it's local time and that's not directly supported yet; see news for v1.13.0 - // but user can specify that the unmarked datetimes are UTC by passing tz="UTC" + // but user can specify that the unmarked datetimes are UTC by passing tz="UTC" // if local time is UTC (env variable TZ is "" or "UTC", not unset) then local time is UTC, and that's caught by fread at R level too } } @@ -2271,7 +2271,7 @@ int freadMain(freadMainArgs _args) { fun[abs(thisType)](&fctx); if (*tch!=sep) break; int8_t thisSize = size[j]; - if (thisSize) ((char **) targets)[thisSize] += thisSize; // 'if' for when rereading to avoid undefined NULL+0 + if (thisSize) ((char **) targets)[thisSize] += thisSize; // 'if' for when rereading to avoid undefined NULL+0 tch++; j++; } diff --git a/src/fread.h b/src/fread.h index 0f365511cc..c0e9669d01 100644 --- a/src/fread.h +++ b/src/fread.h @@ -145,7 +145,7 @@ typedef struct freadMainArgs bool logical01; bool keepLeadingZeros; - + // should datetime with no Z or UTZ-offset be read as UTC? bool noTZasUTC; diff --git a/src/freadR.c b/src/freadR.c index bd93555f8e..842baf00a3 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -335,7 +335,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int type[i]=CT_STRING; // e.g. CT_ISO8601_DATE changed to character here so that as.POSIXct treats the date-only as local time in tests 1743.122 and 2150.11 SET_STRING_ELT(colClassesAs, i, tt); } - } else { + } else { type[i] = typeEnum[w-1]; // freadMain checks bump up only not down if (w==NUT) SET_STRING_ELT(colClassesAs, i, tt); } diff --git a/src/fsort.c b/src/fsort.c index 3d29401c03..c50f8bc3eb 100644 --- a/src/fsort.c +++ b/src/fsort.c @@ -154,7 +154,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) { // TODO: -0ULL should allow negatives // avoid twiddle function call as expensive in recent tests (0.34 vs 2.7) // possibly twiddle once to *ans, then untwiddle at the end in a fast parallel sweep - + union {double d; uint64_t u64;} u; u.d = max; uint64_t maxULL = u.u64; @@ -262,7 +262,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) { double *restrict myworking = NULL; // the working memory for the largest group per thread is allocated when the thread receives its first iteration int myfirstmsb = -1; // for the monotonicity check - + #pragma omp for schedule(monotonic_dynamic,1) // We require here that a thread can never be assigned to an earlier iteration; e.g. threads 0:(nth-1) // get iterations 0:(nth-1), possibly out of order, then first-come-first-served in order after that. @@ -291,7 +291,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) { if (myfirstmsb==-1 || msb Date: Sun, 9 May 2021 04:34:06 -0700 Subject: [PATCH 223/588] fix for melt with na.rm=TRUE and list column value (#4737) --- inst/tests/tests.Rraw | 6 +++++- src/fmelt.c | 4 ---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6eebfb2222..4efddd4699 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3019,6 +3019,10 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.014, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1,c("1","2")), a=1, b=1)) test(1035.015, melt(data.table(a=1+2i, b=1), id.vars="a"), error="Unknown column type 'complex' for column 'a' in 'data'") + # na.rm=TRUE with list column value, PR#4737 + test(1035.016, melt(data.table(a1=1, b1=list(1:2), b2=list(c('foo','bar'))), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1), a=1, b=list(1:2))) + test(1035.017, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1), a=1, b=1))#this worked even before the PR. + ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] test(1035.02, melt(DT, id.vars=c("i_1", "i_2", "l_2"), measure.vars=c("l_1")), ans1) @@ -3041,7 +3045,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.051, ans1, melt(DT, id.vars="id", measure.vars=list(c(5, 6), c(7, 8)))) test(1035.052, melt(DT, id.vars="id", measure.vars=list(as.raw(0))), error="Unknown 'measure.vars' type raw") - test(1035.06, ans1, melt(DT, id.vars="id", measure.vars=list(5:6, 7:8), na.rm=TRUE)) # should've no effect + test(1035.06, na.omit(ans1), melt(DT, id.vars="id", measure.vars=list(5:6, 7:8), na.rm=TRUE)) test(1035.07, ans1, melt(DT, id.vars="id", measure.vars=patterns("d_", "l_"))) # melt retains ordered factors! test(1035.08, melt(DT, id.vars="id", measure.vars=c("f_1", "f_2"), value.factor=TRUE)$value, factor(c(as.character(DT$f_1), as.character(DT$f_2)), ordered=TRUE)) diff --git a/src/fmelt.c b/src/fmelt.c index 3a1da3bdc8..8c204cb5ce 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -448,10 +448,6 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); if (!data->isidentical[i]) warning(_("'measure.vars' [%s] are not all of the same type. By order of hierarchy, the molten data value column will be of type '%s'. All measure variables not of type '%s' will be coerced too. Check DETAILS in ?melt.data.table for more on coercion.\n"), concat(dtnames, thisvaluecols), type2char(data->maxtype[i]), type2char(data->maxtype[i])); - if (data->maxtype[i] == VECSXP && data->narm) { - if (verbose) Rprintf(_("The molten data value type is a list at item %d. 'na.rm=TRUE' is ignored.\n"), i+1); - data->narm = FALSE; - } } if (data->narm) { SEXP seqcols = PROTECT(seq_int(data->lvalues, 1)); From 531be3705290c98562530b64994b15777dc6c08d Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 9 May 2021 05:55:44 -0600 Subject: [PATCH 224/588] news item, and added Ofek to DESCRIPTION --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 68af8d8857..9e5302f2bb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,7 +62,8 @@ Authors@R: c( person("Kevin","Ushey", role="ctb"), person("Dirk","Eddelbuettel", role="ctb"), person("Ben","Schwen", role="ctb"), - person("Tony","Fischetti", role="ctb")) + person("Tony","Fischetti", role="ctb"), + person("Ofek","Shilon", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NEWS.md b/NEWS.md index ab3d1d97d3..e14b6d4373 100644 --- a/NEWS.md +++ b/NEWS.md @@ -86,6 +86,8 @@ 13. `melt` with a list for `measure.vars` would output `variable` inconsistently between `na.rm=TRUE` and `FALSE`, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Thanks to @tdhock for reporting and fixing. +14. `by=...get()...` could fail with `object not found`, [#4873](https://github.com/Rdatatable/data.table/issues/4873) [#4981](https://github.com/Rdatatable/data.table/issues/4981). Thanks to @sindribaldur for reporting, and @OfekShilon for fixing. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From e61905b012db68d379c0603b1c48616c959fe273 Mon Sep 17 00:00:00 2001 From: Ofek Date: Sun, 9 May 2021 14:58:49 +0300 Subject: [PATCH 225/588] don't trust all.vars when its arguments contains get/eval (#4982) --- R/data.table.R | 9 ++++++++- inst/tests/tests.Rraw | 11 +++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 8af2e6cf0b..33c2c13d77 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -756,7 +756,14 @@ replace_dot_alias = function(e) { bysub = parse(text=paste0("list(",paste(bysub,collapse=","),")"))[[1L]] bysubl = as.list.default(bysub) } - allbyvars = intersect(all.vars(bysub), names_x) + # Fix 4981: when the 'by' expression includes get/mget/eval, all.vars + # cannot be trusted to infer all used columns + bysub.elems <- rapply(as.list(bysub), as.character) + if (any(c("eval","evalq","eval.parent","local","get","mget","dynGet") %chin% bysub.elems)) + allbyvars = NULL + else + allbyvars = intersect(all.vars(bysub), names_x) + orderedirows = .Call(CisOrderedSubset, irows, nrow(x)) # TRUE when irows is NULL (i.e. no i clause). Similar but better than is.sorted(f__) bysameorder = byindex = FALSE if (!bysub %iscall% ":" && ##Fix #4285 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4efddd4699..9c06b8b239 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17520,3 +17520,14 @@ test(2183.79, melt(DTid, measure.vars=measure(letter, number, pattern=as.integer test(2183.80, melt(DTid, measure.vars=measure(letter, number, sep=as.integer)), error="sep must be character string") ##melt(DTid, measure.vars=measure(letter, number, sep=NA_character_) ##melt(DTid, measure.vars=measure(letter, number, sep=character()) + +# `keyby` allows mixing eval/get with direct columns, #4981 +dt <- data.table(a=c(1,2), b=c(3,4), c=c(1,0)) +dt2 <- dt[,.(suma=sum(a)), keyby=.(b=get("b"),c)] +test(2184.1, dt2[1, suma], 1) +dt2 <- dt[2,.(suma=sum(a)), keyby=.(b=b,c)] +test(2184.2, dt2[1, suma], 2) +dt2 <- dt[2,.(suma=sum(a)), keyby=.(b=get("b"))] +test(2184.3, dt2[1, suma], 2) +dt2 <- dt[2,.(suma=sum(a)), keyby=.(b=get("b"),c)] +test(2184.4, dt2[1, suma], 2) \ No newline at end of file From d96cf66a16bba71712da5daa302b607604eb6aa4 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 9 May 2021 14:30:49 +0200 Subject: [PATCH 226/588] unit test for closed issue #4873 (#4985) --- inst/tests/tests.Rraw | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9c06b8b239..d691ed8f76 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17530,4 +17530,10 @@ test(2184.2, dt2[1, suma], 2) dt2 <- dt[2,.(suma=sum(a)), keyby=.(b=get("b"))] test(2184.3, dt2[1, suma], 2) dt2 <- dt[2,.(suma=sum(a)), keyby=.(b=get("b"),c)] -test(2184.4, dt2[1, suma], 2) \ No newline at end of file +test(2184.4, dt2[1, suma], 2) +# #4873 +IDT = as.data.table(iris) +vr = "Species" +IDT[, virginca := get(vr) == "virginica"] +ans = data.table(round = c(3, 3, 3, 2, 2, 4, 2, 4), k = c(6, 7, 8, 5, 7, 7, 6, 8), kar = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("setosa", "versicolor", "virginica"), class = "factor"), N = c(24L, 14L, 4L, 1L, 1L, 1L, 3L, 2L)) +test(2184.5, IDT[(virginca), .N, by = .(round(Sepal.Width), k = round(Sepal.Length), kar = get(vr))] , ans) From c8b75cb2e65d0c740d2e32f1e9af301660972d00 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 9 May 2021 07:05:36 -0600 Subject: [PATCH 227/588] #4982 follow up to pass R 3.1.0; using all.names() also cleaner than rapply..as.character --- R/data.table.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 33c2c13d77..88071b99a7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -756,10 +756,8 @@ replace_dot_alias = function(e) { bysub = parse(text=paste0("list(",paste(bysub,collapse=","),")"))[[1L]] bysubl = as.list.default(bysub) } - # Fix 4981: when the 'by' expression includes get/mget/eval, all.vars - # cannot be trusted to infer all used columns - bysub.elems <- rapply(as.list(bysub), as.character) - if (any(c("eval","evalq","eval.parent","local","get","mget","dynGet") %chin% bysub.elems)) + if (any(c("eval","evalq","eval.parent","local","get","mget","dynGet") %chin% all.names(bysub))) + # when the 'by' expression includes get/mget/eval, all.vars cannot be trusted to infer all used columns, #4981 allbyvars = NULL else allbyvars = intersect(all.vars(bysub), names_x) From cecf528b4a079fc16fad23daf13cc36e3b18ada5 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 9 May 2021 23:18:46 -0700 Subject: [PATCH 228/588] More translations: cat() output (#4151) --- R/between.R | 8 +- R/bmerge.R | 52 ++-- R/cedta.R | 2 +- R/data.table.R | 114 ++++----- R/devel.R | 6 +- R/fmelt.R | 2 +- R/foverlaps.R | 4 +- R/fread.R | 4 +- R/fwrite.R | 2 +- R/last.R | 24 +- R/print.data.table.R | 12 +- R/setkey.R | 8 +- R/tables.R | 4 +- R/test.data.table.R | 54 ++-- R/utils.R | 5 + inst/tests/tests.Rraw | 2 +- po/R-data.table.pot | 330 +++++++++++++++++++++---- po/R-zh_CN.po | 554 ++++++++++++++++++++++++++++++++++-------- po/zh_CN.po | 10 +- 19 files changed, 892 insertions(+), 305 deletions(-) diff --git a/R/between.R b/R/between.R index f5a6600da6..c9ca8d0429 100644 --- a/R/between.R +++ b/R/between.R @@ -44,7 +44,7 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) # length(upper) can be 1 or length(x) independently of lower .Call(Cbetween, x, lower, upper, incbounds, NAbounds, check) } else { - if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type, fallback to slow R routine\n") + if (isTRUE(getOption("datatable.verbose"))) catf("optimised between not available for this data type, fallback to slow R routine\n") if (isTRUE(NAbounds) && (anyNA(lower) || anyNA(upper))) stop("Not yet implemented NAbounds=TRUE for this non-numeric and non-character type") if (check && any(lower>upper, na.rm=TRUE)) stop("Some lower>upper for this non-numeric and non-character type") if (incbounds) x>=lower & x<=upper @@ -78,7 +78,7 @@ inrange = function(x,lower,upper,incbounds=TRUE) { subject = setDT(list(l=lower, u=upper)) ops = if (incbounds) c(4L, 2L) else c(5L, 3L) # >=,<= and >,< verbose = isTRUE(getOption("datatable.verbose")) - if (verbose) {last.started.at=proc.time();cat("forderv(query) took ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("forderv(query) took ... ");flush.console()} if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} ans = bmerge(shallow(subject), query, 1L:2L, c(1L,1L), 0, c(FALSE, TRUE), 0L, "all", ops, verbose) # fix for #1819, turn on verbose messages @@ -86,9 +86,9 @@ inrange = function(x,lower,upper,incbounds=TRUE) { options(datatable.verbose=FALSE) setDT(ans[c("starts", "lens")], key=c("starts", "lens")) options(datatable.verbose=verbose) - if (verbose) {last.started.at=proc.time();cat("Generating final logical vector ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Generating final logical vector ... ");flush.console()} .Call(Cinrange, idx <- vector("logical", length(x)), xo, ans[["starts"]], ans[["lens"]]) - if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console} + if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console} idx } diff --git a/R/bmerge.R b/R/bmerge.R index 3d6ab028f3..6bafd0e5bc 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -43,23 +43,25 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos xc = xcols[a] xclass = getClass(x[[xc]]) iclass = getClass(i[[ic]]) + xname = paste0("x.", names(x)[xc]) + iname = paste0("i.", names(i)[ic]) if (!xclass %chin% supported) stop("x.", names(x)[xc]," is type ", xclass, " which is not supported by data.table join") if (!iclass %chin% supported) stop("i.", names(i)[ic]," is type ", iclass, " which is not supported by data.table join") if (xclass=="factor" || iclass=="factor") { if (roll!=0.0 && a==length(icols)) stop("Attempting roll join on factor column when joining x.",names(x)[xc]," to i.",names(i)[ic],". Only integer, double or character columns may be roll joined.") if (xclass=="factor" && iclass=="factor") { - if (verbose) cat("Matching i.",names(i)[ic]," factor levels to x.",names(x)[xc]," factor levels.\n",sep="") + if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname) set(i, j=ic, value=chmatch(levels(i[[ic]]), levels(x[[xc]]), nomatch=0L)[i[[ic]]]) # nomatch=0L otherwise a level that is missing would match to NA values next } else { if (xclass=="character") { - if (verbose) cat("Coercing factor column i.",names(i)[ic]," to type character to match type of x.",names(x)[xc],".\n",sep="") + if (verbose) catf("Coercing factor column %s to type character to match type of %s.\n", iname, xname) set(i, j=ic, value=val<-as.character(i[[ic]])) set(callersi, j=ic, value=val) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581 next } else if (iclass=="character") { - if (verbose) cat("Matching character column i.",names(i)[ic]," to factor levels in x.",names(x)[xc],".\n",sep="") + if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname) newvalue = chmatch(i[[ic]], levels(x[[xc]]), nomatch=0L) if (anyNA(i[[ic]])) newvalue[is.na(i[[ic]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809 set(i, j=ic, value=newvalue) @@ -69,29 +71,29 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos stop("Incompatible join types: x.", names(x)[xc], " (",xclass,") and i.", names(i)[ic], " (",iclass,"). Factor columns must join to factor or character columns.") } if (xclass == iclass) { - if (verbose) cat("i.",names(i)[ic]," has same type (",xclass,") as x.",names(x)[xc],". No coercion needed.\n", sep="") + if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, xclass, xname) next } if (xclass=="character" || iclass=="character" || xclass=="logical" || iclass=="logical" || xclass=="factor" || iclass=="factor") { if (anyNA(i[[ic]]) && allNA(i[[ic]])) { - if (verbose) cat("Coercing all-NA i.",names(i)[ic]," (",iclass,") to type ",xclass," to match type of x.",names(x)[xc],".\n",sep="") + if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", iname, iclass, xclass, xname) set(i, j=ic, value=match.fun(paste0("as.", xclass))(i[[ic]])) next } else if (anyNA(x[[xc]]) && allNA(x[[xc]])) { - if (verbose) cat("Coercing all-NA x.",names(x)[xc]," (",xclass,") to type ",iclass," to match type of i.",names(i)[ic],".\n",sep="") + if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", xname, xclass, iclass, iname) set(x, j=xc, value=match.fun(paste0("as.", iclass))(x[[xc]])) next } stop("Incompatible join types: x.", names(x)[xc], " (",xclass,") and i.", names(i)[ic], " (",iclass,")") } if (xclass=="integer64" || iclass=="integer64") { - nm = paste0(c("i.","x."), c(names(i)[ic], names(x)[xc])) + nm = c(iname, xname) if (xclass=="integer64") { w=i; wc=ic; wclass=iclass; } else { w=x; wc=xc; wclass=xclass; nm=rev(nm) } # w is which to coerce if (wclass=="integer" || (wclass=="double" && !isReallyReal(w[[wc]]))) { - if (verbose) cat("Coercing ",wclass," column ", nm[1L], if(wclass=="double")" (which contains no fractions)"," to type integer64 to match type of ", nm[2L],".\n",sep="") + if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which contains no fractions)" else "", nm[2L]) set(w, j=wc, value=bit64::as.integer64(w[[wc]])) } else stop("Incompatible join types: ", nm[2L], " is type integer64 but ", nm[1L], " is type double and contains fractions") } else { @@ -100,17 +102,17 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (!isReallyReal(i[[ic]])) { # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys # we've always coerced to int and returned int, for convenience. - if (verbose) cat("Coercing double column i.",names(i)[ic]," (which contains no fractions) to type integer to match type of x.",names(x)[xc],".\n",sep="") + if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s", iname, xname) val = as.integer(i[[ic]]) if (!is.null(attributes(i[[ic]]))) attributes(val) = attributes(i[[ic]]) # to retain Date for example; 3679 set(i, j=ic, value=val) set(callersi, j=ic, value=val) # change the shallow copy of i up in [.data.table to reflect in the result, too. } else { - if (verbose) cat("Coercing integer column x.",names(x)[xc]," to type double to match type of i.",names(i)[ic]," which contains fractions.\n",sep="") + if (verbose) catf("Coercing integer column %s to type double to match type of %s which contains fractions.\n", xname, iname) set(x, j=xc, value=as.double(x[[xc]])) } } else { - if (verbose) cat("Coercing integer column i.",names(i)[ic]," to type double for join to match type of x.",names(x)[xc],".\n",sep="") + if (verbose) catf("Coercing integer column %s to type double for join to match type of %s.\n", iname, xname) set(i, j=ic, value=as.double(i[[ic]])) } } @@ -126,17 +128,17 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos # equi join. use existing key (#1825) or existing secondary index (#1439) if (identical(xcols, head(chmatch(key(x), names(x)), length(xcols)))) { xo = integer(0L) - if (verbose) cat("on= matches existing key, using key\n") + if (verbose) catf("on= matches existing key, using key\n") } else { xo = NULL if (isTRUE(getOption("datatable.use.index"))) { xo = getindex(x, names(x)[xcols]) - if (verbose && !is.null(xo)) cat("on= matches existing index, using index\n") + if (verbose && !is.null(xo)) catf("on= matches existing index, using index\n") } if (is.null(xo)) { if (verbose) {last.started.at=proc.time(); flush.console()} xo = forderv(x, by = xcols) - if (verbose) {cat("Calculated ad hoc index in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("Calculated ad hoc index in %s\n", timetaken(last.started.at)); flush.console()} # TODO: use setindex() instead, so it's cached for future reuse } } @@ -147,9 +149,9 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos # non-equi operators present.. investigate groups.. nqgrp = integer(0L) nqmaxgrp = 1L - if (verbose) cat("Non-equi join operators detected ... \n") + if (verbose) catf("Non-equi join operators detected ... \n") if (roll != FALSE) stop("roll is not implemented for non-equi joins yet.") - if (verbose) {last.started.at=proc.time();cat(" forder took ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf(" forder took ... ");flush.console()} # TODO: could check/reuse secondary indices, but we need 'starts' attribute as well! xo = forderv(x, xcols, retGrp=TRUE) if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} @@ -158,28 +160,28 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (length(resetcols)) { # TODO: can we get around having to reorder twice here? # or at least reuse previous order? - if (verbose) {last.started.at=proc.time();cat(" Generating group lengths ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf(" Generating group lengths ... ");flush.console()} resetlen = attr(forderv(x, resetcols, retGrp=TRUE), 'starts', exact=TRUE) resetlen = .Call(Cuniqlengths, resetlen, nrow(x)) - if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} } else resetlen = integer(0L) - if (verbose) {last.started.at=proc.time();cat(" Generating non-equi group ids ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf(" Generating non-equi group ids ... ");flush.console()} nqgrp = .Call(Cnestedid, x, xcols[non_equi:length(xcols)], xo, xg, resetlen, mult) - if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} if (length(nqgrp)) nqmaxgrp = max(nqgrp) # fix for #1986, when 'x' is 0-row table max(.) returns -Inf. if (nqmaxgrp > 1L) { # got some non-equi join work to do if ("_nqgrp_" %in% names(x)) stop("Column name '_nqgrp_' is reserved for non-equi joins.") - if (verbose) {last.started.at=proc.time();cat(" Recomputing forder with non-equi ids ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf(" Recomputing forder with non-equi ids ... ");flush.console()} set(nqx<-shallow(x), j="_nqgrp_", value=nqgrp) xo = forderv(nqx, c(ncol(nqx), xcols)) - if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} } else nqgrp = integer(0L) - if (verbose) cat(" Found", nqmaxgrp, "non-equi group(s) ...\n") + if (verbose) catf(" Found %d non-equi group(s) ...\n", nqmaxgrp) } - if (verbose) {last.started.at=proc.time();cat("Starting bmerge ...\n");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Starting bmerge ...\n");flush.console()} ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) - if (verbose) {cat("bmerge done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()} # TO DO: xo could be moved inside Cbmerge ans$xo = xo # for further use by [.data.table diff --git a/R/cedta.R b/R/cedta.R index 181ad542e5..d3a90e93cc 100644 --- a/R/cedta.R +++ b/R/cedta.R @@ -40,7 +40,7 @@ cedta = function(n=2L) { tryCatch("data.table" %chin% get(".Depends",paste("package",nsname,sep=":"),inherits=FALSE),error=function(e)FALSE) # both ns$.Depends and get(.Depends,ns) are not sufficient if (!ans && getOption("datatable.verbose")) { # nocov start - cat("cedta decided '",nsname,"' wasn't data.table aware. Here is call stack with [[1L]] applied:\n",sep="") + catf("cedta decided '%s' wasn't data.table aware. Here is call stack with [[1L]] applied:\n", nsname) print(sapply(sys.calls(), "[[", 1L)) # nocov end # so we can trace the namespace name that may need to be added (very unusually) diff --git a/R/data.table.R b/R/data.table.R index 88071b99a7..15d067eb19 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -419,9 +419,11 @@ replace_dot_alias = function(e) { len_common_names = length(common_names) if (!len_common_names) stop("Attempting to do natural join but no common columns in provided tables") if (verbose) { - which_cols_msg = if (len_common_names == length(x)) " all 'x' columns" - else paste(":", brackify(common_names)) - cat("Joining but 'x' has no key, natural join using", which_cols_msg, "\n", sep = "") + which_cols_msg = if (len_common_names == length(x)) { + catf("Joining but 'x' has no key, natural join using all 'x' columns") + } else { + catf("Joining but 'x' has no key, natural join using: %s", brackify(common_names)) + } } on = common_names } @@ -449,10 +451,10 @@ replace_dot_alias = function(e) { # Implementation for not-join along with by=.EACHI, #604 if (notjoin && (byjoin || mult != "all")) { # mult != "all" needed for #1571 notjoin = FALSE - if (verbose) {last.started.at=proc.time();cat("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ...");flush.console()} + if (verbose) {last.started.at=proc.time();catf("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ...");flush.console()} orignames = copy(names(i)) i = setdiff_(x, i, rightcols, leftcols) # part of #547 - if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} setnames(i, orignames[leftcols]) setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted } @@ -480,7 +482,7 @@ replace_dot_alias = function(e) { if (!byjoin || nqbyjoin) { # Really, `anyDuplicated` in base is AWESOME! # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates - if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} irows = if (allLen1) f__ else vecseq(f__,len__, if (allow.cartesian || notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). @@ -494,7 +496,7 @@ replace_dot_alias = function(e) { if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L] } else { if (length(xo) && missing(on)) - stop("Internal error. Cannot by=.EACHI when joining to a secondary key, yet") # nocov + stop("Internal error. Cannot by=.EACHI when joining to an index, yet") # nocov # since f__ refers to xo later in grouping, so xo needs to be passed through to dogroups too. if (length(irows)) stop("Internal error. irows has length in by=.EACHI") # nocov @@ -519,7 +521,7 @@ replace_dot_alias = function(e) { if (length(xo) && length(irows)) { irows = xo[irows] # TO DO: fsort here? if (mult=="all" && !allGrp1) { # following #1991 fix, !allGrp1 will always be TRUE. TODO: revisit. - if (verbose) {last.started.at=proc.time();cat("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()} irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]] if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} } @@ -531,13 +533,13 @@ replace_dot_alias = function(e) { ## restore original order. This is a very expensive operation. ## benchmarks have shown that starting with 1e6 irows, a tweak can significantly reduce time ## (see #2366) - if (verbose) {last.started.at=proc.time()[3L];cat("Reordering", length(irows), "rows after bmerge done in ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Reordering %d rows after bmerge done in ... ", length(irows));flush.console()} if(length(irows) < 1e6){ irows = fsort(irows, internal=TRUE) ## internally, fsort on integer falls back to forderv } else { irows = as.integer(fsort(as.numeric(irows))) ## nocov; parallelized for numeric, but overhead of type conversion } - if (verbose) {cat(round(proc.time()[3L]-last.started.at,3L),"secs\n");flush.console()} + if (verbose) {cat(timetaken(last.started.at), "\n");flush.console()} } ## make sure, all columns are taken from x and not from i. ## This is done by simply telling data.table to continue as if there was a simple subset @@ -588,9 +590,9 @@ replace_dot_alias = function(e) { if (notjoin) { if (byjoin || !is.integer(irows) || is.na(nomatch)) stop("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov irows = irows[irows!=0L] - if (verbose) {last.started.at=proc.time()[3L];cat("Inverting irows for notjoin done in ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Inverting irows for notjoin done in ... ");flush.console()} i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL # NULL meaning all rows i.e. seq_len(nrow(x)) - if (verbose) cat(round(proc.time()[3L]-last.started.at, 3L), "sec\n") + if (verbose) cat(timetaken(last.started.at), "\n") leftcols = integer() # proceed as if row subset from now on, length(leftcols) is switched on later rightcols = integer() # Doing this once here, helps speed later when repeatedly subsetting each column. R's [irows] would do this for each @@ -776,7 +778,7 @@ replace_dot_alias = function(e) { if (!is.na(w)) { byindex = indices(x)[w] if (!length(getindex(x, byindex))) { - if (verbose) cat("by index '", byindex, "' but that index has 0 length. Ignoring.\n", sep="") + if (verbose) catf("by index '%s' but that index has 0 length. Ignoring.\n", byindex) byindex=FALSE } } @@ -799,10 +801,10 @@ replace_dot_alias = function(e) { # TO DO: Make xss directly, rather than recursive call. if (!is.na(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset if (length(allbyvars)) { ############### TO DO TO DO TO DO ############### - if (verbose) cat("i clause present and columns used in by detected, only these subset:",paste(allbyvars,collapse=","),"\n") + if (verbose) catf("i clause present and columns used in by detected, only these subset: %s\n", brackify(allbyvars)) xss = x[irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends] } else { - if (verbose) cat("i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '",deparse(by),"'\n",sep="") + if (verbose) catf("i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '%s'\n", deparse(by)) xss = x[irows,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends] } if (bysub %iscall% ':' && length(bysub)==3L) { @@ -860,8 +862,7 @@ replace_dot_alias = function(e) { if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) { bynames[jj] = deparse(bysubl[[jj+1L]]) if (verbose) - cat("by-expression '", bynames[jj], "' is not named, and the auto-generated name '", tt, - "' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", sep="") + catf("by-expression '%s' is not named, and the auto-generated name '%s' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", bynames[jj], tt) } else bynames[jj] = tt # if user doesn't like this inferred name, user has to use by=list() to name the column @@ -1014,7 +1015,7 @@ replace_dot_alias = function(e) { } non_sdvars = setdiff(ansvars, sdvars) ansvals = chmatch(ansvars, names_x) - if (verbose) cat(gettextf("New ansvars: %s \n", brackify(ansvars))) + if (verbose) catf("New ansvars: %s \n", brackify(ansvars)) } else if (length(non_sdvars)) { # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc., @@ -1026,7 +1027,7 @@ replace_dot_alias = function(e) { if (!missing(.SDcols)) warning("This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table.") allcols = c(names_x, xdotprefix, names_i, idotprefix) ansvars = sdvars = setdiff(intersect(av, allcols), bynames) - if (verbose) cat("Detected that j uses these columns:",if (!length(ansvars)) "" else paste(ansvars,collapse=","),"\n") + if (verbose) catf("Detected that j uses these columns: %s\n",if (!length(ansvars)) "" else brackify(ansvars)) # using a few named columns will be faster # Consider: DT[,max(diff(date)),by=list(month=month(date))] # and: DT[,lapply(.SD,sum),by=month(date)] @@ -1088,8 +1089,7 @@ replace_dot_alias = function(e) { # fix errors in their RHS when called on empty edge cases, even when the result won't be # used anyway (so it would be annoying to have to fix it.) if (verbose) { - cat("No rows match i. No new columns to add so not evaluating RHS of :=\n") - cat("Assigning to 0 row subset of",nrow(x),"rows\n") + catf("No rows match i. No new columns to add so not evaluating RHS of :=\nAssigning to 0 row subset of %d rows\n", nrow(x)) } .Call(Cassign, x, irows, NULL, NULL, NULL) # only purpose is to write 0 to .Last.updated .global$print = address(x) @@ -1111,9 +1111,9 @@ replace_dot_alias = function(e) { # i.e. reallocate at the size as if the new columns were added followed by setalloccol(). name = substitute(x) if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk) - cat("Growing vector of column pointers from truelength ", truelength(x), " to ", n, ". A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n") + catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n) # #1729 -- copying to the wrong environment here can cause some confusion - if (ok == -1L) cat("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") + if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") # Verbosity should not issue warnings, so cat rather than warning. # TO DO: Add option 'datatable.pedantic' to turn on warnings like this. @@ -1384,7 +1384,7 @@ replace_dot_alias = function(e) { SDenv$`-.POSIXt` = function(e1, e2) { if (inherits(e2, 'POSIXt')) { if (verbose && !exists('done_units_report', parent.frame())) { - cat('\nNote: forcing units="secs" on implicit difftime by group; call difftime explicitly to choose custom units') + catf('\nNote: forcing units="secs" on implicit difftime by group; call difftime explicitly to choose custom units\n') assign('done_units_report', TRUE, parent.frame()) } return(difftime(e1, e2, units='secs')) @@ -1421,7 +1421,7 @@ replace_dot_alias = function(e) { if (length(byval) && length(byval[[1L]])) { if (!bysameorder && isFALSE(byindex)) { - if (verbose) {last.started.at=proc.time();cat("Finding groups using forderv ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Finding groups using forderv ... ");flush.console()} o__ = forderv(byval, sort=keyby, retGrp=TRUE) # The sort= argument is called sortGroups at C level. It's primarily for saving the sort of unique strings at # C level for efficiency when by= not keyby=. Other types also retain appearance order, but at byte level to @@ -1435,7 +1435,7 @@ replace_dot_alias = function(e) { if (verbose) { cat(timetaken(last.started.at),"\n") last.started.at=proc.time() - cat("Finding group sizes from the positions (can be avoided to save RAM) ... ") + catf("Finding group sizes from the positions (can be avoided to save RAM) ... ") flush.console() # for windows } f__ = attr(o__, "starts", exact=TRUE) @@ -1443,7 +1443,7 @@ replace_dot_alias = function(e) { if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} if (!bysameorder && !keyby) { # TO DO: lower this into forder.c - if (verbose) {last.started.at=proc.time();cat("Getting back original order ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Getting back original order ... ");flush.console()} firstofeachgroup = o__[f__] if (length(origorder <- forderv(firstofeachgroup))) { f__ = f__[origorder] @@ -1455,11 +1455,11 @@ replace_dot_alias = function(e) { } else { if (verbose) last.started.at=proc.time(); if (bysameorder) { - if (verbose) {cat("Finding groups using uniqlist on key ... ");flush.console()} + if (verbose) {catf("Finding groups using uniqlist on key ... ");flush.console()} f__ = uniqlist(byval) } else { if (!is.character(byindex) || length(byindex)!=1L) stop("Internal error: byindex not the index name") # nocov - if (verbose) {cat("Finding groups using uniqlist on index '", byindex, "' ... ", sep="");flush.console()} + if (verbose) {catf("Finding groups using uniqlist on index '%s' ... ", byindex);flush.console()} o__ = getindex(x, byindex) if (is.null(o__)) stop("Internal error: byindex not found") # nocov f__ = uniqlist(byval, order=o__) @@ -1467,7 +1467,7 @@ replace_dot_alias = function(e) { if (verbose) { cat(timetaken(last.started.at),"\n") last.started.at=proc.time() - cat("Finding group sizes from the positions (can be avoided to save RAM) ... ") + catf("Finding group sizes from the positions (can be avoided to save RAM) ... ") flush.console() # for windows } len__ = uniqlengths(f__, xnrow) @@ -1658,9 +1658,9 @@ replace_dot_alias = function(e) { } if (verbose) { if (!identical(oldjsub, jsub)) - cat("lapply optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") + catf("lapply optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub,width.cutoff=200L, nlines=1L)) else - cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") + catf("lapply optimization is on, j unchanged as '%s'\n", deparse(jsub,width.cutoff=200L, nlines=1L)) } dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character? # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with @@ -1670,7 +1670,7 @@ replace_dot_alias = function(e) { GForce = FALSE if ( (is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N") ) { GForce = TRUE - if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") + if (verbose) catf("GForce optimized j to '%s'\n",deparse(jsub, width.cutoff=200L, nlines=1L)) } } else { # Apply GForce @@ -1704,8 +1704,8 @@ replace_dot_alias = function(e) { jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } - if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") - } else if (verbose) cat("GForce is on, left j unchanged\n"); + if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) + } else if (verbose) catf("GForce is on, left j unchanged\n"); } } if (!GForce && !is.name(jsub)) { @@ -1728,9 +1728,9 @@ replace_dot_alias = function(e) { } if (verbose) { if (!identical(oldjsub, jsub)) - cat("Old mean optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") + catf("Old mean optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub, width.cutoff=200L, nlines=1L)) else - cat("Old mean optimization is on, left j unchanged.\n") + catf("Old mean optimization is on, left j unchanged.\n") } assign("Cfastmean", Cfastmean, SDenv) # Old comments still here for now ... @@ -1740,8 +1740,8 @@ replace_dot_alias = function(e) { # when fastmean can do trim. } } else if (verbose) { - if (getOption("datatable.optimize")<1L) cat("All optimizations are turned off\n") - else cat("Optimization is on but left j unchanged (single plain symbol): '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") + if (getOption("datatable.optimize")<1L) catf("All optimizations are turned off\n") + else catf("Optimization is on but left j unchanged (single plain symbol): '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } if (byjoin) { groups = i @@ -1770,7 +1770,7 @@ replace_dot_alias = function(e) { # for consistency of empty case in test 184 f__=len__=0L } - if (verbose) {last.started.at=proc.time();cat("Making each group and running j (GForce ",GForce,") ... ",sep="");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Making each group and running j (GForce %s) ... ", GForce);flush.console()} if (GForce) { thisEnv = new.env() # not parent=parent.frame() so that gsum is found for (ii in ansvars) assign(ii, x[[ii]], thisEnv) @@ -1816,7 +1816,7 @@ replace_dot_alias = function(e) { cnames = as.character(bysubl)[-1L] cnames = gsub('^`|`$', '', cnames) # the wrapping backticks that were added above can be removed now, #3378 if (all(cnames %chin% names_x)) { - if (verbose) {last.started.at=proc.time();cat("setkey() after the := with keyby= ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("setkey() after the := with keyby= ... ");flush.console()} setkeyv(x,cnames) # TO DO: setkey before grouping to get memcpy benefit. if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} } @@ -1843,7 +1843,7 @@ replace_dot_alias = function(e) { setnames(ans,seq_along(bynames),bynames) # TO DO: reinvestigate bynames flowing from dogroups here and simplify } if (byjoin && keyby && !bysameorder) { - if (verbose) {last.started.at=proc.time();cat("setkey() afterwards for keyby=.EACHI ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("setkey() afterwards for keyby=.EACHI ... ");flush.console()} setkeyv(ans,names(ans)[seq_along(byval)]) if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} } else if (keyby || (haskey(x) && bysameorder && (byjoin || (length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars))))))) { @@ -2347,7 +2347,7 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR dtq[[".SDcols"]] = if (keep.by) names(x) else setdiff(names(x), if (flatten) by else .by) if (join) dtq[["on"]] = if (flatten) by else .by dtq = as.call(dtq) - if (isTRUE(verbose)) cat("Processing split.data.table with: ", deparse(dtq, width.cutoff=500L), "\n", sep="") + if (isTRUE(verbose)) catf("Processing split.data.table with: %s\n", deparse(dtq, width.cutoff=500L)) tmp = eval(dtq) # add names on list setattr(ll <- tmp$.ll.tech.split, @@ -2979,7 +2979,7 @@ isReallyReal = function(x) { ## convert i to data.table with all combinations in rows. if(length(i) > 1L && prod(vapply_1i(i, length)) > 1e4){ ## CJ would result in more than 1e4 rows. This would be inefficient, especially memory-wise #2635 - if (verbose) {cat("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()} + if (verbose) {catf("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()} return(NULL) } ## Care is needed with names as we construct i @@ -2992,15 +2992,15 @@ isReallyReal = function(x) { i = do.call(CJ, i) setnames(i, colNames) idx = NULL - if(is.null(idx)){ - ## check whether key fits the columns in i. - ## order of key columns makes no difference, as long as they are all upfront in the key, I believe. - key_head = head(key(x), length(i)) - if (all(names(i) %chin% key_head)){ - if (verbose) {cat("Optimized subsetting with key '", brackify(key_head),"'\n",sep="");flush.console()} - idx = integer(0L) ## integer(0L) not NULL! Indicates that x is ordered correctly. - idxCols = key_head ## in correct order! - } + if (is.null(idx)) { + ## check whether key fits the columns in i. + ## order of key columns makes no difference, as long as they are all upfront in the key, I believe. + key_head = head(key(x), length(i)) + if (all(names(i) %chin% key_head)) { + if (verbose) {catf("Optimized subsetting with key %s", brackify(key_head)); flush.console()} + idx = integer(0L) ## integer(0L) not NULL! Indicates that x is ordered correctly. + idxCols = key_head ## in correct order! + } } if (is.null(idx)){ if (!getOption("datatable.use.index")) return(NULL) # #1422 @@ -3016,17 +3016,17 @@ isReallyReal = function(x) { } } if (!is.null(idx)){ - if (verbose) {cat("Optimized subsetting with index '", paste0( idxCols, collapse = "__"),"'\n",sep="");flush.console()} + if (verbose) {catf("Optimized subsetting with index '%s'\n", paste0( idxCols, collapse = "__"));flush.console()} } } if (is.null(idx)){ ## if nothing else helped, auto create a new index that can be used if (!getOption("datatable.auto.index")) return(NULL) - if (verbose) {cat("Creating new index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} - if (verbose) {last.started.at=proc.time();cat("Creating index", paste0(names(i), collapse = "__"), "done in ... ");flush.console()} + if (verbose) {catf("Creating new index '%s'\n", paste0(names(i), collapse = "__"));flush.console()} + if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste0(names(i), collapse = "__"));flush.console()} setindexv(x, names(i)) if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} - if (verbose) {cat("Optimized subsetting with index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} + if (verbose) {catf("Optimized subsetting with index '%s'\n", paste0(names(i), collapse = "__"));flush.console()} idx = attr(attr(x, "index", exact=TRUE), paste0("__", names(i), collapse = ""), exact=TRUE) idxCols = names(i) } diff --git a/R/devel.R b/R/devel.R index b89d1af3aa..1da19b7c98 100644 --- a/R/devel.R +++ b/R/devel.R @@ -28,8 +28,8 @@ update.dev.pkg = function(object="data.table", repo="https://Rdatatable.gitlab.i # get Revision field from remote repository PACKAGES file una = is.na(ups<-dcf.repo(pkg, repo, field, type)) if (una) - cat(sprintf("No revision information found in DESCRIPTION file for %s package. Unsure '%s' is correct field in PACKAGES file in your package repository '%s'. Otherwise package will be re-installed every time, proceeding to installation.\n", - pkg, field, contrib.url(repo, type=type))) + catf("No revision information found in DESCRIPTION file for %s package. Unsure '%s' is correct field in PACKAGES file in your package repository '%s'. Otherwise package will be re-installed every time, proceeding to installation.\n", + pkg, field, contrib.url(repo, type=type)) # see if Revision is different then currently installed Revision, note that installed package will have Revision info only when it was installed from remote devel repo upg = una || !identical(ups, dcf.lib(pkg, field, lib.loc=lib)) # update.dev.pkg fails on windows R 4.0.0, we have to unload package namespace before installing new version #4403 @@ -50,7 +50,7 @@ update.dev.pkg = function(object="data.table", repo="https://Rdatatable.gitlab.i .git = function(quiet=FALSE, lib.loc=NULL) { ans = unname(read.dcf(system.file("DESCRIPTION", package="data.table", lib.loc=lib.loc, mustWork=TRUE), fields="Revision")[, "Revision"]) if (!quiet && is.na(ans)) - cat("Git revision is not available. Most likely data.table was installed from CRAN or local archive.\nGit revision is available when installing from our repositories 'https://Rdatatable.gitlab.io/data.table' and 'https://Rdatatable.github.io/data.table'.\n") + catf("Git revision is not available. Most likely data.table was installed from CRAN or local archive.\nGit revision is available when installing from our repositories 'https://Rdatatable.gitlab.io/data.table' and 'https://Rdatatable.github.io/data.table'.\n") ans } diff --git a/R/fmelt.R b/R/fmelt.R index 936876b4d8..362a21695a 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -200,7 +200,7 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl as.logical(verbose)) setDT(ans) if (any(duplicated(names(ans)))) { - cat("Duplicate column names found in molten data.table. Setting unique names using 'make.names'\n") + catf("Duplicate column names found in molten data.table. Setting unique names using 'make.names'\n") setnames(ans, make.unique(names(ans))) } setattr(ans, 'sorted', NULL) diff --git a/R/foverlaps.R b/R/foverlaps.R index 8028482abb..fc0b706ccd 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -128,7 +128,7 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k end = yintervals[2L], any =, within =, equal = yintervals) call = construct(head(ynames, -2L), uycols, type) - if (verbose) {last.started.at=proc.time();cat("unique() + setkey() operations done in ...");flush.console()} + if (verbose) {last.started.at=proc.time();catf("unique() + setkey() operations done in ...");flush.console()} uy = unique(y[, eval(call)]) # this started to fail from R 4.1 due to c(POSIXct, numeric) setkey(uy)[, `:=`(lookup = list(list(integer(0L))), type_lookup = list(list(integer(0L))), count=0L, type_count=0L)] if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} @@ -154,7 +154,7 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k .Call(Clookup, uy, nrow(y), indices(uy, y, yintervals, nomatch=0L, roll=roll), maxgap, minoverlap, mult, type, verbose) if (maxgap == 0L && minoverlap == 1L) { # iintervals = tail(names(x), 2L) # iintervals not yet used so commented out for now - if (verbose) {last.started.at=proc.time();cat("binary search(es) done in ...");flush.console()} + if (verbose) {last.started.at=proc.time();catf("binary search(es) done in ...");flush.console()} xmatches = indices(uy, x, xintervals, nomatch=0L, roll=roll) if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} olaps = .Call(Coverlaps, uy, xmatches, mult, type, nomatch, verbose) diff --git a/R/fread.R b/R/fread.R index c03e1299b4..a36cbfda28 100644 --- a/R/fread.R +++ b/R/fread.R @@ -194,7 +194,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") yaml_header = yaml::yaml.load(yaml_string) yaml_names = names(yaml_header) - if (verbose) cat('Processed', n_read, 'lines of YAML metadata with the following top-level fields:', brackify(yaml_names), '\n') + if (verbose) catf('Processed %d lines of YAML metadata with the following top-level fields: %s\n', n_read, brackify(yaml_names)) # process header first since it impacts how to handle colClasses if ('header' %chin% yaml_names) { if ('header' %chin% call_args) message("User-supplied 'header' will override that found in metadata.") @@ -326,7 +326,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } else { cols_to_factor = which(vapply_1b(ans, is.character)) } - if (verbose) cat("stringsAsFactors=", stringsAsFactors, " converted ", length(cols_to_factor), " column(s): ", brackify(names(ans)[cols_to_factor]), "\n", sep="") + if (verbose) catf("stringsAsFactors=%s converted %d column(s): %s\n", stringsAsFactors, length(cols_to_factor), brackify(names(ans)[cols_to_factor])) for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j))) } diff --git a/R/fwrite.R b/R/fwrite.R index e8bc0f3121..8325f137d3 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -63,7 +63,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", file = path.expand(file) # "~/foo/bar" if (append && (file=="" || file.exists(file))) { if (missing(col.names)) col.names = FALSE - if (verbose) cat("Appending to existing file so setting bom=FALSE and yaml=FALSE\n") + if (verbose) catf("Appending to existing file so setting bom=FALSE and yaml=FALSE\n") bom = FALSE yaml = FALSE } diff --git a/R/last.R b/R/last.R index fe6763b7d5..8dff3271a1 100644 --- a/R/last.R +++ b/R/last.R @@ -7,12 +7,12 @@ last = function(x, n=1L, ...) { if (nargs()>1L) { if ("package:xts" %chin% search()) { if (verbose) - cat("last: using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()\n") + catf("%s: using %s: %s\n", "last", "xts::last", "!is.xts(x) & nargs>1 & 'package:xts'%in%search()") xts::last(x, n=n, ...) } else { # nocov start if (verbose) - cat("last: using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()\n") + catf("%s: using %s: %s\n", "last", "utils::tail", "!is.xts(x) & nargs>1 & !'package:xts'%in%search()") utils::tail(x, n=n, ...) # nocov end } @@ -20,16 +20,16 @@ last = function(x, n=1L, ...) { dx = dim(x) if (is.null(dx)) { if (verbose) - cat("last: using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))\n") + catf("%s: using %s: %s\n", "last", "'x[[length(x)]]'", "!is.xts(x) & !nargs>1 & is.null(dim(x))") lx = length(x) if (!lx) x else x[[lx]] } else if (is.data.frame(x)) { if (verbose) - cat("last: using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)\n") + catf("%s: using %s: %s\n", "last", "'x[nrow(x),]'", "!is.xts(x) & !nargs>1 & is.data.frame(x)") x[dx[1L], , drop=FALSE] } else { if (verbose) - cat("last: using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)\n") + catf("%s: using %s: %s\n", "last", "utils::tail", "!is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") utils::tail(x, n=n, ...) } } @@ -37,7 +37,7 @@ last = function(x, n=1L, ...) { if (!requireNamespace("xts", quietly=TRUE)) stop(domain=NA, gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::last")) # nocov if (verbose) - cat("last: using xts::last: is.xts(x)\n") + catf("%s: using %s: %s\n", "last", "xts::last", "is.xts(x)") xts::last(x, n=n, ...) } } @@ -48,12 +48,12 @@ first = function(x, n=1L, ...) { if (nargs()>1L) { if ("package:xts" %chin% search()) { if (verbose) - cat("first: using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()\n") + catf("%s: using %s: %s\n", "first", "xts::first", "!is.xts(x) & nargs>1 & 'package:xts'%in%search()") xts::first(x, n=n, ...) } else { # nocov start if (verbose) - cat("first: using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()\n") + catf("%s: using %s: %s\n", "first", "utils::head", "!is.xts(x) & nargs>1 & !'package:xts'%in%search()") utils::head(x, n=n, ...) # nocov end } @@ -61,16 +61,16 @@ first = function(x, n=1L, ...) { dx = dim(x) if (is.null(dx)) { if (verbose) - cat("first: using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))\n") + catf("%s: using %s: %s\n", "first", "'x[[1L]]'", "!is.xts(x) & !nargs>1 & is.null(dim(x))") lx = length(x) if (!lx) x else x[[1L]] } else if (is.data.frame(x)) { if (verbose) - cat("first: using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)\n") + catf("%s: using %s: %s\n", "first", "'x[1L,]'", "!is.xts(x) & !nargs>1 & is.data.frame(x)") if (!dx[1L]) x else x[1L, , drop=FALSE] } else { if (verbose) - cat("first: using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)\n") + catf("%s: using %s: %s\n", "first", "utils::head", "!is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") utils::head(x, n=n, ...) } } @@ -78,7 +78,7 @@ first = function(x, n=1L, ...) { if (!requireNamespace("xts", quietly=TRUE)) stop(domain=NA, gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::first")) # nocov if (verbose) - cat("first: using xts::first: is.xts(x)\n") + catf("%s: using %s: %s\n", "first", "xts::first", "is.xts(x)") xts::first(x, n=n, ...) } } diff --git a/R/print.data.table.R b/R/print.data.table.R index b1d5cbad50..4f2ab7bf0e 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -43,21 +43,21 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), if (!is.numeric(topn)) topn = 5L topnmiss = missing(topn) topn = max(as.integer(topn),1L) - if (print.keys){ + if (print.keys) { if (!is.null(ky <- key(x))) - cat("Key: <", toString(ky), ">\n", sep="") + catf("Key: <%s>\n", toString(ky)) if (!is.null(ixs <- indices(x))) cat(sprintf( - ngettext(length(ixs), "Index: %s\n", "Indices: %s\n", domain="R-data.table"), + ngettext(length(ixs), "Index: %s\n", "Indices: %s\n"), paste0("<", ixs, ">", collapse = ", ") )) } if (any(dim(x)==0L)) { class = if (is.data.table(x)) "table" else "frame" # a data.frame could be passed to print.data.table() directly, #3363 if (all(dim(x)==0L)) { - cat("Null data.",class," (0 rows and 0 cols)\n", sep="") # See FAQ 2.5 and NEWS item in v1.8.9 + catf("Null data.%s (0 rows and 0 cols)\n", class) # See FAQ 2.5 and NEWS item in v1.8.9 } else { - cat("Empty data.",class," (", dim(x)[1L], " rows and ",length(x)," cols)", sep="") + catf("Empty data.%s (%d rows and %d cols)", class, NROW(x), NCOL(x)) if (length(x)>0L) cat(": ",paste(head(names(x),6L),collapse=","),if(length(x)>6L)"...",sep="") cat("\n") } @@ -192,7 +192,7 @@ shouldPrint = function(x) { # for removing the head (column names) of matrix output entirely, # as opposed to printing a blank line, for excluding col.names per PR #1483 -cut_top = function(x) cat(capture.output(x)[-1L], sep = '\n') +cut_top = function(x) writeLines(capture.output(x)[-1L]) # for printing the dims for list columns #3671; used by format.data.table() paste_dims = function(x) { diff --git a/R/setkey.R b/R/setkey.R index 95cf4288d1..b9b324ac4c 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -88,12 +88,12 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (verbose) { tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R # suppress needed for tests 644 and 645 in verbose mode - cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") + catf("forder took %.03f sec\n", tt["user.self"]+tt["sys.self"]) } else { o = forderv(x, cols, sort=TRUE, retGrp=FALSE) } } else { - if (verbose) cat("setkey on columns ", brackify(cols), " using existing index '", newkey, "'\n", sep="") + if (verbose) catf("setkey on columns %s using existing index '%s'\n", brackify(cols), newkey) o = getindex(x, newkey) } if (!physical) { @@ -105,9 +105,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (length(o)) { if (verbose) { last.started.at = proc.time() } .Call(Creorder,x,o) - if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() } + if (verbose) { catf("reorder took %s\n", timetaken(last.started.at)); flush.console() } } else { - if (verbose) cat("x is already ordered by these columns, no need to call reorder\n") + if (verbose) catf("x is already ordered by these columns, no need to call reorder\n") } # else empty integer() from forderv means x is already ordered by those cols, nothing to do. setattr(x,"sorted",cols) invisible(x) diff --git a/R/tables.R b/R/tables.R index bcfab0c674..b94441c626 100644 --- a/R/tables.R +++ b/R/tables.R @@ -8,7 +8,7 @@ tables = function(mb=TRUE, order.col="NAME", width=80, all_obj = objects(envir=env, all.names=TRUE) is_DT = which(vapply_1b(all_obj, function(x) is.data.table(get(x, envir=env)))) if (!length(is_DT)) { - if (!silent) cat("No objects of class data.table exist in", if (identical(env,.GlobalEnv)) ".GlobalEnv" else format(env), "\n") + if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) return(invisible(data.table(NULL))) } DT_names = all_obj[is_DT] @@ -36,7 +36,7 @@ tables = function(mb=TRUE, order.col="NAME", width=80, tt[ , NCOL := pretty_format(NCOL, width=4L)] if (mb) tt[ , MB := pretty_format(MB, width=2L)] print(tt, class=FALSE, nrows=Inf) - if (mb) cat("Total: ", prettyNum(sum(info$MB), big.mark=","), "MB\n", sep="") + if (mb) catf("Total: %sMB\n", prettyNum(sum(info$MB), big.mark=",")) } invisible(info) } diff --git a/R/test.data.table.R b/R/test.data.table.R index 6736714923..ef834e8438 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -92,7 +92,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F cat("getDTthreads(verbose=TRUE):\n") # for tracing on CRAN; output to log before anything is attempted getDTthreads(verbose=TRUE) # includes the returned value in the verbose output (rather than dangling '[1] 4'); e.g. "data.table is using 4 threads" - cat("test.data.table() running:", fn, "\n") # print fn to log before attempting anything on it (in case it is missing); on same line for slightly easier grep + catf("test.data.table() running: %s\n", fn) # print fn to log before attempting anything on it (in case it is missing); on same line for slightly easier grep env = new.env(parent=.GlobalEnv) assign("testDir", function(x) file.path(fulldir, x), envir=env) @@ -101,8 +101,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F foreign = txt != "object 'not__exist__' not found" if (foreign) { # nocov start - cat("\n**** This R session's language is not English. Each test will still check that the correct number of errors and/or\n", - "**** warnings are produced. However, to test the text of each error/warning too, please restart R with LANGUAGE=en\n\n", sep="") + catf("\n**** This R session's language is not English. Each test will still check that the correct number of errors and/or\n**** warnings are produced. However, to test the text of each error/warning too, please restart R with LANGUAGE=en\n\n") # nocov end } assign("foreign", foreign, envir=env) @@ -162,8 +161,14 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F ntest = env$ntest if (nfail > 0L) { # nocov start - if (nfail > 1L) {s1="s";s2="s: "} else {s1="";s2=" "} - stop(nfail," error",s1," out of ",ntest,". Search ",names(fn)," for test number",s2,toString(env$whichfail),".") + # domain=NA since it's already translated by then + stop(domain = NA, sprintf( + ngettext( + nfail, + "%d error out of %d. Search %s for test number %s", + "%d errors out of %d. Search %s for test numbers %s" + ), nfail, ntest, names(fn), paste(env$whichfail, collapse=", ") + )) # important to stop() here, so that 'R CMD check' fails # nocov end } @@ -174,10 +179,10 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if ((x<-sum(timings[["nTest"]])) != ntest) { warning("Timings count mismatch:",x,"vs",ntest) # nocov } - cat("10 longest running tests took ", as.integer(tt<-DT[, sum(time)]), "s (", as.integer(100*tt/(ss<-timings[,sum(time)])), "% of ", as.integer(ss), "s)\n", sep="") + catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) print(DT, class=FALSE) - cat("All ",ntest," tests (last ",env$prevtest,") in ",names(fn)," completed ok in ",timetaken(env$started.at),"\n",sep="") + catf("All %d tests (last %s) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) ## this chunk requires to include new suggested deps: graphics, grDevices #memtest.plot = function(.inittime) { @@ -284,7 +289,8 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no timings[ as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE ] } ) if (showProgress) - cat("\rRunning test id", numStr, " ") # nocov. + # \r can't be in gettextf msg + cat("\rRunning test id", numStr, " ") # nocov. # See PR #4090 for comments about change here in Dec 2019. # If a segfault error occurs in future and we'd like to know after which test, then arrange for the # try(sys.source()) in test.data.table() to be run in a separate R process. That process could write out @@ -341,7 +347,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no if (.test.data.table) { if (num>\n",sep="") # \n printed as '\\n' so the two lines of output can be compared vertically - cat("Observed: <<",encodeString(out),">>\n",sep="") + catf("Test %s did not produce correct output:\n", numStr) + catf("Expected: <<%s>>\n", encodeString(output)) # \n printed as '\\n' so the two lines of output can be compared vertically + catf("Observed: <<%s>>\n", encodeString(out)) fail = TRUE # nocov end } if (length(notOutput) && string_match(notOutput, out, ignore.case=TRUE)) { # nocov start - cat("Test",numStr,"produced output but should not have:\n") - cat("Expected absent (case insensitive): <<",encodeString(notOutput),">>\n",sep="") - cat("Observed: <<",encodeString(out),">>\n",sep="") + catf("Test %s produced output but should not have:\n", numStr) + catf("Expected absent (case insensitive): <<%s>>\n", encodeString(notOutput)) + catf("Observed: <<%s>>\n", encodeString(out)) fail = TRUE # nocov end } @@ -411,7 +413,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no if (is.data.table(x) && is.data.table(y)) { if (!selfrefok(x) || !selfrefok(y)) { # nocov start - cat("Test ",numStr," ran without errors but selfrefok(", if(!selfrefok(x))"x"else"y", ") is FALSE\n", sep="") + catf("Test %s ran without errors but selfrefok(%s) is FALSE\n", numStr, if (selfrefok(x)) "y" else "x") fail = TRUE # nocov end } else { @@ -434,12 +436,12 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # For test 617 on r-prerel-solaris-sparc on 7 Mar 2013 # nocov start if (!fail) { - cat("Test", numStr, "ran without errors but failed check that x equals y:\n") + catf("Test %s ran without errors but failed check that x equals y:\n", numStr) failPrint = function(x, xsub) { cat(">", substitute(x), "=", xsub, "\n") if (is.data.table(x)) compactprint(x) else { nn = length(x) - cat(sprintf("First %d of %d (type '%s'): \n", min(nn, 6L), length(x), typeof(x))) + catf("First %d of %d (type '%s'): \n", min(nn, 6L), length(x), typeof(x)) # head.matrix doesn't restrict columns if (length(d <- dim(x))) do.call(`[`, c(list(x, drop = FALSE), lapply(pmin(d, 6L), seq_len))) else print(head(x)) diff --git a/R/utils.R b/R/utils.R index 45678f5a4d..ecffb64226 100644 --- a/R/utils.R +++ b/R/utils.R @@ -145,3 +145,8 @@ edit.data.table = function(name, ...) { setDT(NextMethod('edit', name))[] } # nocov end + +catf = function(fmt, ...) { + cat(gettextf(fmt, ...)) +} + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d691ed8f76..f17961e760 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13854,7 +13854,7 @@ test(1967.75, x[!y, sum(i4), on = 'i1', by = .EACHI, verbose = TRUE], data.table(i1 = c(169L, 369L), V1 = c(270L, 179L)), output = "not-join called with 'by=.EACHI'.*done") test(1967.76, x[!y, sum(i4), on = 'i1', verbose = TRUE], 510L, - output = 'Inverting irows for notjoin.*sec') + output = 'Inverting irows for notjoin.*[0-9]s') x[ , v := 0] ### hitting by = A:B branch test(1967.77, x[ , .(v = sum(v)), by = i1:i4], x[-10L]) diff --git a/po/R-data.table.pot b/po/R-data.table.pot index 8e6d641240..ad00f12772 100644 --- a/po/R-data.table.pot +++ b/po/R-data.table.pot @@ -106,6 +106,9 @@ msgstr "" msgid "trying to use integer64 class when 'bit64' package is not installed" msgstr "" +msgid "optimised between not available for this data type, fallback to slow R routine" +msgstr "" + msgid "Not yet implemented NAbounds=TRUE for this non-numeric and non-character type" msgstr "" @@ -130,57 +133,99 @@ msgstr "" msgid "the second element should be the upper bound(s)." msgstr "" -msgid "x." +msgid "forderv(query) took ..." +msgstr "" + +msgid "Generating final logical vector ..." +msgstr "" + +msgid "done in" +msgstr "" + +msgid "%s is type %s which is not supported by data.table join" +msgstr "" + +msgid "Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined." +msgstr "" + +msgid "Matching %s factor levels to %s factor levels." msgstr "" -msgid "is type" +msgid "Coercing factor column %s to type character to match type of %s." msgstr "" -msgid "which is not supported by data.table join" +msgid "Matching character column %s to factor levels in %s." msgstr "" -msgid "i." +msgid "Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns." msgstr "" -msgid "Attempting roll join on factor column when joining x." +msgid "%s has same type (%s) as %s. No coercion needed." msgstr "" -msgid "to i." +msgid "Coercing all-NA %s (%s) to type %s to match type of %s." msgstr "" -msgid ". Only integer, double or character columns may be roll joined." +msgid "Incompatible join types: %s (%s) and %s (%s)" msgstr "" -msgid "Incompatible join types: x." +msgid "Coercing %s column %s%s to type integer64 to match type of %s." msgstr "" -msgid "(" +msgid "Incompatible join types: %s is type integer64 but %s is type double and contains fractions" msgstr "" -msgid ") and i." +msgid "Coercing double column %s (which contains no fractions) to type integer to match type of %s" msgstr "" -msgid "). Factor columns must join to factor or character columns." +msgid "Coercing integer column %s to type double to match type of %s which contains fractions." msgstr "" -msgid ")" +msgid "Coercing integer column %s to type double for join to match type of %s." msgstr "" -msgid "Incompatible join types:" +msgid "on= matches existing key, using key" msgstr "" -msgid "is type integer64 but" +msgid "on= matches existing index, using index" msgstr "" -msgid "is type double and contains fractions" +msgid "Calculated ad hoc index in %s" +msgstr "" + +msgid "Non-equi join operators detected ..." msgstr "" msgid "roll is not implemented for non-equi joins yet." msgstr "" +msgid "forder took ..." +msgstr "" + +msgid "Generating group lengths ..." +msgstr "" + +msgid "Generating non-equi group ids ..." +msgstr "" + msgid "Column name '_nqgrp_' is reserved for non-equi joins." msgstr "" +msgid "Recomputing forder with non-equi ids ..." +msgstr "" + +msgid "Found %d non-equi group(s) ..." +msgstr "" + +msgid "Starting bmerge ..." +msgstr "" + +msgid "bmerge done in" +msgstr "" + +msgid "cedta decided '%s' wasn't data.table aware. Here is call stack with [[1L]] applied:" +msgstr "" + msgid "key argument of data.table() must be character" msgstr "" @@ -322,12 +367,27 @@ msgstr "" msgid "Attempting to do natural join but no common columns in provided tables" msgstr "" -msgid "Internal error. Cannot by=.EACHI when joining to a secondary key, yet" +msgid "Joining but 'x' has no key, natural join using" +msgstr "" + +msgid "not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ..." +msgstr "" + +msgid "Constructing irows for '!byjoin || nqbyjoin' ..." +msgstr "" + +msgid "Internal error. Cannot by=.EACHI when joining to an index, yet" msgstr "" msgid "Internal error. irows has length in by=.EACHI" msgstr "" +msgid "Reorder irows for 'mult==\"all\" && !allGrp1' ..." +msgstr "" + +msgid "Reordering %d rows after bmerge done in ..." +msgstr "" + msgid "logical error. i is not a data.table, but 'on' argument is provided." msgstr "" @@ -349,6 +409,9 @@ msgstr "" msgid "Internal error: notjoin but byjoin or !integer or nomatch==NA" msgstr "" +msgid "Inverting irows for notjoin done in ..." +msgstr "" + msgid "with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] to assign to column name(s) held in variable myVar. See ?':=' for other examples. As warned in 2014, this is now a warning." msgstr "" @@ -385,9 +448,18 @@ msgstr "" msgid "but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities." msgstr "" +msgid "by index '%s' but that index has 0 length. Ignoring." +msgstr "" + msgid "Internal error: irows isn't integer" msgstr "" +msgid "i clause present and columns used in by detected, only these subset:" +msgstr "" + +msgid "i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '" +msgstr "" + msgid "'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval" msgstr "" @@ -409,6 +481,9 @@ msgstr "" msgid "The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided)." msgstr "" +msgid "by-expression '%s' is not named, and the auto-generated name '%s' clashed with variable(s) in j. Therefore assigning the entire by-expression as name." +msgstr "" + msgid "Internal error: drop_dot passed" msgstr "" @@ -457,6 +532,15 @@ msgstr "" msgid "This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table." msgstr "" +msgid "Detected that j uses these columns:" +msgstr "" + +msgid "'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld:" +msgstr "" + +msgid "New:" +msgstr "" + msgid ".SD is locked. Using := in .SD's j is reserved for possible future use; a tortuously flexible way to modify by group. Use := in j directly to modify by group by reference." msgstr "" @@ -472,9 +556,18 @@ msgstr "" msgid "LHS of := isn't column names ('character') or positions ('integer' or 'numeric')" msgstr "" +msgid "No rows match i. No new columns to add so not evaluating RHS of :=\nAssigning to 0 row subset of %d rows" +msgstr "" + msgid "Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved." msgstr "" +msgid "Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option." +msgstr "" + +msgid "Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected." +msgstr "" + msgid "Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but it's length" msgstr "" @@ -517,24 +610,72 @@ msgstr "" msgid "The column '.I' can't be grouped because it conflicts with the special .I variable. Try setnames(DT,'.I','I') first." msgstr "" +msgid "Note: forcing units=\"secs\" on implicit difftime by group; call difftime explicitly to choose custom units" +msgstr "" + msgid "logical error. i is not data.table, but mult='all' and 'by'=.EACHI" msgstr "" msgid "Internal error: by= is missing" msgstr "" +msgid "Finding groups using forderv ..." +msgstr "" + +msgid "Finding group sizes from the positions (can be avoided to save RAM) ..." +msgstr "" + +msgid "Getting back original order ..." +msgstr "" + +msgid "Finding groups using uniqlist on key ..." +msgstr "" + msgid "Internal error: byindex not the index name" msgstr "" +msgid "Finding groups using uniqlist on index '%s' ..." +msgstr "" + msgid "Internal error: byindex not found" msgstr "" +msgid "lapply optimization changed j from '%s' to '%s'" +msgstr "" + +msgid "lapply optimization is on, j unchanged as '%s'" +msgstr "" + +msgid "GForce optimized j to '" +msgstr "" + +msgid "GForce is on, left j unchanged" +msgstr "" + msgid "Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized." msgstr "" +msgid "Old mean optimization changed j from '%s' to '%s'" +msgstr "" + +msgid "Old mean optimization is on, left j unchanged." +msgstr "" + +msgid "All optimizations are turned off" +msgstr "" + +msgid "Optimization is on but left j unchanged (single plain symbol): '%s'" +msgstr "" + msgid "Internal error: length(irows)!=length(o__)" msgstr "" +msgid "Making each group and running j (GForce %s) ..." +msgstr "" + +msgid "setkey() after the := with keyby= ..." +msgstr "" + msgid "The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=." msgstr "" @@ -547,6 +688,9 @@ msgstr "" msgid "and bynames is" msgstr "" +msgid "setkey() afterwards for keyby=.EACHI ..." +msgstr "" + msgid "rownames and rownames.value cannot both be used at the same time" msgstr "" @@ -649,6 +793,9 @@ msgstr "" msgid "Argument 'by' must refer only to atomic-type columns, but the following columns are non-atomic:" msgstr "" +msgid "Processing split.data.table with:" +msgstr "" + msgid "x is not a data.table. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table" msgstr "" @@ -820,6 +967,21 @@ msgstr "" msgid "Internal error in .isFastSubsettable. Please report to data.table developers" msgstr "" +msgid "Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems." +msgstr "" + +msgid "Optimized subsetting with key '" +msgstr "" + +msgid "Optimized subsetting with index '" +msgstr "" + +msgid "Creating new index '" +msgstr "" + +msgid "Creating index %s done in ..." +msgstr "" + msgid "'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'." msgstr "" @@ -850,6 +1012,9 @@ msgstr "" msgid "There is no package %s in provided repository." msgstr "" +msgid "Git revision is not available. Most likely data.table was installed from CRAN or local archive.\nGit revision is available when installing from our repositories 'https://Rdatatable.gitlab.io/data.table' and 'https://Rdatatable.github.io/data.table'." +msgstr "" + msgid "'fromLast' must be TRUE or FALSE" msgstr "" @@ -949,6 +1114,9 @@ msgstr "" msgid "Please provide a name to each element of 'measure.vars'." msgstr "" +msgid "Duplicate column names found in molten data.table. Setting unique names using 'make.names'" +msgstr "" + msgid "y and x must both be data.tables. Use `setDT()` to convert list/data.frames to data.tables by reference or as.data.table() to convert to data.tables by copying." msgstr "" @@ -1042,6 +1210,12 @@ msgstr "" msgid "POSIXct interval cols have mixed timezones. Overlaps are performed on the internal numerical representation of POSIXct objects (always in UTC epoch time), therefore printed values may give the impression that values don't overlap but their internal representations do Please ensure that POSIXct type interval cols have identical 'tzone' attributes to avoid confusion." msgstr "" +msgid "unique() + setkey() operations done in ..." +msgstr "" + +msgid "binary search(es) done in ..." +msgstr "" + msgid "Not yet implemented" msgstr "" @@ -1171,6 +1345,9 @@ msgstr "" msgid "\". Please double check the input file is a valid csvy." msgstr "" +msgid "Processed %d lines of YAML metadata with the following top-level fields: %s" +msgstr "" + msgid "User-supplied 'header' will override that found in metadata." msgstr "" @@ -1231,6 +1408,9 @@ msgstr "" msgid "so the column has been left as type '" msgstr "" +msgid "stringsAsFactors=%s converted %d column(s): %s" +msgstr "" + msgid "key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)" msgstr "" @@ -1249,6 +1429,9 @@ msgstr "" msgid "x being coerced from class: matrix to data.table" msgstr "" +msgid "Appending to existing file so setting bom=FALSE and yaml=FALSE" +msgstr "" + msgid "Input has no columns; doing nothing." msgstr "" @@ -1315,6 +1498,9 @@ msgstr "" msgid "Using integer64 class columns require to have 'bit64' package installed." msgstr "" +msgid "%s: using %s: %s" +msgstr "" + msgid "'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already" msgstr "" @@ -1408,19 +1594,7 @@ msgstr "" msgid "The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option." msgstr "" -msgid "The datatable." -msgstr "" - -msgid "version (" -msgstr "" - -msgid ") does not match the package (" -msgstr "" - -msgid "). Please close all R sessions to release the old" -msgstr "" - -msgid "and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check." +msgid "The datatable.%s version (%s) does not match the package (%s). Please close all R sessions to release the old %s and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check." msgstr "" msgid "This is R" @@ -1474,6 +1648,15 @@ msgstr "" msgid "Column classes will be suppressed when col.names is 'none'" msgstr "" +msgid "Key: <%s>" +msgstr "" + +msgid "Null data.%s (0 rows and 0 cols)" +msgstr "" + +msgid "Empty data.%s (%d rows and %d cols)" +msgstr "" + msgid "Internal structure doesn't seem to be a list. Possibly corrupt data.table." msgstr "" @@ -1516,6 +1699,18 @@ msgstr "" msgid "Internal error. 'cols' should be character at this point in setkey; please report." msgstr "" +msgid "forder took" +msgstr "" + +msgid "setkey on columns %s using existing index '%s'" +msgstr "" + +msgid "reorder took" +msgstr "" + +msgid "x is already ordered by these columns, no need to call reorder" +msgstr "" + msgid "Internal error: index '" msgstr "" @@ -1576,25 +1771,13 @@ msgstr "" msgid "length(by.x) != length(by.y)" msgstr "" -msgid "When x's column ('" -msgstr "" - -msgid "') is character, the corresponding column in y ('" -msgstr "" - -msgid "') should be factor or character, but found incompatible type '" -msgstr "" - -msgid "') is factor, the corresponding column in y ('" +msgid "When x's column ('%s') is character, the corresponding column in y ('%s') should be factor or character, but found incompatible type '%s'." msgstr "" -msgid "') should be character or factor, but found incompatible type '" +msgid "When x's column ('%s') is factor, the corresponding column in y ('%s') should be character or factor, but found incompatible type '%s'." msgstr "" -msgid "') is integer or numeric, the corresponding column in y ('" -msgstr "" - -msgid "') can not be character or logical types, but found incompatible type '" +msgid "When x's column ('%s') is integer or numeric, the corresponding column in y ('%s') can not be character or logical types, but found incompatible type '%s'." msgstr "" msgid "argument 'all' should be logical of length one" @@ -1645,12 +1828,18 @@ msgstr "" msgid "argument 'fill' ignored, only make sense for type='const'" msgstr "" +msgid "No objects of class data.table exist in %s" +msgstr "" + msgid "order.col='" msgstr "" msgid "' not a column name of info" msgstr "" +msgid "Total:" +msgstr "" + msgid "data.table package is loaded. Unload or start a fresh R session." msgstr "" @@ -1660,25 +1849,31 @@ msgstr "" msgid "Neither %s nor %s exist in %s" msgstr "" +msgid "test.data.table() running:" +msgstr "" + +msgid "**** This R session's language is not English. Each test will still check that the correct number of errors and/or\n**** warnings are produced. However, to test the text of each error/warning too, please restart R with LANGUAGE=en" +msgstr "" + msgid "Failed after test" msgstr "" msgid "before the next test() call in" msgstr "" -msgid "out of" +msgid "Timings count mismatch:" msgstr "" -msgid ". Search" +msgid "vs" msgstr "" -msgid "for test number" +msgid "10 longest running tests took" msgstr "" -msgid "Timings count mismatch:" +msgid "All %d tests in %s completed ok in %s" msgstr "" -msgid "vs" +msgid "Running test id %s" msgstr "" msgid "Test" @@ -1687,6 +1882,33 @@ msgstr "" msgid "is invalid: when error= is provided it does not make sense to pass y as well" msgstr "" +msgid "Test id %s is not in increasing order" +msgstr "" + +msgid "Test %s produced %d %ss but expected %d" +msgstr "" + +msgid "Test %s didn't produce the correct %s:\nExpected: %s\nObserved: %s" +msgstr "" + +msgid "Output captured before unexpected warning/error/message:" +msgstr "" + +msgid "Test %s did not produce the correct output:\nExpected: <<%s>>\nObserved <<%s>>" +msgstr "" + +msgid "Test %s produced output but should not have:\nExpected absent (case insensitive): <<%s>>\nObserved: <<%s>>" +msgstr "" + +msgid "Test %s ran without errors but selfrefok(%s) is FALSE" +msgstr "" + +msgid "Test %s ran without errors but failed check that x equals y:" +msgstr "" + +msgid "First %d of %d (type '%s'):" +msgstr "" + msgid "Use started.at=proc.time() not Sys.time() (POSIXt and slow)" msgstr "" @@ -1756,7 +1978,17 @@ msgstr "" msgid "Following columns are not numeric and will be omitted:" msgstr "" +msgid "Index: " +msgid_plural "Indices: " +msgstr[0] "" +msgstr[1] "" + msgid "%d variable not shown: %s\n" msgid_plural "%d variables not shown: %s\n" msgstr[0] "" msgstr[1] "" + +msgid "%d error out of %d. Search %s for test number %s" +msgid_plural "%d errors out of %d. Search %s for test numbers %s" +msgstr[0] "" +msgstr[1] "" diff --git a/po/R-zh_CN.po b/po/R-zh_CN.po index a73b8e4a1b..7e78584fd7 100644 --- a/po/R-zh_CN.po +++ b/po/R-zh_CN.po @@ -136,6 +136,11 @@ msgstr "。将采用 UTC 时间进行比较。" msgid "trying to use integer64 class when 'bit64' package is not installed" msgstr "试图使用 intger64 类型但 'bit64' 包尚未安装" +msgid "" +"optimised between not available for this data type, fallback to slow R " +"routine" +msgstr "对这种数据类型的优化尚未实现,使用备用较慢的R方法。" + msgid "" "Not yet implemented NAbounds=TRUE for this non-numeric and non-character type" msgstr "" @@ -165,61 +170,118 @@ msgstr "第一个元素应为下界;" msgid "the second element should be the upper bound(s)." msgstr "第二个元素应为上界。" -msgid "x." -msgstr "x." +msgid "forderv(query) took ..." +msgstr "forderv(query) 用了 ..." + +msgid "Generating final logical vector ..." +msgstr "产生最后的逻辑向量 ..." -msgid "is type" -msgstr "的类型为" +msgid "done in" +msgstr "用了" -msgid "which is not supported by data.table join" -msgstr ",该类型无法用于 data.table 的联接" +msgid "%s is type %s which is not supported by data.table join" +msgstr "%s的类型为%s,该类型无法用于 data.table 的联接" + +msgid "" +"Attempting roll join on factor column when joining %s to %s. Only integer, " +"double or character columns may be roll joined." +msgstr "" +"联接%s与%s时试图滚动联接(roll join)因子类型(factor)的列。但只有整数" +"(integer)、双精度(double)或字符(character)类型的列可以使用滚动联接。" -msgid "i." -msgstr "i." +msgid "Matching %s factor levels to %s factor levels." +msgstr "匹配 %s 的因子水平和 %s 的因子水平。" -msgid "Attempting roll join on factor column when joining x." -msgstr "试图滚动联接(roll join)因子类型(factor)的列,这发生于将 x." +msgid "Coercing factor column %s to type character to match type of %s." +msgstr "将因子类型列 %s 强制转换成字符来匹配目 %s。" -msgid "to i." -msgstr "与 i." +msgid "Matching character column %s to factor levels in %s." +msgstr "匹配字符类型列 %s 和 %s 的因子水平。" -msgid ". Only integer, double or character columns may be roll joined." +msgid "" +"Incompatible join types: %s (%s) and %s (%s). Factor columns must join to " +"factor or character columns." msgstr "" -"联接时。但只有整数(integer)、双精度(double)或字符(character)类型的列可" -"以使用滚动联接(roll join)。" +"不兼容的联结类型: %s (%s) 和 %s (%s)。 因子类型的列必须与因子类型或字符类型的" +"列才可以联结" -msgid "Incompatible join types: x." -msgstr "不兼容的联结类型: x。" +msgid "%s has same type (%s) as %s. No coercion needed." +msgstr "%s 有 %s 的类型。不需要强制转换。" -msgid "(" -msgstr "(" +msgid "Coercing all-NA %s (%s) to type %s to match type of %s." +msgstr "强制转换 all-NA %s (%s) 为 %s 类型用来匹配 %s 类型。" -msgid ") and i." -msgstr ")和 i。" +msgid "Incompatible join types: %s (%s) and %s (%s)" +msgstr "不兼容的联结类型: %s (%s) 和 %s (%s)。" -msgid "). Factor columns must join to factor or character columns." -msgstr ")。 因子类型的列必须与因子类型或字符类型的列才可以联结" +msgid "Coercing %s column %s%s to type integer64 to match type of %s." +msgstr "强制转换 %s 个列 %s%s 为整数64类型用来匹配 %s 类型。" -msgid ")" -msgstr ")" +msgid "" +"Incompatible join types: %s is type integer64 but %s is type double and " +"contains fractions" +msgstr "" +"不兼容的联结类型: %s 是 integer64 类型的列但 %s 是有分数的双精度类型列。" -msgid "Incompatible join types:" -msgstr "不兼容的联结类型" +msgid "" +"Coercing double column %s (which contains no fractions) to type integer to " +"match type of %s" +msgstr "强制转换双精度列 %s (不含有分数) 为整数用来匹配 %s 类型" -msgid "is type integer64 but" -msgstr "是 integer64 类型但是" +msgid "" +"Coercing integer column %s to type double to match type of %s which contains " +"fractions." +msgstr "强制转换整数列 %s 为双精度用来匹配含有分数的 %s 类型。" + +msgid "Coercing integer column %s to type double for join to match type of %s." +msgstr "强制转换整数列 %s 为双精度用来与类型 %s 进行联结。" + +msgid "on= matches existing key, using key" +msgstr "on=和现有键(key)相等,用键" + +msgid "on= matches existing index, using index" +msgstr "on=和现有索引(index)相等,用索引" + +msgid "Calculated ad hoc index in %s" +msgstr "计算临时索引用了 %s" -msgid "is type double and contains fractions" -msgstr "是 double 类型并且包含分数" +msgid "Non-equi join operators detected ..." +msgstr "侦测到不等长联结操作符(operator)..." msgid "roll is not implemented for non-equi joins yet." msgstr "不等长联结还不能执行 roll " +msgid "forder took ..." +msgstr "forder 用了 ..." + +msgid "Generating group lengths ..." +msgstr "正在生成组的长度。。。" + +msgid "Generating non-equi group ids ..." +msgstr "正在生成不等长的组标识符 . . . " + msgid "Column name '_nqgrp_' is reserved for non-equi joins." msgstr "列名 '_nqgrp_' 是为不等长联结保留的" +msgid "Recomputing forder with non-equi ids ..." +msgstr "用不等长的组标志符重新计算 forder . . . " + +msgid "Found %d non-equi group(s) ..." +msgstr "找到%d不等长分组 ..." + +msgid "Starting bmerge ..." +msgstr "bmerge开始..." + +msgid "bmerge done in" +msgstr "bmerge 用了" + +msgid "" +"cedta decided '%s' wasn't data.table aware. Here is call stack with [[1L]] " +"applied:" +msgstr "cedta决定data.table不识别 '%s'。使用[[1L]]后的呼叫堆叠就是:" + msgid "key argument of data.table() must be character" -msgstr "data.table() 的主参数必须是字符" +msgstr "data.table() 的key参数必须是字符" msgid "Object '" msgstr "对象 '" @@ -427,18 +489,34 @@ msgid "" msgstr "" "但i是一个 data.table (或者是字符向量),必须使用 'on=' 参数指明参与连接的列 " "(参见 ?data.table),可以是keying x(比如,已排序过,和标记已排序过,请参见?" -"setkey),或者是在x和i共用列的名字(比如,自然连接)。如果x有在内存被排序过," -"Keyed连接的速度会在非常大的数据上有较明显的提高。" +"setkey),或者是在x和i共用列的名字(比如,自然连接)。如果x有在内存被排序过,键" +"(keyed)连接的速度会在非常大的数据上有较明显的提高。" msgid "Attempting to do natural join but no common columns in provided tables" msgstr "尝试进行自然连接然而并没有找到表格中相同的列" -msgid "Internal error. Cannot by=.EACHI when joining to a secondary key, yet" -msgstr "内部错误:目前尚无法对次键使用by=.EACH命令" +msgid "Joining but 'x' has no key, natural join using" +msgstr "联结但 'x' 没有键 (key),自然联结用" + +msgid "not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ..." +msgstr "" +"配套使用了 not-join 和 'by=.EACHI' 的命令; 用 !i 取代 i=setdiff_(x,i) ..." + +msgid "Constructing irows for '!byjoin || nqbyjoin' ..." +msgstr "构造 irows 用来对应于 '!byjoin || nqbyjoin' ..." + +msgid "Internal error. Cannot by=.EACHI when joining to an index, yet" +msgstr "内部错误:目前尚无法对索引(index)使用by=.EACH命令" msgid "Internal error. irows has length in by=.EACHI" msgstr "内部错误:by=.EACHI 中 irows 有长度" +msgid "Reorder irows for 'mult==\"all\" && !allGrp1' ..." +msgstr "对'mult==\"all\" && !allGrp1'再排序irows ..." + +msgid "Reordering %d rows after bmerge done in ..." +msgstr "bmerge 之后再排序%d行用了..." + msgid "logical error. i is not a data.table, but 'on' argument is provided." msgstr "逻辑错误。当 i 并非一个 data.table时,不应提供'on'参数" @@ -465,6 +543,9 @@ msgstr "" msgid "Internal error: notjoin but byjoin or !integer or nomatch==NA" msgstr "内部错误。原因可能为:notjoin 而非 byjoin;非整数;nomatch 为空" +msgid "Inverting irows for notjoin done in ..." +msgstr "对 notjoin 求逆 irows 用了 ..." + msgid "" "with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. " "Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] " @@ -519,9 +600,22 @@ msgstr "" "包含逗号),或传入一个长度为1,由逗号分隔的列名组成的向量输入 ?data.table查看" "其他的选项。" +msgid "by index '%s' but that index has 0 length. Ignoring." +msgstr "by 索引(index) '%s' 但那索引的长度为0。将被忽视。" + msgid "Internal error: irows isn't integer" msgstr "内部错误:irows 不是整型" +msgid "i clause present and columns used in by detected, only these subset:" +msgstr "有 i 子句和在 by 用的列被侦测, 子集只有这个:" + +msgid "" +"i clause present but columns used in by not detected. Having to subset all " +"columns before evaluating 'by': '" +msgstr "" +"有 i 子句但是在 by 用的列并没有被侦测。于是所有的列将用于接下里的 'by': 运" +"算。" + msgid "" "'by' appears to evaluate to column names but isn't c() or key(). Use " "by=list(...) if you can. Otherwise, by=eval" @@ -560,6 +654,13 @@ msgstr "" "在'by'或'keyby'列表中的项长度为 %s。每一项的长度须均为%d,即应与 x (或经 i " "筛选后的子集)中所包含行数相同。" +msgid "" +"by-expression '%s' is not named, and the auto-generated name '%s' clashed " +"with variable(s) in j. Therefore assigning the entire by-expression as name." +msgstr "" +"by-expression '%s' 没有命名,自动生成的名字 '%s' 与 j 中的变量名冲突。将用 " +"by-expression 用来命名。" + msgid "Internal error: drop_dot passed" msgstr "内部错误:drop_dot 传入的参数有" @@ -622,6 +723,22 @@ msgid "" "data.table." msgstr "此处 j 不使用 .SD 但提供了 .SDcols ,因此忽略 .SDcols详见 ?data.table" +msgid "Detected that j uses these columns:" +msgstr "侦测 j 用这个列:" + +msgid "" +"'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a " +"single j=eval(macro) instead. Both will detect the columns used which is " +"important for efficiency.\n" +"Old:" +msgstr "" +"j 中找到了 '(m)get'。ansvars 将应用到所有的列。请考虑使用 .SDcols 或者一个单" +"独的 j=eval(macro)两个命令都会侦测影响效率的列。\n" +"旧:" + +msgid "New:" +msgstr "新:" + msgid "" ".SD is locked. Using := in .SD's j is reserved for possible future use; a " "tortuously flexible way to modify by group. Use := in j directly to modify " @@ -647,6 +764,13 @@ msgid "" "'numeric')" msgstr ":= 的 LHS 不是列名('字符')或列的位置('整数'或'数值')" +msgid "" +"No rows match i. No new columns to add so not evaluating RHS of :=\n" +"Assigning to 0 row subset of %d rows" +msgstr "" +"没有找到匹配 i 的行。无法增加新的列所以无法运算 RHS of :=\n" +"指定一个 0 行的子集" + msgid "" "Invalid .internal.selfref detected and fixed by taking a (shallow) copy of " "the data.table so that := can add this new column by reference. At an " @@ -664,6 +788,28 @@ msgstr "" "及 ?setattr如果以上讯息无法提供帮助,请回报你的案例至 data.table 问题追踪以助" "于修复根本原因或改进本讯息" +msgid "" +"Growing vector of column pointers from truelength %d to %d. A shallow copy " +"has been taken, see ?setalloccol. Only a potential issue if two variables " +"point to the same data (we can't yet detect that well) and if not you can " +"safely ignore this. To avoid this message you could setalloccol() first, " +"deep copy first using copy(), wrap with suppressWarnings() or increase the " +"'datatable.alloccol' option." +msgstr "" +"列指针向量从 truelength %d 增加为 %d。浅拷贝已经完成,详见 ?setalloccol。如果" +"两个变量指向同一个数据 (这个我们无法侦测),会导致潜在的问题。如果并没有,你" +"可以:忽视这个问题。如果想要避免警告,可以使用以下任一命令,像是 " +"setalloccol(),用 copy() 深度拷贝,套用 suppressWarnings() 或者是增加 " +"'datatable.alloccol' 的选项。" + +msgid "" +"Note that the shallow copy will assign to the environment from which := was " +"called. That means for example that if := was called within a function, the " +"original table may be unaffected." +msgstr "" +"需要注意的是这个浅拷贝会被指向给调用了 which := 的环境。意思就是说,如果在函" +"数内部调用了 if :=, 原先的 table 可能不会有任何变化。" + msgid "" "Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] " "syntax is only valid when i is length 1, but it's length" @@ -735,18 +881,52 @@ msgstr "" "无法对 '.I' 列进行分组,因为与 data.table 特有的 .I 变量冲突请先尝试 " "setnames(DT,'.I','I')" +msgid "" +"Note: forcing units=\"secs\" on implicit difftime by group; call difftime " +"explicitly to choose custom units" +msgstr "" +"注意:在隐含的 difftime 强制分组使用了 units=\"secs\"; 请明确的调用 difftime " +"来选择自定义的单位。" + msgid "logical error. i is not data.table, but mult='all' and 'by'=.EACHI" msgstr "逻辑错误: i 不是data.table,但 mult='all' 及 'by'=.EACHI" msgid "Internal error: by= is missing" msgstr "内部错误 : 缺少 by=" +msgid "Finding groups using forderv ..." +msgstr "搜寻组中配套使用了 forderv . . . " + +msgid "Finding group sizes from the positions (can be avoided to save RAM) ..." +msgstr "从位置中搜寻组的大小 (避免此举来节省内存) . . ." + +msgid "Getting back original order ..." +msgstr "恢复原有的顺序 . . . " + +msgid "Finding groups using uniqlist on key ..." +msgstr "搜寻组并配套使用了将 uniqlist 用在键 (key) ... " + msgid "Internal error: byindex not the index name" -msgstr "内部错误 : byindex 不是索引名称" +msgstr "内部错误 : byindex 不是索引(index)名称" + +msgid "Finding groups using uniqlist on index '%s' ..." +msgstr "搜寻组并配套使用了将 uniqlist 用在索引 (index) '%s'... " msgid "Internal error: byindex not found" msgstr "内部错误 : 找不到 byindex" +msgid "lapply optimization changed j from '%s' to '%s'" +msgstr "lapply优化改变j从'%s'成'%s'" + +msgid "lapply optimization is on, j unchanged as '%s'" +msgstr "lapply优化打开了, j ('%s')没有区别" + +msgid "GForce optimized j to '" +msgstr "GForce优化 j 到 '" + +msgid "GForce is on, left j unchanged" +msgstr "GForce打开了, j 没有区别" + msgid "" "Unable to optimize call to mean() and could be very slow. You must name 'na." "rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean " @@ -756,9 +936,27 @@ msgstr "" "果您直接使用 mean(x,TRUE)会被认定为 trim=TRUE,trim 是 mean() 中尚未被优化的" "第二顺位参数" +msgid "Old mean optimization changed j from '%s' to '%s'" +msgstr "旧mean优化改变j 从'%s'成'%s'" + +msgid "Old mean optimization is on, left j unchanged." +msgstr "旧mean优化打开了,j没有区别。" + +msgid "All optimizations are turned off" +msgstr "所有优化关掉了" + +msgid "Optimization is on but left j unchanged (single plain symbol): '%s'" +msgstr "优化打开了但是并没有改变 j (一个普通符号):'%s'" + msgid "Internal error: length(irows)!=length(o__)" msgstr "内部错误:length(irows)!=length(o__)" +msgid "Making each group and running j (GForce %s) ..." +msgstr "进行分组中,并且运行 j (GForce %s) ..." + +msgid "setkey() after the := with keyby= ..." +msgstr "keyby=中,:=后setkey() ..." + msgid "" "The setkey() normally performed by keyby= has been skipped (as if by= was " "used) because := is being used together with keyby= but the keyby= contains " @@ -778,6 +976,9 @@ msgstr "但是ans(答案)是" msgid "and bynames is" msgstr "同时bynames是" +msgid "setkey() afterwards for keyby=.EACHI ..." +msgstr "keyby=.EACHI中到底setkey() ..." + msgid "rownames and rownames.value cannot both be used at the same time" msgstr "rownames和rownames.value 不能同时使用" @@ -798,7 +999,7 @@ msgstr "" "行名长度为零,`length(rownames)==0`,但应该为单一列名,单一数值,或NULL" msgid "rownames is TRUE but key has multiple columns" -msgstr "rownames是TRUE但key不只一个列" +msgstr "rownames是TRUE但键(key)不只一个列" msgid "; taking first column x[,1] as rownames" msgstr "; 取第一列, `column x[,1]`, 为rownames" @@ -901,6 +1102,9 @@ msgid "" "columns are non-atomic:" msgstr "参数 'by' 只适用于原子类型的纵列,但现在关联的纵列不是原子类型" +msgid "Processing split.data.table with:" +msgstr "运行 split.data.table 中使用: " + msgid "" "x is not a data.table. Shallow copy is a copy of the vector of column " "pointers (only), so is only meaningful for data.table" @@ -1125,6 +1329,23 @@ msgid "" "Internal error in .isFastSubsettable. Please report to data.table developers" msgstr ".isFastSubsettable 产生了内部错误。请向 data.table 开发者报告" +msgid "" +"Subsetting optimization disabled because the cross-product of RHS values " +"exceeds 1e4, causing memory problems." +msgstr "筛选子集优化被停止,因为叉积后的RHS值将超过 1e4,会造成内存问题。" + +msgid "Optimized subsetting with key '" +msgstr "优化的子集用键(key) '" + +msgid "Optimized subsetting with index '" +msgstr "优化的子集用索引(index) '" + +msgid "Creating new index '" +msgstr "造成新索引(index) '" + +msgid "Creating index %s done in ..." +msgstr "造成新索引(index) %s 用了 ..." + msgid "" "'on' argument should be a named atomic vector of column names indicating " "which columns in 'i' should be joined with which columns in 'x'." @@ -1159,6 +1380,17 @@ msgstr "." msgid "There is no package %s in provided repository." msgstr "所提供的资料库中不含包%s" +msgid "" +"Git revision is not available. Most likely data.table was installed from " +"CRAN or local archive.\n" +"Git revision is available when installing from our repositories 'https://" +"Rdatatable.gitlab.io/data.table' and 'https://Rdatatable.github.io/data." +"table'." +msgstr "" +"Git 修订并不存在。可能是因为 data.table 是从 CRAN 或者是本地档案安装。\n" +"Git 修订存在的情况只限于从我们资料库 'https://Rdatatable.gitlab.io/data." +"table' 或者'https://Rdatatable.github.io/data.table'下载。" + msgid "'fromLast' must be TRUE or FALSE" msgstr "'fromLast' 必须为 TRUE 或 FALSE" @@ -1295,6 +1527,13 @@ msgstr "将被优先使用。" msgid "Please provide a name to each element of 'measure.vars'." msgstr "请为 'measure.vars' 中的每个元素提供一个名称。" +msgid "" +"Duplicate column names found in molten data.table. Setting unique names " +"using 'make.names'" +msgstr "" +"重复的列名存在于在 molten 之后 data.table。请使用 'make.names' 设置唯一的列" +"名。" + msgid "" "y and x must both be data.tables. Use `setDT()` to convert list/data.frames " "to data.tables by reference or as.data.table() to convert to data.tables by " @@ -1322,8 +1561,8 @@ msgid "" "'y' must be keyed (i.e., sorted, and, marked as sorted). Call setkey(y, ...) " "first, see ?setkey. Also check the examples in ?foverlaps." msgstr "" -"'y' 必须有主键(已经排序并且标记为已排序)。请先用 setkey(y, ...) 设置主键," -"可以参考 ?setkey 以及 ?foverlaps 中提供的例子。" +"'y' 必须有键(key:已经排序并且标记为已排序)。请先用 setkey(y, ...) 设置主" +"键,可以参考 ?setkey 以及 ?foverlaps 中提供的例子。" msgid "" "'by.x' and 'by.y' should contain at least two column names (or numbers) each " @@ -1354,7 +1593,7 @@ msgid "The first" msgstr "首先" msgid "columns of y's key must be identical to the columns specified in by.y." -msgstr "在'by.y'中,y键的列必须与指定的列相同" +msgstr "在'by.y'中,y键(key)的列必须与指定的列相同" msgid "Elements listed in 'by.x' must be valid names in data.table 'x'" msgstr "对于data.table中的'X','by.x'中的元素必须是有效名称" @@ -1434,6 +1673,12 @@ msgstr "" "显示却重叠'的印象,(所以)请确保POSIXct类型的间隔列具有相同的'时区'属性以避" "免混乱。" +msgid "unique() + setkey() operations done in ..." +msgstr "unique() + setkey() 执行用了 ..." + +msgid "binary search(es) done in ..." +msgstr "二进制搜索用了 . . . " + msgid "Not yet implemented" msgstr "尚未实现" @@ -1447,7 +1692,7 @@ msgid "length(na.last) > 1, only the first element will be used" msgstr "当na.last长度大于1时,只会使用第一个元素" msgid "x is a single vector, non-NULL 'cols' doesn't make sense" -msgstr "x是单个向量,非空的'cols'没有意义" +msgstr "x是单个向量,非NULL的'cols'没有意义" msgid "x is a list, 'cols' can not be 0-length" msgstr "x是一个list, 'cols'不能为0长度" @@ -1633,6 +1878,10 @@ msgstr "正则 \"" msgid "\". Please double check the input file is a valid csvy." msgstr "从这里开始" +msgid "" +"Processed %d lines of YAML metadata with the following top-level fields: %s" +msgstr "处理了YAML元数据中的排列最前的 %d 行: %s" + msgid "User-supplied 'header' will override that found in metadata." msgstr "用户提供的'header'将覆盖元数据中的表头" @@ -1699,11 +1948,14 @@ msgstr ":" msgid "so the column has been left as type '" msgstr "所以该列已经被保存为类型" +msgid "stringsAsFactors=%s converted %d column(s): %s" +msgstr "stringsAsFactors=%s 改变 %d 列: %s" + msgid "" "key argument of data.table() must be a character vector naming columns (NB: " "col.names are applied before this)" msgstr "" -"data.table()的关键参数必须是字符向量命名的列(NB:col.names在这之前被使用过)" +"data.table()的key参数必须是字符向量命名的列(NB:col.names在这之前被使用过)" msgid "" "index argument of data.table() must be a character vector naming columns " @@ -1725,6 +1977,9 @@ msgstr "" msgid "x being coerced from class: matrix to data.table" msgstr "x 的类将强制从 matrix 转变为 data.table" +msgid "Appending to existing file so setting bom=FALSE and yaml=FALSE" +msgstr "并入了已存在的文件,所以设置 bom=FALSE 和 yaml=FALSE" + msgid "Input has no columns; doing nothing." msgstr "输入没有列,不执行任何操作。" @@ -1820,6 +2075,9 @@ msgid "" "Using integer64 class columns require to have 'bit64' package installed." msgstr "要在列中使用 integer64 类,需要先安装 'bit64' 包。" +msgid "%s: using %s: %s" +msgstr "%s: 用 %s: %s" + msgid "" "'xts' class passed to %s function but 'xts' is not available, you should " "have 'xts' installed already" @@ -1895,7 +2153,7 @@ msgid "" "**********" msgstr "" "**********\n" -"用中文运行data.table。软件包只提供英语支持。当在在线搜索帮助时,也要确保检查" +"用中文执行data.table。软件包只提供英语支持。当在在线搜索帮助时,也要确保检查" "英语错误信息。这个可以通过查看软件包源文件中的po/R-zh_CN.po和po/zh_CN.po文件" "获得,这个文件可以并排找到母语和英语错误信息。\n" "**********" @@ -1963,35 +2221,26 @@ msgstr "" "用,但在未来不会被使用。相关的详细信息和动机,请参阅1.12.4的信息。要指定内部" "连接,请在调用中明确指定`nomatch = NULL`,而不要使用此选项更改默认值。" -msgid "The datatable." -msgstr "datatable" - -msgid "version (" -msgstr "版本(" - -msgid ") does not match the package (" -msgstr ")和包不匹配 (" - -msgid "). Please close all R sessions to release the old" -msgstr ").请关闭所有R会话以释放旧版本" - msgid "" -"and reinstall data.table in a fresh R session. The root cause is that R's " -"package installer can in some unconfirmed circumstances leave a package in a " -"state that is apparently functional but where new R code is calling old C " -"code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. " -"Once a package is in this mismatch state it may produce wrong results " -"silently until you next upgrade the package. Please help by adding precise " -"circumstances to 17478 to move the status to confirmed. This mismatch " -"between R and C code can happen with any package not just data.table. It is " -"just that data.table has added this check." -msgstr "" -"并在全新的R会话中重新安装data.table。根本原因是R包安装程序可能在某些未经确认" -"的条件下将包置于显然可以正常工作的状态,但是新的R代码正在默默地调用旧的C代" -"码:https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478。一旦安装包处于" -"这不匹配的状态下,在您下次升级程序包之前,它可能会默默地产生错误的结果请提交" -"具体的情况至17478协助我们确认这个Bug。R和C代码之间的这种不匹配可能发生在任何" -"包中,而不仅仅是在data.table中。只是data.table添加了这个检查" +"The datatable.%s version (%s) does not match the package (%s). Please close " +"all R sessions to release the old %s and reinstall data.table in a fresh R " +"session. The root cause is that R's package installer can in some " +"unconfirmed circumstances leave a package in a state that is apparently " +"functional but where new R code is calling old C code silently: https://bugs." +"r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this " +"mismatch state it may produce wrong results silently until you next upgrade " +"the package. Please help by adding precise circumstances to 17478 to move " +"the status to confirmed. This mismatch between R and C code can happen with " +"any package not just data.table. It is just that data.table has added this " +"check." +msgstr "" +"data.table.%s版本(%s)和包不匹配版本(%s)。请关闭所有R会话以释放旧%s并在全新的R" +"会话中重新安装data.table。根本原因是R包安装程序可能在某些未经确认的条件下将包" +"置于显然可以正常工作的状态,但是新的R代码正在默默地调用旧的C代码:https://" +"bugs.r-project.org/bugzilla/show_bug.cgi?id=17478。一旦安装包处于这不匹配的状" +"态下,在您下次升级程序包之前,它可能会默默地产生错误的结果请提交具体的情况至" +"17478协助我们确认这个Bug。R和C代码之间的这种不匹配可能发生在任何包中,而不仅" +"仅是在data.table中。只是data.table添加了这个检查" msgid "This is R" msgstr "这是R" @@ -2059,6 +2308,15 @@ msgstr "对col.names有效的参数为'auto', 'top', and 'none'" msgid "Column classes will be suppressed when col.names is 'none'" msgstr "当col.names为'none'时,列的类型将被抑制" +msgid "Key: <%s>" +msgstr "键(key): <%s>" + +msgid "Null data.%s (0 rows and 0 cols)" +msgstr "NULL data.%s (0行,0列)" + +msgid "Empty data.%s (%d rows and %d cols)" +msgstr "空的 data.%s (%d行,%d列)" + msgid "" "Internal structure doesn't seem to be a list. Possibly corrupt data.table." msgstr "内部类型可能不是一个列表,该操作可能会损坏data.table" @@ -2088,8 +2346,8 @@ msgid "" "the original data's order by group. Try setindex() instead. Or, set*(copy(." "SD)) as a (slow) last resort." msgstr "" -"在.SD设置一个物理的键的功能被保留,以备未来的需求; 如需通过分组修改原数据顺序" -"请使用setindex(), 或者set*(copy(.SD))作为最终(该方式缓慢)的方法" +"在.SD设置一个物理的键(key)的功能被保留,以备未来的需求; 如需通过分组修改原数" +"据顺序请使用setindex(), 或者set*(copy(.SD))作为最终(该方式缓慢)的方法" msgid "" "cols is a character vector of zero length. Removed the key, but use NULL " @@ -2099,7 +2357,7 @@ msgstr "" "来避免警告" msgid "cols is the empty string. Use NULL to remove the key." -msgstr "列为一个空字符串,请使用NULL以删除键值。" +msgstr "列为一个空字符串,请使用NULL以删除键(key)值。" msgid "cols contains some blanks." msgstr "列中包含空白" @@ -2115,15 +2373,27 @@ msgid "' is type '" msgstr "是类型" msgid "' which is not supported as a key column type, currently." -msgstr "目前不是一种被支持的列类型" +msgstr "目前不是一种被支持的键(key)列类型" msgid "" "Internal error. 'cols' should be character at this point in setkey; please " "report." msgstr "内部错误: 目前在setkey中,'cols'应该是字符类型, 请报告" +msgid "forder took" +msgstr "forder 用了" + +msgid "setkey on columns %s using existing index '%s'" +msgstr "setkey到列%s用现有索引(index) '%s'" + +msgid "reorder took" +msgstr "reorder 用了" + +msgid "x is already ordered by these columns, no need to call reorder" +msgstr "x 已根据这些列进行了排序,无需调用 reorder" + msgid "Internal error: index '" -msgstr "内部错误:索引" +msgstr "内部错误:索引(index) '" msgid "' exists but is invalid" msgstr "存在但无效" @@ -2203,26 +2473,27 @@ msgstr "x 和 y 均需为 data.table" msgid "length(by.x) != length(by.y)" msgstr "length(by.x) != length(by.y)" -msgid "When x's column ('" -msgstr "当 x 的列 ('" - -msgid "') is character, the corresponding column in y ('" -msgstr "') 是字符,y 中相应的列 ('" - -msgid "') should be factor or character, but found incompatible type '" -msgstr "') 应该是因子或字符,然而此类型并不兼容:'" - -msgid "') is factor, the corresponding column in y ('" -msgstr "') 是因子,y 中相应的列 ('" - -msgid "') should be character or factor, but found incompatible type '" -msgstr "') 应该是字符或因子,然而此类型并不兼容:'" +msgid "" +"When x's column ('%s') is character, the corresponding column in y ('%s') " +"should be factor or character, but found incompatible type '%s'." +msgstr "" +"当 x 的列('%s') 是字符,y 中相应的列 ('%s') 应该是因子或字符,然而此类型并不" +"兼容:'%s'." -msgid "') is integer or numeric, the corresponding column in y ('" -msgstr "') 是整数或数值,y 中相应的列 ('" +msgid "" +"When x's column ('%s') is factor, the corresponding column in y ('%s') " +"should be character or factor, but found incompatible type '%s'." +msgstr "" +"当 x 的列('%s') 是因子, y 中相应的列 ('%s') 应该是字符或因子,然而此类型并不" +"兼容:'%s'." -msgid "') can not be character or logical types, but found incompatible type '" -msgstr "') 不能是字符或逻辑类型,然而此类型不兼容:'" +msgid "" +"When x's column ('%s') is integer or numeric, the corresponding column in y " +"('%s') can not be character or logical types, but found incompatible type " +"'%s'." +msgstr "" +"当 x 的列('%s') 是整数或数值,y 中相应的列('%s') 不能是字符或逻辑类型,然而此" +"类型不兼容:'%s'." msgid "argument 'all' should be logical of length one" msgstr "参数 'all' 应该是长度为 1 的逻辑型" @@ -2284,12 +2555,18 @@ msgstr "内部错误:此时不匹配的因子类型应已被发现" msgid "argument 'fill' ignored, only make sense for type='const'" msgstr "参数 'fill' 将被忽略,因其仅当 type='const'时有意义" +msgid "No objects of class data.table exist in %s" +msgstr "%s中没有 data.table类型的对象" + msgid "order.col='" msgstr "order.col='" msgid "' not a column name of info" msgstr "' 并非info的一个列名" +msgid "Total:" +msgstr "共计:" + msgid "data.table package is loaded. Unload or start a fresh R session." msgstr "data.table 包已被加载。请将其卸载或启动一个新的 R 会话。" @@ -2303,27 +2580,40 @@ msgstr "" msgid "Neither %s nor %s exist in %s" msgstr "%3$s 中 %1$s 也 %2$s 不存在" +msgid "test.data.table() running:" +msgstr "test.data.table() 执行:" + +msgid "" +"**** This R session's language is not English. Each test will still check " +"that the correct number of errors and/or\n" +"**** warnings are produced. However, to test the text of each error/warning " +"too, please restart R with LANGUAGE=en" +msgstr "" +"**** 此 R 会话的语言并非英文。每个测试仍将检查生成的警告或错误的个数是否正" +"确。**** 然而,若需同时测试警告和错误的文本内容,请用 LANGUAGE=en 重新启动 " +"R。" + msgid "Failed after test" msgstr "错误出现于测试" msgid "before the next test() call in" msgstr "后,先于下一调用test()于" -msgid "out of" -msgstr "总数为" - -msgid ". Search" -msgstr ". 搜索" - -msgid "for test number" -msgstr "以获得测试编号" - msgid "Timings count mismatch:" msgstr "计时不一致:" msgid "vs" msgstr "vs" +msgid "10 longest running tests took" +msgstr "最慢10个测试用了" + +msgid "All %d tests in %s completed ok in %s" +msgstr "%2$s中每%1$d个测试在%3$s结束了ok" + +msgid "Running test id %s" +msgstr "执行测试 id %s" + msgid "Test" msgstr "测试" @@ -2331,6 +2621,51 @@ msgid "" "is invalid: when error= is provided it does not make sense to pass y as well" msgstr "无效:当使用了error=,不应再输入y" +msgid "Test id %s is not in increasing order" +msgstr "测试标识符 %s 不是递增的顺序" + +msgid "Test %s produced %d %ss but expected %d" +msgstr "测试 %s 生成了%d %ss 但预计生成 %d" + +msgid "" +"Test %s didn't produce the correct %s:\n" +"Expected: %s\n" +"Observed: %s" +msgstr "" +"测试 %s 没有生成正确的 %s:\n" +"预计生成:%s\n" +" 实际生成:%s " + +msgid "Output captured before unexpected warning/error/message:" +msgstr "在意外的警告/错误/提示之前,输入已被记录:" + +msgid "" +"Test %s did not produce the correct output:\n" +"Expected: <<%s>>\n" +"Observed <<%s>>" +msgstr "" +"测试 %s 没有生成正确的输入: \n" +"预计生成: <<%s>>\n" +"实际生成:<<%s>>" + +msgid "" +"Test %s produced output but should not have:\n" +"Expected absent (case insensitive): <<%s>>\n" +"Observed: <<%s>>" +msgstr "" +"测试 %s 生成输出但是不应当出现以下:\n" +"预计不存在(不区分大小写): <<%s>>\n" +"实际生成:<<%s>>" + +msgid "Test %s ran without errors but selfrefok(%s) is FALSE" +msgstr "测试 %s 可以无报错运行但是 selfrefok(%s) 是否:" + +msgid "Test %s ran without errors but failed check that x equals y:" +msgstr "测试 %s 可以无报错运行但是在检查 x 与 y 相同时候有报错:" + +msgid "First %d of %d (type '%s'):" +msgstr "第%d之%d (类型 '%s'):" + msgid "Use started.at=proc.time() not Sys.time() (POSIXt and slow)" msgstr "使用started.at=proc.time()而非Sys.time() (返回POSIXt类型,处理较慢)" @@ -2417,6 +2752,17 @@ msgstr "" msgid "Following columns are not numeric and will be omitted:" msgstr "以下的列并非数值类型,将被忽略:" +msgid "Index: " +msgid_plural "Indices: " +msgstr[0] "索引(index): " + msgid "%d variable not shown: %s\n" msgid_plural "%d variables not shown: %s\n" msgstr[0] "%d变量没显示: %s\n" + +msgid "%d error out of %d. Search %s for test number %s" +msgid_plural "%d errors out of %d. Search %s for test numbers %s" +msgstr[0] "%d错误总数为%d. %s中搜索测试编号%s" + +#~ msgid "'target' and 'current' must both be data.tables" +#~ msgstr "'target' 和 'current' 都必须是 data.table" diff --git a/po/zh_CN.po b/po/zh_CN.po index d9b54a4435..57242f7044 100644 --- a/po/zh_CN.po +++ b/po/zh_CN.po @@ -442,12 +442,12 @@ msgid "" "Dropping index '%s' as it doesn't have '__' at the beginning of its name. It " "was very likely created by v1.9.4 of data.table.\n" msgstr "" -"丢掉索引 '%s' 因为它的名字前面没有 '__' 。这个很可能是 data.table v1.9.4 创建" -"的\n" +"丢掉索引(index) '%s' 因为它的名字前面没有 '__' 。这个很可能由data.table " +"v1.9.4 创建\n" #: assign.c:574 msgid "Internal error: index name ends with trailing __" -msgstr "内部错误: 索引名称以 __ 结尾" +msgstr "内部错误: 索引(index)名称以 __ 结尾" #: assign.c:579 msgid "Internal error: Couldn't allocate memory for s4." @@ -460,12 +460,12 @@ msgstr "内部错误: 不能给 s5 分配内存" #: assign.c:611 assign.c:627 #, c-format msgid "Dropping index '%s' due to an update on a key column\n" -msgstr " 因为一个主列的更新,丢掉索引 '%s'\n" +msgstr " 因为一个键(key)列的更新,丢掉索引(index) '%s'\n" #: assign.c:620 #, c-format msgid "Shortening index '%s' to '%s' due to an update on a key column\n" -msgstr "因为一个主列的更新,缩短索引 '%s' 到 '%s'\n" +msgstr "因为一个键(key)列的更新,缩短索引(index) '%s' 到 '%s'\n" #: assign.c:650 #, c-format From 79c3d3e1d2b1842edc948f2d8c895fe45a9070ac Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 10 May 2021 09:11:25 +0200 Subject: [PATCH 229/588] programming on data.table (#4304) --- .Rbuildignore | 1 + NAMESPACE | 1 + NEWS.md | 29 ++ R/data.table.R | 33 +- R/programming.R | 80 ++++ R/test.data.table.R | 11 +- inst/tests/programming.Rraw | 600 ++++++++++++++++++++++++++++ man/data.table.Rd | 4 +- man/substitute2.Rd | 77 ++++ src/data.table.h | 2 + src/init.c | 1 + src/programming.c | 32 ++ tests/programming.R | 2 + vignettes/datatable-programming.Rmd | 413 +++++++++++++++++++ 14 files changed, 1274 insertions(+), 12 deletions(-) create mode 100644 R/programming.R create mode 100644 inst/tests/programming.Rraw create mode 100644 man/substitute2.Rd create mode 100644 src/programming.c create mode 100644 tests/programming.R create mode 100644 vignettes/datatable-programming.Rmd diff --git a/.Rbuildignore b/.Rbuildignore index a6cb72b2a9..9a939aae81 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -32,6 +32,7 @@ ^.*\.Rproj$ ^\.Rproj\.user$ ^\.idea$ +^\.libs$ ^.*\.dll$ diff --git a/NAMESPACE b/NAMESPACE index 57271aa04d..277a6a2892 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(substitute2) S3method("[", data.table) S3method("[<-", data.table) diff --git a/NEWS.md b/NEWS.md index e14b6d4373..bcaf69f92c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -58,6 +58,35 @@ 9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage for reporting, and to @tdhock for implementing. +10. A new interface for _programming on data.table_ has been added, [#2655](https://github.com/Rdatatable/data.table/issues/2655) any many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing. + + ```R + DT = data.table(x = 1:5, y = 5:1) + + # parameters + in_col_name = "x" + fun = "sum" + fun_arg1 = "na.rm" + fun_arg1val = TRUE + out_col_name = "sum_x" + + # parameterized query + #DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val))] + + # desired query + DT[, .(sum_x = sum(x, na.rm=TRUE))] + + # new interface + DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + )] + ``` + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index 15d067eb19..0d51beafff 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -121,7 +121,7 @@ replace_dot_alias = function(e) { } } -"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL) +"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could # test explicitly if the caller is [.data.table (even stronger test. TO DO.) @@ -151,15 +151,19 @@ replace_dot_alias = function(e) { keyby = FALSE } else { if (missing(by)) { - by = bysub = substitute(keyby) + by = bysub = if (is.null(env)) substitute(keyby) + else eval(substitute(substitute2(.keyby, env), list(.keyby = substitute(keyby)))) keyby = TRUE } else { - by = bysub = substitute(by) + by = bysub = if (is.null(env)) substitute(by) + else eval(substitute(substitute2(.by, env), list(.by = substitute(by)))) if (missing(keyby)) keyby = FALSE else if (!isTRUEorFALSE(keyby)) stop("When by and keyby are both provided, keyby must be TRUE or FALSE") } + if (missing(by)) { missingby=TRUE; by=bysub=NULL } # possible when env is used, PR#4304 + else if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") } bynull = !missingby && is.null(by) #3530 byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI" @@ -215,7 +219,16 @@ replace_dot_alias = function(e) { av = NULL jsub = NULL if (!missing(j)) { - jsub = replace_dot_alias(substitute(j)) + if (is.null(env)) jsub = substitute(j) else { + jsub = eval(substitute( + substitute2(.j, env), + list(.j = substitute(j)) + )) + if (missing(jsub)) {j = substitute(); jsub=NULL} else if (verbose) cat("Argument 'j' after substitute: ", paste(deparse(jsub, width.cutoff=500L), collapse=" "), "\n", sep="") + } + } + if (!missing(j)) { + jsub = replace_dot_alias(jsub) root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" if (root == ":" || (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') || @@ -291,10 +304,18 @@ replace_dot_alias = function(e) { # setdiff removes duplicate entries, which'll create issues with duplicated names. Use %chin% instead. dupdiff = function(x, y) x[!x %chin% y] - + isub = NULL + if (!missing(i)) { + if (is.null(env)) isub = substitute(i) else { + isub = eval(substitute( + substitute2(.i, env), + list(.i = substitute(i)) + )) + if (missing(isub)) {i = substitute(); isub=NULL} else if (verbose) cat("Argument 'i' after substitute: ", paste(deparse(isub, width.cutoff=500L), collapse=" "), "\n", sep="") + } + } if (!missing(i)) { xo = NULL - isub = substitute(i) if (identical(isub, NA)) { # only possibility *isub* can be NA (logical) is the symbol NA itself; i.e. DT[NA] # replace NA in this case with NA_integer_ as that's almost surely what user intended to diff --git a/R/programming.R b/R/programming.R new file mode 100644 index 0000000000..b4d25012f8 --- /dev/null +++ b/R/programming.R @@ -0,0 +1,80 @@ +is.AsIs = function(x) { + inherits(x, "AsIs") +} +rm.AsIs = function(x) { + cl = oldClass(x) + oldClass(x) = cl[cl!="AsIs"] + x +} +list2lang = function(x) { + if (!is.list(x)) + stop("'x' must be a list") + if (is.AsIs(x)) + return(rm.AsIs(x)) + asis = vapply(x, is.AsIs, FALSE) + char = vapply(x, is.character, FALSE) + to.name = !asis & char + if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience + if (any(non.scalar.char <- vapply(x[to.name], length, 0L)!=1L)) { + stop("Character objects provided in the input are not scalar objects, if you need them as character vector rather than a name, then wrap each into 'I' call: ", + paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) + } + x[to.name] = lapply(x[to.name], as.name) + } + if (isTRUE(getOption("datatable.enlist", TRUE))) { ## recursively enlist for nested lists, see note section in substitute2 manual + islt = vapply(x, is.list, FALSE) + to.enlist = !asis & islt + if (any(to.enlist)) { + x[to.enlist] = lapply(x[to.enlist], enlist) + } + } + if (any(asis)) { + x[asis] = lapply(x[asis], rm.AsIs) + } + x +} +enlist = function(x) { + if (!is.list(x)) + stop("'x' must be a list") + if (is.AsIs(x)) + return(rm.AsIs(x)) + as.call(c(quote(list), list2lang(x))) +} + +substitute2 = function(expr, env) { + if (missing(expr)) + return(substitute()) + if (missing(env)) { + stop("'env' must not be missing") + } else if (is.null(env)) { + # null is fine, will be escaped few lines below + } else if (is.environment(env)) { + env = as.list(env, all.names=TRUE, sorted=TRUE) + } else if (!is.list(env)) { + stop("'env' must be a list or an environment") + } + if (!length(env)) { + return(substitute(expr)) + } + env.names = names(env) + if (is.null(env.names)) { + stop("'env' argument does not have names") + } else if (!all(nzchar(env.names))) { + stop("'env' argument has zero char names") + } else if (anyNA(env.names)) { + stop("'env' argument has NA names") + } else if (anyDuplicated(env.names)) { + stop("'env' argument has duplicated names") + } + # character to name/symbol, and list to list call + env = list2lang(env) + # R substitute + expr.sub = eval(substitute( + substitute(.expr, env), + env = list(.expr = substitute(expr)) + )) + if (missing(expr.sub)) + return(substitute()) ## nested emptiness + # substitute call argument names + .Call(Csubstitute_call_arg_namesR, expr.sub, env) +} diff --git a/R/test.data.table.R b/R/test.data.table.R index ef834e8438..0b47d8e18b 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -177,7 +177,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F timings = env$timings DT = head(timings[-1L][order(-time)], 10L) # exclude id 1 as in dev that includes JIT if ((x<-sum(timings[["nTest"]])) != ntest) { - warning("Timings count mismatch:",x,"vs",ntest) # nocov + warning("Timings count mismatch: ",x," vs ",ntest) # nocov } catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) print(DT, class=FALSE) @@ -260,6 +260,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # iv) if warning is supplied, y is checked to equal x, and x should result in a warning message matching the pattern # v) if output is supplied, x is evaluated and printed and the output is checked to match the pattern # num just needs to be numeric and unique. We normally increment integers at the end, but inserts can be made using decimals e.g. 10,11,11.1,11.2,12,13,... + # num=0 to escape global failure tracking so we can test behaviour of test function itself: test(1.1, test(0, TRUE, FALSE), FALSE, output="1 element mismatch") # Motivations: # 1) we'd like to know all tests that fail not just stop at the first. This often helps by revealing a common feature across a set of # failing tests @@ -273,7 +274,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no prevtest = get("prevtest", parent.frame()) nfail = get("nfail", parent.frame()) # to cater for both test.data.table() and stepping through tests in dev whichfail = get("whichfail", parent.frame()) - assign("ntest", get("ntest", parent.frame()) + 1L, parent.frame(), inherits=TRUE) # bump number of tests run + assign("ntest", get("ntest", parent.frame()) + if (num>0) 1L else 0L, parent.frame(), inherits=TRUE) # bump number of tests run lasttime = get("lasttime", parent.frame()) timings = get("timings", parent.frame()) memtest = get("memtest", parent.frame()) @@ -282,7 +283,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no foreign = get("foreign", parent.frame()) showProgress = get("showProgress", parent.frame()) time = nTest = NULL # to avoid 'no visible binding' note - on.exit( { + if (num>0) on.exit( { now = proc.time()[3L] took = now-lasttime # so that prep time between tests is attributed to the following test assign("lasttime", now, parent.frame(), inherits=TRUE) @@ -344,7 +345,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no fwrite(mem, "memtest.csv", append=TRUE, verbose=FALSE) # nocov } fail = FALSE - if (.test.data.table) { + if (.test.data.table && num>0) { if (num0) { # nocov start assign("nfail", nfail+1L, parent.frame(), inherits=TRUE) assign("whichfail", c(whichfail, numStr), parent.frame(), inherits=TRUE) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw new file mode 100644 index 0000000000..3d8a056e3a --- /dev/null +++ b/inst/tests/programming.Rraw @@ -0,0 +1,600 @@ +require(methods) +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test + is.AsIs = data.table:::is.AsIs + rm.AsIs = data.table:::rm.AsIs + enlist = data.table:::enlist + list2lang = data.table:::list2lang +} + +# test that 'test' catches the difference in language object +cl1 = substitute(f(1L, list(2L))) +cl2 = substitute(f(1L, .v), list(.v=list(2L))) +test(1.01, all.equal(cl1, cl2), TRUE) +test(1.02, identical(cl1, cl2), FALSE) +test(1.03, test(0, cl1, cl2), FALSE, output="f(1L, list(2L))") +# AsIs +test(1.11, is.AsIs(1L), FALSE) +test(1.12, is.AsIs(I(1L)), TRUE) +test(1.13, is.AsIs("a"), FALSE) +test(1.14, is.AsIs(I("a")), TRUE) +test(1.15, is.AsIs(list(1L)), FALSE) +test(1.16, is.AsIs(I(list(1L))), TRUE) +test(1.17, is.AsIs(structure(list(NULL), class="an_S3")), FALSE) ## S3 +test(1.18, is.AsIs(I(structure(list(NULL), class="an_S3"))), TRUE) +test(1.19, is.AsIs(getClass("MethodDefinition")), FALSE) ## S4 +test(1.20, is.AsIs(I(getClass("MethodDefinition"))), TRUE) +test(1.21, is.AsIs(rm.AsIs(1L)), FALSE) +test(1.22, is.AsIs(rm.AsIs(I(1L))), FALSE) +test(1.23, is.AsIs(rm.AsIs(list(1L))), FALSE) +test(1.24, is.AsIs(rm.AsIs(I(list(1L)))), FALSE) + +# substitute2 simple +test(2.01, substitute2(list(var = val), env = list(var="my_var", val=5L)), quote(list(my_var = 5L))) +# substitute2 + I to handle char and symbol +test(2.02, substitute2(list(var = val), env = list(var="my_var", val=I("my_val"))), quote(list(my_var="my_val"))) +test(2.03, substitute2(list(var = val), env = I(list(var=as.name("my_var"), val="my_val"))), quote(list(my_var="my_val"))) +# substitute2 handle symbol anyway +test(2.04, substitute2(list(var = val), env = list(var=as.name("my_var"), val=I("my_val"))), quote(list(my_var="my_val"))) +# substitute2 complex use case +test(2.11, substitute2( + .(fun_ans_var = fun(farg1, farg2=farg2val), timestamp=Sys.time(), col_head = head(head_arg, n=1L)), + list( + fun_ans_var = "my_mean_res", + fun = "mean", + farg1 = "my_x_col", + farg2 = "na.rm", + farg2val = TRUE, + col_head = "first_y", + head_arg = "y" + ) +), quote(.(my_mean_res=mean(my_x_col, na.rm=TRUE), timestamp=Sys.time(), first_y=head(y, n=1L)))) +# substitute2 PR example +test(2.12, substitute2( + .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + ) +), quote(.(sum_x = sum(x, na.rm=TRUE)))) +# substitute2 nested calls argument names substitute +test(2.13, substitute2( + f1(a1 = f2(a2 = f3(a3 = f4(a4 = v1, extra=v2), v3, a3b = v4)), a1b=c("a","b")), + list(f1="fun1", f2="fun2", f3="fun3", f4="fun4", a1="arg1", a2="arg2", a3="arg3", a4="arg4", v1="col1", extra="n", v2=6L, v3="col2", a3b="arg3b", v4=c(3.5,4.5), a1b="arg1b") +), substitute( + fun1(arg1 = fun2(arg2 = fun3(arg3 = fun4(arg4 = col1, n=6L), col2, arg3b = v4)), arg1b=c("a","b")), + list(v4=c(3.5,4.5)) +)) +# calls of length 0 args +const1 = function() 1L +test(2.21, substitute2(list(nm = fun()), env=list(a="b", fun="const1", nm="int1")), quote(list(int1=const1()))) +test(2.22, substitute2(.(), env=list(a="b", fun="const1", nm="int1")), quote(.())) +test(2.23, identical(substitute2(), substitute())) +# substitute2 AsIs class properly removed or kept +test(2.31, class(substitute2(var3%in%values, list(var3="a", values=I(c("a","b","c"))))[[3L]]), "character") +test(2.32, class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c("a","b","c"))))[[3L]]), "character") +test(2.33, class(substitute2(var3%in%values, list(var3="a", values=I(1:3)))[[3L]]), "integer") +test(2.34, class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c(1:3))))[[3L]]), "integer") +cl = substitute2(var3%in%values, I(list(var3=as.name("a"), values=I(c("a","b","c"))))) ## keeping AsIs by extra I on whole env arg +test(2.35, cl, substitute(a %in% .v, list(.v=I(c("a","b","c"))))) +test(2.36, class(cl[[3L]]), "AsIs") +cl = substitute2(var3%in%values, I(list(var3="a", values=I(1:3)))) +test(2.37, cl, substitute("a" %in% .v, list(.v=I(1:3)))) +test(2.38, class(cl[[3L]]), "AsIs") +# substitute2 non-scalar char as name +test(2.41, substitute2(list(var = val), env = list(var="my_var", val=c("a","b"))), error="are not scalar") +test(2.42, substitute2(list(var = val), env = list(var="my_var", val=I(c("a","b")))), substitute(list(my_var=.v), list(.v=c("a","b")))) ## note that quote(list(my_var=c("a","b")))) will not work because 'c("a","b")' will be a 'language' class (a 'c()' call), but we need to have it as 'character' class instead +test(2.43, substitute2(list(var = val), env = I(list(var=as.name("my_var"), val=c("a","b")))), substitute(list(my_var=.v), list(.v=c("a","b")))) +# substitute2 non-symbol +test(2.44, substitute2(list(var = val), env = list(var=I("my_var"), val="my_val")), error="type 'character' but it has to be 'symbol'") +test(2.45, substitute2(list(var = val), env = I(list(var="my_var", val="my_val"))), error="type 'character' but it has to be 'symbol'") +test(2.46, substitute2(.(v1=v2), list(v1=1L, v2=2L)), error="type 'integer' but it has to be 'symbol'") +test(2.47, substitute2(.(v1=v2), list(v1=FALSE, v2=2L)), error="type 'logical' but it has to be 'symbol'") +# substitute2 NA_character_ becomes valid 'NA' name +test(2.48, substitute2(.(v1 = v2), list(v1 = NA_character_, v2 = NA_character_, "." = "list")), quote(list(`NA` = `NA`))) +cl = substitute2(.(v1 = v2), list(v1 = NA_character_, v2 = I(NA_character_), "." = "list")) +test(2.49, cl, quote(list(`NA` = NA_character_))) +test(2.50, eval(cl), list("NA" = NA_character_)) +# substitute2 duplicate matches +test(2.51, substitute2(list(v1=v2, v1=v2), env=list(v1="nm",v2=2L,v3=3L)), quote(list(nm = 2L, nm = 2L))) +test(2.52, substitute2(list(v1=v2, v1=v3), env=list(v1="nm",v2=2L,v3=3L)), quote(list(nm = 2L, nm = 3L))) +# substitute2 nested unnamed call +test(2.53, substitute2(c(list(v1=v2, v1=v2)), env=list(v1="nm",v2=2L,v3=3L)), quote(c(list(nm = 2L, nm = 2L)))) +test(2.54, substitute2(c(list(v1=v2, v1=v3)), env=list(v1="nm",v2=2L,v3=3L)), quote(c(list(nm = 2L, nm = 3L)))) + +# substitute2 env as environment class +e = as.environment(list(v=1L, .v=2L)) +test(2.81, substitute2(.(v, .v), e), quote(.(1L, 2L))) +# unline in base R substitute, the env arg is always evaluated +e = new.env() +delayedAssign("a_promise", stop("I am the error"), assign.env=e) +e$x = 5L +promises = function(env) { + f = function(x, env) eval(substitute(substitute(.x, env), list(.x=x))) + sym = lapply(setNames(nm=ls(env)), as.name) + lapply(sym, f, env) +} +test(2.820, promises(e), list(a_promise=quote(stop("I am the error")), x=5L)) +test(2.821, substitute(x + 1L, e), quote(5L + 1L)) +test(2.822, substitute2(x + 1L, e), error="I am the error", ignore.warning="restarting interrupted promise evaluation") +# substitute2 env various corner cases +test(2.901, substitute2(.(v), NULL), quote(.(v))) +test(2.902, substitute2(.(v), list()), quote(.(v))) +test(2.903, substitute2(.(v), emptyenv()), quote(.(v))) +test(2.91, substitute2(.()), error="'env' must not be missing") +test(2.92, substitute2(v, c(v=1L)), error="'env' must be a list or an environment") +test(2.93, substitute2(.(v), list(1L, 2L)), error="'env' argument does not have names") +test(2.94, substitute2(.(v), structure(list(1L,2L), names=c("","v"))), error="'env' argument has zero char names") +test(2.95, substitute2(.(v), structure(list(1,2), names=c(NA,"v"))), error="'env' argument has NA names") +test(2.96, substitute2(.(v), list(v=1,v=2)), error="'env' argument has duplicated names") + +# substitute2 re-use inside another function +f = function(expr, env) { + eval(substitute( + substitute2(.expr, env), + list(.expr = substitute(expr)) + )) +} +cl = f( + .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + ) +) +test(3.01, cl, quote(.(sum_x = sum(x, na.rm = TRUE)))) +# substitute2 nested re-use inside another function +cl = substitute2(list(nm = fun(.(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = tf_var, ## note a parameter here + out_col_name = "sum_x" +))), list(nm="my_call", fun="f", tf_var=FALSE)) +test(3.02, eval(cl), list(my_call = quote(.(sum_x = sum(x, na.rm = FALSE))))) + +# enlist +test(4.01, enlist(c("a")), error="'x' must be a list") +test(4.02, enlist(list("V1","V2")), quote(list(V1, V2))) +test(4.03, enlist(list(V1="V1", V2="V2")), quote(list(V1=V1, V2=V2))) +test(4.04, enlist(I(list(V1="V1", V2="V2"))), list(V1="V1", V2="V2")) +test(4.05, enlist(list(V1=I("V1"), V2=I("V2"))), quote(list(V1="V1", V2="V2"))) +test(4.06, enlist(list(V1="V1", V2=I("V2"))), quote(list(V1=V1, V2="V2"))) +test(4.07, enlist(list(V1="V1", V2=I("V2"), V3=list("X1", "X2"))), quote(list(V1=V1, V2="V2", V3=list(X1, X2)))) +test(4.08, enlist(list(V1="V1", V2=I("V2"), V3=list(X1="X1", X2=I("X2")))), quote(list(V1=V1, V2="V2", V3=list(X1=X1, X2="X2")))) +test(4.09, enlist(list(V1="V1", V2=I("V2"), V3=enlist(list("X1","X2")))), quote(list(V1 = V1, V2 = "V2", V3 = list(X1, X2)))) +test(4.10, enlist(list(V1="V1", V2=I("V2"), V3=I(enlist(list("X1","X2"))))), quote(list(V1 = V1, V2 = "V2", V3 = list(X1, X2)))) +test(4.11, enlist(list(V1="V1", V2=I("V2"), V3=enlist(I(list("X1","X2"))))), quote(list(V1 = V1, V2 = "V2", V3 = list(X1, X2)))) +test(4.12, enlist(list(V1="V1", V2=I("V2"), V3=I(enlist(I(list("X1","X2")))))), substitute(list(V1 = V1, V2 = "V2", V3 = lst), list(lst = list("X1", "X2")))) +test(4.13, enlist(list(V1="V1", V2=I("V2"), V3=I(enlist(list(I("X1"),I("X2")))))), quote(list(V1 = V1, V2 = "V2", V3 = list("X1", "X2")))) +test(4.14, enlist(I(list(V1="V1", V2=list("V2")))), list(V1="V1", V2=list("V2"))) +test(4.15, enlist(I(list(V1="V1", V2=I(list("V2"))))), list(V1="V1", V2=I(list("V2")))) + +# list2lang +test(5.01, list2lang(c("a")), error="'x' must be a list") +test(5.02, list2lang(list("a", 1L)), list(as.name("a"), 1L)) +test(5.03, list2lang(I(list("a", 1L))), list("a", 1L)) +test(5.04, list2lang(list(I("a"), 1L)), list("a", 1L)) +test(5.05, list2lang(list("a", 1L, list("b"))), list(as.name("a"), 1L, call("list", as.name("b")))) +test(5.06, list2lang(list("a", 1L, list(I("b")))), list(as.name("a"), 1L, call("list", "b"))) +test(5.07, list2lang(list("a", 1L, I(list("b")))), list(as.name("a"), 1L, list("b"))) +test(5.08, list2lang(I(list("a", 1L, list("b")))), list("a", 1L, list("b"))) +test(5.09, list2lang(I(list("a", 1L, I(list("b"))))), list("a", 1L, I(list("b")))) +test(5.10, list2lang(list("a", 1L, c(1L, 2L))), list(as.name("a"), 1L, c(1L,2L))) ## no 'enlist' like feature for 'c()' function, see next test +test(5.11, list2lang(list("a", 1L, call("c", 1L, 2L))), list(as.name("a"), 1L, quote(c(1L, 2L)))) + +# datatable.enlist +op = options(datatable.enlist=NULL) +test(6.01, + substitute2(list(v1 = v2, v3 = v4), list(v1 = "int", v2 = 1L, v3 = "lst", v4 = list("a", "b", list("c", "d")))), + quote(list(int = 1L, lst = list(a, b, list(c, d))))) +options(datatable.enlist=FALSE) +test(6.02, + substitute2(list(v1 = v2, v3 = v4), list(v1 = "int", v2 = 1L, v3 = "lst", v4 = list("a", "b", list("c", "d")))), + substitute(list(int = 1L, lst = lst), list(lst = list("a", "b", list("c", "d"))))) +options(datatable.enlist=NULL) +test(6.03, + enlist(list(v1 = 1L, v2 = list(v3 = "b", v4 = list(v5 = "c")))), + quote(list(v1 = 1L, v2 = list(v3 = b, v4 = list(v5 = c))))) +options(datatable.enlist=FALSE) +test(6.04, + enlist(list(v1 = 1L, v2 = list(v3 = "b", v4 = list(v5 = "c")))), + substitute(list(v1 = 1L, v2 = lst), list(lst=list(v3 = "b", v4 = list(v5 = "c"))))) +options(datatable.enlist=NULL) +test(6.05, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5")))), + quote(list(V1, V2, list(V4, V5)))) +options(datatable.enlist=FALSE) +test(6.06, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5")))), + quote(list(V1, V2, list(V4, V5)))) +test(6.07, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5", list("V6"))))), + substitute(list(V1, V2, list(V4, V5, lst)), list(lst=list("V6")))) +test(6.08, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5", enlist(list("V6")))))), + quote(list(V1, V2, list(V4, V5, list(V6))))) +options(op) + +# documentation examples +test(7.01, substitute2(list(var1 = var2), list(var1 = "c1", var2 = 5L)), quote(list(c1 = 5L))) ## works also on names +test(7.02, substitute2(var1, list(var1 = I("c1"))), "c1") ## enforce character with I +test(7.03, substitute2(var1, list(var1 = "c1")), quote(c1)) ## turn character into symbol, for convenience +test(7.04, substitute2(list(var1 = var2), list(var1 = "c1", var2 = I("some_character"))), quote(list(c1 = "some_character"))) ## mix symbols and characters +test(7.05, substitute2(list(var1 = var2), I(list(var1 = as.name("c1"), var2 = "some_character"))), quote(list(c1 = "some_character"))) +test(7.06, substitute2(f(lst), I(list(lst = list(1L, 2L)))), substitute(f(lst), list(lst=list(1L,2L)))) ## list elements are enlist'ed into list calls +test(7.07, substitute2(f(lst), list(lst = I(list(1L, 2L)))), substitute(f(lst), list(lst=list(1L,2L)))) +test(7.08, substitute2(f(lst), list(lst = call("list", 1L, 2L))), quote(f(list(1L, 2L)))) +test(7.09, substitute2(f(lst), list(lst = list(1L, 2L))), quote(f(list(1L, 2L)))) +test(7.10, substitute2(f(lst), list(lst = list(1L, list(2L)))), quote(f(list(1L, list(2L))))) ## character to name and list into list calls works recursively +test(7.11, substitute2(f(lst), I(list(lst = list(1L, list(2L))))), substitute(f(lst), list(lst=list(1L, list(2L))))) +f = function(expr, env) { ## using substitute2 from another function + eval(substitute( + substitute2(.expr, env), + list(.expr = substitute(expr)) + )) +} +test(7.12, f(list(var1 = var2), list(var1 = "c1", var2 = 5L)), quote(list(c1 = 5L))) + +# data.table i, j, by +d = data.table(a = 2:1, b = 1:4) +test(11.01, d[var3%in%values, .(var1 = f(var2)), by=var3, + env=list(var1="res", var2="b", f="sum", var3="a", values=0:3), + verbose=TRUE], data.table(a=c(2L,1L), res=c(4L,6L)), output=c("Argument 'by' after substitute: a","Argument 'j' after substitute: .(res = sum(b))","Argument 'i' after substitute: a %in% 0:3")) +# data.table symbols and chars +d = data.table(a = c("b","a"), b = 1:4) +out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, + env=list(var1="res", var2="b", f="sum", var3="a", values=I(c("a","b","c"))), + verbose=TRUE]) # could not use output arg in test, so test it manually +test(11.02, ans, data.table(a=c("a","b"), res=c(6L,4L), key="a")) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.021, length(out), 3L) # we expect i, j, by only here, ensure about that +test(11.022, "Argument 'by' after substitute: a" %in% out, TRUE) +test(11.023, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) +test(11.024, "Argument 'i' after substitute: a %in% c(\"a\", \"b\", \"c\")" %in% out, TRUE) +out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, + env=I(list(var1=as.name("res"), var2=as.name("b"), f=as.name("sum"), var3=as.name("a"), values=c("b","c"))), + verbose=TRUE]) +test(11.03, ans, data.table(a=c("b"), res=c(4L), key="a")) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.031, length(out), 3L) +test(11.032, "Argument 'by' after substitute: a" %in% out, TRUE) +test(11.033, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) +test(11.034, "Argument 'i' after substitute: a %in% c(\"b\", \"c\")" %in% out, TRUE) +# substitute2 during join +d1 = data.table(id1=1:4, v1=5) +d2 = data.table(id1=c(0L,2:3), v1=6) +out = capture.output(ans <- d1[d2, on="id1<=id1", .(c1, c2, c3, c4), env=list(c1="x.id1", c2="i.id1", c3="x.v1", c4="i.v1"), verbose=TRUE]) +test(11.041, ans, data.table(x.id1=c(NA,1:2,1:3), i.id1=c(0L,2L,2L,3L,3L,3L), x.v1=c(NA,rep(5,5)), i.v1=rep(6,6))) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.042, length(out), 2L) ## 2L because i is non-missing attempt to substitute is made +test(11.043, "Argument 'j' after substitute: .(x.id1, i.id1, x.v1, i.v1)" %in% out, TRUE) +d1 = data.table(id1=c(2L,4L,2L,4L), v1=5) +d2 = data.table(id1=c(0L,2:3), v1=6) +out = capture.output(ans <- d1[dd, on="id1<=id1", .(sum(c3), sum(c4)), by=by, env=list(dd="d2", c3="x.v1", c4="i.v1", by=".EACHI"), verbose=TRUE]) +test(11.044, ans, data.table(id1=c(0L,2L,3L), V1=c(NA,10,10), V2=c(6,6,6))) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.045, length(out), 3L) +test(11.046, "Argument 'by' after substitute: .EACHI" %in% out, TRUE) +test(11.047, "Argument 'j' after substitute: .(sum(x.v1), sum(i.v1))" %in% out, TRUE) +test(11.048, "Argument 'i' after substitute: d2" %in% out, TRUE) +dt1 = data.table(x = letters[1:5], y = 1:5) +dt2 = data.table(x = letters[1:3], y = 11:13) +target_v = "y" +source_v = paste0("i.", target_v) +on_v = "x" +out = capture.output(invisible(dt1[dt2, target_v := source_v, on = on_v, env = list(target_v = target_v, source_v = source_v), verbose=TRUE])) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.049, length(out), 2L) +test(11.050, dt1, data.table(x = c("a", "b", "c", "d", "e"), y = c(11L, 12L, 13L, 4L, 5L))) +# substitute special symbols +d = data.table(V1=1:2, V2=1:4) +test(11.051, d[, j, by, env=list(j=".N", by="V1")], data.table(V1=c(1L,2L), N=c(2L,2L))) +test(11.052, d[, j, by, env=list(j=".SD", by="V1")], data.table(V1=c(1L,1L,2L,2L), V2=c(1L,3L,2L,4L))) +test(11.053, d[, j, env=I(list(j=as.name(".N")))], 4L) +test(11.054, d[, .(op, fun(col)), by=by, env=list(op=".N", fun="sum", col="V2", by="V1")], data.table(V1=1:2, N=c(2L,2L), V2=c(4L,6L))) +# get and mget use cases +d = as.data.table(lapply(1:5, rep, 2L)) +setnames(d, paste0("c",1:5)) +v1 = "c1"; v2 = "c2"; v3 = "c3"; v4 = "c4"; v5 = "c5" +test(11.061, d[, v1, env=list(v1=v1)], d[, get(v1)]) ## symbol c1 +test(11.062, d[, v1, env=list(v1=I(v1))], data.table(c1=c(1L,1L))) ## character "c1" +test(11.063, d[, list(v1), env=list(v1=v1)], d[, mget(v1)]) ## symbol c1 in list +test(11.064, d[, v1v2, env=list(v1v2=I(c(v1,v2)))], d[, mget(c(v1, v2))]) ## character c("c1","c2") +test(11.065, d[, v1v2, env=list(v1v2=as.list(c(v1,v2)))], d[, mget(c(v1, v2))]) ## call list(c1,c2) ## auto-enlist +test(11.066, d[, .(v1), env=list(v1=v1)], data.table(c1=c(1L,1L))) ## d[, .(get(v1))] - (m)get would return unnamed columns +test(11.067, d[, .(v1, v2), env=list(v1=v1, v2=v2)], data.table(c1=c(1L,1L),c2=c(2L,2L))) ## d[, .(get(v1), get(v2))] +test(11.068, d[, .(sum(v1)), env=list(v1=v1)], d[, .(sum(get(v1)))]) +test(11.069, d[, lapply(vN, sum), env=list(vN=as.list(setNames(nm = c(v1, v3))))], d[, lapply(mget(c(v1,v3)), sum)]) +test(11.070, d[, c(list(c1=c1, c2=c2), list(v3=v3), list(v4=v4, v5=v5)), env=list(v3=v3,v4=v4,v5=v5)], d) ## d[, c(list(c1, c2), list(get(v3)), mget(c(v4,v5)))] - some are unnamed +# empty input +d = data.table(x=1:2, y=1:4) +test(11.081, d[.i, env=list(.i=substitute()), verbose=TRUE], d, notOutput="after substitute") +test(11.082, d[.i, .j, .by, env=list(.i=substitute(), .j=substitute(), .by=substitute()), verbose=TRUE], d, notOutput="after substitute") +f = function(x, i, j, by) { + x[.i, .j, .by, env=list(.i=substitute(i), .j=substitute(j), .by=substitute(by)), verbose=TRUE] +} +test(11.083, f(d), d) +test(11.084, f(d, 1), d[1], output="Argument 'i' after substitute", notOutput="Argument 'j' after substitute") +test(11.085, f(d,, 1), d[,1], output="Argument 'j' after substitute", notOutput="Argument 'i' after substitute") +test(11.086, f(d, 1, 1), d[1, 1], output="Argument 'j' after substitute.*Argument 'i' after substitute") + +#1985 weird exception when by contains get +tb = data.table(x=c(1,2), y=c(3,4), z=c(5,6), w=c("a","b")) +test(11.101, tb[w != "b", .(x=sum(x)), by=.(y, zz=.z), env=list(.z="z")], data.table(y=3, zz=5, x=1)) +dtIris = as.data.table(iris) +speciesVar = "Species" +test(11.102, dtIris[Sepal.Length > 4, .N, by = .(var = .speciesVar, Petal.Width), env = list(.speciesVar = speciesVar)], dtIris[Sepal.Length > 4, .N, by = .(var = Species, Petal.Width)]) +#2589 Need an easier way to use dynamically determined symbols +dt = data.table(x1 = 1:10, x2 = 10:1, x3 = 1:10) +s1 = "x2"; s2 = "x3" +test(11.103, dt[, s1 * s2, env=list(s1=s1,s2=s2)], c(10L, 18L, 24L, 28L, 30L, 30L, 28L, 24L, 18L, 10L)) +#2884 Alternative way to dynamic symbol usage in `j` +dt = data.table(id = rep(1:2, 5), x1 = rnorm(10), x2 = rnorm(10), y1 = rnorm(10), y2 = rnorm(10)) +test(11.104, dt[, .(xsum = sum(x), ysum = sum(y)), by = id, env = list(x = "x1", y = "y2")], dt[, .(xsum=sum(x1), ysum=sum(y2)), by=id]) +#2816 Possible regression for programmatic use in `j` +dt = data.table(x=1:3) +var = "x" +dt[, var := var+1L, env=list(var="x")] +test(11.105, dt, data.table(x=2:4)) +# injecting quoted expressions +#750 `by=list(eval(as.name("colA")))` renames column +DT = data.table(colA=1:4, colB=5:8, colC=9:12) +test(11.106, DT[, sum(colA), by=list(grp_name=grp), env=list(grp_name="colA", grp="colA")], data.table(colA=1:4, V1=1:4)) +#2432 Add Programmable NSE +co2 = as.data.table(CO2) +Jexp1 = quote(max(conc)) +Jexp2 = quote(mean(conc)) +Jexp = substitute(list(Jexp1, round(Jexp2)), list(Jexp1=Jexp1, Jexp2=Jexp2)) +out = capture.output(ans <- co2[, j, by=Type, env=list(j=Jexp), verbose=TRUE]) +test(11.107, ans, data.table(Type=factor(c("Quebec","Mississippi"), levels=c("Quebec","Mississippi")), V1=c(1000,1000), V2=c(435,435))) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.108, length(out), 2L) +test(11.109, "Argument 'by' after substitute: Type" %in% out, TRUE) +test(11.110, "Argument 'j' after substitute: list(max(conc), round(mean(conc)))" %in% out, TRUE) +#628 Change j=list(xout=eval(...))'s eval to eval within scope of DT +dat = data.table(x_one=1:10, x_two=1:10, y_one=1:10, y_two=1:10) +f = function(vars) as.call(c(quote(list), lapply(setNames(vars, paste(vars,"out",sep="_")), function(var) substitute2(one-two, list(one=paste(var,"one",sep="_"), two=paste(var,"two",sep="_")))))) +test(11.111, dat[, j, env=list(j = f(c("x","y")))], dat[, list(x_out = x_one - x_two, y_out = y_one - y_two)]) + +# vignette examples +square = function(x) x^2 +test(12.01, + substitute2(outer(inner(var1) + inner(var2)), env = list(outer = "sqrt", inner = "square", var1 = "a", var2 = "b")), + quote(sqrt(square(a) + square(b)))) +DT = as.data.table(iris) +test(12.02, + DT[, outer(inner(var1) + inner(var2)), env = list(outer = "sqrt", inner = "square", var1 = "Sepal.Length", var2 = "Sepal.Width")], + DT[, sqrt(square(Sepal.Length) + square(Sepal.Width))]) +test(12.03, # return as data.table, substitute call argument name + DT[, .(Species, var1, var2, out = outer(inner(var1) + inner(var2))), env = list(outer = "sqrt", inner = "square", var1 = "Sepal.Length", var2 = "Sepal.Width", out = "Sepal.Hypotenuse")], + DT[, .(Species, Sepal.Length, Sepal.Width, Sepal.Hypotenuse = sqrt(square(Sepal.Length) + square(Sepal.Width)))]) +test(12.04, # i, j, by + DT[filter_col %in% filter_val, .(var1, var2, out = outer(inner(var1) + inner(var2))), by = by_col, env = list(outer = "sqrt", inner = "square", var1 = "Sepal.Length", var2 = "Sepal.Width", out = "Sepal.Hypotenuse", filter_col = "Species", filter_val = I(c("versicolor", "virginica")), by_col = "Species")], + DT[Species %in% c("versicolor","virginica"), .(Sepal.Length, Sepal.Width, Sepal.Hypotenuse = sqrt(square(Sepal.Length) + square(Sepal.Width))), by = Species]) +test(12.05, # like base R, env AsIs class + substitute2(rank(input, ties.method = ties), env = I(list(input = as.name("Sepal.Width"), ties = "first"))), + quote(rank(Sepal.Width, ties.method = "first"))) +test(12.06, # only particular elements of env are AsIs class + substitute2(rank(input, ties.method = ties), env = list(input = "Sepal.Width", ties = I("first"))), + quote(rank(Sepal.Width, ties.method = "first"))) +test(12.07, # all are symbols + substitute2(f(v1, v2), list(v1 = "a", v2 = list("b", list("c", "d")))), + quote(f(a, list(b, list(c, d))))) +test(12.08, # 'a' and 'd' should stay as character + substitute2(f(v1, v2), list(v1 = I("a"), v2 = list("b", list("c", I("d"))))), + quote(f("a", list(b, list(c, "d"))))) +cols = c("Sepal.Length", "Sepal.Width") +test(12.09, # data.table automatically enlist nested lists into list calls + DT[, j, env = list(j = as.list(cols))], + DT[, list(Sepal.Length, Sepal.Width)]) +test(12.10, # turning above 'j' list into a list call + DT[, j, env = list(j = quote(list(Sepal.Length, Sepal.Width)))], + DT[, list(Sepal.Length, Sepal.Width)]) +test(12.11, # the same as above but accepts character vector + DT[, j, env = list(j = as.call(c(quote(list), lapply(cols, as.name))))], + DT[, list(Sepal.Length, Sepal.Width)]) +test(12.12, # list of symbols + DT[, j, env = I(list(j = lapply(cols, as.name))), verbose = TRUE], + error = "j-argument should be", + output = "list(Sepal.Length, Sepal.Width)") +test(12.13, substitute2(j, env = I(list(j = lapply(cols, as.name)))), lapply(cols, as.name)) +test(12.14, substitute2(j, env = list(j = as.list(cols))), as.call(c(quote(list), lapply(cols, as.name)))) +outer = "sqrt"; inner = "square"; vars = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") +syms = lapply(vars, as.name) +to_inner_call = function(var, fun) call(fun, var) +inner_calls = lapply(syms, to_inner_call, inner) +test(12.15, inner_calls, list(quote(square(Sepal.Length)), quote(square(Sepal.Width)), quote(square(Petal.Length)), quote(square(Petal.Width)))) +to_add_call = function(x, y) call("+", x, y) +add_calls = Reduce(to_add_call, inner_calls) +test(12.16, add_calls, quote(square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))) +rms = substitute2(expr = outer((add_calls) / len), env = list(outer = outer, add_calls = add_calls, len = length(vars))) +test(12.17, rms, quote(sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L))) +test(12.18, + DT[, j, env = list(j = rms)], + DT[, sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L)]) +test(12.19, # same but skipping last substitute2 call and using add_calls directly + DT[, outer((add_calls) / len), env = list(outer = outer, add_calls = add_calls, len = length(vars))], + DT[, sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L)]) +j = substitute2(j, list(j = as.list(setNames(nm = c(vars, "Species", "rms"))))) # return as data.table +j[["rms"]] = rms +test(12.20, + DT[, j, env = list(j = j)], + DT[, .(Sepal.Length=Sepal.Length, Sepal.Width=Sepal.Width, Petal.Length=Petal.Length, Petal.Width=Petal.Width, Species, rms = sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L))]) +j = as.call(c( # alternatively + quote(list), + lapply(setNames(nm = vars), as.name), + list(Species = as.name("Species")), + list(rms = rms) +)) +test(12.21, + DT[, j, env = list(j = j)], + DT[, .(Sepal.Length=Sepal.Length, Sepal.Width=Sepal.Width, Petal.Length=Petal.Length, Petal.Width=Petal.Width, Species, rms = sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L))]) +v1 = "Petal.Width" # get +v2 = "Sepal.Width" +test(12.22, + DT[, .(total = sum(v1, v2)), env = list(v1 = v1, v2 = v2)], + DT[, .(total = sum(get(v1), get(v2)))]) +v = c("Petal.Width", "Sepal.Width") # mget +test(12.23, + DT[, lapply(v, mean), env = list(v = as.list(v))], + DT[, lapply(list(Petal.Width, Sepal.Width), mean)]) +test(12.24, + DT[, lapply(v, mean), env = list(v = as.list(setNames(nm = v)))], + DT[, lapply(mget(v), mean)]) +cl = quote(.(Petal.Width = mean(Petal.Width), Sepal.Width = mean(Sepal.Width))) +test(12.25, DT[, cl, env = list(cl = cl)], DT[, eval(cl)]) + +####################### +# contributed use cases +####################### + +# renkun-ken +dt = as.data.table(list( ## RNGversion("3.5.0"); set.seed(108); round(numeric(), 4) + symbol = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), + date = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), + grp1 = c(1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L), + grp2 = c(3L, 3L, 3L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 3L, 2L, 1L, 3L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 3L, 2L, 1L, 2L, 2L, 3L, 2L), + x0 = c(1.1396, -0.2706, -2.2801, -0.1572, -1.0671, -0.9666, -0.8071, -0.23, -0.1626, 1.4347, -0.2234, 0.5613, -0.7084, 0.2598, -0.2023, 1.8624, 0.5209, -1.561, -1.2297, -1.0064, -0.9782, -0.1291, -2.275, 0.5268, -0.5316, 2.3234, 0.0556, -0.3623, -0.5695, -0.0142), + x1 = c(1.3553, 1.2909, -0.8958, -0.3677, 1.0041, 1.1247, -0.0595, 0.7503, 0.3503, -1.559, -1.6823, -0.0906, 0.7874, 0.2785, -0.1712, -1.5325, 0.408, 0.5981, -1.1464, -0.2233, -0.0635, 0.4461, -1.9813, -0.7281, 1.1216, -0.0516, 1.373, 0.2388, 0.6257, -0.0551), + x2 = c(-0.2457, -0.9797, 0.3957, -1.094, -1.1973, 0.3137, 0.2004, -1.9404, 1.6927, -0.4063, 0.0731, -0.3338, -2.2683, -1.1105, 0.2115, -0.0163, 0.2139, 0.5016, 0.2296, 0.4189, 0.3295, 0.0408, 1.4633, -0.7118, 0.4811, 0.4499, -0.4214, 0.1503, -0.2222, 0.4573), + x3 = c(1.3439, 0.3841, -0.4787, -0.6312, -0.5481, -0.8703, -1.2684, -1.4851, 0.6789, 0.1575, 2.7873, -1.1201, 0.1337, -0.6053, -0.6538, 0.4597, -0.8955, 0.1625, 1.3767, 0.6024, -1.2141, -1.3534, -0.6583, -0.095, 1.1923, 0.3062, -0.6818, 0.2407, -0.8534, -1.4521), + y1 = c(-0.2159, 0.8934, 0.0216, -1.0682, 1.2549, -0.1517, 1.4404, 1.3436, -2.1388, -0.2453, -1.4628, -1.7654, 0.6437, -0.9685, -0.9393, 0.0962, -0.2041, 1.1007, -1.8705, 0.2053, -0.9238, -0.6301, 1.9876, 1.2862, 0.3363, -0.334, -1.5149, -1.3254, 0.5716, -0.7165), + y2 = c(-0.5962, 0.3394, -0.2971, -0.6241, -0.5279, 1.1945, -0.152, 0.8207, 0.8731, 0.2281, 0.3466, -1.4862, -0.4694, 0.0435, 0.9888, -0.0797, 0.7109, -0.6636, -0.4402, 1.0093, -0.0655, 0.5099, 1.5415, 1.8833, -1.2365, 0.5085, 0.7073, -0.2191, 0.2442, 0.1501), + y3 = c(0.6222, -0.7174, -1.9616, -0.0117, -0.114, 0.1313, -1.3854, 1.5021, -0.7115, 0.4822, 1.8474, 1.1742, 0.8192, 0.2819, -1.3365, -0.6179, -0.9706, 0.2179, -1.2654, 1.0065, -2.2514, -0.7161, 0.9578, -0.0335, 0.3166, 0.0471, -0.9983, -0.6455, 1.4064, 0.2954))) +xs = c("x", "y") ## apply same formula to different set of columns +out = vector("list", length(xs)) +names(out) = xs +for (x in xs) { + out[[x]] = capture.output(invisible(dt[, RATIO := (R3 - R2) * (R2 - R1) * (R3 - R1) / sqrt(R1^2 + R2^2 + R3^2), + env = list(RATIO = paste0(x, "_ratio"), R1 = paste0(x, 1), R2 = paste0(x, 2), R3 = paste0(x, 3)), + verbose = TRUE])) # assign to nul, other +} +x_rat = c(0.0150761734954921, 1.68603966340262, -0.432117480975587, 0.0673302370985585, +1.3396117186265, -1.31542975195976, 0.358990921654875, 1.07137398842599, -0.240804570258909, 0.689134697166349, 6.53944855876942, -0.167936293758913, 1.99518595021054, 0.478886131900058, 0.225672526235629, 0.898595029001403, -0.278725254056844, -0.0178774591562397, 2.20493313305713, 0.126869315798536, 0.554130827073314, -0.713268530169861, -3.79227895596263, 0.00622410754980975, -0.0188758915276097, -0.0471688415642347, -0.60391972591766, -4.09856489441073e-05, -0.732101471917737, 0.897197218930381) +y_rat = c(-0.437137931952723, -0.789182136098114, -0.530238437504097, 0.232242653273211, 0.739369921650875, -0.334413400872578, -2.76908561851941, -0.0259528361203494, -2.81810697204509, 0.149050554297973, 3.77409495341661, 0.84329199487865, -0.220290266022232, 0.298795199314652, 0.932599183107379, -0.107238527606129, 0.966425089066359, 1.05320054480325, -0.310406226974414, -0.00125245906648534, 1.02314586034282, 0.111130598215941, -0.0996278782862306, 0.66222170820334, 0.0364570881136429, -0.242779893874194, -1.00552326863148, -0.215191768368067, -0.206580227824426, 0.16140646232964) +test(101.01, dt$x_ratio, x_rat) +test(101.02, dt$y_ratio, y_rat) +test(101.03, length(grep("Argument.*substitute", out[["x"]], value=TRUE)), 1L) +test(101.04, length(grep("Argument.*substitute", out[["y"]], value=TRUE)), 1L) +test(101.05, "Argument 'j' after substitute: `:=`(x_ratio, (x3 - x2) * (x2 - x1) * (x3 - x1)/sqrt(x1^2 + x2^2 + x3^2))" %in% out[["x"]], TRUE) +test(101.06, "Argument 'j' after substitute: `:=`(y_ratio, (y3 - y2) * (y2 - y1) * (y3 - y1)/sqrt(y1^2 + y2^2 + y3^2))" %in% out[["y"]], TRUE) +daily_cor = function(data, x, y) { ## daily correlation of user input features + data[, .(cor = cor(x, y)), + keyby = date, + env = list(x = x, y = y), + verbose = TRUE] +} +out = capture.output(ans <- daily_cor(dt, "x0", "y2")) +test(101.07, length(grep("Argument.*substitute", out, value=TRUE)), 2L) ## 'by' (or 'keyby') is not substituted here but it still goes via substitute2 because it is non-missing +test(101.08, "Argument 'by' after substitute: date" %in% out, TRUE) +test(101.09, "Argument 'j' after substitute: .(cor = cor(x0, y2))" %in% out, TRUE) +group_cor = function(data, x, y, g) { ## group cor comparison of user input features + cor_dt = data[, lapply(.SD, function(x) cor(x, Y)), + keyby = .(group = GROUP), + .SDcols = x, + env = list(Y = y, GROUP = g), + verbose = TRUE] + melt.data.table(cor_dt, id.vars = "group", measure.vars = x, variable.name = "x", value.name = "cor", variable.factor = FALSE) ## not relevant but lets keep it for completeness +} +out = capture.output(dt1 <- group_cor(dt, c("x0", "x1", "x2"), "y1", "grp1")) +test(101.10, length(grep("Argument.*substitute", out, value=TRUE)), 2L) +test(101.11, "Argument 'by' after substitute: .(group = grp1)" %in% out, TRUE) +test(101.12, "Argument 'j' after substitute: lapply(.SD, function(x) cor(x, y1))" %in% out, TRUE) +out = capture.output(dt2 <- group_cor(dt, c("x0", "x1", "x2"), "y1", "grp2")) +test(101.13, length(grep("Argument.*substitute", out, value=TRUE)), 2L) +test(101.14, "Argument 'by' after substitute: .(group = grp2)" %in% out, TRUE) +test(101.15, "Argument 'j' after substitute: lapply(.SD, function(x) cor(x, y1))" %in% out, TRUE) +stats_dt1 = as.data.table(list( + x = c("x0", "x1", "x2"), + min = c(-0.325967794724422, -0.126026585686073, -0.398950077203113), + mean = c(-0.277318407860876, -0.0164428001010045, -0.220868266148565), + max = c(-0.22866902099733, 0.0931409854840638, -0.0427864550940165) +), key="x") +test(101.16, dt1[, .(min = min(cor), mean = mean(cor), max = max(cor)), keyby = x], stats_dt1) ## post aggregation with known colnames, not relevant but lets keep it for completeness +stats_dt2 = as.data.table(list( + x = c("x0", "x1", "x2"), + min = c(-0.392714958827804, -0.339274985404091, -0.45937864657761), + mean = c(-0.279968323960171, 0.150866984990403, 0.0838779176840593), + max = c(-0.180337725136444, 0.697473394580653, 0.714679537878464) +), key="x") +test(101.17, dt2[, .(min = min(cor), mean = mean(cor), max = max(cor)), keyby = x], stats_dt2) +set.seed(108) ## to many values to hardcode +yn = c(1, 5, 10, 20) +ycols = paste0("y", yn) +ydt = data.table(symbol = rep(1:3, each = 100)) +ydt[, date := seq_len(.N), by = symbol] +ydt[, ret := rnorm(.N)] +ydt[, (ycols) := shift(ret, yn, type = "lead"), by = symbol] +xdt = data.table(symbol = rep(1:2, each = 20)) +xdt[, date := seq_len(.N), by = symbol] +xdt[, `:=`(x1 = rnorm(.N), x2 = rnorm(.N))] +cor_xy = function(xdt, ydt, x, y) { ## cor between each x and a single y + xdt[ydt, y := Y, on = .(symbol, date), + env = list(Y = y), + verbose = TRUE] + on.exit(xdt[, y := NULL]) + xdt[, lapply(.SD, cor, y = y), keyby = symbol, .SDcols = x] +} +out = capture.output(ans <- cor_xy(xdt, ydt, c("x1", "x2"), "y10")) +exp = as.data.table(list(symbol = 1:2, x1 = c(0.529292252112253, 0.0301956035638738), x2 = c(0.287076866252898, -0.335969587268599)), key="symbol") +test(102.01, ans, exp) +test(102.02, length(grep("Argument.*substitute", out, value=TRUE)), 2L) +test(102.03, "Argument 'j' after substitute: `:=`(y, y10)" %in% out, TRUE) +test(102.04, "Argument 'i' after substitute: ydt" %in% out, TRUE) +cor_xy2 = function(xdt, ydt, x, y) { ## cor between each pair of x and y + rbindlist(lapply(y, function(yi) { + xdt[ydt, y := Y, on = .(symbol, date), + env = list(Y = yi)] + on.exit(xdt[, y := NULL]) + rbindlist(lapply(x, function(xi) { + xdt[, .(x = xi, y = yi, cor = cor(X, y)), keyby = symbol, + env = list(X = xi)] + })) + })) +} +cor_dt = cor_xy2(xdt, ydt, c("x1", "x2"), ycols) +exp = as.data.table(list( + symbol = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), + x = c("x1", "x1", "x2", "x2", "x1", "x1", "x2", "x2", "x1", "x1", "x2", "x2", "x1", "x1", "x2", "x2"), + y = c("y1", "y1", "y1", "y1", "y5", "y5", "y5", "y5", "y10", "y10", "y10", "y10", "y20", "y20", "y20", "y20"), + cor = c(0.0963296961360529, -0.155702586981777, 0.45855688298414, -0.0867798048307359, -0.272158447799069, 0.0969909109333228, -0.172091337596075, -0.231918279862371, 0.529292252112253, 0.0301956035638738, 0.287076866252898, -0.335969587268599, 0.489259093604126, 0.190094143537513, 0.382176633086643, -0.0481151265706696) +)) +test(102.05, cor_dt, exp) +cor_xy3 = function(xdt, ydt, x, y) { ## cor matrix of existing columns and dynamically in-place merged columns + cl = as.call(lapply(setNames(c(":=", y), c("", y)), as.name)) + xdt[ydt, j, on = .(symbol, date), + env = list(j=cl)] + on.exit(xdt[, (y) := NULL]) + xdt[, cor(.SD), .SDcols = c(x, y)] +} +cor_mx = cor_xy3(xdt, ydt, c("x1", "x2"), ycols) +exp = structure(c( + 1, 0.242249239102964, -0.0286729531730845, -0.0936087330415663, 0.245575245812681, 0.323778522797129, 0.242249239102964, 1, 0.199165327684089, -0.160954354243643, 0.0034174556771777, 0.185518712777259, -0.0286729531730845, 0.199165327684089, 1, -0.164047186655086, -0.0689536633998918, -0.0326400434160486, -0.0936087330415663, -0.160954354243643, -0.164047186655086, 1, -0.0810998892055976, -0.106457956110047, 0.245575245812681, 0.0034174556771777, -0.0689536633998918, -0.0810998892055976, 1, 0.324977066952494, 0.323778522797129, 0.185518712777259, -0.0326400434160486, -0.106457956110047, 0.324977066952494, 1 + ), .Dim = c(6L, 6L), .Dimnames = list( + c("x1", "x2", "y1", "y5", "y10", "y20"), + c("x1", "x2", "y1", "y5", "y10", "y20") +)) +test(102.06, cor_mx, exp) +nadt = data.table(x1 = c(1, 2, NA, Inf), x2 = c(2, NA, 3, Inf), x3 = c(NA, 1, 2, 0)) ## fill abnormal values of multiple columns +dt_fill = function(data, columns, selector, fill) { + selector = match.fun(selector) + for (col in columns) { + data[selector(X), X := fill, env = list(X = col)] + } +} +dt_fill(nadt, c("x1", "x2", "x3"), is.na, 0) +test(103.01, nadt, data.table(x1 = c(1, 2, 0, Inf), x2 = c(2, 0, 3, Inf), x3 = c(0, 1, 2, 0))) +dt_fill(nadt, c("x1", "x2", "x3"), is.infinite, 0) +test(103.02, nadt, data.table(x1 = c(1, 2, 0, 0), x2 = c(2, 0, 3, 0), x3 = c(0, 1, 2, 0))) diff --git a/man/data.table.Rd b/man/data.table.Rd index 637bdce86f..e934028a3b 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -31,7 +31,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac .SDcols, verbose = getOption("datatable.verbose"), # default: FALSE allow.cartesian = getOption("datatable.allow.cartesian"), # default: FALSE - drop = NULL, on = NULL) + drop = NULL, on = NULL, env = NULL) } \arguments{ \item{\dots}{ Just as \code{\dots} in \code{\link{data.frame}}. Usual recycling rules are applied to vectors of different lengths to create a list of equal length vectors.} @@ -170,6 +170,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac } See examples as well as \href{../doc/datatable-secondary-indices-and-auto-indexing.html}{\code{vignette("datatable-secondary-indices-and-auto-indexing")}}. } + + \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr diff --git a/man/substitute2.Rd b/man/substitute2.Rd new file mode 100644 index 0000000000..3b8d536141 --- /dev/null +++ b/man/substitute2.Rd @@ -0,0 +1,77 @@ +\name{substitute2} +\alias{substitute2} +\alias{substitute} +\alias{I} +\title{ Substitute expression } +\description{ + Experimental, more robust, and more user-friendly version of base R \code{\link[base]{substitute}}. +} +\usage{ + substitute2(expr, env) +} +\arguments{ + \item{expr}{ Unevaluated expression in which substitution has to take place. } + \item{env}{ List, or an environment that will be coerced to list, from which variables will be taken to inject into \code{expr}. } +} +\details{ + For convenience function will turn any character elements of \code{env} argument into symbols. In case if character is of length 2 or more, it will raise an error. It will also turn any list elements into list calls instead. Behaviour can be changed by wrapping \code{env} into \code{\link[base]{I}} call. In such case any symbols must be explicitly created, for example using \code{as.name} function. Alternatively it is possible to wrap particular elements of \code{env} into \code{\link[base]{I}} call, then only those elements will retain their original class. + + Comparing to base R \code{\link[base]{substitute}}, \code{substitute2} function: +\enumerate{ + \item substitutes calls argument names as well + \item by default converts character elements of \code{env} argument to symbols + \item by default converts list elements of \code{env} argument to list calls + \item does not accept missing \code{env} argument + \item evaluates elements of \code{env} argument +} +} +\note{ + Conversion of \emph{character to symbol} and \emph{list to list call} works recursively for each list element in \code{env} list. If this behaviour is not desired for your use case, we would like to hear about that via our issue tracker. For the present moment there is an option to disable that: \code{options(datatable.enlist=FALSE)}. This option is provided only for debugging and will be removed in future. Please do not write code that depends on it, but use \code{\link[base]{I}} calls instead. +} +\value{ + Quoted expression having variables and call argument names substituted. +} +\seealso{ \code{\link[base]{substitute}}, \code{\link[base]{I}}, \code{\link[base]{call}}, \code{\link[base]{name}}, \code{\link[base]{eval}} } +\examples{ +## base R substitute vs substitute2 +substitute(list(var1 = var2), list(var1 = "c1", var2 = 5L)) +substitute2(list(var1 = var2), list(var1 = "c1", var2 = 5L)) ## works also on names + +substitute(var1, list(var1 = "c1")) +substitute2(var1, list(var1 = I("c1"))) ## enforce character with I + +substitute(var1, list(var1 = as.name("c1"))) +substitute2(var1, list(var1 = "c1")) ## turn character into symbol, for convenience + +## mix symbols and characters using 'I' function, both lines will yield same result +substitute2(list(var1 = var2), list(var1 = "c1", var2 = I("some_character"))) +substitute2(list(var1 = var2), I(list(var1 = as.name("c1"), var2 = "some_character"))) + +## list elements are enlist'ed into list calls +(cl1 = substitute(f(lst), list(lst = list(1L, 2L)))) +(cl2 = substitute2(f(lst), I(list(lst = list(1L, 2L))))) +(cl3 = substitute2(f(lst), list(lst = I(list(1L, 2L))))) +(cl4 = substitute2(f(lst), list(lst = quote(list(1L, 2L))))) +(cl5 = substitute2(f(lst), list(lst = list(1L, 2L)))) +cl1[[2L]] ## base R substitute with list element +cl2[[2L]] ## same +cl3[[2L]] ## same +cl4[[2L]] ## desired +cl5[[2L]] ## automatically + +## character to name and list into list calls works recursively +(cl1 = substitute2(f(lst), list(lst = list(1L, list(2L))))) +(cl2 = substitute2(f(lst), I(list(lst = list(1L, list(2L)))))) ## unless I() used +last(cl1[[2L]]) ## enlisted recursively +last(cl2[[2L]]) ## AsIs + +## using substitute2 from another function +f = function(expr, env) { + eval(substitute( + substitute2(.expr, env), + list(.expr = substitute(expr)) + )) +} +f(list(var1 = var2), list(var1 = "c1", var2 = 5L)) +} +\keyword{ data } diff --git a/src/data.table.h b/src/data.table.h index 6cb5413918..9fb386567d 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -255,3 +255,5 @@ SEXP fcaseR(SEXP na, SEXP rho, SEXP args); //snprintf.c int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...); +// programming.c +SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); diff --git a/src/init.c b/src/init.c index f168a5b8be..5e4c854962 100644 --- a/src/init.c +++ b/src/init.c @@ -220,6 +220,7 @@ R_CallMethodDef callMethods[] = { {"CcoerceAs", (DL_FUNC) &coerceAs, -1}, {"Ctest_dt_win_snprintf", (DL_FUNC)&test_dt_win_snprintf, -1}, {"Cdt_zlib_version", (DL_FUNC)&dt_zlib_version, -1}, +{"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, {NULL, NULL, 0} }; diff --git a/src/programming.c b/src/programming.c new file mode 100644 index 0000000000..4f6cf1a19f --- /dev/null +++ b/src/programming.c @@ -0,0 +1,32 @@ +#include "data.table.h" + +static void substitute_call_arg_names(SEXP expr, SEXP env) { + R_len_t len = length(expr); + if (len && isLanguage(expr)) { // isLanguage is R's is.call + SEXP arg_names = getAttrib(expr, R_NamesSymbol); + if (!isNull(arg_names)) { + SEXP env_names = getAttrib(env, R_NamesSymbol); + int *imatches = INTEGER(PROTECT(chmatch(arg_names, env_names, 0))); + const SEXP *env_sub = SEXPPTR_RO(env); + SEXP tmp = expr; + for (int i=0; i + %\VignetteIndexEntry{Programming on data.table} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r init, include = FALSE} +require(data.table) +knitr::opts_chunk$set( + comment = "#", + error = FALSE, + tidy = FALSE, + cache = FALSE, + collapse = TRUE +) +``` + + +## Introduction + +`data.table` from the very first releases enabled interface of `subset` and `with` (or `within`) functions to be used inside `[.data.frame` by defining `[.data.table` method. `subset` and `with` are base R functions that are useful to reduce repetition in code, to make it more easily readable, and usually reduce number of total characters user has to type. Those functions are possible in R because of its quite unique feature called *lazy evaluation*. Feature that allows for a function to catch its arguments before they are evaluated, and for example to evaluate them in a scope different than scope in which they were called. Let's recap interface of `subset` function. + +```{r opt_max_print_10, include = FALSE} +options(max.print = 10L) # 2 rows +``` + +```{r subset} +subset(iris, Species == "setosa") +``` + +It takes the second argument and evaluates it inside the scope of the first argument, a data.frame. It removes variable repetition, makes code more readable, making it less prone to errors. + +## Problem description + +Problem of this kind of interface is that we cannot easily parameterize the code that uses it. It is because expressions passed to those functions are substituted before being evaluated. Those substituted expressions are different when they are passed by user interactively from those that passed as arguments of another function. + +### Example + +```{r subset_error, error=TRUE} +my_subset = function(data, col, val) { + subset(data, col == val) +} +my_subset(iris, Species, "setosa") +``` + +### Approaching the problem + +There are multiple ways to work around the problem. + +#### Avoid *lazy evaluation* + +The easiest workaround is to avoid *lazy evaluation* in the first place, and fall back to less intuitive, more error-prone machnisms that uses `df[["variable"]]`, etc. + +```{r subset_nolazy} +my_subset = function(data, col, val) { + data[data[[col]] == val, ] +} +my_subset(iris, col = "Species", val = "setosa") +``` + +We basically compute logical vector (of length of nrow of our dataset) first, then logical vector is supplied to `i` argument of `[.data.frame` to perform ordinary subsetting by logical vector. It works well for this simple example, but it lacks flexibility, incorporates variable repetition, and requires user to change the interface to pass column name as character rather than unquoted symbol. The more complex is the expression we need to parameterize the less practical is this approach. + +#### Use of `eval parse` + +This method is usually preferred by newcomers to R language. Conceptually it is the most straightforward way. It is to produce required statement by using string concatenation, then parse it and evaluate. + +```{r subset_parse} +my_subset = function(data, col, val) { + data = deparse(substitute(data)) + col = deparse(substitute(col)) + val = paste0("'", val, "'") + text = paste0("subset(", data, ", ", col, " == ", val, ")") + eval(parse(text = text)[[1L]]) +} +my_subset(iris, Species, "setosa") +``` + +We have to use `deparse(substitute(.))` to catch the actual names of objects passed to function, so we can construct the `subset` function call using those original names. Although It gives unlimited flexibility with a relatively low complexity, **use of `eval(parse(.))` should be avoided**. The main reasons are: + +- lack of syntax validation +- [vulnerability to code injection](https://github.com/Rdatatable/data.table/issues/2655#issuecomment-376781159) +- better alternatives + +Martin Machler, R Project Core Developer, [once said](https://stackoverflow.com/a/40164111/2490497): + +> Sorry but I don't understand why too many people even think a string was something that could be evaluated. You must change your mindset, really. Forget all connections between strings on one side and expressions, calls, evaluation on the other side. +The (possibly) only connection is via `parse(text = ....)` and all good R programmers should know that this is rarely an efficient or safe means to construct expressions (or calls). Rather learn more about `substitute()`, `quote()`, and possibly the power of using `do.call(substitute, ......)`. + +#### Use *computing one the language* + +Mentioned functions, along with some others (including `as.call`, `as.name`/`as.symbol`, `bquote`, `eval`), can be categorized as functions to *compute on the language*, as they operate on language objects (`call`, `name`/`symbol`). + +```{r subset_substitute} +my_subset = function(data, col, val) { + eval(substitute(subset(data, col == val))) +} +my_subset(iris, Species, "setosa") +``` + +We used base R `substitute` function to transform call `subset(data, col == val)` into `subset(iris, Species == "setosa")` by substituting `data`, `col` and `val` with their original names (or values) from their parent environment. We can see this solution is superior to former ones. Note that we operate on the language objects layer not touching string manipulation routines, thus we refer to that process as *computing on the language*. There is a dedicated chapter on *Computing on the language* in [R language manual](https://cloud.r-project.org/doc/manuals/r-release/R-lang.html). Although it is not necessary for *programming on data.table*, we encourage readers to read this chapter, for the sake of better understanding of the R's powerful and pretty unique feature. + +#### Use third party packages + +There are third party packages that can achieve what base R computing on the language routines do. To name a few `pryr`, `lazyeval` or `rlang`. We will not discuss them here as they are not solving any problem that could not be addressed by base R routines. + +## Programming on data.table + +Now, once we established the proper way to parameterize code that uses *lazy evaluation*, we can move on to the main discussion of this vignette, *programming on data.table*. + +Starting from version 1.12.10 data.table provides robust mechanism for parameterizing expressions passed to `i`, `j` and `by` (or `keyby`) arguments of `[.data.table`. It is built upon base R `substitute` function, and mimics its interface. For that purpose `substitute2` has been added as a more robust, and more user-friendly, version of base R `substitute`. For a complete list of differences between `base::substitute` and `data.table::substitute2` please read [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). + +### Substitute variables and names + +Let's assume we want to have a general function that applies a function to sum of two arguments that has been applied another function. Code of the particular example function will make this example clearer. Below we have a function to compute length of hypotenuse in a right-angled triangle, knowing length of its legs. + +${\displaystyle c = \sqrt{a^2 + b^2}}$ + +```{r hypotenuse} +square = function(x) x^2 +quote( + sqrt(square(a) + square(b)) +) +``` + +The goal is the make every name in the above call to be able to be passed as parameter. + +```{r hypotenuse_substitute2} +substitute2( + outer(inner(var1) + inner(var2)), + env = list( + outer = "sqrt", + inner = "square", + var1 = "a", + var2 = "b" + ) +) +``` + +We can see that both functions names has been replaced, as well names of the variables passed to functions. We used `substitute2` for convenience. In this simple case, base R `substitute` could be used as well, although it requires extra `lapply(env, as.name)`. + +Now, to use substitution inside `[.data.table` we don't need to call `substitute2` function. It is being used internally, all we have to do is to provide `env` argument, the same way as we provide it to `substitute2` function. Substitution is applied to `i`, `j` and `by` (or `keyby`) arguments of `[.data.table` method. Note that `verbose = TRUE` argument can be used to print expressions after substitution is applied. + +```{r opt_max_print_8, include = FALSE} +options(max.print = 8L) # 2 rows +``` + +```{r hypotenuse_datatable} +DT = as.data.table(iris) + +DT[, outer(inner(var1) + inner(var2)), + env = list( + outer = "sqrt", + inner = "square", + var1 = "Sepal.Length", + var2 = "Sepal.Width" + )] + +# return as data.table, substitute call argument name +DT[, .(Species, var1, var2, out = outer(inner(var1) + inner(var2))), + env = list( + outer = "sqrt", + inner = "square", + var1 = "Sepal.Length", + var2 = "Sepal.Width", + out = "Sepal.Hypotenuse" + )] +``` + +In the last call we added another parameter `out = "Sepal.Hypotenuse"` that conveys the name of output column. Unlike base R, `substitute2` will handle substitution of call arguments names as well. + +Substitution works on `i` and `by` (or `keyby`) as well. + +```{r hypotenuse_datatable_i_j_by} +DT[filter_col %in% filter_val, + .(var1, var2, out = outer(inner(var1) + inner(var2))), + by = by_col, + env = list( + outer = "sqrt", + inner = "square", + var1 = "Sepal.Length", + var2 = "Sepal.Width", + out = "Sepal.Hypotenuse", + filter_col = "Species", + filter_val = I(c("versicolor", "virginica")), + by_col = "Species" + )] +``` + +### Substitute variables and character values + +In the above example we have seen convenient feature of `substitute2` to automatically converts character to names/symbols. Obvious question arises, what if we actually want to substitute parameter with a character value, so to have base R `substitute` behaviour. We provide mechanism to escape automatic conversion by wrapping elements into base R `I()` call. The `I` function marks an object as *AsIs*, preventing conversion to take place, read `?AsIs` for more details. If base R behaviour is desired for a whole `env` argument, then best to wrap `env` argument into `I()`, otherwise each list element can be wrapped into `I()` individually, see both use cases below. + +```{r rank} +substitute( # base R + rank(input, ties.method = ties), + env = list(input = as.name("Sepal.Width"), ties = "first") +) + +substitute2( # like base R, env AsIs class + rank(input, ties.method = ties), + env = I(list(input = as.name("Sepal.Width"), ties = "first")) +) + +substitute2( # only particular elements of env are AsIs class + rank(input, ties.method = ties), + env = list(input = "Sepal.Width", ties = I("first")) +) +``` + +Note that conversion works recursively on each list element, including of course the escape mechanism. + +```{r substitute2_recursive} +substitute2( # all are symbols + f(v1, v2), + list(v1 = "a", v2 = list("b", list("c", "d"))) +) +substitute2( # 'a' and 'd' should stay as character + f(v1, v2), + list(v1 = I("a"), v2 = list("b", list("c", I("d")))) +) +``` + +### Substitute list of arbitrary length + +Example discussed above presents neat and powerful way to make your code more dynamic. Although there are many other, much more complex cases that developer might have to deal with. One of the common problems is to handle arbitrary length list of arguments. + +An obvious use case could be to mimic `.SD` functionality by injecting a `list` call into `j` argument. + +```{r opt_max_print_4, include = FALSE} +options(max.print = 4L) # 2 rows +``` + +```{r splice_sd} +cols = c("Sepal.Length", "Sepal.Width") +DT[, .SD, .SDcols = cols] +``` + +Having `cols` parameter we want to splice it into a `list` call making `j` argument look like below. + +```{r splice_tobe} +DT[, list(Sepal.Length, Sepal.Width)] +``` + +*Splice* is an operation where list of objects has to be inlined into expression as a sequence of arguments to call. +In base R splice `cols` into a `list` call can be achieved using `as.call(c(quote(list), cols))`. Additionally starting from R 4.0.0, there is new interface for such operation in `bquote` function. +In data.table we make it easier, by automatically _enlist_-ing list of objects into list call to those objects. It means that any `list` object inside `env` list argument will be turned into list `call`. Making the API for that as simple as presented below. + +```{r splice_datatable} +# this works +DT[, j, + env = list(j = as.list(cols)), + verbose = TRUE] + +# this will not work +#DT[, list(cols), +# env = list(cols = cols)] +``` + +It is important to provide a call to list, rather than a list, inside the `env` list argument. It is exactly what is happening in the above example, let's explain _enlist_ list into list call more in details. + +```{r splice_enlist} +DT[, j, # data.table automatically enlist nested lists into list calls + env = list(j = as.list(cols)), + verbose = TRUE] + +DT[, j, # turning above 'j' list into a list call + env = list(j = quote(list(Sepal.Length, Sepal.Width))), + verbose = TRUE] + +DT[, j, # the same as above but accepts character vector + env = list(j = as.call(c(quote(list), lapply(cols, as.name)))), + verbose = TRUE] +``` + +Now let's try to pass a list of symbols, rather than list call to those symbols. We will use `I()` to escape automatic _enlist_-ing, it will also turn off character to symbol conversion, so we also have to use `as.name`. + +```{r splice_not, error=TRUE} +DT[, j, # list of symbols + env = I(list(j = lapply(cols, as.name))), + verbose = TRUE] + +DT[, j, # again the proper way, enlist list to list call automatically + env = list(j = as.list(cols)), + verbose = TRUE] +``` + +Note that both expressions visually look the same, although they are not identical. + +```{r splice_substitute2_not} +str(substitute2(j, env = I(list(j = lapply(cols, as.name))))) + +str(substitute2(j, env = list(j = as.list(cols)))) +``` + +For more detailed explanation on that matter please see examples in [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). + +### Substitution of a complex query + +Let's take as an example more complex function that calculates root mean square. + +${\displaystyle x_{\text{RMS}}={\sqrt{{\frac{1}{n}}\left(x_{1}^{2}+x_{2}^{2}+\cdots +x_{n}^{2}\right)}}}$ + +It takes arbitrary number of variables on input, but now we cannot just *splice* list of arguments into a list call because each of those arguments has to be wrapped into `square` call. In this case we have to *splice* by hand rather than relying on data.table automatic _enlist_. + +First we have to construct calls to `square` function for each of the variables (see `inner_calls`). Then we have to reduce list of calls into a single call, having nested sequence of `+` calls (see `add_calls`). Lastly we have to substitute constructed call into surrounding expression (see `rms`). + +```{r opt_max_print_12, include = FALSE} +options(max.print = 12L) # 2 rows +``` + +```{r complex} +outer = "sqrt" +inner = "square" +vars = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") + +syms = lapply(vars, as.name) +to_inner_call = function(var, fun) call(fun, var) +inner_calls = lapply(syms, to_inner_call, inner) +print(inner_calls) + +to_add_call = function(x, y) call("+", x, y) +add_calls = Reduce(to_add_call, inner_calls) +print(add_calls) + +rms = substitute2( + expr = outer((add_calls) / len), + env = list( + outer = outer, + add_calls = add_calls, + len = length(vars) + ) +) +print(rms) + +DT[, j, env = list(j = rms)] + +# same but skipping last substitute2 call and using add_calls directly +DT[, outer((add_calls) / len), + env = list( + outer = outer, + add_calls = add_calls, + len = length(vars) + )] + +# return as data.table +j = substitute2(j, list(j = as.list(setNames(nm = c(vars, "Species", "rms"))))) +j[["rms"]] = rms +print(j) +DT[, j, env = list(j = j)] + +# alternatively +j = as.call(c( + quote(list), + lapply(setNames(nm = vars), as.name), + list(Species = as.name("Species")), + list(rms = rms) +)) +print(j) +DT[, j, env = list(j = j)] +``` + +## Retired interfaces + +In `[.data.table` it is also possible to use other interfaces for variable substitution, or passing quoted expressions. Those are `get` and `mget` for inline injection of variables by providing their names as character, and `eval` that tells `[.data.table` that expression we passed into an argument is a quoted expression, so should be handled differently. Those interfaces should be considered as retired and we recommended to use new `env` argument instead. + +### `get` + +```{r opt_max_print_4b, include = FALSE} +options(max.print = 4L) # 2 rows +``` + +```{r old_get} +v1 = "Petal.Width" +v2 = "Sepal.Width" + +DT[, .(total = sum(get(v1), get(v2)))] + +DT[, .(total = sum(v1, v2)), + env = list(v1 = v1, v2 = v2)] +``` + +### `mget` + +```{r old_mget} +v = c("Petal.Width", "Sepal.Width") + +DT[, lapply(mget(v), mean)] + +DT[, lapply(v, mean), + env = list(v = as.list(v))] + +DT[, lapply(v, mean), + env = list(v = as.list(setNames(nm = v)))] +``` + +### `eval` + +Instead of using `eval` function we can provide quoted expression into the element of `env` argument, no extra `eval` call is needed then. + +```{r old_eval} +cl = quote( + .(Petal.Width = mean(Petal.Width), Sepal.Width = mean(Sepal.Width)) +) + +DT[, eval(cl)] + +DT[, cl, env = list(cl = cl)] +``` From 1bc1b4095b99ce101b87b38017b6854614934380 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 10 May 2021 19:11:22 +0200 Subject: [PATCH 230/588] address new warning in R-devel (#4986) --- inst/tests/programming.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 3d8a056e3a..88c6a99e6f 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -27,7 +27,7 @@ test(1.16, is.AsIs(I(list(1L))), TRUE) test(1.17, is.AsIs(structure(list(NULL), class="an_S3")), FALSE) ## S3 test(1.18, is.AsIs(I(structure(list(NULL), class="an_S3"))), TRUE) test(1.19, is.AsIs(getClass("MethodDefinition")), FALSE) ## S4 -test(1.20, is.AsIs(I(getClass("MethodDefinition"))), TRUE) +test(1.20, is.AsIs(suppressWarnings(I(getClass("MethodDefinition")))), TRUE) ## suppressWarnings due new warning in R 4.1 test(1.21, is.AsIs(rm.AsIs(1L)), FALSE) test(1.22, is.AsIs(rm.AsIs(I(1L))), FALSE) test(1.23, is.AsIs(rm.AsIs(list(1L))), FALSE) From 16d26fe37ad45ccb7f7f75728f48130880ebd229 Mon Sep 17 00:00:00 2001 From: Tony Fischetti Date: Mon, 10 May 2021 15:51:25 -0400 Subject: [PATCH 231/588] Typo/grammar changes in datatable-programming vignette (#4988) --- NEWS.md | 2 +- vignettes/datatable-programming.Rmd | 109 +++++++++++++++------------- 2 files changed, 59 insertions(+), 52 deletions(-) diff --git a/NEWS.md b/NEWS.md index bcaf69f92c..f1e5fc1789 100644 --- a/NEWS.md +++ b/NEWS.md @@ -58,7 +58,7 @@ 9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage for reporting, and to @tdhock for implementing. -10. A new interface for _programming on data.table_ has been added, [#2655](https://github.com/Rdatatable/data.table/issues/2655) any many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing. +10. A new interface for _programming on data.table_ has been added, closing [#2655](https://github.com/Rdatatable/data.table/issues/2655) and many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing. ```R DT = data.table(x = 1:5, y = 5:1) diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index 9f904138a8..3fb59f4497 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -23,7 +23,7 @@ knitr::opts_chunk$set( ## Introduction -`data.table` from the very first releases enabled interface of `subset` and `with` (or `within`) functions to be used inside `[.data.frame` by defining `[.data.table` method. `subset` and `with` are base R functions that are useful to reduce repetition in code, to make it more easily readable, and usually reduce number of total characters user has to type. Those functions are possible in R because of its quite unique feature called *lazy evaluation*. Feature that allows for a function to catch its arguments before they are evaluated, and for example to evaluate them in a scope different than scope in which they were called. Let's recap interface of `subset` function. +`data.table`, from its very first releases, enabled the usage of `subset` and `with` (or `within`) functions by defining the`[.data.table` method. `subset` and `with` are base R functions that are useful for reducing repetition in code, enhancing readability, and reducing number the total characters the user has to type. This functionality is possible in R because of a quite unique feature called *lazy evaluation*. This feature allows a function to catch its arguments, before they are evaluated, and to evaluate them in a different scope than the one in which they were called. Let's recap usage of the `subset` function. ```{r opt_max_print_10, include = FALSE} options(max.print = 10L) # 2 rows @@ -33,11 +33,11 @@ options(max.print = 10L) # 2 rows subset(iris, Species == "setosa") ``` -It takes the second argument and evaluates it inside the scope of the first argument, a data.frame. It removes variable repetition, makes code more readable, making it less prone to errors. +Here, `subset` takes the second argument and evaluates it within the scope of the `data.frame` given as its first argument. This removes the need for variable repetition, making it less prone to errors, and makes the code more readable. ## Problem description -Problem of this kind of interface is that we cannot easily parameterize the code that uses it. It is because expressions passed to those functions are substituted before being evaluated. Those substituted expressions are different when they are passed by user interactively from those that passed as arguments of another function. +The problem with this kind of interface is that we cannot easily parameterize the code that uses it. This is because the expressions passed to those functions are substituted before being evaluated. ### Example @@ -48,13 +48,13 @@ my_subset = function(data, col, val) { my_subset(iris, Species, "setosa") ``` -### Approaching the problem +### Approaches to the problem -There are multiple ways to work around the problem. +There are multiple ways to work around this problem. #### Avoid *lazy evaluation* -The easiest workaround is to avoid *lazy evaluation* in the first place, and fall back to less intuitive, more error-prone machnisms that uses `df[["variable"]]`, etc. +The easiest workaround is to avoid *lazy evaluation* in the first place, and fall back to less intuitive, more error-prone approaches like `df[["variable"]]`, etc. ```{r subset_nolazy} my_subset = function(data, col, val) { @@ -63,37 +63,37 @@ my_subset = function(data, col, val) { my_subset(iris, col = "Species", val = "setosa") ``` -We basically compute logical vector (of length of nrow of our dataset) first, then logical vector is supplied to `i` argument of `[.data.frame` to perform ordinary subsetting by logical vector. It works well for this simple example, but it lacks flexibility, incorporates variable repetition, and requires user to change the interface to pass column name as character rather than unquoted symbol. The more complex is the expression we need to parameterize the less practical is this approach. +Here, we compute a logical vector of length `nrow(iris)`, then this vector is supplied to the `i` argument of `[.data.frame` to perform ordinary logical vector subsetting. It works well for this simple example, but it lacks flexibility, introduces variable repetition, and requires user to change the function interface to pass the column name as a character rather than unquoted symbol. The more complex the expression we need to parameterize, the less practical this approach becomes. -#### Use of `eval parse` +#### Use of `parse` / `eval` -This method is usually preferred by newcomers to R language. Conceptually it is the most straightforward way. It is to produce required statement by using string concatenation, then parse it and evaluate. +This method is usually preferred by newcomers to R as it is, perhaps, the most straightforward conceptually. This way requires producing the required expression using string concatenation, parsing it, and then evaluating it. ```{r subset_parse} my_subset = function(data, col, val) { data = deparse(substitute(data)) - col = deparse(substitute(col)) - val = paste0("'", val, "'") + col = deparse(substitute(col)) + val = paste0("'", val, "'") text = paste0("subset(", data, ", ", col, " == ", val, ")") eval(parse(text = text)[[1L]]) } my_subset(iris, Species, "setosa") ``` -We have to use `deparse(substitute(.))` to catch the actual names of objects passed to function, so we can construct the `subset` function call using those original names. Although It gives unlimited flexibility with a relatively low complexity, **use of `eval(parse(.))` should be avoided**. The main reasons are: +We have to use `deparse(substitute(...))` to catch the actual names of objects passed to function so we can construct the `subset` function call using those original names. Although ths provides unlimited flexibility with relatively low complexity, **use of `eval(parse(...))` should be avoided**. The main reasons are: - lack of syntax validation - [vulnerability to code injection](https://github.com/Rdatatable/data.table/issues/2655#issuecomment-376781159) -- better alternatives +- the existence of better alternatives Martin Machler, R Project Core Developer, [once said](https://stackoverflow.com/a/40164111/2490497): -> Sorry but I don't understand why too many people even think a string was something that could be evaluated. You must change your mindset, really. Forget all connections between strings on one side and expressions, calls, evaluation on the other side. +> Sorry but I don't understand why too many people even think a string was something that could be evaluated. You must change your mindset, really. Forget all connections between strings on one side and expressions, calls, evaluation on the other side. The (possibly) only connection is via `parse(text = ....)` and all good R programmers should know that this is rarely an efficient or safe means to construct expressions (or calls). Rather learn more about `substitute()`, `quote()`, and possibly the power of using `do.call(substitute, ......)`. -#### Use *computing one the language* +#### Computing on the language -Mentioned functions, along with some others (including `as.call`, `as.name`/`as.symbol`, `bquote`, `eval`), can be categorized as functions to *compute on the language*, as they operate on language objects (`call`, `name`/`symbol`). +The aforementioned functions, along with some others (including `as.call`, `as.name`/`as.symbol`, `bquote`, and `eval`), can be categorized as functions to *compute on the language*, as they operate on _language_ objects (e.g. `call`, `name`/`symbol`). ```{r subset_substitute} my_subset = function(data, col, val) { @@ -102,21 +102,23 @@ my_subset = function(data, col, val) { my_subset(iris, Species, "setosa") ``` -We used base R `substitute` function to transform call `subset(data, col == val)` into `subset(iris, Species == "setosa")` by substituting `data`, `col` and `val` with their original names (or values) from their parent environment. We can see this solution is superior to former ones. Note that we operate on the language objects layer not touching string manipulation routines, thus we refer to that process as *computing on the language*. There is a dedicated chapter on *Computing on the language* in [R language manual](https://cloud.r-project.org/doc/manuals/r-release/R-lang.html). Although it is not necessary for *programming on data.table*, we encourage readers to read this chapter, for the sake of better understanding of the R's powerful and pretty unique feature. +Here, we used the base R `substitute` function to transform the call `subset(data, col == val)` into `subset(iris, Species == "setosa")` by substituting `data`, `col`, and `val` with their original names (or values) from their parent environment. The benefits of this approach to the previous ones should be clear. Note that because we operate at the level of language objects, and don't have to resort to string manipulation, we refer to this as *computing on the language*. There is a dedicated chapter on *Computing on the language* in [R language manual](https://cloud.r-project.org/doc/manuals/r-release/R-lang.html). Although it is not necessary for *programming on data.table*, we encourage readers to read this chapter for the sake of better understanding this powerful and unique feature. #### Use third party packages -There are third party packages that can achieve what base R computing on the language routines do. To name a few `pryr`, `lazyeval` or `rlang`. We will not discuss them here as they are not solving any problem that could not be addressed by base R routines. +There are third party packages that can achieve what base R computing on the language routines do (`pryr`, `lazyeval` and `rlang`, to name a few). + +Though these can be helpful, we will be discussing a `data.table`-unique approach here. ## Programming on data.table -Now, once we established the proper way to parameterize code that uses *lazy evaluation*, we can move on to the main discussion of this vignette, *programming on data.table*. +Now that we've established the proper way to parameterize code that uses *lazy evaluation*, we can move on to the main subject of this vignette, *programming on data.table*. -Starting from version 1.12.10 data.table provides robust mechanism for parameterizing expressions passed to `i`, `j` and `by` (or `keyby`) arguments of `[.data.table`. It is built upon base R `substitute` function, and mimics its interface. For that purpose `substitute2` has been added as a more robust, and more user-friendly, version of base R `substitute`. For a complete list of differences between `base::substitute` and `data.table::substitute2` please read [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). +Starting from version 1.12.10, data.table provides a robust mechanism for parameterizing expressions passed to the `i`, `j`, and `by` (or `keyby`) arguments of `[.data.table`. It is built upon the base R `substitute` function, and mimics its interface. Here, we introduce `substitute2` as a more robust and more user-friendly version of base R's `substitute`. For a complete list of differences between `base::substitute` and `data.table::substitute2` please read the [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). -### Substitute variables and names +### Substituting variables and names -Let's assume we want to have a general function that applies a function to sum of two arguments that has been applied another function. Code of the particular example function will make this example clearer. Below we have a function to compute length of hypotenuse in a right-angled triangle, knowing length of its legs. +Let's say we want to have a general function that applies a function to sum of two arguments that has been applied another function. As a concrete example, below we have a function to compute the length of the hypotenuse in a right triangle, knowing length of its legs. ${\displaystyle c = \sqrt{a^2 + b^2}}$ @@ -127,7 +129,7 @@ quote( ) ``` -The goal is the make every name in the above call to be able to be passed as parameter. +The goal is the make every name in the above call able to be passed as a parameter. ```{r hypotenuse_substitute2} substitute2( @@ -141,9 +143,11 @@ substitute2( ) ``` -We can see that both functions names has been replaced, as well names of the variables passed to functions. We used `substitute2` for convenience. In this simple case, base R `substitute` could be used as well, although it requires extra `lapply(env, as.name)`. +We can see in the output that both the functions names, as well as the names of the variables passed to those functions, have been replaced.. We used `substitute2` for convenience. In this simple case, base R's `substitute` could have been used as well, though it would've required usage of `lapply(env, as.name)`. + +Now, to use substitution inside `[.data.table`, we don't need to call the `substitute2` function. As it is now being used internally, all we have to do is to provide `env` argument, the same way as we've provided it to the `substitute2` function in the example above. Substitution can be applied to the `i`, `j` and `by` (or `keyby`) arguments of the `[.data.table` method. Note that setting the `verbose` argument to `TRUE` can be used to print expressions after substitution is applied. This is very useful for debugging. -Now, to use substitution inside `[.data.table` we don't need to call `substitute2` function. It is being used internally, all we have to do is to provide `env` argument, the same way as we provide it to `substitute2` function. Substitution is applied to `i`, `j` and `by` (or `keyby`) arguments of `[.data.table` method. Note that `verbose = TRUE` argument can be used to print expressions after substitution is applied. +Let's use the `iris` data set as a demonstration. Just as an example, let's pretend we want to compute the `Sepal.Hypotenuse`, treating the sepal width and length as if they were legs of a right triangle. ```{r opt_max_print_8, include = FALSE} options(max.print = 8L) # 2 rows @@ -160,7 +164,7 @@ DT[, outer(inner(var1) + inner(var2)), var2 = "Sepal.Width" )] -# return as data.table, substitute call argument name +# return as a data.table DT[, .(Species, var1, var2, out = outer(inner(var1) + inner(var2))), env = list( outer = "sqrt", @@ -171,9 +175,9 @@ DT[, .(Species, var1, var2, out = outer(inner(var1) + inner(var2))), )] ``` -In the last call we added another parameter `out = "Sepal.Hypotenuse"` that conveys the name of output column. Unlike base R, `substitute2` will handle substitution of call arguments names as well. +In the last call, we added another parameter, `out = "Sepal.Hypotenuse"`, that conveys the intended name of output column. Unlike base R's `substitute`, `substitute2` will handle the substitution of the names of call arguments, as well. -Substitution works on `i` and `by` (or `keyby`) as well. +Substitution works on `i` and `by` (or `keyby`), as well. ```{r hypotenuse_datatable_i_j_by} DT[filter_col %in% filter_val, @@ -193,26 +197,26 @@ DT[filter_col %in% filter_val, ### Substitute variables and character values -In the above example we have seen convenient feature of `substitute2` to automatically converts character to names/symbols. Obvious question arises, what if we actually want to substitute parameter with a character value, so to have base R `substitute` behaviour. We provide mechanism to escape automatic conversion by wrapping elements into base R `I()` call. The `I` function marks an object as *AsIs*, preventing conversion to take place, read `?AsIs` for more details. If base R behaviour is desired for a whole `env` argument, then best to wrap `env` argument into `I()`, otherwise each list element can be wrapped into `I()` individually, see both use cases below. +In the above example, we have seen a convenient feature of `substitute2`: automatic conversion from strings into names/symbols. An obvious question arises: what if we actually want to substitute a parameter with a *character* value, so as to have base R `substitute` behaviour. We provide a mechanism to escape automatic conversion by wrapping the elements into base R `I()` call. The `I` function marks an object as *AsIs*, preventing its arguments from substitution. (Read the `?AsIs` documentation for more details.) If base R behaviour is desired for the whole `env` argument, then it's best to wrap the whole argument in `I()`. Alternatively, each list element can be wrapped in `I()` individually. Let's explore both cases below. ```{r rank} -substitute( # base R +substitute( # base R behaviour rank(input, ties.method = ties), env = list(input = as.name("Sepal.Width"), ties = "first") ) -substitute2( # like base R, env AsIs class +substitute2( # mimicking base R's "substitute" using "I" rank(input, ties.method = ties), env = I(list(input = as.name("Sepal.Width"), ties = "first")) ) -substitute2( # only particular elements of env are AsIs class +substitute2( # only particular elements of env are used "AsIs" rank(input, ties.method = ties), env = list(input = "Sepal.Width", ties = I("first")) ) ``` -Note that conversion works recursively on each list element, including of course the escape mechanism. +Note that conversion works recursively on each list element, including the escape mechanism of course. ```{r substitute2_recursive} substitute2( # all are symbols @@ -225,11 +229,11 @@ substitute2( # 'a' and 'd' should stay as character ) ``` -### Substitute list of arbitrary length +### Substituting lists of arbitrary length -Example discussed above presents neat and powerful way to make your code more dynamic. Although there are many other, much more complex cases that developer might have to deal with. One of the common problems is to handle arbitrary length list of arguments. +The example presented above illustrates a neat and powerful way to make your code more dynamic. However, there are many other much more complex cases that a developer might have to deal with. One common problem handling a list of arguments of arbitrary length. -An obvious use case could be to mimic `.SD` functionality by injecting a `list` call into `j` argument. +An obvious use case could be to mimic `.SD` functionality by injecting a `list` call into the `j` argument. ```{r opt_max_print_4, include = FALSE} options(max.print = 4L) # 2 rows @@ -240,15 +244,16 @@ cols = c("Sepal.Length", "Sepal.Width") DT[, .SD, .SDcols = cols] ``` -Having `cols` parameter we want to splice it into a `list` call making `j` argument look like below. +Having `cols` parameter, we'd want to splice it into a `list` call, making `j` argument look like in the code below. ```{r splice_tobe} DT[, list(Sepal.Length, Sepal.Width)] ``` -*Splice* is an operation where list of objects has to be inlined into expression as a sequence of arguments to call. -In base R splice `cols` into a `list` call can be achieved using `as.call(c(quote(list), cols))`. Additionally starting from R 4.0.0, there is new interface for such operation in `bquote` function. -In data.table we make it easier, by automatically _enlist_-ing list of objects into list call to those objects. It means that any `list` object inside `env` list argument will be turned into list `call`. Making the API for that as simple as presented below. +*Splicing* is an operation where a list of objects have to be inlined into an expression as a sequence of arguments to call. +In base R, splicing `cols` into a `list` can be achieved using `as.call(c(quote(list), cols))`. Additionally, starting from R 4.0.0, there is new interface for such an operation in the `bquote` function. + +In data.table, we make it easier by automatically _enlist_-ing a list of objects into a list call with those objects. This means that any `list` object inside the `env` list argument will be turned into list `call`, making the API for that use case as simple as presented below. ```{r splice_datatable} # this works @@ -261,14 +266,16 @@ DT[, j, # env = list(cols = cols)] ``` -It is important to provide a call to list, rather than a list, inside the `env` list argument. It is exactly what is happening in the above example, let's explain _enlist_ list into list call more in details. +It is important to provide a call to `as.list`, rather than simply a list, inside the `env` list argument, as is shown in the above example. + +Let's explore _enlist_-ing in more detail. ```{r splice_enlist} -DT[, j, # data.table automatically enlist nested lists into list calls +DT[, j, # data.table automatically enlists nested lists into list calls env = list(j = as.list(cols)), verbose = TRUE] -DT[, j, # turning above 'j' list into a list call +DT[, j, # turning the above 'j' list into a list call env = list(j = quote(list(Sepal.Length, Sepal.Width))), verbose = TRUE] @@ -277,7 +284,7 @@ DT[, j, # the same as above but accepts character vector verbose = TRUE] ``` -Now let's try to pass a list of symbols, rather than list call to those symbols. We will use `I()` to escape automatic _enlist_-ing, it will also turn off character to symbol conversion, so we also have to use `as.name`. +Now let's try to pass a list of symbols, rather than list call to those symbols. We'll use `I()` to escape automatic _enlist_-ing but, as this will also turn off character to symbol conversion, we also have to use `as.name`. ```{r splice_not, error=TRUE} DT[, j, # list of symbols @@ -289,7 +296,7 @@ DT[, j, # again the proper way, enlist list to list call automatically verbose = TRUE] ``` -Note that both expressions visually look the same, although they are not identical. +Note that both expressions, although visually appearing to be the same, are not identical. ```{r splice_substitute2_not} str(substitute2(j, env = I(list(j = lapply(cols, as.name))))) @@ -297,17 +304,17 @@ str(substitute2(j, env = I(list(j = lapply(cols, as.name))))) str(substitute2(j, env = list(j = as.list(cols)))) ``` -For more detailed explanation on that matter please see examples in [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). +For more detailed explanation on that matter, please see the examples in the [`substitute2` documentation](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). ### Substitution of a complex query -Let's take as an example more complex function that calculates root mean square. +Let's take, as an example of a more complex function, calculating root mean square. ${\displaystyle x_{\text{RMS}}={\sqrt{{\frac{1}{n}}\left(x_{1}^{2}+x_{2}^{2}+\cdots +x_{n}^{2}\right)}}}$ -It takes arbitrary number of variables on input, but now we cannot just *splice* list of arguments into a list call because each of those arguments has to be wrapped into `square` call. In this case we have to *splice* by hand rather than relying on data.table automatic _enlist_. +It takes arbitrary number of variables on input, but now we cannot just *splice* a list of arguments into a list call because each of those arguments have to be wrapped in a `square` call. In this case, we have to *splice* by hand rather than relying on data.table's automatic _enlist_. -First we have to construct calls to `square` function for each of the variables (see `inner_calls`). Then we have to reduce list of calls into a single call, having nested sequence of `+` calls (see `add_calls`). Lastly we have to substitute constructed call into surrounding expression (see `rms`). +First, we have to construct calls to the `square` function for each of the variables (see `inner_calls`). Then, we have to reduce the list of calls into a single call, having a nested sequence of `+` calls (see `add_calls`). Lastly, we have to substitute the constructed call into the surrounding expression (see `rms`). ```{r opt_max_print_12, include = FALSE} options(max.print = 12L) # 2 rows @@ -339,7 +346,7 @@ print(rms) DT[, j, env = list(j = rms)] -# same but skipping last substitute2 call and using add_calls directly +# same, but skipping last substitute2 call and using add_calls directly DT[, outer((add_calls) / len), env = list( outer = outer, @@ -366,7 +373,7 @@ DT[, j, env = list(j = j)] ## Retired interfaces -In `[.data.table` it is also possible to use other interfaces for variable substitution, or passing quoted expressions. Those are `get` and `mget` for inline injection of variables by providing their names as character, and `eval` that tells `[.data.table` that expression we passed into an argument is a quoted expression, so should be handled differently. Those interfaces should be considered as retired and we recommended to use new `env` argument instead. +In `[.data.table`, it is also possible to use other mechanisms for variable substitution or for passing quoted expressions. These include `get` and `mget` for inline injection of variables by providing their names as strings, and `eval` that tells `[.data.table` that the expression we passed into an argument is a quoted expression and that it should be handled differently. Those interfaces should now be considered retired and we recommend using the new `env` argument, instead. ### `get` From 7e89b522b2a6ce45765e2ff31f63b51b2af46b6c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 10 May 2021 16:08:11 -0700 Subject: [PATCH 232/588] get Rraw files to highlight well on GitHub (#4811) --- .gitattributes | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitattributes b/.gitattributes index fa1385d99a..9c72b27aea 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,3 @@ * -text +*.Rraw linguist-language=R + From 3764969747d217d982738de67940c3c8b54d0600 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 10 May 2021 17:46:06 -0700 Subject: [PATCH 233/588] use writeLines for fread(text=.) (#4805) --- R/fread.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fread.R b/R/fread.R index a36cbfda28..b8271ce0c1 100644 --- a/R/fread.R +++ b/R/fread.R @@ -36,7 +36,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (!is.character(text)) stop("'text=' is type ", typeof(text), " but must be character.") if (!length(text)) return(data.table()) if (length(text) > 1L) { - cat(text, file=(tmpFile<-tempfile(tmpdir=tmpdir)), sep="\n") # avoid paste0() which could create a new very long single string in R's memory + writeLines(text, tmpFile<-tempfile(tmpdir=tmpdir)) # avoid paste0() which could create a new very long single string in R's memory file = tmpFile on.exit(unlink(tmpFile), add=TRUE) } else { From 19c61c0ed8b75b292212702e3308e78d7b9f2add Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 10 May 2021 17:50:06 -0700 Subject: [PATCH 234/588] add a note about when .N is computed (#4696) --- man/special-symbols.Rd | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd index 30cfedc5fa..9bfa72fceb 100644 --- a/man/special-symbols.Rd +++ b/man/special-symbols.Rd @@ -10,7 +10,7 @@ \alias{.NGRP} \title{ Special symbols } \description{ - \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. See the vignettes and examples here and in \code{\link{data.table}}. + \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}. \code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}. } \details{ @@ -28,6 +28,8 @@ } \code{.EACHI} is defined as \code{NULL} but its value is not used. Its usage is \code{by=.EACHI} (or \code{keyby=.EACHI}) which invokes grouping-by-each-row-of-i; see \code{\link{data.table}}'s \code{by} argument for more details. + + Note that \code{.N} in \code{i} is computed up-front, while that in \code{j} applies \emph{after filtering in \code{i}}. That means that even absent grouping, \code{.N} in \code{i} can be different from \code{.N} in \code{j}. See Examples. } \seealso{ \code{\link{data.table}}, \code{\link{:=}}, \code{\link{set}}, \code{\link{datatable-optimize}} @@ -52,5 +54,9 @@ DT[, c(.(y=max(y)), lapply(.SD, min)), DT[, grp := .GRP, by=x] # add a group counter DT[, grp_pct := .GRP/.NGRP, by=x] # add a group "progress" counter X[, DT[.BY, y, on="x"], by=x] # join within each group + +# .N can be different in i and j +DT[{cat(sprintf('in i, .N is \%d\n', .N)); a < .N/2}, + {cat(sprintf('in j, .N is \%d\n', .N)); mean(a)}] } \keyword{ data } From 8f7ef88cda14f9e6d7c19c3e60c29fb3d3275d8e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 12 May 2021 17:00:38 -0700 Subject: [PATCH 235/588] testing is more robust to changes in base messages (#4806) --- R/test.data.table.R | 3 +- inst/tests/tests.Rraw | 98 ++++++++++++++++++++++++++++++------------- 2 files changed, 70 insertions(+), 31 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 0b47d8e18b..da12144f66 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -97,8 +97,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F assign("testDir", function(x) file.path(fulldir, x), envir=env) # are R's messages being translated to a foreign language? #3039, #630 - txt = eval(parse(text="tryCatch(mean(not__exist__), error = function(e) e$message)"), envir=.GlobalEnv) - foreign = txt != "object 'not__exist__' not found" + foreign = gettext("object '%s' not found", domain="R") != "object '%s' not found" if (foreign) { # nocov start catf("\n**** This R session's language is not English. Each test will still check that the correct number of errors and/or\n**** warnings are produced. However, to test the text of each error/warning too, please restart R with LANGUAGE=en\n\n") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f17961e760..2857d7322f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -101,6 +101,42 @@ if (!test_longdouble) { # e.g. under valgrind, longdouble.digits==53; causing these to fail: 1262, 1729.04, 1729.08, 1729.09, 1729.11, 1729.13, 1830.7; #4639 } +# generate simple error messages from base that are checked against in our tests. this helps +# protect us against these messages evolving in base in the future, and against these messages +# potentially not being produced in English. +# Three use cases: +# (1) match message exactly [missing delim] +# (2) match message pattern after dropping anything between delimeters [delim, fmt=FALSE] +# (3) function factory for matching messages exactly by substituting anything between delimeters [delim, fmt=TRUE] +get_msg = function(e, delim, fmt=FALSE) { + msg = tryCatch(e, error=identity, warning=identity)$message + if (missing(delim)) return(msg) + if (length(delim) == 1L) delim[2L] = delim[1L] + msg = gsub( + sprintf("%1$s[^%2$s]+%2$s", delim[1L], delim[2L]), + sprintf("%s%s%s", delim[1L], if (fmt) "%s" else ".+", delim[2L]), + msg + ) + if (fmt) return(function(x) sprintf(msg, x)) + return(msg) +} +base_messages = list( + missing_object = get_msg(`__dt_test_missing_` + 1, "'", fmt=TRUE), + missing_function = get_msg(`__dt_test_missing_`(), '"', fmt=TRUE), + invalid_arg_unary_operator = get_msg(-'a'), + invalid_arg_binary_operator = get_msg(1 + 'a'), + invalid_arg_sum = get_msg(sum('a'), c("\\(", "\\)"), fmt=TRUE), + arg_length_mismatch = get_msg(base::order(1, 1:2)), + empty_max = get_msg(max(numeric())), + empty_min = get_msg(min(numeric())), + coerce_na = get_msg(as.integer('a')), + locked_binding = get_msg({e = new.env(); e$x = 1; lockBinding('x', e); e$x = 2}, "'", fmt=TRUE), + missing_file = get_msg({tmp <- tempfile(tmpdir=tempfile("xxx")); file(tmp, "w")}, "'"), + # gives both error & warning but tryCatch returns the warning first, so suppress + cant_open_file = get_msg(suppressWarnings({con<-file(tempfile()); open(con, 'r')})), + mixed_subscripts = get_msg(letters[-1:1]) +) + ########################## test(1.1, tables(env=new.env()), null.data.table(), output = "No objects of class") @@ -977,7 +1013,7 @@ DT = data.table(a=1:5, b=6:10, c=11:15) test(327, within(DT,rm(a,b)), data.table(c=11:15)) test(328, within(DT,rm(b,c)), data.table(a=1:5)) test(329, within(DT,rm(b,a)), data.table(c=11:15)) -test(330, within(DT,rm(b,c,d)), data.table(a=1:5), warning="object 'd' not found") +test(330, within(DT,rm(b,c,d)), data.table(a=1:5), warning=base_messages$missing_object("d")) DT[,c("b","a")]=NULL test(332, DT, data.table(c=11:15)) test(333, within(DT,rm(c)), data.table(NULL)) @@ -1119,8 +1155,8 @@ test(378, cbind(), NULL) test(379, rbind(), NULL) DT = data.table(a=rep(1:3,1:3),b=1:6) -test(380, DT[,{.SD$b[1]=10L;.SD}, by=a], error="locked binding") # .SD locked for 1st group -test(381, DT[,{if (a==2) {.SD$b[1]=10L;.SD} else .SD}, by=a], error="locked binding") # .SD locked in 2nd group onwards too +test(380, DT[,{.SD$b[1]=10L;.SD}, by=a], error=base_messages$locked_binding(".SD")) # .SD locked for 1st group +test(381, DT[,{if (a==2) {.SD$b[1]=10L;.SD} else .SD}, by=a], error=base_messages$locked_binding(".SD")) # .SD locked in 2nd group onwards too # test that direct := is trapped, but := within a copy of .SD is allowed (FAQ 4.5). See also tests 556-557. test(382, DT[,b:=.N*2L,by=a], data.table(a=rep(1:3,1:3),b=rep(2L*(1:3),1:3))) @@ -1672,7 +1708,7 @@ test(570.1, DT[,list(.I=.I),list(a,b)][,.I,a], error="The column '.I' can't be g DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6) test(571, DT[,sum(v),by="b, b"], data.table("b"=3:4, " b"=5:6, V1=c(9L,12L))) test(572, DT[,sum(v),by="a , b"], data.table("a "=1:2, " b"=5:6, V1=c(9L,12L))) -test(573, DT[,sum(v),by="b, a"], error="object ' a' not found") +test(573, DT[,sum(v),by="b, a"], error=base_messages$missing_object(" a")) # Test base::unname, used by melt, and only supported by data.table for DF compatibility for non-dtaware packages DT = data.table(a=1:3, b=4:6) @@ -2036,7 +2072,7 @@ if (ncol(DT)==2L) setnames(DT,c("A","B")) # else don't stop under torture with s test(714, DT[,z:=6:10], data.table(A=1:5,B=5,z=6:10)) # Test J alias is now removed outside DT[...] from v1.8.7 (to resolve rJava::J conflict) -test(715, J(a=1:3,b=4), error="could not find function.*J") +test(715, J(a=1:3,b=4), error=base_messages$missing_function("J")) # Test get in j DT = data.table(a=1:3,b=4:6) @@ -3792,7 +3828,7 @@ test(1137.03, DT[, .SD, .SDcols=-"y"], DT[, c(1,3), with=FALSE]) test(1137.04, DT[, .SD, .SDcols=-c("y", "x")], DT[, 3, with=FALSE]) test(1137.05, DT[, .SD, .SDcols=-which(names(DT) %in% c("x", "y", "z"))], null.data.table()) test(1137.06, DT[, .SD, .SDcols=c(1, -2)], error=".SDcols is numeric but has both") -test(1137.07, DT[, .SD, .SDcols=c("x", -"y")], error="invalid argument to unary") +test(1137.07, DT[, .SD, .SDcols=c("x", -"y")], error=base_messages$invalid_arg_unary_operator) test(1137.08, DT[, .SD, .SDcols=c(-1, "x")], error="Some items of .SDcols are") DT <- data.table(x=1:5, y=6:10, z=11:15, zz=letters[1:5]) @@ -4535,8 +4571,7 @@ ix = with(DT, order(1-DT$x, decreasing=TRUE)) test(1251.07, DT[order(1-DT$x, decreasing=TRUE)], DT[ix]) test(1251.08, DT[order(x, list(-y), decreasing=TRUE)], error = "Column 2 is length 1 which differs from length of column 1.*10") -test(1251.09, DT[base::order(x, list(-y), decreasing=TRUE)], - error = "argument lengths differ") # data.table's error is more helpful than base's +test(1251.09, DT[base::order(x, list(-y), decreasing=TRUE)], error=base_messages$arg_length_mismatch) # data.table's error is more helpful than base's # more "edge cases" to ensure we're consistent with base test(1251.10, DT[order("a")], DT[1L]) test(1251.11, DT[order("b", "a")], DT[1L]) @@ -4915,7 +4950,7 @@ test(1290.34, DT[, names(DT) == "x", with=FALSE], as.data.table(ll[c(1,3,4)])) dt1 = data.table(a=character(0),b=numeric(0)) ans1 = data.table(a=character(0), b=numeric(0), c=numeric(0)) ans2 = data.table(a=character(0), b=numeric(0), c=numeric(0), d=integer(0)) -test(1291.1, dt1[, c:=max(b), by='a'], ans1, warning="no non-missing arguments to max") +test(1291.1, dt1[, c:=max(b), by='a'], ans1, warning=base_messages$empty_max) test(1291.2, dt1[, d := integer(0), by=a], ans2) # Bug #21 @@ -4955,7 +4990,7 @@ test(1294.02, dt[, a := 1.5]$a, rep(1L, 3L), test(1294.03, dt[, a := NA]$a, rep(NA_integer_, 3L)) test(1294.04, dt[, a := "a"]$a, rep(NA_integer_, 3L), warning=c("Coercing 'character' RHS to 'integer'.*column 1 named 'a'", - "NAs introduced by coercion")) + base_messages$coerce_na)) test(1294.05, dt[, a := list(list(1))]$a, rep(1L, 3L), warning="Coercing 'list' RHS to 'integer' to match.*column 1 named 'a'") test(1294.06, dt[, a := list(1L)]$a, rep(1L, 3L)) @@ -4965,7 +5000,7 @@ test(1294.09, dt[, b := 1L]$b, rep(1,3)) test(1294.10, dt[, b := NA]$b, rep(NA_real_,3)) test(1294.11, dt[, b := "bla"]$b, rep(NA_real_, 3), warning=c("Coercing 'character' RHS to 'double' to match.*column 2 named 'b'", - "NAs introduced by coercion")) + base_messages$coerce_na)) test(1294.12, dt[, b := list(list(1))]$b, rep(1,3), warning="Coercing 'list' RHS to 'double' to match.*column 2 named 'b'") test(1294.13, dt[, b := TRUE]$b, rep(1,3)) @@ -9960,7 +9995,8 @@ test(1670.2, class(as.data.table(x)), class(x)[2:3]) # #1676, `:=` with by shouldn't add cols on supported types dt = data.table(x=1, y=2) -test(1671, dt[, z := sd, by=x], error="invalid type/length (closure/1)") +test(1671, dt[, z := sd, by=x], + error=gettextf("invalid type/length (%s/%d) in vector allocation", "closure", 1L, domain="R")) # 1683 DT <- data.table(V1 = rep(1:2, 3), V2 = 1:6) @@ -10327,7 +10363,8 @@ if (.Platform$OS.type=="unix") { test(1703.15, fread("."), error="File '.' is a directory. Not yet implemented.") # tmpdir argument d = tempfile("dir") -test(1703.16, fread(text=c('a,b','1,2'), tmpdir=d), error="cannot open the connection", warning="No such file or directory") +test(1703.16, fread(text=c('a,b','1,2'), tmpdir=d), + error=base_messages$cant_open_file, warning=base_messages$missing_file) dir.create(d) test(1703.17, fread(text=c('a,b','1,2'), tmpdir=d), data.table(a=1L,b=2L)) test(1703.18, fread(text=c('a,b','1,2')), data.table(a=1L, b=2L)) @@ -10394,8 +10431,8 @@ test(1722.2, DT[,(!is.na(as.numeric(FieldName)))], c(TRUE,TRUE,FALSE,TRUE,FALSE, test(1723.1, DT[removalIndex>0,rowId-(2*removalIndex-1)], c(-2,-11,-5,-14)) test(1723.2, DT[removalIndex>0,(rowId-(2*removalIndex-1))], c(-2,-11,-5,-14)) DT = data.table(FieldName = c("1", "2", "3", "four", "five", "6")) -test(1724.1, DT[, is.na(as.numeric(FieldName))], c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE), warning="NAs introduced by coercion") -test(1724.2, DT[, !is.na(as.numeric(FieldName))], c(TRUE,TRUE,TRUE,FALSE,FALSE,TRUE), warning="NAs introduced by coercion") +test(1724.1, DT[, is.na(as.numeric(FieldName))], c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE), warning=base_messages$coerce_na) +test(1724.2, DT[, !is.na(as.numeric(FieldName))], c(TRUE,TRUE,TRUE,FALSE,FALSE,TRUE), warning=base_messages$coerce_na) # Ensure NA's are added properly when a new column is added, not all the target rows are joined to, and the number of i # rows is equal or greater than the number of rows in the target table. @@ -10846,7 +10883,8 @@ test(1743.217, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor test(1743.218, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor = c(1, 2, 4), factor = 3), select = c(5, 4, 2, 3)), class), y = c(e = "character", d = "factor", b = "factor", c = "factor")) test(1743.22, fread("a,b,c\n1999/01/01,2,f", colClasses=list(Date=1L), drop="a"), data.table(b=2L, c="f")) -test(1743.231, fread("a,b,c\n2,1,4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4i"), warning="NAs introduced by coercion.*left as type 'character'") +test(1743.231, fread("a,b,c\n2,1,4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4i"), + warning=paste0(base_messages$coerce_na, ".*left as type 'character'")) test(1743.232, fread("a,b,c\n2,1,3+4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c=3+4i)) test(1743.241, fread("a,b,c\n2,2,f", colClasses = list(character="c", integer="b"), drop="a"), data.table(b=2L, c="f")) test(1743.242, fread("a,b,c\n2,2,f", colClasses = c("integer", "integer", "factor"), drop="a"), data.table(b=2L, c=factor("f"))) @@ -10886,7 +10924,9 @@ test(1743.308, fread(data1743, colClasses=list(NULL=c("C","D")), drop=1:2), data test(1743.311, fread(data1743, colClasses="NULL"), ans<-data.table(A=1:2, B=3:4, C=5:6, D=7:8), warning="colClasses.*quoted.*interpreted as colClasses.*NULL") test(1743.312, fread(data1743, colClasses=character()), ans) test(1743.32, fread("A,B\na,0+1i", colClasses="complex"), data.table(A="a", B=1i), - warning="Column 'A' was requested to be 'complex'.*NAs introduced by coercion.*column has been left as.*character") + warning=paste0("Column 'A' was requested to be 'complex'.*", + base_messages$coerce_na, + ".*column has been left as.*character")) test(1743.33, fread(data1743, colClasses=list("character"=4, "numeric"=c(2,NA,1))), data.table(A=c(1,2), B=c(3,4), C=5:6, D=c("7","8")), warning="colClasses[[2]][2] is NA") test(1743.34, fread(data1743, select=list("character"=4, "numeric"=c(2,NA,1))), data.table(D=c("7","8"), B=c(3,4), A=c(1,2)), warning="colClasses[[2]][2] is NA") old = options(warn=2) @@ -11021,7 +11061,7 @@ test(1750.10, # groupingsets on aggregate using grouping col char type and sum - error test(1750.11, groupingsets(dt, j = lapply(.SD, sum), by = c("status","year"), sets=list(character()), .SDcols="color"), - error = "invalid 'type' (character) of argument" + error=base_messages$invalid_arg_sum("character") ) # groupingsets on aggregate using grouping col factor type and sum - error test(1750.12, @@ -11071,9 +11111,9 @@ test(1750.19, uniqueN({ ), 1L, warning = "'sets' contains a duplicate") # entries in `by` / `sets` not exists in data.table test(1750.20, exists("notexist"), FALSE) # https://github.com/Rdatatable/data.table/issues/3055#issuecomment-423364960 -test(1750.21, groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","notexist"), sets=list(c("color"), character()), id=TRUE), error = "object 'notexist' not found") +test(1750.21, groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","notexist"), sets=list(c("color"), character()), id=TRUE), error=base_messages$missing_object("notexist")) test(1750.22, groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=list(c("color"), "stat"), id=TRUE), error = "Columns used in 'sets' but not present in 'by': [stat]") -test(1750.23, groupingsets(dt, j = .(a=sum(notexist)), by = c("color","year","status"), sets=list(c("color"), character()), id=TRUE), error = "object 'notexist' not found") +test(1750.23, groupingsets(dt, j = .(a=sum(notexist)), by = c("color","year","status"), sets=list(c("color"), character()), id=TRUE), error=base_messages$missing_object("notexist")) # update by ref `:=` forbidden test(1750.24, groupingsets(dt, j = sum_value := sum(value), by = c("color","year","status"), sets=list(c("color"), character())), @@ -13038,7 +13078,7 @@ test(1923.2, indices(DT, vectors=TRUE), list(c("V1"))) DT = data.table(varname = 1) test(1924.1, DT[var_name==1], error='not found\\. Perhaps you intended.*varname') test(1924.2, DT[variable==1], error='Object.*not found among') -test(1924.3, DT[varname+'a'], error='non-numeric argument') +test(1924.3, DT[varname+'a'], error=base_messages$invalid_arg_binary_operator) DT[, VAR_NAME:=2] test(1924.4, DT[var_name==1], error="Object 'var_name' not found. Perhaps you intended [varname, VAR_NAME]") DT = setDT(lapply(integer(50), function(...) numeric(1L))) @@ -13201,10 +13241,10 @@ test(1948.14, DT[i, on = 1L], error = "'on' argument should be a named atomic ve # helpful error when on= is provided but not i, rather than silently ignoring on= DT = data.table(A=1:3) -test(1949.1, DT[,,on=A], error="object 'A' not found") # tests .1 to .4 amended after #3621 -test(1949.2, DT[,1,on=A], error="object 'A' not found") -test(1949.3, DT[on=A], error="object 'A' not found") -test(1949.4, DT[,on=A], error="object 'A' not found") +test(1949.1, DT[,,on=A], error=base_messages$missing_object("A")) # tests .1 to .4 amended after #3621 +test(1949.2, DT[,1,on=A], error=base_messages$missing_object("A")) +test(1949.3, DT[on=A], error=base_messages$missing_object("A")) +test(1949.4, DT[,on=A], error=base_messages$missing_object("A")) test(1949.5, DT[1,,with=FALSE], error="j must be provided when with=FALSE") test(1949.6, DT[], output="A.*1.*2.*3") # no error test(1949.7, DT[,], output="A.*1.*2.*3") # no error, #3163 @@ -13808,7 +13848,7 @@ test(1967.57, setnames(x), error = 'x has 2 columns but its names are length 0') names(x) = c('a', 'b') test(1967.58, names(setnames(x, new = c('b', 'c'))), c('b', 'c')) test(1967.59, setnames(x, 1:2, c(8L, 9L)), error = "'new' is not a character") -test(1967.60, setnames(x, -1:1, c('hey', 'you')), error = "mixed.*negative") +test(1967.60, setnames(x, -1:1, c('hey', 'you')), error = base_messages$mixed_subscripts) test(1967.61, setnames(x, 1+3i, 'cplx'), error = "'old' is type complex") test(1967.62, setnames(x, 1, c('d', 'e')), error = "'old' is length 1 but 'new'") test(1967.621, setnames(x, 1:2, c("a","a")), data.table(a=1:5, a=6:10)) @@ -17209,11 +17249,11 @@ test(2158.2, DT[, by="index", list(value=list(value))], DT = data.table(x = 1) test(2159.01, typeof(as.matrix(DT)), "double") test(2159.02, typeof(as.matrix(DT[0L])), "double") -test(2159.03, min(DT[0L]), Inf, warning="missing") # R's warning message; use one word 'missing' to insulate from possible future changes to R's message +test(2159.03, min(DT[0L]), Inf, warning=base_messages$empty_min) DT = data.table(x = 1L) test(2159.04, typeof(as.matrix(DT)), "integer") test(2159.05, typeof(as.matrix(DT[0L])), "integer") -test(2159.06, min(DT[0L]), Inf, warning="missing") +test(2159.06, min(DT[0L]), Inf, warning=base_messages$empty_min) DT = data.table(x = TRUE) test(2159.07, typeof(as.matrix(DT)), "logical") test(2159.08, typeof(as.matrix(DT[0L])), "logical") @@ -17498,7 +17538,7 @@ iris.i <- 1 iris.num <- datasets::iris[iris.i, 1:4] iris.days <- data.table( day1=iris.num, day2=iris.num, Species=iris$Species[iris.i]) -test(2183.61, melt(iris.days, measure.vars=measure(before=as.integer, value.name, dim, sep=".")), error="before conversion function returned vector of all NA", warning="NAs introduced by coercion") +test(2183.61, melt(iris.days, measure.vars=measure(before=as.integer, value.name, dim, sep=".")), error="before conversion function returned vector of all NA", warning=base_messages$coerce_na) test(2183.62, melt(iris.days, measure.vars=measure(before=function(x)rep(4, length(x)), value.name, dim, sep=".")), error="number of unique groups after applying type conversion functions less than number of groups, change type conversion") test(2183.63, melt(iris.days, measure.vars=measure(before, value.name, dim, pattern="(day)[12][.](.*)[.](.*)")), error="number of unique column IDs =4 is less than number of melted columns =8; fix by changing pattern/sep") test(2183.64, melt(iris.days, measure.vars=measure(day=as.integer, value.name, dim, pattern="day(.)[.](.*)[.](.*)")), data.table(Species=factor("setosa"), day=as.integer(c(1,2,1,2)), dim=c("Length","Length","Width","Width"), Sepal=c(5.1,5.1,3.5,3.5), Petal=c(1.4,1.4,0.2,0.2))) From 9a4ace127c27388a0aa65a41e4ba709ea3b7c18b Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 13 May 2021 22:28:00 +0200 Subject: [PATCH 236/588] split.data.table minor changes (#3476) --- R/data.table.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 0d51beafff..b36c0599a3 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2357,8 +2357,8 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR join = TRUE } dtq[["j"]] = substitute( - list(.ll.tech.split=list(.expr)), - list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) # simplify when `nomatch` accept NULL #857 ? + list(.ll.tech.split=list(.expr), .ll.tech.split.names=paste(lapply(.BY, as.character), collapse=".")), + list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) ) dtq[["by"]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`. .expr, @@ -2371,11 +2371,9 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR if (isTRUE(verbose)) catf("Processing split.data.table with: %s\n", deparse(dtq, width.cutoff=500L)) tmp = eval(dtq) # add names on list - setattr(ll <- tmp$.ll.tech.split, - "names", - as.character( - if (!flatten) tmp[[.by]] else tmp[, list(.nm.tech.split=paste(unlist(lapply(.SD, as.character)), collapse = ".")), by=by, .SDcols=by]$.nm.tech.split - )) + ll = tmp$.ll.tech.split + nm = tmp$.ll.tech.split.names + setattr(ll, "names", nm) # handle nested split if (flatten || length(by) == 1L) { for (x in ll) .Call(C_unlock, x) From 334d51e32b645edce231e75bfc51484560907105 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 13 May 2021 14:51:11 -0700 Subject: [PATCH 237/588] add a regression test for fixed dcast fun.aggregate issue (#4251) --- NEWS.md | 5 +++-- inst/tests/tests.Rraw | 7 +++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index f1e5fc1789..e26cf65c1e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -135,6 +135,8 @@ 4. `cube(DT, by="a")` now gives a more helpful error that `j` is missing, [#4282](https://github.com/Rdatatable/data.table/pull/4282). +5. v1.13.0 (July 2020) fixed a segfault/corruption/error (depending on version of R and circumstances) in `dcast()` when `fun.aggregate` returned `NA` (type `logical`) in an otherwise `character` result, [#2394](https://github.com/Rdatatable/data.table/issues/2394). This fix was the result of other internal rework and there was no news item at the time. A new test to cover this case has now been added. Thanks Vadim Khotilovich for reporting, and Michael Chirico for investigating, pinpointing when the fix occurred and adding the test. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) @@ -340,7 +342,6 @@ has a better chance of working on Mac. 11. `copy()` now overallocates deeply nested lists of `data.table`s, [#4205](https://github.com/Rdatatable/data.table/issues/4205). Thanks to @d-sci for reporting and the PR. 12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR. - 13. A relatively rare case of segfault when combining non-equi joins with `by=.EACHI` is now fixed, closes [#4388](https://github.com/Rdatatable/data.table/issues/4388). 14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report. @@ -355,7 +356,7 @@ has a better chance of working on Mac. 19. Matrices resulting from logical operators or comparisons on `data.table`s, e.g. in `dta == dtb`, can no longer have their colnames changed by reference later, [#4323](https://github.com/Rdatatable/data.table/issues/4323). Thanks to @eyherabh for reporting and @tlapak for the PR. -20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8). +20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8).r ## NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2857d7322f..a14dbe2928 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17577,3 +17577,10 @@ vr = "Species" IDT[, virginca := get(vr) == "virginica"] ans = data.table(round = c(3, 3, 3, 2, 2, 4, 2, 4), k = c(6, 7, 8, 5, 7, 7, 6, 8), kar = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("setosa", "versicolor", "virginica"), class = "factor"), N = c(24L, 14L, 4L, 1L, 1L, 1L, 3L, 2L)) test(2184.5, IDT[(virginca), .N, by = .(round(Sepal.Width), k = round(Sepal.Length), kar = get(vr))] , ans) + +# dcast() segfault or 'STRING_ELT() can only be applied to character not logical' fixed in v1.13.0, #2394 +agg = function(x) if(length(x) > 0) min(x) else NA +DT = data.table(id=c(1,1,2,2), x=c('y','y','y','z'), v=c('a','b','c','d')) +test(2185, dcast(DT, formula=id~x, fun.aggregate=agg, value.var='v'), + data.table(id=c(1,2), y=c('a','c'), z=c(NA,'d'), key="id")) + From 336187baa31083eb739acc5686a89fab1f0428fd Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 13 May 2021 16:23:33 -0700 Subject: [PATCH 238/588] & --> && where appropriate, and some tidying of tests/CRAN_release (#4269) --- .dev/CRAN_Release.cmd | 9 ++++++++- R/between.R | 4 ++-- R/data.table.R | 5 +++-- R/merge.R | 33 ++++++++++++++++++++------------- inst/tests/other.Rraw | 6 ++---- inst/tests/tests.Rraw | 31 +++++++++++++++++++------------ 6 files changed, 54 insertions(+), 34 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 274b55a2dd..88f9f52752 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -157,11 +157,18 @@ grep -Enr "^[^#]*(?:\[|==|>|<|>=|<=|,|\(|\+)\s*[-]?[0-9]+[^0-9L:.e]" R | grep -E grep -Enr "\bifelse" R # No system.time in main tests.Rraw. Timings should be in benchmark.Rraw -grep -n "system[.]time" ./inst/tests/tests.Rraw +grep -Fn "system.time" ./inst/tests/*.Rraw | grep -Fv "benchmark.Rraw" | grep -Fv "this system.time usage ok" + +# No tryCatch in *.Rraw -- tryCatch should be handled only in test() itself to avoid silently missed warnings/errors/output +grep -Fn "tryCatch" ./inst/tests/*.Rraw # All % in *.Rd should be escaped otherwise text gets silently chopped grep -n "[^\]%" ./man/*.Rd +# if (a & b) is either invalid or inefficient (ditto for replace & with |); +# if(any(a [&|] b)) is appropriate b/c of collapsing the logical vector to scalar +grep -nr "^[^#]*if[^&#]*[^&#\"][&][^&]" R | grep -Ev "if\s*[(](?:any|all)" + # seal leak potential where two unprotected API calls are passed to the same # function call, usually involving install() or mkChar() # Greppable thanks to single lines and wide screens diff --git a/R/between.R b/R/between.R index c9ca8d0429..61fee332b4 100644 --- a/R/between.R +++ b/R/between.R @@ -47,8 +47,8 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) if (isTRUE(getOption("datatable.verbose"))) catf("optimised between not available for this data type, fallback to slow R routine\n") if (isTRUE(NAbounds) && (anyNA(lower) || anyNA(upper))) stop("Not yet implemented NAbounds=TRUE for this non-numeric and non-character type") if (check && any(lower>upper, na.rm=TRUE)) stop("Some lower>upper for this non-numeric and non-character type") - if (incbounds) x>=lower & x<=upper - else x>lower & x=lower & x<=upper # this & is correct not && + else x> lower & x< upper } } diff --git a/R/data.table.R b/R/data.table.R index b36c0599a3..fd0e9fd1cd 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1638,7 +1638,8 @@ replace_dot_alias = function(e) { jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__) idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) - if (any(idx)) jn__[idx & (jn__ == "")] = "I" + if (any(idx)) + jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && jvnames = c(jvnames, jn__) jsubl[[i_]] = jl__ } @@ -2554,7 +2555,7 @@ setnames = function(x,old,new,skip_absent=FALSE) { } } } - if (any(w <- new==names(x)[i] & Encoding(new)==Encoding(names(x)[i]))) { + if (any(w <- new==names(x)[i] & Encoding(new)==Encoding(names(x)[i]))) { # this & is correct not && w = which(!w) new = new[w] i = i[w] diff --git a/R/merge.R b/R/merge.R index 3dc4389965..8dc59e018b 100644 --- a/R/merge.R +++ b/R/merge.R @@ -11,10 +11,17 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL by = key(x) } } - x0 = length(x)==0L; y0 = length(y)==0L - if (x0 || y0) warning("You are trying to join data.tables where ", if(x0 && y0) "arguments 'x' and 'y' have" else if(x0) "argument 'x' has" else "argument 'y' has", " no columns.") - if (any(duplicated(names(x)))) stop("x has some duplicated column name(s): ",paste(names(x)[duplicated(names(x))],collapse=","),". Please remove or rename the duplicate(s) and try again.") - if (any(duplicated(names(y)))) stop("y has some duplicated column name(s): ",paste(names(y)[duplicated(names(y))],collapse=","),". Please remove or rename the duplicate(s) and try again.") + x0 = length(x)==0L + y0 = length(y)==0L + if (x0 || y0) warning(sprintf(ngettext(x0+y0, + "You are trying to join data.tables where %s has 0 columns.", + "You are trying to join data.tables where %s have 0 columns."), + if (x0 && y0) "'x' and 'y'" else if (x0) "'x'" else "'y'" + )) + nm_x = names(x) + nm_y = names(y) + if (anyDuplicated(nm_x)) stop(gettextf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "x", brackify(nm_x[duplicated(nm_x)]))) + if (anyDuplicated(nm_y)) stop(gettextf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "y", brackify(nm_y[duplicated(nm_y)]))) ## set up 'by'/'by.x'/'by.y' if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) ) @@ -22,11 +29,11 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (!missing(by) && !missing(by.x)) warning("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.") if (!is.null(by.x)) { - if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y)) + if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y)) stop("A non-empty vector of column names is required for `by.x` and `by.y`.") - if (!all(by.x %chin% names(x))) + if (!all(by.x %chin% nm_x)) stop("Elements listed in `by.x` must be valid column names in x.") - if (!all(by.y %chin% names(y))) + if (!all(by.y %chin% nm_y)) stop("Elements listed in `by.y` must be valid column names in y.") by = by.x names(by) = by.y @@ -36,10 +43,10 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (is.null(by)) by = key(x) if (is.null(by)) - by = intersect(names(x), names(y)) + by = intersect(nm_x, nm_y) if (length(by) == 0L || !is.character(by)) stop("A non-empty vector of column names for `by` is required.") - if (!all(by %chin% intersect(colnames(x), colnames(y)))) + if (!all(by %chin% intersect(nm_x, nm_y))) stop("Elements listed in `by` must be valid column names in x and y") by = unname(by) by.x = by.y = by @@ -48,8 +55,8 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL ## sidestep the auto-increment column number feature-leading-to-bug by ## ensuring no names end in ".1", see unit test ## "merge and auto-increment columns in y[x]" in test-data.frame.like.R - start = setdiff(names(x), by.x) - end = setdiff(names(y), by.y) + start = setdiff(nm_x, by.x) + end = setdiff(nm_y, by.y) dupnames = intersect(start, end) if (length(dupnames)) { start[chmatch(dupnames, start, 0L)] = paste0(dupnames, suffixes[1L]) @@ -69,7 +76,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) { yy = y[missingyidx] - othercolsx = setdiff(names(x), by) + othercolsx = setdiff(nm_x, by) if (length(othercolsx)) { tmp = rep.int(NA_integer_, length(missingyidx)) # TO DO: use set() here instead.. @@ -81,7 +88,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } } # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. - newend = setdiff(names(y), by.y) + newend = setdiff(nm_y, by.y) # fix for #1290, make sure by.y order is set properly before naming setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend)) setnames(dt, c(by.x, start, end)) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 1bd91286f9..03d62b4389 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -186,8 +186,6 @@ if (loaded[["parallel"]]) { } # example(":=", local=TRUE) triggered cedta==FALSE and then error, #2972 -res = tryCatch(example(':=', package='data.table', local=TRUE)) -test(14.1, !inherits(res, 'error')) -res = tryCatch(example('CJ', package='data.table', local=TRUE)) -test(14.2, !inherits(res, 'error')) +test(14.1, {example(':=', package='data.table', local=TRUE); TRUE}) +test(14.2, {example('CJ', package='data.table', local=TRUE); TRUE}) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a14dbe2928..f87a2363d8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2361,8 +2361,8 @@ test(827.1, names(a[b]), c("User ID","Blah Blah","Yadda Yadda")) # setcolorder and merge check for dup column names, #2193(ii) setnames(DT2,"b","a") test(828, setcolorder(DT2,c("a","b")), error="x has some duplicated column name(s): a. Please remove or rename") -test(829, merge(DT1,DT2), error="y has some duplicated column name(s): a. Please remove or rename") -test(830, merge(DT2,DT1), error="x has some duplicated column name(s): a. Please remove or rename") +test(829, merge(DT1,DT2), error="y has some duplicated column name(s): [a]. Please remove or rename") +test(830, merge(DT2,DT1), error="x has some duplicated column name(s): [a]. Please remove or rename") # attribs such as "comments" should be retained, #2270 DT1 <- data.table(id = seq.int(1, 10), A = LETTERS[1:10], key = "id") @@ -6937,13 +6937,12 @@ test(1486.1, as.data.frame(ans1.1), as.data.frame(ans1.2)) test(1486.2, as.data.frame(ans2.1), as.data.frame(ans2.1)) # Fix for #832 -x <- matrix(1:9, ncol=3) -setattr(x, "names", paste("V", seq_len(length(x)), sep = "")) +x <- matrix(1:9, ncol=3L) +setattr(x, "names", paste0("V", seq_along(x))) test(1487.1, setattr(x, "class", c("data.table", "data.frame")), error="Internal structure doesn't seem to be a list") -x <- matrix(1:9, ncol=3) +x <- matrix(1:9, ncol=3L) class(x) = c("data.table", "data.frame") -# not sure how to test this one, so using `tryCatch` -test(1487.2, tryCatch(print(x), error=function(k) "bla"), "bla") +test(1487.2, print(x), error="dim.data.table expects a data.table as input") # Fix for #1043 DT = data.table(grp=LETTERS[1:2], categ=rep(c("X","Y"), each=2L), condition=rep(c("P","Q"), each=4L), value=sample(8)) @@ -8361,10 +8360,18 @@ DT2 = data.table(id1=c("c", "w", "b"), val=50:52) test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id1"]), c("id1", "val", "bla")) # warn when merge empty data.table #597 -test(1601.1, merge(data.table(a=1),data.table(a=1), by="a"), data.table(a=1, key="a")) -test(1601.2, tryCatch(merge(data.table(a=1),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where argument 'y' has no columns.") -test(1601.3, tryCatch(merge(data.table(NULL),data.table(a=1), by="a"), warning = function(w) w$message), "You are trying to join data.tables where argument 'x' has no columns.") -test(1601.4, tryCatch(merge(data.table(NULL),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where arguments 'x' and 'y' have no columns.") +DT0 = data.table(NULL) +DT1 = data.table(a=1) +test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a")) +test(1601.2, merge(DT1, DT0, by="a"), + warning="You are trying to join data.tables where 'y' has 0 columns.", + error="Elements listed in `by`") +test(1601.3, merge(DT0, DT1, by="a"), + warning="You are trying to join data.tables where 'x' has 0 columns.", + error="Elements listed in `by`") +test(1601.4, merge(DT0, DT0, by="a"), + warning="You are trying to join data.tables where 'x' and 'y' have 0 columns.", + error="Elements listed in `by`") # fix for #1549 d1 <- data.table(v1=1:2,x=x) @@ -10338,7 +10345,7 @@ if (.Platform$OS.type=="unix") { options(datatable.fread.input.cmd.message = NULL) # when option is missing as it is by default, then TRUE test(1703.02, fread(cmd), ans, message="security concern") options(datatable.fread.input.cmd.message = FALSE) - test(1703.03, tryCatch(fread(cmd), message=stop), ans) + test(1703.03, fread(cmd), ans) options(datatable.fread.input.cmd.message = NULL) test(1703.04, fread(cmd=cmd), ans) test(1703.05, fread(file=cmd), error=sprintf("File '%s' does not exist", cmd)) From 2d88099a63ed46cf9ca8b63127665c194b1a7f31 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Fri, 14 May 2021 08:43:59 +0800 Subject: [PATCH 239/588] need to call as.POSIXct() with tz after "TZ" ENV VAR gets changed (#4261) --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f87a2363d8..7959217d97 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16713,7 +16713,7 @@ t0 = as.POSIXct('2019-10-01') test(2124.1, format(as.ITime(t0)), '00:00:00') test(2124.2, format(as.IDate(t0)), '2019-10-01') if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) -# careful to unset because TZ="" means UTC whereas unset TZ means local +# careful to unset because TZ="" means UTC whereas unset TZ means local, #4261 and #4464 # trunc.cols in print.data.table, #4074 old_width = options("width" = 40L) From 2e68d74b6e3ecf0e954cbbf730fdaecad5fddb85 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 13 May 2021 18:55:56 -0700 Subject: [PATCH 240/588] detect new edge case in auto-naming j (#4275) --- NEWS.md | 2 ++ R/data.table.R | 13 ++++++++++--- inst/tests/tests.Rraw | 11 ++++++++--- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index e26cf65c1e..fe861cebcf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -86,6 +86,8 @@ out_col_name = "sum_x" )] ``` + +11. `DT[, if (...) .(a=1L) else .(a=1L, b=2L), by=group]` now returns a 1-column result with warning `j may not evaluate to the same number of columns for each group`, rather than error `'names' attribute [2] must be the same length as the vector`, [#4274](https://github.com/Rdatatable/data.table/issues/4274). Thanks to @robitalec for reporting, and Michael Chirico for the PR. ## BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index fd0e9fd1cd..3a3cf1f29d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -921,8 +921,12 @@ replace_dot_alias = function(e) { if (is.name(thisq)) nm[jj] = drop_dot(thisq) # TO DO: if call to a[1] for example, then call it 'a' too } - if (!is.null(jvnames) && any(idx <- nm != jvnames)) - warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE) + if (!is.null(jvnames)) { + if (length(nm) != length(jvnames)) + warning("j may not evaluate to the same number of columns for each group; if you're sure this warning is in error, please put the branching logic outside of [ for efficiency") + else if (any(idx <- nm != jvnames)) + warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE) + } jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group; reinstated at the end on the result. } @@ -1367,7 +1371,10 @@ replace_dot_alias = function(e) { setattr(jval,"names",NULL) # discard names of named vectors otherwise each cell in the column would have a name jval = list(jval) } - if (!is.null(jvnames) && !all(jvnames=="")) setattr(jval, 'names', jvnames) # e.g. jvnames=="N" for DT[,.N,] + if (!is.null(jvnames) && any(nzchar(jvnames))) { + if (length(jvnames) > length(jval)) jvnames = jvnames[seq_along(jval)] #4274 + setattr(jval, 'names', jvnames[seq_along(jval)]) # e.g. jvnames=="N" for DT[,.N,] + } jval = as.data.table.list(jval, .named=NULL) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7959217d97..3001616f90 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3058,7 +3058,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) # na.rm=TRUE with list column value, PR#4737 test(1035.016, melt(data.table(a1=1, b1=list(1:2), b2=list(c('foo','bar'))), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1), a=1, b=list(1:2))) test(1035.017, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1), a=1, b=1))#this worked even before the PR. - + ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] test(1035.02, melt(DT, id.vars=c("i_1", "i_2", "l_2"), measure.vars=c("l_1")), ans1) @@ -10890,7 +10890,7 @@ test(1743.217, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor test(1743.218, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor = c(1, 2, 4), factor = 3), select = c(5, 4, 2, 3)), class), y = c(e = "character", d = "factor", b = "factor", c = "factor")) test(1743.22, fread("a,b,c\n1999/01/01,2,f", colClasses=list(Date=1L), drop="a"), data.table(b=2L, c="f")) -test(1743.231, fread("a,b,c\n2,1,4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4i"), +test(1743.231, fread("a,b,c\n2,1,4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4i"), warning=paste0(base_messages$coerce_na, ".*left as type 'character'")) test(1743.232, fread("a,b,c\n2,1,3+4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c=3+4i)) test(1743.241, fread("a,b,c\n2,2,f", colClasses = list(character="c", integer="b"), drop="a"), data.table(b=2L, c="f")) @@ -17531,7 +17531,7 @@ test(2183.40, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", " # measure with pattern= test(2183.41, melt(DTid, measure.vars=measure(value.name, istr="bar", pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") test(2183.42, melt(DTid, measure.vars=measure(value.name, istr=function()1, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") -test(2183.43, melt(DTid, measure.vars=measure(value.name, istr=interactive, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") +test(2183.43, melt(DTid, measure.vars=measure(value.name, istr=interactive, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") test(2183.44, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: istr") test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2") test(2183.46, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") @@ -17591,3 +17591,8 @@ DT = data.table(id=c(1,1,2,2), x=c('y','y','y','z'), v=c('a','b','c','d')) test(2185, dcast(DT, formula=id~x, fun.aggregate=agg, value.var='v'), data.table(id=c(1,2), y=c('a','c'), z=c(NA,'d'), key="id")) +# compatible branches might seem incompatible if the condition is global, #4274 +DT = data.table(a=1L) +test(2186, DT[, if (TRUE) .(a=1L) else .(a=1L, b=2L)], DT, + warning='j may not evaluate to the same number of columns for each group') + From a83de0921f1e8386eb98753ac466722f0a1e60e4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 13 May 2021 19:52:05 -0700 Subject: [PATCH 241/588] .checkTypos drops dependency on English locale (#4990) --- NEWS.md | 2 ++ R/data.table.R | 21 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index fe861cebcf..f5badb48df 100644 --- a/NEWS.md +++ b/NEWS.md @@ -89,6 +89,8 @@ 11. `DT[, if (...) .(a=1L) else .(a=1L, b=2L), by=group]` now returns a 1-column result with warning `j may not evaluate to the same number of columns for each group`, rather than error `'names' attribute [2] must be the same length as the vector`, [#4274](https://github.com/Rdatatable/data.table/issues/4274). Thanks to @robitalec for reporting, and Michael Chirico for the PR. +12. Typo checking in `i` available since 1.11.4 is extended to work in non-English sessions, [#4989](https://github.com/Rdatatable/data.table/issues/4989). Thanks to Michael Chirico for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index 3a3cf1f29d..c070f7dcb6 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -108,8 +108,25 @@ replace_dot_alias = function(e) { } .checkTypos = function(err, ref) { - if (grepl('object.*not found', err$message)) { - used = gsub(".*object '([^']+)'.*", "\\1", err$message) + # a slightly wonky workaround so that this still works in non-English sessions, #4989 + # generate this at run time (as opposed to e.g. onAttach) since session language is + # technically OK to update (though this should be rare), and since it's low-cost + # to do so here because we're about to error anyway. + missing_obj_fmt = gsub( + "'missing_datatable_variable____'", + "'(?[^']+)'", + tryCatch(eval(parse(text="missing_datatable_variable____")), error=identity)$message + # eval(parse()) to avoid "no visible binding for global variable" note from R CMD check + # names starting with _ don't parse, so no leading _ in the name + ) + idx <- regexpr(missing_obj_fmt, err$message, perl=TRUE) + if (idx > 0L) { + start = attr(idx, "capture.start", exact=TRUE)[ , "obj_name"] + used = substr( + err$message, + start, + start + attr(idx, "capture.length", exact=TRUE)[ , "obj_name"] - 1L + ) found = agrep(used, ref, value=TRUE, ignore.case=TRUE, fixed=TRUE) if (length(found)) { stop("Object '", used, "' not found. Perhaps you intended ", brackify(found)) From 98474454bbe9af3d96b8eec5aa44f2946b44159d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 13 May 2021 23:22:20 -0700 Subject: [PATCH 242/588] print(col.names="none") more robust (#4271) --- NEWS.md | 2 ++ R/print.data.table.R | 7 ++++--- inst/tests/tests.Rraw | 4 ++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index f5badb48df..ae047be820 100644 --- a/NEWS.md +++ b/NEWS.md @@ -121,6 +121,8 @@ 14. `by=...get()...` could fail with `object not found`, [#4873](https://github.com/Rdatatable/data.table/issues/4873) [#4981](https://github.com/Rdatatable/data.table/issues/4981). Thanks to @sindribaldur for reporting, and @OfekShilon for fixing. +15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/print.data.table.R b/R/print.data.table.R index 4f2ab7bf0e..4e666ca22e 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -114,7 +114,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn)) rownames(toprint) = format(rownames(toprint), justify="right") if (col.names == "none") { - cut_top(print(toprint, right=TRUE, quote=quote)) + cut_colnames(print(toprint, right=TRUE, quote=quote)) } else { print(toprint, right=TRUE, quote=quote) } @@ -129,7 +129,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), # option to shut this off per request of Oleg Bondar on SO, #1482 toprint=rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97 if (col.names == "none") { - cut_top(print(toprint, right=TRUE, quote=quote)) + cut_colnames(print(toprint, right=TRUE, quote=quote)) } else { print(toprint, right=TRUE, quote=quote) } @@ -192,7 +192,8 @@ shouldPrint = function(x) { # for removing the head (column names) of matrix output entirely, # as opposed to printing a blank line, for excluding col.names per PR #1483 -cut_top = function(x) writeLines(capture.output(x)[-1L]) +# be sure to remove colnames from any row where they exist, #4270 +cut_colnames = function(x) writeLines(grep("^\\s*(?:[0-9]+:|---)", capture.output(x), value=TRUE)) # for printing the dims for list columns #3671; used by format.data.table() paste_dims = function(x) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3001616f90..9f2cafb3c5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17596,3 +17596,7 @@ DT = data.table(a=1L) test(2186, DT[, if (TRUE) .(a=1L) else .(a=1L, b=2L)], DT, warning='j may not evaluate to the same number of columns for each group') +# col.names='none' should apply when wrapping too, #4270 +DT = setDT(replicate(getOption('width'), 1, simplify = FALSE)) +test(2187, {print(DT, col.names='none'); TRUE}, notOutput="V") + From 03dff9121f74f7c638d08619e8b6d2185c41cb98 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Sat, 15 May 2021 06:21:55 +0800 Subject: [PATCH 243/588] fifelse() coerces NA to other types and supports vectorized na argument (#4289) --- NEWS.md | 2 + inst/tests/tests.Rraw | 24 ++++- man/fifelse.Rd | 4 +- src/fifelse.c | 205 ++++++++++++++++++++++++++---------------- 4 files changed, 155 insertions(+), 80 deletions(-) diff --git a/NEWS.md b/NEWS.md index ae047be820..b80eb35c63 100644 --- a/NEWS.md +++ b/NEWS.md @@ -91,6 +91,8 @@ 12. Typo checking in `i` available since 1.11.4 is extended to work in non-English sessions, [#4989](https://github.com/Rdatatable/data.table/issues/4989). Thanks to Michael Chirico for the PR. +13. `fifelse()` now coerces logical `NA` to other types and the `na` argument supports vectorized input, [#4277](https://github.com/Rdatatable/data.table/issues/4277) [#4286](https://github.com/Rdatatable/data.table/issues/4286) [#4287](https://github.com/Rdatatable/data.table/issues/4287). Thanks to @michaelchirico and @shrektan for reporting, and @shrektan for implementing. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9f2cafb3c5..48c502581e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15859,7 +15859,7 @@ test(2072.009, fifelse(test_vec, rep(1L,11L), rep(0L,10L)), error="Length o test(2072.010, fifelse(test_vec, rep(1,10L), rep(0,11L)), error="Length of 'yes' is 10 but must be 1 or length of 'test' (11).") test(2072.011, fifelse(test_vec, rep(TRUE,10L), rep(FALSE,10L)), error="Length of 'yes' is 10 but must be 1 or length of 'test' (11).") test(2072.012, fifelse(0:1, rep(TRUE,2L), rep(FALSE,2L)), error="Argument 'test' must be logical.") -test(2072.013, fifelse(test_vec, TRUE, "FALSE"), error="'yes' is of type logical but 'no' is of type character. Please") +test(2072.013, fifelse(test_vec, TRUE, "FALSE"), error="'no' is of type character but 'yes' is logical. Please") test(2072.014, fifelse(test_vec, list(1),list(2,4)), error="Length of 'no' is 2 but must be 1 or length of 'test' (11).") test(2072.015, fifelse(test_vec, list(1,3),list(2,4)), error="Length of 'yes' is 2 but must be 1 or length of 'test' (11).") test(2072.016, fifelse(test_vec, list(1), list(0)), as.list(as.numeric(out_vec))) @@ -15885,7 +15885,7 @@ test(2072.031, fifelse(test_vec_na, "1", rep("0",12L)), as.character(out_vec_na) test(2072.032, fifelse(test_vec_na, rep("1",12L), "0"), as.character(out_vec_na)) test(2072.033, fifelse(test_vec_na, rep("1",12L), rep("0",12L)), as.character(out_vec_na)) test(2072.034, fifelse(test_vec_na, "1", "0"), as.character(out_vec_na)) -test(2072.035, fifelse(test_vec, as.Date("2011-01-01"), FALSE), error="'yes' is of type double but 'no' is of type logical. Please") +test(2072.035, fifelse(test_vec, as.Date("2011-01-01"), FALSE), error="'no' is of type logical but 'yes' is double. Please") test(2072.036, fifelse(test_vec_na, 1+0i, 0+0i), as.complex(out_vec_na)) test(2072.037, fifelse(test_vec_na, rep(1+0i,12L), 0+0i), as.complex(out_vec_na)) test(2072.038, fifelse(test_vec_na, rep(1+0i,12L), rep(0+0i,12L)), as.complex(out_vec_na)) @@ -16322,7 +16322,7 @@ test(2100.03, fifelse(test_vec_na, TRUE, FALSE, TRUE), as.logical(out_vec_na)) test(2100.04, fifelse(test_vec_na, "1", "0","2"), as.character(out_vec_na)) test(2100.05, fifelse(test_vec_na, 1+0i, 0+0i, 2+0i), as.complex(out_vec_na)) test(2100.06, fifelse(c(TRUE,FALSE,NA), list(1:5), list(5:1), list(15:11)), list(1:5,5:1,15:11)) -test(2100.07, fifelse(test_vec_na, 1, 0, 2L), error = "'yes' is of type double but 'na' is of type integer. Please make sure that both arguments have the same type.") +test(2100.07, fifelse(test_vec_na, 1, 0, 2L), c(1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2)) # corece na test(2100.08, fifelse(test_vec_na, 1, 0, c(2,3)), error = "Length of 'na' is 2 but must be 1") test(2100.09, fifelse(date_vec_na, as.Date("2019-08-31"), as.Date("2019-08-30"), as.Date("2019-08-29")), as.Date(c(18139, 18138, 18138, 18138, 18138, 18137), origin = '1970-01-01')) test(2100.10, fifelse(date_vec_na, as.Date("2019-08-31"), as.Date("2019-08-30"), 18137), error = "'yes' has different class than 'na'. Please make sure that both arguments have the same class.") @@ -17600,3 +17600,21 @@ test(2186, DT[, if (TRUE) .(a=1L) else .(a=1L, b=2L)], DT, DT = setDT(replicate(getOption('width'), 1, simplify = FALSE)) test(2187, {print(DT, col.names='none'); TRUE}, notOutput="V") +# fifelse now supports vector na arguments and coerces NA to other types, PR#4289 +test(2188.01, fifelse(c(TRUE, FALSE, TRUE, NA), 1L, 2L, 1.0), c(1, 2, 1, 1)) +test(2188.02, fifelse(c(TRUE, FALSE, TRUE, NA), 1, 2, 1L), c(1, 2, 1, 1)) +test(2188.03, fifelse(c(TRUE, FALSE, TRUE, NA), 1:4, 11:14, 101:104), c(1L, 12L, 3L, 104L)) +test(2188.04, fifelse(c(TRUE, FALSE, TRUE, NA), NA, 11:14, 101:104), c(NA, 12L, NA, 104L)) +test(2188.05, fifelse(c(TRUE, FALSE, TRUE, NA), 1:4, NA, 101:104), c(1L, NA, 3L, 104L)) +test(2188.06, fifelse(c(TRUE, FALSE, TRUE, NA), 1:4, 11:14, NA), c(1L, 12L, 3L, NA)) +test(2188.07, fifelse(c(TRUE, FALSE, TRUE, NA), 1:4, NA, NA), c(1L, NA, 3L, NA)) +test(2188.08, fifelse(c(TRUE, FALSE, TRUE, NA), NA, NA, NA), c(NA, NA, NA, NA)) +test(2188.09, fifelse(c(TRUE, FALSE, TRUE, NA), NA, NA, NA_character_), rep(NA_character_, 4L)) +test(2188.10, fifelse(c(TRUE, FALSE, TRUE, NA), NA, NA, 101:104), c(NA, NA, NA, 104L)) +test(2188.11, fifelse(c(TRUE, FALSE, TRUE, NA), NA, 11:14, NA), c(NA, 12L, NA, NA)) +test(2188.12, fifelse(c(TRUE, FALSE, TRUE, NA), NA, NA, as.Date("2020-01-01")), as.Date(c(NA, NA, NA, "2020-01-01"))) +test(2188.13, fifelse(TRUE, 1L, 2.0, "a"), error="'na' is of type character but 'no' is double. Please") # smart error message +test(2188.14, fifelse(TRUE, NA, 2, as.Date("2019-07-07")), error="'no' has different class than 'na'. Please") +test(2188.15, fifelse(TRUE, NA, factor('a'), factor('a', levels = c('a','b'))), error="'no' and 'na' are both type factor but their levels are different") +test(2188.16, fifelse(c(NA, NA), 1L, 2L, NULL), c(NA_integer_, NA_integer_)) # NULL `na` is treated as NA + diff --git a/man/fifelse.Rd b/man/fifelse.Rd index 2fe355c98c..4165dd796d 100644 --- a/man/fifelse.Rd +++ b/man/fifelse.Rd @@ -11,10 +11,10 @@ \arguments{ \item{test}{ A logical vector. } \item{yes, no}{ Values to return depending on \code{TRUE}/\code{FALSE} element of \code{test}. They must be the same type and be either length \code{1} or the same length of \code{test}. } - \item{na}{ Value to return if an element of \code{test} is \code{NA}. It must be the same type as \code{yes} and \code{no} and length \code{1}. Default value \code{NA}. \code{NULL} is treated as \code{NA}. } + \item{na}{ Value to return if an element of \code{test} is \code{NA}. It must be the same type as \code{yes} and \code{no} and its length must be either \code{1} or the same length of \code{test}. Default value \code{NA}. \code{NULL} is treated as \code{NA}. } } \details{ -In contrast to \code{\link[base]{ifelse}} attributes are copied from \code{yes} to the output. This is useful when returning \code{Date}, \code{factor} or other classes. +In contrast to \code{\link[base]{ifelse}} attributes are copied from the first non-\code{NA} argument to the output. This is useful when returning \code{Date}, \code{factor} or other classes. } \value{ A vector of the same length as \code{test} and attributes as \code{yes}. Data values are taken from the values of \code{yes} and \code{no}, eventually \code{na}. diff --git a/src/fifelse.c b/src/fifelse.c index 5218cf7b4f..e5d22a1eea 100644 --- a/src/fifelse.c +++ b/src/fifelse.c @@ -11,127 +11,182 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) { const int64_t len0 = xlength(l); const int64_t len1 = xlength(a); const int64_t len2 = xlength(b); + const int64_t len3 = xlength(na); SEXPTYPE ta = TYPEOF(a); SEXPTYPE tb = TYPEOF(b); - int nprotect = 0; + SEXPTYPE tn = TYPEOF(na); + // na_a/b/n means a scalar NA (or NULL for the na argument), which is considered to be coerced into other types + bool na_a = len1==1 && ta==LGLSXP && LOGICAL(a)[0]==NA_LOGICAL; + bool na_b = len2==1 && tb==LGLSXP && LOGICAL(b)[0]==NA_LOGICAL; + bool na_n = isNull(na) || (len3==1 && tn==LGLSXP && LOGICAL(na)[0]==NA_LOGICAL); - if (ta != tb) { - if (ta == INTSXP && tb == REALSXP) { - SEXP tmp = PROTECT(coerceVector(a, REALSXP)); nprotect++; - a = tmp; + if (!na_a && len1!=1 && len1!=len0) + error(_("Length of 'yes' is %"PRId64" but must be 1 or length of 'test' (%"PRId64")."), len1, len0); + if (!na_b && len2!=1 && len2!=len0) + error(_("Length of 'no' is %"PRId64" but must be 1 or length of 'test' (%"PRId64")."), len2, len0); + if (!na_n && len3!=1 && len3!=len0) + error(_("Length of 'na' is %"PRId64" but must be 1 or length of 'test' (%"PRId64")."), len3, len0); + + int nprotect = 0; + SEXPTYPE tans = !na_a ? ta : !na_b ? tb : !na_n ? tn : LGLSXP; + if (!(na_a && na_b && na_n)) { + SEXPTYPE ta0 = ta, tb0 = tb, tn0 = tn; // record the original type for error message use + if (!na_b && tans==INTSXP && tb==REALSXP) tans = tb; + if (!na_n && tans==INTSXP && tn==REALSXP) tans = tn; + if (!na_a && tans==REALSXP && ta==INTSXP) { + a = PROTECT(coerceVector(a, REALSXP)); nprotect++; ta = REALSXP; - } else if (ta == REALSXP && tb == INTSXP) { - SEXP tmp = PROTECT(coerceVector(b, REALSXP)); nprotect++; - b = tmp; + } + // it's not possible that non-NA `yes`' type will be different from `tans` + if (!na_b && tans==REALSXP && tb==INTSXP) { + b = PROTECT(coerceVector(b, REALSXP)); nprotect++; tb = REALSXP; - } else { - error(_("'yes' is of type %s but 'no' is of type %s. Please make sure that both arguments have the same type."), type2char(ta), type2char(tb)); } + if (!na_b && tans != tb) + error(_("'no' is of type %s but '%s' is %s. Please make all arguments have the same type."), type2char(tb0), tans==ta0 ? "yes" : "na", tans==ta0 ? type2char(ta0) : type2char(tn0)); + if (!na_n && tans==REALSXP && tn==INTSXP) { + na = PROTECT(coerceVector(na, REALSXP)); nprotect++; + tn = REALSXP; + } + if (!na_n && tans != tn) + error(_("'na' is of type %s but '%s' is %s. Please make all arguments have the same type."), type2char(tn0), tans==ta0 ? "yes" : "no", tans==ta0 ? type2char(ta0) : type2char(tb0)); } - - if (!R_compute_identical(PROTECT(getAttrib(a,R_ClassSymbol)), PROTECT(getAttrib(b,R_ClassSymbol)), 0)) - error(_("'yes' has different class than 'no'. Please make sure that both arguments have the same class.")); - UNPROTECT(2); - - if (isFactor(a)) { - if (!R_compute_identical(PROTECT(getAttrib(a,R_LevelsSymbol)), PROTECT(getAttrib(b,R_LevelsSymbol)), 0)) - error(_("'yes' and 'no' are both type factor but their levels are different.")); + + if (!na_a && !na_b) { + if (!R_compute_identical(PROTECT(getAttrib(a,R_ClassSymbol)), PROTECT(getAttrib(b,R_ClassSymbol)), 0)) + error(_("'yes' has different class than 'no'. Please make sure that both arguments have the same class.")); + UNPROTECT(2); + } + if (!na_a && !na_n) { + if (!R_compute_identical(PROTECT(getAttrib(a,R_ClassSymbol)), PROTECT(getAttrib(na,R_ClassSymbol)), 0)) + error(_("'yes' has different class than 'na'. Please make sure that both arguments have the same class.")); + UNPROTECT(2); + } + if (!na_b && !na_n) { + if (!R_compute_identical(PROTECT(getAttrib(b,R_ClassSymbol)), PROTECT(getAttrib(na,R_ClassSymbol)), 0)) + error(_("'no' has different class than 'na'. Please make sure that both arguments have the same class.")); UNPROTECT(2); } + + if (isFactor(a) || isFactor(b)) { + if (!na_a && !na_b) { + if (!R_compute_identical(PROTECT(getAttrib(a,R_LevelsSymbol)), PROTECT(getAttrib(b,R_LevelsSymbol)), 0)) + error(_("'yes' and 'no' are both type factor but their levels are different.")); + UNPROTECT(2); + } + if (!na_a && !na_n) { + if (!R_compute_identical(PROTECT(getAttrib(a,R_LevelsSymbol)), PROTECT(getAttrib(na,R_LevelsSymbol)), 0)) + error(_("'yes' and 'na' are both type factor but their levels are different.")); + UNPROTECT(2); + } + if (!na_b && !na_n) { + if (!R_compute_identical(PROTECT(getAttrib(b,R_LevelsSymbol)), PROTECT(getAttrib(na,R_LevelsSymbol)), 0)) + error(_("'no' and 'na' are both type factor but their levels are different.")); + UNPROTECT(2); + } + } - if (len1!=1 && len1!=len0) - error(_("Length of 'yes' is %"PRId64" but must be 1 or length of 'test' (%"PRId64")."), len1, len0); - if (len2!=1 && len2!=len0) - error(_("Length of 'no' is %"PRId64" but must be 1 or length of 'test' (%"PRId64")."), len2, len0); const int64_t amask = len1>1 ? INT64_MAX : 0; // for scalar 'a' bitwise AND will reset iterator to first element: pa[i & amask] -> pa[0] const int64_t bmask = len2>1 ? INT64_MAX : 0; + const int64_t nmask = len3>1 ? INT64_MAX : 0; const int *restrict pl = LOGICAL(l); - SEXP ans = PROTECT(allocVector(ta, len0)); nprotect++; - copyMostAttrib(a, ans); - - bool nonna = !isNull(na); - if (nonna) { - if (xlength(na) != 1) - error(_("Length of 'na' is %"PRId64" but must be 1"), (int64_t)xlength(na)); - SEXPTYPE tn = TYPEOF(na); - if (tn == LGLSXP && LOGICAL(na)[0]==NA_LOGICAL) { - nonna = false; - } else { - if (tn != ta) - error(_("'yes' is of type %s but 'na' is of type %s. Please make sure that both arguments have the same type."), type2char(ta), type2char(tn)); - if (!R_compute_identical(PROTECT(getAttrib(a,R_ClassSymbol)), PROTECT(getAttrib(na,R_ClassSymbol)), 0)) - error(_("'yes' has different class than 'na'. Please make sure that both arguments have the same class.")); - UNPROTECT(2); - if (isFactor(a)) { - if (!R_compute_identical(PROTECT(getAttrib(a,R_LevelsSymbol)), PROTECT(getAttrib(na,R_LevelsSymbol)), 0)) - error(_("'yes' and 'na' are both type factor but their levels are different.")); - UNPROTECT(2); - } - } - } + SEXP ans = PROTECT(allocVector(tans, len0)); nprotect++; + if (!na_a) + copyMostAttrib(a, ans); + else if (!na_b) + copyMostAttrib(b, ans); + else if (!na_n) + copyMostAttrib(na, ans); - switch(ta) { + switch(tans) { case LGLSXP: { int *restrict pans = LOGICAL(ans); - const int *restrict pa = LOGICAL(a); - const int *restrict pb = LOGICAL(b); - const int pna = nonna ? LOGICAL(na)[0] : NA_LOGICAL; + const int *restrict pa; if (!na_a) pa = LOGICAL(a); + const int *restrict pb; if (!na_b) pb = LOGICAL(b); + const int *restrict pna; if (!na_n) pna = LOGICAL(na); + const int na = NA_LOGICAL; #pragma omp parallel for num_threads(getDTthreads(len0, true)) for (int64_t i=0; i Date: Fri, 14 May 2021 22:24:51 -0500 Subject: [PATCH 244/588] Bug fix for gmin and blank strings (#4847) --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ inst/tests/tests.Rraw | 7 ++++--- src/data.table.h | 1 + src/gsumm.c | 6 +++--- src/init.c | 2 ++ 6 files changed, 14 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9e5302f2bb..8ab2deaa0d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -63,7 +63,8 @@ Authors@R: c( person("Dirk","Eddelbuettel", role="ctb"), person("Ben","Schwen", role="ctb"), person("Tony","Fischetti", role="ctb"), - person("Ofek","Shilon", role="ctb")) + person("Ofek","Shilon", role="ctb"), + person("Vadim","Khotilovich", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NEWS.md b/NEWS.md index b80eb35c63..fd48fdcade 100644 --- a/NEWS.md +++ b/NEWS.md @@ -125,6 +125,8 @@ 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing. +16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) at the beginning of a group and return the smallest non-blank instead of blank, [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 48c502581e..df6b7bf6a4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5164,7 +5164,8 @@ test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) # for character set.seed(1L) -DT <- data.table(x=rep(1:6, each=3), y=sample(c("", letters[1:3], NA), 18, TRUE)) +DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) +DT[x==7, y := c("","b","c")] test(1313.23, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.24, DT[, max(y), by=x], DT[, base::max(y), by=x]) test(1313.25, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) @@ -5172,8 +5173,8 @@ test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by= DT[x==6, y := NA_character_] test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("a","a","c","","a",NA)), warning="No non-missing") -test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("b","a","c","a","c",NA)), warning="No non-missing") +test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,"")), warning="No non-missing") +test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c")), warning="No non-missing") # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = diff --git a/src/data.table.h b/src/data.table.h index 9fb386567d..50d43a34a5 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -89,6 +89,7 @@ extern SEXP char_ordered; extern SEXP char_datatable; extern SEXP char_dataframe; extern SEXP char_NULL; +extern SEXP char_maxString; extern SEXP sym_sorted; extern SEXP sym_index; extern SEXP sym_BY; diff --git a/src/gsumm.c b/src/gsumm.c index 9c31f4a761..064131f3cd 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -760,15 +760,15 @@ SEXP gmin(SEXP x, SEXP narm) case STRSXP: ans = PROTECT(allocVector(STRSXP, ngrp)); protecti++; if (!LOGICAL(narm)[0]) { - for (i=0; i Date: Sat, 15 May 2021 02:42:01 -0600 Subject: [PATCH 245/588] Localize remaining loop variables (#5000) --- src/assign.c | 47 ++++---- src/dogroups.c | 2 +- src/fastmean.c | 18 ++-- src/fcast.c | 14 ++- src/frank.c | 70 +++++------- src/gsumm.c | 288 ++++++++++++++++++++++++------------------------- src/ijoin.c | 224 +++++++++++++++++++------------------- src/inrange.c | 15 +-- 8 files changed, 330 insertions(+), 348 deletions(-) diff --git a/src/assign.c b/src/assign.c index bcd7f29265..bc1c91188f 100644 --- a/src/assign.c +++ b/src/assign.c @@ -149,7 +149,6 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) // NEW: cols argument to specify the columns to shallow copy on. If NULL, all columns. // called from alloccol where n is checked carefully, or from shallow() at R level // where n is set to truelength (i.e. a shallow copy only with no size change) - R_len_t i,l; int protecti=0; SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here? SET_ATTRIB(newdt, shallow_duplicate(ATTRIB(dt))); @@ -173,21 +172,20 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++; SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++; + const int l = isNull(cols) ? LENGTH(dt) : length(cols); if (isNull(cols)) { - l = LENGTH(dt); - for (i=0; i0 but nrow) error(_("i[%d] is %d which is out of range [1,nrow=%d]."),i+1,rowsd[i],nrow); // set() reaches here (test 2005.2); := reaches the same error in subset.c first if (rowsd[i]>=1) numToDo++; @@ -364,13 +362,13 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) PROTECT(tmp = chmatch(cols, names, 0)); protecti++; buf = (int *) R_alloc(length(cols), sizeof(int)); int k=0; - for (i=0; i0) { if (!isDataTable) error(_("set() on a data.frame is for changing existing columns, not adding new ones. Please use a data.table for that. data.table's are over-allocated and don't shallow copy.")); newcolnames = PROTECT(allocVector(STRSXP, k)); protecti++; - for (i=0; ioldncol+length(newcolnames)) { if (!isDataTable) error(_("Item %d of column numbers in j is %d which is outside range [1,ncol=%d]. set() on a data.frame is for changing existing columns, not adding new ones. Please use a data.table for that."), i+1, coln, oldncol); @@ -436,8 +434,11 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } // RHS of assignment to new column is zero length but we'll use its type to create all-NA column of that type } - if (isMatrix(thisvalue) && (j=INTEGER(getAttrib(thisvalue, R_DimSymbol))[1]) > 1) // matrix passes above (considered atomic vector) - warning(_("%d column matrix RHS of := will be treated as one vector"), j); + { + int j; + if (isMatrix(thisvalue) && (j=INTEGER(getAttrib(thisvalue, R_DimSymbol))[1]) > 1) // matrix passes above (considered atomic vector) + warning(_("%d column matrix RHS of := will be treated as one vector"), j); + } const SEXP existing = (coln+1)<=oldncol ? VECTOR_ELT(dt,coln) : R_NilValue; if (isFactor(existing) && !isString(thisvalue) && TYPEOF(thisvalue)!=INTSXP && TYPEOF(thisvalue)!=LGLSXP && !isReal(thisvalue) && !isNewList(thisvalue)) { // !=INTSXP includes factor @@ -470,7 +471,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) error(_("Internal error: selfrefnames is ok but tl names [%d] != tl [%d]"), TRUELENGTH(names), oldtncol); // # nocov SETLENGTH(dt, oldncol+LENGTH(newcolnames)); SETLENGTH(names, oldncol+LENGTH(newcolnames)); - for (i=0; i= 0")); // ans = PROTECT(allocVector(INTSXP, n)); -// for (i=0; i= 1 -// for (i=0; iLENGTH(x)) error(_("Item %d of 'cols' is %d which is outside 1-based range [1,ncol(x)=%d]"), i+1, elem, LENGTH(x)); if (!n) n = length(VECTOR_ELT(x, elem-1)); } - SEXP ans = PROTECT(allocVector(LGLSXP, 1)); - LOGICAL(ans)[0]=0; - for (i=0; i grpsize[i]) { LOGICAL(ans)[i] = NA_LOGICAL; continue; } - k = ff[i]+val-2; + int k = ff[i]+val-2; if (isunsorted) k = oo[k]-1; k = (irowslen == -1) ? k : irows[k]-1; ians[i] = ix[k]; @@ -1225,9 +1220,9 @@ SEXP gnthvalue(SEXP x, SEXP valArg) { const int *ix = INTEGER(x); ans = PROTECT(allocVector(INTSXP, ngrp)); int *ians = INTEGER(ans); - for (i=0; i grpsize[i]) { INTEGER(ans)[i] = NA_INTEGER; continue; } - k = ff[i]+val-2; + int k = ff[i]+val-2; if (isunsorted) k = oo[k]-1; k = (irowslen == -1) ? k : irows[k]-1; ians[i] = ix[k]; @@ -1238,9 +1233,9 @@ SEXP gnthvalue(SEXP x, SEXP valArg) { const double *dx = REAL(x); ans = PROTECT(allocVector(REALSXP, ngrp)); double *dans = REAL(ans); - for (i=0; i grpsize[i]) { REAL(ans)[i] = NA_REAL; continue; } - k = ff[i]+val-2; + int k = ff[i]+val-2; if (isunsorted) k = oo[k]-1; k = (irowslen == -1) ? k : irows[k]-1; dans[i] = dx[k]; @@ -1251,9 +1246,9 @@ SEXP gnthvalue(SEXP x, SEXP valArg) { const Rcomplex *dx = COMPLEX(x); ans = PROTECT(allocVector(CPLXSXP, ngrp)); Rcomplex *dans = COMPLEX(ans); - for (i=0; i grpsize[i]) { dans[i].r = NA_REAL; dans[i].i = NA_REAL; continue; } - k = ff[i]+val-2; + int k = ff[i]+val-2; if (isunsorted) k = oo[k]-1; k = (irowslen == -1) ? k : irows[k]-1; dans[i] = dx[k]; @@ -1261,9 +1256,9 @@ SEXP gnthvalue(SEXP x, SEXP valArg) { } break; case STRSXP: ans = PROTECT(allocVector(STRSXP, ngrp)); - for (i=0; i grpsize[i]) { SET_STRING_ELT(ans, i, NA_STRING); continue; } - k = ff[i]+val-2; + int k = ff[i]+val-2; if (isunsorted) k = oo[k]-1; k = (irowslen == -1) ? k : irows[k]-1; SET_STRING_ELT(ans, i, STRING_ELT(x, k)); @@ -1271,9 +1266,9 @@ SEXP gnthvalue(SEXP x, SEXP valArg) { break; case VECSXP: ans = PROTECT(allocVector(VECSXP, ngrp)); - for (i=0; i grpsize[i]) { SET_VECTOR_ELT(ans, i, R_NilValue); continue; } - k = ff[i]+val-2; + int k = ff[i]+val-2; if (isunsorted) k = oo[k]-1; k = (irowslen == -1) ? k : irows[k]-1; SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k)); @@ -1294,33 +1289,32 @@ SEXP gvarsd1(SEXP x, SEXP narm, Rboolean isSD) if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); if (!isVectorAtomic(x)) error(_("GForce var/sd can only be applied to columns, not .SD or similar. For the full covariance matrix of all items in a list such as .SD, either add the prefix stats::var(.SD) (or stats::sd(.SD)) or turn off GForce optimization using options(datatable.optimize=1). Alternatively, if you only need the diagonal elements, 'DT[,lapply(.SD,var),by=,.SDcols=]' is the optimized way to do this.")); if (inherits(x, "factor")) error(_("var/sd is not meaningful for factors.")); - long double m, s, v; - R_len_t i, j, ix, thisgrpsize = 0, n = (irowslen == -1) ? length(x) : irowslen; + const int n = (irowslen == -1) ? length(x) : irowslen; if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gvar"); SEXP sub, ans = PROTECT(allocVector(REALSXP, ngrp)); - Rboolean ans_na; switch(TYPEOF(x)) { case LGLSXP: case INTSXP: sub = PROTECT(allocVector(INTSXP, maxgrpn)); // allocate once upfront if (!LOGICAL(narm)[0]) { - for (i=0; i DBL_MAX) REAL(ans)[i] = R_PosInf; else if (s[i] < -DBL_MAX) REAL(ans)[i] = R_NegInf; else REAL(ans)[i] = (double)s[i]; } break; case REALSXP: - for (i=0; i DBL_MAX) REAL(ans)[i] = R_PosInf; else if (s[i] < -DBL_MAX) REAL(ans)[i] = R_NegInf; else REAL(ans)[i] = (double)s[i]; diff --git a/src/ijoin.c b/src/ijoin.c index 59bfd8fed4..96a9deae4f 100644 --- a/src/ijoin.c +++ b/src/ijoin.c @@ -9,7 +9,7 @@ SEXP lookup(SEXP ux, SEXP xlen, SEXP indices, SEXP gaps, SEXP overlaps, SEXP multArg, SEXP typeArg, SEXP verbose) { SEXP vv, tt, lookup, type_lookup; - R_len_t i,j,k,*idx,*count,*type_count,xrows=INTEGER(xlen)[0],uxrows=LENGTH(VECTOR_ELT(ux, 0)),uxcols=LENGTH(ux); + R_len_t *idx,*count,*type_count,xrows=INTEGER(xlen)[0],uxrows=LENGTH(VECTOR_ELT(ux, 0)),uxcols=LENGTH(ux); int *from = (int *)INTEGER(VECTOR_ELT(indices, 0)); int *to = (int *)INTEGER(VECTOR_ELT(indices, 1)); clock_t pass1, pass2, pass3, start; @@ -37,19 +37,19 @@ SEXP lookup(SEXP ux, SEXP xlen, SEXP indices, SEXP gaps, SEXP overlaps, SEXP mul case FIRST: switch(type) { case EQUAL: - for (i=0; i 0 && type_count[from[i]-1]) ? type_count[from[i]-1] : 1; break; case EQUAL: - for (i=0; i0) ? from[i] : 1; + for (int i=0; i0) ? from[i] : 1; if (k == to[i]) { wlen = count[k-1]; } else if (k < to[i]) { @@ -284,13 +285,13 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr break; case ANY: - for (i=0; i 0) ? from[i] : 1; - k = from[i]; + const int k = from[i]; if (k<=to[i]) totlen += count[k-1]; - for (j=k+1; j<=to[i]; j++) + for (int j=k+1; j<=to[i]; ++j) totlen += type_count[j-1]; if (len == totlen) ++totlen; @@ -298,9 +299,10 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr break; case WITHIN: - for (i=0; i 0) { if (k == to[i]) { totlen += count[k-1]; @@ -342,12 +344,12 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr case ALL: switch (type) { case START : case END : - for (i=0; i 0) { - k = from[i]; + const int k = from[i]; tmp2 = VECTOR_ELT(type_lookup, k-1); - for (j=0; j 0 && to[i] > 0) { - k = from[i]; + const int k = from[i]; if (k == to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(type_lookup, to[i]-1); - for (j=0; j0) ? from[i] : 1; - k = from[i]; + const int k = from[i]; if (k<=to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); - for (m=0; m 0) { if (k == to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); - for (j=0; j0) ? from[i] : 1; + const int k = (from[i]>0) ? from[i] : 1; if (k <= to[i]) { // count[k-1] is equal to type_count[k-1] and will always be >0, so no length check necessary. tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[0]; @@ -485,17 +487,17 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr break; case EQUAL : - for (i=0; i 0 && to[i] > 0) { - k = from[i]; + const int k = from[i]; if (k == to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[0]; ++thislen; } else if (k < to[i]) { - j=0; m=0; + int j=0, m=0; tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(type_lookup, to[i]-1); while (j0) ? from[i] : 1; - k = from[i]; - for (j=k; j<=to[i]; j++) { + const int k = from[i]; + for (int j=k; j<=to[i]; ++j) { if (type_count[j-1]) { tmp2 = VECTOR_ELT(type_lookup, j-1); INTEGER(f2__)[thislen] = INTEGER(tmp2)[0]; @@ -538,17 +540,17 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr break; case WITHIN: - for (i=0; i 0) { if (k == to[i] && count[k-1]) { tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[0]; ++thislen; } else if (k < to[i]) { - j=0; m=0; + int j=0, m=0; tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(lookup, to[i]-1); while (j0) ? from[i] : 1; + const int k = (from[i]>0) ? from[i] : 1; if (k <= to[i]) { // count[k-1] is equal to type_count[k-1] and will always be >0, so no length check necessary. tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[count[k-1]-1]; @@ -600,11 +602,11 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr // n (next line) // p val # for native C objects // call Rf_PrintValue(val) # for SEXP objects, to print whole vector/vals - for (i=0; i 0 && to[i] > 0) { - k = from[i]; + const int k = from[i]; if (k == to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[count[k-1]-1]; @@ -612,7 +614,7 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr } else if (k < to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(type_lookup, to[i]-1); - j=count[k-1]-1; m=type_count[to[i]-1]-1; // bug fix, k=from[i] but should be to[i] + int j=count[k-1]-1, m=type_count[to[i]-1]-1; // bug fix, k=from[i] but should be to[i] while (j>=0 && m>=0) { if ( INTEGER(tmp1)[j] == INTEGER(tmp2)[m] ) { INTEGER(f2__)[thislen] = INTEGER(tmp1)[j]; @@ -635,37 +637,37 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr // for 'first' we need to just get the minimum of first non-zero-length element, but not the same case for 'last'. // We've to loop over from[i]:to[i] and get maximum of all tmp2 values (each is of length 1 already conveniently set uo) in that range // case ANY: - // for (i=0; i0) ? from[i] : 1; - // k = from[i]; - // for (j=k; j<=to[i]; j++) { - // if (type_count[j-1]) { - // tmp2 = VECTOR_ELT(type_lookup, j-1); - // INTEGER(f2__)[thislen] = (INTEGER(f2__)[thislen] < INTEGER(tmp2)[type_count[j-1]-1]) ? INTEGER(tmp2)[type_count[j-1]-1] : INTEGER(f2__)[thislen]; - // } + // for (int i=0; i0) ? from[i] : 1; + // k = from[i]; + // for (int j=k; j<=to[i]; ++j) { + // if (type_count[j-1]) { + // tmp2 = VECTOR_ELT(type_lookup, j-1); + // INTEGER(f2__)[thislen] = (INTEGER(f2__)[thislen] < INTEGER(tmp2)[type_count[j-1]-1]) ? INTEGER(tmp2)[type_count[j-1]-1] : INTEGER(f2__)[thislen]; // } - // if (INTEGER(f2__)[thislen] == 0) - // INTEGER(f2__)[thislen] = nomatch; - // ++thislen; + // } + // if (INTEGER(f2__)[thislen] == 0) + // INTEGER(f2__)[thislen] = nomatch; + // ++thislen; // } // break; case ANY: - for (i=0; i0) ? from[i] : 1; - k = from[i]; + const int k = from[i]; if (k <= to[i]) { if (k==to[i] && count[k-1]) { tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[count[k-1]-1]; ++thislen; } else { - for (j=to[i]; j>k; j--) { + for (int j=to[i]; j>k; --j) { if (type_count[j-1]) { tmp2 = VECTOR_ELT(type_lookup, j-1); INTEGER(f2__)[thislen] = INTEGER(tmp2)[0]; // tmp2 will be length 1 @@ -687,10 +689,10 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr break; case WITHIN: - for (i=0; i 0) { if (k == to[i] && count[k-1]) { tmp1 = VECTOR_ELT(lookup, k-1); @@ -699,7 +701,7 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr } else if (k < to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(lookup, to[i]-1); - j=count[k-1]-1; m=count[to[i]-1]-1; + int j=count[k-1]-1, m=count[to[i]-1]-1; while (j>=0 && m>=0) { if ( INTEGER(tmp1)[j] == INTEGER(tmp2)[m] ) { INTEGER(f2__)[thislen] = INTEGER(tmp1)[j]; diff --git a/src/inrange.c b/src/inrange.c index 158f0ad2b6..626fe70bc2 100644 --- a/src/inrange.c +++ b/src/inrange.c @@ -4,11 +4,12 @@ SEXP inrange(SEXP ansArg, SEXP xoArg, SEXP startsArg, SEXP lenArg) { - int *ans = INTEGER(ansArg), *xo = INTEGER(xoArg); - int *starts = INTEGER(startsArg), *len = INTEGER(lenArg); - R_len_t i, j, n = length(startsArg), nxo = length(xoArg); - for (i = 0; i < n; i++) { - for (j = starts[i]-1; j < starts[i]-1+len[i]; j++) { + int *ans = INTEGER(ansArg); + const int *xo = INTEGER(xoArg); + const int *starts = INTEGER(startsArg), *len = INTEGER(lenArg); + const int n = length(startsArg), nxo = length(xoArg); + for (int i=0; i new_ee ? ee : new_ee; // } // // Rprintf(_("Moved to %d, start=%d, end=%d\n"), i, ss, ee); - // for (j=ss; j<=ee; j++) ans[nxo ? xo[j]-1 : j] = 1; + // for (int j=ss; j<=ee; j++) ans[nxo ? xo[j]-1 : j] = 1; // } return (R_NilValue); } From 2da34fadc360427c59d3a429cd057d40d8c3cbe3 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 15 May 2021 03:50:01 -0600 Subject: [PATCH 246/588] clear gcc warnings (#5001) --- src/fifelse.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/fifelse.c b/src/fifelse.c index e5d22a1eea..c2497b566d 100644 --- a/src/fifelse.c +++ b/src/fifelse.c @@ -102,9 +102,9 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) { switch(tans) { case LGLSXP: { int *restrict pans = LOGICAL(ans); - const int *restrict pa; if (!na_a) pa = LOGICAL(a); - const int *restrict pb; if (!na_b) pb = LOGICAL(b); - const int *restrict pna; if (!na_n) pna = LOGICAL(na); + const int *restrict pa = na_a ? NULL : LOGICAL(a); + const int *restrict pb = na_b ? NULL : LOGICAL(b); + const int *restrict pna = na_n ? NULL : LOGICAL(na); const int na = NA_LOGICAL; #pragma omp parallel for num_threads(getDTthreads(len0, true)) for (int64_t i=0; i Date: Sun, 16 May 2021 23:21:06 +0200 Subject: [PATCH 247/588] unit tests for fixed #1913 (#5004) --- inst/tests/tests.Rraw | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index df6b7bf6a4..467f9b270d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17619,3 +17619,11 @@ test(2188.14, fifelse(TRUE, NA, 2, as.Date("2019-07-07")), error="'no' has diff test(2188.15, fifelse(TRUE, NA, factor('a'), factor('a', levels = c('a','b'))), error="'no' and 'na' are both type factor but their levels are different") test(2188.16, fifelse(c(NA, NA), 1L, 2L, NULL), c(NA_integer_, NA_integer_)) # NULL `na` is treated as NA +# rolling join expected output on non-matching join column has been fixed #1913 +dt = data.table(ID=1:5, A=c(1.3, 1.7, 2.4, 0.9, 0.6)) +buckets = data.table(BucketID=1:4, BinA=1:4) +dt[, A.copy := A] +test(2189.1, buckets[dt, on=c("BinA"="A"), roll=-Inf], data.table(BucketID = c(2L, 2L, 3L, 1L, 1L), BinA = c(1.3, 1.7, 2.4, 0.9, 0.6), ID = 1:5, A.copy = c(1.3, 1.7, 2.4, 0.9, 0.6))) +buckets[, BinA := as.numeric(BinA)] +test(2189.2, buckets[dt, on=c("BinA"="A"), roll=-Inf], data.table(BucketID = c(2L, 2L, 3L, 1L, 1L), BinA = c(1.3, 1.7, 2.4, 0.9, 0.6), ID = 1:5, A.copy = c(1.3, 1.7, 2.4, 0.9, 0.6))) + From c3f1f6f7e52eea0b9b0d287291eada6541434a77 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 17 May 2021 16:05:52 -0700 Subject: [PATCH 248/588] tag non-local loops with #skip_loop_scope (#5003) --- .dev/CRAN_Release.cmd | 4 ++++ src/dogroups.c | 2 +- src/fread.c | 4 ++-- src/froll.c | 12 ++++++------ 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 88f9f52752..f5ffdae7d7 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -203,6 +203,10 @@ grep allocVector *.c | grep -v PROTECT | grep -v SET_VECTOR_ELT | grep -v setAtt grep coerceVector *.c | grep -v PROTECT | grep -v SET_VECTOR_ELT | grep -v setAttrib | grep -v return grep asCharacter *.c | grep -v PROTECT | grep -v SET_VECTOR_ELT | grep -v setAttrib | grep -v return +# Enforce local scope for loop index (`for (int i=0; ...)` instead of `int i; for (i=0; ...)`) +# exceptions are tagged with #loop_counter_not_local_scope_ok +grep -En "for\s*[(]\s*[a-zA-Z0-9_]+\s*=" src/*.c | grep -Fv "#loop_counter_not_local_scope_ok" + cd .. R cc(test=TRUE, clean=TRUE, CC="gcc-10") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html diff --git a/src/dogroups.c b/src/dogroups.c index 53fbf30dcd..5bb9983408 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -117,7 +117,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by" // fetch rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to // non data.table aware package that uses rownames - for (s = ATTRIB(SD); s != R_NilValue && TAG(s)!=R_RowNamesSymbol; s = CDR(s)); // getAttrib0 basically but that's hidden in attrib.c + for (s = ATTRIB(SD); s != R_NilValue && TAG(s)!=R_RowNamesSymbol; s = CDR(s)); // getAttrib0 basically but that's hidden in attrib.c; #loop_counter_not_local_scope_ok if (s==R_NilValue) error(_("row.names attribute of .SD not found")); rownames = CAR(s); if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]); diff --git a/src/fread.c b/src/fread.c index aee28faf07..33c707c4f1 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1616,7 +1616,7 @@ int freadMain(freadMainArgs _args) { int topSkip=0; // how many rows to auto-skip const char *topStart=NULL; - for (quoteRule=quote?0:3; quoteRule<4; quoteRule++) { + for (quoteRule=quote?0:3; quoteRule<4; quoteRule++) { // #loop_counter_not_local_scope_ok // quote rule in order of preference. // when top is tied the first wins, so do all seps for the first quoteRule, then all seps for the second quoteRule, etc for (int s=0; s1 field to disambiguate // choose quote rule 0 or 1 based on for which 100 rows gets furthest into file - for (quoteRule=0; quoteRule<=1; quoteRule++) { + for (quoteRule=0; quoteRule<=1; quoteRule++) { // #loop_counter_not_local_scope_ok int thisRow=0, thisncol=0; ch = pos; while (ch=0) {}; diff --git a/src/froll.c b/src/froll.c index b044431ded..3ab7bd927a 100644 --- a/src/froll.c +++ b/src/froll.c @@ -49,8 +49,8 @@ void frollmeanFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool long double w = 0.0; // sliding window aggregate bool truehasna = hasna>0; // flag to re-run with NA support if NAs detected if (!truehasna) { - int i; // iterator declared here because it is being used after foor loop - for (i=0; idbl_v[i] = fill; // answers are fill for partial window } @@ -85,8 +85,8 @@ void frollmeanFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool } if (truehasna) { int nc = 0; // NA counter within sliding window - int i; // iterator declared here because it is being used after foor loop - for (i=0; i0; if (!truehasna) { int i; - for (i=0; idbl_v[i] = fill; } @@ -289,7 +289,7 @@ void frollsumFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool n if (truehasna) { int nc = 0; int i; - for (i=0; i Date: Mon, 17 May 2021 20:12:02 -0700 Subject: [PATCH 249/588] added test for fixed issue (#4191) --- inst/tests/tests.Rraw | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 467f9b270d..290e263823 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15130,6 +15130,8 @@ test(2041.1, DT[, median(date), by=g], data.table(g=c("a","b"), V1=as.Date(c("20 test(2041.2, DT[, median(time), by=g], DT[c(2,5),.(g=g, V1=time)]) # 'invalid trim argument' with optimization level 1; #1876 +# these tests check via output= that level 1 is on, and also that level 2 is on (which includes level 1). +# They could run in level 1 with level 2 off, but output= would need to be changed and there's no need. test(2042.1, DT[ , as.character(mean(date)), by=g, verbose=TRUE ], data.table(g=c("a","b"), V1=c("2018-01-04","2018-01-21")), output=msg<-"GForce is on, left j unchanged.*Old mean optimization is on, left j unchanged") @@ -15138,7 +15140,19 @@ test(2042.1, DT[ , as.character(mean(date)), by=g, verbose=TRUE ], Jan.2018 = format(strptime("2018-01-01", "%Y-%m-%d"), "%b-%Y") test(2042.2, DT[ , format(mean(date),"%b-%Y")], Jan.2018) test(2042.3, DT[ , format(mean(date),"%b-%Y"), by=g, verbose=TRUE ], # just this case generated the error - data.table(g=c("a","b"), V1=c(Jan.2018, Jan.2018)), output=msg) + data.table(g=c("a","b"), V1=c(Jan.2018, Jan.2018)), output=msg) +# also incidentally fixed #2491 +DT = data.table( + Group = c("A", "A", "B", "B", "C", "C"), + Date1 = .Date(c(17446.0291040738, 17470.0221205444, 17445.0765226481, + 17456.0360002079, 17440.0230725919, 17451.0572453837)), + Date2 = .Date(c(17459.1561177987, 17451.1086757995, 17449.0820898537, + 17443.1175238448, 17461.0463715783, 17448.1033968224)) +) +DT[ , DiffTime := abs(difftime(Date1, Date2, units = 'days'))] +test(2042.4, DT[ , round(mean(DiffTime)), by=Group, verbose=TRUE], + data.table(Group=c("A", "B", "C"), V1=structure(c(16, 8, 12), class="difftime", units="days")), + output="Old mean optimization is on, left j unchanged.*GForce.*FALSE") # gforce wrongly applied to external variable; #875 DT = data.table(x=INT(1,1,1,2,2), y=1:5) From 0cbc418cf4d25f51704239783045a3cb4e795e43 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 17 May 2021 23:22:37 -0600 Subject: [PATCH 250/588] .Date() replaced to pass R 3.1.0 (#5009) --- inst/tests/tests.Rraw | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 290e263823..00fba2de01 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15144,10 +15144,10 @@ test(2042.3, DT[ , format(mean(date),"%b-%Y"), by=g, verbose=TRUE ], # just thi # also incidentally fixed #2491 DT = data.table( Group = c("A", "A", "B", "B", "C", "C"), - Date1 = .Date(c(17446.0291040738, 17470.0221205444, 17445.0765226481, - 17456.0360002079, 17440.0230725919, 17451.0572453837)), - Date2 = .Date(c(17459.1561177987, 17451.1086757995, 17449.0820898537, - 17443.1175238448, 17461.0463715783, 17448.1033968224)) + Date1 = `class<-`(c(17446.0291040738, 17470.0221205444, 17445.0765226481, # `class<-`() == .Date() to pass on R 3.1.0 + 17456.0360002079, 17440.0230725919, 17451.0572453837), "Date"), + Date2 = `class<-`(c(17459.1561177987, 17451.1086757995, 17449.0820898537, + 17443.1175238448, 17461.0463715783, 17448.1033968224), "Date") ) DT[ , DiffTime := abs(difftime(Date1, Date2, units = 'days'))] test(2042.4, DT[ , round(mean(DiffTime)), by=Group, verbose=TRUE], From eb9337d8e4ce087d0b60c93416c5e8d448fedcad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Tue, 18 May 2021 10:31:18 +0200 Subject: [PATCH 251/588] Fix segfaults when assigning to list column (#4350) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 22 ++++++++++++++++++---- src/assign.c | 31 +++++++++++++++++++++++-------- 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index fd48fdcade..aae1234553 100644 --- a/NEWS.md +++ b/NEWS.md @@ -127,6 +127,8 @@ 16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) at the beginning of a group and return the smallest non-blank instead of blank, [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it. +17. Assigning a wrong-length or non-list vector to a list column could segfault, [#4166](https://github.com/Rdatatable/data.table/issues/4166) [#4667](https://github.com/Rdatatable/data.table/issues/4667) [#4678](https://github.com/Rdatatable/data.table/issues/4678) [#4729](https://github.com/Rdatatable/data.table/issues/4729). Thanks to @fklirono, Kun Ren, @kevinvzandvoort and @peterlittlejohn for reporting, and to Václav Tlapák for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 00fba2de01..29a76866cc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17634,10 +17634,24 @@ test(2188.15, fifelse(TRUE, NA, factor('a'), factor('a', levels = c('a','b'))), test(2188.16, fifelse(c(NA, NA), 1L, 2L, NULL), c(NA_integer_, NA_integer_)) # NULL `na` is treated as NA # rolling join expected output on non-matching join column has been fixed #1913 -dt = data.table(ID=1:5, A=c(1.3, 1.7, 2.4, 0.9, 0.6)) +DT = data.table(ID=1:5, A=c(1.3, 1.7, 2.4, 0.9, 0.6)) buckets = data.table(BucketID=1:4, BinA=1:4) -dt[, A.copy := A] -test(2189.1, buckets[dt, on=c("BinA"="A"), roll=-Inf], data.table(BucketID = c(2L, 2L, 3L, 1L, 1L), BinA = c(1.3, 1.7, 2.4, 0.9, 0.6), ID = 1:5, A.copy = c(1.3, 1.7, 2.4, 0.9, 0.6))) +DT[, A.copy := A] +test(2189.1, buckets[DT, on=c("BinA"="A"), roll=-Inf], data.table(BucketID = c(2L, 2L, 3L, 1L, 1L), BinA = c(1.3, 1.7, 2.4, 0.9, 0.6), ID = 1:5, A.copy = c(1.3, 1.7, 2.4, 0.9, 0.6))) buckets[, BinA := as.numeric(BinA)] -test(2189.2, buckets[dt, on=c("BinA"="A"), roll=-Inf], data.table(BucketID = c(2L, 2L, 3L, 1L, 1L), BinA = c(1.3, 1.7, 2.4, 0.9, 0.6), ID = 1:5, A.copy = c(1.3, 1.7, 2.4, 0.9, 0.6))) +test(2189.2, buckets[DT, on=c("BinA"="A"), roll=-Inf], data.table(BucketID = c(2L, 2L, 3L, 1L, 1L), BinA = c(1.3, 1.7, 2.4, 0.9, 0.6), ID = 1:5, A.copy = c(1.3, 1.7, 2.4, 0.9, 0.6))) + +# segfault subassigning non-list type to list column, #4166 +DT = data.table(a=list(1:2, 3, 4)) +test(2190.1, DT[, a:=1:4], error="Supplied 4 items to be assigned to 3 items of column 'a'.*please use rep") +test(2190.2, DT[1:2, a:=structure(c(1L, 2L), att='t') ]$a, list(structure(1L, att='t'), structure(2L, att='t'), 4)) +test(2190.3, DT[1:2, a:=structure(c(1, 2), att='t') ]$a, list(structure(1, att='t'), structure(2, att='t'), 4)) +test(2190.4, DT[1:2, a:=structure(as.raw(c(1, 2)), att='t') ]$a, list(structure(as.raw(1), att='t'), structure(as.raw(2), att='t'), 4)) +test(2190.5, DT[1:2, a:=structure(as.complex(c(1, 2)), att='t')]$a, list(structure(as.complex(1), att='t'), structure(as.complex(2), att='t'), 4)) +test(2190.6, DT[1:2, a:=structure(c(TRUE, FALSE), att='t') ]$a, list(structure(TRUE, att='t'), structure(FALSE, att='t'), 4)) +test(2190.7, DT[1:2, a:=structure(c('a', 'b'), att='t') ]$a, list(structure('a', att='t'), structure('b', att='t'), 4)) +if (test_bit64) { + test(2190.8, DT[1:2, a:=as.integer64(1:2) ]$a, list(as.integer64(1), as.integer64(2), 4)) +} +test(2190.9, DT[1:2, a:=call('sum', 1)], error="type 'language' cannot be coerced to 'list'") diff --git a/src/assign.c b/src/assign.c index bc1c91188f..c87d99bdb9 100644 --- a/src/assign.c +++ b/src/assign.c @@ -434,7 +434,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } // RHS of assignment to new column is zero length but we'll use its type to create all-NA column of that type } - { + { int j; if (isMatrix(thisvalue) && (j=INTEGER(getAttrib(thisvalue, R_DimSymbol))[1]) > 1) // matrix passes above (considered atomic vector) warning(_("%d column matrix RHS of := will be treated as one vector"), j); @@ -445,8 +445,9 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) error(_("Can't assign to column '%s' (type 'factor') a value of type '%s' (not character, factor, integer or numeric)"), CHAR(STRING_ELT(names,coln)),type2char(TYPEOF(thisvalue))); } - if (nrow>0 && targetlen>0 && vlen>1 && vlen!=targetlen && (TYPEOF(existing)!=VECSXP || TYPEOF(thisvalue)==VECSXP)) { - // note that isNewList(R_NilValue) is true so it needs to be TYPEOF(existing)!=VECSXP above + if (nrow>0 && targetlen>0 && vlen>1 && vlen!=targetlen && !(TYPEOF(existing)==VECSXP && targetlen==1)) { + // We allow assigning objects of arbitrary to single items of list columns for convenience. + // Note that isNewList(R_NilValue) is true so it needs to be !(TYPEOF(existing)==VECSXP) above error(_("Supplied %d items to be assigned to %d items of column '%s'. If you wish to 'recycle' the RHS please use rep() to make this intent clear to readers of your code."), vlen, targetlen, CHAR(colnam)); } } @@ -1065,11 +1066,25 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con BODY(SEXP, STRING_PTR, SEXP, val, SET_STRING_ELT(target, off+i, cval)) } case VECSXP : - case EXPRSXP : // #546 - if (TYPEOF(source)!=VECSXP && TYPEOF(source)!=EXPRSXP) - BODY(SEXP, &, SEXP, val, SET_VECTOR_ELT(target, off+i, cval)) - else - BODY(SEXP, SEXPPTR_RO, SEXP, val, SET_VECTOR_ELT(target, off+i, cval)) + case EXPRSXP : { // #546 #4350 + if (len == 1 && TYPEOF(source)!=VECSXP && TYPEOF(source)!=EXPRSXP) { + BODY(SEXP, &, SEXP, val, SET_VECTOR_ELT(target, off+i, cval)) + } else { + switch (TYPEOF(source)) { + // no protect of CAST needed because SET_VECTOR_ELT protects it, and it can't get released by copyMostAttrib or anything else inside BODY + // copyMostAttrib is appended to CAST so as to be outside loop + case RAWSXP: BODY(Rbyte, RAW, SEXP, ScalarRaw(val); copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) + case LGLSXP: BODY(int, INTEGER, SEXP, ScalarLogical(val);copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) + case INTSXP: BODY(int, INTEGER, SEXP, ScalarInteger(val);copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) + case REALSXP: BODY(double, REAL, SEXP, ScalarReal(val); copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) + case CPLXSXP: BODY(Rcomplex, COMPLEX, SEXP, ScalarComplex(val);copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) + case STRSXP: BODY(SEXP, STRING_PTR, SEXP, ScalarString(val); copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) + case VECSXP: + case EXPRSXP: BODY(SEXP, SEXPPTR_RO, SEXP, val, SET_VECTOR_ELT(target,off+i,cval)) + default: COERCE_ERROR("list"); + } + } + } break; default : error(_("Unsupported column type in assign.c:memrecycle '%s'"), type2char(TYPEOF(target))); // # nocov } From f2f0a8bd8ce77e7535453b2b7330234198171ba1 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 18 May 2021 17:14:36 -0600 Subject: [PATCH 252/588] state 'contains a space' in fread input= message (#5011) --- R/fread.R | 2 +- inst/tests/tests.Rraw | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fread.R b/R/fread.R index b8271ce0c1..53d2f55acb 100644 --- a/R/fread.R +++ b/R/fread.R @@ -81,7 +81,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") else if (length(grep(' ', input, fixed = TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command cmd = input if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) { - message("Taking input= as a system command ('",cmd,"') and a variable has been used in the expression passed to `input=`. Please use fread(cmd=...). There is a security concern if you are creating an app, and the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.") + message("Taking input= as a system command because it contains a space ('",cmd,"'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.") } } else { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 29a76866cc..029f33bbb8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10342,7 +10342,7 @@ if (.Platform$OS.type=="unix") { cat("a,b\n4,2", file=f<-tempfile()) cmd <- sprintf("cat %s", f) options(datatable.fread.input.cmd.message = TRUE) - test(1703.01, fread(cmd), ans<-data.table(a=4L, b=2L), message="Please use fread.cmd=.*security concern.*Please read item 5 in the NEWS file for v1.11.6") + test(1703.01, fread(cmd), ans<-data.table(a=4L, b=2L), message="security concern.*Please read item 5 in the NEWS file for v1.11.6") options(datatable.fread.input.cmd.message = NULL) # when option is missing as it is by default, then TRUE test(1703.02, fread(cmd), ans, message="security concern") options(datatable.fread.input.cmd.message = FALSE) From 591ce5e74d28aa49b292220de964306701d8ff74 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 20 May 2021 23:26:59 -0700 Subject: [PATCH 253/588] Use anyDuplicated as appropriate (#5015) --- R/data.table.R | 2 +- R/fmelt.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index c070f7dcb6..bbb484c9c1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -906,7 +906,7 @@ replace_dot_alias = function(e) { # if user doesn't like this inferred name, user has to use by=list() to name the column } # Fix for #1334 - if (any(duplicated(bynames))) { + if (anyDuplicated(bynames)) { bynames = make.unique(bynames) } } diff --git a/R/fmelt.R b/R/fmelt.R index 362a21695a..56b075691d 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -199,7 +199,7 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl variable.name, value.name, as.logical(na.rm), as.logical(verbose)) setDT(ans) - if (any(duplicated(names(ans)))) { + if (anyDuplicated(names(ans))) { catf("Duplicate column names found in molten data.table. Setting unique names using 'make.names'\n") setnames(ans, make.unique(names(ans))) } From 9ed02559a0d15f359c8929a12a7527fccbc984e1 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 20 May 2021 23:52:30 -0700 Subject: [PATCH 254/588] combine consecutive stopifnot calls (#5016) --- R/fread.R | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/R/fread.R b/R/fread.R index 53d2f55acb..2f918fb2bd 100644 --- a/R/fread.R +++ b/R/fread.R @@ -21,15 +21,19 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) { stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } - stopifnot( isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress), - isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml) ) - stopifnot( isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0)) - stopifnot( is.numeric(nrows), length(nrows)==1L ) + stopifnot( + isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress), + isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), + isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), + is.numeric(nrows), length(nrows)==1L + ) nrows=as.double(nrows) #4686 if (is.na(nrows) || nrows<0) nrows=Inf # accept -1 to mean Inf, as read.table does if (identical(header,"auto")) header=NA - stopifnot(is.logical(header) && length(header)==1L) # TRUE, FALSE or NA - stopifnot(is.numeric(nThread) && length(nThread)==1L) + stopifnot( + is.logical(header) && length(header)==1L, # TRUE, FALSE or NA + is.numeric(nThread) && length(nThread)==1L + ) nThread=as.integer(nThread) stopifnot(nThread>=1L) if (!is.null(text)) { From e6276cc97fedf0c5ca021328306c0310fef3f10f Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 20 May 2021 23:58:14 -0700 Subject: [PATCH 255/588] Flip any(!x) to !all(x) and all(!x) to !any(x) (#5017) --- R/as.data.table.R | 2 +- R/data.table.R | 2 +- R/xts.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index a706aecb86..9d286d7f16 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -102,7 +102,7 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va dnx } else dnx val = rev(val) - if (is.null(names(val)) || all(!nzchar(names(val)))) + if (is.null(names(val)) || !any(nzchar(names(val)))) setattr(val, 'names', paste0("V", rev(seq_along(val)))) if (value.name %chin% names(val)) stop("Argument 'value.name' should not overlap with column names in result: ", brackify(rev(names(val)))) diff --git a/R/data.table.R b/R/data.table.R index bbb484c9c1..dbede49855 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1116,7 +1116,7 @@ replace_dot_alias = function(e) { lhs = names_x[m] } else stop("LHS of := isn't column names ('character') or positions ('integer' or 'numeric')") - if (all(!is.na(m))) { + if (!anyNA(m)) { # updates by reference to existing columns cols = as.integer(m) newnames=NULL diff --git a/R/xts.R b/R/xts.R index 121f36f1bd..31c5ad2309 100644 --- a/R/xts.R +++ b/R/xts.R @@ -19,7 +19,7 @@ as.xts.data.table = function(x, ...) { stopifnot(requireNamespace("xts"), !missing(x), is.data.table(x)) if (!xts::is.timeBased(x[[1L]])) stop("data.table must have a time based column in first position, use `setcolorder` function to change the order, or see ?timeBased for supported types") colsNumeric = vapply_1b(x, is.numeric)[-1L] # exclude first col, xts index - if (any(!colsNumeric)) warning("Following columns are not numeric and will be omitted: ", brackify(names(colsNumeric)[!colsNumeric])) + if (!all(colsNumeric)) warning("Following columns are not numeric and will be omitted: ", brackify(names(colsNumeric)[!colsNumeric])) r = setDF(x[, .SD, .SDcols = names(colsNumeric)[colsNumeric]]) return(xts::as.xts(r, order.by = if ("IDate" %chin% class(x[[1L]])) as.Date(x[[1L]]) else x[[1L]])) } From 4785417044fd54518a7db59c834d4beccb286f62 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 21 May 2021 09:55:08 +0200 Subject: [PATCH 256/588] by dot work with verbose=T, closes #3196 (#4359) --- inst/tests/tests.Rraw | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 029f33bbb8..1c2b9eb45b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17655,3 +17655,6 @@ if (test_bit64) { } test(2190.9, DT[1:2, a:=call('sum', 1)], error="type 'language' cannot be coerced to 'list'") +# adding test for (since fixed) 'could not find function "."' when verbose=TRUE, #3196 +DT = data.table(i1 = c(234L, 250L, 169L, 234L, 147L, 96L, 96L, 369L, 147L, 96L), i4 = c(79L, 113L, 270L, -121L, 113L, 113L, -121L, 179L, -228L, 113L), v = 0) +test(2191, DT[1:5, sum(v), by=.(i5 = 1:5 %% 2L), verbose=TRUE], data.table(i5=1:0, V1=c(0,0)), output="gforce") From 4796603118de9216fa0f97fe618f9ceec299cee5 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 21 May 2021 11:46:11 +0200 Subject: [PATCH 257/588] eval and := fail fixed, closes #1181 (#4361) --- inst/tests/tests.Rraw | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1c2b9eb45b..ceed569e30 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5929,7 +5929,11 @@ test(1388, as.character(x), c("00:00:01", "-00:00:01", "-01:01:40")) # Fix for #880. Another eval(parse(.)) issue. DT <- as.data.table(iris) DT[, foo := "Species"] -test(1389, copy(DT)[,bar := eval(parse(text=foo[1]), envir=.SD)], copy(DT)[, bar := Species]) +test(1389.1, copy(DT)[,bar := eval(parse(text=foo[1]), envir=.SD)], copy(DT)[, bar := Species]) +# another test from #1181 for completeness +DT1 = data.table(a = 1, key = 'a') +DT2 = data.table(c = 1, fn = list(quote(5*a)), key = 'c') +test(1389.2, DT1[, n:=eval(DT2[a]$fn[[1]], .SD)], data.table(a=1, n=5, key="a")) # Fix for foverlaps() floating point interval (double) types. Should increment them by machine tolerance, not by 1L DT1 = data.table(start=c(0.88), end=c(0.88)) From 7511c2d82c5857456c0af75b7855498aeb598307 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 23 May 2021 14:30:09 +0200 Subject: [PATCH 258/588] nested dt addcol, tests for #1629 (#4366) --- inst/tests/tests.Rraw | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ceed569e30..36925a4998 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15391,6 +15391,19 @@ options(old) test(2049.2, outer$ab, list(data.table(a=1:3, b=4L))) test(2049.3, outer$ab[[1]][, b := 5L], data.table(a=1:3, b=5L)) test(2049.4, outer$ab, list(data.table(a=1:3, b=5L))) +test(2049.5, {DT=data.table(d=list(data.table(a=1))); DT$d[[1]][, new_col:=NA]; DT}, # verbatim from #1629 + data.table(d = list(data.table(a=1, new_col=NA)))) +# extra tests on similar theme to #1629 added in PR#4366 ... +add_col1 = function(dt) { + if (is.data.table(dt)) dt[, new_col:=NA] + if (is.list(dt)) lapply(dt, add_col1) + invisible() +} +DT = data.table(a=c(1,2), b=list(data.table(d=c("a", "b"), e=c(100, 200)))) +test(2049.6, add_col1(DT), NULL) +test(2049.7, names(DT), c("a","b","new_col")) +test(2049.8, names(DT$b[[1L]]), c("d","e","new_col")) +test(2049.9, names(DT$b[[2L]]), c("d","e","new_col")) # rbindlist zero row DT should retain its (unused) levels, #3508 DT = data.table(f = factor(c("a", "b", "c"))) @@ -17662,3 +17675,4 @@ test(2190.9, DT[1:2, a:=call('sum', 1)], error="type 'language' cannot be coerce # adding test for (since fixed) 'could not find function "."' when verbose=TRUE, #3196 DT = data.table(i1 = c(234L, 250L, 169L, 234L, 147L, 96L, 96L, 369L, 147L, 96L), i4 = c(79L, 113L, 270L, -121L, 113L, 113L, -121L, 179L, -228L, 113L), v = 0) test(2191, DT[1:5, sum(v), by=.(i5 = 1:5 %% 2L), verbose=TRUE], data.table(i5=1:0, V1=c(0,0)), output="gforce") + From 5dafccbb41fb537cfc2589669c80d057976bace4 Mon Sep 17 00:00:00 2001 From: avimallu <66530011+avimallu@users.noreply.github.com> Date: Sun, 23 May 2021 18:03:01 +0530 Subject: [PATCH 259/588] froll partial documentation (#5019) --- man/froll.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/froll.Rd b/man/froll.Rd index f1726d0723..b1fc2cc970 100644 --- a/man/froll.Rd +++ b/man/froll.Rd @@ -22,7 +22,7 @@ frollsum(x, n, fill=NA, algo=c("fast","exact"), align=c("right", "left", frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) } \arguments{ - \item{x}{ vector, list, data.frame or data.table of numeric or logical columns. } + \item{x}{ vector, data.frame or data.table of numeric or logical columns. May also be a list, in which case the rolling function is applied to each of its elements. } \item{n}{ integer vector, for adaptive rolling function also list of integer vectors, rolling window size. } \item{fill}{ numeric or logical, value to pad by. Defaults to \code{NA}. } From 7a9eb51b1ff0f725e4455a7ad4afa4e93328ea96 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 23 May 2021 06:22:34 -0700 Subject: [PATCH 260/588] detect a function-cum-column in i (#5021) --- NEWS.md | 2 ++ R/data.table.R | 15 +++++++++------ inst/tests/tests.Rraw | 14 ++++++++------ 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index aae1234553..5f2c8210cf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -149,6 +149,8 @@ 5. v1.13.0 (July 2020) fixed a segfault/corruption/error (depending on version of R and circumstances) in `dcast()` when `fun.aggregate` returned `NA` (type `logical`) in an otherwise `character` result, [#2394](https://github.com/Rdatatable/data.table/issues/2394). This fix was the result of other internal rework and there was no news item at the time. A new test to cover this case has now been added. Thanks Vadim Khotilovich for reporting, and Michael Chirico for investigating, pinpointing when the fix occurred and adding the test. +6. `DT[subset]` where `DT[(subset)]` or `DT[subset==TRUE]` was intended; i.e., subsetting by a logical column whose name conflicts with an existing function, now gives a friendlier error message, [#5014](https://github.com/Rdatatable/data.table/issues/5014). Thanks @michaelchirico for the suggestion and PR, and @ColeMiller1 for helping with the fix. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/R/data.table.R b/R/data.table.R index dbede49855..fa92561489 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -405,14 +405,17 @@ replace_dot_alias = function(e) { } else { # isub is a single symbol name such as B in DT[B] i = try(eval(isub, parent.frame(), parent.frame()), silent=TRUE) - if (inherits(i,"try-error")) { + if (inherits(i,"try-error") || is.function(i)) { # must be "not found" since isub is a mere symbol col = try(eval(isub, x), silent=TRUE) # is it a column name? - msg = if (inherits(col,"try-error")) " and it is not a column name either." - else paste0(" but it is a column of type ", typeof(col),". If you wish to select rows where that column contains TRUE", - ", or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized.") - stop(as.character(isub), " is not found in calling scope", msg, - " When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.") + msg = if (inherits(col, "try-error")) gettextf( + "'%s' is not found in calling scope and it is not a column name either. ", + as.character(isub) + ) else gettextf( + "'%s' is not found in calling scope, but it is a column of type %s. If you wish to select rows where that column contains TRUE, or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE} is particularly clear and is optimized. ", + as.character(isub), typeof(col) + ) + stop(msg, "When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.") } } if (restore.N) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 36925a4998..070f7cf379 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11452,16 +11452,18 @@ if (exists("B")) rm(B) if (exists("NOTEXIST")) rm(NOTEXIST) if (exists("MyCol")) rm(MyCol) DT <- data.table(A = c(FALSE, TRUE), B = 2:1, C=c(2,3), MyCol=c(2,2)) -test(1773.01, DT[A], error = "A is not found in calling scope but it is a column of type logical.*==TRUE.*When the first argument") -test(1773.02, DT[B], error = "B is not found in calling scope but it is a column of type integer.*DT\\[\\(col\\)\\].*When the first argument") # 697 -test(1773.03, DT[C], error = "i has evaluated to type closure. Expecting logical, integer or double") # C picks up stats::C in calling scope -test(1773.04, DT[MyCol], error="MyCol is not found in calling scope but it is a column of type double.*DT\\[\\(col\\)\\].*When the first argument") -test(1773.05, DT[NOTEXIST], error = "NOTEXIST is not found in calling scope and it is not a column name either. When the first argument") +test(1773.01, DT[A], error = "'A' is not found in calling scope, but it is a column of type logical.*==TRUE.*When the first argument") +test(1773.02, DT[B], error = "'B' is not found in calling scope, but it is a column of type integer.*DT\\[\\(col\\)\\].*When the first argument") # 697 +test(1773.03, DT[C], error = "'C' is not found in calling scope, but it is a column of type double") # C picks up stats::C in calling scope +test(1773.04, DT[MyCol], error="'MyCol' is not found in calling scope, but it is a column of type double.*DT\\[\\(col\\)\\].*When the first argument") +test(1773.05, DT[NOTEXIST], error = "'NOTEXIST' is not found in calling scope and it is not a column name either. When the first argument") test(1773.06, DT[(A)], DT[2]) test(1773.07, DT[A==TRUE], DT[2]) test(1773.08, DT[(B)], data.table(A=c(TRUE,FALSE), B=1:2, C=c(3,2), MyCol=2)) test(1773.09, DT[(MyCol)], data.table(A=c(TRUE,TRUE), B=INT(1,1), C=c(3,3), MyCol=2)) test(1773.10, DT[(C)], data.table(A=c(TRUE,NA), B=c(1L,NA), C=c(3,NA), MyCol=c(2,NA))) +test(1773.11, data.table(subset=c(TRUE,FALSE))[subset], # i being a function name that's also a column name, #5014 + error="'subset' is not found in calling scope, but") # New as.data.table.array method in v1.10.5 set.seed(1L) @@ -16703,7 +16705,7 @@ set.seed(1) vDT = data.table(i_id = unique(iDT$i_id))[, .(v = runif(5,0,10), p = sample(c(5,5,10,10,10))), by=i_id] test(2120.01, !exists("i_id")) # quick verify in case there's an i_id in .GlobalEnv when testing in dev test(2120.02, iDT[i_id, order(e_date, e_time)], # first of all, the correct error - error="i_id is not found in calling scope but it is a column of type character") + error="'i_id' is not found in calling scope, but it is a column of type character") tmp = vDT[c("B","C","A"), on=.(i_id), .N, by=.EACHI] # split long statement in 2120.05 up as per demo in #3669 test(2120.03, tmp, data.table(i_id=c("B","C","A"), N=5L)) # just make sure the helper tmp is correct test(2120.04, tmp[iDT[i_id, order(e_date, e_time)]], # i_id obtained from tmp; this is what broke in dev 1.12.3 From c179b5e755554602087192031dfe42e5fc479972 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 23 May 2021 15:58:26 +0200 Subject: [PATCH 261/588] programming vignette tweak (#5010) --- _pkgdown.yml | 4 +++- vignettes/datatable-programming.Rmd | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 6d2ef397d3..4b02b39491 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -36,8 +36,10 @@ navbar: href: articles/datatable-keys-fast-subset.html - text: "Secondary indices and auto indexing" href: articles/datatable-secondary-indices-and-auto-indexing.html - - text: "Efficient reshaping using data.tables" + - text: "Efficient reshaping using data.table" href: articles/datatable-reshape.html + - text: "Programming on data.table" + href: articles/datatable-programming.html - text: "Frequently asked questions" href: articles/datatable-faq.html - text: "Importing data.table" diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index 3fb59f4497..46008e7045 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -114,7 +114,7 @@ Though these can be helpful, we will be discussing a `data.table`-unique approac Now that we've established the proper way to parameterize code that uses *lazy evaluation*, we can move on to the main subject of this vignette, *programming on data.table*. -Starting from version 1.12.10, data.table provides a robust mechanism for parameterizing expressions passed to the `i`, `j`, and `by` (or `keyby`) arguments of `[.data.table`. It is built upon the base R `substitute` function, and mimics its interface. Here, we introduce `substitute2` as a more robust and more user-friendly version of base R's `substitute`. For a complete list of differences between `base::substitute` and `data.table::substitute2` please read the [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). +Starting from version 1.14.2, data.table provides a robust mechanism for parameterizing expressions passed to the `i`, `j`, and `by` (or `keyby`) arguments of `[.data.table`. It is built upon the base R `substitute` function, and mimics its interface. Here, we introduce `substitute2` as a more robust and more user-friendly version of base R's `substitute`. For a complete list of differences between `base::substitute` and `data.table::substitute2` please read the [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). ### Substituting variables and names From 77b20ae1767087caed8ced063b960b37a109daf1 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 25 May 2021 03:23:36 -0400 Subject: [PATCH 262/588] new measurev function (#5022) --- NEWS.md | 2 +- R/fmelt.R | 90 ++++++++++++++++++++++++++++--------------- inst/tests/tests.Rraw | 38 ++++++++++++++---- man/measure.Rd | 52 +++++++++++++++++++------ 4 files changed, 129 insertions(+), 53 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5f2c8210cf..fbd536ea37 100644 --- a/NEWS.md +++ b/NEWS.md @@ -56,7 +56,7 @@ 8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing. -9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage for reporting, and to @tdhock for implementing. +9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551), [#4998](https://github.com/Rdatatable/data.table/issues/4998). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New functions `measure()` and `measurev()` which use either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage, Mark Fairbanks for reporting, and to @tdhock for implementing. 10. A new interface for _programming on data.table_ has been added, closing [#2655](https://github.com/Rdatatable/data.table/issues/2655) and many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing. diff --git a/R/fmelt.R b/R/fmelt.R index 56b075691d..009369ea9e 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -35,21 +35,11 @@ patterns = function(..., cols=character(0L)) { } measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { - # 1. basic error checking. - if (!missing(sep) && !missing(pattern)) { - stop("both sep and pattern arguments used in measure; must use either sep or pattern (not both)") - } - if (!(is.character(multiple.keyword) && length(multiple.keyword)==1 && !is.na(multiple.keyword) && nchar(multiple.keyword)>0)) { - stop("multiple.keyword must be a character string with nchar>0") - } - if (!is.character(cols)) { - stop("cols must be a character vector of column names") - } - # 2. compute conversion function list with group names. mcall = match.call() L = as.list(mcall)[-1] - formal.names <- names(formals()) - fun.list = L[-which(names(L) %in% formal.names)] + formal.names = names(formals()) + formal.i.vec = which(names(L) %in% formal.names) + fun.list = L[-formal.i.vec] user.named = names(fun.list) != "" is.symb = sapply(fun.list, is.symbol) bad.i = which((!user.named) & (!is.symb)) @@ -57,26 +47,61 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { stop("each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: ", paste(bad.i, collapse=",")) } names(fun.list)[!user.named] = sapply(fun.list[!user.named], paste) + fun.list[!user.named] = list(NULL) # group names error checking. - group.is.formal <- names(fun.list) %in% formal.names + group.is.formal = names(fun.list) %in% formal.names if (any(group.is.formal)) { - bad.names <- names(fun.list)[group.is.formal] + bad.names = names(fun.list)[group.is.formal] stop("group names specified in ... conflict with measure argument names; please fix by changing group names: ", paste(bad.names, collapse=",")) } - err.names.unique <- function(err.what, name.vec) { + # evaluate each value in ... and stop if not function. + for (fun.i in which(user.named)) { + fun = eval(fun.list[[fun.i]], parent.frame(1L)) + if (!is.function(fun) || length(formals(args(fun)))==0) { + stop("each ... argument to measure must be a function with at least one argument, problem: ", names(fun.list)[[fun.i]]) + } + fun.list[[fun.i]] = fun + } + measurev.args = c( + list(fun.list), + L[formal.i.vec], + list(group.desc="... arguments to measure")) + do.call(measurev, measurev.args) +} + +measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.name", group.desc="elements of fun.list"){ + # 1. basic error checking. + if (!missing(sep) && !missing(pattern)) { + stop("both sep and pattern arguments used; must use either sep or pattern (not both)") + } + if (!(is.character(multiple.keyword) && length(multiple.keyword)==1 && !is.na(multiple.keyword) && nchar(multiple.keyword)>0)) { + stop("multiple.keyword must be a character string with nchar>0") + } + if (!is.character(cols)) { + stop("cols must be a character vector of column names") + } + prob.i <- if (is.null(names(fun.list))) { + seq_along(fun.list) + } else { + which(names(fun.list) == "") + } + if (length(prob.i)) { + stop("in measurev, ", group.desc, " must be named, problems: ", paste(prob.i, collapse=",")) + } + err.names.unique = function(err.what, name.vec) { name.tab = table(name.vec) bad.counts = name.tab[1 < name.tab] if (length(bad.counts)) { - stop(err.what, " names should be unique, problems: ", paste(names(bad.counts), collapse=",")) + stop(err.what, " should be uniquely named, problems: ", paste(names(bad.counts), collapse=",")) } } - err.args.groups <- function(type, N){ + err.args.groups = function(type, N){ if (N != length(fun.list)) { - stop("number of ... arguments to measure =", length(fun.list), " must be same as ", type, " =", N) + stop("number of ", group.desc, " =", length(fun.list), " must be same as ", type, " =", N) } } - err.names.unique("measure group", names(fun.list)) - # 3. compute initial group data table, used as variable_table attribute. + err.names.unique(group.desc, names(fun.list)) + # 2. compute initial group data table, used as variable_table attribute. group.mat = if (!missing(pattern)) { if (!is.character(pattern)) { stop("pattern must be character string") @@ -108,34 +133,35 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { measure.vec = which(vector.lengths==n.groups) do.call(rbind, list.of.vectors[measure.vec]) } - err.names.unique("measured column", cols[measure.vec]) - uniq.mat <- unique(group.mat) + err.names.unique("measured columns", cols[measure.vec]) + uniq.mat = unique(group.mat) if (nrow(uniq.mat) < nrow(group.mat)) { stop("number of unique column IDs =", nrow(uniq.mat), " is less than number of melted columns =", nrow(group.mat), "; fix by changing pattern/sep") } colnames(group.mat) = names(fun.list) group.dt = data.table(group.mat) - # 4. apply conversion functions to group data table. - for (group.i in which(user.named)) { + # 3. apply conversion functions to group data table. + fun.i.vec = which(!sapply(fun.list, is.null)) + for (group.i in fun.i.vec) { group.name = names(fun.list)[[group.i]] - fun = eval(fun.list[[group.name]], parent.frame(1L)) + fun = fun.list[[group.i]] if (!is.function(fun) || length(formals(args(fun)))==0) { - stop("each ... argument to measure must be a function with at least one argument, problem: ", group.name) + stop("in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: ", group.name) } group.val = fun(group.dt[[group.name]]) if (!(is.atomic(group.val) && length(group.val)==nrow(group.dt))) { - stop("each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: ", group.name) + stop("each conversion function must return an atomic vector with same length as its first argument, problem: ", group.name) } if (all(is.na(group.val))) { stop(group.name, " conversion function returned vector of all NA") } set(group.dt, j=group.name, value=group.val) } - group.uniq <- unique(group.dt) + group.uniq = unique(group.dt) if (nrow(group.uniq) < nrow(group.dt)) { stop("number of unique groups after applying type conversion functions less than number of groups, change type conversion") } - # 5. compute measure.vars list or vector. + # 4. compute measure.vars list or vector. if (multiple.keyword %in% names(fun.list)) {# multiple output columns. if (!is.character(group.dt[[multiple.keyword]])) { stop(multiple.keyword, " column class=", class(group.dt[[multiple.keyword]])[[1L]], " after applying conversion function, but must be character") @@ -149,7 +175,7 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { other.dt = data.table(do.call(expand.grid, other.values)) measure.list = structure(list(), variable_table=other.dt) column.values = unique(group.dt[[multiple.keyword]]) - for(column.val in column.values){ + for (column.val in column.values) { select.dt = data.table(other.dt) set(select.dt, j=multiple.keyword, value=column.val) measure.list[[column.val]] = data.table( @@ -160,7 +186,7 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { } else {# single output column. structure(measure.vec, variable_table=group.dt) } -} +} melt.data.table = function(data, id.vars, measure.vars, variable.name = "variable", value.name = "value", ..., na.rm = FALSE, variable.factor = TRUE, value.factor = FALSE, diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 070f7cf379..13ca8f3b62 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17522,7 +17522,31 @@ test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE)[, .(a, b)], data.table(a=2, b=2))#not testing variable because it is not computed correctly, #4455 -# new variable.name attribute for measure.vars, PR#4731 for multiple issues +### First block testing measurev +# new variable_table attribute for measure.vars, PR#4731 for multiple issues +measurev = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. +test(2183.00001, melt(DT.wide, measure.vars=measurev()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +measurev = list("foo", "bar")#measurev below should not use this since it is not a function. +test(2183.00002, melt(DTid, measure.vars=measurev(list(value.name=NULL, num=as.complex), pattern="([ab])([12])")), error="Type 'complex' not supported for joining/merging") +test(2183.00004, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=NULL), pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) +test(2183.00005, melt(DTid, measure.vars=measurev(list(column=NULL, istr=NULL), pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword +iris.dt = data.table(datasets::iris) +test(2183.00020, melt(iris.dt, measure.vars=measurev(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") +test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: 1") +test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: 1,2") +test(2183.00027, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim="bar"), sep=".")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: dim") +test(2183.00028, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), sep=".")), error="number of elements of fun.list =3 must be same as max number of items after splitting column names =2") +test(2183.00042, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function()1), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr") +test(2183.00043, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=interactive), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr") +test(2183.00044, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function(x)1), pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr") +test(2183.00045, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), pattern="(.*)[.](.*)")), error="number of elements of fun.list =3 must be same as number of capture groups in pattern =2") +test(2183.00048, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, value.name=NULL), sep=".")), error="elements of fun.list should be uniquely named, problems: value.name") +# measure with factor conversion. +myfac = function(x)factor(x)#user-defined conversion function. +test(2183.00060, melt(DTid, measure.vars=measurev(list(letter=myfac, value.name=NULL), pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) + +### Second block testing measure +# new variable_table attribute for measure.vars, PR#4731 for multiple issues measure = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. test(2183.01, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) measure = list("foo", "bar")#measure below should not use this since it is not a function. @@ -17538,7 +17562,7 @@ test(2183.10, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table test(2183.11, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=list(x=1:2, y=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2")#make sure to check each list element, not just the first. # general measure errors. iris.dt = data.table(datasets::iris) -test(2183.20, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used in measure; must use either sep or pattern (not both)") +test(2183.20, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") # school example. schools.wide <- data.table( school = c("A","B"), @@ -17566,11 +17590,11 @@ test(2183.40, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", " test(2183.41, melt(DTid, measure.vars=measure(value.name, istr="bar", pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") test(2183.42, melt(DTid, measure.vars=measure(value.name, istr=function()1, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") test(2183.43, melt(DTid, measure.vars=measure(value.name, istr=interactive, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") -test(2183.44, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each ... argument to measure must be a function that returns an atomic vector with same length as its first argument, problem: istr") +test(2183.44, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr") test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2") test(2183.46, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") test(2183.47, melt(iris.dt, measure.vars=measure(function(x)factor(x), pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") -test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="measure group names should be unique, problems: value.name") +test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="... arguments to measure should be uniquely named, problems: value.name") # measure with factor conversion. myfac = function(x)factor(x)#user-defined conversion function. test(2183.60, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) @@ -17587,8 +17611,8 @@ test(2183.65, melt(iris.days, measure.vars=measure(pattern="day")), error="patte test(2183.66, melt(iris.days, measure.vars=measure(value.name, pattern="(.*)")), error="value.name is the only group; fix by creating at least one more group") test(2183.67, melt(iris.days, measure.vars=measure(foo, bar, pattern="(foo)(bar)")), error="pattern did not match any cols, so nothing would be melted; fix by changing pattern") test(2183.68, melt(iris.days, measure.vars=measure(value.name, bar, pattern="(foo)(bar)")), error="pattern did not match any cols, so nothing would be melted; fix by changing pattern") -test(2183.69, melt(data.table(ff=1, ff=2), measure.vars=measure(letter, number, pattern="(.)(.)")), error="measured column names should be unique, problems: ff") -test(2183.70, melt(data.table(f_f=1, f_f=2), measure.vars=measure(letter, number)), error="measured column names should be unique, problems: f_f") +test(2183.69, melt(data.table(ff=1, ff=2), measure.vars=measure(letter, number, pattern="(.)(.)")), error="measured columns should be uniquely named, problems: ff") +test(2183.70, melt(data.table(f_f=1, f_f=2), measure.vars=measure(letter, number)), error="measured columns should be uniquely named, problems: f_f") test(2183.71, melt(iris.days, measure.vars=measure(value.name=as.integer, variable, pattern="day(.)[.](.*)")), error="value.name column class=integer after applying conversion function, but must be character") test(2183.72, melt(data.table(ff=1, ff=2, a=3, b=4), measure.vars=measure(letter, pattern="([ab])"), id.vars="ff"), data.table(ff=1, letter=c("a","b"), value=c(3,4)))#duplicate column names are fine if they are not matched by pattern. test(2183.73, melt(DTid, measure.vars=measure(letter, multiple.keyword, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: multiple.keyword") @@ -17599,8 +17623,6 @@ test(2183.77, melt(DTid, measure.vars=measure(letter, cols, pattern="([ab])([12] test(2183.78, melt(DTid, measure.vars=measure(letter, cols=as.integer, pattern="([ab])([12])")), error="cols must be a character vector of column names") test(2183.79, melt(DTid, measure.vars=measure(letter, number, pattern=as.integer)), error="pattern must be character string") test(2183.80, melt(DTid, measure.vars=measure(letter, number, sep=as.integer)), error="sep must be character string") -##melt(DTid, measure.vars=measure(letter, number, sep=NA_character_) -##melt(DTid, measure.vars=measure(letter, number, sep=character()) # `keyby` allows mixing eval/get with direct columns, #4981 dt <- data.table(a=c(1,2), b=c(3,4), c=c(1,0)) diff --git a/man/measure.Rd b/man/measure.Rd index 964660b6f8..73a315e006 100644 --- a/man/measure.Rd +++ b/man/measure.Rd @@ -1,31 +1,50 @@ \name{measure} \alias{measure} +\alias{measurev} \title{Specify measure.vars via regex or separator} \description{ -\code{measure} computes an integer vector or list which can be passed as -the \code{measure.vars} argument to \code{melt}. -See the \code{Efficient reshaping using -data.tables} vignette linked below to learn more. + These functions compute an integer vector or list for use as + the \code{measure.vars} argument to \code{melt}. + Each measured variable name is converted into several groups that occupy + different columns in the output melted data. + \code{measure} allows specifying group names/conversions in R code + (each group and conversion specified as an argument) + whereas \code{measurev} allows specifying group names/conversions using + data values + (each group and conversion specified as a list element). + See + \href{../doc/datatable-reshape.html}{\code{vignette("datatable-reshape")}} + for more info. } \usage{ measure(\dots, sep, pattern, cols, multiple.keyword="value.name") +measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name", + group.desc="elements of fun.list") } \arguments{ \item{\dots}{One or more (1) symbols (without argument name; symbol - is used for output variable column name) or (2) - functions (with argument name that is used for output variable - column name). Must have same number of arguments as groups that are + is used for group name) or (2) functions to convert the groups + (with argument name that is used for group name). + Must have same number of arguments as groups that are specified by either \code{sep} or \code{pattern} arguments.} - \item{sep}{Separator to split \code{cols} into groups. Columns that - result in the maximum number of groups are considered measure variables.} + \item{fun.list}{Named list which must have the same number of + elements as groups that are specified by either \code{sep} or + \code{pattern} arguments. Each name used for a group + name, and each value must be either a function + (to convert the group from a character vector to an atomic vector of the + same size) or NULL (no conversion).} + \item{sep}{Separator to split each element of \code{cols} into + groups. Columns that result in the maximum number of groups + are considered measure variables.} \item{pattern}{Perl-compatible regex with capture groups to match to \code{cols}. Columns that match the regex are considered measure variables.} \item{cols}{A character vector of column names.} - \item{multiple.keyword}{A string, if used in \code{\dots}, then + \item{multiple.keyword}{A string, if used as a group name, then measure returns a list and melt returns multiple value columns (with names defined by the unique values in that - group). Otherwise if the string not used in \code{\dots}, then + group). Otherwise if the string not used as a group name, then measure returns a vector and melt returns a single value column.} + \item{group.desc}{Internal, used in error messages.} } \seealso{ \code{\link{melt}}, @@ -35,20 +54,29 @@ measure(\dots, sep, pattern, cols, multiple.keyword="value.name") (two.iris = data.table(datasets::iris)[c(1,150)]) # melt into a single value column. melt(two.iris, measure.vars = measure(part, dim, sep=".")) +# do the same, programmatically with measurev +my.list = list(part=NULL, dim=NULL) +melt(two.iris, measure.vars=measurev(my.list, sep=".")) # melt into two value columns, one for each part. melt(two.iris, measure.vars = measure(value.name, dim, sep=".")) # melt into two value columns, one for each dim. melt(two.iris, measure.vars = measure(part, value.name, sep=".")) -# melt using either sep or pattern, converting child number to integer. +# melt using sep, converting child number to integer. (two.families = data.table(sex_child1="M", sex_child2="F", age_child1=10, age_child2=20)) print(melt(two.families, measure.vars = measure( value.name, child=as.integer, sep="_child" )), class=TRUE) +# same melt using pattern. print(melt(two.families, measure.vars = measure( value.name, child=as.integer, pattern="(.*)_child(.)" )), class=TRUE) +# same melt with pattern and measurev function list. +print(melt(two.families, measure.vars = measurev( + list(value.name=NULL, child=as.integer), + pattern="(.*)_child(.)" +)), class=TRUE) # inspired by data(who, package="tidyr") (who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3)) # melt to three variable columns, all character. From 0c1e9cdc56f283027221c62a858588846b37e79a Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 25 May 2021 11:12:47 +0200 Subject: [PATCH 263/588] fixes coerce from xts where a column x was preent (#4898) --- NEWS.md | 2 ++ R/xts.R | 2 +- inst/tests/tests.Rraw | 5 +++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index fbd536ea37..1c415854d3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -129,6 +129,8 @@ 17. Assigning a wrong-length or non-list vector to a list column could segfault, [#4166](https://github.com/Rdatatable/data.table/issues/4166) [#4667](https://github.com/Rdatatable/data.table/issues/4667) [#4678](https://github.com/Rdatatable/data.table/issues/4678) [#4729](https://github.com/Rdatatable/data.table/issues/4729). Thanks to @fklirono, Kun Ren, @kevinvzandvoort and @peterlittlejohn for reporting, and to Václav Tlapák for the PR. +18. `as.data.table()` on `xts` objects containing a column named `x` would return an `index` of type plain `integer` rather than `POSIXct`, [#4897](https://github.com/Rdatatable/data.table/issues/4897). Thanks to Emil Sjørup for reporting, and Jan Gorecki for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/xts.R b/R/xts.R index 31c5ad2309..fce6aad3b5 100644 --- a/R/xts.R +++ b/R/xts.R @@ -8,7 +8,7 @@ as.data.table.xts = function(x, keep.rownames = TRUE, key=NULL, ...) { if (identical(keep.rownames, FALSE)) return(r[]) index_nm = if (is.character(keep.rownames)) keep.rownames else "index" if (index_nm %chin% names(x)) stop(domain=NA, gettextf("Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index column name.", index_nm, index_nm)) - r[, c(index_nm) := zoo::index(x)] + r[, c(index_nm) := zoo::index(x), env=list(x=x)] setcolorder(r, c(index_nm, setdiff(names(r), index_nm))) # save to end to allow for key=index_nm setkeyv(r, key) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 13ca8f3b62..7fdbc8f56b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6714,6 +6714,10 @@ if (test_xts) { " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", " 9: 1970-01-10 9", "10: 1970-01-11 10")) options(old) + + # as.data.table.xts(foo) had incorrect integer index with a column name called 'x', #4897 + M = as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) + test(1465.19, inherits(as.data.table(M)$index,"POSIXct")) Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) } @@ -17700,3 +17704,4 @@ test(2190.9, DT[1:2, a:=call('sum', 1)], error="type 'language' cannot be coerce DT = data.table(i1 = c(234L, 250L, 169L, 234L, 147L, 96L, 96L, 369L, 147L, 96L), i4 = c(79L, 113L, 270L, -121L, 113L, 113L, -121L, 179L, -228L, 113L), v = 0) test(2191, DT[1:5, sum(v), by=.(i5 = 1:5 %% 2L), verbose=TRUE], data.table(i5=1:0, V1=c(0,0)), output="gforce") + From 60427ca58f8f97e881d279625e7565ec7e7d3391 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 25 May 2021 12:13:14 +0200 Subject: [PATCH 264/588] update and clarify := docs (#4481) --- man/assign.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/assign.Rd b/man/assign.Rd index 5cfc42b9a9..f622755606 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -63,7 +63,7 @@ For additional resources, please read \href{../doc/datatable-faq.html}{\code{vig When \code{LHS} is a factor column and \code{RHS} is a character vector with items missing from the factor levels, the new level(s) are automatically added (by reference, efficiently), unlike base methods. -Unlike \code{<-} for \code{data.frame}, the (potentially large) LHS is not coerced to match the type of the (often small) RHS. Instead the RHS is coerced to match the type of the LHS, if necessary. Where this involves double precision values being coerced to an integer column, a warning is given (whether or not fractional data is truncated). The motivation for this is efficiency. It is best to get the column types correct up front and stick to them. Changing a column type is possible but deliberately harder: provide a whole column as the RHS. This RHS is then \emph{plonked} into that column slot and we call this \emph{plonk syntax}, or \emph{replace column syntax} if you prefer. By needing to construct a full length vector of a new type, you as the user are more aware of what is happening, and it is clearer to readers of your code that you really do intend to change the column type. +Unlike \code{<-} for \code{data.frame}, the (potentially large) LHS is not coerced to match the type of the (often small) RHS. Instead the RHS is coerced to match the type of the LHS, if necessary. Where this involves double precision values being coerced to an integer column, a warning is given when fractional data is truncated. It is best to get the column types correct up front and stick to them. Changing a column type is possible but deliberately harder: provide a whole column as the RHS. This RHS is then \emph{plonked} into that column slot and we call this \emph{plonk syntax}, or \emph{replace column syntax} if you prefer. By needing to construct a full length vector of a new type, you as the user are more aware of what is happening and it is clearer to readers of your code that you really do intend to change the column type; e.g., \code{DT[, colA:=as.integer(colA)]}. A plonk occurs whenever you provide a RHS value to `:=` which is \code{nrow} long. When a column is \emph{plonked}, the original column is not updated by reference because that would entail updating every single element of that column whereas the plonk is just one column pointer update. \code{data.table}s are \emph{not} copied-on-change by \code{:=}, \code{setkey} or any of the other \code{set*} functions. See \code{\link{copy}}. } @@ -72,7 +72,7 @@ Unlike \code{<-} for \code{data.frame}, the (potentially large) LHS is not coerc Since \code{[.data.table} incurs overhead to check the existence and type of arguments (for example), \code{set()} provides direct (but less flexible) assignment by reference with low overhead, appropriate for use inside a \code{for} loop. See examples. \code{:=} is more powerful and flexible than \code{set()} because \code{:=} is intended to be combined with \code{i} and \code{by} in single queries on large datasets. } -\section{Note:}{ +\note{ \code{DT[a > 4, b := c]} is different from \code{DT[a > 4][, b := c]}. The first expression updates (or adds) column \code{b} with the value \code{c} on those rows where \code{a > 4} evaluates to \code{TRUE}. \code{X} is updated \emph{by reference}, therefore no assignment needed. The second expression on the other hand updates a \emph{new} \code{data.table} that's returned by the subset operation. Since the subsetted data.table is ephemeral (it is not assigned to a symbol), the result would be lost; unless the result is assigned, for example, as follows: \code{ans <- DT[a > 4][, b := c]}. From 5cb17c0b76afccee81a91af859e26d53677e41f3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 25 May 2021 03:29:11 -0700 Subject: [PATCH 265/588] remove obsolete Makefile (#4557) --- vignettes/Makefile | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 vignettes/Makefile diff --git a/vignettes/Makefile b/vignettes/Makefile deleted file mode 100644 index bdc2822fc6..0000000000 --- a/vignettes/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# Makefile to use knitr for package vignettes - -clean: - rm -rf *.tex *.bbl *.blg *.aux *.out *.toc *.log *.spl *tikzDictionary *.md figure/ - -%.html: %.Rmd - $(R_HOME)/bin/Rscript -e "if (getRversion() < '3.0.0') knitr::knit2html('$*.Rmd')" From 73204cde7d16f9421ae7482a47d3b422fcc8216b Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 25 May 2021 21:57:36 +0200 Subject: [PATCH 266/588] CI: r-rel and r-oldrel upgrade (#5025) --- .gitlab-ci.yml | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 206502a56c..d36f99fbcd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,9 +6,9 @@ variables: TZ: "UTC" ## to avoid 'Failed to create bus connection' from timedatectl via Sys.timezone() on Docker with R 3.4. ## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under ## a non-UTC timezone, although, that's what we do routinely in dev. - R_REL_VERSION: "4.0" + R_REL_VERSION: "4.1" R_DEVEL_VERSION: "4.2" - R_OLDREL_VERSION: "3.6" + R_OLDREL_VERSION: "4.0" stages: - dependencies @@ -96,16 +96,14 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.0.3/R-4.0.3-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.0/R-4.1.0-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-oldrel-win: &install-r-oldrel-win - - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/3.6.3/R-3.6.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.0.5/R-4.0.5-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-rtools-win: &install-rtools-win - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait -.test-install-rtools35-win: &install-rtools35-win - - curl.exe -s -o ../Rtools35.exe https://cloud.r-project.org/bin/windows/Rtools/Rtools35.exe; Start-Process -FilePath ..\Rtools35.exe -ArgumentList "/VERYSILENT /DIR=C:\Rtools" -NoNewWindow -Wait .test-template: &test stage: test @@ -191,7 +189,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual variables: _R_CHECK_CRAN_INCOMING_: "TRUE" ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though) _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 - _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0, then 00check.log can be checked for "OK" rather than "2 NOTEs" + _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0 before_script: - *install-deps - *cp-src @@ -205,7 +203,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - *rm-src - >- - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 2 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 2 NOTEs"), " (size of tarball) but ", shQuote(l)) else q("no")' + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 1 NOTE")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 1 NOTE"), " (installed package size) but ", shQuote(l)) else q("no")' test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double, check for new notes and compilation warnings, thus allow_failure <<: *test-lin @@ -285,8 +283,8 @@ test-old-win: ## R-oldrel on Windows R_VERSION: "$R_OLDREL_VERSION" before_script: - *install-r-oldrel-win - - *install-rtools35-win - - $ENV:PATH = "C:\R\bin;C:\Rtools\bin;$ENV:PATH" + - *install-rtools-win + - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - *install-deps-win - *cp-src-win - rm.exe -r bus From 5f86efd0c8ed5035dca4511d0b2b23e549dcf087 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 25 May 2021 20:15:28 -0600 Subject: [PATCH 267/588] test failed under R 3.5.0 and R 3.4.4 (#5026) --- inst/tests/tests.Rraw | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7fdbc8f56b..12a38ffae4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6704,6 +6704,7 @@ if (test_xts) { setcolorder(dt, c(2, 3, 1)) dt[ , char_col := 'a'] test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric') + if (base::getRversion() < "3.6.0") rm(as.xts) # 890 -- key argument for as.data.table.xts x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) @@ -6716,7 +6717,7 @@ if (test_xts) { options(old) # as.data.table.xts(foo) had incorrect integer index with a column name called 'x', #4897 - M = as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) + M = xts::as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) # xts:: just to be extra robust; shouldn't be needed with rm(as.xts) above test(1465.19, inherits(as.data.table(M)$index,"POSIXct")) Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) From 4789e556cf2aecbe4bb873104afd873f2971f784 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 26 May 2021 15:36:57 -0700 Subject: [PATCH 268/588] a pass at clarifying ?froll (#4551) --- man/froll.Rd | 80 ++++++++++++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 43 deletions(-) diff --git a/man/froll.Rd b/man/froll.Rd index b1fc2cc970..a18ef9545e 100644 --- a/man/froll.Rd +++ b/man/froll.Rd @@ -12,71 +12,65 @@ \alias{frollapply} \title{Rolling functions} \description{ - Fast rolling functions to calculate aggregates on sliding window. Function name and arguments are experimental. + Fast rolling functions to calculate aggregates on sliding windows. Function name and arguments are experimental. } \usage{ -frollmean(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", - "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) -frollsum(x, n, fill=NA, algo=c("fast","exact"), align=c("right", "left", - "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) +frollmean(x, n, fill=NA, algo=c("fast", "exact"), + align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) +frollsum(x, n, fill=NA, algo=c("fast","exact"), + align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) } \arguments{ - \item{x}{ vector, data.frame or data.table of numeric or logical columns. May also be a list, in which case the rolling function is applied to each of its elements. } - \item{n}{ integer vector, for adaptive rolling function also list of - integer vectors, rolling window size. } - \item{fill}{ numeric or logical, value to pad by. Defaults to \code{NA}. } - \item{algo}{ character, default \code{"fast"}. When set to \code{"exact"}, - then slower algorithm is used. It suffers less from floating point - rounding error, performs extra pass to adjust rounding error - correction and carefully handles all non-finite values. If available - it will use multiple cores. See details for more information. } - \item{align}{ character, define if rolling window covers preceding rows - (\code{"right"}), following rows (\code{"left"}) or centered - (\code{"center"}). Defaults to \code{"right"}. } - \item{na.rm}{ logical. Should missing values be removed when - calculating window? Defaults to \code{FALSE}. For details on handling - other non-finite values, see details below. } - \item{hasNA}{ logical. If it is known that \code{x} contains \code{NA} - then setting to \code{TRUE} will speed up. Defaults to \code{NA}. } - \item{adaptive}{ logical, should adaptive rolling function be - calculated, default \code{FALSE}. See details below. } - \item{FUN}{ the function to be applied in rolling fashion; see Details for restrictions } - \item{\dots}{ extra arguments passed to \code{FUN} in \code{frollapply}. } + \item{x}{ Vector, \code{data.frame} or \code{data.table} of integer, numeric or logical columns over which to calculate the windowed aggregations. May also be a list, in which case the rolling function is applied to each of its elements. } + \item{n}{ Integer vector giving rolling window size(s). This is the \emph{total} number of included values. Adaptive rolling functions also accept a list of integer vectors. } + \item{fill}{ Numeric; value to pad by. Defaults to \code{NA}. } + \item{algo}{ Character, default \code{"fast"}. When set to \code{"exact"}, a slower (but more accurate) algorithm is used. It + suffers less from floating point rounding errors by performing an extra pass, and carefully handles all non-finite values. + It will use mutiple cores where available. See Details for more information. } + \item{align}{ Character, specifying the "alignment" of the rolling window, defaulting to \code{"right"}. \code{"right"} covers preceding rows (the window \emph{ends} on the current value); \code{"left"} covers following rows (the window \emph{starts} on the current value); \code{"center"} is halfway in between (the window is \emph{centered} on the current value, biased towards \code{"left"} when \code{n} is even). } + \item{na.rm}{ Logical, default \code{FALSE}. Should missing values be removed when + calculating window? For details on handling other non-finite values, see Details. } + \item{hasNA}{ Logical. If it is known that \code{x} contains \code{NA} + then setting this to \code{TRUE} will speed up calculation. Defaults to \code{NA}. } + \item{adaptive}{ Logical, default \code{FALSE}. Should the rolling function be calculated adaptively? See Details below. } + \item{FUN}{ The function to be applied to the rolling window; see Details for restrictions. } + \item{\dots}{ Extra arguments passed to \code{FUN} in \code{frollapply}. } } \details{ - \code{froll*} functions accepts vectors, lists, data.frames or - data.tables. They always return a list except when the input is a - \code{vector} and \code{length(n)==1} in which case a \code{vector} - is returned, for convenience. Thus rolling functions can be used - conveniently within data.table syntax. + \code{froll*} functions accept vectors, lists, \code{data.frames} or + \code{data.tables}. They always return a list except when the input is a + \code{vector} and \code{length(n)==1}, in which case a \code{vector} + is returned, for convenience. Thus, rolling functions can be used + conveniently within \code{data.table} syntax. Argument \code{n} allows multiple values to apply rolling functions on - multiple window sizes. If \code{adaptive=TRUE}, then it expects a list. + multiple window sizes. If \code{adaptive=TRUE}, then \code{n} must be a list. Each list element must be integer vector of window sizes corresponding - to every single observation in each column. + to every single observation in each column; see Examples. - When \code{algo="fast"} then \emph{on-line} algorithm is used, also - any \code{NaN, +Inf, -Inf} is treated as \code{NA}. - Setting \code{algo="exact"} will make rolling functions to use - compute-intensive algorithm that suffers less from floating point - rounding error. It also handles \code{NaN, +Inf, -Inf} consistently to + When \code{algo="fast"} an \emph{"on-line"} algorithm is used, and + all of \code{NaN, +Inf, -Inf} are treated as \code{NA}. + Setting \code{algo="exact"} will make rolling functions to use a more + computationally-intensive algorithm that suffers less from floating point + rounding error (the same consideration applies to \code{\link[base]{mean}}). + \code{algo="exact"} also handles \code{NaN, +Inf, -Inf} consistently to base R. In case of some functions (like \emph{mean}), it will additionally make extra pass to perform floating point error correction. Error corrections might not be truly exact on some platforms (like Windows) when using multiple threads. - Adaptive rolling functions are special cases where for each single - observation has own corresponding rolling window width. Due to the logic - of adaptive rolling functions, following restrictions apply: + Adaptive rolling functions are a special case where each + observation has its own corresponding rolling window width. Due to the logic + of adaptive rolling functions, the following restrictions apply: \itemize{ \item{ \code{align} only \code{"right"}. } \item{ if list of vectors is passed to \code{x}, then all - list vectors must have equal length. } + vectors within it must have equal length. } } When multiple columns or multiple windows width are provided, then they - are run in parallel. Except for the \code{algo="exact"} which runs in + are run in parallel. The exception is for \code{algo="exact"}, which runs in parallel already. \code{frollapply} computes rolling aggregate on arbitrary R functions. From b97568a07e1669612e243119b3611bc90e514469 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 26 May 2021 17:57:18 -0700 Subject: [PATCH 269/588] mirror fix to base R for as.IDate (#4692) --- NEWS.md | 4 +++- R/IDateTime.R | 4 ++++ inst/tests/tests.Rraw | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 1c415854d3..4e108fae47 100644 --- a/NEWS.md +++ b/NEWS.md @@ -43,7 +43,7 @@ Item 2 has 0 rows but longest item has 3; filled with NA ``` -5. `%like%` on factors with a large number of levels is now faster, [#4748](https://github.com/Rdatatable/data.table/issues/4748). The example in the PR shows 2.37s reduced to 0.86s on a factor lengh 100 million containing 1 million unique 10-character strings. Thanks to @statquant for reporting, and @shrektan for implementing. +5. `%like%` on factors with a large number of levels is now faster, [#4748](https://github.com/Rdatatable/data.table/issues/4748). The example in the PR shows 2.37s reduced to 0.86s on a factor length 100 million containing 1 million unique 10-character strings. Thanks to @statquant for reporting, and @shrektan for implementing. 6. `keyby=` now accepts `TRUE`/`FALSE` together with `by=`, [#4307](https://github.com/Rdatatable/data.table/issues/4307). The primary motivation is benchmarking where `by=` vs `keyby=` is varied across a set of queries. Thanks to Jan Gorecki for the request and the PR. @@ -131,6 +131,8 @@ 18. `as.data.table()` on `xts` objects containing a column named `x` would return an `index` of type plain `integer` rather than `POSIXct`, [#4897](https://github.com/Rdatatable/data.table/issues/4897). Thanks to Emil Sjørup for reporting, and Jan Gorecki for the PR. +19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/IDateTime.R b/R/IDateTime.R index 0bf7bf0fac..cfacfacbb2 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -7,6 +7,10 @@ as.IDate = function(x, ...) UseMethod("as.IDate") as.IDate.default = function(x, ..., tz = attr(x, "tzone", exact=TRUE)) { if (is.null(tz)) tz = "UTC" + if (is.character(x)) { + # 4676 mimics for back-compatibility a similar patch applied to as.Date.character in r79119 + is.na(x) = !nzchar(x) + } as.IDate(as.Date(x, tz = tz, ...)) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 12a38ffae4..218de465da 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17705,4 +17705,7 @@ test(2190.9, DT[1:2, a:=call('sum', 1)], error="type 'language' cannot be coerce DT = data.table(i1 = c(234L, 250L, 169L, 234L, 147L, 96L, 96L, 369L, 147L, 96L), i4 = c(79L, 113L, 270L, -121L, 113L, 113L, -121L, 179L, -228L, 113L), v = 0) test(2191, DT[1:5, sum(v), by=.(i5 = 1:5 %% 2L), verbose=TRUE], data.table(i5=1:0, V1=c(0,0)), output="gforce") +# base::as.Date was error when first item blank, affecting as.IDate, #4676 +test(2192.1, as.IDate(c('', '2020-01-01')), structure(c(NA_integer_, 18262L), class=c("IDate","Date"))) +test(2192.2, as.IDate(c('2020-01-01', '')), structure(c(18262L, NA_integer_), class=c("IDate","Date"))) From df8009dda8a6b59f3dac98da3c27ab4b93fb4316 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 28 May 2021 17:33:39 -0600 Subject: [PATCH 270/588] comment only, follow up to #4692 --- R/IDateTime.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/IDateTime.R b/R/IDateTime.R index cfacfacbb2..832424091f 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -8,7 +8,7 @@ as.IDate = function(x, ...) UseMethod("as.IDate") as.IDate.default = function(x, ..., tz = attr(x, "tzone", exact=TRUE)) { if (is.null(tz)) tz = "UTC" if (is.character(x)) { - # 4676 mimics for back-compatibility a similar patch applied to as.Date.character in r79119 + # backport of similar patch to base::as.Date.character in R 4.0.3, #4676 is.na(x) = !nzchar(x) } as.IDate(as.Date(x, tz = tz, ...)) From e35bea257a1b346e02c926c3d608fe05947a1331 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 28 May 2021 17:43:08 -0600 Subject: [PATCH 271/588] tweak froll.Rd only, follow up to #4551 --- man/froll.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/froll.Rd b/man/froll.Rd index a18ef9545e..090b397a90 100644 --- a/man/froll.Rd +++ b/man/froll.Rd @@ -38,8 +38,8 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) \item{\dots}{ Extra arguments passed to \code{FUN} in \code{frollapply}. } } \details{ - \code{froll*} functions accept vectors, lists, \code{data.frames} or - \code{data.tables}. They always return a list except when the input is a + \code{froll*} functions accept vectors, lists, \code{data.frame}s or + \code{data.table}s. They always return a list except when the input is a \code{vector} and \code{length(n)==1}, in which case a \code{vector} is returned, for convenience. Thus, rolling functions can be used conveniently within \code{data.table} syntax. From 8fc2066518de8b3f50525a2f515eafb677cbdc8d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 28 May 2021 21:04:12 -0700 Subject: [PATCH 272/588] Extend error message for auto-naming (#5029) --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index fa92561489..b405a3c2af 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -945,7 +945,7 @@ replace_dot_alias = function(e) { if (length(nm) != length(jvnames)) warning("j may not evaluate to the same number of columns for each group; if you're sure this warning is in error, please put the branching logic outside of [ for efficiency") else if (any(idx <- nm != jvnames)) - warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE) + warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names. If this was intentional (e.g., you know only one branch will ever be used in a given query because the branch is controlled by a function argument), please (1) pull this branch out of the call; (2) explicitly provide missing defaults for each branch in all cases; or (3) use the same name for each branch and re-name it in a follow-up call.', call. = FALSE) } jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group; reinstated at the end on the result. From feebb0e3700e0a38d53588ce420071c480170c51 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 8 Jun 2021 17:42:26 -0600 Subject: [PATCH 273/588] .dev-only: revdep.R's dump.frames needed utils:: prefix and improved incorrect Makevars handling --- .dev/revdep.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 49aa6e06f9..7471671bcc 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -9,7 +9,7 @@ Sys.unsetenv("R_PROFILE_USER") # options copied from .dev/.Rprofile that aren't run due to the way this script is started via a profile options(help_type="html") -options(error=quote(dump.frames())) +options(error=quote(utils::dump.frames())) options(width=200) # for cran() output not to wrap # Check that env variables have been set correctly: @@ -36,10 +36,12 @@ stopifnot(identical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_"),"true")) # e.g. https://github.com/reimandlab/ActivePathways/issues/14 cflags = system("grep \"^[^#]*CFLAGS\" ~/.R/Makevars", intern=TRUE) -cat("~/.R/Makevars contains", cflags, "ok\n") -if (!grepl("^CFLAGS=-O[0-3]$", cflags)) { +cat("~/.R/Makevars contains", cflags) +if (!grepl("^CFLAGS=-O[0-3] *$", cflags)) { stop("Some packages have failed to install in the past (e.g. processx and RGtk2) when CFLAGS contains -pedandic, -Wall, and similar. ", - "So for revdepr keep CFLAGS simple; i.e. -O[0-3] only.") + "So for revdepr keep CFLAGS simple; i.e. -O[0-3] only. Check ~/.R/Makevars.") +} else { + cat(" ok\n") } options(repos = c("CRAN"=c("http://cloud.r-project.org"))) From d9e1afd5519a3fe32caf02358cdf5f374e3831e8 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 9 Jun 2021 16:36:18 -0600 Subject: [PATCH 274/588] .dev-only: cran() now has 2 macos for r-release --- .dev/revdep.R | 1 - 1 file changed, 1 deletion(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 7471671bcc..c172eb163f 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -250,7 +250,6 @@ cran = function() # reports CRAN status of the .cran.fail packages cat("tools::CRAN_check_results() returned",prettyNum(nrow(db), big.mark=","),"rows in",timetaken(p),"\n") rel = unique(db$Flavor) rel = sort(rel[grep("release",rel)]) - stopifnot(identical(rel, c("r-release-linux-x86_64", "r-release-macos-x86_64", "r-release-windows-ix86+x86_64"))) cat("R-release is used for revdep checking so comparing to CRAN results for R-release\n") ans = db[Package %chin% .fail.cran & Flavor %chin% rel, Status, keyby=.(Package, Flavor)] dcast(ans, Package~Flavor, value.var="Status", fill="")[.fail.cran,] From 7f05222f46b83d235ddc1847dd32d54f6d90f4b4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 10 Jun 2021 10:42:08 -0700 Subject: [PATCH 275/588] remove domain= usage in setops.R (#5038) --- R/setops.R | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/R/setops.R b/R/setops.R index bd3a4eed27..d8fcb9dfcf 100644 --- a/R/setops.R +++ b/R/setops.R @@ -157,9 +157,8 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu return(gettextf( "Datasets have different %s. 'target': %s. 'current': %s.", "keys", - if(length(k1)) brackify(k1) else gettextf("has no key", domain="R-data.table"), - if(length(k2)) brackify(k2) else gettextf("has no key", domain="R-data.table"), - domain="R-data.table" + if(length(k1)) brackify(k1) else gettextf("has no key"), + if(length(k2)) brackify(k2) else gettextf("has no key") )) } # check index @@ -169,9 +168,8 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu return(gettextf( "Datasets have different %s. 'target': %s. 'current': %s.", "indices", - if(length(i1)) brackify(i1) else gettextf("has no index", domain = "R-data.table"), - if(length(i2)) brackify(i2) else gettextf("has no index", domain = "R-data.table"), - domain = "R-data.table" + if(length(i1)) brackify(i1) else gettextf("has no index"), + if(length(i2)) brackify(i2) else gettextf("has no index") )) } From f08fc1585bd5f2c9c831bf24c0b251ff5f39d3b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roman=20Lu=C5=A1trik?= Date: Fri, 11 Jun 2021 00:25:44 +0200 Subject: [PATCH 276/588] Mention the different default of data.frame stringsAsFactors in R >= 4.0 (#4560) --- vignettes/datatable-faq.Rmd | 2 +- vignettes/datatable-intro.Rmd | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index 816cb99882..1df42e166c 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -395,7 +395,7 @@ A key advantage of column vectors in R is that they are _ordered_, unlike SQL[^2 - `DT[ColA == ColB]` is simpler than `DF[!is.na(ColA) & !is.na(ColB) & ColA == ColB, ]` - `data.frame(list(1:2, "k", 1:4))` creates 3 columns, data.table creates one `list` column. - `check.names` is by default `TRUE` in `data.frame` but `FALSE` in data.table, for convenience. - - `stringsAsFactors` is by default `TRUE` in `data.frame` but `FALSE` in data.table, for efficiency. Since a global string cache was added to R, characters items are a pointer to the single cached string and there is no longer a performance benefit of converting to `factor`. + - `data.table` has always set `stringsAsFactors=FALSE` by default. In R 4.0.0 (Apr 2020), `data.frame`'s default was changed from `TRUE` to `FALSE` and there is no longer a difference in this regard; see [stringsAsFactors, Kurt Hornik, Feb 2020](https://developer.r-project.org/Blog/public/2020/02/16/stringsasfactors/). - Atomic vectors in `list` columns are collapsed when printed using `", "` in `data.frame`, but `","` in data.table with a trailing comma after the 6th item to avoid accidental printing of large embedded objects. In `[.data.frame` we very often set `drop = FALSE`. When we forget, bugs can arise in edge cases where single columns are selected and all of a sudden a vector is returned rather than a single column `data.frame`. In `[.data.table` we took the opportunity to make it consistent and dropped `drop`. diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index 1dcfe786f5..c5da5d87d8 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -88,8 +88,6 @@ You can also convert existing objects to a `data.table` using `setDT()` (for `da #### Note that: {.bs-callout .bs-callout-info} -* Unlike `data.frame`s, columns of `character` type are *never* converted to `factors` by default. - * Row numbers are printed with a `:` in order to visually separate the row number from the first column. * When the number of rows to print exceeds the global option `datatable.print.nrows` (default = `r getOption("datatable.print.nrows")`), it automatically prints only the top 5 and bottom 5 rows (as can be seen in the [Data](#data) section). If you've had a lot of experience with `data.frame`s, you may have found yourself waiting around while larger tables print-and-page, sometimes seemingly endlessly. You can query the default number like so: From 11f2d7e0f7f7ad355ddd108276fb1948d4703719 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 11 Jun 2021 01:47:52 +0200 Subject: [PATCH 277/588] unit tests, closes #2530 (#4578) --- inst/tests/tests.Rraw | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 218de465da..e63e5cf33d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17709,3 +17709,10 @@ test(2191, DT[1:5, sum(v), by=.(i5 = 1:5 %% 2L), verbose=TRUE], data.table(i5=1: test(2192.1, as.IDate(c('', '2020-01-01')), structure(c(NA_integer_, 18262L), class=c("IDate","Date"))) test(2192.2, as.IDate(c('2020-01-01', '')), structure(c(18262L, NA_integer_), class=c("IDate","Date"))) +# subassign coerce to integer64 was fixed in 1.12.4, #2530 +if (test_bit64) { + DT = data.table(a = as.integer64(1:10)) + DT[a==1, a:=12] + DT[a==2, a:=as.integer64(13)] + test(2193, DT, data.table(a = as.integer64(c(12,13,3:10)))) +} From ad5b4275b1861c7a0f6bcd509fb804e5210a3ba0 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 11 Jun 2021 02:25:30 +0200 Subject: [PATCH 278/588] unit tests for #3779 (#4576) --- inst/tests/tests.Rraw | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e63e5cf33d..5460aec8cc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17709,10 +17709,15 @@ test(2191, DT[1:5, sum(v), by=.(i5 = 1:5 %% 2L), verbose=TRUE], data.table(i5=1: test(2192.1, as.IDate(c('', '2020-01-01')), structure(c(NA_integer_, 18262L), class=c("IDate","Date"))) test(2192.2, as.IDate(c('2020-01-01', '')), structure(c(18262L, NA_integer_), class=c("IDate","Date"))) -# subassign coerce to integer64 was fixed in 1.12.4, #2530 if (test_bit64) { + # subassign coerce to integer64 was fixed in 1.12.4, #2530 DT = data.table(a = as.integer64(1:10)) DT[a==1, a:=12] DT[a==2, a:=as.integer64(13)] - test(2193, DT, data.table(a = as.integer64(c(12,13,3:10)))) + test(2193.1, DT, data.table(a = as.integer64(c(12,13,3:10)))) + + # X[Y,,by=.EACHI] when Y contains integer64 also fixed in 1.12.4, #3779 + X = data.table(x=1:3) + Y = data.table(x=1:2, y=as.integer64(c(10,20))) + test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) } From 36e1b02c2a42533cabb5e729c2a7b53c5ade723c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 11 Jun 2021 11:28:02 -0700 Subject: [PATCH 279/588] some sapply->vapply replacements (#4502) --- R/as.data.table.R | 2 +- R/cedta.R | 2 +- R/data.table.R | 2 +- R/fread.R | 6 +++--- R/groupingsets.R | 4 ++-- R/setkey.R | 2 +- R/utils.R | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index 9d286d7f16..75e8d23ae0 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -97,7 +97,7 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va # NULL dimnames will create integer keys, not character as in table method val = if (is.null(dnx)) { lapply(dx, seq.int) - } else if (any(nulldnx<-sapply(dnx, is.null))) { + } else if (any(nulldnx <- vapply_1b(dnx, is.null))) { dnx[nulldnx] = lapply(dx[nulldnx], seq.int) #3636 dnx } else dnx diff --git a/R/cedta.R b/R/cedta.R index d3a90e93cc..7ace210079 100644 --- a/R/cedta.R +++ b/R/cedta.R @@ -34,7 +34,7 @@ cedta = function(n=2L) { (exists("debugger.look", parent.frame(n+1L)) || (length(sc<-sys.calls())>=8L && sc[[length(sc)-7L]] %iscall% 'example')) ) || # 'example' for #2972 (nsname=="base" && all(c("FUN", "X") %chin% ls(parent.frame(n)))) || # lapply - (nsname %chin% cedta.pkgEvalsUserCode && any(sapply(sys.calls(), function(x) is.name(x[[1L]]) && (x[[1L]]=="eval" || x[[1L]]=="evalq")))) || + (nsname %chin% cedta.pkgEvalsUserCode && any(vapply_1b(sys.calls(), function(x) is.name(x[[1L]]) && (x[[1L]]=="eval" || x[[1L]]=="evalq")))) || nsname %chin% cedta.override || isTRUE(ns$.datatable.aware) || # As of Sep 2018: RCAS, caretEnsemble, dtplyr, rstanarm, rbokeh, CEMiTool, rqdatatable, RImmPort, BPRMeth, rlist tryCatch("data.table" %chin% get(".Depends",paste("package",nsname,sep=":"),inherits=FALSE),error=function(e)FALSE) # both ns$.Depends and get(.Depends,ns) are not sufficient diff --git a/R/data.table.R b/R/data.table.R index b405a3c2af..85f1d9edb3 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1404,7 +1404,7 @@ replace_dot_alias = function(e) { setattr(jval, 'class', class(x)) # fix for #64 if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) setattr(jval, 'sorted', key(x)) - if (any(sapply(jval, is.null))) stop("Internal error: j has created a data.table result containing a NULL column") # nocov + if (any(vapply_1b(jval, is.null))) stop("Internal error: j has created a data.table result containing a NULL column") # nocov } return(jval) } diff --git a/R/fread.R b/R/fread.R index 2f918fb2bd..236a30bb76 100644 --- a/R/fread.R +++ b/R/fread.R @@ -206,7 +206,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } if ('schema' %chin% yaml_names) { new_types = sapply(yaml_header$schema$fields, `[[`, 'type') - if (any(null_idx <- sapply(new_types, is.null))) + if (any(null_idx <- vapply_1b(new_types, is.null))) new_types = do.call(c, new_types) synonms = rbindlist(list( character = list(syn = c('character', 'string')), @@ -346,10 +346,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } if (yaml) setattr(ans, 'yaml_metadata', yaml_header) if (!is.null(index) && data.table) { - if (!all(sapply(index, is.character))) + if (!all(vapply_1b(index, is.character))) stop("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") if (is.list(index)) { - to_split = sapply(index, length) == 1L + to_split = vapply_1i(index, length) == 1L if (any(to_split)) index[to_split] = sapply(index[to_split], strsplit, split = ",", fixed = TRUE) } else { diff --git a/R/groupingsets.R b/R/groupingsets.R index 5c3ad02d4b..2300d09da0 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -53,7 +53,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) stop("Argument 'by' must be a character vector of column names used in grouping.") if (anyDuplicated(by) > 0L) stop("Argument 'by' must have unique column names for grouping.") - if (!is.list(sets) || !all(sapply(sets, is.character))) + if (!is.list(sets) || !all(vapply_1b(sets, is.character))) stop("Argument 'sets' must be a list of character vectors.") if (!is.logical(id)) stop("Argument 'id' must be a logical scalar.") @@ -62,7 +62,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) stop("All columns used in 'sets' argument must be in 'by' too. Columns used in 'sets' but not present in 'by': ", brackify(setdiff(sets.all.by, by))) if (id && "grouping" %chin% names(x)) stop("When using `id=TRUE` the 'x' data.table must not have a column named 'grouping'.") - if (any(sapply(sets, anyDuplicated))) + if (any(vapply_1i(sets, anyDuplicated))) # anyDuplicated returns index of first duplicate, otherwise 0L stop("Character vectors in 'sets' list must not have duplicated column names within a single grouping set.") if (length(sets) > 1L && (idx<-anyDuplicated(lapply(sets, sort)))) warning("'sets' contains a duplicate (i.e., equivalent up to sorting) element at index ", idx, "; as such, there will be duplicate rows in the output -- note that grouping by A,B and B,A will produce the same aggregations. Use `sets=unique(lapply(sets, sort))` to eliminate duplicates.") diff --git a/R/setkey.R b/R/setkey.R index b9b324ac4c..e9f18398ab 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -184,7 +184,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.las forder = function(..., na.last=TRUE, decreasing=FALSE) { sub = substitute(list(...)) - tt = sapply(sub, function(x) is.null(x) || (is.symbol(x) && !nzchar(x))) + tt = vapply_1b(sub, function(x) is.null(x) || (is.symbol(x) && !nzchar(x))) if (any(tt)) sub[tt] = NULL # remove any NULL or empty arguments; e.g. test 1962.052: forder(DT, NULL) and forder(DT, ) if (length(sub)<2L) return(NULL) # forder() with no arguments returns NULL consistent with base::order asc = rep.int(1L, length(sub)-1L) # ascending (1) or descending (-1) per column diff --git a/R/utils.R b/R/utils.R index ecffb64226..75a45b8991 100644 --- a/R/utils.R +++ b/R/utils.R @@ -45,7 +45,7 @@ which.last = function(x) require_bit64_if_needed = function(DT) { # called in fread and print.data.table - if (!isNamespaceLoaded("bit64") && any(sapply(DT,inherits,"integer64"))) { + if (!isNamespaceLoaded("bit64") && any(vapply_1b(DT, inherits, "integer64"))) { # nocov start # a test was attempted to cover the requireNamespace() by using unloadNamespace() first, but that fails when nanotime is loaded because nanotime also uses bit64 if (!requireNamespace("bit64",quietly=TRUE)) { @@ -84,7 +84,7 @@ name_dots = function(...) { } notnamed = vnames=="" if (any(notnamed)) { - syms = sapply(dot_sub, is.symbol) # save the deparse() in most cases of plain symbol + syms = vapply_1b(dot_sub, is.symbol) # save the deparse() in most cases of plain symbol for (i in which(notnamed)) { tmp = if (syms[i]) as.character(dot_sub[[i]]) else deparse(dot_sub[[i]])[1L] if (tmp == make.names(tmp)) vnames[i]=tmp From 78264b3b182a157c1a030269919f0133026405f2 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 11 Jun 2021 16:08:38 -0700 Subject: [PATCH 280/588] catch some partial matching issues in examples (#4506) --- .dev/CRAN_Release.cmd | 5 ++++ man/address.Rd | 4 +++ man/dcast.data.table.Rd | 36 +++++++++++------------ man/deprecated.Rd | 3 ++ man/melt.data.table.Rd | 64 ++++++++++++++++++++++------------------- man/openmp-utils.Rd | 3 ++ man/shouldPrint.Rd | 4 ++- man/test.data.table.Rd | 5 ++++ 8 files changed, 76 insertions(+), 48 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index f5ffdae7d7..3c8a2ee0ae 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -254,6 +254,11 @@ require(data.table) test.data.table(script="other.Rraw") test.data.table(script="*.Rraw") test.data.table(verbose=TRUE) # since main.R no longer tests verbose mode + +# check example() works on every exported function, with these sticter options too, and also that all help pages have examples +options(warn=2, warnPartialMatchArgs=TRUE, warnPartialMatchAttr=TRUE, warnPartialMatchDollar=TRUE) +invisible(lapply(objects(pos="package:data.table"), example, character.only=TRUE, echo=FALSE, ask=FALSE)) + gctorture2(step=50) system.time(test.data.table(script="*.Rraw")) # apx 8h = froll 3h + nafill 1m + main 5h diff --git a/man/address.Rd b/man/address.Rd index 258c0241f2..2c390a3a61 100644 --- a/man/address.Rd +++ b/man/address.Rd @@ -19,5 +19,9 @@ Sometimes useful in determining whether a value has been copied or not, programm \references{ \url{https://stackoverflow.com/a/10913296/403310} (but implemented in C without using \code{.Internal(inspect())}) } +\examples{ +x=1 +address(x) +} \keyword{ data } diff --git a/man/dcast.data.table.Rd b/man/dcast.data.table.Rd index daf9fba655..2aa265a96c 100644 --- a/man/dcast.data.table.Rd +++ b/man/dcast.data.table.Rd @@ -61,16 +61,16 @@ Historical note: \code{dcast.data.table} was originally designed as an enhanceme \examples{ ChickWeight = as.data.table(ChickWeight) setnames(ChickWeight, tolower(names(ChickWeight))) -DT <- melt(as.data.table(ChickWeight), id=2:4) # calls melt.data.table +DT <- melt(as.data.table(ChickWeight), id.vars=2:4) # calls melt.data.table # dcast is an S3 method in data.table from v1.9.6 -dcast(DT, time ~ variable, fun=mean) # using partial matching of argument -dcast(DT, diet ~ variable, fun=mean) +dcast(DT, time ~ variable, fun.aggregate=mean) +dcast(DT, diet ~ variable, fun.aggregate=mean) dcast(DT, diet+chick ~ time, drop=FALSE) dcast(DT, diet+chick ~ time, drop=FALSE, fill=0) # using subset -dcast(DT, chick ~ time, fun=mean, subset=.(time < 10 & chick < 20)) +dcast(DT, chick ~ time, fun.aggregate=mean, subset=.(time < 10 & chick < 20)) # drop argument, #1512 DT <- data.table(v1 = c(1.1, 1.1, 1.1, 2.2, 2.2, 2.2), @@ -78,37 +78,37 @@ DT <- data.table(v1 = c(1.1, 1.1, 1.1, 2.2, 2.2, 2.2), v3 = factor(c(2L, 3L, 5L, 1L, 2L, 6L), levels=1:6), v4 = c(3L, 2L, 2L, 5L, 4L, 3L)) # drop=TRUE -dcast(DT, v1 + v2 ~ v3) # default is drop=TRUE -dcast(DT, v1 + v2 ~ v3, drop=FALSE) # all missing combinations of both LHS and RHS -dcast(DT, v1 + v2 ~ v3, drop=c(FALSE, TRUE)) # all missing combinations of only LHS -dcast(DT, v1 + v2 ~ v3, drop=c(TRUE, FALSE)) # all missing combinations of only RHS +dcast(DT, v1+v2~v3, value.var='v4') # default is drop=TRUE +dcast(DT, v1+v2~v3, value.var='v4', drop=FALSE) # all missing combinations of LHS and RHS +dcast(DT, v1+v2~v3, value.var='v4', drop=c(FALSE, TRUE)) # all missing combinations of LHS only +dcast(DT, v1+v2~v3, value.var='v4', drop=c(TRUE, FALSE)) # all missing combinations of RHS only # using . and ... DT <- data.table(v1 = rep(1:2, each = 6), v2 = rep(rep(1:3, 2), each = 2), v3 = rep(1:2, 6), v4 = rnorm(6)) -dcast(DT, \dots ~ v3, value.var = "v4") #same as v1 + v2 ~ v3, value.var = "v4" -dcast(DT, v1 + v2 + v3 ~ ., value.var = "v4") +dcast(DT, \dots ~ v3, value.var="v4") # same as v1+v2 ~ v3, value.var="v4" +dcast(DT, v1+v2+v3 ~ ., value.var="v4") ## for each combination of (v1, v2), add up all values of v4 -dcast(DT, v1 + v2 ~ ., value.var = "v4", fun.aggregate = sum) +dcast(DT, v1+v2 ~ ., value.var="v4", fun.aggregate=sum) # fill and types -dcast(DT, v2 ~ v3, value.var = 'v1', fill = 0L) # 0L --> 0 -dcast(DT, v2 ~ v3, value.var = 'v4', fill = 1.1) # 1.1 --> 1L +dcast(DT, v2~v3, value.var='v1', fun.aggregate=length, fill=0L) # 0L --> 0 +dcast(DT, v2~v3, value.var='v4', fun.aggregate=length, fill=1.1) # 1.1 --> 1L # multiple value.var and multiple fun.aggregate DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), - z=sample(letters[1:2], 20,TRUE), d1 = runif(20), d2=1L) + z=sample(letters[1:2], 20,TRUE), d1=runif(20), d2=1L) # multiple value.var -dcast(DT, x + y ~ z, fun=sum, value.var=c("d1","d2")) +dcast(DT, x+y ~ z, fun.aggregate=sum, value.var=c("d1","d2")) # multiple fun.aggregate -dcast(DT, x + y ~ z, fun=list(sum, mean), value.var="d1") +dcast(DT, x+y ~ z, fun.aggregate=list(sum, mean), value.var="d1") # multiple fun.agg and value.var (all combinations) -dcast(DT, x + y ~ z, fun=list(sum, mean), value.var=c("d1", "d2")) +dcast(DT, x+y ~ z, fun.aggregate=list(sum, mean), value.var=c("d1", "d2")) # multiple fun.agg and value.var (one-to-one) -dcast(DT, x + y ~ z, fun=list(sum, mean), value.var=list("d1", "d2")) +dcast(DT, x+y ~ z, fun.aggregate=list(sum, mean), value.var=list("d1", "d2")) } \seealso{ \code{\link{melt.data.table}}, \code{\link{rowid}}, \url{https://cran.r-project.org/package=reshape} diff --git a/man/deprecated.Rd b/man/deprecated.Rd index c1bb9afc16..da138d8734 100644 --- a/man/deprecated.Rd +++ b/man/deprecated.Rd @@ -8,6 +8,9 @@ \usage{ key(x) <- value # warning since 2012; DEPRECATED since Mar 2019 } +\examples{ +# dummy example section to pass release check that all .Rd files have examples +} \arguments{ \item{x}{ Deprecated. } } diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 3794231f99..ddca733fe8 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -91,62 +91,68 @@ An unkeyed \code{data.table} containing the molten data. set.seed(45) require(data.table) DT <- data.table( - i_1 = c(1:5, NA), - i_2 = c(NA,6,7,8,9,10), - f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)), - f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE), - c_1 = sample(c(letters[1:3], NA), 6, TRUE), - d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"), - d_2 = as.Date(6:1, origin="2012-01-01")) + i_1 = c(1:5, NA), + n_1 = c(NA, 6, 7, 8, 9, 10), + f_1 = factor(sample(c(letters[1:3], NA), 6L, TRUE)), + f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE), + c_1 = sample(c(letters[1:3], NA), 6L, TRUE), + c_2 = sample(c(LETTERS[1:2], NA), 6L, TRUE), + d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"), + d_2 = as.Date(6:1, origin="2012-01-01") +) # add a couple of list cols -DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c] -DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c] +DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5, 1L)))), by = i_1]$c] +DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5, 1L)))), by = i_1]$c] -# id, measure as character/integer/numeric vectors -melt(DT, id=1:2, measure="f_1") -melt(DT, id=c("i_1", "i_2"), measure=3) # same as above -melt(DT, id=1:2, measure=3L, value.factor=TRUE) # same, but 'value' is factor -melt(DT, id=1:2, measure=3:4, value.factor=TRUE) # 'value' is *ordered* factor +# id.vars, measure.vars as character/integer/numeric vectors +melt(DT, id.vars=1:2, measure.vars="f_1") +melt(DT, id.vars=c("i_1", "n_1"), measure.vars=3) # same as above +melt(DT, id.vars=1:2, measure.vars=3L, value.factor=TRUE) # same, but 'value' is factor +melt(DT, id.vars=1:2, measure.vars=3:4, value.factor=TRUE) # 'value' is *ordered* factor # preserves attribute when types are identical, ex: Date -melt(DT, id=3:4, measure=c("d_1", "d_2")) -melt(DT, id=3:4, measure=c("i_1", "d_1")) # attribute not preserved +melt(DT, id.vars=3:4, measure.vars=c("d_1", "d_2")) +melt(DT, id.vars=3:4, measure.vars=c("n_1", "d_1")) # attribute not preserved # on list -melt(DT, id=1, measure=c("l_1", "l_2")) # value is a list -melt(DT, id=1, measure=c("c_1", "l_1")) # c1 coerced to list +melt(DT, id.vars=1, measure.vars=c("l_1", "l_2")) # value is a list +suppressWarnings( + melt(DT, id.vars=1, measure.vars=c("c_1", "l_1")) # c1 coerced to list, with warning +) # on character -melt(DT, id=1, measure=c("c_1", "f_1")) # value is char -melt(DT, id=1, measure=c("c_1", "i_2")) # i2 coerced to char +melt(DT, id.vars=1, measure.vars=c("c_1", "f_1")) # value is char +suppressWarnings( + melt(DT, id.vars=1, measure.vars=c("c_1", "n_1")) # n_1 coerced to char, with warning +) # on na.rm=TRUE. NAs are removed efficiently, from within C -melt(DT, id=1, measure=c("c_1", "i_2"), na.rm=TRUE) # remove NA +melt(DT, id.vars=1, measure.vars=c("c_1", "c_2"), na.rm=TRUE) # remove NA # measure.vars can be also a list # melt "f_1,f_2" and "d_1,d_2" simultaneously, retain 'factor' attribute # convenient way using internal function patterns() -melt(DT, id=1:2, measure=patterns("^f_", "^d_"), value.factor=TRUE) +melt(DT, id.vars=1:2, measure.vars=patterns("^f_", "^d_"), value.factor=TRUE) # same as above, but provide list of columns directly by column names or indices -melt(DT, id=1:2, measure=list(3:4, c("d_1", "d_2")), value.factor=TRUE) +melt(DT, id.vars=1:2, measure.vars=list(3:4, c("d_1", "d_2")), value.factor=TRUE) # same as above, but provide names directly: -melt(DT, id=1:2, measure=patterns(f="^f_", d="^d_"), value.factor=TRUE) +melt(DT, id.vars=1:2, measure.vars=patterns(f="^f_", d="^d_"), value.factor=TRUE) # na.rm=TRUE removes rows with NAs in any 'value' columns -melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) +melt(DT, id.vars=1:2, measure.vars=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) # return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column -melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE) +melt(DT, id.vars=1:2, measure.vars=patterns("l_", "c_"), na.rm=TRUE) # measure list with missing/short entries results in output with runs of NA DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] -melt(DT.missing.cols, measure=list(d=1:2, c="c_1", f=c(NA, "f_2"))) +melt(DT.missing.cols, measure.vars=list(d=1:2, c="c_1", f=c(NA, "f_2"))) # specifying columns to melt via separator. -melt(DT.missing.cols, measure=measure(value.name, number=as.integer, sep="_")) +melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, sep="_")) # specifying columns to melt via regex. -melt(DT.missing.cols, measure=measure(value.name, number=as.integer, pattern="(.)_(.)")) +melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, pattern="(.)_(.)")) } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} diff --git a/man/openmp-utils.Rd b/man/openmp-utils.Rd index f3f616a6e4..71e469ed72 100644 --- a/man/openmp-utils.Rd +++ b/man/openmp-utils.Rd @@ -51,4 +51,7 @@ \item{\file{types.c} - Internal testing usage} } } +\examples{ + getDTthreads(verbose=TRUE) +} \keyword{ data } diff --git a/man/shouldPrint.Rd b/man/shouldPrint.Rd index 80851f53d8..b3e1bcdc9b 100644 --- a/man/shouldPrint.Rd +++ b/man/shouldPrint.Rd @@ -21,5 +21,7 @@ \url{https://github.com/IRkernel/IRkernel/issues/127}\cr \url{https://github.com/Rdatatable/data.table/issues/933}\cr } - +\examples{ +# dummy example section to pass release check that all .Rd files have examples +} diff --git a/man/test.data.table.Rd b/man/test.data.table.Rd index e84ae4797d..ba0fe25f9c 100644 --- a/man/test.data.table.Rd +++ b/man/test.data.table.Rd @@ -25,4 +25,9 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", If all tests were successful, \code{TRUE} is returned. Otherwise, see the \code{silent} argument above. \code{silent=TRUE} is intended for use at the start of production scripts; e.g. \code{stopifnot(test.data.table(silent=TRUE))} to check \code{data.table} is passing its own tests before proceeding. } \seealso{ \code{\link{data.table}}, \code{\link{test}} } +\examples{ + \dontrun{ + test.data.table() + } +} \keyword{ data } From a58f72b7f2fe868328496dd16c03947cf1245540 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 12 Jun 2021 01:53:30 +0200 Subject: [PATCH 281/588] tests for fread on 1 row datetime input (#4475) --- inst/tests/tests.Rraw | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5460aec8cc..41f8978b17 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17228,6 +17228,10 @@ if (TZnotUTC) { test(2150.20, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)), ans, output=ans_print) } +# fread single row single column datetime field, #2609 +test(2150.21, fread("c1\n2018-01-31 03:16:57"), data.table(V1=as.IDate("2018-01-31"), c1="03:16:57"), + warning="Detected 1 column names but the data has 2 columns") +test(2150.22, fread("c1\n2018-01-31 03:16:57", sep=""), data.table(c1=as.POSIXct("2018-01-31 03:16:57", tz="UTC"))) options(old) # 1 is treated as . in dcast formula, #4615 @@ -17721,3 +17725,4 @@ if (test_bit64) { Y = data.table(x=1:2, y=as.integer64(c(10,20))) test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) } + From b074df12e8ff28d5c8b243b8e0ae5f0171d3e76f Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 11 Jun 2021 22:26:20 -0700 Subject: [PATCH 282/588] convenience macros for fread (#4462) --- src/fread.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/fread.c b/src/fread.c index 33c707c4f1..da3e0e18ea 100644 --- a/src/fread.c +++ b/src/fread.c @@ -103,7 +103,8 @@ static void Field(FieldParseContext *ctx); #define ASSERT(cond, msg, ...) \ if (!(cond)) STOP(_("Internal error in line %d of fread.c, please report on data.table GitHub: " msg), __LINE__, __VA_ARGS__) // # nocov - +#define AS_DIGIT(x) (uint_fast8_t)(x - '0') +#define IS_DIGIT(x) AS_DIGIT(x) < 10 //================================================================================================= // @@ -575,7 +576,7 @@ static void str_to_i32_core(const char **pch, int32_t *target) { const char *ch = *pch; - if (*ch=='0' && args.keepLeadingZeros && (uint_fast8_t)(ch[1]-'0')<10) return; + if (*ch=='0' && args.keepLeadingZeros && IS_DIGIT(ch[1])) return; bool neg = *ch=='-'; ch += (neg || *ch=='+'); const char *start = ch; // to know if at least one digit is present @@ -590,7 +591,7 @@ static void str_to_i32_core(const char **pch, int32_t *target) // number significant figures = digits from the first non-zero onwards including trailing zeros while (*ch=='0') ch++; uint_fast32_t sf = 0; - while ( (digit=(uint_fast8_t)(ch[sf]-'0'))<10 ) { + while ( (digit=AS_DIGIT(ch[sf]))<10 ) { acc = 10*acc + digit; sf++; } @@ -619,7 +620,7 @@ static void StrtoI64(FieldParseContext *ctx) { const char *ch = *(ctx->ch); int64_t *target = (int64_t*) ctx->targets[sizeof(int64_t)]; - if (*ch=='0' && args.keepLeadingZeros && (uint_fast8_t)(ch[1]-'0')<10) return; + if (*ch=='0' && args.keepLeadingZeros && IS_DIGIT(ch[1])) return; bool neg = *ch=='-'; ch += (neg || *ch=='+'); const char *start = ch; @@ -627,7 +628,7 @@ static void StrtoI64(FieldParseContext *ctx) uint_fast64_t acc = 0; // important unsigned not signed here; we now need the full unsigned range uint_fast8_t digit; uint_fast32_t sf = 0; - while ( (digit=(uint_fast8_t)(ch[sf]-'0'))<10 ) { + while ( (digit=AS_DIGIT(ch[sf]))<10 ) { acc = 10*acc + digit; sf++; } @@ -677,7 +678,7 @@ static void parse_double_regular_core(const char **pch, double *target) #define FLOAT_MAX_DIGITS 18 const char *ch = *pch; - if (*ch=='0' && args.keepLeadingZeros && (uint_fast8_t)(ch[1]-'0')<10) return; + if (*ch=='0' && args.keepLeadingZeros && IS_DIGIT(ch[1])) return; bool neg, Eneg; ch += (neg = *ch=='-') + (*ch=='+'); @@ -691,7 +692,7 @@ static void parse_double_regular_core(const char **pch, double *target) // Read the first, integer part of the floating number (but no more than // FLOAT_MAX_DIGITS digits). int_fast32_t sflimit = FLOAT_MAX_DIGITS; - while ((digit=(uint_fast8_t)(*ch-'0'))<10 && sflimit) { + while ((digit=AS_DIGIT(*ch))<10 && sflimit) { acc = 10*acc + digit; sflimit--; ch++; @@ -701,8 +702,8 @@ static void parse_double_regular_core(const char **pch, double *target) // we will read and discard those extra digits, but only if they are followed // by a decimal point (otherwise it's a just big integer, which should be // treated as a string instead of losing precision). - if (sflimit==0 && (uint_fast8_t)(*ch-'0')<10) { - while ((uint_fast8_t)(*ch-'0')<10) { + if (sflimit==0 && IS_DIGIT(*ch)) { + while (IS_DIGIT(*ch)) { ch++; e++; } @@ -725,7 +726,7 @@ static void parse_double_regular_core(const char **pch, double *target) // Now read the significant digits in the fractional part of the number int_fast32_t k = 0; - while ((digit=(uint_fast8_t)(ch[k]-'0'))<10 && sflimit) { + while ((digit=AS_DIGIT(ch[k]))<10 && sflimit) { acc = 10*acc + digit; k++; sflimit--; @@ -735,7 +736,7 @@ static void parse_double_regular_core(const char **pch, double *target) // If more digits are present, skip them if (sflimit==0) { - while ((uint_fast8_t)(*ch-'0')<10) ch++; + while (IS_DIGIT(*ch)) ch++; } // Check that at least 1 digit was present in either the integer or // fractional part ("+1" here accounts for the decimal point char). @@ -752,13 +753,13 @@ static void parse_double_regular_core(const char **pch, double *target) if (ch==start) goto fail; // something valid must be between [+|-] and E, character E alone is invalid. ch += 1/*E*/ + (Eneg = ch[1]=='-') + (ch[1]=='+'); int_fast32_t E = 0; - if ((digit=(uint_fast8_t)(*ch-'0'))<10) { + if ((digit=AS_DIGIT(*ch))<10) { E = digit; ch++; - if ((digit=(uint_fast8_t)(*ch-'0'))<10) { + if ((digit=AS_DIGIT(*ch))<10) { E = E*10 + digit; ch++; - if ((digit=(uint_fast8_t)(*ch-'0'))<10) { + if ((digit=AS_DIGIT(*ch))<10) { E = E*10 + digit; ch++; } @@ -825,11 +826,11 @@ static void parse_double_extended(FieldParseContext *ctx) } if (ch[0]=='N' && (ch[1]=='A' || ch[1]=='a') && ch[2]=='N' && (ch += 3)) { if (ch[-2]=='a' && (*ch=='%' || *ch=='Q' || *ch=='S')) ch++; - while ((uint_fast8_t)(*ch-'0') < 10) ch++; + while (IS_DIGIT(*ch)) ch++; goto return_nan; } if ((ch[0]=='q' || ch[0]=='s') && ch[1]=='N' && ch[2]=='a' && ch[3]=='N' && (ch += 4)) { - while ((uint_fast8_t)(*ch-'0') < 10) ch++; + while (IS_DIGIT(*ch)) ch++; goto return_nan; } if (ch[0]=='1' && ch[1]=='.' && ch[2]=='#') { @@ -915,7 +916,7 @@ static void parse_double_hexadecimal(FieldParseContext *ctx) acc <<= (13 - ndigits) * 4; ch += 1 + (Eneg = ch[1]=='-') + (ch[1]=='+'); uint64_t E = 0; - while ((digit = (uint8_t)(*ch-'0')) < 10) { + while ((digit = AS_DIGIT(*ch)) < 10) { E = 10*E + digit; ch++; } @@ -1079,7 +1080,7 @@ static void parse_bool_numeric(FieldParseContext *ctx) { const char *ch = *(ctx->ch); int8_t *target = (int8_t*) ctx->targets[sizeof(int8_t)]; - uint8_t d = (uint8_t)(*ch - '0'); // '0'=>0, '1'=>1, everything else > 1 + uint_fast8_t d = AS_DIGIT(*ch); // '0'=>0, '1'=>1, everything else > 1 if (d <= 1) { *target = (int8_t) d; *(ctx->ch) = ch + 1; From 5421e7ae706af2d5366a9ffbee798dac5e994f54 Mon Sep 17 00:00:00 2001 From: Xu Ren Date: Sat, 12 Jun 2021 03:00:54 -0400 Subject: [PATCH 283/588] minor things in vignette (#4454) --- vignettes/datatable-keys-fast-subset.Rmd | 6 +++--- vignettes/datatable-reference-semantics.Rmd | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vignettes/datatable-keys-fast-subset.Rmd b/vignettes/datatable-keys-fast-subset.Rmd index 541e8bb5ba..917a904136 100644 --- a/vignettes/datatable-keys-fast-subset.Rmd +++ b/vignettes/datatable-keys-fast-subset.Rmd @@ -242,7 +242,7 @@ flights[.(unique(origin), "MIA")] * *"MIA"* is automatically recycled to fit the length of `unique(origin)` which is *3*. -## 2) Combining keys with `j` and `by` +## 2. Combining keys with `j` and `by` All we have seen so far is the same concept -- obtaining *row indices* in `i`, but just using a different method -- using `keys`. It shouldn't be surprising that we can do exactly the same things in `j` and `by` as seen from the previous vignettes. We will highlight this with a few examples. @@ -340,7 +340,7 @@ key(ans) * We use `keyby` to automatically key that result by *month*. Now we understand what that means. In addition to ordering, it also sets *month* as the `key` column. -## 3) Additional arguments - `mult` and `nomatch` +## 3. Additional arguments - `mult` and `nomatch` ### a) The *mult* argument @@ -376,7 +376,7 @@ flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last", nomatch = NULL] * The query “JFK”, “XNA” doesn’t match any rows in flights and therefore is skipped. -## 4) binary search vs vector scans +## 4. binary search vs vector scans We have seen so far how we can set and use keys to subset. But what's the advantage? For example, instead of doing: diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index 792bbf3b4b..2f3457056c 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -266,7 +266,7 @@ flights[, c("speed", "max_speed", "max_dep_delay", "max_arr_delay") := NULL] head(flights) ``` -## 3) `:=` and `copy()` +## 3. `:=` and `copy()` `:=` modifies the input object by reference. Apart from the features we have discussed already, sometimes we might want to use the update by reference feature for its side effect. And at other times it may not be desirable to modify the original object, in which case we can use `copy()` function, as we will see in a moment. From 24191e7a16cbe1df4e1fcae0517226e2421a9f21 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 14 Jun 2021 13:37:07 -0700 Subject: [PATCH 284/588] expand & improve error message for grouping by unsupported types (#4309) --- NEWS.md | 2 ++ R/data.table.R | 4 +++- inst/tests/tests.Rraw | 4 +++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4e108fae47..8c3693dbc4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -155,6 +155,8 @@ 6. `DT[subset]` where `DT[(subset)]` or `DT[subset==TRUE]` was intended; i.e., subsetting by a logical column whose name conflicts with an existing function, now gives a friendlier error message, [#5014](https://github.com/Rdatatable/data.table/issues/5014). Thanks @michaelchirico for the suggestion and PR, and @ColeMiller1 for helping with the fix. +7. Grouping by a `list` column has its error message improved stating this is unsupported, [#4308](https://github.com/Rdatatable/data.table/issues/4308). Thanks @sindribaldur for filing, and @michaelchirico for the PR. Please add your vote and especially use cases to the [#1597](https://github.com/Rdatatable/data.table/issues/1597) feature request. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/R/data.table.R b/R/data.table.R index 85f1d9edb3..392599da71 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -881,7 +881,9 @@ replace_dot_alias = function(e) { if (!is.list(byval)) stop("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)") if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)() if (!bynull) for (jj in seq_len(length(byval))) { - if (!typeof(byval[[jj]]) %chin% ORDERING_TYPES) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]") + if (!(this_type <- typeof(byval[[jj]])) %chin% ORDERING_TYPES) { + stop(gettextf("Column or expression %d of 'by' or 'keyby' is type '%s' which is not currently supported. If you have a compelling use case, please add it to https://github.com/Rdatatable/data.table/issues/1597. As a workaround, consider converting the column to a supported type, e.g. by=sapply(list_col, toString), whilst taking care to maintain distinctness in the process.", jj, this_type)) + } } tt = vapply_1i(byval,length) if (any(tt!=xnrow)) stop(domain=NA, gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 41f8978b17..5b21448fa6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14086,7 +14086,9 @@ test(1984.05, DT[ , sum(b), keyby = c, verbose = TRUE], ### hitting byval = eval(bysub, setattr(as.list(seq_along(xss)), ...) test(1984.06, DT[1:3, sum(a), by=b:c], data.table(b=10:8, c=1:3, V1=1:3)) test(1984.07, DT[, sum(a), by=call('sin',pi)], error='must evaluate to a vector or a list of vectors') -test(1984.08, DT[, sum(a), by=as.raw(0)], error='column or expression.*type raw') +test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*type 'raw'.*not.*supported") +test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 + error="Column or expression.*2.*type 'list'.*not.*supported") test(1984.09, DT[, sum(a), by=.(1,1:2)], error='The items.*list are length[(]s[)] [(]1,2[)].*Each must be length 10; .*rows in x.*after subsetting') options('datatable.optimize' = Inf) test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE], From 2791043ff29efcfe776896006d0b2dbf013bd7e5 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 14 Jun 2021 23:41:59 +0200 Subject: [PATCH 285/588] improve copy docs (#4590) --- man/address.Rd | 5 ++++- man/copy.Rd | 10 +++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/man/address.Rd b/man/address.Rd index 2c390a3a61..8363d3c7ba 100644 --- a/man/address.Rd +++ b/man/address.Rd @@ -16,8 +16,11 @@ Sometimes useful in determining whether a value has been copied or not, programm \value{ A character vector length 1. } +\seealso{ + \code{\link{copy}} +} \references{ -\url{https://stackoverflow.com/a/10913296/403310} (but implemented in C without using \code{.Internal(inspect())}) + \url{https://stackoverflow.com/a/10913296/403310} (but implemented in C without using \code{.Internal(inspect())}) } \examples{ x=1 diff --git a/man/copy.Rd b/man/copy.Rd index 819fa2a509..587f216805 100644 --- a/man/copy.Rd +++ b/man/copy.Rd @@ -16,11 +16,15 @@ copy(x) \code{data.table} provides functions that operate on objects \emph{by reference} and minimise full object copies as much as possible. Still, it might be necessary in some situations to work on an object's copy which can be done using \code{DT.copy <- copy(DT)}. It may also be sometimes useful before \code{:=} (or \code{set}) is used to subassign to a column by reference. A \code{copy()} may be required when doing \code{dt_names = names(DT)}. Due to R's \emph{copy-on-modify}, \code{dt_names} still points to the same location in memory as \code{names(DT)}. Therefore modifying \code{DT} \emph{by reference} now, say by adding a new column, \code{dt_names} will also get updated. To avoid this, one has to \emph{explicitly} copy: \code{dt_names <- copy(names(DT))}. - } +} +\note{ + To confirm precisely whether an object is a copy of another, compare their exact memory address with \code{\link{address}}. +} \value{ - Returns a copy of the object. + Returns a copy of the object. } -\seealso{ \code{\link{data.table}}, \code{\link{setkey}}, \code{\link{setDT}}, \code{\link{setDF}}, \code{\link{set}} \code{\link{:=}}, \code{\link{setorder}}, \code{\link{setattr}}, \code{\link{setnames}} +\seealso{ + \code{\link{data.table}}, \code{\link{address}}, \code{\link{setkey}}, \code{\link{setDT}}, \code{\link{setDF}}, \code{\link{set}} \code{\link{:=}}, \code{\link{setorder}}, \code{\link{setattr}}, \code{\link{setnames}} } \examples{ # Type 'example(copy)' to run these at prompt and browse output From 80365ff18106fb9a45f940e75c60a65f4cca931b Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 16 Jun 2021 16:26:56 -0700 Subject: [PATCH 286/588] replace substring globally with substr (#4447) --- .dev/CRAN_Release.cmd | 5 ++++- .dev/revdep.R | 2 +- R/data.table.R | 29 +++++++++++++++-------------- R/fread.R | 23 ++++++++++------------- R/test.data.table.R | 4 ++-- R/utils.R | 7 +++++++ inst/tests/benchmark.Rraw | 4 ++-- inst/tests/tests.Rraw | 18 +++++++++++++++--- 8 files changed, 56 insertions(+), 36 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 3c8a2ee0ae..1dfec0a02a 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -154,7 +154,10 @@ grep -n "[^A-Za-z0-9]F[^A-Za-z0-9]" ./inst/tests/tests.Rraw grep -Enr "^[^#]*(?:\[|==|>|<|>=|<=|,|\(|\+)\s*[-]?[0-9]+[^0-9L:.e]" R | grep -Ev "stop|warning|tolerance" # Never use ifelse. fifelse for vectors when necessary (nothing yet) - grep -Enr "\bifelse" R +grep -Enr "\bifelse" R + +# use substr() instead of substring(), #4447 +grep -Fnr "substring" R # No system.time in main tests.Rraw. Timings should be in benchmark.Rraw grep -Fn "system.time" ./inst/tests/*.Rraw | grep -Fv "benchmark.Rraw" | grep -Fv "this system.time usage ok" diff --git a/.dev/revdep.R b/.dev/revdep.R index c172eb163f..38c5a93a66 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -157,7 +157,7 @@ status0 = function(bioc=FALSE) { if (file.exists(fn)) { v = suppressWarnings(system(paste0("grep 'Status:' ",fn), intern=TRUE)) if (!length(v)) return("RUNNING") - return(substring(v,9)) + return(substr(v, 9L, nchar(v))) } if (file.exists(paste0("./",x,".Rcheck"))) return("RUNNING") return("NOT STARTED") diff --git a/R/data.table.R b/R/data.table.R index 392599da71..79b8e6483d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -249,7 +249,7 @@ replace_dot_alias = function(e) { root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" if (root == ":" || (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') || - ( (!length(av<-all.vars(jsub)) || all(substring(av,1L,2L)=="..")) && + ( (!length(av<-all.vars(jsub)) || all(startsWith(av, ".."))) && root %chin% c("","c","paste","paste0","-","!") && missingby )) { # test 763. TODO: likely that !missingby iff with==TRUE (so, with can be removed) # When no variable names (i.e. symbols) occur in j, scope doesn't matter because there are no symbols to find. @@ -266,8 +266,8 @@ replace_dot_alias = function(e) { with=FALSE if (length(av)) { for (..name in av) { - name = substring(..name, 3L) - if (name=="") stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.") + name = substr(..name, 3L, nchar(..name)) + if (!nzchar(name)) stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.") if (!exists(name, where=parent.frame())) { stop("Variable '",name,"' is not found in calling scope. Looking in calling scope because you used the .. prefix.", if (exists(..name, where=parent.frame())) @@ -283,7 +283,7 @@ replace_dot_alias = function(e) { ..syms = av } } else if (is.name(jsub)) { - if (substring(jsub, 1L, 2L) == "..") stop("Internal error: DT[, ..var] should be dealt with by the branch above now.") # nocov + if (startsWith(as.character(jsub), "..")) stop("Internal error: DT[, ..var] should be dealt with by the branch above now.") # nocov if (!with && !exists(as.character(jsub), where=parent.frame())) stop("Variable '",jsub,"' is not found in calling scope. Looking in calling scope because you set with=FALSE. Also, please use .. symbol prefix and remove with=FALSE.") } @@ -709,7 +709,7 @@ replace_dot_alias = function(e) { j = eval(jsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) # else j will be evaluated for the first time on next line } else { names(..syms) = ..syms - j = eval(jsub, lapply(substring(..syms,3L), get, pos=parent.frame()), parent.frame()) + j = eval(jsub, lapply(substr(..syms, 3L, nchar(..syms)), get, pos=parent.frame()), parent.frame()) } if (is.logical(j)) j <- which(j) if (!length(j) && !notj) return( null.data.table() ) @@ -815,7 +815,7 @@ replace_dot_alias = function(e) { # TODO: could be allowed if length(irows)>1 but then the index would need to be squashed for use by uniqlist, #3062 # find if allbyvars is leading subset of any of the indices; add a trailing "__" to fix #3498 where a longer column name starts with a shorter column name tt = paste0(c(allbyvars,""), collapse="__") - w = which.first(substring(paste0(indices(x),"__"),1L,nchar(tt)) == tt) + w = which.first(startsWith(paste0(indices(x), "__"), tt)) if (!is.na(w)) { byindex = indices(x)[w] if (!length(getindex(x, byindex))) { @@ -921,8 +921,8 @@ replace_dot_alias = function(e) { jvnames = NULL drop_dot = function(x) { if (length(x)!=1L) stop("Internal error: drop_dot passed ",length(x)," items") # nocov - if (identical(substring(x<-as.character(x), 1L, 1L), ".") && x %chin% c(".N", ".I", ".GRP", ".NGRP", ".BY")) - substring(x, 2L) + if (startsWith(x<-as.character(x), ".") && x %chin% c(".N", ".I", ".GRP", ".NGRP", ".BY")) + substr(x, 2L, nchar(x)) else x } @@ -1242,8 +1242,8 @@ replace_dot_alias = function(e) { } syms = all.vars(jsub) - syms = syms[ substring(syms,1L,2L)==".." ] - syms = syms[ substring(syms,3L,3L)!="." ] # exclude ellipsis + syms = syms[ startsWith(syms, "..") ] + syms = syms[ substr(syms, 3L, 3L) != "." ] # exclude ellipsis for (sym in syms) { if (sym %chin% names_x) { # if "..x" exists as column name, use column, for backwards compatibility; e.g. package socialmixr in rev dep checks #2779 @@ -1251,7 +1251,7 @@ replace_dot_alias = function(e) { # TODO in future, as warned in NEWS item for v1.11.0 : # warning(sym," in j is looking for ",getName," in calling scope, but a column '", sym, "' exists. Column names should not start with ..") } - getName = substring(sym, 3L) + getName = substr(sym, 3L, nchar(sym)) if (!exists(getName, parent.frame())) { if (exists(sym, parent.frame())) next # user did 'manual' prefix; i.e. variable in calling scope has .. prefix stop("Variable '",getName,"' is not found in calling scope. Looking in calling scope because this symbol was prefixed with .. in the j= parameter.") @@ -1731,8 +1731,9 @@ replace_dot_alias = function(e) { # is.symbol() is for #1369, #1974 and #2949 if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 - if ((length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) && (!q1 %chin% c("head","tail"))) return(TRUE) - # ... head-tail uses default value n=6 which as of now should not go gforce ^^ + if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na"))) && (!q1 %chin% c("head","tail"))) return(TRUE) + # ^^ base::startWith errors on NULL unfortunately + # head-tail uses default value n=6 which as of now should not go gforce ... ^^ # otherwise there must be three arguments, and only in two cases: # 1) head/tail(x, 1) or 2) x[n], n>0 length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && @@ -1907,7 +1908,7 @@ replace_dot_alias = function(e) { if (length(expr)==2L) # no parameters passed to mean, so defaults of trim=0 and na.rm=FALSE return(call(".External",quote(Cfastmean),expr[[2L]], FALSE)) # return(call(".Internal",expr)) # slightly faster than .External, but R now blocks .Internal in coerce.c from apx Sep 2012 - if (length(expr)==3L && identical("na",substring(names(expr)[3L], 1L, 2L))) # one parameter passed to mean() + if (length(expr)==3L && startsWith(names(expr)[3L], "na")) # one parameter passed to mean() return(call(".External",quote(Cfastmean),expr[[2L]], expr[[3L]])) # faster than .Call assign("nomeanopt",TRUE,parent.frame()) expr # e.g. trim is not optimized, just na.rm diff --git a/R/fread.R b/R/fread.R index 236a30bb76..eb765fe639 100644 --- a/R/fread.R +++ b/R/fread.R @@ -55,13 +55,11 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (input=="" || length(grep('\\n|\\r', input))) { # input is data itself containing at least one \n or \r } else { - if (substring(input,1L,1L)==" ") { + if (startsWith(input, " ")) { stop("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") } - str6 = substring(input,1L,6L) # avoid grepl() for #2531 - str7 = substring(input,1L,7L) - str8 = substring(input,1L,8L) - if (str7=="ftps://" || str8=="https://") { + str7 = substr(input, 1L, 7L) # avoid grepl() for #2531 + if (str7=="ftps://" || startsWith(input, "https://")) { # nocov start if (!requireNamespace("curl", quietly = TRUE)) stop("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov @@ -71,7 +69,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") on.exit(unlink(tmpFile), add=TRUE) # nocov end } - else if (str6=="ftp://" || str7== "http://" || str7=="file://") { + else if (startsWith(input, "ftp://") || str7== "http://" || str7=="file://") { # nocov start method = if (str7=="file://") "internal" else getOption("download.file.method", default="auto") # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668 @@ -107,12 +105,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (data.table) 'data.table' else 'data.frame', ".") return(if (data.table) data.table(NULL) else data.frame(NULL)) } - ext2 = substring(file, nchar(file)-2L, nchar(file)) # last 3 characters ".gz" - ext3 = substring(file, nchar(file)-3L, nchar(file)) # last 4 characters ".bz2" - if (ext2==".gz" || ext3==".bz2") { + if ((is_gz <- endsWith(file, ".gz")) || endsWith(file, ".bz2")) { if (!requireNamespace("R.utils", quietly = TRUE)) stop("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov - FUN = if (ext2==".gz") gzfile else bzfile + FUN = if (is_gz) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download on.exit(unlink(decompFile), add=TRUE) @@ -174,9 +170,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") yaml_border_re = '^#?---' if (!grepl(yaml_border_re, first_line)) { close(f) - stop('Encountered <', substring(first_line, 1L, 50L), if (nchar(first_line) > 50L) '...', '> at the first ', - 'unskipped line (', 1L+skip, '), which does not constitute the start to a valid YAML header ', - '(expecting something matching regex "', yaml_border_re, '"); please check your input and try again.') + stop(gettextf( + 'Encountered <%s%s> at the first unskipped line (%d), which does not constitute the start to a valid YAML header (expecting something matching regex "%s"); please check your input and try again.', + substr(first_line, 1L, 50L), if (nchar(first_line) > 50L) '...' else '', 1L+skip, yaml_border_re + )) } yaml_comment_re = '^#' diff --git a/R/test.data.table.R b/R/test.data.table.R index da12144f66..cf778c68b6 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -215,10 +215,10 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F compactprint = function(DT, topn=2L) { tt = vapply_1c(DT,function(x)class(x)[1L]) tt[tt=="integer64"] = "i64" - tt = substring(tt, 1L, 3L) + tt = substr(tt, 1L, 3L) makeString = function(x) paste(x, collapse = ",") # essentially toString.default cn = paste0(" [Key=",makeString(key(DT)), - " Types=", makeString(substring(sapply(DT, typeof), 1L, 3L)), + " Types=", makeString(substr(sapply(DT, typeof), 1L, 3L)), " Classes=", makeString(tt), "]") if (nrow(DT)) { print(copy(DT)[,(cn):="",verbose=FALSE], topn=topn, class=FALSE) diff --git a/R/utils.R b/R/utils.R index 75a45b8991..7a698131c6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -25,6 +25,13 @@ if (base::getRversion() < "3.2.0") { # Apr 2015 isNamespaceLoaded = function(x) x %chin% loadedNamespaces() } +if (!exists('startsWith', 'package:base', inherits=FALSE)) { # R 3.3.0; Apr 2016 + startsWith = function(x, stub) substr(x, 1L, nchar(stub))==stub +} +if (!exists('endsWith', 'package:base', inherits=FALSE)) { + endsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub} +} + # which.first which.first = function(x) { diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 1c8bf146a6..bf0bf77e9f 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -161,10 +161,10 @@ set.seed(1) L = lapply(1:1e6, sample, x=100, size=2) x = capture.output(fwrite(L)) test(1742.1, nchar(x), c(2919861L, 2919774L)) # tests 2 very long lines, too -test(1742.2, substring(x,1,10), c("27,58,21,9","38,91,90,6")) +test(1742.2, substr(x, 1L, 10L), c("27,58,21,9", "38,91,90,6")) test(1742.3, L[[1L]], c(27L,38L)) test(1742.4, L[[1000000L]], c(76L, 40L)) -test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40")) +test(1742.5, substr(x, nchar(x)-10L, nchar(x)), c("50,28,95,76","62,87,23,40")) # Add scaled-up non-ASCII forder test 1896 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5b21448fa6..668b63ff8c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3470,7 +3470,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, set.seed(3) DT = data.table(a=5:1, b=runif(5)) ans = dcast(DT, a ~ b, value.var="b")[c(4,.N), c(2,6)] - setnames(ans, substring(names(ans),1,6)) + setnames(ans, substr(names(ans), 1L, 6L)) test(1102.06, ans, data.table("0.1680"=c(NA,DT[1,b]), "0.8075"=c(DT[2,b],NA))) # Fix for case 2 in bug report #71 - dcast didn't aggregate properly when formula RHS has "." @@ -7346,7 +7346,7 @@ test(1530.4, which.last(x), tail(which(x), 1L)) set.seed(2L) x = apply(matrix(sample(letters, 12), nrow=2), 1, paste, collapse="") y = factor(sample(c(letters[1:5], x), 20, TRUE)) -xsub = substring(x, 1L, 1L) +xsub = substr(x, 1L, 1L) test(1532.01, y %like% xsub[1L], grepl(xsub[1L], y)) test(1532.02, y %like% xsub[2L], grepl(xsub[2L], y)) test(1532.03, like(y, xsub[1L]), grepl(xsub[1L], y)) @@ -9564,7 +9564,7 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { runcmb = as.data.table(runcmb[, 1:min(100L, ncol(runcmb)), drop=FALSE]) # max 100 combinations to test runops = lapply(runcmb, function(cols) { thisops = sample(ops, k, TRUE) - thisops[substring(cols,1,1)=="c"] = "==" + thisops[startsWith(cols, "c")] = "==" thisops }) is_only_na <- function(x) is.na(x) & !is.nan(x) @@ -17728,3 +17728,15 @@ if (test_bit64) { test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) } +# compatibility of endsWith backport with base::endsWith +if (exists('endsWith', 'package:base', inherits=FALSE)) { + DTendsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub} + BSendsWith = base::endsWith + test(2194.1, DTendsWith('abcd', 'd'), BSendsWith('abcd', 'd')) + test(2194.2, DTendsWith(letters, 'e'), BSendsWith(letters, 'e')) + test(2194.3, DTendsWith(NA_character_, 'a'), BSendsWith(NA_character_, 'a')) + test(2194.4, DTendsWith(character(), 'a'), BSendsWith(character(), 'a')) + # file used in encoding tests + txt = readLines(testDir("issue_563_fread.txt")) + test(2194.5, DTendsWith(txt, 'B'), BSendsWith(txt, 'B')) +} From 80bb6b3e64218ca1969374deb05eca516e7a0f27 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 17 Jun 2021 00:29:43 -0700 Subject: [PATCH 287/588] uniqueN(by=character()) returns nrow, as does by=NULL (#4595) --- NEWS.md | 2 ++ R/duplicated.R | 16 ++++++++-------- inst/tests/tests.Rraw | 20 +++++++++++++++++--- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8c3693dbc4..0acc628a72 100644 --- a/NEWS.md +++ b/NEWS.md @@ -133,6 +133,8 @@ 19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3. +20. `uniqueN(DT, by=character())` is now equivalent to `uniqueN(DT)` rather than internal error `'by' is either not integer or is length 0`, [#4594](https://github.com/Rdatatable/data.table/issues/4594). Thanks Marco Colombo for the report, and Michael Chirico for the PR. Similarly for `unique()`, `duplicated()` and `anyDuplicated()`. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/duplicated.R b/R/duplicated.R index 1ae7e8a6e4..249a5470c5 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -1,14 +1,12 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) { if (!cedta()) return(NextMethod("duplicated")) #nocov - if (!identical(incomparables, FALSE)) { + if (!isFALSE(incomparables)) { .NotYetUsed("incomparables != FALSE") } if (nrow(x) == 0L || ncol(x) == 0L) return(logical(0L)) # fix for bug #28 if (is.na(fromLast) || !is.logical(fromLast)) stop("'fromLast' must be TRUE or FALSE") + if (!length(by)) by = NULL #4594 query = .duplicated.helper(x, by) - # fix for bug #44 - unique on null data table returns error (because of 'forderv') - # however, in this case we can bypass having to go to forderv at all. - if (!length(query$by)) return(logical(0L)) if (query$use.keyprefix) { f = uniqlist(shallow(x, query$by)) @@ -27,10 +25,11 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) { if (!cedta()) return(NextMethod("unique")) # nocov - if (!identical(incomparables, FALSE)) { + if (!isFALSE(incomparables)) { .NotYetUsed("incomparables != FALSE") } if (nrow(x) <= 1L) return(x) + if (!length(by)) by = NULL #4594 o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't @@ -105,14 +104,15 @@ uniqueN = function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) if (is.logical(x)) return(.Call(CuniqueNlogical, x, na.rm=na.rm)) x = as_list(x) } + if (!length(by)) by = NULL #4594 o = forderv(x, by=by, retGrp=TRUE, na.last=if (!na.rm) FALSE else NA) starts = attr(o, 'starts', exact=TRUE) - if (!na.rm) { - length(starts) - } else { + if (na.rm) { # TODO: internal efficient sum # fix for #1771, account for already sorted input sum( (if (length(o)) o[starts] else starts) != 0L) + } else { + length(starts) } } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 668b63ff8c..426151ad9c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13406,8 +13406,7 @@ test(1962.004, duplicated(DT, by = -1L), error = 'specify non existing column*.*-1') test(1962.005, duplicated(DT, by = 'y'), error = 'specify non existing column*.*y') -test(1962.0061, duplicated(data.table(NULL)), logical(0L)) -test(1962.0062, duplicated(data.table(a = 1L), by = character()), logical()) +test(1962.006, duplicated(data.table(NULL)), logical(0L)) test(1962.007, unique(DT, incomparables = TRUE), error = 'not used (yet)') @@ -17700,12 +17699,16 @@ test(2190.2, DT[1:2, a:=structure(c(1L, 2L), att='t') ]$a, list(structu test(2190.3, DT[1:2, a:=structure(c(1, 2), att='t') ]$a, list(structure(1, att='t'), structure(2, att='t'), 4)) test(2190.4, DT[1:2, a:=structure(as.raw(c(1, 2)), att='t') ]$a, list(structure(as.raw(1), att='t'), structure(as.raw(2), att='t'), 4)) test(2190.5, DT[1:2, a:=structure(as.complex(c(1, 2)), att='t')]$a, list(structure(as.complex(1), att='t'), structure(as.complex(2), att='t'), 4)) -test(2190.6, DT[1:2, a:=structure(c(TRUE, FALSE), att='t') ]$a, list(structure(TRUE, att='t'), structure(FALSE, att='t'), 4)) +test(2190.61, DT[1:2, a:=structure(c(TRUE, FALSE), att='t') ]$a, list(structure(TRUE, att='t'), structure(FALSE, att='t'), 4)) +test(2190.62, attributes(TRUE), NULL) # ensure R's internal global TRUE/FALSE didn't receive attribute att='t'; discovered when merging #4595 +test(2190.63, attributes(FALSE), NULL) test(2190.7, DT[1:2, a:=structure(c('a', 'b'), att='t') ]$a, list(structure('a', att='t'), structure('b', att='t'), 4)) if (test_bit64) { test(2190.8, DT[1:2, a:=as.integer64(1:2) ]$a, list(as.integer64(1), as.integer64(2), 4)) } test(2190.9, DT[1:2, a:=call('sum', 1)], error="type 'language' cannot be coerced to 'list'") +test(2190.91, attributes(TRUE), NULL) # ensure R's internal global TRUE/FALSE didn't receive attribute att='t'; discovered when merging #4595 +test(2190.92, attributes(FALSE), NULL) # adding test for (since fixed) 'could not find function "."' when verbose=TRUE, #3196 DT = data.table(i1 = c(234L, 250L, 169L, 234L, 147L, 96L, 96L, 369L, 147L, 96L), i4 = c(79L, 113L, 270L, -121L, 113L, 113L, -121L, 179L, -228L, 113L), v = 0) @@ -17740,3 +17743,14 @@ if (exists('endsWith', 'package:base', inherits=FALSE)) { txt = readLines(testDir("issue_563_fread.txt")) test(2194.5, DTendsWith(txt, 'B'), BSendsWith(txt, 'B')) } + +# uniqueN(x, by=character()) was internal error, #4594 +DT = data.table(idx=c(1L,2L,1L,3L), value="val") +test(2195.1, uniqueN(DT, by=character(0L)), 3L) +test(2195.2, uniqueN(DT, by=NULL), 3L) +test(2195.3, unique(DT, by=character(0L)), ans<-data.table(idx=1:3, value="val")) +test(2195.4, unique(DT, by=NULL), ans) +test(2195.5, duplicated(DT, by=character(0L)), ans<-c(FALSE, FALSE, TRUE, FALSE)) +test(2195.6, duplicated(DT, by=NULL), ans) +test(2195.7, anyDuplicated(DT, by=character(0L)), 3L) +test(2195.8, anyDuplicated(DT, by=NULL), 3L) From cfc29dba6793a862ea84f84e2cd95043e23cb8a1 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 17 Jun 2021 01:37:36 -0600 Subject: [PATCH 288/588] follow up to #4447 to pass R 3.1.0 --- inst/tests/tests.Rraw | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 426151ad9c..a0e92fc729 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -30,6 +30,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { compactprint = data.table:::compactprint cube.data.table = data.table:::cube.data.table dcast.data.table = data.table:::dcast.data.table + if (!exists('endsWith', 'package:base', inherits=FALSE)) endsWith = data.table:::endsWith forder = data.table:::forder forderv = data.table:::forderv format.data.table = data.table:::format.data.table @@ -53,6 +54,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { shallow = data.table:::shallow # until exported .shallow = data.table:::.shallow split.data.table = data.table:::split.data.table + if (!exists('startsWith', 'package:base', inherits=FALSE)) startsWith = data.table:::startsWith test = data.table:::test uniqlengths = data.table:::uniqlengths uniqlist = data.table:::uniqlist From 2e8691185f8a0f7d26212fa4674d1d64f27151ee Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 17 Jun 2021 14:22:47 -0600 Subject: [PATCH 289/588] Don't call copyMostAttrib on ScalarLogical result (#5047) --- src/assign.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/assign.c b/src/assign.c index c87d99bdb9..1955fe502a 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1071,16 +1071,21 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con BODY(SEXP, &, SEXP, val, SET_VECTOR_ELT(target, off+i, cval)) } else { switch (TYPEOF(source)) { - // no protect of CAST needed because SET_VECTOR_ELT protects it, and it can't get released by copyMostAttrib or anything else inside BODY - // copyMostAttrib is appended to CAST so as to be outside loop - case RAWSXP: BODY(Rbyte, RAW, SEXP, ScalarRaw(val); copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) - case LGLSXP: BODY(int, INTEGER, SEXP, ScalarLogical(val);copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) - case INTSXP: BODY(int, INTEGER, SEXP, ScalarInteger(val);copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) - case REALSXP: BODY(double, REAL, SEXP, ScalarReal(val); copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) - case CPLXSXP: BODY(Rcomplex, COMPLEX, SEXP, ScalarComplex(val);copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) - case STRSXP: BODY(SEXP, STRING_PTR, SEXP, ScalarString(val); copyMostAttrib(source,cval), SET_VECTOR_ELT(target,off+i,cval)) + // allocVector instead of ScalarLogical to avoid copyMostAttrib on R's internal global TRUE/FALSE values; #4595. Then because + // ScalarInteger may now or in future R also return R internal global small integer constants, the same for that. Then + // because we do that here for logical and integer, use allocVeector too for the other types to follow the same pattern and possibly + // in future R will also have some global constants for those types too. + // the UNPROTECT can be at the end of the CAST before the SET_VECTOR_ELT, because SET_VECTOR_ELT will protect it and there's no other code inbetween + // the PROTECT is now needed because of the call to LOGICAL() which could feasibly gc inside it. + // copyMostAttrib is inside CAST so as to be outside loop. See the history in #4350 and its follow up + case RAWSXP: BODY(Rbyte, RAW, SEXP, PROTECT(allocVector(RAWSXP, 1));RAW(cval)[0]=val;copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case LGLSXP: BODY(int, LOGICAL, SEXP, PROTECT(allocVector(LGLSXP, 1));LOGICAL(cval)[0]=val;copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case INTSXP: BODY(int, INTEGER, SEXP, PROTECT(allocVector(INTSXP, 1));INTEGER(cval)[0]=val;copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case REALSXP: BODY(double, REAL, SEXP, PROTECT(allocVector(REALSXP, 1));REAL(cval)[0]=val;copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case CPLXSXP: BODY(Rcomplex, COMPLEX, SEXP, PROTECT(allocVector(CPLXSXP, 1));COMPLEX(cval)[0]=val;copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case STRSXP: BODY(SEXP, STRING_PTR, SEXP, PROTECT(allocVector(STRSXP, 1));SET_STRING_ELT(cval, 0, val);copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) case VECSXP: - case EXPRSXP: BODY(SEXP, SEXPPTR_RO, SEXP, val, SET_VECTOR_ELT(target,off+i,cval)) + case EXPRSXP: BODY(SEXP, SEXPPTR_RO, SEXP, val, SET_VECTOR_ELT(target,off+i,cval)) default: COERCE_ERROR("list"); } } From 1b4fc68ee1554151ce306a9793957ff9b6a69fe1 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Mon, 21 Jun 2021 21:59:41 +0200 Subject: [PATCH 290/588] Changes buffer size for writing header in fwrite (#5049) --- NEWS.md | 2 ++ src/fwrite.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 0acc628a72..465b3505df 100644 --- a/NEWS.md +++ b/NEWS.md @@ -159,6 +159,8 @@ 7. Grouping by a `list` column has its error message improved stating this is unsupported, [#4308](https://github.com/Rdatatable/data.table/issues/4308). Thanks @sindribaldur for filing, and @michaelchirico for the PR. Please add your vote and especially use cases to the [#1597](https://github.com/Rdatatable/data.table/issues/1597) feature request. +8. OpenBSD 6.9 released May 2021 apparently uses a 16 year old version of zlib (v1.2.3 from 2005) which induces `Compress gzip error: -9` from `fwrite()`, [#5048](https://github.com/Rdatatable/data.table/issues/5048). Thanks to Philippe Chataignon for investigating and for the PR which attempts a solution. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/src/fwrite.c b/src/fwrite.c index b85d513a6f..d6f9322982 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -738,7 +738,9 @@ void fwriteMain(fwriteMainArgs args) free(buff); // # nocov STOP(_("Can't allocate gzip stream structure")); // # nocov } - size_t zbuffSize = deflateBound(&stream, headerLen); + // by default, buffsize is the same used for writing rows (#5048 old openbsd zlib) + // takes the max with headerLen size in case of very long header + size_t zbuffSize = deflateBound(&stream, headerLen > buffSize ? headerLen : buffSize); char *zbuff = malloc(zbuffSize); if (!zbuff) { free(buff); // # nocov From 4c600a256f2f571b36600a58de906a256670d548 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 21 Jun 2021 13:00:24 -0700 Subject: [PATCH 291/588] simplify checking for untranslated messages (#5050) --- .dev/CRAN_Release.cmd | 33 +++------------------------------ src/fwrite.c | 2 +- src/init.c | 2 +- src/nafill.c | 6 +++--- src/programming.c | 2 +- src/utils.c | 14 +++++++------- 6 files changed, 16 insertions(+), 43 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 1dfec0a02a..8be3ada82d 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -7,36 +7,9 @@ ## ideally, we are including _() wrapping in ## new PRs throughout dev cycle, and this step ## becomes about tying up loose ends -## Appending _() char array wrapping to all messages -## that might be shown to the user. This step is slightly -## too greedy, as it includes too many msg, some of which -## need not be translated [more work to do here to make -## this less manual] some things to watch out for: -## * quote embedded (and escaped) within message [could be fixed with smarter regex] -## * multi-line implicit-concat arrays (in C, `"a" "b"` is the same as `"ab"`) should be wrapped "on the outside" not individually -## * `data.table` shares some of its `src` with `pydatatable`, so the requirement to `#include ` before the `#define _` macro meant we need to be careful about including this macro only in the R headers for these files (hence I created `po.h`) -## * Can't use `_()` _inside_ another functional macro. Only wrap the string passed to the macro later. -for MSG in error warning DTWARN DTPRINT Rprintf STOP Error; - do for SRC_FILE in src/*.c; - # no inplace -i in default mac sed - do sed -E "s/$MSG[(](\"[^\"]*\")/$MSG(_(\1)/g" $SRC_FILE > out; - mv out $SRC_FILE; - done -done - -## checking for other lines calling these that didn't get _()-wrapped -for MSG in error warning DTWARN DTPRINT Rprintf STOP Error; - do grep -Er "\b$MSG[(]" src --include=*.c | grep -v _ | grep -Ev "(?:\s*//|[*]).*$MSG[(]" -done - -## similar, but a bit more manual to check snprintf usage - -## look for char array that haven't been covered yet -grep -Er '"[^"]+"' src --include=*.c | grep -Fv '_("' | \ - grep -Ev '#include|//.*".*"|strcmp|COERCE_ERROR|install\("|\{"' - -## look for lines starting with a char array (likely continued from prev line & can be combined) -grep -Er '^\s*"' src/*.c +## Check the output here for translatable messages +xgettext -o /dev/stdout ./*.c \ + --keyword=Rprintf --keyword=error --keyword=warning --keyword=STOP --keyword=DTWARN --keyword=Error --keyword=DTPRINT --keyword=snprintf:3 ## (b) Update R template file: src/R-data.table.pot ## NB: this relies on R >= 4.0 to remove a bug in update_pkg_po diff --git a/src/fwrite.c b/src/fwrite.c index d6f9322982..7bad0cd168 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -802,7 +802,7 @@ void fwriteMain(fwriteMainArgs args) if(init_stream(&stream)) STOP(_("Can't allocate gzip stream structure")); // # nocov zbuffSize = deflateBound(&stream, buffSize); - if (verbose) DTPRINT("zbuffSize=%d returned from deflateBound\n", (int)zbuffSize); + if (verbose) DTPRINT(_("zbuffSize=%d returned from deflateBound\n"), (int)zbuffSize); deflateEnd(&stream); #endif } diff --git a/src/init.c b/src/init.c index 2c65f1d980..083649685e 100644 --- a/src/init.c +++ b/src/init.c @@ -395,7 +395,7 @@ int GetVerbose() { // don't call repetitively; save first in that case SEXP opt = GetOption(sym_verbose, R_NilValue); if ((!isLogical(opt) && !isInteger(opt)) || LENGTH(opt)!=1 || INTEGER(opt)[0]==NA_INTEGER) - error("verbose option must be length 1 non-NA logical or integer"); + error(_("verbose option must be length 1 non-NA logical or integer")); return INTEGER(opt)[0]; } diff --git a/src/nafill.c b/src/nafill.c index ac5e28aacf..7c24152dcf 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -84,7 +84,7 @@ void nafillInteger64(int64_t *x, uint_fast64_t nx, unsigned int type, int64_t fi } } if (verbose) - snprintf(ans->message[0], 500, "%s: took %.3fs\n", __func__, omp_get_wtime()-tic); + snprintf(ans->message[0], 500, _("%s: took %.3fs\n"), __func__, omp_get_wtime()-tic); } SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols) { @@ -100,7 +100,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S bool binplace = LOGICAL(inplace)[0]; if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) - error("nan_is_na must be TRUE or FALSE"); // # nocov + error(_("nan_is_na must be TRUE or FALSE")); // # nocov bool nan_is_na = LOGICAL(nan_is_na_arg)[0]; SEXP x = R_NilValue; @@ -184,7 +184,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S SET_VECTOR_ELT(fill, i, fill1); } if (!isNewList(fill)) - error("internal error: 'fill' should be recycled as list already"); // # nocov + error(_("internal error: 'fill' should be recycled as list already")); // # nocov for (R_len_t i=0; i=2; // verbose level 2 required if (!LOGICAL(copyArg)[0] && TYPEOF(x)==TYPEOF(as) && class1(x)==class1(as)) { if (verbose) - Rprintf("copy=false and input already of expected type and class %s[%s]\n", type2char(TYPEOF(x)), class1(x)); + Rprintf(_("copy=false and input already of expected type and class %s[%s]\n"), type2char(TYPEOF(x)), class1(x)); copyMostAttrib(as, x); // so attrs like factor levels are same for copy=T|F return(x); } int len = LENGTH(x); SEXP ans = PROTECT(allocNAVectorLike(as, len)); if (verbose) - Rprintf("Coercing %s[%s] into %s[%s]\n", type2char(TYPEOF(x)), class1(x), type2char(TYPEOF(as)), class1(as)); + Rprintf(_("Coercing %s[%s] into %s[%s]\n"), type2char(TYPEOF(x)), class1(x), type2char(TYPEOF(as)), class1(as)); const char *ret = memrecycle(/*target=*/ans, /*where=*/R_NilValue, /*start=*/0, /*len=*/LENGTH(x), /*source=*/x, /*sourceStart=*/0, /*sourceLen=*/-1, /*colnum=*/0, /*colname=*/""); if (ret) warning(_("%s"), ret); @@ -385,7 +385,7 @@ SEXP dt_zlib_version() { #ifndef NOZLIB snprintf(out, 50, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION); #else - snprintf(out, 50, "zlib header files were not found when data.table was compiled"); + snprintf(out, 50, _("zlib header files were not found when data.table was compiled")); #endif return ScalarString(mkChar(out)); } From 793cec001e36c505d1562511adc36f878044d3ce Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 21 Jun 2021 16:58:44 -0400 Subject: [PATCH 292/588] fix melt(na.rm=TRUE) with list columns (#5044) --- NEWS.md | 3 +++ inst/tests/tests.Rraw | 9 +++++++- man/melt.data.table.Rd | 7 ++++-- src/fmelt.c | 18 +++++++++++++--- src/frank.c | 48 +++++++++++++++++++++++++++++++++++++----- 5 files changed, 74 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index 465b3505df..6a218bb6ba 100644 --- a/NEWS.md +++ b/NEWS.md @@ -135,6 +135,9 @@ 20. `uniqueN(DT, by=character())` is now equivalent to `uniqueN(DT)` rather than internal error `'by' is either not integer or is length 0`, [#4594](https://github.com/Rdatatable/data.table/issues/4594). Thanks Marco Colombo for the report, and Michael Chirico for the PR. Similarly for `unique()`, `duplicated()` and `anyDuplicated()`. +21. `melt()` on a `data.table` with `list` columns for `measure.vars` would silently ignore `na.rm=TRUE`, [#5044](https://github.com/Rdatatable/data.table/issues/5044). Now the same logic as `is.na()` from base R is used; i.e. if list element is scalar NA then it is considered missing and removed. Thanks to Toby Dylan Hocking for the PRs. + + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a0e92fc729..c1a6b1ca34 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3060,6 +3060,13 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) # na.rm=TRUE with list column value, PR#4737 test(1035.016, melt(data.table(a1=1, b1=list(1:2), b2=list(c('foo','bar'))), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1), a=1, b=list(1:2))) test(1035.017, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1), a=1, b=1))#this worked even before the PR. + DT.list.missing = data.table(l1=list(1,NA), l2=list(NA,2), n34=c(3,4), NA5=c(NA,5)) + test(1035.0180, melt(DT.list.missing, measure.vars=c("n34","NA5"), na.rm=TRUE)[["value"]], c(3,4,5)) + test(1035.0181, melt(DT.list.missing, measure.vars=c("l1","l2"), na.rm=TRUE)[["value"]], list(1,2)) + test(1035.0182, melt(DT.list.missing, measure.vars=c("l1","n34"), na.rm=TRUE)[["value"]], list(1,3,4), warning="are not all of the same type") + test(1035.0183, melt(DT.list.missing, measure.vars=c("l1","NA5"), na.rm=TRUE)[["value"]], list(1,5), warning="are not all of the same type") + test(1035.0184, melt(DT.list.missing, measure.vars=list(l=c("l1","l2"), n=c("n34","NA5")), na.rm=TRUE), data.table(variable=factor(1:2), l=list(1,2), n=c(3,5))) + test(1035.0185, melt(data.table(l=list(c(NA,NA), NA, NA_integer_, NA_real_, NA_complex_, NA_character_, if(test_bit64)NA_integer64_)), measure.vars="l", na.rm=TRUE)[["value"]], list(c(NA,NA))) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] @@ -6514,7 +6521,7 @@ test(1459.12, .Call("CsubsetDT", DT, 5L, seq_along(DT)), setDT(as.data.frame(DT) # Test for na.omit with list, raw and complex types DT = data.table(x=c(1L,1L,NA), y=c(NA, NA, 1), z=as.raw(1:3), w=list(1,NA,2), v=c(1+5i, NA, NA)) -test(1460.1, na.omit(DT, cols="w"), DT) +test(1460.1, na.omit(DT, cols="w"), DT[c(1,3)]) test(1460.2, na.omit(DT, cols="v"), DT[1]) test(1460.3, na.omit(DT, cols=c("v", "y")), DT[0]) test(1460.4, na.omit(DT, cols=c("z", "v")), DT[1]) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index ddca733fe8..b31017356b 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -141,8 +141,11 @@ melt(DT, id.vars=1:2, measure.vars=patterns(f="^f_", d="^d_"), value.factor=TRUE # na.rm=TRUE removes rows with NAs in any 'value' columns melt(DT, id.vars=1:2, measure.vars=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) -# return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column -melt(DT, id.vars=1:2, measure.vars=patterns("l_", "c_"), na.rm=TRUE) +# 'na.rm=TRUE' also works with list column, but note that is.na only +# returns TRUE if the list element is a length=1 vector with an NA. +is.na(list(one.NA=NA, two.NA=c(NA,NA))) +melt(DT, id.vars=1:2, measure.vars=patterns("l_", "d_"), na.rm=FALSE) +melt(DT, id.vars=1:2, measure.vars=patterns("l_", "d_"), na.rm=TRUE) # measure list with missing/short entries results in output with runs of NA DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] diff --git a/src/fmelt.c b/src/fmelt.c index 8c204cb5ce..b33d99adb4 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -520,6 +520,7 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s for (int k=0; knrow; ++k) SET_STRING_ELT(target, j*data->nrow + k, STRING_ELT(thiscol, k)); } break; + //TODO complex value type: case CPLXSXP: { } break; case REALSXP : { double *dtarget = REAL(target); const double *dthiscol = REAL(thiscol); @@ -729,10 +730,21 @@ SEXP getidcols(SEXP DT, SEXP dtnames, Rboolean verbose, struct processData *data } break; case VECSXP : { - for (int j=0; jlmax; ++j) { - for (int k=0; knrow; ++k) { - SET_VECTOR_ELT(target, j*data->nrow + k, VECTOR_ELT(thiscol, k)); + if (data->narm) { + for (int j=0; jlmax; ++j) { + SEXP thisidx = VECTOR_ELT(data->naidx, j); + const int *ithisidx = INTEGER(thisidx); + const int thislen = length(thisidx); + for (int k=0; klmax; ++j) { + for (int k=0; knrow; ++k) { + SET_VECTOR_ELT(target, j*data->nrow + k, VECTOR_ELT(thiscol, k)); + } + } } } break; diff --git a/src/frank.c b/src/frank.c index 810baf85c5..2e9e14bcf1 100644 --- a/src/frank.c +++ b/src/frank.c @@ -19,7 +19,7 @@ SEXP dt_na(SEXP x, SEXP cols) { for (int i=0; i Date: Mon, 21 Jun 2021 23:37:42 +0200 Subject: [PATCH 293/588] Fix fread crashes (#5046) --- NEWS.md | 3 +++ inst/tests/tests.Rraw | 12 ++++++++++++ src/fread.c | 7 +++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6a218bb6ba..17d6622203 100644 --- a/NEWS.md +++ b/NEWS.md @@ -137,6 +137,9 @@ 21. `melt()` on a `data.table` with `list` columns for `measure.vars` would silently ignore `na.rm=TRUE`, [#5044](https://github.com/Rdatatable/data.table/issues/5044). Now the same logic as `is.na()` from base R is used; i.e. if list element is scalar NA then it is considered missing and removed. Thanks to Toby Dylan Hocking for the PRs. +22. `fread(fill=TRUE)` could segfault if the input contained an improperly quoted character field, [#4774](https://github.com/Rdatatable/data.table/issues/4774) [#5041](https://github.com/Rdatatable/data.table/issues/5041). Thanks to @AndeolEvain and @e-nascimento for reporting and to Václav Tlapák for the PR. + +23. `fread(fill=TRUE, verbose=TRUE)` would segfault on the out-of-sample type bump verbose output if the input did not contain column names, [5046](https://github.com/Rdatatable/data.table/pull/5046). Thanks to Václav Tlapák for the PR. ## NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c1a6b1ca34..21972d44bd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17763,3 +17763,15 @@ test(2195.5, duplicated(DT, by=character(0L)), ans<-c(FALSE, FALSE, TRUE, FALSE) test(2195.6, duplicated(DT, by=NULL), ans) test(2195.7, anyDuplicated(DT, by=character(0L)), 3L) test(2195.8, anyDuplicated(DT, by=NULL), 3L) + +# Improperly quoted character field with fill=TRUE would segfault, #4774 and #5041 +test(2196, + fread(paste0(paste(rep(c('a; b'), 100), collapse='\n'), c('\n"a" 2;b')), fill=TRUE, quote='\"'), + data.table(a=c(rep('a', 99), '"a" 2'), b=rep('b', 100)), warning='Found and resolved improper quoting') + +# Verbose output would segfault when no header present, out-of-sample type error and fill=TRUE, similar to #4644 +# Test vaildity may depend on sampling behaviour of fread since the type bump needs to occur out of sample to trigger the segfault +sampleText = paste0(paste(rep(c('1; 2'), 100), collapse='\n'), c('\n"a";2\n1; 2\n'), paste(rep(c('1; 2'), 100), collapse='\n')) +test(2197, fread(sampleText, fill=TRUE, quote='\"', verbose=TRUE, header=FALSE), + data.table(rep(c("1","a","1"),c(100,1,101)), 2L), + output='Column 1 bumped') diff --git a/src/fread.c b/src/fread.c index da3e0e18ea..d00cffcd1f 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2361,6 +2361,9 @@ int freadMain(freadMainArgs _args) { if (j+fieldsRemaining != ncol) break; checkedNumberOfFields = true; } + if (thisType <= -NUMTYPE) { + break; // Improperly quoted char field needs to be healed below, other columns will be filled #5041 and #4774 + } #pragma omp critical { joldType = type[j]; // fetch shared value again in case another thread bumped it while I was waiting. @@ -2369,8 +2372,8 @@ int freadMain(freadMainArgs _args) { if (verbose) { char temp[1001]; int len = snprintf(temp, 1000, - _("Column %d (\"%.*s\") bumped from '%s' to '%s' due to <<%.*s>> on row %"PRIu64"\n"), - j+1, colNames[j].len, colNamesAnchor + colNames[j].off, + _("Column %d%s%.*s%s bumped from '%s' to '%s' due to <<%.*s>> on row %"PRIu64"\n"), + j+1, colNames?" <<":"", colNames?(colNames[j].len):0, colNames?(colNamesAnchor+colNames[j].off):"", colNames?">>":"", typeName[abs(joldType)], typeName[abs(thisType)], (int)(tch-fieldStart), fieldStart, (uint64_t)(ctx.DTi+myNrow)); if (len > 1000) len = 1000; From 628fdeecbf699a54bfc430de4ad30a2fa1c31e1a Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 21 Jun 2021 23:48:07 +0200 Subject: [PATCH 294/588] as.df removes index (#5043) --- R/data.table.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/data.table.R b/R/data.table.R index 79b8e6483d..b3e6cc826d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2142,6 +2142,7 @@ as.data.frame.data.table = function(x, ...) setattr(ans,"row.names",.set_row_names(nrow(x))) # since R 2.4.0, data.frames can have non-character row names setattr(ans,"class","data.frame") setattr(ans,"sorted",NULL) # remove so if you convert to df, do something, and convert back, it is not sorted + setattr(ans,"index",NULL) #5042 setattr(ans,".internal.selfref",NULL) # leave tl intact, no harm, ans From 5440244e8abe73310ad755150d42e08053adf2f8 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 21 Jun 2021 15:17:38 -0700 Subject: [PATCH 295/588] internal cleanup -- new helper clip_msec (#4924) --- R/IDateTime.R | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/R/IDateTime.R b/R/IDateTime.R index 832424091f..c84c173a72 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -146,13 +146,8 @@ as.ITime.POSIXct = function(x, tz = attr(x, "tzone", exact=TRUE), ...) { } as.ITime.numeric = function(x, ms = 'truncate', ...) { - secs = switch(ms, - 'truncate' = as.integer(x), - 'nearest' = as.integer(round(x)), - 'ceil' = as.integer(ceiling(x)), - stop("Valid options for ms are 'truncate', ", - "'nearest', and 'ceil'.")) %% 86400L - (setattr(secs, "class", "ITime")) # the %% here ^^ ensures a local copy is obtained; the truncate as.integer() may not copy + secs = clip_msec(x, ms) %% 86400L # the %% here ensures a local copy is obtained; the truncate as.integer() may not copy + (setattr(secs, "class", "ITime")) } as.ITime.character = function(x, format, ...) { @@ -181,23 +176,13 @@ as.ITime.character = function(x, format, ...) { } as.ITime.POSIXlt = function(x, ms = 'truncate', ...) { - secs = switch(ms, - 'truncate' = as.integer(x$sec), - 'nearest' = as.integer(round(x$sec)), - 'ceil' = as.integer(ceiling(x$sec)), - stop("Valid options for ms are 'truncate', ", - "'nearest', and 'ceil'.")) + secs = clip_msec(x$sec, ms) (setattr(with(x, secs + min * 60L + hour * 3600L), "class", "ITime")) # () wrap to return visibly } as.ITime.times = function(x, ms = 'truncate', ...) { secs = 86400 * (unclass(x) %% 1) - secs = switch(ms, - 'truncate' = as.integer(secs), - 'nearest' = as.integer(round(secs)), - 'ceil' = as.integer(ceiling(secs)), - stop("Valid options for ms are 'truncate', ", - "'nearest', and 'ceil'.")) + secs = clip_msec(secs, ms) (setattr(secs, "class", "ITime")) # the first line that creates sec will create a local copy so we can use setattr() to avoid potential copy of class()<- } @@ -311,6 +296,15 @@ as.POSIXlt.ITime = function(x, ...) { as.POSIXlt(as.POSIXct(x, ...)) } +clip_msec = function(secs, action) { + switch(action, + truncate = as.integer(secs), + nearest = as.integer(round(secs)), + ceil = as.integer(ceiling(secs)), + stop("Valid options for ms are 'truncate', 'nearest', and 'ceil'.") + ) +} + ################################################################### # Date - time extraction functions # Adapted from Hadley Wickham's routines cited below to ensure From 5295db281834f148ab907a26ed3be3f3882003b4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 21 Jun 2021 18:48:46 -0700 Subject: [PATCH 296/588] Add . and .. aliases to ?data.table (#4408) --- NEWS.md | 2 ++ R/data.table.R | 2 +- man/data.table.Rd | 10 +++++++--- vignettes/datatable-faq.Rmd | 19 +++++++++---------- 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index 17d6622203..a2576e71d2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -167,6 +167,8 @@ 8. OpenBSD 6.9 released May 2021 apparently uses a 16 year old version of zlib (v1.2.3 from 2005) which induces `Compress gzip error: -9` from `fwrite()`, [#5048](https://github.com/Rdatatable/data.table/issues/5048). Thanks to Philippe Chataignon for investigating and for the PR which attempts a solution. +9. `?"."`, `?".."`, `?".("`, and `?".()"` now point to `?data.table`, [#4385](https://github.com/Rdatatable/data.table/issues/4385) [#4407](https://github.com/Rdatatable/data.table/issues/4407). To help users find the documentation for these convenience features available inside `DT[...]`. Recall that `.` is an alias for `list`, and `..var` tells `data.table` to look for `var` in the calling environment as opposed to a column of the table. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/R/data.table.R b/R/data.table.R index b3e6cc826d..06fadd9ee6 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1351,7 +1351,7 @@ replace_dot_alias = function(e) { # There isn't a copy of the columns here, the xvar symbols point to the SD columns (copy-on-write). if (is.name(jsub) && is.null(lhs) && !exists(jsubChar<-as.character(jsub), SDenv, inherits=FALSE)) { - stop("j (the 2nd argument inside [...]) is a single symbol but column name '",jsubChar,"' is not found. Perhaps you intended DT[, ..",jsubChar,"]. This difference to data.frame is deliberate and explained in FAQ 1.1.") + stop("j (the 2nd argument inside [...]) is a single symbol but column name '",jsubChar,"' is not found. If you intended to select columns using a variable in calling scope, please try DT[, ..",jsubChar,"]. The .. prefix conveys one-level-up similar to a file system path.") } jval = eval(jsub, SDenv, parent.frame()) diff --git a/man/data.table.Rd b/man/data.table.Rd index e934028a3b..9df490f77d 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -5,6 +5,10 @@ \alias{Ops.data.table} \alias{is.na.data.table} \alias{[.data.table} +\alias{.} +\alias{.(} +\alias{.()} +\alias{..} \title{ Enhanced data.frame } \description{ \code{data.table} \emph{inherits} from \code{data.frame}. It offers fast and memory efficient: file reader and writer, aggregations, updates, equi, non-equi, rolling, range and interval joins, in a short and flexible syntax, for faster development. @@ -276,9 +280,9 @@ DT[2:5, cat(v, "\n")] # just for j's side effect # select columns the data.frame way DT[, 2] # 2nd column, returns a data.table always -colNum = 2 # to refer vars in `j` from the outside of data use `..` prefix -DT[, ..colNum] # same, equivalent to DT[, .SD, .SDcols=colNum] -DT[["v"]] # same as DT[, v] but much faster +colNum = 2 +DT[, ..colNum] # same, .. prefix conveys to look for colNum one-level-up in calling scope +DT[["v"]] # same as DT[, v] but faster if called in a loop # grouping operations - j and by DT[, sum(v), by=x] # ad hoc by, order of groups preserved in result diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index 1df42e166c..f66f9611f1 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -66,22 +66,21 @@ Also continue reading and see the FAQ after next. Skim whole documents before ge The `j` expression is the 2nd argument. Try `DT[ , c("x","y","z")]` or `DT[ , .(x,y,z)]`. -## I assigned a variable `mycol = "x"` but then `DT[ , mycol]` returns `"x"`. How do I get it to look up the column name contained in the `mycol` variable? +## I assigned a variable `mycol="x"` but then `DT[, mycol]` returns an error. How do I get it to look up the column name contained in the `mycol` variable? -What's happening is that the `j` expression sees objects in the calling scope. The variable `mycol` does not exist as a column name of `DT` so `data.table` then looked in the calling scope and found `mycol` there and returned its value `"x"`. This is correct behaviour currently. Had `mycol` been a column name, then that column's data would have been returned. +The error is that column named `"mycol"` cannot be found, and this error is correct. `data.table`'s scoping is different to `data.frame` in that you can use column names as if they are variables directly inside `DT[...]` without prefixing each column name with `DT$`; see FAQ 1.1 above. -To get the column `x` from `DT`, there are a few options: +To use `mycol` to select the column `x` from `DT`, there are a few options: ```r -# using .. to tell data.table the variable should be evaluated -DT[ , ..mycol] -# using with=FALSE to do the same -DT[ , mycol, with=FALSE] -# treating DT as a list and using [[ -DT[[mycol]] +DT[, ..mycol] # .. prefix conveys to look for the mycol one level up in calling scope +DT[, mycol, with=FALSE] # revert to data.frame behavior +DT[[mycol]] # treat DT as a list and use [[ from base R ``` -The `with` argument refers to the `base` function `with` -- when `with=TRUE`, `data.table` operates similar to `with`, i.e. `DT[ , mycol]` behaves like `with(DT, mycol)`. When `with=FALSE`, the standard `data.frame` evaluation rules apply. +See `?data.table` for more details about the `..` prefix. + +The `with` argument takes its name from the `base` function `with()`. When `with=TRUE` (default), `data.table` operates similar to `with()`, i.e. `DT[, mycol]` behaves like `with(DT, mycol)`. When `with=FALSE`, the standard `data.frame` evaluation rules apply to all variables in `j` and you can no longer use column names directly. ## What are the benefits of being able to use column names as if they are variables inside `DT[...]`? From 3a5de9675630866266c28a87c3830c7a54b7f14b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 21 Jun 2021 20:35:59 -0600 Subject: [PATCH 297/588] #5044 follow-up: when bit64 is not available test_bit64 is FALSE but the NULL is not removed. Thanks to GLCI which has two instances running without bit64 to catch this --- inst/tests/tests.Rraw | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 21972d44bd..4d952e7144 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3066,7 +3066,10 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.0182, melt(DT.list.missing, measure.vars=c("l1","n34"), na.rm=TRUE)[["value"]], list(1,3,4), warning="are not all of the same type") test(1035.0183, melt(DT.list.missing, measure.vars=c("l1","NA5"), na.rm=TRUE)[["value"]], list(1,5), warning="are not all of the same type") test(1035.0184, melt(DT.list.missing, measure.vars=list(l=c("l1","l2"), n=c("n34","NA5")), na.rm=TRUE), data.table(variable=factor(1:2), l=list(1,2), n=c(3,5))) - test(1035.0185, melt(data.table(l=list(c(NA,NA), NA, NA_integer_, NA_real_, NA_complex_, NA_character_, if(test_bit64)NA_integer64_)), measure.vars="l", na.rm=TRUE)[["value"]], list(c(NA,NA))) + test(1035.0185, melt(data.table(l=list(c(NA,NA), NA, NA_integer_, NA_real_, NA_complex_, NA_character_, + if (test_bit64) NA_integer64_ else NA)), # 'else NA' otherwise NULL is not removed when test_bit64 is FALSE + measure.vars="l", na.rm=TRUE)[["value"]], + list(c(NA,NA))) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] From f26ed0110bd32b743920cfc98349678400f1eb17 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 21 Jun 2021 21:20:47 -0600 Subject: [PATCH 298/588] #4408 follow-up: line width in .Rd --- man/data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/data.table.Rd b/man/data.table.Rd index 9df490f77d..4c6f9961e7 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -281,7 +281,7 @@ DT[2:5, cat(v, "\n")] # just for j's side effect # select columns the data.frame way DT[, 2] # 2nd column, returns a data.table always colNum = 2 -DT[, ..colNum] # same, .. prefix conveys to look for colNum one-level-up in calling scope +DT[, ..colNum] # same, .. prefix conveys one-level-up in calling scope DT[["v"]] # same as DT[, v] but faster if called in a loop # grouping operations - j and by From db2669862ff16602d6ef286491a4453e3ebce0ca Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 22 Jun 2021 10:26:59 +0200 Subject: [PATCH 299/588] easier way lkp all columns on join, closes #3184 (#5052) --- inst/tests/tests.Rraw | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4d952e7144..79fee4fcd1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17778,3 +17778,10 @@ sampleText = paste0(paste(rep(c('1; 2'), 100), collapse='\n'), c('\n"a";2\n1; 2\ test(2197, fread(sampleText, fill=TRUE, quote='\"', verbose=TRUE, header=FALSE), data.table(rep(c("1","a","1"),c(100,1,101)), 2L), output='Column 1 bumped') + +# Need an easier way to in-place merge multiple columns #3184 +d1 = data.table(id = 1:5, x1=5:1, x2=5:1/2) +d2 = data.table(id = 2:4, y1=4:2, y2=4:2/2) +test(2198.1, d1[d2, paste0("z", 1:2) := Y, on = "id", env = list(Y = as.list(paste0("i.y", 1:2)))], data.table(id=1:5, x1=5:1, x2=5:1/2, z1=c(NA,4:2,NA), z2=c(NA,4:2/2,NA))) ## using i. prefix +test(2198.2, d1[d2, paste0("z", 1:2) := Y, on = "id", env = list(Y = as.list(paste0("y", 1:2)))], data.table(id=1:5, x1=5:1, x2=5:1/2, z1=c(NA,4:2,NA), z2=c(NA,4:2/2,NA))) ## no i. prefix should still work + From 7ac2120205b39748c338419c6ae6e6c1515839db Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 22 Jun 2021 11:00:01 +0200 Subject: [PATCH 300/588] set and := no longer warn when LHS is empty (#4339) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 11 +++++++++-- src/assign.c | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index a2576e71d2..af8d5b0270 100644 --- a/NEWS.md +++ b/NEWS.md @@ -169,6 +169,8 @@ 9. `?"."`, `?".."`, `?".("`, and `?".()"` now point to `?data.table`, [#4385](https://github.com/Rdatatable/data.table/issues/4385) [#4407](https://github.com/Rdatatable/data.table/issues/4407). To help users find the documentation for these convenience features available inside `DT[...]`. Recall that `.` is an alias for `list`, and `..var` tells `data.table` to look for `var` in the calling environment as opposed to a column of the table. +10. `DT[, lhs:=rhs]` and `set(DT, , lhs, rhs)` no longer raise a warning on zero length `lhs`, [#4086](https://github.com/Rdatatable/data.table/issues/4086). Thanks to Jan Gorecki for the suggestion and PR. For example, `DT[, grep("foo", names(dt)) := NULL]` no longer warns if there are no column names containing `"foo"`. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 79fee4fcd1..a934fec48a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5045,9 +5045,16 @@ if (test_bit64) { error="Cannot coerce 'list' RHS to 'integer64' to match.*column 6 named 'f'") } -# FR #343, when LHS evaluates to integer(0), provide warning and return dt, not an error. +# FR #343, when LHS evaluates to integer(0), provide warning and return dt, not an error... but then... #4086 set could allow empty input without warning dt = data.table(a = 1:5, b1 = 1:5, b2 = 1:5) -test(1295, dt[, grep("c", names(dt)) := NULL], dt, warning="length(LHS)==0; no columns to delete or assign RHS to") +test(1295.1, dt[, grep("c", names(dt)) := NULL], dt) +test(1295.2, set(dt, NULL, character(), 1L), dt) +test(1295.3, set(dt, 1L, character(), 1L), dt) +op = options(datatable.verbose=TRUE) +test(1295.4, dt[, grep("c", names(dt)) := NULL], dt, output="length(LHS)==0; no columns to delete or assign RHS to") +test(1295.5, set(dt, NULL, character(), 1L), dt, output="length(LHS)==0; no columns to delete or assign RHS to") +test(1295.6, set(dt, 1L, character(), 1L), dt, output="length(LHS)==0; no columns to delete or assign RHS to") +options(op) # Updating logical column in one-row DT (corruption of new R 3.1 internal globals for TRUE, FALSE and NA) DT = data.table(a=1:6, b=c(TRUE,FALSE)) diff --git a/src/assign.c b/src/assign.c index 1955fe502a..af3768f81a 100644 --- a/src/assign.c +++ b/src/assign.c @@ -352,7 +352,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } } if (!length(cols)) { - warning(_("length(LHS)==0; no columns to delete or assign RHS to.")); // test 1295 covers + if (verbose) Rprintf(_("length(LHS)==0; no columns to delete or assign RHS to.")); // test 1295 covers *_Last_updated = 0; UNPROTECT(protecti); return(dt); From 0a62718a1040d0c62067c8440d012f251f829534 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 22 Jun 2021 20:12:25 +0200 Subject: [PATCH 301/588] .SDcols peeling outer ( and edge case of :, and a logical of diff length (#4470) --- NEWS.md | 5 +++++ R/data.table.R | 9 ++++++--- inst/tests/tests.Rraw | 11 +++++++++-- man/data.table.Rd | 2 +- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index af8d5b0270..cef5c3918b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -141,6 +141,10 @@ 23. `fread(fill=TRUE, verbose=TRUE)` would segfault on the out-of-sample type bump verbose output if the input did not contain column names, [5046](https://github.com/Rdatatable/data.table/pull/5046). Thanks to Václav Tlapák for the PR. +24. `.SDcols=-V2:-V1` and `.SDcols=(-1)` could error with `xcolAns does not pass checks` and `argument specifying columns specify non existing column(s)`, [#4231](https://github.com/Rdatatable/data.table/issues/4231). Thanks to Jan Gorecki for reporting and the PR. + +25. `.SDcols=` is now documented in `?data.table` and it is now an error if the logical vector's length is not equal to the number of columns (consistent with `data.table`'s no-recycling policy; see new feature 1 in v1.12.2 Apr 2019), [#4115](https://github.com/Rdatatable/data.table/issues/4115). Thanks to @Henrik-P for reporting and Jan Gorecki for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : @@ -376,6 +380,7 @@ has a better chance of working on Mac. 11. `copy()` now overallocates deeply nested lists of `data.table`s, [#4205](https://github.com/Rdatatable/data.table/issues/4205). Thanks to @d-sci for reporting and the PR. 12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR. + 13. A relatively rare case of segfault when combining non-equi joins with `by=.EACHI` is now fixed, closes [#4388](https://github.com/Rdatatable/data.table/issues/4388). 14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report. diff --git a/R/data.table.R b/R/data.table.R index 06fadd9ee6..aaa13da799 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -988,6 +988,8 @@ replace_dot_alias = function(e) { } else { # FR #355 - negative numeric and character indices for SDcols colsub = substitute(.SDcols) + # peel from parentheses before negation so (-1L) works as well: as.data.table(as.list(1:3))[, .SD,.SDcols=(-1L)] #4231 + while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]] # fix for R-Forge #5190. colsub[[1L]] gave error when it's a symbol. if (colsub %iscall% c("!", "-")) { negate_sdcols = TRUE @@ -995,8 +997,8 @@ replace_dot_alias = function(e) { } else negate_sdcols = FALSE # fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4))) while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]] - if (colsub %iscall% ':' && length(colsub)==3L) { - # .SDcols is of the format a:b + if (colsub %iscall% ':' && length(colsub)==3L && !is.call(colsub[[2L]]) && !is.call(colsub[[3]])) { + # .SDcols is of the format a:b, ensure none of : arguments is a call data.table(V1=-1L, V2=-2L, V3=-3L)[,.SD,.SDcols=-V2:-V1] #4231 .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) } else { if (colsub %iscall% 'patterns') { @@ -1016,7 +1018,8 @@ replace_dot_alias = function(e) { if (anyNA(.SDcols)) stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols)))) if (is.logical(.SDcols)) { - ansvals = which_(rep(.SDcols, length.out=length(x)), !negate_sdcols) + if (length(.SDcols)!=length(x)) stop(gettextf(".SDcols is a logical vector length %d but there are %d columns", length(.SDcols), length(x))) + ansvals = which_(.SDcols, !negate_sdcols) ansvars = sdvars = names_x[ansvals] } else if (is.numeric(.SDcols)) { .SDcols = as.integer(.SDcols) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a934fec48a..65c595d273 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7080,8 +7080,10 @@ test(1497, DT[, .SD, .SDcols = !c("a", "c")], DT[, !c("a", "c"), with=FALSE]) # Fix for #1060 DT = data.table(x=1, y=2, z=3, a=4, b=5, c=6) -test(1498.1, DT[, .SD, .SDcols=c(TRUE,FALSE)], DT[, c("x", "z", "b"), with=FALSE]) -test(1498.2, DT[, .SD, .SDcols=!c(TRUE,FALSE)], DT[, !c("x", "z", "b"), with=FALSE]) +test(1498.1, DT[, .SD, .SDcols=c(TRUE,FALSE)], error="logical.*length 2 but.*6 columns") # #4115 #4470 +test(1498.2, DT[, .SD, .SDcols=!c(TRUE,FALSE)], error="logical.*length 2 but.*6 columns") +test(1498.3, DT[, .SD, .SDcols=c(TRUE,FALSE,TRUE,FALSE,TRUE,FALSE)], data.table(x=1, z=3, b=5)) +test(1498.4, DT[, .SD, .SDcols=!c(TRUE,FALSE,TRUE,FALSE,TRUE,FALSE)], data.table(y=2, a=4, c=6)) # Fix for #1072 dt <- data.table(group1 = "a", group2 = "z", value = 1) @@ -17792,3 +17794,8 @@ d2 = data.table(id = 2:4, y1=4:2, y2=4:2/2) test(2198.1, d1[d2, paste0("z", 1:2) := Y, on = "id", env = list(Y = as.list(paste0("i.y", 1:2)))], data.table(id=1:5, x1=5:1, x2=5:1/2, z1=c(NA,4:2,NA), z2=c(NA,4:2/2,NA))) ## using i. prefix test(2198.2, d1[d2, paste0("z", 1:2) := Y, on = "id", env = list(Y = as.list(paste0("y", 1:2)))], data.table(id=1:5, x1=5:1, x2=5:1/2, z1=c(NA,4:2,NA), z2=c(NA,4:2/2,NA))) ## no i. prefix should still work +# internal error when specifying .SDcols, #4231 +test(2199.1, as.data.table(as.list(1:2))[, .SD,.SDcols=(-1L)], data.table(V2=2L)) +test(2199.2, as.data.table(as.list(1:2))[, .SD,.SDcols=(-(1L))], data.table(V2=2L)) +test(2199.3, as.data.table(as.list(1:3))[, .SD,.SDcols=(-1L)], data.table(V2=2L, V3=3L)) +test(2199.4, data.table(V1=-1L, V2=-2L, V3=-3L)[,.SD,.SDcols=-V2:-V1], error="not found") diff --git a/man/data.table.Rd b/man/data.table.Rd index 4c6f9961e7..7418d12118 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -145,7 +145,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{which}{\code{TRUE} returns the row numbers of \code{x} that \code{i} matches to. If \code{NA}, returns the row numbers of \code{i} that have no match in \code{x}. By default \code{FALSE} and the rows in \code{x} that match are returned.} - \item{.SDcols}{ Specifies the columns of \code{x} to be included in the special symbol \code{\link{.SD}} which stands for \code{Subset of data.table}. May be character column names or numeric positions. This is useful for speed when applying a function through a subset of (possible very many) columns; e.g., \code{DT[, lapply(.SD, sum), by="x,y", .SDcols=301:350]}. + \item{.SDcols}{ Specifies the columns of \code{x} to be included in the special symbol \code{\link{.SD}} which stands for \code{Subset of data.table}. May be character column names, numeric positions, logical, a function name such as `is.numeric`, or a function call such as `patterns()`. `.SDcols` is particularly useful for speed when applying a function through a subset of (possible very many) columns by group; e.g., \code{DT[, lapply(.SD, sum), by="x,y", .SDcols=301:350]}. For convenient interactive use, the form \code{startcol:endcol} is also allowed (as in \code{by}), e.g., \code{DT[, lapply(.SD, sum), by=x:y, .SDcols=a:f]}. From fd24a3105953f7785ea7414678ed8e04524e6955 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 23 Jun 2021 00:58:45 -0600 Subject: [PATCH 302/588] .dev-only: revdep now halts if the same package is installed in both paths --- .dev/revdep.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.dev/revdep.R b/.dev/revdep.R index 38c5a93a66..5d80c32612 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -28,6 +28,20 @@ if (grepl("devel", .libPaths()[2L])) { stopifnot(tt[2L]=="root") R = "R" # R-release } +if (length(dup_pkgs <- intersect(dir(.libPaths()[1]), dir(.libPaths()[2])))) { + stop("Package(s) installed in both paths: ", paste(dup_pkgs, collapse=" ")) + # S4 issues are frustrating as it seems that dependencies need to be reinstalled to reset the nobody-knows-how-it-works S4 cache? + # For example, to fix prioritizr's error in its examples : + # error in evaluating the argument 'x' in selecting a method for function 'print': + # error in evaluating the argument 'y' in selecting a method for function 'intersecting_units': + # package slot missing from signature for generic ‘raster’ + # I reinstalled raster. But that didn't help. Then raster's depend sp but that didn't help. Then sp followed by raster + # again but that didn't help. Then all dependencies but that didn't help. Then lattice too but that didn't help. + # Then after a few hours, I stumbled on the realization when I reinstalled lattice it got installed in revdeplib rather + # than R's base library where the recommended packages are; lattice was in two places on .libPaths(). Removing it from + # revdeplib didn't help. But reinstalling lattice (using sudo R) in /usr/lib/R/library did appear to finally fix prioritizr. + # Why it was lattice that needed to be reinstalled just beats me, and why do S4 packages need to be reinstalled anyway? +} stopifnot(identical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_"),"true")) # _R_CHECK_FORCE_SUGGESTS_=true explicitly in .dev/.bash_aliases From ed72e398df76a0fcfd134a4ad92356690e4210ea Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sat, 26 Jun 2021 12:11:08 -0400 Subject: [PATCH 303/588] remove allocNAVector in melt for memory efficiency (#5054) --- NEWS.md | 2 + src/fmelt.c | 204 ++++++++++++++++++++++++++++------------------------ 2 files changed, 111 insertions(+), 95 deletions(-) diff --git a/NEWS.md b/NEWS.md index cef5c3918b..3eed2775fb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -175,6 +175,8 @@ 10. `DT[, lhs:=rhs]` and `set(DT, , lhs, rhs)` no longer raise a warning on zero length `lhs`, [#4086](https://github.com/Rdatatable/data.table/issues/4086). Thanks to Jan Gorecki for the suggestion and PR. For example, `DT[, grep("foo", names(dt)) := NULL]` no longer warns if there are no column names containing `"foo"`. +11. `melt()`'s internal C code is now more memory efficient, [#5054](https://github.com/Rdatatable/data.table/pull/5054). Thanks to Toby Dylan Hocking for the PR. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/src/fmelt.c b/src/fmelt.c index b33d99adb4..0ee7e70ef8 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -270,11 +270,11 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { } struct processData { - SEXP RCHK; // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865 + SEXP RCHK; // a 2 item list holding vars (result of checkVars) and not_NA_indices. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865 SEXP idcols, // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively variable_table, // NULL or data for variable column(s). valuecols, // list with one element per output/value column, each element is an integer vector. - naidx; + not_NA_indices; int *isfactor, *leach, // length of each element of the valuecols(measure.vars) list. *isidentical; // are all inputs for this value column the same type? @@ -313,10 +313,12 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna for (int i=0; ilvalues; ++i) { // for each output column. tmp = VECTOR_ELT(data->valuecols, i); data->leach[i] = length(tmp); + if (data->leach[i] > data->lmax) { + data->lmax = data->leach[i]; + } data->isidentical[i] = 1; // TODO - why 1 and not Rboolean TRUE? data->isfactor[i] = 0; // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable? data->maxtype[i] = 0; // R_alloc doesn't initialize so careful to here, relied on below - data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i]; for (int j=0; jleach[i]; ++j) { // for each input column. int this_col_num = INTEGER(tmp)[j]; if(this_col_num != NA_INTEGER){ @@ -344,7 +346,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna } } if (data->narm) { - SET_VECTOR_ELT(data->RCHK, 1, data->naidx = allocVector(VECSXP, data->lmax)); + SET_VECTOR_ELT(data->RCHK, 1, data->not_NA_indices = allocVector(VECSXP, data->lmax)); } // TDH 1 Oct 2020 variable table. data->variable_table = getAttrib(measure, sym_variable_table); @@ -360,7 +362,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna for (int i=0; ivariable_table); ++i) { int nrow = length(VECTOR_ELT(data->variable_table, i)); if (data->lmax != nrow) { - error(_("variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =%d"), data->lmax); + error(_("variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =%d"), data->lmax); } } } else {//neither NULL nor DT. @@ -433,14 +435,14 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType return ans; } -SEXP input_col_or_na(SEXP DT, struct processData* data, SEXP thisvaluecols, int out_col, int in_col) { +SEXP input_col_or_NULL(SEXP DT, struct processData* data, SEXP thisvaluecols, int out_col, int in_col) { if (in_col < data->leach[out_col]) { int input_column_num = INTEGER(thisvaluecols)[in_col]; if (input_column_num != NA_INTEGER) { return VECTOR_ELT(DT, input_column_num-1); } } - return allocNAVector(data->maxtype[out_col], data->nrow); + return R_NilValue; } SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, struct processData *data) { @@ -451,17 +453,27 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s } if (data->narm) { SEXP seqcols = PROTECT(seq_int(data->lvalues, 1)); - for (int i=0; ilmax; ++i) { - SEXP tmp = PROTECT(allocVector(VECSXP, data->lvalues)); - for (int j=0; jlvalues; ++j) { + for (int i=0; ilmax; ++i) {//element in measure vector. + SEXP valuecols_data = PROTECT(allocVector(VECSXP, data->lvalues)); + int N_missing_columns = 0; + for (int j=0; jlvalues; ++j) {//which measure vector/output col. SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); - SET_VECTOR_ELT(tmp, j, input_col_or_na(DT, data, thisvaluecols, j, i)); + SEXP vec_or_NULL = input_col_or_NULL(DT, data, thisvaluecols, j, i); + if (vec_or_NULL == R_NilValue) { + N_missing_columns++; + } + SET_VECTOR_ELT(valuecols_data, j, vec_or_NULL); + } + if (N_missing_columns==0) { + SEXP any_missing = PROTECT(dt_na(valuecols_data, seqcols)); + SEXP missing_indices; + SET_VECTOR_ELT(data->not_NA_indices, i, missing_indices=which(any_missing, FALSE)); + data->totlen += length(missing_indices); + UNPROTECT(1); // any_missing + } else { + SET_VECTOR_ELT(data->not_NA_indices, i, allocVector(INTSXP, 0)); } - tmp = PROTECT(dt_na(tmp, seqcols)); - SEXP w; - SET_VECTOR_ELT(data->naidx, i, w=which(tmp, FALSE)); - data->totlen += length(w); - UNPROTECT(2); // tmp twice + UNPROTECT(1); // valuecols_data } UNPROTECT(1); // seqcols } else { @@ -480,80 +492,82 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s bool copyattr = false; for (int j=0; jlmax; ++j) {// for each input column. int thisprotecti = 0; - SEXP thiscol = input_col_or_na(DT, data, thisvaluecols, i, j); - if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { - copyMostAttrib(thiscol, target); - copyattr = true; - } - if (TYPEOF(thiscol) != TYPEOF(target) && (data->maxtype[i] == VECSXP || !isFactor(thiscol))) { - thiscol = PROTECT(coerceVector(thiscol, TYPEOF(target))); thisprotecti++; - } - const int *ithisidx = NULL; - int thislen = 0; - if (data->narm) { - SEXP thisidx = VECTOR_ELT(data->naidx, j); - ithisidx = INTEGER(thisidx); - thislen = length(thisidx); - } - size_t size = SIZEOF(thiscol); - switch (TYPEOF(target)) { - case VECSXP : - if (data->narm) { - for (int k=0; knrow; ++k) SET_VECTOR_ELT(target, j*data->nrow + k, VECTOR_ELT(thiscol, k)); + SEXP thiscol = input_col_or_NULL(DT, data, thisvaluecols, i, j); + if (thiscol == R_NilValue) { + if (!data->narm) { + writeNA(target, j*data->nrow, data->nrow); } - break; - case STRSXP : - if (data->isfactor[i]) { - if (isFactor(thiscol)) { - SET_VECTOR_ELT(flevels, j, getAttrib(thiscol, R_LevelsSymbol)); - thiscol = PROTECT(asCharacterFactor(thiscol)); thisprotecti++; - isordered[j] = isOrdered(thiscol); - } else SET_VECTOR_ELT(flevels, j, thiscol); + }else{ + if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { + copyMostAttrib(thiscol, target); + copyattr = true; } - if (data->narm) { - for (int k=0; knrow; ++k) SET_STRING_ELT(target, j*data->nrow + k, STRING_ELT(thiscol, k)); + if (TYPEOF(thiscol) != TYPEOF(target) && (data->maxtype[i] == VECSXP || !isFactor(thiscol))) { + thiscol = PROTECT(coerceVector(thiscol, TYPEOF(target))); thisprotecti++; } - break; - //TODO complex value type: case CPLXSXP: { } break; - case REALSXP : { - double *dtarget = REAL(target); - const double *dthiscol = REAL(thiscol); + const int *ithisidx = NULL; + int thislen = 0; if (data->narm) { - for (int k=0; knrow, dthiscol, data->nrow*size); + SEXP thisidx = VECTOR_ELT(data->not_NA_indices, j); + ithisidx = INTEGER(thisidx); + thislen = length(thisidx); } - } - break; - case INTSXP : - case LGLSXP : { - int *itarget = INTEGER(target); - const int *ithiscol = INTEGER(thiscol); - if (data->narm) { - for (int k=0; knrow, ithiscol, data->nrow*size); + size_t size = SIZEOF(thiscol); + switch (TYPEOF(target)) { + case VECSXP : + if (data->narm) { + for (int k=0; knrow; ++k) SET_VECTOR_ELT(target, j*data->nrow + k, VECTOR_ELT(thiscol, k)); + } + break; + case STRSXP : + if (data->isfactor[i]) { + if (isFactor(thiscol)) { + SET_VECTOR_ELT(flevels, j, getAttrib(thiscol, R_LevelsSymbol)); + thiscol = PROTECT(asCharacterFactor(thiscol)); thisprotecti++; + isordered[j] = isOrdered(thiscol); + } else SET_VECTOR_ELT(flevels, j, thiscol); + } + if (data->narm) { + for (int k=0; knrow; ++k) SET_STRING_ELT(target, j*data->nrow + k, STRING_ELT(thiscol, k)); + } + break; + //TODO complex value type: case CPLXSXP: { } break; + case REALSXP : { + double *dtarget = REAL(target); + const double *dthiscol = REAL(thiscol); + if (data->narm) { + for (int k=0; knrow, dthiscol, data->nrow*size); + } + } + break; + case INTSXP : + case LGLSXP : { + int *itarget = INTEGER(target); + const int *ithiscol = INTEGER(thiscol); + if (data->narm) { + for (int k=0; knrow, ithiscol, data->nrow*size); + } + } break; + default : + error(_("Unknown column type '%s' for column '%s'."), type2char(TYPEOF(thiscol)), CHAR(STRING_ELT(dtnames, INTEGER(thisvaluecols)[i]-1))); } - } break; - default : - error(_("Unknown column type '%s' for column '%s'."), type2char(TYPEOF(thiscol)), CHAR(STRING_ELT(dtnames, INTEGER(thisvaluecols)[i]-1))); + if (data->narm) counter += thislen; } - if (data->narm) counter += thislen; UNPROTECT(thisprotecti); // inside inner loop (note that it's double loop) so as to limit use of protection stack } if (thisvalfactor && data->isfactor[i] && TYPEOF(target) != VECSXP) { - //SEXP clevels = PROTECT(combineFactorLevels(flevels, &(data->isfactor[i]), isordered)); - //SEXP factorLangSxp = PROTECT(lang3(install(data->isfactor[i] == 1 ? "factor" : "ordered"), target, clevels)); - //SET_VECTOR_ELT(ansvals, i, eval(factorLangSxp, R_GlobalEnv)); - //UNPROTECT(2); // clevels, factorLangSxp SET_VECTOR_ELT(ansvals, i, combineFactorLevels(flevels, target, &(data->isfactor[i]), isordered)); } } @@ -575,13 +589,13 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str if (data->lvalues == 1) {//one value column to output. const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); for (int j=0, ansloc=0; jlmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; SEXP str = STRING_ELT(dtnames, thisvaluecols[j]-1); for (int k=0; klmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; char buff[20]; snprintf(buff, 20, "%d", level++); for (int k=0; knarm && length(VECTOR_ELT(data->naidx, j))==0)) { numRemove++; md[j]=0; } + if (md[j]!=j+1 /*dup*/ || (data->narm && length(VECTOR_ELT(data->not_NA_indices, j))==0)) { numRemove++; md[j]=0; } } if (numRemove) { SEXP newlevels = PROTECT(allocVector(STRSXP, len-numRemove)); protecti++; @@ -611,14 +625,14 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str levels = newlevels; } for (int j=0, ansloc=0; jlmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; for (int k=0; klmax)); protecti++; for (int j=0, ansloc=0; jlmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; char buff[20]; snprintf(buff, 20, "%d", nlevel+1); SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels @@ -633,7 +647,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str SEXP out_col = VECTOR_ELT(data->variable_table, out_col_i); SET_VECTOR_ELT(ansvars, out_col_i, target=allocVector(TYPEOF(out_col), data->totlen)); for (int j=0, ansloc=0; jlmax; ++j) { - const int thislen = data->narm ? length(VECTOR_ELT(data->naidx, j)) : data->nrow; + const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; switch (TYPEOF(target)) { case STRSXP : for (int k=0; knarm) { for (int j=0; jlmax; ++j) { - SEXP thisidx = VECTOR_ELT(data->naidx, j); + SEXP thisidx = VECTOR_ELT(data->not_NA_indices, j); const int *ithisidx = INTEGER(thisidx); const int thislen = length(thisidx); for (int k=0; knarm) { for (int j=0; jlmax; ++j) { - SEXP thisidx = VECTOR_ELT(data->naidx, j); + SEXP thisidx = VECTOR_ELT(data->not_NA_indices, j); const int *ithisidx = INTEGER(thisidx); const int thislen = length(thisidx); for (int k=0; knarm) { for (int j=0; jlmax; ++j) { - SEXP thisidx = VECTOR_ELT(data->naidx, j); + SEXP thisidx = VECTOR_ELT(data->not_NA_indices, j); const int *ithisidx = INTEGER(thisidx); const int thislen = length(thisidx); for (int k=0; knarm) { for (int j=0; jlmax; ++j) { - SEXP thisidx = VECTOR_ELT(data->naidx, j); + SEXP thisidx = VECTOR_ELT(data->not_NA_indices, j); const int *ithisidx = INTEGER(thisidx); const int thislen = length(thisidx); for (int k=0; klmax; ++j) { - for (int k=0; knrow; ++k) { - SET_VECTOR_ELT(target, j*data->nrow + k, VECTOR_ELT(thiscol, k)); - } - } + for (int j=0; jlmax; ++j) { + for (int k=0; knrow; ++k) { + SET_VECTOR_ELT(target, j*data->nrow + k, VECTOR_ELT(thiscol, k)); + } + } } } break; From 1759f3c750e90c2313fbec065b96231a17e68e8c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 7 Jul 2021 20:40:21 -0700 Subject: [PATCH 304/588] Add new template functions for internal error/warning/message to ease translation (#5056) --- .gitignore | 1 - R/as.data.table.R | 4 +- R/between.R | 20 ++--- R/bmerge.R | 12 +-- R/data.table.R | 160 +++++++++++++++++------------------- R/devel.R | 2 +- R/fcast.R | 10 +-- R/fmelt.R | 35 ++++---- R/foverlaps.R | 26 +++--- R/fread.R | 40 ++++----- R/fwrite.R | 8 +- R/groupingsets.R | 4 +- R/last.R | 4 +- R/merge.R | 12 +-- R/onAttach.R | 24 +++--- R/onLoad.R | 4 +- R/openmp-utils.R | 4 +- R/print.data.table.R | 4 +- R/programming.R | 3 +- R/setkey.R | 14 ++-- R/setops.R | 15 ++-- R/tables.R | 2 +- R/test.data.table.R | 11 +-- R/translation.R | 21 +++++ R/transpose.R | 10 +-- R/utils.R | 5 -- R/xts.R | 4 +- inst/tests/programming.Rraw | 38 ++++----- inst/tests/tests.Rraw | 38 ++++----- 29 files changed, 264 insertions(+), 271 deletions(-) create mode 100644 R/translation.R diff --git a/.gitignore b/.gitignore index 51cc13cd69..00d0d0e8be 100644 --- a/.gitignore +++ b/.gitignore @@ -38,7 +38,6 @@ vignettes/plots/figures .Renviron lib library -*.R *.csv *.csvy *.RDS diff --git a/R/as.data.table.R b/R/as.data.table.R index 75e8d23ae0..2f1a336868 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -105,7 +105,7 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va if (is.null(names(val)) || !any(nzchar(names(val)))) setattr(val, 'names', paste0("V", rev(seq_along(val)))) if (value.name %chin% names(val)) - stop("Argument 'value.name' should not overlap with column names in result: ", brackify(rev(names(val)))) + stopf("Argument 'value.name' should not overlap with column names in result: %s", brackify(rev(names(val)))) N = NULL ans = data.table(do.call(CJ, c(val, sorted=FALSE)), N=as.vector(x)) if (isTRUE(na.rm)) @@ -178,7 +178,7 @@ as.data.table.list = function(x, xi = x[[i]] if (is.null(xi)) { n_null = n_null+1L; next } if (eachnrow[i]>1L && nrow%%eachnrow[i]!=0L) # in future: eachnrow[i]!=nrow - warning("Item ", i, " has ", eachnrow[i], " rows but longest item has ", nrow, "; recycled with remainder.") + warningf("Item %d has %d rows but longest item has %d; recycled with remainder.", i, eachnrow[i], nrow) if (is.data.table(xi)) { # matrix and data.frame were coerced to data.table above prefix = if (!isFALSE(.named[i]) && isTRUE(nchar(names(x)[i])>0L)) paste0(names(x)[i],".") else "" # test 2058.12 for (j in seq_along(xi)) { diff --git a/R/between.R b/R/between.R index 61fee332b4..0ece7f3ef5 100644 --- a/R/between.R +++ b/R/between.R @@ -9,10 +9,10 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) if (is.px(x) && (is.character(lower) || is.character(upper))) { tz = attr(x, "tzone", exact=TRUE) if (is.null(tz)) tz = "" - if (is.character(lower)) lower = tryCatch(as.POSIXct(lower, tz=tz), error=function(e)stop( - "'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with: ", e$message)) - if (is.character(upper)) upper = tryCatch(as.POSIXct(upper, tz=tz), error=function(e)stop( - "'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with: ", e$message)) + if (is.character(lower)) lower = tryCatch(as.POSIXct(lower, tz=tz), error=function(e)stopf( + "'between' function the 'x' argument is a POSIX class while '%s' was not, coercion to POSIX failed with: %s", 'lower', e$message)) + if (is.character(upper)) upper = tryCatch(as.POSIXct(upper, tz=tz), error=function(e)stopf( + "'between' function the 'x' argument is a POSIX class while '%s' was not, coercion to POSIX failed with: %s", 'upper', e$message)) stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal } # POSIX check timezone match @@ -24,11 +24,11 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) # lower/upper should be more tightly linked than x/lower, so error # if the former don't match but only inform if they latter don't if (tzs[2L]!=tzs[3L]) { - stop("'between' lower= and upper= are both POSIXct but have different tzone attributes: ", brackify(tzs[2:3],quote=TRUE), ". Please align their time zones.") + stopf("'between' lower= and upper= are both POSIXct but have different tzone attributes: %s. Please align their time zones.", brackify(tzs[2:3], quote=TRUE)) # otherwise the check in between.c that lower<=upper can (correctly) fail for this reason } if (tzs[1L]!=tzs[2L]) { - message("'between' arguments are all POSIXct but have mismatched tzone attributes: ", brackify(tzs,quote=TRUE),". The UTC times will be compared.") + messagef("'between' arguments are all POSIXct but have mismatched tzone attributes: %s. The UTC times will be compared.", brackify(tzs, quote=TRUE)) # the underlying numeric is always UTC anyway in POSIXct so no coerce is needed; just compare as-is. As done by CoSMoS::example(analyzeTS), #3581 } } @@ -60,12 +60,8 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) y = eval.parent(ysub) } if ((l <- length(y)) != 2L) { - stop("RHS has length() ", l, "; expecting length 2. ", - if (ysub %iscall% 'c') - sprintf("Perhaps you meant %s? ", - capture.output(print(`[[<-`(ysub, 1L, quote(list))))), - "The first element should be the lower bound(s); ", - "the second element should be the upper bound(s).") + suggestion <- if (ysub %iscall% 'c') gettextf("Perhaps you meant %s? ", capture.output(print(`[[<-`(ysub, 1L, quote(list))))) else "" + stopf("RHS has length() %d; expecting length 2. %sThe first element should be the lower bound(s); the second element should be the upper bound(s).", l, suggestion) } between(x, y[[1L]], y[[2L]], incbounds=TRUE) } diff --git a/R/bmerge.R b/R/bmerge.R index 6bafd0e5bc..6d5b30a244 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -45,11 +45,11 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos iclass = getClass(i[[ic]]) xname = paste0("x.", names(x)[xc]) iname = paste0("i.", names(i)[ic]) - if (!xclass %chin% supported) stop("x.", names(x)[xc]," is type ", xclass, " which is not supported by data.table join") - if (!iclass %chin% supported) stop("i.", names(i)[ic]," is type ", iclass, " which is not supported by data.table join") + if (!xclass %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, xclass) + if (!iclass %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, iclass) if (xclass=="factor" || iclass=="factor") { if (roll!=0.0 && a==length(icols)) - stop("Attempting roll join on factor column when joining x.",names(x)[xc]," to i.",names(i)[ic],". Only integer, double or character columns may be roll joined.") + stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname) if (xclass=="factor" && iclass=="factor") { if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname) set(i, j=ic, value=chmatch(levels(i[[ic]]), levels(x[[xc]]), nomatch=0L)[i[[ic]]]) # nomatch=0L otherwise a level that is missing would match to NA values @@ -68,7 +68,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos next } } - stop("Incompatible join types: x.", names(x)[xc], " (",xclass,") and i.", names(i)[ic], " (",iclass,"). Factor columns must join to factor or character columns.") + stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, xclass, iname, iclass) } if (xclass == iclass) { if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, xclass, xname) @@ -87,7 +87,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos set(x, j=xc, value=match.fun(paste0("as.", iclass))(x[[xc]])) next } - stop("Incompatible join types: x.", names(x)[xc], " (",xclass,") and i.", names(i)[ic], " (",iclass,")") + stopf("Incompatible join types: %s (%s) and %s (%s)", xname, xclass, iname, iclass) } if (xclass=="integer64" || iclass=="integer64") { nm = c(iname, xname) @@ -95,7 +95,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (wclass=="integer" || (wclass=="double" && !isReallyReal(w[[wc]]))) { if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which contains no fractions)" else "", nm[2L]) set(w, j=wc, value=bit64::as.integer64(w[[wc]])) - } else stop("Incompatible join types: ", nm[2L], " is type integer64 but ", nm[1L], " is type double and contains fractions") + } else stopf("Incompatible join types: %s is type integer64 but %s is type double and contains fractions", nm[2L], nm[1L]) } else { # just integer and double left if (iclass=="double") { diff --git a/R/data.table.R b/R/data.table.R index aaa13da799..0faa04d871 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -129,9 +129,9 @@ replace_dot_alias = function(e) { ) found = agrep(used, ref, value=TRUE, ignore.case=TRUE, fixed=TRUE) if (length(found)) { - stop("Object '", used, "' not found. Perhaps you intended ", brackify(found)) + stopf("Object '%s' not found. Perhaps you intended %s", used, brackify(found)) } else { - stop("Object '", used, "' not found amongst ", brackify(ref)) + stopf("Object '%s' not found amongst %s", used, brackify(ref)) } } else { stop(err$message, call.=FALSE) @@ -180,7 +180,7 @@ replace_dot_alias = function(e) { stop("When by and keyby are both provided, keyby must be TRUE or FALSE") } if (missing(by)) { missingby=TRUE; by=bysub=NULL } # possible when env is used, PR#4304 - else if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") + else if (verbose) catf("Argument '%s' after substitute: %s\n", "by", paste(deparse(bysub, width.cutoff=500L), collapse=" ")) } bynull = !missingby && is.null(by) #3530 byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI" @@ -189,7 +189,7 @@ replace_dot_alias = function(e) { if (missing(i) && !missing(on)) { tt = eval.parent(.massagei(substitute(on))) if (!is.list(tt) || !length(names(tt))) { - warning("When on= is provided but not i=, on= must be a named list or data.table|frame, and a natural join (i.e. join on common names) is invoked. Ignoring on= which is '",class(tt)[1L],"'.") + warningf("When on= is provided but not i=, on= must be a named list or data.table|frame, and a natural join (i.e. join on common names) is invoked. Ignoring on= which is '%s'.", class(tt)[1L]) on = NULL } else { i = tt @@ -210,7 +210,7 @@ replace_dot_alias = function(e) { missingroll = missing(roll) if (length(roll)!=1L || is.na(roll)) stop("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'") if (is.character(roll)) { - if (roll!="nearest") stop("roll is '",roll,"' (type character). Only valid character value is 'nearest'.") + if (roll!="nearest") stopf("roll is '%s' (type character). Only valid character value is 'nearest'.", roll) } else { roll = if (isTRUE(roll)) +Inf else as.double(roll) } @@ -225,7 +225,7 @@ replace_dot_alias = function(e) { if (!is.na(nomatch) && nomatch!=0L) stop("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL)") nomatch = as.integer(nomatch) if (!is.logical(which) || length(which)>1L) stop("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") - if ((isTRUE(which)||is.na(which)) && !missing(j)) stop("which==",which," (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.") + if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (!is.na(nomatch) && is.na(which)) stop("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stop("j must be provided when with=FALSE") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. @@ -241,7 +241,7 @@ replace_dot_alias = function(e) { substitute2(.j, env), list(.j = substitute(j)) )) - if (missing(jsub)) {j = substitute(); jsub=NULL} else if (verbose) cat("Argument 'j' after substitute: ", paste(deparse(jsub, width.cutoff=500L), collapse=" "), "\n", sep="") + if (missing(jsub)) {j = substitute(); jsub=NULL} else if (verbose) catf("Argument '%s' after substitute: %s\n", "j", paste(deparse(jsub, width.cutoff=500L), collapse=" ")) } } if (!missing(j)) { @@ -269,15 +269,14 @@ replace_dot_alias = function(e) { name = substr(..name, 3L, nchar(..name)) if (!nzchar(name)) stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.") if (!exists(name, where=parent.frame())) { - stop("Variable '",name,"' is not found in calling scope. Looking in calling scope because you used the .. prefix.", - if (exists(..name, where=parent.frame())) - paste0(" Variable '..",name,"' does exist in calling scope though, so please just removed the .. prefix from that variable name in calling scope.") - # We have recommended 'manual' .. prefix in the past, so try to be helpful - else - "" - ) + suggested = if (exists(..name, where=parent.frame())) + gettextf(" Variable '..%s' does exist in calling scope though, so please just removed the .. prefix from that variable name in calling scope.", name) + # We have recommended 'manual' .. prefix in the past, so try to be helpful + else + "" + stopf("Variable '%s' is not found in calling scope. Looking in calling scope because you used the .. prefix.%s", name, suggested) } else if (exists(..name, where=parent.frame())) { - warning("Both '",name,"' and '..", name, "' exist in calling scope. Please remove the '..", name,"' variable in calling scope for clarity.") + warningf("Both '%1$s' and '..%1$s' exist in calling scope. Please remove the '..%1$s' variable in calling scope for clarity.", name) } } ..syms = av @@ -285,7 +284,7 @@ replace_dot_alias = function(e) { } else if (is.name(jsub)) { if (startsWith(as.character(jsub), "..")) stop("Internal error: DT[, ..var] should be dealt with by the branch above now.") # nocov if (!with && !exists(as.character(jsub), where=parent.frame())) - stop("Variable '",jsub,"' is not found in calling scope. Looking in calling scope because you set with=FALSE. Also, please use .. symbol prefix and remove with=FALSE.") + stopf("Variable '%s' is not found in calling scope. Looking in calling scope because you set with=FALSE. Also, please use .. symbol prefix and remove with=FALSE.", as.character(jsub)) } if (root=="{") { if (length(jsub) == 2L) { @@ -328,7 +327,7 @@ replace_dot_alias = function(e) { substitute2(.i, env), list(.i = substitute(i)) )) - if (missing(isub)) {i = substitute(); isub=NULL} else if (verbose) cat("Argument 'i' after substitute: ", paste(deparse(isub, width.cutoff=500L), collapse=" "), "\n", sep="") + if (missing(isub)) {i = substitute(); isub=NULL} else if (verbose) catf("Argument '%s' after substitute: %s\n", "i", paste(deparse(isub, width.cutoff=500L), collapse=" ")) } } if (!missing(i)) { @@ -409,13 +408,13 @@ replace_dot_alias = function(e) { # must be "not found" since isub is a mere symbol col = try(eval(isub, x), silent=TRUE) # is it a column name? msg = if (inherits(col, "try-error")) gettextf( - "'%s' is not found in calling scope and it is not a column name either. ", + "'%s' is not found in calling scope and it is not a column name either", as.character(isub) ) else gettextf( - "'%s' is not found in calling scope, but it is a column of type %s. If you wish to select rows where that column contains TRUE, or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE} is particularly clear and is optimized. ", + "'%s' is not found in calling scope, but it is a column of type %s. If you wish to select rows where that column contains TRUE, or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE} is particularly clear and is optimized", as.character(isub), typeof(col) ) - stop(msg, "When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.") + stopf("%s. When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.", msg) } } if (restore.N) { @@ -595,7 +594,7 @@ replace_dot_alias = function(e) { } # TO DO: TODO: Incorporate which_ here on DT[!i] where i is logical. Should avoid i = !i (above) - inefficient. # i is not a data.table - if (!is.logical(i) && !is.numeric(i)) stop("i has evaluated to type ", typeof(i), ". Expecting logical, integer or double.") + if (!is.logical(i) && !is.numeric(i)) stopf("i has evaluated to type %s. Expecting logical, integer or double.", typeof(i)) if (is.logical(i)) { if (is.na(which)) { # #4411 i filter not optimized to join: DT[A > 1, which = NA] ## we need this branch here, not below next to which=TRUE because irows=i=which(i) will filter out NAs: DT[A > 10, which = NA] will be incorrect @@ -617,7 +616,7 @@ replace_dot_alias = function(e) { # Also this which() is for consistency of DT[colA>3,which=TRUE] and which(DT[,colA>3]) # Assigning to 'i' here as well to save memory, #926. - else stop("i evaluates to a logical vector length ", length(i), " but there are ", nrow(x), " rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.") + else stopf("i evaluates to a logical vector length %d but there are %d rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.", length(i), nrow(x)) } else { irows = as.integer(i) # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error) irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), is.null(jsub) || root!=":=") # last argument is allowOverMax (NA when selecting, error when assigning) @@ -716,7 +715,7 @@ replace_dot_alias = function(e) { if (is.factor(j)) j = as.character(j) # fix for FR: #358 if (is.character(j)) { if (notj) { - if (anyNA(idx <- chmatch(j, names_x))) warning("column(s) not removed because not found: ", brackify(j[is.na(idx)])) + if (anyNA(idx <- chmatch(j, names_x))) warningf("column(s) not removed because not found: %s", brackify(j[is.na(idx)])) # all duplicates of the name in names(x) must be removed; e.g. data.table(x=1, y=2, x=3)[, !"x"] should just output 'y'. w = !names_x %chin% j ansvars = names_x[w] @@ -730,13 +729,13 @@ replace_dot_alias = function(e) { if (!length(ansvals)) return(null.data.table()) if (!length(leftcols)) { if (!anyNA(ansvals)) return(.Call(CsubsetDT, x, irows, ansvals)) - else stop("column(s) not found: ", brackify(ansvars[is.na(ansvals)])) + else stopf("column(s) not found: %s", brackify(ansvars[is.na(ansvals)])) } # else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977) # in both cases leave to the R-level subsetting of i and x together further below } else if (is.numeric(j)) { j = as.integer(j) - if (any(w<-(j>ncol(x)))) stop("Item ",which.first(w)," of j is ",j[which.first(w)]," which is outside the column number range [1,ncol=", ncol(x),"]") + if (any(w <- (j>ncol(x)))) stopf("Item %d of j is %d which is outside the column number range [1,ncol=%d]", idx <- which.first(w), j[idx], ncol(x)) j = j[j!=0L] if (any(j<0L)) { if (any(j>0L)) stop("j mixes positives and negatives") @@ -789,7 +788,7 @@ replace_dot_alias = function(e) { if (mode(bysub) == "character") { if (length(grep(",", bysub, fixed = TRUE))) { - if (length(bysub)>1L) stop("'by' is a character vector length ",length(bysub)," but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.") + if (length(bysub)>1L) stopf("'by' is a character vector length %d but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.", length(bysub)) bysub = strsplit(bysub,split=",")[[1L]] } backtick_idx = grep("^[^`]+$",bysub) @@ -867,7 +866,7 @@ replace_dot_alias = function(e) { } else bynames = names(byval) if (is.atomic(byval)) { if (is.character(byval) && length(byval)<=ncol(x) && !(is.name(bysub) && bysub %chin% names_x) ) { - stop("'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval",deparse(bysub)," should work. This is for efficiency so data.table can detect which columns are needed.") + stopf("'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval%s should work. This is for efficiency so data.table can detect which columns are needed.", deparse(bysub)) } else { # by may be a single unquoted column name but it must evaluate to list so this is a convenience to users. Could also be a single expression here such as DT[,sum(v),by=colA%%2] byval = list(byval) @@ -886,7 +885,7 @@ replace_dot_alias = function(e) { } } tt = vapply_1i(byval,length) - if (any(tt!=xnrow)) stop(domain=NA, gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow)) + if (any(tt!=xnrow)) stopf("The items in the 'by' or 'keyby' list are length(s) %s. Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", brackify(tt), xnrow) if (is.null(bynames)) bynames = rep.int("",length(byval)) if (length(idx <- which(!nzchar(bynames))) && !bynull) { # TODO: improve this and unify auto-naming of jsub and bysub @@ -920,7 +919,7 @@ replace_dot_alias = function(e) { jvnames = NULL drop_dot = function(x) { - if (length(x)!=1L) stop("Internal error: drop_dot passed ",length(x)," items") # nocov + if (length(x)!=1L) stopf("Internal error: drop_dot passed %d items", length(x)) # nocov if (startsWith(x<-as.character(x), ".") && x %chin% c(".N", ".I", ".GRP", ".NGRP", ".BY")) substr(x, 2L, nchar(x)) else @@ -939,7 +938,7 @@ replace_dot_alias = function(e) { # attempt to auto-name unnamed columns for (jj in which(nm=="")) { thisq = q[[jj + 1L]] - if (missing(thisq)) stop(domain=NA, gettextf("Item %d of the .() or list() passed to j is missing", jj)) #3507 + if (missing(thisq)) stopf("Item %d of the .() or list() passed to j is missing", jj) #3507 if (is.name(thisq)) nm[jj] = drop_dot(thisq) # TO DO: if call to a[1] for example, then call it 'a' too } @@ -947,7 +946,7 @@ replace_dot_alias = function(e) { if (length(nm) != length(jvnames)) warning("j may not evaluate to the same number of columns for each group; if you're sure this warning is in error, please put the branching logic outside of [ for efficiency") else if (any(idx <- nm != jvnames)) - warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names. If this was intentional (e.g., you know only one branch will ever be used in a given query because the branch is controlled by a function argument), please (1) pull this branch out of the call; (2) explicitly provide missing defaults for each branch in all cases; or (3) use the same name for each branch and re-name it in a follow-up call.', call. = FALSE) + warningf('Different branches of j expression produced different auto-named columns: %s; using the most "last" names. If this was intentional (e.g., you know only one branch will ever be used in a given query because the branch is controlled by a function argument), please (1) pull this branch out of the call; (2) explicitly provide missing defaults for each branch in all cases; or (3) use the same name for each branch and re-name it in a follow-up call.', brackify(sprintf('%s!=%s', nm[idx], jvnames[idx]))) } jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group; reinstated at the end on the result. @@ -1010,15 +1009,15 @@ replace_dot_alias = function(e) { if (is.function(.SDcols)) { .SDcols = lapply(x, .SDcols) if (any(idx <- vapply_1i(.SDcols, length) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA))) - stop("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: ", brackify(names(x)[idx])) + stopf("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: %s", brackify(names(x)[idx])) .SDcols = unlist(.SDcols, use.names = FALSE) } } } if (anyNA(.SDcols)) - stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols)))) + stopf(".SDcols missing at the following indices: %s", brackify(which(is.na(.SDcols)))) if (is.logical(.SDcols)) { - if (length(.SDcols)!=length(x)) stop(gettextf(".SDcols is a logical vector length %d but there are %d columns", length(.SDcols), length(x))) + if (length(.SDcols)!=length(x)) stopf(".SDcols is a logical vector length %d but there are %d columns", length(.SDcols), length(x)) ansvals = which_(.SDcols, !negate_sdcols) ansvars = sdvars = names_x[ansvals] } else if (is.numeric(.SDcols)) { @@ -1026,13 +1025,13 @@ replace_dot_alias = function(e) { # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff' if (length(unique(sign(.SDcols))) > 1L) stop(".SDcols is numeric but has both +ve and -ve indices") if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L)) - stop(".SDcols is numeric but out of bounds [1, ", ncol(x), "] at: ", brackify(which(idx))) + stopf(".SDcols is numeric but out of bounds [1, %d] at: %s", ncol(x), brackify(which(idx))) ansvars = sdvars = if (negate_sdcols) dupdiff(names_x[-.SDcols], bynames) else names_x[.SDcols] ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols } else { if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names") if (!all(idx <- .SDcols %chin% names_x)) - stop("Some items of .SDcols are not column names: ", brackify(.SDcols[!idx])) + stopf("Some items of .SDcols are not column names: %s", brackify(.SDcols[!idx])) ansvars = sdvars = if (negate_sdcols) setdiff(names_x, c(.SDcols, bynames)) else .SDcols # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove) ansvals = chmatch(ansvars, names_x) @@ -1184,9 +1183,9 @@ replace_dot_alias = function(e) { if (is.list(k)) { origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame()) if (is.character(j)) { - if (length(j)!=1L) stop("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but its length is ", length(j)) + if (length(j)!=1L) stopf("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but its length is %d", length(j)) j = match(j, names(k)) - if (is.na(j)) stop("Internal error -- item '", origj, "' not found in names of list") # nocov + if (is.na(j)) stopf("Internal error -- item '%s' not found in names of list", origj) # nocov } .Call(Csetlistelt,k,as.integer(j), x) } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) { @@ -1215,7 +1214,7 @@ replace_dot_alias = function(e) { xcolsAns = seq_along(ansvars) icols = icolsAns = integer() } else { - if (!length(leftcols)) stop("Internal error -- column(s) not found: ", brackify(ansvars[wna])) # nocov + if (!length(leftcols)) stopf("Internal error -- column(s) not found: %s", brackify(ansvars[wna])) # nocov xcols = w[!wna] xcolsAns = which(!wna) map = c(seq_along(i), leftcols) # this map is to handle dups in leftcols, #3635 @@ -1228,7 +1227,7 @@ replace_dot_alias = function(e) { if (any(w2na <- is.na(w2))) { ivars[leftcols] = paste0("i.",ivars[leftcols]) w2[w2na] = chmatch(ansvars[wna][w2na], ivars) - if (any(w2na <- is.na(w2))) stop("Internal error -- column(s) not found: ", paste(ansvars[wna][w2na],sep=", ")) # nocov + if (any(w2na <- is.na(w2))) stopf("Internal error -- column(s) not found: %s", brackify(ansvars[wna][w2na])) # nocov } } icols = w2 @@ -1257,7 +1256,7 @@ replace_dot_alias = function(e) { getName = substr(sym, 3L, nchar(sym)) if (!exists(getName, parent.frame())) { if (exists(sym, parent.frame())) next # user did 'manual' prefix; i.e. variable in calling scope has .. prefix - stop("Variable '",getName,"' is not found in calling scope. Looking in calling scope because this symbol was prefixed with .. in the j= parameter.") + stopf("Variable '%s' is not found in calling scope. Looking in calling scope because this symbol was prefixed with .. in the j= parameter.", getName) } assign(sym, get(getName, parent.frame()), SDenv) } @@ -1269,7 +1268,7 @@ replace_dot_alias = function(e) { if (!(length(i) && length(icols))) { # new in v1.12.0 to redirect to CsubsetDT in this case if (!identical(xcolsAns, seq_along(xcolsAns)) || length(xcols)!=length(xcolsAns) || length(ansvars)!=length(xcolsAns)) { - stop("Internal error: xcolAns does not pass checks: ", length(xcolsAns), length(ansvars), length(xcols), paste(xcolsAns,collapse=",")) # nocov + stopf("Internal error: xcolAns does not pass checks: %d/%d/%d/%s", length(xcolsAns), length(ansvars), length(xcols), brackify(xcolsAns)) # nocov } # Retained from old R way below (test 1542.01 checks shallow at this point) # ' Temp fix for #921 - skip COPY until after evaluating 'jval' (scroll down). @@ -1354,7 +1353,7 @@ replace_dot_alias = function(e) { # There isn't a copy of the columns here, the xvar symbols point to the SD columns (copy-on-write). if (is.name(jsub) && is.null(lhs) && !exists(jsubChar<-as.character(jsub), SDenv, inherits=FALSE)) { - stop("j (the 2nd argument inside [...]) is a single symbol but column name '",jsubChar,"' is not found. If you intended to select columns using a variable in calling scope, please try DT[, ..",jsubChar,"]. The .. prefix conveys one-level-up similar to a file system path.") + stopf("j (the 2nd argument inside [...]) is a single symbol but column name '%1$s' is not found. If you intended to select columns using a variable in calling scope, please try DT[, ..%1$s]. The .. prefix conveys one-level-up similar to a file system path.", jsubChar) } jval = eval(jsub, SDenv, parent.frame()) @@ -1890,7 +1889,7 @@ replace_dot_alias = function(e) { # Efficiency gain of dropping names has been successful. Ordinarily this will run. if (is.null(jvnames)) jvnames = character(length(ans)-length(bynames)) if (length(bynames)+length(jvnames)!=length(ans)) - stop("Internal error: jvnames is length ",length(jvnames), " but ans is ",length(ans)," and bynames is ", length(bynames)) # nocov + stopf("Internal error: jvnames is length %d but ans is %d and bynames is %d", length(jvnames), length(ans), length(bynames)) # nocov ww = which(jvnames=="") if (any(ww)) jvnames[ww] = paste0("V",ww) setattr(ans, "names", c(bynames, jvnames)) @@ -1949,8 +1948,7 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { # TODO in future as warned in NEWS for 1.11.6: # warning("length(rownames)>1 is deprecated. Please use rownames.value= instead") if (length(rownames)!=nrow(x)) - stop("length(rownames)==", length(rownames), " but nrow(DT)==", nrow(x), - ". The rownames argument specifies a single column name or number. Consider rownames.value= instead.") + stopf("length(rownames)==%d but nrow(DT)==%d. The rownames argument specifies a single column name or number. Consider rownames.value= instead.", length(rownames), nrow(x)) rownames.value = rownames rownames = NULL } else if (length(rownames)==0L) { @@ -1958,8 +1956,7 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { } else { if (isTRUE(rownames)) { if (length(key(x))>1L) { - warning("rownames is TRUE but key has multiple columns ", - brackify(key(x)), "; taking first column x[,1] as rownames") + warningf("rownames is TRUE but key has multiple columns %s; taking first column x[,1] as rownames", brackify(key(x))) } rownames = if (length(key(x))==1L) chmatch(key(x),names(x)) else 1L } @@ -1969,20 +1966,18 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { } else if (is.character(rownames)) { w = chmatch(rownames, names(x)) - if (is.na(w)) stop("'", rownames, "' is not a column of x") + if (is.na(w)) stopf("'%s' is not a column of x", rownames) rownames = w } else { # rownames is a column number already rownames = as.integer(rownames) if (is.na(rownames) || rownames<1L || rownames>ncol(x)) - stop("as.integer(rownames)==", rownames, - " which is outside the column number range [1,ncol=", ncol(x), "].") + stopf("as.integer(rownames)==%d which is outside the column number range [1,ncol=%d].", rownames, ncol(x)) } } } else if (!is.null(rownames.value)) { if (length(rownames.value)!=nrow(x)) - stop("length(rownames.value)==", length(rownames.value), - " but should be nrow(x)==", nrow(x)) + stopf("length(rownames.value)==%d but should be nrow(x)==%d", length(rownames.value), nrow(x)) } if (!is.null(rownames)) { # extract that column and drop it. @@ -2044,7 +2039,7 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { if (any(dm==0L)) { # retain highest type of input for empty output, #4762 if (length(X)!=0L) - stop("Internal error: as.matrix.data.table length(X)==", length(X), " but a dimension is zero") # nocov + stopf("Internal error: as.matrix.data.table length(X)==%d but a dimension is zero", length(X)) # nocov return(array(if (is.null(X)) NA else X, dim = dm, dimnames = list(rownames.value, cn))) } dim(X) <- c(n, length(X)/n) @@ -2181,7 +2176,7 @@ dimnames.data.table = function(x) { if (!cedta()) return(`dimnames<-.data.frame`(x,value)) # nocov ; will drop key but names<-.data.table (below) is more common usage and does retain the key if (!is.list(value) || length(value) != 2L) stop("attempting to assign invalid object to dimnames of a data.table") if (!is.null(value[[1L]])) stop("data.tables do not have rownames") - if (ncol(x) != length(value[[2L]])) stop("Can't assign ", length(value[[2L]]), " colnames to a ", ncol(x), "-column data.table") + if (ncol(x) != length(value[[2L]])) stopf("Can't assign %d names to a %d-column data.table", length(value[[2L]]), ncol(x)) setnames(x,as.character(value[[2L]])) x # this returned value is now shallow copied by R 3.1.0 via *tmp*. A very welcome change. } @@ -2358,7 +2353,7 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR if (".ll.tech.split" %chin% names(x)) stop("Column '.ll.tech.split' is reserved for split.data.table processing") if (".nm.tech.split" %chin% by) stop("Column '.nm.tech.split' is reserved for split.data.table processing") if (!all(by %chin% names(x))) stop("Argument 'by' must refer to column names in x") - if (!all(by.atomic <- vapply_1b(by, function(.by) is.atomic(x[[.by]])))) stop("Argument 'by' must refer only to atomic-type columns, but the following columns are non-atomic: ", brackify(by[!by.atomic])) + if (!all(by.atomic <- vapply_1b(by, function(.by) is.atomic(x[[.by]])))) stopf("Argument 'by' must refer only to atomic-type columns, but the following columns are non-atomic: %s", brackify(by[!by.atomic])) # list of data.tables (flatten) or list of lists of ... data.tables make.levels = function(x, cols, sorted) { by.order = if (!sorted) x[, funique(.SD), .SDcols=cols] # remember order of data, only when not sorted=FALSE @@ -2547,15 +2542,15 @@ setnames = function(x,old,new,skip_absent=FALSE) { # duplicates are permitted to be created without warning; e.g. in revdeps and for example, and setting spacer columns all with "" if (!is.data.frame(x)) stop("x is not a data.table or data.frame") ncol = length(x) - if (length(names(x)) != ncol) stop("x has ",ncol," columns but its names are length ",length(names(x))) + if (length(names(x)) != ncol) stopf("x has %d columns but its names are length %d", ncol, length(names(x))) stopifnot(isTRUEorFALSE(skip_absent)) if (missing(new) || missing(old)) { # usage: setnames(DT, new = letters[1:n]) if (missing(old)) { old = new; new = NULL } # for setnames(DT,new); e.g., setnames(DT,c("A","B")) where ncol(DT)==2 if (is.function(old)) old = old(names(x)) - if (!is.character(old)) stop("Passed a vector of type '",typeof(old),"'. Needs to be type 'character'.") - if (length(old) != ncol) stop("Can't assign ",length(old)," names to a ",ncol," column data.table") + if (!is.character(old)) stopf("Passed a vector of type '%s'. Needs to be type 'character'.", typeof(old)) + if (length(old) != ncol) stopf("Can't assign %d names to a %d-column data.table", length(old), ncol) if (anyNA(names(x))) { # if x somehow has some NA names, which() needs help to return them, #2475 w = which((names(x) != old) | (Encoding(names(x)) != Encoding(old)) | (is.na(names(x)) & !is.na(old))) @@ -2569,15 +2564,15 @@ setnames = function(x,old,new,skip_absent=FALSE) { if (is.function(new)) new = if (is.numeric(old)) new(names(x)[old]) else new(old) if (!is.character(new)) stop("'new' is not a character vector or a function") # if (anyDuplicated(new)) warning("Some duplicates exist in 'new': ", brackify(new[duplicated(new)])) # dups allowed without warning; warn if and when the dup causes an ambiguity - if (anyNA(new)) stop("NA in 'new' at positions ", brackify(which(is.na(new)))) - if (anyDuplicated(old)) stop("Some duplicates exist in 'old': ", brackify(old[duplicated(old)])) + if (anyNA(new)) stopf("NA in 'new' at positions %s", brackify(which(is.na(new)))) + if (anyDuplicated(old)) stopf("Some duplicates exist in 'old': %s", brackify(old[duplicated(old)])) if (is.numeric(old)) i = old = seq_along(x)[old] # leave it to standard R to manipulate bounds and negative numbers - else if (!is.character(old)) stop("'old' is type ",typeof(old)," but should be integer, double or character") - if (length(new)!=length(old)) stop("'old' is length ",length(old)," but 'new' is length ",length(new)) - if (anyNA(old)) stop("NA (or out of bounds) in 'old' at positions ", brackify(which(is.na(old)))) + else if (!is.character(old)) stopf("'old' is type %s but should be integer, double or character", typeof(old)) + if (length(new)!=length(old)) stopf("'old' is length %d but 'new' is length %d", length(old), length(new)) + if (anyNA(old)) stopf("NA (or out of bounds) in 'old' at positions %s", brackify(which(is.na(old)))) if (is.character(old)) { i = chmatchdup(c(old,old), names(x)) # chmatchdup returns the second of any duplicates matched to in names(x) (if any) - if (!all(tt<-is.na(tail(i,length(old))))) warning("Item ",w<-which.first(!tt)," of 'old' is '", old[w],"' which appears several times in column names. Just the first will be changed. There are ", sum(!tt)-1L," other items in old that are also duplicated in column names.") + if (!all(tt<-is.na(tail(i,length(old))))) warningf("Item %d of 'old' is '%s' which appears several times in column names. Just the first will be changed. There are %d other items in 'old' that are also duplicated in column names.", w <- which.first(!tt), old[w], sum(!tt)-1L) i = head(i,length(old)) if (anyNA(i)) { if (isTRUE(skip_absent)) { @@ -2585,7 +2580,7 @@ setnames = function(x,old,new,skip_absent=FALSE) { new = new[w] i = i[w] } else { - stop("Items of 'old' not found in column names: ", brackify(old[is.na(i)]), ". Consider skip_absent=TRUE.") + stopf("Items of 'old' not found in column names: %s. Consider skip_absent=TRUE.", brackify(old[is.na(i)])) } } } @@ -2624,7 +2619,7 @@ setnames = function(x,old,new,skip_absent=FALSE) { setcolorder = function(x, neworder=key(x)) { if (is.character(neworder) && anyDuplicated(names(x))) - stop("x has some duplicated column name(s): ", paste(names(x)[duplicated(names(x))], collapse=","), ". Please remove or rename the duplicate(s) and try again.") + stopf("x has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", brackify(unique(names(x)[duplicated(names(x))]))) # if (!is.data.table(x)) stop("x is not a data.table") neworder = colnamesInt(x, neworder, check_dups=FALSE) # dups are now checked inside Csetcolorder below if (length(neworder) != length(x)) { @@ -2677,7 +2672,7 @@ cbind.data.table = data.table rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) { if (is.null(l)) return(null.data.table()) - if (!is.list(l) || is.data.frame(l)) stop("Input is ", class(l)[1L]," but should be a plain list of items to be stacked") + if (!is.list(l) || is.data.frame(l)) stopf("Input is %s but should be a plain list of items to be stacked", class(l)[1L]) if (isFALSE(idcol)) { idcol = NULL } else if (!is.null(idcol)) { if (isTRUE(idcol)) idcol = ".id" @@ -2715,7 +2710,7 @@ setDF = function(x, rownames=NULL) { rn = .set_row_names(nrow(x)) } else { if (length(rownames) != nrow(x)) - stop("rownames incorrect length; expected ", nrow(x), " names, got ", length(rownames)) + stopf("rownames incorrect length; expected %d names, got %d", nrow(x), length(rownames)) rn = rownames } setattr(x, "row.names", rn) @@ -2725,7 +2720,7 @@ setDF = function(x, rownames=NULL) { } else if (is.data.frame(x)) { if (!is.null(rownames)) { if (length(rownames) != nrow(x)) - stop("rownames incorrect length; expected ", nrow(x), " names, got ", length(rownames)) + stopf("rownames incorrect length; expected %d names, got %d", nrow(x), length(rownames)) setattr(x, "row.names", rownames) } x @@ -2748,7 +2743,7 @@ setDF = function(x, rownames=NULL) { rn = .set_row_names(mn) } else { if (length(rownames) != mn) - stop("rownames incorrect length; expected ", mn, " names, got ", length(rownames)) + stopf("rownames incorrect length; expected %d names, got %d", mn, length(rownames)) rn = rownames } setattr(x,"row.names", rn) @@ -2762,21 +2757,21 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { if (is.name(name)) { home = function(x, env) { if (identical(env, emptyenv())) - stop("Cannot find symbol ", cname, call. = FALSE) + stopf("Cannot find symbol %s", cname) else if (exists(x, env, inherits=FALSE)) env else home(x, parent.env(env)) } cname = as.character(name) envir = home(cname, parent.frame()) if (bindingIsLocked(cname, envir)) { - stop("Cannot convert '", cname, "' to data.table by reference because binding is locked. It is very likely that '", cname, "' resides within a package (or an environment) that is locked to prevent modifying its variable bindings. Try copying the object to your current environment, ex: var <- copy(var) and then using setDT again.") + stopf("Cannot convert '%1$s' to data.table by reference because binding is locked. It is very likely that '%1$s' resides within a package (or an environment) that is locked to prevent modifying its variable bindings. Try copying the object to your current environment, ex: var <- copy(var) and then using setDT again.", cname) } } # check no matrix-like columns, #3760. Other than a single list(matrix) is unambiguous and depended on by some revdeps, #3581 if (length(x)>1L) { idx = vapply_1i(x, function(xi) length(dim(xi)))>1L if (any(idx)) - warning("Some columns are a multi-column type (such as a matrix column): ", brackify(which(idx)),". setDT will retain these columns as-is but subsequent operations like grouping and joining may fail. Please consider as.data.table() instead which will create a new column for each embedded column.") + warningf("Some columns are a multi-column type (such as a matrix column): %s. setDT will retain these columns as-is but subsequent operations like grouping and joining may fail. Please consider as.data.table() instead which will create a new column for each embedded column.", brackify(which(idx))) } if (is.data.table(x)) { # fix for #1078 and #1128, see .resetclass() for explanation. @@ -2808,14 +2803,13 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { # many operations still work in the presence of NULL columns and it might be convenient # e.g. in package eplusr which calls setDT on a list when parsing JSON. Operations which # fail for NULL columns will give helpful error at that point, #3480 and #3471 - if (inherits(x[[i]], "POSIXlt")) stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") + if (inherits(x[[i]], "POSIXlt")) stopf("Column %d is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.", i) } n = vapply_1i(x, length) n_range = range(n) if (n_range[1L] != n_range[2L]) { tbl = sort(table(n)) - stop("All elements in argument 'x' to 'setDT' must be of same length, but the profile of input lengths (length:frequency) is: ", - brackify(sprintf('%s:%d', names(tbl), tbl)), "\nThe first entry with fewer than ", n_range[2L], " entries is ", which.max(n 1L) 's', ' not found: [', - paste(p[idx], collapse = ', '), ']') + stop(domain = NA, sprintf(ngettext(length(idx), 'Pattern not found: [%s]', 'Patterns not found: [%s]'), brackify(p[idx]))) matched } @@ -44,7 +43,7 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { is.symb = sapply(fun.list, is.symbol) bad.i = which((!user.named) & (!is.symb)) if (length(bad.i)) { - stop("each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: ", paste(bad.i, collapse=",")) + stopf("each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: %s", brackify(bad.i)) } names(fun.list)[!user.named] = sapply(fun.list[!user.named], paste) fun.list[!user.named] = list(NULL) @@ -52,13 +51,13 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { group.is.formal = names(fun.list) %in% formal.names if (any(group.is.formal)) { bad.names = names(fun.list)[group.is.formal] - stop("group names specified in ... conflict with measure argument names; please fix by changing group names: ", paste(bad.names, collapse=",")) + stopf("group names specified in ... conflict with measure argument names; please fix by changing group names: %s", brackify(bad.names)) } # evaluate each value in ... and stop if not function. for (fun.i in which(user.named)) { fun = eval(fun.list[[fun.i]], parent.frame(1L)) if (!is.function(fun) || length(formals(args(fun)))==0) { - stop("each ... argument to measure must be a function with at least one argument, problem: ", names(fun.list)[[fun.i]]) + stopf("each ... argument to measure must be a function with at least one argument, problem: %s", names(fun.list)[[fun.i]]) } fun.list[[fun.i]] = fun } @@ -86,18 +85,18 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na which(names(fun.list) == "") } if (length(prob.i)) { - stop("in measurev, ", group.desc, " must be named, problems: ", paste(prob.i, collapse=",")) + stopf("in measurev, %s must be named, problems: %s", group.desc, brackify(prob.i)) } err.names.unique = function(err.what, name.vec) { name.tab = table(name.vec) bad.counts = name.tab[1 < name.tab] if (length(bad.counts)) { - stop(err.what, " should be uniquely named, problems: ", paste(names(bad.counts), collapse=",")) + stopf("%s should be uniquely named, problems: %s", err.what, brackify(names(bad.counts))) } } err.args.groups = function(type, N){ if (N != length(fun.list)) { - stop("number of ", group.desc, " =", length(fun.list), " must be same as ", type, " =", N) + stopf("number of %s =%d must be same as %s =%d", group.desc, length(fun.list), type, N) } } err.names.unique(group.desc, names(fun.list)) @@ -136,7 +135,7 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na err.names.unique("measured columns", cols[measure.vec]) uniq.mat = unique(group.mat) if (nrow(uniq.mat) < nrow(group.mat)) { - stop("number of unique column IDs =", nrow(uniq.mat), " is less than number of melted columns =", nrow(group.mat), "; fix by changing pattern/sep") + stopf("number of unique column IDs =%d is less than number of melted columns =%d; fix by changing pattern/sep", nrow(uniq.mat), nrow(group.mat)) } colnames(group.mat) = names(fun.list) group.dt = data.table(group.mat) @@ -146,14 +145,14 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na group.name = names(fun.list)[[group.i]] fun = fun.list[[group.i]] if (!is.function(fun) || length(formals(args(fun)))==0) { - stop("in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: ", group.name) + stopf("in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: %s", group.name) } group.val = fun(group.dt[[group.name]]) if (!(is.atomic(group.val) && length(group.val)==nrow(group.dt))) { - stop("each conversion function must return an atomic vector with same length as its first argument, problem: ", group.name) + stopf("each conversion function must return an atomic vector with same length as its first argument, problem: %s", group.name) } if (all(is.na(group.val))) { - stop(group.name, " conversion function returned vector of all NA") + stopf("%s conversion function returned vector of all NA", group.name) } set(group.dt, j=group.name, value=group.val) } @@ -164,11 +163,11 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na # 4. compute measure.vars list or vector. if (multiple.keyword %in% names(fun.list)) {# multiple output columns. if (!is.character(group.dt[[multiple.keyword]])) { - stop(multiple.keyword, " column class=", class(group.dt[[multiple.keyword]])[[1L]], " after applying conversion function, but must be character") + stopf("%s column class=%s after applying conversion function, but must be character", multiple.keyword, class(group.dt[[multiple.keyword]])[1L]) } is.other = names(group.dt) != multiple.keyword if (!any(is.other)) { - stop(multiple.keyword, " is the only group; fix by creating at least one more group") + stopf("%s is the only group; fix by creating at least one more group", multiple.keyword) } other.values = lapply(group.dt[, is.other, with=FALSE], unique) other.values$stringsAsFactors = FALSE @@ -210,9 +209,7 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl } } else { if (length(value.name) > 1L) { - warning("'value.name' provided in both 'measure.vars'", - "and 'value.name argument'; value provided in", - "'measure.vars' is given precedence.") + warning("'value.name' provided in both 'measure.vars' and 'value.name argument'; value provided in 'measure.vars' is given precedence.") } if (anyNA(meas.nm) || !all(nzchar(meas.nm))) { stop("Please provide a name to each element of 'measure.vars'.") diff --git a/R/foverlaps.R b/R/foverlaps.R index fc0b706ccd..e663d0a3cb 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -19,7 +19,7 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k if (maxgap != 0L || minoverlap != 1L) stop("maxgap and minoverlap arguments are not yet implemented.") if (is.null(by.y)) - stop("'y' must be keyed (i.e., sorted, and, marked as sorted). Call setkey(y, ...) first, see ?setkey. Also check the examples in ?foverlaps.") + stop("y must be keyed (i.e., sorted, and, marked as sorted). Call setkey(y, ...) first, see ?setkey. Also check the examples in ?foverlaps.") if (length(by.x) < 2L || length(by.y) < 2L) stop("'by.x' and 'by.y' should contain at least two column names (or numbers) each - corresponding to 'start' and 'end' points of intervals. Please see ?foverlaps and examples for more info.") if (is.numeric(by.x)) { @@ -37,39 +37,39 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k if (!is.character(by.y)) stop("A non-empty vector of column names or numbers is required for by.y") if (!identical(by.y, key(y)[seq_along(by.y)])) - stop("The first ", length(by.y), " columns of y's key must be identical to the columns specified in by.y.") + stopf("The first %d columns of y's key must be identical to the columns specified in by.y.", length(by.y)) if (anyNA(chmatch(by.x, names(x)))) - stop("Elements listed in 'by.x' must be valid names in data.table 'x'") + stop("Elements listed in 'by.x' must be valid names in data.table x") if (anyDuplicated(by.x) || anyDuplicated(by.y)) stop("Duplicate columns are not allowed in overlap joins. This may change in the future.") if (length(by.x) != length(by.y)) stop("length(by.x) != length(by.y). Columns specified in by.x should correspond to columns specified in by.y and should be of same lengths.") if (any(dup.x<-duplicated(names(x)))) #1730 - handling join possible but would require workarounds on setcolorder further, it is really better just to rename dup column - stop("x has some duplicated column name(s): ",paste(unique(names(x)[dup.x]),collapse=","),". Please remove or rename the duplicate(s) and try again.") + stopf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "x", brackify(unique(names(x)[dup.x]))) if (any(dup.y<-duplicated(names(y)))) - stop("y has some duplicated column name(s): ",paste(unique(names(y)[dup.y]),collapse=","),". Please remove or rename the duplicate(s) and try again.") + stopf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "y", brackify(unique(names(y)[dup.y]))) xnames = by.x; xintervals = tail(xnames, 2L) ynames = by.y; yintervals = tail(ynames, 2L) xval1 = x[[xintervals[1L]]]; xval2 = x[[xintervals[2L]]] yval1 = y[[yintervals[1L]]]; yval2 = y[[yintervals[2L]]] if (!storage.mode(xval1) %chin% c("double", "integer") || !storage.mode(xval2) %chin% c("double", "integer") || is.factor(xval1) || is.factor(xval2)) # adding factors to the bunch, #2645 - stop("The last two columns in by.x should correspond to the 'start' and 'end' intervals in data.table 'x' and must be integer/numeric type.") + stop("The last two columns in by.x should correspond to the 'start' and 'end' intervals in data.table x and must be integer/numeric type.") if ( isTRUEorNA(any(xval2 - xval1 < 0L)) ) { # better error messages as suggested by @msummersgill in #3007. Thanks for the code too. Placing this inside so that it only runs if the general condition is satisfied. Should error anyway here.. So doesn't matter even if runs all if-statements; takes about 0.2s for anyNA check on 200 million elements .. acceptable speed for stoppage, I think, at least for now. if ( anyNA(xval1) ) { - stop("NA values in data.table 'x' start column: '", xintervals[1L],"'. All rows with NA values in the range columns must be removed for foverlaps() to work.") + stopf("NA values in data.table %s '%s' column: '%s'. All rows with NA values in the range columns must be removed for foverlaps() to work.", "x", "start", xintervals[1L]) } else if ( anyNA(xval2) ) { - stop("NA values in data.table 'x' end column: '", xintervals[2L],"'. All rows with NA values in the range columns must be removed for foverlaps() to work.") - } else stop("All entries in column ", xintervals[1L], " should be <= corresponding entries in column ", xintervals[2L], " in data.table 'x'.") + stopf("NA values in data.table %s '%s' column: '%s'. All rows with NA values in the range columns must be removed for foverlaps() to work.", "x", "end", xintervals[2L]) + } else stopf("All entries in column '%s' should be <= corresponding entries in column '%s' in data.table x.", xintervals[1L], xintervals[2L]) } if (!storage.mode(yval1) %chin% c("double", "integer") || !storage.mode(yval2) %chin% c("double", "integer") || is.factor(yval1) || is.factor(yval2)) # adding factors to the bunch, #2645 - stop("The last two columns in by.y should correspond to the 'start' and 'end' intervals in data.table 'y' and must be integer/numeric type.") + stop("The last two columns in by.y should correspond to the 'start' and 'end' intervals in data.table y and must be integer/numeric type.") if ( isTRUEorNA(any(yval2 - yval1 < 0L) )) { if ( anyNA(yval1) ) { - stop("NA values in data.table 'y' start column: '", yintervals[1L],"'. All rows with NA values in the range columns must be removed for foverlaps() to work.") + stopf("NA values in data.table %s '%s' column: '%s'. All rows with NA values in the range columns must be removed for foverlaps() to work.", "y", "start", yintervals[1L]) } else if ( anyNA(yval2) ) { - stop("NA values in data.table 'y' end column: '", yintervals[2L],"'. All rows with NA values in the range columns must be removed for foverlaps() to work.") - } else stop("All entries in column ", yintervals[1L], " should be <= corresponding entries in column ", yintervals[2L], " in data.table 'y'.") + stopf("NA values in data.table %s '%s' column: '%s'. All rows with NA values in the range columns must be removed for foverlaps() to work.", "y", "end", yintervals[2L]) + } else stopf("All entries in column '%s' should be <= corresponding entries in column '%s' in data.table y.", yintervals[1L], yintervals[2L]) } # POSIXct interval cols error check posx_chk = sapply(list(xval1, xval2, yval1, yval2), inherits, 'POSIXct') diff --git a/R/fread.R b/R/fread.R index eb765fe639..c724f22353 100644 --- a/R/fread.R +++ b/R/fread.R @@ -37,7 +37,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") nThread=as.integer(nThread) stopifnot(nThread>=1L) if (!is.null(text)) { - if (!is.character(text)) stop("'text=' is type ", typeof(text), " but must be character.") + if (!is.character(text)) stopf("'text=' is type %s but must be character.", typeof(text)) if (!length(text)) return(data.table()) if (length(text) > 1L) { writeLines(text, tmpFile<-tempfile(tmpdir=tmpdir)) # avoid paste0() which could create a new very long single string in R's memory @@ -83,7 +83,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") else if (length(grep(' ', input, fixed = TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command cmd = input if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) { - message("Taking input= as a system command because it contains a space ('",cmd,"'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.") + messagef("Taking input= as a system command because it contains a space ('%s'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.", cmd) } } else { @@ -98,11 +98,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } if (!is.null(file)) { file_info = file.info(file) - if (is.na(file_info$size)) stop("File '",file,"' does not exist or is non-readable. getwd()=='", getwd(), "'") - if (isTRUE(file_info$isdir)) stop("File '",file,"' is a directory. Not yet implemented.") # dir.exists() requires R v3.2+, #989 + if (is.na(file_info$size)) stopf("File '%s' does not exist or is non-readable. getwd()=='%s'", file, getwd()) + if (isTRUE(file_info$isdir)) stopf("File '%s' is a directory. Not yet implemented.", file) # dir.exists() requires R v3.2+, #989 if (!file_info$size) { - warning("File '", file, "' has size 0. Returning a NULL ", - if (data.table) 'data.table' else 'data.frame', ".") + warningf("File '%s' has size 0. Returning a NULL %s.", file, if (data.table) 'data.table' else 'data.frame') return(if (data.table) data.table(NULL) else data.frame(NULL)) } if ((is_gz <- endsWith(file, ".gz")) || endsWith(file, ".bz2")) { @@ -140,16 +139,16 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") stopifnot(is.null(na.strings) || is.character(na.strings)) tt = grep("^\\s+$", na.strings) if (length(tt)) { - msg = paste0('na.strings[', tt[1L], ']=="',na.strings[tt[1L]],'" consists only of whitespace, ignoring. ') + msg = gettextf('na.strings[%d]=="%s" consists only of whitespace, ignoring', tt[1L], na.strings[tt[1L]]) if (strip.white) { if (any(na.strings=="")) { - warning(msg, 'strip.white==TRUE (default) and "" is present in na.strings, so any number of spaces in string columns will already be read as .') + warningf('%s. strip.white==TRUE (default) and "" is present in na.strings, so any number of spaces in string columns will already be read as .', msg) } else { - warning(msg, 'Since strip.white=TRUE (default), use na.strings="" to specify that any number of spaces in a string column should be read as .') + warningf('%s. Since strip.white=TRUE (default), use na.strings="" to specify that any number of spaces in a string column should be read as .', msg) } na.strings = na.strings[-tt] } else { - stop(msg, 'But strip.white=FALSE. Use strip.white=TRUE (default) together with na.strings="" to turn any number of spaces in string columns into ') + stopf('%s. But strip.white=FALSE. Use strip.white=TRUE (default) together with na.strings="" to turn any number of spaces in string columns into ', msg) } # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804 } @@ -159,9 +158,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") # for tracking which YAML elements may be overridden by being declared explicitly call_args = names(match.call()) if (is.character(skip)) - warning("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, ", - "reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of ", - "the metadata; please file an issue on GitHub if you'd like to see more intuitive behavior supported.") + warning("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of the metadata; please file an issue on GitHub if you'd like to see more intuitive behavior supported.") # create connection to stream header lines from file: # https://stackoverflow.com/questions/9871307 f = base::file(input, 'r') @@ -170,10 +167,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") yaml_border_re = '^#?---' if (!grepl(yaml_border_re, first_line)) { close(f) - stop(gettextf( + stopf( 'Encountered <%s%s> at the first unskipped line (%d), which does not constitute the start to a valid YAML header (expecting something matching regex "%s"); please check your input and try again.', substr(first_line, 1L, 50L), if (nchar(first_line) > 50L) '...' else '', 1L+skip, yaml_border_re - )) + ) } yaml_comment_re = '^#' @@ -183,8 +180,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") n_read = n_read + 1L if (!length(this_line)){ close(f) - stop('Reached the end of the file before finding a completion to the YAML header. A valid YAML header is bookended by lines matching ', - 'the regex "', yaml_border_re, '". Please double check the input file is a valid csvy.') + stopf('Reached the end of the file before finding a completion to the YAML header. A valid YAML header is bookended by lines matching the regex "%s". Please double check the input file is a valid csvy.', yaml_border_re) } if (grepl(yaml_border_re, this_line)) break if (grepl(yaml_comment_re, this_line)) @@ -225,11 +221,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (!all(idx_type <- sapply(matched_name_idx, function(ii) { new_names[ii] %chin% colClasses[[ new_types[ii] ]] }))) { - plural = sum(idx_type) > 1L - message('colClasses dictated by user input and those read from YAML header are in conflict (specifically, for column', if (plural) 's', - ' [', paste(new_names[matched_name_idx[!idx_type]], collapse = ','), ']); the proceeding assumes the user input was ', - 'an intentional override and will ignore the types implied by the YAML header; please exclude ', - if (plural) 'these columns' else 'this column from colClasses if this was unintentional.') + messagef('colClasses dictated by user input and those read from YAML header are in conflict (specifically, for column(s) [%s]); the proceeding assumes the user input was an intentional override and will ignore the type(s) implied by the YAML header; please exclude the column(s) from colClasses if this was unintentional.', + brackify(new_names[matched_name_idx[!idx_type]])) } } # only add unmentioned columns @@ -311,8 +304,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") methods::as(v, new_class)) }, warning = fun <- function(e) { - warning("Column '", names(ans)[j], "' was requested to be '", new_class, "' but fread encountered the following ", - if (inherits(e, "error")) "error" else "warning", ":\n\t", e$message, "\nso the column has been left as type '", typeof(v), "'", call.=FALSE) + warningf("Column '%s' was requested to be '%s' but fread encountered the following %s:\n\t%s\nso the column has been left as type '%s'", names(ans)[j], new_class, if (inherits(e, "error")) "error" else "warning", e$message, typeof(v)) return(v) }, error = fun) diff --git a/R/fwrite.R b/R/fwrite.R index 8325f137d3..ab2353464a 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -76,13 +76,11 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } if (NCOL(x)==0L && file!="") { if (file.exists(file)) { - warning("Input has no columns; doing nothing.", - if (!append) - paste("\nIf you intended to overwrite the file at", - file, "with an empty one, please use file.remove first.")) + suggested <- if (append) "" else gettextf("\nIf you intended to overwrite the file at %s with an empty one, please use file.remove first.", file) + warningf("Input has no columns; doing nothing.%s", suggested) return(invisible()) } else { - warning("Input has no columns; creating an empty file at '", file, "' and exiting.") + warningf("Input has no columns; creating an empty file at '%s' and exiting.", file) file.create(file) return(invisible()) } diff --git a/R/groupingsets.R b/R/groupingsets.R index 2300d09da0..6e7ce8131a 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -59,13 +59,13 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) stop("Argument 'id' must be a logical scalar.") # logic constraints validation if (!all((sets.all.by <- unique(unlist(sets))) %chin% by)) - stop("All columns used in 'sets' argument must be in 'by' too. Columns used in 'sets' but not present in 'by': ", brackify(setdiff(sets.all.by, by))) + stopf("All columns used in 'sets' argument must be in 'by' too. Columns used in 'sets' but not present in 'by': %s", brackify(setdiff(sets.all.by, by))) if (id && "grouping" %chin% names(x)) stop("When using `id=TRUE` the 'x' data.table must not have a column named 'grouping'.") if (any(vapply_1i(sets, anyDuplicated))) # anyDuplicated returns index of first duplicate, otherwise 0L stop("Character vectors in 'sets' list must not have duplicated column names within a single grouping set.") if (length(sets) > 1L && (idx<-anyDuplicated(lapply(sets, sort)))) - warning("'sets' contains a duplicate (i.e., equivalent up to sorting) element at index ", idx, "; as such, there will be duplicate rows in the output -- note that grouping by A,B and B,A will produce the same aggregations. Use `sets=unique(lapply(sets, sort))` to eliminate duplicates.") + warningf("'sets' contains a duplicate (i.e., equivalent up to sorting) element at index %d; as such, there will be duplicate rows in the output -- note that grouping by A,B and B,A will produce the same aggregations. Use `sets=unique(lapply(sets, sort))` to eliminate duplicates.", idx) # input arguments handling jj = if (!missing(jj)) jj else substitute(j) av = all.vars(jj, TRUE) diff --git a/R/last.R b/R/last.R index 8dff3271a1..0a889e6227 100644 --- a/R/last.R +++ b/R/last.R @@ -35,7 +35,7 @@ last = function(x, n=1L, ...) { } } else { if (!requireNamespace("xts", quietly=TRUE)) - stop(domain=NA, gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::last")) # nocov + stopf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::last") # nocov if (verbose) catf("%s: using %s: %s\n", "last", "xts::last", "is.xts(x)") xts::last(x, n=n, ...) @@ -76,7 +76,7 @@ first = function(x, n=1L, ...) { } } else { if (!requireNamespace("xts", quietly=TRUE)) - stop(domain=NA, gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::first")) # nocov + stopf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::first") # nocov if (verbose) catf("%s: using %s: %s\n", "first", "xts::first", "is.xts(x)") xts::first(x, n=n, ...) diff --git a/R/merge.R b/R/merge.R index 8dc59e018b..8ad01de420 100644 --- a/R/merge.R +++ b/R/merge.R @@ -13,9 +13,12 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } x0 = length(x)==0L y0 = length(y)==0L - if (x0 || y0) warning(sprintf(ngettext(x0+y0, - "You are trying to join data.tables where %s has 0 columns.", - "You are trying to join data.tables where %s have 0 columns."), + if (x0 || y0) warning(domain=NA, sprintf( + ngettext( + x0+y0, + "You are trying to join data.tables where %s has 0 columns.", + "You are trying to join data.tables where %s have 0 columns." + ), if (x0 && y0) "'x' and 'y'" else if (x0) "'x'" else "'y'" )) nm_x = names(x) @@ -100,8 +103,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame) resultdupnames = names(dt)[duplicated(names(dt))] if (length(resultdupnames)) { - warning("column names ", paste0("'", resultdupnames, "'", collapse=", "), - " are duplicated in the result") + warningf("column names %s are duplicated in the result", brackify(resultdupnames)) } # retain custom classes of first argument that resulted in dispatch to this method, #1378 diff --git a/R/onAttach.R b/R/onAttach.R index 3e93187e2e..110cab69d4 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -18,21 +18,25 @@ } dev = as.integer(v[1L, 3L]) %% 2L == 1L # version number odd => dev if (!isTRUE(getOption("datatable.quiet"))) { # new option in v1.12.4, #3489 - packageStartupMessage("data.table ", v, if(dev)paste0(" IN DEVELOPMENT built ",d,g), - " using ", getDTthreads(verbose=FALSE), " threads (see ?getDTthreads). Latest news: r-datatable.com", domain="R-data.table") + # NB: we need to supply domain= for translation below since the below is technically not run in the data.table namespace + nth = getDTthreads(verbose=FALSE) + if (dev) + packageStartupMessagef(domain="R-data.table", "data.table %s IN DEVELOPMENT built %s%s using %d threads (see ?getDTthreads). ", v, d, g, nth, appendLF=FALSE) + else + packageStartupMessagef(domain="R-data.table", "data.table %s using %d threads (see ?getDTthreads). ", v, nth, appendLF=FALSE) + packageStartupMessage(domain="R-data.table", "Latest news: r-datatable.com") # NB: domain= is necessary in .onAttach and .onLoad, see ?gettext and https://bugs.r-project.org/bugzilla/show_bug.cgi?id=18092. if (gettext(domain="R-data.table", "TRANSLATION CHECK") != "TRANSLATION CHECK") packageStartupMessage(domain="R-data.table", "**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********") if (dev && (Sys.Date() - as.Date(d))>28L) packageStartupMessage(domain="R-data.table", "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") - if (!.Call(ChasOpenMP)) - packageStartupMessage(domain="R-data.table", "**********\n", - "This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", - if (Sys.info()["sysname"]=="Darwin") - "This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux." - else - paste0("This is ", Sys.info()["sysname"], ". This warning should not normally occur on Windows or Linux where OpenMP is turned on by data.table's configure script by passing -fopenmp to the compiler. If you see this warning on Windows or Linux, please file a GitHub issue."), - "\n**********") + if (!.Call(ChasOpenMP)) { + packageStartupMessage(domain="R-data.table", "**********\nThis installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", appendLF=FALSE) + if (Sys.info()["sysname"] == "Darwin") + packageStartupMessage(domain="R-data.table", "This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.\n**********") + else + packageStartupMessagef(domain="R-data.table", "This is %s. This warning should not normally occur on Windows or Linux where OpenMP is turned on by data.table's configure script by passing -fopenmp to the compiler. If you see this warning on Windows or Linux, please file a GitHub issue.\n**********", Sys.info()["sysname"]) + } } } diff --git a/R/onLoad.R b/R/onLoad.R index 3750510ece..0dcfee82a8 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -26,11 +26,11 @@ dll = if (.Platform$OS.type=="windows") "dll" else "so" # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478 # NB: domain= is necessary in .onAttach and .onLoad, see ?gettext and https://bugs.r-project.org/bugzilla/show_bug.cgi?id=18092. - stop(domain="R-data.table", "The datatable.",dll," version (",dllV,") does not match the package (",RV,"). Please close all R sessions to release the old ",toupper(dll)," and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.") + stopf(domain="R-data.table", "The datatable.%s version (%s) does not match the package (%s). Please close all R sessions to release the old %s and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.", dll, dllV, Rv, toupper(dll)) } builtUsing = readRDS(system.file("Meta/package.rds",package="data.table"))$Built$R if (!identical(base::getRversion()>="4.0.0", builtUsing>="4.0.0")) { - stop(domain="R-data.table", "This is R ", base::getRversion(), " but data.table has been installed using R ",builtUsing,". The major version must match. Please reinstall data.table.") + stopf(domain="R-data.table", "This is R %s but data.table has been installed using R %s. The major version must match. Please reinstall data.table.", base::getRversion(), builtUsing) # the if(R>=4.0.0) in NAMESPACE when registering S3 methods rbind.data.table and cbind.data.table happens on install; #3968 } } diff --git a/R/openmp-utils.R b/R/openmp-utils.R index 9df55f1148..1d21937b5f 100644 --- a/R/openmp-utils.R +++ b/R/openmp-utils.R @@ -1,9 +1,9 @@ setDTthreads = function(threads=NULL, restore_after_fork=NULL, percent=NULL, throttle=NULL) { if (!missing(percent)) { if (!missing(threads)) stop("Provide either threads= or percent= but not both") - if (length(percent)!=1) stop("percent= is provided but is length ", length(percent)) + if (length(percent)!=1) stopf("percent= is provided but is length %d", length(percent)) percent=as.integer(percent) - if (is.na(percent) || percent<2L || percent>100L) stop("percent==",percent," but should be a number between 2 and 100") + if (is.na(percent) || percent<2L || percent>100L) stopf("percent==%d but should be a number between 2 and 100", percent) invisible(.Call(CsetDTthreads, percent, restore_after_fork, TRUE, as.integer(throttle))) } else { invisible(.Call(CsetDTthreads, as.integer(threads), restore_after_fork, FALSE, as.integer(throttle))) diff --git a/R/print.data.table.R b/R/print.data.table.R index 4e666ca22e..e935c01da4 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -235,9 +235,7 @@ trunc_cols_message = function(not_printed, abbs, class, col.names){ n = length(not_printed) if (class && col.names != "none") classes = paste0(" ", tail(abbs, n)) else classes = "" cat(sprintf( - ngettext(n, - "%d variable not shown: %s\n", - "%d variables not shown: %s\n"), + ngettext(n, "%d variable not shown: %s\n", "%d variables not shown: %s\n"), n, brackify(paste0(not_printed, classes)) )) } diff --git a/R/programming.R b/R/programming.R index b4d25012f8..c0b9574a9a 100644 --- a/R/programming.R +++ b/R/programming.R @@ -16,8 +16,7 @@ list2lang = function(x) { to.name = !asis & char if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience if (any(non.scalar.char <- vapply(x[to.name], length, 0L)!=1L)) { - stop("Character objects provided in the input are not scalar objects, if you need them as character vector rather than a name, then wrap each into 'I' call: ", - paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) + stopf("Character objects provided in the input are not scalar objects, if you need them as character vector rather than a name, then wrap each into 'I' call: %s", brackify(names(non.scalar.char)[non.scalar.char])) } x[to.name] = lapply(x[to.name], as.name) } diff --git a/R/setkey.R b/R/setkey.R index e9f18398ab..cca6361cce 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -54,7 +54,7 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (!all(nzchar(cols))) stop("cols contains some blanks.") cols = gsub("`", "", cols, fixed = TRUE) miss = !(cols %chin% colnames(x)) - if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=",")) + if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss])) ## determine, whether key is already present: if (identical(key(x),cols)) { @@ -79,7 +79,7 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (".xi" %chin% names(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.") for (i in cols) { .xi = x[[i]] # [[ is copy on write, otherwise checking type would be copying each column - if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.") + if (!typeof(.xi) %chin% ORDERING_TYPES) stopf("Column '%s' is type '%s' which is not supported as a key column type, currently.", i, typeof(.xi)) } if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov @@ -128,7 +128,7 @@ getindex = function(x, name) { # name can be "col", or "col1__col2", or c("col1","col2") ans = attr(attr(x, 'index', exact=TRUE), paste0("__",name,collapse=""), exact=TRUE) if (!is.null(ans) && (!is.integer(ans) || (length(ans)!=nrow(x) && length(ans)!=0L))) { - stop("Internal error: index '",name,"' exists but is invalid") # nocov + stopf("Internal error: index '%s' exists but is invalid", name) # nocov } ans } @@ -284,11 +284,11 @@ setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE) # remove backticks from cols cols = gsub("`", "", cols, fixed = TRUE) miss = !(cols %chin% colnames(x)) - if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=",")) + if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss])) if (".xi" %chin% colnames(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.") for (i in cols) { .xi = x[[i]] # [[ is copy on write, otherwise checking type would be copying each column - if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported for ordering currently.") + if (!typeof(.xi) %chin% ORDERING_TYPES) stopf("Column '%s' is type '%s' which is not supported for ordering currently.", i, typeof(.xi)) } if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov @@ -337,7 +337,7 @@ CJ = function(..., sorted = TRUE, unique = FALSE) y = l[[i]] if (!length(y)) next if (sorted) { - if (!is.atomic(y)) stop("'sorted' is TRUE but element ", i, " is non-atomic, which can't be sorted; try setting sorted = FALSE") + if (!is.atomic(y)) stopf("'sorted' is TRUE but element %d is non-atomic, which can't be sorted; try setting sorted = FALSE", i) o = forderv(y, retGrp=TRUE) thisdups = attr(o, 'maxgrpn', exact=TRUE)>1L if (thisdups) { @@ -352,7 +352,7 @@ CJ = function(..., sorted = TRUE, unique = FALSE) } } nrow = prod( vapply_1i(l, length) ) # lengths(l) will work from R 3.2.0 - if (nrow > .Machine$integer.max) stop(domain=NA, gettextf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max)) + if (nrow > .Machine$integer.max) stopf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max) l = .Call(Ccj, l) setDT(l) l = setalloccol(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since diff --git a/R/setops.R b/R/setops.R index d8fcb9dfcf..7f21603b59 100644 --- a/R/setops.R +++ b/R/setops.R @@ -14,11 +14,11 @@ setdiff_ = function(x, y, by.x=seq_along(x), by.y=seq_along(y), use.names=FALSE) icnam = names(y)[lc] xcnam = names(x)[rc] if ( is.character(x[[rc]]) && !(is.character(y[[lc]]) || is.factor(y[[lc]])) ) { - stop("When x's column ('",xcnam,"') is character, the corresponding column in y ('",icnam,"') should be factor or character, but found incompatible type '",typeof(y[[lc]]),"'.") + stopf("When x's column ('%s') is character, the corresponding column in y ('%s') should be factor or character, but found incompatible type '%s'.", xcnam, icnam, typeof(y[[lc]])) } else if ( is.factor(x[[rc]]) && !(is.character(y[[lc]]) || is.factor(y[[lc]])) ) { - stop("When x's column ('",xcnam,"') is factor, the corresponding column in y ('",icnam,"') should be character or factor, but found incompatible type '",typeof(y[[lc]]),"'.") + stopf("When x's column ('%s') is factor, the corresponding column in y ('%s') should be character or factor, but found incompatible type '%s'.", xcnam, icnam, typeof(y[[lc]])) } else if ( (is.integer(x[[rc]]) || is.double(x[[rc]])) && (is.logical(y[[lc]]) || is.character(y[[lc]])) ) { - stop("When x's column ('",xcnam,"') is integer or numeric, the corresponding column in y ('",icnam,"') can not be character or logical types, but found incompatible type '",typeof(y[[lc]]),"'.") + stopf("When x's column ('%s') is integer or numeric, the corresponding column in y ('%s') can not be character or logical types, but found incompatible type '%s'.", xcnam, icnam, typeof(y[[lc]])) } } ux = unique(shallow(x, by.x)) @@ -42,8 +42,7 @@ funique = function(x) { if (!identical(names(x), names(y))) stop("x and y must have the same column order") bad_types = c("raw", "complex", if (block_list) "list") found = bad_types %chin% c(vapply_1c(x, typeof), vapply_1c(y, typeof)) - if (any(found)) stop("unsupported column type", if (sum(found) > 1L) "s" else "", - " found in x or y: ", brackify(bad_types[found])) + if (any(found)) stop(domain=NA, sprintf(ngettext(sum(found), "unsupported column type found in x or y: %s", "unsupported column types found in x or y: %s"), brackify(bad_types[found]))) super = function(x) { # allow character->factor and integer->numeric because from v1.12.4 i's type is retained by joins, #3820 ans = class(x)[1L] @@ -51,7 +50,7 @@ funique = function(x) { } if (!identical(sx<-sapply(x, super), sy<-sapply(y, super))) { w = which.first(sx!=sy) - stop("Item ",w," of x is '",class(x[[w]])[1L],"' but the corresponding item of y is '", class(y[[w]])[1L], "'.") + stopf("Item %d of x is '%s' but the corresponding item of y is '%s'.", w, class(x[[w]])[1L], class(y[[w]])[1L]) } if (.seqn && ".seqn" %chin% names(x)) stop("None of the datasets should contain a column named '.seqn'") } @@ -189,9 +188,9 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu stop("None of the datasets to compare should contain a column named '.seqn'") bad.type = setNames(c("raw","complex","list") %chin% c(vapply_1c(current, typeof), vapply_1c(target, typeof)), c("raw","complex","list")) if (any(bad.type)) - stop("Datasets to compare with 'ignore.row.order' must not have unsupported column types: ", brackify(names(bad.type)[bad.type])) + stopf("Datasets to compare with 'ignore.row.order' must not have unsupported column types: %s", brackify(names(bad.type)[bad.type])) if (between(tolerance, 0, sqrt(.Machine$double.eps), incbounds=FALSE)) { - warning("Argument 'tolerance' was forced to lowest accepted value `sqrt(.Machine$double.eps)` from provided ", format(tolerance, scientific=FALSE)) + warningf("Argument 'tolerance' was forced to lowest accepted value `sqrt(.Machine$double.eps)` from provided %s", format(tolerance, scientific=FALSE)) tolerance = sqrt(.Machine$double.eps) } target_dup = as.logical(anyDuplicated(target)) diff --git a/R/tables.R b/R/tables.R index b94441c626..99c59f0c4d 100644 --- a/R/tables.R +++ b/R/tables.R @@ -23,7 +23,7 @@ tables = function(mb=TRUE, order.col="NAME", width=80, KEY = list(key(DT)), INDICES = if (index) list(indices(DT))) })) - if (!order.col %chin% names(info)) stop("order.col='",order.col,"' not a column name of info") + if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names if (!silent) { # prettier printing on console diff --git a/R/test.data.table.R b/R/test.data.table.R index cf778c68b6..d28319aa78 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -46,7 +46,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # nocov start fn2 = paste0(fn,".bz2") if (!file.exists(file.path(fulldir, fn2))) - stop(domain=NA, gettextf("Neither %s nor %s exist in %s",fn, fn2, fulldir)) + stopf("Neither %s nor %s exist in %s",fn, fn2, fulldir) fn = fn2 # nocov end # sys.source() below accepts .bz2 directly. @@ -151,7 +151,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (inherits(err,"try-error")) { # nocov start if (silent) return(FALSE) - stop("Failed after test ", env$prevtest, " before the next test() call in ",fn) + stopf("Failed after test %s before the next test() call in %s", env$prevtest, fn) # the try() above with silent=FALSE will have already printed the error itself # nocov end } @@ -166,7 +166,8 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F nfail, "%d error out of %d. Search %s for test number %s", "%d errors out of %d. Search %s for test numbers %s" - ), nfail, ntest, names(fn), paste(env$whichfail, collapse=", ") + ), + nfail, ntest, names(fn), toString(env$whichfail) )) # important to stop() here, so that 'R CMD check' fails # nocov end @@ -176,7 +177,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F timings = env$timings DT = head(timings[-1L][order(-time)], 10L) # exclude id 1 as in dev that includes JIT if ((x<-sum(timings[["nTest"]])) != ntest) { - warning("Timings count mismatch: ",x," vs ",ntest) # nocov + warningf("Timings count mismatch: %d vs %d", x, ntest) # nocov } catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) print(DT, class=FALSE) @@ -305,7 +306,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no showProgress = FALSE # nocov } if (!missing(error) && !missing(y)) - stop("Test ",numStr," is invalid: when error= is provided it does not make sense to pass y as well") # nocov + stopf("Test %s is invalid: when error= is provided it does not make sense to pass y as well", numStr) # nocov string_match = function(x, y, ignore.case=FALSE) { length(grep(x, y, fixed=TRUE)) || # try treating x as literal first; useful for most messages containing ()[]+ characters diff --git a/R/translation.R b/R/translation.R new file mode 100644 index 0000000000..192c425d85 --- /dev/null +++ b/R/translation.R @@ -0,0 +1,21 @@ +# templated warning/error functions to smooth translation & development + +catf = function(fmt, ..., sep=" ", domain=NULL) { + cat(gettextf(fmt, ..., domain=domain), sep=sep) +} + +stopf = function(fmt, ..., domain=NULL) { + stop(gettextf(fmt, ..., domain=domain), domain=NA, call. = FALSE) +} + +warningf = function(fmt, ..., immediate.=FALSE, noBreaks.=FALSE, domain=NULL) { + warning(gettextf(fmt, ..., domain=domain), domain=NA, call.=FALSE, immediate.=immediate., noBreaks.=noBreaks.) +} + +messagef = function(fmt, ..., appendLF=TRUE, domain=NULL) { + message(gettextf(fmt, ..., domain=domain), domain=NA, appendLF=appendLF) +} + +packageStartupMessagef = function(fmt, ..., appendLF=TRUE, domain=NULL) { + packageStartupMessage(gettextf(fmt, ..., domain=domain), domain=NA, appendLF=appendLF) +} diff --git a/R/transpose.R b/R/transpose.R index 25085c5c21..61dc56abb9 100644 --- a/R/transpose.R +++ b/R/transpose.R @@ -4,12 +4,12 @@ transpose = function(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names if (is.character(make.names)) { m = chmatch(make.names, names(l)) if (is.na(m)) - stop("make.names='",make.names,"' not found in names of input") + stopf("make.names='%s' not found in names of input", make.names) make.names = m } else { make.names = as.integer(make.names) if (is.na(make.names) || make.names<1L || make.names>length(l)) - stop("make.names=",make.names," is out of range [1,ncol=",length(l),"]") + stopf("make.names=%d is out of range [1,ncol=%d]", make.names, length(l)) } colnames = as.character(l[[make.names]]) l = if (is.data.table(l)) l[,-make.names,with=FALSE] else l[-make.names] @@ -31,8 +31,7 @@ tstrsplit = function(x, ..., fill=NA, type.convert=FALSE, keep, names=FALSE) { keep = suppressWarnings(as.integer(keep)) chk = min(keep) >= min(1L, length(ans)) & max(keep) <= length(ans) if (!isTRUE(chk)) # handles NA case too - stop("'keep' should contain integer values between ", - min(1L, length(ans)), " and ", length(ans), ".") + stopf("'keep' should contain integer values between %d and %d.", min(1L, length(ans)), length(ans)) ans = ans[keep] } # Implementing #1094, but default FALSE @@ -41,8 +40,7 @@ tstrsplit = function(x, ..., fill=NA, type.convert=FALSE, keep, names=FALSE) { else if (isTRUE(names)) names = paste0("V", seq_along(ans)) if (length(names) != length(ans)) { str = if (missing(keep)) "ans" else "keep" - stop("length(names) (= ", length(names), - ") is not equal to length(", str, ") (= ", length(ans), ").") + stopf("length(names) (= %d) is not equal to length(%s) (= %d).", length(names), str, length(ans)) } setattr(ans, 'names', names) ans diff --git a/R/utils.R b/R/utils.R index 7a698131c6..3a180dc951 100644 --- a/R/utils.R +++ b/R/utils.R @@ -152,8 +152,3 @@ edit.data.table = function(name, ...) { setDT(NextMethod('edit', name))[] } # nocov end - -catf = function(fmt, ...) { - cat(gettextf(fmt, ...)) -} - diff --git a/R/xts.R b/R/xts.R index fce6aad3b5..0a89bf3892 100644 --- a/R/xts.R +++ b/R/xts.R @@ -7,7 +7,7 @@ as.data.table.xts = function(x, keep.rownames = TRUE, key=NULL, ...) { r = setDT(as.data.frame(x, row.names=NULL)) if (identical(keep.rownames, FALSE)) return(r[]) index_nm = if (is.character(keep.rownames)) keep.rownames else "index" - if (index_nm %chin% names(x)) stop(domain=NA, gettextf("Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index column name.", index_nm, index_nm)) + if (index_nm %chin% names(x)) stopf("Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index column name.", index_nm, index_nm) r[, c(index_nm) := zoo::index(x), env=list(x=x)] setcolorder(r, c(index_nm, setdiff(names(r), index_nm))) # save to end to allow for key=index_nm @@ -19,7 +19,7 @@ as.xts.data.table = function(x, ...) { stopifnot(requireNamespace("xts"), !missing(x), is.data.table(x)) if (!xts::is.timeBased(x[[1L]])) stop("data.table must have a time based column in first position, use `setcolorder` function to change the order, or see ?timeBased for supported types") colsNumeric = vapply_1b(x, is.numeric)[-1L] # exclude first col, xts index - if (!all(colsNumeric)) warning("Following columns are not numeric and will be omitted: ", brackify(names(colsNumeric)[!colsNumeric])) + if (!all(colsNumeric)) warningf("Following columns are not numeric and will be omitted: %s", brackify(names(colsNumeric)[!colsNumeric])) r = setDF(x[, .SD, .SDcols = names(colsNumeric)[colsNumeric]]) return(xts::as.xts(r, order.by = if ("IDate" %chin% class(x[[1L]])) as.Date(x[[1L]]) else x[[1L]])) } diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 88c6a99e6f..f2026259ce 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -251,7 +251,7 @@ test(7.12, f(list(var1 = var2), list(var1 = "c1", var2 = 5L)), quote(list(c1 = 5 d = data.table(a = 2:1, b = 1:4) test(11.01, d[var3%in%values, .(var1 = f(var2)), by=var3, env=list(var1="res", var2="b", f="sum", var3="a", values=0:3), - verbose=TRUE], data.table(a=c(2L,1L), res=c(4L,6L)), output=c("Argument 'by' after substitute: a","Argument 'j' after substitute: .(res = sum(b))","Argument 'i' after substitute: a %in% 0:3")) + verbose=TRUE], data.table(a=c(2L,1L), res=c(4L,6L)), output=c("Argument 'by' after substitute: a","Argument 'j' after substitute: .(res = sum(b))","Argument 'i' after substitute: a %in% 0:3")) # data.table symbols and chars d = data.table(a = c("b","a"), b = 1:4) out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, @@ -261,8 +261,8 @@ test(11.02, ans, data.table(a=c("a","b"), res=c(6L,4L), key="a")) out = grep("Argument.*substitute", out, value=TRUE) test(11.021, length(out), 3L) # we expect i, j, by only here, ensure about that test(11.022, "Argument 'by' after substitute: a" %in% out, TRUE) -test(11.023, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) -test(11.024, "Argument 'i' after substitute: a %in% c(\"a\", \"b\", \"c\")" %in% out, TRUE) +test(11.023, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) +test(11.024, "Argument 'i' after substitute: a %in% c(\"a\", \"b\", \"c\")" %in% out, TRUE) out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, env=I(list(var1=as.name("res"), var2=as.name("b"), f=as.name("sum"), var3=as.name("a"), values=c("b","c"))), verbose=TRUE]) @@ -270,8 +270,8 @@ test(11.03, ans, data.table(a=c("b"), res=c(4L), key="a")) out = grep("Argument.*substitute", out, value=TRUE) test(11.031, length(out), 3L) test(11.032, "Argument 'by' after substitute: a" %in% out, TRUE) -test(11.033, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) -test(11.034, "Argument 'i' after substitute: a %in% c(\"b\", \"c\")" %in% out, TRUE) +test(11.033, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) +test(11.034, "Argument 'i' after substitute: a %in% c(\"b\", \"c\")" %in% out, TRUE) # substitute2 during join d1 = data.table(id1=1:4, v1=5) d2 = data.table(id1=c(0L,2:3), v1=6) @@ -279,7 +279,7 @@ out = capture.output(ans <- d1[d2, on="id1<=id1", .(c1, c2, c3, c4), env=list(c1 test(11.041, ans, data.table(x.id1=c(NA,1:2,1:3), i.id1=c(0L,2L,2L,3L,3L,3L), x.v1=c(NA,rep(5,5)), i.v1=rep(6,6))) out = grep("Argument.*substitute", out, value=TRUE) test(11.042, length(out), 2L) ## 2L because i is non-missing attempt to substitute is made -test(11.043, "Argument 'j' after substitute: .(x.id1, i.id1, x.v1, i.v1)" %in% out, TRUE) +test(11.043, "Argument 'j' after substitute: .(x.id1, i.id1, x.v1, i.v1)" %in% out, TRUE) d1 = data.table(id1=c(2L,4L,2L,4L), v1=5) d2 = data.table(id1=c(0L,2:3), v1=6) out = capture.output(ans <- d1[dd, on="id1<=id1", .(sum(c3), sum(c4)), by=by, env=list(dd="d2", c3="x.v1", c4="i.v1", by=".EACHI"), verbose=TRUE]) @@ -287,8 +287,8 @@ test(11.044, ans, data.table(id1=c(0L,2L,3L), V1=c(NA,10,10), V2=c(6,6,6))) out = grep("Argument.*substitute", out, value=TRUE) test(11.045, length(out), 3L) test(11.046, "Argument 'by' after substitute: .EACHI" %in% out, TRUE) -test(11.047, "Argument 'j' after substitute: .(sum(x.v1), sum(i.v1))" %in% out, TRUE) -test(11.048, "Argument 'i' after substitute: d2" %in% out, TRUE) +test(11.047, "Argument 'j' after substitute: .(sum(x.v1), sum(i.v1))" %in% out, TRUE) +test(11.048, "Argument 'i' after substitute: d2" %in% out, TRUE) dt1 = data.table(x = letters[1:5], y = 1:5) dt2 = data.table(x = letters[1:3], y = 11:13) target_v = "y" @@ -326,9 +326,9 @@ f = function(x, i, j, by) { x[.i, .j, .by, env=list(.i=substitute(i), .j=substitute(j), .by=substitute(by)), verbose=TRUE] } test(11.083, f(d), d) -test(11.084, f(d, 1), d[1], output="Argument 'i' after substitute", notOutput="Argument 'j' after substitute") -test(11.085, f(d,, 1), d[,1], output="Argument 'j' after substitute", notOutput="Argument 'i' after substitute") -test(11.086, f(d, 1, 1), d[1, 1], output="Argument 'j' after substitute.*Argument 'i' after substitute") +test(11.084, f(d, 1), d[1], output="Argument 'i' after substitute", notOutput="Argument 'j' after substitute") +test(11.085, f(d,, 1), d[,1], output="Argument 'j' after substitute", notOutput="Argument 'i' after substitute") +test(11.086, f(d, 1, 1), d[1, 1], output="Argument 'j' after substitute.*Argument 'i' after substitute") #1985 weird exception when by contains get tb = data.table(x=c(1,2), y=c(3,4), z=c(5,6), w=c("a","b")) @@ -362,7 +362,7 @@ test(11.107, ans, data.table(Type=factor(c("Quebec","Mississippi"), levels=c("Qu out = grep("Argument.*substitute", out, value=TRUE) test(11.108, length(out), 2L) test(11.109, "Argument 'by' after substitute: Type" %in% out, TRUE) -test(11.110, "Argument 'j' after substitute: list(max(conc), round(mean(conc)))" %in% out, TRUE) +test(11.110, "Argument 'j' after substitute: list(max(conc), round(mean(conc)))" %in% out, TRUE) #628 Change j=list(xout=eval(...))'s eval to eval within scope of DT dat = data.table(x_one=1:10, x_two=1:10, y_one=1:10, y_two=1:10) f = function(vars) as.call(c(quote(list), lapply(setNames(vars, paste(vars,"out",sep="_")), function(var) substitute2(one-two, list(one=paste(var,"one",sep="_"), two=paste(var,"two",sep="_")))))) @@ -488,8 +488,8 @@ test(101.01, dt$x_ratio, x_rat) test(101.02, dt$y_ratio, y_rat) test(101.03, length(grep("Argument.*substitute", out[["x"]], value=TRUE)), 1L) test(101.04, length(grep("Argument.*substitute", out[["y"]], value=TRUE)), 1L) -test(101.05, "Argument 'j' after substitute: `:=`(x_ratio, (x3 - x2) * (x2 - x1) * (x3 - x1)/sqrt(x1^2 + x2^2 + x3^2))" %in% out[["x"]], TRUE) -test(101.06, "Argument 'j' after substitute: `:=`(y_ratio, (y3 - y2) * (y2 - y1) * (y3 - y1)/sqrt(y1^2 + y2^2 + y3^2))" %in% out[["y"]], TRUE) +test(101.05, "Argument 'j' after substitute: `:=`(x_ratio, (x3 - x2) * (x2 - x1) * (x3 - x1)/sqrt(x1^2 + x2^2 + x3^2))" %in% out[["x"]], TRUE) +test(101.06, "Argument 'j' after substitute: `:=`(y_ratio, (y3 - y2) * (y2 - y1) * (y3 - y1)/sqrt(y1^2 + y2^2 + y3^2))" %in% out[["y"]], TRUE) daily_cor = function(data, x, y) { ## daily correlation of user input features data[, .(cor = cor(x, y)), keyby = date, @@ -499,7 +499,7 @@ daily_cor = function(data, x, y) { ## daily correlation of user input features out = capture.output(ans <- daily_cor(dt, "x0", "y2")) test(101.07, length(grep("Argument.*substitute", out, value=TRUE)), 2L) ## 'by' (or 'keyby') is not substituted here but it still goes via substitute2 because it is non-missing test(101.08, "Argument 'by' after substitute: date" %in% out, TRUE) -test(101.09, "Argument 'j' after substitute: .(cor = cor(x0, y2))" %in% out, TRUE) +test(101.09, "Argument 'j' after substitute: .(cor = cor(x0, y2))" %in% out, TRUE) group_cor = function(data, x, y, g) { ## group cor comparison of user input features cor_dt = data[, lapply(.SD, function(x) cor(x, Y)), keyby = .(group = GROUP), @@ -511,11 +511,11 @@ group_cor = function(data, x, y, g) { ## group cor comparison of user input feat out = capture.output(dt1 <- group_cor(dt, c("x0", "x1", "x2"), "y1", "grp1")) test(101.10, length(grep("Argument.*substitute", out, value=TRUE)), 2L) test(101.11, "Argument 'by' after substitute: .(group = grp1)" %in% out, TRUE) -test(101.12, "Argument 'j' after substitute: lapply(.SD, function(x) cor(x, y1))" %in% out, TRUE) +test(101.12, "Argument 'j' after substitute: lapply(.SD, function(x) cor(x, y1))" %in% out, TRUE) out = capture.output(dt2 <- group_cor(dt, c("x0", "x1", "x2"), "y1", "grp2")) test(101.13, length(grep("Argument.*substitute", out, value=TRUE)), 2L) test(101.14, "Argument 'by' after substitute: .(group = grp2)" %in% out, TRUE) -test(101.15, "Argument 'j' after substitute: lapply(.SD, function(x) cor(x, y1))" %in% out, TRUE) +test(101.15, "Argument 'j' after substitute: lapply(.SD, function(x) cor(x, y1))" %in% out, TRUE) stats_dt1 = as.data.table(list( x = c("x0", "x1", "x2"), min = c(-0.325967794724422, -0.126026585686073, -0.398950077203113), @@ -551,8 +551,8 @@ out = capture.output(ans <- cor_xy(xdt, ydt, c("x1", "x2"), "y10")) exp = as.data.table(list(symbol = 1:2, x1 = c(0.529292252112253, 0.0301956035638738), x2 = c(0.287076866252898, -0.335969587268599)), key="symbol") test(102.01, ans, exp) test(102.02, length(grep("Argument.*substitute", out, value=TRUE)), 2L) -test(102.03, "Argument 'j' after substitute: `:=`(y, y10)" %in% out, TRUE) -test(102.04, "Argument 'i' after substitute: ydt" %in% out, TRUE) +test(102.03, "Argument 'j' after substitute: `:=`(y, y10)" %in% out, TRUE) +test(102.04, "Argument 'i' after substitute: ydt" %in% out, TRUE) cor_xy2 = function(xdt, ydt, x, y) { ## cor between each pair of x and y rbindlist(lapply(y, function(yi) { xdt[ydt, y := Y, on = .(symbol, date), diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 65c595d273..05fea1b7c6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2362,7 +2362,7 @@ test(827.1, names(a[b]), c("User ID","Blah Blah","Yadda Yadda")) # setcolorder and merge check for dup column names, #2193(ii) setnames(DT2,"b","a") -test(828, setcolorder(DT2,c("a","b")), error="x has some duplicated column name(s): a. Please remove or rename") +test(828, setcolorder(DT2,c("a","b")), error="x has some duplicated column name(s): [a]. Please remove or rename") test(829, merge(DT1,DT2), error="y has some duplicated column name(s): [a]. Please remove or rename") test(830, merge(DT2,DT1), error="x has some duplicated column name(s): [a]. Please remove or rename") @@ -5678,7 +5678,7 @@ for (i in seq_along(dt)) { x = data.table(chr=c("Chr1", "Chr1", "Chr2", "Chr2", "Chr2"), start=c(5,10, 1, 25, 50), end=c(11,20,4,52,60)) y = data.table(chr=c("Chr1", "Chr1", "Chr2"), start=c(1, 15,1), end=c(4, 18, 55), val=1:3) # no by.x and by.y error -test(1371.1, foverlaps(x, y, type="any"), error="'y' must be keyed (i.e., sorted, and, marked as sorted).") +test(1371.1, foverlaps(x, y, type="any"), error="y must be keyed (i.e., sorted, and, marked as sorted).") setkey(y, chr, end, start) test(1371.2, foverlaps(x, y, by.y=1:3, type="any"), error="The first 3 columns of y's key must be identical to the columns specified in by.y.") setkey(y, chr, start, end) @@ -13827,7 +13827,7 @@ test(1967.30, foverlaps(x, y), error = 'must be integer/numeric type') x[ , end := as.integer(end)] test(1967.31, foverlaps(x, y, by.x = c('end', 'start')), - error = 'All entries in column end should be <= corresponding entries') + error = "All entries in column 'end' should be <= corresponding entries") y[ , end := as.character(end)] setkey(y, start, end) test(1967.32, foverlaps(x, y), @@ -13835,7 +13835,7 @@ test(1967.32, foverlaps(x, y), y[ , end := as.integer(end)] setkey(y, end, start) test(1967.33, foverlaps(x, y, by.x = c('start', 'end'), by.y = c('end', 'start')), - error = 'All entries in column end should be <= corresponding entries') + error = "All entries in column 'end' should be <= corresponding entries") ## data.table.R test(1967.34, data.table(1:5, NULL), data.table(V1=1:5)) @@ -14109,7 +14109,7 @@ test(1984.07, DT[, sum(a), by=call('sin',pi)], error='must evaluate to a vector test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*type 'raw'.*not.*supported") test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 error="Column or expression.*2.*type 'list'.*not.*supported") -test(1984.09, DT[, sum(a), by=.(1,1:2)], error='The items.*list are length[(]s[)] [(]1,2[)].*Each must be length 10; .*rows in x.*after subsetting') +test(1984.09, DT[, sum(a), by=.(1,1:2)], error="The items in the 'by' or 'keyby' list are length(s) [1, 2]. Each must be length 10; the same length as there are rows in x (after subsetting if i is provided).") options('datatable.optimize' = Inf) test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE], data.table(a = c(1, 0), V1 = c(1, 1)), @@ -14124,7 +14124,7 @@ test(1984.16, DT[1, 1+3i] <- 4, error='j must be vector of') test(1984.17, dimnames(DT) <- 5, error = 'attempting to assign invalid object') test(1984.18, dimnames(DT) <- list(5, 5, 5), error = 'attempting to assign invalid object') test(1984.19, dimnames(DT) <- list(5, 5), error = 'data.tables do not have rownames') -test(1984.20, dimnames(DT) <- list(NULL, 5), error = "Can't assign 1 colnames") +test(1984.20, dimnames(DT) <- list(NULL, 5), error = "Can't assign 1 names") dimnames(DT) <- list(NULL, 1:5) test(1984.21, names(DT), paste0(1:5)) DT = data.table(a = 1:10) @@ -16058,12 +16058,12 @@ test(2074.23, capture.output(print(DT2, topn=1L, col.names='none')), # foverlaps x = data.table(start=NA_integer_, end=1L, key='start,end') y = copy(x) -test(2074.24, foverlaps(x, y), error="NA values in data.table 'x' start column") +test(2074.24, foverlaps(x, y), error="NA values in data.table x 'start' column") x[ , start := 0L] setkey(x, start, end) -test(2074.25, foverlaps(x, y), error="NA values in data.table 'y' start column") +test(2074.25, foverlaps(x, y), error="NA values in data.table y 'start' column") setkey(y, end, start) -test(2074.26, foverlaps(x, y), error="NA values in data.table 'y' end column") +test(2074.26, foverlaps(x, y), error="NA values in data.table y 'end' column") # cube test(2074.27, cube(DT, by=1L), error="Argument 'by' must be a character") @@ -17563,15 +17563,15 @@ test(2183.00004, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=NUL test(2183.00005, melt(DTid, measure.vars=measurev(list(column=NULL, istr=NULL), pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword iris.dt = data.table(datasets::iris) test(2183.00020, melt(iris.dt, measure.vars=measurev(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") -test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: 1") -test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: 1,2") +test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1]") +test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1, 2]") test(2183.00027, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim="bar"), sep=".")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: dim") test(2183.00028, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), sep=".")), error="number of elements of fun.list =3 must be same as max number of items after splitting column names =2") test(2183.00042, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function()1), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr") test(2183.00043, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=interactive), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr") test(2183.00044, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function(x)1), pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr") test(2183.00045, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), pattern="(.*)[.](.*)")), error="number of elements of fun.list =3 must be same as number of capture groups in pattern =2") -test(2183.00048, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, value.name=NULL), sep=".")), error="elements of fun.list should be uniquely named, problems: value.name") +test(2183.00048, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, value.name=NULL), sep=".")), error="elements of fun.list should be uniquely named, problems: [value.name]") # measure with factor conversion. myfac = function(x)factor(x)#user-defined conversion function. test(2183.00060, melt(DTid, measure.vars=measurev(list(letter=myfac, value.name=NULL), pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) @@ -17623,9 +17623,9 @@ test(2183.42, melt(DTid, measure.vars=measure(value.name, istr=function()1, patt test(2183.43, melt(DTid, measure.vars=measure(value.name, istr=interactive, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") test(2183.44, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr") test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2") -test(2183.46, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") -test(2183.47, melt(iris.dt, measure.vars=measure(function(x)factor(x), pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") -test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="... arguments to measure should be uniquely named, problems: value.name") +test(2183.46, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: [1]") +test(2183.47, melt(iris.dt, measure.vars=measure(function(x)factor(x), pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: [1]") +test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="... arguments to measure should be uniquely named, problems: [value.name]") # measure with factor conversion. myfac = function(x)factor(x)#user-defined conversion function. test(2183.60, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) @@ -17642,15 +17642,15 @@ test(2183.65, melt(iris.days, measure.vars=measure(pattern="day")), error="patte test(2183.66, melt(iris.days, measure.vars=measure(value.name, pattern="(.*)")), error="value.name is the only group; fix by creating at least one more group") test(2183.67, melt(iris.days, measure.vars=measure(foo, bar, pattern="(foo)(bar)")), error="pattern did not match any cols, so nothing would be melted; fix by changing pattern") test(2183.68, melt(iris.days, measure.vars=measure(value.name, bar, pattern="(foo)(bar)")), error="pattern did not match any cols, so nothing would be melted; fix by changing pattern") -test(2183.69, melt(data.table(ff=1, ff=2), measure.vars=measure(letter, number, pattern="(.)(.)")), error="measured columns should be uniquely named, problems: ff") -test(2183.70, melt(data.table(f_f=1, f_f=2), measure.vars=measure(letter, number)), error="measured columns should be uniquely named, problems: f_f") +test(2183.69, melt(data.table(ff=1, ff=2), measure.vars=measure(letter, number, pattern="(.)(.)")), error="measured columns should be uniquely named, problems: [ff]") +test(2183.70, melt(data.table(f_f=1, f_f=2), measure.vars=measure(letter, number)), error="measured columns should be uniquely named, problems: [f_f]") test(2183.71, melt(iris.days, measure.vars=measure(value.name=as.integer, variable, pattern="day(.)[.](.*)")), error="value.name column class=integer after applying conversion function, but must be character") test(2183.72, melt(data.table(ff=1, ff=2, a=3, b=4), measure.vars=measure(letter, pattern="([ab])"), id.vars="ff"), data.table(ff=1, letter=c("a","b"), value=c(3,4)))#duplicate column names are fine if they are not matched by pattern. -test(2183.73, melt(DTid, measure.vars=measure(letter, multiple.keyword, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: multiple.keyword") +test(2183.73, melt(DTid, measure.vars=measure(letter, multiple.keyword, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: [multiple.keyword]") test(2183.74, melt(DTid, measure.vars=measure(letter, number, multiple.keyword=as.integer, pattern="([ab])([12])")), error="multiple.keyword must be a character string") test(2183.75, melt(DTid, measure.vars=measure(letter, number, multiple.keyword=NA_character_, pattern="([ab])([12])")), error="multiple.keyword must be a character string") test(2183.76, melt(DTid, measure.vars=measure(letter, number, multiple.keyword="", pattern="([ab])([12])")), error="multiple.keyword must be a character string with nchar>0") -test(2183.77, melt(DTid, measure.vars=measure(letter, cols, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: cols") +test(2183.77, melt(DTid, measure.vars=measure(letter, cols, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: [cols]") test(2183.78, melt(DTid, measure.vars=measure(letter, cols=as.integer, pattern="([ab])([12])")), error="cols must be a character vector of column names") test(2183.79, melt(DTid, measure.vars=measure(letter, number, pattern=as.integer)), error="pattern must be character string") test(2183.80, melt(DTid, measure.vars=measure(letter, number, sep=as.integer)), error="sep must be character string") From 5774db8d9bf4f37b3fdc3755a47c838c613258fb Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 8 Jul 2021 07:20:03 -0700 Subject: [PATCH 305/588] cleanup knitr file after test (#5067) --- tests/knitr.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/knitr.R b/tests/knitr.R index 5ead0c4de5..0940f429ed 100644 --- a/tests/knitr.R +++ b/tests/knitr.R @@ -2,6 +2,7 @@ if (suppressPackageStartupMessages(requireNamespace("knitr", quietly = TRUE))) { require(knitr) knit("knitr.Rmd", quiet=TRUE) cat(readLines("knitr.md"), sep="\n") + file.remove("knitr.md") } else { cat(readLines("knitr.Rout.mock", warn = FALSE), sep="\n") } From f84fb935114124748c08d6786012e0dc6435421d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 8 Jul 2021 07:51:23 -0700 Subject: [PATCH 306/588] fully migrate to templated messages (#5068) --- R/AllS4.R | 2 +- R/IDateTime.R | 14 +-- R/as.data.table.R | 16 +-- R/between.R | 8 +- R/bmerge.R | 4 +- R/data.table.R | 222 +++++++++++++++++++++--------------------- R/duplicated.R | 4 +- R/fcast.R | 24 ++--- R/fmelt.R | 28 +++--- R/foverlaps.R | 46 ++++----- R/frank.R | 12 +-- R/fread.R | 40 ++++---- R/fwrite.R | 14 +-- R/groupingsets.R | 40 ++++---- R/merge.R | 38 ++++---- R/onAttach.R | 10 +- R/onLoad.R | 20 ++-- R/openmp-utils.R | 2 +- R/print.data.table.R | 14 +-- R/programming.R | 16 +-- R/setkey.R | 54 +++++----- R/setops.R | 26 ++--- R/test.data.table.R | 19 ++-- R/timetaken.R | 2 +- R/transpose.R | 2 +- R/uniqlist.R | 2 +- R/utils.R | 10 +- R/xts.R | 6 +- inst/tests/tests.Rraw | 60 ++++++------ 29 files changed, 375 insertions(+), 380 deletions(-) diff --git a/R/AllS4.R b/R/AllS4.R index 1e6244fbcb..fc3db0fa09 100644 --- a/R/AllS4.R +++ b/R/AllS4.R @@ -1,5 +1,5 @@ ## Functions to let data.table play nicer with S4 -if ("package:data.table" %in% search()) stop("data.table package loaded. When developing don't load package") +if ("package:data.table" %in% search()) stopf("data.table package loaded. When developing don't load package") ## Allows data.table to be defined as an object of an S4 class, ## or even have data.table be a super class of an S4 class. diff --git a/R/IDateTime.R b/R/IDateTime.R index c84c173a72..9fd57b2b7a 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -93,27 +93,27 @@ round.IDate = function (x, digits=c("weeks", "months", "quarters", "years"), ... return(e1) # TODO: investigate Ops.IDate method a la Ops.difftime if (inherits(e1, "difftime") || inherits(e2, "difftime")) - stop("Internal error -- difftime objects may not be added to IDate, but Ops dispatch should have intervened to prevent this") # nocov + stopf("Internal error -- difftime objects may not be added to IDate, but Ops dispatch should have intervened to prevent this") # nocov if (isReallyReal(e1) || isReallyReal(e2)) { return(`+.Date`(e1, e2)) # IDate doesn't support fractional days; revert to base Date } if (inherits(e1, "Date") && inherits(e2, "Date")) - stop("binary + is not defined for \"IDate\" objects") + stopf("binary + is not defined for \"IDate\" objects") (setattr(as.integer(unclass(e1) + unclass(e2)), "class", c("IDate", "Date"))) # () wrap to return visibly } `-.IDate` = function (e1, e2) { if (!inherits(e1, "IDate")) { if (inherits(e1, 'Date')) return(base::`-.Date`(e1, e2)) - stop("can only subtract from \"IDate\" objects") + stopf("can only subtract from \"IDate\" objects") } if (storage.mode(e1) != "integer") - stop("Internal error: storage mode of IDate is somehow no longer integer") # nocov + stopf("Internal error: storage mode of IDate is somehow no longer integer") # nocov if (nargs() == 1L) - stop("unary - is not defined for \"IDate\" objects") + stopf("unary - is not defined for \"IDate\" objects") if (inherits(e2, "difftime")) - stop("Internal error -- difftime objects may not be subtracted from IDate, but Ops dispatch should have intervened to prevent this") # nocov + stopf("Internal error -- difftime objects may not be subtracted from IDate, but Ops dispatch should have intervened to prevent this") # nocov if ( isReallyReal(e2) ) { # IDate deliberately doesn't support fractional days so revert to base Date @@ -301,7 +301,7 @@ clip_msec = function(secs, action) { truncate = as.integer(secs), nearest = as.integer(round(secs)), ceil = as.integer(ceiling(secs)), - stop("Valid options for ms are 'truncate', 'nearest', and 'ceil'.") + stopf("Valid options for ms are 'truncate', 'nearest', and 'ceil'.") ) } diff --git a/R/as.data.table.R b/R/as.data.table.R index 2f1a336868..5a547149ea 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -83,15 +83,15 @@ as.data.table.matrix = function(x, keep.rownames=FALSE, key=NULL, ...) { as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, value.name="value", na.rm=TRUE, ...) { dx = dim(x) if (length(dx) <= 2L) - stop("as.data.table.array method should only be called for arrays with 3+ dimensions; use the matrix method for 2-dimensional arrays") + stopf("as.data.table.array method should only be called for arrays with 3+ dimensions; use the matrix method for 2-dimensional arrays") if (!is.character(value.name) || length(value.name)!=1L || is.na(value.name) || !nzchar(value.name)) - stop("Argument 'value.name' must be scalar character, non-NA and at least one character") + stopf("Argument 'value.name' must be scalar character, non-NA and at least one character") if (!is.logical(sorted) || length(sorted)!=1L || is.na(sorted)) - stop("Argument 'sorted' must be scalar logical and non-NA") + stopf("Argument 'sorted' must be scalar logical and non-NA") if (!is.logical(na.rm) || length(na.rm)!=1L || is.na(na.rm)) - stop("Argument 'na.rm' must be scalar logical and non-NA") + stopf("Argument 'na.rm' must be scalar logical and non-NA") if (!missing(sorted) && !is.null(key)) - stop("Please provide either 'key' or 'sorted', but not both.") + stopf("Please provide either 'key' or 'sorted', but not both.") dnx = dimnames(x) # NULL dimnames will create integer keys, not character as in table method @@ -137,7 +137,7 @@ as.data.table.list = function(x, if (is.null(xi)) next # eachncol already initialized to 0 by integer() above if (!is.null(dim(xi)) && missing.check.names) check.names=TRUE if ("POSIXlt" %chin% class(xi)) { - warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") + warningf("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") xi = x[[i]] = as.POSIXct(xi) } else if (is.matrix(xi) || is.data.frame(xi)) { if (!is.data.table(xi)) { @@ -193,7 +193,7 @@ as.data.table.list = function(x, k = k+1L } } - if (any(vnames==".SD")) stop("A column may not be called .SD. That has special meaning.") + if (any(vnames==".SD")) stopf("A column may not be called .SD. That has special meaning.") if (check.names) vnames = make.names(vnames, unique=TRUE) setattr(ans, "names", vnames) setDT(ans, key=key) # copy ensured above; also, setDT handles naming @@ -207,7 +207,7 @@ as.data.table.list = function(x, # for now. This addresses #1078 and #1128 .resetclass = function(x, class) { if (length(class)!=1L) - stop("class must be length 1") # nocov + stopf("class must be length 1") # nocov cx = class(x) n = chmatch(class, cx) # chmatch accepts length(class)>1 but next line requires length(n)==1 unique( c("data.table", "data.frame", tail(cx, length(cx)-n)) ) diff --git a/R/between.R b/R/between.R index 0ece7f3ef5..42925637ad 100644 --- a/R/between.R +++ b/R/between.R @@ -1,6 +1,6 @@ # is x[i] in between lower[i] and upper[i] ? between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) { - if (is.logical(x)) stop("between has been passed an argument x of type logical") + if (is.logical(x)) stopf("between has been passed an argument x of type logical") if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) is.px = function(x) inherits(x, "POSIXct") @@ -33,7 +33,7 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) } } if (is.i64(x)) { - if (!requireNamespace("bit64", quietly=TRUE)) stop("trying to use integer64 class when 'bit64' package is not installed") # nocov + if (!requireNamespace("bit64", quietly=TRUE)) stopf("trying to use integer64 class when 'bit64' package is not installed") # nocov if (!is.i64(lower) && is.numeric(lower)) lower = bit64::as.integer64(lower) if (!is.i64(upper) && is.numeric(upper)) upper = bit64::as.integer64(upper) } @@ -45,8 +45,8 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) .Call(Cbetween, x, lower, upper, incbounds, NAbounds, check) } else { if (isTRUE(getOption("datatable.verbose"))) catf("optimised between not available for this data type, fallback to slow R routine\n") - if (isTRUE(NAbounds) && (anyNA(lower) || anyNA(upper))) stop("Not yet implemented NAbounds=TRUE for this non-numeric and non-character type") - if (check && any(lower>upper, na.rm=TRUE)) stop("Some lower>upper for this non-numeric and non-character type") + if (isTRUE(NAbounds) && (anyNA(lower) || anyNA(upper))) stopf("Not yet implemented NAbounds=TRUE for this non-numeric and non-character type") + if (check && any(lower>upper, na.rm=TRUE)) stopf("Some lower>upper for this non-numeric and non-character type") if (incbounds) x>=lower & x<=upper # this & is correct not && else x> lower & x< upper } diff --git a/R/bmerge.R b/R/bmerge.R index 6d5b30a244..ddaedc1b3d 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -150,7 +150,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos nqgrp = integer(0L) nqmaxgrp = 1L if (verbose) catf("Non-equi join operators detected ... \n") - if (roll != FALSE) stop("roll is not implemented for non-equi joins yet.") + if (roll != FALSE) stopf("roll is not implemented for non-equi joins yet.") if (verbose) {last.started.at=proc.time();catf(" forder took ... ");flush.console()} # TODO: could check/reuse secondary indices, but we need 'starts' attribute as well! xo = forderv(x, xcols, retGrp=TRUE) @@ -170,7 +170,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} if (length(nqgrp)) nqmaxgrp = max(nqgrp) # fix for #1986, when 'x' is 0-row table max(.) returns -Inf. if (nqmaxgrp > 1L) { # got some non-equi join work to do - if ("_nqgrp_" %in% names(x)) stop("Column name '_nqgrp_' is reserved for non-equi joins.") + if ("_nqgrp_" %in% names(x)) stopf("Column name '_nqgrp_' is reserved for non-equi joins.") if (verbose) {last.started.at=proc.time();catf(" Recomputing forder with non-equi ids ... ");flush.console()} set(nqx<-shallow(x), j="_nqgrp_", value=nqgrp) xo = forderv(nqx, c(ncol(nqx), xcols)) diff --git a/R/data.table.R b/R/data.table.R index 0faa04d871..c15d65f034 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -38,7 +38,7 @@ is.ff = function(x) inherits(x, "ff") # define this in data.table so that we do #} #NROW = function(x) { # if (is.data.frame(x) || is.data.table(x)) return(nrow(x)) -# if (is.list(x) && !is.ff(x)) stop("List is not a data.frame or data.table. Convert first before using NROW") # list may have different length elements, which data.table and data.frame's resolve. +# if (is.list(x) && !is.ff(x)) stopf("List is not a data.frame or data.table. Convert first before using NROW") # list may have different length elements, which data.table and data.frame's resolve. # if (is.array(x)) nrow(x) else length(x) #} @@ -60,7 +60,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str if (length(x)==1L && (is.null(x[[1L]]) || (is.list(x[[1L]]) && length(x[[1L]])==0L))) return( null.data.table() ) #48 ans = as.data.table.list(x, keep.rownames=keep.rownames, check.names=check.names, .named=nd$.named) # see comments inside as.data.table.list re copies if (!is.null(key)) { - if (!is.character(key)) stop("key argument of data.table() must be character") + if (!is.character(key)) stopf("key argument of data.table() must be character") if (length(key)==1L) { key = strsplit(key,split=",")[[1L]] # eg key="A,B"; a syntax only useful in key argument to data.table(), really. @@ -134,7 +134,7 @@ replace_dot_alias = function(e) { stopf("Object '%s' not found amongst %s", used, brackify(ref)) } } else { - stop(err$message, call.=FALSE) + stopf(err$message) } } @@ -154,8 +154,8 @@ replace_dot_alias = function(e) { return(ans) } if (!missing(verbose)) { - if (!is.integer(verbose) && !is.logical(verbose)) stop("verbose must be logical or integer") - if (length(verbose)!=1 || anyNA(verbose)) stop("verbose must be length 1 non-NA") + if (!is.integer(verbose) && !is.logical(verbose)) stopf("verbose must be logical or integer") + if (length(verbose)!=1 || anyNA(verbose)) stopf("verbose must be length 1 non-NA") # set the global verbose option because that is fetched from C code without having to pass it through oldverbose = options(datatable.verbose=verbose) on.exit(options(oldverbose)) @@ -163,7 +163,7 @@ replace_dot_alias = function(e) { .global$print="" missingby = missing(by) && missing(keyby) # for tests 359 & 590 where passing by=NULL results in data.table not vector if (missingby || missing(j)) { - if (!missingby) warning("Ignoring by/keyby because 'j' is not supplied") + if (!missingby) warningf("Ignoring by/keyby because 'j' is not supplied") by = bysub = NULL keyby = FALSE } else { @@ -177,7 +177,7 @@ replace_dot_alias = function(e) { if (missing(keyby)) keyby = FALSE else if (!isTRUEorFALSE(keyby)) - stop("When by and keyby are both provided, keyby must be TRUE or FALSE") + stopf("When by and keyby are both provided, keyby must be TRUE or FALSE") } if (missing(by)) { missingby=TRUE; by=bysub=NULL } # possible when env is used, PR#4304 else if (verbose) catf("Argument '%s' after substitute: %s\n", "by", paste(deparse(bysub, width.cutoff=500L), collapse=" ")) @@ -202,32 +202,32 @@ replace_dot_alias = function(e) { if (!is.null(names(sys.call())) && # not relying on nargs() as it considers DT[,] to have 3 arguments, #3163 tryCatch(!is.symbol(tt_isub), error=function(e)TRUE) && # a symbol that inherits missingness from caller isn't missing for our purpose; test 1974 tryCatch(!is.symbol(tt_jsub), error=function(e)TRUE)) { - warning("i and j are both missing so ignoring the other arguments. This warning will be upgraded to error in future.") + warningf("i and j are both missing so ignoring the other arguments. This warning will be upgraded to error in future.") } return(x) } - if (!mult %chin% c("first","last","all")) stop("mult argument can only be 'first', 'last' or 'all'") + if (!mult %chin% c("first","last","all")) stopf("mult argument can only be 'first', 'last' or 'all'") missingroll = missing(roll) - if (length(roll)!=1L || is.na(roll)) stop("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'") + if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'") if (is.character(roll)) { if (roll!="nearest") stopf("roll is '%s' (type character). Only valid character value is 'nearest'.", roll) } else { roll = if (isTRUE(roll)) +Inf else as.double(roll) } force(rollends) - if (!is.logical(rollends)) stop("rollends must be a logical vector") - if (length(rollends)>2L) stop("rollends must be length 1 or 2") + if (!is.logical(rollends)) stopf("rollends must be a logical vector") + if (length(rollends)>2L) stopf("rollends must be length 1 or 2") if (length(rollends)==1L) rollends=rep.int(rollends,2L) # TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one. .unsafe.opt() #3585 missingnomatch = missing(nomatch) if (is.null(nomatch)) nomatch = 0L # allow nomatch=NULL API already now, part of: https://github.com/Rdatatable/data.table/issues/857 - if (!is.na(nomatch) && nomatch!=0L) stop("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL)") + if (!is.na(nomatch) && nomatch!=0L) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL)") nomatch = as.integer(nomatch) - if (!is.logical(which) || length(which)>1L) stop("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") + if (!is.logical(which) || length(which)>1L) stopf("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) - if (!is.na(nomatch) && is.na(which)) stop("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.") - if (!with && missing(j)) stop("j must be provided when with=FALSE") + if (!is.na(nomatch) && is.na(which)) stopf("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.") + if (!with && missing(j)) stopf("j must be provided when with=FALSE") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE rightcols = leftcols = integer() @@ -267,7 +267,7 @@ replace_dot_alias = function(e) { if (length(av)) { for (..name in av) { name = substr(..name, 3L, nchar(..name)) - if (!nzchar(name)) stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.") + if (!nzchar(name)) stopf("The symbol .. is invalid. The .. prefix must be followed by at least one character.") if (!exists(name, where=parent.frame())) { suggested = if (exists(..name, where=parent.frame())) gettextf(" Variable '..%s' does exist in calling scope though, so please just removed the .. prefix from that variable name in calling scope.", name) @@ -282,7 +282,7 @@ replace_dot_alias = function(e) { ..syms = av } } else if (is.name(jsub)) { - if (startsWith(as.character(jsub), "..")) stop("Internal error: DT[, ..var] should be dealt with by the branch above now.") # nocov + if (startsWith(as.character(jsub), "..")) stopf("Internal error: DT[, ..var] should be dealt with by the branch above now.") # nocov if (!with && !exists(as.character(jsub), where=parent.frame())) stopf("Variable '%s' is not found in calling scope. Looking in calling scope because you set with=FALSE. Also, please use .. symbol prefix and remove with=FALSE.", as.character(jsub)) } @@ -292,7 +292,7 @@ replace_dot_alias = function(e) { root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" } else if (length(jsub) > 2L && jsub[[2L]] %iscall% ":=") { #2142 -- j can be {} and have length 1 - stop("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}") + stopf("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}") } } if (root=="eval" && !any(all.vars(jsub[[2L]]) %chin% names_x)) { @@ -310,9 +310,9 @@ replace_dot_alias = function(e) { if (root == ":=") { allow.cartesian=TRUE # (see #800) if (!missing(i) && keyby) - stop(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i") + stopf(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i") if (!missingnomatch) { - warning("nomatch isn't relevant together with :=, ignoring nomatch") + warningf("nomatch isn't relevant together with :=, ignoring nomatch") nomatch=0L } } @@ -368,7 +368,7 @@ replace_dot_alias = function(e) { } if (isub %iscall% "!") { notjoin = TRUE - if (!missingnomatch) stop("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch."); + if (!missingnomatch) stopf("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch."); nomatch = 0L isub = isub[[2L]] # #932 related so that !(v1 == 1) becomes v1 == 1 instead of (v1 == 1) after removing "!" @@ -397,7 +397,7 @@ replace_dot_alias = function(e) { if (getOption("datatable.optimize")>=1L) assign("order", forder, ienv) i = tryCatch(eval(.massagei(isub), x, ienv), error=function(e) { if (grepl(":=.*defined for use in j.*only", e$message)) - stop("Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number.") + stopf("Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number.") else .checkTypos(e, names_x) }) @@ -426,7 +426,7 @@ replace_dot_alias = function(e) { if (is.numeric(i) && ncol(i)==1L) { # #826 - subset DT on single integer vector stored as matrix i = as.integer(i) } else { - stop("i is invalid type (matrix). Perhaps in future a 2 column matrix could return a list of elements of DT (in the spirit of A[B] in FAQ 2.14). Please report to data.table issue tracker if you'd like this, or add your comments to FR #657.") + stopf("i is invalid type (matrix). Perhaps in future a 2 column matrix could return a list of elements of DT (in the spirit of A[B] in FAQ 2.14). Please report to data.table issue tracker if you'd like this, or add your comments to FR #657.") } } if (is.logical(i)) { @@ -449,7 +449,7 @@ replace_dot_alias = function(e) { if (is.data.table(i)) { if (missing(on)) { if (!haskey(x)) { - stop("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") + stopf("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") } } else if (identical(substitute(on), as.name(".NATURAL"))) { naturaljoin = TRUE @@ -457,7 +457,7 @@ replace_dot_alias = function(e) { if (naturaljoin) { # natural join #629 common_names = intersect(names_x, names(i)) len_common_names = length(common_names) - if (!len_common_names) stop("Attempting to do natural join but no common columns in provided tables") + if (!len_common_names) stopf("Attempting to do natural join but no common columns in provided tables") if (verbose) { which_cols_msg = if (len_common_names == length(x)) { catf("Joining but 'x' has no key, natural join using all 'x' columns") @@ -536,10 +536,10 @@ replace_dot_alias = function(e) { if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L] } else { if (length(xo) && missing(on)) - stop("Internal error. Cannot by=.EACHI when joining to an index, yet") # nocov + stopf("Internal error. Cannot by=.EACHI when joining to an index, yet") # nocov # since f__ refers to xo later in grouping, so xo needs to be passed through to dogroups too. if (length(irows)) - stop("Internal error. irows has length in by=.EACHI") # nocov + stopf("Internal error. irows has length in by=.EACHI") # nocov } if (nqbyjoin) { irows = if (length(xo)) xo[irows] else irows @@ -590,7 +590,7 @@ replace_dot_alias = function(e) { } else { if (!missing(on)) { - stop("logical error. i is not a data.table, but 'on' argument is provided.") + stopf("logical error. i is not a data.table, but 'on' argument is provided.") } # TO DO: TODO: Incorporate which_ here on DT[!i] where i is logical. Should avoid i = !i (above) - inefficient. # i is not a data.table @@ -598,7 +598,7 @@ replace_dot_alias = function(e) { if (is.logical(i)) { if (is.na(which)) { # #4411 i filter not optimized to join: DT[A > 1, which = NA] ## we need this branch here, not below next to which=TRUE because irows=i=which(i) will filter out NAs: DT[A > 10, which = NA] will be incorrect - if (notjoin) stop("internal error: notjoin and which=NA (non-matches), huh? please provide reproducible example to issue tracker") # nocov + if (notjoin) stopf("internal error: notjoin and which=NA (non-matches), huh? please provide reproducible example to issue tracker") # nocov return(which(is.na(i) | !i)) } if (length(i)==1L # to avoid unname copy when length(i)==nrow (normal case we don't want to slow down) @@ -628,7 +628,7 @@ replace_dot_alias = function(e) { } } if (notjoin) { - if (byjoin || !is.integer(irows) || is.na(nomatch)) stop("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov + if (byjoin || !is.integer(irows) || is.na(nomatch)) stopf("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov irows = irows[irows!=0L] if (verbose) {last.started.at=proc.time();catf("Inverting irows for notjoin done in ... ");flush.console()} i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL # NULL meaning all rows i.e. seq_len(nrow(x)) @@ -688,10 +688,10 @@ replace_dot_alias = function(e) { # TODO: make these both errors (or single long error in both cases) in next release. # i.e. using with=FALSE together with := at all will become an error. Eventually with will be removed. if (is.null(names(jsub)) && is.name(jsub[[2L]])) { - warning("with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] to assign to column name(s) held in variable myVar. See ?':=' for other examples. As warned in 2014, this is now a warning.") + warningf("with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] to assign to column name(s) held in variable myVar. See ?':=' for other examples. As warned in 2014, this is now a warning.") jsub[[2L]] = eval(jsub[[2L]], parent.frame(), parent.frame()) } else { - warning("with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.") + warningf("with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.") } with = TRUE } @@ -738,7 +738,7 @@ replace_dot_alias = function(e) { if (any(w <- (j>ncol(x)))) stopf("Item %d of j is %d which is outside the column number range [1,ncol=%d]", idx <- which.first(w), j[idx], ncol(x)) j = j[j!=0L] if (any(j<0L)) { - if (any(j>0L)) stop("j mixes positives and negatives") + if (any(j>0L)) stopf("j mixes positives and negatives") j = seq_along(x)[j] # all j are <0 here } # 3013 -- handle !FALSE in column subset in j via logical+with @@ -746,7 +746,7 @@ replace_dot_alias = function(e) { if (!length(j)) return(null.data.table()) return(.Call(CsubsetDT, x, irows, j)) } else { - stop("When with=FALSE, j-argument should be of type logical/character/integer indicating the columns to select.") # fix for #1440. + stopf("When with=FALSE, j-argument should be of type logical/character/integer indicating the columns to select.") # fix for #1440. } } else { # with=TRUE and byjoin could be TRUE bynames = NULL @@ -778,7 +778,7 @@ replace_dot_alias = function(e) { # or look for column names used in this by (since if none it wouldn't find column names anyway # when evaled within full x[irows]). Trouble is that colA%%2L is a call and should be within frame. tt = eval(bysub, parent.frame(), parent.frame()) - if (!is.character(tt)) stop("by=c(...), key(...) or names(...) must evaluate to 'character'") + if (!is.character(tt)) stopf("by=c(...), key(...) or names(...) must evaluate to 'character'") bysub=tt } else if (is.call(bysub) && !(bysub[[1L]] %chin% c("list", "as.list", "{", ".", ":"))) { # potential use of function, ex: by=month(date). catch it and wrap with "(", because we need to set "bysameorder" to FALSE as we don't know if the function will return ordered results just because "date" is ordered. Fixes #2670. @@ -832,7 +832,7 @@ replace_dot_alias = function(e) { } else byval = eval(bysub, x, parent.frame()) } else { # length 0 when i returns no rows - if (!is.integer(irows)) stop("Internal error: irows isn't integer") # nocov + if (!is.integer(irows)) stopf("Internal error: irows isn't integer") # nocov # Passing irows as i to x[] below has been troublesome in a rare edge case. # irows may contain NA, 0, negatives and >nrow(x) here. That's all ok. # But we may need i join column values to be retained (where those rows have no match), hence we tried eval(isub) @@ -877,11 +877,11 @@ replace_dot_alias = function(e) { bynames = names(byval) } } - if (!is.list(byval)) stop("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)") + if (!is.list(byval)) stopf("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)") if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)() if (!bynull) for (jj in seq_len(length(byval))) { if (!(this_type <- typeof(byval[[jj]])) %chin% ORDERING_TYPES) { - stop(gettextf("Column or expression %d of 'by' or 'keyby' is type '%s' which is not currently supported. If you have a compelling use case, please add it to https://github.com/Rdatatable/data.table/issues/1597. As a workaround, consider converting the column to a supported type, e.g. by=sapply(list_col, toString), whilst taking care to maintain distinctness in the process.", jj, this_type)) + stopf("Column or expression %d of 'by' or 'keyby' is type '%s' which is not currently supported. If you have a compelling use case, please add it to https://github.com/Rdatatable/data.table/issues/1597. As a workaround, consider converting the column to a supported type, e.g. by=sapply(list_col, toString), whilst taking care to maintain distinctness in the process.", jj, this_type) } } tt = vapply_1i(byval,length) @@ -944,7 +944,7 @@ replace_dot_alias = function(e) { } if (!is.null(jvnames)) { if (length(nm) != length(jvnames)) - warning("j may not evaluate to the same number of columns for each group; if you're sure this warning is in error, please put the branching logic outside of [ for efficiency") + warningf("j may not evaluate to the same number of columns for each group; if you're sure this warning is in error, please put the branching logic outside of [ for efficiency") else if (any(idx <- nm != jvnames)) warningf('Different branches of j expression produced different auto-named columns: %s; using the most "last" names. If this was intentional (e.g., you know only one branch will ever be used in a given query because the branch is controlled by a function argument), please (1) pull this branch out of the call; (2) explicitly provide missing defaults for each branch in all cases; or (3) use the same name for each branch and re-name it in a follow-up call.', brackify(sprintf('%s!=%s', nm[idx], jvnames[idx]))) } @@ -1023,13 +1023,13 @@ replace_dot_alias = function(e) { } else if (is.numeric(.SDcols)) { .SDcols = as.integer(.SDcols) # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff' - if (length(unique(sign(.SDcols))) > 1L) stop(".SDcols is numeric but has both +ve and -ve indices") + if (length(unique(sign(.SDcols))) > 1L) stopf(".SDcols is numeric but has both +ve and -ve indices") if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L)) stopf(".SDcols is numeric but out of bounds [1, %d] at: %s", ncol(x), brackify(which(idx))) ansvars = sdvars = if (negate_sdcols) dupdiff(names_x[-.SDcols], bynames) else names_x[.SDcols] ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols } else { - if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names") + if (!is.character(.SDcols)) stopf(".SDcols should be column numbers or names") if (!all(idx <- .SDcols %chin% names_x)) stopf("Some items of .SDcols are not column names: %s", brackify(.SDcols[!idx])) ansvars = sdvars = if (negate_sdcols) setdiff(names_x, c(.SDcols, bynames)) else .SDcols @@ -1044,7 +1044,7 @@ replace_dot_alias = function(e) { # added 'mget' - fix for #994 if (any(c("get", "mget") %chin% av)){ if (verbose) - cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", brackify(ansvars))) + catf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", brackify(ansvars)) # get('varname') is too difficult to detect which columns are used in general # eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above. @@ -1073,7 +1073,7 @@ replace_dot_alias = function(e) { } # .SDcols might include grouping columns if users wants that, but normally we expect user not to include them in .SDcols } else { - if (!missing(.SDcols)) warning("This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table.") + if (!missing(.SDcols)) warningf("This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table.") allcols = c(names_x, xdotprefix, names_i, idotprefix) ansvars = sdvars = setdiff(intersect(av, allcols), bynames) if (verbose) catf("Detected that j uses these columns: %s\n",if (!length(ansvars)) "" else brackify(ansvars)) @@ -1089,7 +1089,7 @@ replace_dot_alias = function(e) { newnames = NULL suppPrint = identity if (length(av) && av[1L] == ":=") { - if (.Call(C_islocked, x)) stop(".SD is locked. Using := in .SD's j is reserved for possible future use; a tortuously flexible way to modify by group. Use := in j directly to modify by group by reference.") + if (.Call(C_islocked, x)) stopf(".SD is locked. Using := in .SD's j is reserved for possible future use; a tortuously flexible way to modify by group. Use := in j directly to modify by group by reference.") suppPrint = function(x) { .global$print=address(x); x } # Suppress print when returns ok not on error, bug #2376. Thanks to: http://stackoverflow.com/a/13606880/403310 # All appropriate returns following this point are wrapped; i.e. return(suppPrint(x)). @@ -1097,7 +1097,7 @@ replace_dot_alias = function(e) { if (is.null(names(jsub))) { # regular LHS:=RHS usage, or `:=`(...) with no named arguments (an error) # `:=`(LHS,RHS) is valid though, but more because can't see how to detect that, than desire - if (length(jsub)!=3L) stop("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.") + if (length(jsub)!=3L) stopf("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.") lhs = jsub[[2L]] jsub = jsub[[3L]] if (is.name(lhs)) { @@ -1109,20 +1109,20 @@ replace_dot_alias = function(e) { } else { # `:=`(c2=1L,c3=2L,...) lhs = names(jsub)[-1L] - if (any(lhs=="")) stop("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.") + if (any(lhs=="")) stopf("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.") names(jsub)="" jsub[[1L]]=as.name("list") } av = all.vars(jsub,TRUE) - if (!is.atomic(lhs)) stop("LHS of := must be a symbol, or an atomic vector (column names or positions).") + if (!is.atomic(lhs)) stopf("LHS of := must be a symbol, or an atomic vector (column names or positions).") if (is.character(lhs)) { m = chmatch(lhs, names_x) } else if (is.numeric(lhs)) { m = as.integer(lhs) - if (any(m<1L | ncol(x)1L) { # TODO in future as warned in NEWS for 1.11.6: - # warning("length(rownames)>1 is deprecated. Please use rownames.value= instead") + # warningf("length(rownames)>1 is deprecated. Please use rownames.value= instead") if (length(rownames)!=nrow(x)) stopf("length(rownames)==%d but nrow(DT)==%d. The rownames argument specifies a single column name or number. Consider rownames.value= instead.", length(rownames), nrow(x)) rownames.value = rownames rownames = NULL } else if (length(rownames)==0L) { - stop("length(rownames)==0 but should be a single column name or number, or NULL") + stopf("length(rownames)==0 but should be a single column name or number, or NULL") } else { if (isTRUE(rownames)) { if (length(key(x))>1L) { @@ -2070,12 +2070,12 @@ tail.data.table = function(x, n=6L, ...) { else `[<-.data.frame`(x, i, j, value) return(setalloccol(x)) # over-allocate (again). Avoid all this by using :=. } - # TO DO: warning("Please use DT[i,j:=value] syntax instead of DT[i,j]<-value, for efficiency. See ?':='") + # TO DO: warningf("Please use DT[i,j:=value] syntax instead of DT[i,j]<-value, for efficiency. See ?':='") if (!missing(i)) { isub=substitute(i) i = eval(.massagei(isub), x, parent.frame()) if (is.matrix(i)) { - if (!missing(j)) stop("When i is a matrix in DT[i]<-value syntax, it doesn't make sense to provide j") + if (!missing(j)) stopf("When i is a matrix in DT[i]<-value syntax, it doesn't make sense to provide j") x = `[<-.data.frame`(x, i, value=value) return(setalloccol(x)) } @@ -2085,15 +2085,15 @@ tail.data.table = function(x, n=6L, ...) { # named 'value'". So, users have to use := for that. } else i = NULL # meaning (to C code) all rows, without allocating 1L:nrow(x) vector if (missing(j)) j=names(x) - if (!is.atomic(j)) stop("j must be an atomic vector, see ?is.atomic") - if (anyNA(j)) stop("NA in j") + if (!is.atomic(j)) stopf("j must be an atomic vector, see ?is.atomic") + if (anyNA(j)) stopf("NA in j") if (is.character(j)) { newnames = setdiff(j,names(x)) cols = as.integer(chmatch(j, c(names(x),newnames))) # We can now mix existing columns and new columns } else { - if (!is.numeric(j)) stop("j must be vector of column name or positions") - if (any(j>ncol(x))) stop("Attempt to assign to column position greater than ncol(x). Create the column by name, instead. This logic intends to catch (most likely) user errors.") + if (!is.numeric(j)) stopf("j must be vector of column name or positions") + if (any(j>ncol(x))) stopf("Attempt to assign to column position greater than ncol(x). Create the column by name, instead. This logic intends to catch (most likely) user errors.") cols = as.integer(j) # for convenience e.g. to convert 1 to 1L newnames = NULL } @@ -2165,7 +2165,7 @@ as.list.data.table = function(x, ...) { dimnames.data.table = function(x) { if (!cedta()) { if (!inherits(x, "data.frame")) - stop("data.table inherits from data.frame (from v1.5), but this data.table does not. Has it been created manually (e.g. by using 'structure' rather than 'data.table') or saved to disk using a prior version of data.table?") + stopf("data.table inherits from data.frame (from v1.5), but this data.table does not. Has it been created manually (e.g. by using 'structure' rather than 'data.table') or saved to disk using a prior version of data.table?") return(`dimnames.data.frame`(x)) } list(NULL, names(x)) @@ -2174,8 +2174,8 @@ dimnames.data.table = function(x) { "dimnames<-.data.table" = function (x, value) # so that can do colnames(dt)=<..> as well as names(dt)=<..> { if (!cedta()) return(`dimnames<-.data.frame`(x,value)) # nocov ; will drop key but names<-.data.table (below) is more common usage and does retain the key - if (!is.list(value) || length(value) != 2L) stop("attempting to assign invalid object to dimnames of a data.table") - if (!is.null(value[[1L]])) stop("data.tables do not have rownames") + if (!is.list(value) || length(value) != 2L) stopf("attempting to assign invalid object to dimnames of a data.table") + if (!is.null(value[[1L]])) stopf("data.tables do not have rownames") if (ncol(x) != length(value[[2L]])) stopf("Can't assign %d names to a %d-column data.table", length(value[[2L]]), ncol(x)) setnames(x,as.character(value[[2L]])) x # this returned value is now shallow copied by R 3.1.0 via *tmp*. A very welcome change. @@ -2253,7 +2253,7 @@ subset.data.table = function (x, subset, select, ...) e = substitute(subset) r = eval(e, x, parent.frame()) if (!is.logical(r)) - stop("'subset' must evaluate to logical") + stopf("'subset' must evaluate to logical") r = r & !is.na(r) } @@ -2296,7 +2296,7 @@ na.omit.data.table = function (object, cols = seq_along(object), invert = FALSE, # compare to stats:::na.omit.data.frame if (!cedta()) return(NextMethod()) # nocov if ( !missing(invert) && is.na(as.logical(invert)) ) - stop("Argument 'invert' must be logical TRUE/FALSE") + stopf("Argument 'invert' must be logical TRUE/FALSE") cols = colnamesInt(object, cols, check_dups=FALSE) ix = .Call(Cdt_na, object, cols) # forgot about invert with no NA case, #2660 @@ -2337,22 +2337,22 @@ Ops.data.table = function(e1, e2 = NULL) } split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TRUE, flatten = TRUE, ..., verbose = getOption("datatable.verbose")) { - if (!is.data.table(x)) stop("x argument must be a data.table") + if (!is.data.table(x)) stopf("x argument must be a data.table") stopifnot(is.logical(drop), is.logical(sorted), is.logical(keep.by), is.logical(flatten)) # split data.frame way, using `f` and not `by` argument if (!missing(f)) { if (!length(f) && nrow(x)) - stop("group length is 0 but data nrow > 0") + stopf("group length is 0 but data nrow > 0") if (!missing(by)) - stop("passing 'f' argument together with 'by' is not allowed, use 'by' when split by column in data.table and 'f' when split by external factor") + stopf("passing 'f' argument together with 'by' is not allowed, use 'by' when split by column in data.table and 'f' when split by external factor") # same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind])) } - if (missing(by)) stop("Either 'by' or 'f' argument must be supplied") + if (missing(by)) stopf("Either 'by' or 'f' argument must be supplied") # check reserved column names during processing - if (".ll.tech.split" %chin% names(x)) stop("Column '.ll.tech.split' is reserved for split.data.table processing") - if (".nm.tech.split" %chin% by) stop("Column '.nm.tech.split' is reserved for split.data.table processing") - if (!all(by %chin% names(x))) stop("Argument 'by' must refer to column names in x") + if (".ll.tech.split" %chin% names(x)) stopf("Column '.ll.tech.split' is reserved for split.data.table processing") + if (".nm.tech.split" %chin% by) stopf("Column '.nm.tech.split' is reserved for split.data.table processing") + if (!all(by %chin% names(x))) stopf("Argument 'by' must refer to column names in x") if (!all(by.atomic <- vapply_1b(by, function(.by) is.atomic(x[[.by]])))) stopf("Argument 'by' must refer only to atomic-type columns, but the following columns are non-atomic: %s", brackify(by[!by.atomic])) # list of data.tables (flatten) or list of lists of ... data.tables make.levels = function(x, cols, sorted) { @@ -2482,7 +2482,7 @@ copy = function(x) { shallow = function(x, cols=NULL) { if (!is.data.table(x)) - stop("x is not a data.table. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table") + stopf("x is not a data.table. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table") ans = .shallow(x, cols=cols, retain.key = TRUE) ans } @@ -2490,7 +2490,7 @@ shallow = function(x, cols=NULL) { setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbose=getOption("datatable.verbose")) { name = substitute(DT) - if (identical(name, quote(`*tmp*`))) stop("setalloccol attempting to modify `*tmp*`") + if (identical(name, quote(`*tmp*`))) stopf("setalloccol attempting to modify `*tmp*`") ans = .Call(Calloccolwrapper, DT, eval(n), verbose) if (is.name(name)) { name = as.character(name) @@ -2525,7 +2525,7 @@ setattr = function(x,name,value) { ans = .Call(Csetattrib, x, name, value) # If name=="names" and this is the first time names are assigned (e.g. in data.table()), this will be grown by setalloccol very shortly afterwards in the caller. if (!is.null(ans)) { - warning("Input is a length=1 logical that points to the same address as R's global value. Therefore the attribute has not been set by reference, rather on a copy. You will need to assign the result back to a variable. See issue #1281.") + warningf("Input is a length=1 logical that points to the same address as R's global value. Therefore the attribute has not been set by reference, rather on a copy. You will need to assign the result back to a variable. See issue #1281.") x = ans } } @@ -2540,7 +2540,7 @@ setnames = function(x,old,new,skip_absent=FALSE) { # But also more convenient than names(DT)[i]="newname" because we can also do setnames(DT,"oldname","newname") # without an onerous match() ourselves. old can be positions, too, but we encourage by name for robustness. # duplicates are permitted to be created without warning; e.g. in revdeps and for example, and setting spacer columns all with "" - if (!is.data.frame(x)) stop("x is not a data.table or data.frame") + if (!is.data.frame(x)) stopf("x is not a data.table or data.frame") ncol = length(x) if (length(names(x)) != ncol) stopf("x has %d columns but its names are length %d", ncol, length(names(x))) stopifnot(isTRUEorFALSE(skip_absent)) @@ -2562,8 +2562,8 @@ setnames = function(x,old,new,skip_absent=FALSE) { i = w } else { if (is.function(new)) new = if (is.numeric(old)) new(names(x)[old]) else new(old) - if (!is.character(new)) stop("'new' is not a character vector or a function") - # if (anyDuplicated(new)) warning("Some duplicates exist in 'new': ", brackify(new[duplicated(new)])) # dups allowed without warning; warn if and when the dup causes an ambiguity + if (!is.character(new)) stopf("'new' is not a character vector or a function") + # if (anyDuplicated(new)) warningf("Some duplicates exist in 'new': ", brackify(new[duplicated(new)])) # dups allowed without warning; warn if and when the dup causes an ambiguity if (anyNA(new)) stopf("NA in 'new' at positions %s", brackify(which(is.na(new)))) if (anyDuplicated(old)) stopf("Some duplicates exist in 'old': %s", brackify(old[duplicated(old)])) if (is.numeric(old)) i = old = seq_along(x)[old] # leave it to standard R to manipulate bounds and negative numbers @@ -2590,7 +2590,7 @@ setnames = function(x,old,new,skip_absent=FALSE) { i = i[w] } if (!length(new)) return(invisible(x)) # no changes - if (length(i) != length(new)) stop("Internal error: length(i)!=length(new)") # nocov + if (length(i) != length(new)) stopf("Internal error: length(i)!=length(new)") # nocov } # update the key if the column name being change is in the key m = chmatch(names(x)[i], key(x)) @@ -2620,7 +2620,7 @@ setcolorder = function(x, neworder=key(x)) { if (is.character(neworder) && anyDuplicated(names(x))) stopf("x has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", brackify(unique(names(x)[duplicated(names(x))]))) - # if (!is.data.table(x)) stop("x is not a data.table") + # if (!is.data.table(x)) stopf("x is not a data.table") neworder = colnamesInt(x, neworder, check_dups=FALSE) # dups are now checked inside Csetcolorder below if (length(neworder) != length(x)) { #if shorter than length(x), pad by the missing @@ -2676,14 +2676,14 @@ rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) { if (isFALSE(idcol)) { idcol = NULL } else if (!is.null(idcol)) { if (isTRUE(idcol)) idcol = ".id" - if (!is.character(idcol)) stop("idcol must be a logical or character vector of length 1. If logical TRUE the id column will named '.id'.") + if (!is.character(idcol)) stopf("idcol must be a logical or character vector of length 1. If logical TRUE the id column will named '.id'.") idcol = idcol[1L] } miss = missing(use.names) # more checking of use.names happens at C level; this is just minimal to massage 'check' to NA - if (identical(use.names, NA)) stop("use.names=NA invalid") # otherwise use.names=NA could creep in an usage equivalent to use.names='check' + if (identical(use.names, NA)) stopf("use.names=NA invalid") # otherwise use.names=NA could creep in an usage equivalent to use.names='check' if (identical(use.names,"check")) { - if (!miss) stop("use.names='check' cannot be used explicitly because the value 'check' is new in v1.12.2 and subject to change. It is just meant to convey default behavior. See ?rbindlist.") + if (!miss) stopf("use.names='check' cannot be used explicitly because the value 'check' is new in v1.12.2 and subject to change. It is just meant to convey default behavior. See ?rbindlist.") use.names = NA } ans = .Call(Crbindlist, l, use.names, fill, idcol) @@ -2698,12 +2698,12 @@ address = function(x) .Call(Caddress, eval(substitute(x), parent.frame())) ":=" = function(...) { # this error is detected when eval'ing isub and replaced with a more helpful one when using := in i due to forgetting a comma, #4227 - stop('Check that is.data.table(DT) == TRUE. Otherwise, := and `:=`(...) are defined for use in j, once only and in particular ways. See help(":=").') + stopf('Check that is.data.table(DT) == TRUE. Otherwise, := and `:=`(...) are defined for use in j, once only and in particular ways. See help(":=").') } setDF = function(x, rownames=NULL) { - if (!is.list(x)) stop("setDF only accepts data.table, data.frame or list of equal length as input") - if (anyDuplicated(rownames)) stop("rownames contains duplicates") + if (!is.list(x)) stopf("setDF only accepts data.table, data.frame or list of equal length as input") + if (anyDuplicated(rownames)) stopf("rownames contains duplicates") if (is.data.table(x)) { # copied from as.data.frame.data.table if (is.null(rownames)) { @@ -2728,7 +2728,7 @@ setDF = function(x, rownames=NULL) { n = vapply_1i(x, length) mn = max(n) if (any(n 1L && prod(vapply_1i(i, length)) > 1e4){ ## CJ would result in more than 1e4 rows. This would be inefficient, especially memory-wise #2635 @@ -3110,7 +3110,7 @@ isReallyReal = function(x) { } on = eval(onsub, parent.frame(2L), parent.frame(2L)) if (length(on) == 0L || !is.character(on)) - stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") + stopf("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") ## extract the operators and potential variable names from 'on'. ## split at backticks to take care about variable names like `col1<=`. pieces = strsplit(on, "(?=[`])", perl = TRUE) diff --git a/R/duplicated.R b/R/duplicated.R index 249a5470c5..4fc7c8d166 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -4,7 +4,7 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_ .NotYetUsed("incomparables != FALSE") } if (nrow(x) == 0L || ncol(x) == 0L) return(logical(0L)) # fix for bug #28 - if (is.na(fromLast) || !is.logical(fromLast)) stop("'fromLast' must be TRUE or FALSE") + if (is.na(fromLast) || !is.logical(fromLast)) stopf("'fromLast' must be TRUE or FALSE") if (!length(by)) by = NULL #4594 query = .duplicated.helper(x, by) @@ -99,7 +99,7 @@ anyDuplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=s uniqueN = function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) { # na.rm, #1455 if (is.null(x)) return(0L) if (!is.atomic(x) && !is.data.frame(x)) - stop("x must be an atomic vector or data.frames/data.tables") + stopf("x must be an atomic vector or data.frames/data.tables") if (is.atomic(x)) { if (is.logical(x)) return(.Call(CuniqueNlogical, x, na.rm=na.rm)) x = as_list(x) diff --git a/R/fcast.R b/R/fcast.R index 7d62ab2f22..465ff665da 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -28,12 +28,12 @@ dcast <- function( check_formula = function(formula, varnames, valnames) { if (is.character(formula)) formula = as.formula(formula) if (!inherits(formula, "formula") || length(formula) != 3L) - stop("Invalid formula. Cast formula should be of the form LHS ~ RHS, for e.g., a + b ~ c.") # nocov; couldn't find a way to construct a test formula with length!=3L + stopf("Invalid formula. Cast formula should be of the form LHS ~ RHS, for e.g., a + b ~ c.") # nocov; couldn't find a way to construct a test formula with length!=3L vars = all.vars(formula) vars = vars[!vars %chin% c(".", "...")] allvars = c(vars, valnames) if (any(allvars %chin% varnames[duplicated(varnames)])) - stop('data.table to cast must have unique column names') + stopf('data.table to cast must have unique column names') deparse_formula(as.list(formula)[-1L], varnames, allvars) } @@ -73,7 +73,7 @@ aggregate_funs = function(funs, vals, sep="_", ...) { if (length(funs) != length(vals)) { if (length(vals) == 1L) vals = replicate(length(funs), vals) - else stop("When 'fun.aggregate' and 'value.var' are both lists, 'value.var' must be either of length =1 or =length(fun.aggregate).") + else stopf("When 'fun.aggregate' and 'value.var' are both lists, 'value.var' must be either of length =1 or =length(fun.aggregate).") } only_one_fun = length(unlist(funs)) == 1L dots = list(...) @@ -106,9 +106,9 @@ aggregate_funs = function(funs, vals, sep="_", ...) { } dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose")) { - if (!is.data.table(data)) stop("'data' must be a data.table.") + if (!is.data.table(data)) stopf("'data' must be a data.table.") drop = as.logical(rep(drop, length.out=2L)) - if (anyNA(drop)) stop("'drop' must be logical TRUE/FALSE") + if (anyNA(drop)) stopf("'drop' must be logical TRUE/FALSE") # #2980 if explicitly providing fun.aggregate=length but not a value.var, # just use the last column (as guess(data) would do) because length will be # the same on all columns @@ -140,7 +140,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., rhsnames = tail(varnames, -length(lvars$lhs)) setattr(dat, 'names', c(varnames, valnames)) if (any(vapply_1b(dat[varnames], is.list))) { - stop("Columns specified in formula can not be of type list") + stopf("Columns specified in formula can not be of type list") } setDT(dat) @@ -151,23 +151,23 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., idx = which(eval(subset, data, parent.frame())) # any advantage thro' secondary keys? dat = .Call(CsubsetDT, dat, idx, seq_along(dat)) } - if (!nrow(dat) || !ncol(dat)) stop("Can not cast an empty data.table") + if (!nrow(dat) || !ncol(dat)) stopf("Can not cast an empty data.table") fun.call = m[["fun.aggregate"]] fill.default = NULL if (is.null(fun.call)) { oo = forderv(dat, by=varnames, retGrp=TRUE) if (attr(oo, 'maxgrpn', exact=TRUE) > 1L) { - message("Aggregate function missing, defaulting to 'length'") + messagef("Aggregate function missing, defaulting to 'length'") fun.call = quote(length) } } if (!is.null(fun.call)) { fun.call = aggregate_funs(fun.call, lvals, sep, ...) - errmsg = "Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately." + errmsg = gettext("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") if (is.null(fill)) { fill.default = suppressWarnings(dat[0L][, eval(fun.call)]) - # tryCatch(fill.default <- dat[0L][, eval(fun.call)], error = function(x) stop(errmsg, call.=FALSE)) - if (nrow(fill.default) != 1L) stop(errmsg, call.=FALSE) + # tryCatch(fill.default <- dat[0L][, eval(fun.call)], error = function(x) stopf(errmsg)) + if (nrow(fill.default) != 1L) stopf(errmsg) } dat = dat[, eval(fun.call), by=c(varnames)] } @@ -220,6 +220,6 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., # removed 'setcolorder()' here, #1153 setattr(ans, 'names', c(lhsnames, allcols)) setDT(ans); setattr(ans, 'sorted', lhsnames) - } else stop("Internal error -- empty rhsnames in dcast; please report") # nocov + } else stopf("Internal error -- empty rhsnames in dcast; please report") # nocov return (ans) } diff --git a/R/fmelt.R b/R/fmelt.R index 268e52462e..5038894ba0 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -25,11 +25,11 @@ patterns = function(..., cols=character(0L)) { L = list(...) p = unlist(L, use.names = any(nzchar(names(L)))) if (!is.character(p)) - stop("Input patterns must be of type character.") + stopf("Input patterns must be of type character.") matched = lapply(p, grep, cols) # replace with lengths when R 3.2.0 dependency arrives if (length(idx <- which(sapply(matched, length) == 0L))) - stop(domain = NA, sprintf(ngettext(length(idx), 'Pattern not found: [%s]', 'Patterns not found: [%s]'), brackify(p[idx]))) + stopf('Pattern(s) not found: [%s]', brackify(p[idx])) matched } @@ -71,13 +71,13 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.name", group.desc="elements of fun.list"){ # 1. basic error checking. if (!missing(sep) && !missing(pattern)) { - stop("both sep and pattern arguments used; must use either sep or pattern (not both)") + stopf("both sep and pattern arguments used; must use either sep or pattern (not both)") } if (!(is.character(multiple.keyword) && length(multiple.keyword)==1 && !is.na(multiple.keyword) && nchar(multiple.keyword)>0)) { - stop("multiple.keyword must be a character string with nchar>0") + stopf("multiple.keyword must be a character string with nchar>0") } if (!is.character(cols)) { - stop("cols must be a character vector of column names") + stopf("cols must be a character vector of column names") } prob.i <- if (is.null(names(fun.list))) { seq_along(fun.list) @@ -103,16 +103,16 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na # 2. compute initial group data table, used as variable_table attribute. group.mat = if (!missing(pattern)) { if (!is.character(pattern)) { - stop("pattern must be character string") + stopf("pattern must be character string") } match.vec = regexpr(pattern, cols, perl=TRUE) measure.vec = which(0 < match.vec) if (length(measure.vec) == 0L) { - stop("pattern did not match any cols, so nothing would be melted; fix by changing pattern") + stopf("pattern did not match any cols, so nothing would be melted; fix by changing pattern") } start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE] if (is.null(start)) { - stop("pattern must contain at least one capture group (parenthesized sub-pattern)") + stopf("pattern must contain at least one capture group (parenthesized sub-pattern)") } err.args.groups("number of capture groups in pattern", ncol(start)) end = attr(match.vec, "capture.length")[measure.vec,]+start-1L @@ -120,13 +120,13 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na substr(names.mat, start, end) } else { #pattern not specified, so split using sep. if (!is.character(sep)) { - stop("sep must be character string") + stopf("sep must be character string") } list.of.vectors = strsplit(cols, sep, fixed=TRUE) vector.lengths = sapply(list.of.vectors, length) n.groups = max(vector.lengths) if (n.groups == 1) { - stop("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") + stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") } err.args.groups("max number of items after splitting column names", n.groups) measure.vec = which(vector.lengths==n.groups) @@ -158,7 +158,7 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na } group.uniq = unique(group.dt) if (nrow(group.uniq) < nrow(group.dt)) { - stop("number of unique groups after applying type conversion functions less than number of groups, change type conversion") + stopf("number of unique groups after applying type conversion functions less than number of groups, change type conversion") } # 4. compute measure.vars list or vector. if (multiple.keyword %in% names(fun.list)) {# multiple output columns. @@ -190,7 +190,7 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na melt.data.table = function(data, id.vars, measure.vars, variable.name = "variable", value.name = "value", ..., na.rm = FALSE, variable.factor = TRUE, value.factor = FALSE, verbose = getOption("datatable.verbose")) { - if (!is.data.table(data)) stop("'data' must be a data.table") + if (!is.data.table(data)) stopf("'data' must be a data.table") if (missing(id.vars)) id.vars=NULL if (missing(measure.vars)) measure.vars = NULL measure.sub = substitute(measure.vars) @@ -209,10 +209,10 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl } } else { if (length(value.name) > 1L) { - warning("'value.name' provided in both 'measure.vars' and 'value.name argument'; value provided in 'measure.vars' is given precedence.") + warningf("'value.name' provided in both 'measure.vars' and 'value.name argument'; value provided in 'measure.vars' is given precedence.") } if (anyNA(meas.nm) || !all(nzchar(meas.nm))) { - stop("Please provide a name to each element of 'measure.vars'.") + stopf("Please provide a name to each element of 'measure.vars'.") } value.name = meas.nm } diff --git a/R/foverlaps.R b/R/foverlaps.R index e663d0a3cb..58c7a75557 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -1,49 +1,49 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=key(y), maxgap=0L, minoverlap=1L, type=c("any", "within", "start", "end", "equal"), mult=c("all", "first", "last"), nomatch=getOption("datatable.nomatch", NA), which=FALSE, verbose=getOption("datatable.verbose")) { - if (!is.data.table(y) || !is.data.table(x)) stop("y and x must both be data.tables. Use `setDT()` to convert list/data.frames to data.tables by reference or as.data.table() to convert to data.tables by copying.") + if (!is.data.table(y) || !is.data.table(x)) stopf("y and x must both be data.tables. Use `setDT()` to convert list/data.frames to data.tables by reference or as.data.table() to convert to data.tables by copying.") maxgap = as.integer(maxgap); minoverlap = as.integer(minoverlap) which = as.logical(which) .unsafe.opt() #3585 nomatch = if (is.null(nomatch)) 0L else as.integer(nomatch) if (!length(maxgap) || length(maxgap) != 1L || is.na(maxgap) || maxgap < 0L) - stop("maxgap must be a non-negative integer value of length 1") + stopf("maxgap must be a non-negative integer value of length 1") if (!length(minoverlap) || length(minoverlap) != 1L || is.na(minoverlap) || minoverlap < 1L) - stop("minoverlap must be a positive integer value of length 1") + stopf("minoverlap must be a positive integer value of length 1") if (!length(which) || length(which) != 1L || is.na(which)) - stop("which must be a logical vector of length 1. Either TRUE/FALSE") + stopf("which must be a logical vector of length 1. Either TRUE/FALSE") if (!length(nomatch) || length(nomatch) != 1L || (!is.na(nomatch) && nomatch!=0L)) - stop("nomatch must either be NA or NULL") + stopf("nomatch must either be NA or NULL") type = match.arg(type) mult = match.arg(mult) # if (maxgap > 0L || minoverlap > 1L) # for future implementation if (maxgap != 0L || minoverlap != 1L) - stop("maxgap and minoverlap arguments are not yet implemented.") + stopf("maxgap and minoverlap arguments are not yet implemented.") if (is.null(by.y)) - stop("y must be keyed (i.e., sorted, and, marked as sorted). Call setkey(y, ...) first, see ?setkey. Also check the examples in ?foverlaps.") + stopf("y must be keyed (i.e., sorted, and, marked as sorted). Call setkey(y, ...) first, see ?setkey. Also check the examples in ?foverlaps.") if (length(by.x) < 2L || length(by.y) < 2L) - stop("'by.x' and 'by.y' should contain at least two column names (or numbers) each - corresponding to 'start' and 'end' points of intervals. Please see ?foverlaps and examples for more info.") + stopf("'by.x' and 'by.y' should contain at least two column names (or numbers) each - corresponding to 'start' and 'end' points of intervals. Please see ?foverlaps and examples for more info.") if (is.numeric(by.x)) { if (any(by.x < 0L) || any(by.x > length(x))) - stop("Invalid numeric value for 'by.x'; it should be a vector with values 1 <= by.x <= length(x)") + stopf("Invalid numeric value for 'by.x'; it should be a vector with values 1 <= by.x <= length(x)") by.x = names(x)[by.x] } if (is.numeric(by.y)) { if (any(by.y < 0L) || any(by.y > length(y))) - stop("Invalid numeric value for 'by.y'; it should be a vector with values 1 <= by.y <= length(y)") + stopf("Invalid numeric value for 'by.y'; it should be a vector with values 1 <= by.y <= length(y)") by.y = names(y)[by.y] } if (!is.character(by.x)) - stop("A non-empty vector of column names or numbers is required for by.x") + stopf("A non-empty vector of column names or numbers is required for by.x") if (!is.character(by.y)) - stop("A non-empty vector of column names or numbers is required for by.y") + stopf("A non-empty vector of column names or numbers is required for by.y") if (!identical(by.y, key(y)[seq_along(by.y)])) stopf("The first %d columns of y's key must be identical to the columns specified in by.y.", length(by.y)) if (anyNA(chmatch(by.x, names(x)))) - stop("Elements listed in 'by.x' must be valid names in data.table x") + stopf("Elements listed in 'by.x' must be valid names in data.table x") if (anyDuplicated(by.x) || anyDuplicated(by.y)) - stop("Duplicate columns are not allowed in overlap joins. This may change in the future.") + stopf("Duplicate columns are not allowed in overlap joins. This may change in the future.") if (length(by.x) != length(by.y)) - stop("length(by.x) != length(by.y). Columns specified in by.x should correspond to columns specified in by.y and should be of same lengths.") + stopf("length(by.x) != length(by.y). Columns specified in by.x should correspond to columns specified in by.y and should be of same lengths.") if (any(dup.x<-duplicated(names(x)))) #1730 - handling join possible but would require workarounds on setcolorder further, it is really better just to rename dup column stopf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "x", brackify(unique(names(x)[dup.x]))) if (any(dup.y<-duplicated(names(y)))) @@ -53,7 +53,7 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k xval1 = x[[xintervals[1L]]]; xval2 = x[[xintervals[2L]]] yval1 = y[[yintervals[1L]]]; yval2 = y[[yintervals[2L]]] if (!storage.mode(xval1) %chin% c("double", "integer") || !storage.mode(xval2) %chin% c("double", "integer") || is.factor(xval1) || is.factor(xval2)) # adding factors to the bunch, #2645 - stop("The last two columns in by.x should correspond to the 'start' and 'end' intervals in data.table x and must be integer/numeric type.") + stopf("The last two columns in by.x should correspond to the 'start' and 'end' intervals in data.table x and must be integer/numeric type.") if ( isTRUEorNA(any(xval2 - xval1 < 0L)) ) { # better error messages as suggested by @msummersgill in #3007. Thanks for the code too. Placing this inside so that it only runs if the general condition is satisfied. Should error anyway here.. So doesn't matter even if runs all if-statements; takes about 0.2s for anyNA check on 200 million elements .. acceptable speed for stoppage, I think, at least for now. if ( anyNA(xval1) ) { @@ -63,7 +63,7 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k } else stopf("All entries in column '%s' should be <= corresponding entries in column '%s' in data.table x.", xintervals[1L], xintervals[2L]) } if (!storage.mode(yval1) %chin% c("double", "integer") || !storage.mode(yval2) %chin% c("double", "integer") || is.factor(yval1) || is.factor(yval2)) # adding factors to the bunch, #2645 - stop("The last two columns in by.y should correspond to the 'start' and 'end' intervals in data.table y and must be integer/numeric type.") + stopf("The last two columns in by.y should correspond to the 'start' and 'end' intervals in data.table y and must be integer/numeric type.") if ( isTRUEorNA(any(yval2 - yval1 < 0L) )) { if ( anyNA(yval1) ) { stopf("NA values in data.table %s '%s' column: '%s'. All rows with NA values in the range columns must be removed for foverlaps() to work.", "y", "start", yintervals[1L]) @@ -74,13 +74,13 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k # POSIXct interval cols error check posx_chk = sapply(list(xval1, xval2, yval1, yval2), inherits, 'POSIXct') if (any(posx_chk) && !all(posx_chk)) { - stop("Some interval cols are of type POSIXct while others are not. Please ensure all interval cols are (or are not) of POSIXct type") + stopf("Some interval cols are of type POSIXct while others are not. Please ensure all interval cols are (or are not) of POSIXct type") } # #1143, mismatched timezone getTZ = function(x) if (is.null(tz <- attr(x, "tzone", exact=TRUE))) "" else tz # "" == NULL AFAICT tzone_chk = c(getTZ(xval1), getTZ(xval2), getTZ(yval1), getTZ(yval2)) if (length(unique(tzone_chk)) > 1L) { - warning("POSIXct interval cols have mixed timezones. Overlaps are performed on the internal numerical representation of POSIXct objects (always in UTC epoch time), therefore printed values may give the impression that values don't overlap but their internal representations do Please ensure that POSIXct type interval cols have identical 'tzone' attributes to avoid confusion.") + warningf("POSIXct interval cols have mixed timezones. Overlaps are performed on the internal numerical representation of POSIXct objects (always in UTC epoch time), therefore printed values may give the impression that values don't overlap but their internal representations do Please ensure that POSIXct type interval cols have identical 'tzone' attributes to avoid confusion.") } ## see NOTES below: yclass = c(class(yval1), class(yval2)) @@ -161,13 +161,13 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k } # nocov start else if (maxgap == 0L && minoverlap > 1L) { - stop("Not yet implemented") + stopf("Not yet implemented") } else if (maxgap > 0L && minoverlap == 1L) { - stop("Not yet implemented") + stopf("Not yet implemented") } else if (maxgap > 0L && minoverlap > 1L) { if (maxgap > minoverlap) - warning("maxgap > minoverlap. maxgap will have no effect here.") - stop("Not yet implemented") + warningf("maxgap > minoverlap. maxgap will have no effect here.") + stopf("Not yet implemented") } # nocov end diff --git a/R/frank.R b/R/frank.R index 47e701c4cd..ba90a83b93 100644 --- a/R/frank.R +++ b/R/frank.R @@ -1,8 +1,8 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("average", "first", "last", "random", "max", "min", "dense")) { ties.method = match.arg(ties.method) - if (!length(na.last)) stop('length(na.last) = 0') + if (!length(na.last)) stopf('length(na.last) = 0') if (length(na.last) != 1L) { - warning("length(na.last) > 1, only the first element will be used") + warningf("length(na.last) > 1, only the first element will be used") na.last = na.last[1L] } keep = (na.last == "keep") @@ -14,13 +14,13 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a } if (is.atomic(x)) { if (!missing(cols) && !is.null(cols)) - stop("x is a single vector, non-NULL 'cols' doesn't make sense") + stopf("x is a single vector, non-NULL 'cols' doesn't make sense") cols = 1L x = as_list(x) } else { cols = colnamesInt(x, cols, check_dups=TRUE) if (!length(cols)) - stop("x is a list, 'cols' can not be 0-length") + stopf("x is a list, 'cols' can not be 0-length") } # need to unlock for #4429 x = .shallow(x, cols, unlock = TRUE) # shallow copy even if list.. @@ -28,7 +28,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a cols = seq_along(cols) if (is.na(na.last)) { if ("..na_prefix.." %chin% names(x)) - stop("Input column '..na_prefix..' conflicts with data.table internal usage; please rename") + stopf("Input column '..na_prefix..' conflicts with data.table internal usage; please rename") set(x, j = "..na_prefix..", value = is_na(x, cols)) order = if (length(order) == 1L) c(1L, rep(order, length(cols))) else c(1L, order) cols = c(ncol(x), cols) @@ -43,7 +43,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a n = nrow(x) } if ('..stats_runif..' %chin% names(x)) - stop("Input column '..stats_runif..' conflicts with data.table internal usage; please rename") + stopf("Input column '..stats_runif..' conflicts with data.table internal usage; please rename") set(x, idx, '..stats_runif..', stats::runif(n)) order = if (length(order) == 1L) c(rep(order, length(cols)), 1L) else c(order, 1L) cols = c(cols, ncol(x)) diff --git a/R/fread.R b/R/fread.R index c724f22353..81ffb2a0df 100644 --- a/R/fread.R +++ b/R/fread.R @@ -7,7 +7,7 @@ showProgress=getOption("datatable.showProgress",interactive()), data.table=getOp nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE), yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") { - if (missing(input)+is.null(file)+is.null(text)+is.null(cmd) < 3L) stop("Used more than one of the arguments input=, file=, text= and cmd=.") + if (missing(input)+is.null(file)+is.null(text)+is.null(cmd) < 3L) stopf("Used more than one of the arguments input=, file=, text= and cmd=.") input_has_vars = length(all.vars(substitute(input)))>0L # see news for v1.11.6 if (is.null(sep)) sep="\n" # C level knows that \n means \r\n on Windows, for example else { @@ -19,7 +19,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L ) # handle encoding, #563 if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) { - stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") + stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } stopifnot( isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress), @@ -50,19 +50,19 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } else if (is.null(cmd)) { if (!is.character(input) || length(input)!=1L) { - stop("input= must be a single character string containing a file name, a system command containing at least one space, a URL starting 'http[s]://', 'ftp[s]://' or 'file://', or, the input data itself containing at least one \\n or \\r") + stopf("input= must be a single character string containing a file name, a system command containing at least one space, a URL starting 'http[s]://', 'ftp[s]://' or 'file://', or, the input data itself containing at least one \\n or \\r") } if (input=="" || length(grep('\\n|\\r', input))) { # input is data itself containing at least one \n or \r } else { if (startsWith(input, " ")) { - stop("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") + stopf("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") } str7 = substr(input, 1L, 7L) # avoid grepl() for #2531 if (str7=="ftps://" || startsWith(input, "https://")) { # nocov start if (!requireNamespace("curl", quietly = TRUE)) - stop("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov + stopf("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below curl::curl_download(input, tmpFile, mode="wb", quiet = !showProgress) file = tmpFile @@ -106,7 +106,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } if ((is_gz <- endsWith(file, ".gz")) || endsWith(file, ".bz2")) { if (!requireNamespace("R.utils", quietly = TRUE)) - stop("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov + stopf("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov FUN = if (is_gz) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download @@ -116,18 +116,18 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") input = file } - if (!missing(autostart)) warning("'autostart' is now deprecated and ignored. Consider skip='string' or skip=n"); + if (!missing(autostart)) warningf("'autostart' is now deprecated and ignored. Consider skip='string' or skip=n"); if (is.logical(colClasses)) { - if (!allNA(colClasses)) stop("colClasses is type 'logical' which is ok if all NA but it has some TRUE or FALSE values in it which is not allowed. Please consider the drop= or select= argument instead. See ?fread.") + if (!allNA(colClasses)) stopf("colClasses is type 'logical' which is ok if all NA but it has some TRUE or FALSE values in it which is not allowed. Please consider the drop= or select= argument instead. See ?fread.") colClasses = NULL } if (!is.null(colClasses) && is.atomic(colClasses)) { - if (!is.character(colClasses)) stop("colClasses is not type list or character vector") + if (!is.character(colClasses)) stopf("colClasses is not type list or character vector") if (!length(colClasses)) { colClasses=NULL; } else if (identical(colClasses, "NULL")) { colClasses = NULL - warning('colClasses="NULL" (quoted) is interpreted as colClasses=NULL (the default) as opposed to dropping every column.') + warningf('colClasses="NULL" (quoted) is interpreted as colClasses=NULL (the default) as opposed to dropping every column.') } else if (!is.null(names(colClasses))) { # names are column names; convert to list approach colClasses = tapply(names(colClasses), colClasses, c, simplify=FALSE) } @@ -154,11 +154,11 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } if (yaml) { if (!requireNamespace('yaml', quietly = TRUE)) - stop("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov + stopf("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov # for tracking which YAML elements may be overridden by being declared explicitly call_args = names(match.call()) if (is.character(skip)) - warning("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of the metadata; please file an issue on GitHub if you'd like to see more intuitive behavior supported.") + warningf("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of the metadata; please file an issue on GitHub if you'd like to see more intuitive behavior supported.") # create connection to stream header lines from file: # https://stackoverflow.com/questions/9871307 f = base::file(input, 'r') @@ -194,7 +194,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (verbose) catf('Processed %d lines of YAML metadata with the following top-level fields: %s\n', n_read, brackify(yaml_names)) # process header first since it impacts how to handle colClasses if ('header' %chin% yaml_names) { - if ('header' %chin% call_args) message("User-supplied 'header' will override that found in metadata.") + if ('header' %chin% call_args) messagef("User-supplied 'header' will override that found in metadata.") else header = as.logical(yaml_header$header) } if ('schema' %chin% yaml_names) { @@ -212,7 +212,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") new_types = synonms[list(new_types)]$r_type new_names = sapply(yaml_header$schema$fields[!null_idx], `[[`, 'name') - if ('col.names' %chin% call_args) message("User-supplied column names in 'col.names' will override those found in YAML metadata.") + if ('col.names' %chin% call_args) messagef("User-supplied column names in 'col.names' will override those found in YAML metadata.") # resolve any conflicts with colClasses, if supplied; # colClasses (if present) is already in list form by now if ('colClasses' %chin% call_args) { @@ -242,21 +242,21 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } sep_syn = c('sep', 'delimiter') if (any(sep_idx <- sep_syn %chin% yaml_names)) { - if ('sep' %chin% call_args) message("User-supplied 'sep' will override that found in metadata.") + if ('sep' %chin% call_args) messagef("User-supplied 'sep' will override that found in metadata.") else sep = yaml_header[[ sep_syn[sep_idx][1L] ]] } quote_syn = c('quote', 'quoteChar', 'quote_char') if (any(quote_idx <- quote_syn %chin% yaml_names)) { - if ('quote' %chin% call_args) message("User-supplied 'quote' will override that found in metadata.") + if ('quote' %chin% call_args) messagef("User-supplied 'quote' will override that found in metadata.") else quote = yaml_header[[ quote_syn[quote_idx][1L] ]] } dec_syn = c('dec', 'decimal') if (any(dec_idx <- dec_syn %chin% yaml_names)) { - if ('dec' %chin% call_args) message("User-supplied 'dec' will override that found in metadata.") + if ('dec' %chin% call_args) messagef("User-supplied 'dec' will override that found in metadata.") else dec = yaml_header[[ dec_syn[dec_idx][1L] ]] } if ('na.strings' %chin% yaml_names) { - if ('na.strings' %chin% call_args) message("User-supplied 'na.strings' will override that found in metadata.") + if ('na.strings' %chin% call_args) messagef("User-supplied 'na.strings' will override that found in metadata.") else na.strings = yaml_header$na.strings } if (is.integer(skip)) skip = skip + n_read @@ -327,7 +327,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") setnames(ans, col.names) # setnames checks and errors automatically if (!is.null(key) && data.table) { if (!is.character(key)) - stop("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") + stopf("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") if (length(key) == 1L) { key = strsplit(key, split = ",", fixed = TRUE)[[1L]] } @@ -336,7 +336,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (yaml) setattr(ans, 'yaml_metadata', yaml_header) if (!is.null(index) && data.table) { if (!all(vapply_1b(index, is.character))) - stop("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") + stopf("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") if (is.list(index)) { to_split = vapply_1i(index, length) == 1L if (any(to_split)) diff --git a/R/fwrite.R b/R/fwrite.R index ab2353464a..3f85ff1ea0 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -16,18 +16,18 @@ fwrite = function(x, file="", append=FALSE, quote="auto", encoding = "") { na = as.character(na[1L]) # fix for #1725 if (length(encoding) != 1L || !encoding %chin% c("", "UTF-8", "native")) { - stop("Argument 'encoding' must be '', 'UTF-8' or 'native'.") + stopf("Argument 'encoding' must be '', 'UTF-8' or 'native'.") } if (missing(qmethod)) qmethod = qmethod[1L] if (missing(compress)) compress = compress[1L] if (missing(dateTimeAs)) { dateTimeAs = dateTimeAs[1L] } - else if (length(dateTimeAs)>1L) stop("dateTimeAs must be a single string") + else if (length(dateTimeAs)>1L) stopf("dateTimeAs must be a single string") dateTimeAs = chmatch(dateTimeAs, c("ISO","squash","epoch","write.csv"))-1L - if (is.na(dateTimeAs)) stop("dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") + if (is.na(dateTimeAs)) stopf("dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") if (!missing(logical01) && !missing(logicalAsInt)) - stop("logicalAsInt has been renamed logical01. Use logical01 only, not both.") + stopf("logicalAsInt has been renamed logical01. Use logical01 only, not both.") if (!missing(logicalAsInt)) { - # TODO: warning("logicalAsInt has been renamed logical01 for consistency with fread. It will work fine but please change to logical01 at your convenience so we can remove logicalAsInt in future.") + # TODO: warningf("logicalAsInt has been renamed logical01 for consistency with fread. It will work fine but please change to logical01 at your convenience so we can remove logicalAsInt in future.") logical01 = logicalAsInt logicalAsInt=NULL } @@ -37,7 +37,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' # validate arguments if (is.matrix(x)) { # coerce to data.table if input object is matrix - message("x being coerced from class: matrix to data.table") + messagef("x being coerced from class: matrix to data.table") x = as.data.table(x) } stopifnot(is.list(x), @@ -87,7 +87,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } yaml = if (!yaml) "" else { if (!requireNamespace('yaml', quietly=TRUE)) - stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov + stopf("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov schema_vec = sapply(x, class) # multi-class objects reduced to first class if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L) diff --git a/R/groupingsets.R b/R/groupingsets.R index 6e7ce8131a..4c25b5b651 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -4,11 +4,11 @@ rollup = function(x, ...) { rollup.data.table = function(x, j, by, .SDcols, id = FALSE, ...) { # input data type basic validation if (!is.data.table(x)) - stop("Argument 'x' must be a data.table object") + stopf("Argument 'x' must be a data.table object") if (!is.character(by)) - stop("Argument 'by' must be a character vector of column names used in grouping.") + stopf("Argument 'by' must be a character vector of column names used in grouping.") if (!is.logical(id)) - stop("Argument 'id' must be a logical scalar.") + stopf("Argument 'id' must be a logical scalar.") # generate grouping sets for rollup sets = lapply(length(by):0L, function(i) by[0L:i]) # redirect to workhorse function @@ -22,13 +22,13 @@ cube = function(x, ...) { cube.data.table = function(x, j, by, .SDcols, id = FALSE, ...) { # input data type basic validation if (!is.data.table(x)) - stop("Argument 'x' must be a data.table object") + stopf("Argument 'x' must be a data.table object") if (!is.character(by)) - stop("Argument 'by' must be a character vector of column names used in grouping.") + stopf("Argument 'by' must be a character vector of column names used in grouping.") if (!is.logical(id)) - stop("Argument 'id' must be a logical scalar.") + stopf("Argument 'id' must be a logical scalar.") if (missing(j)) - stop("Argument 'j' is required") + stopf("Argument 'j' is required") # generate grouping sets for cube - power set: http://stackoverflow.com/a/32187892/2490497 n = length(by) keepBool = sapply(2L^(seq_len(n)-1L), function(k) rep(c(FALSE, TRUE), times=k, each=((2L^n)/(2L*k)))) @@ -44,41 +44,41 @@ groupingsets = function(x, ...) { groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) { # input data type basic validation if (!is.data.table(x)) - stop("Argument 'x' must be a data.table object") + stopf("Argument 'x' must be a data.table object") if (ncol(x) < 1L) - stop("Argument 'x' is a 0-column data.table; no measure to apply grouping over.") + stopf("Argument 'x' is a 0-column data.table; no measure to apply grouping over.") if (anyDuplicated(names(x)) > 0L) - stop("Input data.table must not contain duplicate column names.") + stopf("Input data.table must not contain duplicate column names.") if (!is.character(by)) - stop("Argument 'by' must be a character vector of column names used in grouping.") + stopf("Argument 'by' must be a character vector of column names used in grouping.") if (anyDuplicated(by) > 0L) - stop("Argument 'by' must have unique column names for grouping.") + stopf("Argument 'by' must have unique column names for grouping.") if (!is.list(sets) || !all(vapply_1b(sets, is.character))) - stop("Argument 'sets' must be a list of character vectors.") + stopf("Argument 'sets' must be a list of character vectors.") if (!is.logical(id)) - stop("Argument 'id' must be a logical scalar.") + stopf("Argument 'id' must be a logical scalar.") # logic constraints validation if (!all((sets.all.by <- unique(unlist(sets))) %chin% by)) stopf("All columns used in 'sets' argument must be in 'by' too. Columns used in 'sets' but not present in 'by': %s", brackify(setdiff(sets.all.by, by))) if (id && "grouping" %chin% names(x)) - stop("When using `id=TRUE` the 'x' data.table must not have a column named 'grouping'.") + stopf("When using `id=TRUE` the 'x' data.table must not have a column named 'grouping'.") if (any(vapply_1i(sets, anyDuplicated))) # anyDuplicated returns index of first duplicate, otherwise 0L - stop("Character vectors in 'sets' list must not have duplicated column names within a single grouping set.") + stopf("Character vectors in 'sets' list must not have duplicated column names within a single grouping set.") if (length(sets) > 1L && (idx<-anyDuplicated(lapply(sets, sort)))) warningf("'sets' contains a duplicate (i.e., equivalent up to sorting) element at index %d; as such, there will be duplicate rows in the output -- note that grouping by A,B and B,A will produce the same aggregations. Use `sets=unique(lapply(sets, sort))` to eliminate duplicates.", idx) # input arguments handling jj = if (!missing(jj)) jj else substitute(j) av = all.vars(jj, TRUE) if (":=" %chin% av) - stop("Expression passed to grouping sets function must not update by reference. Use ':=' on results of your grouping function.") + stopf("Expression passed to grouping sets function must not update by reference. Use ':=' on results of your grouping function.") if (missing(.SDcols)) .SDcols = if (".SD" %chin% av) setdiff(names(x), by) else NULL # 0 rows template data.table to keep colorder and type empty = if (length(.SDcols)) x[0L, eval(jj), by, .SDcols=.SDcols] else x[0L, eval(jj), by] if (id && "grouping" %chin% names(empty)) # `j` could have been evaluated to `grouping` field - stop("When using `id=TRUE` the 'j' expression must not evaluate to a column named 'grouping'.") + stopf("When using `id=TRUE` the 'j' expression must not evaluate to a column named 'grouping'.") if (anyDuplicated(names(empty)) > 0L) - stop("There exists duplicated column names in the results, ensure the column passed/evaluated in `j` and those in `by` are not overlapping.") + stopf("There exists duplicated column names in the results, ensure the column passed/evaluated in `j` and those in `by` are not overlapping.") # adding grouping column to template - aggregation level identifier if (id) { set(empty, j = "grouping", value = integer()) @@ -88,7 +88,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) int64.cols = vapply_1b(empty, inherits, "integer64") int64.cols = names(int64.cols)[int64.cols] if (length(int64.cols) && !requireNamespace("bit64", quietly=TRUE)) - stop("Using integer64 class columns require to have 'bit64' package installed.") # nocov + stopf("Using integer64 class columns require to have 'bit64' package installed.") # nocov int64.by.cols = intersect(int64.cols, by) # aggregate function called for each grouping set aggregate.set = function(by.set) { diff --git a/R/merge.R b/R/merge.R index 8ad01de420..fb0666d5e0 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,9 +1,9 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), ...) { if (!sort %in% c(TRUE, FALSE)) - stop("Argument 'sort' should be logical TRUE/FALSE") + stopf("Argument 'sort' should be logical TRUE/FALSE") if (!no.dups %in% c(TRUE, FALSE)) - stop("Argument 'no.dups' should be logical TRUE/FALSE") + stopf("Argument 'no.dups' should be logical TRUE/FALSE") class_x = class(x) if (!is.data.table(y)) { y = as.data.table(y) @@ -13,31 +13,31 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } x0 = length(x)==0L y0 = length(y)==0L - if (x0 || y0) warning(domain=NA, sprintf( - ngettext( - x0+y0, - "You are trying to join data.tables where %s has 0 columns.", - "You are trying to join data.tables where %s have 0 columns." - ), - if (x0 && y0) "'x' and 'y'" else if (x0) "'x'" else "'y'" - )) + if (x0 || y0) { + if (x0 && y0) + warningf("Neither of the input data.tables to join have columns.") + else if (x0) + warningf("Input data.table '%s' has no columns.", "x") + else + warningf("Input data.table '%s' has no columns.", "y") + } nm_x = names(x) nm_y = names(y) - if (anyDuplicated(nm_x)) stop(gettextf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "x", brackify(nm_x[duplicated(nm_x)]))) - if (anyDuplicated(nm_y)) stop(gettextf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "y", brackify(nm_y[duplicated(nm_y)]))) + if (anyDuplicated(nm_x)) stopf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "x", brackify(nm_x[duplicated(nm_x)])) + if (anyDuplicated(nm_y)) stopf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "y", brackify(nm_y[duplicated(nm_y)])) ## set up 'by'/'by.x'/'by.y' if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) ) - stop("`by.x` and `by.y` must be of same length.") + stopf("`by.x` and `by.y` must be of same length.") if (!missing(by) && !missing(by.x)) - warning("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.") + warningf("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.") if (!is.null(by.x)) { if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y)) - stop("A non-empty vector of column names is required for `by.x` and `by.y`.") + stopf("A non-empty vector of column names is required for `by.x` and `by.y`.") if (!all(by.x %chin% nm_x)) - stop("Elements listed in `by.x` must be valid column names in x.") + stopf("Elements listed in `by.x` must be valid column names in x.") if (!all(by.y %chin% nm_y)) - stop("Elements listed in `by.y` must be valid column names in y.") + stopf("Elements listed in `by.y` must be valid column names in y.") by = by.x names(by) = by.y } else { @@ -48,9 +48,9 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (is.null(by)) by = intersect(nm_x, nm_y) if (length(by) == 0L || !is.character(by)) - stop("A non-empty vector of column names for `by` is required.") + stopf("A non-empty vector of column names for `by` is required.") if (!all(by %chin% intersect(nm_x, nm_y))) - stop("Elements listed in `by` must be valid column names in x and y") + stopf("Elements listed in `by` must be valid column names in x and y") by = unname(by) by.x = by.y = by } diff --git a/R/onAttach.R b/R/onAttach.R index 110cab69d4..bbad945497 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -24,16 +24,16 @@ packageStartupMessagef(domain="R-data.table", "data.table %s IN DEVELOPMENT built %s%s using %d threads (see ?getDTthreads). ", v, d, g, nth, appendLF=FALSE) else packageStartupMessagef(domain="R-data.table", "data.table %s using %d threads (see ?getDTthreads). ", v, nth, appendLF=FALSE) - packageStartupMessage(domain="R-data.table", "Latest news: r-datatable.com") + packageStartupMessagef(domain="R-data.table", "Latest news: r-datatable.com") # NB: domain= is necessary in .onAttach and .onLoad, see ?gettext and https://bugs.r-project.org/bugzilla/show_bug.cgi?id=18092. if (gettext(domain="R-data.table", "TRANSLATION CHECK") != "TRANSLATION CHECK") - packageStartupMessage(domain="R-data.table", "**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********") + packageStartupMessagef(domain="R-data.table", "**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********") if (dev && (Sys.Date() - as.Date(d))>28L) - packageStartupMessage(domain="R-data.table", "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") + packageStartupMessagef(domain="R-data.table", "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") if (!.Call(ChasOpenMP)) { - packageStartupMessage(domain="R-data.table", "**********\nThis installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", appendLF=FALSE) + packageStartupMessagef(domain="R-data.table", "**********\nThis installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", appendLF=FALSE) if (Sys.info()["sysname"] == "Darwin") - packageStartupMessage(domain="R-data.table", "This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.\n**********") + packageStartupMessagef(domain="R-data.table", "This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.\n**********") else packageStartupMessagef(domain="R-data.table", "This is %s. This warning should not normally occur on Windows or Linux where OpenMP is turned on by data.table's configure script by passing -fopenmp to the compiler. If you see this warning on Windows or Linux, please file a GitHub issue.\n**********", Sys.info()["sysname"]) } diff --git a/R/onLoad.R b/R/onLoad.R index 0dcfee82a8..9bf65c7907 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -9,7 +9,7 @@ val = getOption("datatable.nomatch") if (is.null(val)) return(invisible()) # not set is ideal (it's no longer set in .onLoad) if (identical(val, NA) || identical(val, NA_integer_)) return(invisible()) # set to default NA is ok for now; in future possible message/warning asking to remove - message("The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option.") + messagef("The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option.") .pkg.store$.unsafe.done = TRUE } @@ -94,14 +94,14 @@ } if (!is.null(getOption("datatable.old.bywithoutby"))) - warning(domain="R-data.table", "Option 'datatable.old.bywithoutby' has been removed as warned for 2 years. It is now ignored. Please use by=.EACHI instead and stop using this option.") + warningf(domain="R-data.table", "Option 'datatable.old.bywithoutby' has been removed as warned for 2 years. It is now ignored. Please use by=.EACHI instead and stop using this option.") if (!is.null(getOption("datatable.old.unique.by.key"))) - warning(domain="R-data.table", "Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. It is now ignored. Please use by=key(DT) instead and stop using this option.") + warningf(domain="R-data.table", "Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. It is now ignored. Please use by=key(DT) instead and stop using this option.") # Test R behaviour that changed in v3.1 and is now depended on x = 1L:3L y = list(x) - if (address(x) != address(y[[1L]])) stop(domain="R-data.table", "Unexpected base R behaviour: list(x) has copied x") + if (address(x) != address(y[[1L]])) stopf(domain="R-data.table", "Unexpected base R behaviour: list(x) has copied x") DF = data.frame(a=1:3, b=4:6) add1 = address(DF$a) @@ -109,7 +109,7 @@ names(DF) = c("A","B") add3 = address(DF$A) add4 = address(DF$B) - if (add1!=add3 || add2!=add4) stop(domain="R-data.table", "Unexpected base R behaviour: names<- has copied column contents") + if (add1!=add3 || add2!=add4) stopf(domain="R-data.table", "Unexpected base R behaviour: names<- has copied column contents") DF = data.frame(a=1:3, b=4:6) add1 = address(DF$a) @@ -119,10 +119,10 @@ add4 = address(DF$a) add5 = address(DF$b) add6 = address(DF) - if (add2==add5) stop(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- did not copy column 2 which was assigned to") - if (add1!=add4) stop(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- copied the first column which was not assigned to, too") + if (add2==add5) stopf(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- did not copy column 2 which was assigned to") + if (add1!=add4) stopf(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- copied the first column which was not assigned to, too") - if (add3==add6) warning(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- has not copied address(DF)") + if (add3==add6) warningf(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- has not copied address(DF)") # R could feasibly in future not copy DF's vecsxp in this case. If that changes in R, we'd like to know via the warning # because tests will likely break too. The warning will quickly tell R-core and us why, so we can then update. @@ -131,11 +131,11 @@ invisible() } -getRversion = function(...) stop("Reminder to data.table developers: don't use getRversion() internally. Add a behaviour test to .onLoad instead.") +getRversion = function(...) stopf("Reminder to data.table developers: don't use getRversion() internally. Add a behaviour test to .onLoad instead.") # notranslate # 1) using getRversion() wasted time when R3.0.3beta was released without the changes we expected in getRversion()>"3.0.2". # 2) R-devel and ourselves may wish to tinker with R-devel, turning on and off features in the same version number. So it's better if data.table doesn't hard code expectations into the version number. # 3) The discipline of adding a feature test here helps fully understand the change. -# 4) Defining getRversion with a stop() here helps prevent new switches on getRversion() being added in future. Easily circumvented but the point is to issue the message above. +# 4) Defining getRversion with a stopf() here helps prevent new switches on getRversion() being added in future. Easily circumvented but the point is to issue the message above. .onUnload = function(libpath) { # fix for #474. the shared object name is different from package name diff --git a/R/openmp-utils.R b/R/openmp-utils.R index 1d21937b5f..f19120724b 100644 --- a/R/openmp-utils.R +++ b/R/openmp-utils.R @@ -1,6 +1,6 @@ setDTthreads = function(threads=NULL, restore_after_fork=NULL, percent=NULL, throttle=NULL) { if (!missing(percent)) { - if (!missing(threads)) stop("Provide either threads= or percent= but not both") + if (!missing(threads)) stopf("Provide either threads= or percent= but not both") if (length(percent)!=1) stopf("percent= is provided but is length %d", length(percent)) percent=as.integer(percent) if (is.na(percent) || percent<2L || percent>100L) stopf("percent==%d but should be a number between 2 and 100", percent) diff --git a/R/print.data.table.R b/R/print.data.table.R index e935c01da4..3f19cdc391 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -14,11 +14,11 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), # class - should column class be printed underneath column name? (FALSE) # trunc.cols - should only the columns be printed that can fit in the console? (FALSE) if (!col.names %chin% c("auto", "top", "none")) - stop("Valid options for col.names are 'auto', 'top', and 'none'") + stopf("Valid options for col.names are 'auto', 'top', and 'none'") if (length(trunc.cols) != 1L || !is.logical(trunc.cols) || is.na(trunc.cols)) - stop("Valid options for trunc.cols are TRUE and FALSE") + stopf("Valid options for trunc.cols are TRUE and FALSE") if (col.names == "none" && class) - warning("Column classes will be suppressed when col.names is 'none'") + warningf("Column classes will be suppressed when col.names is 'none'") if (!shouldPrint(x)) { # := in [.data.table sets .global$print=address(x) to suppress the next print i.e., like <- does. See FAQ 2.22 and README item in v1.9.5 # The issue is distinguishing "> DT" (after a previous := in a function) from "> DT[,foo:=1]". To print.data.table(), there @@ -142,7 +142,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), format.data.table = function (x, ..., justify="none", timezone = FALSE) { if (is.atomic(x) && !is.null(x)) { - stop("Internal structure doesn't seem to be a list. Possibly corrupt data.table.") + stopf("Internal structure doesn't seem to be a list. Possibly corrupt data.table.") } format.item = function(x) { if (is.null(x)) # NULL item in a list column @@ -234,9 +234,9 @@ toprint_subset = function(x, cols_to_print) { trunc_cols_message = function(not_printed, abbs, class, col.names){ n = length(not_printed) if (class && col.names != "none") classes = paste0(" ", tail(abbs, n)) else classes = "" - cat(sprintf( - ngettext(n, "%d variable not shown: %s\n", "%d variables not shown: %s\n"), + catf( + "%d variable(s) not shown: %s\n", n, brackify(paste0(not_printed, classes)) - )) + ) } diff --git a/R/programming.R b/R/programming.R index c0b9574a9a..9050a7ee25 100644 --- a/R/programming.R +++ b/R/programming.R @@ -8,7 +8,7 @@ rm.AsIs = function(x) { } list2lang = function(x) { if (!is.list(x)) - stop("'x' must be a list") + stopf("'x' must be a list") if (is.AsIs(x)) return(rm.AsIs(x)) asis = vapply(x, is.AsIs, FALSE) @@ -34,7 +34,7 @@ list2lang = function(x) { } enlist = function(x) { if (!is.list(x)) - stop("'x' must be a list") + stopf("'x' must be a list") if (is.AsIs(x)) return(rm.AsIs(x)) as.call(c(quote(list), list2lang(x))) @@ -44,26 +44,26 @@ substitute2 = function(expr, env) { if (missing(expr)) return(substitute()) if (missing(env)) { - stop("'env' must not be missing") + stopf("'env' must not be missing") } else if (is.null(env)) { # null is fine, will be escaped few lines below } else if (is.environment(env)) { env = as.list(env, all.names=TRUE, sorted=TRUE) } else if (!is.list(env)) { - stop("'env' must be a list or an environment") + stopf("'env' must be a list or an environment") } if (!length(env)) { return(substitute(expr)) } env.names = names(env) if (is.null(env.names)) { - stop("'env' argument does not have names") + stopf("'env' argument does not have names") } else if (!all(nzchar(env.names))) { - stop("'env' argument has zero char names") + stopf("'env' argument has zero char names") } else if (anyNA(env.names)) { - stop("'env' argument has NA names") + stopf("'env' argument has NA names") } else if (anyDuplicated(env.names)) { - stop("'env' argument has duplicated names") + stopf("'env' argument has duplicated names") } # character to name/symbol, and list to list call env = list2lang(env) diff --git a/R/setkey.R b/R/setkey.R index cca6361cce..3bd3f782c4 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -1,6 +1,6 @@ setkey = function(x, ..., verbose=getOption("datatable.verbose"), physical=TRUE) { - if (is.character(x)) stop("x may no longer be the character name of the data.table. The possibility was undocumented and has been removed.") + if (is.character(x)) stopf("x may no longer be the character name of the data.table. The possibility was undocumented and has been removed.") cols = as.character(substitute(list(...))[-1L]) if (!length(cols)) { cols=colnames(x) } else if (identical(cols,"NULL")) cols=NULL @@ -20,7 +20,7 @@ setindexv = function(x, cols, verbose=getOption("datatable.verbose")) { # upgrade to error after Mar 2020. Has already been warning since 2012, and stronger warning in Mar 2019 (note in news for 1.12.2); #3399 "key<-" = function(x,value) { - warning("key(x)<-value is deprecated and not supported. Please change to use setkey() with perhaps copy(). Has been warning since 2012 and will be an error in future.") + warningf("key(x)<-value is deprecated and not supported. Please change to use setkey() with perhaps copy(). Has been warning since 2012 and will be an error in future.") setkeyv(x,value) # The returned value here from key<- is then copied by R before assigning to x, it seems. That's # why we can't do anything about it without a change in R itself. If we return NULL (or invisible()) from this key<- @@ -42,16 +42,16 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU oldverbose = options(datatable.verbose=verbose) on.exit(options(oldverbose)) } - if (!is.data.table(x)) stop("x is not a data.table") - if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setkey.") - if (physical && .Call(C_islocked, x)) stop("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.") + if (!is.data.table(x)) stopf("x is not a data.table") + if (!is.character(cols)) stopf("cols is not a character vector. Please see further information in ?setkey.") + if (physical && .Call(C_islocked, x)) stopf("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.") if (!length(cols)) { - warning("cols is a character vector of zero length. Removed the key, but use NULL instead, or wrap with suppressWarnings() to avoid this warning.") + warningf("cols is a character vector of zero length. Removed the key, but use NULL instead, or wrap with suppressWarnings() to avoid this warning.") setattr(x,"sorted",NULL) return(invisible(x)) } - if (identical(cols,"")) stop("cols is the empty string. Use NULL to remove the key.") - if (!all(nzchar(cols))) stop("cols contains some blanks.") + if (identical(cols,"")) stopf("cols is the empty string. Use NULL to remove the key.") + if (!all(nzchar(cols))) stopf("cols contains some blanks.") cols = gsub("`", "", cols, fixed = TRUE) miss = !(cols %chin% colnames(x)) if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss])) @@ -76,12 +76,12 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU return(invisible(x)) } - if (".xi" %chin% names(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.") + if (".xi" %chin% names(x)) stopf("x contains a column called '.xi'. Conflicts with internal use by data.table.") for (i in cols) { .xi = x[[i]] # [[ is copy on write, otherwise checking type would be copying each column if (!typeof(.xi) %chin% ORDERING_TYPES) stopf("Column '%s' is type '%s' which is not supported as a key column type, currently.", i, typeof(.xi)) } - if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov + if (!is.character(cols) || length(cols)<1L) stopf("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov newkey = paste0(cols, collapse="__") if (!any(indices(x) == newkey)) { @@ -142,7 +142,7 @@ haskey = function(x) !is.null(key(x)) setreordervec = function(x, order) .Call(Creorder, x, order) # sort = sort.int = sort.list = order = is.unsorted = function(...) -# stop("Should never be called by data.table internals. Use is.sorted() on vectors, or forder() for lists and vectors.") +# stopf("Should never be called by data.table internals. Use is.sorted() on vectors, or forder() for lists and vectors.") # Nice idea, but users might use these in i or j e.g. blocking order caused tests 304 to fail. # Maybe just a grep through *.R for use of these function internally would be better (TO DO). @@ -160,7 +160,7 @@ is.sorted = function(x, by=NULL) { if (missing(by)) by = seq_along(x) # wouldn't make sense when x is a vector; hence by=seq_along(x) is not the argument default if (is.character(by)) by = chmatch(by, names(x)) } else { - if (!missing(by)) stop("x is vector but 'by' is supplied") + if (!missing(by)) stopf("x is vector but 'by' is supplied") } .Call(Cissorted, x, as.integer(by)) # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1) @@ -170,7 +170,7 @@ ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE) { if (is.atomic(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), - if (!missing(by) && !is.null(by)) stop("x is a single vector, non-NULL 'by' doesn't make sense") + if (!missing(by) && !is.null(by)) stopf("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL } else { if (!length(x)) return(integer(0L)) # e.g. forderv(data.table(NULL)) and forderv(list()) return integer(0L)) @@ -202,12 +202,12 @@ forder = function(..., na.last=TRUE, decreasing=FALSE) } x = eval(sub[[2L]], parent.frame(), parent.frame()) if (is.list(x)) { - if (length(x)==0L && is.data.frame(x)) stop("Attempting to order a 0-column data.table or data.frame.") + if (length(x)==0L && is.data.frame(x)) stopf("Attempting to order a 0-column data.table or data.frame.") sub[2L] = NULL # change list(DT, ...) to list(...) if (length(sub)==1L) { data = x } else { - if (!is.data.frame(x)) stop("The first item passed to [f]order is a plain list but there are more items. It should be a data.table or data.frame.") + if (!is.data.frame(x)) stopf("The first item passed to [f]order is a plain list but there are more items. It should be a data.table or data.frame.") asc = asc[-1L] data = eval(sub, x, parent.frame()) } @@ -224,7 +224,7 @@ fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FAL { containsNAs = FALSE if (typeof(x)=="double" && !decreasing && !(containsNAs <- anyNA(x))) { - if (internal) stop("Internal code should not be being called on type double") + if (internal) stopf("Internal code should not be being called on type double") return(.Call(Cfsort, x, verbose)) } else { @@ -232,9 +232,9 @@ fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FAL # The only places internally we use fsort internally (3 calls, all on integer) have had internal=TRUE added for now. # TODO: implement integer and character in Cfsort and remove this branch and warning if (!internal){ - if (typeof(x)!="double") warning("Input is not a vector of type double. New parallel sort has only been done for double vectors so far. Using one thread.") - if (decreasing) warning("New parallel sort has not been implemented for decreasing=TRUE so far. Using one thread.") - if (containsNAs) warning("New parallel sort has not been implemented for vectors containing NA values so far. Using one thread.") + if (typeof(x)!="double") warningf("Input is not a vector of type double. New parallel sort has only been done for double vectors so far. Using one thread.") + if (decreasing) warningf("New parallel sort has not been implemented for decreasing=TRUE so far. Using one thread.") + if (containsNAs) warningf("New parallel sort has not been implemented for vectors containing NA values so far. Using one thread.") } orderArg = if (decreasing) -1 else 1 o = forderv(x, order=orderArg, na.last=na.last) @@ -246,7 +246,7 @@ setorder = function(x, ..., na.last=FALSE) # na.last=FALSE here, to be consistent with data.table's default # as opposed to DT[order(.)] where na.last=TRUE, to be consistent with base { - if (!is.data.frame(x)) stop("x must be a data.frame or data.table") + if (!is.data.frame(x)) stopf("x must be a data.frame or data.table") cols = substitute(list(...))[-1L] if (identical(as.character(cols),"NULL")) return(x) if (length(cols)) { @@ -272,25 +272,25 @@ setorder = function(x, ..., na.last=FALSE) setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE) { if (is.null(cols)) return(x) - if (!is.data.frame(x)) stop("x must be a data.frame or data.table") + if (!is.data.frame(x)) stopf("x must be a data.frame or data.table") na.last = as.logical(na.last) - if (is.na(na.last) || !length(na.last)) stop('na.last must be logical TRUE/FALSE') - if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setorder.") + if (is.na(na.last) || !length(na.last)) stopf('na.last must be logical TRUE/FALSE') + if (!is.character(cols)) stopf("cols is not a character vector. Please see further information in ?setorder.") if (!length(cols)) { - warning("cols is a character vector of zero length. Use NULL instead, or wrap with suppressWarnings() to avoid this warning.") + warningf("cols is a character vector of zero length. Use NULL instead, or wrap with suppressWarnings() to avoid this warning.") return(x) } - if (!all(nzchar(cols))) stop("cols contains some blanks.") # TODO: probably I'm checking more than necessary here.. there are checks in 'forderv' as well + if (!all(nzchar(cols))) stopf("cols contains some blanks.") # TODO: probably I'm checking more than necessary here.. there are checks in 'forderv' as well # remove backticks from cols cols = gsub("`", "", cols, fixed = TRUE) miss = !(cols %chin% colnames(x)) if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss])) - if (".xi" %chin% colnames(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.") + if (".xi" %chin% colnames(x)) stopf("x contains a column called '.xi'. Conflicts with internal use by data.table.") for (i in cols) { .xi = x[[i]] # [[ is copy on write, otherwise checking type would be copying each column if (!typeof(.xi) %chin% ORDERING_TYPES) stopf("Column '%s' is type '%s' which is not supported for ordering currently.", i, typeof(.xi)) } - if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov + if (!is.character(cols) || length(cols)<1L) stopf("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov o = forderv(x, cols, sort=TRUE, retGrp=FALSE, order=order, na.last=na.last) if (length(o)) { diff --git a/R/setops.R b/R/setops.R index 7f21603b59..042d0c5f93 100644 --- a/R/setops.R +++ b/R/setops.R @@ -1,12 +1,12 @@ # setdiff for data.tables, internal at the moment #547, used in not-join setdiff_ = function(x, y, by.x=seq_along(x), by.y=seq_along(y), use.names=FALSE) { - if (!is.data.table(x) || !is.data.table(y)) stop("x and y must both be data.tables") + if (!is.data.table(x) || !is.data.table(y)) stopf("x and y must both be data.tables") # !ncol redundant since all 0-column data.tables have 0 rows if (!nrow(x)) return(x) by.x = colnamesInt(x, by.x, check_dups=TRUE) if (!nrow(y)) return(unique(x, by=by.x)) by.y = colnamesInt(y, by.y, check_dups=TRUE) - if (length(by.x) != length(by.y)) stop("length(by.x) != length(by.y)") + if (length(by.x) != length(by.y)) stopf("length(by.x) != length(by.y)") # factor in x should've factor/character in y, and viceversa for (a in seq_along(by.x)) { lc = by.y[a] @@ -36,13 +36,13 @@ funique = function(x) { } .set_ops_arg_check = function(x, y, all, .seqn = FALSE, block_list = TRUE) { - if (!is.logical(all) || length(all) != 1L) stop("argument 'all' should be logical of length one") - if (!is.data.table(x) || !is.data.table(y)) stop("x and y must both be data.tables") - if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have the same column names") - if (!identical(names(x), names(y))) stop("x and y must have the same column order") + if (!is.logical(all) || length(all) != 1L) stopf("argument 'all' should be logical of length one") + if (!is.data.table(x) || !is.data.table(y)) stopf("x and y must both be data.tables") + if (!identical(sort(names(x)), sort(names(y)))) stopf("x and y must have the same column names") + if (!identical(names(x), names(y))) stopf("x and y must have the same column order") bad_types = c("raw", "complex", if (block_list) "list") found = bad_types %chin% c(vapply_1c(x, typeof), vapply_1c(y, typeof)) - if (any(found)) stop(domain=NA, sprintf(ngettext(sum(found), "unsupported column type found in x or y: %s", "unsupported column types found in x or y: %s"), brackify(bad_types[found]))) + if (any(found)) stopf("unsupported column type(s) found in x or y: %s", brackify(bad_types[found])) super = function(x) { # allow character->factor and integer->numeric because from v1.12.4 i's type is retained by joins, #3820 ans = class(x)[1L] @@ -52,7 +52,7 @@ funique = function(x) { w = which.first(sx!=sy) stopf("Item %d of x is '%s' but the corresponding item of y is '%s'.", w, class(x[[w]])[1L], class(y[[w]])[1L]) } - if (.seqn && ".seqn" %chin% names(x)) stop("None of the datasets should contain a column named '.seqn'") + if (.seqn && ".seqn" %chin% names(x)) stopf("None of the datasets should contain a column named '.seqn'") } fintersect = function(x, y, all=FALSE) { @@ -141,7 +141,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu targetTypes = vapply_1c(target, squashClass) currentTypes = vapply_1c(current, squashClass) if (length(targetTypes) != length(currentTypes)) - stop("Internal error: ncol(current)==ncol(target) was checked above") # nocov + stopf("Internal error: ncol(current)==ncol(target) was checked above") # nocov if (any( d<-(targetTypes != currentTypes))) { w = head(which(d),3L) return(paste0("Datasets have different column classes. First 3: ",paste( @@ -185,7 +185,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu if (ignore.row.order) { if (".seqn" %chin% names(target)) - stop("None of the datasets to compare should contain a column named '.seqn'") + stopf("None of the datasets to compare should contain a column named '.seqn'") bad.type = setNames(c("raw","complex","list") %chin% c(vapply_1c(current, typeof), vapply_1c(target, typeof)), c("raw","complex","list")) if (any(bad.type)) stopf("Datasets to compare with 'ignore.row.order' must not have unsupported column types: %s", brackify(names(bad.type)[bad.type])) @@ -203,7 +203,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu else if (!target_dup && current_dup) msg = c(msg, "Dataset 'current' has duplicate rows while 'target' doesn't") else { # both if (!identical(tolerance, sqrt(.Machine$double.eps))) # non-default will raise error - stop("Duplicate rows in datasets, numeric columns and ignore.row.order cannot be used with non 0 tolerance argument") + stopf("Duplicate rows in datasets, numeric columns and ignore.row.order cannot be used with non 0 tolerance argument") msg = c(msg, "Both datasets have duplicate rows, they also have numeric columns, together with ignore.row.order this force 'tolerance' argument to 0") tolerance = 0 } @@ -217,7 +217,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu # handling 'tolerance' for factor cols - those `msg` will be returned only when equality with tolerance will fail if (any(vapply_1b(target,is.factor)) && !identical(tolerance, 0)) { if (!identical(tolerance, sqrt(.Machine$double.eps))) # non-default will raise error - stop("Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") + stopf("Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") msg = c(msg, "Using factor columns together together with ignore.row.order, this force 'tolerance' argument to 0") tolerance = 0 } @@ -261,7 +261,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu x = target[[i]] y = current[[i]] if (xor(is.factor(x),is.factor(y))) - stop("Internal error: factor type mismatch should have been caught earlier") # nocov + stopf("Internal error: factor type mismatch should have been caught earlier") # nocov cols.r = TRUE if (is.factor(x)) { if (!identical(levels(x),levels(y))) { diff --git a/R/test.data.table.R b/R/test.data.table.R index d28319aa78..a8a19522f4 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -3,7 +3,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (exists("test.data.table", .GlobalEnv,inherits=FALSE)) { # package developer # nocov start - if ("package:data.table" %chin% search()) stop("data.table package is loaded. Unload or start a fresh R session.") + if ("package:data.table" %chin% search()) stopf("data.table package is loaded. Unload or start a fresh R session.") rootdir = if (pkg!="." && pkg %chin% dir()) file.path(getwd(), pkg) else Sys.getenv("PROJ_PATH") subdir = file.path("inst","tests") # nocov end @@ -16,7 +16,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F stopifnot(is.character(script), length(script)==1L, !is.na(script), nzchar(script)) if (!grepl(".Rraw$", script)) - stop("script must end with '.Rraw'. If a file ending '.Rraw.bz2' exists, that will be found and used.") # nocov + stopf("script must end with '.Rraw'. If a file ending '.Rraw.bz2' exists, that will be found and used.") # nocov if (identical(script,"*.Rraw")) { # nocov start @@ -160,16 +160,11 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F ntest = env$ntest if (nfail > 0L) { # nocov start - # domain=NA since it's already translated by then - stop(domain = NA, sprintf( - ngettext( - nfail, - "%d error out of %d. Search %s for test number %s", - "%d errors out of %d. Search %s for test numbers %s" - ), + stopf( + "%d error(s) out of %d. Search %s for test number(s) %s", nfail, ntest, names(fn), toString(env$whichfail) - )) - # important to stop() here, so that 'R CMD check' fails + ) + # important to stopf() here, so that 'R CMD check' fails # nocov end } @@ -204,7 +199,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # graphics::par(p) # grDevices::dev.off() # } else { - # warning("test.data.table runs with memory testing but did not collect any memory statistics.") + # warningf("test.data.table runs with memory testing but did not collect any memory statistics.") # } #} #if (memtest<-get("memtest", envir=env)) memtest.plot(get("inittime", envir=env)) diff --git a/R/timetaken.R b/R/timetaken.R index 453ad2234d..daa52c9f1f 100644 --- a/R/timetaken.R +++ b/R/timetaken.R @@ -1,6 +1,6 @@ timetaken = function(started.at) { - if (!inherits(started.at,"proc_time")) stop("Use started.at=proc.time() not Sys.time() (POSIXt and slow)") # nocov + if (!inherits(started.at,"proc_time")) stopf("Use started.at=proc.time() not Sys.time() (POSIXt and slow)") # nocov format = function(secs) { if (secs > 60.0) { secs = as.integer(secs) diff --git a/R/transpose.R b/R/transpose.R index 61dc56abb9..a326863e70 100644 --- a/R/transpose.R +++ b/R/transpose.R @@ -25,7 +25,7 @@ transpose = function(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names tstrsplit = function(x, ..., fill=NA, type.convert=FALSE, keep, names=FALSE) { if (!isTRUEorFALSE(names) && !is.character(names)) - stop("'names' must be TRUE/FALSE or a character vector.") + stopf("'names' must be TRUE/FALSE or a character vector.") ans = transpose(strsplit(as.character(x), ...), fill=fill, ignore.empty=FALSE) if (!missing(keep)) { keep = suppressWarnings(as.integer(keep)) diff --git a/R/uniqlist.R b/R/uniqlist.R index b0c1c9fdd0..2a610ab1a7 100644 --- a/R/uniqlist.R +++ b/R/uniqlist.R @@ -9,7 +9,7 @@ uniqlist = function (l, order = -1L) # TRUE has the last in a sequence of dups FALSE (so you can keep the last if that's required) # l = list(...) if (!is.list(l)) - stop("l not type list") + stopf("l not type list") if (!length(l)) return(list(0L)) ans = .Call(Cuniqlist, l, as.integer(order)) ans diff --git a/R/utils.R b/R/utils.R index 3a180dc951..b4ae4c9ee6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -15,10 +15,10 @@ isTRUEorFALSE = function(x) is.logical(x) && length(x)==1L && !is.na(x) allNA = function(x) .Call(C_allNAR, x) # helper for nan argument (e.g. nafill): TRUE -> treat NaN as NA nan_is_na = function(x) { - if (length(x) != 1L) stop("Argument 'nan' must be length 1") + if (length(x) != 1L) stopf("Argument 'nan' must be length 1") if (identical(x, NA) || identical(x, NA_real_)) return(TRUE) if (identical(x, NaN)) return(FALSE) - stop("Argument 'nan' must be NA or NaN") + stopf("Argument 'nan' must be NA or NaN") } if (base::getRversion() < "3.2.0") { # Apr 2015 @@ -36,7 +36,7 @@ if (!exists('endsWith', 'package:base', inherits=FALSE)) { which.first = function(x) { if (!is.logical(x)) { - stop("x not boolean") + stopf("x not boolean") } match(TRUE, x) } @@ -45,7 +45,7 @@ which.first = function(x) which.last = function(x) { if (!is.logical(x)) { - stop("x not boolean") + stopf("x not boolean") } length(x) - match(TRUE, rev(x)) + 1L } @@ -56,7 +56,7 @@ require_bit64_if_needed = function(DT) { # nocov start # a test was attempted to cover the requireNamespace() by using unloadNamespace() first, but that fails when nanotime is loaded because nanotime also uses bit64 if (!requireNamespace("bit64",quietly=TRUE)) { - warning("Some columns are type 'integer64' but package bit64 is not installed. Those columns will print as strange looking floating point data. There is no need to reload the data. Simply install.packages('bit64') to obtain the integer64 print method and print the data again.") + warningf("Some columns are type 'integer64' but package bit64 is not installed. Those columns will print as strange looking floating point data. There is no need to reload the data. Simply install.packages('bit64') to obtain the integer64 print method and print the data again.") } # nocov end } diff --git a/R/xts.R b/R/xts.R index 0a89bf3892..005f0f6024 100644 --- a/R/xts.R +++ b/R/xts.R @@ -1,7 +1,7 @@ as.data.table.xts = function(x, keep.rownames = TRUE, key=NULL, ...) { stopifnot(requireNamespace("xts"), !missing(x), xts::is.xts(x)) - if (length(keep.rownames) != 1L) stop("keep.rownames must be length 1") - if (is.na(keep.rownames)) stop("keep.rownames must not be NA") + if (length(keep.rownames) != 1L) stopf("keep.rownames must be length 1") + if (is.na(keep.rownames)) stopf("keep.rownames must not be NA") # as.data.frame.xts will handle copying, and # the error check above ensures as.data.frame.xts is applied r = setDT(as.data.frame(x, row.names=NULL)) @@ -17,7 +17,7 @@ as.data.table.xts = function(x, keep.rownames = TRUE, key=NULL, ...) { as.xts.data.table = function(x, ...) { stopifnot(requireNamespace("xts"), !missing(x), is.data.table(x)) - if (!xts::is.timeBased(x[[1L]])) stop("data.table must have a time based column in first position, use `setcolorder` function to change the order, or see ?timeBased for supported types") + if (!xts::is.timeBased(x[[1L]])) stopf("data.table must have a time based column in first position, use `setcolorder` function to change the order, or see ?timeBased for supported types") colsNumeric = vapply_1b(x, is.numeric)[-1L] # exclude first col, xts index if (!all(colsNumeric)) warningf("Following columns are not numeric and will be omitted: %s", brackify(names(colsNumeric)[!colsNumeric])) r = setDF(x[, .SD, .SDcols = names(colsNumeric)[colsNumeric]]) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 05fea1b7c6..1a890b2566 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8395,13 +8395,13 @@ DT0 = data.table(NULL) DT1 = data.table(a=1) test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a")) test(1601.2, merge(DT1, DT0, by="a"), - warning="You are trying to join data.tables where 'y' has 0 columns.", + warning="Input data.table 'y' has no columns.", error="Elements listed in `by`") test(1601.3, merge(DT0, DT1, by="a"), - warning="You are trying to join data.tables where 'x' has 0 columns.", + warning="Input data.table 'x' has no columns.", error="Elements listed in `by`") test(1601.4, merge(DT0, DT0, by="a"), - warning="You are trying to join data.tables where 'x' and 'y' have 0 columns.", + warning="Neither of the input data.tables to join have columns.", error="Elements listed in `by`") # fix for #1549 @@ -8845,26 +8845,26 @@ test(1626.45, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,5)], all=TRUE)), 0L) dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = lapply(1:4, function(x) new.env())) x = dt[c(2:4,2L,2L)] y = dt[c(1:3,2L)] -test(1626.46, fintersect(x, y), error = "unsupported column type found in x or y: [list]") -test(1626.47, fintersect(x, y, all=TRUE), error = "unsupported column type found in x or y: [list]") -test(1626.48, fsetdiff(x, y), error = "unsupported column type found in x or y: [list]") -test(1626.49, fsetdiff(x, y, all=TRUE), error = "unsupported column type found in x or y: [list]") -test(1626.50, funion(x, y), error = "unsupported column type found in x or y: [list]") +test(1626.46, fintersect(x, y), error = "unsupported column type(s) found in x or y: [list]") +test(1626.47, fintersect(x, y, all=TRUE), error = "unsupported column type(s) found in x or y: [list]") +test(1626.48, fsetdiff(x, y), error = "unsupported column type(s) found in x or y: [list]") +test(1626.49, fsetdiff(x, y, all=TRUE), error = "unsupported column type(s) found in x or y: [list]") +test(1626.50, funion(x, y), error = "unsupported column type(s) found in x or y: [list]") test(1626.51, funion(x, y, all=TRUE), dt[c(2:4,2L,2L,1:3,2L)]) -test(1626.52, fsetequal(x, y), error = "unsupported column type found in x or y: [list]") -test(1626.53, fsetequal(dt[c(1:2,2L)], dt[c(1:2,2L)]), error = "unsupported column type found in x or y: [list]") +test(1626.52, fsetequal(x, y), error = "unsupported column type(s) found in x or y: [list]") +test(1626.53, fsetequal(dt[c(1:2,2L)], dt[c(1:2,2L)]), error = "unsupported column type(s) found in x or y: [list]") # unsupported type in set-ops: complex, raw dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = as.complex(1:4), V4 = as.raw(1:4), V5 = lapply(1:4, function(x) NULL)) x = dt[c(2:4,2L,2L)] y = dt[c(1:3,2L)] -test(1626.54, fintersect(x, y), error = "unsupported column types found in x or y: [raw, complex, list]") -test(1626.55, fintersect(x, y, all=TRUE), error = "unsupported column types found in x or y: [raw, complex, list]") -test(1626.56, fsetdiff(x, y), error = "unsupported column types found in x or y: [raw, complex, list]") -test(1626.57, fsetdiff(x, y, all=TRUE), error = "unsupported column types found in x or y: [raw, complex, list]") -test(1626.58, funion(x, y), error = "unsupported column types found in x or y: [raw, complex, list]") -test(1626.59, funion(x, y, all=TRUE), error = "unsupported column types found in x or y: [raw, complex]") # no 'list' here which is supported for `all=TRUE` -test(1626.60, fsetequal(x, y), error = "unsupported column types found in x or y: [raw, complex, list]") -test(1626.61, fsetequal(dt[c(1:2,2L)], dt[c(1:2,2L)]), error = "unsupported column types found in x or y: [raw, complex, list]") +test(1626.54, fintersect(x, y), error = "unsupported column type(s) found in x or y: [raw, complex, list]") +test(1626.55, fintersect(x, y, all=TRUE), error = "unsupported column type(s) found in x or y: [raw, complex, list]") +test(1626.56, fsetdiff(x, y), error = "unsupported column type(s) found in x or y: [raw, complex, list]") +test(1626.57, fsetdiff(x, y, all=TRUE), error = "unsupported column type(s) found in x or y: [raw, complex, list]") +test(1626.58, funion(x, y), error = "unsupported column type(s) found in x or y: [raw, complex, list]") +test(1626.59, funion(x, y, all=TRUE), error = "unsupported column type(s) found in x or y: [raw, complex]") # no 'list' here which is supported for `all=TRUE` +test(1626.60, fsetequal(x, y), error = "unsupported column type(s) found in x or y: [raw, complex, list]") +test(1626.61, fsetequal(dt[c(1:2,2L)], dt[c(1:2,2L)]), error = "unsupported column type(s) found in x or y: [raw, complex, list]") # supported types multi column test dt = data.table( V1 = 1:4, @@ -13310,9 +13310,9 @@ test(1952.2, d1[a==2, 2, which=TRUE], error="which==TRUE.*but j is also supplied # 3106 -- melt patterns don't match any columns (and more coverage tests) DT = data.table(id = 1:3, a1 = rnorm(3), a2 = rnorm(3)) test(1953.1, melt(DT, id.vars = 'id', measure.vars = patterns(a = 'a', b = 'b')), - error = 'Pattern not found') + error = 'Pattern(s) not found') test(1953.2, melt(DT, id.vars = 'id', measure.vars = patterns(a = 'a', b = 'b', c = 'c')), - error = 'Patterns not found') + error = 'Pattern(s) not found') test(1953.3, melt(DT, id.vars = 'id', measure.vars = patterns(1L)), error = 'Input patterns must be of type character') setDF(DT) @@ -16785,7 +16785,7 @@ DT = data.table(a = "aaaaaaaaaaaaa", d = "ddddddddddddd") test(2125.01, capture.output(print(DT, trunc.cols=TRUE))[3L], - "2 variables not shown: [c, d]") + "2 variable(s) not shown: [c, d]") # Printing with dots DT = data.table(a = vector("integer", 102L), b = "bbbbbbbbbbbbb", @@ -16804,7 +16804,7 @@ test(2125.02, capture.output(print(DT, trunc.cols=TRUE)), "100: 0 bbbbbbbbbbbbb ccccccccccccc", "101: 0 bbbbbbbbbbbbb ccccccccccccc", "102: 0 bbbbbbbbbbbbb ccccccccccccc", - "1 variable not shown: [d]")) + "1 variable(s) not shown: [d]")) test(2125.03, capture.output(print(DT, trunc.cols=TRUE, row.names=FALSE)), c(" a b c", " 0 bbbbbbbbbbbbb ccccccccccccc", @@ -16818,22 +16818,22 @@ test(2125.03, capture.output(print(DT, trunc.cols=TRUE, row.names=FALSE)), " 0 bbbbbbbbbbbbb ccccccccccccc", " 0 bbbbbbbbbbbbb ccccccccccccc", " 0 bbbbbbbbbbbbb ccccccccccccc", - "1 variable not shown: [d]" )) + "1 variable(s) not shown: [d]" )) # also testing #4266 -- getting width of row #s register right # TODO: understand why 2 variables truncated here. a,b,c combined have width # _exactly_ 40, but still wraps. If we set options(width=41) it won't truncate. # seems to be an issue with print.default. test(2125.04, capture.output(print(DT, trunc.cols=TRUE, class=TRUE))[14L], - "2 variables not shown: [c , d ]") + "2 variable(s) not shown: [c , d ]") test(2125.05, capture.output(print(DT, trunc.cols=TRUE, class=TRUE, row.names=FALSE))[c(1,14)], c(" a b c", - "1 variable not shown: [d ]" )) + "1 variable(s) not shown: [d ]" )) test(2125.06, capture.output(print(DT, trunc.cols=TRUE, col.names="none"))[c(1,12)], c(" 1: 0 bbbbbbbbbbbbb ccccccccccccc", - "1 variable not shown: [d]" )) + "1 variable(s) not shown: [d]" )) test(2125.07, capture.output(print(DT, trunc.cols=TRUE, class=TRUE, col.names="none"))[c(1,13)], c(" 1: 0 bbbbbbbbbbbbb", - "2 variables not shown: [c, d]" ), + "2 variable(s) not shown: [c, d]" ), warning = "Column classes will be suppressed when col.names is 'none'") options("width" = 20) DT = data.table(a = vector("integer", 2), @@ -16844,16 +16844,16 @@ test(2125.08, capture.output(print(DT, trunc.cols=TRUE)), c(" a b", "1: 0 bbbbbbbbbbbbb", "2: 0 bbbbbbbbbbbbb", - "2 variables not shown: [c, d]")) + "2 variable(s) not shown: [c, d]")) options("width" = 10) DT = data.table(a = "aaaaaaaaaaaaa", b = "bbbbbbbbbbbbb", c = "ccccccccccccc", d = "ddddddddddddd") test(2125.09, capture.output(print(DT, trunc.cols=TRUE)), - "4 variables not shown: [a, b, c, d]") + "4 variable(s) not shown: [a, b, c, d]") test(2125.10, capture.output(print(DT, trunc.cols=TRUE, class=TRUE)), - "4 variables not shown: [a , b , c , d ]") + "4 variable(s) not shown: [a , b , c , d ]") options(old_width) # segfault when i is NULL or zero-column, #4060 From 9bfae7df68cac320c634595251ede92d1ec3f1c1 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 8 Jul 2021 09:00:57 -0600 Subject: [PATCH 307/588] follow-up to #5056: Rv vs RV caught thanks to 'no visible binding for global variable 'Rv' --- R/onLoad.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/onLoad.R b/R/onLoad.R index 9bf65c7907..d4be38f6ed 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -26,7 +26,7 @@ dll = if (.Platform$OS.type=="windows") "dll" else "so" # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478 # NB: domain= is necessary in .onAttach and .onLoad, see ?gettext and https://bugs.r-project.org/bugzilla/show_bug.cgi?id=18092. - stopf(domain="R-data.table", "The datatable.%s version (%s) does not match the package (%s). Please close all R sessions to release the old %s and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.", dll, dllV, Rv, toupper(dll)) + stopf(domain="R-data.table", "The datatable.%s version (%s) does not match the package (%s). Please close all R sessions to release the old %s and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.", dll, dllV, RV, toupper(dll)) } builtUsing = readRDS(system.file("Meta/package.rds",package="data.table"))$Built$R if (!identical(base::getRversion()>="4.0.0", builtUsing>="4.0.0")) { From 9eee1e06cfaec76b7d85f75e156ae28daca7b69d Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 8 Jul 2021 09:30:40 -0600 Subject: [PATCH 308/588] follow-up #5067: R CMD check displayed difference to .Rout.save but still passed fully OK even under --as-cran (oddly) --- tests/knitr.R | 2 +- tests/knitr.Rout.save | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/knitr.R b/tests/knitr.R index 0940f429ed..eb9bfe1ae8 100644 --- a/tests/knitr.R +++ b/tests/knitr.R @@ -2,7 +2,7 @@ if (suppressPackageStartupMessages(requireNamespace("knitr", quietly = TRUE))) { require(knitr) knit("knitr.Rmd", quiet=TRUE) cat(readLines("knitr.md"), sep="\n") - file.remove("knitr.md") + invisible(file.remove("knitr.md")) } else { cat(readLines("knitr.Rout.mock", warn = FALSE), sep="\n") } diff --git a/tests/knitr.Rout.save b/tests/knitr.Rout.save index 252480db76..f97eeb4a4f 100644 --- a/tests/knitr.Rout.save +++ b/tests/knitr.Rout.save @@ -19,6 +19,7 @@ Type 'q()' to quit R. + require(knitr) + knit("knitr.Rmd", quiet=TRUE) + cat(readLines("knitr.md"), sep="\n") ++ invisible(file.remove("knitr.md")) + } else { + cat(readLines("knitr.Rout.mock", warn = FALSE), sep="\n") + } From dc81a5aed4fcf42851a10d893a8fbdd0cf8c9c04 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 8 Jul 2021 23:09:53 +0200 Subject: [PATCH 309/588] proper handling list-like objects in env arg #5057 (#5058) --- R/programming.R | 7 +++++-- inst/tests/programming.Rraw | 5 +++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/R/programming.R b/R/programming.R index 9050a7ee25..da97e785ce 100644 --- a/R/programming.R +++ b/R/programming.R @@ -6,6 +6,9 @@ rm.AsIs = function(x) { oldClass(x) = cl[cl!="AsIs"] x } +only.list = function(x) { + identical(class(x), "list") +} list2lang = function(x) { if (!is.list(x)) stopf("'x' must be a list") @@ -21,7 +24,7 @@ list2lang = function(x) { x[to.name] = lapply(x[to.name], as.name) } if (isTRUE(getOption("datatable.enlist", TRUE))) { ## recursively enlist for nested lists, see note section in substitute2 manual - islt = vapply(x, is.list, FALSE) + islt = vapply(x, only.list, FALSE) #5057 nested DT that inherits from a list must not be turned into list call to.enlist = !asis & islt if (any(to.enlist)) { x[to.enlist] = lapply(x[to.enlist], enlist) @@ -49,7 +52,7 @@ substitute2 = function(expr, env) { # null is fine, will be escaped few lines below } else if (is.environment(env)) { env = as.list(env, all.names=TRUE, sorted=TRUE) - } else if (!is.list(env)) { + } else if (!only.list(env) && !(is.AsIs(env) && only.list(rm.AsIs(env)))) { stopf("'env' must be a list or an environment") } if (!length(env)) { diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index f2026259ce..bed7bf0db4 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -598,3 +598,8 @@ dt_fill(nadt, c("x1", "x2", "x3"), is.na, 0) test(103.01, nadt, data.table(x1 = c(1, 2, 0, Inf), x2 = c(2, 0, 3, Inf), x3 = c(0, 1, 2, 0))) dt_fill(nadt, c("x1", "x2", "x3"), is.infinite, 0) test(103.02, nadt, data.table(x1 = c(1, 2, 0, 0), x2 = c(2, 0, 3, 0), x3 = c(0, 1, 2, 0))) + +# providing data.table with character column to env = list(...) gives error? #5057 +test(201.1, substitute2(dt, env=list(dt = data.table(a=1:9, b=1:9))), data.table(a=1:9, b=1:9)) +test(201.2, substitute2(dt, env=list(dt = data.table(a=1:9, b=as.character(1:9)))), data.table(a=1:9, b=as.character(1:9))) +test(201.3, substitute2(dt, env=list(dt = data.table(a=1:2, b=as.character(1:2)))), data.table(a=1:2, b=as.character(1:2))) From 1780aae1edd36bb80a64d570efc5220c1a091cf4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 15 Jul 2021 12:30:21 -0600 Subject: [PATCH 310/588] .dev-only: ask=FALSE added to 3rd update.packages() in revdep.R --- .dev/revdep.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 5d80c32612..5dd7612986 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -130,7 +130,7 @@ for (p in deps) { } } cat("New downloaded:",new," Already had latest:", old, " TOTAL:", length(deps), "\n") -update.packages(checkBuilt=TRUE) +update.packages(checkBuilt=TRUE, ask=FALSE) cat("This is R ",R.version$major,".",R.version$minor,"; ",R.version.string,"\n",sep="") cat("Previously installed packages were built using:\n") x = installed.packages() From 93d2feaf0532a92611b83051f1c2365cd8d5fafe Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 15 Jul 2021 15:14:19 -0600 Subject: [PATCH 311/588] .dev-only: log() now displays fail.log for convenience --- .dev/revdep.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 5dd7612986..fea041f535 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -327,7 +327,7 @@ inst = function() { system(paste0(R," CMD INSTALL ",last)) } -log = function(bioc=FALSE, fnam="~/fail.log") { +log = function(bioc=FALSE, fnam="~/fail.log", app="gedit") { x = c(.fail.cran, if (bioc) .fail.bioc) cat("Writing 00check.log for",length(x),"packages to",fnam,":\n") cat(paste(x,collapse=" "), "\n") @@ -351,6 +351,8 @@ log = function(bioc=FALSE, fnam="~/fail.log") { system(paste0("grep -H . ./",i,".Rcheck/00check.log >> ",fnam)) # the fail messages cat("\n\n", file=fnam, append=TRUE) } + system(paste(app, fnam)) + invisible() } inst() From 49d479e655316103243f655b54c5cecdd6bf462a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 15 Jul 2021 15:13:58 -0700 Subject: [PATCH 312/588] Set default domain="R-data.table" explicitly for translation functions (#5072) --- R/onAttach.R | 20 +++++++++----------- R/onLoad.R | 19 +++++++++---------- R/translation.R | 10 +++++----- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/R/onAttach.R b/R/onAttach.R index bbad945497..554d2599d6 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -18,24 +18,22 @@ } dev = as.integer(v[1L, 3L]) %% 2L == 1L # version number odd => dev if (!isTRUE(getOption("datatable.quiet"))) { # new option in v1.12.4, #3489 - # NB: we need to supply domain= for translation below since the below is technically not run in the data.table namespace nth = getDTthreads(verbose=FALSE) if (dev) - packageStartupMessagef(domain="R-data.table", "data.table %s IN DEVELOPMENT built %s%s using %d threads (see ?getDTthreads). ", v, d, g, nth, appendLF=FALSE) + packageStartupMessagef("data.table %s IN DEVELOPMENT built %s%s using %d threads (see ?getDTthreads). ", v, d, g, nth, appendLF=FALSE) else - packageStartupMessagef(domain="R-data.table", "data.table %s using %d threads (see ?getDTthreads). ", v, nth, appendLF=FALSE) - packageStartupMessagef(domain="R-data.table", "Latest news: r-datatable.com") - # NB: domain= is necessary in .onAttach and .onLoad, see ?gettext and https://bugs.r-project.org/bugzilla/show_bug.cgi?id=18092. - if (gettext(domain="R-data.table", "TRANSLATION CHECK") != "TRANSLATION CHECK") - packageStartupMessagef(domain="R-data.table", "**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********") + packageStartupMessagef("data.table %s using %d threads (see ?getDTthreads). ", v, nth, appendLF=FALSE) + packageStartupMessagef("Latest news: r-datatable.com") + if (gettext("TRANSLATION CHECK") != "TRANSLATION CHECK") + packageStartupMessagef("**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********") if (dev && (Sys.Date() - as.Date(d))>28L) - packageStartupMessagef(domain="R-data.table", "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") + packageStartupMessagef("**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") if (!.Call(ChasOpenMP)) { - packageStartupMessagef(domain="R-data.table", "**********\nThis installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", appendLF=FALSE) + packageStartupMessagef("**********\nThis installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", appendLF=FALSE) if (Sys.info()["sysname"] == "Darwin") - packageStartupMessagef(domain="R-data.table", "This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.\n**********") + packageStartupMessagef("This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.\n**********") else - packageStartupMessagef(domain="R-data.table", "This is %s. This warning should not normally occur on Windows or Linux where OpenMP is turned on by data.table's configure script by passing -fopenmp to the compiler. If you see this warning on Windows or Linux, please file a GitHub issue.\n**********", Sys.info()["sysname"]) + packageStartupMessagef("This is %s. This warning should not normally occur on Windows or Linux where OpenMP is turned on by data.table's configure script by passing -fopenmp to the compiler. If you see this warning on Windows or Linux, please file a GitHub issue.\n**********", Sys.info()["sysname"]) } } } diff --git a/R/onLoad.R b/R/onLoad.R index d4be38f6ed..5e72fab47f 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -25,12 +25,11 @@ if (dllV != RV) { dll = if (.Platform$OS.type=="windows") "dll" else "so" # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478 - # NB: domain= is necessary in .onAttach and .onLoad, see ?gettext and https://bugs.r-project.org/bugzilla/show_bug.cgi?id=18092. - stopf(domain="R-data.table", "The datatable.%s version (%s) does not match the package (%s). Please close all R sessions to release the old %s and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.", dll, dllV, RV, toupper(dll)) + stopf("The datatable.%s version (%s) does not match the package (%s). Please close all R sessions to release the old %s and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.", dll, dllV, RV, toupper(dll)) } builtUsing = readRDS(system.file("Meta/package.rds",package="data.table"))$Built$R if (!identical(base::getRversion()>="4.0.0", builtUsing>="4.0.0")) { - stopf(domain="R-data.table", "This is R %s but data.table has been installed using R %s. The major version must match. Please reinstall data.table.", base::getRversion(), builtUsing) + stopf("This is R %s but data.table has been installed using R %s. The major version must match. Please reinstall data.table.", base::getRversion(), builtUsing) # the if(R>=4.0.0) in NAMESPACE when registering S3 methods rbind.data.table and cbind.data.table happens on install; #3968 } } @@ -94,14 +93,14 @@ } if (!is.null(getOption("datatable.old.bywithoutby"))) - warningf(domain="R-data.table", "Option 'datatable.old.bywithoutby' has been removed as warned for 2 years. It is now ignored. Please use by=.EACHI instead and stop using this option.") + warningf("Option 'datatable.old.bywithoutby' has been removed as warned for 2 years. It is now ignored. Please use by=.EACHI instead and stop using this option.") if (!is.null(getOption("datatable.old.unique.by.key"))) - warningf(domain="R-data.table", "Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. It is now ignored. Please use by=key(DT) instead and stop using this option.") + warningf("Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. It is now ignored. Please use by=key(DT) instead and stop using this option.") # Test R behaviour that changed in v3.1 and is now depended on x = 1L:3L y = list(x) - if (address(x) != address(y[[1L]])) stopf(domain="R-data.table", "Unexpected base R behaviour: list(x) has copied x") + if (address(x) != address(y[[1L]])) stopf("Unexpected base R behaviour: list(x) has copied x") DF = data.frame(a=1:3, b=4:6) add1 = address(DF$a) @@ -109,7 +108,7 @@ names(DF) = c("A","B") add3 = address(DF$A) add4 = address(DF$B) - if (add1!=add3 || add2!=add4) stopf(domain="R-data.table", "Unexpected base R behaviour: names<- has copied column contents") + if (add1!=add3 || add2!=add4) stopf("Unexpected base R behaviour: names<- has copied column contents") DF = data.frame(a=1:3, b=4:6) add1 = address(DF$a) @@ -119,10 +118,10 @@ add4 = address(DF$a) add5 = address(DF$b) add6 = address(DF) - if (add2==add5) stopf(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- did not copy column 2 which was assigned to") - if (add1!=add4) stopf(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- copied the first column which was not assigned to, too") + if (add2==add5) stopf("Unexpected base R behaviour: DF[2,2]<- did not copy column 2 which was assigned to") + if (add1!=add4) stopf("Unexpected base R behaviour: DF[2,2]<- copied the first column which was not assigned to, too") - if (add3==add6) warningf(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- has not copied address(DF)") + if (add3==add6) warningf("Unexpected base R behaviour: DF[2,2]<- has not copied address(DF)") # R could feasibly in future not copy DF's vecsxp in this case. If that changes in R, we'd like to know via the warning # because tests will likely break too. The warning will quickly tell R-core and us why, so we can then update. diff --git a/R/translation.R b/R/translation.R index 192c425d85..66faa9fe84 100644 --- a/R/translation.R +++ b/R/translation.R @@ -1,21 +1,21 @@ # templated warning/error functions to smooth translation & development -catf = function(fmt, ..., sep=" ", domain=NULL) { +catf = function(fmt, ..., sep=" ", domain="R-data.table") { cat(gettextf(fmt, ..., domain=domain), sep=sep) } -stopf = function(fmt, ..., domain=NULL) { +stopf = function(fmt, ..., domain="R-data.table") { stop(gettextf(fmt, ..., domain=domain), domain=NA, call. = FALSE) } -warningf = function(fmt, ..., immediate.=FALSE, noBreaks.=FALSE, domain=NULL) { +warningf = function(fmt, ..., immediate.=FALSE, noBreaks.=FALSE, domain="R-data.table") { warning(gettextf(fmt, ..., domain=domain), domain=NA, call.=FALSE, immediate.=immediate., noBreaks.=noBreaks.) } -messagef = function(fmt, ..., appendLF=TRUE, domain=NULL) { +messagef = function(fmt, ..., appendLF=TRUE, domain="R-data.table") { message(gettextf(fmt, ..., domain=domain), domain=NA, appendLF=appendLF) } -packageStartupMessagef = function(fmt, ..., appendLF=TRUE, domain=NULL) { +packageStartupMessagef = function(fmt, ..., appendLF=TRUE, domain="R-data.table") { packageStartupMessage(gettextf(fmt, ..., domain=domain), domain=NA, appendLF=appendLF) } From 129366e518612f0ce30417c2078bae3e6e6420ac Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 16 Jul 2021 13:52:27 -0400 Subject: [PATCH 313/588] melt(na.rm=TRUE) should remove rows with missing list column (#5053) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 3 +++ man/melt.data.table.Rd | 4 +--- src/assign.c | 17 ++++++++++++----- src/data.table.h | 2 +- src/dogroups.c | 6 +++--- src/fmelt.c | 2 +- src/rbindlist.c | 4 ++-- 8 files changed, 25 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3eed2775fb..5d6a7da3ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -145,6 +145,8 @@ 25. `.SDcols=` is now documented in `?data.table` and it is now an error if the logical vector's length is not equal to the number of columns (consistent with `data.table`'s no-recycling policy; see new feature 1 in v1.12.2 Apr 2019), [#4115](https://github.com/Rdatatable/data.table/issues/4115). Thanks to @Henrik-P for reporting and Jan Gorecki for the PR. +26. `melt()` now outputs scalar logical `NA` instead of `NULL` in rows corresponding to missing list columns, for consistency with non-list columns when using `na.rm=TRUE`, [#5053](https://github.com/Rdatatable/data.table/pull/5053). Thanks to Toby Dylan Hocking for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1a890b2566..aa654a236c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3070,6 +3070,9 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) if (test_bit64) NA_integer64_ else NA)), # 'else NA' otherwise NULL is not removed when test_bit64 is FALSE measure.vars="l", na.rm=TRUE)[["value"]], list(c(NA,NA))) + DT_missing_l_2 = data.table(num_1=1, num_2=2, list_1=list(1), list_3=list(3)) + test(1035.0186, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=TRUE), data.table(char="1", num=1, list=list(1))) + test(1035.0187, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=FALSE), data.table(char=c("1","2","3"), num=c(1,2,NA), list=list(1,NA,3))) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index b31017356b..6dd74291d5 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -58,9 +58,7 @@ variables. When all \code{measure.vars} are not of the same type, they'll be coerced according to the hierarchy \code{list} > \code{character} > \code{numeric > integer > logical}. For example, if any of the measure variables is a -\code{list}, then entire value column will be coerced to a list. Note that, -if the type of \code{value} column is a list, \code{na.rm = TRUE} will have no -effect. +\code{list}, then entire value column will be coerced to a list. From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars} accepting a list of \code{character} or \code{integer} vectors as well to melt diff --git a/src/assign.c b/src/assign.c index af3768f81a..0dc38c9b0a 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1097,8 +1097,9 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con return memrecycle_message[0] ? memrecycle_message : NULL; } -void writeNA(SEXP v, const int from, const int n) +void writeNA(SEXP v, const int from, const int n, const bool listNA) // e.g. for use after allocVector() which does not initialize its result. +// listNA for #5503 { const int to = from-1+n; // writing to position 2147483647 in mind, 'i<=to' in loop conditions switch(TYPEOF(v)) { @@ -1133,8 +1134,14 @@ void writeNA(SEXP v, const int from, const int n) // If there's ever a way added to R API to pass NA_STRING to allocVector() to tell it to initialize with NA not "", would be great for (int i=from; i<=to; ++i) SET_STRING_ELT(v, i, NA_STRING); break; - case VECSXP: case EXPRSXP : - // although allocVector already initializes to R_NilValue, we use writeNA() in other places too, so we shouldn't skip this assign + case VECSXP: { + // See #5053 for comments and dicussion re listNA + // although allocVector initializes to R_NilValue, we use writeNA() in other places too, so we shouldn't skip the R_NilValue assign + // ScalarLogical(NA_LOGICAL) returns R's internal constant R_LogicalNAValue (no alloc and no protect needed) + const SEXP na = listNA ? ScalarLogical(NA_LOGICAL) : R_NilValue; + for (int i=from; i<=to; ++i) SET_VECTOR_ELT(v, i, na); + } break; + case EXPRSXP : for (int i=from; i<=to; ++i) SET_VECTOR_ELT(v, i, R_NilValue); break; default : @@ -1149,7 +1156,7 @@ SEXP allocNAVector(SEXPTYPE type, R_len_t n) // We guess that author of allocVector would have liked to initialize with NA but was prevented since memset // is restricted to one byte. SEXP v = PROTECT(allocVector(type, n)); - writeNA(v, 0, n); + writeNA(v, 0, n, false); UNPROTECT(1); return(v); } @@ -1159,7 +1166,7 @@ SEXP allocNAVectorLike(SEXP x, R_len_t n) { // TODO: remove allocNAVector above when usage in fastmean.c, fcast.c and fmelt.c can be adjusted; see comments in PR3724 SEXP v = PROTECT(allocVector(TYPEOF(x), n)); copyMostAttrib(x, v); - writeNA(v, 0, n); + writeNA(v, 0, n, false); UNPROTECT(1); return(v); } diff --git a/src/data.table.h b/src/data.table.h index 50d43a34a5..e897f6a8b4 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -123,7 +123,7 @@ SEXP growVector(SEXP x, R_len_t newlen); // assign.c SEXP allocNAVector(SEXPTYPE type, R_len_t n); SEXP allocNAVectorLike(SEXP x, R_len_t n); -void writeNA(SEXP v, const int from, const int n); +void writeNA(SEXP v, const int from, const int n, const bool listNA); void savetl_init(), savetl(SEXP s), savetl_end(); int checkOverAlloc(SEXP x); diff --git a/src/dogroups.c b/src/dogroups.c index 5bb9983408..be5d675b23 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -204,7 +204,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } if (istarts[i] == NA_INTEGER || (LENGTH(order) && iorder[ istarts[i]-1 ]==NA_INTEGER)) { for (int j=0; j0 if (TYPEOF(source) != TYPEOF(target)) diff --git a/src/fmelt.c b/src/fmelt.c index 0ee7e70ef8..9bfe931d85 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -495,7 +495,7 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s SEXP thiscol = input_col_or_NULL(DT, data, thisvaluecols, i, j); if (thiscol == R_NilValue) { if (!data->narm) { - writeNA(target, j*data->nrow, data->nrow); + writeNA(target, j*data->nrow, data->nrow, true); // listNA=true #5053 } }else{ if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { diff --git a/src/rbindlist.c b/src/rbindlist.c index 5dab7fff51..5d0b6547e5 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -406,7 +406,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (!length(li)) continue; // NULL items in the list() of DT/DF; not if thisnrow==0 because we need to retain (unused) factor levels (#3508) int w = usenames ? colMap[i*ncol + j] : j; if (w==-1) { - writeNA(target, ansloc, thisnrow); + writeNA(target, ansloc, thisnrow, false); } else { SEXP thisCol = VECTOR_ELT(li, w); SEXP thisColStr = isFactor(thisCol) ? getAttrib(thisCol, R_LevelsSymbol) : (isString(thisCol) ? thisCol : VECTOR_ELT(coercedForFactor, i)); @@ -512,7 +512,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) int w = usenames ? colMap[i*ncol + j] : j; SEXP thisCol; if (w==-1 || !length(thisCol=VECTOR_ELT(li, w))) { // !length for zeroCol warning above; #1871 - writeNA(target, ansloc, thisnrow); // writeNA is integer64 aware and writes INT64_MIN + writeNA(target, ansloc, thisnrow, false); // writeNA is integer64 aware and writes INT64_MIN } else { if ((TYPEOF(target)==VECSXP || TYPEOF(target)==EXPRSXP) && TYPEOF(thisCol)!=TYPEOF(target)) { // do an as.list() on the atomic column; #3528 From 0c579cd03d2ea56436649f2a1e7173a2344d33c0 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 16 Jul 2021 13:19:03 -0600 Subject: [PATCH 314/588] .dev-only: added flex for RcppCWB --- .dev/CRAN_Release.cmd | 1 + 1 file changed, 1 insertion(+) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 8be3ada82d..2cc09f0653 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -523,6 +523,7 @@ sudo apt-get -y install cargo # for gifski, a suggest of nasoi sudo apt-get -y install libgit2-dev # for gert sudo apt-get -y install cmake # for symengine for RxODE sudo apt-get -y install libxslt1-dev # for xslt +sudo apt-get -y install flex # for RcppCWB sudo R CMD javareconf # ENDIF From 2f12a7fa9305b59b43985ffe333e736cfaa696ce Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 16 Jul 2021 15:00:16 -0600 Subject: [PATCH 315/588] .dev-only: auto remove packages from revdeplib that are no longer available on CRAN/Bioc; ensure all packages installed with same x.y version of R --- .dev/revdep.R | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index fea041f535..9dec806eec 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -130,20 +130,29 @@ for (p in deps) { } } cat("New downloaded:",new," Already had latest:", old, " TOTAL:", length(deps), "\n") -update.packages(checkBuilt=TRUE, ask=FALSE) +update.packages(checkBuilt=TRUE, ask=FALSE) # won't rebuild packages which are no longer available on CRAN + +# The presence of packages here in revdeplib which no longer exist on CRAN could explain differences to CRAN. A revdep +# could be running tests using that package when available and failing which may be the very reason that package was removed from CRAN. +# When it is removed from revdeplib to match CRAN, then the revdep might then pass as it will skip its tests using that package. +x = installed.packages() +tt = match(rownames(x), rownames(avail)) +removed = rownames(x)[is.na(tt) & is.na(x[,"Priority"])] +cat("Removing",length(removed),"packages which are no longer available on CRAN/Bioc:", paste(removed, collapse=","), "\n") +stopifnot(all(x[removed,"LibPath"] == .libPaths()[1])) +oldn = nrow(x) +remove.packages(removed, .libPaths()[1]) +x = installed.packages() +stopifnot(nrow(x) == oldn-length(removed)) + +# Ensure all installed packages were built with this x.y release of R cat("This is R ",R.version$major,".",R.version$minor,"; ",R.version.string,"\n",sep="") cat("Previously installed packages were built using:\n") -x = installed.packages() -table(x[,"Built"], dnn=NULL) # manually inspect to ensure all built with this x.y release of R -if (FALSE) { # if not, run this manually replacing "4.0.0" appropriately - for (p in rownames(x)[x[,"Built"]=="4.0.0"]) { - install.packages(p) - } - # warnings may suggest many of them were removed from CRAN, so remove the remaining from revdeplib to be clean - x = installed.packages() - remove.packages(rownames(x)[x[,"Built"]=="4.0.0"]) - table(installed.packages()[,"Built"], dnn=NULL) # check again to make sure all built in current R-devel x.y version -} +print(tt <- table(x[,"Built"], dnn=NULL)) +minorR = paste(strsplit(as.character(getRversion()), split="[.]")[[1]][c(1,2)], collapse=".") +stopifnot(all(substring(names(tt),1,nchar(minorR)) == minorR)) +# if not (e.g. when using R-devel for revdep testing) perhaps run the following manually replacing "4.0.0" as appropriate +# for (p in rownames(x)[x[,"Built"]=="4.0.0"]) install.packages(p) # Remove the tar.gz no longer needed : for (p in deps) { From a66cc16184e66d2db5035d552908a056582efc80 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 22 Jul 2021 00:45:50 -0600 Subject: [PATCH 316/588] .dev-only: added wait=FALSE when displaying revdep.R:log() --- .dev/revdep.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 9dec806eec..33176e753e 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -360,7 +360,7 @@ log = function(bioc=FALSE, fnam="~/fail.log", app="gedit") { system(paste0("grep -H . ./",i,".Rcheck/00check.log >> ",fnam)) # the fail messages cat("\n\n", file=fnam, append=TRUE) } - system(paste(app, fnam)) + system(paste(app, fnam), wait=FALSE) invisible() } From fc311137e23439dd238f2fb408a254dd8ac9f4b9 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 22 Jul 2021 11:17:27 -0600 Subject: [PATCH 317/588] GLCI-only: rel-cran now OK from 1 NOTE --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d36f99fbcd..c3027ca1c3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -203,7 +203,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - *rm-src - >- - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 1 NOTE")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 1 NOTE"), " (installed package size) but ", shQuote(l)) else q("no")' + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: OK")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: OK"), " (installed package size) but ", shQuote(l)) else q("no")' test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double, check for new notes and compilation warnings, thus allow_failure <<: *test-lin From 79ad02a49b10fa4b14c00e8234946839693ceb73 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 26 Jul 2021 12:14:46 -0600 Subject: [PATCH 318/588] .dev-only: version comparison and links added to revdep.R:cran() --- .dev/revdep.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 33176e753e..da90f0c66e 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -271,11 +271,15 @@ cran = function() # reports CRAN status of the .cran.fail packages p = proc.time() db = setDT(tools::CRAN_check_results()) cat("tools::CRAN_check_results() returned",prettyNum(nrow(db), big.mark=","),"rows in",timetaken(p),"\n") - rel = unique(db$Flavor) - rel = sort(rel[grep("release",rel)]) - cat("R-release is used for revdep checking so comparing to CRAN results for R-release\n") - ans = db[Package %chin% .fail.cran & Flavor %chin% rel, Status, keyby=.(Package, Flavor)] - dcast(ans, Package~Flavor, value.var="Status", fill="")[.fail.cran,] + ans = db[Package %chin% .fail.cran, + .(ERROR=sum(Status=="ERROR"), + WARN =sum(Status=="WARN"), + cran =paste(unique(Version),collapse=";"), + local=as.character(packageVersion(.BY[[1]]))), + keyby=Package] + ans[local==cran, c("cran","local"):=""] + ans[, "right_click_in_bash":=paste0("https://cran.r-project.org/web/checks/check_results_",Package,".html")] + ans[] } run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE, choose=NULL) { From f9ebb1ba1422969185ef55c255e89ab233d210ca Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Tue, 27 Jul 2021 07:55:29 +1200 Subject: [PATCH 319/588] First look for .datatable.aware in environment (#4909) --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ R/cedta.R | 10 ++++++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8ab2deaa0d..ffa85cb31a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -64,7 +64,8 @@ Authors@R: c( person("Ben","Schwen", role="ctb"), person("Tony","Fischetti", role="ctb"), person("Ofek","Shilon", role="ctb"), - person("Vadim","Khotilovich", role="ctb")) + person("Vadim","Khotilovich", role="ctb"), + person("Hadley","Wickham", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NEWS.md b/NEWS.md index 5d6a7da3ea..2e0cef06af 100644 --- a/NEWS.md +++ b/NEWS.md @@ -93,6 +93,8 @@ 13. `fifelse()` now coerces logical `NA` to other types and the `na` argument supports vectorized input, [#4277](https://github.com/Rdatatable/data.table/issues/4277) [#4286](https://github.com/Rdatatable/data.table/issues/4286) [#4287](https://github.com/Rdatatable/data.table/issues/4287). Thanks to @michaelchirico and @shrektan for reporting, and @shrektan for implementing. +14. `.datatable.aware` is now recognized in the calling environment in addition to the namespace of the calling package, [dtplyr#184](https://github.com/tidyverse/dtplyr/issues/184). Thanks to Hadley Wickham for the idea and PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/cedta.R b/R/cedta.R index 7ace210079..5d0a60c10f 100644 --- a/R/cedta.R +++ b/R/cedta.R @@ -18,11 +18,18 @@ cedta.pkgEvalsUserCode = c("gWidgetsWWW","statET","FastRWeb","slidify","rmarkdow # .datatable.aware = TRUE # which makes them data.table-aware optionally and possibly variably. # http://stackoverflow.com/a/13131555/403310 +# .datatable.aware is not in data.table's namespace and it is not intended to ever be added here. Otherwise +# package authors could set it using assignInNamespace and then not revert its value properly which would +# cause subsequent calls from other packages to fail. # cedta = Calling Environment Data.Table-Aware cedta = function(n=2L) { # Calling Environment Data Table Aware - ns = topenv(parent.frame(n)) + env = parent.frame(n) + if (isTRUE(env$.datatable.aware)) { # dtplyr 184 + return(TRUE) + } + ns = topenv(env) if (!isNamespace(ns)) { # e.g. DT queries at the prompt (.GlobalEnv) and knitr's eval(,envir=globalenv()) but not DF[...] inside knitr::kable v1.6 return(TRUE) @@ -47,4 +54,3 @@ cedta = function(n=2L) { } ans } - From 246a3a749d68d076ef1395230ba41a142c9e6bc6 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 26 Jul 2021 16:11:19 -0600 Subject: [PATCH 320/588] NEWS-only: added news item for #5043 (as.data.frame now removing indices) --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 2e0cef06af..c38808aab2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -149,6 +149,8 @@ 26. `melt()` now outputs scalar logical `NA` instead of `NULL` in rows corresponding to missing list columns, for consistency with non-list columns when using `na.rm=TRUE`, [#5053](https://github.com/Rdatatable/data.table/pull/5053). Thanks to Toby Dylan Hocking for the PR. +27. `as.data.frame(DT)` now removes any indices in addition to removing any key, [#5042(https://github.com/Rdatatable/data.table/issues/5042). When indices were left intact, a subsequent subset or reorder of the `data.frame` would not update the indices since they are treated just like any other `data.frame` attribute, causing incorrect results if the result is later converted back to `data.table` again. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From a7325a38d7cfe072164c477e053bdefb18bdd8f8 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 26 Jul 2021 17:42:35 -0600 Subject: [PATCH 321/588] NEWS-only: typo in link --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index c38808aab2..2d5ee8bbf7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -149,7 +149,7 @@ 26. `melt()` now outputs scalar logical `NA` instead of `NULL` in rows corresponding to missing list columns, for consistency with non-list columns when using `na.rm=TRUE`, [#5053](https://github.com/Rdatatable/data.table/pull/5053). Thanks to Toby Dylan Hocking for the PR. -27. `as.data.frame(DT)` now removes any indices in addition to removing any key, [#5042(https://github.com/Rdatatable/data.table/issues/5042). When indices were left intact, a subsequent subset or reorder of the `data.frame` would not update the indices since they are treated just like any other `data.frame` attribute, causing incorrect results if the result is later converted back to `data.table` again. +27. `as.data.frame(DT)` now removes any indices in addition to removing any key, [#5042](https://github.com/Rdatatable/data.table/issues/5042). When indices were left intact, a subsequent subset or reorder of the `data.frame` would not update the indices since they are treated just like any other `data.frame` attribute, causing incorrect results if the result is later converted back to `data.table` again. ## NOTES From 6fa5cabb3117803cba3323e87589c70cf9950dd3 Mon Sep 17 00:00:00 2001 From: Ofek Date: Tue, 27 Jul 2021 22:28:51 +0300 Subject: [PATCH 322/588] setDF deletes the index attribute (#4893) --- NEWS.md | 2 +- R/data.table.R | 4 +++- inst/tests/tests.Rraw | 8 ++++++++ man/setDF.Rd | 2 +- src/utils.c | 6 +++--- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2d5ee8bbf7..34542e8a95 100644 --- a/NEWS.md +++ b/NEWS.md @@ -149,7 +149,7 @@ 26. `melt()` now outputs scalar logical `NA` instead of `NULL` in rows corresponding to missing list columns, for consistency with non-list columns when using `na.rm=TRUE`, [#5053](https://github.com/Rdatatable/data.table/pull/5053). Thanks to Toby Dylan Hocking for the PR. -27. `as.data.frame(DT)` now removes any indices in addition to removing any key, [#5042](https://github.com/Rdatatable/data.table/issues/5042). When indices were left intact, a subsequent subset or reorder of the `data.frame` would not update the indices since they are treated just like any other `data.frame` attribute, causing incorrect results if the result is later converted back to `data.table` again. +27. `as.data.frame(DT)`, `setDF(DT)` and `as.list(DT)` now remove the `"index"` attribute which contains any indices (a.k.a. secondary keys), as they already did for other `data.table`-only attributes such as the primary key stored in the `"sorted"` attribute. When indices were left intact, a subsequent subset or reorder of the `data.frame` by `data.frame`-code in base R or other packages would not update the indices, causing incorrect results if then converted back to `data.table`, [#4889](https://github.com/Rdatatable/data.table/issues/4889) [#5042](https://github.com/Rdatatable/data.table/issues/5042). Thanks @OfekShilon for the report and the PR. ## NOTES diff --git a/R/data.table.R b/R/data.table.R index c15d65f034..0a5a38785b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2140,7 +2140,7 @@ as.data.frame.data.table = function(x, ...) setattr(ans,"row.names",.set_row_names(nrow(x))) # since R 2.4.0, data.frames can have non-character row names setattr(ans,"class","data.frame") setattr(ans,"sorted",NULL) # remove so if you convert to df, do something, and convert back, it is not sorted - setattr(ans,"index",NULL) #5042 + setattr(ans,"index",NULL) #4889 #5042 setattr(ans,".internal.selfref",NULL) # leave tl intact, no harm, ans @@ -2157,6 +2157,7 @@ as.list.data.table = function(x, ...) { setattr(ans, "class", NULL) setattr(ans, "row.names", NULL) setattr(ans, "sorted", NULL) + setattr(ans, "index", NULL) #4889 #5042 setattr(ans,".internal.selfref", NULL) # needed to pass S4 tests for example ans } @@ -2716,6 +2717,7 @@ setDF = function(x, rownames=NULL) { setattr(x, "row.names", rn) setattr(x, "class", "data.frame") setattr(x, "sorted", NULL) + setattr(x, "index", NULL) #4889 #5042 setattr(x, ".internal.selfref", NULL) } else if (is.data.frame(x)) { if (!is.null(rownames)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index aa654a236c..b6093cf9cd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17802,3 +17802,11 @@ test(2199.1, as.data.table(as.list(1:2))[, .SD,.SDcols=(-1L)], data.table(V2=2 test(2199.2, as.data.table(as.list(1:2))[, .SD,.SDcols=(-(1L))], data.table(V2=2L)) test(2199.3, as.data.table(as.list(1:3))[, .SD,.SDcols=(-1L)], data.table(V2=2L, V3=3L)) test(2199.4, data.table(V1=-1L, V2=-2L, V3=-3L)[,.SD,.SDcols=-V2:-V1], error="not found") + +# setDF now drops index attributes, #4889 +d = data.table(a=1:100, b=1:100) +setindex(d, a) +setDF(d) +d[1:50, "a"] = d[51:100, "a"] +setDT(d) +test(2200, nrow(d[a==99]), 2L) diff --git a/man/setDF.Rd b/man/setDF.Rd index f50c9ae491..57cba39433 100644 --- a/man/setDF.Rd +++ b/man/setDF.Rd @@ -15,7 +15,7 @@ setDF(x, rownames=NULL) } \details{ - All \code{data.table} attributes including any keys of the input data.table are stripped off. + All \code{data.table} attributes including any keys and indices of the input data.table are stripped off. When using \code{rownames}, recall that the row names of a \code{data.frame} must be unique. By default, the assigned set of row names is simply the sequence 1, \ldots, \code{nrow(x)} (or \code{length(x)} for \code{list}s). } diff --git a/src/utils.c b/src/utils.c index 0c4f04fa39..a1d9093b8d 100644 --- a/src/utils.c +++ b/src/utils.c @@ -381,11 +381,11 @@ SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg) { #include #endif SEXP dt_zlib_version() { - char out[51]; + char out[71]; #ifndef NOZLIB - snprintf(out, 50, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION); + snprintf(out, 70, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION); #else - snprintf(out, 50, _("zlib header files were not found when data.table was compiled")); + snprintf(out, 70, _("zlib header files were not found when data.table was compiled")); #endif return ScalarString(mkChar(out)); } From 061315767e9a151a1be63e4ba0d50e84f8b76574 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 27 Jul 2021 16:26:22 -0600 Subject: [PATCH 323/588] GLCI-only: rel-cran remove 'installed package size' now Status:OK --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c3027ca1c3..3c6049a1dd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -203,7 +203,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - *rm-src - >- - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: OK")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: OK"), " (installed package size) but ", shQuote(l)) else q("no")' + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: OK")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: OK"), " but ", shQuote(l)) else q("no")' test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double, check for new notes and compilation warnings, thus allow_failure <<: *test-lin From 3ff515e35e3e42e80d968f9b94be7f44ab27bbf6 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 27 Jul 2021 19:42:06 -0600 Subject: [PATCH 324/588] tidy plyr::arrange (not dplyr) comments and test (#5082) --- R/data.table.R | 2 +- inst/tests/other.Rraw | 26 ++++++++++++-------------- inst/tests/tests.Rraw | 2 +- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 0a5a38785b..a5d9433947 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -150,7 +150,7 @@ replace_dot_alias = function(e) { else if (missing(drop)) `[.data.frame`(x,i,j) else `[.data.frame`(x,i,j,drop) # added is.data.table(ans) check to fix bug #81 - if (!missing(i) && is.data.table(ans)) setkey(ans, NULL) # See test 304 + if (!missing(i) && is.data.table(ans)) setkey(ans, NULL) # drops index too; tested by plyr::arrange test in other.Rraw return(ans) } if (!missing(verbose)) { diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 03d62b4389..41380dbd65 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -1,11 +1,8 @@ - -# Usage: require(data.table); test.data.table(with.other.packages=TRUE) - -if (exists("test.data.table",.GlobalEnv,inherits=FALSE)) { - warning("This is dev where with.other.packages should not be run. Instead, use a fresh R session with data.table installed. ", - "Not doing so in dev can be the cause of both false errors and false passes.") +if (exists("test.data.table",.GlobalEnv,inherits=FALSE) || + !"package:data.table" %in% search()) { + stop("Usage: R CMD INSTALL; require(data.table); test.data.table('other.Rraw')") + # running other.Raw in dev mode (i.e. when data.table is not installed) is not intended to work } -if (!"package:data.table" %in% search()) stop("data.table should be already attached. Usage: require(data.table); test.data.table(with.other.packages=TRUE)") test = data.table:::test INT = data.table:::INT @@ -57,9 +54,10 @@ if (loaded[["ggplot2"]]) { } if (loaded[["plyr"]]) { - # Test key is dropped when non-dt-aware packages (here, plyr) reorders rows of data.table. + # Test key and indices are dropped when non-dt-aware packages (here, plyr) reorders rows of data.table. DT = data.table(a=1:10,b=1:2,key="a") - test(2, arrange(DT,b), data.table(a=INT(1,3,5,7,9,2,4,6,8,10),b=INT(1,1,1,1,1,2,2,2,2,2), key=NULL)) + setindex(DT, b) + test(2, arrange(DT,b), data.table(a=INT(1,3,5,7,9,2,4,6,8,10),b=INT(1,1,1,1,1,2,2,2,2,2))) } if (FALSE) { # loaded[["reshape"]] @@ -75,7 +73,7 @@ if (FALSE) { # loaded[["reshape"]] if (loaded[["caret"]]) { # Fix for #476 # caret seems heavy (plyr, reshape2 and withr). win-builder halts at this point consistently, but we pass on Travis and locally. - # So I put the win-builder fail down to resource issues and moved this test into test.data.table(with.other.packages=TRUE). + # So I put the win-builder fail down to resource issues and moved this test into test.data.table("other.Rraw"). DT = data.table(x = rnorm(10), y = rnorm(10)) cv.ctrl = trainControl(method = 'repeatedcv', number = 5, repeats = 1) fit = train(y ~ x, data = DT, 'lm', trControl = cv.ctrl) @@ -93,8 +91,8 @@ if (loaded[["xts"]]) { test(5, last(x,10), x[91:100,]) # The important thing this tests is that data.table's last() dispatches to xts's method when data.table is loaded above xts. # But this might not be the case, depending on whether xts was already loaded before loading data.table. - # So to make this test relevant, in a fresh R session type: "require(xts);require(data.table);test.data.table(with.other.packages=TRUE)" - # rather than: "require(data.table);require(xts);test.data.table(with.other.packages=TRUE)" + # So to make this test relevant, in a fresh R session type: "require(xts);require(data.table);test.data.table('other.Rraw')" + # rather than: "require(data.table);require(xts);test.data.table('other.Rraw')" # Which was the main thrust of bug#2312 fixed in v1.8.3 } @@ -186,6 +184,6 @@ if (loaded[["parallel"]]) { } # example(":=", local=TRUE) triggered cedta==FALSE and then error, #2972 -test(14.1, {example(':=', package='data.table', local=TRUE); TRUE}) -test(14.2, {example('CJ', package='data.table', local=TRUE); TRUE}) +test(14.1, {example(':=', package='data.table', local=TRUE, echo=FALSE); TRUE}) +test(14.2, {example('CJ', package='data.table', local=TRUE, echo=FALSE); TRUE}) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b6093cf9cd..e30cd255d7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -958,7 +958,7 @@ test(302, DT, data.table(a=c(1L,2L,2L,3L),b=c(4L,42L,6L,7L),key="a")) DT[J(2),b:=84L] test(303, DT, data.table(a=c(1L,2L,2L,3L),b=c(4L,84L,84L,7L),key="a")) -# Test 304 was testing compatibility with package:plyr. Moved to the ggplot2 block above to be moved to a separate test package. +# Test 304 was testing compatibility with package:plyr. #2671 moved it to other.Rraw. # Test that changing colnames keep key in sync. # TO DO: will have to do this for secondary keys, too, when implemented. From cf8c1c5658a0c08ff176f484107bb751a69d175b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 28 Jul 2021 15:31:32 -0600 Subject: [PATCH 325/588] don't use index when selfref detects prior copy by another package (#5084) --- NEWS.md | 4 +++- R/data.table.R | 2 +- inst/tests/other.Rraw | 14 +++++++++++--- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 34542e8a95..1745f4d153 100644 --- a/NEWS.md +++ b/NEWS.md @@ -149,7 +149,9 @@ 26. `melt()` now outputs scalar logical `NA` instead of `NULL` in rows corresponding to missing list columns, for consistency with non-list columns when using `na.rm=TRUE`, [#5053](https://github.com/Rdatatable/data.table/pull/5053). Thanks to Toby Dylan Hocking for the PR. -27. `as.data.frame(DT)`, `setDF(DT)` and `as.list(DT)` now remove the `"index"` attribute which contains any indices (a.k.a. secondary keys), as they already did for other `data.table`-only attributes such as the primary key stored in the `"sorted"` attribute. When indices were left intact, a subsequent subset or reorder of the `data.frame` by `data.frame`-code in base R or other packages would not update the indices, causing incorrect results if then converted back to `data.table`, [#4889](https://github.com/Rdatatable/data.table/issues/4889) [#5042](https://github.com/Rdatatable/data.table/issues/5042). Thanks @OfekShilon for the report and the PR. +27. `as.data.frame(DT)`, `setDF(DT)` and `as.list(DT)` now remove the `"index"` attribute which contains any indices (a.k.a. secondary keys), as they already did for other `data.table`-only attributes such as the primary key stored in the `"sorted"` attribute. When indices were left intact, a subsequent subset, assign, or reorder of the `data.frame` by `data.frame`-code in base R or other packages would not update the indices, causing incorrect results if then converted back to `data.table`, [#4889](https://github.com/Rdatatable/data.table/issues/4889). Thanks @OfekShilon for the report and the PR. + +28. `dplyr::arrange(DT)` uses `vctrs::vec_slice` which retains `data.table`'s class but uses C to bypass `[` method dispatch and does not adjust `data.table`'s attributes containing the index row numbers, [#5042](https://github.com/Rdatatable/data.table/issues/5042). `data.table`'s long-standing `.internal.selfref` mechanism to detect such operations by other packages was not being checked by `data.table` when using indexes, causing `data.table` filters and joins to use invalid indexes and return incorrect results after a `dplyr::arrange(DT)`. Thanks to @Waldi73 for reporting; @avimallu, @tlapak, @MichaelChirico, @jangorecki and @hadley for investigating and suggestions; and @mattdowle for the PR. The intended way to use `data.table` is `data.table::setkey(DT, col1, col2, ...)` which reorders `DT` by reference in parallel, sets the primary key for automatic use by subsequent `data.table` queries, and permits rowname-like usage such as `DT["foo",]` which returns the now-contiguous-in-memory block of rows where the first column of `DT`'s key contains `"foo"`. Multi-column-rownames (i.e. a primary key of more than one column) can be looked up using `DT[.("foo",20210728L), ]`. Using `==` in `i` is also optimized to use the key or indices, if you prefer using column names explicitly and `==`. An alternative to `setkey(DT)` is returning a new ordered result using `DT[order(col1, col2, ...), ]`. ## NOTES diff --git a/R/data.table.R b/R/data.table.R index a5d9433947..efcc2104e5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2484,7 +2484,7 @@ copy = function(x) { shallow = function(x, cols=NULL) { if (!is.data.table(x)) stopf("x is not a data.table. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table") - ans = .shallow(x, cols=cols, retain.key = TRUE) + ans = .shallow(x, cols=cols, retain.key=selfrefok(x)) # selfrefok for #5042 ans } diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 41380dbd65..3848a3077f 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -7,7 +7,7 @@ if (exists("test.data.table",.GlobalEnv,inherits=FALSE) || test = data.table:::test INT = data.table:::INT -pkgs = c("ggplot2", "hexbin", "plyr", "caret", "xts", "gdata", "zoo", "nlme", "bit64", "knitr", "parallel") +pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "xts", "gdata", "zoo", "nlme", "bit64", "knitr", "parallel") if (any(duplicated(pkgs))) stop("Packages defined to be loaded for integration tests in 'inst/tests/other.Rraw' contains duplicates.") is.require = function(pkg) suppressWarnings(suppressMessages(isTRUE(require(pkg, character.only=TRUE, quietly=TRUE, warn.conflicts=FALSE)))) @@ -54,10 +54,18 @@ if (loaded[["ggplot2"]]) { } if (loaded[["plyr"]]) { - # Test key and indices are dropped when non-dt-aware packages (here, plyr) reorders rows of data.table. + # Test key and indices are dropped when non-dt-aware packages reorders rows using `[` DT = data.table(a=1:10,b=1:2,key="a") setindex(DT, b) - test(2, arrange(DT,b), data.table(a=INT(1,3,5,7,9,2,4,6,8,10),b=INT(1,1,1,1,1,2,2,2,2,2))) + test(2.1, plyr::arrange(DT,b), data.table(a=INT(1,3,5,7,9,2,4,6,8,10),b=INT(1,1,1,1,1,2,2,2,2,2))) +} + +if (loaded[["dplyr"]]) { + # dplyr::arrange uses vctrs::vec_slice which is implemented in C and bypasses `[` dispatch; #5042 + DT = data.table(A=c("b","c","a"), B=10:12) + setindex(DT, A) + DT2 = dplyr::arrange(DT, A) + test(2.2, DT2[A=="c"], data.table(A="c", B=11L)) } if (FALSE) { # loaded[["reshape"]] From 7b82b4c0646e896ac26b6fb1025ea4ee29090c9b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 28 Jul 2021 21:54:30 -0600 Subject: [PATCH 326/588] Tidy other.Rraw CI (#5086) --- .ci/README.md | 2 +- .dev/.bash_aliases | 3 +++ .gitlab-ci.yml | 2 +- inst/tests/other.Rraw | 24 ++++++++++++++---------- inst/tests/tests-DESCRIPTION | 7 ------- tests/other.R | 17 ++++++++++++++--- 6 files changed, 33 insertions(+), 22 deletions(-) delete mode 100644 inst/tests/tests-DESCRIPTION diff --git a/.ci/README.md b/.ci/README.md index 72568fd844..3f303e34ac 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -7,7 +7,7 @@ On each Pull Request opened in GitHub we run Travis CI and Appveyor to provide p ### [GitLab CI](./../.gitlab-ci.yml) Test jobs: -- `test-rel-lin` - `r-release` on Linux, most comprehensive test environment, `-O3 -flto -fno-common -Wunused-result`, extra check for no compilation warnings, includes testing [_with other packages_](./../inst/tests/other.Rraw) ([extended suggests](./../inst/tests/tests-DESCRIPTION)) +- `test-rel-lin` - `r-release` on Linux, most comprehensive test environment, `-O3 -flto -fno-common -Wunused-result`, extra check for no compilation warnings, includes testing [_with other packages_](./../inst/tests/other.Rraw) - `test-rel-cran-lin` - `--as-cran` on Linux, `-g0`, extra check for final status of `R CMD check` where we allow one NOTE (_size of tarball_). - `test-dev-cran-lin` - `r-devel` and `--as-cran` on Linux, `--with-recommended-packages --enable-strict-barrier --disable-long-double`, tests for compilation warnings in pkg install and new NOTEs/Warnings in pkg check, and because it is R-devel it is marked as allow_failure - `test-rel-vanilla-lin` - `r-release` on Linux, no suggested deps, no OpenMP, `-O0`, tracks memory usage during tests diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases index 504df41504..3d46c94d6e 100644 --- a/.dev/.bash_aliases +++ b/.dev/.bash_aliases @@ -31,3 +31,6 @@ export R_PROFILE_USER='~/.Rprofile' export R_DEFAULT_INTERNET_TIMEOUT=3600 # increase from R's default 60, always not just in revdep testing, to help --as-cran +export TEST_DATA_TABLE_WITH_OTHER_PACKAGES=true +# R CMD check in dev should always run other.Rraw + diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3c6049a1dd..577d4f4b61 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -153,7 +153,7 @@ test-rel-lin: ## most comprehensive tests, force all suggests, also integration OPENBLAS_MAIN_FREE: "1" TEST_DATA_TABLE_WITH_OTHER_PACKAGES: "TRUE" before_script: - - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies(c("DESCRIPTION","inst/tests/tests-DESCRIPTION"), which="all"), quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' + - Rscript -e 'source(".ci/ci.R"); eval(parse("inst/tests/other.Rraw", n=1L)); install.packages(c(dcf.dependencies("DESCRIPTION", which="all"), pkgs), quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' - *cp-src - rm -r bus - mkdir -p ~/.R diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 3848a3077f..89ce49b00f 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -1,3 +1,9 @@ +pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "xts", "gdata", "zoo", "nlme", "bit64", "knitr", "parallel") +# First expression of this file must be as above: .gitlab-ci.yml uses parse(,n=1L) to read one expression from this file and installs pkgs. +# So that these dependencies of other.Rraw are maintained in a single place. +# TEST_DATA_TABLE_WITH_OTHER_PACKAGES is off by default so this other.Rraw doesn't run on CRAN. It is run by GLCI, locally in dev, and by +# users running test.data.table("other.Rraw"). + if (exists("test.data.table",.GlobalEnv,inherits=FALSE) || !"package:data.table" %in% search()) { stop("Usage: R CMD INSTALL; require(data.table); test.data.table('other.Rraw')") @@ -7,18 +13,16 @@ if (exists("test.data.table",.GlobalEnv,inherits=FALSE) || test = data.table:::test INT = data.table:::INT -pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "xts", "gdata", "zoo", "nlme", "bit64", "knitr", "parallel") if (any(duplicated(pkgs))) stop("Packages defined to be loaded for integration tests in 'inst/tests/other.Rraw' contains duplicates.") -is.require = function(pkg) suppressWarnings(suppressMessages(isTRUE(require(pkg, character.only=TRUE, quietly=TRUE, warn.conflicts=FALSE)))) -loaded = sapply(pkgs, is.require) - -if (sum(!loaded)) { - if (as.logical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_", "TRUE"))) { - stop(sprintf("Package (extended) suggested but not available: %s\n\nThe (extended) suggested packages are required for a complete check of data.table integration tests.\nChecking can be attempted without them by setting the environment variable _R_CHECK_FORCE_SUGGESTS_ to a false value.\nList of extended suggested packages used for integration tests can be found in `system.file(file.path('tests','tests-DESCRIPTION'), package='data.table')`.", paste("'", names(loaded)[!loaded], "'", sep="", collapse=", "))) - } else { - invisible(sapply(names(loaded)[!loaded], function(s) cat("\n**** Other package",s,"is not installed. Tests using it will be skipped.\n"))) - } +f = function(pkg) suppressMessages(isTRUE(require(pkg, character.only=TRUE, quietly=TRUE, warn.conflicts=FALSE))) +loaded = sapply(pkgs, f) +if (any(!loaded)) { + stop("test.data.table('other.Rraw') is missing required package(s): ", paste(names(loaded)[!loaded], collapse=", "), ". If you can't install them and this is R CMD check, please set environment variable TEST_DATA_TABLE_WITH_OTHER_PACKAGES back to the default, false.") + # Would like to install them now for convenience but gitlab-ci.yml seems to install to bus/mirror-other-packages/cran. + # If that's a cache, that's nice, but we don't know at this point whether this script is being run by GLCI or by a user or in dev. + # We don't allow skipping (e.g. if _R_CHECK_FORCE_SUGGESTS_ is FALSE) to keep things simple and to keep things strict; i.e. + # if this script runs then we want to be sure it has fully passed. } cat("\n") diff --git a/inst/tests/tests-DESCRIPTION b/inst/tests/tests-DESCRIPTION deleted file mode 100644 index 35e3411ad0..0000000000 --- a/inst/tests/tests-DESCRIPTION +++ /dev/null @@ -1,7 +0,0 @@ -Package: test.data.table -Version: 0.1 -Type: Backend -Title: List of data.table dependencies used in integration tests -Authors@R: c(person("data.table team", role = c("aut", "cre", "cph"), email="mattjdowle@gmail.com")) -Description: Standalone R DESCRIPTION file which defines R dependencies for integration tests of data.table package. Integration tests are not part of main testing workflow. They are performed only when TEST_DATA_TABLE_WITH_OTHER_PACKAGES environment variable is set to true. This allows us to run those integration tests in our CI pipeline and not impose dependency chains on the user. -Suggests: ggplot2 (>= 0.9.0), reshape, hexbin, fastmatch, nlme, gdata, caret, rmarkdown, parallel diff --git a/tests/other.R b/tests/other.R index 79ba8ca8d8..46a0bf7762 100644 --- a/tests/other.R +++ b/tests/other.R @@ -1,4 +1,15 @@ require(data.table) -# integration tests for packages excluded from Suggests in 1.10.5 -# for list of used packages see inst/tests/tests-DESCRIPTION -if (as.logical(Sys.getenv("TEST_DATA_TABLE_WITH_OTHER_PACKAGES","FALSE"))) test.data.table(script="other.Rraw") +if (as.logical(Sys.getenv("TEST_DATA_TABLE_WITH_OTHER_PACKAGES","FALSE"))) { + + options(warn=1) + # test.data.table() turns on R's warnPartial* options and currently there + # are partial argument names used in base and other packages. Without the + # options(warn=1), other.Rout just contains "There were 16 warnings (use + # warnings() to see them)". However, a print(warnings()) after test.data.table() + # just results in NULL in other.Rout. Hence options(warn=1) because that + # worked to display the warnings, not because we want them displayed at the + # time per se. + + test.data.table(script="other.Rraw") +} + From 925235d080ff6e39d90df8b22923d3c2aa521041 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 28 Jul 2021 22:12:15 -0600 Subject: [PATCH 327/588] follow up #5086 --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 577d4f4b61..248f97a30d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -24,7 +24,7 @@ stages: paths: - bus -mirror-packages: ## mirror all recursive dependencies, source and win.binary of data.table suggests from inst/tests/tests-DESCRIPTION +mirror-packages: ## mirror all recursive dependencies, source and win.binary of data.table suggests from DESCRIPTION stage: dependencies tags: - linux @@ -40,7 +40,7 @@ mirror-packages: ## mirror all recursive dependencies, source and win.binary of - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_REL_VERSION","R_DEVEL_VERSION","R_OLDREL_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' <<: *artifacts -mirror-other-packages: ## mirror integration suggests from inst/tests/tests-DESCRIPTION +mirror-other-packages: ## mirror integration suggests from pkgs at the top of inst/tests/other.Rraw stage: dependencies tags: - linux @@ -51,7 +51,7 @@ mirror-other-packages: ## mirror integration suggests from inst/tests/tests-DESC script: - echo 'source(".ci/ci.R")' >> .Rprofile - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib - - Rscript -e 'mirror.packages(dcf.dependencies("inst/tests/tests-DESCRIPTION", "all"), repos=c(Sys.getenv("CRAN_MIRROR"), dcf.repos("inst/tests/tests-DESCRIPTION")), repodir="bus/mirror-other-packages/cran")' + - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); mirror.packages(pkgs, repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-other-packages/cran")' <<: *artifacts build: ## build data.table sources as tar.gz archive From f5fa5ecee0a2536999c87c0271cbfabd5b63657a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 30 Jul 2021 13:11:58 -0600 Subject: [PATCH 328/588] NEWS-only: update OpenBSD news item #5048 #5049 --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 1745f4d153..1ce72afd52 100644 --- a/NEWS.md +++ b/NEWS.md @@ -177,7 +177,7 @@ 7. Grouping by a `list` column has its error message improved stating this is unsupported, [#4308](https://github.com/Rdatatable/data.table/issues/4308). Thanks @sindribaldur for filing, and @michaelchirico for the PR. Please add your vote and especially use cases to the [#1597](https://github.com/Rdatatable/data.table/issues/1597) feature request. -8. OpenBSD 6.9 released May 2021 apparently uses a 16 year old version of zlib (v1.2.3 from 2005) which induces `Compress gzip error: -9` from `fwrite()`, [#5048](https://github.com/Rdatatable/data.table/issues/5048). Thanks to Philippe Chataignon for investigating and for the PR which attempts a solution. +8. OpenBSD 6.9 released May 2021 uses a 16 year old version of zlib (v1.2.3 from 2005) plus cherry-picked bug fixes (i.e. a semi-fork of zlib) which induces `Compress gzip error: -9` from `fwrite()`, [#5048](https://github.com/Rdatatable/data.table/issues/5048). Thanks to Philippe Chataignon for investigating and fixing. Matt asked on OpenBSD's mailing list if zlib could be upgraded to 4 year old zlib 1.2.11 but forgot his tin hat: https://marc.info/?l=openbsd-misc&m=162455479311886&w=1. 9. `?"."`, `?".."`, `?".("`, and `?".()"` now point to `?data.table`, [#4385](https://github.com/Rdatatable/data.table/issues/4385) [#4407](https://github.com/Rdatatable/data.table/issues/4407). To help users find the documentation for these convenience features available inside `DT[...]`. Recall that `.` is an alias for `list`, and `..var` tells `data.table` to look for `var` in the calling environment as opposed to a column of the table. From dff60a83b7676d45b9164ac91fc241b6cc72baf0 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 3 Aug 2021 01:17:34 +0200 Subject: [PATCH 329/588] merge sort arg document better (#4427) --- NEWS.md | 2 ++ man/merge.Rd | 46 +++++++++++++++++----------------------------- man/setkey.Rd | 27 +++++++++++++++++---------- 3 files changed, 36 insertions(+), 39 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1ce72afd52..eb426a6d66 100644 --- a/NEWS.md +++ b/NEWS.md @@ -185,6 +185,8 @@ 11. `melt()`'s internal C code is now more memory efficient, [#5054](https://github.com/Rdatatable/data.table/pull/5054). Thanks to Toby Dylan Hocking for the PR. +12. `?merge` and `?setkey` have been updated to clarify that the row order is retained when `sort=FALSE`, and why `NA`s are always first when `sort=TRUE`, [#2574](https://github.com/Rdatatable/data.table/issues/2574) [#2594](https://github.com/Rdatatable/data.table/issues/2594). Thanks to Davor Josipovic and Markus Bonsch for the reports, and Jan Gorecki for the PR. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/man/merge.Rd b/man/merge.Rd index fe0a03f7a0..6fcbc10866 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -4,7 +4,8 @@ \title{Merge two data.tables} \description{ Fast merge of two \code{data.table}s. The \code{data.table} method behaves -very similarly to that of \code{data.frame}s except that, by default, it attempts to merge +similarly to \code{data.frame} except that row order is specified, and by +default the columns to merge on are chosen: \itemize{ \item at first based on the shared key columns, and if there are none, @@ -13,7 +14,7 @@ very similarly to that of \code{data.frame}s except that, by default, it attempt \item then based on the common columns between the two \code{data.table}s. } -Set the \code{by}, or \code{by.x} and \code{by.y} arguments explicitly to override this default. +Use the \code{by}, \code{by.x} and \code{by.y} arguments explicitly to override this default. } \usage{ @@ -32,15 +33,17 @@ If \code{y} has no key columns, this defaults to the key of \code{x}.} \item{by.x, by.y}{Vectors of column names in \code{x} and \code{y} to merge on.} \item{all}{logical; \code{all = TRUE} is shorthand to save setting both \code{all.x = TRUE} and \code{all.y = TRUE}.} -\item{all.x}{logical; if \code{TRUE}, then extra rows will be added to the -output, one for each row in \code{x} that has no matching row in \code{y}. -These rows will have 'NA's in those columns that are usually filled with values -from \code{y}. The default is \code{FALSE}, so that only rows with data from both -\code{x} and \code{y} are included in the output.} +\item{all.x}{logical; if \code{TRUE}, rows from \code{x} which have no matching row +in \code{y} are included. These rows will have 'NA's in the columns that are usually +filled with values from \code{y}. The default is \code{FALSE} so that only rows with +data from both \code{x} and \code{y} are included in the output.} \item{all.y}{logical; analogous to \code{all.x} above.} -\item{sort}{logical. If \code{TRUE} (default), the merged \code{data.table} is -sorted by setting the key to the \code{by / by.x} columns. If \code{FALSE}, the -result is not sorted.} +\item{sort}{logical. If \code{TRUE} (default), the rows of the merged +\code{data.table} are sorted by setting the key to the \code{by / by.x} columns. If +\code{FALSE}, unlike base R's \code{merge} for which row order is unspecified, the +row order in \code{x} is retained (including retaining the position of missings when +\code{all.x=TRUE}), followed by \code{y} rows that don't match \code{x} (when \code{all.y=TRUE}) +retaining the order those appear in \code{y}.} \item{suffixes}{A \code{character(2)} specifying the suffixes to be used for making non-\code{by} column names unique. The suffix behaviour works in a similar fashion as the \code{\link{merge.data.frame}} method does.} @@ -54,27 +57,12 @@ as any \code{by.x}.} \details{ \code{\link{merge}} is a generic function in base R. It dispatches to either the \code{merge.data.frame} method or \code{merge.data.table} method depending on -the class of its first argument. Note that, unlike \code{SQL}, \code{NA} is +the class of its first argument. Note that, unlike \code{SQL} join, \code{NA} is matched against \code{NA} (and \code{NaN} against \code{NaN}) while merging. -In versions \code{<= v1.9.4}, if the specified columns in \code{by} were not the -key (or head of the key) of \code{x} or \code{y}, then a \code{\link{copy}} is -first re-keyed prior to performing the merge. This was less performant as well as memory -inefficient. The concept of secondary keys (implemented in \code{v1.9.4}) was -used to overcome this limitation from \code{v1.9.6}+. No deep copies are made -any more, thereby improving performance and memory efficiency. Also, there is better -control for providing the columns to merge on with the help of the newly implemented -\code{by.x} and \code{by.y} arguments. - For a more \code{data.table}-centric way of merging two \code{data.table}s, see \code{\link{[.data.table}}; e.g., \code{x[y, \dots]}. See FAQ 1.11 for a detailed comparison of \code{merge} and \code{x[y, \dots]}. - -If any column names provided to \code{by.x} also occur in \code{names(y)} but not in \code{by.y}, -then this \code{data.table} method will add the \code{suffixes} to those column names. As of -R v3.4.3, the \code{data.frame} method will not (leading to duplicate column names in the result) but a patch has -been proposed (see r-devel thread \href{https://r.789695.n4.nabble.com/Duplicate-column-names-created-by-base-merge-when-by-x-has-the-same-name-as-a-column-in-y-td4748345.html}{here}) -which is looking likely to be accepted for a future version of R. } \value{ @@ -84,7 +72,7 @@ set to \code{TRUE}. } \seealso{ -\code{\link{data.table}}, \code{\link{as.data.table}}, \code{\link{[.data.table}}, +\code{\link{data.table}}, \code{\link{setkey}}, \code{\link{[.data.table}}, \code{\link{merge.data.frame}} } @@ -125,14 +113,14 @@ merge(d4, d1) merge(d1, d4, all=TRUE) merge(d4, d1, all=TRUE) -# new feature, no need to set keys anymore +# setkey is automatic by default set.seed(1L) d1 <- data.table(a=sample(rep(1:3,each=2)), z=1:6) d2 <- data.table(a=2:0, z=10:12) merge(d1, d2, by="a") merge(d1, d2, by="a", all=TRUE) -# new feature, using by.x and by.y arguments +# using by.x and by.y setnames(d2, "a", "b") merge(d1, d2, by.x="a", by.y="b") merge(d1, d2, by.x="a", by.y="b", all=TRUE) diff --git a/man/setkey.Rd b/man/setkey.Rd index daf10c83ad..ca386c58d7 100644 --- a/man/setkey.Rd +++ b/man/setkey.Rd @@ -9,16 +9,23 @@ \title{ Create key on a data.table } \description{ \code{setkey} sorts a \code{data.table} and marks it as sorted with an -attribute \code{sorted}. The sorted columns are the key. The key can be any -number of columns. The columns are always sorted in \emph{ascending} order. The table -is changed \emph{by reference} and \code{setkey} is very memory efficient. - -There are three reasons \code{setkey} is desirable: i) binary search and joins are faster -when they detect they can use an existing key, ii) grouping by a leading subset of the key -columns is faster because the groups are already gathered contiguously in RAM, iii) -simpler shorter syntax; e.g. \code{DT["id",]} finds the group "id" in the first column -of DT's key using binary search. It may be helpful to think of a key as -super-charged rownames: multi-column and multi-type rownames. +attribute \code{"sorted"}. The sorted columns are the key. The key can be any +number of columns. The data is always sorted in \emph{ascending} order with \code{NA}s +(if any) always first. The table is changed \emph{by reference} and there is +no memory used for the key (other than marking which columns the data is sorted by). + +There are three reasons \code{setkey} is desirable: +\itemize{ + \item binary search and joins are faster when they detect they can use an existing key + \item grouping by a leading subset of the key columns is faster because the groups are already gathered contiguously in RAM + \item simpler shorter syntax; e.g. \code{DT["id",]} finds the group "id" in the first column of \code{DT}'s key using binary search. It may be helpful to think of a key as super-charged rownames: multi-column and multi-type. +} + +\code{NA}s are always first because: +\itemize{ + \item \code{NA} is internally \code{INT_MIN} (a large negative number) in R. Keys and indexes are always in increasing order so if \code{NA}s are first, no special treatment or branch is needed in many \code{data.table} internals involving binary search. It is not optional to place \code{NA}s last for speed, simplicity and rubustness of internals at C level. + \item if any \code{NA}s are present then we believe it is better to display them up front (rather than hiding them at the end) to reduce the risk of not realizing \code{NA}s are present. +} In \code{data.table} parlance, all \code{set*} functions change their input \emph{by reference}. That is, no copy is made at all other than for temporary From 6452c70fcb0cf1a5fe1cd52a3bd8afd459343c23 Mon Sep 17 00:00:00 2001 From: Bennet B Date: Wed, 4 Aug 2021 07:32:25 +0200 Subject: [PATCH 330/588] fixing "invalid permission" segfault from #5077 by only starting the amount of thread, memory was allocated for. (#5087) --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ inst/tests/tests.Rraw | 14 ++++++++++++++ src/forder.c | 9 ++++++--- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ffa85cb31a..523a40f041 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -65,7 +65,8 @@ Authors@R: c( person("Tony","Fischetti", role="ctb"), person("Ofek","Shilon", role="ctb"), person("Vadim","Khotilovich", role="ctb"), - person("Hadley","Wickham", role="ctb")) + person("Hadley","Wickham", role="ctb"), + person("Bennet","Becker", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NEWS.md b/NEWS.md index eb426a6d66..a3f043eddc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -153,6 +153,8 @@ 28. `dplyr::arrange(DT)` uses `vctrs::vec_slice` which retains `data.table`'s class but uses C to bypass `[` method dispatch and does not adjust `data.table`'s attributes containing the index row numbers, [#5042](https://github.com/Rdatatable/data.table/issues/5042). `data.table`'s long-standing `.internal.selfref` mechanism to detect such operations by other packages was not being checked by `data.table` when using indexes, causing `data.table` filters and joins to use invalid indexes and return incorrect results after a `dplyr::arrange(DT)`. Thanks to @Waldi73 for reporting; @avimallu, @tlapak, @MichaelChirico, @jangorecki and @hadley for investigating and suggestions; and @mattdowle for the PR. The intended way to use `data.table` is `data.table::setkey(DT, col1, col2, ...)` which reorders `DT` by reference in parallel, sets the primary key for automatic use by subsequent `data.table` queries, and permits rowname-like usage such as `DT["foo",]` which returns the now-contiguous-in-memory block of rows where the first column of `DT`'s key contains `"foo"`. Multi-column-rownames (i.e. a primary key of more than one column) can be looked up using `DT[.("foo",20210728L), ]`. Using `==` in `i` is also optimized to use the key or indices, if you prefer using column names explicitly and `==`. An alternative to `setkey(DT)` is returning a new ordered result using `DT[order(col1, col2, ...), ]`. +29. A segfault occurred when `nrow/throttle < nthread`, [#5077](https://github.com/Rdatatable/data.table/issues/5077). With the default throttle of 1024 rows (see `?setDTthreads`), at least 64 threads would be needed to trigger the segfault since there needed to be more than 65,535 rows too. It occurred on a server with 256 logical cores where `data.table` uses 128 threads by default. Thanks to Bennet Becker for reporting, debugging at C level, and fixing. It also occurred when the throttle was increased so as to use fewer threads; e.g. at the limit `setDTthreads(throttle=nrow(DT))`. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e30cd255d7..a39d8bfac9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17810,3 +17810,17 @@ setDF(d) d[1:50, "a"] = d[51:100, "a"] setDT(d) test(2200, nrow(d[a==99]), 2L) + +# segfault in forder when nrow/throttle=255 && nrow>=65536; #5077 +# Matt ran these on clang's ASAN+OpenMP which correctly faulted v1.14.0; these tests segfault consistently without ASAN too +set.seed(1) +DT = data.table(grp=sample(255L, 65536L ,replace=TRUE)) # >=255 && >=65536 necessary +setDTthreads(throttle=nrow(DT)) # increase throttle to reduce threads to 1 for this nrow +test(2201.1, nrow(DT[, .N, by=grp]), 255L) +test(2201.2, nrow(setkey(DT, grp)), 65536L) +set.seed(1) +DT = data.table(grp=sample(65536L)) # extra case with all size 1 groups too just for fun +test(2201.3, nrow(DT[, .N, by=grp]), 65536L) +test(2201.4, nrow(setkey(DT, grp)), 65536L) +setDTthreads() # restore default throttle + diff --git a/src/forder.c b/src/forder.c index 4ccdd549a9..e7676386e7 100644 --- a/src/forder.c +++ b/src/forder.c @@ -714,10 +714,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S Rprintf(_("nradix=%d\n"), nradix); #endif - nth = getDTthreads(nrow, true); // this nth is relied on in cleanup() + // global nth, TMP & UGRP + nth = getDTthreads(nrow, true); // this nth is relied on in cleanup(); throttle=true/false debated for #5077 TMP = (int *)malloc(nth*UINT16_MAX*sizeof(int)); // used by counting sort (my_n<=65536) in radix_r() UGRP = (uint8_t *)malloc(nth*256); // TODO: align TMP and UGRP to cache lines (and do the same for stack allocations too) if (!TMP || !UGRP /*|| TMP%64 || UGRP%64*/) STOP(_("Failed to allocate TMP or UGRP or they weren't cache line aligned: nth=%d"), nth); + if (retgrp) { gs_thread = calloc(nth, sizeof(int *)); // thread private group size buffers gs_thread_alloc = calloc(nth, sizeof(int)); @@ -1222,8 +1224,9 @@ void radix_r(const int from, const int to, const int radix) { } else { // all groups are <=65535 and radix_r() will handle each one single-threaded. Therefore, this time // it does make sense to start a parallel team and there will be no nestedness here either. + if (retgrp) { - #pragma omp parallel for ordered schedule(dynamic) num_threads(getDTthreads(ngrp, false)) + #pragma omp parallel for ordered schedule(dynamic) num_threads(MIN(nth, ngrp)) // #5077 for (int i=0; i Date: Wed, 4 Aug 2021 19:53:33 +0200 Subject: [PATCH 331/588] dll name uses underscore (#4442) --- .dev/cc.R | 10 +++++----- NAMESPACE | 2 +- NEWS.md | 4 ++++ R/onLoad.R | 9 +++------ src/Makevars.in | 6 +++--- src/Makevars.win | 2 +- src/init.c | 5 ++--- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.dev/cc.R b/.dev/cc.R index a908a6b3df..b35a24ae6c 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -50,7 +50,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys # Make sure library .so is not loaded (neither installed package nor from dev) dll = unlist(do.call("rbind",getLoadedDLLs())[,"path"]) - dll = grep("datatable.so",dll,value=TRUE) + dll = grep("data_table.so",dll,value=TRUE) sapply(dll, dyn.unload) gc() @@ -61,18 +61,18 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys if (clean) system("rm *.o *.so") OMP = if (omp) "" else "no-" if (debug) { - ret = system(sprintf("MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f%sopenmp CFLAGS=-std=c99\\ -O0\\ -ggdb\\ -pedantic' R CMD SHLIB -d -o datatable.so *.c", CC, OMP)) + ret = system(sprintf("MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f%sopenmp CFLAGS=-std=c99\\ -O0\\ -ggdb\\ -pedantic' R CMD SHLIB -d -o data_table.so *.c", CC, OMP)) } else { - ret = system(sprintf("MAKEFLAGS='-j CC=%s CFLAGS=-f%sopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic\\ -fno-common' R CMD SHLIB -o datatable.so *.c", CC, OMP)) + ret = system(sprintf("MAKEFLAGS='-j CC=%s CFLAGS=-f%sopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic\\ -fno-common' R CMD SHLIB -o data_table.so *.c", CC, OMP)) # TODO add -Wextra too? } if (ret) return() # clang -Weverything includes -pedantic and issues many more warnings than gcc - # system("R CMD SHLIB -o datatable.so *.c") + # system("R CMD SHLIB -o data_table.so *.c") if (any(sapply(objects(envir=.GlobalEnv),function(x){inherits(get(x,.GlobalEnv),"data.table")}))) { cat("ABOUT TO RELOAD .SO BUT THERE ARE DATA.TABLE OBJECTS IN .GLOBALENV SO FINALIZER MIGHT CRASH\n") } - dyn.load("datatable.so") + dyn.load("data_table.so") setwd(old) xx = getDLLRegisteredRoutines("datatable",TRUE) for (i in seq_along(xx$.Call)) diff --git a/NAMESPACE b/NAMESPACE index 277a6a2892..55d660a871 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,4 @@ -useDynLib(datatable, .registration=TRUE) +useDynLib("data_table", .registration=TRUE) ## For S4-ization import(methods) diff --git a/NEWS.md b/NEWS.md index a3f043eddc..4d58f97c43 100644 --- a/NEWS.md +++ b/NEWS.md @@ -189,6 +189,10 @@ 12. `?merge` and `?setkey` have been updated to clarify that the row order is retained when `sort=FALSE`, and why `NA`s are always first when `sort=TRUE`, [#2574](https://github.com/Rdatatable/data.table/issues/2574) [#2594](https://github.com/Rdatatable/data.table/issues/2594). Thanks to Davor Josipovic and Markus Bonsch for the reports, and Jan Gorecki for the PR. +13. `datatable.[dll|so]` has changed name to `data_table.[dll|so]`, [#4442](https://github.com/Rdatatable/data.table/pull/4442). Thanks to Jan Gorecki for the PR. We had previously removed the `.` since `.` is not allowed by the following paragraph in the Writing-R-Extensions manual. Replacing `.` with `_` instead now seems more consistent with the last sentence. + + > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/R/onLoad.R b/R/onLoad.R index 5e72fab47f..9ad7051ffd 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -19,13 +19,12 @@ # Runs when loaded but not attached to search() path; e.g., when a package just Imports (not Depends on) data.table if (!exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # check when installed package is loaded but skip when developing the package with cc() - dllV = if (is.loaded("CdllVersion",PACKAGE="datatable")) .Call(CdllVersion) else "before 1.12.0" - # ^^ no dot as this is the name of the dll file, #3282 + dllV = if (is.loaded("CdllVersion",PACKAGE="data_table")) .Call(CdllVersion) else "before 1.12.0" RV = packageVersion("data.table") if (dllV != RV) { dll = if (.Platform$OS.type=="windows") "dll" else "so" # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478 - stopf("The datatable.%s version (%s) does not match the package (%s). Please close all R sessions to release the old %s and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.", dll, dllV, RV, toupper(dll)) + stopf("The data_table.%s version (%s) does not match the package (%s). Please close all R sessions to release the old %s and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.", dll, dllV, RV, toupper(dll)) } builtUsing = readRDS(system.file("Meta/package.rds",package="data.table"))$Built$R if (!identical(base::getRversion()>="4.0.0", builtUsing>="4.0.0")) { @@ -137,9 +136,7 @@ getRversion = function(...) stopf("Reminder to data.table developers: don't use # 4) Defining getRversion with a stopf() here helps prevent new switches on getRversion() being added in future. Easily circumvented but the point is to issue the message above. .onUnload = function(libpath) { - # fix for #474. the shared object name is different from package name - # So 'detach' doesn't find datatable.so, as it looks by default for data.table.so - library.dynam.unload("datatable", libpath) + library.dynam.unload("data_table", libpath) } # nocov end diff --git a/src/Makevars.in b/src/Makevars.in index b411786283..3e57d91193 100644 --- a/src/Makevars.in +++ b/src/Makevars.in @@ -8,7 +8,7 @@ PKG_LIBS = @PKG_LIBS@ @openmp_cflags@ @zlib_libs@ # Note that -lz is now escaped via @zlib_libs@ when zlib is not installed all: $(SHLIB) - @echo PKG_CFLAGS = $(PKG_CFLAGS) + @echo PKG_CFLAGS = $(PKG_CFLAGS) @echo PKG_LIBS = $(PKG_LIBS) - if [ "$(SHLIB)" != "datatable$(SHLIB_EXT)" ]; then mv $(SHLIB) datatable$(SHLIB_EXT); fi - if [ "$(OS)" != "Windows_NT" ] && [ `uname -s` = 'Darwin' ]; then install_name_tool -id datatable$(SHLIB_EXT) datatable$(SHLIB_EXT); fi + if [ "$(SHLIB)" != "data_table$(SHLIB_EXT)" ]; then mv $(SHLIB) data_table$(SHLIB_EXT); fi + if [ "$(OS)" != "Windows_NT" ] && [ `uname -s` = 'Darwin' ]; then install_name_tool -id data_table$(SHLIB_EXT) data_table$(SHLIB_EXT); fi diff --git a/src/Makevars.win b/src/Makevars.win index 3ea29da12d..a878b2fd09 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -2,4 +2,4 @@ PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS) PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) -lz all: $(SHLIB) - mv $(SHLIB) datatable$(SHLIB_EXT) + mv $(SHLIB) data_table$(SHLIB_EXT) diff --git a/src/init.c b/src/init.c index 083649685e..49765ea482 100644 --- a/src/init.c +++ b/src/init.c @@ -245,8 +245,7 @@ static void setSizes() { // One place we need the largest sizeof is the working memory malloc in reorder.c } -void attribute_visible R_init_datatable(DllInfo *info) -// relies on pkg/src/Makevars to mv data.table.so to datatable.so +void attribute_visible R_init_data_table(DllInfo *info) { // C exported routines // must be also listed in inst/include/datatableAPI.h @@ -376,7 +375,7 @@ inline long long DtoLL(double x) { // under clang 3.9.1 -O3 and solaris-sparc but not solaris-x86 or gcc. // There is now a grep in CRAN_Release.cmd; use this union method instead. // int64_t may help rather than 'long long' (TODO: replace all long long with int64_t) - // The two types must be the same size. That is checked in R_init_datatable (above) + // The two types must be the same size. That is checked in R_init_data_table (above) // where sizeof(int64_t)==sizeof(double)==8 is checked. // Endianness should not matter because whether big or little, endianness is the same // inside this process, and the two types are the same size. From 60a45533d0af76b143d531e0b50b00eed9e540ce Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 4 Aug 2021 12:05:32 -0600 Subject: [PATCH 332/588] fix tab in Makevars.in #4442 --- src/Makevars.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Makevars.in b/src/Makevars.in index 3e57d91193..fcfaceba99 100644 --- a/src/Makevars.in +++ b/src/Makevars.in @@ -8,7 +8,8 @@ PKG_LIBS = @PKG_LIBS@ @openmp_cflags@ @zlib_libs@ # Note that -lz is now escaped via @zlib_libs@ when zlib is not installed all: $(SHLIB) - @echo PKG_CFLAGS = $(PKG_CFLAGS) + @echo PKG_CFLAGS = $(PKG_CFLAGS) @echo PKG_LIBS = $(PKG_LIBS) if [ "$(SHLIB)" != "data_table$(SHLIB_EXT)" ]; then mv $(SHLIB) data_table$(SHLIB_EXT); fi if [ "$(OS)" != "Windows_NT" ] && [ `uname -s` = 'Darwin' ]; then install_name_tool -id data_table$(SHLIB_EXT) data_table$(SHLIB_EXT); fi + From d6106424a03b462b7738dbb9eb4bc2b86c0cfb6f Mon Sep 17 00:00:00 2001 From: Kyle Haynes Date: Thu, 5 Aug 2021 09:05:27 +1000 Subject: [PATCH 333/588] Adding plike (#4129) --- DESCRIPTION | 3 ++- NAMESPACE | 2 +- NEWS.md | 2 ++ R/like.R | 8 +++++--- inst/tests/tests.Rraw | 9 +++++---- man/like.Rd | 6 +++++- 6 files changed, 20 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 523a40f041..ff8fe0ebf6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -66,7 +66,8 @@ Authors@R: c( person("Ofek","Shilon", role="ctb"), person("Vadim","Khotilovich", role="ctb"), person("Hadley","Wickham", role="ctb"), - person("Bennet","Becker", role="ctb")) + person("Bennet","Becker", role="ctb"), + person("Kyle","Haynes", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NAMESPACE b/NAMESPACE index 55d660a871..fbd4f8df21 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,7 @@ exportClasses(data.table, IDate, ITime) export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy) export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) -export(last,first,like,"%like%","%ilike%","%flike%",between,"%between%",inrange,"%inrange%") +export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%") export(timetaken) export(truelength, setalloccol, alloc.col, ":=") export(setattr, setnames, setcolorder, set, setDT, setDF) diff --git a/NEWS.md b/NEWS.md index 4d58f97c43..f944a2ffb8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -95,6 +95,8 @@ 14. `.datatable.aware` is now recognized in the calling environment in addition to the namespace of the calling package, [dtplyr#184](https://github.com/tidyverse/dtplyr/issues/184). Thanks to Hadley Wickham for the idea and PR. +15. New convenience function `%plike%` maps to `like(..., perl=TRUE)`, [#3702](https://github.com/Rdatatable/data.table/issues/3702). `%plike%` uses Perl-compatible regular expressions (PCRE) which extend TRE, and may be more efficient in some cases. Thanks @KyleHaynes for the suggestion and PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/like.R b/R/like.R index dd2a8c5b59..b86faca8d3 100644 --- a/R/like.R +++ b/R/like.R @@ -1,15 +1,15 @@ # Intended for use with a data.table 'where' # Don't use * or % like SQL's like. Uses regexpr syntax - more powerful. # returns 'logical' so can be combined with other where clauses. -like = function(vector, pattern, ignore.case = FALSE, fixed = FALSE) { +like = function(vector, pattern, ignore.case = FALSE, fixed = FALSE, perl = FALSE) { if (is.factor(vector)) { # indexing by factors is equivalent to indexing by the numeric codes, see ?`[` #4748 - ret = grepl(pattern, levels(vector), ignore.case = ignore.case, fixed = fixed)[vector] + ret = grepl(pattern, levels(vector), ignore.case = ignore.case, fixed = fixed, perl = perl)[vector] ret[is.na(ret)] = FALSE ret } else { # most usually character, but integer and numerics will be silently coerced by grepl - grepl(pattern, vector, ignore.case = ignore.case, fixed = fixed) + grepl(pattern, vector, ignore.case = ignore.case, fixed = fixed, perl = perl) } } @@ -19,3 +19,5 @@ like = function(vector, pattern, ignore.case = FALSE, fixed = FALSE) { # as grep -F or fgrep -- grep against a fixed pattern (no regex) # (more efficient where applicable) "%flike%" = function(vector, pattern) like(vector, pattern, fixed = TRUE) +# Perl-compatible regex +"%plike%" = function(vector, pattern) like(vector, pattern, perl = TRUE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a39d8bfac9..49dd28509f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7366,7 +7366,7 @@ test(1530.2, which.first(x), which(x)[1L]) test(1530.3, which.last(1:5), error = "x not boolean") test(1530.4, which.last(x), tail(which(x), 1L)) -# test for like, %like%, %ilike%, %flike% +# test for like, %like%, %ilike%, %flike%, %plike% set.seed(2L) x = apply(matrix(sample(letters, 12), nrow=2), 1, paste, collapse="") y = factor(sample(c(letters[1:5], x), 20, TRUE)) @@ -7382,10 +7382,11 @@ test(1532.06, like(x, '()'), c(TRUE, TRUE, TRUE)) test(1532.07, like(x, '()', fixed = TRUE), c(FALSE, FALSE, TRUE)) test(1532.08, x %ilike% 'hey', c(TRUE, TRUE, FALSE)) test(1532.09, x %flike% '()', c(FALSE, FALSE, TRUE)) -## %like% test for ordered factor with NA -x = c("A", "B", "C", NA_character_) +test(1532.10, like(x, "(?=h)(?=.*y)", perl = TRUE), c(FALSE, TRUE, FALSE)) +test(1532.11, x %plike% "(?=h)(?=.*y)", c(FALSE, TRUE, FALSE)) #3702 +x = c("A", "B", "C", NA_character_) # ordered factor with NA x = ordered(x, levels = rev(x)[-1L]) -test(1532.10, x %like% "A", c(TRUE, FALSE, FALSE, FALSE)) +test(1532.12, x %like% "A", c(TRUE, FALSE, FALSE, FALSE)) # coverage for setkey() to 100% dt1 = data.table(x=sample(5), y=1:5, key="y") diff --git a/man/like.Rd b/man/like.Rd index 4eadb98a81..81016d2843 100644 --- a/man/like.Rd +++ b/man/like.Rd @@ -3,6 +3,7 @@ \alias{\%like\%} \alias{\%ilike\%} \alias{\%flike\%} +\alias{\%plike\%} \title{ Convenience function for calling grep. } \description{ Intended for use in \code{i} in \code{\link[=data.table]{[.data.table}}, i.e., for subsetting/filtering. @@ -10,16 +11,18 @@ Syntax should be familiar to SQL users, with interpretation as regex. } \usage{ -like(vector, pattern, ignore.case = FALSE, fixed = FALSE) +like(vector, pattern, ignore.case = FALSE, fixed = FALSE, perl = FALSE) vector \%like\% pattern vector \%ilike\% pattern vector \%flike\% pattern +vector \%plike\% pattern } \arguments{ \item{vector}{ Either a \code{character} or a \code{factor} vector. } \item{pattern}{ Pattern to be matched } \item{ignore.case}{ \code{logical}; is \code{pattern} case-sensitive? } \item{fixed}{ \code{logical}; should \code{pattern} be interpreted as a literal string (i.e., ignoring regular expressions)? } + \item{perl}{ \code{logical}; is \code{pattern} Perl-compatible regular expression? } } \details{ Internally, \code{like} is essentially a wrapper around \code{\link[base:grep]{base::grepl}}, except that it is smarter about handling \code{factor} input (\code{base::grep} uses slow \code{as.character} conversion). @@ -34,5 +37,6 @@ DT = data.table(Name=c("Mary","George","Martha"), Salary=c(2,3,4)) DT[Name \%like\% "^Mar"] DT[Name \%ilike\% "mar"] DT[Name \%flike\% "Mar"] +DT[Name \%plike\% "(?=Ma)(?=.*y)"] } \keyword{ data } From c2b5ca26944928e086a10a6f7737c29a59c69381 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 5 Aug 2021 02:40:38 +0200 Subject: [PATCH 334/588] fwrite allows sep="" (#5091) --- NEWS.md | 2 ++ R/fwrite.R | 2 +- inst/tests/tests.Rraw | 7 +++++++ src/fwrite.c | 24 +++++++++++++++--------- 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index f944a2ffb8..b5b93e3546 100644 --- a/NEWS.md +++ b/NEWS.md @@ -97,6 +97,8 @@ 15. New convenience function `%plike%` maps to `like(..., perl=TRUE)`, [#3702](https://github.com/Rdatatable/data.table/issues/3702). `%plike%` uses Perl-compatible regular expressions (PCRE) which extend TRE, and may be more efficient in some cases. Thanks @KyleHaynes for the suggestion and PR. +16. `fwrite()` now accepts `sep=""`, [#4817](https://github.com/Rdatatable/data.table/issues/4817). The motivation is an example where the result of `paste0()` needs to be written to file but `paste0()` takes 40 minutes due to constructing a very large number of unique long strings in R's global character cache. Allowing `fwrite(, sep="")` avoids the `paste0` and saves 40 mins. Thanks to Jan Gorecki for the request, and Ben Schwen for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/fwrite.R b/R/fwrite.R index 3f85ff1ea0..c822b05678 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -42,7 +42,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } stopifnot(is.list(x), identical(quote,"auto") || isTRUEorFALSE(quote), - is.character(sep) && length(sep)==1L && nchar(sep) == 1L, + is.character(sep) && length(sep)==1L && (nchar(sep) == 1L || sep == ""), is.character(sep2) && length(sep2)==3L && nchar(sep2[2L])==1L, is.character(dec) && length(dec)==1L && nchar(dec) == 1L, dec != sep, # sep2!=dec and sep2!=sep checked at C level when we know if list columns are present diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 49dd28509f..963c8c9718 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17825,3 +17825,10 @@ test(2201.3, nrow(DT[, .N, by=grp]), 65536L) test(2201.4, nrow(setkey(DT, grp)), 65536L) setDTthreads() # restore default throttle +# fwrite now allows sep="", #4817 +test(2202.1, fwrite(data.frame(a="id", b=letters[1:5], c=1:5), sep=""), + output = c("abc", paste0("id", letters[1:5], 1:5))) +test(2202.2, fwrite(data.frame(a="id", b=1:1e2), sep=""), + output = c("ab", paste0("id", 1:1e2))) +test(2202.3, fwrite(data.table(a=c(NA, 2, 3.01), b=c('foo', NA, 'bar')), sep=""), + output=c("ab", "foo", "2", "3.01bar")) diff --git a/src/fwrite.c b/src/fwrite.c index 7bad0cd168..f7f4003181 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -35,6 +35,7 @@ // Globals for this file only. Written once to hold parameters passed from R level. static const char *na; // by default "" or if set (not recommended) then usually "NA" static char sep; // comma in .csv files +static int sepLen; // 0 when sep="" for #4817, otherwise 1 static char sep2; // '|' within list columns. Used here to know if field should be quoted and in freadR.c to write sep2 in list columns static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 static int8_t doQuote=INT8_MIN; // whether to surround fields with double quote ". NA means 'auto' (default) @@ -590,6 +591,7 @@ void fwriteMain(fwriteMainArgs args) na = args.na; sep = args.sep; + sepLen = sep=='\0' ? 0 : 1; sep2 = args.sep2; dec = args.dec; scipen = args.scipen; @@ -635,10 +637,10 @@ void fwriteMain(fwriteMainArgs args) // could be console output) and writing column names to it. double t0 = wallclock(); - size_t maxLineLen = eolLen + args.ncol*(2*(doQuote!=0) + 1/*sep*/); + size_t maxLineLen = eolLen + args.ncol*(2*(doQuote!=0) + sepLen); if (args.doRowNames) { maxLineLen += args.rowNames ? getMaxStringLen(args.rowNames, args.nrow)*2 : 1+(int)log10(args.nrow); // the width of the row number - maxLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; + maxLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + sepLen; } for (int j=0; j> column name) + headerLen += args.ncol*(sepLen+(doQuote!=0)*2) + eolLen + 3; // 3 in case doRowNames and doQuote (the first blank <<"",>> column name) } if (headerLen) { char *buff = malloc(headerLen); @@ -716,13 +718,15 @@ void fwriteMain(fwriteMainArgs args) if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv - *ch++ = sep; + *ch = sep; + ch += sepLen; } for (int j=0; j=1 because 0-columns was caught earlier. + ch -= sepLen; // backup onto the last sep after the last column. ncol>=1 because 0-columns was caught earlier. write_chars(args.eol, &ch); // overwrite last sep with eol instead } // compress buffer if gzip From 2bbd07d586421d8260ce80687d40c6ec8a291d03 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 5 Aug 2021 00:28:21 -0600 Subject: [PATCH 335/588] end of line whitespace to reduce diff in #3414 --- .ci/ci.R | 34 +++++++++++++++---------------- .ci/deploy.sh | 2 +- man/like.Rd | 2 +- vignettes/datatable-importing.Rmd | 4 ++-- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.ci/ci.R b/.ci/ci.R index b124843b2a..70e5fa27a2 100644 --- a/.ci/ci.R +++ b/.ci/ci.R @@ -5,10 +5,10 @@ ## added ver argument to produce R version independent urls ## https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=17420 contrib.url <- -function (repos, type = getOption("pkgType"), ver) +function (repos, type = getOption("pkgType"), ver) { type <- utils:::resolvePkgType(type) - if (is.null(repos)) + if (is.null(repos)) return(NULL) if ("@CRAN@" %in% repos && interactive()) { cat(gettext("--- Please select a CRAN mirror for use in this session ---"), "\n", sep = "") @@ -17,12 +17,12 @@ function (repos, type = getOption("pkgType"), ver) m <- match("@CRAN@", repos) nm <- names(repos) repos[m] <- getOption("repos")["CRAN"] - if (is.null(nm)) + if (is.null(nm)) nm <- rep("", length(repos)) nm[m] <- "CRAN" names(repos) <- nm } - if ("@CRAN@" %in% repos) + if ("@CRAN@" %in% repos) stop("trying to use CRAN without setting a mirror") if(missing(ver)) { ver <- paste(R.version$major, strsplit(R.version$minor, ".", fixed=TRUE)[[1L]][1L], sep = ".") @@ -37,7 +37,7 @@ function (repos, type = getOption("pkgType"), ver) res <- switch( type, source = paste(gsub("/$", "", repos), "src", "contrib", sep = "/"), - mac.binary = paste(gsub("/$", "", repos), "bin", mac.path, "contrib", ver, sep = "/"), + mac.binary = paste(gsub("/$", "", repos), "bin", mac.path, "contrib", ver, sep = "/"), win.binary = paste(gsub("/$", "", repos), "bin", "windows", "contrib", ver, sep = "/") ) res @@ -45,7 +45,7 @@ function (repos, type = getOption("pkgType"), ver) ## returns dependencies for a package based on its DESCRIPTION file dcf.dependencies <- -function(file = "DESCRIPTION", +function(file = "DESCRIPTION", which = NA, except.priority = "base") { if (!is.character(file) || !length(file) || !all(file.exists(file))) @@ -71,7 +71,7 @@ function(file = "DESCRIPTION", }, which = which), use.names = FALSE) local.extract_dependency_package_names = function (x) { ## do not filter out R like tools:::.extract_dependency_package_names, used for web/$pkg/index.html - if (is.na(x)) + if (is.na(x)) return(character()) x <- unlist(strsplit(x, ",[[:space:]]*")) x <- sub("[[:space:]]*([[:alnum:].]+).*", "\\1", x) @@ -101,13 +101,13 @@ function(file = "DESCRIPTION") { ## download dependencies recursively for provided packages ## put all downloaded packages into local repository mirror.packages <- -function(pkgs, - which = c("Depends", "Imports", "LinkingTo"), - repos = getOption("repos"), - type = c("source", "mac.binary", "win.binary"), - repodir, - except.repodir = repodir, - except.priority = "base", +function(pkgs, + which = c("Depends", "Imports", "LinkingTo"), + repos = getOption("repos"), + type = c("source", "mac.binary", "win.binary"), + repodir, + except.repodir = repodir, + except.priority = "base", method, quiet = TRUE, binary.ver, @@ -161,7 +161,7 @@ function(pkgs, warning(sprintf("Packages binaries could not be found in provided reposistories for R version %s: %s", binary.ver, paste(newpkgs[!availpkgs], collapse = ", "))) newpkgs <- newpkgs[availpkgs] } - + pkgsext <- switch(type, "source" = "tar.gz", "mac.binary" = "tgz", @@ -171,8 +171,8 @@ function(pkgs, unlink(dlfiles[file.exists(dlfiles)]) ## repos argument is not used in download.packages, only as default for contriburl argument ## we provide contriburl to avoid interactive CRAN menu popup twice in mirror.packages - dp <- utils::download.packages(pkgs = newpkgs, destdir = destdir, - available = db, contriburl = repos.url, + dp <- utils::download.packages(pkgs = newpkgs, destdir = destdir, + available = db, contriburl = repos.url, type = type, method = method, quiet = quiet) tools::write_PACKAGES(dir = destdir, type = type, ...) dp diff --git a/.ci/deploy.sh b/.ci/deploy.sh index 6d0fd3590a..6f01ef136f 100644 --- a/.ci/deploy.sh +++ b/.ci/deploy.sh @@ -24,7 +24,7 @@ addToDrat(){ commit='Travis publish data.table: build $TRAVIS_COMMIT', \ addFiles=TRUE, fields='Revision')" git push --force upstream gh-pages 2>err.txt - + } addToDrat diff --git a/man/like.Rd b/man/like.Rd index 81016d2843..ed32340b96 100644 --- a/man/like.Rd +++ b/man/like.Rd @@ -7,7 +7,7 @@ \title{ Convenience function for calling grep. } \description{ Intended for use in \code{i} in \code{\link[=data.table]{[.data.table}}, i.e., for subsetting/filtering. - + Syntax should be familiar to SQL users, with interpretation as regex. } \usage{ diff --git a/vignettes/datatable-importing.Rmd b/vignettes/datatable-importing.Rmd index f59f910ff4..689e68903e 100644 --- a/vignettes/datatable-importing.Rmd +++ b/vignettes/datatable-importing.Rmd @@ -173,8 +173,8 @@ my.write = function (x) { } ``` -When using a package as a suggested dependency, you should not `import` it in the `NAMESPACE` file. Just mention it in the `DESCRIPTION` file. -When using `data.table` functions in package code (R/* files) you need to use the `data.table::` prefix because none of them are imported. +When using a package as a suggested dependency, you should not `import` it in the `NAMESPACE` file. Just mention it in the `DESCRIPTION` file. +When using `data.table` functions in package code (R/* files) you need to use the `data.table::` prefix because none of them are imported. When using `data.table` in package tests (e.g. tests/testthat/test* files), you need to declare `.datatable.aware=TRUE` in one of the R/* files. ## `data.table` in `Imports` but nothing imported From 0820aabd551fb4a78e389e9f9213d4c718f2f6f6 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 5 Aug 2021 10:50:37 -0600 Subject: [PATCH 336/588] .dev-only: datatable to data_table in cc.R follow up to #4442 --- .dev/cc.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.dev/cc.R b/.dev/cc.R index b35a24ae6c..6c278e2693 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -41,7 +41,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys stopifnot(is.character(CC), length(CC)==1L, !is.na(CC), nzchar(CC)) gc() - xx = try(getDLLRegisteredRoutines("datatable",TRUE), silent=TRUE) + xx = try(getDLLRegisteredRoutines("data_table",TRUE), silent=TRUE) if (!inherits(xx, "try-error")) { remove(list=sapply(xx$.Call,'[[',"name"), pos=.GlobalEnv) remove(list=sapply(xx$.External,'[[',"name"), pos=.GlobalEnv) @@ -74,7 +74,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys } dyn.load("data_table.so") setwd(old) - xx = getDLLRegisteredRoutines("datatable",TRUE) + xx = getDLLRegisteredRoutines("data_table",TRUE) for (i in seq_along(xx$.Call)) assign(xx$.Call[[i]]$name, xx$.Call[[i]]$address, envir=.GlobalEnv) for (i in seq_along(xx$.External)) From 831013a8892a6d39e12bbd45aeb559ccd258bc0a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 5 Aug 2021 18:12:04 -0700 Subject: [PATCH 337/588] format_col/format_list_item printing generics for customization (#3414) --- NAMESPACE | 8 ++++ NEWS.md | 2 + R/print.data.table.R | 86 ++++++++++++++++++++++++----------------- inst/tests/tests.Rraw | 34 ++++++++++++++++ man/print.data.table.Rd | 35 +++++++++++++++++ 5 files changed, 130 insertions(+), 35 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index fbd4f8df21..999a834304 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -187,3 +187,11 @@ S3method(unique, ITime) S3method('[<-', IDate) S3method(edit, data.table) +# generics to support custom column formatters +export(format_col) +S3method(format_col, default) +S3method(format_col, POSIXct) +S3method(format_col, expression) +export(format_list_item) +S3method(format_list_item, default) + diff --git a/NEWS.md b/NEWS.md index b5b93e3546..f333ec0356 100644 --- a/NEWS.md +++ b/NEWS.md @@ -99,6 +99,8 @@ 16. `fwrite()` now accepts `sep=""`, [#4817](https://github.com/Rdatatable/data.table/issues/4817). The motivation is an example where the result of `paste0()` needs to be written to file but `paste0()` takes 40 minutes due to constructing a very large number of unique long strings in R's global character cache. Allowing `fwrite(, sep="")` avoids the `paste0` and saves 40 mins. Thanks to Jan Gorecki for the request, and Ben Schwen for the PR. +17. `data.table` printing now supports customizable methods for both columns and list column row items, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). `format_col` is S3-generic for customizing how to print whole columns; `format_list_item` is S3-generic for customizing how to print each row of a list column. Thanks to @mllg who initially filed [#3338](https://github.com/Rdatatable/data.table/pulls/3338) with the seed of the idea, @franknarf1 who earlier suggested the idea of providing custom formatters, @fparages who submitted a patch to improve the printing of timezones for [#2842](https://github.com/Rdatatable/data.table/issues/2842), @RichardRedding for pointing out an error relating to printing wide `expression` columns in [#3011](https://github.com/Rdatatable/data.table/issues/3011), and @MichaelChirico for implementing. See `?print.data.table` for examples. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/print.data.table.R b/R/print.data.table.R index 3f19cdc391..023551074a 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -140,44 +140,11 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), invisible(x) } -format.data.table = function (x, ..., justify="none", timezone = FALSE) { +format.data.table = function (x, ..., justify="none") { if (is.atomic(x) && !is.null(x)) { stopf("Internal structure doesn't seem to be a list. Possibly corrupt data.table.") } - format.item = function(x) { - if (is.null(x)) # NULL item in a list column - "" - else if (is.atomic(x) || inherits(x,"formula")) # FR #2591 - format.data.table issue with columns of class "formula" - paste(c(format(head(x, 6L), justify=justify, ...), if (length(x) > 6L) "..."), collapse=",") # fix for #37 - format has to be added here... - else - paste0("<", class(x)[1L], paste_dims(x), ">") - } - # FR #2842 add timezone for posix timestamps - format.timezone = function(col) { # paste timezone to a time object - tz = attr(col,'tzone', exact=TRUE) - if (!is.null(tz)) { # date object with tz - nas = is.na(col) - col = paste0(as.character(col)," ",tz) # parse to character - col[nas] = NA_character_ - } - return(col) - } - # FR #1091 for pretty printing of character - # TODO: maybe instead of doing "this is...", we could do "this ... test"? - char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) { - trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) - if (!is.character(x) || trunc.char <= 0L) return(x) - idx = which(nchar(x) > trunc.char) - x[idx] = paste0(substr(x[idx], 1L, as.integer(trunc.char)), "...") - x - } - do.call("cbind",lapply(x,function(col,...) { - if (!is.null(dim(col))) return("") - if(timezone) col = format.timezone(col) - if (is.list(col)) col = vapply_1c(col, format.item) - else col = format(char.trunc(col), justify=justify, ...) # added an else here to fix #37 - col - },...)) + do.call("cbind", lapply(x, format_col, ..., justify=justify)) } mimicsAutoPrint = c("knit_print.default") @@ -205,6 +172,55 @@ paste_dims = function(x) { paste0("[", paste(dims,collapse="x"), "]") } +format_col = function(x, ...) { + UseMethod("format_col") +} + +format_list_item = function(x, ...) { + UseMethod("format_list_item") +} + +format_col.default = function(x, ...) { + if (!is.null(dim(x))) return("") + if (is.list(x)) return(vapply_1c(x, format_list_item, ...)) + format(char.trunc(x), ...) # relevant to #37 +} + +# #2842 -- different columns can have different tzone, so force usage in output +format_col.POSIXct = function(x, ..., timezone=FALSE) { + if (timezone) { + tz = attr(x,'tzone',exact=TRUE) + nas = is.na(x) + x = paste0(as.character(x)," ",tz) + is.na(x) = nas + } else { + x = format(x, usetz=FALSE) + } + x +} + +# #3011 -- expression columns can wrap to newlines which breaks printing +format_col.expression = function(x, ...) format(char.trunc(as.character(x)), ...) + +format_list_item.default = function(x, ...) { + if (is.null(x)) # NULL item in a list column + "" + else if (is.atomic(x) || inherits(x, "formula")) # FR #2591 - format.data.table issue with columns of class "formula" + paste(c(format(head(x, 6L), ...), if (length(x) > 6L) "..."), collapse=",") # fix for #5435 and #37 - format has to be added here... + else + paste0("<", class(x)[1L], paste_dims(x), ">") +} + +# FR #1091 for pretty printing of character +# TODO: maybe instead of doing "this is...", we could do "this ... test"? +char.trunc <- function(x, trunc.char = getOption("datatable.prettyprint.char")) { + trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) + if (!is.character(x) || trunc.char <= 0L) return(x) + idx = which(nchar(x) > trunc.char) + x[idx] = paste0(substr(x[idx], 1L, as.integer(trunc.char)), "...") + x +} + # to calculate widths of data.table for PR #4074 # gets the width of the data.table at each column # and compares it to the console width diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 963c8c9718..9a30d6b3d4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -34,6 +34,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { forder = data.table:::forder forderv = data.table:::forderv format.data.table = data.table:::format.data.table + format_col.default = data.table:::format_col.default + format_list_item.default = data.table:::format_list_item.default getdots = data.table:::getdots groupingsets.data.table = data.table:::groupingsets.data.table guess = data.table:::guess @@ -17002,6 +17004,38 @@ DT = data.table( s4class(x=2L, y="yes", z=1))) test(2130.03, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) +# format_col and format_list_item printing helpers/generics +## Use case: solve #2842 by defining format_col.POSIXct to have usetz = TRUE +DT = data.table( + t1 = as.POSIXct('2018-05-01 12:34:56', tz = 'UTC'), + t2 = as.POSIXct('2018-05-01 12:34:56', tz = 'Asia/Singapore') +) +test(2130.101, print(DT, timezone=TRUE), output='UTC') +test(2130.102, print(DT, timezone=FALSE), notOutput='UTC') + +# default expression printing can break format_col.default, #3011 +test(2130.11, print(data.table(e = expression(1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13))), output = '1 + 2 + 3') + +# format_col generic is used +format_col.complex = function(x, ...) sprintf('(%.1f, %.1fi)', Re(x), Im(x)) +registerS3method("format_col", "complex", format_col.complex) +# this registerS3method does seem to be necessary to work within the test.data.table() environment +# assigning the method using <<- probably works too, but we don't want to write to user's environment at all +x = data.table(z = c(1 + 3i, 2 - 1i, pi + 2.718i)) +test(2130.12, x, output = '(1.0, 3.0i)') +rm(format_col.complex) +registerS3method("format_col", "complex", format_col.default) +# otherwise it remains registered after test.data.table() and causes test 1610.1 to fail on the next run for example, and user display if they have complex data +# haven't found a way to unregister an S3 method (tried registering NULL but there's an error that NULL isn't a function) + +# format_list_item() generic is used +format_list_item.myclass <- function(x, ...) paste0("<", class(x)[1L], ":", x$id, ">") +registerS3method("format_list_item", "myclass", format_list_item.myclass) +DT = data.table(row = 1:2, objs = list(structure(list(id = "foo"), class = "myclass"), structure(list(id = "bar"), class = "myclass"))) +test(2130.13, print(DT), output = "myclass:foo.*myclass:bar") +rm(format_list_item.myclass) +registerS3method("format_list_item", "myclass", format_list_item.default) + # .SD from grouping should be unlocked, part of #4159 x = data.table(a=1:3, b=4:6) test(2131.1, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.locked'), diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index db7337a381..234fcd8ff1 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -1,10 +1,18 @@ \name{print.data.table} \alias{print.data.table} +\alias{format_col} +\alias{format_col.default} +\alias{format_col.POSIXct} +\alias{format_col.expression} +\alias{format_list_item} +\alias{format_list_item.default} \title{ data.table Printing Options } \description{ \code{print.data.table} extends the functionalities of \code{print.data.frame}. Key enhancements include automatic output compression of many observations and concise column-wise \code{class} summary. + + \code{format_col} and \code{format_list_item} generics provide flexibility for end-users to define custom printing methods for generic classes. } \usage{ \method{print}{data.table}(x, @@ -17,6 +25,14 @@ trunc.cols=getOption("datatable.print.trunc.cols"), # default: FALSE quote=FALSE, timezone=FALSE, \dots) + + format_col(x, \dots) + \method{format_col}{default}(x, \dots) + \method{format_col}{POSIXct}(x, \dots, timezone=FALSE) + \method{format_col}{expression}(x, \dots) + + format_list_item(x, \dots) + \method{format_list_item}{default}(x, \dots) } \arguments{ \item{x}{ A \code{data.table}. } @@ -31,8 +47,17 @@ \item{timezone}{ If \code{TRUE}, time columns of class POSIXct or POSIXlt will be printed with their timezones (if attribute is available). } \item{\dots}{ Other arguments ultimately passed to \code{format}. } } +\value{ + \code{print.data.table} returns \code{x} invisibly. + + \code{format_col} returns a \code{length(x)}-size \code{character} vector. + + \code{format_list_item} returns a length-1 \code{character} scalar. +} \details{ By default, with an eye to the typically large number of observations in a \code{data.table}, only the beginning and end of the object are displayed (specifically, \code{head(x, topn)} and \code{tail(x, topn)} are displayed unless \code{nrow(x) < nrows}, in which case all rows will print). + + \code{format_col} is applied at a column level; for example, \code{format_col.POSIXct} is used to tag the time zones of \code{POSIXct} columns. \code{format_list_item} is applied to the elements (rows) of \code{list} columns; see Examples. } \seealso{\code{\link{print.default}}} \examples{ @@ -72,5 +97,15 @@ thing_61 = vector("complex", 3)) print(DT, trunc.cols=TRUE) options(old_width) + + # Formatting customization + format_col.complex = function(x, ...) sprintf('(\%.1f, \%.1fi)', Re(x), Im(x)) + x = data.table(z = c(1 + 3i, 2 - 1i, pi + 2.718i)) + print(x) + + iris = as.data.table(iris) + iris_agg = iris[ , .(reg = list(lm(Sepal.Length ~ Petal.Length))), by = Species] + format_list_item.lm = function(x, ...) sprintf('', format(x$call$formula)) + print(iris_agg) } From 08cbf44cc9aab783430b1b7824f402f81f0cabb3 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 9 Aug 2021 14:33:32 -0600 Subject: [PATCH 338/588] whitespace-only: tabs in frank.c --- src/frank.c | 64 ++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/src/frank.c b/src/frank.c index 2e9e14bcf1..e8445998b3 100644 --- a/src/frank.c +++ b/src/frank.c @@ -62,38 +62,38 @@ SEXP dt_na(SEXP x, SEXP cols) { // is.na(some_list) returns TRUE only for elements which are // scalar NA. for (int j=0; j Date: Tue, 10 Aug 2021 01:09:40 +0200 Subject: [PATCH 339/588] added fread(file=URL) support (#5097) --- NEWS.md | 2 ++ R/fread.R | 66 ++++++++++++++++++++----------------------- R/utils.R | 10 +++++-- inst/tests/tests.Rraw | 34 ++++++++++++---------- src/init.c | 2 ++ src/utils.c | 27 ++++++++++++++++++ 6 files changed, 87 insertions(+), 54 deletions(-) diff --git a/NEWS.md b/NEWS.md index f333ec0356..3a9ffbca5f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -161,6 +161,8 @@ 29. A segfault occurred when `nrow/throttle < nthread`, [#5077](https://github.com/Rdatatable/data.table/issues/5077). With the default throttle of 1024 rows (see `?setDTthreads`), at least 64 threads would be needed to trigger the segfault since there needed to be more than 65,535 rows too. It occurred on a server with 256 logical cores where `data.table` uses 128 threads by default. Thanks to Bennet Becker for reporting, debugging at C level, and fixing. It also occurred when the throttle was increased so as to use fewer threads; e.g. at the limit `setDTthreads(throttle=nrow(DT))`. +30. `fread(file=URL)` now works rather than error `does not exist or is non-readable`, [#4952](https://github.com/Rdatatable/data.table/issues/4952). `fread(URL)` and `fread(input=URL)` worked before and continue to work. Thanks to @pnacht for reporting and @ben-schwen for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/fread.R b/R/fread.R index 81ffb2a0df..1bc0267b34 100644 --- a/R/fread.R +++ b/R/fread.R @@ -54,41 +54,15 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } if (input=="" || length(grep('\\n|\\r', input))) { # input is data itself containing at least one \n or \r - } else { - if (startsWith(input, " ")) { - stopf("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") - } - str7 = substr(input, 1L, 7L) # avoid grepl() for #2531 - if (str7=="ftps://" || startsWith(input, "https://")) { - # nocov start - if (!requireNamespace("curl", quietly = TRUE)) - stopf("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov - tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below - curl::curl_download(input, tmpFile, mode="wb", quiet = !showProgress) - file = tmpFile - on.exit(unlink(tmpFile), add=TRUE) - # nocov end - } - else if (startsWith(input, "ftp://") || str7== "http://" || str7=="file://") { - # nocov start - method = if (str7=="file://") "internal" else getOption("download.file.method", default="auto") - # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668 - tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) - download.file(input, tmpFile, method=method, mode="wb", quiet=!showProgress) - # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" - file = tmpFile - on.exit(unlink(tmpFile), add=TRUE) - # nocov end - } - else if (length(grep(' ', input, fixed = TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command - cmd = input - if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) { - messagef("Taking input= as a system command because it contains a space ('%s'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.", cmd) - } - } - else { - file = input # filename + } else if (startsWith(input, " ")) { + stopf("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") + } else if (length(grep(' ', input, fixed=TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command + cmd = input + if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) { + messagef("Taking input= as a system command because it contains a space ('%s'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.", cmd) } + } else { + file = input # filename, including URLS } } if (!is.null(cmd)) { @@ -97,6 +71,26 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") on.exit(unlink(tmpFile), add=TRUE) } if (!is.null(file)) { + if (!is.character(file) || length(file)!=1L) + stopf("file= must be a single character string containing a filename, or URL starting 'http[s]://', 'ftp[s]://' or 'file://'") + if (w <- startsWithAny(file, c("https://", "ftps://", "http://", "ftp://", "file://"))) { # avoid grepl() for #2531 + # nocov start + tmpFile = tempfile(fileext = paste0(".",tools::file_ext(file)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below + if (w<=2L) { # https: or ftps: + if (!requireNamespace("curl", quietly = TRUE)) + stopf("URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov + + curl::curl_download(file, tmpFile, mode="wb", quiet = !showProgress) + } else { + method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 + else getOption("download.file.method", default="auto") # http: or ftp: + download.file(file, tmpFile, method=method, mode="wb", quiet=!showProgress) + # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" + } + file = tmpFile + on.exit(unlink(tmpFile), add=TRUE) + # nocov end + } file_info = file.info(file) if (is.na(file_info$size)) stopf("File '%s' does not exist or is non-readable. getwd()=='%s'", file, getwd()) if (isTRUE(file_info$isdir)) stopf("File '%s' is a directory. Not yet implemented.", file) # dir.exists() requires R v3.2+, #989 @@ -104,10 +98,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") warningf("File '%s' has size 0. Returning a NULL %s.", file, if (data.table) 'data.table' else 'data.frame') return(if (data.table) data.table(NULL) else data.frame(NULL)) } - if ((is_gz <- endsWith(file, ".gz")) || endsWith(file, ".bz2")) { + if (w <- endsWithAny(file, c(".gz",".bz2"))) { if (!requireNamespace("R.utils", quietly = TRUE)) stopf("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov - FUN = if (is_gz) gzfile else bzfile + FUN = if (w==1L) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download on.exit(unlink(decompFile), add=TRUE) diff --git a/R/utils.R b/R/utils.R index b4ae4c9ee6..575913d345 100644 --- a/R/utils.R +++ b/R/utils.R @@ -28,9 +28,13 @@ if (base::getRversion() < "3.2.0") { # Apr 2015 if (!exists('startsWith', 'package:base', inherits=FALSE)) { # R 3.3.0; Apr 2016 startsWith = function(x, stub) substr(x, 1L, nchar(stub))==stub } -if (!exists('endsWith', 'package:base', inherits=FALSE)) { - endsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub} -} +# endsWith no longer used from #5097 so no need to backport; prevent usage to avoid dev delay until GLCI's R 3.1.0 test +endsWith = function(...) stop("Internal error: use endsWithAny instead of base::endsWith") + +startsWithAny = function(x,y) .Call(CstartsWithAny, x, y, TRUE) +endsWithAny = function(x,y) .Call(CstartsWithAny, x, y, FALSE) +# For fread.R #5097 we need if any of the prefixes match, which one, and can return early on the first match +# Hence short and simple ascii-only at C level # which.first which.first = function(x) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9a30d6b3d4..ea8abd7537 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -30,7 +30,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { compactprint = data.table:::compactprint cube.data.table = data.table:::cube.data.table dcast.data.table = data.table:::dcast.data.table - if (!exists('endsWith', 'package:base', inherits=FALSE)) endsWith = data.table:::endsWith + endsWith = data.table:::endsWith + endsWithAny = data.table:::endsWithAny forder = data.table:::forder forderv = data.table:::forderv format.data.table = data.table:::format.data.table @@ -5887,10 +5888,11 @@ f = paste0("file://",testDir("russellCRLF.csv")) # simulates a http:// request as far as file.download() and unlink() goes, without internet # download.file() in fread() changes the input data from \r\n to \n, on Windows. test(1378.2, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) - +test(1378.25, fread(file = f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) f = paste("file://",testDir("russellCRCRLF.csv"),sep="") # actually has 3 \r in the file, download.file() from file:// changes that to \r\r\n, so we can simulate download.file from http: in text mode. test(1378.3, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) +test(1378.35, fread(file = f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) #==================================== options(datatable.fread.datatable = FALSE) @@ -13094,12 +13096,15 @@ test(1921.2, as.IDate(1000), as.IDate("1972-09-27")) f = tempfile() file.create(f) test(1922.1, fread(f), data.table(NULL), warning = 'File.*size 0') -test(1922.2, fread(file = f), data.table(NULL), warning = 'File.*size 0') +test(1922.21, fread(file = f), data.table(NULL), warning = 'File.*size 0') +test(1922.22, fread(file = 2L), error="file= must be a single character string containing") # trigger download for last instance of warning test(1922.3, fread(paste0('file://', f)), data.table(NULL), warning = 'File.*size 0') +test(1922.35, fread(file = paste0('file://', f)), data.table(NULL), warning = 'File.*size 0') test(1922.4, fread(f, data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') test(1922.5, fread(file = f, data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') test(1922.6, fread(paste0('file://', f), data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') +test(1922.65, fread(file = paste0('file://', f), data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') unlink(f) #fwrite creates a file or does nothing, as appropriate, also #2898 @@ -17790,18 +17795,17 @@ if (test_bit64) { test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) } -# compatibility of endsWith backport with base::endsWith -if (exists('endsWith', 'package:base', inherits=FALSE)) { - DTendsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub} - BSendsWith = base::endsWith - test(2194.1, DTendsWith('abcd', 'd'), BSendsWith('abcd', 'd')) - test(2194.2, DTendsWith(letters, 'e'), BSendsWith(letters, 'e')) - test(2194.3, DTendsWith(NA_character_, 'a'), BSendsWith(NA_character_, 'a')) - test(2194.4, DTendsWith(character(), 'a'), BSendsWith(character(), 'a')) - # file used in encoding tests - txt = readLines(testDir("issue_563_fread.txt")) - test(2194.5, DTendsWith(txt, 'B'), BSendsWith(txt, 'B')) -} +# endsWithAny added in #5097 for internal use replacing one use of base::endsWith (in fread.R) +test(2194.1, endsWithAny('abcd', 'd'), 1L) +test(2194.2, endsWithAny('ab.bz2', c('.gz','.bz2')), 2L) +test(2194.3, endsWithAny('ab.bz', c('.gz','.bz2')), FALSE) +test(2194.4, endsWithAny(letters, 'e'), error="Internal error.*types or lengths incorrect") +test(2194.5, endsWithAny(NA_character_, 'a'), FALSE) +test(2194.6, endsWithAny(character(), 'a'), error="Internal error.*types or lengths incorrect") +# file used in encoding tests +txt = readLines(testDir("issue_563_fread.txt")) +test(2194.7, endsWithAny(txt, 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5 +test(2194.8, endsWith('abcd', 'd'), error="Internal error.*use endsWithAny") # uniqueN(x, by=character()) was internal error, #4594 DT = data.table(idx=c(1L,2L,1L,3L), value="val") diff --git a/src/init.c b/src/init.c index 49765ea482..910d675194 100644 --- a/src/init.c +++ b/src/init.c @@ -127,6 +127,7 @@ SEXP islockedR(); SEXP allNAR(); SEXP test_dt_win_snprintf(); SEXP dt_zlib_version(); +SEXP startsWithAny(); // .Externals SEXP fastmean(); @@ -222,6 +223,7 @@ R_CallMethodDef callMethods[] = { {"Ctest_dt_win_snprintf", (DL_FUNC)&test_dt_win_snprintf, -1}, {"Cdt_zlib_version", (DL_FUNC)&dt_zlib_version, -1}, {"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, +{"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {NULL, NULL, 0} }; diff --git a/src/utils.c b/src/utils.c index a1d9093b8d..6320ccd221 100644 --- a/src/utils.c +++ b/src/utils.c @@ -389,3 +389,30 @@ SEXP dt_zlib_version() { #endif return ScalarString(mkChar(out)); } + +SEXP startsWithAny(const SEXP x, const SEXP y, SEXP start) { + // for is_url in fread.R added in #5097 + // startsWith was added to R in 3.3.0 so we need something to support R 3.1.0 + // short and simple ascii-only + if (!isString(x) || !isString(y) || length(x)!=1 || length(y)<1 || !isLogical(start) || length(start)!=1 || LOGICAL(start)[0]==NA_LOGICAL) + error("Internal error: data.table's internal startsWithAny types or lengths incorrect"); + const char *xd = CHAR(STRING_ELT(x, 0)); + const int n=length(y); + if (LOGICAL(start)[0]) { + for (int i=0; i=ylen && strncmp(xd+xlen-ylen, yd, ylen)==0) + return ScalarInteger(i+1); + } + } + return ScalarLogical(false); +} + From c3d1100cf814ee16d003f8ab294dcf8e93d75d16 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 10 Aug 2021 03:55:07 -0400 Subject: [PATCH 340/588] Fwrite integer rownames (#5098) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 35 +++++++++++++++++++++-------------- src/fwrite.c | 20 ++++++++++++-------- src/fwrite.h | 3 ++- src/fwriteR.c | 14 +++++++++++++- 5 files changed, 50 insertions(+), 24 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3a9ffbca5f..8bb857cef3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -163,6 +163,8 @@ 30. `fread(file=URL)` now works rather than error `does not exist or is non-readable`, [#4952](https://github.com/Rdatatable/data.table/issues/4952). `fread(URL)` and `fread(input=URL)` worked before and continue to work. Thanks to @pnacht for reporting and @ben-schwen for the PR. +31. `fwrite(DF, row.names=TRUE)` where `DF` has specific integer rownames (e.g. using `rownames(DF) <- c(10L,20L,30L)`) would ignore the integer rownames and write the row numbers instead, [#4957](https://github.com/Rdatatable/data.table/issues/4957). Thanks to @dgarrimar for reporting and @ColeMiller1 for the PR. Further, when `quote='auto'` (default) and the rownames are integers (either default or specific), they are no longer quoted. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ea8abd7537..0ccda6f46b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10706,23 +10706,30 @@ test(1733.2, fwrite(data.table(c(1.2,-8.0,pi,67.99),1:4),dec=",",sep=";"), # fwrite implied and actual row.names DT = data.table(foo=1:3,bar=c(1.2,9.8,-6.0)) -test(1734.1, capture.output(fwrite(DT,row.names=TRUE,quote=FALSE)), - capture.output(write.csv(DT,quote=FALSE))) -test(1734.2, capture.output(fwrite(DT,row.names=TRUE,quote=TRUE)), - capture.output(write.csv(DT))) -test(1734.3, fwrite(DT,row.names=TRUE,quote='auto'), # same other than 'foo' and 'bar' column names not quoted - output="\"\",foo,bar\n\"1\",1,1.2\n\"2\",2,9.8\n\"3\",3,-6") +test(1734.01, capture.output(fwrite(DT,row.names=TRUE,quote=FALSE)), + capture.output(write.csv(DT,quote=FALSE))) +test(1734.02, capture.output(fwrite(DT,row.names=TRUE,quote=TRUE)), + capture.output(write.csv(DT))) +test(1734.03, fwrite(DT,row.names=TRUE,quote='auto'), # same other than 'foo' and 'bar' column names not quoted + output="\"\",foo,bar\n1,1,1.2\n2,2,9.8\n3,3,-6") DF = as.data.frame(DT) -test(1734.4, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), - capture.output(write.csv(DF,quote=FALSE))) -test(1734.5, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), - capture.output(write.csv(DF))) +test(1734.04, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), + capture.output(write.csv(DF,quote=FALSE))) +test(1734.05, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), + capture.output(write.csv(DF))) rownames(DF)[2] = "someName" rownames(DF)[3] = "another" -test(1734.6, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), - capture.output(write.csv(DF,quote=FALSE))) -test(1734.7, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), - capture.output(write.csv(DF))) +test(1734.06, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), + capture.output(write.csv(DF,quote=FALSE))) +test(1734.07, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), + capture.output(write.csv(DF))) +rownames(DF) = c(10L, -20L, 30L) ## test for #4957 +test(1734.08, capture.output(fwrite(DF, row.names=TRUE, quote=TRUE)), + capture.output(write.csv(DF))) +test(1734.09, capture.output(fwrite(DF, row.names=TRUE, quote=FALSE)), + capture.output(write.csv(DF, quote=FALSE))) +test(1734.10, fwrite(DF, row.names=TRUE, quote='auto'), + output=c('"",foo,bar','10,1,1.2','-20,2,9.8','30,3,-6')) # list columns and sep2 set.seed(1) diff --git a/src/fwrite.c b/src/fwrite.c index f7f4003181..2d10d222fd 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -623,8 +623,8 @@ void fwriteMain(fwriteMainArgs args) DTPRINT(_("... ")); for (int j=args.ncol-10; j Date: Fri, 13 Aug 2021 23:18:27 +0200 Subject: [PATCH 341/588] Allow 'type.convert' argument in `tstrsplit` function to support a function/list of functions/named list. (#5099) --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ R/transpose.R | 51 ++++++++++++++++++++++++++++++++++++++++--- inst/tests/tests.Rraw | 26 ++++++++++++++++++++++ man/tstrsplit.Rd | 22 ++++++++++++++++++- 5 files changed, 99 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ff8fe0ebf6..d4f8fbe69c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,7 +67,8 @@ Authors@R: c( person("Vadim","Khotilovich", role="ctb"), person("Hadley","Wickham", role="ctb"), person("Bennet","Becker", role="ctb"), - person("Kyle","Haynes", role="ctb")) + person("Kyle","Haynes", role="ctb"), + person("Kamgang","B", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NEWS.md b/NEWS.md index 8bb857cef3..73e8fd5608 100644 --- a/NEWS.md +++ b/NEWS.md @@ -101,6 +101,8 @@ 17. `data.table` printing now supports customizable methods for both columns and list column row items, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). `format_col` is S3-generic for customizing how to print whole columns; `format_list_item` is S3-generic for customizing how to print each row of a list column. Thanks to @mllg who initially filed [#3338](https://github.com/Rdatatable/data.table/pulls/3338) with the seed of the idea, @franknarf1 who earlier suggested the idea of providing custom formatters, @fparages who submitted a patch to improve the printing of timezones for [#2842](https://github.com/Rdatatable/data.table/issues/2842), @RichardRedding for pointing out an error relating to printing wide `expression` columns in [#3011](https://github.com/Rdatatable/data.table/issues/3011), and @MichaelChirico for implementing. See `?print.data.table` for examples. +18. `tstrsplit(,type.convert=)` now accepts a named list of functions to apply to each part, [#5094](https://github.com/Rdatatable/data.table/issues/5094). Thanks to @Kamgang-B for the request and implementing. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/transpose.R b/R/transpose.R index a326863e70..e6f9da0381 100644 --- a/R/transpose.R +++ b/R/transpose.R @@ -30,12 +30,57 @@ tstrsplit = function(x, ..., fill=NA, type.convert=FALSE, keep, names=FALSE) { if (!missing(keep)) { keep = suppressWarnings(as.integer(keep)) chk = min(keep) >= min(1L, length(ans)) & max(keep) <= length(ans) - if (!isTRUE(chk)) # handles NA case too + if (!isTRUE(chk) || !length(keep)) stopf("'keep' should contain integer values between %d and %d.", min(1L, length(ans)), length(ans)) - ans = ans[keep] + } else { + keep = seq_along(ans) } + if (isFALSE(type.convert)) + ans = ans[keep] # Implementing #1094, but default FALSE - if(type.convert) ans = lapply(ans, type.convert, as.is = TRUE) + else if (isTRUE(type.convert)) + ans = lapply(ans[keep], type.convert, as.is=TRUE) + # Implementing and extending #5094 + else if (is.function(type.convert)) + ans = lapply(ans[keep], type.convert) + else if (is.list(type.convert)) { + if (all(vapply(type.convert, is.function, NA)) && (length(keep) == length(type.convert) || length(type.convert) == 1L)) + ans = mapply(function(idx, fun) fun(ans[[idx]]), keep, type.convert, SIMPLIFY=FALSE, USE.NAMES=FALSE) + else { + n = length(type.convert) + if(!n) stopf("The argument 'type.convert' does not support empty list.") + is_named = nzchar(names(type.convert)) + all_is_named = length(is_named) && all(is_named) # because all(is_named)=TRUE if is_named=NULL <-- names(type.convert)=NULL + last_item = deparse1(substitute(type.convert)[[n + 1L]]) + if (!all_is_named) { + if (!(sum(!is_named) == 1L && !is_named[n] && is.function(type.convert[[n]]))) + stopf("When the argument 'type.convert' contains an unnamed element, it is expected to be the last element and should be a function. More than one unnamed element is not allowed unless all elements are functions with length equal to %d (the length of the transpose list or 'keep' argument if it is specified).", length(keep)) + else { + fothers = type.convert[[n]] + type.convert = type.convert[-n] + } + } + indxs = unlist(type.convert, recursive=FALSE, use.names=FALSE) + bad_indxs = setdiff(indxs, keep) + if (!is.numeric(indxs) || anyNA(indxs) || anyDuplicated(indxs)) + stopf("When the argument 'type.convert' contains transpose list indices, it should be a named list of non-missing integer values (with no duplicate) except the last element that should be unnamed if it is a function.") + if (length(bad_indxs)) + stopf("When the argument 'type.convert' contains transpose list indices, they should be integer values contained in the argument 'keep' (if it is specified) or be between %d and %d (if it is not). But '%s' is/are not contained in '%s'.", 1L, length(keep), toString(bad_indxs), toString(keep)) + if (exists("fothers", inherits=FALSE)) { + others = setdiff(keep, indxs) + if (length(others)) + ans[others] = lapply(ans[others], fothers) + else + warningf("In the argument 'type.convert', '%s' was ignored because all elements in the transpose list or elements corrisponding to indices specified in the 'keep' argument have already been converted.", last_item) + } + for (fn in names(type.convert)) { + idx = type.convert[[fn]] + ans[idx] = lapply(ans[idx], function(x) match.fun(fn)(x)) + } + ans = ans[keep] + } + } else + stopf("The argument 'type.convert' should be TRUE/FALSE, a function, a list of functions, or a named list of pairs 'fun=indices' with optionally one unnamed element (a function) but an object of type '%s' was provided.", typeof(type.convert)) if (isFALSE(names)) return(ans) else if (isTRUE(names)) names = paste0("V", seq_along(ans)) if (length(names) != length(ans)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0ccda6f46b..fb3cb363da 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17877,3 +17877,29 @@ test(2202.2, fwrite(data.frame(a="id", b=1:1e2), sep=""), output = c("ab", paste0("id", 1:1e2))) test(2202.3, fwrite(data.table(a=c(NA, 2, 3.01), b=c('foo', NA, 'bar')), sep=""), output=c("ab", "foo", "2", "3.01bar")) + +# tstrsplit(,type.convert=) now accepts functions, #5094 +w = c("Yes/F", "No/M") +x = c("Yes 2000-03-01 A/T", "No 2000-04-01 E/R") +y = c("1/1/2", "2/5/2.5") +z = c("Yes/1/2", "No/5/3.5") +test(2203.01, tstrsplit(z, "/"), list(c("Yes", "No"), c("1", "5"), c("2", "3.5"))) +test(2203.02, tstrsplit(z, "/", type.convert=TRUE), list(c("Yes", "No"), c(1L, 5L), c(2, 3.5))) +test(2203.03, tstrsplit(y, "/", type.convert=as.numeric), list(c(1, 2), c(1, 5), c(2, 2.5))) +test(2203.04, tstrsplit(w, "/", type.convert=as.factor), list(factor(c("Yes", "No")), factor(c("F", "M")))) +test(2203.05, tstrsplit(w, "/", type.convert=list(as.factor)), list(factor(c("Yes", "No")), factor(c("F", "M")))) +test(2203.06, tstrsplit(z, "/", type.convert=list(as.numeric=2:3)), list(c("Yes", "No"), c(1, 5), c(2, 3.5))) +test(2203.07, tstrsplit(x, " ", type.convert=as.IDate.default, keep=2L), list(c(as.IDate.default("2000-03-01"), as.IDate.default("2000-04-01")))) +test(2203.08, tstrsplit(z, "/", type.convert=list(as.factor=1L, as.integer=2L, as.numeric=3L)), list(factor(c("Yes", "No")), c(1L, 5L), c(2, 3.5))) +test(2203.09, tstrsplit(z, "/", type.convert=list(as.factor, as.integer, as.numeric)), list(factor(c("Yes", "No")), c(1L, 5L), c(2, 3.5))) +test(2203.10, tstrsplit(w, "/", type.convert=function(x) type.convert(x, as.is=FALSE)), list(factor(c("Yes", "No")), factor(c("F", "M")))) +test(2203.11, tstrsplit(z, "/", type.convert=TRUE, names=TRUE), list(V1=c("Yes", "No"), V2=c(1L, 5L), V3=c(2, 3.5))) +test(2203.12, tstrsplit(z, "/", type.convert=TRUE, names=c("A", "B", "C")), list(A=c("Yes", "No"), B=c(1L, 5L), C=c(2, 3.5))) +test(2203.13, tstrsplit(x, "[- ]", type.convert=list(as.integer=2L), keep=c(1:2, 5L), names=c("bin", "yr", "tz")), list(bin=c("Yes", "No"), yr=c(2000L, 2000L), tz=c("A/T", "E/R"))) +test(2203.14, tstrsplit(w, "/", type.convert=list(as.factor=1:2, as.numeric)), list(factor(c("Yes", "No")), factor(c("F", "M"))), warning="type.convert.+was ignored") +test(2203.15, tstrsplit(z, "/", type.convert=list(as.factor, as.numeric)), error="unnamed.+all.+functions.+transpose list or 'keep' argument") +test(2203.16, tstrsplit(z, "/", type.convert=list(as.integer=2L), keep=5L), error="keep.+contain integer.+between") +test(2203.17, tstrsplit(w, "/", type.convert="4"), error="TRUE/FALSE.+function.+named list") +test(2203.18, tstrsplit(w, "/", type.convert=c(TRUE, FALSE)), error="TRUE/FALSE.+function.+named list") +test(2203.19, tstrsplit(w, "/", keep=integer()), error="keep.+contain integer.+between", ignore.warning="no non-missing") +test(2203.20, tstrsplit(w, "/", type.convert=list()), error="not support empty list") diff --git a/man/tstrsplit.Rd b/man/tstrsplit.Rd index 8719371733..a73f5e9f2c 100644 --- a/man/tstrsplit.Rd +++ b/man/tstrsplit.Rd @@ -14,7 +14,7 @@ tstrsplit(x, \dots, fill=NA, type.convert=FALSE, keep, names=FALSE) \item{x}{The vector to split (and transpose).} \item{\dots}{ All the arguments to be passed to \code{\link[base]{strsplit}}. } \item{fill}{ Default is \code{NA}. It is used to fill shorter list elements so as to return each element of the transposed result of equal lengths. } - \item{type.convert}{\code{TRUE} calls \code{\link{type.convert}} with \code{as.is=TRUE} on the columns.} + \item{type.convert}{\code{TRUE} calls \code{\link{type.convert}} with \code{as.is=TRUE} on the columns. May also be a function, list of functions, or named list of functions to apply to each part; see examples. } \item{keep}{Specify indices corresponding to just those list elements to retain in the transposed result. Default is to return all.} \item{names}{\code{TRUE} auto names the list with \code{V1, V2} etc. Default (\code{FALSE}) is to return an unnamed list.} } @@ -42,6 +42,26 @@ tstrsplit(x, "", fixed=TRUE, keep=c(1,3,5), names=LETTERS[1:3]) DT = data.table(x=c("A/B", "A", "B"), y=1:3) DT[, c("c1") := tstrsplit(x, "/", fixed=TRUE, keep=1L)][] DT[, c("c1", "c2") := tstrsplit(x, "/", fixed=TRUE)][] + +# type.convert argument +DT = data.table( + w = c("Yes/F", "No/M"), + x = c("Yes 2000-03-01 A/T", "No 2000-04-01 E/R"), + y = c("1/1/2", "2/5/2.5"), + z = c("Yes/1/2", "No/5/3.5"), + v = c("Yes 10 30.5 2000-03-01 A/T", "No 20 10.2 2000-04-01 E/R")) + +# convert each element in the transpose list to type factor +DT[, tstrsplit(w, "/", type.convert=as.factor)] + +# convert part (some elements) of the transpose list and leave another part (if there is any) unchanged. +DT[, tstrsplit(z, "/", type.convert=list(as.numeric=2:3))] + +# convert some elements to the corresponding specified types and the remaining elements to a given type +DT[, tstrsplit(z, "/", type.convert=list(as.factor=1L, as.numeric))] + +# convert given elements to specific types and convert the remaining elements using 'type.convert' function. +DT[, tstrsplit(v, " ", type.convert=list(as.factor=1L, as.IDate=4L, function(x) type.convert(x, as.is=TRUE)))] } \seealso{ \code{\link{data.table}}, \code{\link{transpose}} From 01bf66765e5a8d6c10816441fe3470766d9590bc Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 13 Aug 2021 18:58:38 -0600 Subject: [PATCH 342/588] update contributor name #5099 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d4f8fbe69c..b8439044c9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -68,7 +68,7 @@ Authors@R: c( person("Hadley","Wickham", role="ctb"), person("Bennet","Becker", role="ctb"), person("Kyle","Haynes", role="ctb"), - person("Kamgang","B", role="ctb")) + person("Boniface Christian","Kamgang", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown From 67a21bbf7a2ae99bb6b6c0e127bb9a28cfafdad1 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 13 Aug 2021 19:08:06 -0600 Subject: [PATCH 343/588] #5099 follow up: deparse1 was new in R 4.0.0 so use deparse --- R/transpose.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/transpose.R b/R/transpose.R index e6f9da0381..115752c04e 100644 --- a/R/transpose.R +++ b/R/transpose.R @@ -51,7 +51,7 @@ tstrsplit = function(x, ..., fill=NA, type.convert=FALSE, keep, names=FALSE) { if(!n) stopf("The argument 'type.convert' does not support empty list.") is_named = nzchar(names(type.convert)) all_is_named = length(is_named) && all(is_named) # because all(is_named)=TRUE if is_named=NULL <-- names(type.convert)=NULL - last_item = deparse1(substitute(type.convert)[[n + 1L]]) + last_item = paste(deparse(substitute(type.convert)[[n + 1L]], width.cutoff=500L), collapse=" ") if (!all_is_named) { if (!(sum(!is_named) == 1L && !is_named[n] && is.function(type.convert[[n]]))) stopf("When the argument 'type.convert' contains an unnamed element, it is expected to be the last element and should be a function. More than one unnamed element is not allowed unless all elements are functions with length equal to %d (the length of the transpose list or 'keep' argument if it is specified).", length(keep)) From 86a383db2ae14a08d805a3ea3f7453e1ce58b758 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 14 Aug 2021 08:41:49 -0600 Subject: [PATCH 344/588] #5099 follow up: line length 100 in .Rd --- man/tstrsplit.Rd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/man/tstrsplit.Rd b/man/tstrsplit.Rd index a73f5e9f2c..123539b09c 100644 --- a/man/tstrsplit.Rd +++ b/man/tstrsplit.Rd @@ -54,17 +54,17 @@ DT = data.table( # convert each element in the transpose list to type factor DT[, tstrsplit(w, "/", type.convert=as.factor)] -# convert part (some elements) of the transpose list and leave another part (if there is any) unchanged. +# convert part and leave any others DT[, tstrsplit(z, "/", type.convert=list(as.numeric=2:3))] -# convert some elements to the corresponding specified types and the remaining elements to a given type +# convert part with one function and any others with another DT[, tstrsplit(z, "/", type.convert=list(as.factor=1L, as.numeric))] -# convert given elements to specific types and convert the remaining elements using 'type.convert' function. -DT[, tstrsplit(v, " ", type.convert=list(as.factor=1L, as.IDate=4L, function(x) type.convert(x, as.is=TRUE)))] +# convert the remaining using 'type.convert(x, as.is=TRUE)' (i.e. what type.convert=TRUE does) +DT[, tstrsplit(v, " ", type.convert=list(as.IDate=4L, function(x) type.convert(x, as.is=TRUE)))] } \seealso{ - \code{\link{data.table}}, \code{\link{transpose}} + \code{\link{data.table}}, \code{\link{transpose}}, \code{\link[utils]{type.convert}} } \keyword{ data } From 6555976d7fd2a314d5d620806ed4fcd2d50f8dab Mon Sep 17 00:00:00 2001 From: "Florian G. Pflug" Date: Mon, 16 Aug 2021 21:30:55 +0200 Subject: [PATCH 345/588] Step 1 in fixing OpenMP detection in configure to work in cases where "-fopenmp" is needed (#4707) --- configure | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configure b/configure index f2e98ec312..9cf4baf981 100755 --- a/configure +++ b/configure @@ -87,8 +87,8 @@ EOF if [ "$R_NO_OPENMP" = "1" ]; then # Compilation failed -- try forcing -fopenmp instead. - # TODO: doesn't R_NO_OPENMP need to be set to 0 before next line? - ${CC} ${CFLAGS} -fopenmp test-omp.c || R_NO_OPENMP=1 + R_NO_OPENMP=0 + "${CC}" "${CFLAGS}" -fopenmp test-omp.c || R_NO_OPENMP=1 # TODO: and then nothing seems to be done with this outcome else echo "R CMD SHLIB supports OpenMP without any extra hint" From 893e169adfaed9379d3d4dd3c29d7858b1ad9fef Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 17 Aug 2021 20:42:50 -0600 Subject: [PATCH 346/588] .dev-only: flags to compile R 3.1.0 using recent gcc and gfortran --- .dev/CRAN_Release.cmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 2cc09f0653..b010d175f4 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -252,7 +252,7 @@ cd ~/build wget http://cran.stat.ucla.edu/src/base/R-3/R-3.1.0.tar.gz tar xvf R-3.1.0.tar.gz cd R-3.1.0 -./configure --without-recommended-packages +CFLAGS="-fcommon" FFLAGS="-fallow-argument-mismatch" ./configure --without-recommended-packages make alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD From 69ce6917e53387565cf23f22b069420cbd93b47b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 17 Aug 2021 21:40:34 -0600 Subject: [PATCH 347/588] isolate tests.Rraw from variables created by user in .GlobalEnv (#5101) --- NEWS.md | 2 ++ R/test.data.table.R | 5 +++-- inst/tests/nafill.Rraw | 6 ++++-- inst/tests/other.Rraw | 4 +++- inst/tests/tests.Rraw | 6 ++++-- 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 73e8fd5608..ef19ccd191 100644 --- a/NEWS.md +++ b/NEWS.md @@ -167,6 +167,8 @@ 31. `fwrite(DF, row.names=TRUE)` where `DF` has specific integer rownames (e.g. using `rownames(DF) <- c(10L,20L,30L)`) would ignore the integer rownames and write the row numbers instead, [#4957](https://github.com/Rdatatable/data.table/issues/4957). Thanks to @dgarrimar for reporting and @ColeMiller1 for the PR. Further, when `quote='auto'` (default) and the rownames are integers (either default or specific), they are no longer quoted. +32. `test.data.table()` would fail on test 1894 if the variable `z` was defined by the user, [#3705](https://github.com/Rdatatable/data.table/issues/3705). The test suite already ran in its own separate environment. That environment's parent is no longer `.GlobalEnv` to isolate it further. Thanks to Michael Chirico for reporting, and Matt Dowle for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/test.data.table.R b/R/test.data.table.R index a8a19522f4..0c7fbeb23a 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -1,16 +1,18 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent) { stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress)) - if (exists("test.data.table", .GlobalEnv,inherits=FALSE)) { + if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # package developer # nocov start if ("package:data.table" %chin% search()) stopf("data.table package is loaded. Unload or start a fresh R session.") rootdir = if (pkg!="." && pkg %chin% dir()) file.path(getwd(), pkg) else Sys.getenv("PROJ_PATH") subdir = file.path("inst","tests") + env = new.env(parent=.GlobalEnv) # in dev cc() sources all functions in .GlobalEnv # nocov end } else { # i) R CMD check and ii) user running test.data.table() rootdir = getNamespaceInfo("data.table","path") subdir = "tests" + env = new.env(parent=parent.env(.GlobalEnv)) # when user runs test.data.table() we don't want their variables in .GlobalEnv affecting tests, #3705 } fulldir = file.path(rootdir, subdir) @@ -93,7 +95,6 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F cat("getDTthreads(verbose=TRUE):\n") # for tracing on CRAN; output to log before anything is attempted getDTthreads(verbose=TRUE) # includes the returned value in the verbose output (rather than dangling '[1] 4'); e.g. "data.table is using 4 threads" catf("test.data.table() running: %s\n", fn) # print fn to log before attempting anything on it (in case it is missing); on same line for slightly easier grep - env = new.env(parent=.GlobalEnv) assign("testDir", function(x) file.path(fulldir, x), envir=env) # are R's messages being translated to a foreign language? #3039, #630 diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index dcaa0f40d4..1e5107fb71 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -15,8 +15,10 @@ sugg = c( "nanotime" ) for (s in sugg) { - assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages(require(s, character.only=TRUE)))) - if (!loaded) cat("\n**** Suggested package",s,"is not installed. Tests using it will be skipped.\n\n") + assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages( + library(s, character.only=TRUE, logical.return=TRUE, quietly=TRUE, warn.conflicts=FALSE, pos="package:base") # attach at the end for #5101 + ))) + if (!loaded) cat("\n**** Suggested package",s,"is not installed or has dependencies missing. Tests using it will be skipped.\n\n") } x = 1:10 diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 89ce49b00f..a346a9e56b 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -15,7 +15,9 @@ INT = data.table:::INT if (any(duplicated(pkgs))) stop("Packages defined to be loaded for integration tests in 'inst/tests/other.Rraw' contains duplicates.") -f = function(pkg) suppressMessages(isTRUE(require(pkg, character.only=TRUE, quietly=TRUE, warn.conflicts=FALSE))) +f = function(pkg) suppressWarnings(suppressMessages(isTRUE( + library(pkg, character.only=TRUE, logical.return=TRUE, quietly=TRUE, warn.conflicts=FALSE, pos="package:base") # attach at the end for #5101 +))) loaded = sapply(pkgs, f) if (any(!loaded)) { stop("test.data.table('other.Rraw') is missing required package(s): ", paste(names(loaded)[!loaded], collapse=", "), ". If you can't install them and this is R CMD check, please set environment variable TEST_DATA_TABLE_WITH_OTHER_PACKAGES back to the default, false.") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fb3cb363da..0578b5b30f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -96,8 +96,10 @@ sugg = c( # zoo # In DESCRIPTION:Suggests otherwise R CMD check warning: '::' or ':::' import not declared from: 'zoo'; it is tested in other.Rraw though ) for (s in sugg) { - assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages(require(s, character.only=TRUE)))) - if (!loaded) cat("\n**** Suggested package",s,"is not installed. Tests using it will be skipped.\n\n") + assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages( + library(s, character.only=TRUE, logical.return=TRUE, quietly=TRUE, warn.conflicts=FALSE, pos="package:base") # attach at the end for #5101 + ))) + if (!loaded) cat("\n**** Suggested package",s,"is not installed or has dependencies missing. Tests using it will be skipped.\n\n") } test_longdouble = isTRUE(capabilities()["long.double"]) && identical(as.integer(.Machine$longdouble.digits), 64L) From 82b5e1569d319f3ebe28dcab46cebc58af82ba03 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 17 Aug 2021 22:02:05 -0600 Subject: [PATCH 348/588] follow-up #5101: other.Rraw zoo before xts to be attached last --- inst/tests/other.Rraw | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index a346a9e56b..bd9374db25 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -1,8 +1,9 @@ -pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "xts", "gdata", "zoo", "nlme", "bit64", "knitr", "parallel") +pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel") # First expression of this file must be as above: .gitlab-ci.yml uses parse(,n=1L) to read one expression from this file and installs pkgs. # So that these dependencies of other.Rraw are maintained in a single place. # TEST_DATA_TABLE_WITH_OTHER_PACKAGES is off by default so this other.Rraw doesn't run on CRAN. It is run by GLCI, locally in dev, and by # users running test.data.table("other.Rraw"). +# zoo needs to be before xts for #5101 otherwise xts's dependency zoo gets attached at position 2 if xts is loaded first if (exists("test.data.table",.GlobalEnv,inherits=FALSE) || !"package:data.table" %in% search()) { From f122f1dc362136268d46271aeffaca8e4823112f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 18 Aug 2021 00:24:38 -0600 Subject: [PATCH 349/588] Travis to GH-actions (#5102) --- .Rbuildignore | 1 - .github/.gitignore | 1 + .github/workflows/test-coverage.yaml | 48 ++++++++++++++++++++++++++++ .travis.yml | 48 ---------------------------- 4 files changed, 49 insertions(+), 49 deletions(-) create mode 100644 .github/.gitignore create mode 100644 .github/workflows/test-coverage.yaml delete mode 100644 .travis.yml diff --git a/.Rbuildignore b/.Rbuildignore index 9a939aae81..1e99a9004b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -15,7 +15,6 @@ ^\.graphics$ ^\.github$ -^\.travis\.yml$ ^\.appveyor\.yml$ ^\.gitlab-ci\.yml$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000000..2d19fc766d --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml new file mode 100644 index 0000000000..ba1f94fded --- /dev/null +++ b/.github/workflows/test-coverage.yaml @@ -0,0 +1,48 @@ +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +name: test-coverage + +jobs: + test-coverage: + runs-on: macOS-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v1 + + - uses: r-lib/actions/setup-pandoc@v1 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install dependencies + run: | + install.packages(c("remotes")) + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("covr") + shell: Rscript {0} + + - name: Test coverage + run: covr::codecov() + shell: Rscript {0} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 8455e3dc88..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,48 +0,0 @@ -language: r -dist: bionic -cache: packages # to rebuild cache see tweet thread ending here https://twitter.com/jimhester_/status/1115718589804421121 -warnings_are_errors: true - -r: - - release - -os: - - linux - # - osx # Takes 13m (+9m linux = 22m total); #3357; #3326; #3331. When off it's to speed up dev cycle; CRAN_Release.cmd has a reminder to turn back on. - -brew_packages: - - llvm - -r_packages: - - drat # used in .ci/deploy.sh to publish tar.gz to github.io/Rdatatable/data.table - - covr - -before_install: - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then rm "/usr/local/bin/gfortran"; fi - -before_script: - - echo "Revision:" $TRAVIS_COMMIT >> ./DESCRIPTION - -after_success: - - test $TRAVIS_OS_NAME == "linux" && - travis_wait Rscript -e 'library(covr); codecov()' - - test $TRAVIS_OS_NAME == "linux" && - test $TRAVIS_REPO_SLUG == "Rdatatable/data.table" && - test $TRAVIS_PULL_REQUEST == "false" && - test $TRAVIS_BRANCH == "master" && - bash .ci/deploy.sh - -notifications: - email: - on_success: change - on_failure: change - -env: - global: - - PKG_CFLAGS="-O3 -Wall -pedantic" - - _R_CHECK_NO_STOP_ON_TEST_ERROR_=true - - _R_CHECK_CRAN_INCOMING_REMOTE_=false - # Block truncation of any error messages in R CMD check - - _R_CHECK_TESTS_NLINES_=0 - # drat using @jangorecki token - - secure: "CxDW++rsQApQWos+h1z/F76odysyD6AtXJrDwlCHlgqXeKJNRATR4wZDDR18SK+85jUqjoqOvpyrq+5kKuyg6AnA/zduaX2uYE5mcntEUiyzlG/jJUKbcJqt22nyAvFXP3VS60T2u4H6IIhVmr7dArdxLkv8W+pJvf2Tg6kx8Ws=" From 3f16fa60e40b11d7f4346e92d28ea015a3a1612d Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 18 Aug 2021 01:21:00 -0600 Subject: [PATCH 350/588] follow up #5102: update badge as excuse to see if it triggers now that Actions tab is turned on --- .github/workflows/R-CMD-check.yaml | 85 ++++++++++++++++++++++++++++++ README.md | 2 +- 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/R-CMD-check.yaml diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100644 index 0000000000..cea7ca2d44 --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,85 @@ +# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. +# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: windows-latest, r: 'release'} + - {os: macOS-latest, r: 'release'} + - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v1 + with: + r-version: ${{ matrix.config.r }} + + - uses: r-lib/actions/setup-pandoc@v1 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + shell: Rscript {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@main + with: + name: ${{ runner.os }}-r${{ matrix.config.r }}-results + path: check diff --git a/README.md b/README.md index fcaa408b80..3764230531 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![CRAN status](https://cranchecks.info/badges/flavor/release/data.table)](https://cran.r-project.org/web/checks/check_results_data.table.html) -[![Travis build status](https://travis-ci.org/Rdatatable/data.table.svg?branch=master)](https://travis-ci.org/Rdatatable/data.table) +[![R-CMD-check](https://github.com/Rdatatable/data.table/workflows/R-CMD-check/badge.svg)](https://github.com/Rdatatable/data.table/actions) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/kayjdh5qtgymhoxr/branch/master?svg=true)](https://ci.appveyor.com/project/Rdatatable/data-table) [![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://codecov.io/github/Rdatatable/data.table?branch=master) [![GitLab CI build status](https://gitlab.com/Rdatatable/data.table/badges/master/pipeline.svg)](https://gitlab.com/Rdatatable/data.table/pipelines) From ebf0bd0e31b78af9e62cf66aa85fe17675b58c9d Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 18 Aug 2021 01:51:36 -0600 Subject: [PATCH 351/588] follow up #5102: remove project .Rprofile; we have .dev/.Rprofile anyway --- .Rprofile | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 .Rprofile diff --git a/.Rprofile b/.Rprofile deleted file mode 100644 index aea0049339..0000000000 --- a/.Rprofile +++ /dev/null @@ -1,2 +0,0 @@ -if (!nzchar(Sys.getenv("PROJ_PATH"))) Sys.setenv(PROJ_PATH=getwd()) -source(file.path(Sys.getenv("PROJ_PATH"), ".dev", "cc.R")) From 5f0af679fd4cb219ab389c8e22344eeb0f67aeb3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 18 Aug 2021 02:42:09 -0700 Subject: [PATCH 352/588] keep.rownames=key="str" works for as.data.table.data.frame (#4469) --- NEWS.md | 5 ++++- R/as.data.table.R | 6 ++++-- inst/tests/tests.Rraw | 4 ++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index ef19ccd191..e7f3acd890 100644 --- a/NEWS.md +++ b/NEWS.md @@ -103,6 +103,9 @@ 18. `tstrsplit(,type.convert=)` now accepts a named list of functions to apply to each part, [#5094](https://github.com/Rdatatable/data.table/issues/5094). Thanks to @Kamgang-B for the request and implementing. +19. `as.data.table(DF, keep.rownames=key='keyCol')` now works, [#4468](https://github.com/Rdatatable/data.table/issues/4468). Thanks to Michael Chirico for the idea and the PR. + + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. @@ -427,7 +430,7 @@ has a better chance of working on Mac. 19. Matrices resulting from logical operators or comparisons on `data.table`s, e.g. in `dta == dtb`, can no longer have their colnames changed by reference later, [#4323](https://github.com/Rdatatable/data.table/issues/4323). Thanks to @eyherabh for reporting and @tlapak for the PR. -20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8).r +20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8). ## NOTES diff --git a/R/as.data.table.R b/R/as.data.table.R index 5a547149ea..feae0f5ace 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -214,8 +214,10 @@ as.data.table.list = function(x, } as.data.table.data.frame = function(x, keep.rownames=FALSE, key=NULL, ...) { - if (!identical(keep.rownames, FALSE)) { - # can specify col name to keep.rownames, #575 + if (!isFALSE(keep.rownames)) { + # can specify col name to keep.rownames, #575; if it's the same as key, + # kludge it to 'rn' since we only apply the new name afterwards, #4468 + if (is.character(keep.rownames) && identical(keep.rownames, key)) key='rn' ans = data.table(rn=rownames(x), x, keep.rownames=FALSE, key=key) if (is.character(keep.rownames)) setnames(ans, 'rn', keep.rownames[1L]) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0578b5b30f..56ee3613a7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17905,3 +17905,7 @@ test(2203.17, tstrsplit(w, "/", type.convert="4"), error="TRUE/FALSE.+function.+ test(2203.18, tstrsplit(w, "/", type.convert=c(TRUE, FALSE)), error="TRUE/FALSE.+function.+named list") test(2203.19, tstrsplit(w, "/", keep=integer()), error="keep.+contain integer.+between", ignore.warning="no non-missing") test(2203.20, tstrsplit(w, "/", type.convert=list()), error="not support empty list") + +# set rownames as key directly in as.data.table, #4468 +test(2204, as.data.table(mtcars, keep.rownames='model', key='model'), + setnames(setkey(as.data.table(mtcars, keep.rownames = TRUE), rn), 'rn', 'model')) From 1b878498474b7df3112f80c9f838480ec2d1fb4a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 18 Aug 2021 04:08:47 -0600 Subject: [PATCH 353/588] follow up #5102: reduce GHA jobs for faster PR dev cycle --- .github/workflows/R-CMD-check.yaml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index cea7ca2d44..b1e43e4cb0 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -19,13 +19,16 @@ jobs: name: ${{ matrix.config.os }} (${{ matrix.config.r }}) strategy: - fail-fast: false + fail-fast: true matrix: config: - - {os: windows-latest, r: 'release'} - - {os: macOS-latest, r: 'release'} + # Rdatatable has full-strength GLCI which runs after merge. So we just need + # a few jobs (mainly test-coverage) to run in PRs so as to not slow down dev. + - {os: windows-latest, r: 'release'} # don't need both AppVeyor and this but leave in place for now + # - {os: macOS-latest, r: 'release'} # test-coverage.yaml uses macOS - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } + # - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } + # GLCI covers R-devel; no need to delay PR dev in case of changes in R-devel in recent days env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true From b2299a61b155425182f0307ddbfa953ae84c6702 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 18 Aug 2021 15:30:28 -0600 Subject: [PATCH 354/588] follow up #5102: use AppVeyor for Windows for now for concurrency --- .github/workflows/R-CMD-check.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index b1e43e4cb0..77662bdfa1 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -24,11 +24,11 @@ jobs: config: # Rdatatable has full-strength GLCI which runs after merge. So we just need # a few jobs (mainly test-coverage) to run in PRs so as to not slow down dev. - - {os: windows-latest, r: 'release'} # don't need both AppVeyor and this but leave in place for now - # - {os: macOS-latest, r: 'release'} # test-coverage.yaml uses macOS + # - {os: windows-latest, r: 'release'} # use AppVeyor in PRs since it runs concurrently to save elapsed time on each commit + # - {os: macOS-latest, r: 'release'} # test-coverage.yaml uses macOS which covers macOS and runs concurrently - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} # - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } - # GLCI covers R-devel; no need to delay PR dev in case of changes in R-devel in recent days + # GLCI covers R-devel; no need to delay contributors in dev due to changes in R-devel in recent days env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true From cf51cae192334c764db209e70ad0a0021dfd3606 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 18 Aug 2021 23:54:53 +0200 Subject: [PATCH 355/588] add fread support for text= single line without \r or \n (#4703) --- NEWS.md | 2 ++ R/fread.R | 2 +- inst/tests/tests.Rraw | 2 ++ src/freadR.c | 25 ++++++++++++------------- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index e7f3acd890..6fa4e801fb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -172,6 +172,8 @@ 32. `test.data.table()` would fail on test 1894 if the variable `z` was defined by the user, [#3705](https://github.com/Rdatatable/data.table/issues/3705). The test suite already ran in its own separate environment. That environment's parent is no longer `.GlobalEnv` to isolate it further. Thanks to Michael Chirico for reporting, and Matt Dowle for the PR. +33. `fread(text="a,b,c")` (where input data contains no `\n` but `text=` has been used) now works instead of error `file not found: a,b,c`, [#4689](https://github.com/Rdatatable/data.table/issues/4689). Thanks to @trainormg for reporting, and @ben-schwen for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/fread.R b/R/fread.R index 1bc0267b34..12f46b57ea 100644 --- a/R/fread.R +++ b/R/fread.R @@ -262,7 +262,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local tz="UTC" } - ans = .Call(CfreadR,input,sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, + ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns nr = length(ans[[1L]]) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 56ee3613a7..f5dec62a41 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10417,6 +10417,7 @@ dir.create(d) test(1703.17, fread(text=c('a,b','1,2'), tmpdir=d), data.table(a=1L,b=2L)) test(1703.18, fread(text=c('a,b','1,2')), data.table(a=1L, b=2L)) unlink(d) +test(1703.19, fread(text="a b c"), data.table(a=logical(), b=logical(), c=logical())) # text= with no \n, #4689 # Ensure all.equal respects 'check.attributes' w.r.t. column names. As testthat::check_equivalent relies on this # as used by package popEpi in its tests @@ -17909,3 +17910,4 @@ test(2203.20, tstrsplit(w, "/", type.convert=list()), error="not support empty l # set rownames as key directly in as.data.table, #4468 test(2204, as.data.table(mtcars, keep.rownames='model', key='model'), setnames(setkey(as.data.table(mtcars, keep.rownames = TRUE), rn), 'rn', 'model')) + diff --git a/src/freadR.c b/src/freadR.c index 842baf00a3..97fe691aa1 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -50,6 +50,7 @@ static bool oldNoDateTime = false; SEXP freadR( // params passed to freadMain SEXP inputArg, + SEXP isFileNameArg, SEXP sepArg, SEXP decArg, SEXP quoteArg, @@ -81,22 +82,20 @@ SEXP freadR( freadMainArgs args; ncol = 0; dtnrows = 0; - const char *ch, *ch2; + if (!isString(inputArg) || LENGTH(inputArg)!=1) error(_("Internal error: freadR input not a single character string: a filename or the data itself. Should have been caught at R level.")); // # nocov - ch = ch2 = (const char *)CHAR(STRING_ELT(inputArg,0)); - while (*ch2!='\n' && *ch2!='\r' && *ch2!='\0') ch2++; - args.input = (*ch2=='\0') ? R_ExpandFileName(ch) : ch; // for convenience so user doesn't have to call path.expand() - - ch = args.input; - while (*ch!='\0' && *ch!='\n' && *ch!='\r') ch++; - if (*ch!='\0' || args.input[0]=='\0') { - if (verbose) DTPRINT(_("Input contains a \\n or is \")\". Taking this to be text input (not a filename)\n")); - args.filename = NULL; - } else { - if (verbose) DTPRINT(_("Input contains no \\n. Taking this to be a filename to open\n")); - args.filename = args.input; + const char *ch = (const char *)CHAR(STRING_ELT(inputArg,0)); + if (!isLogical(isFileNameArg) || LENGTH(isFileNameArg)!=1 || LOGICAL(isFileNameArg)[0]==NA_LOGICAL) + error(_("Internal error: freadR isFileNameArg not TRUE or FALSE")); // # nocov + if (LOGICAL(isFileNameArg)[0]) { + if (verbose) DTPRINT(_("freadR.c has been passed a filename: %s\n"), ch); + args.filename = R_ExpandFileName(ch); // for convenience so user doesn't have to call path.expand() args.input = NULL; + } else { + if (verbose) DTPRINT(_("freadR.c has been passed the data as text input (not a filename)\n")); + args.filename = NULL; + args.input = ch; } if (!isString(sepArg) || LENGTH(sepArg)!=1 || strlen(CHAR(STRING_ELT(sepArg,0)))>1) From aa47b7c608302d3ca34ed19f122a937c87c943b8 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 18 Aug 2021 16:46:23 -0600 Subject: [PATCH 356/588] follow up #5102: just 32bit AppVeyor to save dev cycle time in PRs before merge --- .appveyor.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 7cabbb9062..a283cd2a34 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -16,11 +16,12 @@ environment: global: CRAN: http://cloud.r-project.org WARNINGS_ARE_ERRORS: 1 - R_CHECK_ARGS: --no-manual + R_CHECK_ARGS: --no-manual --no-multiarch + R_ARCH: i386 # R_CHECK_ARGS specified in order to turn off --as-cran (on by default) as that can be slow - R_ARCH: x64 -# multiarch is on by default which runs tests on both 32bit R and 64bit R in one x64 job; i.e. very nice and convenient for all. -# The default for R_ARCH is i386, though, for which multiarch would just compile and test 32bit, hence setting R_ARCH to x64 +# multiarch is on by default which (when R_ARCH: x64) compiles and tests both 32bit and 64bit in one x64 job +# --no-multiarch so as to not run both 32bit and 64bit on every commit in PRs to save dev cycle time; GLCI after merge is full-strength +# GHA has MacOS 64bit (test-coverage) and Ubuntu 64bit, therefore picked 32bit for Windows GCC_PATH: mingw_64 # Default GCC_PATH appears to be gcc-4.6.3 which is now unsupported as from Rtools.exe v3.4. _R_CHECK_NO_STOP_ON_TEST_ERROR_: true From 51144d241d75273764f1b1e9e59717b247ee8d9c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 18 Aug 2021 15:58:48 -0700 Subject: [PATCH 357/588] Use metaprogramming instead of parse to build bysub (#4713) --- R/data.table.R | 18 +++++++++++------- inst/tests/tests.Rraw | 3 ++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index efcc2104e5..98c80c11f9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -787,15 +787,19 @@ replace_dot_alias = function(e) { } else if (bysub %iscall% ".") bysub[[1L]] = quote(list) if (mode(bysub) == "character") { - if (length(grep(",", bysub, fixed = TRUE))) { + if (any(grepl(",", bysub, fixed = TRUE))) { if (length(bysub)>1L) stopf("'by' is a character vector length %d but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.", length(bysub)) - bysub = strsplit(bysub,split=",")[[1L]] + bysub = strsplit(bysub, split=",", fixed=TRUE)[[1L]] + } + bysub = gsub("^`(.*)`$", "\\1", bysub) # see test 138 + nzidx = nzchar(bysub) + # by='' means by=NULL, tests 592&596 + if (!all(nzidx)) { + if (length(bysub) > 1L) stop("At least one entry of by is empty") + bysub = NULL + } else { + bysub = as.call(c(list(quote(list)), lapply(bysub, as.name))) } - backtick_idx = grep("^[^`]+$",bysub) - if (length(backtick_idx)) bysub[backtick_idx] = paste0("`",bysub[backtick_idx],"`") - backslash_idx = grep("\\", bysub, fixed = TRUE) - if (length(backslash_idx)) bysub[backslash_idx] = gsub('\\', '\\\\', bysub[backslash_idx], fixed = TRUE) - bysub = parse(text=paste0("list(",paste(bysub,collapse=","),")"))[[1L]] bysubl = as.list.default(bysub) } if (any(c("eval","evalq","eval.parent","local","get","mget","dynGet") %chin% all.names(bysub))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f5dec62a41..ec9cc1e232 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1759,7 +1759,8 @@ DT = data.table(x=rep(1:3,each=3), y=c(1,3,6), v=1:9) test(589, DT[,sapply(.SD,sum)*.N], c(x=162, y=270, v=405)) test(590, DT[,sapply(.SD,sum)*.N,by=NULL], data.table(V1=c(162,270,405))) test(591, DT[,sapply(.SD,sum)*.N,by=character()], data.table(V1=c(162,270,405))) -test(592, DT[,sapply(.SD,sum)*.N,by=""], data.table(V1=c(162,270,405))) +test(592.1, DT[,sapply(.SD,sum)*.N,by=""], data.table(V1=c(162,270,405))) +test(592.2, DT[ , sapply(.SD, sum)*.N, by=c('x', '')], error='At least one entry of by is empty') test(593, DT[,lapply(.SD,sum)], data.table(x=18L, y=30, v=45L)) # bug fix #2263 in v1.8.3: now data.table result for consistency test(594, DT[,lapply(.SD,sum),by=NULL], data.table(x=18L, y=30, v=45L)) test(595, DT[,lapply(.SD,sum),by=character()], data.table(x=18L, y=30, v=45L)) From c7b255eaa342ff93b1bb8ffe4124533824d302d0 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 18 Aug 2021 17:59:24 -0700 Subject: [PATCH 358/588] na.omit respects missing nanotime values (#4752) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 5 +++++ src/assign.c | 6 +++--- src/between.c | 6 +++--- src/coalesce.c | 2 +- src/data.table.h | 1 - src/fifelse.c | 4 ++-- src/nafill.c | 2 +- src/shift.c | 2 +- src/utils.c | 40 +++++++++++++++++++--------------------- 10 files changed, 37 insertions(+), 33 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6fa4e801fb..38002f3dd8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -174,6 +174,8 @@ 33. `fread(text="a,b,c")` (where input data contains no `\n` but `text=` has been used) now works instead of error `file not found: a,b,c`, [#4689](https://github.com/Rdatatable/data.table/issues/4689). Thanks to @trainormg for reporting, and @ben-schwen for the PR. +34. `na.omit(DT)` did not remove `NA` in `nanotime` columns, [#4744](https://github.com/Rdatatable/data.table/issues/4744). Thanks Jean-Mathieu Vermosen for reporting, and Michael Chirico for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ec9cc1e232..b362096ff9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17912,3 +17912,8 @@ test(2203.20, tstrsplit(w, "/", type.convert=list()), error="not support empty l test(2204, as.data.table(mtcars, keep.rownames='model', key='model'), setnames(setkey(as.data.table(mtcars, keep.rownames = TRUE), rn), 'rn', 'model')) +# na.omit works for nanotime, #4744 +if (test_nanotime) { + DT = data.table(time=nanotime(c(1,NA,3))) + test(2205, na.omit(DT), DT[c(1,3)]) +} diff --git a/src/assign.c b/src/assign.c index 0dc38c9b0a..c84ca276a5 100644 --- a/src/assign.c +++ b/src/assign.c @@ -710,8 +710,8 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con snprintf(targetDesc, 500, colnum==0 ? _("target vector") : _("column %d named '%s'"), colnum, colname); int protecti=0; const bool sourceIsFactor=isFactor(source), targetIsFactor=isFactor(target); - const bool sourceIsI64=isReal(source) && Rinherits(source, char_integer64); - const bool targetIsI64=isReal(target) && Rinherits(target, char_integer64); + const bool sourceIsI64=isReal(source) && INHERITS(source, char_integer64); + const bool targetIsI64=isReal(target) && INHERITS(target, char_integer64); if (sourceIsFactor || targetIsFactor) { if (!targetIsFactor) { if (!isString(target) && !isNewList(target)) @@ -1116,7 +1116,7 @@ void writeNA(SEXP v, const int from, const int n, const bool listNA) for (int i=from; i<=to; ++i) vd[i] = NA_INTEGER; } break; case REALSXP: { - if (Rinherits(v, char_integer64)) { // Rinherits covers nanotime too which inherits from integer64 via S4 extends + if (INHERITS(v, char_integer64)) { int64_t *vd = (int64_t *)REAL(v); for (int i=from; i<=to; ++i) vd[i] = NA_INTEGER64; } else { diff --git a/src/between.c b/src/between.c index c5d91b30c0..899ea1d94e 100644 --- a/src/between.c +++ b/src/between.c @@ -83,8 +83,8 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg, S } break; case REALSXP: - if (Rinherits(x, char_integer64)) { - if (!Rinherits(lower, char_integer64) || !Rinherits(upper, char_integer64)) + if (INHERITS(x, char_integer64)) { + if (!INHERITS(lower, char_integer64) || !INHERITS(upper, char_integer64)) error(_("x is integer64 but lower and/or upper are not.")); // e.g. between(int64, character, character) const int64_t *lp = (int64_t *)REAL(lower); const int64_t *up = (int64_t *)REAL(upper); @@ -111,7 +111,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg, S } if (verbose) Rprintf(_("between parallel processing of integer64 took %8.3fs\n"), omp_get_wtime()-tic); } else { - if (Rinherits(lower, char_integer64) || Rinherits(upper, char_integer64)) + if (INHERITS(lower, char_integer64) || INHERITS(upper, char_integer64)) error(_("x is not integer64 but lower and/or upper is integer64. Please align classes.")); const double *lp = REAL(lower); const double *up = REAL(upper); diff --git a/src/coalesce.c b/src/coalesce.c index 558d2d4da5..75ce2e2e00 100644 --- a/src/coalesce.c +++ b/src/coalesce.c @@ -74,7 +74,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) { } } break; case REALSXP: { - if (Rinherits(first, char_integer64)) { // Rinherits() is true for nanotime + if (INHERITS(first, char_integer64)) { int64_t *xP=(int64_t *)REAL(first), finalVal=NA_INTEGER64; int k=0; for (int j=0; j Date: Wed, 18 Aug 2021 19:14:54 -0600 Subject: [PATCH 359/588] follow up #5102: clarify comments that GHA does run jobs concurrently --- .github/workflows/R-CMD-check.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 77662bdfa1..a2e6d1b35a 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -22,13 +22,14 @@ jobs: fail-fast: true matrix: config: - # Rdatatable has full-strength GLCI which runs after merge. So we just need - # a few jobs (mainly test-coverage) to run in PRs so as to not slow down dev. - # - {os: windows-latest, r: 'release'} # use AppVeyor in PRs since it runs concurrently to save elapsed time on each commit - # - {os: macOS-latest, r: 'release'} # test-coverage.yaml uses macOS which covers macOS and runs concurrently + # Rdatatable has full-strength GLCI which runs after merge. So we just need a few + # jobs (mainly test-coverage) to run on every commit in PRs so as to not slow down dev. + # GHA does run these jobs concurrently but even so reducing the load seems like a good idea. + # - {os: windows-latest, r: 'release'} # currently using AppVeyor which runs 32bit in 5 min and works + # - {os: macOS-latest, r: 'release'} # test-coverage.yaml uses macOS - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} # - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } - # GLCI covers R-devel; no need to delay contributors in dev due to changes in R-devel in recent days + # GLCI covers R-devel; no need to delay contributors in dev due to changes in R-devel in recent days env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true From 3b82d51db4a0e8279669079b6cd1eac60fe48a48 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 19 Aug 2021 08:19:01 +0200 Subject: [PATCH 360/588] internal isRealReallyInt, closes #3966 (#3967) --- R/data.table.R | 4 ---- R/wrappers.R | 3 +++ inst/tests/tests.Rraw | 16 ++++++++++++++++ src/data.table.h | 1 + src/init.c | 1 + src/utils.c | 38 ++++++++++++++++---------------------- 6 files changed, 37 insertions(+), 26 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 98c80c11f9..c06c60e817 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2934,10 +2934,6 @@ gvar = function(x, na.rm=FALSE) .Call(Cgvar, x, na.rm) gsd = function(x, na.rm=FALSE) .Call(Cgsd, x, na.rm) gforce = function(env, jsub, o, f, l, rows) .Call(Cgforce, env, jsub, o, f, l, rows) -isReallyReal = function(x) { - .Call(CisReallyReal, x) -} - .prepareFastSubset = function(isub, x, enclos, notjoin, verbose = FALSE){ ## helper that decides, whether a fast binary search can be performed, if i is a call ## For details on the supported queries, see \code{\link{datatable-optimize}} diff --git a/R/wrappers.R b/R/wrappers.R index 0c226b9f30..dcf8ba08e5 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -12,4 +12,7 @@ colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, c testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) +isRealReallyInt = function(x) .Call(CisRealReallyIntR, x) +isReallyReal = function(x) .Call(CisReallyReal, x) + coerceAs = function(x, as, copy=TRUE) .Call(CcoerceAs, x, as, copy) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b362096ff9..2218ad2f16 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -44,6 +44,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { is_na = data.table:::is_na is.sorted = data.table:::is.sorted isReallyReal = data.table:::isReallyReal + isRealReallyInt = data.table:::isRealReallyInt is_utc = data.table:::is_utc melt.data.table = data.table:::melt.data.table # for test 1953.4 null.data.table = data.table:::null.data.table @@ -17917,3 +17918,18 @@ if (test_nanotime) { DT = data.table(time=nanotime(c(1,NA,3))) test(2205, na.omit(DT), DT[c(1,3)]) } + +# isRealReallyInt, #3966 +test(2206.01, isRealReallyInt(c(-2147483647.0, NA, 0.0, 2147483647.0)), TRUE) +test(2206.02, isRealReallyInt(2147483648.0), FALSE) # >INT_MAX +test(2206.03, isRealReallyInt(-2147483648.0), FALSE) # <=INT_MIN since INT_MIN==NA_integer_ +test(2206.04, isRealReallyInt(c(5,-5,2147483648)), FALSE) # test real last position +test(2206.05, isRealReallyInt(NaN), FALSE) +test(2206.06, isRealReallyInt(+Inf), FALSE) +test(2206.07, isRealReallyInt(-Inf), FALSE) +test(2206.08, isRealReallyInt(0.1), FALSE) +test(2206.09, isRealReallyInt(numeric()), TRUE) +test(2206.10, isRealReallyInt(9L), FALSE) # must be type double +test(2206.11, isRealReallyInt(integer()), FALSE) + + diff --git a/src/data.table.h b/src/data.table.h index c7f3ca889e..2ba639a64a 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -229,6 +229,7 @@ SEXP coalesce(SEXP x, SEXP inplace); // utils.c bool isRealReallyInt(SEXP x); +SEXP isRealReallyIntR(SEXP x); SEXP isReallyReal(SEXP x); bool allNA(SEXP x, bool errorForBadType); SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups); diff --git a/src/init.c b/src/init.c index 910d675194..eda1c607c4 100644 --- a/src/init.c +++ b/src/init.c @@ -183,6 +183,7 @@ R_CallMethodDef callMethods[] = { {"Ctranspose", (DL_FUNC) &transpose, -1}, {"CanyNA", (DL_FUNC) &anyNA, -1}, {"CisReallyReal", (DL_FUNC) &isReallyReal, -1}, +{"CisRealReallyIntR", (DL_FUNC) &isRealReallyIntR, -1}, {"Csetlevels", (DL_FUNC) &setlevels, -1}, {"Crleid", (DL_FUNC) &rleid, -1}, {"Cgmedian", (DL_FUNC) &gmedian, -1}, diff --git a/src/utils.c b/src/utils.c index 358397dc90..312f554f8f 100644 --- a/src/utils.c +++ b/src/utils.c @@ -1,35 +1,29 @@ #include "data.table.h" -bool isRealReallyInt(SEXP x) { - if (!isReal(x)) return(false); +static R_xlen_t firstNonInt(SEXP x) { R_xlen_t n=xlength(x), i=0; - double *dx = REAL(x); + const double *dx = REAL(x); while (i Date: Thu, 19 Aug 2021 02:51:34 -0400 Subject: [PATCH 361/588] add support for complex values in value.var in dcast (#4863) --- NEWS.md | 1 + inst/tests/tests.Rraw | 5 ++++- src/fcast.c | 30 ++++++++++++++++++++++-------- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 38002f3dd8..1836a4b3b4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -105,6 +105,7 @@ 19. `as.data.table(DF, keep.rownames=key='keyCol')` now works, [#4468](https://github.com/Rdatatable/data.table/issues/4468). Thanks to Michael Chirico for the idea and the PR. +20. `dcast()` now supports complex values in `value.var`, [#4855](https://github.com/Rdatatable/data.table/issues/4855). This extends earlier support for complex values in `formula`. Thanks Elio Campitelli for the request, and Michael Chirico for the PR. ## BUG FIXES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2218ad2f16..2d5df0498d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17932,4 +17932,7 @@ test(2206.09, isRealReallyInt(numeric()), TRUE) test(2206.10, isRealReallyInt(9L), FALSE) # must be type double test(2206.11, isRealReallyInt(integer()), FALSE) - +# dcast supports complex value to cast, #4855 +DT = CJ(x=1:3, y=letters[1:2]) +DT[, z := complex(real=1:6, imaginary=6:1)] +test(2207, dcast(DT, x~y, value.var="z"), data.table(x=1:3, a=c(1+6i, 3+4i, 5+2i), b=c(2+5i, 4+3i, 6+1i), key='x')) diff --git a/src/fcast.c b/src/fcast.c index 8819f20019..46d7465e9c 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -18,22 +18,23 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil for (int i=0; i Date: Thu, 19 Aug 2021 10:25:42 +0200 Subject: [PATCH 362/588] Search registered S3 method for melt before redirecting to reshape2 (#4864) --- DESCRIPTION | 3 ++- NAMESPACE | 1 + NEWS.md | 2 ++ R/fmelt.R | 22 +++++++++++----------- inst/tests/tests.Rraw | 9 +++++++++ 5 files changed, 25 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b8439044c9..69e1eb9147 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -68,7 +68,8 @@ Authors@R: c( person("Hadley","Wickham", role="ctb"), person("Bennet","Becker", role="ctb"), person("Kyle","Haynes", role="ctb"), - person("Boniface Christian","Kamgang", role="ctb")) + person("Boniface Christian","Kamgang", role="ctb"), + person("Odel","Marcelle", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NAMESPACE b/NAMESPACE index 999a834304..00cb51a4cb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -121,6 +121,7 @@ S3method(split, data.table) export(dcast, melt) S3method(dcast, data.table) S3method(melt, data.table) +S3method(melt, default) # exported for historical reasons -- if reshape2 is higher on search path, # dcast(DT) will not dispatch since reshape2::dcast is not generic. So users diff --git a/NEWS.md b/NEWS.md index 1836a4b3b4..48192c232b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -107,6 +107,8 @@ 20. `dcast()` now supports complex values in `value.var`, [#4855](https://github.com/Rdatatable/data.table/issues/4855). This extends earlier support for complex values in `formula`. Thanks Elio Campitelli for the request, and Michael Chirico for the PR. +21. `melt()` was pseudo generic in that `melt(DT)` would dispatch to the `melt.data.table` method but `melt(not-DT)` would explicitly redirect to `reshape2`. Now `melt()` is standard generic so that methods can be developed in other packages, [#4864](https://github.com/Rdatatable/data.table/pull/4864). Thanks to @odelmarcelle for suggesting and implementing. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/fmelt.R b/R/fmelt.R index 5038894ba0..243480445b 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -4,18 +4,18 @@ # redirection as well melt = function(data, ..., na.rm = FALSE, value.name = "value") { - if (is.data.table(data)) { - UseMethod("melt", data) - # if data is not data.table and reshape2 is installed, this won't dispatch to reshape2's method; - # CRAN package edarf and others fail without the else branch + UseMethod("melt", data) +} + +melt.default = function(data, ..., na.rm = FALSE, value.name = "value") { + # if no registered method exists for data, attempts to redirect data to reshape2::melt; + # CRAN package edarf and others fail without the redirection # nocov start - } else { - data_name = deparse(substitute(data)) - ns = tryCatch(getNamespace("reshape2"), error=function(e) - stopf("The %1$s generic in data.table has been passed a %2$s, but data.table::%1$s currently only has a method for data.tables. Please confirm your input is a data.table, with setDT(%3$s) or as.data.table(%3$s). If you intend to use a method from reshape2, try installing that package first, but do note that reshape2 is superseded and is no longer actively developed.", "melt", class(data)[1L], data_name)) - warningf("The %1$s generic in data.table has been passed a %2$s and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is superseded and is no longer actively developed, and this redirection is now deprecated. Please do this redirection yourself like reshape2::%1$s(%3$s). In the next version, this warning will become an error.", "melt", class(data)[1L], data_name) - ns$melt(data, ..., na.rm=na.rm, value.name=value.name) - } + data_name = deparse(substitute(data)) + ns = tryCatch(getNamespace("reshape2"), error=function(e) + stopf("The %1$s generic in data.table has been passed a %2$s, but data.table::%1$s currently only has a method for data.tables. Please confirm your input is a data.table, with setDT(%3$s) or as.data.table(%3$s). If you intend to use a method from reshape2, try installing that package first, but do note that reshape2 is superseded and is no longer actively developed.", "melt", class(data)[1L], data_name)) + warningf("The %1$s generic in data.table has been passed a %2$s and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is superseded and is no longer actively developed, and this redirection is now deprecated. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace, i.e. reshape2::%1$s(%3$s). In the next version, this warning will become an error.", "melt", class(data)[1L], data_name) + ns$melt(data, ..., na.rm=na.rm, value.name=value.name) # nocov end } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2d5df0498d..fa2346dd7b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3256,6 +3256,15 @@ Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") x[ , r := as.raw(c(0, 1))] test(1037.414, melt(x, id.vars='x1', measure.vars='r'), error="Unknown column type 'raw' for column 'r'") + + # test dispatch for non-data.table objects. See #4864. Only possible to test the error message on CI. + test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + error="The melt generic in data.table has been passed a data.frame") + + # uncomment 1038.002 and comment 1308.001 if reshape2 is ever available on CI. + # test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), + # warning="The melt generic in data.table has been passed a data.frame") + } # sorting and grouping of Inf, -Inf, NA and NaN, #117, #112 & #105 From 4cf92893742a0359ce3b3ddfd279a9e69627d52b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 19 Aug 2021 03:52:24 -0600 Subject: [PATCH 363/588] follow-up #4864: pass GLCI rel-cran --- inst/tests/tests.Rraw | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fa2346dd7b..02b9362822 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3257,14 +3257,18 @@ Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") test(1037.414, melt(x, id.vars='x1', measure.vars='r'), error="Unknown column type 'raw' for column 'r'") - # test dispatch for non-data.table objects. See #4864. Only possible to test the error message on CI. - test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), - error="The melt generic in data.table has been passed a data.frame") - - # uncomment 1038.002 and comment 1308.001 if reshape2 is ever available on CI. - # test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), - # warning="The melt generic in data.table has been passed a data.frame") - + # test dispatch for non-data.table objects, #4864. + if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) { + test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + error="The melt generic in data.table has been passed a data.frame") + } else { + # 1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2 + # 2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded) + # 3) in dev locally I have reshape2 installed to run caret in other.Rraw + test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), + warning="The melt generic in data.table has been passed a data.frame") + } } # sorting and grouping of Inf, -Inf, NA and NaN, #117, #112 & #105 From be8ba59193c4cf5b3a033227cfd11f3d9e88eb0e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 19 Aug 2021 14:13:58 -0600 Subject: [PATCH 364/588] add DT() functional form data.table query (#5104) --- NAMESPACE | 1 + NEWS.md | 6 ++++++ R/data.table.R | 6 ++++-- man/data.table.Rd | 8 ++++++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 00cb51a4cb..81c0fce689 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,6 +57,7 @@ export(setnafill) export(.Last.updated) export(fcoalesce) export(substitute2) +export(DT) # mtcars |> DT(i,j,by) #4872 S3method("[", data.table) S3method("[<-", data.table) diff --git a/NEWS.md b/NEWS.md index 48192c232b..769b98dafa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -109,6 +109,12 @@ 21. `melt()` was pseudo generic in that `melt(DT)` would dispatch to the `melt.data.table` method but `melt(not-DT)` would explicitly redirect to `reshape2`. Now `melt()` is standard generic so that methods can be developed in other packages, [#4864](https://github.com/Rdatatable/data.table/pull/4864). Thanks to @odelmarcelle for suggesting and implementing. +22. `DT(i, j, by, ...)` has been added, i.e. functional form of a `data.table` query, [#641](https://github.com/Rdatatable/data.table/issues/641) [#4872](https://github.com/Rdatatable/data.table/issues/4872). Thanks to Yike Lu and Elio Campitelli for filing requests, many others for comments and suggestions, and Matt Dowle for the PR. This enables the `data.table` general form query to be invoked on a `data.frame` without converting it to a `data.table` first. The class of the input object is retained. + + ```R + mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) + ``` + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index c06c60e817..504eb49cc7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -846,10 +846,10 @@ replace_dot_alias = function(e) { if (!is.na(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset if (length(allbyvars)) { ############### TO DO TO DO TO DO ############### if (verbose) catf("i clause present and columns used in by detected, only these subset: %s\n", brackify(allbyvars)) - xss = x[irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends] + xss = `[.data.table`(x,irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends) } else { if (verbose) catf("i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '%s'\n", deparse(by)) - xss = x[irows,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends] + xss = `[.data.table`(x,irows,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends) } if (bysub %iscall% ':' && length(bysub)==3L) { byval = eval(bysub, setattr(as.list(seq_along(xss)), 'names', names(xss)), parent.frame()) @@ -1910,6 +1910,8 @@ replace_dot_alias = function(e) { setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line } +DT = `[.data.table` #4872 + .optmean = function(expr) { # called by optimization of j inside [.data.table only. Outside for a small speed advantage. if (length(expr)==2L) # no parameters passed to mean, so defaults of trim=0 and na.rm=FALSE return(call(".External",quote(Cfastmean),expr[[2L]], FALSE)) diff --git a/man/data.table.Rd b/man/data.table.Rd index 7418d12118..d1ab11a924 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -5,6 +5,7 @@ \alias{Ops.data.table} \alias{is.na.data.table} \alias{[.data.table} +\alias{DT} \alias{.} \alias{.(} \alias{.()} @@ -217,6 +218,8 @@ The way to read this out loud is: "Take \code{DT}, subset rows by \code{i}, \emp # see ?assign to add/update/delete columns by reference using the same consistent interface } +A \code{data.table} query may be invoked on a \code{data.frame} using functional form \code{DT(...)}, see examples. The class of the input is retained. + A \code{data.table} is a \code{list} of vectors, just like a \code{data.frame}. However : \enumerate{ \item it never has or uses rownames. Rownames based indexing can be done by setting a \emph{key} of one or more columns or done \emph{ad-hoc} using the \code{on} argument (now preferred). @@ -431,6 +434,11 @@ dev.off() # using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v' DT[, c(.(y=max(y)), lapply(.SD, min)), by=rleid(v), .SDcols=v:b] +# functional query DT(...) +if (getRversion() >= "4.1.0") { # native pipe |> new in R 4.1.0 + mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) +} + # Support guide and links: # https://github.com/Rdatatable/data.table/wiki/Support From 53278d37b2119a4cb44e1265b4839e9542a9ca6e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 19 Aug 2021 15:27:16 -0600 Subject: [PATCH 365/588] #5104: dontrun in .Rd to pass R<4.1.0 --- man/data.table.Rd | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/man/data.table.Rd b/man/data.table.Rd index d1ab11a924..fd2bbd4508 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -435,8 +435,10 @@ dev.off() DT[, c(.(y=max(y)), lapply(.SD, min)), by=rleid(v), .SDcols=v:b] # functional query DT(...) -if (getRversion() >= "4.1.0") { # native pipe |> new in R 4.1.0 - mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) +\dontshow{ #dontrun to pass R CMD check prior to R 4.1.0 when |> was added + # an if getRVersion()>"4.1.0" still has its code parsed } +\dontrun{ +mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) } # Support guide and links: From 4d689015b71cc6a9a4d4060d552bfb271c3efe19 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 19 Aug 2021 23:38:26 -0600 Subject: [PATCH 366/588] rework gminmax (#5105) --- NEWS.md | 44 +++++++ inst/tests/tests.Rraw | 49 ++++---- src/gsumm.c | 275 +++++++++--------------------------------- 3 files changed, 131 insertions(+), 237 deletions(-) diff --git a/NEWS.md b/NEWS.md index 769b98dafa..5e666241af 100644 --- a/NEWS.md +++ b/NEWS.md @@ -185,6 +185,50 @@ 34. `na.omit(DT)` did not remove `NA` in `nanotime` columns, [#4744](https://github.com/Rdatatable/data.table/issues/4744). Thanks Jean-Mathieu Vermosen for reporting, and Michael Chirico for the PR. +35. `DT[, min(intCol, na.rm=TRUE), by=grp]` would return `Inf` for any groups containing all NAs, with a type change from `integer` to `numeric` to hold the `Inf`, and with warning. Now `NA` is returned for such all-NA groups, without warning or type change. This is almost-surely less surprising, more convenient, consistent, and efficient. There was no user request for this, likely because our desire to be consistent with base R in this regard was known (base R's `min(x, na.rm=TRUE)` returns `Inf` with warning for all-NA input). Matt Dowle made this change when reworking internals, [#5105](https://github.com/Rdatatable/data.table/pull/5105). The old behavior seemed so bad, and since there was a warning too, it seemed more appropriate to treat it as a bug. + + ```R + DT + # A B + # + # 1: a 1 + # 2: a NA + # 3: b 2 + # 4: b NA + + DT[, min(B,na.rm=TRUE), by=A] # no change in behavior (no all-NA groups yet) + # A V1 + # + # 1: a 1 + # 2: b 2 + + DT[3, B:=NA] # make an all-NA group + DT + # A B + # + # 1: a 1 + # 2: a NA + # 3: b NA + # 4: b NA + + DT[, min(B,na.rm=TRUE), by=A] # old result + # A V1 + # # V1's type changed to numeric (inconsistent) + # 1: a 1 + # 2: b Inf # Inf surprising + # Warning message: # warning inconvenient + # In gmin(B, na.rm = TRUE) : + # No non-missing values found in at least one group. Coercing to numeric + # type and returning 'Inf' for such groups to be consistent with base + + DT[, min(B,na.rm=TRUE), by=A] # new result + # A V1 + # # V1's type remains integer (consistent) + # 1: a 1 + # 2: b NA # NA because there are no non-NA, naturally + # no inconvenient warning + ``` + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 02b9362822..4f73a19085 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5178,8 +5178,8 @@ test(1313.04, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by= DT[x==6, y := INT(NA)] test(1313.05, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.06, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-2147483647,Inf)), warning="No non-missing") -test(1313.08, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-2147483647,-Inf)), warning="No non-missing") +test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) +test(1313.08, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) # for numeric DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) @@ -5191,8 +5191,8 @@ test(1313.12, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by= DT[x==6, y := NA_real_] test(1313.13, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.14, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,Inf)), warning="No non-missing") -test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,-Inf)), warning="No non-missing") +test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) +test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) # for date (attribute check.. especially after issues/689 !!!) DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) @@ -5215,8 +5215,8 @@ test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by= DT[x==6, y := NA_character_] test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,"")), warning="No non-missing") -test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c")), warning="No non-missing") +test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) +test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = @@ -8220,18 +8220,23 @@ test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')]) # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) -# bug fix #1461 -dt = data.table(x=c(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) -# make sure gforce is on -options(datatable.optimize=Inf) -ans1 = suppressWarnings(dt[, base::min(y, na.rm=TRUE), by=x]) -ans2 = suppressWarnings(dt[, base::max(y, na.rm=TRUE), by=x]) -test(1583.1, dt[, min(y, na.rm=TRUE), by=x], ans1, warning="No non-missing values found") -test(1583.2, dt[, max(y, na.rm=TRUE), by=x], ans2, warning="No non-missing values found") -ans3 = suppressWarnings(dt[, base::min(y), by=x]) -ans4 = suppressWarnings(dt[, base::max(y), by=x]) -test(1583.3, dt[, min(y), by=x], ans3) -test(1583.4, dt[, max(y), by=x], ans4) +# bug fix #1461 related to NaN not being recognized due to ISNA vs ISNAN at C level +# verbatim test from the original report: +options(datatable.optimize=Inf) # ensure gforce is on +DT = data.table( + C1 = c(rep("A", 4), rep("B",4), rep("C", 4)), + C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), + Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) +test(1583.1, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], + data.table(C1=c("A","A","B","B","C","C"), + C2=c("a","b","b","c","c","d"), + agg=c(1,4,5,8,9,10))) +# extra test with a size-1 group containing one NaN too +DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) +test(1583.2, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) +test(1583.3, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) +test(1583.4, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.5, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) # Fixed a minor bug in fread when blank.lines.skip=TRUE f1 <- function(x, f=TRUE, b=FALSE) fread(x, fill=f, blank.lines.skip=b, data.table=FALSE, logical01=FALSE) @@ -14700,8 +14705,8 @@ if (test_bit64) { test(2019, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows } DT = data.table(id = c(1L,1L,2L), v = as.raw(0:2)) -test(2020.01, DT[, min(v), by=id], error="'raw' not supported by GForce min") -test(2020.02, DT[, max(v), by=id], error="'raw' not supported by GForce max") +test(2020.01, DT[, min(v), by=id], error="'raw' not supported by GForce min/max") +test(2020.02, DT[, max(v), by=id], error="'raw' not supported by GForce min/max") test(2020.03, DT[, median(v), by=id], error="'raw' not supported by GForce median") test(2020.04, DT[, head(v, 1), by=id], error="'raw' not supported by GForce head") test(2020.05, DT[, tail(v, 1), by=id], error="'raw' not supported by GForce tail") @@ -15820,8 +15825,8 @@ test(2065.7, DT[1L, z_sum := 1i][1L, z_sum], 1i) # GForce for complex columns, part of #3690 DT = data.table(id=c(1L,1L,2L), v=c(1i, 2i, 3i)) -test(2066.01, DT[, min(v), by=id], error="'complex' has no well-defined min") -test(2066.02, DT[, max(v), by=id], error="'complex' has no well-defined max") +test(2066.01, DT[, min(v), by=id], error="'complex' has no well-defined min/max") +test(2066.02, DT[, max(v), by=id], error="'complex' has no well-defined min/max") test(2066.03, DT[, head(v, 1), by=id], data.table(id=1:2, V1=c(1, 3)*1i)) test(2066.04, DT[, tail(v, 1), by=id], data.table(id=1:2, V1=(2:3)*1i)) test(2066.05, DT[, v[2], by=id], data.table(id = 1:2, V1=c(2i, NA))) diff --git a/src/gsumm.c b/src/gsumm.c index 651f1c3385..b925b35fa6 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -713,267 +713,112 @@ SEXP gmean(SEXP x, SEXP narmArg) return(ans); } -// gmin -SEXP gmin(SEXP x, SEXP narm) +static SEXP gminmax(SEXP x, SEXP narm, const bool min) { if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); - if (!isVectorAtomic(x)) error(_("GForce min can only be applied to columns, not .SD or similar. To find min of all items in a list such as .SD, either add the prefix base::min(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,min),by=,.SDcols=]'")); - if (inherits(x, "factor") && !inherits(x, "ordered")) error(_("min is not meaningful for factors.")); + if (!isVectorAtomic(x)) error(_("GForce min/max can only be applied to columns, not .SD or similar. To find min/max of all items in a list such as .SD, either add the prefix base::min(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,min),by=,.SDcols=]'")); + if (inherits(x, "factor") && !inherits(x, "ordered")) error(_("min/max is not meaningful for factors.")); const int n = (irowslen == -1) ? length(x) : irowslen; //clock_t start = clock(); SEXP ans; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gmin"); - int protecti=0; + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gmin"); + // GForce guarantees each group has at least one value; i.e. we don't need to consider length-0 per group here switch(TYPEOF(x)) { - case LGLSXP: case INTSXP: - ans = PROTECT(allocVector(INTSXP, ngrp)); protecti++; + case LGLSXP: case INTSXP: { + ans = PROTECT(allocVector(INTSXP, ngrp)); + int *ansd = INTEGER(ans); + const int *xd = INTEGER(x); if (!LOGICAL(narm)[0]) { - for (int i=0; i Date: Fri, 20 Aug 2021 02:51:30 -0400 Subject: [PATCH 367/588] gmin/gmax properly support integer64 (#4445) --- NEWS.md | 4 ++- inst/tests/tests.Rraw | 16 +++++++++++ src/gsumm.c | 63 ++++++++++++++++++++++++++++++------------- 3 files changed, 63 insertions(+), 20 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5e666241af..80bc92137c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -185,7 +185,7 @@ 34. `na.omit(DT)` did not remove `NA` in `nanotime` columns, [#4744](https://github.com/Rdatatable/data.table/issues/4744). Thanks Jean-Mathieu Vermosen for reporting, and Michael Chirico for the PR. -35. `DT[, min(intCol, na.rm=TRUE), by=grp]` would return `Inf` for any groups containing all NAs, with a type change from `integer` to `numeric` to hold the `Inf`, and with warning. Now `NA` is returned for such all-NA groups, without warning or type change. This is almost-surely less surprising, more convenient, consistent, and efficient. There was no user request for this, likely because our desire to be consistent with base R in this regard was known (base R's `min(x, na.rm=TRUE)` returns `Inf` with warning for all-NA input). Matt Dowle made this change when reworking internals, [#5105](https://github.com/Rdatatable/data.table/pull/5105). The old behavior seemed so bad, and since there was a warning too, it seemed more appropriate to treat it as a bug. +35. `DT[, min(intCol, na.rm=TRUE), by=grp]` would return `Inf` for any groups containing all NAs, with a type change from `integer` to `numeric` to hold the `Inf`, and with warning. Similarly `max` would return `-Inf`. Now `NA` is returned for such all-NA groups, without warning or type change. This is almost-surely less surprising, more convenient, consistent, and efficient. There was no user request for this, likely because our desire to be consistent with base R in this regard was known (`base::min(x, na.rm=TRUE)` returns `Inf` with warning for all-NA input). Matt Dowle made this change when reworking internals, [#5105](https://github.com/Rdatatable/data.table/pull/5105). The old behavior seemed so bad, and since there was a warning too, it seemed appropriate to treat it as a bug. ```R DT @@ -228,6 +228,8 @@ # 2: b NA # NA because there are no non-NA, naturally # no inconvenient warning ``` + +36. `DT[, min(int64Col), by=grp]` (and `max`) would return incorrect results for `bit64::integer64` columns, [#4444](https://github.com/Rdatatable/data.table/issues/4444). Thanks to @go-see for reporting, and Michael Chirico for the PR. ## NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4f73a19085..9da461e16c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17954,3 +17954,19 @@ test(2206.11, isRealReallyInt(integer()), FALSE) DT = CJ(x=1:3, y=letters[1:2]) DT[, z := complex(real=1:6, imaginary=6:1)] test(2207, dcast(DT, x~y, value.var="z"), data.table(x=1:3, a=c(1+6i, 3+4i, 5+2i), b=c(2+5i, 4+3i, 6+1i), key='x')) + +# gmin/gmax for integer64, #4444 +if (test_bit64) { + DT = data.table(grp=c(1L, 1L, 1L, 2L), i64=as.integer64(c(NA, 1:3))) + old = options(datatable.optimize=2L) + test(2208.1, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.2, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) + test(2208.3, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.4, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) + # create an all-NA group + DT[, i64:=rev(i64)] + test(2208.7, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) + test(2208.8, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) + options(old) +} + diff --git a/src/gsumm.c b/src/gsumm.c index b925b35fa6..0fe05d1299 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -742,7 +742,7 @@ static SEXP gminmax(SEXP x, SEXP narm, const bool min) for (int i=0; i Date: Fri, 20 Aug 2021 15:49:31 -0400 Subject: [PATCH 368/588] Fread sep block dec (#4495) --- NEWS.md | 23 +++++++++++++++++++++++ inst/tests/tests.Rraw | 7 +++++++ src/fread.c | 4 ++-- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 80bc92137c..7b04b26ced 100644 --- a/NEWS.md +++ b/NEWS.md @@ -231,6 +231,29 @@ 36. `DT[, min(int64Col), by=grp]` (and `max`) would return incorrect results for `bit64::integer64` columns, [#4444](https://github.com/Rdatatable/data.table/issues/4444). Thanks to @go-see for reporting, and Michael Chirico for the PR. +37. `fread(dec=',')` was able to guess `sep=','` and return an incorrect result, [#4483](https://github.com/Rdatatable/data.table/issues/4483). Thanks to Michael Chirico for reporting and fixing. It was already an error to provide both `sep=','` and `dec=','` manually. + + ```R + fread('A|B|C\n1|0,4|a\n2|0,5|b\n', dec=',') # no problem + + # A B C + # + # 1: 1 0.4 a + # 2: 2 0.5 b + + fread('A|B,C\n1|0,4\n2|0,5\n', dec=',') + + # A|B C # old result guessed sep=',' despite dec=',' + # + # 1: 1|0 4 + # 2: 2|0 5 + + # A B,C # now detects sep='|' correctly + # + # 1: 1 0.4 + # 2: 2 0.5 + ``` + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9da461e16c..a2f69ae819 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17970,3 +17970,10 @@ if (test_bit64) { options(old) } +# when user supplies dec=',' don't try sep=',', #4483 +test(2209.1, fread('A,B,C\n1,+,4\n2,-,5\n3,-,6\n', dec=','), data.table('A,B,C'=c('1,+,4', '2,-,5', '3,-,6'))) +test(2209.2, fread('A|B|C\n1|+|4\n2|-|5\n3|-|6\n', dec=','), data.table(A=1:3, B=c('+', '-', '-'), C=4:6)) # ok before +test(2209.3, fread('A|B,C\n1|+,4\n2|-,5\n3|-,6\n', dec=','), data.table(A=1:3, 'B,C'=c(.4, -.5, -.6))) +test(2209.4, fread('A|B|C\n1|0,4|a\n2|0,5|b\n', dec=','), data.table(A=1:2, B=c(0.4,0.5), C=c("a","b"))) # ok before +test(2209.5, fread('A|B,C\n1|0,4\n2|0,5\n', dec=','), data.table(A=1:2, "B,C"=c(0.4,0.5))) + diff --git a/src/fread.c b/src/fread.c index d00cffcd1f..70597a8f8d 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1598,8 +1598,8 @@ int freadMain(freadMainArgs _args) { ch = pos; } else { int nseps; - char seps[]=",|;\t "; // default seps in order of preference. See ?fread. - // seps[] not *seps for writeability (http://stackoverflow.com/a/164258/403310) + char seps__[] = ",|;\t "; // default seps in order of preference; writeable http://stackoverflow.com/a/164258/403310 + char *seps = dec!=',' ? seps__ : seps__+1; // prevent guessing sep=',' when dec=',' #4483 char topSep=127; // which sep 'wins' top place (see top* below). By default 127 (ascii del) means no sep i.e. single-column input (1 field) if (args.sep == '\0') { if (verbose) DTPRINT(_(" Detecting sep automatically ...\n")); From 26394bcf97110834ae5b28598ccd5416a2a19f36 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 20 Aug 2021 23:11:48 +0200 Subject: [PATCH 369/588] add test for na.omit(, cols=empty) (#4360) --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a2f69ae819..ce0941de7d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14172,6 +14172,7 @@ DT = data.table(a = 1:10) test(1984.22, na.omit(DT, invert = 'a'), error="'invert' must be logical") test(1984.23, na.omit(DT, cols = 'b'), error="specify non existing column*.*b") #test(1984.24, na.omit(DT, cols = c('b', 'c')), error="Columns [b, c] don't") # only first non-existing col is now reported for efficiency +test(1984.242, na.omit(data.table(A=c(1,NA,2)), cols=character()), data.table(A=c(1,NA,2))) #2514 ### idcol = TRUE behavior of rbindlist test(1984.25, rbindlist(list(DT[1L], DT[2L]), idcol = TRUE), data.table(.id=1:2, a=1:2)) test(1984.26, setalloccol(`*tmp*`), error='setalloccol attempting to modify `*tmp*`') From a87c9b0e2e4a87df0416110182e99b81a99d4bc6 Mon Sep 17 00:00:00 2001 From: JenspederM <37183160+JenspederM@users.noreply.github.com> Date: Sat, 21 Aug 2021 04:39:24 +0200 Subject: [PATCH 370/588] dots were not passed along to as.IDate/as.ITime in IDateTime method. (#4674) --- NEWS.md | 15 +++++++++++++++ R/IDateTime.R | 2 +- inst/tests/tests.Rraw | 1 + 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 7b04b26ced..f6696b71e7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -254,6 +254,21 @@ # 2: 2 0.5 ``` +38. `IDateTime()` ignored the `tz=` and `format=` arguments because `...` was not passed through to submethods, [#2402](https://github.com/Rdatatable/data.table/pull/2402). Thanks to Frank Narf for reporting, and Jens Peder Meldgaard for the PR. + + ``` + IDateTime("20171002095500", format="%Y%m%d%H%M%S") + + # was : + # Error in charToDate(x) : + # character string is not in a standard unambiguous format + + # now : + # idate itime + # + # 1: 2017-10-02 09:55:00 + ``` + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/IDateTime.R b/R/IDateTime.R index 9fd57b2b7a..42a6b289a6 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -270,7 +270,7 @@ mean.ITime = seq.ITime = c.ITime = function(x, ...) as.ITime(NextMethod()) IDateTime = function(x, ...) UseMethod("IDateTime") IDateTime.default = function(x, ...) { - data.table(idate = as.IDate(x), itime = as.ITime(x)) + data.table(idate = as.IDate(x, ...), itime = as.ITime(x, ...)) } # POSIXt support diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ce0941de7d..6626e4f0af 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10236,6 +10236,7 @@ test(1689.2, as.IDate(date_tz), output="2016-01-13") date_tz = structure(1496275200.11903, class = c("POSIXct", "POSIXt"), tzone = "America/Los_Angeles") test(1689.3, as.character(as.IDate(date_tz)), "2017-05-31") test(1689.4, as.character(as.IDate(date_tz, tz="UTC")), "2017-06-01") +test(1689.5, IDateTime("20171002095512", format="%Y%m%d%H%M%S"), data.table(idate=as.IDate("2017-10-02"), itime=as.ITime("09:55:12"))) #2402 # fix for #1766 and #1704 A = data.table(i = 1:6, j = rep(1:2, 3), x = letters[1:6], key = "i") From 1bc9178f56832e23978c352042602f60a6af9e19 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 22 Aug 2021 10:42:44 -0600 Subject: [PATCH 371/588] ignore datatable.nomatch option with warning (#5108) --- NEWS.md | 8 ++++++++ R/data.table.R | 2 +- R/foverlaps.R | 2 +- R/onLoad.R | 9 ++++----- man/data.table.Rd | 4 ++-- man/foverlaps.Rd | 9 ++++----- vignettes/datatable-importing.Rmd | 2 +- 7 files changed, 21 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index f6696b71e7..451532fca4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -307,6 +307,14 @@ > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. +14. For nearly two years, since v1.12.4 (Oct 2019) (note 11 below in this NEWS file), using `options(datatable.nomatch=0)` has produced the following message : + + ``` + The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option. + ``` + + The message is now upgraded to warning that the option is now ignored. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/R/data.table.R b/R/data.table.R index 504eb49cc7..f13a1c7eab 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -138,7 +138,7 @@ replace_dot_alias = function(e) { } } -"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) +"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could # test explicitly if the caller is [.data.table (even stronger test. TO DO.) diff --git a/R/foverlaps.R b/R/foverlaps.R index 58c7a75557..9a0cd55808 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -1,4 +1,4 @@ -foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=key(y), maxgap=0L, minoverlap=1L, type=c("any", "within", "start", "end", "equal"), mult=c("all", "first", "last"), nomatch=getOption("datatable.nomatch", NA), which=FALSE, verbose=getOption("datatable.verbose")) { +foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=key(y), maxgap=0L, minoverlap=1L, type=c("any", "within", "start", "end", "equal"), mult=c("all", "first", "last"), nomatch=NA, which=FALSE, verbose=getOption("datatable.verbose")) { if (!is.data.table(y) || !is.data.table(x)) stopf("y and x must both be data.tables. Use `setDT()` to convert list/data.frames to data.tables by reference or as.data.table() to convert to data.tables by copying.") maxgap = as.integer(maxgap); minoverlap = as.integer(minoverlap) diff --git a/R/onLoad.R b/R/onLoad.R index 9ad7051ffd..1ee328e99f 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -1,16 +1,15 @@ # nocov start -# used to raise message (write to STDERR but not raise warning) once per session only -# in future this will be upgraded to warning, then error, until eventually removed after several years .pkg.store = new.env() .pkg.store$.unsafe.done = FALSE .unsafe.opt = function() { if (.pkg.store$.unsafe.done) return(invisible()) val = getOption("datatable.nomatch") - if (is.null(val)) return(invisible()) # not set is ideal (it's no longer set in .onLoad) - if (identical(val, NA) || identical(val, NA_integer_)) return(invisible()) # set to default NA is ok for now; in future possible message/warning asking to remove - messagef("The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option.") + if (is.null(val)) return(invisible()) # not defined (it hasn't been defined in .onLoad since v1.12.4) + warningf("Option 'datatable.nomatch' is defined but is now ignored. Please see note 11 in v1.12.4 NEWS (Oct 2019), and note 14 in v1.14.2.") + # leave this as warning for a long time .pkg.store$.unsafe.done = TRUE + invisible() } .Last.updated = vector("integer", 1L) # exported variable; number of rows updated by the last := or set(), #1885 diff --git a/man/data.table.Rd b/man/data.table.Rd index fd2bbd4508..7ec8cec3a8 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -26,7 +26,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE) \method{[}{data.table}(x, i, j, by, keyby, with = TRUE, - nomatch = getOption("datatable.nomatch", NA), + nomatch = NA, mult = "all", roll = FALSE, rollends = if (roll=="nearest") c(TRUE,TRUE) @@ -121,7 +121,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac When \code{j} is a character vector of column names, a numeric vector of column positions to select or of the form \code{startcol:endcol}, and the value returned is always a \code{data.table}. \code{with=FALSE} is not necessary anymore to select columns dynamically. Note that \code{x[, cols]} is equivalent to \code{x[, ..cols]} and to \code{x[, cols, with=FALSE]} and to \code{x[, .SD, .SDcols=cols]}.} - \item{nomatch}{ When a row in \code{i} has no match to \code{x}, \code{nomatch=NA} (default) means \code{NA} is returned. \code{NULL} (or \code{0} for backward compatibility) means no rows will be returned for that row of \code{i}. Use \code{options(datatable.nomatch=NULL)} to change the default value (used when \code{nomatch} is not supplied).} + \item{nomatch}{ When a row in \code{i} has no match to \code{x}, \code{nomatch=NA} (default) means \code{NA} is returned. \code{NULL} (or \code{0} for backward compatibility) means no rows will be returned for that row of \code{i}. } \item{mult}{ When \code{i} is a \code{list} (or \code{data.frame} or \code{data.table}) and \emph{multiple} rows in \code{x} match to the row in \code{i}, \code{mult} controls which are returned: \code{"all"} (default), \code{"first"} or \code{"last"}.} diff --git a/man/foverlaps.Rd b/man/foverlaps.Rd index e90d251338..c8f72117c6 100644 --- a/man/foverlaps.Rd +++ b/man/foverlaps.Rd @@ -20,7 +20,7 @@ foverlaps(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.y = key(y), maxgap = 0L, minoverlap = 1L, type = c("any", "within", "start", "end", "equal"), mult = c("all", "first", "last"), - nomatch = getOption("datatable.nomatch", NA), + nomatch = NA, which = FALSE, verbose = getOption("datatable.verbose")) } \arguments{ @@ -66,11 +66,10 @@ of the overlap. This will be updated once \code{maxgap} is implemented.} match in \code{y}, \code{nomatch=NA} (default) means \code{NA} is returned for \code{y}'s non-\code{by.y} columns for that row of \code{x}. \code{nomatch=NULL} (or \code{0} for backward compatibility) means no rows will be returned for that -row of \code{x}. Use \code{options(datatable.nomatch=NULL)} to change the default -value (used when \code{nomatch} is not supplied).} +row of \code{x}. } \item{which}{ When \code{TRUE}, if \code{mult="all"} returns a two column \code{data.table} with the first column corresponding to \code{x}'s row number -and the second corresponding to \code{y}'s. when \code{nomatch=NA}, no matches +and the second corresponding to \code{y}'s. When \code{nomatch=NA}, no matches return \code{NA} for \code{y}, and if \code{nomatch=NULL}, those rows where no match is found will be skipped; if \code{mult="first" or "last"}, a vector of length equal to the number of rows in \code{x} is returned, with no-match entries @@ -116,7 +115,7 @@ NB: When \code{which=TRUE}: \code{a)} \code{mult="first" or "last"} returns a containing row numbers of \code{x} and the second column with corresponding row numbers of \code{y}. -\code{nomatch=NA or 0} also influences whether non-matching rows are returned +\code{nomatch=NA|NULL} also influences whether non-matching rows are returned or not, as explained above. } diff --git a/vignettes/datatable-importing.Rmd b/vignettes/datatable-importing.Rmd index 689e68903e..41a3d629ae 100644 --- a/vignettes/datatable-importing.Rmd +++ b/vignettes/datatable-importing.Rmd @@ -132,7 +132,7 @@ If you don't mind having `id` and `grp` registered as variables globally in your Common practice by R packages is to provide customization options set by `options(name=val)` and fetched using `getOption("name", default)`. Function arguments often specify a call to `getOption()` so that the user knows (from `?fun` or `args(fun)`) the name of the option controlling the default for that parameter; e.g. `fun(..., verbose=getOption("datatable.verbose", FALSE))`. All `data.table` options start with `datatable.` so as to not conflict with options in other packages. A user simply calls `options(datatable.verbose=TRUE)` to turn on verbosity. This affects all calls to `fun()` other the ones which have been provided `verbose=` explicity; e.g. `fun(..., verbose=FALSE)`. -The option mechanism in R is _global_. Meaning that if a user sets a `data.table` option for their own use, that setting also affects code inside any package that is using `data.table` too. For an option like `datatable.verbose`, this is exactly the desired behavior since the desire is to trace and log all `data.table` operations from wherever they originate; turning on verbosity does not affect the results. Another unique-to-R and excellent-for-production option is R's `options(warn=2)` which turns all warnings into errors. Again, the desire is to affect any warning in any package so as to not missing any warnings in production. There are 6 `datatable.print.*` options and 3 optimization options which do not affect the result of operations, either. However, there is one `data.table` option that does and is now a concern: `datatable.nomatch`. This option changes the default join from outer to inner. [Aside, the default join is outer because outer is safer; it doesn't drop missing data silently; moreover it is consistent to base R way of matching by names and indices.] Some users prefer inner join to be the default and we provided this option for them. However, a user setting this option can unintentionally change the behavior of joins inside packages that use `data.table`. Accordingly, in v1.12.4, we have started the process to deprecate the `datatable.nomatch` option. It is the only `data.table` option with this concern. +The option mechanism in R is _global_. Meaning that if a user sets a `data.table` option for their own use, that setting also affects code inside any package that is using `data.table` too. For an option like `datatable.verbose`, this is exactly the desired behavior since the desire is to trace and log all `data.table` operations from wherever they originate; turning on verbosity does not affect the results. Another unique-to-R and excellent-for-production option is R's `options(warn=2)` which turns all warnings into errors. Again, the desire is to affect any warning in any package so as to not miss any warnings in production. There are 6 `datatable.print.*` options and 3 optimization options which do not affect the result of operations. However, there is one `data.table` option that does and is now a concern: `datatable.nomatch`. This option changes the default join from outer to inner. [Aside, the default join is outer because outer is safer; it doesn't drop missing data silently; moreover it is consistent to base R way of matching by names and indices.] Some users prefer inner join to be the default and we provided this option for them. However, a user setting this option can unintentionally change the behavior of joins inside packages that use `data.table`. Accordingly, in v1.12.4 (Oct 2019) a message was printed when the `datatable.nomatch` option was used, and from v1.14.2 it is now ignored with warning. It was the only `data.table` option with this concern. ## Troubleshooting From 30b6f0eb0b9d8896fb2cdaf2bb4ebdf05a3e7388 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 22 Aug 2021 21:55:45 +0200 Subject: [PATCH 372/588] int i and nomatch arg (#4353) --- NEWS.md | 18 +++++++++++++++ R/data.table.R | 39 +++++++++++++++++--------------- inst/tests/tests.Rraw | 32 +++++++++++++++++++++++++- src/bmerge.c | 3 +-- src/nqrecreateindices.c | 6 ++--- src/subset.c | 50 ++++++++++++++++++++++++++--------------- 6 files changed, 106 insertions(+), 42 deletions(-) diff --git a/NEWS.md b/NEWS.md index 451532fca4..f0581ccb52 100644 --- a/NEWS.md +++ b/NEWS.md @@ -114,6 +114,24 @@ ```R mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) ``` + +23. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. + + ```R + DT = data.table(A=1:3) + DT[c(1L, NA, 3L, 5L)] # default nomatch=NA + # A + # + # 1: 1 + # 2: NA + # 3: 3 + # 4: NA + DT[c(1L, NA, 3L, 5L), nomatch=NULL] + # A + # + # 1: 1 + # 2: 3 + ``` ## BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index f13a1c7eab..10a1381297 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -221,12 +221,12 @@ replace_dot_alias = function(e) { # TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one. .unsafe.opt() #3585 missingnomatch = missing(nomatch) - if (is.null(nomatch)) nomatch = 0L # allow nomatch=NULL API already now, part of: https://github.com/Rdatatable/data.table/issues/857 - if (!is.na(nomatch) && nomatch!=0L) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL)") - nomatch = as.integer(nomatch) + nomatch0 = identical(nomatch,0) || identical(nomatch,0L) # for warning with row-numbers in i; #4353 + if (nomatch0) nomatch=NULL # retain nomatch=0 backwards compatibility; #857 + if (!is.na(nomatch) && !is.null(nomatch)) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)") if (!is.logical(which) || length(which)>1L) stopf("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) - if (!is.na(nomatch) && is.na(which)) stopf("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.") + if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stopf("j must be provided when with=FALSE") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE @@ -313,7 +313,7 @@ replace_dot_alias = function(e) { stopf(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i") if (!missingnomatch) { warningf("nomatch isn't relevant together with :=, ignoring nomatch") - nomatch=0L + nomatch=NULL } } } @@ -369,7 +369,7 @@ replace_dot_alias = function(e) { if (isub %iscall% "!") { notjoin = TRUE if (!missingnomatch) stopf("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch."); - nomatch = 0L + nomatch = NULL isub = isub[[2L]] # #932 related so that !(v1 == 1) becomes v1 == 1 instead of (v1 == 1) after removing "!" if (isub %iscall% "(" && !is.name(isub[[2L]])) @@ -389,7 +389,7 @@ replace_dot_alias = function(e) { on = o$on ## the following two are ignored if i is not a data.table. ## Since we are converting i to data.table, it is important to set them properly. - nomatch = 0L + nomatch = NULL mult = "all" } else if (!is.name(isub)) { @@ -509,8 +509,7 @@ replace_dot_alias = function(e) { len__ = ans$lens allGrp1 = all(ops==1L) # was previously 'ans$allGrp1'. Fixing #1991. TODO: Revisit about allGrp1 possibility for speedups in certain cases when I find some time. indices__ = if (length(ans$indices)) ans$indices else seq_along(f__) # also for #1991 fix - # length of input nomatch (single 0 or NA) is 1 in both cases. - # When no match, len__ is 0 for nomatch=0 and 1 for nomatch=NA, so len__ isn't .N + # When no match, len__ is 0 for nomatch=NULL and 1 for nomatch=NA, so len__ isn't .N # If using secondary key of x, f__ will refer to xo if (is.na(which)) { w = if (notjoin) f__!=0L else is.na(f__) @@ -533,7 +532,7 @@ replace_dot_alias = function(e) { # Fix for #1092 and #1074 # TODO: implement better version of "any"/"all"/"which" to avoid # unnecessary construction of logical vectors - if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L] + if (is.null(nomatch) && allLen1) irows = irows[irows != 0L] } else { if (length(xo) && missing(on)) stopf("Internal error. Cannot by=.EACHI when joining to an index, yet") # nocov @@ -553,10 +552,10 @@ replace_dot_alias = function(e) { } else { if (!byjoin) { #1287 and #1271 irows = f__ # len__ is set to 1 as well, no need for 'pmin' logic - if (identical(nomatch,0L)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows + if (is.null(nomatch)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows } # TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"? - # TODO: how about when nomatch=0L, can we avoid allocating then as well? + # TODO: how about when nomatch=NULL, can we avoid allocating then as well? } if (length(xo) && length(irows)) { irows = xo[irows] # TO DO: fsort here? @@ -619,8 +618,12 @@ replace_dot_alias = function(e) { else stopf("i evaluates to a logical vector length %d but there are %d rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.", length(i), nrow(x)) } else { irows = as.integer(i) # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error) - irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), is.null(jsub) || root!=":=") # last argument is allowOverMax (NA when selecting, error when assigning) - # simplifies logic from here on: can assume positive subscripts (no zeros) + if (nomatch0) warning("Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)") + # warning only for this case where nomatch was ignored before v1.14.2; #3109 + irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), + is.null(jsub) || root!=":=", # allowOverMax (NA when selecting, error when assigning) + !is.null(nomatch)) # allowNA=false when nomatch=NULL #3109, true when nomatch=NA #3666 + # simplifies logic from here on: can assume positive subscripts (no zeros), for nomatch=NULL also no NAs # maintains Arun's fix for #2697 (test 1042) # efficient in C with more detailed helpful messages when user mixes positives and negatives # falls through quickly (no R level allocs) if all items are within range [1,max] with no zeros or negatives @@ -628,7 +631,7 @@ replace_dot_alias = function(e) { } } if (notjoin) { - if (byjoin || !is.integer(irows) || is.na(nomatch)) stopf("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov + if (byjoin || !is.integer(irows) || !is.null(nomatch)) stopf("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov irows = irows[irows!=0L] if (verbose) {last.started.at=proc.time();catf("Inverting irows for notjoin done in ... ");flush.console()} i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL # NULL meaning all rows i.e. seq_len(nrow(x)) @@ -843,7 +846,7 @@ replace_dot_alias = function(e) { # in 1.8.3, but this failed test 876. # TO DO: Add a test like X[i,sum(v),by=i.x2], or where by includes a join column (both where some i don't match). # TO DO: Make xss directly, rather than recursive call. - if (!is.na(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset + if (is.null(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset if (length(allbyvars)) { ############### TO DO TO DO TO DO ############### if (verbose) catf("i clause present and columns used in by detected, only these subset: %s\n", brackify(allbyvars)) xss = `[.data.table`(x,irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends) @@ -1290,7 +1293,7 @@ replace_dot_alias = function(e) { } ans = vector("list", length(ansvars)) ii = rep.int(indices__, len__) # following #1991 fix - # TODO: if (allLen1 && allGrp1 && (is.na(nomatch) || !any(f__==0L))) then ii will be 1:nrow(i) [nomatch=0 should drop rows in i that have no match] + # TODO: if (allLen1 && allGrp1 && (!is.null(nomatch) || !any(f__==0L))) then ii will be 1:nrow(i) [nomatch=NULL should drop rows in i that have no match] # But rather than that complex logic here at R level to catch that and do a shallow copy for efficiency, just do the check inside CsubsetDT # to see if it passed 1:nrow(x) and then CsubsetDT should do the shallow copy safely and centrally. # That R level branch was taken out in PR #3213 @@ -1721,7 +1724,7 @@ replace_dot_alias = function(e) { } dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character? # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with - # nomatch=0L even now.. but not switching it on yet, will deal it separately. + # nomatch=NULL even now.. but not switching it on yet, will deal it separately. if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { if (!length(ansvars) && !use.I) { GForce = FALSE diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6626e4f0af..d477b13e7d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2293,7 +2293,8 @@ test(807, DT[!c("b","foo","c"),nomatch=0], error="not-join.*prefix is present on test(808, DT[c("b","foo","c"),which=TRUE,nomatch=NA], INT(3:4,NA,5:6)) test(809, DT[c("b","foo","c"),which=TRUE,nomatch=0], INT(3:4,5:6)) test(810, DT[c("b","foo","c"),which=NA,nomatch=NA], 2L) -test(811, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0 would always return an empty vector[.] Please change or remove either which or nomatch") +test(811.1, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0|NULL would always return an empty vector[.] Please change or remove either which or nomatch") +test(811.2, DT[c("b","foo","c"),which=NA,nomatch=NULL], error="which=NA with nomatch=0|NULL would always return an empty vector[.] Please change or remove either which or nomatch") # New notj for column names and positions when with=FALSE, #1384 DT = data.table(a=1:3,b=4:6,c=7:9) @@ -17979,3 +17980,32 @@ test(2209.3, fread('A|B,C\n1|+,4\n2|-,5\n3|-,6\n', dec=','), data.table(A=1:3, ' test(2209.4, fread('A|B|C\n1|0,4|a\n2|0,5|b\n', dec=','), data.table(A=1:2, B=c(0.4,0.5), C=c("a","b"))) # ok before test(2209.5, fread('A|B,C\n1|0,4\n2|0,5\n', dec=','), data.table(A=1:2, "B,C"=c(0.4,0.5))) +# respect `nomatch=NULL` for integer i, #3109 #3666 +DT = data.table(x = 1:4) +test(2210.01, DT[c(1L, 5L, NA_integer_)], data.table(x=c(1L,NA_integer_,NA_integer_))) # default nomatch=NA +test(2210.02, DT[c(1L, 5L, NA_integer_), nomatch=NULL], data.table(x = 1L)) +test(2210.03, DT[c(1L, 5L, NA_integer_), nomatch=0], data.table(x = 1L), warning="Please use nomatch=NULL") +test(2210.04, DT[c(1L, 5L, NA_integer_), x, nomatch=NULL], 1L) +test(2210.05, DT[c(1L, 5L, NA_integer_), x, nomatch=0], 1L, warning="Please use nomatch=NULL") +test(2210.06, DT[c(1:4,1:4), nomatch=NULL], data.table(x=c(1:4,1:4))) # early stopping convertNegAndZeroIdx +test(2210.07, DT[c(1:4,-1L), nomatch=NULL], error="Cannot mix positives and negatives") +test(2210.08, DT[c(1:4,NA_integer_,-1L), nomatch=NULL], error="Cannot mix positives and negatives") +test(2210.09, DT[c(1:4,NA_integer_,-1L), nomatch=0], error="Cannot mix positives and negatives", warning="Please use nomatch=NULL") +test(2210.10, DT[c(-1L,NA_integer_), nomatch=NULL], error="Cannot mix negatives and NA") +test(2210.11, DT[c(-1L,NA_integer_), nomatch=0], error="Cannot mix negatives and NA", warning="Please use nomatch=NULL") +test(2210.12, DT[NA, nomatch=NULL], data.table(x=integer())) +test(2210.13, DT[NA, nomatch=0], data.table(x=integer()), warning="Please use nomatch=NULL") +test(2210.14, DT[0L, nomatch=NULL], data.table(x=integer())) +test(2210.15, DT[0L, nomatch=0], data.table(x=integer()), warning="Please use nomatch=NULL") +test(2210.16, DT[0:1, nomatch=NULL], data.table(x=1L)) +test(2210.17, DT[0:1, nomatch=0], data.table(x=1L), warning="Please use nomatch=NULL") +test(2210.18, DT[-1L, nomatch=NULL], data.table(x=2:4)) +test(2210.19, DT[-1L, nomatch=0], data.table(x=2:4), warning="Please use nomatch=NULL") +test(2210.20, data.table()[1L, nomatch=NULL], data.table()) +test(2210.21, data.table()[1L, nomatch=0], data.table(), warning="Please use nomatch=NULL") +test(2210.22, data.table()[-1L, nomatch=NULL], data.table(), warning="there are only 0 rows") +test(2210.23, data.table()[-1L, nomatch=0], data.table(), warning=c("Please use nomatch=NULL","there are only 0 rows")) +test(2210.24, DT[-c(1L,0L)], data.table(x=2:4)) # codecov gap, not related to nomatch +test(2210.25, DT[-c(1L,0L), nomatch=NULL], data.table(x=2:4)) +test(2210.26, DT[-c(1L,0L), nomatch=0], data.table(x=2:4), warning="Please use nomatch=NULL") + diff --git a/src/bmerge.c b/src/bmerge.c index 83c5b167d2..fac7ee281f 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -85,8 +85,7 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP error(_("rollends must be a length 2 logical vector")); rollends = LOGICAL(rollendsArg); - // nomatch arg - nomatch = INTEGER(nomatchArg)[0]; + nomatch = isNull(nomatchArg) ? 0 : INTEGER(nomatchArg)[0]; // mult arg if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL; diff --git a/src/nqrecreateindices.c b/src/nqrecreateindices.c index a07ce38c7b..a750c093eb 100644 --- a/src/nqrecreateindices.c +++ b/src/nqrecreateindices.c @@ -14,7 +14,7 @@ SEXP nqRecreateIndices(SEXP xo, SEXP len, SEXP indices, SEXP nArg, SEXP nomatch) const int *iindices = INTEGER(indices); const int *ilen = INTEGER(len); const int *ixo = INTEGER(xo); - const int *inomatch = INTEGER(nomatch); + const int inomatch = isNull(nomatch) ? 0 : INTEGER(nomatch)[0]; int *inewstarts = INTEGER(newstarts); for (int i=0; i= xn) { // NA_integer_ = INT_MIN is checked in init.c - // j >= xn needed for special nomatch=0L case, see issue#4388 (due to xo[irows] from R removing '0' value in xo) - inewstarts[i] = inomatch[0]; + // j >= xn needed for special nomatch=NULL case, see issue#4388 (due to xo[irows] from R removing '0' value in xo) + inewstarts[i] = inomatch; j++; // newlen will be 1 for xo=NA and 0 for xo=0 .. but we need to increment by 1 for both } else { inewstarts[i] = tmp+1; diff --git a/src/subset.c b/src/subset.c index 2e62b6ec9c..a04fcc9ecb 100644 --- a/src/subset.c +++ b/src/subset.c @@ -124,39 +124,42 @@ const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_ou return NULL; } -SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax) +SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP allowNAArg) { // called from [.data.table to massage user input, creating a new strictly positive idx if there are any negatives or zeros // + more precise and helpful error messages telling user exactly where the problem is (saving user debugging time) // + a little more efficient than negativeSubscript in src/main/subscript.c (it's private to R so we can't call it anyway) // allowOverMaxArg is false when := (test 1024), otherwise true for selecting + // allowNAArg is false when nomatch=NULL #3109 #3666 if (!isInteger(idx)) error(_("Internal error. 'idx' is type '%s' not 'integer'"), type2char(TYPEOF(idx))); // # nocov if (!isInteger(maxArg) || length(maxArg)!=1) error(_("Internal error. 'maxArg' is type '%s' and length %d, should be an integer singleton"), type2char(TYPEOF(maxArg)), length(maxArg)); // # nocov if (!isLogical(allowOverMax) || LENGTH(allowOverMax)!=1 || LOGICAL(allowOverMax)[0]==NA_LOGICAL) error(_("Internal error: allowOverMax must be TRUE/FALSE")); // # nocov - int max = INTEGER(maxArg)[0], n=LENGTH(idx); + const int max = INTEGER(maxArg)[0], n=LENGTH(idx); if (max<0) error(_("Internal error. max is %d, must be >= 0."), max); // # nocov includes NA which will print as INT_MIN - int *idxp = INTEGER(idx); + if (!isLogical(allowNAArg) || LENGTH(allowNAArg)!=1 || LOGICAL(allowNAArg)[0]==NA_LOGICAL) error(_("Internal error: allowNAArg must be TRUE/FALSE")); // # nocov + const bool allowNA = LOGICAL(allowNAArg)[0]; + const int *idxp = INTEGER(idx); bool stop = false; #pragma omp parallel for num_threads(getDTthreads(n, true)) - for (int i=0; imax) stop=true; + if ((elem<1 && (elem!=NA_INTEGER || !allowNA)) || elem>max) stop=true; } - if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA or in range [1-max] + if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA (if allowNA) or in range [1-max] // --------- - // else massage the input to a standard idx where all items are either NA or in range [1,max] ... + // else massage the input to a standard idx where all items are either in range [1,max], or NA (if allowNA) - int countNeg=0, countZero=0, countNA=0, firstOverMax=0; - for (int i=0; imax && firstOverMax==0) firstOverMax=i+1; + else if (elem>max && ++countOverMax && firstOverMax==0) firstOverMax=i+1; } if (firstOverMax && LOGICAL(allowOverMax)[0]==FALSE) { error(_("i[%d] is %d which is out of range [1,nrow=%d]"), firstOverMax, idxp[firstOverMax-1], max); @@ -186,13 +189,24 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax) SEXP ans; if (countNeg==0) { - // just zeros to remove, or >max to convert to NA - ans = PROTECT(allocVector(INTSXP, n - countZero)); - int *ansp = INTEGER(ans); - for (int i=0, ansi=0; imax ? NA_INTEGER : elem; + if (allowNA) { + // remove zeros, convert >max to NA + ans = PROTECT(allocVector(INTSXP, n-countZero)); + int *ansp = INTEGER(ans); + for (int i=0, ansi=0; imax ? NA_INTEGER : elem; + } + } else { + // remove zeros, NA and >max + ans = PROTECT(allocVector(INTSXP, n-countZero-countNA-countOverMax)); + int *ansp = INTEGER(ans); + for (int i=0, ansi=0; imax) continue; + ansp[ansi++] = elem; + } } } else { // idx is all negative without any NA but perhaps some zeros @@ -265,7 +279,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md bool anyNA=false, orderedSubset=true; // true for when rows==null (meaning all rows) if (!isNull(rows) && check_idx(rows, nrow, &anyNA, &orderedSubset)!=NULL) { SEXP max = PROTECT(ScalarInteger(nrow)); nprotect++; - rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE))); nprotect++; + rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE), ScalarLogical(TRUE))); nprotect++; const char *err = check_idx(rows, nrow, &anyNA, &orderedSubset); if (err!=NULL) error(err); } From 897ac6d172e9dc3852130209a0eb9940677040b4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 22 Aug 2021 18:24:02 -0600 Subject: [PATCH 373/588] #4353: tweak to pass R 3.1 where is.na(NULL) issued a warning --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 10a1381297..96e9547fc7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -223,7 +223,7 @@ replace_dot_alias = function(e) { missingnomatch = missing(nomatch) nomatch0 = identical(nomatch,0) || identical(nomatch,0L) # for warning with row-numbers in i; #4353 if (nomatch0) nomatch=NULL # retain nomatch=0 backwards compatibility; #857 - if (!is.na(nomatch) && !is.null(nomatch)) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)") + if (!(is.null(nomatch) || (length(nomatch)==1L && is.na(nomatch)))) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)") if (!is.logical(which) || length(which)>1L) stopf("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") From b33dee6130d35c9e62bcdf2db336eafe46db60e9 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 24 Aug 2021 02:02:27 +0200 Subject: [PATCH 374/588] Gforce edge case creates segfault (#5109) --- NEWS.md | 26 +- R/test.data.table.R | 2 +- inst/tests/tests.Rraw | 44 +++- src/gsumm.c | 535 ++++++++++++++---------------------------- 4 files changed, 220 insertions(+), 387 deletions(-) diff --git a/NEWS.md b/NEWS.md index f0581ccb52..c91e72b81c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,7 +18,7 @@ DT = data.table(A=1:3, B=letters[1:3]) DT[A>3, .(ITEM='A>3', A, B)] # (1) DT[A>3][, .(ITEM='A>3', A, B)] # (2) - # the above are now equivalent as expected and return: + # the above are now equivalent as expected and return: Empty data.table (0 rows and 3 cols): ITEM,A,B # Previously, (2) returned : ITEM A B @@ -30,12 +30,12 @@ 2: In as.data.table.list(jval, .named = NULL) : Item 3 has 0 rows but longest item has 1; filled with NA ``` - + ```R DT = data.table(A=1:3, B=letters[1:3], key="A") DT[.(1:3, double()), B] # new result : - character(0) + character(0) # old result : [1] "a" "b" "c" Warning message: @@ -51,7 +51,7 @@ DT[, sum(colB), keyby="colA"] DT[, sum(colB), by="colA", keyby=TRUE] # same ``` - + 7. `fwrite()` gains a new `datatable.fwrite.sep` option to change the default separator, still `","` by default. Thanks to Tony Fischetti for the PR. As is good practice in R in general, we usually resist new global options for the reason that a user changing the option for their own code can inadvertently change the behaviour of any package using `data.table` too. However, in this case, the global option affects file output rather than code behaviour. In fact, the very reason the user may wish to change the default separator is that they know a different separator is more appropriate for their data being passed to the package using `fwrite` but cannot otherwise change the `fwrite` call within that package. 8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing. @@ -86,7 +86,7 @@ out_col_name = "sum_x" )] ``` - + 11. `DT[, if (...) .(a=1L) else .(a=1L, b=2L), by=group]` now returns a 1-column result with warning `j may not evaluate to the same number of columns for each group`, rather than error `'names' attribute [2] must be the same length as the vector`, [#4274](https://github.com/Rdatatable/data.table/issues/4274). Thanks to @robitalec for reporting, and Michael Chirico for the PR. 12. Typo checking in `i` available since 1.11.4 is extended to work in non-English sessions, [#4989](https://github.com/Rdatatable/data.table/issues/4989). Thanks to Michael Chirico for the PR. @@ -114,7 +114,7 @@ ```R mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) ``` - + 23. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. ```R @@ -246,26 +246,26 @@ # 2: b NA # NA because there are no non-NA, naturally # no inconvenient warning ``` - + 36. `DT[, min(int64Col), by=grp]` (and `max`) would return incorrect results for `bit64::integer64` columns, [#4444](https://github.com/Rdatatable/data.table/issues/4444). Thanks to @go-see for reporting, and Michael Chirico for the PR. 37. `fread(dec=',')` was able to guess `sep=','` and return an incorrect result, [#4483](https://github.com/Rdatatable/data.table/issues/4483). Thanks to Michael Chirico for reporting and fixing. It was already an error to provide both `sep=','` and `dec=','` manually. ```R fread('A|B|C\n1|0,4|a\n2|0,5|b\n', dec=',') # no problem - + # A B C # # 1: 1 0.4 a # 2: 2 0.5 b fread('A|B,C\n1|0,4\n2|0,5\n', dec=',') - + # A|B C # old result guessed sep=',' despite dec=',' # # 1: 1|0 4 # 2: 2|0 5 - + # A B,C # now detects sep='|' correctly # # 1: 1 0.4 @@ -276,9 +276,9 @@ ``` IDateTime("20171002095500", format="%Y%m%d%H%M%S") - + # was : - # Error in charToDate(x) : + # Error in charToDate(x) : # character string is not in a standard unambiguous format # now : @@ -287,6 +287,8 @@ # 1: 2017-10-02 09:55:00 ``` +39. `DT[i, sum(b), by=grp]` (and other optimized-by-group aggregates: `mean`, `var`, `sd`, `median`, `prod`, `min`, `max`, `first`, `last`, `head` and `tail`) could segfault if `i` contained row numbers and one or more were NA, [#1994](https://github.com/Rdatatable/data.table/issues/1994). Thanks to Arun Srinivasan for reporting, and Benjamin Schwendinger for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/test.data.table.R b/R/test.data.table.R index 0c7fbeb23a..65a62fd0b5 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -178,7 +178,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) print(DT, class=FALSE) - catf("All %d tests (last %s) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) + catf("All %d tests (last %.8g) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) ## this chunk requires to include new suggested deps: graphics, grDevices #memtest.plot = function(.inittime) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d477b13e7d..4d782e4e92 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3257,7 +3257,7 @@ Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") x[ , r := as.raw(c(0, 1))] test(1037.414, melt(x, id.vars='x1', measure.vars='r'), error="Unknown column type 'raw' for column 'r'") - + # test dispatch for non-data.table objects, #4864. if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) { test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), @@ -6759,7 +6759,7 @@ if (test_xts) { " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", " 9: 1970-01-10 9", "10: 1970-01-11 10")) options(old) - + # as.data.table.xts(foo) had incorrect integer index with a column name called 'x', #4897 M = xts::as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) # xts:: just to be extra robust; shouldn't be needed with rm(as.xts) above test(1465.19, inherits(as.data.table(M)$index,"POSIXct")) @@ -14711,9 +14711,9 @@ DT = data.table(id = c(1L,1L,2L), v = as.raw(0:2)) test(2020.01, DT[, min(v), by=id], error="'raw' not supported by GForce min/max") test(2020.02, DT[, max(v), by=id], error="'raw' not supported by GForce min/max") test(2020.03, DT[, median(v), by=id], error="'raw' not supported by GForce median") -test(2020.04, DT[, head(v, 1), by=id], error="'raw' not supported by GForce head") -test(2020.05, DT[, tail(v, 1), by=id], error="'raw' not supported by GForce tail") -test(2020.06, DT[, v[1], by=id], error="'raw' not supported by GForce subset") +test(2020.04, DT[, head(v, 1), by=id], error="'raw' not supported by GForce head/tail/first/last/`[`") +test(2020.05, DT[, tail(v, 1), by=id], error="'raw' not supported by GForce head/tail/first/last/`[`") +test(2020.06, DT[, v[1], by=id], error="'raw' not supported by GForce head/tail/first/last/`[`") test(2020.07, DT[, sd(v), by=id], error="'raw' not supported by GForce sd") test(2020.08, DT[, var(v), by=id], error="'raw' not supported by GForce var") test(2020.09, DT[, prod(v), by=id], error="'raw' not supported by GForce prod") @@ -17062,7 +17062,7 @@ registerS3method("format_col", "complex", format_col.complex) x = data.table(z = c(1 + 3i, 2 - 1i, pi + 2.718i)) test(2130.12, x, output = '(1.0, 3.0i)') rm(format_col.complex) -registerS3method("format_col", "complex", format_col.default) +registerS3method("format_col", "complex", format_col.default) # otherwise it remains registered after test.data.table() and causes test 1610.1 to fail on the next run for example, and user display if they have complex data # haven't found a way to unregister an S3 method (tried registering NULL but there's an error that NULL isn't a function) @@ -17779,7 +17779,7 @@ test(2188.12, fifelse(c(TRUE, FALSE, TRUE, NA), NA, NA, as.Date("2020-01-01")), test(2188.13, fifelse(TRUE, 1L, 2.0, "a"), error="'na' is of type character but 'no' is double. Please") # smart error message test(2188.14, fifelse(TRUE, NA, 2, as.Date("2019-07-07")), error="'no' has different class than 'na'. Please") test(2188.15, fifelse(TRUE, NA, factor('a'), factor('a', levels = c('a','b'))), error="'no' and 'na' are both type factor but their levels are different") -test(2188.16, fifelse(c(NA, NA), 1L, 2L, NULL), c(NA_integer_, NA_integer_)) # NULL `na` is treated as NA +test(2188.16, fifelse(c(NA, NA), 1L, 2L, NULL), c(NA_integer_, NA_integer_)) # NULL `na` is treated as NA # rolling join expected output on non-matching join column has been fixed #1913 DT = data.table(ID=1:5, A=c(1.3, 1.7, 2.4, 0.9, 0.6)) @@ -17821,7 +17821,7 @@ if (test_bit64) { DT[a==1, a:=12] DT[a==2, a:=as.integer64(13)] test(2193.1, DT, data.table(a = as.integer64(c(12,13,3:10)))) - + # X[Y,,by=.EACHI] when Y contains integer64 also fixed in 1.12.4, #3779 X = data.table(x=1:3) Y = data.table(x=1:2, y=as.integer64(c(10,20))) @@ -17899,7 +17899,7 @@ setDTthreads() # restore default throttle # fwrite now allows sep="", #4817 test(2202.1, fwrite(data.frame(a="id", b=letters[1:5], c=1:5), sep=""), output = c("abc", paste0("id", letters[1:5], 1:5))) -test(2202.2, fwrite(data.frame(a="id", b=1:1e2), sep=""), +test(2202.2, fwrite(data.frame(a="id", b=1:1e2), sep=""), output = c("ab", paste0("id", 1:1e2))) test(2202.3, fwrite(data.table(a=c(NA, 2, 3.01), b=c('foo', NA, 'bar')), sep=""), output=c("ab", "foo", "2", "3.01bar")) @@ -18009,3 +18009,29 @@ test(2210.24, DT[-c(1L,0L)], data.table(x=2:4)) # codecov gap, not related to no test(2210.25, DT[-c(1L,0L), nomatch=NULL], data.table(x=2:4)) test(2210.26, DT[-c(1L,0L), nomatch=0], data.table(x=2:4), warning="Please use nomatch=NULL") +# NA in i would segfault gforce, #1994 +DT = data.table(a=1L, b=2, c="a", grp=1L) +i = c(1L,NA,NA,NA) # 3 NA to trigger segfault in var (min 3 obs) otherwise just c(1L,NA) is enough to trigger the others +funs = c("sum","mean","var","sd","median","prod","min","max","`[`","first","last","head","tail") +EVAL = function(...) { + e = paste0(...) + # cat(e,"\n") # uncomment to check the queries tested + eval(parse(text=e)) +} +testnum = 2211.0 +for (col in c("a","b","c")) { + testnum = testnum+0.1 + for (fi in seq_along(funs)) { + if (col=="c" && fi<=6L) next # first 6 funs don't support type character + f = funs[fi] + testnum = testnum+0.001 + test(testnum, EVAL("DT[i, ",f,"(",col, if(fi>8L)", 1L","), by=grp]"), # segfault before when NA in i + EVAL("DT[i][, ",f,"(",col, if(fi>8L)", 1L","), by=grp]")) # ok before by taking DT[i] subset first + if (fi<=8L) { + testnum = testnum+0.001 + test(testnum, EVAL("DT[i, ",f,"(",col,", na.rm=TRUE), by=grp]"), + EVAL("DT[i][, ",f,"(",col,", na.rm=TRUE), by=grp]")) + } + } +} + diff --git a/src/gsumm.c b/src/gsumm.c index 0fe05d1299..f806b1e3c8 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -235,7 +235,7 @@ void *gather(SEXP x, bool *anyNA) } else { const int *my_x = irows + b*batchSize; for (int i=0; i1) + const bool nosubset = irowslen == -1; + const int n = nosubset ? length(x) : irowslen; SEXP ans; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gtail"); + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, first?"gfirst":"glast"); + const bool gnth = w>1; // const bool to avoid fetching grpsize[i] when not needed switch(TYPEOF(x)) { case LGLSXP: { const int *ix = LOGICAL(x); ans = PROTECT(allocVector(LGLSXP, ngrp)); int *ians = LOGICAL(ans); for (int i=0; igrpsize[i]) { ians[i]=NA_LOGICAL; continue; } + int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - ians[i] = ix[k]; + ians[i] = nosubset ? ix[k] : (irows[k]==NA_INTEGER ? NA_LOGICAL : ix[irows[k]-1]); } } break; @@ -921,10 +925,10 @@ SEXP glast(SEXP x) { ans = PROTECT(allocVector(INTSXP, ngrp)); int *ians = INTEGER(ans); for (int i=0; igrpsize[i]) { ians[i]=NA_INTEGER; continue; } + int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - ians[i] = ix[k]; + ians[i] = nosubset ? ix[k] : (irows[k]==NA_INTEGER ? NA_INTEGER : ix[irows[k]-1]); } } break; @@ -933,10 +937,10 @@ SEXP glast(SEXP x) { ans = PROTECT(allocVector(REALSXP, ngrp)); double *dans = REAL(ans); for (int i=0; igrpsize[i]) { dans[i]=NA_REAL; continue; } + int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - dans[i] = dx[k]; + dans[i] = nosubset ? dx[k] : (irows[k]==NA_INTEGER ? NA_REAL : dx[irows[k]-1]); } } break; @@ -945,348 +949,146 @@ SEXP glast(SEXP x) { ans = PROTECT(allocVector(CPLXSXP, ngrp)); Rcomplex *dans = COMPLEX(ans); for (int i=0; igrpsize[i]) { dans[i]=NA_CPLX; continue; } + int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - dans[i] = dx[k]; + dans[i] = nosubset ? dx[k] : (irows[k]==NA_INTEGER ? NA_CPLX : dx[irows[k]-1]); } } break; - case STRSXP: + case STRSXP: { + const SEXP *sx = STRING_PTR(x); ans = PROTECT(allocVector(STRSXP, ngrp)); for (int i=0; igrpsize[i]) { SET_STRING_ELT(ans, i, NA_STRING); continue; } + int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - SET_STRING_ELT(ans, i, STRING_ELT(x, k)); + SET_STRING_ELT(ans, i, nosubset ? sx[k] : (irows[k]==NA_INTEGER ? NA_STRING : sx[irows[k]-1])); } - break; - case VECSXP: + } break; + case VECSXP: { + const SEXP *vx = SEXPPTR_RO(x); ans = PROTECT(allocVector(VECSXP, ngrp)); for (int i=0; igrpsize[i]) { SET_VECTOR_ELT(ans, i, ScalarLogical(NA_LOGICAL)); continue; } + int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k)); + SET_VECTOR_ELT(ans, i, nosubset ? vx[k] : (irows[k]==NA_INTEGER ? ScalarLogical(NA_LOGICAL) : vx[irows[k]-1])); } - break; + } break; default: - error(_("Type '%s' not supported by GForce tail (gtail). Either add the prefix utils::tail(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); + error(_("Type '%s' not supported by GForce head/tail/first/last/`[`. Either add the prefix utils::head(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); } copyMostAttrib(x, ans); UNPROTECT(1); return(ans); } +SEXP glast(SEXP x) { + return gfirstlast(x, false, 1); +} + SEXP gfirst(SEXP x) { - const int n = (irowslen == -1) ? length(x) : irowslen; - SEXP ans; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "ghead"); - switch(TYPEOF(x)) { - case LGLSXP: { - int const *ix = LOGICAL(x); - ans = PROTECT(allocVector(LGLSXP, ngrp)); - int *ians = LOGICAL(ans); - for (int i=0; i grpsize[i]) { LOGICAL(ans)[i] = NA_LOGICAL; continue; } - int k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - ians[i] = ix[k]; - } - } - break; - case INTSXP: { - const int *ix = INTEGER(x); - ans = PROTECT(allocVector(INTSXP, ngrp)); - int *ians = INTEGER(ans); - for (int i=0; i grpsize[i]) { INTEGER(ans)[i] = NA_INTEGER; continue; } - int k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - ians[i] = ix[k]; - } - } - break; - case REALSXP: { - const double *dx = REAL(x); - ans = PROTECT(allocVector(REALSXP, ngrp)); - double *dans = REAL(ans); - for (int i=0; i grpsize[i]) { REAL(ans)[i] = NA_REAL; continue; } - int k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - dans[i] = dx[k]; - } - } - break; - case CPLXSXP: { - const Rcomplex *dx = COMPLEX(x); - ans = PROTECT(allocVector(CPLXSXP, ngrp)); - Rcomplex *dans = COMPLEX(ans); - for (int i=0; i grpsize[i]) { dans[i].r = NA_REAL; dans[i].i = NA_REAL; continue; } - int k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - dans[i] = dx[k]; - } - } break; - case STRSXP: - ans = PROTECT(allocVector(STRSXP, ngrp)); - for (int i=0; i grpsize[i]) { SET_STRING_ELT(ans, i, NA_STRING); continue; } - int k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - SET_STRING_ELT(ans, i, STRING_ELT(x, k)); - } - break; - case VECSXP: - ans = PROTECT(allocVector(VECSXP, ngrp)); - for (int i=0; i grpsize[i]) { SET_VECTOR_ELT(ans, i, R_NilValue); continue; } - int k = ff[i]+val-2; - if (isunsorted) k = oo[k]-1; - k = (irowslen == -1) ? k : irows[k]-1; - SET_VECTOR_ELT(ans, i, VECTOR_ELT(x, k)); - } - break; - default: - error(_("Type '%s' not supported by GForce subset `[` (gnthvalue). Either add the prefix utils::head(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); - } - copyMostAttrib(x, ans); - UNPROTECT(1); - return(ans); + return gfirstlast(x, true, INTEGER(valArg)[0]); } // TODO: gwhich.min, gwhich.max // implemented this similar to gmedian to balance well between speed and memory usage. There's one extra allocation on maximum groups and that's it.. and that helps speed things up extremely since we don't have to collect x's values for each group for each step (mean, residuals, mean again and then variance). -SEXP gvarsd1(SEXP x, SEXP narm, Rboolean isSD) +static SEXP gvarsd1(SEXP x, SEXP narmArg, bool isSD) { - if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); + if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); if (!isVectorAtomic(x)) error(_("GForce var/sd can only be applied to columns, not .SD or similar. For the full covariance matrix of all items in a list such as .SD, either add the prefix stats::var(.SD) (or stats::sd(.SD)) or turn off GForce optimization using options(datatable.optimize=1). Alternatively, if you only need the diagonal elements, 'DT[,lapply(.SD,var),by=,.SDcols=]' is the optimized way to do this.")); if (inherits(x, "factor")) error(_("var/sd is not meaningful for factors.")); const int n = (irowslen == -1) ? length(x) : irowslen; if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gvar"); SEXP sub, ans = PROTECT(allocVector(REALSXP, ngrp)); + double *ansd = REAL(ans); + const bool nosubset = irowslen==-1; + const bool narm = LOGICAL(narmArg)[0]; switch(TYPEOF(x)) { - case LGLSXP: case INTSXP: + case LGLSXP: case INTSXP: { sub = PROTECT(allocVector(INTSXP, maxgrpn)); // allocate once upfront - if (!LOGICAL(narm)[0]) { - for (int i=0; i DBL_MAX) REAL(ans)[i] = R_PosInf; - else if (s[i] < -DBL_MAX) REAL(ans)[i] = R_NegInf; - else REAL(ans)[i] = (double)s[i]; - } + s[thisgrp] *= elem; // no under/overflow here, s is long double (like base) + }} break; - case REALSXP: + case REALSXP: { + const double *xd = REAL(x); for (int i=0; i DBL_MAX) REAL(ans)[i] = R_PosInf; - else if (s[i] < -DBL_MAX) REAL(ans)[i] = R_NegInf; - else REAL(ans)[i] = (double)s[i]; - } + const double elem = nosubset ? xd[i] : (irows[i]==NA_INTEGER ? NA_REAL : xd[irows[i]-1]); + if (ISNAN(elem)) { + if (!narm) s[thisgrp] = NA_REAL; + continue; + } + s[thisgrp] *= elem; + }} break; default: free(s); error(_("Type '%s' not supported by GForce prod (gprod). Either add the prefix base::prod(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); } + for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; + else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; + else ansd[i] = (double)s[i]; + } free(s); copyMostAttrib(x, ans); UNPROTECT(1); // Rprintf(_("this gprod took %8.3f\n"), 1.0*(clock()-start)/CLOCKS_PER_SEC); return(ans); } + From 7f3fba9be406071ae5d1ff283f1f9f067efd6593 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 25 Aug 2021 10:05:33 +0200 Subject: [PATCH 375/588] added ghead/gtail support for n>1 (#5089) --- NEWS.md | 2 + R/data.table.R | 29 ++++++-- inst/tests/tests.Rraw | 45 ++++++++----- src/gsumm.c | 149 ++++++++++++++++++++---------------------- 4 files changed, 128 insertions(+), 97 deletions(-) diff --git a/NEWS.md b/NEWS.md index c91e72b81c..4c9647cdfb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -133,6 +133,8 @@ # 2: 3 ``` +24. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index 96e9547fc7..4dfa9c276a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -809,8 +809,8 @@ replace_dot_alias = function(e) { # when the 'by' expression includes get/mget/eval, all.vars cannot be trusted to infer all used columns, #4981 allbyvars = NULL else - allbyvars = intersect(all.vars(bysub), names_x) - + allbyvars = intersect(all.vars(bysub), names_x) + orderedirows = .Call(CisOrderedSubset, irows, nrow(x)) # TRUE when irows is NULL (i.e. no i clause). Similar but better than is.sorted(f__) bysameorder = byindex = FALSE if (!bysub %iscall% ":" && ##Fix #4285 @@ -1740,13 +1740,13 @@ replace_dot_alias = function(e) { # is.symbol() is for #1369, #1974 and #2949 if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 - if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na"))) && (!q1 %chin% c("head","tail"))) return(TRUE) + if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na")))) return(TRUE) # ^^ base::startWith errors on NULL unfortunately # head-tail uses default value n=6 which as of now should not go gforce ... ^^ # otherwise there must be three arguments, and only in two cases: # 1) head/tail(x, 1) or 2) x[n], n>0 length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && - ( (q1 %chin% c("head", "tail") && q3==1L) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) + ( (q1 %chin% c("head", "tail")) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) } if (jsub[[1L]]=="list") { GForce = TRUE @@ -1762,6 +1762,8 @@ replace_dot_alias = function(e) { if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 } else { + # adding argument to ghead/gtail if none is supplied to g-optimized head/tail + if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } @@ -1841,6 +1843,25 @@ replace_dot_alias = function(e) { ans = gforce(thisEnv, jsub, o__, f__, len__, irows) # irows needed for #971. gi = if (length(o__)) o__[f__] else f__ g = lapply(grpcols, function(i) groups[[i]][gi]) + + # adding ghead/gtail(n) support for n > 1 #5060 #523 + q3 = 0 + if (!is.symbol(jsub)) { + headTail_arg = function(q) { + if (length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && + (q1 <- q[[1L]]) %chin% c("ghead", "gtail") && q3!=1) q3 + else 0 + } + if (jsub[[1L]] == "list"){ + q3 = max(sapply(jsub, headTail_arg)) + } else if (length(jsub)==3L) { + q3 = headTail_arg(jsub) + } + } + if (q3 > 0) { + grplens = pmin.int(q3, len__) + g = lapply(g, rep.int, times=grplens) + } ans = c(g, ans) } else { ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4d782e4e92..32b16e471f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8116,21 +8116,36 @@ test(1579.18, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) test(1579.19, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) test(1579.20, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) test(1579.21, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) -# GForce _doesn't_ work when n > 1 -test(1579.22, dt[ , tail(.SD, 2), by = x, verbose = TRUE], output = 'GForce FALSE') +# 1579.22 tested gtail with n>1; now 1579.4+ below mysub <- function(x, n) x[n] -test(1579.23, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x]) -test(1579.24, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x]) -test(1579.25, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.26, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.27, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.28, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.29, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.30, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) - -ans = capture.output(dt[, .SD[2], by=x, verbose=TRUE]) -test(1579.31, any(grepl("GForce optimized", ans)), TRUE) +test(1579.23, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") +test(1579.24, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) +test(1579.25, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) +test(1579.26, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) +test(1579.27, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 +test(1579.28, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) + +# gforce head/tail for n>1, #5060 +set.seed(99) +DT = data.table(x = sample(letters[1:5], 20, TRUE), + y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly + i = sample(c(-2L,0L,3L,NA), 20, TRUE), + d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), + s = sample(c("foo","bar",NA), 20, TRUE), + l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) +if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] +options(datatable.optimize=2L) +test(1579.401, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(1579.402, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(1579.403, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(1579.404, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(1579.405, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(1579.406, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(1579.407, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(1579.408, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(1579.409, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(1579.410, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") options(datatable.optimize = Inf) @@ -14695,11 +14710,11 @@ DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1) test(2018.1, DT[, head(.SD), a, verbose=TRUE], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1)), output=c("lapply optimization changed j from 'head(.SD)' to 'list(head(b, n = 6L), head(d, n = 6L))'", - "GForce is on, left j unchanged")) + "GForce optimized j to 'list(ghead(b, n = 6L), ghead(d, n = 6L))'")) test(2018.2, DT[, head(b), a, verbose=TRUE], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(1:6, 8:12)), output=c("lapply optimization is on, j unchanged as 'head(b)'", - "GForce is on, left j unchanged")) + "GForce optimized j to 'ghead(b, n = 6L)'")) test(2018.3, DT[, tail(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(2:7, 8:12), d=c(11:6, 5:1))) test(2018.4, DT[, tail(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(2:7, 8:12))) # gforce tests coverage diff --git a/src/gsumm.c b/src/gsumm.c index f806b1e3c8..7470f9f527 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -900,81 +900,72 @@ SEXP gmedian(SEXP x, SEXP narmArg) { return ans; } -static SEXP gfirstlast(SEXP x, const bool first, const int w) { +static SEXP gfirstlast(SEXP x, const bool first, const int w, const bool headw) { // w: which item (1 other than for gnthvalue when could be >1) + // headw: select 1:w of each group when first=true, and (n-w+1):n when first=false (i.e. tail) const bool nosubset = irowslen == -1; + const bool issorted = !isunsorted; // make a const-bool for use inside loops const int n = nosubset ? length(x) : irowslen; - SEXP ans; if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, first?"gfirst":"glast"); - const bool gnth = w>1; // const bool to avoid fetching grpsize[i] when not needed - switch(TYPEOF(x)) { - case LGLSXP: { - const int *ix = LOGICAL(x); - ans = PROTECT(allocVector(LGLSXP, ngrp)); - int *ians = LOGICAL(ans); - for (int i=0; igrpsize[i]) { ians[i]=NA_LOGICAL; continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - ians[i] = nosubset ? ix[k] : (irows[k]==NA_INTEGER ? NA_LOGICAL : ix[irows[k]-1]); - } - } - break; - case INTSXP: { - const int *ix = INTEGER(x); - ans = PROTECT(allocVector(INTSXP, ngrp)); - int *ians = INTEGER(ans); + if (w==1 && headw) error(_("Internal error: gfirstlast headw should only be true when w>1")); + int anslen = ngrp; + if (headw) { + anslen = 0; for (int i=0; igrpsize[i]) { ians[i]=NA_INTEGER; continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - ians[i] = nosubset ? ix[k] : (irows[k]==NA_INTEGER ? NA_INTEGER : ix[irows[k]-1]); + anslen += MIN(w, grpsize[i]); } } - break; - case REALSXP: { - const double *dx = REAL(x); - ans = PROTECT(allocVector(REALSXP, ngrp)); - double *dans = REAL(ans); - for (int i=0; igrpsize[i]) { dans[i]=NA_REAL; continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - dans[i] = nosubset ? dx[k] : (irows[k]==NA_INTEGER ? NA_REAL : dx[irows[k]-1]); - } + SEXP ans = PROTECT(allocVector(TYPEOF(x), anslen)); + int ansi = 0; + #define DO(CTYPE, RTYPE, RNA, ASSIGN) { \ + const CTYPE *xd = (const CTYPE *)RTYPE(x); \ + if (headw) { \ + /* returning more than 1 per group; w>1 */ \ + for (int i=0; i1 && first) { \ + /* gnthvalue */ \ + for (int i=0; igrpn) { const CTYPE val=RNA; ASSIGN; continue; } \ + const int j = ff[i]-1+w-1; \ + const int k = issorted ? j : oo[j]-1; \ + const CTYPE val = nosubset ? xd[k] : (irows[k]==NA_INTEGER ? RNA : xd[irows[k]-1]); \ + ASSIGN; \ + } \ + } else { \ + /* w>1 && !first not supported because -i in R means everything-but-i and gnthvalue */ \ + /* currently takes n>0 only. However, we could still support n'th from the end, somehow */ \ + error(_("Internal error: unanticipated case in gfirstlast first=%d w=%d headw=%d"), \ + first, w, headw); \ + } \ } - break; - case CPLXSXP: { - const Rcomplex *dx = COMPLEX(x); - ans = PROTECT(allocVector(CPLXSXP, ngrp)); - Rcomplex *dans = COMPLEX(ans); - for (int i=0; igrpsize[i]) { dans[i]=NA_CPLX; continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - dans[i] = nosubset ? dx[k] : (irows[k]==NA_INTEGER ? NA_CPLX : dx[irows[k]-1]); - } - } break; - case STRSXP: { - const SEXP *sx = STRING_PTR(x); - ans = PROTECT(allocVector(STRSXP, ngrp)); - for (int i=0; igrpsize[i]) { SET_STRING_ELT(ans, i, NA_STRING); continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - SET_STRING_ELT(ans, i, nosubset ? sx[k] : (irows[k]==NA_INTEGER ? NA_STRING : sx[irows[k]-1])); - } - } break; - case VECSXP: { - const SEXP *vx = SEXPPTR_RO(x); - ans = PROTECT(allocVector(VECSXP, ngrp)); - for (int i=0; igrpsize[i]) { SET_VECTOR_ELT(ans, i, ScalarLogical(NA_LOGICAL)); continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - SET_VECTOR_ELT(ans, i, nosubset ? vx[k] : (irows[k]==NA_INTEGER ? ScalarLogical(NA_LOGICAL) : vx[irows[k]-1])); - } - } break; + switch(TYPEOF(x)) { + case LGLSXP: { int *ansd=LOGICAL(ans); DO(int, LOGICAL, NA_LOGICAL, ansd[ansi++]=val) } break; + case INTSXP: { int *ansd=INTEGER(ans); DO(int, INTEGER, NA_INTEGER, ansd[ansi++]=val) } break; + case REALSXP: if (INHERITS(x, char_integer64)) { + int64_t *ansd=(int64_t *)REAL(ans); DO(int64_t, REAL, NA_INTEGER64, ansd[ansi++]=val) } + else { double *ansd=REAL(ans); DO(double, REAL, NA_REAL, ansd[ansi++]=val) } break; + case CPLXSXP: { Rcomplex *ansd=COMPLEX(ans); DO(Rcomplex, COMPLEX, NA_CPLX, ansd[ansi++]=val) } break; + case STRSXP: DO(SEXP, STRING_PTR, NA_STRING, SET_STRING_ELT(ans,ansi++,val)) break; + case VECSXP: DO(SEXP, SEXPPTR_RO, ScalarLogical(NA_LOGICAL), SET_VECTOR_ELT(ans,ansi++,val)) break; default: error(_("Type '%s' not supported by GForce head/tail/first/last/`[`. Either add the prefix utils::head(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); } @@ -984,26 +975,28 @@ static SEXP gfirstlast(SEXP x, const bool first, const int w) { } SEXP glast(SEXP x) { - return gfirstlast(x, false, 1); + return gfirstlast(x, false, 1, false); } SEXP gfirst(SEXP x) { - return gfirstlast(x, true, 1); + return gfirstlast(x, true, 1, false); } -SEXP gtail(SEXP x, SEXP valArg) { - if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]!=1) error(_("Internal error, gtail is only implemented for n=1. This should have been caught before. please report to data.table issue tracker.")); // # nocov - return gfirstlast(x, false, 1); +SEXP gtail(SEXP x, SEXP nArg) { + if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) error(_("Internal error, gtail is only implemented for n>0. This should have been caught before. please report to data.table issue tracker.")); // # nocov + const int n=INTEGER(nArg)[0]; + return n==1 ? glast(x) : gfirstlast(x, false, n, true); } -SEXP ghead(SEXP x, SEXP valArg) { - if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]!=1) error(_("Internal error, ghead is only implemented for n=1. This should have been caught before. please report to data.table issue tracker.")); // # nocov - return gfirstlast(x, true, 1); +SEXP ghead(SEXP x, SEXP nArg) { + if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) error(_("Internal error, gtail is only implemented for n>0. This should have been caught before. please report to data.table issue tracker.")); // # nocov + const int n=INTEGER(nArg)[0]; + return n==1 ? gfirst(x) : gfirstlast(x, true, n, true); } -SEXP gnthvalue(SEXP x, SEXP valArg) { - if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]<=0) error(_("Internal error, `g[` (gnthvalue) is only implemented single value subsets with positive index, e.g., .SD[2]. This should have been caught before. please report to data.table issue tracker.")); // # nocov - return gfirstlast(x, true, INTEGER(valArg)[0]); +SEXP gnthvalue(SEXP x, SEXP nArg) { + if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) error(_("Internal error, `g[` (gnthvalue) is only implemented single value subsets with positive index, e.g., .SD[2]. This should have been caught before. please report to data.table issue tracker.")); // # nocov + return gfirstlast(x, true, INTEGER(nArg)[0], false); } // TODO: gwhich.min, gwhich.max From 76cfe11044ed0f773b1e9f6dd4d9dfab2126f74f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 25 Aug 2021 05:03:47 -0600 Subject: [PATCH 376/588] #4864: contributor name correction --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 69e1eb9147..fea7936d52 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -69,7 +69,7 @@ Authors@R: c( person("Bennet","Becker", role="ctb"), person("Kyle","Haynes", role="ctb"), person("Boniface Christian","Kamgang", role="ctb"), - person("Odel","Marcelle", role="ctb")) + person("Olivier","Delmarcell", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown From b13b3329ce2a3811a1187204110203c148f3f750 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 26 Aug 2021 09:20:33 -0600 Subject: [PATCH 377/588] DT() follow up (#5113) --- NEWS.md | 4 +++- R/data.table.R | 29 ++++++++++++++++-------- R/test.data.table.R | 14 +++++++----- inst/tests/tests.Rraw | 51 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 80 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4c9647cdfb..194fa27e9e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -109,11 +109,13 @@ 21. `melt()` was pseudo generic in that `melt(DT)` would dispatch to the `melt.data.table` method but `melt(not-DT)` would explicitly redirect to `reshape2`. Now `melt()` is standard generic so that methods can be developed in other packages, [#4864](https://github.com/Rdatatable/data.table/pull/4864). Thanks to @odelmarcelle for suggesting and implementing. -22. `DT(i, j, by, ...)` has been added, i.e. functional form of a `data.table` query, [#641](https://github.com/Rdatatable/data.table/issues/641) [#4872](https://github.com/Rdatatable/data.table/issues/4872). Thanks to Yike Lu and Elio Campitelli for filing requests, many others for comments and suggestions, and Matt Dowle for the PR. This enables the `data.table` general form query to be invoked on a `data.frame` without converting it to a `data.table` first. The class of the input object is retained. +22. `DT(i, j, by, ...)` has been added, i.e. functional form of a `data.table` query, [#641](https://github.com/Rdatatable/data.table/issues/641) [#4872](https://github.com/Rdatatable/data.table/issues/4872). Thanks to Yike Lu and Elio Campitelli for filing requests, many others for comments and suggestions, and Matt Dowle for the PR. This enables the `data.table` general form query to be invoked on a `data.frame` without converting it to a `data.table` first. The class of the input object is retained. Thanks to Mark Fairbanks and Boniface Kamgang for testing and reporting problems that have been fixed before release, [#5106](https://github.com/Rdatatable/data.table/issues/5106) [#5107](https://github.com/Rdatatable/data.table/issues/5107). ```R mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) ``` + + When `data.table` queries (either `[...]` or `|> DT(...)`) receive a `data.table`, the operations maintain `data.table`'s attributes such as its key and any indices. For example, if a `data.table` is reordered by `data.table`, or a key column has a value changed by `:=` in `data.table`, its key and indices will either be dropped or reordered appropriately. Some `data.table` operations automatically add and store an index on a `data.table` for reuse in future queries, if `options(datatable.auto.index=TRUE)`, which is `TRUE` by default. `data.table`'s are also over-allocated, which means there are spare column pointer slots allocated in advance so that a `data.table` in the `.GlobalEnv` can have a column added to it truly by reference, like an in-memory database with multiple client sessions connecting to one server R process, as a `data.table` video has shown in the past. But because R and other packages don't maintain `data.table`'s attributes or over-allocation (e.g. a subset or reorder by R or another package will create invalid `data.table` attributes) `data.table` cannot use these attributes when it detects that base R or another package has touched the `data.table` in the meantime, even if the attributes may sometimes still be valid. So, please realize that, `DT()` on a `data.table` should realize better speed and memory usage than `DT()` on a `data.frame`. `DT()` on a `data.frame` may still be useful to use `data.table`'s syntax (e.g. sub-queries within group: `|> DT(i, .SD[sub-query], by=grp)`) without needing to convert to a `data.table` first. 23. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. diff --git a/R/data.table.R b/R/data.table.R index 4dfa9c276a..8718f3e44e 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -446,7 +446,7 @@ replace_dot_alias = function(e) { i = as.data.table(i) } - if (is.data.table(i)) { + if (is.data.frame(i)) { if (missing(on)) { if (!haskey(x)) { stopf("When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") @@ -1160,7 +1160,8 @@ replace_dot_alias = function(e) { # ok=-1 which will trigger setalloccol with verbose in the next # branch, which again calls _selfrefok and returns the message then if ((ok<-selfrefok(x, verbose=FALSE))==0L) # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R] - warningf("Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.") + if (is.data.table(x)) warningf("Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.") + # !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) { DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval() @@ -1325,13 +1326,12 @@ replace_dot_alias = function(e) { if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE setattr(ans,"sorted",head(key(x),keylen)) } - setattr(ans, "class", class(x)) # fix for #64 - setattr(ans, "row.names", .set_row_names(nrow(ans))) + setattr(ans, "class", class(x)) # retain class that inherits from data.table, #64 + setattr(ans, "row.names", .set_row_names(length(ans[[1L]]))) setalloccol(ans) } - if (!with || missing(j)) return(ans) - + if (!is.data.table(ans)) setattr(ans, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...]) .SD should be data.table, test 2212.013 SDenv$.SDall = ans SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall SDenv$.N = nrow(ans) @@ -1544,6 +1544,7 @@ replace_dot_alias = function(e) { # TODO add: if (max(len__)==nrow) stopf("There is no need to deep copy x in this case") # TODO move down to dogroup.c, too. SDenv$.SDall = .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols) # must be deep copy when largest group is a subset + if (!is.data.table(SDenv$.SDall)) setattr(SDenv$.SDall, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...],by=grp) needs .SD to be data.table, test 2022.012 if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[xcolsAns]) # now that we allow 'x.' prefix in 'j', #2313 bug fix - [xcolsAns] SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall } @@ -1934,7 +1935,17 @@ replace_dot_alias = function(e) { setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line } -DT = `[.data.table` #4872 +DT = function(x, ...) { #4872 + old = getOption("datatable.optimize") + if (!is.data.table(x) && old>2L) { + options(datatable.optimize=2L) + # GForce still on; building and storing indices in .prepareFastSubset off; see long paragraph in news item 22 of v1.14.2 + } + ans = `[.data.table`(x, ...) + options(datatable.optimize=old) + .global$print = "" # functional form should always print; #5106 + ans +} .optmean = function(expr) { # called by optimization of j inside [.data.table only. Outside for a small speed advantage. if (length(expr)==2L) # no parameters passed to mean, so defaults of trim=0 and na.rm=FALSE @@ -2512,8 +2523,8 @@ copy = function(x) { } shallow = function(x, cols=NULL) { - if (!is.data.table(x)) - stopf("x is not a data.table. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table") + if (!is.data.frame(x)) + stopf("x is not a data.table|frame. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table|frame") ans = .shallow(x, cols=cols, retain.key=selfrefok(x)) # selfrefok for #5042 ans } diff --git a/R/test.data.table.R b/R/test.data.table.R index 65a62fd0b5..b64dfe119d 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -407,8 +407,8 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no y = try(y,TRUE) if (identical(x,y)) return(invisible(TRUE)) all.equal.result = TRUE - if (is.data.table(x) && is.data.table(y)) { - if (!selfrefok(x) || !selfrefok(y)) { + if (is.data.frame(x) && is.data.frame(y)) { + if ((is.data.table(x) && !selfrefok(x)) || (is.data.table(y) && !selfrefok(y))) { # nocov start catf("Test %s ran without errors but selfrefok(%s) is FALSE\n", numStr, if (selfrefok(x)) "y" else "x") fail = TRUE @@ -417,12 +417,14 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no xc=copy(x) yc=copy(y) # so we don't affect the original data which may be used in the next test # drop unused levels in factors - if (length(x)) for (i in which(vapply_1b(x,is.factor))) {.xi=x[[i]];xc[,(i):=factor(.xi)]} - if (length(y)) for (i in which(vapply_1b(y,is.factor))) {.yi=y[[i]];yc[,(i):=factor(.yi)]} - setattr(xc,"row.names",NULL) # for test 165+, i.e. x may have row names set from inheritance but y won't, consider these equal - setattr(yc,"row.names",NULL) + if (length(x)) for (i in which(vapply_1b(x,is.factor))) {.xi=x[[i]];xc[[i]]<-factor(.xi)} + if (length(y)) for (i in which(vapply_1b(y,is.factor))) {.yi=y[[i]];yc[[i]]<-factor(.yi)} + if (is.data.table(xc)) setattr(xc,"row.names",NULL) # for test 165+, i.e. x may have row names set from inheritance but y won't, consider these equal + if (is.data.table(yc)) setattr(yc,"row.names",NULL) setattr(xc,"index",NULL) # too onerous to create test RHS with the correct index as well, just check result setattr(yc,"index",NULL) + setattr(xc,".internal.selfref",NULL) # test 2212 + setattr(yc,".internal.selfref",NULL) if (identical(xc,yc) && identical(key(x),key(y))) return(invisible(TRUE)) # check key on original x and y because := above might have cleared it on xc or yc if (isTRUE(all.equal.result<-all.equal(xc,yc,check.environment=FALSE)) && identical(key(x),key(y)) && # ^^ to pass tests 2022.[1-4] in R-devel from 5 Dec 2020, #4835 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 32b16e471f..a7d292bdf6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7,6 +7,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { } if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") + DTfun = DT # just in dev-mode, DT() gets overwritten in .GlobalEnv by DT objects here in tests.Rraw; we restore DT() in test 2212 } else { require(data.table) # Make symbols to the installed version's ::: so that we can i) test internal-only not-exposed R functions @@ -639,7 +640,7 @@ test(211, ncol(TESTDT), 2L) DT = data.table(a=1:6,key="a") test(212, DT[J(3)]$a, 3L) # correct class c("data.table","data.frame") class(DT) = "data.table" # incorrect class, but as from 1.8.1 it works. By accident when moving from colnames() to names(), it was dimnames() doing the check, but rather than add a check that identical(class(DT),c("data.frame","data.table")) at the top of [.data.table, we'll leave it flexible to user (user might not want to inherit from data.frame for some reason). -test(213, DT[J(3)]$a, 3L) +test(213, DT[J(3)]$a, error="x is not a data.table|frame") # from v1.14.2, data.table must inherit from data.frame (internals are too hard to reason if a data.table may not be data.frame too) # setkey now auto coerces double and character for convenience, and # to solve bug #953 @@ -14194,7 +14195,7 @@ test(1984.242, na.omit(data.table(A=c(1,NA,2)), cols=character()), data.table(A= test(1984.25, rbindlist(list(DT[1L], DT[2L]), idcol = TRUE), data.table(.id=1:2, a=1:2)) test(1984.26, setalloccol(`*tmp*`), error='setalloccol attempting to modify `*tmp*`') DF = as.data.frame(DT) -test(1984.27, shallow(DF), error='x is not a data.table') +test(1984.27, shallow(DF), DF) # shallow (which is not exported) works on DF from v1.14.2 test(1984.28, split.data.table(DF), error='argument must be a data.table') test(1984.29, split(DT, by='a', f='a'), error="passing 'f' argument together with 'by' is not allowed") test(1984.30, split(DT), error="Either 'by' or 'f' argument must be supplied") @@ -18050,3 +18051,49 @@ for (col in c("a","b","c")) { } } +# DT() functional form, #4872 #5106 #5107 +if (base::getRversion() >= "4.1.0") { + # we have to EVAL "|>" here too otherwise this tests.Rraw file won't parse in R<4.1.0 + if (exists("DTfun")) DT=DTfun # just in dev-mode restore DT() in .GlobalEnv as DT object overwrote it in tests above + droprn = function(df) { rownames(df)=NULL; df } # TODO: could retain rownames where droprn is currently used below + test(2212.011, EVAL("mtcars |> DT(mpg>20, .(mean_hp=round(mean(hp),2)), by=cyl)"), + data.frame(cyl=c(6,4), mean_hp=c(110.0, 82.64))) + test(2212.012, EVAL("mtcars |> DT(mpg>15, .SD[hp>mean(hp)], by=cyl)"), + droprn(mtcars[c(10,11,30,3,9,21,27,28,32,29), c(2,1,3:11)])) + test(2212.013, EVAL("mtcars |> DT(mpg>20, .SD[hp>mean(hp)])"), + droprn(mtcars[ mtcars$mpg>20 & mtcars$hp>mean(mtcars$hp[mtcars$mpg>20]), ])) + D = copy(mtcars) + test(2212.02, EVAL("D |> DT(,.SD)"), D) + test(2212.03, EVAL("D |> DT(, .SD, .SDcols=5:8)"), D[,5:8]) + test(2212.04, EVAL("D |> DT(, 5:8)"), droprn(D[,5:8])) + test(2212.05, EVAL("D |> DT(, lapply(.SD, sum))"), as.data.frame(lapply(D,sum))) + test(2212.06, EVAL("D |> DT(, .SD, keyby=cyl) |> setkey(NULL)"), droprn(D[order(D$cyl),c(2,1,3:11)])) + test(2212.07, EVAL("D |> DT(1:20, .SD)"), droprn(D[1:20,])) + test(2212.08, EVAL("D |> DT(, .SD, by=cyl, .SDcols=5:8)"), droprn(D[unlist(tapply(1:32, D$cyl, c)[c(2,1,3)]), c(2,5:8)])) + test(2212.09, EVAL("D |> DT(1:20, .SD, .SDcols=5:8)"), droprn(D[1:20, 5:8])) + test(2212.10, EVAL("D |> DT(1:20, .SD, by=cyl, .SDcols=5:8)"), droprn(D[unlist(tapply(1:20, D$cyl[1:20], c)[c(2,1,3)]), c(2,5:8)])) + test(2212.11, EVAL("D |> DT(1:20, lapply(.SD, sum))"), as.data.frame(lapply(D[1:20,],sum))) + test(2212.12, droprn(EVAL("D |> DT(1:20, c(N=.N, lapply(.SD, sum)), by=cyl)")[c(1,3),c("cyl","N","carb")]), data.frame(cyl=c(6,8), N=c(6L,8L), carb=c(18,27))) + test(2212.13, EVAL("D |> DT(cyl==4)"), droprn(D[D$cyl==4,])) + test(2212.14, EVAL("D |> DT(cyl==4 & vs==0)"), droprn(D[D$cyl==4 & D$vs==0,])) + test(2212.15, EVAL("D |> DT(cyl==4 & vs>0)"), droprn(D[D$cyl==4 & D$vs>0,])) + test(2212.16, EVAL("D |> DT(cyl>=4)"), droprn(D[D$cyl>=4,])) + test(2212.17, EVAL("D |> DT(cyl!=4)"), droprn(D[D$cyl!=4,])) + test(2212.18, EVAL("D |> DT(cyl!=4 & vs!=0)"), droprn(D[D$cyl!=4 & D$vs!=0,])) + test(2212.19, EVAL("iris |> DT(Sepal.Length==5.0 & Species=='setosa')"), droprn(iris[iris$Sepal.Length==5.0 & iris$Species=="setosa",])) + test(2212.20, EVAL("iris |> DT(Sepal.Length==5.0)"), droprn(iris[iris$Sepal.Length==5.0,])) + test(2212.21, EVAL("iris |> DT(Species=='setosa')"), droprn(iris[iris$Species=='setosa',])) + test(2212.22, EVAL("D |> DT(, cyl)"), droprn(D[,"cyl"])) + test(2212.23, EVAL("D |> DT(1:2, cyl)"), droprn(D[1:2, "cyl"])) + test(2212.24, EVAL("D |> DT(, list(cyl))"), droprn(D[,"cyl",drop=FALSE])) + test(2212.25, EVAL("D |> DT(1:2, .(cyl))"), droprn(D[1:2, "cyl", drop=FALSE])) + test(2212.26, EVAL("D |> DT(, z:=sum(cyl))"), cbind(D, z=sum(D$cyl))) + test(2212.27, EVAL("D |> DT(, z:=round(mean(mpg),2), by=cyl)"), cbind(D, z=c("6"=19.74, "4"=26.66, "8"=15.10)[as.character(D$cyl)])) + test(2212.28, EVAL("D |> DT(1:3, z:=5, by=cyl)"), cbind(D, z=c(5,5,5,rep(NA,nrow(D)-3)))) + test(2212.29, EVAL("D |> DT(1:3, z:=NULL)"), error="When deleting columns, i should not be provided") + test(2212.30, EVAL("D |> DT(data.table(cyl=4), on='cyl')"), droprn(D[D$cyl==4,])) + test(2212.31, EVAL("D |> DT(data.frame(cyl=4), on='cyl')"), droprn(D[D$cyl==4,])) + test(2212.32, EVAL("D |> DT(.(4), on='cyl')"), droprn(D[D$cyl==4,])) + test(2212.33, EVAL("iris |> DT('setosa', on='Species')"), {tt=droprn(iris[iris$Species=="setosa",]); tt$Species=as.character(tt$Species); tt}) +} + From 457fe7b4213a4c2c4fc9ab0bb698591b842c6009 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 26 Aug 2021 09:38:22 -0700 Subject: [PATCH 378/588] bugfix for melt when measure.vars is a named list of length=1 (#5112) --- NEWS.md | 2 +- R/fmelt.R | 8 ++++---- inst/tests/tests.Rraw | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 194fa27e9e..c4b4861b3a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -56,7 +56,7 @@ 8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing. -9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551), [#4998](https://github.com/Rdatatable/data.table/issues/4998). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New functions `measure()` and `measurev()` which use either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage, Mark Fairbanks for reporting, and to @tdhock for implementing. +9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551), [#4998](https://github.com/Rdatatable/data.table/issues/4998). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New functions `measure()` and `measurev()` which use either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage, Mark Fairbanks for reporting, and to Toby Dylon Hocking for implementing. Thanks to @keatingw for testing before release, requesting `measure()` accept single groups too [#5065](https://github.com/Rdatatable/data.table/issues/5065), and Toby for implementing. 10. A new interface for _programming on data.table_ has been added, closing [#2655](https://github.com/Rdatatable/data.table/issues/2655) and many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing. diff --git a/R/fmelt.R b/R/fmelt.R index 243480445b..83963bebcd 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -60,7 +60,7 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { stopf("each ... argument to measure must be a function with at least one argument, problem: %s", names(fun.list)[[fun.i]]) } fun.list[[fun.i]] = fun - } + } measurev.args = c( list(fun.list), L[formal.i.vec], @@ -185,7 +185,7 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na } else {# single output column. structure(measure.vec, variable_table=group.dt) } -} +} melt.data.table = function(data, id.vars, measure.vars, variable.name = "variable", value.name = "value", ..., na.rm = FALSE, variable.factor = TRUE, value.factor = FALSE, @@ -200,11 +200,11 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl measure.vars = eval.result } } - if (is.list(measure.vars) && length(measure.vars) > 1L) { + if (is.list(measure.vars)) { meas.nm = names(measure.vars) if (is.null(meas.nm)) { # user-provided or default stub - if (length(value.name) == 1L) { + if (length(value.name) == 1L && length(measure.vars) > 1L) { value.name = paste0(value.name, seq_along(measure.vars)) } } else { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a7d292bdf6..9a08da8b0c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17643,7 +17643,8 @@ DTid = data.table(DT.wide, id=1) exid = data.table(id=1, expected) test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) -test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE)[, .(a, b)], data.table(a=2, b=2))#not testing variable because it is not computed correctly, #4455 +test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE), data.table(variable=factor(2), a=2, b=2)) +test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("b1","b2")), b=c(1,2))) # measure.vars named list length=1, #5065 ### First block testing measurev # new variable_table attribute for measure.vars, PR#4731 for multiple issues From f337b11ead3d17642296a76c4ff5b76c3b126135 Mon Sep 17 00:00:00 2001 From: dracodoc Date: Thu, 26 Aug 2021 14:58:25 -0400 Subject: [PATCH 379/588] setcolorder gains before= and after= (#4691) --- NEWS.md | 2 ++ R/data.table.R | 14 ++++++++++---- inst/tests/tests.Rraw | 18 +++++++++++++----- man/setcolorder.Rd | 3 ++- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index c4b4861b3a..0ddd69e8ca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -137,6 +137,8 @@ 24. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. +25 `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index 8718f3e44e..d70e677615 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2658,15 +2658,21 @@ setnames = function(x,old,new,skip_absent=FALSE) { invisible(x) } -setcolorder = function(x, neworder=key(x)) +setcolorder = function(x, neworder=key(x), before=NULL, after=NULL) # before/after #4358 { if (is.character(neworder) && anyDuplicated(names(x))) stopf("x has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", brackify(unique(names(x)[duplicated(names(x))]))) - # if (!is.data.table(x)) stopf("x is not a data.table") + if (!is.null(before) && !is.null(after)) + stopf("Provide either before= or after= but not both") + if (length(before)>1 || length(after)>1) + stopf("before=/after= accept a single column name or number, not more than one") neworder = colnamesInt(x, neworder, check_dups=FALSE) # dups are now checked inside Csetcolorder below + if (length(before)) + neworder = c(setdiff(seq_len(colnamesInt(x, before) - 1L), neworder), neworder) + if (length(after)) + neworder = c(setdiff(seq_len(colnamesInt(x, after)), neworder), neworder) if (length(neworder) != length(x)) { - #if shorter than length(x), pad by the missing - # elements (checks below will catch other mistakes) + # pad by the missing elements (checks inside Csetcolorder catch other mistakes) neworder = c(neworder, setdiff(seq_along(x), neworder)) } .Call(Csetcolorder, x, neworder) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9a08da8b0c..86c5c95f63 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1529,14 +1529,22 @@ DT = data.table(a=1:2,b=3:4,c=5:6) test(495.1, setcolorder(DT,c(2,1,3)), data.table(b=3:4,a=1:2,c=5:6)) test(495.2, setcolorder(DT,c(2,1,3)), data.table(a=1:2,b=3:4,c=5:6)) test(496, setcolorder(DT,c("c","a","b")), data.table(c=5:6,a=1:2,b=3:4)) -test(497, setcolorder(DT,c("d","a","b")), error="specify non existing column*.*d") +test(497.01, setcolorder(DT,c("d","a","b")), error="specify non existing column*.*d") DT = data.table(a = 1:3, b = 2:4, c = 3:5) -test(498.1, names(setcolorder(DT, "b")), c("b", "a", "c")) -test(498.2, names(setcolorder(DT, c(2, 3))), c("a", "c", "b")) -test(498.3, setcolorder(DT, 1:4), error = "specify non existing column*.*4") +test(497.02, names(setcolorder(DT, "b")), c("b", "a", "c")) +test(497.03, names(setcolorder(DT, c(2, 3))), c("a", "c", "b")) +test(497.04, setcolorder(DT, 1:4), error = "specify non existing column*.*4") # Test where neworder=NULL, thus ordered by key and index columns DT = data.table(a = 1:3, b = 2:4, c = 3:5, d = 4:6, key="b") -test(498.4, names(setcolorder(DT)), c("b", "a", "c", "d")) +test(497.05, names(setcolorder(DT)), c("b", "a", "c", "d")) +# new arguments before= and after=, #4358 +DT = data.table(a=1, b=2, c=3) +test(498.01, setcolorder(DT, "a", after="c"), data.table(b=2, c=3, a=1)) +test(498.02, setcolorder(DT, "a", before="b"), data.table(a=1, b=2, c=3)) +test(498.03, setcolorder(DT, 1, after=3), data.table(b=2, c=3, a=1)) +test(498.04, setcolorder(DT, 3, before=1), data.table(a=1, b=2, c=3)) +test(498.05, setcolorder(DT, 1, before=1, after=1), error="Provide either before= or after= but not both") +test(498.06, setcolorder(DT, 1, before=1:2), error="before=/after= accept a single column name or number, not more than one") # test first group listens to nomatch when j uses join inherited scope. x <- data.table(x=c(1,3,8),x1=10:12, key="x") diff --git a/man/setcolorder.Rd b/man/setcolorder.Rd index 48d17c0ad9..71c6cd87fb 100644 --- a/man/setcolorder.Rd +++ b/man/setcolorder.Rd @@ -9,11 +9,12 @@ } \usage{ -setcolorder(x, neworder=key(x)) +setcolorder(x, neworder=key(x), before=NULL, after=NULL) } \arguments{ \item{x}{ A \code{data.table}. } \item{neworder}{ Character vector of the new column name ordering. May also be column numbers. If \code{length(neworder) < length(x)}, the specified columns are moved in order to the "front" of \code{x}. By default, \code{setcolorder} without a specified \code{neworder} moves the key columns in order to the "front" of \code{x}. } + \item{before, after}{ If one of them (not both) was provided with a column name or number, \code{neworder} will be inserted before or after that column. } } \details{ To reorder \code{data.table} columns, the idiomatic way is to use \code{setcolorder(x, neworder)}, instead of doing \code{x <- x[, neworder, with=FALSE]}. This is because the latter makes an entire copy of the \code{data.table}, which maybe unnecessary in most situations. \code{setcolorder} also allows column numbers instead of names for \code{neworder} argument, although we recommend using names as a good programming practice. From 01df48c19ae926f60785b8c9ee47cd3eb1eef837 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 26 Aug 2021 12:04:16 -0700 Subject: [PATCH 380/588] Update NEWS.md (#5114) --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 0ddd69e8ca..03dbe3d4b5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -56,7 +56,7 @@ 8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing. -9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551), [#4998](https://github.com/Rdatatable/data.table/issues/4998). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New functions `measure()` and `measurev()` which use either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage, Mark Fairbanks for reporting, and to Toby Dylon Hocking for implementing. Thanks to @keatingw for testing before release, requesting `measure()` accept single groups too [#5065](https://github.com/Rdatatable/data.table/issues/5065), and Toby for implementing. +9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551), [#4998](https://github.com/Rdatatable/data.table/issues/4998). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New functions `measure()` and `measurev()` which use either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage, Mark Fairbanks for reporting, and to Toby Dylan Hocking for implementing. Thanks to @keatingw for testing before release, requesting `measure()` accept single groups too [#5065](https://github.com/Rdatatable/data.table/issues/5065), and Toby for implementing. 10. A new interface for _programming on data.table_ has been added, closing [#2655](https://github.com/Rdatatable/data.table/issues/2655) and many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing. From 9e6e45301ea89227414a4f6df1ffc679c5c7ef1c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 26 Aug 2021 20:27:33 -0400 Subject: [PATCH 381/588] identifying / unifying similar messages for streamlined translation (#4299) --- inst/tests/nafill.Rraw | 18 +++++----- inst/tests/tests.Rraw | 69 ++++++++++++++++++------------------- src/assign.c | 11 +++--- src/between.c | 8 ++--- src/bmerge.c | 4 ++- src/cj.c | 2 +- src/coalesce.c | 2 +- src/dogroups.c | 2 +- src/fifelse.c | 4 +-- src/fmelt.c | 2 +- src/forder.c | 8 ++--- src/frank.c | 8 ++--- src/fread.c | 2 +- src/frollR.c | 14 ++++---- src/fsort.c | 6 ++-- src/gsumm.c | 77 ++++++++++++++++++++++++++---------------- src/init.c | 22 ++++++------ src/nafill.c | 2 +- src/openmp-utils.c | 3 +- src/shift.c | 2 +- src/subset.c | 8 ++--- src/uniqlist.c | 17 +++++----- src/utils.c | 10 +++--- 23 files changed, 163 insertions(+), 138 deletions(-) diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index 1e5107fb71..e8ea3d7eec 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -132,15 +132,15 @@ test(4.03, colnamesInt(dt, 1), 1L) test(4.04, colnamesInt(dt, c("a","d")), c(1L, 3L)) test(4.05, colnamesInt(dt, c(1L, 3L)), c(1L, 3L)) test(4.06, colnamesInt(dt, c(1, 3)), c(1L, 3L)) -test(4.07, colnamesInt(dt, c("a", "e")), error="specify non existing column*.*e") -test(4.08, colnamesInt(dt, c(1L, 4L)), error="specify non existing column*.*4") -test(4.09, colnamesInt(dt, c(1, 4)), error="specify non existing column*.*4") -test(4.10, colnamesInt(dt, c("a", NA)), error="specify non existing column*.*NA") -test(4.11, colnamesInt(dt, c(1L, NA)), error="specify non existing column") -test(4.12, colnamesInt(dt, c(1, NA)), error="specify non existing column") -test(4.13, colnamesInt(dt, c("a","d","a"), check_dups=TRUE), error="specify duplicated column") -test(4.14, colnamesInt(dt, c(1L, 3L, 1L), check_dups=TRUE), error="specify duplicated column") -test(4.15, colnamesInt(dt, c(1, 3, 1), check_dups=TRUE), error="specify duplicated column") +test(4.07, colnamesInt(dt, c("a", "e")), error="received non-existing column*.*e") +test(4.08, colnamesInt(dt, c(1L, 4L)), error="received non-existing column*.*4") +test(4.09, colnamesInt(dt, c(1, 4)), error="received non-existing column*.*4") +test(4.10, colnamesInt(dt, c("a", NA)), error="received non-existing column*.*NA") +test(4.11, colnamesInt(dt, c(1L, NA)), error="received non-existing column") +test(4.12, colnamesInt(dt, c(1, NA)), error="received non-existing column") +test(4.13, colnamesInt(dt, c("a","d","a"), check_dups=TRUE), error="received duplicate column(s)") +test(4.14, colnamesInt(dt, c(1L, 3L, 1L), check_dups=TRUE), error="received duplicate column(s)") +test(4.15, colnamesInt(dt, c(1, 3, 1), check_dups=TRUE), error="received duplicate column(s)") test(4.16, colnamesInt(dt, list("a")), error="must be character or numeric") test(4.17, colnamesInt(dt, NA), error="must be character or numeric") test(4.18, colnamesInt(dt, character()), integer()) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 86c5c95f63..a9eb68ab09 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1529,11 +1529,11 @@ DT = data.table(a=1:2,b=3:4,c=5:6) test(495.1, setcolorder(DT,c(2,1,3)), data.table(b=3:4,a=1:2,c=5:6)) test(495.2, setcolorder(DT,c(2,1,3)), data.table(a=1:2,b=3:4,c=5:6)) test(496, setcolorder(DT,c("c","a","b")), data.table(c=5:6,a=1:2,b=3:4)) -test(497.01, setcolorder(DT,c("d","a","b")), error="specify non existing column*.*d") +test(497.01, setcolorder(DT,c("d","a","b")), error="received non-existing column*.*d") DT = data.table(a = 1:3, b = 2:4, c = 3:5) test(497.02, names(setcolorder(DT, "b")), c("b", "a", "c")) test(497.03, names(setcolorder(DT, c(2, 3))), c("a", "c", "b")) -test(497.04, setcolorder(DT, 1:4), error = "specify non existing column*.*4") +test(497.04, setcolorder(DT, 1:4), error = "received non-existing column*.*4") # Test where neworder=NULL, thus ordered by key and index columns DT = data.table(a = 1:3, b = 2:4, c = 3:5, d = 4:6, key="b") test(497.05, names(setcolorder(DT)), c("b", "a", "c", "d")) @@ -2867,10 +2867,10 @@ test(988, unique(dt, by='B'), dt[!duplicated(df[, 'B'])]) test(989, unique(dt, by='C'), dt[!duplicated(df[, 'C'])]) test(990, unique(dt, by=c('B', 'C')), dt[!duplicated(df[, c('B', 'C')])]) test(991, unique(dt, by=NULL), dt[!duplicated(df)]) -test(991.1, unique(dt, by=4), error="specify non existing column*.*4") +test(991.1, unique(dt, by=4), error="received non-existing column*.*4") test(991.2, unique(dt, by=c(1,3.1)), error="is type 'double' and one or more items in it are not whole integers") test(991.3, unique(dt, by=2:3), dt[!duplicated(df[,c('B','C')])]) -test(991.4, unique(dt, by=c('C','D','E')), error="specify non existing column*.*D") +test(991.4, unique(dt, by=c('C','D','E')), error="received non-existing column*.*D") # :=NULL on factor column in empty data.table, #114 DT = data.table(A = integer(), B = factor()) @@ -2916,7 +2916,7 @@ test(996.07, CJ(1:2, list(1:2, 3), 4:5, sorted = FALSE), data.table(V1 = rep(1:2, each = 4L), V2 = rep(rep(list(1:2, 3), each = 2L), 2L), V3 = rep(4:5, 4L))) test(996.08, CJ(expression(1)), error = "element 1 is non-atomic") -test(996.09, CJ(expression(2), 3, sorted = FALSE), error = "Type 'expression' not supported") +test(996.09, CJ(expression(2), 3, sorted = FALSE), error = "Type 'expression' is not supported") ## complex input support (can't handle sorted yet) test(996.10, CJ(z = 0:1 + (0:1)*1i, b = 1:3, sorted = FALSE), data.table(z = rep(0:1, each=3L) + rep(0:1, each=3L)*1i, b = rep(1:3, 2))) @@ -4285,7 +4285,7 @@ setNumericRounding(old_rounding) DT = data.table(id=INT(1,2,1), val1=3:1, val2=3:1, val3=list(2:3,4:6,7:10)) # 5380 test(1199.1, DT[, sum(.SD), by=id, .SDcols=2:3], data.table(id=1:2, V1=INT(8,4))) #875 made the .SD case work test(1199.2, DT[, sum(.SD), by=id], error="data.*frame.*numeric") # this is R's error message so use flexible string pattern to insulate from minor changes in R, #4769 -test(1199.3, DT[, sum(val3), by=id], error="Type 'list' not supported by GForce sum [(]gsum[)]. Either.*or turn off") +test(1199.3, DT[, sum(val3), by=id], error="Type 'list' is not supported by GForce sum [(]gsum[)]. Either.*or turn off") # Selection of columns, copy column to maintain the same as R <= 3.0.2, in Rdevel, for now # Otherwise e.g. setkey changes the original columns too. TO DO: could allow shallow copy, perhaps. @@ -5571,7 +5571,7 @@ setDF(Y) test(1364.18, setdiff_(X, Y), error = 'x and y must both be data.tables') setDT(Y) test(1364.19, setdiff_(X[0L], Y), X[0L]) -test(1364.20, setdiff_(X, Y, by.x = 'f'), error = 'specify non existing column*.*f') +test(1364.20, setdiff_(X, Y, by.x = 'f'), error = 'received non-existing column*.*f') #test(1364.21, setdiff_(X, Y, by.x = c('f', 'g')), error = 'by.x values [f, g] not present') # now only first no existing column is printed for efficiency test(1364.22, setdiff_(X, Y[0L], by.x = 'a'), data.table(a = c(1, 3, 2), b = factor(c(1L, 3L, 2L)), @@ -6669,13 +6669,13 @@ test(1464.03, rleidv(DT, "b"), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L)) test(1464.04, rleid(DT$b), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L)) test(1464.05, rleidv(DT, "c"), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L)) test(1464.06, rleid(DT$c), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L)) -test(1464.07, rleid(as.raw(c(3L, 1L, 2L))), error="Type 'raw' not supported") -test(1464.08, rleidv(DT, 0), error="specify non existing column*.*0") -test(1464.09, rleidv(DT, 5), error="specify non existing column*.*5") +test(1464.07, rleid(as.raw(c(3L, 1L, 2L))), error="Type 'raw' is not supported") +test(1464.08, rleidv(DT, 0), error="received non-existing column*.*0") +test(1464.09, rleidv(DT, 5), error="received non-existing column*.*5") test(1464.10, rleidv(DT, 1:4), 1:nrow(DT)) set.seed(1) DT = data.table( sample(1:2,20,replace=TRUE), sample(1:2,20,replace=TRUE), sample(1:2,20, replace=TRUE)) -test(1464.11, rleidv(DT, 1:4), error="specify non existing column*.*4") +test(1464.11, rleidv(DT, 1:4), error="received non-existing column*.*4") test(1464.12, rleidv(DT, 1:2), ans<-INT(1,2,3,4,5,6,6,6,7,8,8,9,10,11,12,13,14,15,16,17)) test(1464.13, rleidv(DT, 2:1), ans) test(1464.14, rleidv(DT, c(3,1)), INT(1,1,2,2,3,4,5,5,6,7,8,9,10,11,12,13,14,15,16,17)) @@ -8038,8 +8038,8 @@ test(1574.4, X["bar", on="c"], X[2L]) # missed previously # fix for #1376 X = data.table(a=1:3,b=4:6,c=c("foo","bar","baz")) Y = data.table(A=2:4, B=5:7) -test(1575.1, X[Y, on=c(A="a")], error="specify non existing column*.*A") # does not report 'x' or 'i' anymore after switch to colnamesInt -test(1575.2, X[Y, on=c(a="a")], error="specify non existing column*.*a") +test(1575.1, X[Y, on=c(A="a")], error="received non-existing column*.*A") # does not report 'x' or 'i' anymore after switch to colnamesInt +test(1575.2, X[Y, on=c(a="a")], error="received non-existing column*.*a") # work around for issue introduced in v1.9.4, #1396 X = data.table(x=5:1, y=6:10) @@ -13491,10 +13491,11 @@ DT = data.table(x = c(1, 1, 3, 2), key = 'x') test(1962.003, duplicated(DT, fromLast = NA), error = 'must be TRUE or FALSE') test(1962.004, duplicated(DT, by = -1L), - error = 'specify non existing column*.*-1') + error = 'received non-existing column*.*-1') test(1962.005, duplicated(DT, by = 'y'), - error = 'specify non existing column*.*y') -test(1962.006, duplicated(data.table(NULL)), logical(0L)) + error = 'received non-existing column*.*y') +test(1962.0061, duplicated(data.table(NULL)), logical(0L)) +test(1962.0062, duplicated(data.table(a = 1L), by = character()), FALSE) test(1962.007, unique(DT, incomparables = TRUE), error = 'not used (yet)') @@ -13763,7 +13764,7 @@ test(1963.07, shift(DT, -1:1), c(NA, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L))) ## some coverage tests for good measure test(1963.08, shift(DT$x, type = 'some_other_type'), error='should be one of.*lag.*lead') -test(1963.09, shift(as.raw(0:1)), error = 'Unsupported type') +test(1963.09, shift(as.raw(0:1)), error = "Type 'raw' is not supported") test(1963.10, shift(DT, -1:1, type="shift", give.names = TRUE), # new type="shift" #3223 ans <- list(`x_shift_-1` = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA), x_shift_0 = 1:10, @@ -13961,7 +13962,7 @@ test(1967.621, setnames(x, 1:2, c("a","a")), data.table(a=1:5, a=6:10)) test(1967.622, setnames(x, 1:2, c("a",NA)), error = "NA in 'new' at positions [2]") test(1967.63, setcolorder(x, c(1, 1)), error = 'Item 2 of order (1) is either NA, out of range [1,2], or is duplicated. The new order must be a strict permutation of 1:n') test(1967.64, setcolorder(x, 1+3i), error = 'must be character or numeric') -test(1967.65, setcolorder(x, 300), error = 'specify non existing column*.*300') +test(1967.65, setcolorder(x, 300), error = 'received non-existing column*.*300') test(1967.66, rbindlist(list(x), idcol = FALSE), rbindlist(list(x))) test(1967.67, rbindlist(list(x), idcol = 1+3i), error = 'idcol must be a logical') @@ -14196,7 +14197,7 @@ dimnames(DT) <- list(NULL, 1:5) test(1984.21, names(DT), paste0(1:5)) DT = data.table(a = 1:10) test(1984.22, na.omit(DT, invert = 'a'), error="'invert' must be logical") -test(1984.23, na.omit(DT, cols = 'b'), error="specify non existing column*.*b") +test(1984.23, na.omit(DT, cols = 'b'), error="received non-existing column*.*b") #test(1984.24, na.omit(DT, cols = c('b', 'c')), error="Columns [b, c] don't") # only first non-existing col is now reported for efficiency test(1984.242, na.omit(data.table(A=c(1,NA,2)), cols=character()), data.table(A=c(1,NA,2))) #2514 ### idcol = TRUE behavior of rbindlist @@ -14732,15 +14733,15 @@ if (test_bit64) { test(2019, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows } DT = data.table(id = c(1L,1L,2L), v = as.raw(0:2)) -test(2020.01, DT[, min(v), by=id], error="'raw' not supported by GForce min/max") -test(2020.02, DT[, max(v), by=id], error="'raw' not supported by GForce min/max") -test(2020.03, DT[, median(v), by=id], error="'raw' not supported by GForce median") -test(2020.04, DT[, head(v, 1), by=id], error="'raw' not supported by GForce head/tail/first/last/`[`") -test(2020.05, DT[, tail(v, 1), by=id], error="'raw' not supported by GForce head/tail/first/last/`[`") -test(2020.06, DT[, v[1], by=id], error="'raw' not supported by GForce head/tail/first/last/`[`") -test(2020.07, DT[, sd(v), by=id], error="'raw' not supported by GForce sd") -test(2020.08, DT[, var(v), by=id], error="'raw' not supported by GForce var") -test(2020.09, DT[, prod(v), by=id], error="'raw' not supported by GForce prod") +test(2020.01, DT[, min(v), by=id], error="'raw' is not supported by GForce min") +test(2020.02, DT[, max(v), by=id], error="'raw' is not supported by GForce max") +test(2020.03, DT[, median(v), by=id], error="'raw' is not supported by GForce median") +test(2020.04, DT[, head(v, 1), by=id], error="'raw' is not supported by GForce head/tail/first/last/`[`") +test(2020.05, DT[, tail(v, 1), by=id], error="'raw' is not supported by GForce head/tail/first/last/`[`") +test(2020.06, DT[, v[1], by=id], error="'raw' is not supported by GForce head/tail/first/last/`[`") +test(2020.07, DT[, sd(v), by=id], error="'raw' is not supported by GForce sd") +test(2020.08, DT[, var(v), by=id], error="'raw' is not supported by GForce var") +test(2020.09, DT[, prod(v), by=id], error="'raw' is not supported by GForce prod") DT = data.table(id = c(1L,1L,2L,2L), v = c(1L, 2L, NA, NA)) test(2020.10, DT[, median(v), id], data.table(id=1:2, V1=c(1.5, NA))) # median whole group has NAs @@ -15722,7 +15723,7 @@ test(2060.208, fcoalesce(fkt, 'b'), error='Item 1 is a factor but item 2 is not test(2060.209, fcoalesce(str, factor('b')), error='Item 2 is a factor but item 1 is not a factor. When factors are involved, all items must be factor') test(2060.212, fcoalesce(list(1), list(2)), error="The first argument is a list, data.table or data.frame. In this case there should be no other arguments provided.") test(2060.213, fcoalesce(bool, c(TRUE, FALSE)), error="Item 2 is length 2 but the first item is length 3. Only singletons are recycled") -test(2060.214, fcoalesce(as.raw(0), as.raw(1)), error="Unsupported type: raw") +test(2060.214, fcoalesce(as.raw(0), as.raw(1)), error="Type 'raw' is not supported") test(2060.215, fcoalesce(bool, list()), bool) test(2060.216, fcoalesce(structure(c(1:2,NA,4L), class=c("a")), c(NA,NA,3L,4L)),, error="Item 2 has a different class than item 1") # different classes of x arg #3660 @@ -15949,7 +15950,7 @@ DT1 = data.table(a = sample(3L, 15L, TRUE) + .1, b=sample(c(TRUE, FALSE, NA), 15 DT2 = data.table(a = sample(3L, 6L, TRUE) + .1, b=sample(c(TRUE, FALSE, NA), 6L, TRUE)) test(2069.32, DT1[DT2, .(y = sum(b, na.rm=TRUE)), by=.EACHI, on=c(a = 'a', b="b")]$y, rep(0L, 6L)) DT = data.table(z = 1i) -test(2069.33, DT[DT, on = 'z'], error = "Type 'complex' not supported for joining/merging") +test(2069.33, DT[DT, on = 'z'], error = "Type 'complex' is not supported for joining/merging") # forder verbose message when !isReallyReal Date, #1738 DT = data.table(d=sample(seq(as.Date("2015-01-01"), as.Date("2015-01-05"), by="days"), 20, replace=TRUE)) @@ -16019,7 +16020,7 @@ test(2072.036, fifelse(test_vec_na, 1+0i, 0+0i), as.complex(out_vec_na)) test(2072.037, fifelse(test_vec_na, rep(1+0i,12L), 0+0i), as.complex(out_vec_na)) test(2072.038, fifelse(test_vec_na, rep(1+0i,12L), rep(0+0i,12L)), as.complex(out_vec_na)) test(2072.039, fifelse(test_vec_na, 1+0i, rep(0+0i,12L)), as.complex(out_vec_na)) -test(2072.040, fifelse(test_vec, as.raw(0), as.raw(1)), error="Type raw is not supported.") +test(2072.040, fifelse(test_vec, as.raw(0), as.raw(1)), error="Type 'raw' is not supported") test(2072.041, fifelse(TRUE,1,as.Date("2019-07-07")), error="'yes' has different class than 'no'. Please") test(2072.042, fifelse(TRUE,1L,factor(letters[1])), error="'yes' has different class than 'no'. Please") test(2072.043, fifelse(TRUE, list(1:5), list(5:1)), list(1:5)) @@ -16958,7 +16959,7 @@ test(2127.13, fcase(test_vec1, 1+0i, test_vec2, 0+0i, default=2+0i), as.complex( test(2127.14, fcase(test_vec1, list(1), test_vec2, list(0),default=list(2)), list(1,1,1,1,1, 2, 0, 0, 0, 0, 0)) test(2127.15, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default=as.Date("2019-10-15")), c(rep(as.Date("2019-10-11"),5),as.Date("2019-10-15"),rep(as.Date("2019-10-14"),5))) test(2127.16, fcase(test_vec1, factor("a", levels=letters[1:3]), test_vec2, factor("b", levels=letters[1:3]),default=factor("c", levels=letters[1:3])), factor(c(rep("a",5),"c",rep("b",5)), levels=letters[1:3])) -test(2127.17, fcase(test_vec1, as.raw(1), test_vec2, as.raw(0)), error="Type raw is not supported.") +test(2127.17, fcase(test_vec1, as.raw(1), test_vec2, as.raw(0)), error="Type 'raw' is not supported") test(2127.18, fcase(test_vec1, factor("a", levels=letters[1]), test_vec2, factor("b", levels=letters[1:3])), error="Argument #2 and argument #4 are both factor but their levels are different.") test(2127.19, fcase(test_vec1, factor("a", levels=letters[1:2]), test_vec2, factor("b", levels=letters[1:2]),default=factor("c", levels=letters[1:3])), error="Resulting value and 'default' are both type factor but their levels are different.") test(2127.20, fcase(test_vec1, 1L:10L, test_vec2, 3L:12L, test_vec2), error="Received 5 inputs; please supply an even number of arguments in ..., consisting of logical condition, resulting value pairs (in that order). Note that the default argument must be named explicitly, e.g., default=0") @@ -17659,7 +17660,7 @@ test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, measurev = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. test(2183.00001, melt(DT.wide, measure.vars=measurev()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) measurev = list("foo", "bar")#measurev below should not use this since it is not a function. -test(2183.00002, melt(DTid, measure.vars=measurev(list(value.name=NULL, num=as.complex), pattern="([ab])([12])")), error="Type 'complex' not supported for joining/merging") +test(2183.00002, melt(DTid, measure.vars=measurev(list(value.name=NULL, num=as.complex), pattern="([ab])([12])")), error="Type 'complex' is not supported for joining/merging") test(2183.00004, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=NULL), pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) test(2183.00005, melt(DTid, measure.vars=measurev(list(column=NULL, istr=NULL), pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword iris.dt = data.table(datasets::iris) @@ -17682,7 +17683,7 @@ test(2183.00060, melt(DTid, measure.vars=measurev(list(letter=myfac, value.name= measure = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. test(2183.01, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) measure = list("foo", "bar")#measure below should not use this since it is not a function. -test(2183.02, melt(DTid, measure.vars=measure(value.name, num=as.complex, pattern="([ab])([12])")), error="Type 'complex' not supported for joining/merging") +test(2183.02, melt(DTid, measure.vars=measure(value.name, num=as.complex, pattern="([ab])([12])")), error="Type 'complex' is not supported for joining/merging") test(2183.03, melt(DTid, measure.vars=structure(list(a=c(NA,"a2"),b=c("b1","b2")), variable_table=data.table(number=as.complex(1:2)))), error="variable_table does not support column type 'complex' for column 'number'") test(2183.04, melt(DTid, measure.vars=measure(value.name, istr, pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) test(2183.05, melt(DTid, measure.vars=measure(column, istr, pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword diff --git a/src/assign.c b/src/assign.c index c84ca276a5..d0faf337c8 100644 --- a/src/assign.c +++ b/src/assign.c @@ -121,9 +121,9 @@ static int _selfrefok(SEXP x, Rboolean checkNames, Rboolean verbose) { if (verbose) Rprintf(_(".internal.selfref ptr is NULL. This is expected and normal for a data.table loaded from disk. Please remember to always setDT() immediately after loading to prevent unexpected behavior. If this table was not loaded from disk or you've already run setDT(), please report to data.table issue tracker.\n")); return -1; } - if (!isNull(p)) error(_("Internal error: .internal.selfref ptr is not NULL or R_NilValue")); // # nocov + if (!isNull(p)) error(_("Internal error: .internal.selfref ptr is neither NULL nor R_NilValue")); // # nocov tag = R_ExternalPtrTag(v); - if (!(isNull(tag) || isString(tag))) error(_("Internal error: .internal.selfref tag isn't NULL or a character vector")); // # nocov + if (!(isNull(tag) || isString(tag))) error(_("Internal error: .internal.selfref tag is neither NULL nor a character vector")); // # nocov names = getAttrib(x, R_NamesSymbol); if (names!=tag && isString(names) && !ALTREP(names)) // !ALTREP for #4734 SET_TRUELENGTH(names, LENGTH(names)); @@ -246,7 +246,8 @@ int checkOverAlloc(SEXP x) } SEXP alloccolwrapper(SEXP dt, SEXP overAllocArg, SEXP verbose) { - if (!isLogical(verbose) || length(verbose)!=1) error(_("verbose must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(verbose)) + error(_("%s must be TRUE or FALSE"), "verbose"); int overAlloc = checkOverAlloc(overAllocArg); SEXP ans = PROTECT(alloccol(dt, length(dt)+overAlloc, LOGICAL(verbose)[0])); @@ -311,7 +312,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++; if (isNull(names)) error(_("dt passed to assign has no names")); if (length(names)!=oldncol) - error(_("Internal error in assign: length of names (%d) is not length of dt (%d)"),length(names),oldncol); // # nocov + error(_("Internal error: length of names (%d) is not length of dt (%d)"), length(names), oldncol); // # nocov if (isNull(dt)) { error(_("data.table is NULL; malformed. A null data.table should be an empty list. typeof() should always return 'list' for data.table.")); // # nocov // Not possible to test because R won't permit attributes be attached to NULL (which is good and we like); warning from R 3.4.0+ tested by 944.5 @@ -336,7 +337,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) const int *rowsd = INTEGER(rows); for (int i=0; inrow) - error(_("i[%d] is %d which is out of range [1,nrow=%d]."),i+1,rowsd[i],nrow); // set() reaches here (test 2005.2); := reaches the same error in subset.c first + error(_("i[%d] is %d which is out of range [1,nrow=%d]"), i+1, rowsd[i], nrow); // set() reaches here (test 2005.2); := reaches the same error in subset.c first if (rowsd[i]>=1) numToDo++; } if (verbose) Rprintf(_("Assigning to %d row subset of %d rows\n"), numToDo, nrow); diff --git a/src/between.c b/src/between.c index 899ea1d94e..818fa0cbb4 100644 --- a/src/between.c +++ b/src/between.c @@ -12,14 +12,14 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg, S error(_("Incompatible vector lengths: length(x)==%d length(lower)==%d length(upper)==%d. Each should be either length 1 or the length of the longest."), nx, nl, nu); } const int longestBound = MAX(nl, nu); // just for when check=TRUE - if (!isLogical(incbounds) || LOGICAL(incbounds)[0]==NA_LOGICAL) - error(_("incbounds must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(incbounds)) + error(_("%s must be TRUE or FALSE"), "incbounds"); const bool open = !LOGICAL(incbounds)[0]; if (!isLogical(NAboundsArg) || LOGICAL(NAboundsArg)[0]==FALSE) error(_("NAbounds must be TRUE or NA")); const bool NAbounds = LOGICAL(NAboundsArg)[0]==TRUE; - if (!isLogical(checkArg) || LOGICAL(checkArg)[0]==NA_LOGICAL) - error(_("check must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(checkArg)) + error(_("%s must be TRUE or FALSE"), "check"); const bool check = LOGICAL(checkArg)[0]; const bool verbose = GetVerbose(); diff --git a/src/bmerge.c b/src/bmerge.c index fac7ee281f..44ac7b569c 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -67,7 +67,7 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP int xt = TYPEOF(VECTOR_ELT(xdt, xcols[col]-1)); if (iN && it!=xt) error(_("typeof x.%s (%s) != typeof i.%s (%s)"), CHAR(STRING_ELT(getAttrib(xdt,R_NamesSymbol),xcols[col]-1)), type2char(xt), CHAR(STRING_ELT(getAttrib(idt,R_NamesSymbol),icols[col]-1)), type2char(it)); if (iN && it!=LGLSXP && it!=INTSXP && it!=REALSXP && it!=STRSXP) - error(_("Type '%s' not supported for joining/merging"), type2char(it)); + error(_("Type '%s' is not supported for joining/merging"), type2char(it)); } // rollArg, rollendsArg @@ -368,6 +368,8 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisg } break; // supported types were checked up front to avoid handling an error here in (future) parallel region + default: + error(_("Type '%s' is not supported for joining/merging"), type2char(TYPEOF(xc))); } if (xlowleach = (int *)R_alloc(data->lvalues, sizeof(int)); data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int)); data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int)); diff --git a/src/forder.c b/src/forder.c index e7676386e7..a2ddf022a6 100644 --- a/src/forder.c +++ b/src/forder.c @@ -455,11 +455,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S STOP(_("Column %d is length %d which differs from length of column 1 (%d), are you attempting to order by a list column?\n"), INTEGER(by)[i], length(VECTOR_ELT(DT, INTEGER(by)[i]-1)), nrow); if (TYPEOF(VECTOR_ELT(DT, by_i-1)) == CPLXSXP) n_cplx++; } - if (!isLogical(retGrpArg) || LENGTH(retGrpArg)!=1 || INTEGER(retGrpArg)[0]==NA_LOGICAL) - STOP(_("retGrp must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(retGrpArg)) + STOP(_("%s must be TRUE or FALSE"), "retGrp"); retgrp = LOGICAL(retGrpArg)[0]==TRUE; - if (!isLogical(sortGroupsArg) || LENGTH(sortGroupsArg)!=1 || INTEGER(sortGroupsArg)[0]==NA_LOGICAL ) - STOP(_("sort must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(sortGroupsArg)) + STOP(_("%s must be TRUE or FALSE"), "sort"); sortType = LOGICAL(sortGroupsArg)[0]==TRUE; // if sortType is 1, it is later flipped between +1/-1 according to ascArg. Otherwise ascArg is ignored when sortType==0 if (!retgrp && !sortType) STOP(_("At least one of retGrp= or sort= must be TRUE")); diff --git a/src/frank.c b/src/frank.c index e8445998b3..7148bdd387 100644 --- a/src/frank.c +++ b/src/frank.c @@ -6,8 +6,8 @@ SEXP dt_na(SEXP x, SEXP cols) { int n=0, elem; - if (!isNewList(x)) error(_("Internal error. Argument 'x' to Cdt_na is type '%s' not 'list'"), type2char(TYPEOF(x))); // # nocov - if (!isInteger(cols)) error(_("Internal error. Argument 'cols' to Cdt_na is type '%s' not 'integer'"), type2char(TYPEOF(cols))); // # nocov + if (!isNewList(x)) error(_("Internal error. Argument '%s' to %s is type '%s' not '%s'"), "x", "Cdt_na", type2char(TYPEOF(x)), "list"); // # nocov + if (!isInteger(cols)) error(_("Internal error. Argument '%s' to %s is type '%s' not '%s'"), "cols", "Cdt_na", type2char(TYPEOF(cols)), "integer"); // # nocov for (int i=0; iLENGTH(x)) @@ -184,8 +184,8 @@ SEXP frank(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP ties_method) { // internal version of anyNA for data.tables SEXP anyNA(SEXP x, SEXP cols) { int n=0; - if (!isNewList(x)) error(_("Internal error. Argument 'x' to CanyNA is type '%s' not 'list'"), type2char(TYPEOF(x))); // #nocov - if (!isInteger(cols)) error(_("Internal error. Argument 'cols' to CanyNA is type '%s' not 'integer'"), type2char(TYPEOF(cols))); // # nocov + if (!isNewList(x)) error(_("Internal error. Argument '%s' to %s is type '%s' not '%s'"), "x", "CanyNA", type2char(TYPEOF(x)), "list"); // #nocov + if (!isInteger(cols)) error(_("Internal error. Argument '%s' to %s is type '%s' not '%s'"), "cols", "CanyNA", type2char(TYPEOF(cols)), "integer"); // # nocov for (int i=0; iLENGTH(x)) diff --git a/src/fread.c b/src/fread.c index 70597a8f8d..7c7836250a 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1355,7 +1355,7 @@ int freadMain(freadMainArgs _args) { const char* fnam = args.filename; #ifndef WIN32 int fd = open(fnam, O_RDONLY); - if (fd==-1) STOP(_("file not found: %s"),fnam); + if (fd==-1) STOP(_("File not found: %s"),fnam); struct stat stat_buf; if (fstat(fd, &stat_buf) == -1) { close(fd); // # nocov diff --git a/src/frollR.c b/src/frollR.c index 8a1b0ef295..644b863439 100644 --- a/src/frollR.c +++ b/src/frollR.c @@ -36,8 +36,8 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX if (xlength(k) == 0) // check that window is non zero length error(_("n must be non 0 length")); - if (!isLogical(adaptive) || length(adaptive) != 1 || LOGICAL(adaptive)[0] == NA_LOGICAL) - error(_("adaptive must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(adaptive)) + error(_("%s must be TRUE or FALSE"), "adaptive"); bool badaptive = LOGICAL(adaptive)[0]; R_len_t nk = 0; // number of rolling windows, for adaptive might be atomic to be wrapped into list, 0 for clang -Wall @@ -91,7 +91,7 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX } if (!IS_TRUE_OR_FALSE(narm)) - error(_("na.rm must be TRUE or FALSE")); + error(_("%s must be TRUE or FALSE"), "na.rm"); if (!isLogical(hasna) || length(hasna)!=1) error(_("hasNA must be TRUE, FALSE or NA")); @@ -106,7 +106,7 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX else if (!strcmp(CHAR(STRING_ELT(align, 0)), "left")) ialign = -1; else - error(_("Internal error: invalid align argument in rolling function, should have been caught before. please report to data.table issue tracker.")); // # nocov + error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "align", "rolling"); // # nocov if (badaptive && ialign!=1) error(_("using adaptive TRUE and align argument different than 'right' is not implemented")); @@ -138,7 +138,7 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX } else if (!strcmp(CHAR(STRING_ELT(fun, 0)), "sum")) { sfun = SUM; } else { - error(_("Internal error: invalid fun argument in rolling function, should have been caught before. please report to data.table issue tracker.")); // # nocov + error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "fun", "rolling"); // # nocov } if (length(fill) != 1) @@ -160,7 +160,7 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX else if (!strcmp(CHAR(STRING_ELT(algo, 0)), "exact")) ialgo = 1; // exact = 1 else - error(_("Internal error: invalid algo argument in rolling function, should have been caught before. please report to data.table issue tracker.")); // # nocov + error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "algo", "rolling"); // # nocov int* iik = NULL; if (!badaptive) { @@ -250,7 +250,7 @@ SEXP frollapplyR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP align, SEXP rho) { } else if (!strcmp(CHAR(STRING_ELT(align, 0)), "left")) { ialign = -1; } else { - error(_("Internal error: invalid align argument in rolling function, should have been caught before. please report to data.table issue tracker.")); // # nocov + error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "align", "rolling"); // # nocov } if (length(fill) != 1) diff --git a/src/fsort.c b/src/fsort.c index c50f8bc3eb..6dbb85d550 100644 --- a/src/fsort.c +++ b/src/fsort.c @@ -101,10 +101,10 @@ int qsort_cmp(const void *a, const void *b) { SEXP fsort(SEXP x, SEXP verboseArg) { double t[10]; t[0] = wallclock(); - if (!isLogical(verboseArg) || LENGTH(verboseArg)!=1 || LOGICAL(verboseArg)[0]==NA_LOGICAL) - error(_("verbose must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(verboseArg)) + error(_("%s must be TRUE or FALSE"), "verbose"); Rboolean verbose = LOGICAL(verboseArg)[0]; - if (!isNumeric(x)) error(_("x must be a vector of type 'double' currently")); + if (!isNumeric(x)) error(_("x must be a vector of type double currently")); // TODO: not only detect if already sorted, but if it is, just return x to save the duplicate SEXP ansVec = PROTECT(allocVector(REALSXP, xlength(x))); diff --git a/src/gsumm.c b/src/gsumm.c index 7470f9f527..be3b0f7855 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -41,7 +41,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { double started = wallclock(); const bool verbose = GetVerbose(); if (TYPEOF(env) != ENVSXP) error(_("env is not an environment")); - // The type of jsub is pretty flexbile in R, so leave checking to eval() below. + // The type of jsub is pretty flexible in R, so leave checking to eval() below. if (!isInteger(o)) error(_("%s is not an integer vector"), "o"); if (!isInteger(f)) error(_("%s is not an integer vector"), "f"); if (!isInteger(l)) error(_("%s is not an integer vector"), "l"); @@ -339,14 +339,17 @@ void *gather(SEXP x, bool *anyNA) SEXP gsum(SEXP x, SEXP narmArg) { - if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(narmArg)) + error(_("%s must be TRUE or FALSE"), "na.rm"); const bool narm = LOGICAL(narmArg)[0]; - if (inherits(x, "factor")) error(_("sum is not meaningful for factors.")); + if (inherits(x, "factor")) + error(_("%s is not meaningful for factors."), "sum"); const int n = (irowslen == -1) ? length(x) : irowslen; double started = wallclock(); const bool verbose=GetVerbose(); - if (verbose) Rprintf(_("This gsum took (narm=%s) ... "), narm?"TRUE":"FALSE"); - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gsum"); + if (verbose) Rprintf(_("This gsum (narm=%s) took ... "), narm?"TRUE":"FALSE"); + if (nrow != n) + error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gsum"); bool anyNA=false; SEXP ans; switch(TYPEOF(x)) { @@ -561,7 +564,7 @@ SEXP gsum(SEXP x, SEXP narmArg) } } break; default: - error(_("Type '%s' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); + error(_("Type '%s' is not supported by GForce %s. Either add the prefix %s or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x)), "sum (gsum)", "base::sum(.)"); } copyMostAttrib(x, ans); if (verbose) { Rprintf(_("%.3fs\n"), wallclock()-started); } @@ -571,8 +574,10 @@ SEXP gsum(SEXP x, SEXP narmArg) SEXP gmean(SEXP x, SEXP narmArg) { - if (inherits(x, "factor")) error(_("mean is not meaningful for factors.")); - if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); + if (inherits(x, "factor")) + error(_("%s is not meaningful for factors."), "mean"); + if (!IS_TRUE_OR_FALSE(narmArg)) + error(_("%s must be TRUE or FALSE"), "na.rm"); const bool narm = LOGICAL(narmArg)[0]; const int n = (irowslen == -1) ? length(x) : irowslen; double started = wallclock(); @@ -715,15 +720,18 @@ SEXP gmean(SEXP x, SEXP narmArg) static SEXP gminmax(SEXP x, SEXP narm, const bool min) { - if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(narm)) + error(_("%s must be TRUE or FALSE"), "na.rm"); if (!isVectorAtomic(x)) error(_("GForce min/max can only be applied to columns, not .SD or similar. To find min/max of all items in a list such as .SD, either add the prefix base::min(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,min),by=,.SDcols=]'")); - if (inherits(x, "factor") && !inherits(x, "ordered")) error(_("min/max is not meaningful for factors.")); - const int n = (irowslen == -1) ? length(x) : irowslen; + if (inherits(x, "factor") && !inherits(x, "ordered")) + error(_("%s is not meaningful for factors."), min?"min":"max"); + const bool nosubset = irowslen==-1; + const int n = nosubset ? length(x) : irowslen; //clock_t start = clock(); SEXP ans; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gmin"); + if (nrow != n) + error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gminmax"); // GForce guarantees each group has at least one value; i.e. we don't need to consider length-0 per group here - const bool nosubset = irowslen==-1; switch(TYPEOF(x)) { case LGLSXP: case INTSXP: { ans = PROTECT(allocVector(INTSXP, ngrp)); @@ -829,7 +837,8 @@ static SEXP gminmax(SEXP x, SEXP narm, const bool min) error(_("Type 'complex' has no well-defined min/max")); break; default: - error(_("Type '%s' not supported by GForce min/max. Either add the prefix base::min(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); + error(_("Type '%s' is not supported by GForce %s. Either add the prefix %s or turn off GForce optimization using options(datatable.optimize=1)"), + type2char(TYPEOF(x)), min?"min (gmin)":"max (gmax)", min?"base::min(.)":"base::max(.)"); } copyMostAttrib(x, ans); // all but names,dim and dimnames. And if so, we want a copy here, not keepattr's SET_ATTRIB. UNPROTECT(1); // ans @@ -849,12 +858,15 @@ SEXP gmax(SEXP x, SEXP narm) // gmedian, always returns numeric type (to avoid as.numeric() wrap..) SEXP gmedian(SEXP x, SEXP narmArg) { - if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(narmArg)) + error(_("%s must be TRUE or FALSE"), "na.rm"); if (!isVectorAtomic(x)) error(_("GForce median can only be applied to columns, not .SD or similar. To find median of all items in a list such as .SD, either add the prefix stats::median(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,median),by=,.SDcols=]'")); - if (inherits(x, "factor")) error(_("median is not meaningful for factors.")); + if (inherits(x, "factor")) + error(_("%s is not meaningful for factors."), "median"); const bool isInt64 = INHERITS(x, char_integer64), narm = LOGICAL(narmArg)[0]; const int n = (irowslen == -1) ? length(x) : irowslen; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gmedian"); + if (nrow != n) + error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gmedian"); SEXP ans = PROTECT(allocVector(REALSXP, ngrp)); double *ansd = REAL(ans); const bool nosubset = irowslen==-1; @@ -892,7 +904,7 @@ SEXP gmedian(SEXP x, SEXP narmArg) { }} break; default: - error(_("Type '%s' not supported by GForce median (gmedian). Either add the prefix stats::median(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); + error(_("Type '%s' is not supported by GForce %s. Either add the prefix %s or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x)), "median (gmedian)", "stats::median(.)"); } if (!isInt64) copyMostAttrib(x, ans); // else the integer64 class needs to be dropped since double is always returned by gmedian @@ -967,7 +979,7 @@ static SEXP gfirstlast(SEXP x, const bool first, const int w, const bool headw) case STRSXP: DO(SEXP, STRING_PTR, NA_STRING, SET_STRING_ELT(ans,ansi++,val)) break; case VECSXP: DO(SEXP, SEXPPTR_RO, ScalarLogical(NA_LOGICAL), SET_VECTOR_ELT(ans,ansi++,val)) break; default: - error(_("Type '%s' not supported by GForce head/tail/first/last/`[`. Either add the prefix utils::head(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); + error(_("Type '%s' is not supported by GForce head/tail/first/last/`[`. Either add the namespace prefix (e.g. utils::head(.)) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); } copyMostAttrib(x, ans); UNPROTECT(1); @@ -1003,11 +1015,14 @@ SEXP gnthvalue(SEXP x, SEXP nArg) { // implemented this similar to gmedian to balance well between speed and memory usage. There's one extra allocation on maximum groups and that's it.. and that helps speed things up extremely since we don't have to collect x's values for each group for each step (mean, residuals, mean again and then variance). static SEXP gvarsd1(SEXP x, SEXP narmArg, bool isSD) { - if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(narmArg)) + error(_("%s must be TRUE or FALSE"), "na.rm"); if (!isVectorAtomic(x)) error(_("GForce var/sd can only be applied to columns, not .SD or similar. For the full covariance matrix of all items in a list such as .SD, either add the prefix stats::var(.SD) (or stats::sd(.SD)) or turn off GForce optimization using options(datatable.optimize=1). Alternatively, if you only need the diagonal elements, 'DT[,lapply(.SD,var),by=,.SDcols=]' is the optimized way to do this.")); - if (inherits(x, "factor")) error(_("var/sd is not meaningful for factors.")); + if (inherits(x, "factor")) + error(_("%s is not meaningful for factors."), isSD ? "sd" : "var"); const int n = (irowslen == -1) ? length(x) : irowslen; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gvar"); + if (nrow != n) + error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gvar"); SEXP sub, ans = PROTECT(allocVector(REALSXP, ngrp)); double *ansd = REAL(ans); const bool nosubset = irowslen==-1; @@ -1076,8 +1091,8 @@ static SEXP gvarsd1(SEXP x, SEXP narmArg, bool isSD) }} break; default: - error(_("Type '%s' not supported by GForce %s. Either add the prefix stats::var(.) or turn off GForce optimization using options(datatable.optimize=1)"), - type2char(TYPEOF(x)), isSD?"sd (gsd)":"var (gvar)"); + error(_("Type '%s' is not supported by GForce %s. Either add the prefix %s or turn off GForce optimization using options(datatable.optimize=1)"), + type2char(TYPEOF(x)), isSD?"sd (gsd)":"var (gvar)", isSD?"stats::sd(.)":"stats::var(.)"); } // no copyMostAttrib(x, ans) since class (e.g. Date) unlikely applicable to sd/var UNPROTECT(2); // ans,sub @@ -1093,15 +1108,19 @@ SEXP gsd(SEXP x, SEXP narm) { } SEXP gprod(SEXP x, SEXP narmArg) { - if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(narmArg)) + error(_("%s must be TRUE or FALSE"), "na.rm"); const bool narm=LOGICAL(narmArg)[0]; - if (!isVectorAtomic(x)) error(_("GForce prod can only be applied to columns, not .SD or similar. To multiply all items in a list such as .SD, either add the prefix base::prod(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,prod),by=,.SDcols=]'")); - if (inherits(x, "factor")) error(_("prod is not meaningful for factors.")); + if (!isVectorAtomic(x)) + error(_("GForce prod can only be applied to columns, not .SD or similar. To multiply all items in a list such as .SD, either add the prefix base::prod(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,prod),by=,.SDcols=]'")); + if (inherits(x, "factor")) + error(_("%s is not meaningful for factors."), "prod"); const bool nosubset = irowslen==-1; const int n = nosubset ? length(x) : irowslen; //clock_t start = clock(); SEXP ans; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gprod"); + if (nrow != n) + error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gprod"); long double *s = malloc(ngrp * sizeof(long double)); if (!s) error(_("Unable to allocate %d * %d bytes for gprod"), ngrp, sizeof(long double)); for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; diff --git a/src/init.c b/src/init.c index eda1c607c4..56bf66d419 100644 --- a/src/init.c +++ b/src/init.c @@ -258,22 +258,22 @@ void attribute_visible R_init_data_table(DllInfo *info) R_registerRoutines(info, NULL, callMethods, NULL, externalMethods); R_useDynamicSymbols(info, FALSE); setSizes(); - const char *msg = "... failed. Please forward this message to maintainer('data.table')."; + const char *msg = _("... failed. Please forward this message to maintainer('data.table')."); if ((int)NA_INTEGER != (int)INT_MIN) error(_("Checking NA_INTEGER [%d] == INT_MIN [%d] %s"), NA_INTEGER, INT_MIN, msg); if ((int)NA_INTEGER != (int)NA_LOGICAL) error(_("Checking NA_INTEGER [%d] == NA_LOGICAL [%d] %s"), NA_INTEGER, NA_LOGICAL, msg); - if (sizeof(int) != 4) error(_("Checking sizeof(int) [%d] is 4 %s"), sizeof(int), msg); - if (sizeof(double) != 8) error(_("Checking sizeof(double) [%d] is 8 %s"), sizeof(double), msg); // 8 on both 32bit and 64bit + if (sizeof(int) != 4) error(_("Checking sizeof(%s) [%d] is %d %s"), "int", sizeof(int), 4, msg); + if (sizeof(double) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "double", sizeof(double), 8, msg); // 8 on both 32bit and 64bit // alignof not available in C99: if (alignof(double) != 8) error(_("Checking alignof(double) [%d] is 8 %s"), alignof(double), msg); // 8 on both 32bit and 64bit - if (sizeof(long long) != 8) error(_("Checking sizeof(long long) [%d] is 8 %s"), sizeof(long long), msg); + if (sizeof(long long) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "long long", sizeof(long long), 8, msg); if (sizeof(char *) != 4 && sizeof(char *) != 8) error(_("Checking sizeof(pointer) [%d] is 4 or 8 %s"), sizeof(char *), msg); if (sizeof(SEXP) != sizeof(char *)) error(_("Checking sizeof(SEXP) [%d] == sizeof(pointer) [%d] %s"), sizeof(SEXP), sizeof(char *), msg); - if (sizeof(uint64_t) != 8) error(_("Checking sizeof(uint64_t) [%d] is 8 %s"), sizeof(uint64_t), msg); - if (sizeof(int64_t) != 8) error(_("Checking sizeof(int64_t) [%d] is 8 %s"), sizeof(int64_t), msg); - if (sizeof(signed char) != 1) error(_("Checking sizeof(signed char) [%d] is 1 %s"), sizeof(signed char), msg); - if (sizeof(int8_t) != 1) error(_("Checking sizeof(int8_t) [%d] is 1 %s"), sizeof(int8_t), msg); - if (sizeof(uint8_t) != 1) error(_("Checking sizeof(uint8_t) [%d] is 1 %s"), sizeof(uint8_t), msg); - if (sizeof(int16_t) != 2) error(_("Checking sizeof(int16_t) [%d] is 2 %s"), sizeof(int16_t), msg); - if (sizeof(uint16_t) != 2) error(_("Checking sizeof(uint16_t) [%d] is 2 %s"), sizeof(uint16_t), msg); + if (sizeof(uint64_t) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "uint64_t", sizeof(uint64_t), 8, msg); + if (sizeof(int64_t) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "int64_t", sizeof(int64_t), 8, msg); + if (sizeof(signed char) != 1) error(_("Checking sizeof(%s) [%d] is %d %s"), "signed char", sizeof(signed char), 1, msg); + if (sizeof(int8_t) != 1) error(_("Checking sizeof(%s) [%d] is %d %s"), "int8_t", sizeof(int8_t), 1, msg); + if (sizeof(uint8_t) != 1) error(_("Checking sizeof(%s) [%d] is %d %s"), "uint8_t", sizeof(uint8_t), 1, msg); + if (sizeof(int16_t) != 2) error(_("Checking sizeof(%s) [%d] is %d %s"), "int16_t", sizeof(int16_t), 2, msg); + if (sizeof(uint16_t) != 2) error(_("Checking sizeof(%s) [%d] is %d %s"), "uint16_t", sizeof(uint16_t), 2 ,msg); SEXP tmp = PROTECT(allocVector(INTSXP,2)); if (LENGTH(tmp)!=2) error(_("Checking LENGTH(allocVector(INTSXP,2)) [%d] is 2 %s"), LENGTH(tmp), msg); diff --git a/src/nafill.c b/src/nafill.c index b393db1c16..03aa6d091b 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -167,7 +167,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S else if (!strcmp(CHAR(STRING_ELT(type, 0)), "nocb")) itype = 2; else - error(_("Internal error: invalid type argument in nafillR function, should have been caught before. Please report to data.table issue tracker.")); // # nocov + error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "type", "nafillR"); // # nocov bool hasFill = !isLogical(fill) || LOGICAL(fill)[0]!=NA_LOGICAL; bool *isInt64 = (bool *)R_alloc(nx, sizeof(bool)); diff --git a/src/openmp-utils.c b/src/openmp-utils.c index 22e562506d..c9003ee07b 100644 --- a/src/openmp-utils.c +++ b/src/openmp-utils.c @@ -75,7 +75,8 @@ static const char *mygetenv(const char *name, const char *unset) { } SEXP getDTthreads_R(SEXP verbose) { - if (!isLogical(verbose) || LENGTH(verbose)!=1 || INTEGER(verbose)[0]==NA_LOGICAL) error(_("'verbose' must be TRUE or FALSE")); + if(!IS_TRUE_OR_FALSE(verbose)) + error(_("%s must be TRUE or FALSE"), "verbose"); if (LOGICAL(verbose)[0]) { #ifndef _OPENMP Rprintf(_("This installation of data.table has not been compiled with OpenMP support.\n")); diff --git a/src/shift.c b/src/shift.c index f70fcf5c1f..9ff0449628 100644 --- a/src/shift.c +++ b/src/shift.c @@ -167,7 +167,7 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) } break; default : - error(_("Unsupported type '%s'"), type2char(TYPEOF(elem))); + error(_("Type '%s' is not supported"), type2char(TYPEOF(elem))); } } diff --git a/src/subset.c b/src/subset.c index a04fcc9ecb..2158451798 100644 --- a/src/subset.c +++ b/src/subset.c @@ -107,7 +107,7 @@ const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_ou // error if any negatives, zeros or >max since they should have been dealt with by convertNegAndZeroIdx() called ealier at R level. // single cache efficient sweep with prefetch, so very low priority to go parallel { - if (!isInteger(idx)) error(_("Internal error. 'idx' is type '%s' not 'integer'"), type2char(TYPEOF(idx))); // # nocov + if (!isInteger(idx)) error(_("Internal error. Argument '%s' to %s is type '%s' not '%s'"), "idx", "check_idx", type2char(TYPEOF(idx)), "integer"); // # nocov bool anyLess=false, anyNA=false; int last = INT32_MIN; int *idxp = INTEGER(idx), n=LENGTH(idx); @@ -271,7 +271,7 @@ static void checkCol(SEXP col, int colNum, int nrow, SEXP x) SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md and man/cdt.Rd int nprotect=0; - if (!isNewList(x)) error(_("Internal error. Argument 'x' to CsubsetDT is type '%s' not 'list'"), type2char(TYPEOF(rows))); // # nocov + if (!isNewList(x)) error(_("Internal error. Argument '%s' to %s is type '%s' not '%s'"), "x", "CsubsetDT", type2char(TYPEOF(rows)), "list"); // # nocov if (!length(x)) return(x); // return empty list const int nrow = length(VECTOR_ELT(x,0)); @@ -284,10 +284,10 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md if (err!=NULL) error(err); } - if (!isInteger(cols)) error(_("Internal error. Argument 'cols' to Csubset is type '%s' not 'integer'"), type2char(TYPEOF(cols))); // # nocov + if (!isInteger(cols)) error(_("Internal error. Argument '%s' to %s is type '%s' not '%s'"), "cols", "Csubset", type2char(TYPEOF(cols)), "integer"); // # nocov for (int i=0; iLENGTH(x)) error(_("Item %d of 'cols' is %d which is outside 1-based range [1,ncol(x)=%d]"), i+1, this, LENGTH(x)); + if (this<1 || this>LENGTH(x)) error(_("Item %d of cols is %d which is outside the range [1,ncol(x)=%d]"), i+1, this, LENGTH(x)); } int overAlloc = checkOverAlloc(GetOption(install("datatable.alloccol"), R_NilValue)); diff --git a/src/uniqlist.c b/src/uniqlist.c index d79f7587e0..48da706a2c 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -95,7 +95,7 @@ SEXP uniqlist(SEXP l, SEXP order) } } break; default : - error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov + error(_("Type '%s' is not supported"), type2char(TYPEOF(v))); // # nocov } } else { // ncol>1 @@ -127,7 +127,7 @@ SEXP uniqlist(SEXP l, SEXP order) } break; default : - error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov + error(_("Type '%s' is not supported"), type2char(TYPEOF(v))); // # nocov } } if (!b) { @@ -171,7 +171,7 @@ SEXP rleid(SEXP l, SEXP cols) { int *icols = INTEGER(cols); for (int i=0; incol) error(_("Item %d of cols is %d which is outside range of l [1,length(l)=%d]"), i+1, elem, ncol); + if (elem<1 || elem>ncol) error(_("Item %d of cols is %d which is outside the range [1,length(l)=%d]"), i+1, elem, ncol); } for (int i=1; i0 but ngrps==0"), nrows); // # nocov R_len_t resetctr=0, rlen = length(resetvals) ? INTEGER(resetvals)[0] : 0; - if (!isInteger(cols) || ncols == 0) error(_("cols must be an integer vector of positive length")); + if (!isInteger(cols) || ncols == 0) error(_("cols must be an integer vector with length >= 1")); // mult arg enum {ALL, FIRST, LAST} mult = ALL; if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL; @@ -318,7 +318,7 @@ SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP mul dtwiddle(xd[thisi]) >= dtwiddle(xd[previ]); } break; default: - error(_("Type '%s' not supported"), type2char(TYPEOF(v))); // # nocov + error(_("Type '%s' is not supported"), type2char(TYPEOF(v))); // # nocov } } if (b) break; @@ -350,7 +350,8 @@ SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP mul SEXP uniqueNlogical(SEXP x, SEXP narmArg) { // single pass; short-circuit and return as soon as all 3 values are found if (!isLogical(x)) error(_("x is not a logical vector")); - if (!isLogical(narmArg) || length(narmArg)!=1 || INTEGER(narmArg)[0]==NA_INTEGER) error(_("na.rm must be TRUE or FALSE")); + if (!IS_TRUE_OR_FALSE(narmArg)) + error(_("%s must be TRUE or FALSE"), "na.rm"); bool narm = LOGICAL(narmArg)[0]==1; const R_xlen_t n = xlength(x); if (n==0) diff --git a/src/utils.c b/src/utils.c index 312f554f8f..e499aced06 100644 --- a/src/utils.c +++ b/src/utils.c @@ -92,7 +92,7 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups) { if (!isNewList(x)) error(_("'x' argument must be data.table compatible")); if (!IS_TRUE_OR_FALSE(check_dups)) - error(_("'check_dups' argument must be TRUE or FALSE")); + error(_("%s must be TRUE or FALSE"), "check_dups"); int protecti = 0; R_len_t nx = length(x); R_len_t nc = length(cols); @@ -114,7 +114,7 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups) { int *icols = INTEGER(ricols); for (int i=0; inx) || (icols[i]<1)) - error(_("argument specifying columns specify non existing column(s): cols[%d]=%d"), i+1, icols[i]); // handles NAs also + error(_("argument specifying columns received non-existing column(s): cols[%d]=%d"), i+1, icols[i]); // handles NAs also } } else if (isString(cols)) { SEXP xnames = PROTECT(getAttrib(x, R_NamesSymbol)); protecti++; @@ -124,13 +124,13 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups) { int *icols = INTEGER(ricols); for (int i=0; i Date: Fri, 27 Aug 2021 07:33:17 -0700 Subject: [PATCH 382/588] fix floating point parsing precision in some rare cases (#4463) --- NEWS.md | 2 + inst/tests/tests.Rraw | 3 + src/fread.c | 11 +- src/fread.h | 2 +- src/freadLookups.h | 303 +----------------------------------------- 5 files changed, 14 insertions(+), 307 deletions(-) diff --git a/NEWS.md b/NEWS.md index 03dbe3d4b5..6a7c3ba948 100644 --- a/NEWS.md +++ b/NEWS.md @@ -295,6 +295,8 @@ 39. `DT[i, sum(b), by=grp]` (and other optimized-by-group aggregates: `mean`, `var`, `sd`, `median`, `prod`, `min`, `max`, `first`, `last`, `head` and `tail`) could segfault if `i` contained row numbers and one or more were NA, [#1994](https://github.com/Rdatatable/data.table/issues/1994). Thanks to Arun Srinivasan for reporting, and Benjamin Schwendinger for the PR. +40. `identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)` is now TRUE, [#4461](https://github.com/Rdatatable/data.table/issues/4461). `fread` was using `*10^-n` rather than `/10^n` resulting in `0.80606673659999994` vs `0.80606673660000006`. `fread()` now matches R's parser and `read.table` identically in this respect. Thanks to Gabe Becker for requesting consistency, and Michael Chirico for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a9eb68ab09..1d39b2d817 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18107,3 +18107,6 @@ if (base::getRversion() >= "4.1.0") { test(2212.33, EVAL("iris |> DT('setosa', on='Species')"), {tt=droprn(iris[iris$Species=="setosa",]); tt$Species=as.character(tt$Species); tt}) } +# precision powers of 10^(-n), #4461 +test(2213, identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)) + diff --git a/src/fread.c b/src/fread.c index 7c7836250a..e0a32d3e14 100644 --- a/src/fread.c +++ b/src/fread.c @@ -652,8 +652,8 @@ static void StrtoI64(FieldParseContext *ctx) // TODO: review ERANGE checks and tests; that range outside [1.7e-308,1.7e+308] coerces to [0.0,Inf] /* f = "~/data.table/src/freadLookups.h" -cat("const long double pow10lookup[601] = {\n", file=f, append=FALSE) -for (i in (-300):(299)) cat("1.0E",i,"L,\n", sep="", file=f, append=TRUE) +cat("const long double pow10lookup[301] = {\n", file=f, append=FALSE) +for (i in 0:299) cat("1.0E",i,"L,\n", sep="", file=f, append=TRUE) cat("1.0E300L\n};\n", file=f, append=TRUE) */ @@ -780,12 +780,13 @@ static void parse_double_regular_core(const char **pch, double *target) // fail to be encoded by the compiler, even though the values can actually // be stored correctly. int_fast8_t extra = e < 0 ? e + 300 : e - 300; - r *= pow10lookup[extra + 300]; + r = extra<0 ? r/pow10lookup[-extra] : r*pow10lookup[extra]; e -= extra; } - e += 300; // lookup table is arranged from -300 (0) to +300 (600) - r *= pow10lookup[e]; + // pow10lookup[301] contains 10^(0:300). Storing negative powers there too + // avoids this ternary but is slightly less accurate in some cases, #4461 + r = e < 0 ? r/pow10lookup[-e] : r*pow10lookup[e]; *target = (double)(neg? -r : r); *pch = ch; return; diff --git a/src/fread.h b/src/fread.h index c0e9669d01..446da18e4b 100644 --- a/src/fread.h +++ b/src/fread.h @@ -37,7 +37,7 @@ typedef enum { extern int8_t typeSize[NUMTYPE]; extern const char typeName[NUMTYPE][10]; -extern const long double pow10lookup[601]; +extern const long double pow10lookup[301]; extern const uint8_t hexdigits[256]; diff --git a/src/freadLookups.h b/src/freadLookups.h index 80c4861014..103d644da4 100644 --- a/src/freadLookups.h +++ b/src/freadLookups.h @@ -142,307 +142,8 @@ const int32_t cumDaysCycleYears[401] = { 9496, 9862, 10227, 10592, 146097// total days in 400 years }; -const long double pow10lookup[601] = { -1.0E-300L, -1.0E-299L, -1.0E-298L, -1.0E-297L, -1.0E-296L, -1.0E-295L, -1.0E-294L, -1.0E-293L, -1.0E-292L, -1.0E-291L, -1.0E-290L, -1.0E-289L, -1.0E-288L, -1.0E-287L, -1.0E-286L, -1.0E-285L, -1.0E-284L, -1.0E-283L, -1.0E-282L, -1.0E-281L, -1.0E-280L, -1.0E-279L, -1.0E-278L, -1.0E-277L, -1.0E-276L, -1.0E-275L, -1.0E-274L, -1.0E-273L, -1.0E-272L, -1.0E-271L, -1.0E-270L, -1.0E-269L, -1.0E-268L, -1.0E-267L, -1.0E-266L, -1.0E-265L, -1.0E-264L, -1.0E-263L, -1.0E-262L, -1.0E-261L, -1.0E-260L, -1.0E-259L, -1.0E-258L, -1.0E-257L, -1.0E-256L, -1.0E-255L, -1.0E-254L, -1.0E-253L, -1.0E-252L, -1.0E-251L, -1.0E-250L, -1.0E-249L, -1.0E-248L, -1.0E-247L, -1.0E-246L, -1.0E-245L, -1.0E-244L, -1.0E-243L, -1.0E-242L, -1.0E-241L, -1.0E-240L, -1.0E-239L, -1.0E-238L, -1.0E-237L, -1.0E-236L, -1.0E-235L, -1.0E-234L, -1.0E-233L, -1.0E-232L, -1.0E-231L, -1.0E-230L, -1.0E-229L, -1.0E-228L, -1.0E-227L, -1.0E-226L, -1.0E-225L, -1.0E-224L, -1.0E-223L, -1.0E-222L, -1.0E-221L, -1.0E-220L, -1.0E-219L, -1.0E-218L, -1.0E-217L, -1.0E-216L, -1.0E-215L, -1.0E-214L, -1.0E-213L, -1.0E-212L, -1.0E-211L, -1.0E-210L, -1.0E-209L, -1.0E-208L, -1.0E-207L, -1.0E-206L, -1.0E-205L, -1.0E-204L, -1.0E-203L, -1.0E-202L, -1.0E-201L, -1.0E-200L, -1.0E-199L, -1.0E-198L, -1.0E-197L, -1.0E-196L, -1.0E-195L, -1.0E-194L, -1.0E-193L, -1.0E-192L, -1.0E-191L, -1.0E-190L, -1.0E-189L, -1.0E-188L, -1.0E-187L, -1.0E-186L, -1.0E-185L, -1.0E-184L, -1.0E-183L, -1.0E-182L, -1.0E-181L, -1.0E-180L, -1.0E-179L, -1.0E-178L, -1.0E-177L, -1.0E-176L, -1.0E-175L, -1.0E-174L, -1.0E-173L, -1.0E-172L, -1.0E-171L, -1.0E-170L, -1.0E-169L, -1.0E-168L, -1.0E-167L, -1.0E-166L, -1.0E-165L, -1.0E-164L, -1.0E-163L, -1.0E-162L, -1.0E-161L, -1.0E-160L, -1.0E-159L, -1.0E-158L, -1.0E-157L, -1.0E-156L, -1.0E-155L, -1.0E-154L, -1.0E-153L, -1.0E-152L, -1.0E-151L, -1.0E-150L, -1.0E-149L, -1.0E-148L, -1.0E-147L, -1.0E-146L, -1.0E-145L, -1.0E-144L, -1.0E-143L, -1.0E-142L, -1.0E-141L, -1.0E-140L, -1.0E-139L, -1.0E-138L, -1.0E-137L, -1.0E-136L, -1.0E-135L, -1.0E-134L, -1.0E-133L, -1.0E-132L, -1.0E-131L, -1.0E-130L, -1.0E-129L, -1.0E-128L, -1.0E-127L, -1.0E-126L, -1.0E-125L, -1.0E-124L, -1.0E-123L, -1.0E-122L, -1.0E-121L, -1.0E-120L, -1.0E-119L, -1.0E-118L, -1.0E-117L, -1.0E-116L, -1.0E-115L, -1.0E-114L, -1.0E-113L, -1.0E-112L, -1.0E-111L, -1.0E-110L, -1.0E-109L, -1.0E-108L, -1.0E-107L, -1.0E-106L, -1.0E-105L, -1.0E-104L, -1.0E-103L, -1.0E-102L, -1.0E-101L, -1.0E-100L, -1.0E-99L, -1.0E-98L, -1.0E-97L, -1.0E-96L, -1.0E-95L, -1.0E-94L, -1.0E-93L, -1.0E-92L, -1.0E-91L, -1.0E-90L, -1.0E-89L, -1.0E-88L, -1.0E-87L, -1.0E-86L, -1.0E-85L, -1.0E-84L, -1.0E-83L, -1.0E-82L, -1.0E-81L, -1.0E-80L, -1.0E-79L, -1.0E-78L, -1.0E-77L, -1.0E-76L, -1.0E-75L, -1.0E-74L, -1.0E-73L, -1.0E-72L, -1.0E-71L, -1.0E-70L, -1.0E-69L, -1.0E-68L, -1.0E-67L, -1.0E-66L, -1.0E-65L, -1.0E-64L, -1.0E-63L, -1.0E-62L, -1.0E-61L, -1.0E-60L, -1.0E-59L, -1.0E-58L, -1.0E-57L, -1.0E-56L, -1.0E-55L, -1.0E-54L, -1.0E-53L, -1.0E-52L, -1.0E-51L, -1.0E-50L, -1.0E-49L, -1.0E-48L, -1.0E-47L, -1.0E-46L, -1.0E-45L, -1.0E-44L, -1.0E-43L, -1.0E-42L, -1.0E-41L, -1.0E-40L, -1.0E-39L, -1.0E-38L, -1.0E-37L, -1.0E-36L, -1.0E-35L, -1.0E-34L, -1.0E-33L, -1.0E-32L, -1.0E-31L, -1.0E-30L, -1.0E-29L, -1.0E-28L, -1.0E-27L, -1.0E-26L, -1.0E-25L, -1.0E-24L, -1.0E-23L, -1.0E-22L, -1.0E-21L, -1.0E-20L, -1.0E-19L, -1.0E-18L, -1.0E-17L, -1.0E-16L, -1.0E-15L, -1.0E-14L, -1.0E-13L, -1.0E-12L, -1.0E-11L, -1.0E-10L, -1.0E-9L, -1.0E-8L, -1.0E-7L, -1.0E-6L, -1.0E-5L, -1.0E-4L, -1.0E-3L, -1.0E-2L, -1.0E-1L, + +const long double pow10lookup[301] = { 1.0E0L, 1.0E1L, 1.0E2L, From 6b0c45bba96c2b796d174d3a16dfa1424b4f9f5c Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 27 Aug 2021 11:55:34 -0600 Subject: [PATCH 383/588] news-only: more detail for #4463 --- NEWS.md | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 6a7c3ba948..621c8966bf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -295,7 +295,33 @@ 39. `DT[i, sum(b), by=grp]` (and other optimized-by-group aggregates: `mean`, `var`, `sd`, `median`, `prod`, `min`, `max`, `first`, `last`, `head` and `tail`) could segfault if `i` contained row numbers and one or more were NA, [#1994](https://github.com/Rdatatable/data.table/issues/1994). Thanks to Arun Srinivasan for reporting, and Benjamin Schwendinger for the PR. -40. `identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)` is now TRUE, [#4461](https://github.com/Rdatatable/data.table/issues/4461). `fread` was using `*10^-n` rather than `/10^n` resulting in `0.80606673659999994` vs `0.80606673660000006`. `fread()` now matches R's parser and `read.table` identically in this respect. Thanks to Gabe Becker for requesting consistency, and Michael Chirico for the PR. +40. `identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)` is now TRUE, [#4461](https://github.com/Rdatatable/data.table/issues/4461). This is one of 13 numbers in the set of 100,000 between 0.80606 and 0.80607 in 0.0000000001 increments that were not already identical. In all 13 cases R's parser (same as `read.table`) vs `fread` straddled the true number by the same amount. `fread` now uses `/10^n` rather than `*10^-n` to match R identically in all cases. Thanks to Gabe Becker for requesting consistency, and Michael Chirico for the PR. + + ```R + for (i in 0:99999) { + s = sprintf("0.80606%05d", i) + r = eval(parse(text=s)) + f = fread(text=paste0("A\n",s,"\n"))$A + if (!identical(r, f)) + cat(s, sprintf("%1.17f", c(r, f, r)), "\n") + } + input eval & read.table fread before fread now + 0.8060603509 0.80606035089999994 0.80606035090000006 0.80606035089999994 + 0.8060614740 0.80606147399999994 0.80606147400000006 0.80606147399999994 + 0.8060623757 0.80606237569999994 0.80606237570000006 0.80606237569999994 + 0.8060629084 0.80606290839999994 0.80606290840000006 0.80606290839999994 + 0.8060632774 0.80606327739999994 0.80606327740000006 0.80606327739999994 + 0.8060638101 0.80606381009999994 0.80606381010000006 0.80606381009999994 + 0.8060647118 0.80606471179999994 0.80606471180000006 0.80606471179999994 + 0.8060658349 0.80606583489999994 0.80606583490000006 0.80606583489999994 + 0.8060667366 0.80606673659999994 0.80606673660000006 0.80606673659999994 + 0.8060672693 0.80606726929999994 0.80606726930000006 0.80606726929999994 + 0.8060676383 0.80606763829999994 0.80606763830000006 0.80606763829999994 + 0.8060681710 0.80606817099999994 0.80606817100000006 0.80606817099999994 + 0.8060690727 0.80606907269999994 0.80606907270000006 0.80606907269999994 + + # remaining 99,987 out of 100,000 were already identical + ``` ## NOTES From 490d46028e08cb38ad96c2c5aa672ca8de6b49f2 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 27 Aug 2021 12:20:10 -0600 Subject: [PATCH 384/588] news-only: tweak news item detail for #4463 --- NEWS.md | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/NEWS.md b/NEWS.md index 621c8966bf..8824204aa7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -114,7 +114,7 @@ ```R mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) ``` - + When `data.table` queries (either `[...]` or `|> DT(...)`) receive a `data.table`, the operations maintain `data.table`'s attributes such as its key and any indices. For example, if a `data.table` is reordered by `data.table`, or a key column has a value changed by `:=` in `data.table`, its key and indices will either be dropped or reordered appropriately. Some `data.table` operations automatically add and store an index on a `data.table` for reuse in future queries, if `options(datatable.auto.index=TRUE)`, which is `TRUE` by default. `data.table`'s are also over-allocated, which means there are spare column pointer slots allocated in advance so that a `data.table` in the `.GlobalEnv` can have a column added to it truly by reference, like an in-memory database with multiple client sessions connecting to one server R process, as a `data.table` video has shown in the past. But because R and other packages don't maintain `data.table`'s attributes or over-allocation (e.g. a subset or reorder by R or another package will create invalid `data.table` attributes) `data.table` cannot use these attributes when it detects that base R or another package has touched the `data.table` in the meantime, even if the attributes may sometimes still be valid. So, please realize that, `DT()` on a `data.table` should realize better speed and memory usage than `DT()` on a `data.frame`. `DT()` on a `data.frame` may still be useful to use `data.table`'s syntax (e.g. sub-queries within group: `|> DT(i, .SD[sub-query], by=grp)`) without needing to convert to a `data.table` first. 23. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. @@ -295,32 +295,32 @@ 39. `DT[i, sum(b), by=grp]` (and other optimized-by-group aggregates: `mean`, `var`, `sd`, `median`, `prod`, `min`, `max`, `first`, `last`, `head` and `tail`) could segfault if `i` contained row numbers and one or more were NA, [#1994](https://github.com/Rdatatable/data.table/issues/1994). Thanks to Arun Srinivasan for reporting, and Benjamin Schwendinger for the PR. -40. `identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)` is now TRUE, [#4461](https://github.com/Rdatatable/data.table/issues/4461). This is one of 13 numbers in the set of 100,000 between 0.80606 and 0.80607 in 0.0000000001 increments that were not already identical. In all 13 cases R's parser (same as `read.table`) vs `fread` straddled the true number by the same amount. `fread` now uses `/10^n` rather than `*10^-n` to match R identically in all cases. Thanks to Gabe Becker for requesting consistency, and Michael Chirico for the PR. +40. `identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)` is now TRUE, [#4461](https://github.com/Rdatatable/data.table/issues/4461). This is one of 13 numbers in the set of 100,000 between 0.80606 and 0.80607 in 0.0000000001 increments that were not already identical. In all 13 cases R's parser (same as `read.table`) and `fread` straddled the true value by a very similar small amount. `fread` now uses `/10^n` rather than `*10^-n` to match R identically in all cases. Thanks to Gabe Becker for requesting consistency, and Michael Chirico for the PR. ```R for (i in 0:99999) { s = sprintf("0.80606%05d", i) - r = eval(parse(text=s)) + r = eval(parse(text=s)) f = fread(text=paste0("A\n",s,"\n"))$A if (!identical(r, f)) - cat(s, sprintf("%1.17f", c(r, f, r)), "\n") + cat(s, sprintf("%1.18f", c(r, f, r)), "\n") } - input eval & read.table fread before fread now - 0.8060603509 0.80606035089999994 0.80606035090000006 0.80606035089999994 - 0.8060614740 0.80606147399999994 0.80606147400000006 0.80606147399999994 - 0.8060623757 0.80606237569999994 0.80606237570000006 0.80606237569999994 - 0.8060629084 0.80606290839999994 0.80606290840000006 0.80606290839999994 - 0.8060632774 0.80606327739999994 0.80606327740000006 0.80606327739999994 - 0.8060638101 0.80606381009999994 0.80606381010000006 0.80606381009999994 - 0.8060647118 0.80606471179999994 0.80606471180000006 0.80606471179999994 - 0.8060658349 0.80606583489999994 0.80606583490000006 0.80606583489999994 - 0.8060667366 0.80606673659999994 0.80606673660000006 0.80606673659999994 - 0.8060672693 0.80606726929999994 0.80606726930000006 0.80606726929999994 - 0.8060676383 0.80606763829999994 0.80606763830000006 0.80606763829999994 - 0.8060681710 0.80606817099999994 0.80606817100000006 0.80606817099999994 - 0.8060690727 0.80606907269999994 0.80606907270000006 0.80606907269999994 - - # remaining 99,987 out of 100,000 were already identical + # input eval & read.table fread before fread now + # 0.8060603509 0.806060350899999944 0.806060350900000055 0.806060350899999944 + # 0.8060614740 0.806061473999999945 0.806061474000000056 0.806061473999999945 + # 0.8060623757 0.806062375699999945 0.806062375700000056 0.806062375699999945 + # 0.8060629084 0.806062908399999944 0.806062908400000055 0.806062908399999944 + # 0.8060632774 0.806063277399999945 0.806063277400000056 0.806063277399999945 + # 0.8060638101 0.806063810099999944 0.806063810100000055 0.806063810099999944 + # 0.8060647118 0.806064711799999944 0.806064711800000055 0.806064711799999944 + # 0.8060658349 0.806065834899999945 0.806065834900000056 0.806065834899999945 + # 0.8060667366 0.806066736599999945 0.806066736600000056 0.806066736599999945 + # 0.8060672693 0.806067269299999944 0.806067269300000055 0.806067269299999944 + # 0.8060676383 0.806067638299999945 0.806067638300000056 0.806067638299999945 + # 0.8060681710 0.806068170999999944 0.806068171000000055 0.806068170999999944 + # 0.8060690727 0.806069072699999944 0.806069072700000055 0.806069072699999944 + # + # remaining 99,987 of these 100,000 were already identical ``` ## NOTES From a96e2b2cba8ddb7ec5abf8183553d60afc996ba0 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 27 Aug 2021 17:11:26 -0600 Subject: [PATCH 385/588] added min.IDate and max.IDate (#5117) --- NAMESPACE | 2 ++ NEWS.md | 28 ++++++++++++++++++++++++++++ R/IDateTime.R | 7 ++++++- inst/tests/tests.Rraw | 14 +++++++++++++- 4 files changed, 49 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 81c0fce689..ae54e95d11 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -187,6 +187,8 @@ S3method(seq, ITime) S3method(unique, IDate) S3method(unique, ITime) S3method('[<-', IDate) +S3method('min', IDate) +S3method('max', IDate) S3method(edit, data.table) # generics to support custom column formatters diff --git a/NEWS.md b/NEWS.md index 8824204aa7..a6fbba8fbf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -253,6 +253,34 @@ # no inconvenient warning ``` + On the same basis, `min` and `max` methods for empty `IDate` input now return `NA_integer_` of class `IDate`, rather than `NA_double_` of class `IDate` together with base R's warning `no non-missing arguments to min; returning Inf`, [#2256](https://github.com/Rdatatable/data.table/issues/2256]. The type change and warning would cause an error in grouping, see example below. Since `NA` was returned before it seems clear that still returning `NA` but of the correct type and with no warning is appropriate, backwards compatible, and a bug fix. Thanks to Frank Narf for reporting, and Matt Dowle for fixing. + + ```R + DT + # d g + # + # 1: 2020-01-01 a + # 2: 2020-01-02 a + # 3: 2019-12-31 b + + DT[, min(d[d>"2020-01-01"]), by=g] + + # was: + + # Error in `[.data.table`(DT, , min(d[d > "2020-01-01"]), by = g) : + # Column 1 of result for group 2 is type 'double' but expecting type 'integer'. Column types must be consistent for each group. + # In addition: Warning message: + # In min.default(integer(0), na.rm = FALSE) : + # no non-missing arguments to min; returning Inf + + # now : + + # g V1 + # + # 1: a 2020-01-02 + # 2: b + ``` + 36. `DT[, min(int64Col), by=grp]` (and `max`) would return incorrect results for `bit64::integer64` columns, [#4444](https://github.com/Rdatatable/data.table/issues/4444). Thanks to @go-see for reporting, and Michael Chirico for the PR. 37. `fread(dec=',')` was able to guess `sep=','` and return an incorrect result, [#4483](https://github.com/Rdatatable/data.table/issues/4483). Thanks to Michael Chirico for reporting and fixing. It was already an error to provide both `sep=','` and `dec=','` manually. diff --git a/R/IDateTime.R b/R/IDateTime.R index 42a6b289a6..33d04b87c4 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -71,6 +71,11 @@ unique.IDate = x } +# define min and max to avoid base R's Inf with warning on empty, #2256 +min.IDate = max.IDate = function(x, ...) { + as.IDate(if (!length(x)) NA else NextMethod()) +} + # fix for #1315 as.list.IDate = function(x, ...) NextMethod() @@ -304,7 +309,7 @@ clip_msec = function(secs, action) { stopf("Valid options for ms are 'truncate', 'nearest', and 'ceil'.") ) } - + ################################################################### # Date - time extraction functions # Adapted from Hadley Wickham's routines cited below to ensure diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1d39b2d817..e5a1fb04fc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3422,10 +3422,22 @@ test(1083, setkeyv(ans[, list(r = .N), by=key(DT1)], key(ans)), check) # if the # Tests for #2531. `:=` loses POSIXct or ITime attribute: # first test from this SO post: http://stackoverflow.com/questions/15996692/cannot-assign-columns-as-date-by-reference-in-data-table +set.seed(1) dt <- data.table(date = as.IDate(sample(10000:11000, 10), origin = "1970-01-01")) dt[, group := rep(1:2, 5)] dt[, min.group.date := as.IDate(min(date)), by = group] -test(1084, class(dt$min.group.date), c("IDate", "Date")) +test(1084.1, class(dt$min.group.date), c("IDate", "Date")) + +# min.IDate on empty input NA, #2256 +# non-optimized grouping first: +test(1084.2, dt[, min(date[date>"1999-12-01"]), by=group], data.table(group=1:2, V1=as.IDate(c("1999-12-14",NA)))) +test(1084.3, dt[, max(date[date<"1997-08-01"]), by=group], data.table(group=1:2, V1=as.IDate(c(NA,"1997-07-19")))) +dt[group==2, date:=NA] # make group 2 an all-NA group +# GForce grouping with na.rm=FALSE|TRUE on the all-NA group +test(1084.4, dt[, min(date, na.rm=TRUE), by=group], data.table(group=1:2, V1=as.IDate(c("1997-12-06",NA)))) +test(1084.5, dt[, min(date), by=group], data.table(group=1:2, V1=as.IDate(c("1997-12-06",NA)))) +test(1084.6, dt[, max(date, na.rm=TRUE), by=group], data.table(group=1:2, V1=as.IDate(c("1999-12-14",NA)))) +test(1084.7, dt[, max(date), by=group], data.table(group=1:2, V1=as.IDate(c("1999-12-14",NA)))) dt <- data.table(date = as.IDate(sample(10000:11000, 10), origin = "1970-01-01")) dt[, group := rep(1:2, 5)] From 5c6bee3355fe6258e8e68e51d9b0e4b58a7d0bb9 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 27 Aug 2021 18:29:32 -0600 Subject: [PATCH 386/588] dcast(empty) returns empty (#5118) --- NEWS.md | 2 ++ R/fcast.R | 1 - inst/tests/tests.Rraw | 3 +-- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index a6fbba8fbf..dcd11d7dc3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -350,6 +350,8 @@ # # remaining 99,987 of these 100,000 were already identical ``` + +41. `dcast(empty-DT)` now returns an empty `data.table` rather than error `Cannot cast an empty data.table`, [#1215](https://github.com/Rdatatable/data.table/issues/1215). Thanks to Damian Betebenner for reporting, and Matt Dowle for fixing. ## NOTES diff --git a/R/fcast.R b/R/fcast.R index 465ff665da..efe18cf72c 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -151,7 +151,6 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., idx = which(eval(subset, data, parent.frame())) # any advantage thro' secondary keys? dat = .Call(CsubsetDT, dat, idx, seq_along(dat)) } - if (!nrow(dat) || !ncol(dat)) stopf("Can not cast an empty data.table") fun.call = m[["fun.aggregate"]] fill.default = NULL if (is.null(fun.call)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e5a1fb04fc..769703c7e5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13717,8 +13717,7 @@ test(1962.086, dcast(DT, a ~ a, drop = NA), DT = data.table(a = c(1, 1, 2, 2), b = list(1, 2, 3, 4), c = c(4, 4, 2, 2)) test(1962.087, dcast(DT, a ~ b, value.var = 'b'), error = 'Columns specified in formula can not be of type list') -test(1962.088, dcast(DT[0L, ], a ~ c, value.var = 'b'), - error = 'Can not cast an empty data.table') +test(1962.088, dcast(DT[0L, ], a ~ c, value.var = 'b'), data.table(a=numeric(), key="a")) #1215 test(1962.089, dcast(DT, a ~ c, value.var = 'b'), data.table(a = c(1, 2), `2` = c(0L, 2L), `4` = c(2L, 0L), key = 'a'), message = 'Aggregate function missing') From 42b492e1cbd46416bcd2599bbcede9a8d5a7d644 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Mon, 30 Aug 2021 23:21:21 +0200 Subject: [PATCH 387/588] fast droplevels.data.table (#5116) --- NAMESPACE | 2 ++ NEWS.md | 2 ++ R/fdroplevels.r | 25 ++++++++++++++++++++++ inst/tests/tests.Rraw | 12 +++++++++++ man/fdroplevels.rd | 48 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 89 insertions(+) create mode 100644 R/fdroplevels.r create mode 100644 man/fdroplevels.rd diff --git a/NAMESPACE b/NAMESPACE index ae54e95d11..260b7a7af4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -199,3 +199,5 @@ S3method(format_col, expression) export(format_list_item) S3method(format_list_item, default) +export(fdroplevels) +S3method(droplevels, data.table) diff --git a/NEWS.md b/NEWS.md index dcd11d7dc3..f6c834f07a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -139,6 +139,8 @@ 25 `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. +26. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/fdroplevels.r b/R/fdroplevels.r new file mode 100644 index 0000000000..5c53ee42fb --- /dev/null +++ b/R/fdroplevels.r @@ -0,0 +1,25 @@ +# 647 fast droplevels.data.table method +fdroplevels = function(x, exclude = if (anyNA(levels(x))) NULL else NA, ...) { + stopifnot(inherits(x, "factor")) + lev = which(tabulate(x, length(levels(x))) & (!match(levels(x), exclude, 0L))) + ans = match(as.integer(x), lev) + setattr(ans, 'levels', levels(x)[lev]) + setattr(ans, 'class', 'factor') + return(ans) +} + +droplevels.data.table = function(x, except = NULL, exclude, in.place = FALSE, ...){ + stopifnot(length(x) > 0L, is.logical(in.place)) + ix = vapply(x, is.factor, NA) + if(!is.null(except)){ + stopifnot(is.numeric(except), except <= length(x)) + ix[except] = FALSE + } + if(!sum(ix)) return(x) + if(!in.place) x = copy(x) + for(nx in names(ix)[ix==TRUE]){ + if (missing(exclude)) set(x, i = NULL, j = nx, value = fdroplevels(x[[nx]])) + else set(x, i = NULL, j = nx, value = fdroplevels(x[[nx]], exclude = exclude)) + } + return(x) +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 769703c7e5..d4681d5471 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18121,3 +18121,15 @@ if (base::getRversion() >= "4.1.0") { # precision powers of 10^(-n), #4461 test(2213, identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)) +# droplevels.data.table method, and fdroplevels, #647 +x = factor(letters[1:10]) +DT = data.table(a = x)[1:5] +test(2214.1, fdroplevels(factor()), droplevels(factor())) +test(2214.2, fdroplevels(x[1:5]), droplevels(x[1:5])) +test(2214.3, fdroplevels(x[1:5], c("b", "d")), droplevels(x[1:5], c("b", "d"))) +test(2214.4, fdroplevels(x[1:5], letters[1:5]), droplevels(x[1:5], letters[1:5])) +test(2214.5, droplevels(DT)[["a"]], droplevels(DT[1:5,a])) +test(2214.6, droplevels(DT, exclude=c("b", "d"))[["a"]], droplevels(DT[1:5,a], c("b", "d"))) +test(2214.7, droplevels(DT, 1)[["a"]], x[1:5]) +test(2214.8, droplevels(DT, in.place=TRUE), DT) + diff --git a/man/fdroplevels.rd b/man/fdroplevels.rd new file mode 100644 index 0000000000..98334f0110 --- /dev/null +++ b/man/fdroplevels.rd @@ -0,0 +1,48 @@ +\name{fdroplevels} +\alias{fdroplevels} +\alias{droplevels} +\alias{droplevels.data.table} +\title{Fast droplevels} +\description{ + Similar to \code{base::droplevels} but \emph{much faster}. +} + +\usage{ +fdroplevels(x, exclude = if (anyNA(levels(x))) NULL else NA, \dots) + +\method{droplevels}{data.table}(x, except = NULL, exclude, in.place = FALSE, \dots) +} +\arguments{ + \item{x}{ \code{factor} or \code{data.table} where unused levels should be dropped. } + \item{exclude}{ A \code{character} vector of factor levels which are dropped no matter of presented or not. } + \item{except}{ An \code{integer} vector of indices of data.table columns which are not modified by dropping levels. } + \item{in.place}{ logical (default is \code{FALSE}). If \code{TRUE} levels of factors of \code{data.table} are modified in-place. } + \item{\dots}{ further arguments passed to methods } +} + +\value{ + \code{fdroplevels} returns a \code{factor}. + + \code{droplevels} returns a \code{data.table} where levels are dropped at factor columns. +} + +\examples{ +# on vectors +x = factor(letters[1:10]) +fdroplevels(x[1:5]) +# exclude levels from drop +fdroplevels(x[1:5], exclude = c("a", "c")) + +# on data.table +DT = data.table(a = factor(1:10), b = factor(letters[1:10])) +droplevels(head(DT))[["b"]] +# exclude levels +droplevels(head(DT), exclude = c("b", "c"))[["b"]] +# except columns from drop +droplevels(head(DT), except = 2)[["b"]] +droplevels(head(DT), except = 1)[["b"]] +} +\seealso{ + \code{\link{data.table}}, \code{\link{duplicated}}, \code{\link{unique}} +} +\keyword{ data } From 9ec150a1abbce3763bf3d674325b046c5e0f134c Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 30 Aug 2021 16:35:58 -0600 Subject: [PATCH 388/588] DT[fctr] now works (#5120) --- NEWS.md | 4 +++- R/data.table.R | 2 +- inst/tests/tests.Rraw | 6 ++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index f6c834f07a..7e6c71f9a4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -352,9 +352,11 @@ # # remaining 99,987 of these 100,000 were already identical ``` - + 41. `dcast(empty-DT)` now returns an empty `data.table` rather than error `Cannot cast an empty data.table`, [#1215](https://github.com/Rdatatable/data.table/issues/1215). Thanks to Damian Betebenner for reporting, and Matt Dowle for fixing. +42. `DT[factor("id")]` now works rather than error `i has evaluated to type integer. Expecting logical, integer or double`, [#1632](https://github.com/Rdatatable/data.table/issues/1632). `DT["id"]` has worked forever by automatically converting to `DT[.("id")]` for convenience, and joins have worked forever between char/fact, fact/char and fact/fact even when levels mismatch, so it was unfortunate that `DT[factor("id")]` managed to escape the simple automatic conversion to `DT[.(factor("id"))]` which is now in place. Thanks to @aushev for reporting, and Matt Dowle for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/data.table.R b/R/data.table.R index d70e677615..87f6eef10e 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -436,7 +436,7 @@ replace_dot_alias = function(e) { } } if (is.null(i)) return( null.data.table() ) - if (is.character(i)) { + if (is.character(i) || is.factor(i)) { isnull_inames = TRUE i = data.table(V1=i) # for user convenience; e.g. DT["foo"] without needing DT[.("foo")] } else if (identical(class(i),"list") && length(i)==1L && is.data.frame(i[[1L]])) { i = as.data.table(i[[1L]]) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d4681d5471..5c83983a8e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18133,3 +18133,9 @@ test(2214.6, droplevels(DT, exclude=c("b", "d"))[["a"]], droplevels(DT[1:5,a], c test(2214.7, droplevels(DT, 1)[["a"]], x[1:5]) test(2214.8, droplevels(DT, in.place=TRUE), DT) +# factor i should be just like character i and work, #1632 +DT = data.table(A=letters[1:3], B=4:6, key="A") +test(2215.1, DT["b", B], 5L) # has worked forever +test(2215.2, DT[factor("b"), B], 5L) # now works too, joining fact/fact, char/fact and fact/char have plenty of tests + + From 1a6bcdc321be9559755a59bf1b5030fb0fdd8819 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 30 Aug 2021 16:57:21 -0600 Subject: [PATCH 389/588] #5116: pass R 3.1.0 --- inst/tests/tests.Rraw | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5c83983a8e..9af8653531 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18126,10 +18126,13 @@ x = factor(letters[1:10]) DT = data.table(a = x)[1:5] test(2214.1, fdroplevels(factor()), droplevels(factor())) test(2214.2, fdroplevels(x[1:5]), droplevels(x[1:5])) -test(2214.3, fdroplevels(x[1:5], c("b", "d")), droplevels(x[1:5], c("b", "d"))) -test(2214.4, fdroplevels(x[1:5], letters[1:5]), droplevels(x[1:5], letters[1:5])) -test(2214.5, droplevels(DT)[["a"]], droplevels(DT[1:5,a])) -test(2214.6, droplevels(DT, exclude=c("b", "d"))[["a"]], droplevels(DT[1:5,a], c("b", "d"))) +if (base::getRversion() >= "3.4.0") { + # bug fix in R 3.4.0: "droplevels(f) now keeps levels when present." + test(2214.3, fdroplevels(x[1:5], c("b", "d")), droplevels(x[1:5], c("b", "d"))) + test(2214.4, fdroplevels(x[1:5], letters[1:5]), droplevels(x[1:5], letters[1:5])) + test(2214.5, droplevels(DT, exclude=c("b", "d"))[["a"]], droplevels(DT[1:5,a], c("b", "d"))) +} +test(2214.6, droplevels(DT)[["a"]], droplevels(DT[1:5,a])) test(2214.7, droplevels(DT, 1)[["a"]], x[1:5]) test(2214.8, droplevels(DT, in.place=TRUE), DT) From 891fba90531b13a6bfb8907958fa69b19a661274 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 30 Aug 2021 17:00:40 -0600 Subject: [PATCH 390/588] #5116: .r to .R --- R/{fdroplevels.r => fdroplevels.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{fdroplevels.r => fdroplevels.R} (100%) diff --git a/R/fdroplevels.r b/R/fdroplevels.R similarity index 100% rename from R/fdroplevels.r rename to R/fdroplevels.R From 01921723d285e6b83bccfddfb8cb6ad05e5b12dc Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 30 Aug 2021 17:02:30 -0600 Subject: [PATCH 391/588] #5116: .rd to .Rd --- man/{fdroplevels.rd => fdroplevels.Rd} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename man/{fdroplevels.rd => fdroplevels.Rd} (100%) diff --git a/man/fdroplevels.rd b/man/fdroplevels.Rd similarity index 100% rename from man/fdroplevels.rd rename to man/fdroplevels.Rd From 7f0ce147eef1dd5cd5ff05dffc3c72f472bcde51 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 30 Aug 2021 23:43:10 -0600 Subject: [PATCH 392/588] NEWS-only: formatting --- NEWS.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7e6c71f9a4..30866667b9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -137,7 +137,7 @@ 24. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. -25 `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. +25. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. 26. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. @@ -255,7 +255,7 @@ # no inconvenient warning ``` - On the same basis, `min` and `max` methods for empty `IDate` input now return `NA_integer_` of class `IDate`, rather than `NA_double_` of class `IDate` together with base R's warning `no non-missing arguments to min; returning Inf`, [#2256](https://github.com/Rdatatable/data.table/issues/2256]. The type change and warning would cause an error in grouping, see example below. Since `NA` was returned before it seems clear that still returning `NA` but of the correct type and with no warning is appropriate, backwards compatible, and a bug fix. Thanks to Frank Narf for reporting, and Matt Dowle for fixing. + On the same basis, `min` and `max` methods for empty `IDate` input now return `NA_integer_` of class `IDate`, rather than `NA_double_` of class `IDate` together with base R's warning `no non-missing arguments to min; returning Inf`, [#2256](https://github.com/Rdatatable/data.table/issues/2256). The type change and warning would cause an error in grouping, see example below. Since `NA` was returned before it seems clear that still returning `NA` but of the correct type and with no warning is appropriate, backwards compatible, and a bug fix. Thanks to Frank Narf for reporting, and Matt Dowle for fixing. ```R DT @@ -270,7 +270,8 @@ # was: # Error in `[.data.table`(DT, , min(d[d > "2020-01-01"]), by = g) : - # Column 1 of result for group 2 is type 'double' but expecting type 'integer'. Column types must be consistent for each group. + # Column 1 of result for group 2 is type 'double' but expecting type + # 'integer'. Column types must be consistent for each group. # In addition: Warning message: # In min.default(integer(0), na.rm = FALSE) : # no non-missing arguments to min; returning Inf @@ -398,7 +399,10 @@ 14. For nearly two years, since v1.12.4 (Oct 2019) (note 11 below in this NEWS file), using `options(datatable.nomatch=0)` has produced the following message : ``` - The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option. + The option 'datatable.nomatch' is being used and is not set to the default NA. This option + is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for + detailed information and motivation. To specify inner join, please specify `nomatch=NULL` + explicitly in your calls rather than changing the default using this option. ``` The message is now upgraded to warning that the option is now ignored. From 21f8f05e14f1127da330be8e21d5081a7bc73fb4 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 2 Sep 2021 22:38:43 +0200 Subject: [PATCH 393/588] fix contributor name (#5123) --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index fea7936d52..3f2fd8936d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -61,7 +61,7 @@ Authors@R: c( person("Vaclav","Tlapak", role="ctb"), person("Kevin","Ushey", role="ctb"), person("Dirk","Eddelbuettel", role="ctb"), - person("Ben","Schwen", role="ctb"), + person("Benjamin","Schwendinger", role="ctb"), person("Tony","Fischetti", role="ctb"), person("Ofek","Shilon", role="ctb"), person("Vadim","Khotilovich", role="ctb"), From b7c43cd75a08cd769f5b9d1a220df35b47f77f47 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 7 Sep 2021 13:46:52 -0600 Subject: [PATCH 394/588] .dev-only: NA status on CRAN to account for --- .dev/revdep.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index da90f0c66e..b8c17cc65d 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -269,11 +269,11 @@ cran = function() # reports CRAN status of the .cran.fail packages } require(data.table) p = proc.time() - db = setDT(tools::CRAN_check_results()) + db <<- setDT(tools::CRAN_check_results()) cat("tools::CRAN_check_results() returned",prettyNum(nrow(db), big.mark=","),"rows in",timetaken(p),"\n") ans = db[Package %chin% .fail.cran, - .(ERROR=sum(Status=="ERROR"), - WARN =sum(Status=="WARN"), + .(ERROR=sum(Status=="ERROR", na.rm=TRUE), + WARN =sum(Status=="WARN", na.rm=TRUE), cran =paste(unique(Version),collapse=";"), local=as.character(packageVersion(.BY[[1]]))), keyby=Package] From b82eb68bb8adaf27e5104222dd0480ea6b7fbd9f Mon Sep 17 00:00:00 2001 From: Marco Colombo Date: Wed, 22 Sep 2021 18:36:44 +0200 Subject: [PATCH 395/588] Clarify comments in special-symbols help. (#5166) --- man/special-symbols.Rd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd index 9bfa72fceb..1f4e1615c0 100644 --- a/man/special-symbols.Rd +++ b/man/special-symbols.Rd @@ -43,10 +43,10 @@ X DT[.N] # last row, only special symbol allowed in 'i' DT[, .N] # total number of rows in DT DT[, .N, by=x] # number of rows in each group -DT[, .SD, .SDcols=x:y] # select columns 'x' and 'y' +DT[, .SD, .SDcols=x:y] # select columns 'x' through 'y' DT[, .SD[1]] # first row of all columns -DT[, .SD[1], by=x] # first row of 'y' and 'v' for each group in 'x' -DT[, c(.N, lapply(.SD, sum)), by=x] # get rows *and* sum columns 'v' and 'y' by group +DT[, .SD[1], by=x] # first row of all columns for each group in 'x' +DT[, c(.N, lapply(.SD, sum)), by=x] # get rows *and* sum all columns by group DT[, .I[1], by=x] # row number in DT corresponding to each group DT[, .N, by=rleid(v)] # get count of consecutive runs of 'v' DT[, c(.(y=max(y)), lapply(.SD, min)), From 1e32776ddf557d49a7882d901f5917af46fe45a1 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 22 Sep 2021 23:08:33 +0200 Subject: [PATCH 396/588] Segfault of is.sorted on list/vector containing only NA_character_ (#5170) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 9 +++++++++ src/forder.c | 1 + 3 files changed, 12 insertions(+) diff --git a/NEWS.md b/NEWS.md index 30866667b9..e14e59b590 100644 --- a/NEWS.md +++ b/NEWS.md @@ -358,6 +358,8 @@ 42. `DT[factor("id")]` now works rather than error `i has evaluated to type integer. Expecting logical, integer or double`, [#1632](https://github.com/Rdatatable/data.table/issues/1632). `DT["id"]` has worked forever by automatically converting to `DT[.("id")]` for convenience, and joins have worked forever between char/fact, fact/char and fact/fact even when levels mismatch, so it was unfortunate that `DT[factor("id")]` managed to escape the simple automatic conversion to `DT[.(factor("id"))]` which is now in place. Thanks to @aushev for reporting, and Matt Dowle for the fix. +43. All-NA character key columns could segfault, [#5070](https://github.com/Rdatatable/data.table/issues/5070). Thanks to @JorisChau for reporting and Benjamin Schwendinger for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9af8653531..f921ed0a40 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4155,6 +4155,10 @@ DT = data.table(A=c(utf8_strings, latin1_strings), B=1:4) test(1162.21, is.sorted(DT), FALSE) setkey(DT) test(1162.22, is.sorted(DT), TRUE) +# Issue #5070 +DT = data.table(x2 = rep(NA_character_, 2)) +test(1162.23, is.sorted(DT)) +test(1162.24, is.sorted(rep(NA_character_, 2))) # FR #351 - last on length=0 arguments x <- character(0) @@ -18141,4 +18145,9 @@ DT = data.table(A=letters[1:3], B=4:6, key="A") test(2215.1, DT["b", B], 5L) # has worked forever test(2215.2, DT[factor("b"), B], 5L) # now works too, joining fact/fact, char/fact and fact/char have plenty of tests +# segfault on merge keyed all-NA_character_ due to is.sorted, #5070 +DT1 = data.table(x1 = rep(letters[1:4], each=3), x2=NA_character_, key="x2") +DT2 = data.table(x1 = letters[1:3]) +test(2216.1, DT1[DT2, on="x1"][,.(x1,x2)], DT1[1:9]) # segfault in v1.14.0 +test(2216.2, merge(DT1, DT2, by="x1")[,.(x1,x2)], setkey(DT1[1:9], x1)) # ok before but included for completeness verbatim from issue diff --git a/src/forder.c b/src/forder.c index a2ddf022a6..6e8a77ecf5 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1290,6 +1290,7 @@ SEXP issorted(SEXP x, SEXP by) SEXP *xd = STRING_PTR(x); i = 0; while (i Date: Wed, 22 Sep 2021 18:10:31 -0600 Subject: [PATCH 397/588] clang 13 omp (#5172) --- DESCRIPTION | 2 +- NEWS.md | 9 ++++++++- src/data.table.h | 2 +- src/frollR.c | 2 +- src/init.c | 2 +- src/types.c | 2 +- 6 files changed, 13 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3f2fd8936d..5e412260b4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.14.1 +Version: 1.14.3 Title: Extension of `data.frame` Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), diff --git a/NEWS.md b/NEWS.md index e14e59b590..dbc4571bad 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.14.1](https://github.com/Rdatatable/data.table/milestone/20) (in development) +# data.table [v1.14.3](https://github.com/Rdatatable/data.table/milestone/20) (in development) ## NEW FEATURES @@ -410,6 +410,13 @@ The message is now upgraded to warning that the option is now ignored. +# data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) + +## NOTES + +1. clang 13.0.0 (Sep 2021) requires the system header `omp.h` to be included before R's headers, [#5122](https://github.com/Rdatatable/data.table/issues/5122). Many thanks to Prof Ripley for testing and providing a patch file. + + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) ## POTENTIALLY BREAKING CHANGES diff --git a/src/data.table.h b/src/data.table.h index 2ba639a64a..6bafdc52af 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -1,3 +1,4 @@ +#include "myomp.h" // first for clang-13-omp, #5122 #include "dt_stdio.h" // PRId64 and PRIu64 #include #include @@ -10,7 +11,6 @@ #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT #include // for uint64_t rather than unsigned long long #include -#include "myomp.h" #include "types.h" #include "po.h" #ifdef WIN32 // positional specifiers (%n$) used in translations; #4402 diff --git a/src/frollR.c b/src/frollR.c index 644b863439..74cc7dd4ef 100644 --- a/src/frollR.c +++ b/src/frollR.c @@ -1,5 +1,5 @@ +#include "data.table.h" // first (before Rdefines.h) for clang-13-omp, #5122 #include -#include "data.table.h" SEXP coerceToRealListR(SEXP obj) { // accept atomic/list of integer/logical/real returns list of real diff --git a/src/init.c b/src/init.c index 56bf66d419..0c1774508b 100644 --- a/src/init.c +++ b/src/init.c @@ -425,6 +425,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion() { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.14.1"))); + return(ScalarString(mkChar("1.14.3"))); } diff --git a/src/types.c b/src/types.c index 18f1993dc2..6e9020bb59 100644 --- a/src/types.c +++ b/src/types.c @@ -1,5 +1,5 @@ +#include "data.table.h" // first (before Rdefines.h) for clang-13-omp, #5122 #include -#include "data.table.h" /* * find end of a string, used to append verbose messages or warnings From e78363f9f9a95132a67ecf8938b9304039ed108e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 22 Sep 2021 19:24:11 -0600 Subject: [PATCH 398/588] as.character(hexmode) -> format(hexmode) in test to pass rdevel (#5174) --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f921ed0a40..175149f3e1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1902,7 +1902,7 @@ test(640, capture.output(print(DT,class=FALSE)), c(" a b c","1: 8 xy DT=data.table(a=letters,b=1:26) test(641, tail(capture.output(print(DT[1:20], class=FALSE)),2), c("19: s 19","20: t 20")) test(642, tail(capture.output(print(DT[1:21], class=FALSE, nrows=100)),2), c("21: u 21"," a b")) -DT=data.table(a=as.character(as.hexmode(1:500)), b=1:500) +DT=data.table(a=format(as.hexmode(1:500)), b=1:500) test(643, capture.output(print(DT, class=FALSE)), c(" a b"," 1: 001 1"," 2: 002 2"," 3: 003 3"," 4: 004 4"," 5: 005 5"," --- ","496: 1f0 496","497: 1f1 497","498: 1f2 498","499: 1f3 499","500: 1f4 500")) # Test inconsistent length of columns error. From a3398949a9a0f687e61a6988312d19285b14721a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 22 Sep 2021 19:34:01 -0600 Subject: [PATCH 399/588] another as.character(hexmode)->format(hexmode) in test to pass rdevel (#5175) --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 175149f3e1..5b880c9434 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1082,7 +1082,7 @@ test(353, DT[2,f:="c"], data.table(f=factor(c("a","c","a","b")),x=1:4)) test(354, DT[3,f:=factor("foo")], data.table(f=factor(c("a","c","foo","b")),x=1:4)) # Test growVector logic when adding levels (don't need to grow levels for character cols) -newlevels = as.character(as.hexmode(1:2000)) +newlevels = format(as.hexmode(1:2000)) DT = data.table(f=factor("000"),x=1:2010) test(355, DT[11:2010,f:=newlevels], data.table(f=factor(c(rep("000",10),newlevels)),x=1:2010)) From 0437a551daeff888b55f2d9b636973a99f0e35c6 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 22 Sep 2021 22:04:07 -0600 Subject: [PATCH 400/588] URL-follows from --as-cran testing of 1.14.2 --- Makefile | 6 +++--- NEWS.md | 4 ++-- README.md | 2 +- vignettes/datatable-faq.Rmd | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 2be00d3b74..50a919440e 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.14.1.tar.gz + $(RM) data.table_1.14.3.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.14.1.tar.gz + $(R) CMD INSTALL data.table_1.14.3.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.1.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.3.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NEWS.md b/NEWS.md index dbc4571bad..1e3eace743 100644 --- a/NEWS.md +++ b/NEWS.md @@ -423,7 +423,7 @@ 1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz="UTC"` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended `In addition to convenience, fread is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.`. - At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://rstudio.com/resources/rstudioglobal-2021/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). + At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://www.rstudio.com/resources/rstudioglobal-2021/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). `tz=`'s default is now changed from `""` to `"UTC"`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. @@ -1321,7 +1321,7 @@ has a better chance of working on Mac. ## NOTES -1. When data.table loads it now checks its DLL version against the version of its R level code. This is to detect installation issues on Windows when i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. +1. When data.table loads it now checks its DLL version against the version of its R level code. This is to detect installation issues on Windows when i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. 2. When `on=` is provided but not `i=`, a helpful error is now produced rather than silently ignoring `on=`. Thanks to Dirk Eddelbuettel for the idea. diff --git a/README.md b/README.md index 3764230531..5b3a7d38a1 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![R-CMD-check](https://github.com/Rdatatable/data.table/workflows/R-CMD-check/badge.svg)](https://github.com/Rdatatable/data.table/actions) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/kayjdh5qtgymhoxr/branch/master?svg=true)](https://ci.appveyor.com/project/Rdatatable/data-table) [![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://codecov.io/github/Rdatatable/data.table?branch=master) -[![GitLab CI build status](https://gitlab.com/Rdatatable/data.table/badges/master/pipeline.svg)](https://gitlab.com/Rdatatable/data.table/pipelines) +[![GitLab CI build status](https://gitlab.com/Rdatatable/data.table/badges/master/pipeline.svg)](https://gitlab.com/Rdatatable/data.table/-/pipelines) [![downloads](https://cranlogs.r-pkg.org/badges/data.table)](https://www.rdocumentation.org/trends) [![CRAN usage](https://jangorecki.gitlab.io/rdeps/data.table/CRAN_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) [![BioC usage](https://jangorecki.gitlab.io/rdeps/data.table/BioC_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index f66f9611f1..cf358724ba 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -98,7 +98,7 @@ We _have_ proposed enhancements to R wherever possible, too. One of these was ac > `unique()` and `match()` are now faster on character vectors where all elements are in the global CHARSXP cache and have unmarked encoding (ASCII). Thanks to Matt Dowle for suggesting improvements to the way the hash code is generated in unique.c. -A second proposal was to use `memcpy` in duplicate.c, which is much faster than a for loop in C. This would improve the _way_ that R copies data internally (on some measures by 13 times). The thread on r-devel is [here](https://r.789695.n4.nabble.com/suggestion-how-to-use-memcpy-in-duplicate-c-td2019184.html). +A second proposal was to use `memcpy` in duplicate.c, which is much faster than a for loop in C. This would improve the _way_ that R copies data internally (on some measures by 13 times). The thread on r-devel is [here](https://stat.ethz.ch/pipermail/r-devel/2010-April/057249.html). A third more significant proposal that was accepted is that R now uses data.table's radix sort code as from R 3.3.0 : From 8041e489d94744665c7117b3cd15042ff7c8e16c Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 23 Sep 2021 15:09:17 -0600 Subject: [PATCH 401/588] another URL-follow --- vignettes/datatable-intro.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index c5da5d87d8..3a5eda34cd 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -38,7 +38,7 @@ Briefly, if you are interested in reducing *programming* and *compute* time trem ## Data {#data} -In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the Bureau of Transporation Statistics for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/hadley/nycflights13)). The data is available only for Jan-Oct'14. +In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the Bureau of Transporation Statistics for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/tidyverse/nycflights13)). The data is available only for Jan-Oct'14. We can use `data.table`'s fast-and-friendly file reader `fread` to load `flights` directly as follows: From 36a4dab2f429beac735f519b57f7c77f0f2287fa Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 24 Sep 2021 13:10:44 -0600 Subject: [PATCH 402/588] DT() mimics calling [.data.table in that frame (#5176) --- NAMESPACE | 1 + R/data.table.R | 6 +++- inst/tests/tests.Rraw | 64 +++++++++++++++++++++++++------------------ 3 files changed, 43 insertions(+), 28 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 260b7a7af4..157d39a081 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -60,6 +60,7 @@ export(substitute2) export(DT) # mtcars |> DT(i,j,by) #4872 S3method("[", data.table) +export("[.data.table") # so that functional DT() finds it; PR#5176 S3method("[<-", data.table) # S3method("[[", data.table) # S3method("[[<-", data.table) diff --git a/R/data.table.R b/R/data.table.R index 87f6eef10e..9c36b10bf8 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1941,7 +1941,11 @@ DT = function(x, ...) { #4872 options(datatable.optimize=2L) # GForce still on; building and storing indices in .prepareFastSubset off; see long paragraph in news item 22 of v1.14.2 } - ans = `[.data.table`(x, ...) + fun = match.call() + fun[[1L]] = as.name("[.data.table") # hence now exporting [.data.table method otherwise R CMD check can't find it in tests 2212.* + ans = eval(fun, envir=parent.frame(), # for issue 2 in #5129 so that eval(.massagei(isub), x, ienv) finds objects in calling + # env, and likely other places inside [.data.table that look at the calling env + enclos=parent.frame()) # including enclos= too as it has often been needed in the past options(datatable.optimize=old) .global$print = "" # functional form should always print; #5106 ans diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5b880c9434..27c5cbb4df 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18076,7 +18076,7 @@ for (col in c("a","b","c")) { } } -# DT() functional form, #4872 #5106 #5107 +# DT() functional form, #4872 #5106 #5107 #5129 if (base::getRversion() >= "4.1.0") { # we have to EVAL "|>" here too otherwise this tests.Rraw file won't parse in R<4.1.0 if (exists("DTfun")) DT=DTfun # just in dev-mode restore DT() in .GlobalEnv as DT object overwrote it in tests above @@ -18088,38 +18088,48 @@ if (base::getRversion() >= "4.1.0") { test(2212.013, EVAL("mtcars |> DT(mpg>20, .SD[hp>mean(hp)])"), droprn(mtcars[ mtcars$mpg>20 & mtcars$hp>mean(mtcars$hp[mtcars$mpg>20]), ])) D = copy(mtcars) - test(2212.02, EVAL("D |> DT(,.SD)"), D) - test(2212.03, EVAL("D |> DT(, .SD, .SDcols=5:8)"), D[,5:8]) - test(2212.04, EVAL("D |> DT(, 5:8)"), droprn(D[,5:8])) - test(2212.05, EVAL("D |> DT(, lapply(.SD, sum))"), as.data.frame(lapply(D,sum))) - test(2212.06, EVAL("D |> DT(, .SD, keyby=cyl) |> setkey(NULL)"), droprn(D[order(D$cyl),c(2,1,3:11)])) - test(2212.07, EVAL("D |> DT(1:20, .SD)"), droprn(D[1:20,])) - test(2212.08, EVAL("D |> DT(, .SD, by=cyl, .SDcols=5:8)"), droprn(D[unlist(tapply(1:32, D$cyl, c)[c(2,1,3)]), c(2,5:8)])) - test(2212.09, EVAL("D |> DT(1:20, .SD, .SDcols=5:8)"), droprn(D[1:20, 5:8])) - test(2212.10, EVAL("D |> DT(1:20, .SD, by=cyl, .SDcols=5:8)"), droprn(D[unlist(tapply(1:20, D$cyl[1:20], c)[c(2,1,3)]), c(2,5:8)])) - test(2212.11, EVAL("D |> DT(1:20, lapply(.SD, sum))"), as.data.frame(lapply(D[1:20,],sum))) + test(2212.02, EVAL("D |> DT(,.SD)"), mtcars) + test(2212.03, EVAL("D |> DT(, .SD, .SDcols=5:8)"), mtcars[,5:8]) + test(2212.04, EVAL("D |> DT(, 5:8)"), droprn(mtcars[,5:8])) + test(2212.05, EVAL("D |> DT(, lapply(.SD, sum))"), as.data.frame(lapply(mtcars,sum))) + test(2212.06, EVAL("D |> DT(, .SD, keyby=cyl) |> setkey(NULL)"), droprn(mtcars[order(mtcars$cyl),c(2,1,3:11)])) + test(2212.07, EVAL("D |> DT(1:20, .SD)"), droprn(mtcars[1:20,])) + test(2212.08, EVAL("D |> DT(, .SD, by=cyl, .SDcols=5:8)"), droprn(mtcars[unlist(tapply(1:32, mtcars$cyl, c)[c(2,1,3)]), c(2,5:8)])) + test(2212.09, EVAL("D |> DT(1:20, .SD, .SDcols=5:8)"), droprn(mtcars[1:20, 5:8])) + test(2212.10, EVAL("D |> DT(1:20, .SD, by=cyl, .SDcols=5:8)"), droprn(mtcars[unlist(tapply(1:20, mtcars$cyl[1:20], c)[c(2,1,3)]), c(2,5:8)])) + test(2212.11, EVAL("D |> DT(1:20, lapply(.SD, sum))"), as.data.frame(lapply(mtcars[1:20,],sum))) test(2212.12, droprn(EVAL("D |> DT(1:20, c(N=.N, lapply(.SD, sum)), by=cyl)")[c(1,3),c("cyl","N","carb")]), data.frame(cyl=c(6,8), N=c(6L,8L), carb=c(18,27))) - test(2212.13, EVAL("D |> DT(cyl==4)"), droprn(D[D$cyl==4,])) - test(2212.14, EVAL("D |> DT(cyl==4 & vs==0)"), droprn(D[D$cyl==4 & D$vs==0,])) - test(2212.15, EVAL("D |> DT(cyl==4 & vs>0)"), droprn(D[D$cyl==4 & D$vs>0,])) - test(2212.16, EVAL("D |> DT(cyl>=4)"), droprn(D[D$cyl>=4,])) - test(2212.17, EVAL("D |> DT(cyl!=4)"), droprn(D[D$cyl!=4,])) - test(2212.18, EVAL("D |> DT(cyl!=4 & vs!=0)"), droprn(D[D$cyl!=4 & D$vs!=0,])) + test(2212.13, EVAL("D |> DT(cyl==4)"), droprn(mtcars[mtcars$cyl==4,])) + test(2212.14, EVAL("D |> DT(cyl==4 & vs==0)"), droprn(mtcars[mtcars$cyl==4 & mtcars$vs==0,])) + test(2212.15, EVAL("D |> DT(cyl==4 & vs>0)"), droprn(mtcars[mtcars$cyl==4 & mtcars$vs>0,])) + test(2212.16, EVAL("D |> DT(cyl>=4)"), droprn(mtcars[mtcars$cyl>=4,])) + test(2212.17, EVAL("D |> DT(cyl!=4)"), droprn(mtcars[mtcars$cyl!=4,])) + test(2212.18, EVAL("D |> DT(cyl!=4 & vs!=0)"), droprn(mtcars[mtcars$cyl!=4 & mtcars$vs!=0,])) test(2212.19, EVAL("iris |> DT(Sepal.Length==5.0 & Species=='setosa')"), droprn(iris[iris$Sepal.Length==5.0 & iris$Species=="setosa",])) test(2212.20, EVAL("iris |> DT(Sepal.Length==5.0)"), droprn(iris[iris$Sepal.Length==5.0,])) test(2212.21, EVAL("iris |> DT(Species=='setosa')"), droprn(iris[iris$Species=='setosa',])) - test(2212.22, EVAL("D |> DT(, cyl)"), droprn(D[,"cyl"])) - test(2212.23, EVAL("D |> DT(1:2, cyl)"), droprn(D[1:2, "cyl"])) - test(2212.24, EVAL("D |> DT(, list(cyl))"), droprn(D[,"cyl",drop=FALSE])) - test(2212.25, EVAL("D |> DT(1:2, .(cyl))"), droprn(D[1:2, "cyl", drop=FALSE])) - test(2212.26, EVAL("D |> DT(, z:=sum(cyl))"), cbind(D, z=sum(D$cyl))) - test(2212.27, EVAL("D |> DT(, z:=round(mean(mpg),2), by=cyl)"), cbind(D, z=c("6"=19.74, "4"=26.66, "8"=15.10)[as.character(D$cyl)])) - test(2212.28, EVAL("D |> DT(1:3, z:=5, by=cyl)"), cbind(D, z=c(5,5,5,rep(NA,nrow(D)-3)))) + test(2212.22, EVAL("D |> DT(, cyl)"), droprn(mtcars[,"cyl"])) + test(2212.23, EVAL("D |> DT(1:2, cyl)"), droprn(mtcars[1:2, "cyl"])) + test(2212.24, EVAL("D |> DT(, list(cyl))"), droprn(mtcars[,"cyl",drop=FALSE])) + test(2212.25, EVAL("D |> DT(1:2, .(cyl))"), droprn(mtcars[1:2, "cyl", drop=FALSE])) + test(2212.26, EVAL("D |> DT(, z:=sum(cyl))"), cbind(mtcars, z=sum(mtcars$cyl))) + D = copy(mtcars) # D was changed by := so recopy mtcars; TODO: remove this line when #5129 is fully closed + test(2212.27, EVAL("D |> DT(, z:=round(mean(mpg),2), by=cyl)"), cbind(mtcars, z=c("6"=19.74, "4"=26.66, "8"=15.10)[as.character(mtcars$cyl)])) + D = copy(mtcars) # D was changed by := so recopy mtcars; TODO: remove this line when #5129 is fully closed + test(2212.28, EVAL("D |> DT(1:3, z:=5, by=cyl)"), cbind(mtcars, z=c(5,5,5,rep(NA,nrow(mtcars)-3)))) + D = copy(mtcars) # D was changed by := so recopy mtcars; TODO: remove this line when #5129 is fully closed test(2212.29, EVAL("D |> DT(1:3, z:=NULL)"), error="When deleting columns, i should not be provided") - test(2212.30, EVAL("D |> DT(data.table(cyl=4), on='cyl')"), droprn(D[D$cyl==4,])) - test(2212.31, EVAL("D |> DT(data.frame(cyl=4), on='cyl')"), droprn(D[D$cyl==4,])) - test(2212.32, EVAL("D |> DT(.(4), on='cyl')"), droprn(D[D$cyl==4,])) + test(2212.30, EVAL("D |> DT(data.table(cyl=4), on='cyl')"), droprn(mtcars[mtcars$cyl==4,])) + test(2212.31, EVAL("D |> DT(data.frame(cyl=4), on='cyl')"), droprn(mtcars[mtcars$cyl==4,])) + test(2212.32, EVAL("D |> DT(.(4), on='cyl')"), droprn(mtcars[mtcars$cyl==4,])) test(2212.33, EVAL("iris |> DT('setosa', on='Species')"), {tt=droprn(iris[iris$Species=="setosa",]); tt$Species=as.character(tt$Species); tt}) + filter = mtcars # mask stats::filter + dt = df = D = as.data.table(filter) # mask stats::D + test(2212.50, EVAL("df |> DT(df[, .I[which.max(mpg)], by=cyl]$V1)"), ans<-dt[c(4,20,25)]) + test(2212.51, EVAL("dt |> DT(dt[, .I[which.max(mpg)], by=cyl]$V1)"), ans) + test(2212.52, EVAL("D |> DT(D[, .I[which.max(mpg)], by=cyl]$V1)"), ans) + test(2212.53, EVAL("filter |> DT(filter[, .I[which.max(mpg)], by=cyl]$V1)"), error="unused.*argument.*by.*cyl") # R's [.data.frame error on filter[...] + test(2212.54, EVAL("filter |> DT((filter |> DT(, .I[which.max(mpg)], by=cyl))$V1)"), as.data.frame(ans)) } # precision powers of 10^(-n), #4461 From 552f421da4a89afe79ad68de477456147ac5c401 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 24 Sep 2021 23:08:05 +0200 Subject: [PATCH 403/588] ensure no NSE in env arg, #4994 #4995 (#5132) --- inst/tests/programming.Rraw | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index bed7bf0db4..429545dcb7 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -603,3 +603,15 @@ test(103.02, nadt, data.table(x1 = c(1, 2, 0, 0), x2 = c(2, 0, 3, 0), x3 = c(0, test(201.1, substitute2(dt, env=list(dt = data.table(a=1:9, b=1:9))), data.table(a=1:9, b=1:9)) test(201.2, substitute2(dt, env=list(dt = data.table(a=1:9, b=as.character(1:9)))), data.table(a=1:9, b=as.character(1:9))) test(201.3, substitute2(dt, env=list(dt = data.table(a=1:2, b=as.character(1:2)))), data.table(a=1:2, b=as.character(1:2))) + +# ensure env argument is a standard evaluation argument #4994 #4995 +dt = data.table(x=1:2, y=2:1) +jpar = list(.j=list("y")) +test(202.1, dt[, .j, env=jpar], data.table(y=2:1)) +f = function(d, params) { + d[, .j, env=params] +} +test(202.2, f(dt, params=jpar), data.table(y=2:1)) +"." = function(...) list(.j=list("x")) +test(202.3, dt[, .j, env=.(.j=list("y"))], data.table(x=1:2)) +rm(list=".") From 2b2e91f01cb5e733b3b620d42860c1403ef616bb Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 24 Sep 2021 23:45:46 +0200 Subject: [PATCH 404/588] shift type="cyclic" (#5134) --- NEWS.md | 14 +++++ R/shift.R | 6 +- inst/tests/tests.Rraw | 131 +++++++++++++++++++++++++++++------------- man/shift.Rd | 10 ++-- src/shift.c | 99 +++++++++++++++++++------------ 5 files changed, 176 insertions(+), 84 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1e3eace743..237c960790 100644 --- a/NEWS.md +++ b/NEWS.md @@ -141,6 +141,20 @@ 26. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. +27. `shift()` now also supports `type="cyclic"`, [#4451](https://github.com/Rdatatable/data.table/issues/4451). Arguments that are normally pushed out by `type="lag"` or `type="lead"` are re-introduced at this type at the first/last positions. Thanks to @RicoDiel for requesting, and Benjamin Schwendinger for the PR. + + ```R + shift(1:5, n=-1:1, type="cyclic") + # [[1]] + # [1] 2 3 4 5 1 + # + # [[2]] + # [1] 1 2 3 4 5 + # + # [[3]] + # [1] 5 1 2 3 4 + ``` + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/shift.R b/R/shift.R index c73d8b0840..064eea20cf 100644 --- a/R/shift.R +++ b/R/shift.R @@ -1,5 +1,7 @@ -shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FALSE) { +shift = function(x, n=1L, fill, type=c("lag", "lead", "shift", "cyclic"), give.names=FALSE) { type = match.arg(type) + if (type == "cyclic" && !missing(fill)) warning("Provided argument fill=", fill, " will be ignored since type='shift'.") + if (missing(fill)) fill = NA stopifnot(is.numeric(n)) ans = .Call(Cshift, x, as.integer(n), fill, type) if (give.names && is.list(ans)) { @@ -9,7 +11,7 @@ shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FA else nx = paste0("V", if (is.atomic(x)) 1L else seq_along(x)) } else nx = names(x) - if (type!="shift") { + if (!(type %chin% c("shift", "cyclic"))) { # flip type for negative n, #3223 neg = (n<0L) if (type=="lead" && length(unique(sign(n))) == 3L) neg[ n==0L ] = TRUE # lead_0 should be named lag_0 for consistency (if mixing signs of n, #3832) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 27c5cbb4df..c1481a0841 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6603,63 +6603,88 @@ test(1462.3, DT[, sum(unlist(mget(cols, as.environment(-1)))), by=x], DT[, sum(u # test for 'shift' x=1:5 y=factor(x) -test(1463.01, shift(x,1L), as.integer(c(NA, 1:4))) -test(1463.02, shift(x,1:2), list(as.integer(c(NA, 1:4)), as.integer(c(NA, NA, 1:3)))) -test(1463.03, shift(x,1L, 0L), as.integer(c(0L, 1:4))) -test(1463.04, shift(x,1L, type="lead"), as.integer(c(2:5, NA))) -test(1463.05, shift(x,1:2, type="lead"), list(as.integer(c(2:5, NA)), as.integer(c(3:5, NA, NA)))) -test(1463.06, shift(x,1L, 0L, type="lead"), as.integer(c(2:5, 0L))) -test(1463.07, shift(y,1L), factor(c(NA,1:4), levels=1:5)) -test(1463.08, shift(y,1L, type="lead"), factor(c(2:5, NA), levels=1:5)) +test(1463.01, shift(x,1L), as.integer(c(NA, 1:4))) +test(1463.02, shift(x,1:2), list(as.integer(c(NA, 1:4)), as.integer(c(NA, NA, 1:3)))) +test(1463.03, shift(x,1L, 0L), as.integer(c(0L, 1:4))) +test(1463.04, shift(x,1L, type="lead"), as.integer(c(2:5, NA))) +test(1463.05, shift(x,1:2, type="lead"), list(as.integer(c(2:5, NA)), as.integer(c(3:5, NA, NA)))) +test(1463.06, shift(x,1L, 0L,type="lead"), as.integer(c(2:5, 0L))) +test(1463.07, shift(y,1L), factor(c(NA,1:4), levels=1:5)) +test(1463.08, shift(y,1L, type="lead"), factor(c(2:5, NA), levels=1:5)) +test(1463.09, shift(x,1L, type="cyclic"), as.integer(c(5, 1:4))) +test(1463.10, shift(x,1:2, type="cyclic"), list(as.integer(c(5, 1:4)), as.integer(c(4:5, 1:3)))) +test(1463.11, shift(x,-1L, type="cyclic"), as.integer(c(2:5, 1))) +test(1463.12, shift(x,-(1:2),type="cyclic"), list(as.integer(c(2:5, 1)), as.integer(c(3:5,1:2)))) x=as.numeric(x) -test(1463.09, shift(x,1L), as.numeric(c(NA, 1:4))) -test(1463.10, shift(x,1:2), list(as.numeric(c(NA, 1:4)), as.numeric(c(NA, NA, 1:3)))) -test(1463.11, shift(x,1L, 0L), as.numeric(c(0L, 1:4))) -test(1463.12, shift(x,1L, type="lead"), as.numeric(c(2:5, NA))) -test(1463.13, shift(x,1:2, type="lead"), list(as.numeric(c(2:5, NA)), as.numeric(c(3:5, NA, NA)))) -test(1463.14, shift(x,1L, 0L, type="lead"), as.numeric(c(2:5, 0L))) +test(1463.13, shift(x,1L), as.numeric(c(NA, 1:4))) +test(1463.14, shift(x,1:2), list(as.numeric(c(NA, 1:4)), as.numeric(c(NA, NA, 1:3)))) +test(1463.15, shift(x,1L, 0L), as.numeric(c(0L, 1:4))) +test(1463.16, shift(x,1L, type="lead"), as.numeric(c(2:5, NA))) +test(1463.17, shift(x,1:2, type="lead"), list(as.numeric(c(2:5, NA)), as.numeric(c(3:5, NA, NA)))) +test(1463.18, shift(x,1L, 0L,type="lead"), as.numeric(c(2:5, 0L))) +test(1463.19, shift(x,1L, type="cyclic"), as.numeric(c(5, 1:4))) +test(1463.20, shift(x,1:2, type="cyclic"), list(as.numeric(c(5, 1:4)), as.numeric(c(4:5, 1:3)))) +test(1463.21, shift(x,-1L, type="cyclic"), as.numeric(c(2:5, 1))) +test(1463.22, shift(x,-(1:2),type="cyclic"), list(as.numeric(c(2:5, 1)), as.numeric(c(3:5,1:2)))) + if (test_bit64) { x=as.integer64(x) - test(1463.15, shift(x,1L), as.integer64(c(NA, 1:4))) - test(1463.16, shift(x,1:2), list(as.integer64(c(NA, 1:4)), as.integer64(c(NA, NA, 1:3)))) - test(1463.17, shift(x,1L, 0L), as.integer64(c(0L, 1:4))) - test(1463.18, shift(x,1L, type="lead"), as.integer64(c(2:5, NA))) - test(1463.19, shift(x,1:2, type="lead"), list(as.integer64(c(2:5, NA)), as.integer64(c(3:5, NA, NA)))) - test(1463.20, shift(x,1L, 0L, type="lead"), as.integer64(c(2:5, 0L))) + test(1463.23, shift(x,1L), as.integer64(c(NA, 1:4))) + test(1463.24, shift(x,1:2), list(as.integer64(c(NA, 1:4)), as.integer64(c(NA, NA, 1:3)))) + test(1463.25, shift(x,1L, 0L), as.integer64(c(0L, 1:4))) + test(1463.26, shift(x,1L, type="lead"), as.integer64(c(2:5, NA))) + test(1463.27, shift(x,1:2, type="lead"), list(as.integer64(c(2:5, NA)), as.integer64(c(3:5, NA, NA)))) + test(1463.28, shift(x,1L, 0L, type="lead"), as.integer64(c(2:5, 0L))) + test(1463.29, shift(x,1L, type="cyclic"), as.integer64(c(5, 1:4))) + test(1463.30, shift(x,1:2, type="cyclic"), list(as.integer64(c(5, 1:4)), as.integer64(c(4:5, 1:3)))) + test(1463.31, shift(x,-1L, type="cyclic"), as.integer64(c(2:5, 1))) + test(1463.32, shift(x,-(1:2), type="cyclic"), list(as.integer64(c(2:5, 1)), as.integer64(c(3:5,1:2)))) } x=as.character(x) -test(1463.21, shift(x,1L), as.character(c(NA, 1:4))) -test(1463.22, shift(x,1:2), list(as.character(c(NA, 1:4)), as.character(c(NA, NA, 1:3)))) -test(1463.23, shift(x,1L, 0L), as.character(c(0L, 1:4))) -test(1463.24, shift(x,1L, type="lead"), as.character(c(2:5, NA))) -test(1463.25, shift(x,1:2, type="lead"), list(as.character(c(2:5, NA)), as.character(c(3:5, NA, NA)))) -test(1463.26, shift(x,1L, 0L, type="lead"), as.character(c(2:5, 0L))) +test(1463.33, shift(x,1L), as.character(c(NA, 1:4))) +test(1463.34, shift(x,1:2), list(as.character(c(NA, 1:4)), as.character(c(NA, NA, 1:3)))) +test(1463.35, shift(x,1L, 0L), as.character(c(0L, 1:4))) +test(1463.36, shift(x,1L, type="lead"), as.character(c(2:5, NA))) +test(1463.37, shift(x,1:2, type="lead"), list(as.character(c(2:5, NA)), as.character(c(3:5, NA, NA)))) +test(1463.38, shift(x,1L, 0L, type="lead"), as.character(c(2:5, 0L))) +test(1463.39, shift(x,1L, type="cyclic"), as.character(c(5, 1:4))) +test(1463.40, shift(x,1:2, type="cyclic"), list(as.character(c(5, 1:4)), as.character(c(4:5, 1:3)))) +test(1463.41, shift(x,-1L, type="cyclic"), as.character(c(2:5, 1))) +test(1463.42, shift(x,-(1:2), type="cyclic"), list(as.character(c(2:5, 1)), as.character(c(3:5,1:2)))) x=c(TRUE,FALSE,TRUE,FALSE,TRUE) -test(1463.27, shift(x,1L), c(NA, x[-5L])) -test(1463.28, shift(x,1:2), list(c(NA, x[-5L]), c(NA, NA, x[-(4:5)]))) -test(1463.29, shift(x,1L, 0L), c(FALSE, x[-5L])) -test(1463.30, shift(x,1L, type="lead"), c(x[-1L], NA)) -test(1463.31, shift(x,1:2, type="lead"), list(c(x[-1L],NA), c(x[-(1:2)],NA,NA))) -test(1463.32, shift(x,1L, 0L, type="lead"), c(x[-(1)], FALSE)) +test(1463.43, shift(x,1L), c(NA, x[-5L])) +test(1463.44, shift(x,1:2), list(c(NA, x[-5L]), c(NA, NA, x[-(4:5)]))) +test(1463.45, shift(x,1L, 0L), c(FALSE, x[-5L])) +test(1463.46, shift(x,1L, type="lead"), c(x[-1L], NA)) +test(1463.47, shift(x,1:2, type="lead"), list(c(x[-1L],NA), c(x[-(1:2)],NA,NA))) +test(1463.48, shift(x,1L, 0L, type="lead"), c(x[-(1)], FALSE)) +test(1463.49, shift(x,1L, type="cyclic"), c(x[5L], x[-5L])) +test(1463.50, shift(x,1:2, type="cyclic"), list(c(x[5L], x[-5L]), c(x[4L:5L], x[-4L:-5L]))) +test(1463.51, shift(x,-1L, type="cyclic"), c(x[-1L], x[1L])) +test(1463.52, shift(x,-(1:2), type="cyclic"), list(c(x[-1L], x[1L]), c(x[-1L:-2L], x[1L:2L]))) # for list of list, #1595 x = data.table(foo = c(list(c("a","b","c")), list(c("b","c")), list(c("a","b")), list(c("a"))), id = c(1,1,2,2)) -test(1463.33, x[, shift(list(foo)), by=id], +test(1463.53, x[, shift(list(foo)), by=id], data.table(id=c(1,1,2,2), V1=list(NA, c("a", "b", "c"), NA, c("a", "b")))) -test(1463.34, x[, shift(list(foo), type="lead", fill=NA_integer_), by=id], +test(1463.54, x[, shift(list(foo), type="lead", fill=NA_integer_), by=id], data.table(id=c(1,1,2,2), V1=list(c("b", "c"), NA_integer_, c("a"), NA_integer_))) +test(1463.55, x[, shift(list(foo), 1, type="cyclic"), by=id], + data.table(id=c(1,1,2,2), V1=list(c("b","c"), c("a","b","c"), c("a"), c("a","b")))) +test(1463.56, x[, shift(list(foo), -1, type="cyclic"), by=id], + data.table(id=c(1,1,2,2), V1=list(c("b","c"), c("a","b","c"), c("a"), c("a","b")))) # Fix for #1009 segfault in shift val = runif(1) -test(1463.35, shift(val, 2L), NA_real_) -test(1463.36, shift(val, 2L, type="lead"), NA_real_) +test(1463.57, shift(val, 2L), NA_real_) +test(1463.58, shift(val, 2L, type="lead"), NA_real_) -test(1463.37, shift(1:5, 1L, fill=c(1:2)), error="fill must be a vector of length") -test(1463.38, shift(mean), error="type 'closure' passed to shift(). Must be a vector, list, data.frame or data.table") +test(1463.59, shift(1:5, 1L, fill=c(1:2)), error="fill must be a vector of length") +test(1463.60, shift(mean), error="type 'closure' passed to shift(). Must be a vector, list, data.frame or data.table") # add tests for date and factor? @@ -6668,13 +6693,26 @@ x = 1:10 nm = c("x_lag_1", "x_lag_2") ans = list(as.integer(c(NA, 1:9)), as.integer(c(NA, NA, 1:8))) setattr(ans, 'names', nm) -test(1463.39, shift(x, 1:2, give.names=TRUE), ans) +test(1463.61, shift(x, 1:2, give.names=TRUE), ans) if (test_nanotime) { - test(1463.40, shift(nanotime(1:4) ), c(nanotime::nanotime(NA), nanotime::nanotime(1:3))); - test(1463.41, shift(nanotime(1:4), fill=0L), c(nanotime::nanotime(0L), nanotime::nanotime(1:3))); + x=nanotime(1:4) + test(1463.62, shift(x ), c(nanotime::nanotime(NA), x[1:3])); + test(1463.63, shift(x, fill=0L), c(nanotime::nanotime(0L), x[1:3])); + test(1463.64, shift(x, 1, type="cyclic"), c(x[4L], x[-4L])); + test(1463.65, shift(x, -1, type="cyclic"), c(x[-1L], x[1L])); } +# shift circular +x = 1:5 +test(1463.66, shift(x, 5, type="cyclic"), x) +test(1463.67, shift(x, -5, type="cyclic"), x) +test(1463.68, shift(x, 6, type="cyclic"), shift(x, 1, type="cyclic")) +test(1463.69, shift(x, -6, type="cyclic"), shift(x, -1, type="cyclic")) +# test warning +test(1463.70, shift(x, 1, fill=1, type="cyclic"), c(5L, 1L:4L), warning="Provided argument fill=1 will be ignored since type='shift'.") + + # FR #686 DT = data.table(a=rep(c("A", "B", "C", "A", "B"), c(2,2,3,1,2)), foo=1:10) # Seemingly superfluous 'foo' is needed to test fix for #1942 @@ -13801,6 +13839,11 @@ test(1963.16, shift(DT, -3L, type="lag"), shift(DT, 3L, type="lead")) DT <- data.table(a = 1:3, b = 2:4) test(1963.17, DT[ , shift(.SD, 0:1, give.names = TRUE, type = "lead")], data.table(a_lead_0 = 1:3, a_lead_1 = c(2L, 3L, NA), b_lead_0 = 2:4, b_lead_1 = c(3L, 4L, NA))) +DT = data.table(x = 1:10, y = 10:1) +test(1963.18, shift(DT, 1L, type="cyclic"), list(c(10L, 1L:9L), c(1L, 10L:2L))) +test(1963.19, shift(DT, -1, type="cyclic"), list(c(2L:10L, 1L), c(9L:1L, 10L))) +test(1963.20, shift(DT, 3L, type="cyclic"), shift(DT, -7L, type="cyclic")) +test(1963.21, shift(DT, -3L, type="cyclic"), shift(DT, 7L, type="cyclic")) # 0 column data.table should not have rownames, #3149 M0 = matrix(1:6, nrow=3, ncol=2, dimnames=list(rows=paste0("id",1:3), cols=c("v1","v2"))) @@ -15889,6 +15932,10 @@ test(2067.1, shift(z), c(NA, z[1:2])) test(2067.2, shift(z, type = 'lead'), c(z[2:3], NA)) test(2067.3, shift(z, fill = 1i), c(1i, z[1:2])) test(2067.4, shift(list(z, 1:3)), list(c(NA, z[1:2]), c(NA, 1:2))) +test(2067.5, shift(z, n=1, type = 'cyclic'), c(z[3], z[1:2])) +test(2067.6, shift(z, n=-1, type = 'cyclic'), c(z[2:3], z[1])) +test(2067.7, shift(list(z, 1L:3L), n=1, type = 'cyclic'), list(c(z[3], z[1:2]), c(3L, 1:2))) +test(2067.8, shift(list(z, 1L:3L), n=-1, type = 'cyclic'), list(c(z[2:3], z[1]), c(2:3, 1L))) # support for ordering tables with complex columns, #1444 DT = data.table(a = 2:1, z = complex(0, 0:1)) @@ -16173,7 +16220,9 @@ test(2074.33, merge(DT, DT, by.x = 1i, by.y=1i), error="A non-empty vector of co # shift naming test(2074.34, shift(list(a=1:5, b=6:10), give.names=TRUE), list(a_lag_1=c(NA, 1:4), b_lag_1=c(NA, 6:9))) +test(2074.345, shift(list(a=1:5, b=6:10), type="cyclic", give.names=TRUE), list(a_cyclic_1=c(5L, 1:4), b_cyclic_1=c(10L, 6:9))) test(2074.35, shift(1:5, 1:2, give.names=TRUE), list(V1_lag_1=c(NA, 1:4), V1_lag_2=c(NA, NA, 1:3))) +test(2074.355, shift(1:5, 1:2, type="cyclic", give.names=TRUE), list(V1_cyclic_1=c(5L, 1:4), V1_cyclic_2=c(4L:5L, 1:3))) # bmerge.c x = data.table(a='a') diff --git a/man/shift.Rd b/man/shift.Rd index c710ab3687..219b8f3d8c 100644 --- a/man/shift.Rd +++ b/man/shift.Rd @@ -10,14 +10,14 @@ } \usage{ -shift(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FALSE) +shift(x, n=1L, fill, type=c("lag", "lead", "shift", "cyclic"), give.names=FALSE) } \arguments{ \item{x}{ A vector, list, data.frame or data.table. } \item{n}{ integer vector denoting the offset by which to lead or lag the input. To create multiple lead/lag vectors, provide multiple values to \code{n}; negative values of \code{n} will "flip" the value of \code{type}, i.e., \code{n=-1} and \code{type='lead'} is the same as \code{n=1} and \code{type='lag'}. } - \item{fill}{ Value to use for padding when the window goes beyond the input length. } - \item{type}{ default is \code{"lag"} (look "backwards"). The other possible values \code{"lead"} (look "forwards") and \code{"shift"} (behave same as \code{"lag"} except given names). } - \item{give.names}{default is \code{FALSE} which returns an unnamed list. When \code{TRUE}, names are automatically generated corresponding to \code{type} and \code{n}. If answer is an atomic vector, then the argument is ignored. } + \item{fill}{ default is \code{NA}. Value to use for padding when the window goes beyond the input length. } + \item{type}{ default is \code{"lag"} (look "backwards"). The other possible values \code{"lead"} (look "forwards"), \code{"shift"} (behave same as \code{"lag"} except given names) and \code{"cyclic"} where pushed out values are re-introduced at the front/back. } + \item{give.names}{ default is \code{FALSE} which returns an unnamed list. When \code{TRUE}, names are automatically generated corresponding to \code{type} and \code{n}. If answer is an atomic vector, then the argument is ignored. } } \details{ \code{shift} accepts vectors, lists, data.frames or data.tables. It always returns a list except when the input is a \code{vector} and \code{length(n) == 1} in which case a \code{vector} is returned, for convenience. This is so that it can be used conveniently within data.table's syntax. For example, \code{DT[, (cols) := shift(.SD, 1L), by=id]} would lag every column of \code{.SD} by 1 for each group and \code{DT[, newcol := colA + shift(colB)]} would assign the sum of two \emph{vectors} to \code{newcol}. @@ -40,6 +40,8 @@ shift(x, n=1:2, fill=0, type="lag") # getting a window by using positive and negative n: shift(x, n = -1:1) shift(x, n = -1:1, type = "shift", give.names = TRUE) +# cyclic shift where pad uses pushed out values +shift(x, n = -1:1, type = "cyclic") # on data.tables DT = data.table(year=2010:2014, v1=runif(5), v2=1:5, v3=letters[1:5]) diff --git a/src/shift.c b/src/shift.c index 9ff0449628..11346648d5 100644 --- a/src/shift.c +++ b/src/shift.c @@ -4,7 +4,7 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) { int nprotect=0; - enum {LAG, LEAD/*, SHIFT, CYCLIC*/} stype = LAG; // currently SHIFT maps to LAG and CYCLIC is unimplemented (see comments in #1708) + enum {LAG, LEAD/*, SHIFT*/,CYCLIC} stype = LAG; // currently SHIFT maps to LAG (see comments in #1708) if (!xlength(obj)) return(obj); // NULL, list() SEXP x; if (isVectorAtomic(obj)) { @@ -23,6 +23,7 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) if (!strcmp(CHAR(STRING_ELT(type, 0)), "lag")) stype = LAG; else if (!strcmp(CHAR(STRING_ELT(type, 0)), "lead")) stype = LEAD; else if (!strcmp(CHAR(STRING_ELT(type, 0)), "shift")) stype = LAG; // when we get rid of nested if branches we can use SHIFT, for now it maps to LAG + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "cyclic")) stype = CYCLIC; else error(_("Internal error: invalid type for shift(), should have been caught before. please report to data.table issue tracker")); // # nocov int nx = length(x), nk = length(k); @@ -30,6 +31,8 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) const int *kd = INTEGER(k); for (int i=0; i= 0) || (stype == LEAD && kd[j] < 0)) { - // LAG when type = 'lag' and n >= 0 _or_ type = 'lead' and n < 0 - if (tailk > 0) memmove(itmp+thisk, INTEGER(elem), tailk*size); - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + // LAG when type %in% c('lag','cyclic') and n >= 0 _or_ type = 'lead' and n < 0 + if (tailk > 0) memmove(itmp+thisk, ielem, tailk*size); + if (cycle) { + if (thisk > 0) memmove(itmp, ielem+tailk, thisk*size); + } else for (int m=0; m=0 _or_ type = 'lag', n<0 - if (tailk > 0) memmove(itmp, INTEGER(elem)+thisk, tailk*size); - for (int m=xrows-thisk; m=0 _or_ type %in% c('lag','cyclic'), n<0 + if (tailk > 0) memmove(itmp, ielem+thisk, tailk*size); + if (cycle) { + if (thisk > 0) memmove(itmp+tailk, ielem, thisk*size); + } else for (int m=tailk; m= 0) || (stype == LEAD && kd[j] < 0)) { - if (tailk > 0) memmove(dtmp+thisk, REAL(elem), tailk*size); - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + if (tailk > 0) memmove(dtmp+thisk, delem, tailk*size); + if (cycle) { + if (thisk > 0) memmove(dtmp, delem+tailk, thisk*size); + } else for (int m=0; m 0) memmove(dtmp, REAL(elem)+thisk, tailk*size); - for (int m=tailk; m 0) memmove(dtmp, delem+thisk, tailk*size); + if (cycle) { + if (thisk > 0) memmove(dtmp+tailk, delem, thisk*size); + } else for (int m=tailk; m= 0) || (stype == LEAD && kd[j] < 0)) { - if (tailk > 0) memmove(ctmp+thisk, COMPLEX(elem), tailk*size); - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + if (tailk > 0) memmove(ctmp+thisk, celem, tailk*size); + if (cycle) { + if (thisk > 0) memmove(ctmp, celem+tailk, thisk*size); + } else for (int m=0; m 0) memmove(ctmp, COMPLEX(elem)+thisk, tailk*size); - for (int m=tailk; m 0) memmove(ctmp, celem+thisk, tailk*size); + if (cycle) { + if (thisk > 0) memmove(ctmp+tailk, celem, thisk*size); + } else for (int m=tailk; m= 0) || (stype == LEAD && kd[j] < 0)) { - if (tailk > 0) memmove(ltmp+thisk, LOGICAL(elem), tailk*size); - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + if (tailk > 0) memmove(ltmp+thisk, lelem, tailk*size); + if (cycle) { + if (thisk > 0) memmove(ltmp, lelem+tailk, thisk*size); + } else for (int m=0; m 0) memmove(ltmp, LOGICAL(elem)+thisk, tailk*size); - for (int m=tailk; m 0) memmove(ltmp, lelem+thisk, tailk*size); + if (cycle) { + if (thisk > 0) memmove(ltmp+tailk, lelem, thisk*size); + } else for (int m=tailk; m= 0) || (stype == LEAD && kd[j] < 0)) { - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { - for (int m=0; m= 0) || (stype == LEAD && kd[j] < 0)) { + for (int m=0; m Date: Mon, 27 Sep 2021 22:10:09 +0200 Subject: [PATCH 405/588] Check attributes of list column elements in anySpecialStatic (#5178) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 5 +++++ src/dogroups.c | 8 +++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 237c960790..afb5a735c4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -374,6 +374,8 @@ 43. All-NA character key columns could segfault, [#5070](https://github.com/Rdatatable/data.table/issues/5070). Thanks to @JorisChau for reporting and Benjamin Schwendinger for the fix. +44. In v1.13.2 a version of an old bug was reintroduced where during a grouping operation list columns could retain a pointer to the last group. This affected only attributes of list elements and only if those were updated during the grouping operation, [#4963](https://github.com/Rdatatable/data.table/issues/4963). Thanks to @fujiaxiang for reporting and @avimallu and Václav Tlapák for investigating and the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c1481a0841..8495e9e808 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18210,3 +18210,8 @@ DT2 = data.table(x1 = letters[1:3]) test(2216.1, DT1[DT2, on="x1"][,.(x1,x2)], DT1[1:9]) # segfault in v1.14.0 test(2216.2, merge(DT1, DT2, by="x1")[,.(x1,x2)], setkey(DT1[1:9], x1)) # ok before but included for completeness verbatim from issue +# copy attributes assigned to elements of list columns in grouping #4963 +DT1 = data.table(id=1:3, grp=c('a', 'a', 'b'), value=4:6) +DT2 = data.table(grp = c('a', 'b'), agg = list(c('1' = 4, '2' = 5), c('3' = 6))) +test(2217, DT1[, by = grp, .(agg = list(setNames(as.numeric(value), id)))], DT2) + diff --git a/src/dogroups.c b/src/dogroups.c index d76889932a..5ddd1f672c 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -39,6 +39,7 @@ static bool anySpecialStatic(SEXP x) { // with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed. // Currently the marker is negative truelength. These specials are protected by us here and before we release them // we restore the true truelength for when R starts to use vector truelength. + SEXP attribs, list_el; const int n = length(x); // use length() not LENGTH() because LENGTH() on NULL is segfault in R<3.5 where we still define USE_RINTERNALS // (see data.table.h), and isNewList() is true for NULL @@ -50,8 +51,13 @@ static bool anySpecialStatic(SEXP x) { if (TRUELENGTH(x)<0) return true; // test 2158 for (int i=0; i Date: Tue, 28 Sep 2021 19:09:45 -0600 Subject: [PATCH 406/588] applied suggestion from Kurt Kornik (#5183) --- NEWS.md | 4 +++- R/merge.R | 4 ++-- inst/tests/tests.Rraw | 6 +++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index afb5a735c4..b83869e50d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -425,8 +425,10 @@ The message is now upgraded to warning that the option is now ignored. +15. Many thanks to Kurt Hornik for investigating potential impact of a possible future change to `base::intersect()` on empty input, providing a patch so that `data.table` won't break if the change is made to R, and giving us plenty of notice, [#5183](https://github.com/Rdatatable/data.table/pull/5183). -# data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) + +# data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) ## NOTES diff --git a/R/merge.R b/R/merge.R index fb0666d5e0..683a6d08a4 100644 --- a/R/merge.R +++ b/R/merge.R @@ -43,9 +43,9 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } else { if (is.null(by)) by = intersect(key(x), key(y)) - if (is.null(by)) + if (!length(by)) # was is.null() before PR#5183 changed to !length() by = key(x) - if (is.null(by)) + if (!length(by)) by = intersect(nm_x, nm_y) if (length(by) == 0L || !is.character(by)) stopf("A non-empty vector of column names for `by` is required.") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8495e9e808..bfa901d315 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13576,7 +13576,7 @@ test(1962.014, merge(DT1, DT2), data.table(a = integer(0), V = character(0))) setkey(DT1, a) test(1962.015, merge(DT1, DT2), - data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a')) + ans<-data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a')) test(1962.016, merge(DT1, DT2, by.x = 'a', by.y = c('a', 'V')), error = 'must be of same length') test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'), @@ -13586,8 +13586,8 @@ test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'), error = 'Elements listed in `by.x`') test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'), error = 'Elements listed in `by.y`') -test(1962.020, merge(DT1, DT2, by = character(0L)), - error = 'non-empty vector of column names') +test(1962.0201, merge(DT1, DT2, by=character(0L)), ans) # was error before PR#5183 +test(1962.0202, merge(DT1, DT2, by=NULL), ans) # test explicit NULL too as missing() could be used inside merge() test(1962.021, merge(DT1, DT2, by = 'z'), error = 'must be valid column names in x and y') From 8f0afcb44827f1df3ed231b87fda7768db047537 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 29 Sep 2021 00:20:49 -0400 Subject: [PATCH 407/588] fix alignment of chunk gates (#5182) --- vignettes/datatable-reference-semantics.Rmd | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index 2f3457056c..33da89bb92 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -92,7 +92,7 @@ It can be used in `j` in two ways: # when you have only one column to assign to you # can drop the quotes and list(), for convenience DT[, colA := valA] - ``` + ``` (b) The functional form @@ -367,4 +367,3 @@ However we could improve this functionality further by *shallow* copying instead So far we have seen a whole lot in `j`, and how to combine it with `by` and little of `i`. Let's turn our attention back to `i` in the next vignette *"Keys and fast binary search based subset"* to perform *blazing fast subsets* by *keying data.tables*. *** - From cd818080606591ac5348005732db0cadfb4e597c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 5 Oct 2021 19:49:35 +0200 Subject: [PATCH 408/588] droplevels for empty table and ordered factors (#5185) --- NEWS.md | 2 +- R/fdroplevels.R | 5 +++-- inst/tests/tests.Rraw | 23 +++++++++++++++-------- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index b83869e50d..18b5c5c49b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -139,7 +139,7 @@ 25. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. -26. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. +26. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, Boniface Kamgang and Martin Binder for testing, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. 27. `shift()` now also supports `type="cyclic"`, [#4451](https://github.com/Rdatatable/data.table/issues/4451). Arguments that are normally pushed out by `type="lag"` or `type="lead"` are re-introduced at this type at the first/last positions. Thanks to @RicoDiel for requesting, and Benjamin Schwendinger for the PR. diff --git a/R/fdroplevels.R b/R/fdroplevels.R index 5c53ee42fb..c7025dda0e 100644 --- a/R/fdroplevels.R +++ b/R/fdroplevels.R @@ -4,12 +4,13 @@ fdroplevels = function(x, exclude = if (anyNA(levels(x))) NULL else NA, ...) { lev = which(tabulate(x, length(levels(x))) & (!match(levels(x), exclude, 0L))) ans = match(as.integer(x), lev) setattr(ans, 'levels', levels(x)[lev]) - setattr(ans, 'class', 'factor') + setattr(ans, 'class', class(x)) return(ans) } droplevels.data.table = function(x, except = NULL, exclude, in.place = FALSE, ...){ - stopifnot(length(x) > 0L, is.logical(in.place)) + stopifnot(is.logical(in.place)) + if (nrow(x)==0L) return(x) ix = vapply(x, is.factor, NA) if(!is.null(except)){ stopifnot(is.numeric(except), except <= length(x)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bfa901d315..619ab67304 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18187,17 +18187,24 @@ test(2213, identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)) # droplevels.data.table method, and fdroplevels, #647 x = factor(letters[1:10]) DT = data.table(a = x)[1:5] -test(2214.1, fdroplevels(factor()), droplevels(factor())) -test(2214.2, fdroplevels(x[1:5]), droplevels(x[1:5])) +test(2214.01, fdroplevels(factor()), droplevels(factor())) +test(2214.02, fdroplevels(x[1:5]), droplevels(x[1:5])) if (base::getRversion() >= "3.4.0") { # bug fix in R 3.4.0: "droplevels(f) now keeps levels when present." - test(2214.3, fdroplevels(x[1:5], c("b", "d")), droplevels(x[1:5], c("b", "d"))) - test(2214.4, fdroplevels(x[1:5], letters[1:5]), droplevels(x[1:5], letters[1:5])) - test(2214.5, droplevels(DT, exclude=c("b", "d"))[["a"]], droplevels(DT[1:5,a], c("b", "d"))) + test(2214.03, fdroplevels(x[1:5], c("b", "d")), droplevels(x[1:5], c("b", "d"))) + test(2214.04, fdroplevels(x[1:5], letters[1:5]), droplevels(x[1:5], letters[1:5])) + test(2214.05, droplevels(DT, exclude=c("b", "d"))[["a"]], droplevels(DT[1:5,a], c("b", "d"))) } -test(2214.6, droplevels(DT)[["a"]], droplevels(DT[1:5,a])) -test(2214.7, droplevels(DT, 1)[["a"]], x[1:5]) -test(2214.8, droplevels(DT, in.place=TRUE), DT) +test(2214.06, droplevels(DT)[["a"]], droplevels(DT[1:5,a])) +test(2214.07, droplevels(DT, 1)[["a"]], x[1:5]) +test(2214.08, droplevels(DT, in.place=TRUE), DT) +# support ordered factors in fdroplevels +o = factor(letters[1:10], ordered=TRUE) +test(2214.09, fdroplevels(o[1:5]), droplevels(o[1:5])) +# edge case for empty table #5184 +test(2214.10, droplevels(DT[0]), DT[0]) +test(2214.11, droplevels(data.table()), data.table()) + # factor i should be just like character i and work, #1632 DT = data.table(A=letters[1:3], B=4:6, key="A") From 5f9df4d6118483c08501335ba427bc65476c8fbc Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 8 Oct 2021 07:18:54 +0200 Subject: [PATCH 409/588] shift fix type coercion for integer64 (#5189) --- NEWS.md | 21 +++++++++++++++++ inst/tests/nafill.Rraw | 7 +++--- inst/tests/tests.Rraw | 29 ++++++++++++++++++++++- src/assign.c | 53 +++++++++++++++++++++++++++--------------- src/data.table.h | 1 + src/init.c | 2 ++ src/shift.c | 29 +---------------------- src/utils.c | 2 -- 8 files changed, 91 insertions(+), 53 deletions(-) diff --git a/NEWS.md b/NEWS.md index 18b5c5c49b..d0a8e8f5f7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -376,6 +376,27 @@ 44. In v1.13.2 a version of an old bug was reintroduced where during a grouping operation list columns could retain a pointer to the last group. This affected only attributes of list elements and only if those were updated during the grouping operation, [#4963](https://github.com/Rdatatable/data.table/issues/4963). Thanks to @fujiaxiang for reporting and @avimallu and Václav Tlapák for investigating and the PR. +45. `shift(xInt64, fill=0)` and `shift(xInt64, fill=as.integer64(0))` (but not `shift(xInt64, fill=0L)`) would error with `INTEGER() can only be applied to a 'integer', not a 'double'` where `xInt64` conveys `bit64::integer64`, `0` is type `double` and `0L` is type integer, [#4865](https://github.com/Rdatatable/data.table/issues/4865). Thanks to @peterlittlejohn for reporting and Benjamin Schwendinger for the PR. + +46. `DT[i, strCol:=classVal]` did not coerce using the `as.character` method for the class, resulting in either an unexpected string value or an error such as `To assign integer64 to a target of type character, please use as.character() for clarity`. Discovered during work on the previous issue, [#5189](https://github.com/Rdatatable/data.table/pull/5189). + + ```R + DT + # A + # + # 1: a + # 2: b + # 3: c + DT[2, A:=as.IDate("2021-02-03")] + DT[3, A:=bit64::as.integer64("4611686018427387906")] + DT + # A + # + # 1: a + # 2: 2021-02-03 # was 18661 + # 3: 4611686018427387906 # was error 'please use as.character' + ``` + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index e8ea3d7eec..d2ee592ccc 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -281,7 +281,7 @@ if (test_bit64) { x = as.integer64(1L) test(10.81, coerceAs(x, 1), 1, output="double[integer64] into double[numeric]") test(10.82, coerceAs(x, 1L), 1L, output="double[integer64] into integer[integer]") - test(10.83, coerceAs(x, "1"), error="please use as.character", output="double[integer64] into character[character]") # not yet implemented + test(10.83, coerceAs(x, "1"), "1", output="double[integer64] into character[character]") test(10.84, coerceAs(1, x), x, output="double[numeric] into double[integer64]") test(10.85, coerceAs(1L, x), x, output="integer[integer] into double[integer64]") test(10.86, coerceAs("1", x), x, output="character[character] into double[integer64]", warning="Coercing.*character") @@ -294,14 +294,15 @@ if (test_nanotime) { x = nanotime(1L) test(10.91, coerceAs(x, 1), 1, output="double[nanotime] into double[numeric]") test(10.92, coerceAs(x, 1L), 1L, output="double[nanotime] into integer[integer]") - test(10.93, coerceAs(x, "1"), error="please use as.character", output="double[nanotime] into character[character]") # not yet implemented + test(10.93, substring(coerceAs(x, "1"),1,11) %in% c("1","1970-01-01T"), output="double[nanotime] into character[character]") + # ^ https://github.com/eddelbuettel/nanotime/issues/92; %in% so as not to break if nanotime adds as.character method test(10.94, coerceAs(1, x), x, output="double[numeric] into double[nanotime]") test(10.95, coerceAs(1L, x), x, output="integer[integer] into double[nanotime]") test(10.96, coerceAs("1", x), x, output="character[character] into double[nanotime]", warning="Coercing.*character") } options(datatable.verbose=FALSE) test(11.01, coerceAs(list(a=1), 1), error="is not atomic") -test(11.02, coerceAs(1, list(a=1)), error="is not atomic") +test(11.02, coerceAs(1, list(a=1)), list(1)) test(11.03, coerceAs(sum, 1), error="is not atomic") test(11.04, coerceAs(quote(1+1), 1), error="is not atomic") test(11.05, coerceAs(as.name("x"), 1), error="is not atomic") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 619ab67304..c349b99daf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14611,7 +14611,7 @@ if (test_bit64) { warning="-1.*integer64.*position 1 taken as 0 when assigning.*raw.*column 3 named 'c'") test(2005.66, DT[2:3, f:=as.integer64(c(NA,"2147483648"))]$f, as.complex(c(-42,NA,2147483648))) DT[,h:=LETTERS[1:3]] - test(2005.67, DT[2:3, h:=as.integer64(1:2)], error="To assign integer64 to.*type character, please use as.character.") + test(2005.67, DT[2:3, h:=as.integer64(1:2)]$h, c("A","1","2")) # PR#5189 } # rbindlist raw type, #2819 @@ -18222,3 +18222,30 @@ DT1 = data.table(id=1:3, grp=c('a', 'a', 'b'), value=4:6) DT2 = data.table(grp = c('a', 'b'), agg = list(c('1' = 4, '2' = 5), c('3' = 6))) test(2217, DT1[, by = grp, .(agg = list(setNames(as.numeric(value), id)))], DT2) +# shift integer64 when fill isn't integer32, #4865 +testnum = 2218 +funs = c(as.integer, as.double, as.complex, as.character, if (test_bit64) as.integer64) +# when test_bit64==FALSE these all passed before; now passes with test_bit64==TRUE too +for (f1 in funs) { + DT = data.table(x=f1(1:4)) + for (f2 in funs) { + testnum = testnum + 0.01 + test(testnum, DT[, shift(x)], f1(c(NA, 1:3))) + testnum = testnum + 0.01 + w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" + test(testnum, DT[, shift(x, fill=f2(NA))], f1(c(NA, 1:3)), warning=w) + testnum = testnum + 0.01 + if (identical(f1,as.character) && identical(f2,as.complex)) { + # one special case due to as.complex(0)=="0+0i"!="0" + test(testnum, DT[, shift(x, fill="0")], f1(0:3)) + } else { + test(testnum, DT[, shift(x, fill=f2(0))], f1(0:3), warning=w) + } + } +} + +# subassign coerce a class to character, part of PR#5189 +DT = data.table(A=letters[1:3]) +test(2219.1, DT[2, A:=as.IDate("2021-02-03")], data.table(A=c("a","2021-02-03","c"))) +if (test_bit64) test(2219.2, DT[3, A:=as.integer64("4611686018427387906")], data.table(A=c("a","2021-02-03","4611686018427387906"))) + diff --git a/src/assign.c b/src/assign.c index d0faf337c8..7fb09fa71e 100644 --- a/src/assign.c +++ b/src/assign.c @@ -830,7 +830,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con } } } else if (isString(source) && !isString(target) && !isNewList(target)) { - warning(_("Coercing 'character' RHS to '%s' to match the type of %s."), type2char(TYPEOF(target)), targetDesc); + warning(_("Coercing 'character' RHS to '%s' to match the type of %s."), targetIsI64?"integer64":type2char(TYPEOF(target)), targetDesc); // this "Coercing ..." warning first to give context in case coerceVector warns 'NAs introduced by coercion' // and also because 'character' to integer/double coercion is often a user mistake (e.g. wrong target column, or wrong // variable on RHS) which they are more likely to appreciate than find inconvenient @@ -856,7 +856,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con // inside BODY that cater for 'where' or not. Maybe there's a way to merge the two macros in future. // The idea is to do these range checks without calling coerceVector() (which allocates) -#define CHECK_RANGE(STYPE, RFUN, COND, FMT, TO) {{ \ +#define CHECK_RANGE(STYPE, RFUN, COND, FMT, TO, FMTVAL) {{ \ const STYPE *sd = (const STYPE *)RFUN(source); \ for (int i=0; i255, "d", "taken as 0") + case INTSXP: CHECK_RANGE(int, INTEGER, val<0 || val>255, "d", "taken as 0", val) case REALSXP: if (sourceIsI64) - CHECK_RANGE(int64_t, REAL, val<0 || val>255, PRId64, "taken as 0") - else CHECK_RANGE(double, REAL, !R_FINITE(val) || val<0.0 || val>256.0 || (int)val!=val, "f", "either truncated (precision lost) or taken as 0") + CHECK_RANGE(int64_t, REAL, val<0 || val>255, PRId64, "taken as 0", val) + else CHECK_RANGE(double, REAL, !R_FINITE(val) || val<0.0 || val>256.0 || (int)val!=val, "f", "either truncated (precision lost) or taken as 0", val) } break; case INTSXP: - if (TYPEOF(source)==REALSXP) { - if (sourceIsI64) - CHECK_RANGE(int64_t, REAL, val!=NA_INTEGER64 && (val<=NA_INTEGER || val>INT_MAX), PRId64, "out-of-range (NA)") - else CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "f", "truncated (precision lost)") + switch (TYPEOF(source)) { + case REALSXP: if (sourceIsI64) + CHECK_RANGE(int64_t, REAL, val!=NA_INTEGER64 && (val<=NA_INTEGER || val>INT_MAX), PRId64, "out-of-range (NA)", val) + else CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "f", "truncated (precision lost)", val) + case CPLXSXP: CHECK_RANGE(Rcomplex, COMPLEX, !((ISNAN(val.i) || (R_FINITE(val.i) && val.i==0.0)) && + (ISNAN(val.r) || (R_FINITE(val.r) && (int)val.r==val.r))), "f", "either imaginary part discarded or real part truncated (precision lost)", val.r) } break; case REALSXP: - if (targetIsI64 && isReal(source) && !sourceIsI64) { - CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "f", "truncated (precision lost)") + switch (TYPEOF(source)) { + case REALSXP: if (targetIsI64 && !sourceIsI64) + CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "f", "truncated (precision lost)", val) + break; + case CPLXSXP: if (targetIsI64) + CHECK_RANGE(Rcomplex, COMPLEX, !((ISNAN(val.i) || (R_FINITE(val.i) && val.i==0.0)) && + (ISNAN(val.r) || (R_FINITE(val.r) && (int64_t)val.r==val.r))), "f", "either imaginary part discarded or real part truncated (precision lost)", val.r) + else CHECK_RANGE(Rcomplex, COMPLEX, !(ISNAN(val.i) || (R_FINITE(val.i) && val.i==0.0)), "f", "imaginary part discarded", val.i) } } } @@ -992,6 +1000,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con if (sourceIsI64) BODY(int64_t, REAL, int, (val==NA_INTEGER64||val>INT_MAX||val<=NA_INTEGER) ? NA_INTEGER : (int)val, td[i]=cval) else BODY(double, REAL, int, ISNAN(val) ? NA_INTEGER : (int)val, td[i]=cval) + case CPLXSXP: BODY(Rcomplex, COMPLEX, int, ISNAN(val.r) ? NA_INTEGER : (int)val.r, td[i]=cval) default: COERCE_ERROR("integer"); // test 2005.4 } } break; @@ -1008,6 +1017,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con memcpy(td, (int64_t *)REAL(source), slen*sizeof(int64_t)); break; } else BODY(int64_t, REAL, int64_t, val, td[i]=cval) } else BODY(double, REAL, int64_t, R_FINITE(val) ? val : NA_INTEGER64, td[i]=cval) + case CPLXSXP: BODY(Rcomplex, COMPLEX, int64_t, ISNAN(val.r) ? NA_INTEGER64 : (int64_t)val.r, td[i]=cval) default: COERCE_ERROR("integer64"); } } else { @@ -1022,6 +1032,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con memcpy(td, (double *)REAL(source), slen*sizeof(double)); break; } else BODY(double, REAL, double, val, td[i]=cval) } else BODY(int64_t, REAL, double, val==NA_INTEGER64 ? NA_REAL : val, td[i]=cval) + case CPLXSXP: BODY(Rcomplex, COMPLEX, double, val.r, td[i]=cval) default: COERCE_ERROR("double"); } } @@ -1060,9 +1071,13 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con } break; } - if (sourceIsI64) - error(_("To assign integer64 to a target of type character, please use as.character() for clarity.")); // TODO: handle that here as well - source = PROTECT(coerceVector(source, STRSXP)); protecti++; + if (OBJECT(source) && getAttrib(source, R_ClassSymbol)!=R_NilValue) { + // otherwise coerceVector doesn't call the as.character method for Date, IDate, integer64, nanotime, etc; PR#5189 + // this if() is to save the overhead of the R call eval() when we know there can be no method + source = PROTECT(eval(PROTECT(lang2(sym_as_character, source)), R_GlobalEnv)); protecti+=2; + } else { + source = PROTECT(coerceVector(source, STRSXP)); protecti++; + } } BODY(SEXP, STRING_PTR, SEXP, val, SET_STRING_ELT(target, off+i, cval)) } diff --git a/src/data.table.h b/src/data.table.h index 6bafdc52af..a7f52b5e09 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -103,6 +103,7 @@ extern SEXP sym_datatable_locked; extern SEXP sym_tzone; extern SEXP sym_old_fread_datetime_character; extern SEXP sym_variable_table; +extern SEXP sym_as_character; extern double NA_INT64_D; extern long long NA_INT64_LL; extern Rcomplex NA_CPLX; // initialized in init.c; see there for comments diff --git a/src/init.c b/src/init.c index 0c1774508b..38b0de1e2f 100644 --- a/src/init.c +++ b/src/init.c @@ -36,6 +36,7 @@ SEXP sym_datatable_locked; SEXP sym_tzone; SEXP sym_old_fread_datetime_character; SEXP sym_variable_table; +SEXP sym_as_character; double NA_INT64_D; long long NA_INT64_LL; Rcomplex NA_CPLX; @@ -366,6 +367,7 @@ void attribute_visible R_init_data_table(DllInfo *info) sym_tzone = install("tzone"); sym_old_fread_datetime_character = install("datatable.old.fread.datetime.character"); sym_variable_table = install("variable_table"); + sym_as_character = install("as.character"); initDTthreads(); avoid_openmp_hang_within_fork(); diff --git a/src/shift.c b/src/shift.c index 11346648d5..dba598fe50 100644 --- a/src/shift.c +++ b/src/shift.c @@ -38,11 +38,10 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) SEXP elem = VECTOR_ELT(x, i); size_t size = SIZEOF(elem); R_xlen_t xrows = xlength(elem); + SEXP thisfill = PROTECT(coerceAs(fill, elem, ScalarLogical(0))); nprotect++; // #4865 use coerceAs for type coercion switch (TYPEOF(elem)) { case INTSXP : { - SEXP thisfill = PROTECT(coerceVector(fill, INTSXP)); const int ifill = INTEGER(thisfill)[0]; - UNPROTECT(1); for (int j=0; j Date: Fri, 8 Oct 2021 08:02:26 +0200 Subject: [PATCH 410/588] improve coverage for gfuns (#5192) --- inst/tests/tests.Rraw | 13 +++++++++++++ src/gsumm.c | 15 +++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c349b99daf..6255f4f843 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18249,3 +18249,16 @@ DT = data.table(A=letters[1:3]) test(2219.1, DT[2, A:=as.IDate("2021-02-03")], data.table(A=c("a","2021-02-03","c"))) if (test_bit64) test(2219.2, DT[3, A:=as.integer64("4611686018427387906")], data.table(A=c("a","2021-02-03","4611686018427387906"))) +# gforce improve coverage +DT = data.table(g=1:2, i=c(NA, 1:4, NA), f=factor(letters[1:6]), l=as.list(1:6)) +options(datatable.optimize = 2L) +funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") +testnum = 2220 +for (fun in funs) { + testnum = testnum + 0.01 + test(testnum, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + testnum = testnum + 0.01 + test(testnum, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) +} +test(testnum+0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") + diff --git a/src/gsumm.c b/src/gsumm.c index be3b0f7855..5bb2620243 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -348,8 +348,7 @@ SEXP gsum(SEXP x, SEXP narmArg) double started = wallclock(); const bool verbose=GetVerbose(); if (verbose) Rprintf(_("This gsum (narm=%s) took ... "), narm?"TRUE":"FALSE"); - if (nrow != n) - error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gsum"); + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gsum"); bool anyNA=false; SEXP ans; switch(TYPEOF(x)) { @@ -729,8 +728,7 @@ static SEXP gminmax(SEXP x, SEXP narm, const bool min) const int n = nosubset ? length(x) : irowslen; //clock_t start = clock(); SEXP ans; - if (nrow != n) - error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gminmax"); + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gminmax"); // GForce guarantees each group has at least one value; i.e. we don't need to consider length-0 per group here switch(TYPEOF(x)) { case LGLSXP: case INTSXP: { @@ -865,8 +863,7 @@ SEXP gmedian(SEXP x, SEXP narmArg) { error(_("%s is not meaningful for factors."), "median"); const bool isInt64 = INHERITS(x, char_integer64), narm = LOGICAL(narmArg)[0]; const int n = (irowslen == -1) ? length(x) : irowslen; - if (nrow != n) - error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gmedian"); + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gmedian"); SEXP ans = PROTECT(allocVector(REALSXP, ngrp)); double *ansd = REAL(ans); const bool nosubset = irowslen==-1; @@ -1021,8 +1018,7 @@ static SEXP gvarsd1(SEXP x, SEXP narmArg, bool isSD) if (inherits(x, "factor")) error(_("%s is not meaningful for factors."), isSD ? "sd" : "var"); const int n = (irowslen == -1) ? length(x) : irowslen; - if (nrow != n) - error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gvar"); + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gvar"); SEXP sub, ans = PROTECT(allocVector(REALSXP, ngrp)); double *ansd = REAL(ans); const bool nosubset = irowslen==-1; @@ -1119,8 +1115,7 @@ SEXP gprod(SEXP x, SEXP narmArg) { const int n = nosubset ? length(x) : irowslen; //clock_t start = clock(); SEXP ans; - if (nrow != n) - error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gprod"); + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gprod"); long double *s = malloc(ngrp * sizeof(long double)); if (!s) error(_("Unable to allocate %d * %d bytes for gprod"), ngrp, sizeof(long double)); for (int i=0; i Date: Fri, 8 Oct 2021 00:50:04 -0700 Subject: [PATCH 411/588] exclude `...` from tables() search (#5199) --- NEWS.md | 2 ++ R/tables.R | 16 ++++++++++------ inst/tests/tests.Rraw | 6 +++++- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index d0a8e8f5f7..cc98ebbe31 100644 --- a/NEWS.md +++ b/NEWS.md @@ -396,6 +396,8 @@ # 2: 2021-02-03 # was 18661 # 3: 4611686018427387906 # was error 'please use as.character' ``` + +47. `tables()` failed with `argument "..." is missing` when called from within a function taking `...`; e.g. `function(...) { tables() }`, [#5197](https://github.com/Rdatatable/data.table/issues/5197). Thanks @greg-minshall for the report and @michaelchirico for the fix. ## NOTES diff --git a/R/tables.R b/R/tables.R index 99c59f0c4d..adb82066b5 100644 --- a/R/tables.R +++ b/R/tables.R @@ -2,12 +2,13 @@ MB = NCOL = NROW = NULL tables = function(mb=TRUE, order.col="NAME", width=80, - env=parent.frame(), silent=FALSE, index=FALSE) + env=parent.frame(), silent=FALSE, index=FALSE) { # Prints name, size and colnames of all data.tables in the calling environment by default - all_obj = objects(envir=env, all.names=TRUE) - is_DT = which(vapply_1b(all_obj, function(x) is.data.table(get(x, envir=env)))) - if (!length(is_DT)) { + # include "hidden" objects (starting with .) via all.names=TRUE, but exclude ... specifically, #5197 + all_obj = grep("...", objects(envir=env, all.names=TRUE, sorted=order.col == "NAME"), invert=TRUE, fixed=TRUE, value=TRUE) + is_DT = vapply_1b(mget(all_obj, envir=env), is.data.table) + if (!any(is_DT)) { if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) return(invisible(data.table(NULL))) } @@ -23,8 +24,11 @@ tables = function(mb=TRUE, order.col="NAME", width=80, KEY = list(key(DT)), INDICES = if (index) list(indices(DT))) })) - if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) - info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names + # objects() above handled the sorting for order.col=="NAME" + if (order.col != "NAME") { + if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) + info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names + } if (!silent) { # prettier printing on console pretty_format = function(x, width) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6255f4f843..4ede2544c4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7490,7 +7490,8 @@ test(1536, duplicated(dt, incomparables=TRUE), error = "argument 'incomparables test(1537 , names(melt(dt, id.vars=1L, variable.name = "x", value.name="x")), c("x", "x.1", "x.2"), output = "Duplicate column names") # test for tables() -test(1538, tables(), output = "Total:") +test(1538.1, tables(), output = "Total:") +test(1538.2, !is.unsorted(tables(order.col="NROW")$NROW)) # uniqueN not support list-of-list: reverted #1224 d1 <- data.table(a = 1:4, l = list(list(letters[1:2]),list(Sys.time()),list(1:10),list(letters[1:2]))) @@ -18262,3 +18263,6 @@ for (fun in funs) { } test(testnum+0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +# tables() error when called from inside a function(...), #5197 +test(2221, (function(...) tables())(), output = "No objects of class data.table exist") + From a7e04bd48407c402b1cdef2dcd5d0d0d47187056 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 8 Oct 2021 05:49:42 -0600 Subject: [PATCH 412/588] #5199 follow-up to pass R 3.1.0 and to suppress new test 1538.2 output --- R/tables.R | 4 ++-- inst/tests/tests.Rraw | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/tables.R b/R/tables.R index adb82066b5..5196935eda 100644 --- a/R/tables.R +++ b/R/tables.R @@ -6,7 +6,8 @@ tables = function(mb=TRUE, order.col="NAME", width=80, { # Prints name, size and colnames of all data.tables in the calling environment by default # include "hidden" objects (starting with .) via all.names=TRUE, but exclude ... specifically, #5197 - all_obj = grep("...", objects(envir=env, all.names=TRUE, sorted=order.col == "NAME"), invert=TRUE, fixed=TRUE, value=TRUE) + all_obj = grep("...", ls(envir=env, all.names=TRUE), invert=TRUE, fixed=TRUE, value=TRUE) + if (order.col=="NAME") all_obj=sort(all_obj) # neither ls() nor objects() had sorted arg in R 3.1.0 is_DT = vapply_1b(mget(all_obj, envir=env), is.data.table) if (!any(is_DT)) { if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) @@ -24,7 +25,6 @@ tables = function(mb=TRUE, order.col="NAME", width=80, KEY = list(key(DT)), INDICES = if (index) list(indices(DT))) })) - # objects() above handled the sorting for order.col=="NAME" if (order.col != "NAME") { if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4ede2544c4..6bb1def697 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7490,8 +7490,8 @@ test(1536, duplicated(dt, incomparables=TRUE), error = "argument 'incomparables test(1537 , names(melt(dt, id.vars=1L, variable.name = "x", value.name="x")), c("x", "x.1", "x.2"), output = "Duplicate column names") # test for tables() -test(1538.1, tables(), output = "Total:") -test(1538.2, !is.unsorted(tables(order.col="NROW")$NROW)) +test(1538.1, tables(), output="Total:") +test(1538.2, !is.unsorted(tables(order.col="NROW")$NROW), output="Total:") # uniqueN not support list-of-list: reverted #1224 d1 <- data.table(a = 1:4, l = list(list(letters[1:2]),list(Sys.time()),list(1:10),list(letters[1:2]))) From 96cdef6d03f3fb296778cc7a5550f704d011e083 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 8 Oct 2021 06:01:03 -0600 Subject: [PATCH 413/588] vignette title mismatch thanks to warning in GLCI integration log --- vignettes/datatable-faq.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index cf358724ba..4b0645e6b6 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -6,7 +6,7 @@ output: toc: true number_sections: true vignette: > - %\VignetteIndexEntry{Frequently asked questions} + %\VignetteIndexEntry{Frequently Asked Questions about data.table} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- From 086fb7d75d9817f7369abc48229eef4665e5dd66 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sat, 9 Oct 2021 14:54:03 -0600 Subject: [PATCH 414/588] .dev-only: cran() result ordered the same as status() --- .dev/revdep.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index b8c17cc65d..8747d287e1 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -279,7 +279,7 @@ cran = function() # reports CRAN status of the .cran.fail packages keyby=Package] ans[local==cran, c("cran","local"):=""] ans[, "right_click_in_bash":=paste0("https://cran.r-project.org/web/checks/check_results_",Package,".html")] - ans[] + setkey(ans, Package)[.fail.cran,] } run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE, choose=NULL) { From de7c29aa536b3da0f71ac741709d1ffab594a3ce Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sat, 9 Oct 2021 15:14:34 -0600 Subject: [PATCH 415/588] .dev-only: 20min timeout added to revdep.R --- .dev/revdep.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 8747d287e1..46f9e2a376 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -327,7 +327,7 @@ run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE, choose=NULL) { } if (!identical(pkgs,"_ALL_")) for (i in pkgs) system(paste0("rm -rf ./",i,".Rcheck")) SUGG = paste0("_R_CHECK_FORCE_SUGGESTS_=",tolower(R_CHECK_FORCE_SUGGESTS)) - cmd = paste0("ls -1 *.tar.gz ", filter, "| TZ='UTC' OMP_THREAD_LIMIT=2 ",SUGG," parallel --max-procs 50% ",R," CMD check") + cmd = paste0("ls -1 *.tar.gz ", filter, "| TZ='UTC' OMP_THREAD_LIMIT=2 ",SUGG," parallel --max-procs 50% --timeout 1200 ",R," CMD check") # TZ='UTC' because some packages have failed locally for me but not on CRAN or for their maintainer, due to sensitivity of tests to timezone if (as.integer(system("ps -e | grep perfbar | wc -l", intern=TRUE)) < 1) system("perfbar",wait=FALSE) system("touch /tmp/started.flag ; rm -f /tmp/finished.flag") From 92827d92a426d0b974d0c9d4fd9b2627b15263a0 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 10 Oct 2021 00:17:13 -0700 Subject: [PATCH 416/588] run plot() in tempdir() (#5190) --- R/test.data.table.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/test.data.table.R b/R/test.data.table.R index b64dfe119d..2e8aca38ea 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -116,6 +116,9 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F assign("filename", fn, envir=env) assign("inittime", as.integer(Sys.time()), envir=env) # keep measures from various test.data.table runs assign("showProgress", showProgress, envir=env) + + owd = setwd(tempdir()) # ensure writeable directory; e.g. tests that plot may write .pdf here depending on device option and/or batch mode; #5190 + on.exit(setwd(owd)) err = try(sys.source(fn, envir=env), silent=silent) From a41ae4a7759386f6f3994adf2bdd102dd059a667 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 11 Oct 2021 13:03:46 -0600 Subject: [PATCH 417/588] restore supporting data.table not inheriting from data.frame (#5210) --- R/data.table.R | 4 +++- inst/tests/tests.Rraw | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 9c36b10bf8..91f7b52110 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2527,8 +2527,10 @@ copy = function(x) { } shallow = function(x, cols=NULL) { - if (!is.data.frame(x)) + if (!is.data.frame(x) && !is.data.table(x)) { + # ^^ some revdeps do class(x)="data.table" without inheriting from data.frame, PR#5210 stopf("x is not a data.table|frame. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table|frame") + } ans = .shallow(x, cols=cols, retain.key=selfrefok(x)) # selfrefok for #5042 ans } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6bb1def697..c814b9f397 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -640,7 +640,7 @@ test(211, ncol(TESTDT), 2L) DT = data.table(a=1:6,key="a") test(212, DT[J(3)]$a, 3L) # correct class c("data.table","data.frame") class(DT) = "data.table" # incorrect class, but as from 1.8.1 it works. By accident when moving from colnames() to names(), it was dimnames() doing the check, but rather than add a check that identical(class(DT),c("data.frame","data.table")) at the top of [.data.table, we'll leave it flexible to user (user might not want to inherit from data.frame for some reason). -test(213, DT[J(3)]$a, error="x is not a data.table|frame") # from v1.14.2, data.table must inherit from data.frame (internals are too hard to reason if a data.table may not be data.frame too) +test(213, DT[J(3)]$a, 3L) # setkey now auto coerces double and character for convenience, and # to solve bug #953 @@ -18266,3 +18266,8 @@ test(testnum+0.01, DT[, prod(l), g], error="GForce prod can only be applied to c # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") +# some revdeps do class(x)="data.table" without inheriting from data.frame, PR#5210 +DT = data.table(A=1:3) +class(DT) = "data.table" +test(2222, print(DT), output="A.*3") + From 36279f4b6d41a226de779e14fe7e723db615efd0 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 11 Oct 2021 14:53:19 -0600 Subject: [PATCH 418/588] retain nomatch=FALSE for backwards compatibility (#5214) --- R/data.table.R | 4 ++-- inst/tests/tests.Rraw | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 91f7b52110..3fe2ba8e1f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -221,8 +221,8 @@ replace_dot_alias = function(e) { # TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one. .unsafe.opt() #3585 missingnomatch = missing(nomatch) - nomatch0 = identical(nomatch,0) || identical(nomatch,0L) # for warning with row-numbers in i; #4353 - if (nomatch0) nomatch=NULL # retain nomatch=0 backwards compatibility; #857 + nomatch0 = identical(nomatch,0) || identical(nomatch,0L) || identical(nomatch, FALSE) # for warning with row-numbers in i; #4353 + if (nomatch0) nomatch=NULL # retain nomatch=0|FALSE backwards compatibility, #857 #5214 if (!(is.null(nomatch) || (length(nomatch)==1L && is.na(nomatch)))) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)") if (!is.logical(which) || length(which)>1L) stopf("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c814b9f397..18fb5c82a5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18271,3 +18271,7 @@ DT = data.table(A=1:3) class(DT) = "data.table" test(2222, print(DT), output="A.*3") +# retain nomatch=FALSE backwards compatibility, #5214 +DT = data.table(A=1:3, key="A") +test(2223, DT[.(4), nomatch=FALSE], data.table(A=integer(), key="A")) + From 4e7804901f0aa7110732c27fa7c19220836f40a2 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 12 Oct 2021 02:21:39 -0600 Subject: [PATCH 419/588] restore nomatch=NA_character_ (#5216) --- R/data.table.R | 5 ++++- inst/tests/tests.Rraw | 5 +++-- src/bmerge.c | 10 +++++++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 3fe2ba8e1f..b8c1132f6a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -223,7 +223,10 @@ replace_dot_alias = function(e) { missingnomatch = missing(nomatch) nomatch0 = identical(nomatch,0) || identical(nomatch,0L) || identical(nomatch, FALSE) # for warning with row-numbers in i; #4353 if (nomatch0) nomatch=NULL # retain nomatch=0|FALSE backwards compatibility, #857 #5214 - if (!(is.null(nomatch) || (length(nomatch)==1L && is.na(nomatch)))) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)") + if (!is.null(nomatch)) { + if (!(length(nomatch)==1L && is.na(nomatch))) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)") + nomatch=NA # convert NA_character_ to NA-logical, PR#5216 + } if (!is.logical(which) || length(which)>1L) stopf("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 18fb5c82a5..ef4ed3a614 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18271,7 +18271,8 @@ DT = data.table(A=1:3) class(DT) = "data.table" test(2222, print(DT), output="A.*3") -# retain nomatch=FALSE backwards compatibility, #5214 +# retain nomatch=FALSE backwards compatibility #5214, and nomatch=NA_character_ PR#5216 DT = data.table(A=1:3, key="A") -test(2223, DT[.(4), nomatch=FALSE], data.table(A=integer(), key="A")) +test(2223.1, DT[.(4), nomatch=FALSE], data.table(A=integer(), key="A")) +test(2223.2, DT[.(4), nomatch=NA_character_], data.table(A=4L, key="A")) diff --git a/src/bmerge.c b/src/bmerge.c index 44ac7b569c..1011c84aab 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -85,7 +85,15 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP error(_("rollends must be a length 2 logical vector")); rollends = LOGICAL(rollendsArg); - nomatch = isNull(nomatchArg) ? 0 : INTEGER(nomatchArg)[0]; + if (isNull(nomatchArg)) { + nomatch=0; + } else { + if (length(nomatchArg)!=1 || (!isLogical(nomatchArg) && !isInteger(nomatchArg))) + error(_("Internal error: nomatchArg must be NULL or length-1 logical/integer")); // # nocov + nomatch = INTEGER(nomatchArg)[0]; + if (nomatch!=NA_INTEGER && nomatch!=0) + error(_("Internal error: nomatchArg must be NULL, NA, NA_integer_ or 0L")); // # nocov + } // mult arg if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL; From b78483c2c9b523c6638cdff86da9f4efd33b1be9 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 12 Oct 2021 02:44:04 -0600 Subject: [PATCH 420/588] .dev-only: include running (i.e. killed) revdeps in cran() output --- .dev/revdep.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 46f9e2a376..6f4b1f11e2 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -200,7 +200,9 @@ status0 = function(bioc=FALSE) { if (length(ns)) paste0("NOT STARTED : ",paste(sort(names(x)[head(ns,20)]),collapse=" "), if(length(ns)>20)paste(" +",length(ns)-20,"more"), "\n"), "\n" ) - assign(if (bioc) ".fail.bioc" else ".fail.cran", c(sort(names(x)[e]), sort(names(x)[w])), envir=.GlobalEnv) + assign(if (bioc) ".fail.bioc" else ".fail.cran", c(sort(names(x)[e]), sort(names(x)[w])), envir=.GlobalEnv) + assign(if (bioc) ".running.bioc" else ".running.cran", sort(names(x)[r]), envir=.GlobalEnv) + # if parallel finished then 'running' means killed; we want to see if status on CRAN (using cran()) shows FAIL with a log showing kill signal (or similar) due to taking too long invisible() } @@ -263,15 +265,16 @@ status = function(bioc=FALSE) { cran = function() # reports CRAN status of the .cran.fail packages { - if (!length(.fail.cran)) { - cat("No CRAN revdeps in error or warning status\n") + x = c(.fail.cran, .running.cran) + if (!length(x)) { + cat("No CRAN revdeps in error, warning or running status\n") return(invisible()) } require(data.table) p = proc.time() db <<- setDT(tools::CRAN_check_results()) cat("tools::CRAN_check_results() returned",prettyNum(nrow(db), big.mark=","),"rows in",timetaken(p),"\n") - ans = db[Package %chin% .fail.cran, + ans = db[Package %chin% x, .(ERROR=sum(Status=="ERROR", na.rm=TRUE), WARN =sum(Status=="WARN", na.rm=TRUE), cran =paste(unique(Version),collapse=";"), @@ -279,7 +282,7 @@ cran = function() # reports CRAN status of the .cran.fail packages keyby=Package] ans[local==cran, c("cran","local"):=""] ans[, "right_click_in_bash":=paste0("https://cran.r-project.org/web/checks/check_results_",Package,".html")] - setkey(ans, Package)[.fail.cran,] + setkey(ans, Package)[x,] } run = function(pkgs=NULL, R_CHECK_FORCE_SUGGESTS=TRUE, choose=NULL) { From d89c1d6cd8f5b20dd8be55eb78b2aa07a16475e2 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 12 Oct 2021 10:14:37 -0700 Subject: [PATCH 421/588] Internal: fmelt.c clean up checkVars() (#5215) --- src/fmelt.c | 161 ++++++++++++++++++++++------------------------------ 1 file changed, 69 insertions(+), 92 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index c40fb1d7ce..a09bec56c7 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -138,128 +138,105 @@ static SEXP unlist_(SEXP xint) { return(ans); } -bool invalid_measure(int i, int ncol) { - return (i<=0 && i!=NA_INTEGER) || i>ncol; +bool is_default_measure(SEXP vec) { + return (isInteger(vec) || isNumeric(vec) || isLogical(vec)) && !isFactor(vec); +} + +// maybe unlist, then unique, then set_diff. +SEXP uniq_diff(SEXP int_or_list, int ncol, bool is_measure) { + SEXP int_vec = PROTECT(isNewList(int_or_list) ? unlist_(int_or_list) : int_or_list); + SEXP is_duplicated = PROTECT(duplicated(int_vec, FALSE)); + int n_unique_cols = 0; + for (int i=0; i ncol) - error(_("One or more values in 'id.vars' is invalid.")); - else if (!LOGICAL(booltmp)[i]) targetcols++; - else continue; - } - unqtmp = PROTECT(allocVector(INTSXP, targetcols)); protecti++; - u = 0; - for (int i=0; i ncol) - error(_("One or more values in 'id.vars' is invalid.")); - } - idcols = PROTECT(tmp); protecti++; - switch(TYPEOF(measure)) { - case STRSXP : tmp2 = PROTECT(chmatch(measure, dtnames, 0)); protecti++; break; - case REALSXP : tmp2 = PROTECT(coerceVector(measure, INTSXP)); protecti++; break; - case INTSXP : tmp2 = measure; break; - case VECSXP : tmp2 = PROTECT(measurelist(measure, dtnames)); protecti++; break; - default : error(_("Unknown 'measure.vars' type %s, must be character or integer vector"), type2char(TYPEOF(measure))); - } - tmp = tmp2; - if (isNewList(measure)) { - tmp = PROTECT(unlist_(tmp2)); protecti++; - } - for (int i=0; i Date: Tue, 12 Oct 2021 10:27:19 -0700 Subject: [PATCH 422/588] Update datatable-reshape.Rmd (#5212) Fix typos in reshape vignette. --- vignettes/datatable-reshape.Rmd | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index 9c55cdbd0a..3f94392fc6 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -110,7 +110,7 @@ DT.m1 In the previous section, we saw how to get from wide form to long form. Let's see the reverse operation in this section. -#### - How can we get back to the original data table `DT` from `DT.m`? +#### - How can we get back to the original data table `DT` from `DT.m1`? That is, we'd like to collect all *child* observations corresponding to each `family_id, age_mother` together under the same row. We can accomplish it using `dcast` as follows: @@ -126,7 +126,7 @@ dcast(DT.m1, family_id + age_mother ~ child, value.var = "dob") * `dcast` also tries to preserve attributes in result wherever possible. -#### - Starting from `DT.m`, how can we get the number of children in each family? +#### - Starting from `DT.m1`, how can we get the number of children in each family? You can also pass a function to aggregate by in `dcast` with the argument `fun.aggregate`. This is particularly essential when the formula provided does not identify single observation for each cell. @@ -327,4 +327,3 @@ You can also provide *multiple functions* to `fun.aggregate` to `dcast` for *dat # *** - From 47c0e51d159e9d47461ebab3aeced37072e65fdb Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 14 Oct 2021 18:22:52 -0700 Subject: [PATCH 423/588] Helpful error message for melt(measure.vars=list with unknown column) (#5211) --- inst/tests/tests.Rraw | 8 +++++++- src/fmelt.c | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ef4ed3a614..471bac6ad5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3186,6 +3186,12 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.25, melt(dt, id.vars=NULL, measure.vars=-1), error="One or more values in 'measure.vars'") test(1035.26, melt(dt, id.vars=5, measure.vars=-1), error="One or more values in 'id.vars'") test(1035.27, melt(dt, id.vars=1, measure.vars=-1), error="One or more values in 'measure.vars'") + test(1035.28, melt(dt, measure.vars=list("a")), error="One or more values in 'measure.vars'") + test(1035.29, melt(dt, measure.vars=NA_integer_, id.vars="y"), error="One or more values in 'measure.vars'") + test(1035.291, melt(dt, measure.vars=NA_integer_, id.vars=NULL), error="One or more values in 'measure.vars'") + test(1035.30, melt(dt, id.vars=NA_integer_), error="One or more values in 'id.vars'") + test(1035.31, melt(dt, measure.vars=NA_character_), error="One or more values in 'measure.vars'") + test(1035.32, melt(dt, id.vars=NA_character_), error="One or more values in 'id.vars'") if (test_R.utils) { # dup names in variable used to generate malformed factor error and/or segfault, #1754; was test 1570 @@ -18239,7 +18245,7 @@ for (f1 in funs) { if (identical(f1,as.character) && identical(f2,as.complex)) { # one special case due to as.complex(0)=="0+0i"!="0" test(testnum, DT[, shift(x, fill="0")], f1(0:3)) - } else { + } else { test(testnum, DT[, shift(x, fill=f2(0))], f1(0:3), warning=w) } } diff --git a/src/fmelt.c b/src/fmelt.c index a09bec56c7..9990da2fcd 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -97,6 +97,23 @@ static const char *concat(SEXP vec, SEXP idx) { return ans; } +// input: character vector of column names (maybe missing), output: +// integer vector of column indices with NA_INTEGER in the positions +// with missing inputs, and -1 in the positions with column names not +// found. Column names not found will eventually cause error via +// uniq_diff(). +SEXP chmatch_na(SEXP x, SEXP table){ + SEXP ans; + PROTECT(ans = chmatch(x, table, -1)); + for(int i=0; i Date: Fri, 15 Oct 2021 01:38:13 -0700 Subject: [PATCH 424/588] remove experimental tag from IDateTime (#5223) --- man/IDateTime.Rd | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/man/IDateTime.Rd b/man/IDateTime.Rd index 03e464c360..876b28b161 100644 --- a/man/IDateTime.Rd +++ b/man/IDateTime.Rd @@ -46,8 +46,20 @@ \title{ Integer based date class } \description{ - Date and time classes with integer storage for fast sorting and - grouping. Still experimental! + Classes (\code{IDate} and \code{ITime}) with \emph{integer} storage + for fast sorting and grouping. + + \code{IDate} inherits from the base class \code{Date}; the main + difference is that the latter uses double storage, allowing e.g. for + fractional dates at the cost of storage & sorting inefficiency. + + Using \code{IDate}, if sub-day granularity is needed, use a second + \code{ITime} column. \code{IDateTime()} facilitates building such + paired columns. + + Lastly, there are date-time helpers for extracting parts of dates as + integers, for example the year (\code{year()}), month + (\code{month()}), or day in the month (\code{mday()}); see Usage and Exampels. } \usage{ as.IDate(x, \dots) @@ -92,7 +104,9 @@ year(x) \item{tz}{time zone (see \code{strptime}).} \item{date}{date object convertible with \code{as.IDate}.} \item{time}{time-of-day object convertible with \code{as.ITime}.} - \item{digits}{really \code{units}; one of the units listed for rounding. May be abbreviated.} + \item{digits}{really \code{units}; one of the units listed for + rounding. May be abbreviated. Named \code{digits} for consistency with + the S3 generic.} \item{units}{one of the units listed for truncating. May be abbreviated.} \item{ms}{ For \code{as.ITime} methods, what should be done with sub-second fractions of input? Valid values are \code{'truncate'} (floor), \code{'nearest'} (round), and \code{'ceil'} (ceiling). See Details. } } @@ -100,7 +114,13 @@ year(x) \code{IDate} is a date class derived from \code{Date}. It has the same internal representation as the \code{Date} class, except the storage mode is integer. \code{IDate} is a relatively simple wrapper, and it -should work in almost all situations as a replacement for \code{Date}. +should work in almost all situations as a replacement for +\code{Date}. The main limitations of integer storage are (1) fractional + dates are not supported (use \code{IDateTime()} instead) and (2) the + range of supported dates is bounded by \code{.Machine$integer.max} + dates away from January 1, 1970 (a rather impractical limitation as + these dates are roughly 6 million years in the future/past, but + consider this your caveat). Functions that use \code{Date} objects generally work for \code{IDate} objects. This package provides specific methods for @@ -113,11 +133,10 @@ hours. Because \code{ITime} is stored in seconds, you can add it to a \code{POSIXct} object, but you should not add it to a \code{Date} object. -Conversions to and from \code{Date} and \code{POSIXct} formats are provided. +We also provide S3 methods to convert to and from \code{Date} and \code{POSIXct}. -\code{ITime} does not account for time zones. When converting -\code{ITime} and \code{IDate} to POSIXct with \code{as.POSIXct}, a time -zone may be specified. +\code{ITime} is time zone-agnostic. When converting \code{ITime} and +\code{IDate} to POSIXct with \code{as.POSIXct}, a time zone may be specified. Inputs like \code{'2018-05-15 12:34:56.789'} are ambiguous from the perspective of an \code{ITime} object -- the method of coercion of the 789 milliseconds is controlled by the \code{ms} argument to relevant methods. The default behavior (\code{ms = 'truncate'}) is to use \code{as.integer}, which has the effect of truncating anything after the decimal. Alternatives are to round to the nearest integer (\code{ms = 'nearest'}) or to round up (\code{ms = 'ceil'}). From d9da63edc4950033676aefa75a1510f331ea0102 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 15 Oct 2021 01:51:35 -0700 Subject: [PATCH 425/588] use seq(), not seq.Date() in tests (#5222) --- inst/tests/tests.Rraw | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 471bac6ad5..05c39eafde 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9956,7 +9956,7 @@ test(1658.34, fwrite(data.table(id=c("A","B","C"), v=c(1.1,0.0,9.9))), output="i test(1658.35, fwrite(data.table(id=1:3,bool=c(TRUE,NA,FALSE)),na="NA",logical01=TRUE), output="\"id\",\"bool\"\n1,1\n2,NA\n3,0") # POSIXct -test(1658.36, fwrite(data.table(D = as.POSIXct(seq.Date(as.Date("2038-01-19"), as.Date("2038-01-20"), by = "day")))), +test(1658.36, fwrite(data.table(D = as.POSIXct(seq(as.Date("2038-01-19"), as.Date("2038-01-20"), by = "day")))), output="D\n2038-01-19T00:00:00Z\n2038-01-20T00:00:00Z") # input is of class matrix @@ -10883,13 +10883,13 @@ test(1738.5, as.integer(as.Date(c("0000-03-01","9999-12-31"))), c(-719468L,29328 if (FALSE) { # Full range takes too long for CRAN. - dts = seq.Date(as.Date("0000-03-01"),as.Date("9999-12-31"),by="day") + dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day") dtsCh = as.character(dts) # 36s dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000 test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31"))) } else { # test on CRAN a reduced but important range - dts = seq.Date(as.Date("1899-12-31"),as.Date("2100-01-01"),by="day") + dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day") dtsCh = as.character(dts) test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01"))) } From 1b4f6c5d955d27792d75f8a67b54c2e65d3b0fcb Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 15 Oct 2021 01:54:07 -0700 Subject: [PATCH 426/588] suggest running bug reports with verbose=TRUE (#5195) --- .../issue_template.md} | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) rename .github/{ISSUE_TEMPLATE.md => ISSUE_TEMPLATE/issue_template.md} (86%) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE/issue_template.md similarity index 86% rename from .github/ISSUE_TEMPLATE.md rename to .github/ISSUE_TEMPLATE/issue_template.md index 3facaa4dea..09857e4e6f 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -1,3 +1,8 @@ +--- +name: Bug report or feature request +about: Report a bug or describe a new requested feature +--- + Click preview tab ^^^ above! By continuing to file this new issue / feature request, I confirm I have : @@ -10,6 +15,6 @@ By continuing to file this new issue / feature request, I confirm I have : #### Thanks! Please remove the text above and include the two items below. -`#` [`Minimal reproducible example`](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) +`#` [`Minimal reproducible example`](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example); please be sure to set `verbose=TRUE` where possible! `#` `Output of sessionInfo()` From 6265d3093f2a55affa5a2a8004f4dd05520df43f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 15 Oct 2021 11:00:27 +0200 Subject: [PATCH 427/588] add feature (#5196) --- NEWS.md | 1 + inst/tests/tests.Rraw | 8 ++++++-- src/fread.c | 5 +++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index cc98ebbe31..8829b856fb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -155,6 +155,7 @@ # [1] 5 1 2 3 4 ``` +28. `fread()` now also supports "0" and "1" in `na.strings`, [#2927](https://github.com/Rdatatable/data.table/issues/2927). Previously this was not permitted since "0" and "1" can be recognized as boolean values. Note that it is still not permitted to use "0" and "1" in `na.strings` in combination with `logical01 = TRUE`. Thanks to @msgoussi for the request, and Benjamin Schwendinger for the PR. ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 05c39eafde..9867e133b5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7855,10 +7855,14 @@ read_table = function(str, ...) { test(1552.1, fread(str, na.strings="#N/A"), read_table(str, na.strings="#N/A")) test(1552.2, fread(str, na.strings=c("#N/A", "-999")), read_table(str, na.strings=c("#N/A", "-999"))) test(1552.3, fread(str, na.strings=c("#N/A", "-999", "+1")), read_table(str, na.strings=c("#N/A", "-999", "+1"))) -test(1552.4, fread(str, na.strings=c("#N/A", "-999", "+1", "1")), - error="NAstring <<1>> is recognized as type boolean.*not permitted") +test(1552.4, fread(str, na.strings=c("#N/A", "-999", "+1", "1")), read_table(str, na.strings=c("#N/A", "-999", "+1", "1"))) # enabled by FR #2927 test(1552.5, fread(str, na.strings=c("#N/A", "-999", "FALSE")), error="NAstring <>.*boolean.*not permitted") test(1552.6, fread("A\n1.0\n2\n-", na.strings=c("-")), data.table(A=c(1.0, 2.0, NA))) +test(1552.7, fread(str, na.strings=c("#N/A", "-999", "+1", "1"), logical01=TRUE), + error="NAstring <<1>> and logical01=TRUE.*not permitted") +str = "a,b,c\n0,1,2\n1,0,2" +test(1552.8, fread(str, na.strings = "0"), data.table(a=c(NA,1L), b=c(1L,NA), c=c(2L,2L))) +test(1552.9, fread(str, na.strings = c("0","1")), data.table(a=c(NA,NA), b=c(NA,NA), c=c(2L,2L))) # FR #1177: 'quote' option of 'print.data.table' DT1 <- data.table(s1=paste(" ",LETTERS[1:5],sep=""),s2=LETTERS[1:5]) diff --git a/src/fread.c b/src/fread.c index e0a32d3e14..be50976849 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1287,9 +1287,10 @@ int freadMain(freadMainArgs _args) { STOP(_("freadMain: NAstring <<%s>> has whitespace at the beginning or end"), ch); if (strcmp(ch,"T")==0 || strcmp(ch,"F")==0 || strcmp(ch,"TRUE")==0 || strcmp(ch,"FALSE")==0 || - strcmp(ch,"True")==0 || strcmp(ch,"False")==0 || - strcmp(ch,"1")==0 || strcmp(ch,"0")==0) + strcmp(ch,"True")==0 || strcmp(ch,"False")==0) STOP(_("freadMain: NAstring <<%s>> is recognized as type boolean, this is not permitted."), ch); + if ((strcmp(ch,"1")==0 || strcmp(ch,"0")==0) && args.logical01) + STOP(_("freadMain: NAstring <<%s>> and logical01=%s, this is not permitted."), ch, args.logical01 ? "TRUE" : "FALSE"); char *end; errno = 0; (void)strtod(ch, &end); // careful not to let "" get to here (see continue above) as strtod considers "" numeric From 06958dd67f78405459457f1f6d688818e01f21d3 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 15 Oct 2021 11:34:07 +0200 Subject: [PATCH 428/588] Raw support setkey as column (not key) (#5180) --- NEWS.md | 5 ++++- inst/tests/tests.Rraw | 12 +++++++++--- src/reorder.c | 14 ++++++++++---- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8829b856fb..211cd7110f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -155,7 +155,10 @@ # [1] 5 1 2 3 4 ``` -28. `fread()` now also supports "0" and "1" in `na.strings`, [#2927](https://github.com/Rdatatable/data.table/issues/2927). Previously this was not permitted since "0" and "1" can be recognized as boolean values. Note that it is still not permitted to use "0" and "1" in `na.strings` in combination with `logical01 = TRUE`. Thanks to @msgoussi for the request, and Benjamin Schwendinger for the PR. +28. `fread()` now supports "0" and "1" in `na.strings`, [#2927](https://github.com/Rdatatable/data.table/issues/2927). Previously this was not permitted since "0" and "1" can be recognized as boolean values. Note that it is still not permitted to use "0" and "1" in `na.strings` in combination with `logical01 = TRUE`. Thanks to @msgoussi for the request, and Benjamin Schwendinger for the PR. + +29. `setkey()` now supports type `raw` as value columns (not as key columns), [#5100](https://github.com/Rdatatable/data.table/issues/5100). Thanks Hugh Parsonage for requesting, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9867e133b5..927babad7c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15952,11 +15952,17 @@ test(2067.8, shift(list(z, 1L:3L), n=-1, type = 'cyclic'), list(c(z[2:3], z[1]), DT = data.table(a = 2:1, z = complex(0, 0:1)) test(2068.1, setkey(copy(DT), a), data.table(a=1:2, z=complex(0, 1:0), key='a')) test(2068.2, DT[ , abs(z), by=a], data.table(a=2:1, V1=c(0, 1))) -# raw continues not to be supported +# support for ordering tables with raw columns, #5100 DT = data.table(ID=2:1, r=as.raw(0:1)) -test(2068.3, setkey(DT, ID), error="Item 2 of list is type 'raw'") +test(2068.3, setkey(copy(DT), ID), data.table(ID=1:2, r=as.raw(1:0), key='ID')) +DT = data.table(x=c(1, 2, 1), y=raw(3)) +test(2068.4, setkey(copy(DT), x), data.table(x=c(1,1,2), y=raw(3), key='x')) +test(2068.5, DT[, y[.N], x], data.table(x=c(1,2), V1=raw(2))) +# expression continue to be not supported +DT = data.table(ID=2:1, r=expression(1, 2)) +test(2068.6, setkey(DT, ID), error="Item 2 of list is type 'expression'") # setreordervec triggers !isNewList branch for coverage -test(2068.4, setreordervec(DT$r, order(DT$ID)), error="reorder accepts vectors but this non-VECSXP") +test(2068.7, setreordervec(DT$r, order(DT$ID)), error="reorder accepts vectors but this non-VECSXP") # forderv (and downstream functions) handles complex vector input, part of #3690 DT = data.table( diff --git a/src/reorder.c b/src/reorder.c index c2deea8ae9..debdb02172 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -13,7 +13,7 @@ SEXP reorder(SEXP x, SEXP order) ncol = length(x); for (int i=0; i Date: Fri, 15 Oct 2021 11:40:55 +0200 Subject: [PATCH 429/588] shift cyclic: add benchmark to NEWS (#5177) --- NEWS.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/NEWS.md b/NEWS.md index 211cd7110f..afbf154d71 100644 --- a/NEWS.md +++ b/NEWS.md @@ -144,6 +144,7 @@ 27. `shift()` now also supports `type="cyclic"`, [#4451](https://github.com/Rdatatable/data.table/issues/4451). Arguments that are normally pushed out by `type="lag"` or `type="lead"` are re-introduced at this type at the first/last positions. Thanks to @RicoDiel for requesting, and Benjamin Schwendinger for the PR. ```R + # Usage shift(1:5, n=-1:1, type="cyclic") # [[1]] # [1] 2 3 4 5 1 @@ -153,6 +154,19 @@ # # [[3]] # [1] 5 1 2 3 4 + + # Benchmark + x = sample(1e9) # 3.7 GB + microbenchmark::microbenchmark( + shift(x, 1, type="cyclic"), + c(tail(x, 1), head(x,-1)), + times = 10L, + unit = "s" + ) + # Unit: seconds + # expr min lq mean median uq max neval + # shift(x, 1, type = "cyclic") 1.57 1.67 1.71 1.68 1.70 2.03 10 + # c(tail(x, 1), head(x, -1)) 6.96 7.16 7.49 7.32 7.64 8.60 10 ``` 28. `fread()` now supports "0" and "1" in `na.strings`, [#2927](https://github.com/Rdatatable/data.table/issues/2927). Previously this was not permitted since "0" and "1" can be recognized as boolean values. Note that it is still not permitted to use "0" and "1" in `na.strings` in combination with `logical01 = TRUE`. Thanks to @msgoussi for the request, and Benjamin Schwendinger for the PR. From 7db8c3b3d1a82f0c5941158b911cf06ce7b6d18e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 20 Oct 2021 08:56:23 -0700 Subject: [PATCH 430/588] some tidying of cc() (#5228) --- .dev/cc.R | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/.dev/cc.R b/.dev/cc.R index 6c278e2693..bdf2c6ed15 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -50,7 +50,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys # Make sure library .so is not loaded (neither installed package nor from dev) dll = unlist(do.call("rbind",getLoadedDLLs())[,"path"]) - dll = grep("data_table.so",dll,value=TRUE) + dll = grep("data_table.so", dll, fixed=TRUE, value=TRUE) sapply(dll, dyn.unload) gc() @@ -69,21 +69,24 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys if (ret) return() # clang -Weverything includes -pedantic and issues many more warnings than gcc # system("R CMD SHLIB -o data_table.so *.c") - if (any(sapply(objects(envir=.GlobalEnv),function(x){inherits(get(x,.GlobalEnv),"data.table")}))) { - cat("ABOUT TO RELOAD .SO BUT THERE ARE DATA.TABLE OBJECTS IN .GLOBALENV SO FINALIZER MIGHT CRASH\n") + for (obj in ls(.GlobalEnv)) { + if (inherits(.GlobalEnv[[obj]], "data.table")) { + cat("ABOUT TO RELOAD .SO BUT THERE ARE DATA.TABLE OBJECTS IN .GLOBALENV SO FINALIZER MIGHT CRASH\n") + break + } } dyn.load("data_table.so") setwd(old) xx = getDLLRegisteredRoutines("data_table",TRUE) - for (i in seq_along(xx$.Call)) - assign(xx$.Call[[i]]$name, xx$.Call[[i]]$address, envir=.GlobalEnv) - for (i in seq_along(xx$.External)) - assign(xx$.External[[i]]$name, xx$.External[[i]]$address, envir=.GlobalEnv) - sourceDir(paste0(path,"/R")) - if (base::getRversion()<"4.0.0") rm(list=c("rbind.data.table","cbind.data.table"), envir=.GlobalEnv) # 3968 follow up - assign("testDir", function(x)paste0(path,"/inst/tests/",x), envir=.GlobalEnv) + for (Call xx$.Call) + .GlobalEnv[[Call$name]] = Call$address + for (Extern in xx$.External) + .GlobalEnv[[Extern$name]] = Extern$address + sourceDir(file.path(path, "R")) + if (base::getRversion()<"4.0.0") rm(list=c("rbind.data.table", "cbind.data.table"), envir=.GlobalEnv) # 3968 follow up + .GlobalEnv$testDir = function(x) file.path(path,"inst/tests",x) .onLoad() - if (is.logical(test) && isTRUE(test)) test.data.table() else if (is.character(test)) test.data.table(script=test) + if (isTRUE(test)) test.data.table() else if (is.character(test)) test.data.table(script=test) gc() invisible() } From 740b3332c2e674b5074d6659bc912587318ad73a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 20 Oct 2021 18:05:05 +0200 Subject: [PATCH 431/588] groupingsets: new metaprogramming together with named by (#5227) --- R/groupingsets.R | 1 + inst/tests/tests.Rraw | 3 +++ 2 files changed, 4 insertions(+) diff --git a/R/groupingsets.R b/R/groupingsets.R index 4c25b5b651..96940497c8 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -73,6 +73,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) stopf("Expression passed to grouping sets function must not update by reference. Use ':=' on results of your grouping function.") if (missing(.SDcols)) .SDcols = if (".SD" %chin% av) setdiff(names(x), by) else NULL + if (length(names(by))) by = unname(by) # 0 rows template data.table to keep colorder and type empty = if (length(.SDcols)) x[0L, eval(jj), by, .SDcols=.SDcols] else x[0L, eval(jj), by] if (id && "grouping" %chin% names(empty)) # `j` could have been evaluated to `grouping` field diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 927babad7c..26f531455b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18292,3 +18292,6 @@ DT = data.table(A=1:3, key="A") test(2223.1, DT[.(4), nomatch=FALSE], data.table(A=integer(), key="A")) test(2223.2, DT[.(4), nomatch=NA_character_], data.table(A=4L, key="A")) +# groupingsets by named by argument +test(2224.1, groupingsets(data.table(iris), j = sum(Sepal.Length), by = c('Sp'='Species'), sets = list('Species')), data.table(Species = factor(c("setosa", "versicolor", "virginica")), V1=c(250.3, 296.8, 329.4))) +test(2224.2, groupingsets(data.table(iris), j = mean(Sepal.Length), by = c('Sp'='Species'), sets = list('Species')), groupingsets(data.table(iris), j = mean(Sepal.Length), by = c('Species'), sets = list('Species'))) From fa76197dcbf2a8fced04d769e9ae47521f9d4d30 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 20 Oct 2021 11:01:07 -0600 Subject: [PATCH 432/588] #5228 follow up --- .dev/cc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/cc.R b/.dev/cc.R index bdf2c6ed15..43d848283b 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -78,7 +78,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys dyn.load("data_table.so") setwd(old) xx = getDLLRegisteredRoutines("data_table",TRUE) - for (Call xx$.Call) + for (Call in xx$.Call) .GlobalEnv[[Call$name]] = Call$address for (Extern in xx$.External) .GlobalEnv[[Extern$name]] = Extern$address From e88826e94127991754a8821ac8149d15f1f9c3c2 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 20 Oct 2021 21:03:36 +0200 Subject: [PATCH 433/588] gshift as gforce optimized shift (#5205) --- NEWS.md | 34 ++++++++++++++ R/data.table.R | 24 +++++++++- inst/tests/test2224.Rdata | Bin 0 -> 1580 bytes inst/tests/tests.Rraw | 65 ++++++++++++++++++++++++--- src/gsumm.c | 90 ++++++++++++++++++++++++++++++++++++++ src/init.c | 2 + 6 files changed, 208 insertions(+), 7 deletions(-) create mode 100644 inst/tests/test2224.Rdata diff --git a/NEWS.md b/NEWS.md index afbf154d71..5faf40723f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -173,6 +173,40 @@ 29. `setkey()` now supports type `raw` as value columns (not as key columns), [#5100](https://github.com/Rdatatable/data.table/issues/5100). Thanks Hugh Parsonage for requesting, and Benjamin Schwendinger for the PR. +30. `shift()` is now optimised by group, [#1534](https://github.com/Rdatatable/data.table/issues/1534). Thanks to Gerhard Nachtmann for requesting, and Benjamin Schwendinger for the PR. + + ```R + N = 1e7 + DT = data.table(x=sample(N), y=sample(1e6,N,TRUE)) + shift_no_opt = shift # different name not optimised as a way to compare + microbenchmark( + DT[, c(NA, head(x,-1)), y], + DT[, shift_no_opt(x, 1, type="lag"), y], + DT[, shift(x, 1, type="lag"), y], + times=10L, unit="s") + # Unit: seconds + # expr min lq mean median uq max neval + # DT[, c(NA, head(x, -1)), y] 8.7620 9.0240 9.1870 9.2800 9.3700 9.4110 10 + # DT[, shift_no_opt(x, 1, type = "lag"), y] 20.5500 20.9000 21.1600 21.3200 21.4400 21.5200 10 + # DT[, shift(x, 1, type = "lag"), y] 0.4865 0.5238 0.5463 0.5446 0.5725 0.5982 10 + ``` + + Example from [stackoverflow](https://stackoverflow.com/questions/35179911/shift-in-data-table-v1-9-6-is-slow-for-many-groups) + ```R + set.seed(1) + mg = data.table(expand.grid(year=2012:2016, id=1:1000), + value=rnorm(5000)) + microbenchmark(v1.9.4 = mg[, c(value[-1], NA), by=id], + v1.9.6 = mg[, shift_no_opt(value, n=1, type="lead"), by=id], + v1.14.4 = mg[, shift(value, n=1, type="lead"), by=id], + unit="ms") + # Unit: milliseconds + # expr min lq mean median uq max neval + # v1.9.4 3.6600 3.8250 4.4930 4.1720 4.9490 11.700 100 + # v1.9.6 18.5400 19.1800 21.5100 20.6900 23.4200 29.040 100 + # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 + ``` + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index b8c1132f6a..e020ea3e3d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1745,6 +1745,10 @@ replace_dot_alias = function(e) { if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na")))) return(TRUE) + if (length(q)>=2L && q[[1L]] == "shift") { + q_named = match.call(shift, q) + if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE) + } # add gshift support # ^^ base::startWith errors on NULL unfortunately # head-tail uses default value n=6 which as of now should not go gforce ... ^^ # otherwise there must be three arguments, and only in two cases: @@ -1848,6 +1852,17 @@ replace_dot_alias = function(e) { gi = if (length(o__)) o__[f__] else f__ g = lapply(grpcols, function(i) groups[[i]][gi]) + # returns all rows instead of one per group + nrow_funs = c("gshift") + .is_nrows = function(q) { + if (!is.call(q)) return(FALSE) + if (q[[1L]] == "list") { + any(vapply(q, .is_nrows, FALSE)) + } else { + q[[1L]] %chin% nrow_funs + } + } + # adding ghead/gtail(n) support for n > 1 #5060 #523 q3 = 0 if (!is.symbol(jsub)) { @@ -1865,6 +1880,8 @@ replace_dot_alias = function(e) { if (q3 > 0) { grplens = pmin.int(q3, len__) g = lapply(g, rep.int, times=grplens) + } else if (.is_nrows(jsub)) { + g = lapply(g, rep.int, times=len__) } ans = c(g, ans) } else { @@ -2970,7 +2987,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { # (2) edit .gforce_ok (defined within `[`) to catch which j will apply the new function # (3) define the gfun = function() R wrapper gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod", - "median", "min", "max", "var", "sd", ".N") # added .N for #334 + "median", "min", "max", "var", "sd", ".N", "shift") # added .N for #334 `g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment @@ -2984,6 +3001,11 @@ gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm) gmax = function(x, na.rm=FALSE) .Call(Cgmax, x, na.rm) gvar = function(x, na.rm=FALSE) .Call(Cgvar, x, na.rm) gsd = function(x, na.rm=FALSE) .Call(Cgsd, x, na.rm) +gshift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift", "cyclic")) { + type = match.arg(type) + stopifnot(is.numeric(n)) + .Call(Cgshift, x, as.integer(n), fill, type) +} gforce = function(env, jsub, o, f, l, rows) .Call(Cgforce, env, jsub, o, f, l, rows) .prepareFastSubset = function(isub, x, enclos, notjoin, verbose = FALSE){ diff --git a/inst/tests/test2224.Rdata b/inst/tests/test2224.Rdata new file mode 100644 index 0000000000000000000000000000000000000000..9c6423b9fb04ede6cae2fb77c7a59a091a0afada GIT binary patch literal 1580 zcmV+{2GjW;iwFP!000001MOYSj@&j3rhf+AP0=49MbYPIi(Z>UpP;8)x5*}0AlU%f zUV7?2OJAgUqy>7PsqHa6GKUl?TbkH265y3AQXfB36vr9tI=}z+^2cW{p8-G#bx}ZB zHN(1WeiTr{b<CW|6>CBhTT*1tDxCC?PT|8pT z#U0J}NZQhC-gD>#$<>E@^T)az)>vH07o%u!N zS!}+9SG4%HHt2=6t@A`@mFEHy*9Xo|itX}tvB~_vSHK(P7=t&Or*TNh7v$Wu#+c1t zl({}}m>gGTe${n{;TZMvt!WkZ{RO+-ZcB9>y4=U-0Z3$JOPOKi@|A`*n-^_fsqR;-6pt z`>AW#I}$J2$Kn~Q zr@ZIthw`-M@l?LCcz*79O8F6bZnXTB&O7$>yrZ7PGuI_Q?{lg1Yst@Cm*VKJHJ)qx ze7D9;<7?HI>`gsK)_Nq#t^Qc^bE)yv>hsh*mFlFl-daEN=O)kVT%SW$P(Qzg{r3Xb z&A;b&_$vl~+xerCuIZ^<@dou3Zh z0P`g<^zFsfzSq7a{P$_EhyBr)h)*zThA*c__BH%a{#35%Q~vZGgg9z`hsGr|PFmdP z`p`Iq?z_QW&PS)s&BxIAQ++L;V#hIczS8@t=Xz=V+&YrFheUAxfUrHQvKJvV= zN%bN$KPVnuPkW7zhOgP1>p$e@9+Q6td-=J?Y56(aFW2m)%Ew$E{r0T+IJXat>(cNi zoUi=cv*0*hYCeY6AF5CNqJ3ySQTrOcwqBEK_mn>qFH=5hyi%Xjx%Rd7q4-i9Q`aBL zGc>6#bU&aKEYW&TuH}* zhhq*mHD9`(!zFWGTj1(Upnu* zIGnk-7R*!dg>j2sA2==)o63YW178jgNRAOZY%ZLNuhWJX^>Mf$#8~3%>i94@u5jXW zoSH9P&*73e3HOH%x86VrWl^+!S=2RbDwJ(DP(v%Jp_Non)wXrbin8uxRVS+tvIDEq z`RT|y6Z@Am0hI~sHk*2Ht?sNw{;V&)!rYeCz!2>=v~`=$rsJ_})r77oc;Y z$Jf`j<~E(dx-;ndcd~q`x<4*{1h$oQ9`ASiwx#_j@Ql86ui=yTmTz9YyLotk{jba7 z&wW}O*IiW;S)G-n=#b&@j@ z{z3o3OY?c`3?(Hst25S}kal?1&d^9oy%hOa`ci+nrTKG_wD$SgjDl+s)0~p@rF%(# z(#}}PBUvEx8u?KD!@3LK8TE4po8*MHbDA?UUVfYzgX%`c>!h@i$ZI=u(7*7VIdXPxW5ky@&lv^4sb5ySFQ%a@DBZ-@LzVzruRn>uq}*SG`X07Y^=l e@EQjXICzDFcQ_dS`uYyZ!S;V1Vb}HvZ~y?unN$D( literal 0 HcmV?d00001 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 26f531455b..6382a13a85 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18243,21 +18243,39 @@ test(2217, DT1[, by = grp, .(agg = list(setNames(as.numeric(value), id)))], DT2) testnum = 2218 funs = c(as.integer, as.double, as.complex, as.character, if (test_bit64) as.integer64) # when test_bit64==FALSE these all passed before; now passes with test_bit64==TRUE too +# add grouping tests for #5205 +g = rep(c(1,2), each=2) +options(datatable.optimize = 2L) for (f1 in funs) { - DT = data.table(x=f1(1:4)) + DT = data.table(x=f1(1:4), g=g) for (f2 in funs) { - testnum = testnum + 0.01 + testnum = testnum + 0.001 test(testnum, DT[, shift(x)], f1(c(NA, 1:3))) - testnum = testnum + 0.01 + testnum = testnum + 0.001 w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" test(testnum, DT[, shift(x, fill=f2(NA))], f1(c(NA, 1:3)), warning=w) - testnum = testnum + 0.01 + testnum = testnum + 0.001 if (identical(f1,as.character) && identical(f2,as.complex)) { # one special case due to as.complex(0)=="0+0i"!="0" test(testnum, DT[, shift(x, fill="0")], f1(0:3)) } else { test(testnum, DT[, shift(x, fill=f2(0))], f1(0:3), warning=w) } + + testnum = testnum + 0.001 + test(testnum, DT[, shift(x), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3)))) + testnum = testnum + 0.001 + w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" + f = f2(NA) + test(testnum, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3))), warning=w) + testnum = testnum + 0.001 + if (identical(f1,as.character) && identical(f2,as.complex)) { + # one special case due to as.complex(0)=="0+0i"!="0" + test(testnum, DT[, shift(x, fill="0"), by=g], data.table(g=g, V1=f1(c(0,1,0,3)))) + } else { + f = f2(0) + test(testnum, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(0,1,0,3))), warning=w) + } } } @@ -18292,6 +18310,41 @@ DT = data.table(A=1:3, key="A") test(2223.1, DT[.(4), nomatch=FALSE], data.table(A=integer(), key="A")) test(2223.2, DT[.(4), nomatch=NA_character_], data.table(A=4L, key="A")) +# gshift, #5205 +options(datatable.optimize = 2L) +set.seed(123) +DT = data.table(x = sample(letters[1:5], 20, TRUE), + y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly + i = sample(c(-2L,0L,3L,NA), 20, TRUE), + d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), + s = sample(c("foo","bar",NA), 20, TRUE), + c = sample(c(0+3i,1,-1-1i,NA), 20, TRUE), + l = sample(c(TRUE, FALSE, NA), 20, TRUE), + r = as.raw(sample(1:5, 20, TRUE))) +load(testDir("test2224.Rdata")) # ans array +if (test_bit64) { + DT[, i64:=as.integer64(sample(c(-2L,0L,2L,NA), 20, TRUE))] +} else { + ans = ans[, -match("i64",colnames(ans))] +} +test(2224.01, sapply(names(DT)[-1], function(col) { + sapply(list(1, 5, -1, -5, c(1,2), c(-1,1)), function(n) list( + # fill is tested by group in tests 2218.*; see comments in #5205 + EVAL(sprintf("DT[, shift(%s, %d, type='lag'), by=x]$V1", col, n)), + EVAL(sprintf("DT[, shift(%s, %d, type='lead'), by=x]$V1", col, n)), + EVAL(sprintf("DT[, shift(%s, %d, type='shift'), by=x]$V1", col, n)), + EVAL(sprintf("DT[, shift(%s, %d, type='cyclic'), by=x]$V1", col, n)) + )) +}), ans) +a = 1:2 # fill argument with length > 1 which is not a call +test(2224.02, DT[, shift(i, fill=a), by=x], error="fill must be a vector of length 1") +DT = data.table(x=pairlist(1), g=1) +# unsupported type as argument +test(2224.03, DT[, shift(x), g], error="Type 'list' is not supported by GForce gshift.") + # groupingsets by named by argument -test(2224.1, groupingsets(data.table(iris), j = sum(Sepal.Length), by = c('Sp'='Species'), sets = list('Species')), data.table(Species = factor(c("setosa", "versicolor", "virginica")), V1=c(250.3, 296.8, 329.4))) -test(2224.2, groupingsets(data.table(iris), j = mean(Sepal.Length), by = c('Sp'='Species'), sets = list('Species')), groupingsets(data.table(iris), j = mean(Sepal.Length), by = c('Species'), sets = list('Species'))) +test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), + data.table(Species=factor(c("setosa", "versicolor", "virginica")), V1=c(250.3, 296.8, 329.4))) +test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), + groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species'))) + diff --git a/src/gsumm.c b/src/gsumm.c index 5bb2620243..4964de8b6e 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -1162,3 +1162,93 @@ SEXP gprod(SEXP x, SEXP narmArg) { return(ans); } +SEXP gshift(SEXP x, SEXP nArg, SEXP fillArg, SEXP typeArg) { + const bool nosubset = irowslen == -1; + const bool issorted = !isunsorted; + const int n = nosubset ? length(x) : irowslen; + if (nrow != n) error(_("Internal error: nrow [%d] != length(x) [%d] in %s"), nrow, n, "gshift"); + + int nprotect=0; + enum {LAG, LEAD/*, SHIFT*/,CYCLIC} stype = LAG; + if (!(length(fillArg) == 1)) + error(_("fill must be a vector of length 1")); + + if (!isString(typeArg) || length(typeArg) != 1) + error(_("Internal error: invalid type for gshift(), should have been caught before. please report to data.table issue tracker")); // # nocov + if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "lag")) stype = LAG; + else if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "lead")) stype = LEAD; + else if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "shift")) stype = LAG; + else if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "cyclic")) stype = CYCLIC; + else error(_("Internal error: invalid type for gshift(), should have been caught before. please report to data.table issue tracker")); // # nocov + + bool lag; + const bool cycle = stype == CYCLIC; + + R_xlen_t nx = xlength(x), nk = length(nArg); + if (!isInteger(nArg)) error(_("Internal error: n must be integer")); // # nocov + const int *kd = INTEGER(nArg); + for (int i=0; i grpn -> jend = jstart */ \ + if (lag) { \ + const int o = ff[i]-1+(grpn-thisn); \ + for (int j=0; j Date: Tue, 16 Nov 2021 12:14:17 +0100 Subject: [PATCH 434/588] Fix gprod for integer64 (#5231) --- NEWS.md | 4 +++- inst/tests/tests.Rraw | 8 +++++++ src/gsumm.c | 55 +++++++++++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 19 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5faf40723f..e89778c544 100644 --- a/NEWS.md +++ b/NEWS.md @@ -448,9 +448,11 @@ # 2: 2021-02-03 # was 18661 # 3: 4611686018427387906 # was error 'please use as.character' ``` - + 47. `tables()` failed with `argument "..." is missing` when called from within a function taking `...`; e.g. `function(...) { tables() }`, [#5197](https://github.com/Rdatatable/data.table/issues/5197). Thanks @greg-minshall for the report and @michaelchirico for the fix. +48. `DT[, prod(int64Col), by=grp]` produced wrong results for `bit64::integer64` due to incorrect optimization, [#5225](https://github.com/Rdatatable/data.table/issues/5225). Thanks to Benjamin Schwendinger for reporting and fixing. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6382a13a85..0bd814862a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18348,3 +18348,11 @@ test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Spec test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species'))) +# make gprod work for bit64, #5225 +if (test_bit64) { + test(2226.1, base::prod(2147483647L,2L), 4294967294) # just to illustrate that base returns double + DT = data.table(x=c(lim.integer64(), 2, 1, NA, NA, -2, 4), g=INT(1,2,1,2,1,2,3,3)) + test(2226.2, DT[, prod(x), g], data.table(g=1:3, V1=as.integer64(c(NA,NA,-8L)))) + test(2226.3, DT[, prod(x,na.rm=TRUE), g], data.table(g=1:3, V1=as.integer64(c(NA,"9223372036854775807",-8L)))) +} + diff --git a/src/gsumm.c b/src/gsumm.c index 4964de8b6e..2a2c0cdd4e 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -1114,13 +1114,10 @@ SEXP gprod(SEXP x, SEXP narmArg) { const bool nosubset = irowslen==-1; const int n = nosubset ? length(x) : irowslen; //clock_t start = clock(); - SEXP ans; if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gprod"); long double *s = malloc(ngrp * sizeof(long double)); if (!s) error(_("Unable to allocate %d * %d bytes for gprod"), ngrp, sizeof(long double)); for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; - else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; - else ansd[i] = (double)s[i]; + SEXP ans = PROTECT(allocVector(REALSXP, ngrp)); + if (INHERITS(x, char_integer64)) { + int64_t *ansd = (int64_t *)REAL(ans); + for (int i=0; iINT64_MAX || s[i]<=INT64_MIN) ? NA_INTEGER64 : (int64_t)s[i]; + } + } else { + double *ansd = REAL(ans); + for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; + else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; + else ansd[i] = (double)s[i]; + } } free(s); copyMostAttrib(x, ans); UNPROTECT(1); // Rprintf(_("this gprod took %8.3f\n"), 1.0*(clock()-start)/CLOCKS_PER_SEC); - return(ans); + return ans; } SEXP gshift(SEXP x, SEXP nArg, SEXP fillArg, SEXP typeArg) { From 45e7da8c76d94e2c9a33ce0ffea2db31d98480bf Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 16 Nov 2021 13:16:24 +0100 Subject: [PATCH 435/588] fread doc - remove unhosted files (#5242) --- man/fread.Rd | 35 +++-------------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/man/fread.Rd b/man/fread.Rd index c7b7da8566..cc96062dec 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -271,42 +271,13 @@ system.time(DT <- fread("testbig.csv")) all(mapply(all.equal, DF, DT)) - -# Real data example (Airline data) -# https://stat-computing.org/dataexpo/2009/the-data.html - -download.file("https://stat-computing.org/dataexpo/2009/2008.csv.bz2", - destfile="2008.csv.bz2") -# 109MB (compressed) - -system("bunzip2 2008.csv.bz2") -# 658MB (7,009,728 rows x 29 columns) - -colClasses = sapply(read.csv("2008.csv",nrows=100,stringsAsFactors=FALSE),class) -# 4 character, 24 integer, 1 logical. Incorrect. - -colClasses = sapply(read.csv("2008.csv",nrows=200,stringsAsFactors=FALSE),class) -# 5 character, 24 integer. Correct. Might have missed data only using 100 rows -# since read.table assumes colClasses is correct. - -system.time(DF <- read.table("2008.csv", header=TRUE, sep=",", - quote="",stringsAsFactors=FALSE,comment.char="",nrows=7009730, - colClasses=colClasses)) -# 24.4 secs - -system.time(DT <- fread("2008.csv")) -# 1.9 secs - -table(sapply(DT,class)) -# 5 character and 24 integer columns. Correct without needing to worry about colClasses -# issue above. - - # Reads URLs directly : fread("https://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat") # Decompresses .gz and .bz2 automatically : -fread("https://stat-computing.org/dataexpo/2009/1987.csv.bz2") +fread("https://github.com/Rdatatable/data.table/raw/1.14.0/inst/tests/ch11b.dat.bz2") + +fread("https://github.com/Rdatatable/data.table/raw/1.14.0/inst/tests/issue_785_fread.txt.gz") } } From 96860f2dad8053ce391d1b8a1db90e6e6d8432fb Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 18 Nov 2021 03:18:02 +0100 Subject: [PATCH 436/588] fread(file, nrows=0) file with header does not determine types (#5253) --- NEWS.md | 2 +- inst/tests/tests.Rraw | 31 ++++++++++++++++++++++++++----- src/fread.c | 4 ++-- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index e89778c544..3bd0d56148 100644 --- a/NEWS.md +++ b/NEWS.md @@ -213,7 +213,7 @@ 2. `print(DT, trunc.cols=TRUE)` and the corresponding `datatable.print.trunc.cols` option (new feature 3 in v1.13.0) could incorrectly display an extra column, [#4266](https://github.com/Rdatatable/data.table/issues/4266). Thanks to @tdhock for the bug report and @MichaelChirico for the PR. -3. `fread(..., nrows=0L)` now works as intended and the same as `nrows=0`; i.e. returning the column names and typed empty columns determined by the large sample, [#4686](https://github.com/Rdatatable/data.table/issues/4686). Thanks to @hongyuanjia for reporting, and Benjamin Schwendinger for the PR. +3. `fread(..., nrows=0L)` now works as intended and the same as `nrows=0`; i.e. returning the column names and typed empty columns determined by the large sample, [#4686](https://github.com/Rdatatable/data.table/issues/4686), [#4029](https://github.com/Rdatatable/data.table/issues/4029). Thanks to @hongyuanjia and @michaelpaulhirsch for reporting, and Benjamin Schwendinger for the PR. 4. Passing `.SD` to `frankv()` with `ties.method='random'` or with `na.last=NA` failed with `.SD is locked`, [#4429](https://github.com/Rdatatable/data.table/issues/4429). Thanks @smarches for the report. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0bd814862a..06e7f12d31 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13487,13 +13487,34 @@ test(1958.01, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encod test(1958.02, fread('a,b\n1,2', nrows = NA_real_), data.table(a = 1L, b = 2L)) test(1958.03, fread('a,b\n1,2', nrows = -1), data.table(a = 1L, b = 2L)) test(1958.04, fread('a,b\n1,2', key = 1), error = 'must be a character vector naming columns') -test(1958.05, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=logical(), B=logical(), C=logical())) #2747 -test(1958.06, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=logical(), B=logical(), C=logical())) +test(1958.05, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=integer(), B=integer(), C=integer())) #2747 +test(1958.06, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=integer(), B=integer(), C=integer())) test(1958.07, fread('A,B,C,D\n"a,b",4,5,6\n"c,d",6,7\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(6,NA))) # 2547 test(1958.08, fread('A,B,C,D\n"a,b",4,5\n"c,d",6,7,8\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(NA,8))) -# 4686 -test(1958.09, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0L), data.table(A=logical(), B=logical(), C=logical())) -test(1958.10, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0L, sep=','), data.table(A=logical(), B=logical(), C=logical())) +test(1958.09, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0L), data.table(A=integer(), B=integer(), C=integer())) # nrows=0 vs 0L, 4686 +test(1958.10, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0L, sep=','), data.table(A=integer(), B=integer(), C=integer())) +# nrows=0 should perform a full sample to get the empty column types right as documented, #4029 +test(1958.11, fread('A,B,C,D\n1,CHAR,"CHAR",3.1', nrows=0L), data.table(A=integer(), B=character(), C=character(), D=numeric())) +# .. one different type in the middle of under 100 +txt = paste(c("A,B\n1,2\n", rep("3,4\n",48), "3,4.1\n", rep("5,6\n",48)), collapse="") +test(1958.12, fread(text=txt, nrows=0L), data.table(A=integer(), B=numeric())) +test(1958.13, fread(text=txt, nrows=0L, skip=1L), data.table(V1=integer(), V2=numeric())) +test(1958.14, fread(text=txt, nrows=1L), data.table(A=1L, B=2L)) # B integer not numeric because sample is min(nrows,100) when nrows>=1 +test(1958.15, fread(text=txt, nrows=1L, skip=1L), data.table(V1=1L, V2=2L)) +test(1958.16, fread(text=txt, nrows=2L), data.table(A=c(1L,3L), B=c(2L,4L))) +test(1958.17, fread(text=txt, nrows=2L, skip=1L), data.table(V1=c(1L,3L), V2=c(2L,4L))) +# .. one different type on line 148 when there are just under 200 lines +txt = paste(c("A,B\n1,2\n", rep("3,4\n",148), "3,4.1\n", rep("5,6\n",48)), collapse="") +test(1958.18, fread(text=txt, nrows=0L, verbose=TRUE), data.table(A=integer(), B=numeric()), + output="Sampled 149 rows.*at 2 jump points") +# .. one different type within sample for large number of lines +txt = paste(c("A,B\n1,2\n", rep("3,4\n",5000), "3,4.1\n", rep("5,6\n",5000)), collapse="") +test(1958.19, fread(text=txt, nrows=0L, verbose=TRUE), data.table(A=integer(), B=numeric()), + output="Sampled 1049 rows.*at 11 jump points") +# .. one different type out of sample for large number of lines +txt = paste(c("A,B\n1,2\n", rep("3,4\n",5100), "3,4.1\n", rep("5,6\n",4900)), collapse="") +test(1958.20, fread(text=txt, nrows=0L, verbose=TRUE), data.table(A=integer(), B=integer()), + output="Sampled 1049 rows.*at 11 jump points") # Skip should work with all types of newlines #3006 eols = c("\n", "\r\n", "\r", "\n\r") diff --git a/src/fread.c b/src/fread.c index be50976849..2aa796037c 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1579,7 +1579,7 @@ int freadMain(freadMainArgs _args) { int ncol; // Detected number of columns in the file const char *firstJumpEnd=NULL; // remember where the winning jumpline from jump 0 ends, to know its size excluding header const char *prevStart = NULL; // the start of the non-empty line before the first not-ignored row (for warning message later, or taking as column names) - int jumpLines = (int)umin(100,nrowLimit); // how many lines from each jump point to use. If nrowLimit is supplied, nJumps is later set to 1 as well. + int jumpLines = nrowLimit==0 ? 100 : (int)umin(100, nrowLimit); // how many lines from each jump point to use. If nrows>0 is supplied, nJumps is later set to 1. #4029 { if (verbose) DTPRINT(_("[06] Detect separator, quoting rule, and ncolumns\n")); @@ -1812,7 +1812,7 @@ int freadMain(freadMainArgs _args) { (uint64_t)sz, (uint64_t)jump0size, (uint64_t)(sz/(2*jump0size))); } nJumps++; // the extra sample at the very end (up to eof) is sampled and format checked but not jumped to when reading - if (nrowLimit0) nJumps=1; // when nrows>0 supplied by user, no jumps (not even at the end) and single threaded sampleLines = 0; double sumLen=0.0, sumLenSq=0.0; From d8dc315eb66bc29c49ac0344550f976d16b153ee Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 19 Nov 2021 04:27:48 +0100 Subject: [PATCH 437/588] set operations for DT containing x and y as column names (#5256) --- NEWS.md | 2 ++ R/setops.R | 10 ++++++---- inst/tests/tests.Rraw | 5 +++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3bd0d56148..b6556230a9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -453,6 +453,8 @@ 48. `DT[, prod(int64Col), by=grp]` produced wrong results for `bit64::integer64` due to incorrect optimization, [#5225](https://github.com/Rdatatable/data.table/issues/5225). Thanks to Benjamin Schwendinger for reporting and fixing. +49. `fintersect(..., all=TRUE)` and `fsetdiff(..., all=TRUE)` could return incorrect results when the inputs had columns named `x` and `y`, [#5255](https://github.com/Rdatatable/data.table/issues/5255). Thanks @Fpadt for the report, and @ben-schwen for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/setops.R b/R/setops.R index 042d0c5f93..1034b0f0fa 100644 --- a/R/setops.R +++ b/R/setops.R @@ -59,8 +59,9 @@ fintersect = function(x, y, all=FALSE) { .set_ops_arg_check(x, y, all, .seqn = TRUE) if (!nrow(x) || !nrow(y)) return(x[0L]) if (all) { - x = shallow(x)[, ".seqn" := rowidv(x)] - y = shallow(y)[, ".seqn" := rowidv(y)] + .seqn_id = NULL # to avoid 'no visible binding for global variable' note from R CMD check + x = shallow(x)[, ".seqn" := rowidv(.seqn_id), env=list(.seqn_id=x)] + y = shallow(y)[, ".seqn" := rowidv(.seqn_id), env=list(.seqn_id=y)] jn.on = c(".seqn",setdiff(names(y),".seqn")) # fixes #4716 by preserving order of 1st (uses y[x] join) argument instead of 2nd (uses x[y] join) y[x, .SD, .SDcols=setdiff(names(y),".seqn"), nomatch=NULL, on=jn.on] @@ -75,8 +76,9 @@ fsetdiff = function(x, y, all=FALSE) { if (!nrow(x)) return(x) if (!nrow(y)) return(if (!all) funique(x) else x) if (all) { - x = shallow(x)[, ".seqn" := rowidv(x)] - y = shallow(y)[, ".seqn" := rowidv(y)] + .seqn_id = NULL # to avoid 'no visible binding for global variable' note from R CMD check + x = shallow(x)[, ".seqn" := rowidv(.seqn_id), env=list(.seqn_id=x)] + y = shallow(y)[, ".seqn" := rowidv(.seqn_id), env=list(.seqn_id=y)] jn.on = c(".seqn",setdiff(names(x),".seqn")) x[!y, .SD, .SDcols=setdiff(names(x),".seqn"), on=jn.on] } else { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 06e7f12d31..ef06748e05 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18377,3 +18377,8 @@ if (test_bit64) { test(2226.3, DT[, prod(x,na.rm=TRUE), g], data.table(g=1:3, V1=as.integer64(c(NA,"9223372036854775807",-8L)))) } +# set ops when DT has column names x and y, #5255 +DT = data.table(x=c(1,2,2,2), y=LETTERS[c(1,2,2,3)]) +test(2227.1, fintersect(DT, DT, all=TRUE), DT) +test(2227.2, fsetdiff(DT, DT, all=TRUE), DT[0]) + From 49223840532cc5a79b07ab320529f67107115e1f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 23 Nov 2021 03:25:40 +0100 Subject: [PATCH 438/588] rbindlist support fill=TRUE with use.names=FALSE and use it in merge.R ToDo of #678 (#5263) --- NEWS.md | 45 +++++++++++++++++++++++++++++++++++++++++++ R/merge.R | 11 +---------- inst/tests/tests.Rraw | 19 ++++++++++++++++-- man/rbindlist.Rd | 2 +- src/rbindlist.c | 3 +-- 5 files changed, 65 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index b6556230a9..59c9404d3a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -206,6 +206,51 @@ # v1.9.6 18.5400 19.1800 21.5100 20.6900 23.4200 29.040 100 # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 ``` + +31. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.` + + ```R + DT1 + # A B + # + # 1: 1 5 + # 2: 2 6 + + DT2 + # foo + # + # 1: 3 + # 2: 4 + + rbind(DT1, DT2, fill=TRUE) # no change + # A B foo + # + # 1: 1 5 NA + # 2: 2 6 NA + # 3: NA NA 3 + # 4: NA NA 4 + + rbind(DT1, DT2, fill=TRUE, use.names=FALSE) + + # was: + # A B foo + # + # 1: 1 5 NA + # 2: 2 6 NA + # 3: NA NA 3 + # 4: NA NA 4 + # Warning message: + # In rbindlist(l, use.names, fill, idcol) : + # use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE. + + # now: + # A B + # + # 1: 1 5 + # 2: 2 6 + # 3: 3 NA + # 4: 4 NA + ``` ## BUG FIXES diff --git a/R/merge.R b/R/merge.R index 683a6d08a4..f237bcbf32 100644 --- a/R/merge.R +++ b/R/merge.R @@ -78,16 +78,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL # Perhaps not very commonly used, so not a huge deal that the join is redone here. missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) { - yy = y[missingyidx] - othercolsx = setdiff(nm_x, by) - if (length(othercolsx)) { - tmp = rep.int(NA_integer_, length(missingyidx)) - # TO DO: use set() here instead.. - yy = cbind(yy, x[tmp, othercolsx, with = FALSE]) - } - # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist - # takes care of #24 without having to save names. This is how it should be, IMHO. - dt = rbind(dt, yy, use.names=FALSE) + dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE) } } # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ef06748e05..6cae6fe5c6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1863,6 +1863,8 @@ test(628.2, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list # Test merge with common names and all.y=TRUE, #2011 DT1 = data.table(a=c(1,3,4,5), total=c(2,1,3,1), key="a") DT2 = data.table(a=c(2,3,5), total=c(5,1,2), key="a") +DT3 = data.table(a=c(2), total=c(5), key="a") +DT4 = data.table(a=c(3), total=c(1), key="a") # 629+630 worked before anyway. 631+632 test the bug fix. adf=as.data.frame adt=as.data.table @@ -1875,6 +1877,16 @@ test(630.1, merge(DT1,DT2,all.x=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a" test(631, merge(DT1,DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=c(NA,1,1),total.y=c(5,1,2),key="a")) test(631.1, merge(DT1,DT2,all.y=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all.y=TRUE)),a)) +# ensure merge(x,y,all.y) does not alter input y ... +# .. i subset y with 1:nrow(y) +test(631.2, merge(DT1[c(1,3)],DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=NA_real_,total.y=c(5,1,2),key="a")) +test(631.3, DT2, data.table(a=c(2,3,5), total=c(5,1,2), key="a")) +# .. nrow(y)=1, i subset y with 1 and no match with x +test(631.4, merge(DT1,DT3,all.y=TRUE), data.table(a=c(2),total.x=NA_real_,total.y=c(5),key="a")) +test(631.5, DT3, data.table(a=c(2), total=c(5), key="a")) +# .. nrow(y)=1, i subset y with 1 and match with x +test(631.6, merge(DT1,DT4,all.y=TRUE), data.table(a=c(3),total.x=c(1),total.y=c(1),key="a")) +test(631.7, DT4, data.table(a=c(3), total=c(1), key="a")) test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3,1),total.y=c(NA,5,1,NA,2),key="a")) test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a)) @@ -14577,8 +14589,11 @@ test(2002.12, rbind(DT1, DT2, idcol='id'), data.table(id=integer(), a=logica test(2003.1, rbindlist(list(), use.names=1), error="use.names= should be TRUE, FALSE, or not used [(]\"check\" by default[)]") test(2003.2, rbindlist(list(), fill=1), error="fill= should be TRUE or FALSE") test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, use.names=FALSE), - data.table(a=c(1:2,NA,NA), b=c(NA,NA,3:4)), - warning="use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE") + data.table(a=c(1:4))) +test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TRUE, use.names=FALSE), + data.table(a=c(1:4), c=INT(5,6,NA,NA))) +test(2003.5, rbindlist(list(data.table(a=1:2), data.table(b=3:4, c=5:6)), fill=TRUE, use.names=FALSE), + data.table(a=c(1:4), V1=INT(NA,NA,5,6))) # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111 x1 = "fa\xE7ile" diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd index 192fb5135f..2ba39a2a98 100644 --- a/man/rbindlist.Rd +++ b/man/rbindlist.Rd @@ -13,7 +13,7 @@ rbindlist(l, use.names="check", fill=FALSE, idcol=NULL) \arguments{ \item{l}{ A list containing \code{data.table}, \code{data.frame} or \code{list} objects. \code{\dots} is the same but you pass the objects by name separately. } \item{use.names}{\code{TRUE} binds by matching column name, \code{FALSE} by position. `check` (default) warns if all items don't have the same names in the same order and then currently proceeds as if `use.names=FALSE` for backwards compatibility (\code{TRUE} in future); see news for v1.12.2.} - \item{fill}{\code{TRUE} fills missing columns with NAs. By default \code{FALSE}. When \code{TRUE}, \code{use.names} is set to \code{TRUE}.} + \item{fill}{\code{TRUE} fills missing columns with NAs. By default \code{FALSE}.} \item{idcol}{Creates a column in the result showing which list item those rows came from. \code{TRUE} names this column \code{".id"}. \code{idcol="file"} names this column \code{"file"}. If the input list has names, those names are the values placed in this id column, otherwise the values are an integer vector \code{1:length(l)}. See \code{examples}.} } \details{ diff --git a/src/rbindlist.c b/src/rbindlist.c index 5d0b6547e5..3669028835 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -12,8 +12,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists.")); Rboolean usenames = LOGICAL(usenamesArg)[0]; const bool fill = LOGICAL(fillArg)[0]; - if (fill && usenames!=TRUE) { - if (usenames==FALSE) warning(_("use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.")); // else no warning if usenames==NA (default) + if (fill && usenames==NA_LOGICAL) { usenames=TRUE; } const bool idcol = !isNull(idcolArg); From 68872b4ea8e27c7b29d064506caf8652a6cb0fd6 Mon Sep 17 00:00:00 2001 From: JoshOBrien Date: Fri, 26 Nov 2021 00:06:32 -0800 Subject: [PATCH 439/588] Add format_list_item() method for simple feature geometry columns (#5224) --- DESCRIPTION | 3 +- NEWS.md | 12 ++++---- R/print.data.table.R | 27 ++++++++++++++---- R/test.data.table.R | 2 +- inst/tests/other.Rraw | 7 ++++- inst/tests/tests.Rraw | 61 ++++++++++++++++++++++++++--------------- man/print.data.table.Rd | 2 +- 7 files changed, 76 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5e412260b4..88dfd46140 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -69,7 +69,8 @@ Authors@R: c( person("Bennet","Becker", role="ctb"), person("Kyle","Haynes", role="ctb"), person("Boniface Christian","Kamgang", role="ctb"), - person("Olivier","Delmarcell", role="ctb")) + person("Olivier","Delmarcell", role="ctb"), + person("Josh","O'Brien", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NEWS.md b/NEWS.md index 59c9404d3a..3121d893d8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -99,7 +99,7 @@ 16. `fwrite()` now accepts `sep=""`, [#4817](https://github.com/Rdatatable/data.table/issues/4817). The motivation is an example where the result of `paste0()` needs to be written to file but `paste0()` takes 40 minutes due to constructing a very large number of unique long strings in R's global character cache. Allowing `fwrite(, sep="")` avoids the `paste0` and saves 40 mins. Thanks to Jan Gorecki for the request, and Ben Schwen for the PR. -17. `data.table` printing now supports customizable methods for both columns and list column row items, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). `format_col` is S3-generic for customizing how to print whole columns; `format_list_item` is S3-generic for customizing how to print each row of a list column. Thanks to @mllg who initially filed [#3338](https://github.com/Rdatatable/data.table/pulls/3338) with the seed of the idea, @franknarf1 who earlier suggested the idea of providing custom formatters, @fparages who submitted a patch to improve the printing of timezones for [#2842](https://github.com/Rdatatable/data.table/issues/2842), @RichardRedding for pointing out an error relating to printing wide `expression` columns in [#3011](https://github.com/Rdatatable/data.table/issues/3011), and @MichaelChirico for implementing. See `?print.data.table` for examples. +17. `data.table` printing now supports customizable methods for both columns and list column row items, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). `format_col` is S3-generic for customizing how to print whole columns and by default defers to the S3 `format` method for the column's class if one exists; e.g. `format.sfc` for geometry columns from the `sf` package, [#2273](https://github.com/Rdatatable/data.table/issues/2273). Similarly, `format_list_item` is S3-generic for customizing how to print each row of list columns (which lack a format method at a column level) and also by default defers to the S3 `format` method for that item's class if one exists. Thanks to @mllg who initially filed [#3338](https://github.com/Rdatatable/data.table/pulls/3338) with the seed of the idea, @franknarf1 who earlier suggested the idea of providing custom formatters, @fparages who submitted a patch to improve the printing of timezones for [#2842](https://github.com/Rdatatable/data.table/issues/2842), @RichardRedding for pointing out an error relating to printing wide `expression` columns in [#3011](https://github.com/Rdatatable/data.table/issues/3011), @JoshOBrien for improving the output for geometry columns, and @MichaelChirico for implementing. See `?print.data.table` for examples. 18. `tstrsplit(,type.convert=)` now accepts a named list of functions to apply to each part, [#5094](https://github.com/Rdatatable/data.table/issues/5094). Thanks to @Kamgang-B for the request and implementing. @@ -206,7 +206,7 @@ # v1.9.6 18.5400 19.1800 21.5100 20.6900 23.4200 29.040 100 # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 ``` - + 31. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.` ```R @@ -215,13 +215,13 @@ # # 1: 1 5 # 2: 2 6 - + DT2 # foo # # 1: 3 # 2: 4 - + rbind(DT1, DT2, fill=TRUE) # no change # A B foo # @@ -231,7 +231,7 @@ # 4: NA NA 4 rbind(DT1, DT2, fill=TRUE, use.names=FALSE) - + # was: # A B foo # @@ -242,7 +242,7 @@ # Warning message: # In rbindlist(l, use.names, fill, idcol) : # use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE. - + # now: # A B # diff --git a/R/print.data.table.R b/R/print.data.table.R index 023551074a..16950fd110 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -180,10 +180,20 @@ format_list_item = function(x, ...) { UseMethod("format_list_item") } +has_format_method = function(x) { + f = function(y) !is.null(getS3method("format", class=y, optional=TRUE)) + any(sapply(class(x), f)) +} + format_col.default = function(x, ...) { - if (!is.null(dim(x))) return("") - if (is.list(x)) return(vapply_1c(x, format_list_item, ...)) - format(char.trunc(x), ...) # relevant to #37 + if (!is.null(dim(x))) + "" + else if (has_format_method(x) && length(formatted<-format(x, ...))==length(x)) + formatted #PR5224 motivated by package sf where column class is c("sfc_MULTIPOLYGON","sfc") and sf:::format.sfc exists + else if (is.list(x)) + vapply_1c(x, format_list_item, ...) + else + format(char.trunc(x), ...) # relevant to #37 } # #2842 -- different columns can have different tzone, so force usage in output @@ -206,14 +216,19 @@ format_list_item.default = function(x, ...) { if (is.null(x)) # NULL item in a list column "" else if (is.atomic(x) || inherits(x, "formula")) # FR #2591 - format.data.table issue with columns of class "formula" - paste(c(format(head(x, 6L), ...), if (length(x) > 6L) "..."), collapse=",") # fix for #5435 and #37 - format has to be added here... - else + paste(c(format(head(x, 6L), ...), if (length(x) > 6L) "..."), collapse=",") # fix for #5435 and #37 - format has to be added here... + else if (has_format_method(x) && length(formatted<-format(x, ...))==1L) { + # the column's class does not have a format method (otherwise it would have been used by format_col and this + # format_list_item would not be reached) but this particular list item does have a format method so use it + formatted + } else { paste0("<", class(x)[1L], paste_dims(x), ">") + } } # FR #1091 for pretty printing of character # TODO: maybe instead of doing "this is...", we could do "this ... test"? -char.trunc <- function(x, trunc.char = getOption("datatable.prettyprint.char")) { +char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) { trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) if (!is.character(x) || trunc.char <= 0L) return(x) idx = which(nchar(x) > trunc.char) diff --git a/R/test.data.table.R b/R/test.data.table.R index 2e8aca38ea..8ecbe304e6 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -116,7 +116,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F assign("filename", fn, envir=env) assign("inittime", as.integer(Sys.time()), envir=env) # keep measures from various test.data.table runs assign("showProgress", showProgress, envir=env) - + owd = setwd(tempdir()) # ensure writeable directory; e.g. tests that plot may write .pdf here depending on device option and/or batch mode; #5190 on.exit(setwd(owd)) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index bd9374db25..11b00cc546 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -1,4 +1,4 @@ -pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel") +pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf") # First expression of this file must be as above: .gitlab-ci.yml uses parse(,n=1L) to read one expression from this file and installs pkgs. # So that these dependencies of other.Rraw are maintained in a single place. # TEST_DATA_TABLE_WITH_OTHER_PACKAGES is off by default so this other.Rraw doesn't run on CRAN. It is run by GLCI, locally in dev, and by @@ -202,3 +202,8 @@ if (loaded[["parallel"]]) { test(14.1, {example(':=', package='data.table', local=TRUE, echo=FALSE); TRUE}) test(14.2, {example('CJ', package='data.table', local=TRUE, echo=FALSE); TRUE}) +if (loaded[["sf"]]) { #2273 + DT = as.data.table(st_read(system.file("shape/nc.shp", package = "sf"))) + test(15, DT[1:3, .(NAME, FIPS, geometry)], output="Ashe.*-81.4.*Surry.*-80.4") +} + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6cae6fe5c6..a3f11e6762 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21,6 +21,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { allNA = data.table:::allNA any_na = data.table:::any_na as.data.table.array = data.table:::as.data.table.array + as.data.table.default = data.table:::as.data.table.default as.IDate.default = data.table:::as.IDate.default as.ITime.default = data.table:::as.ITime.default binary = data.table:::binary @@ -8801,16 +8802,16 @@ test(1613.605, all.equal(data.table(a=1), try(stop('this wont work'), silent = T L1 = list(a = data.table(1), b = setattr("foo1613", "tbl", data.table(1))) L2 = list(a = 1, b = setattr("foo1613", "tbl", 1)) test(1613.606, all(grepl("target is data.table, current is numeric", all.equal(L1, L2)))) -as.data.table.foo1613 = function(x) { # test as.data.table coerce of 'current' argument +registerS3method("as.data.table", "foo1613", function(x) { # test as.data.table coerce of 'current' argument if (!length(x)) warning("empty foo1613") - as.data.table(unclass(foo1613)) -} -registerS3method("as.data.table", "foo1613", as.data.table.foo1613) + as.data.table(unclass(x)) +}) foo1613 = structure(list(NULL), class="foo1613") test(1613.607, all.equal(data.table(), foo1613, check.attributes=FALSE)) foo1613 = structure(list(), class="foo1613") test(1613.608, all.equal(data.table(), foo1613, check.attributes=FALSE), warning="empty") -rm(as.data.table.foo1613, foo1613) +registerS3method("as.data.table", "foo1613", as.data.table.default) +# search below in this file for "registerS3method" for comments about it DT1 <- data.table(a = 1:4, b = letters[1:4], .seqn = 5L) DT2 <- data.table(a = 4:1, b = letters[4:1], .seqn = 5L) @@ -17196,25 +17197,41 @@ test(2130.102, print(DT, timezone=FALSE), notOutput='UTC') # default expression printing can break format_col.default, #3011 test(2130.11, print(data.table(e = expression(1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13))), output = '1 + 2 + 3') -# format_col generic is used -format_col.complex = function(x, ...) sprintf('(%.1f, %.1fi)', Re(x), Im(x)) -registerS3method("format_col", "complex", format_col.complex) +# format_col and format_list_item generics, #2273 for package sf +registerS3method("format_col", "complex", function(x, ...) sprintf('(%.1f, %.1fi)', Re(x), Im(x))) # this registerS3method does seem to be necessary to work within the test.data.table() environment -# assigning the method using <<- probably works too, but we don't want to write to user's environment at all -x = data.table(z = c(1 + 3i, 2 - 1i, pi + 2.718i)) -test(2130.12, x, output = '(1.0, 3.0i)') -rm(format_col.complex) +# assigning the method in .GlobalEnv might work too, but we don't want to write to user's environment at all (and is disallowed by CRAN policy) +x = data.table(z = c(1+3i, 2-1i, pi+2.718i)) +test(2130.12, x, output="(1.0, 3.0i)") registerS3method("format_col", "complex", format_col.default) -# otherwise it remains registered after test.data.table() and causes test 1610.1 to fail on the next run for example, and user display if they have complex data -# haven't found a way to unregister an S3 method (tried registering NULL but there's an error that NULL isn't a function) - -# format_list_item() generic is used -format_list_item.myclass <- function(x, ...) paste0("<", class(x)[1L], ":", x$id, ">") -registerS3method("format_list_item", "myclass", format_list_item.myclass) -DT = data.table(row = 1:2, objs = list(structure(list(id = "foo"), class = "myclass"), structure(list(id = "bar"), class = "myclass"))) -test(2130.13, print(DT), output = "myclass:foo.*myclass:bar") -rm(format_list_item.myclass) -registerS3method("format_list_item", "myclass", format_list_item.default) +# haven't found a way to unregister an S3 method; tried registering NULL but that's an error that NULL isn't a function +# so registering the default method is the only known solution to clean up since the registered method persists after test.data.table() finishes and +# then i) test 1610.1 fails if test.data.table() is rerun, ii) user display of complex data would be affected +# did try wrapping with on.exit(,add=TRUE) but perhaps because this is a script that is sys.source'd, it ran straight away + +# format method for column takes predecedence over format method for each list item +registerS3method("format", "myclass2130", function(x, ...) paste0("<", class(x)[1L], ":", x$id, ">")) +DT = data.table(row=1:2, objs=list(structure(list(id="foo"), class="myclass2130"), structure(list(id="bar"), class="myclass2130"))) +test(2130.13, print(DT), output="myclass2130:foo.*myclass2130:bar") +setattr(DT$objs, "class", "foo2130") +registerS3method("format", "foo2130", function(x, ...) "All hail foo") +test(2130.14, print(DT), output="myclass2130:foo.*myclass2130:bar") # because length 1 from format but needs to be length(x) +registerS3method("format", "foo2130", function(x, ...) rep("All hail foo",length(x))) +test(2130.15, print(DT), output="All hail foo") # e.g. sf:::format.sfc rather than sf:::format.sfg on each item +setattr(DT$objs, "class", "bar2130_with_no_method") +test(2130.16, print(DT), output="myclass2130:foo.*myclass2130:bar") +registerS3method("format", "myclass2130", format.default) +registerS3method("format", "foo2130", format.default) + +DT = data.table(num = 1:2, + formula = list(as.formula("mpg~cyl")), + model = list(lm(mpg~cyl, mtcars)), + shallow = list(1:3, 4:6), + nested = list(list(1:3), list(4:6))) +test(2130.17, capture.output(DT), + c(" num formula model shallow nested", + "1: 1 mpg ~ cyl 1,2,3 ", + "2: 2 mpg ~ cyl 4,5,6 ")) # .SD from grouping should be unlocked, part of #4159 x = data.table(a=1:3, b=4:6) diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index 234fcd8ff1..fdaf84de20 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -57,7 +57,7 @@ \details{ By default, with an eye to the typically large number of observations in a \code{data.table}, only the beginning and end of the object are displayed (specifically, \code{head(x, topn)} and \code{tail(x, topn)} are displayed unless \code{nrow(x) < nrows}, in which case all rows will print). - \code{format_col} is applied at a column level; for example, \code{format_col.POSIXct} is used to tag the time zones of \code{POSIXct} columns. \code{format_list_item} is applied to the elements (rows) of \code{list} columns; see Examples. + \code{format_col} is applied at a column level; for example, \code{format_col.POSIXct} is used to tag the time zones of \code{POSIXct} columns. \code{format_list_item} is applied to the elements (rows) of \code{list} columns; see Examples. The default \code{format_col} method uses \code{\link[utils]{getS3method}} to test if a \code{format} method exists for the column, and if so uses it. Otherwise, the default \code{format_list_item} method uses the S3 format method (if one exists) for each item of a \code{list} column. } \seealso{\code{\link{print.default}}} \examples{ From 22b51c510a614e9095b7cd6de57cf87aae0d8914 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 29 Nov 2021 11:26:40 -0700 Subject: [PATCH 440/588] print.class and print.keys now TRUE by default (#5275) --- .dev/cc.R | 2 - NEWS.md | 2 + R/onLoad.R | 4 +- R/test.data.table.R | 3 +- man/print.data.table.Rd | 4 +- tests/autoprint.Rout.save | 122 ++++++++++++++++++++++---------------- tests/knitr.Rout.save | 33 ++++++----- 7 files changed, 96 insertions(+), 74 deletions(-) diff --git a/.dev/cc.R b/.dev/cc.R index 43d848283b..bc15b6765f 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -22,8 +22,6 @@ # c # test and step between R and C -options(datatable.print.class = TRUE) - sourceDir = function(path=getwd(), trace = TRUE, ...) { # copied verbatim from example(source) in base R for (nm in list.files(path, pattern = "\\.[RrSsQq]$")) { diff --git a/NEWS.md b/NEWS.md index 3121d893d8..eb1027e9a8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -551,6 +551,8 @@ 15. Many thanks to Kurt Hornik for investigating potential impact of a possible future change to `base::intersect()` on empty input, providing a patch so that `data.table` won't break if the change is made to R, and giving us plenty of notice, [#5183](https://github.com/Rdatatable/data.table/pull/5183). +16. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively. + # data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) diff --git a/R/onLoad.R b/R/onLoad.R index 1ee328e99f..b4ebeafdf2 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -73,10 +73,10 @@ "datatable.optimize"="Inf", # datatable. "datatable.print.nrows"="100L", # datatable. "datatable.print.topn"="5L", # datatable. - "datatable.print.class"="FALSE", # for print.data.table + "datatable.print.class"="TRUE", # for print.data.table "datatable.print.rownames"="TRUE", # for print.data.table "datatable.print.colnames"="'auto'", # for print.data.table - "datatable.print.keys"="FALSE", # for print.data.table + "datatable.print.keys"="TRUE", # for print.data.table "datatable.print.trunc.cols"="FALSE", # for print.data.table "datatable.allow.cartesian"="FALSE", # datatable. "datatable.dfdispatchwarn"="TRUE", # not a function argument diff --git a/R/test.data.table.R b/R/test.data.table.R index 8ecbe304e6..298fc34c13 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -81,7 +81,8 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F scipen = 0L, # fwrite now respects scipen datatable.optimize = Inf, datatable.alloccol = 1024L, - datatable.print.class = FALSE, # this is TRUE in cc.R and we like TRUE. But output= tests need to be updated (they assume FALSE currently) + datatable.print.class = FALSE, # output= tests were written when default was FALSE + datatable.print.keys = FALSE, # output= tests were written when default was FALSE datatable.print.trunc.cols = FALSE, #4552 datatable.rbindlist.check = NULL, datatable.integer64 = "integer64", diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index fdaf84de20..b4929e7899 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -18,10 +18,10 @@ \method{print}{data.table}(x, topn=getOption("datatable.print.topn"), # default: 5 nrows=getOption("datatable.print.nrows"), # default: 100 - class=getOption("datatable.print.class"), # default: FALSE + class=getOption("datatable.print.class"), # default: TRUE row.names=getOption("datatable.print.rownames"), # default: TRUE col.names=getOption("datatable.print.colnames"), # default: "auto" - print.keys=getOption("datatable.print.keys"), # default: FALSE + print.keys=getOption("datatable.print.keys"), # default: TRUE trunc.cols=getOption("datatable.print.trunc.cols"), # default: FALSE quote=FALSE, timezone=FALSE, \dots) diff --git a/tests/autoprint.Rout.save b/tests/autoprint.Rout.save index 60ae5056f2..a2879ff158 100644 --- a/tests/autoprint.Rout.save +++ b/tests/autoprint.Rout.save @@ -1,6 +1,6 @@ -R version 3.1.1 (2014-07-10) -- "Sock it to Me" -Copyright (C) 2014 The R Foundation for Statistical Computing +R version 4.1.1 (2021-08-10) -- "Kick Things" +Copyright (C) 2021 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. @@ -21,104 +21,122 @@ Loading required package: data.table > # Since this tests autoprinting at the console, it needs to use the .Rout.save mechanism in R CMD check > DT = data.table(a=1:2) # Should print at console? > DT # yes - a -1: 1 -2: 2 + a + +1: 1 +2: 2 > DT[1] # yes - a -1: 1 + a + +1: 1 > DT[2,a:=3L] # no > DT # yes - a -1: 1 -2: 3 + a + +1: 1 +2: 3 > DT[FALSE,a:=3L] # no > DT[a==4L,a:=5L] # no > DT[a %in% 4:8, a:=5L] # no > DT # yes - a -1: 1 -2: 3 +Index: + a + +1: 1 +2: 3 > print(DT[2,a:=4L]) # no > print(DT) # yes - a -1: 1 -2: 4 + a + +1: 1 +2: 4 > if (TRUE) DT[2,a:=5L] # no. used to print before v1.9.5 > if (TRUE) if (TRUE) DT[2,a:=6L] # no. used to print before v1.9.5 > (function(){DT[2,a:=5L];NULL})() # print NULL NULL > DT # no (from v1.9.5+). := suppresses next auto print (can't distinguish just "DT" symbol alone at the prompt) > DT # yes. 2nd time needed, or solutions below - a -1: 1 -2: 5 + a + +1: 1 +2: 5 > (function(){DT[2,a:=5L];NULL})() # print NULL NULL > DT[] # yes. guaranteed print - a -1: 1 -2: 5 + a + +1: 1 +2: 5 > (function(){DT[2,a:=5L];NULL})() # print NULL NULL > print(DT) # no. only DT[] is guaranteed print from v1.9.6 and R 3.2.0 > (function(){DT[2,a:=5L][];NULL})() # print NULL NULL > DT # yes. i) function needs to add [] after last one, so that "DT" alone is guaranteed anyway - a -1: 1 -2: 5 + a + +1: 1 +2: 5 > (function(){DT[2,a:=5L];DT[];NULL})() # print NULL NULL > DT # yes. ii) or as a separate DT[] after the last := inside the function - a -1: 1 -2: 5 + a + +1: 1 +2: 5 > DT2 = data.table(b=3:4) # no > (function(){DT[2,a:=6L];DT2[1,b:=7L];NULL})() NULL > DT # yes. last := was on DT2 not DT - a -1: 1 -2: 6 + a + +1: 1 +2: 6 > {DT[2,a:=6L];invisible()} # no > print(DT) # no > (function(){print(DT[2,a:=7L]);print(DT);invisible()})() # yes*2 - a -1: 1 -2: 7 - a -1: 1 -2: 7 + a + +1: 1 +2: 7 + a + +1: 1 +2: 7 > {print(DT[2,a:=8L]);print(DT);invisible()} # yes*1 Not within function so as at prompt - a -1: 1 -2: 8 + a + +1: 1 +2: 8 > DT[1][,a:=9L] # no (was too tricky to detect that DT[1] is a new object). Simple rule is that := always doesn't print > DT[2,a:=10L][1] # yes - a -1: 1 + a + +1: 1 > DT[1,a:=10L][1,a:=10L] # no > DT[,a:=as.integer(a)] # no > DT[1,a:=as.integer(a)] # no > DT[1,a:=10L][] # yes. ...[] == oops, forgot print(...) - a -1: 10 -2: 10 + a + +1: 10 +2: 10 > > # Test that error in := doesn't suppress next valid print, bug #2376 > tryCatch(DT[,foo:=ColumnNameTypo], error=function(e) e$message) # error: not found. [1] "object 'ColumnNameTypo' not found" > DT # yes - a -1: 10 -2: 10 + a + +1: 10 +2: 10 > DT # yes - a -1: 10 -2: 10 + a + +1: 10 +2: 10 > > > proc.time() user system elapsed - 3.14 0.10 3.22 + 0.723 0.637 0.217 diff --git a/tests/knitr.Rout.save b/tests/knitr.Rout.save index f97eeb4a4f..3d4b0cf72d 100644 --- a/tests/knitr.Rout.save +++ b/tests/knitr.Rout.save @@ -1,6 +1,6 @@ -R version 3.1.1 (2014-07-10) -- "Sock it to Me" -Copyright (C) 2014 The R Foundation for Statistical Computing +R version 4.1.1 (2021-08-10) -- "Kick Things" +Copyright (C) 2021 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. @@ -33,10 +33,11 @@ DT # yes ``` ``` -## x y -## 1: 1 4 -## 2: 2 5 -## 3: 3 6 +## x y +## +## 1: 1 4 +## 2: 2 5 +## 3: 3 6 ``` ```r @@ -45,10 +46,11 @@ print(DT[, z := 10:12]) # yes ``` ``` -## x y z -## 1: 1 4 10 -## 2: 2 5 11 -## 3: 3 6 12 +## x y z +## +## 1: 1 4 10 +## 2: 2 5 11 +## 3: 3 6 12 ``` ```r @@ -57,10 +59,11 @@ DT # yes ``` ``` -## x y z a -## 1: 1 4 10 1 -## 2: 2 5 11 1 -## 3: 3 6 12 1 +## x y z a +## +## 1: 1 4 10 1 +## 2: 2 5 11 1 +## 3: 3 6 12 1 ``` Some text. @@ -68,4 +71,4 @@ Some text. > > proc.time() user system elapsed - 3.116 0.128 3.257 + 0.742 0.666 0.261 From 32ed90cbdda9a9c99bbbadc20fd559477fd557d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Mon, 29 Nov 2021 23:37:21 +0100 Subject: [PATCH 441/588] Fwrite rounding (#5249) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 7 ++++++- src/fwrite.c | 14 ++++++++------ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index eb1027e9a8..3abcdaed4d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -500,6 +500,8 @@ 49. `fintersect(..., all=TRUE)` and `fsetdiff(..., all=TRUE)` could return incorrect results when the inputs had columns named `x` and `y`, [#5255](https://github.com/Rdatatable/data.table/issues/5255). Thanks @Fpadt for the report, and @ben-schwen for the fix. +50. `fwrite()` could produce not-ISO-compliant timestamps such as `2023-03-08T17:22:32.:00Z` when under a whole second by less than numerical tolerance of one microsecond, [#5238](https://github.com/Rdatatable/data.table/issues/5238). Thanks to @avraam-inside for the report and Václav Tlapák for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a3f11e6762..0b77d98fae 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10982,7 +10982,7 @@ setattr(DT[[4]], "tzone", NULL) setattr(DT[[5]], "tzone", NULL) # format() now supports digits = 0, to display nsmall decimal places. -options(digits.secs=0) +old=options(digits.secs=0) test(1741.3, x1<-capture.output(fwrite(DT,dateTimeAs="write.csv")), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) options(digits.secs=3) @@ -10993,6 +10993,7 @@ test(1741.5, x3<-capture.output(fwrite(DT,dateTimeAs="write.csv")), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) # check that extra digits made it into output test(1741.6, sum(nchar(x1)) < sum(nchar(x2)) && sum(nchar(x2)) < sum(nchar(x3))) +options(old) # fread should properly handle NA in colClasses argument #1910 test(1743.01, sapply(fread("a,b\n3,a", colClasses=c(NA, "factor")), class), c(a="integer", b="factor")) @@ -18414,3 +18415,7 @@ DT = data.table(x=c(1,2,2,2), y=LETTERS[c(1,2,2,3)]) test(2227.1, fintersect(DT, DT, all=TRUE), DT) test(2227.2, fsetdiff(DT, DT, all=TRUE), DT[0]) +# fwrite POSIXct rounding, #5238 +DT = data.table(as.POSIXct(c(1678296152.99999952316284179688, -118944658.0000004, -.00000004), origin='1970-01-01 00:00:00')) +test(2228, fwrite(DT), output="2023-03-08T17:22:33Z.*1966-03-26T07:49:02Z.*1970-01-01T00:00:00Z") + diff --git a/src/fwrite.c b/src/fwrite.c index 2d10d222fd..4922dd8b78 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -417,19 +417,21 @@ void writePOSIXct(double *col, int64_t row, char **pch) write_chars(na, &ch); } else { int64_t xi, d, t; - if (x>=0) { - xi = floor(x); + xi = floor(x); + int m = ((x-xi)*10000000); // 7th digit used to round up if 9 + m += (m%10); // 9 is numerical accuracy, 8 or less then we truncate to last microsecond + m /= 10; + int carry = m / 1000000; // Need to know if we rounded up to a whole second + m -= carry * 1000000; + xi += carry; + if (xi>=0) { d = xi / 86400; t = xi % 86400; } else { // before 1970-01-01T00:00:00Z - xi = floor(x); d = (xi+1)/86400 - 1; t = xi - d*86400; // xi and d are both negative here; t becomes the positive number of seconds into the day } - int m = ((x-xi)*10000000); // 7th digit used to round up if 9 - m += (m%10); // 9 is numerical accuracy, 8 or less then we truncate to last microsecond - m /= 10; write_date(d, &ch); *ch++ = 'T'; ch -= squashDateTime; From f83123f3c63b95e58810dfce4a19c69e357ab857 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 30 Nov 2021 07:23:31 +0100 Subject: [PATCH 442/588] fread improve containing header guess (#5257) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 7 +++++-- src/fread.c | 18 ++++++++++++------ src/fread.h | 3 ++- src/freadR.c | 6 +++--- 5 files changed, 24 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3abcdaed4d..f5fe08f923 100644 --- a/NEWS.md +++ b/NEWS.md @@ -251,6 +251,8 @@ # 3: 3 NA # 4: 4 NA ``` + +32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. ## BUG FIXES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0b77d98fae..8982342bc4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12498,9 +12498,12 @@ if (test_R.utils) { } # better colname detection by comparing potential column names to the whole sample not just the first row of the sample, #2526 -test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(A=NA, "100"=c(300L,500L), "200"=c(400L,600L))) -test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(A=NA, "100"=c(NA,500L), V3=c(NA,600L))) +test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,300L,500L), V3=c(200L,400L,600L))) +test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,NA,500L), V3=c(NA,NA,600L))) test(1870.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA,3.4))) +test(1870.4, fread("A,B,200\n,300,400\n,500,600"), data.table(A=NA, B=c(300L,500L), "200"=c(400L,600L))) +test(1870.5, fread("A,B,\n,,\n,500,600"), data.table(A=NA, B=c(NA,500L), V3=c(NA,600L))) +test(1870.6, fread("A,,\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(NA,300L,500L), V3=c(NA,400L,600L))) # nrows= now ignores errors after those nrows as expected and skip= determines first row for sure, #1267 txt = "V1, V2, V3\n2,3,4\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n" diff --git a/src/fread.c b/src/fread.c index 2aa796037c..04df88d9c2 100644 --- a/src/fread.c +++ b/src/fread.c @@ -66,8 +66,8 @@ static int8_t *type = NULL, *tmpType = NULL, *size = NULL; static lenOff *colNames = NULL; static freadMainArgs args = {0}; // global for use by DTPRINT; static implies ={0} but include the ={0} anyway just in case for valgrind #4639 -const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; -int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; +const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; +int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; // In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043 // So we assign them through below init function. @@ -1076,6 +1076,12 @@ static void parse_iso8601_timestamp(FieldParseContext *ctx) *target = NA_FLOAT64; } +static void parse_empty(FieldParseContext *ctx) +{ + int8_t *target = (int8_t*) ctx->targets[sizeof(int8_t)]; + *target = NA_BOOL8; +} + /* Parse numbers 0 | 1 as boolean and ,, as NA (fwrite's default) */ static void parse_bool_numeric(FieldParseContext *ctx) { @@ -1152,7 +1158,8 @@ static void parse_bool_lowercase(FieldParseContext *ctx) */ typedef void (*reader_fun_t)(FieldParseContext *ctx); static reader_fun_t fun[NUMTYPE] = { - (reader_fun_t) &Field, + (reader_fun_t) &Field, // CT_DROP + (reader_fun_t) &parse_empty, // CT_EMPTY (reader_fun_t) &parse_bool_numeric, (reader_fun_t) &parse_bool_uppercase, (reader_fun_t) &parse_bool_titlecase, @@ -1167,7 +1174,7 @@ static reader_fun_t fun[NUMTYPE] = { (reader_fun_t) &Field }; -static int disabled_parsers[NUMTYPE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +static int disabled_parsers[NUMTYPE] = {0}; static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped) { // used in sampling column types and whether column names are present @@ -1883,8 +1890,7 @@ int freadMain(freadMainArgs _args) { bool bumped=false; detect_types(&ch, tmpType, ncol, &bumped); if (sampleLines>0) for (int j=0; jCT_EMPTY) { args.header=true; if (verbose) DTPRINT(_(" 'header' determined to be true due to column %d containing a string on row 1 and a lower type (%s) in the rest of the %d sample rows\n"), j+1, typeName[type[j]], sampleLines); diff --git a/src/fread.h b/src/fread.h index 446da18e4b..7035615a55 100644 --- a/src/fread.h +++ b/src/fread.h @@ -20,7 +20,8 @@ typedef enum { NEG = -1, // dummy to force signed type; sign bit used for out-of-sample type bump management CT_DROP = 0, // skip column requested by user; it is navigated as a string column with the prevailing quoteRule - CT_BOOL8_N, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1. + CT_EMPTY, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1. EMPTY to help column heading guess, #5257 + CT_BOOL8_N, // int8_t CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, diff --git a/src/freadR.c b/src/freadR.c index 97fe691aa1..82992aba33 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -24,9 +24,9 @@ Secondary separator for list() columns, such as columns 11 and 12 in BED (no nee #define NUT NUMTYPE+2 // +1 for "numeric" alias for "double"; +1 for CLASS fallback using as.class() at R level afterwards -static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; -static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; -static int typeEnum[NUT] = {CT_DROP, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; +static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; +static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; +static int typeEnum[NUT] = {CT_DROP, CT_EMPTY, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; static colType readInt64As=CT_INT64; static SEXP selectSxp; static SEXP dropSxp; From 0dfc5a8634ac63ce1de3478983dcfcbaf225983e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 30 Nov 2021 10:11:15 +0100 Subject: [PATCH 443/588] fread support .zip and .tar (#5240) --- NEWS.md | 2 ++ R/fread.R | 24 ++++++++++++++++++++++-- inst/tests/multi-file.zip | Bin 0 -> 974 bytes inst/tests/russellCRLF.tar | Bin 0 -> 10240 bytes inst/tests/russellCRLF.zip | Bin 0 -> 467 bytes inst/tests/tests.Rraw | 18 ++++++++++++++++++ 6 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 inst/tests/multi-file.zip create mode 100644 inst/tests/russellCRLF.tar create mode 100644 inst/tests/russellCRLF.zip diff --git a/NEWS.md b/NEWS.md index f5fe08f923..c226f84fd1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -254,6 +254,8 @@ 32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. +33. `fread()` can now read `.zip` and `.tar` directly, [#3834](https://github.com/Rdatatable/data.table/issues/3834). Moreover, if a compressed file name is missing its extension, `fread()` attempts to infer the correct filetype from its magic bits. Thanks to Michael Chirico for the idea, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/fread.R b/R/fread.R index 12f46b57ea..f8b025d9c3 100644 --- a/R/fread.R +++ b/R/fread.R @@ -98,10 +98,30 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") warningf("File '%s' has size 0. Returning a NULL %s.", file, if (data.table) 'data.table' else 'data.frame') return(if (data.table) data.table(NULL) else data.frame(NULL)) } - if (w <- endsWithAny(file, c(".gz",".bz2"))) { + + # support zip and tar files #3834 + zip_signature = charToRaw("PK\x03\x04") + file_signature = readBin(file, raw(), 8L) + + if ((w <- endsWithAny(file, c(".zip", ".tar"))) || identical(head(file_signature, 4L), zip_signature)) { + FUN = if (w==2L) untar else unzip + fnames = FUN(file, list=TRUE) + if (is.data.frame(fnames)) fnames = fnames[,1L] + if (length(fnames) > 1L) + stopf("Compressed files containing more than 1 file are currently not supported.") + FUN(file, exdir=tmpdir) + decompFile = file.path(tmpdir, fnames) + file = decompFile + on.exit(unlink(decompFile), add=TRUE) + } + + gz_signature = as.raw(c(0x1F, 0x8B)) + bz2_signature = as.raw(c(0x42, 0x5A, 0x68)) + gzsig = FALSE + if ((w <- endsWithAny(file, c(".gz",".bz2"))) || (gzsig <- identical(head(file_signature, 2L), gz_signature)) || identical(head(file_signature, 3L), bz2_signature)) { if (!requireNamespace("R.utils", quietly = TRUE)) stopf("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov - FUN = if (w==1L) gzfile else bzfile + FUN = if (w==1L || gzsig) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download on.exit(unlink(decompFile), add=TRUE) diff --git a/inst/tests/multi-file.zip b/inst/tests/multi-file.zip new file mode 100644 index 0000000000000000000000000000000000000000..6bf27e2a42e515a690cebb1a05d874a428e64f57 GIT binary patch literal 974 zcmWIWW@Zs#-~htjhHpU(Q1F0-fkBW#fuX3hxHvT@$2rJ3$j41DxwtGegqMN+b=A*= zsKu3uAiA`In}Lxf_t88CmK-3_dwFkOivbV&hrcTK7!8#l*W5f==iOfR;=;KX4q8Hc zS968buChyZ4!9f=Yoo}x-{R-(%y;V!|A@5zvOf8;+4H))^8SDK^nU%kW#64`TkZVi zFB`{>@n1jYN7;KVIhU|paAT%#!GSwc?g9bJ zb~;Zsb>0xjZs52k_8LpodRg(nB;UyDgzHjH0Xo;_lrr>sTI*kZrEc2IVfJp0$Ogge z&&gXRrl>ShZ`~Un-o8q@!mgZBoh%aBZaz@D7OG|`0 zO~t=Ue=Xd<)2fy^z?+@ph2869MMee&P$&mr4gXz08GhvOM+^PRxjIh{}2N+^O>t#g{Gd8U8MVHWrf?NuoeX||EahBefqdP_I~WIhnuTj#I^5Q z>3?^V?Yv!?va6mKR_&CSK-{}iL+My{`jZPwmd)n>c{8% zs)|nNCT0t6%+sh6$UI@}awQ^j;wI4!y_*~&Q?I3;PS{?1S;#HZ@;1XG=_D3qFYTLd z8+Js?mt6ak@>Jm9Qsd2xic3vfpB~P7pPjQ!PgMP8h*s*kwoR!M&T?72of*50NoEqS zLs+Kj?SebzN7rs$c*A^0ZT#}JfjXPGo}D=Ldd*QK?f2i;cx2bV&Th1fY+A7OUA=vF z@wR<0(jVPrOAVXWJ#S0ed(X6`QGa~gP z#|tR+!@!b85R2rzfNngp)u70QfhCO%KnKB%CoZZ3yjj^m1~38P9w7aP3B&^c7uJ#h literal 0 HcmV?d00001 diff --git a/inst/tests/russellCRLF.tar b/inst/tests/russellCRLF.tar new file mode 100644 index 0000000000000000000000000000000000000000..6d508dcce02c2d7042ca9a5a3d8595f215923df0 GIT binary patch literal 10240 zcmeH{J5Iwu5I|icaRzTCGlJm*+5Vw%K1Sfy)__y{5z8J|4cGtF(_xDX%hVgm=nYZFMH3 zcgAM9NZYBFPM`c zwTp?47QCNm>mUjW>!(7GtcPfjji4iuwIl05rMP&6B(iQ~19&^pPJr}8){kriCB9f2 zXsjGu86m)6&Q5~YL{@&sO|0u#>V2geS%byFxL7KpXF1f!2nRl~9t<^SGsgAc0RIjP2!}7l6%dH7)G$Y>CslGKF?FJ=p}smg5UQVFoWX<~18jf|umLu}2G{@_U;}J`4X^<=zy{a=8~7IkpVVJ+5C8xG literal 0 HcmV?d00001 diff --git a/inst/tests/russellCRLF.zip b/inst/tests/russellCRLF.zip new file mode 100644 index 0000000000000000000000000000000000000000..c5060f1fed4209762c01eb602d31e904c1daa7f7 GIT binary patch literal 467 zcmWIWW@Zs#U|`^2=xg{El%v@fBhSddu#1I(fuBK!p{TUDI5j87ImpLNFS)oZG=!6Z z`E}LLg!nm?iKP|X42&$fkLEG3{HNaf_vz#I*!!`+9&WCB5!b$JrT^Vcw)1vn%C34|Sh;hZ zcd3;9vAP{!KF`-b`ug_lUxjm@C(c^+`{SQF+w%PQs~?~5t13F7o0u)QF;AmPAoGN= z%aw@CiJL?_^loy9Oud$VI$?Y5Wg)jr%i9c(q?1^by|iz-ZP*blUvlkF%2R=ZON}=( zDlRo`eR??SeRj?^JyG?WAzG>D+BT(5ILl@6c4q80CYedR4q=(9w+rr=A6>h3;SKX0 zweic>2I_3$dUoQ}>orG}wBLVU z?>*C&PED%2DEE1G+6U6LW?UgH0gM3#24KK5ENKL>Fv6V`67FbW9^lQ&22#!l MgrPt>17rdN0J7r1)Bpeg literal 0 HcmV?d00001 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8982342bc4..1c5704bfce 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18422,3 +18422,21 @@ test(2227.2, fsetdiff(DT, DT, all=TRUE), DT[0]) DT = data.table(as.POSIXct(c(1678296152.99999952316284179688, -118944658.0000004, -.00000004), origin='1970-01-01 00:00:00')) test(2228, fwrite(DT), output="2023-03-08T17:22:33Z.*1966-03-26T07:49:02Z.*1970-01-01T00:00:00Z") +# automatically infer known files signatures and attempt auto-unzip #3834 +DT = fread(testDir("russellCRLF.csv")) +test(2229.1, fread(testDir("russellCRLF.zip")), DT) +test(2229.2, fread(testDir("russellCRLF.tar")), DT) +# guess binary file type +f = tempfile() +file.copy(testDir("russellCRLF.zip"), f, overwrite=TRUE) +test(2229.3, fread(f), DT) +if (test_R.utils) { + file.copy(testDir("ch11b.dat.bz2"), f, overwrite=TRUE) + test(2229.4, fread(f, logical01=FALSE)[,1], data.table(V1 = 1:100)) + file.copy(testDir("issue_785_fread.txt.gz"), f, overwrite=TRUE) + test(2229.5, fread(f, logical01=FALSE)[,25], data.table(Sv3 = c(10,14,14,15))) +} +unlink(f) +# not supporting multi file zips yet +test(2229.6, fread(testDir("multi-file.zip")), error="Compressed files containing more than 1 file are currently not supported.") + From 154fcf9878892a642a08feab2b94b771f9d4f69c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 30 Nov 2021 19:55:04 +0100 Subject: [PATCH 444/588] Alias for functional assignment by reference `:=`(...) (#5261) --- NAMESPACE | 2 +- NEWS.md | 14 ++++++++++++- R/data.table.R | 12 +++++++---- inst/tests/tests.Rraw | 48 +++++++++++++++++++++++++++++-------------- man/assign.Rd | 11 +++++++++- 5 files changed, 65 insertions(+), 22 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 157d39a081..207d50593a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,7 +10,7 @@ export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%") export(timetaken) -export(truelength, setalloccol, alloc.col, ":=") +export(truelength, setalloccol, alloc.col, ":=", let) export(setattr, setnames, setcolorder, set, setDT, setDF) export(setorder, setorderv) export(setNumericRounding, getNumericRounding) diff --git a/NEWS.md b/NEWS.md index c226f84fd1..2e09efbc3a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -254,7 +254,19 @@ 32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. -33. `fread()` can now read `.zip` and `.tar` directly, [#3834](https://github.com/Rdatatable/data.table/issues/3834). Moreover, if a compressed file name is missing its extension, `fread()` attempts to infer the correct filetype from its magic bits. Thanks to Michael Chirico for the idea, and Benjamin Schwendinger for the PR. +33. `fread()` can now read `.zip` and `.tar` directly, [#3834](https://github.com/Rdatatable/data.table/issues/3834). Moreover, if a compressed file name is missing its extension, `fread()` now attempts to infer the correct filetype from its magic bytes. Thanks to Michael Chirico for the idea, and Benjamin Schwendinger for the PR. + +34. `DT[, let(...)]` is a new alias for the functional form of `:=`; i.e. `DT[, ':='(...)]`, [#3795](https://github.com/Rdatatable/data.table/issues/3795). Thanks to Elio Campitelli for requesting, and Benjamin Schwendinger for the PR. + + ```R + DT = data.table(A=1:2) + DT[, let(B=3:4, C=letters[1:2])] + DT + # A B C + # + # 1: 1 3 a + # 2: 2 4 b + ``` ## BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index e020ea3e3d..cf952e306e 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -310,7 +310,9 @@ replace_dot_alias = function(e) { as.character(jsub[[1L]])[1L] } else "" } - if (root == ":=") { + if (root == ":=" || root == "let") { # let(...) as alias for :=(...) (#3795) + if (root == "let") + jsub[[1L]] = as.symbol(":=") allow.cartesian=TRUE # (see #800) if (!missing(i) && keyby) stopf(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i") @@ -1107,7 +1109,7 @@ replace_dot_alias = function(e) { if (is.null(names(jsub))) { # regular LHS:=RHS usage, or `:=`(...) with no named arguments (an error) # `:=`(LHS,RHS) is valid though, but more because can't see how to detect that, than desire - if (length(jsub)!=3L) stopf("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.") + if (length(jsub)!=3L) stopf("In %s(col1=val1, col2=val2, ...) form, all arguments must be named.", if (root == "let") "let" else "`:=`") lhs = jsub[[2L]] jsub = jsub[[3L]] if (is.name(lhs)) { @@ -1119,7 +1121,7 @@ replace_dot_alias = function(e) { } else { # `:=`(c2=1L,c3=2L,...) lhs = names(jsub)[-1L] - if (any(lhs=="")) stopf("In `:=`(col1=val1, col2=val2, ...) form, all arguments must be named.") + if (any(lhs=="")) stopf("In %s(col1=val1, col2=val2, ...) form, all arguments must be named.", if (root == "let") "let" else "`:=`") names(jsub)="" jsub[[1L]]=as.name("list") } @@ -2772,9 +2774,11 @@ address = function(x) .Call(Caddress, eval(substitute(x), parent.frame())) ":=" = function(...) { # this error is detected when eval'ing isub and replaced with a more helpful one when using := in i due to forgetting a comma, #4227 - stopf('Check that is.data.table(DT) == TRUE. Otherwise, := and `:=`(...) are defined for use in j, once only and in particular ways. See help(":=").') + stopf('Check that is.data.table(DT) == TRUE. Otherwise, :=, `:=`(...) and let(...) are defined for use in j, once only and in particular ways. See help(":=").') } +let = function(...) `:=`(...) + setDF = function(x, rownames=NULL) { if (!is.list(x)) stopf("setDF only accepts data.table, data.frame or list of equal length as input") if (anyDuplicated(rownames)) stopf("rownames contains duplicates") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1c5704bfce..e489f73461 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1170,7 +1170,7 @@ test(381, DT[,{if (a==2) {.SD$b[1]=10L;.SD} else .SD}, by=a], error=base_message # test that direct := is trapped, but := within a copy of .SD is allowed (FAQ 4.5). See also tests 556-557. test(382, DT[,b:=.N*2L,by=a], data.table(a=rep(1:3,1:3),b=rep(2L*(1:3),1:3))) -test(383, DT[,{z=10L;b:=z},by=a], error=":= and `:=`(...) are defined for use in j, once only and in particular ways") +test(383, DT[,{z=10L;b:=z},by=a], error="defined for use in j, once only and in particular ways") test(384, DT[,{mySD=copy(.SD);mySD[1,b:=99L];mySD},by=a], data.table(a=rep(1:3,1:3),b=c(99L,99L,4L,99L,6L,6L))) # somehow missed testing := on logical subset with mixed TRUE/FALSE, reported by Muhammad Waliji @@ -2165,9 +2165,13 @@ test(738, DT[,c("c2", "c1"):=list(c1+1L, NULL)], data.table(c2=2:3)) # `:=`(c1=v1,v2=v2,...) is now valid , #2254 DT = data.table( c1=1:3 ) -test(739, DT[,`:=`(c2=4:6, c3=7:9)], data.table(c1=1:3,c2=4:6,c3=7:9)) -test(740, DT[,`:=`(4:6,c3=7:9)], error="all arguments must be named") -test(741, DT[,`:=`(4:6,7:9,10:12)], error="all arguments must be named") # test the same error message in the other branch +test(739.1, DT[,`:=`(c2=4:6, c3=7:9)], data.table(c1=1:3,c2=4:6,c3=7:9)) +test(739.2, DT[,`:=`(4:6,c3=7:9)], error="all arguments must be named") +test(739.3, DT[,`:=`(4:6,7:9,10:12)], error="all arguments must be named") # test the same error message in the other branch +DT = data.table( c1=1:3 ) +test(739.4, DT[,let(c2=4:6, c3=7:9)], data.table(c1=1:3,c2=4:6,c3=7:9)) +test(739.5, DT[,let(4:6,c3=7:9)], error="all arguments must be named") +test(739.6, DT[,let(4:6,7:9,10:12)], error="all arguments must be named") # that out of bounds LHS is caught, root cause of #2254 test(742, DT[,3:6:=1L], error="outside.*range") @@ -2181,12 +2185,14 @@ test(746, DT["a",c("new1","new2"):=list(4L, 5L)], data.table(a=letters[c(1:3,3L)],new1=INT(4,NA,NA,NA),new2=INT(5,NA,NA,NA),key="a")) test(747.1, DT[,new1:=4:6], error="Supplied 3 items to be assigned to 4 items of column 'new1'") test(747.2, DT[,new1:=INT(4,5,6,4)], data.table(a=letters[c(1:3,3L)],new1=INT(4L,5L,6L,4L),new2=INT(5,NA,NA,NA),key="a")) -test(748, DT[c("c","b"),`:=`(new3=.N,new2=sum(new1)+1L),by=.EACHI], data.table(a=letters[c(1:3,3L)],new1=INT(4,5,6,4),new2=INT(5,6,11,11),new3=INT(NA,1,2,2),key="a")) +test(748.1, copy(DT)[c("c","b"),`:=`(new3=.N,new2=sum(new1)+1L),by=.EACHI], data.table(a=letters[c(1:3,3L)],new1=INT(4,5,6,4),new2=INT(5,6,11,11),new3=INT(NA,1,2,2),key="a")) +test(748.2, copy(DT)[c("c","b"),let(new3=.N,new2=sum(new1)+1L),by=.EACHI], data.table(a=letters[c(1:3,3L)],new1=INT(4,5,6,4),new2=INT(5,6,11,11),new3=INT(NA,1,2,2),key="a")) # and multiple LHS by group, #1710 DT = data.table(a=rep(6:8,1:3),b=1:6) test(749, DT[,c("c","d","e"):=list(.N,sum(b),a*10L),by=a], data.table(a=rep(6:8,1:3),b=1:6,c=rep(1:3,1:3),d=INT(rep(c(1,5,15),1:3)),e=rep(6:8,1:3)*10L)) -test(750, DT[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) +test(750.1, copy(DT)[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) +test(750.2, copy(DT)[a<8,let(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) # varname holding colnames, by group, linked from #2120. DT = data.table(a=rep(1:3,1:3),b=1:6) @@ -2284,7 +2290,8 @@ test(783, DT[,.I,by=a]$I, 1:8) test(784, DT[,.I[which.max(b)],by=a], data.table(a=1:4,V1=INT(2,4,6,8),key="a")) test(785, DT[J(2:4),.I,by=a%%2L], data.table(a=rep(0:1,c(4,2)),I=INT(3,4,7,8,5,6))) test(786, DT[J(c(3,2,4)),list(.I,.GRP),by=.EACHI], data.table(a=rep(c(3L,2L,4L),each=2),I=INT(5,6,3,4,7,8),GRP=rep(1:3,each=2L))) -test(787, DT[J(3:2),`:=`(i=.I,grp=.GRP),by=.EACHI][,list(i,grp)], data.table(i=INT(NA,NA,3:6,NA,NA),grp=INT(NA,NA,2,2,1,1,NA,NA))) +test(787.1, copy(DT)[J(3:2),`:=`(i=.I,grp=.GRP),by=.EACHI][,list(i,grp)], data.table(i=INT(NA,NA,3:6,NA,NA),grp=INT(NA,NA,2,2,1,1,NA,NA))) +test(787.2, copy(DT)[J(3:2),let(i=.I,grp=.GRP),by=.EACHI][,list(i,grp)], data.table(i=INT(NA,NA,3:6,NA,NA),grp=INT(NA,NA,2,2,1,1,NA,NA))) # New not-join (a.k.a. not-select, since not just for data.table i but integer, logical and character too) DT = data.table(A=rep(1:3,each=2),B=1:6,key="A") @@ -2789,7 +2796,8 @@ test(950, fread('A,B,C\n1,+,4\n2,-,5\n3,-,6\n'), data.table(A=1:3,B=c("+","-","- # catching misuse of `:=` x = data.table(a=1:5) -test(951, x[,{b=a+3; `:=`(c=b)}], error="defined for use in j, once only and in particular ways") +test(951.1, x[,{b=a+3; `:=`(c=b)}], error="defined for use in j, once only and in particular ways") +test(951.2, x[,{b=a+3; let(c=b)}], error="defined for use in j, once only and in particular ways") # fread colClasses input = 'A,B,C\n01,foo,3.140\n002,bar,6.28000\n' @@ -2840,7 +2848,8 @@ test(978.3, fread(input, skip=9), data.table(E=9:10, F=11:12)) # mixed add and update in same `:=` bug/crash, #2528 and #2778 DT = data.table(x=rep(1:2, c(3,2)), y=6:10) DT[, z:=.GRP, by=x] # first assignment -test(979, DT[, `:=`(z=.GRP, w=2), by=x], data.table(x=INT(1,1,1,2,2),y=6:10,z=INT(1,1,1,2,2),w=2)) # mixed update and add +test(979.1, copy(DT)[, `:=`(z=.GRP, w=2), by=x], data.table(x=INT(1,1,1,2,2),y=6:10,z=INT(1,1,1,2,2),w=2)) # mixed update and add +test(979.2, copy(DT)[, let(z=.GRP, w=2), by=x], data.table(x=INT(1,1,1,2,2),y=6:10,z=INT(1,1,1,2,2),w=2)) # and example from http://stackoverflow.com/a/14732348/403310 : dt1 = fread("Date,Time,A,B 01/01/2013,08:00,10,30 @@ -2854,13 +2863,18 @@ dt2 = fread("Date,A,B,C 02/01/2013,200,400,2") setkey(dt1, "Date") setkey(dt2, "Date") -test(980, dt1[dt2, `:=`(A=A+i.A, B=B+i.B, C=i.C)][,list(A,B,C)], +test(980.1, copy(dt1)[dt2, `:=`(A=A+i.A, B=B+i.B, C=i.C)][,list(A,B,C)], + data.table(A=INT(110,115,120,225,230,235),B=INT(330,325,320,415,410,405),C=rep(1:2,each=3))) +test(980.2, copy(dt1)[dt2, let(A=A+i.A, B=B+i.B, C=i.C)][,list(A,B,C)], data.table(A=INT(110,115,120,225,230,235),B=INT(330,325,320,415,410,405),C=rep(1:2,each=3))) DT = data.table(A=1:2,B=3:4,C=5:6) -test(981, DT[,`:=`(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], error="Supplied 2 items to be assigned to group 1 of size 1 in column 'B'") +test(981.1, copy(DT)[,`:=`(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], error="Supplied 2 items to be assigned to group 1 of size 1 in column 'B'") +test(981.2, copy(DT)[,let(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], error="Supplied 2 items to be assigned to group 1 of size 1 in column 'B'") DT = data.table(A=1:2,B=3:4,C=5:6) -test(982, DT[,`:=`(D=B+4L,B=0L,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], +test(982.1, copy(DT)[,`:=`(D=B+4L,B=0L,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L))) # Also note that G is not yet iterative. In future: c(12,14) +test(982.2, copy(DT)[, let(D=B+4L,B=0L,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], + data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L))) # rbindlist binding factors, #2650 test(983, rbindlist(list(data.table(factor(c("A","A","B","C","A"))), data.table(factor(c("B","F","A","G"))))), data.table(V1=factor(c("A","A","B","C","A","B","F","A","G")))) @@ -3949,7 +3963,8 @@ test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(200 DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) ans = copy(DT) -test(1143.4, DT[,`:=`(lx=tail(x,1L)), by=y], ans[, lx := x]) +test(1143.4, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], ans[, lx := x]) +test(1143.5, copy(DT)[,let(lx=tail(x,1L)), by=y], ans[, lx := x]) # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 @@ -16842,7 +16857,9 @@ DT = data.table(id=1:9, grp=rep(1:3,each=3), val=c("a","b","c", "a","b","c", "a" test(2114.5, as.character(DT[, valfactor1 := factor(val), by = grp]$valfactor1), ans<-rep(c("a","b","c"),3)) test(2114.6, as.character(DT[, valfactor2 := factor(val), by = id]$valfactor2), ans) DT = data.table(x = rep(letters[c(3, 1, 2)], each = 2)) -test(2114.7, DT[, `:=`(g=.GRP, f=factor(.GRP)), by = x], +test(2114.7, copy(DT)[, `:=`(g=.GRP, f=factor(.GRP)), by = x], + data.table(x=rep(c("c","a","b"),each=2), g=rep(1:3,each=2), f=factor(rep(as.character(1:3),each=2)))) +test(2114.8, copy(DT)[, let(g=.GRP, f=factor(.GRP)), by = x], data.table(x=rep(c("c","a","b"),each=2), g=rep(1:3,each=2), f=factor(rep(as.character(1:3),each=2)))) # extra tests from #996 for completeness; no warning no-alloc coerce here of 0 and 1 numerics @@ -17989,7 +18006,8 @@ if (test_bit64) { # X[Y,,by=.EACHI] when Y contains integer64 also fixed in 1.12.4, #3779 X = data.table(x=1:3) Y = data.table(x=1:2, y=as.integer64(c(10,20))) - test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) + test(2193.2, copy(X)[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) + test(2193.3, copy(X)[Y, let(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) } # endsWithAny added in #5097 for internal use replacing one use of base::endsWith (in fread.R) diff --git a/man/assign.Rd b/man/assign.Rd index f622755606..bb87a5221b 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -1,23 +1,31 @@ \name{:=} \alias{:=} \alias{set} +\alias{let} \title{ Assignment by reference } \description{ Fast add, remove and update subsets of columns, by reference. \code{:=} operator can be used in two ways: \code{LHS := RHS} form, and \code{Functional form}. See \code{Usage}. \code{set} is a low-overhead loop-able version of \code{:=}. It is particularly useful for repetitively updating rows of certain columns by reference (using a for-loop). See \code{Examples}. It can not perform grouping operations. + \code{let} is an alias for the functional form and behaves exactly like \code{`:=`}. + } \usage{ # 1. LHS := RHS form # DT[i, LHS := RHS, by = ...] # DT[i, c("LHS1", "LHS2") := list(RHS1, RHS2), by = ...] -# 2. Functional form +# 2a. Functional form with `:=` # DT[i, `:=`(LHS1 = RHS1, # LHS2 = RHS2, # ...), by = ...] +# 2b. Functional form with let +# DT[i, let(LHS1 = RHS1, +# LHS2 = RHS2, +# ...), by = ...] + set(x, i = NULL, j, value) } \arguments{ @@ -42,6 +50,7 @@ set(x, i = NULL, j, value) DT[i, (colvector) := val] # same (NOW PREFERRED) shorthand syntax. The parens are enough to stop the LHS being a symbol; same as c(colvector). DT[i, colC := mean(colB), by = colA] # update (or add) column called "colC" by reference by group. A major feature of `:=`. DT[,`:=`(new1 = sum(colB), new2 = sum(colC))] # Functional form + DT[, let(new1 = sum(colB), new2 = sum(colC))] # New alias for functional form. } The \code{\link{.Last.updated}} variable contains the number of rows updated by the most recent \code{:=} or \code{set} calls, which may be useful, for example, in production settings for testing assumptions about the number of rows affected by a statement; see \code{\link{.Last.updated}} for details. From ccda29bd50cf28b5aa09ff2fbeb278420c5a3690 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 1 Dec 2021 00:08:39 +0100 Subject: [PATCH 445/588] add incomparables arg to merge.data.table and warn about passing arguments to ... (#5233) --- NEWS.md | 2 ++ R/merge.R | 23 +++++++++++++++++++++-- inst/tests/tests.Rraw | 13 +++++++++++++ man/merge.Rd | 9 ++++++++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2e09efbc3a..61759c24d8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -518,6 +518,8 @@ 50. `fwrite()` could produce not-ISO-compliant timestamps such as `2023-03-08T17:22:32.:00Z` when under a whole second by less than numerical tolerance of one microsecond, [#5238](https://github.com/Rdatatable/data.table/issues/5238). Thanks to @avraam-inside for the report and Václav Tlapák for the fix. +51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). It is now implemented and any other ignored arguments (e.g. misspellings) are now warned about. Thanks to @GBsuperman for the report and @ben-schwen for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/merge.R b/R/merge.R index f237bcbf32..cbc9b9e291 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,5 +1,5 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, - all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), ...) { + all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) { if (!sort %in% c(TRUE, FALSE)) stopf("Argument 'sort' should be logical TRUE/FALSE") if (!no.dups %in% c(TRUE, FALSE)) @@ -14,7 +14,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL x0 = length(x)==0L y0 = length(y)==0L if (x0 || y0) { - if (x0 && y0) + if (x0 && y0) warningf("Neither of the input data.tables to join have columns.") else if (x0) warningf("Input data.table '%s' has no columns.", "x") @@ -54,6 +54,15 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL by = unname(by) by.x = by.y = by } + + # warn about unused arguments #2587 + if (length(list(...))) { + ell = as.list(substitute(list(...)))[-1L] + for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n) + unnamed_n = length(ell) - sum(names(ell) != "") + if (unnamed_n) + warningf("Passed %d unknown and unnamed arguments.", unnamed_n) + } # with i. prefix in v1.9.3, this goes away. Left here for now ... ## sidestep the auto-increment column number feature-leading-to-bug by ## ensuring no names end in ".1", see unit test @@ -72,6 +81,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L]) } + # implement incomparables argument #2587 + if (!is.null(incomparables)) { + # %fin% to be replaced when #5232 is implemented/closed + "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table + xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by) + yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by) + # subset both so later steps still work + x = x[xind] + y = y[yind] + } dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names) if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e489f73461..ffa0a95ac3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18458,3 +18458,16 @@ unlink(f) # not supporting multi file zips yet test(2229.6, fread(testDir("multi-file.zip")), error="Compressed files containing more than 1 file are currently not supported.") +# merge.data.table ignored incomparables argument without warning, #2587 +x = data.frame(k1 = c(NA,NA,3,4,5), k2 = c(1,NA,NA,4,5), data = 1:5) +y = data.frame(k1 = c(NA,2,NA,4,5), k2 = c(NA,NA,3,4,5), data = 1:5) +DT = as.data.table(x) +test(2230.1, setDF(merge(DT, y, by="k2", incomparables=NA)), merge(x, y, by="k2", incomparables=NA)) +test(2230.2, setDF(merge(DT, y, by="k2", incomparables=c(NA,4))), merge(x, y, by="k2", incomparables=c(NA,4))) +test(2230.3, setDF(merge(DT, y, by="k2", incomparables=c(4,5))), merge(x, y, by="k2", incomparables=c(4,5))) +test(2230.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x, y, by="k2", incomparables=c(1,NA,4,5))) +test(2230.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA,3,4,5))) +test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unknown argument 'unk' has been passed.") +test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L), + merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments.")) + diff --git a/man/merge.Rd b/man/merge.Rd index 6fcbc10866..d8246668c3 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -21,7 +21,7 @@ Use the \code{by}, \code{by.x} and \code{by.y} arguments explicitly to override \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), # default FALSE -\dots) +incomparables = NULL, \dots) } \arguments{ @@ -51,6 +51,7 @@ fashion as the \code{\link{merge.data.frame}} method does.} non-\code{by.y} column names in \code{y} when they have the same column name as any \code{by.x}.} \item{allow.cartesian}{See \code{allow.cartesian} in \code{\link{[.data.table}}.} +\item{incomparables}{values which cannot be matched and therefore are excluded from by columns.} \item{\dots}{Not used at this time.} } @@ -125,6 +126,12 @@ setnames(d2, "a", "b") merge(d1, d2, by.x="a", by.y="b") merge(d1, d2, by.x="a", by.y="b", all=TRUE) merge(d2, d1, by.x="b", by.y="a") + +# using incomparables values +d1 <- data.table(a=c(1,2,NA,NA,3,1), z=1:6) +d2 <- data.table(a=c(1,2,NA), z=10:12) +merge(d1, d2, by="a") +merge(d1, d2, by="a", incomparables=NA) } \keyword{ data } From e2fee06c7b2b02d9fdf5f5c0b27753edd1554644 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 1 Dec 2021 13:58:20 -0700 Subject: [PATCH 446/588] glci-only: remove other.Rraw from rel-lin closes #5274 --- .gitlab-ci.yml | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 248f97a30d..5dbb2c2e91 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -40,19 +40,19 @@ mirror-packages: ## mirror all recursive dependencies, source and win.binary of - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_REL_VERSION","R_DEVEL_VERSION","R_OLDREL_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' <<: *artifacts -mirror-other-packages: ## mirror integration suggests from pkgs at the top of inst/tests/other.Rraw - stage: dependencies - tags: - - linux - image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev - cache: - paths: - - bus/$CI_BUILD_NAME/cran - script: - - echo 'source(".ci/ci.R")' >> .Rprofile - - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib - - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); mirror.packages(pkgs, repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-other-packages/cran")' - <<: *artifacts +# mirror-other-packages: ## mirror integration suggests from pkgs at the top of inst/tests/other.Rraw; off now #5274 +# stage: dependencies +# tags: +# - linux +# image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev +# cache: +# paths: +# - bus/$CI_BUILD_NAME/cran +# script: +# - echo 'source(".ci/ci.R")' >> .Rprofile +# - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib +# - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); mirror.packages(pkgs, repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-other-packages/cran")' +# <<: *artifacts build: ## build data.table sources as tar.gz archive stage: build @@ -144,16 +144,17 @@ build: ## build data.table sources as tar.gz archive test-rel-lin: ## most comprehensive tests, force all suggests, also integration tests, using gcc -O3 -flto -fno-common -Wunused-result <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-builder - needs: ["mirror-packages","mirror-other-packages","build"] + needs: ["mirror-packages","build"] # "mirror-other-packages" variables: _R_CHECK_CRAN_INCOMING_: "FALSE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_CHECK_FORCE_SUGGESTS_: "TRUE" _R_CHECK_TESTS_NLINES_: "0" OPENBLAS_MAIN_FREE: "1" - TEST_DATA_TABLE_WITH_OTHER_PACKAGES: "TRUE" + TEST_DATA_TABLE_WITH_OTHER_PACKAGES: "FALSE" #5274 before_script: - - Rscript -e 'source(".ci/ci.R"); eval(parse("inst/tests/other.Rraw", n=1L)); install.packages(c(dcf.dependencies("DESCRIPTION", which="all"), pkgs), quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' + # - Rscript -e 'source(".ci/ci.R"); eval(parse("inst/tests/other.Rraw", n=1L)); install.packages(c(dcf.dependencies("DESCRIPTION", which="all"), pkgs), quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' + # removing pkgs from other.Rraw leaves dcf.dependencies("DESCRIPTION", which="all") but 'needs: mirror-packages' does that iiuc, let's see if it passes ... - *cp-src - rm -r bus - mkdir -p ~/.R From b28b496ee487b6bf21c559bf0867223dd59f03e1 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 1 Dec 2021 14:56:53 -0700 Subject: [PATCH 447/588] glci-only: install.packages for DESCRIPTION-dependencies is needed then as it failed without, #5274 --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5dbb2c2e91..3a031e9c1f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -153,8 +153,8 @@ test-rel-lin: ## most comprehensive tests, force all suggests, also integration OPENBLAS_MAIN_FREE: "1" TEST_DATA_TABLE_WITH_OTHER_PACKAGES: "FALSE" #5274 before_script: - # - Rscript -e 'source(".ci/ci.R"); eval(parse("inst/tests/other.Rraw", n=1L)); install.packages(c(dcf.dependencies("DESCRIPTION", which="all"), pkgs), quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' - # removing pkgs from other.Rraw leaves dcf.dependencies("DESCRIPTION", which="all") but 'needs: mirror-packages' does that iiuc, let's see if it passes ... + - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), quiet=TRUE)' ## does seem to be needed despite 'needs mirror-packages' + ## - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); install.packages(pkgs, quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' - *cp-src - rm -r bus - mkdir -p ~/.R From 3581f129a1b59d0c573ee2ea520415946af91b42 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 2 Dec 2021 11:04:58 -0700 Subject: [PATCH 448/588] GLCI-only: update R-rel.exe link, #5198 --- .gitlab-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3a031e9c1f..8688b0b47f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -96,7 +96,8 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.0/R-4.1.0-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/release.html; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + # https://cloud.r-project.org/bin/windows/base/ contains "A stable link which will redirect to the current Windows binary release is ..." which is used in the line above .test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-oldrel-win: &install-r-oldrel-win From ef4d24c4ef888a745000de78d070a7a9809fed08 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 2 Dec 2021 11:07:21 -0700 Subject: [PATCH 449/588] GLCI-only: update R-rel.exe link (missed /base), #5198 --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8688b0b47f..a558aa3d3e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -96,7 +96,7 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/release.html; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/release.html; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait # https://cloud.r-project.org/bin/windows/base/ contains "A stable link which will redirect to the current Windows binary release is ..." which is used in the line above .test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait From 9e53b697db6c2734afcf6aba36b3277964a0af09 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 2 Dec 2021 13:48:11 -0700 Subject: [PATCH 450/588] GLCI-only: use link to R-4.1.2-win.exe, #5198 --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a558aa3d3e..300adb2734 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -96,8 +96,8 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/release.html; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait - # https://cloud.r-project.org/bin/windows/base/ contains "A stable link which will redirect to the current Windows binary release is ..." which is used in the line above + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/R-4.1.2-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + # see #5198 for discussion about the https link used above; it will break each time R is released and the version number will need to be updated .test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-oldrel-win: &install-r-oldrel-win From 198f45245318f2fd5d8388c3a7a56b7ddb1bd70a Mon Sep 17 00:00:00 2001 From: Ethan Smith <24379655+ethanbsmith@users.noreply.github.com> Date: Thu, 2 Dec 2021 16:31:52 -0700 Subject: [PATCH 451/588] 5268_as_xts_support_non_numerics (#5276) --- NEWS.md | 2 ++ R/xts.R | 13 ++++++++----- inst/tests/tests.Rraw | 5 +++++ man/as.xts.data.table.Rd | 7 ++++--- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 61759c24d8..cb49520d74 100644 --- a/NEWS.md +++ b/NEWS.md @@ -520,6 +520,8 @@ 51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). It is now implemented and any other ignored arguments (e.g. misspellings) are now warned about. Thanks to @GBsuperman for the report and @ben-schwen for the fix. +52. `as.xts.data.table` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/xts.R b/R/xts.R index 005f0f6024..234f36cac6 100644 --- a/R/xts.R +++ b/R/xts.R @@ -15,11 +15,14 @@ as.data.table.xts = function(x, keep.rownames = TRUE, key=NULL, ...) { r[] } -as.xts.data.table = function(x, ...) { +as.xts.data.table = function(x, numeric.only = TRUE, ...) { stopifnot(requireNamespace("xts"), !missing(x), is.data.table(x)) if (!xts::is.timeBased(x[[1L]])) stopf("data.table must have a time based column in first position, use `setcolorder` function to change the order, or see ?timeBased for supported types") - colsNumeric = vapply_1b(x, is.numeric)[-1L] # exclude first col, xts index - if (!all(colsNumeric)) warningf("Following columns are not numeric and will be omitted: %s", brackify(names(colsNumeric)[!colsNumeric])) - r = setDF(x[, .SD, .SDcols = names(colsNumeric)[colsNumeric]]) - return(xts::as.xts(r, order.by = if ("IDate" %chin% class(x[[1L]])) as.Date(x[[1L]]) else x[[1L]])) + r <- x[, -1L]# exclude first col, xts index + if (numeric.only) { + colsNumeric = vapply_1b(r, is.numeric) + if (!all(colsNumeric)) warningf("Following columns are not numeric and will be omitted: %s", brackify(names(colsNumeric)[!colsNumeric])) + r <- r[, .SD, .SDcols = names(colsNumeric)[colsNumeric]] + } + return(xts::xts(as.matrix(r), order.by = if (inherits(x[[1L]], "IDate")) as.Date(x[[1L]]) else x[[1L]])) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ffa0a95ac3..5325f6f6d2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6861,6 +6861,11 @@ if (test_xts) { M = xts::as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) # xts:: just to be extra robust; shouldn't be needed with rm(as.xts) above test(1465.19, inherits(as.data.table(M)$index,"POSIXct")) + # non-numeric xts coredata, #5268 + x = xts::xts(x=c(TRUE,FALSE), order.by=Sys.Date()+(1:2)) + colnames(x) = "value" # perhaps relates to #4897 + test(1465.20, identical(x, as.xts(as.data.table(x), numeric.only=FALSE))) + Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) } diff --git a/man/as.xts.data.table.Rd b/man/as.xts.data.table.Rd index 1f42cceab0..1328229edb 100644 --- a/man/as.xts.data.table.Rd +++ b/man/as.xts.data.table.Rd @@ -2,13 +2,14 @@ \alias{as.xts.data.table} \title{Efficient data.table to xts conversion} \description{ - Efficient conversion of data.table to xts, data.table must have \emph{POSIXct} or \emph{Date} type in first column. + Efficient conversion of data.table to xts, data.table must have a time based type in first column. See ?xts::timeBased for supported types } \usage{ -as.xts.data.table(x, \dots) +as.xts.data.table(x, numeric.only = TRUE, \dots) } \arguments{ -\item{x}{data.table to convert to xts, must have \emph{POSIXct} or \emph{Date} in the first column. All others non-numeric columns will be omitted with warning.} +\item{x}{data.table to convert to xts, must have a time based first column. As xts objects are indexed matrixes, all columns must be of the same type. If columns of multiple types are selected, standard as.matrix rules are applied during the conversion. } +\item{numeric.only}{If TRUE, only include numeric columns in the conversion and all non-numeric columns will be omitted with warning} \item{\dots}{ignored, just for consistency with generic method.} } \seealso{ \code{\link{as.data.table.xts}} } From 2ef323c2ae1abc3204c82af60c769356daf161b6 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 3 Dec 2021 19:58:43 +0100 Subject: [PATCH 452/588] replace looped subsetVector with subsetDT (#5264) --- R/data.table.R | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index cf952e306e..ef514b8e02 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1303,18 +1303,8 @@ replace_dot_alias = function(e) { # But rather than that complex logic here at R level to catch that and do a shallow copy for efficiency, just do the check inside CsubsetDT # to see if it passed 1:nrow(x) and then CsubsetDT should do the shallow copy safely and centrally. # That R level branch was taken out in PR #3213 - - # TO DO: use CsubsetDT twice here and then remove this entire R level branch - for (s in seq_along(icols)) { - target = icolsAns[s] - source = icols[s] - ans[[target]] = .Call(CsubsetVector,i[[source]],ii) # i.e. i[[source]][ii] - } - for (s in seq_along(xcols)) { - target = xcolsAns[s] - source = xcols[s] - ans[[target]] = .Call(CsubsetVector,x[[source]],irows) # i.e. x[[source]][irows], but guaranteed new memory even for singleton logicals from R 3.1.0 - } + ans[icolsAns] = .Call(CsubsetDT, i, ii, icols) + ans[xcolsAns] = .Call(CsubsetDT, x, irows, xcols) setattr(ans, "names", ansvars) if (haskey(x)) { keylen = which.first(!key(x) %chin% ansvars)-1L From efee370fecaf77d5a8b0df0e3af66aa70ace40ec Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 3 Dec 2021 21:36:47 +0100 Subject: [PATCH 453/588] Gforce weighted.mean (#5246) --- NEWS.md | 2 ++ R/data.table.R | 26 ++++++++++----- inst/tests/tests.Rraw | 74 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index cb49520d74..94259eb061 100644 --- a/NEWS.md +++ b/NEWS.md @@ -267,6 +267,8 @@ # 1: 1 3 a # 2: 2 4 b ``` + +35. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. ## BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index ef514b8e02..13ce66e233 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1737,14 +1737,13 @@ replace_dot_alias = function(e) { if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na")))) return(TRUE) + # ^^ base::startWith errors on NULL unfortunately if (length(q)>=2L && q[[1L]] == "shift") { q_named = match.call(shift, q) if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE) - } # add gshift support - # ^^ base::startWith errors on NULL unfortunately - # head-tail uses default value n=6 which as of now should not go gforce ... ^^ - # otherwise there must be three arguments, and only in two cases: - # 1) head/tail(x, 1) or 2) x[n], n>0 + } + if (length(q)>=3L && q[[1L]] == "weighted.mean") return(TRUE) #3977 + # otherwise there must be three arguments length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && ( (q1 %chin% c("head", "tail")) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) } @@ -1759,13 +1758,13 @@ replace_dot_alias = function(e) { for (ii in seq_along(jsub)[-1L]) { if (dotN(jsub[[ii]])) next; # For #334 jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]])) - if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 + if (length(jsub[[ii]])==3L && is.symbol(jsub[[ii]][[3L]]) && !(jsub[[ii]][[3L]] %chin% sdvars) && exists(jsub[[ii]][[3L]], parent.frame())) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 } else { # adding argument to ghead/gtail if none is supplied to g-optimized head/tail if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) - if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 + if (length(jsub)==3L && is.symbol(jsub[[3L]]) && !(jsub[[3L]] %chin% sdvars) && exists(jsub[[3L]], parent.frame())) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } else if (verbose) catf("GForce is on, left j unchanged\n"); @@ -2981,7 +2980,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { # (2) edit .gforce_ok (defined within `[`) to catch which j will apply the new function # (3) define the gfun = function() R wrapper gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod", - "median", "min", "max", "var", "sd", ".N", "shift") # added .N for #334 + "median", "min", "max", "var", "sd", ".N", "shift", "weighted.mean") # added .N for #334 `g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment @@ -2989,6 +2988,17 @@ gfirst = function(x) .Call(Cgfirst, x) glast = function(x) .Call(Cglast, x) gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm) gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm) +gweighted.mean = function(x, w, na.rm=FALSE) { + if (missing(w)) gmean(x, na.rm) + else { + if (na.rm) { # take those indices out of the equation by setting them to 0 + ix <- is.na(x) + x[ix] <- 0 + w[ix] <- 0 + } + gsum((w!=0)*x*w, na.rm=FALSE)/gsum(w, na.rm=FALSE) + } +} gprod = function(x, na.rm=FALSE) .Call(Cgprod, x, na.rm) gmedian = function(x, na.rm=FALSE) .Call(Cgmedian, x, na.rm) gmin = function(x, na.rm=FALSE) .Call(Cgmin, x, na.rm) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5325f6f6d2..54f60d31ca 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18476,3 +18476,77 @@ test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unkno test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L), merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments.")) +# weighted.mean GForce optimized, #3977 +old = options(datatable.optimize=1L) +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2231.01, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce FALSE") +test(2231.02, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce FALSE") +test(2231.03, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce FALSE") +# multiple groups +DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.04, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") +test(2231.05, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") +test(2231.06, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce FALSE") +# (only x XOR w) containing NA +DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.07, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") +test(2231.08, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce FALSE") +test(2231.09, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.10, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (only x XOR w) containing NaN +DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.11, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce FALSE") +test(2231.12, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") +test(2231.13, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce FALSE") +test(2231.14, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (only x XOR w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.15, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") +test(2231.16, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") +test(2231.17, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.18, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (x and w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.19, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.20, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.21, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.22, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# same as previous test cases but now GForce optimized +options(datatable.optimize=2L) +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2231.31, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce optimized j to") +test(2231.32, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce optimized j to") +test(2231.33, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to") +# multiple groups +DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.34, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") +test(2231.35, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") +test(2231.36, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce optimized j to") +# (only x XOR w) containing NA +DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.37, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") +test(2231.38, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce optimized j to") +test(2231.39, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.40, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (only x XOR w) containing NaN +DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.41, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce optimized j to") +test(2231.42, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") +test(2231.43, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce optimized j to") +test(2231.44, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (only x XOR w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.45, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") +test(2231.46, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") +test(2231.47, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.48, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (x and w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.49, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.50, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +options(old) + From 0404ed84be95eab81c704df1988717d22557bcf9 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 3 Dec 2021 13:58:37 -0700 Subject: [PATCH 454/588] #5276 moved news item to new feature section not bug --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 94259eb061..ea55888f23 100644 --- a/NEWS.md +++ b/NEWS.md @@ -270,6 +270,8 @@ 35. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. +36. `as.xts.data.table()` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. @@ -522,8 +524,6 @@ 51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). It is now implemented and any other ignored arguments (e.g. misspellings) are now warned about. Thanks to @GBsuperman for the report and @ben-schwen for the fix. -52. `as.xts.data.table` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. - ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From 6f3b7c1a4d681cbd1506aa114de7bc4e4f0760f9 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 3 Dec 2021 13:46:36 -0800 Subject: [PATCH 455/588] cols argument for unique.data.table (#5244) --- NEWS.md | 2 ++ R/duplicated.R | 5 ++++- inst/tests/tests.Rraw | 9 +++++++++ man/duplicated.Rd | 11 +++++++++-- 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index ea55888f23..859a8fc1b0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -272,6 +272,8 @@ 36. `as.xts.data.table()` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. +37. `unique.data.table()` gains `cols` to specify a subset of columns to include in the resulting `data.table`, [#5243](https://github.com/Rdatatable/data.table/issues/5243). This saves the memory overhead of subsetting unneeded columns, and provides a cleaner API for a common operation previously needing more convoluted code. Thanks to @MichaelChirico for the suggestion & implementation. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/duplicated.R b/R/duplicated.R index 4fc7c8d166..901d6e3c01 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -23,7 +23,7 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_ res } -unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) { +unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), cols=NULL, ...) { if (!cedta()) return(NextMethod("unique")) # nocov if (!isFALSE(incomparables)) { .NotYetUsed("incomparables != FALSE") @@ -31,6 +31,9 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon if (nrow(x) <= 1L) return(x) if (!length(by)) by = NULL #4594 o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) + if (!is.null(cols)) { + x = .shallow(x, c(by, cols), retain.key=TRUE) + } # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't # as efficient as forderv returning empty o when input is already ordered diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 54f60d31ca..a2e9fa6e04 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18550,3 +18550,12 @@ test(2231.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.tabl test(2231.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") options(old) +# cols argument for unique.data.table, #5243 +DT = data.table(g = rep(letters, 3), v1=1:78, v2=78:1) +test(2232.1, unique(DT, by='g', cols='v1'), DT[1:26, !'v2']) +test(2232.2, unique(DT, by='g', cols='v2'), DT[1:26, !'v1']) +## no duplicates +test(2232.3, unique(DT[1:26], by='g', cols='v1'), DT[1:26, !'v2']) +## invalid columns fail as expected +test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") + diff --git a/man/duplicated.Rd b/man/duplicated.Rd index a9c333beb5..daf7c39d58 100644 --- a/man/duplicated.Rd +++ b/man/duplicated.Rd @@ -28,7 +28,8 @@ memory efficient. \usage{ \method{duplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots) -\method{unique}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots) +\method{unique}{data.table}(x, incomparables=FALSE, fromLast=FALSE, +by=seq_along(x), cols=NULL, \dots) \method{anyDuplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots) @@ -46,6 +47,8 @@ correspond to \code{duplicated = FALSE}.} of columns from \code{x} to use for uniqueness checks. By default all columns are being used. That was changed recently for consistency to data.frame methods. In version \code{< 1.9.8} default was \code{key(x)}.} +\item{cols}{Columns (in addition to \code{by}) from \code{x} to include in the + resulting \code{data.table}.} \item{na.rm}{Logical (default is \code{FALSE}). Should missing values (including \code{NaN}) be removed?} } @@ -59,7 +62,11 @@ handle cases where limitations in floating point representation is undesirable. \code{v1.9.4} introduces \code{anyDuplicated} method for data.tables and is similar to base in functionality. It also implements the logical argument -\code{fromLast} for all three functions, with default value \code{FALSE}. +\code{fromLast} for all three functions, with default value +\code{FALSE}. + +Note: When \code{cols} is specified, the resulting table will have +columns \code{c(by, cols)}, in that order. } \value{ \code{duplicated} returns a logical vector of length \code{nrow(x)} From 9619ad5fcde4fc951f8aed8b34bdc5bd61dd6ef5 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 3 Dec 2021 15:18:10 -0700 Subject: [PATCH 456/588] #5246-follow-up: remove exists() as it failed in older R versions with 'invalid first argument'; removing it passed all tests so it wasn't covered. --- R/data.table.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 13ce66e233..f6eee8dfd0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1758,13 +1758,13 @@ replace_dot_alias = function(e) { for (ii in seq_along(jsub)[-1L]) { if (dotN(jsub[[ii]])) next; # For #334 jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]])) - if (length(jsub[[ii]])==3L && is.symbol(jsub[[ii]][[3L]]) && !(jsub[[ii]][[3L]] %chin% sdvars) && exists(jsub[[ii]][[3L]], parent.frame())) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 + if (length(jsub[[ii]])==3L && is.symbol(jsub[[ii]][[3L]]) && !(jsub[[ii]][[3L]] %chin% sdvars)) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 } else { # adding argument to ghead/gtail if none is supplied to g-optimized head/tail if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) - if (length(jsub)==3L && is.symbol(jsub[[3L]]) && !(jsub[[3L]] %chin% sdvars) && exists(jsub[[3L]], parent.frame())) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 + if (length(jsub)==3L && is.symbol(jsub[[3L]]) && !(jsub[[3L]] %chin% sdvars)) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } else if (verbose) catf("GForce is on, left j unchanged\n"); From c3df3bc7f096cd1884b4bab0bbdabd7361117b3a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 9 Dec 2021 01:34:27 +0100 Subject: [PATCH 457/588] `:=` works with GForce (#5245) --- NEWS.md | 2 + R/data.table.R | 19 ++++- inst/tests/tests.Rraw | 177 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 170 insertions(+), 28 deletions(-) diff --git a/NEWS.md b/NEWS.md index 859a8fc1b0..00a9b5be1b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -274,6 +274,8 @@ 37. `unique.data.table()` gains `cols` to specify a subset of columns to include in the resulting `data.table`, [#5243](https://github.com/Rdatatable/data.table/issues/5243). This saves the memory overhead of subsetting unneeded columns, and provides a cleaner API for a common operation previously needing more convoluted code. Thanks to @MichaelChirico for the suggestion & implementation. +38. `:=` is now optimized by group, [#1414](https://github.com/Rdatatable/data.table/issues/1414). Thanks to Arun Srinivasan for suggesting, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index f6eee8dfd0..198c31ee3b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1721,13 +1721,15 @@ replace_dot_alias = function(e) { dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character? # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with # nomatch=NULL even now.. but not switching it on yet, will deal it separately. - if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { + if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__)) { if (!length(ansvars) && !use.I) { GForce = FALSE - if ( (is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N") ) { + if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { GForce = TRUE if (verbose) catf("GForce optimized j to '%s'\n",deparse(jsub, width.cutoff=200L, nlines=1L)) } + } else if (length(lhs) && is.symbol(jsub)) { # turn off GForce for the combination of := and .N + GForce = FALSE } else { # Apply GForce .gforce_ok = function(q) { @@ -1758,13 +1760,13 @@ replace_dot_alias = function(e) { for (ii in seq_along(jsub)[-1L]) { if (dotN(jsub[[ii]])) next; # For #334 jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]])) - if (length(jsub[[ii]])==3L && is.symbol(jsub[[ii]][[3L]]) && !(jsub[[ii]][[3L]] %chin% sdvars)) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 + if (length(jsub[[ii]])>=3L && is.symbol(jsub[[ii]][[3L]]) && !(jsub[[ii]][[3L]] %chin% sdvars)) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 } else { # adding argument to ghead/gtail if none is supplied to g-optimized head/tail if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) - if (length(jsub)==3L && is.symbol(jsub[[3L]]) && !(jsub[[3L]] %chin% sdvars)) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 + if (length(jsub)>=3L && is.symbol(jsub[[3L]]) && !(jsub[[3L]] %chin% sdvars)) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } else if (verbose) catf("GForce is on, left j unchanged\n"); @@ -1896,6 +1898,15 @@ replace_dot_alias = function(e) { # Grouping by by: i is by val, icols NULL, o__ may be subset of x, f__ points to o__ (or x if !length o__) # TO DO: setkey could mark the key whether it is unique or not. if (!is.null(lhs)) { + if (GForce) { # GForce should work with := #1414 + vlen = length(ans[[1L]]) + # replicate vals if GForce returns 1 value per group + jvals = if (vlen==length(len__)) lapply(tail(ans, -length(g)), rep.int, times=len__) else tail(ans, -length(g)) + jrows = if (!is.null(irows) && length(irows)!=length(o__)) irows else { if (length(o__)==0L) NULL else o__} + # unwrap single column jvals for assign + if (length(jvals)==1L) jvals = jvals[[1L]] + .Call(Cassign, x, jrows, lhs, newnames, jvals) + } if (any(names_x[cols] %chin% key(x))) setkey(x,NULL) # fixes #1479. Take care of secondary indices, TODO: cleaner way of doing this diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a2e9fa6e04..e510b3292c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1809,25 +1809,41 @@ test(610.3, chorder(x), base::order(x)) test(610.4, unique(x[chgroup(x)]), unique(x)) # := by group +options(datatable.optimize=0L) +DT = data.table(a=1:3,b=(1:9)/10) +test(611.1, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +setkey(DT,a) +test(611.2, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +# Combining := by group with i +test(611.3, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(611.4, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) +options(datatable.optimize=2L) DT = data.table(a=1:3,b=(1:9)/10) -test(611, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +test(612.1, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) setkey(DT,a) -test(612, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +test(612.2, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +# Combining := by group with i +test(612.3, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(612.4, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) # Assign to subset ok (NA initialized in the other items) ok : test(613, DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) test(614, DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) -# Combining := by group with i -test(616, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(617, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) # Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 ans = copy(DT)[,r:=NA_real_] -test(618, copy(DT)[a>3,r:=sum(b)], ans) -test(619, copy(DT)[J(-1),r:=sum(b)], ans) -test(620.1, copy(DT)[NA,r:=sum(b)], ans) -test(620.2, copy(DT)[0,r:=sum(b)], ans) -test(620.3, copy(DT)[NULL,r:=sum(b)], null.data.table()) +options(datatable.optimize=0L) +test(618.1, copy(DT)[a>3,r:=sum(b)], ans) +test(618.2, copy(DT)[J(-1),r:=sum(b)], ans) +test(618.3, copy(DT)[NA,r:=sum(b)], ans) +test(618.4, copy(DT)[0,r:=sum(b)], ans) +test(618.5, copy(DT)[NULL,r:=sum(b)], null.data.table()) +options(datatable.optimize=2L) +test(619.1, copy(DT)[a>3,r:=sum(b)], ans) +test(619.2, copy(DT)[J(-1),r:=sum(b)], ans) +test(619.3, copy(DT)[NA,r:=sum(b)], ans) +test(619.4, copy(DT)[0,r:=sum(b)], ans) +test(619.5, copy(DT)[NULL,r:=sum(b)], null.data.table()) DT = data.table(x=letters, key="x") test(621, copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained @@ -1835,7 +1851,10 @@ test(622, copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") set.seed(2) DT = data.table(a=rnorm(5)*10, b=1:5) -test(623, DT[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +options(datatable.optimize=0L) +test(623.1, copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +options(datatable.optimize=2L) +test(623.2, copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) # Tests on POSIXct attributes @@ -1902,12 +1921,20 @@ test(635, names(DT[,list(x,y,a=y)]), c("x","y","a")) test(636, names(DT[,list(x,a=y)]), c("x","a")) # Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. +options(datatable.optimize=0L) set.seed(1) DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") -test(637, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) -test(638, key(DT[J(43L),a:=99L]), NULL) +test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) +test(637.2, key(DT[J(43L),a:=99L]), NULL) setkey(DT,a) -test(639, key(DT[,a:=99L,by=a]), NULL) +test(637.3, key(DT[,a:=99L,by=a]), NULL) +options(datatable.optimize=2L) +set.seed(1) +DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") +test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) +test(638.2, key(DT[J(43L),a:=99L]), NULL) +setkey(DT,a) +test(638.3, key(DT[,a:=99L,by=a]), NULL) # Test printing is right aligned without quotes etc, and rownames are repeated ok for more than 20 rows DT=data.table(a=8:10,b=c("xy","x","xyz"),c=c(1.1,22.1,0)) @@ -1999,18 +2026,32 @@ test(668, DT[a<3,sum(b),by=eval(paste("a"))], DT[a<3,sum(b),by=a]) test(669, DT[a<3,sum(b),by=c(2)], error="must evaluate to 'character'") # Test := keyby does setkey, #2065 +options(datatable.optimize=0L) DT = data.table(x=1:2, y=1:6) ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(670, DT[,z:=sum(y),keyby=x], ans) +test(670.1, DT[,z:=sum(y),keyby=x], ans) DT = data.table(x=1:2, y=1:6) -test(671, DT[,z:=sum(y),keyby="x"], ans) +test(670.2, DT[,z:=sum(y),keyby="x"], ans) DT = data.table(x=1:2, y=1:6) -test(672, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), +test(670.3, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") DT = data.table(x=1:2, y=1:6) -test(673, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +test(670.4, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +DT = data.table(x=1:2, y=1:6) +test(670.5, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +options(datatable.optimize=2L) +DT = data.table(x=1:2, y=1:6) +ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") +test(671.1, DT[,z:=sum(y),keyby=x], ans) +DT = data.table(x=1:2, y=1:6) +test(671.2, DT[,z:=sum(y),keyby="x"], ans) +DT = data.table(x=1:2, y=1:6) +test(671.3, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), + warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") DT = data.table(x=1:2, y=1:6) -test(674, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +test(671.4, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +DT = data.table(x=1:2, y=1:6) +test(671.5, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") # Test new .() DT = data.table(x=1:2, y=1:6, key="x") @@ -2195,13 +2236,22 @@ test(750.1, copy(DT)[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.tabl test(750.2, copy(DT)[a<8,let(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) # varname holding colnames, by group, linked from #2120. +options(datatable.optimize=0L) +DT = data.table(a=rep(1:3,1:3),b=1:6) +colname = "newcol" +test(751.1, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) +options(datatable.optimize=2L) DT = data.table(a=rep(1:3,1:3),b=1:6) colname = "newcol" -test(751, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) +test(751.2, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) # Add tests for nested := in j by group, #1987 +options(datatable.optimize=0L) DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +test(752.1, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +options(datatable.optimize=2L) +DT = data.table(a=rep(1:3,2:4),b=1:9) +test(752.2, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) # Test duplicate() of recycled plonking RHS, #2298 DT = data.table(a=letters[3:1],x=1:3) @@ -3871,7 +3921,10 @@ test(1133.3, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) +options(datatable.optimize=0L) test(1133.7, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) +options(datatable.optimize=2L) +test(1133.75, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) # on a new column with warning on 2nd assign DT[,new:=NULL] test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)), @@ -3962,9 +4015,12 @@ DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2)) test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - "))) DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) -ans = copy(DT) -test(1143.4, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], ans[, lx := x]) -test(1143.5, copy(DT)[,let(lx=tail(x,1L)), by=y], ans[, lx := x]) +options(datatable.optimize=0L) +test(1143.4, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.5, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +options(datatable.optimize=2L) +test(1143.6, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.7, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 @@ -14264,9 +14320,14 @@ x <- as.array(1:5) test(1980, names(data.table(x)), "x") # crash when n="lead", #3354 +options(datatable.optimize=0L) DT = data.table( id = 1:5 , val = letters[1:5] ) test(1981.1, DT[, new_col := shift(val, "lead")], error="is.numeric(n) is not TRUE") test(1981.2, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") +options(datatable.optimize=Inf) +DT = data.table( id = 1:5 , val = letters[1:5] ) +test(1981.3, DT[, new_col := shift(val, "lead")], error="is.numeric(n) is not TRUE") +test(1981.4, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") # print of DT with many columns reordered them, #3306. DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print @@ -18559,3 +18620,71 @@ test(2232.3, unique(DT[1:26], by='g', cols='v1'), DT[1:26, !'v2']) ## invalid columns fail as expected test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") +# support := with GForce #1414 +options(datatable.optimize = 2L) +DT = data.table(a=1:3,b=(1:9)/10) +test(2233.01, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output="GForce optimized j to") +# GForce returning full length +test(2233.02, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output="GForce optimized j to") +# GForce neither returning 1 per group nor full length +test(2233.03, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +# compare to non GForce version +DT = data.table(a=1:3,b=(1:9)/10) +test(2233.04, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output="GForce optimized j to") +test(2233.05, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to") + +# with key and grouping by key +DT = data.table(a=1:3,b=(1:9)/10, key="a") +test(2233.06, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10, key="a"), output="GForce optimized j to") +test(2233.07, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10, key="a"), output="GForce optimized j to") +test(2233.08, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +DT = data.table(a=1:3,b=(1:9)/10, key="a") +test(2233.09, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output="GForce optimized j to") +test(2233.10, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to") + +# with key and grouping by nonkey +DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c") +test(2233.11, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:3)/10, key="c"), output="GForce optimized j to") +test(2233.12, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:9)/10, key="c"), output="GForce optimized j to") +test(2233.13, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c") +test(2233.14, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output="GForce optimized j to") +test(2233.15, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to") + +# with key and keyby by nonkey +DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c") +test(2233.16, copy(DT)[, v := min(b), keyby=a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:3)/10, key="a"), output="GForce optimized j to") +test(2233.17, copy(DT)[, v := head(b, 3L), keyby=a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, c=(3:1), v=(1:9)/10, key="a"), output="GForce optimized j to") +test(2233.18, copy(DT)[, v := head(b, 2L), keyby=a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +DT = data.table(a=1:3,b=(1:9)/10,c=(3:1),key="c") +test(2233.19, copy(DT)[, v := min(b), keyby=a, verbose=TRUE], copy(DT)[, v := base::min(b), keyby=a], output="GForce optimized j to") +test(2233.20, copy(DT)[, v := head(b, 3L), keyby=a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), keyby=a], output="GForce optimized j to") +# with irows +DT = data.table(a=1:3,b=(1:9)/10) +test(2233.21, DT[a==2, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=c(NA,0.2,NA)), output="GForce optimized j to") +test(2233.22, DT[a!=4, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output="GForce optimized j to") +test(2233.23, DT[a!=4, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +DT = data.table(a=1:3,b=(1:9)/10) +test(2233.24, copy(DT)[a==2, v := min(b), a, verbose=TRUE], copy(DT)[a==2, v := base::min(b), a, ], output="GForce optimized j to") +test(2233.25, copy(DT)[a!=4, v := head(b, 3L), a, verbose=TRUE], copy(DT)[a!=4, v := utils::head(b, 3L), a], output="GForce optimized j to") + +# multiple assignments +DT = data.table(a=1:3,b=(1:9)/10) +test(2233.26, DT[, c("v1","v2") := .(min(b), max(b)), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v1=(1:3)/10, v2=(7:9)/10), output="GForce optimized j to") +test(2233.27, DT[, c("v1","v2") := .(head(b,3L), tail(b,3L)), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v1=(1:9)/10, v2=(1:9)/10), output="GForce optimized j to") +test(2233.28, DT[, c("v1","v2") := .(head(b,3L), tail(b,2L)), a], error="Supplied 6 items to be assigned to 9 items of column 'v2'.") +test(2233.29, DT[, c("v1","v2") := .(head(b,2L), tail(b,3L)), a], error="Supplied 6 items to be assigned to 9 items of column 'v1'.") +test(2233.30, DT[, c("v1","v2") := .(head(b,2L), tail(b,2L)), a], error="Supplied 6 items to be assigned to 9 items of column 'v1'.") +test(2233.31, DT[, c("v1","v2") := .(min(b), max(b)), a, verbose=TRUE], DT[, c("v1","v2") := .(base::min(b), base::max(b)), a ], output="GForce optimized j to") +test(2233.32, DT[, c("v1","v2") := .(head(b,3L), tail(b,3L)), a, verbose=TRUE], DT[, c("v1","v2") := .(utils::head(b,3L), utils::tail(b,3L)), a], output="GForce optimized j to") + +# gforce needs to evaluate variable arguments before calling C part (part of test 101.17 in programming.Rraw) +set.seed(108) +yn = c(1, 5, 10, 20) +ycols = paste0("y", yn) +ydt = data.table(symbol = rep(1:3, each = 100)) +ydt[, date := seq_len(.N), by = symbol] +ydt[, ret := rnorm(.N)] +f = shift +test(2233.33, copy(ydt)[, (ycols) := shift(ret, yn, type = "lead"), by = symbol, verbose=TRUE], copy(ydt)[, (ycols) := f(ret, yn, type = "lead"), by = symbol], output="GForce optimized j to") + From 75b5d00b403b58284cbdee13b760776142f38cee Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 8 Dec 2021 19:25:34 -0700 Subject: [PATCH 458/588] #5245 rep.int to rep to pass R<=3.5.0 --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 198c31ee3b..dcb18ad1f9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1901,7 +1901,7 @@ replace_dot_alias = function(e) { if (GForce) { # GForce should work with := #1414 vlen = length(ans[[1L]]) # replicate vals if GForce returns 1 value per group - jvals = if (vlen==length(len__)) lapply(tail(ans, -length(g)), rep.int, times=len__) else tail(ans, -length(g)) + jvals = if (vlen==length(len__)) lapply(tail(ans, -length(g)), rep, times=len__) else tail(ans, -length(g)) # see comment in #4245 for why rep instead of rep.int jrows = if (!is.null(irows) && length(irows)!=length(o__)) irows else { if (length(o__)==0L) NULL else o__} # unwrap single column jvals for assign if (length(jvals)==1L) jvals = jvals[[1L]] From 8b257b8c2f638567c2f7166f6bc3e086cf4195c2 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 9 Dec 2021 07:37:31 +0100 Subject: [PATCH 459/588] enable use of by=.I (#5235) --- NEWS.md | 16 ++++++++++++++++ R/data.table.R | 6 ++++++ inst/tests/tests.Rraw | 14 ++++++++++++++ man/special-symbols.Rd | 10 +++++++--- 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 00a9b5be1b..aad41a0ccb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -276,6 +276,22 @@ 38. `:=` is now optimized by group, [#1414](https://github.com/Rdatatable/data.table/issues/1414). Thanks to Arun Srinivasan for suggesting, and Benjamin Schwendinger for the PR. +39. `.I` is now available in `by` for rowwise operations, [#1732](https://github.com/Rdatatable/data.table/issues/1732). Thanks to Rafael H. M. Pereira for requesting, and Benjamin Schwendinger for the PR. + + ```R + DT + # V1 V2 + # + # 1: 3 5 + # 2: 4 6 + + DT[, sum(.SD), by=.I] + # I V1 + # + # 1: 1 8 + # 2: 2 10 + ``` + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index dcb18ad1f9..e671a208df 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -766,6 +766,12 @@ replace_dot_alias = function(e) { # may evaluate to NULL | character() | "" | list(), likely a result of a user expression where no-grouping is one case being loop'd through bysubl = as.list.default(bysub) bysuborig = bysub + if (".I" %in% bysubl) { #1732 + if (!is.symbol(bysub) && (length(bysubl)!=2L || !is.symbol(bysubl[[2L]]) || !(bysubl[[1L]] %chin% c(".","c","list")))) + stopf("'by' contains .I but only the following are currently supported: by=.I, by=.(.I), by=c(.I), by=list(.I)") + bysub = if (is.null(irows)) seq_len(nrow(x)) else irows + bysuborig = as.symbol("I") + } if (is.name(bysub) && !(bysub %chin% names_x)) { # TO DO: names(x),names(i),and i. and x. prefixes bysub = eval(bysub, parent.frame(), parent.frame()) # fix for # 5106 - http://stackoverflow.com/questions/19983423/why-by-on-a-vector-not-from-a-data-table-column-is-very-slow diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e510b3292c..fc7e14f753 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18688,3 +18688,17 @@ ydt[, ret := rnorm(.N)] f = shift test(2233.33, copy(ydt)[, (ycols) := shift(ret, yn, type = "lead"), by = symbol, verbose=TRUE], copy(ydt)[, (ycols) := f(ret, yn, type = "lead"), by = symbol], output="GForce optimized j to") +# support by=.I; #1732 +DT = data.table(V1=1:5, V2=3:7, V3=5:1) +test(2234.1, DT[, min(.SD), by=.I], setnames(DT[, min(.SD), by=1:nrow(DT)], "nrow", "I")) +test(2234.2, DT[, min(.SD), by=.I], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +# works also with i +test(2234.3, DT[c(1,3,5), min(.SD), by=.I], data.table(I=c(1L, 3L, 5L), V1=c(1L, 3L, 1L))) +test(2234.4, DT[c(4, NA), min(.SD), by=.I], data.table(I=c(4L, NA), V1=c(2L, NA))) +# other writing styles of by=.I +test(2234.5, DT[, min(.SD), by=.(.I)], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +test(2234.6, DT[, min(.SD), by=list(.I)], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +test(2234.7, DT[, min(.SD), by=c(.I)], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, 1L))) +test(2234.8, DT[, min(.SD), by=.I%%2L], error="by.*contains .I.*supported") # would be nice to support in future; i.e. by odd/even rows, and by=(.I+1L)%/%2L for pairs of rows; i.e. any expression of .I +test(2234.9, DT[, min(.SD), by=somefun(.I)], error="by.*contains .I.*supported") + diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd index 1f4e1615c0..c96cbef5c4 100644 --- a/man/special-symbols.Rd +++ b/man/special-symbols.Rd @@ -10,7 +10,7 @@ \alias{.NGRP} \title{ Special symbols } \description{ - \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}. + \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. \code{.I} can be used in \code{by} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}. \code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}. } \details{ @@ -22,13 +22,13 @@ \item{\code{.SD} is a \code{data.table} containing the \bold{S}ubset of \code{x}'s \bold{D}ata for each group, excluding any columns used in \code{by} (or \code{keyby}).} \item{\code{.BY} is a \code{list} containing a length 1 vector for each item in \code{by}. This can be useful when \code{by} is not known in advance. The \code{by} variables are also available to \code{j} directly by name; useful for example for titles of graphs if \code{j} is a plot command, or to branch with \code{if()} depending on the value of a group variable.} \item{\code{.N} is an integer, length 1, containing the number of rows in the group. This may be useful when the column names are not known in advance and for convenience generally. When grouping by \code{i}, \code{.N} is the number of rows in \code{x} matched to, for each row of \code{i}, regardless of whether \code{nomatch} is \code{NA} or \code{NULL}. It is renamed to \code{N} (no dot) in the result (otherwise a column called \code{".N"} could conflict with the \code{.N} variable, see FAQ 4.6 for more details and example), unless it is explicitly named; e.g., \code{DT[,list(total=.N),by=a]}.} - \item{\code{.I} is an integer vector equal to \code{seq_len(nrow(x))}. While grouping, it holds for each item in the group, its row location in \code{x}. This is useful to subset in \code{j}; e.g. \code{DT[, .I[which.max(somecol)], by=grp]}.} + \item{\code{.I} is an integer vector equal to \code{seq_len(nrow(x))}. While grouping, it holds for each item in the group, its row location in \code{x}. This is useful to subset in \code{j}; e.g. \code{DT[, .I[which.max(somecol)], by=grp]}. If used in \code{by} it corresponds to applying a function rowwise. } \item{\code{.GRP} is an integer, length 1, containing a simple group counter. 1 for the 1st group, 2 for the 2nd, etc.} \item{\code{.NGRP} is an integer, length 1, containing the number of groups. } } \code{.EACHI} is defined as \code{NULL} but its value is not used. Its usage is \code{by=.EACHI} (or \code{keyby=.EACHI}) which invokes grouping-by-each-row-of-i; see \code{\link{data.table}}'s \code{by} argument for more details. - + Note that \code{.N} in \code{i} is computed up-front, while that in \code{j} applies \emph{after filtering in \code{i}}. That means that even absent grouping, \code{.N} in \code{i} can be different from \code{.N} in \code{j}. See Examples. } \seealso{ @@ -58,5 +58,9 @@ X[, DT[.BY, y, on="x"], by=x] # join within each group # .N can be different in i and j DT[{cat(sprintf('in i, .N is \%d\n', .N)); a < .N/2}, {cat(sprintf('in j, .N is \%d\n', .N)); mean(a)}] + +# .I can be different in j and by, enabling rowwise operations in by +DT[, .(.I, min(.SD[,-1]))] +DT[, .(min(.SD[,-1])), by=.I] } \keyword{ data } From e26fb859f7ebf4f5210ef36a24b92cc389f7269e Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 15 Dec 2021 17:40:36 -0700 Subject: [PATCH 460/588] .dev-only: revdep tweak to the special case cplexAPI which is no longer on CRAN --- .dev/revdep.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 6f4b1f11e2..3cf5b82498 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -96,9 +96,10 @@ update.packages(ask=FALSE, checkBuilt=TRUE) avail = available.packages() # includes CRAN and Bioc, from getOption("repos") set above -avail = avail[-match("cplexAPI",rownames(avail)),] +avail = avail[!rownames(avail) %in% "cplexAPI", ] # cplexAPI is suggested by revdeps ivmte and prioritizr. I haven't succeeded to install IBM ILOG CPLEX which requires a license, # so consider cplexAPI not available when resolving missing suggests at the end of status(). +# Update: cplexAPI was removed from CRAN on 5 Nov 2021 so this is now redundant, but leave it in place for future use deps = tools::package_dependencies("data.table", db = available.packages(repos=getOption("repos")["CRAN"]), # just CRAN revdeps though (not Bioc) from October 2020 From 473d8b8f363aee58b1c6468f1c1e1a02e363b412 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 15 Dec 2021 23:54:33 -0700 Subject: [PATCH 461/588] .dev-only: revdep tweak for LOMAR which fails to install due to its dependency TDA having install errors on CRAN --- .dev/revdep.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index 3cf5b82498..10af35b553 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -279,7 +279,7 @@ cran = function() # reports CRAN status of the .cran.fail packages .(ERROR=sum(Status=="ERROR", na.rm=TRUE), WARN =sum(Status=="WARN", na.rm=TRUE), cran =paste(unique(Version),collapse=";"), - local=as.character(packageVersion(.BY[[1]]))), + local=as.character(tryCatch(packageVersion(.BY[[1]]), error=function(e)"error"))), keyby=Package] ans[local==cran, c("cran","local"):=""] ans[, "right_click_in_bash":=paste0("https://cran.r-project.org/web/checks/check_results_",Package,".html")] From 71c7e6d45d8bb4662b11849c7511a4a83f5dd6bf Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 20 Dec 2021 17:43:02 -0700 Subject: [PATCH 462/588] GLCI-only: remove knitr/rmarkdown from suggests for dev-win #5294 --- .ci/ci.R | 11 +++++++++-- .gitlab-ci.yml | 8 +++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/.ci/ci.R b/.ci/ci.R index 70e5fa27a2..a165de8189 100644 --- a/.ci/ci.R +++ b/.ci/ci.R @@ -47,7 +47,8 @@ function (repos, type = getOption("pkgType"), ver) dcf.dependencies <- function(file = "DESCRIPTION", which = NA, - except.priority = "base") { + except.priority = "base", + exclude = NULL) { if (!is.character(file) || !length(file) || !all(file.exists(file))) stop("file argument must be character of filepath(s) to existing DESCRIPTION file(s)") if (!is.character(except.priority)) @@ -79,7 +80,13 @@ function(file = "DESCRIPTION", } x <- unlist(lapply(x, local.extract_dependency_package_names)) except <- if (length(except.priority)) c("R", unlist(tools:::.get_standard_package_names()[except.priority], use.names = FALSE)) - setdiff(x, except) + x = setdiff(x, except) + if (length(exclude)) { # to exclude knitr/rmarkdown, 5294 + if (!is.character(exclude) || anyDuplicated(exclude)) + stop("exclude may be NULL or a character vector containing no duplicates") + x = setdiff(x, exclude) + } + x } ## returns additional repositories for dependency packages based on its DESCRIPTION file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 300adb2734..ade90bc6ca 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -74,8 +74,6 @@ build: ## build data.table sources as tar.gz archive .test-install-deps: &install-deps - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="most"), quiet=TRUE)' -.test-install-deps-win: &install-deps-win - - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most'), quiet=TRUE)" .test-cp-src: &cp-src - cp $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) . @@ -249,7 +247,7 @@ test-rel-win: ## R-release on Windows, test and build binaries - *install-r-rel-win - *install-rtools-win - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - - *install-deps-win + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most'), quiet=TRUE)" - *cp-src-win - rm.exe -r bus script: @@ -268,7 +266,7 @@ test-dev-win: ## R-devel on Windows - *install-r-devel-win - *install-rtools-win - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - - *install-deps-win + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c("knitr","rmarkdown")), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win - rm.exe -r bus script: @@ -287,7 +285,7 @@ test-old-win: ## R-oldrel on Windows - *install-r-oldrel-win - *install-rtools-win - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - - *install-deps-win + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c("knitr","rmarkdown")), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win - rm.exe -r bus script: From 46ccfdf74af5fd56dc8ab99c761b36c147f6d979 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 20 Dec 2021 18:19:24 -0700 Subject: [PATCH 463/588] GLCI-only: remove knitr/rmarkdown from suggests for dev-win (fix 1) #5294 --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ade90bc6ca..16b75e1a1a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -266,7 +266,7 @@ test-dev-win: ## R-devel on Windows - *install-r-devel-win - *install-rtools-win - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c("knitr","rmarkdown")), quiet=TRUE)" ## exclude= for #5294 + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win - rm.exe -r bus script: @@ -285,7 +285,7 @@ test-old-win: ## R-oldrel on Windows - *install-r-oldrel-win - *install-rtools-win - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c("knitr","rmarkdown")), quiet=TRUE)" ## exclude= for #5294 + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win - rm.exe -r bus script: From eed712ef45fd9198de6aa1ac1b672a7347253d18 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 20 Dec 2021 19:49:56 -0700 Subject: [PATCH 464/588] GLCI-only: rtools42 for dev-win #5294 --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 16b75e1a1a..1cb76bedc1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -258,14 +258,14 @@ test-rel-win: ## R-release on Windows, test and build binaries - *rm-src-win - *mv-bin-win -test-dev-win: ## R-devel on Windows +test-dev-win: ## R-devel on Windows; see #5294 for changes in Dec 2021 related to UCRT and Rtools42 <<: *test-win variables: R_VERSION: "$R_DEVEL_VERSION" before_script: - *install-r-devel-win - - *install-rtools-win - - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" + - curl.exe -s -o ../rtools.exe https://www.r-project.org/nosvn/winutf8/ucrt3/rtools42-4911-4926.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait + - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win - rm.exe -r bus From c2e60fef4db122749718a7e57a6dfaf5839ed3e3 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 15 Mar 2022 05:47:35 -0600 Subject: [PATCH 465/588] gforce := follow up (#5348) --- NEWS.md | 2 +- R/data.table.R | 7 ++-- inst/tests/test2233-43.Rdata | Bin 0 -> 351 bytes inst/tests/tests.Rraw | 63 +++++++++++++++++++++++++++++++++++ 4 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 inst/tests/test2233-43.Rdata diff --git a/NEWS.md b/NEWS.md index aad41a0ccb..aa3341dc53 100644 --- a/NEWS.md +++ b/NEWS.md @@ -274,7 +274,7 @@ 37. `unique.data.table()` gains `cols` to specify a subset of columns to include in the resulting `data.table`, [#5243](https://github.com/Rdatatable/data.table/issues/5243). This saves the memory overhead of subsetting unneeded columns, and provides a cleaner API for a common operation previously needing more convoluted code. Thanks to @MichaelChirico for the suggestion & implementation. -38. `:=` is now optimized by group, [#1414](https://github.com/Rdatatable/data.table/issues/1414). Thanks to Arun Srinivasan for suggesting, and Benjamin Schwendinger for the PR. +38. `:=` is now optimized by group, [#1414](https://github.com/Rdatatable/data.table/issues/1414). Thanks to Arun Srinivasan for suggesting, and Benjamin Schwendinger for the PR. Thanks to @clerousset, @dcaseykc, @OfekShilon, and @SeanShao98 for testing dev and filing detailed bug reports which were fixed before release and their tests added to the test suite. 39. `.I` is now available in `by` for rowwise operations, [#1732](https://github.com/Rdatatable/data.table/issues/1732). Thanks to Rafael H. M. Pereira for requesting, and Benjamin Schwendinger for the PR. diff --git a/R/data.table.R b/R/data.table.R index e671a208df..68205350bb 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1908,9 +1908,10 @@ replace_dot_alias = function(e) { vlen = length(ans[[1L]]) # replicate vals if GForce returns 1 value per group jvals = if (vlen==length(len__)) lapply(tail(ans, -length(g)), rep, times=len__) else tail(ans, -length(g)) # see comment in #4245 for why rep instead of rep.int - jrows = if (!is.null(irows) && length(irows)!=length(o__)) irows else { if (length(o__)==0L) NULL else o__} - # unwrap single column jvals for assign - if (length(jvals)==1L) jvals = jvals[[1L]] + jrows = vecseq(f__,len__,NULL) + if (length(o__)) jrows = o__[jrows] + if (length(irows)) jrows = irows[jrows] + if (length(jvals)==1L) jvals = jvals[[1L]] # unwrap single column jvals for assign .Call(Cassign, x, jrows, lhs, newnames, jvals) } if (any(names_x[cols] %chin% key(x))) diff --git a/inst/tests/test2233-43.Rdata b/inst/tests/test2233-43.Rdata new file mode 100644 index 0000000000000000000000000000000000000000..6f8456cc0a37bd5bbc2230ee2df25beaf75b24f9 GIT binary patch literal 351 zcmV-l0igaLiwFP!000001I1I#PJ=)YUUq2(Qk6zydg!6P07-cP`{NTB54|<3Kte(> z0jyUaSzkb$g;}~RRC`jAZ|9qz{dNYHhx?81Y#c%eC8og$Wk5{|i$P3cgJnF;VZ8z> z+;hNXl@?oSF!UKh#tj8bsdX!jTHaVmvhI){Qp--w@`O}ZaHWhi01VMm9QO=hp>JVM zjSmqQLgVg(RL@^qdHw>Oh1G9Q)oXNgdqkHU-l6=&5=IXwXgSyj5z*SRvzIqdwkBq#@353*`KkB+obX)sPvt=Y xFSu9mrzC7eq6`!@fw32-MVO^L@$xW1, by=id2]$V1), FALSE) + test(testnum+0.005, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) + testnum = 2233.40 +} +# test from #5337 +n=4; k=2 +mm = data.table(a = rep(1:k,n), b=seq_len(n*k), d=rep(1:n,k)) +ans = copy(mm)[, e:=INT(NA,8,NA,12,NA,8,NA,12)] +options(datatable.optimize=0) +test(2233.41, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output="GForce FALSE") +options(datatable.optimize=Inf) +test(2233.42, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output="GForce.*gsum") +# test from #5345 +set.seed(1) +DT = data.table( + t = sample(c(1:3), size=15, replace=TRUE), + id = sample(LETTERS[1:3], size=15, replace=TRUE), + v1 = sample(1:10, size=15, replace=TRUE), + v2 = 1 +) +load(testDir("test2233-43.Rdata")) # ans +setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk +old = options(datatable.verbose=TRUE) +testnum = 2233.43 +for (opt in c(0,Inf)) { + options(datatable.optimize=opt) + out = if (opt) "GForce.*gsum" else "GForce FALSE" + test(testnum, + copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") + ][, n_idT :=dim(.SD)[[1]], by=list(t, id) + ][, sum_v2_id :=sum(v2), by=.(id) + ][, sum_v1_idT:=sum(v1), by=c("id", "t") + ][, sum_v1_id :=sum(v1), by=c("id")], + ans, + output=out) + testnum = 2233.44 +} +options(old) + # support by=.I; #1732 DT = data.table(V1=1:5, V2=3:7, V3=5:1) test(2234.1, DT[, min(.SD), by=.I], setnames(DT[, min(.SD), by=1:nrow(DT)], "nrow", "I")) From 1fb423b6bf6cd17b69bf189008359f6acffa26e3 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 15 Mar 2022 14:00:29 +0100 Subject: [PATCH 466/588] only assign copied value in := if not null (#5289) --- NEWS.md | 2 ++ R/data.table.R | 2 +- inst/tests/tests.Rraw | 4 ++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index aa3341dc53..a3f2fe70cd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -544,6 +544,8 @@ 51. `merge.data.table()` silently ignored the `incomparables` argument, [#2587](https://github.com/Rdatatable/data.table/issues/2587). It is now implemented and any other ignored arguments (e.g. misspellings) are now warned about. Thanks to @GBsuperman for the report and @ben-schwen for the fix. +52. `DT[, c('z','x') := {x=NULL; list(2,NULL)}]` now removes column `x` as expected rather than incorrectly assigning `2` to `x` as well as `z`, [#5284](https://github.com/Rdatatable/data.table/issues/5284). The `x=NULL` is superfluous while the `list(2,NULL)` is the final value of `{}` whose items correspond to `c('z','x')`. Thanks @eutwt for the report, and @ben-schwen for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/data.table.R b/R/data.table.R index 68205350bb..e273e925e1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1377,7 +1377,7 @@ replace_dot_alias = function(e) { } else if (address(jval) == address(SDenv$.SD)) { jval = copy(jval) } else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) { - for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]]) + for (jidx in jcpy) { if(!is.null(jval[[jidx]])) jval[[jidx]] = copy(jval[[jidx]]) } } else if (jsub %iscall% 'get') { jval = copy(jval) # fix for #1212 } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 84f221f7dd..7ac83b66b5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18765,3 +18765,7 @@ test(2234.7, DT[, min(.SD), by=c(.I)], data.table(I=1L:5L, V1=c(1L, 2L, 3L, 2L, test(2234.8, DT[, min(.SD), by=.I%%2L], error="by.*contains .I.*supported") # would be nice to support in future; i.e. by odd/even rows, and by=(.I+1L)%/%2L for pairs of rows; i.e. any expression of .I test(2234.9, DT[, min(.SD), by=somefun(.I)], error="by.*contains .I.*supported") +# copying values of j could lead to recycling if j is a list containing NULL #5284 +DT = data.table(x = 1) +test(2235.1, copy(DT)[, c("z", "x") := {x = NULL; list(2, NULL)}], data.table(z = 2)) +test(2235.2, copy(DT)[, c("z", "x") := {list(2, NULL)}], data.table(z = 2)) From 0ed63cd0e165fd680c2a3f7c2ef0d602bb07d233 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 15 Mar 2022 14:55:51 +0100 Subject: [PATCH 467/588] add yearquarter, move date calculation from POSIXlt to C (#5300) --- NAMESPACE | 2 +- NEWS.md | 2 + R/IDateTime.R | 20 ++++-- inst/tests/tests.Rraw | 46 ++++++++++--- man/IDateTime.Rd | 23 +++++-- src/idatetime.c | 154 ++++++++++++++++++++++++++++++++++++++++++ src/init.c | 2 + 7 files changed, 225 insertions(+), 24 deletions(-) create mode 100644 src/idatetime.c diff --git a/NAMESPACE b/NAMESPACE index 207d50593a..ad306b4ce8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -148,7 +148,7 @@ if (getRversion() >= "3.6.0") { # IDateTime support: export(as.IDate,as.ITime,IDateTime) -export(second,minute,hour,yday,wday,mday,week,isoweek,month,quarter,year) +export(second,minute,hour,yday,wday,mday,week,isoweek,month,quarter,year,yearmon,yearqtr) S3method("[", ITime) S3method("+", IDate) diff --git a/NEWS.md b/NEWS.md index a3f2fe70cd..bf83288464 100644 --- a/NEWS.md +++ b/NEWS.md @@ -292,6 +292,8 @@ # 2: 2 10 ``` +40. New functions `yearmon()` and `yearqtr` give a combined representation of `year()` and `month()`/`quarter()`. These and also `yday`, `wday`, `mday`, `week`, `month` and `year` are now optimized for memory and compute efficiency by removing the `POSIXlt` dependency, [#649](https://github.com/Rdatatable/data.table/issues/649). Thanks to Matt Dowle for the request, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/IDateTime.R b/R/IDateTime.R index 33d04b87c4..4e6adf55e3 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -338,10 +338,10 @@ hour = function(x) { if (inherits(x, 'ITime')) return(as.integer(x) %/% 3600L %% 24L) as.POSIXlt(x)$hour } -yday = function(x) as.POSIXlt(x)$yday + 1L -wday = function(x) (unclass(as.IDate(x)) + 4L) %% 7L + 1L -mday = function(x) as.POSIXlt(x)$mday -week = function(x) yday(x) %/% 7L + 1L +yday = function(x) convertDate(as.IDate(x), "yday") +wday = function(x) convertDate(as.IDate(x), "wday") +mday = function(x) convertDate(as.IDate(x), "mday") +week = function(x) convertDate(as.IDate(x), "week") isoweek = function(x) { # ISO 8601-conformant week, as described at # https://en.wikipedia.org/wiki/ISO_week_date @@ -356,7 +356,13 @@ isoweek = function(x) { 1L + (nearest_thurs - year_start) %/% 7L } -month = function(x) as.POSIXlt(x)$mon + 1L -quarter = function(x) as.POSIXlt(x)$mon %/% 3L + 1L -year = function(x) as.POSIXlt(x)$year + 1900L +month = function(x) convertDate(as.IDate(x), "month") +quarter = function(x) convertDate(as.IDate(x), "quarter") +year = function(x) convertDate(as.IDate(x), "year") +yearmon = function(x) convertDate(as.IDate(x), "yearmon") +yearqtr = function(x) convertDate(as.IDate(x), "yearqtr") +convertDate = function(x, type) { + type = match.arg(type, c("yday", "wday", "mday", "week", "month", "quarter", "year", "yearmon", "yearqtr")) + .Call(CconvertDate, x, type) +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7ac83b66b5..a1b751859c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -86,6 +86,19 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { last = data.table::last # xts first = data.table::first # xts, S4Vectors copy = data.table::copy # bit64 v4; bit64 offered to rename though so this is just in case bit64 unoffers + second = data.table::second # lubridate #1135 + minute = data.table::minute # lubridate + hour = data.table::hour # lubridate + yday = data.table::yday # lubridate + wday = data.table::wday # lubridate + mday = data.table::mday # lubridate + week = data.table::week # lubridate + isoweek = data.table::isoweek # lubridate + month = data.table::month # lubridate + quarter = data.table::quarter # lubridate + year = data.table::year # lubridate + yearmon = data.table::yearmon # zoo + yearqtr = data.table::yearqtr # zoo } # Load optional Suggests packages, which are tested by Travis for code coverage, and on CRAN @@ -10449,15 +10462,17 @@ test(1692, capture.output(as.data.table(structure(57600L, class = "ITime"))), # testing all time part extraction routines (subsumes #874) t <- "2016-08-03 01:02:03.45" -test(1693.1, second(t), 3L) -test(1693.2, minute(t), 2L) -test(1693.3, hour(t), 1L) -test(1693.4, yday(t), 216L) -test(1693.5, wday(t), 4L) -test(1693.6, week(t), 31L) -test(1693.7, month(t), 8L) -test(1693.8, quarter(t), 3L) -test(1693.9, year(t), 2016L) +test(1693.01, second(t), 3L) +test(1693.02, minute(t), 2L) +test(1693.03, hour(t), 1L) +test(1693.04, yday(t), 216L) +test(1693.05, wday(t), 4L) +test(1693.06, week(t), 31L) +test(1693.07, month(t), 8L) +test(1693.08, quarter(t), 3L) +test(1693.09, year(t), 2016L) +test(1693.10, yearmon(t), 2016+7/12) +test(1693.11, yearqtr(t), 2016.5) # fix for #1740 - sub-assigning NAs for factors dt = data.table(x = 1:5, y = factor(c("","a","b","a", "")), z = 5:9) @@ -18769,3 +18784,16 @@ test(2234.9, DT[, min(.SD), by=somefun(.I)], error="by.*contains .I.*supported") DT = data.table(x = 1) test(2235.1, copy(DT)[, c("z", "x") := {x = NULL; list(2, NULL)}], data.table(z = 2)) test(2235.2, copy(DT)[, c("z", "x") := {list(2, NULL)}], data.table(z = 2)) + +# move IDate from POSIXlt to C, add yearquarter; #649 +x = c("1111-11-11", "2019-01-01", "2019-02-28", "2019-03-01", "2019-12-31", "2020-02-29", "2020-03-01", "2020-12-31", "2040-01-01", "2040-12-31", "2100-03-01") +test(2236.1, yday(x), c(315L, 1L, 59L, 60L, 365L, 60L, 61L, 366L, 1L, 366L, 60L)) +test(2236.2, mday(x), c(11L, 1L, 28L, 1L, 31L, 29L, 1L, 31L, 1L, 31L, 1L)) +test(2236.3, wday(x), c(7L, 3L, 5L, 6L, 3L, 7L, 1L, 5L, 1L, 2L, 2L)) +test(2236.4, week(x), c(46L, 1L, 9L, 9L, 53L, 9L, 9L, 53L, 1L, 53L, 9L)) +test(2236.5, month(x), c(11L, 1L, 2L, 3L, 12L, 2L, 3L, 12L, 1L, 12L, 3L)) +test(2236.6, quarter(x), c(4L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 4L, 1L)) +test(2236.7, year(x), c(1111L, 2019L, 2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2040L, 2040L, 2100L)) +test(2236.8, yearmon(x), c(1111+10/12, 2019, 2019+1/12, 2019+2/12, 2019+11/12, 2020+1/12, 2020+2/12, 2020+11/12, 2040, 2040+11/12, 2100+2/12)) +test(2236.9, yearqtr(x), c(1111.75, 2019, 2019, 2019, 2019.75, 2020, 2020, 2020.75, 2040, 2040.75, 2100)) + diff --git a/man/IDateTime.Rd b/man/IDateTime.Rd index 876b28b161..6854f59ae9 100644 --- a/man/IDateTime.Rd +++ b/man/IDateTime.Rd @@ -41,6 +41,8 @@ \alias{month} \alias{quarter} \alias{year} +\alias{yearmon} +\alias{yearqtr} \alias{IDate-class} \alias{ITime-class} @@ -93,6 +95,8 @@ isoweek(x) month(x) quarter(x) year(x) +yearmon(x) +yearqtr(x) } @@ -164,11 +168,11 @@ functions \code{weekdays}, \code{months}, and \code{quarters} can also be used, but these return character values, so they must be converted to factors for use with data.table. \code{isoweek} is ISO 8601-consistent. -The \code{round} method for IDate's is useful for grouping and plotting. +The \code{round} method for IDate's is useful for grouping and plotting. It can round to weeks, months, quarters, and years. Similarly, the \code{round} and \code{trunc} methods for ITime's are useful for grouping and plotting. -They can round or truncate to hours and minutes. -Note for ITime's with 30 seconds, rounding is inconsistent due to rounding off a 5. +They can round or truncate to hours and minutes. +Note for ITime's with 30 seconds, rounding is inconsistent due to rounding off a 5. See 'Details' in \code{\link{round}} for more information. } @@ -188,9 +192,14 @@ See 'Details' in \code{\link{round}} for more information. and \code{year} return integer values for second, minute, hour, day of year, day of week, day of month, week, month, quarter, and year, respectively. - - These values are all taken directly from the \code{POSIXlt} representation - of \code{x}, with the notable difference that while \code{yday}, \code{wday}, + \code{yearmon} and \code{yearqtr} return double values representing + respectively `year + (month-1) / 12` and `year + (quarter-1) / 4`. + + \code{second}, \code{minute}, \code{hour} are taken directly from + the \code{POSIXlt} representation. + All other values are computed from the underlying integer representation + and comparable with the values of their \code{POSIXlt} representation + of \code{x}, with the notable difference that while \code{yday}, \code{wday}, and \code{mon} are all 0-based, here they are 1-based. } @@ -253,7 +262,7 @@ round(seqdates, "months") (seqtimes <- seq(as.ITime("07:00"), as.ITime("08:00"), by = 20)) round(seqtimes, "hours") trunc(seqtimes, "hours") - + } \keyword{utilities} diff --git a/src/idatetime.c b/src/idatetime.c new file mode 100644 index 0000000000..c70df3b053 --- /dev/null +++ b/src/idatetime.c @@ -0,0 +1,154 @@ +#include "data.table.h" + +#define YEARS400 146097 +#define YEARS100 36524 +#define YEARS4 1461 +#define YEARS1 365 + +typedef enum { YDAY, WDAY, MDAY, WEEK, MONTH, QUARTER, YEAR, YEARMON, YEARQTR} datetype; + +static inline bool isLeapYear(int year) { + return (year % 100 != 0 || year % 400 == 0) && year % 4 == 0; +} + +void convertSingleDate(int x, datetype type, void *out) +{ + static const char months[] = {31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 31, 29}; + static const int quarter[] = {31, 91, 92, 92, 60}; + + if (type == WDAY) { + int wday = (x + 4) % 7; + if (wday < 0) wday += 7; + *(int *)out = ++wday; + return; + } + + int days = x - 11017; + + int years400 = days / YEARS400; + days %= YEARS400; + if (days < 0) { + days += YEARS400; + years400--; + } + + int years100 = days / YEARS100; + days %= YEARS100; + + int years4 = days / YEARS4; + days %= YEARS4; + + int years1 = days / YEARS1; + days %= YEARS1; + + int year = 2000 + years1 + 4*years4 + 100*years100 + 400*years400; + if (days > 305) + ++year; + + if (type == YEAR) { + *(int *)out = year; + return; + } + + int leap = !years1 && (years4 || !years100); + + if (type == YDAY || type == WEEK) { + int yday = days + 31 + 28 + leap; + if (yday >= YEARS1 + leap) + yday -= YEARS1 + leap; + *(int *)out = ++yday; + if (type == WEEK) + *(int *)out = (*(int *)out / 7) + 1; + return; + } + + if (type == MONTH || type == YEARMON) { + int i; + if (days==0 && !leap && isLeapYear(year)) { + i = 1; + } else { + i = 2; + while (months[i-2] <= days) { + days -= months[i-2]; + i++; + } + } + if (i >= 12) + i -= 12; + + if (type == MONTH) { + *(int *)out = i + 1; + } else { + *(double *)out = year + i / 12.0; + } + return; + } + + if (type == MDAY) { + if (days==0 && !leap && isLeapYear(year)) { + *(int *)out = 29; + return; + } + int i = 0; + while (months[i] <= days) { + days -= months[i]; + i++; + } + *(int *)out = ++days; + return; + } + + if (type == QUARTER || type == YEARQTR) { + int i = 0; + while (quarter[i] <= days) { + days -= quarter[i]; + i++; + } + if (i >= 4) + i -= 4; + if (type == QUARTER) { + *(int *)out = i + 1; + } else { + *(double *)out = year + (i / 4.0); + } + return; + } +} + +SEXP convertDate(SEXP x, SEXP type) +{ + if (!isInteger(x)) error(_("x must be an integer vector")); + const int *ix = INTEGER(x); + const int n = length(x); + if (!isString(type) || length(type) != 1) + error(_("Internal error: invalid type for convertDate(), should have been caught before. please report to data.table issue tracker")); // # nocov + datetype ctype; + bool ansint = true; + if (!strcmp(CHAR(STRING_ELT(type, 0)), "yday")) ctype = YDAY; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "wday")) ctype = WDAY; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "mday")) ctype = MDAY; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "week")) ctype = WEEK; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "month")) ctype = MONTH; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "quarter")) ctype = QUARTER; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "year")) ctype = YEAR; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "yearmon")) { ctype = YEARMON; ansint = false; } + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "yearqtr")) { ctype = YEARQTR; ansint = false; } + else error(_("Internal error: invalid type for convertDate, should have been caught before. please report to data.table issue tracker")); // # nocov + + SEXP ans; + if (ansint) { + ans = PROTECT(allocVector(INTSXP, n)); + int *ansp = INTEGER(ans); + for (int i=0; i < n; ++i) { + convertSingleDate(ix[i], ctype, &ansp[i]); + } + } else { + ans = PROTECT(allocVector(REALSXP, n)); + double *ansp = REAL(ans); + for (int i=0; i < n; ++i) { + convertSingleDate(ix[i], ctype, &ansp[i]); + } + } + UNPROTECT(1); + return ans; +} diff --git a/src/init.c b/src/init.c index 814ada375d..fd43b956e5 100644 --- a/src/init.c +++ b/src/init.c @@ -130,6 +130,7 @@ SEXP allNAR(); SEXP test_dt_win_snprintf(); SEXP dt_zlib_version(); SEXP startsWithAny(); +SEXP convertDate(); // .Externals SEXP fastmean(); @@ -228,6 +229,7 @@ R_CallMethodDef callMethods[] = { {"Cdt_zlib_version", (DL_FUNC)&dt_zlib_version, -1}, {"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, +{"CconvertDate", (DL_FUNC)&convertDate, -1}, {NULL, NULL, 0} }; From 2f675318268ab483da16e4ad49e5d66c9ef7d3b4 Mon Sep 17 00:00:00 2001 From: Dereck de Mezquita <44912288+dereckdemezquita@users.noreply.github.com> Date: Tue, 15 Mar 2022 20:50:43 -0500 Subject: [PATCH 468/588] as.data.frame but keep row names (#5320) --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ R/data.table.R | 10 ++-------- inst/tests/tests.Rraw | 7 +++++++ 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 88dfd46140..924bdeb2dc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -70,7 +70,8 @@ Authors@R: c( person("Kyle","Haynes", role="ctb"), person("Boniface Christian","Kamgang", role="ctb"), person("Olivier","Delmarcell", role="ctb"), - person("Josh","O'Brien", role="ctb")) + person("Josh","O'Brien", role="ctb"), + person("Dereck","de Mezquita", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NEWS.md b/NEWS.md index bf83288464..19fc575e0d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -548,6 +548,8 @@ 52. `DT[, c('z','x') := {x=NULL; list(2,NULL)}]` now removes column `x` as expected rather than incorrectly assigning `2` to `x` as well as `z`, [#5284](https://github.com/Rdatatable/data.table/issues/5284). The `x=NULL` is superfluous while the `list(2,NULL)` is the final value of `{}` whose items correspond to `c('z','x')`. Thanks @eutwt for the report, and @ben-schwen for the fix. +53. `as.data.frame(DT, row.names=)` no longer silently ignores `row.names`, [#5319](https://github.com/Rdatatable/data.table/issues/5319). Thanks to @dereckdemezquita for the fix and PR, and @ben-schwen for guidance. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/data.table.R b/R/data.table.R index e273e925e1..473cf6e766 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2208,15 +2208,9 @@ tail.data.table = function(x, n=6L, ...) { set(x,j=name,value=value) # important i is missing here } -as.data.frame.data.table = function(x, ...) +as.data.frame.data.table = function(x, row.names = NULL, ...) { - ans = copy(x) - setattr(ans,"row.names",.set_row_names(nrow(x))) # since R 2.4.0, data.frames can have non-character row names - setattr(ans,"class","data.frame") - setattr(ans,"sorted",NULL) # remove so if you convert to df, do something, and convert back, it is not sorted - setattr(ans,"index",NULL) #4889 #5042 - setattr(ans,".internal.selfref",NULL) - # leave tl intact, no harm, + ans = setDF(copy(x), rownames = row.names) # issue #5319 ans } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a1b751859c..f453b96208 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18797,3 +18797,10 @@ test(2236.7, year(x), c(1111L, 2019L, 2019L, 2019L, 2019L, 2020L, 2020L, 2020L, test(2236.8, yearmon(x), c(1111+10/12, 2019, 2019+1/12, 2019+2/12, 2019+11/12, 2020+1/12, 2020+2/12, 2020+11/12, 2040, 2040+11/12, 2100+2/12)) test(2236.9, yearqtr(x), c(1111.75, 2019, 2019, 2019, 2019.75, 2020, 2020, 2020.75, 2040, 2040.75, 2100)) +# as.data.table() no longer ignores row.names=, #5319 +dt = data.table(a=1:2, b=3:4) +df = structure(list(a=1:2, b=3:4), row.names=c("x", "y"), class="data.frame") +test(2237.1, as.data.frame(dt, row.names=c("x", "y")), df) +df = data.frame(a=1:2, b=3:4) +test(2237.2, as.data.frame(dt, row.names=NULL), df) + From e9a323de01a17af70d5316016606fa8d35b25023 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 16 Mar 2022 10:00:23 -0600 Subject: [PATCH 469/588] GLCI-only: rtools42-5038-5046.exe and R-4.1.3.exe version number update; #5198 #5294 --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1cb76bedc1..cd9e15c7f4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -94,7 +94,7 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/R-4.1.2-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/R-4.1.3-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait # see #5198 for discussion about the https link used above; it will break each time R is released and the version number will need to be updated .test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait @@ -264,7 +264,7 @@ test-dev-win: ## R-devel on Windows; see #5294 for changes in Dec 2021 related t R_VERSION: "$R_DEVEL_VERSION" before_script: - *install-r-devel-win - - curl.exe -s -o ../rtools.exe https://www.r-project.org/nosvn/winutf8/ucrt3/rtools42-4911-4926.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait + - curl.exe -s -o ../rtools.exe https://www.r-project.org/nosvn/winutf8/ucrt3/rtools42-5038-5046.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win From 8edbd67260038d5ab4b87870c1cb7e37c1b437ee Mon Sep 17 00:00:00 2001 From: Florian Kohrt Date: Tue, 19 Jul 2022 04:11:48 +0200 Subject: [PATCH 470/588] Fix which vignette is referred to (#5420) `:=` is introduced in the vignette "Reference semantics", not in the introductory vignette to the package --- vignettes/datatable-keys-fast-subset.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-keys-fast-subset.Rmd b/vignettes/datatable-keys-fast-subset.Rmd index 917a904136..465052d941 100644 --- a/vignettes/datatable-keys-fast-subset.Rmd +++ b/vignettes/datatable-keys-fast-subset.Rmd @@ -138,7 +138,7 @@ head(flights) * Alternatively you can pass a character vector of column names to the function `setkeyv()`. This is particularly useful while designing functions to pass columns to set key on as function arguments. -* Note that we did not have to assign the result back to a variable. This is because like the `:=` function we saw in the *"Introduction to data.table"* vignette, `setkey()` and `setkeyv()` modify the input *data.table* *by reference*. They return the result invisibly. +* Note that we did not have to assign the result back to a variable. This is because like the `:=` function we saw in the *"Reference semantics"* vignette, `setkey()` and `setkeyv()` modify the input *data.table* *by reference*. They return the result invisibly. * The *data.table* is now reordered (or sorted) by the column we provided - `origin`. Since we reorder by reference, we only require additional memory of one column of length equal to the number of rows in the *data.table*, and is therefore very memory efficient. From 9689f9e8e060edc558c17e587fc9192948860f15 Mon Sep 17 00:00:00 2001 From: Michael Czekanski <38838492+mczek@users.noreply.github.com> Date: Tue, 19 Jul 2022 01:51:29 -0400 Subject: [PATCH 471/588] Implementation of %notin% (#4931) --- .gitlab-ci.yml | 12 ++++++------ DESCRIPTION | 3 ++- NAMESPACE | 2 +- NEWS.md | 2 ++ R/notin.R | 7 +++++++ inst/tests/tests.Rraw | 10 ++++++++++ man/notin.Rd | 33 +++++++++++++++++++++++++++++++++ src/data.table.h | 3 +++ src/init.c | 2 ++ src/negate.c | 22 ++++++++++++++++++++++ 10 files changed, 88 insertions(+), 8 deletions(-) create mode 100644 R/notin.R create mode 100644 man/notin.Rd create mode 100644 src/negate.c diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cd9e15c7f4..759b51b23b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -94,15 +94,15 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/R-4.1.3-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/R-4.2.1-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait # see #5198 for discussion about the https link used above; it will break each time R is released and the version number will need to be updated .test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-oldrel-win: &install-r-oldrel-win - - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.0.5/R-4.0.5-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.3/R-4.1.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-rtools-win: &install-rtools-win - - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5253-5107-signed.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait .test-template: &test stage: test @@ -246,7 +246,7 @@ test-rel-win: ## R-release on Windows, test and build binaries before_script: - *install-r-rel-win - *install-rtools-win - - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" + - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most'), quiet=TRUE)" - *cp-src-win - rm.exe -r bus @@ -264,7 +264,7 @@ test-dev-win: ## R-devel on Windows; see #5294 for changes in Dec 2021 related t R_VERSION: "$R_DEVEL_VERSION" before_script: - *install-r-devel-win - - curl.exe -s -o ../rtools.exe https://www.r-project.org/nosvn/winutf8/ucrt3/rtools42-5038-5046.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait + - *install-rtools-win - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win @@ -284,7 +284,7 @@ test-old-win: ## R-oldrel on Windows before_script: - *install-r-oldrel-win - *install-rtools-win - - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" + - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win - rm.exe -r bus diff --git a/DESCRIPTION b/DESCRIPTION index 924bdeb2dc..586ef0f308 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -71,7 +71,8 @@ Authors@R: c( person("Boniface Christian","Kamgang", role="ctb"), person("Olivier","Delmarcell", role="ctb"), person("Josh","O'Brien", role="ctb"), - person("Dereck","de Mezquita", role="ctb")) + person("Dereck","de Mezquita", role="ctb"), + person("Michael","Czekanski", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown diff --git a/NAMESPACE b/NAMESPACE index ad306b4ce8..44676f9f5b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,7 @@ exportClasses(data.table, IDate, ITime) export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy) export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) -export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%") +export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%", "%notin%") export(timetaken) export(truelength, setalloccol, alloc.col, ":=", let) export(setattr, setnames, setcolorder, set, setDT, setDF) diff --git a/NEWS.md b/NEWS.md index 19fc575e0d..039eee5ce4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -294,6 +294,8 @@ 40. New functions `yearmon()` and `yearqtr` give a combined representation of `year()` and `month()`/`quarter()`. These and also `yday`, `wday`, `mday`, `week`, `month` and `year` are now optimized for memory and compute efficiency by removing the `POSIXlt` dependency, [#649](https://github.com/Rdatatable/data.table/issues/649). Thanks to Matt Dowle for the request, and Benjamin Schwendinger for the PR. +41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/notin.R b/R/notin.R new file mode 100644 index 0000000000..ba5cef5025 --- /dev/null +++ b/R/notin.R @@ -0,0 +1,7 @@ +"%notin%" = function(x, table) { + if (is.character(x) && is.character(table)) { + .Call(Cnotchin, x, table) + } else { + match(x, table, nomatch = 0L) == 0L + } +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f453b96208..338620ba0f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18804,3 +18804,13 @@ test(2237.1, as.data.frame(dt, row.names=c("x", "y")), df) df = data.frame(a=1:2, b=3:4) test(2237.2, as.data.frame(dt, row.names=NULL), df) +# Test new feature %notin%, #4152 +test(2238.1, 11 %notin% 1:10, TRUE) +test(2238.2, "a" %notin% c(), TRUE) +test(2238.3, "a" %notin% c("a", "b", "c"), FALSE) +test(2238.4, c(1, 2) %notin% c(1,2,3), c(FALSE, FALSE)) +test(2238.5, "a" %notin% character(), TRUE) +test(2238.6, "a" %notin% integer(), TRUE) +test(2238.7, "a" %notin% NULL, TRUE) +test(2238.8, NA %notin% 1:5, TRUE) +test(2238.9, NA %notin% c(1:5, NA), FALSE) diff --git a/man/notin.Rd b/man/notin.Rd new file mode 100644 index 0000000000..d84bb2024d --- /dev/null +++ b/man/notin.Rd @@ -0,0 +1,33 @@ +\name{notin} +\alias{\%notin\%} + +\title{ +Convenience operator for checking if an example is not in a set of elements +} + +\description{ +Check whether an object is absent from a table, i.e., the logical inverse of \code{\link[=base]{in}}. +} + +\usage{ +x \%notin\% table +} + +\arguments{ + \item{x}{ Vector or \code{NULL}: the values to be matched. } + \item{table}{ Vector or \code{NULL}: the values to be matched against. } +} + + +\value{ + Logical vector, \code{TRUE} for each element of \code{x} \emph{absent} from \code{table}, and \code{FALSE} for each element of \code{x} \emph{present} in \code{table}. +} + +\seealso{ \code{\link[base]{match}}, \code{\link[data.table]{chmatch}} } + + +\examples{ + 11 \%notin\% 1:10 # TRUE + "a" \%notin\% c("a", "b") # FALSE +} + diff --git a/src/data.table.h b/src/data.table.h index a7f52b5e09..b966e86c08 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -259,3 +259,6 @@ int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...); // programming.c SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); + +//negate.c +SEXP notchin(SEXP x, SEXP table); diff --git a/src/init.c b/src/init.c index fd43b956e5..284c30b4fd 100644 --- a/src/init.c +++ b/src/init.c @@ -131,6 +131,7 @@ SEXP test_dt_win_snprintf(); SEXP dt_zlib_version(); SEXP startsWithAny(); SEXP convertDate(); +SEXP notchin(); // .Externals SEXP fastmean(); @@ -230,6 +231,7 @@ R_CallMethodDef callMethods[] = { {"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {"CconvertDate", (DL_FUNC)&convertDate, -1}, +{"Cnotchin", (DL_FUNC)¬chin, -1}, {NULL, NULL, 0} }; diff --git a/src/negate.c b/src/negate.c new file mode 100644 index 0000000000..4db3767ff8 --- /dev/null +++ b/src/negate.c @@ -0,0 +1,22 @@ +#include "data.table.h" + +void negateByRef(SEXP x) { + if(TYPEOF(x) != LGLSXP) { + error("not logical or integer vector"); // # nocov + } + const int n = length(x); + Rboolean *ansd = (Rboolean *)LOGICAL(x); + for(int i=0; i Date: Tue, 19 Jul 2022 18:50:14 +0200 Subject: [PATCH 472/588] change appveyor to x64 since 32bit windows support dropped in R 4.2 (#5398) --- .appveyor.yml | 9 ++------- inst/tests/tests.Rraw | 4 ++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index a283cd2a34..ea5051668f 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -16,18 +16,13 @@ environment: global: CRAN: http://cloud.r-project.org WARNINGS_ARE_ERRORS: 1 - R_CHECK_ARGS: --no-manual --no-multiarch - R_ARCH: i386 + R_CHECK_ARGS: --no-manual # R_CHECK_ARGS specified in order to turn off --as-cran (on by default) as that can be slow -# multiarch is on by default which (when R_ARCH: x64) compiles and tests both 32bit and 64bit in one x64 job -# --no-multiarch so as to not run both 32bit and 64bit on every commit in PRs to save dev cycle time; GLCI after merge is full-strength -# GHA has MacOS 64bit (test-coverage) and Ubuntu 64bit, therefore picked 32bit for Windows - GCC_PATH: mingw_64 -# Default GCC_PATH appears to be gcc-4.6.3 which is now unsupported as from Rtools.exe v3.4. _R_CHECK_NO_STOP_ON_TEST_ERROR_: true # continue tests even if some script failed _R_CHECK_TESTS_NLINES_: 0 # Block truncation of any error messages in R CMD check +# R is 64-bit only on Windows from 4.2.0 (prior default was build and test both 32bit and 64bit) so we no longer use R_ARCH to pick one to reduce CI time in PRs matrix: diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 338620ba0f..e05f522814 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14600,11 +14600,11 @@ oldenv1 = Sys.getenv("R_DATATABLE_NUM_PROCS_PERCENT") oldenv2 = Sys.getenv("R_DATATABLE_NUM_THREADS") Sys.setenv(R_DATATABLE_NUM_THREADS="") # in case user has this set, so we can test PROCS_PERCENT Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="3.0") -test(1997.09, setDTthreads(), old, warning="Ignoring invalid.*Please remove any.*not a digit") +test(1997.09, setDTthreads(), old, ignore.warning="Ignoring invalid.*Please remove any.*not a digit") new = getDTthreads() # old above at (1) may not have been default. new now is. test(1997.10, getDTthreads(), new) Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="1") -test(1997.11, setDTthreads(), new, warning="Ignoring invalid.*integer between 2 and 100") +test(1997.11, setDTthreads(), new, ignore.warning="Ignoring invalid.*integer between 2 and 100") test(1997.12, getDTthreads(), new) Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75") test(1997.13, setDTthreads(), new) From 4b16a7ff36f223f1ab24b0cad070ef4e400ff581 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 20 Jul 2022 04:25:11 +0200 Subject: [PATCH 473/588] Attempt workaround of CRAN Windows hang on update.dev.pkg.Rd example (#5421) --- .appveyor.yml | 5 +++-- NAMESPACE | 3 +-- NEWS.md | 2 ++ R/devel.R | 4 ++-- R/onAttach.R | 2 +- README.md | 4 ++-- man/data.table.Rd | 4 ++-- man/{update.dev.pkg.Rd => update_dev_pkg.Rd} | 12 ++++++------ po/R-data.table.pot | 2 +- po/R-zh_CN.po | 4 ++-- 10 files changed, 22 insertions(+), 20 deletions(-) rename man/{update.dev.pkg.Rd => update_dev_pkg.Rd} (69%) diff --git a/.appveyor.yml b/.appveyor.yml index ea5051668f..0f9cdf9e6c 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -16,8 +16,9 @@ environment: global: CRAN: http://cloud.r-project.org WARNINGS_ARE_ERRORS: 1 - R_CHECK_ARGS: --no-manual -# R_CHECK_ARGS specified in order to turn off --as-cran (on by default) as that can be slow + R_CHECK_ARGS: --as-cran --no-manual +# --no-manual to avoid error 'pdflatex is not available' +# --as-cran no longer a lot slower (now takes under 6 mins with and without); logs show _R_CHECK_CRAN_INCOMING_=FALSE which could take 5+ mins _R_CHECK_NO_STOP_ON_TEST_ERROR_: true # continue tests even if some script failed _R_CHECK_TESTS_NLINES_: 0 diff --git a/NAMESPACE b/NAMESPACE index 44676f9f5b..c22782440a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -131,8 +131,7 @@ S3method(melt, default) export(melt.data.table, dcast.data.table) import(utils) -S3method(update, dev.pkg) -export(update.dev.pkg) +export(update_dev_pkg) S3method(tail, data.table) S3method(head, data.table) import(stats) diff --git a/NEWS.md b/NEWS.md index 039eee5ce4..4f4a2f417a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -605,6 +605,8 @@ 16. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively. +17. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has also passed all tests. As such we don't expect any backwards compatibility concerns. + # data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) diff --git a/R/devel.R b/R/devel.R index 9461633ec0..8bd7a1466a 100644 --- a/R/devel.R +++ b/R/devel.R @@ -17,7 +17,7 @@ dcf.repo = function(pkg, repo, field, type) { dcf[dcf[,"Package"]==pkg, field][[1L]] } -update.dev.pkg = function(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { +update_dev_pkg = function(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { # this works for any package, not just data.table pkg = object # perform package upgrade when new Revision present @@ -32,7 +32,7 @@ update.dev.pkg = function(object="data.table", repo="https://Rdatatable.gitlab.i pkg, field, contrib.url(repo, type=type)) # see if Revision is different then currently installed Revision, note that installed package will have Revision info only when it was installed from remote devel repo upg = una || !identical(ups, dcf.lib(pkg, field, lib.loc=lib)) - # update.dev.pkg fails on windows R 4.0.0, we have to unload package namespace before installing new version #4403 + # update_dev_pkg fails on windows R 4.0.0, we have to unload package namespace before installing new version #4403 on.exit({ if (upg) { unloadNamespace(pkg) ## hopefully will release dll lock on Windows diff --git a/R/onAttach.R b/R/onAttach.R index 554d2599d6..9b71a6615c 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -27,7 +27,7 @@ if (gettext("TRANSLATION CHECK") != "TRANSLATION CHECK") packageStartupMessagef("**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********") if (dev && (Sys.Date() - as.Date(d))>28L) - packageStartupMessagef("**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") + packageStartupMessagef("**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update_dev_pkg()\n**********") if (!.Call(ChasOpenMP)) { packageStartupMessagef("**********\nThis installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", appendLF=FALSE) if (Sys.info()["sysname"] == "Darwin") diff --git a/README.md b/README.md index 5b3a7d38a1..47fcf46fc2 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,8 @@ ```r install.packages("data.table") -# latest development version: -data.table::update.dev.pkg() +# latest development version that has passed all tests: +data.table::update_dev_pkg() ``` See [the Installation wiki](https://github.com/Rdatatable/data.table/wiki/Installation) for more details. diff --git a/man/data.table.Rd b/man/data.table.Rd index 7ec8cec3a8..ecc79e2a54 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -453,8 +453,8 @@ if (interactive()) { # keep up to date with latest stable version on CRAN update.packages() - # get the latest devel version - update.dev.pkg() + # get the latest devel version that has passed all tests + update_dev_pkg() # read more at: # https://github.com/Rdatatable/data.table/wiki/Installation } diff --git a/man/update.dev.pkg.Rd b/man/update_dev_pkg.Rd similarity index 69% rename from man/update.dev.pkg.Rd rename to man/update_dev_pkg.Rd index 72b6e7b166..3db5b98316 100644 --- a/man/update.dev.pkg.Rd +++ b/man/update_dev_pkg.Rd @@ -1,11 +1,10 @@ -\name{update.dev.pkg} -\alias{update} -\alias{update.dev.pkg} +\name{update_dev_pkg} +\alias{update_dev_pkg} \title{Perform update of development version of a package} \description{ - It will download and install package from devel repository only when new commit is available there, otherwise only PACKAGES file is transferred. Defaults are set to update \code{data.table}, other packages can be used as well. Their repository has to include git commit information in PACKAGES file. + Downloads and installs latest development version only when a new commit is available which has also passed all tests. Defaults are set to update \code{data.table}, other packages can be used as well. Their repository has to include git commit information in PACKAGES file. } -\usage{\method{update}{dev.pkg}(object="data.table", +\usage{update_dev_pkg(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, \dots) } @@ -30,7 +29,8 @@ NULL. } \examples{ - # data.table::update.dev.pkg() +\dontshow{ # using if(FALSE) because \dontrun could still be run by --run-dontrun; #5421 } + if (FALSE) data.table::update_dev_pkg() } \seealso{ \code{\link{data.table}} diff --git a/po/R-data.table.pot b/po/R-data.table.pot index ad00f12772..2fe5f0b7cd 100644 --- a/po/R-data.table.pot +++ b/po/R-data.table.pot @@ -1567,7 +1567,7 @@ msgstr "" msgid "**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********" msgstr "" -msgid "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********" +msgid "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update_dev_pkg()\n**********" msgstr "" msgid "**********" diff --git a/po/R-zh_CN.po b/po/R-zh_CN.po index 7e78584fd7..105b94145a 100644 --- a/po/R-zh_CN.po +++ b/po/R-zh_CN.po @@ -2161,11 +2161,11 @@ msgstr "" msgid "" "**********\n" "This development version of data.table was built more than 4 weeks ago. " -"Please update: data.table::update.dev.pkg()\n" +"Please update: data.table::update_dev_pkg()\n" "**********" msgstr "" "**********这个data.table的开发版本是在4个多星期之前构建的。请更新版本:data." -"table::update.dev.pkg()\n" +"table::update_dev_pkg()\n" "**********" msgid "**********" From 8473d88cf52c478955ec3368f5122c391fec2c96 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 19 Jul 2022 19:42:59 -0700 Subject: [PATCH 474/588] fix compile-time warning on windows rtools42 (#5395) --- src/fread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fread.c b/src/fread.c index 04df88d9c2..b13fea706b 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1,3 +1,5 @@ +#include "fread.h" +// include fread.h should happen before include time.h to avoid compilation warning on windows about re-defining __USE_MINGW_ANSI_STDIO, PR#5395. #if defined(CLOCK_REALTIME) && !defined(DISABLE_CLOCK_REALTIME) #define HAS_CLOCK_REALTIME #endif @@ -24,7 +26,6 @@ #include // ceil, sqrt, isfinite #endif #include -#include "fread.h" #include "freadLookups.h" // Private globals to save passing all of them through to highly iterated field processors From fda7fd9dbcac939df33769193eebd44731ee59b9 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 19 Jul 2022 21:36:13 -0600 Subject: [PATCH 475/588] GLCI-only: bump R_*_VERSION variables --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 759b51b23b..3440ffe666 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,9 +6,9 @@ variables: TZ: "UTC" ## to avoid 'Failed to create bus connection' from timedatectl via Sys.timezone() on Docker with R 3.4. ## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under ## a non-UTC timezone, although, that's what we do routinely in dev. - R_REL_VERSION: "4.1" - R_DEVEL_VERSION: "4.2" - R_OLDREL_VERSION: "4.0" + R_REL_VERSION: "4.2" + R_DEVEL_VERSION: "4.3" + R_OLDREL_VERSION: "4.1" stages: - dependencies From c4a2085e35689a108d67dacb2f8261e4964d7e12 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 19 Jul 2022 22:56:49 -0600 Subject: [PATCH 476/588] GLCI-only: rtools40 for oldrel-win (R 4.1) --- .gitlab-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3440ffe666..419741f6c5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -283,8 +283,9 @@ test-old-win: ## R-oldrel on Windows R_VERSION: "$R_OLDREL_VERSION" before_script: - *install-r-oldrel-win - - *install-rtools-win - - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait + ## rtools42 doesn't support 32bit so oldrel-win (currently R 4.1) needs rtools40. Can use install-rtools-win again here when oldrel is R 4.2+ + - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win - rm.exe -r bus From 5eb46e817a99585c3811b4e848e9c6fa7f418564 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 5 Oct 2022 22:02:09 -0600 Subject: [PATCH 477/588] gcc-12 always-false in fread.c (#5476) --- .dev/CRAN_Release.cmd | 4 ++-- DESCRIPTION | 2 +- Makefile | 6 +++--- NEWS.md | 9 ++++++++- src/fread.c | 37 +++++++++++++++++++------------------ src/init.c | 2 +- 6 files changed, 34 insertions(+), 26 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index b010d175f4..a3e7600f4a 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -185,7 +185,7 @@ grep -En "for\s*[(]\s*[a-zA-Z0-9_]+\s*=" src/*.c | grep -Fv "#loop_counter_not_l cd .. R -cc(test=TRUE, clean=TRUE, CC="gcc-10") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html +cc(test=TRUE, clean=TRUE, CC="gcc-12") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html saf = options()$stringsAsFactors options(stringsAsFactors=!saf) # check tests (that might be run by user) are insensitive to option, #2718 test.data.table() @@ -306,7 +306,7 @@ cd ~/build/R-devel-strict-clang make cd ~/build/R-devel-strict-gcc -# gcc-10 (in dev currently) failed to build R, so using regular gcc-9 (9.3.0 as per focal/Pop!_OS 20.04) +# gcc-10 failed to build R-devel at some point, so using regular gcc-9 (9.3.0 as per focal/Pop!_OS 20.04) ./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="gcc-9 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make diff --git a/DESCRIPTION b/DESCRIPTION index 586ef0f308..4f5c45640d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.14.3 +Version: 1.14.5 Title: Extension of `data.frame` Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), diff --git a/Makefile b/Makefile index 50a919440e..34eedef4cb 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.14.3.tar.gz + $(RM) data.table_1.14.5.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.14.3.tar.gz + $(R) CMD INSTALL data.table_1.14.5.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.3.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.5.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NEWS.md b/NEWS.md index 4f4a2f417a..a636b6eb45 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ **Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** -# data.table [v1.14.3](https://github.com/Rdatatable/data.table/milestone/20) (in development) +# data.table [v1.14.5](https://github.com/Rdatatable/data.table/milestone/20) (in development) ## NEW FEATURES @@ -608,6 +608,13 @@ 17. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has also passed all tests. As such we don't expect any backwards compatibility concerns. +# data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) + +## NOTES + +1. gcc 12.1 (May 2022) now detects and warns about an always-false condition in `fread` which caused a small efficiency saving never to be invoked. Thanks to CRAN for testing latest versions of compilers. + + # data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) ## NOTES diff --git a/src/fread.c b/src/fread.c index b13fea706b..f028154cb5 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1284,25 +1284,22 @@ int freadMain(freadMainArgs _args) { while (*nastr) { if (**nastr == '\0') { blank_is_a_NAstring = true; - // if blank is the only one, as is the default, clear NAstrings so that doesn't have to be checked - if (nastr==NAstrings && nastr+1==NULL) NAstrings=NULL; - nastr++; - continue; + } else { + const char *ch = *nastr; + size_t nchar = strlen(ch); + if (isspace(ch[0]) || isspace(ch[nchar-1])) + STOP(_("freadMain: NAstring <<%s>> has whitespace at the beginning or end"), ch); + if (strcmp(ch,"T")==0 || strcmp(ch,"F")==0 || + strcmp(ch,"TRUE")==0 || strcmp(ch,"FALSE")==0 || + strcmp(ch,"True")==0 || strcmp(ch,"False")==0) + STOP(_("freadMain: NAstring <<%s>> is recognized as type boolean, this is not permitted."), ch); + if ((strcmp(ch,"1")==0 || strcmp(ch,"0")==0) && args.logical01) + STOP(_("freadMain: NAstring <<%s>> and logical01=%s, this is not permitted."), ch, args.logical01 ? "TRUE" : "FALSE"); + char *end; + errno = 0; + (void)strtod(ch, &end); // careful not to let "" get to here (see continue above) as strtod considers "" numeric + if (errno==0 && (size_t)(end - ch) == nchar) any_number_like_NAstrings = true; } - const char *ch = *nastr; - size_t nchar = strlen(ch); - if (isspace(ch[0]) || isspace(ch[nchar-1])) - STOP(_("freadMain: NAstring <<%s>> has whitespace at the beginning or end"), ch); - if (strcmp(ch,"T")==0 || strcmp(ch,"F")==0 || - strcmp(ch,"TRUE")==0 || strcmp(ch,"FALSE")==0 || - strcmp(ch,"True")==0 || strcmp(ch,"False")==0) - STOP(_("freadMain: NAstring <<%s>> is recognized as type boolean, this is not permitted."), ch); - if ((strcmp(ch,"1")==0 || strcmp(ch,"0")==0) && args.logical01) - STOP(_("freadMain: NAstring <<%s>> and logical01=%s, this is not permitted."), ch, args.logical01 ? "TRUE" : "FALSE"); - char *end; - errno = 0; - (void)strtod(ch, &end); // careful not to let "" get to here (see continue above) as strtod considers "" numeric - if (errno==0 && (size_t)(end - ch) == nchar) any_number_like_NAstrings = true; nastr++; } disabled_parsers[CT_BOOL8_N] = !args.logical01; @@ -1325,6 +1322,10 @@ int freadMain(freadMainArgs _args) { DTPRINT(_(" show progress = %d\n"), args.showProgress); DTPRINT(_(" 0/1 column will be read as %s\n"), args.logical01? "boolean" : "integer"); } + if (*NAstrings==NULL || // user sets na.strings=NULL + (**NAstrings=='\0' && *(NAstrings+1)==NULL)) { // user sets na.strings="" + NAstrings=NULL; // clear NAstrings to save end_NA_string() dealing with these cases (blank_is_a_NAstring was set to true above) + } stripWhite = args.stripWhite; skipEmptyLines = args.skipEmptyLines; diff --git a/src/init.c b/src/init.c index 284c30b4fd..c85fa6122a 100644 --- a/src/init.c +++ b/src/init.c @@ -433,6 +433,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion() { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.14.3"))); + return(ScalarString(mkChar("1.14.5"))); } From 0b46acc9b270ba6a944c769700797374ebb763d9 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 5 Oct 2022 22:20:09 -0600 Subject: [PATCH 478/588] backport #5421 update.dev.pkg name change to 1.14.4 --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index a636b6eb45..f63e206734 100644 --- a/NEWS.md +++ b/NEWS.md @@ -605,8 +605,6 @@ 16. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively. -17. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has also passed all tests. As such we don't expect any backwards compatibility concerns. - # data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) @@ -614,6 +612,8 @@ 1. gcc 12.1 (May 2022) now detects and warns about an always-false condition in `fread` which caused a small efficiency saving never to be invoked. Thanks to CRAN for testing latest versions of compilers. +2. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has passed all tests. As such we don't expect any backwards compatibility concerns. Its manual page was causing an intermittent hang/crash from `R CMD check` on Windows-only on CRAN which we hope will be worked around by changing its name. + # data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) From feccf046f7f1c767edcf0e197055a359e0b11194 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 6 Oct 2022 00:50:59 -0600 Subject: [PATCH 479/588] GLCI-only: rtools42 has a new version suffix --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 419741f6c5..96189c8c31 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -102,7 +102,7 @@ build: ## build data.table sources as tar.gz archive - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.3/R-4.1.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-rtools-win: &install-rtools-win - - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5253-5107-signed.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5355-5357.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait .test-template: &test stage: test From f0e6da5dceaedfd322abe63209a2062517ccd0e1 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 6 Oct 2022 08:02:38 +0100 Subject: [PATCH 480/588] fix cran note (#5445) --- man/fwrite.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 870acaac75..ba6eb4751c 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -64,7 +64,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{encoding}{ The encoding of the strings written to the CSV file. Default is \code{""}, which means writting raw bytes without considering the encoding. Other possible options are \code{"UTF-8"} and \code{"native"}. } } \details{ -\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://www.h2o.ai/blog/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. +\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://h2o.ai/blog/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. To save space, \code{fwrite} prefers to write wide numeric values in scientific notation -- e.g. \code{10000000000} takes up much more space than \code{1e+10}. Most file readers (e.g. \code{\link{fread}}) understand scientific notation, so there's no fidelity loss. Like in base R, users can control this by specifying the \code{scipen} argument, which follows the same rules as \code{\link[base]{options}('scipen')}. \code{fwrite} will see how much space a value will take to write in scientific vs. decimal notation, and will only write in scientific notation if the latter is more than \code{scipen} characters wider. For \code{10000000000}, then, \code{1e+10} will be written whenever \code{scipen<6}. From 7f9ff44c60b778c1252ceeb484c565326e83033d Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 6 Oct 2022 16:05:04 -0600 Subject: [PATCH 481/588] pass -Wstrict-prototypes for CRAN (#5477) --- .dev/cc.R | 3 +- NEWS.md | 4 +- src/assign.c | 4 +- src/data.table.h | 83 ++++++++++++++++++++++++++++++++++--- src/forder.c | 10 ++--- src/fread.c | 2 +- src/fwrite.c | 53 ++++++++++++------------ src/fwrite.h | 35 ++++++++-------- src/fwriteR.c | 10 ++--- src/gsumm.c | 62 ++++++++++++++-------------- src/init.c | 101 ++------------------------------------------- src/openmp-utils.c | 8 ++-- src/snprintf.c | 2 +- src/utils.c | 2 +- 14 files changed, 181 insertions(+), 198 deletions(-) diff --git a/.dev/cc.R b/.dev/cc.R index bc15b6765f..2d60e4200a 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -61,7 +61,8 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys if (debug) { ret = system(sprintf("MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f%sopenmp CFLAGS=-std=c99\\ -O0\\ -ggdb\\ -pedantic' R CMD SHLIB -d -o data_table.so *.c", CC, OMP)) } else { - ret = system(sprintf("MAKEFLAGS='-j CC=%s CFLAGS=-f%sopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic\\ -fno-common' R CMD SHLIB -o data_table.so *.c", CC, OMP)) + ret = system(sprintf("MAKEFLAGS='-j CC=%s CFLAGS=-f%sopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic\\ -Wstrict-prototypes\\ -isystem\\ /usr/share/R/include\\ -fno-common' R CMD SHLIB -o data_table.so *.c", CC, OMP)) + # the -isystem suppresses strict-prototypes warnings from R's headers. Look at the output to see what -I is and pass the same path to -isystem. # TODO add -Wextra too? } if (ret) return() diff --git a/NEWS.md b/NEWS.md index f63e206734..ee808e2acd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -610,10 +610,12 @@ ## NOTES -1. gcc 12.1 (May 2022) now detects and warns about an always-false condition in `fread` which caused a small efficiency saving never to be invoked. Thanks to CRAN for testing latest versions of compilers. +1. gcc 12.1 (May 2022) now detects and warns about an always-false condition (`-Waddress`) in `fread` which caused a small efficiency saving never to be invoked. Thanks to CRAN for testing latest versions of compilers. 2. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has passed all tests. As such we don't expect any backwards compatibility concerns. Its manual page was causing an intermittent hang/crash from `R CMD check` on Windows-only on CRAN which we hope will be worked around by changing its name. +3. Internal C code now passes `-Wstrict-prototypes` to satisfy the warnings now displayed on CRAN. + # data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) diff --git a/src/assign.c b/src/assign.c index 7fb09fa71e..f48f71e733 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1190,7 +1190,7 @@ SEXP allocNAVectorLike(SEXP x, R_len_t n) { static SEXP *saveds=NULL; static R_len_t *savedtl=NULL, nalloc=0, nsaved=0; -void savetl_init() { +void savetl_init(void) { if (nsaved || nalloc || saveds || savedtl) { error(_("Internal error: savetl_init checks failed (%d %d %p %p). please report to data.table issue tracker."), nsaved, nalloc, saveds, savedtl); // # nocov } @@ -1231,7 +1231,7 @@ void savetl(SEXP s) nsaved++; } -void savetl_end() { +void savetl_end(void) { // Can get called if nothing has been saved yet (nsaved==0), or even if _init() hasn't been called yet (pointers NULL). Such // as to clear up before error. Also, it might be that nothing needed to be saved anyway. for (int i=0; i> approach to cleanup() on error. */ -static void free_ustr() { +static void free_ustr(void) { for(int i=0; i + #include // for SEXP in writeList() prototype #include "po.h" #define STOP error #define DTPRINT Rprintf #endif -typedef void (*writer_fun_t)(const void *, int64_t, char **); +typedef void writer_fun_t(const void *, int64_t, char **); // in the order of writer_fun_t in fwriteR.c -void writeBool8(); -void writeBool32(); -void writeBool32AsString(); -void writeInt32(); -void writeInt64(); -void writeFloat64(); -void writeComplex(); -void writeITime(); -void writeDateInt32(); -void writeDateFloat64(); -void writePOSIXct(); -void writeNanotime(); -void writeString(); -void writeCategString(); -void writeList(); +writer_fun_t writeBool8; +writer_fun_t writeBool32; +writer_fun_t writeBool32AsString; +writer_fun_t writeInt32; +writer_fun_t writeInt64; +writer_fun_t writeFloat64; +writer_fun_t writeComplex; +writer_fun_t writeITime; +writer_fun_t writeDateInt32; +writer_fun_t writeDateFloat64; +writer_fun_t writePOSIXct; +writer_fun_t writeNanotime; +writer_fun_t writeString; +writer_fun_t writeCategString; +writer_fun_t writeList; void write_chars(const char *source, char **dest); @@ -75,7 +76,7 @@ typedef struct fwriteMainArgs int64_t nrow; // a vector of pointers to all-same-length column vectors const void **columns; - writer_fun_t *funs; // a vector of writer_fun_t function pointers + writer_fun_t **funs; // a vector of writer_fun_t function pointers // length ncol vector containing which fun[] to use for each column // one byte to use 8 times less cache lines than a vector of function pointers would do diff --git a/src/fwriteR.c b/src/fwriteR.c index a36e443156..f64768d70b 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -19,7 +19,7 @@ static const char *sep2start, *sep2end; // if there are no list columns, set sep2=='\0' // Non-agnostic helpers ... -const char *getString(SEXP *col, int64_t row) { // TODO: inline for use in fwrite.c +const char *getString(const SEXP *col, int64_t row) { // TODO: inline for use in fwrite.c SEXP x = col[row]; return x==NA_STRING ? NULL : ENCODED_CHAR(x); } @@ -53,7 +53,7 @@ const char *getCategString(SEXP col, int64_t row) { return x==NA_INTEGER ? NULL : ENCODED_CHAR(STRING_ELT(getAttrib(col, R_LevelsSymbol), x-1)); } -writer_fun_t funs[] = { +writer_fun_t *funs[] = { &writeBool8, &writeBool32, &writeBool32AsString, @@ -73,8 +73,8 @@ writer_fun_t funs[] = { static int32_t whichWriter(SEXP); -void writeList(SEXP *col, int64_t row, char **pch) { - SEXP v = col[row]; +void writeList(const void *col, int64_t row, char **pch) { + SEXP v = ((const SEXP *)col)[row]; int32_t wf = whichWriter(v); if (TYPEOF(v)==VECSXP || wf==INT32_MIN || isFactor(v)) { error(_("Internal error: getMaxListItemLen should have caught this up front.")); // # nocov @@ -82,7 +82,7 @@ void writeList(SEXP *col, int64_t row, char **pch) { char *ch = *pch; write_chars(sep2start, &ch); const void *data = DATAPTR_RO(v); - writer_fun_t fun = funs[wf]; + writer_fun_t *fun = funs[wf]; for (int j=0; j16) shift=nb/2; // TODO: when we have stress-test off mode, do this - mask = (1<>shift) + 1; + bitshift = nb/2; // /2 so that high and low can be uint16_t, and no limit (even for nb=4) to stress-test. + // bitshift=MAX(nb-8,0); if (bitshift>16) bitshift=nb/2; // TODO: when we have stress-test off mode, do this + mask = (1<>bitshift) + 1; grp = (int *)R_alloc(nrow, sizeof(int)); // TODO: use malloc and made this local as not needed globally when all functions here use gather // maybe better to malloc to avoid R's heap. This grp isn't global, so it doesn't need to be R_alloc @@ -86,8 +86,8 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { // TODO: enable stress-test mode in tests only (#3205) which can be turned off by default in release to decrease overhead on small data // if that is established to be biting (it may be fine). if (nBatch<1 || batchSize<1 || lastBatchSize<1) { - error(_("Internal error: nrow=%d ngrp=%d nbit=%d shift=%d highSize=%d nBatch=%d batchSize=%d lastBatchSize=%d\n"), // # nocov - nrow, ngrp, nb, shift, highSize, nBatch, batchSize, lastBatchSize); // # nocov + error(_("Internal error: nrow=%d ngrp=%d nbit=%d bitshift=%d highSize=%d nBatch=%d batchSize=%d lastBatchSize=%d\n"), // # nocov + nrow, ngrp, nb, bitshift, highSize, nBatch, batchSize, lastBatchSize); // # nocov } // initial population of g: #pragma omp parallel for num_threads(getDTthreads(ngrp, false)) @@ -108,9 +108,9 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { const int *restrict op = INTEGER(o); // o is a permutation of 1:nrow int nb = nbit(nrow-1); - int shift = MAX(nb-8, 0); // TODO: experiment nb/2. Here it doesn't have to be /2 currently. - int highSize = ((nrow-1)>>shift) + 1; - //Rprintf(_("When assigning grp[o] = g, highSize=%d nb=%d shift=%d nBatch=%d\n"), highSize, nb, shift, nBatch); + int bitshift = MAX(nb-8, 0); // TODO: experiment nb/2. Here it doesn't have to be /2 currently. + int highSize = ((nrow-1)>>bitshift) + 1; + //Rprintf(_("When assigning grp[o] = g, highSize=%d nb=%d bitshift=%d nBatch=%d\n"), highSize, nb, bitshift, nBatch); int *counts = calloc(nBatch*highSize, sizeof(int)); // TODO: cache-line align and make highSize a multiple of 64 int *TMP = malloc(nrow*2l*sizeof(int)); // must multiple the long int otherwise overflow may happen, #4295 if (!counts || !TMP ) error(_("Internal error: Failed to allocate counts or TMP when assigning g in gforce")); @@ -120,7 +120,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { const int *my_o = op + b*batchSize; int *restrict my_counts = counts + b*highSize; for (int i=0; i> shift; + const int w = (my_o[i]-1) >> bitshift; my_counts[w]++; } for (int i=0, cum=0; i> shift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too + const int w = (my_o[i]-1) >> bitshift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too int *p = my_tmp + 2*my_counts[w]++; *p++ = my_o[i]-1; *p = my_g[i]; @@ -172,7 +172,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { const int *my_pg = gp + b*batchSize; const int howMany = b==nBatch-1 ? lastBatchSize : batchSize; for (int i=0; i> shift; + const int w = my_pg[i] >> bitshift; my_counts[w]++; my_high[i] = (uint16_t)w; // reduce 4 bytes to 2 } @@ -185,7 +185,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize; memcpy(my_tmpcounts, my_counts, highSize*sizeof(int)); for (int i=0; i> shift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too + const int w = my_pg[i] >> bitshift; // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too my_low[my_tmpcounts[w]++] = (uint16_t)(my_pg[i] & mask); } // counts is now cumulated within batch (with ending values) and we leave it that way @@ -362,7 +362,7 @@ SEXP gsum(SEXP x, SEXP narmArg) if (!anyNA) { #pragma omp parallel for num_threads(getDTthreads(highSize, false)) //schedule(dynamic,1) for (int h=0; h b ? a : b; } -void initDTthreads() { +void initDTthreads(void) { // called at package startup from init.c // also called by setDTthreads(threads=NULL) (default) to reread environment variables; see setDTthreads below // No verbosity here in this setter. Verbosity is in getDTthreads(verbose=TRUE) @@ -169,16 +169,16 @@ SEXP setDTthreads(SEXP threads, SEXP restore_after_fork, SEXP percent, SEXP thro static int pre_fork_DTthreads = 0; -void when_fork() { +void when_fork(void) { pre_fork_DTthreads = DTthreads; DTthreads = 1; } -void after_fork() { +void after_fork(void) { if (RestoreAfterFork) DTthreads = pre_fork_DTthreads; } -void avoid_openmp_hang_within_fork() { +void avoid_openmp_hang_within_fork(void) { // Called once on loading data.table from init.c #ifdef _OPENMP pthread_atfork(&when_fork, &after_fork, NULL); diff --git a/src/snprintf.c b/src/snprintf.c index 94199af707..6b8098c6f2 100644 --- a/src/snprintf.c +++ b/src/snprintf.c @@ -184,7 +184,7 @@ int dt_win_snprintf(char *dest, const size_t n, const char *fmt, ...) return nc; } -SEXP test_dt_win_snprintf() +SEXP test_dt_win_snprintf(void) { char buff[50]; diff --git a/src/utils.c b/src/utils.c index 9d6f5d7592..fa10fd97ca 100644 --- a/src/utils.c +++ b/src/utils.c @@ -370,7 +370,7 @@ SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg) { #ifndef NOZLIB #include #endif -SEXP dt_zlib_version() { +SEXP dt_zlib_version(void) { char out[71]; #ifndef NOZLIB snprintf(out, 70, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION); From 4a540f4cd3bbcdde41bff80b9259bb26eca17d09 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 6 Oct 2022 16:25:32 -0600 Subject: [PATCH 482/588] comment-only: added PR number #5477 --- .dev/cc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/cc.R b/.dev/cc.R index 2d60e4200a..a092aba351 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -62,7 +62,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys ret = system(sprintf("MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f%sopenmp CFLAGS=-std=c99\\ -O0\\ -ggdb\\ -pedantic' R CMD SHLIB -d -o data_table.so *.c", CC, OMP)) } else { ret = system(sprintf("MAKEFLAGS='-j CC=%s CFLAGS=-f%sopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic\\ -Wstrict-prototypes\\ -isystem\\ /usr/share/R/include\\ -fno-common' R CMD SHLIB -o data_table.so *.c", CC, OMP)) - # the -isystem suppresses strict-prototypes warnings from R's headers. Look at the output to see what -I is and pass the same path to -isystem. + # the -isystem suppresses strict-prototypes warnings from R's headers, #5477. Look at the output to see what -I is and pass the same path to -isystem. # TODO add -Wextra too? } if (ret) return() From 067478406bee13a7b1b3e42a15628244c538c044 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 6 Oct 2022 22:41:51 -0600 Subject: [PATCH 483/588] #5476 follow up thanks @michaelchirico: redundant ternary in error string construction, and comment --- src/fread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fread.c b/src/fread.c index 415787b5a6..510c118736 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1294,10 +1294,10 @@ int freadMain(freadMainArgs _args) { strcmp(ch,"True")==0 || strcmp(ch,"False")==0) STOP(_("freadMain: NAstring <<%s>> is recognized as type boolean, this is not permitted."), ch); if ((strcmp(ch,"1")==0 || strcmp(ch,"0")==0) && args.logical01) - STOP(_("freadMain: NAstring <<%s>> and logical01=%s, this is not permitted."), ch, args.logical01 ? "TRUE" : "FALSE"); + STOP(_("freadMain: NAstring <<%s>> and logical01=TRUE, this is not permitted."), ch); char *end; errno = 0; - (void)strtod(ch, &end); // careful not to let "" get to here (see continue above) as strtod considers "" numeric + (void)strtod(ch, &end); // careful not to let "" get to here as strtod considers "" numeric if (errno==0 && (size_t)(end - ch) == nchar) any_number_like_NAstrings = true; } nastr++; From 15a4d74e234b2924f582702bb30c12bd55cd157c Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 7 Oct 2022 23:14:53 -0600 Subject: [PATCH 484/588] fwrite dateTimeAs='write.csv' test 1741 massaged to pass R-devel (#5480) --- .dev/CRAN_Release.cmd | 1 + NEWS.md | 28 ++++++++++++++++++++++++++-- inst/tests/tests.Rraw | 25 ++++++++++++++++++++++--- 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index a3e7600f4a..6980ff06a3 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -295,6 +295,7 @@ tar xvf R-devel.tar.gz mv R-devel R-devel-strict-clang tar xvf R-devel.tar.gz +sudo apt-get -y build-dep r-base cd R-devel # may be used for revdep testing: .dev/revdep.R. # important to change directory name before building not after because the path is baked into the build, iiuc ./configure CFLAGS="-O0 -Wall -pedantic" diff --git a/NEWS.md b/NEWS.md index ee808e2acd..336920c30a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -610,11 +610,35 @@ ## NOTES -1. gcc 12.1 (May 2022) now detects and warns about an always-false condition (`-Waddress`) in `fread` which caused a small efficiency saving never to be invoked. Thanks to CRAN for testing latest versions of compilers. +1. gcc 12.1 (May 2022) now detects and warns about an always-false condition (`-Waddress`) in `fread` which caused a small efficiency saving never to be invoked, [#5476](https://github.com/Rdatatable/data.table/pull/5476). Thanks to CRAN for testing latest versions of compilers. 2. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has passed all tests. As such we don't expect any backwards compatibility concerns. Its manual page was causing an intermittent hang/crash from `R CMD check` on Windows-only on CRAN which we hope will be worked around by changing its name. -3. Internal C code now passes `-Wstrict-prototypes` to satisfy the warnings now displayed on CRAN. +3. Internal C code now passes `-Wstrict-prototypes` to satisfy the warnings now displayed on CRAN, [#5477](https://github.com/Rdatatable/data.table/pull/5477). + +4. `write.csv` in R-devel no longer responds to `getOption("digits.secs")` for `POSIXct`, [#5478](https://github.com/Rdatatable/data.table/issues/5478). This caused our tests of `fwrite(, dateTimeAs="write.csv")` to fail on CRAN's daily checks using latest daily R-devel. While R-devel discussion continues, and currently it seems like the change is intended with further changes possible, this `data.table` release massages our tests to pass on latest R-devel. The idea is to try to get out of the way of R-devel changes in this regard until the new behavior of `write.csv` is released and confirmed. Package updates are not accepted on CRAN if they do not pass the latest daily version of R-devel, even if R-devel changes after the package update is submitted. If the change to `write.csv()` stands, then a future release of `data.table` will be needed to make `fwrite(, dateTimeAs="write.csv")` match `write.csv()` output again in that future version of R onwards. If you use an older version of `data.table` than said future one in the said future version of R, then `fwrite(, dateTimeAs="write.csv")` may not match `write.csv()` if you are using `getOption("digits.secs")` too. However, you can always check that your installation of `data.table` works in your version of R on your platform by simply running `test.data.table()` yourself. Doing so would detect such a situation for you: test 1741 would fail in this case. `test.data.table()` runs the entire suite of tests and is always available to you locally. This way you do not need to rely on our statements about which combinations of versions of R and `data.table` on which platforms we have tested and support; just run `test.data.table()` yourself. Having said that, because test 1741 has been relaxed in this release in order to be accepted on CRAN to pass latest R-devel, this won't be true for this particular release in regard to this particular test. + + ```R + $ R --vanilla + R version 4.2.1 (2022-06-23) -- "Funny-Looking Kid" + > DF = data.frame(A=as.POSIXct("2022-10-01 01:23:45.012")) + > options(digits.secs=0) + > write.csv(DF) + "","A" + "1",2022-10-01 01:23:45 + > options(digits.secs=3) + > write.csv(DF) + "","A" + "1",2022-10-01 01:23:45.012 + + $ Rdevel --vanilla + R Under development (unstable) (2022-10-06 r83040) -- "Unsuffered Consequences" + > DF = data.frame(A=as.POSIXct("2022-10-01 01:23:45.012")) + > options(digits.secs=0) + > write.csv(DF) + "","A" + "1",2022-10-01 01:23:45.012 + ``` # data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e05f522814..04336f3d5d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11073,15 +11073,34 @@ setattr(DT[[4]], "tzone", NULL) setattr(DT[[5]], "tzone", NULL) # format() now supports digits = 0, to display nsmall decimal places. +# Oct 2022: R-devel changed write.csv behavior to no longer respect digits.secs, #5478. +# For now we'll get out of the way while R-devel discussion is ongoing so that 1.14.4 can +# be submitted to CRAN. +# These tests test fwrite(, dateTimeAs="write.csv") whose +# very point is to match write.csv. Rather than turn off these tests, we'll for now +# continue to test that at least fwrite continues to work as intended. Otherwise +# coverage will drop and we could miss a plain old crash or error bug. +# Note that tzone has been removed above so these tests output the POSIXct in the +# R session's timezone because here dateTimeAs="write.csv" and that's what write.csv does. +# This is the reason `y` can't be fixed strings because depending on the timezone of the +# session which is running test.data.table, the results will be different. +# data.table's fwrite achieves local timezone writing (when dateTimeAs="write.csv") via +# an R call to format.POSIXct in fwriteR.c. By default fwrite writes datetime in UTC for +# consistent and reproducible research, which is different to write.csv. +# TODO: revisit when R-devel has settled w.r.t. write.csv behavior. +format_rows_as_csv = function(DT, digits) apply(sapply(DT, format, digits=digits), 1L, paste0, collapse=",") old=options(digits.secs=0) test(1741.3, x1<-capture.output(fwrite(DT,dateTimeAs="write.csv")), - capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + c("A,B,C,D,E", format_rows_as_csv(DT, digits=0L))) + # capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) options(digits.secs=3) test(1741.4, x2<-capture.output(fwrite(DT,dateTimeAs="write.csv")), - capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + c("A,B,C,D,E", format_rows_as_csv(DT, digits=3L))) + # capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) options(digits.secs=6) test(1741.5, x3<-capture.output(fwrite(DT,dateTimeAs="write.csv")), - capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + c("A,B,C,D,E", format_rows_as_csv(DT, digits=6L))) + # capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) # check that extra digits made it into output test(1741.6, sum(nchar(x1)) < sum(nchar(x2)) && sum(nchar(x2)) < sum(nchar(x3))) options(old) From 5cd193b2a1974ea50487f377b6d8797d726a5fcc Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 8 Oct 2022 13:53:28 -0600 Subject: [PATCH 485/588] url follows from R CMD check --as-cran with xml2 installed (#5482) --- NEWS.md | 6 +++--- README.md | 2 +- vignettes/datatable-programming.Rmd | 2 +- vignettes/datatable-sd-usage.Rmd | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 336920c30a..92efe6af5b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -334,7 +334,7 @@ 18. `as.data.table()` on `xts` objects containing a column named `x` would return an `index` of type plain `integer` rather than `POSIXct`, [#4897](https://github.com/Rdatatable/data.table/issues/4897). Thanks to Emil Sjørup for reporting, and Jan Gorecki for the PR. -19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3. +19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3. 20. `uniqueN(DT, by=character())` is now equivalent to `uniqueN(DT)` rather than internal error `'by' is either not integer or is length 0`, [#4594](https://github.com/Rdatatable/data.table/issues/4594). Thanks Marco Colombo for the report, and Michael Chirico for the PR. Similarly for `unique()`, `duplicated()` and `anyDuplicated()`. @@ -700,7 +700,7 @@ ## NOTES -1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://www.h2o.ai/blog/behind-the-scenes-of-cran/). +1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://h2o.ai/blog/behind-the-scenes-of-cran/). 2. `as.Date.IDate` is no longer exported as a function to solve a new error in R-devel `S3 method lookup found 'as.Date.IDate' on search path`, [#4777](https://github.com/Rdatatable/data.table/issues/4777). The S3 method is still exported; i.e. `as.Date(x)` will still invoke the `as.Date.IDate` method when `x` is class `IDate`. The function had been exported, in addition to exporting the method, to solve a compatibility issue with `zoo` (and `xts` which uses `zoo`) because `zoo` exports `as.Date` which masks `base::as.Date`. Happily, since zoo 1.8-1 (Jan 2018) made a change to its `as.IDate`, the workaround is no longer needed. @@ -1609,7 +1609,7 @@ has a better chance of working on Mac. 4. `fread()` has always accepted literal data; e.g. `fread("A,B\n1,2\n3,4")`. It now gains explicit `text=`; e.g. `fread(text="A,B\n1,2\n3,4")`. Unlike the first general purpose `input=` argument, the `text=` argument accepts multi-line input; e.g. `fread(text=c("A,B","1,2","3,4"))`, [#1423](https://github.com/Rdatatable/data.table/issues/1423). Thanks to Douglas Clark for the request and Hugh Parsonage for the PR. -5. `fread()` has always accepted system commands; e.g. `fread("grep blah file.txt")`. It now gains explicit `cmd=`; e.g. `fread(cmd="grep blah file.txt")`. Further, if and only if `input=` is a system command and a variable was used to hold that command (`fread(someCommand)` not `fread("grep blah file.txt")`) or a variable is used to construct it (`fread(paste("grep",variable,"file.txt"))`), a message is now printed suggesting `cmd=`. This is to inform all users that there is a potential security concern if you are i) creating apps, and ii) your app takes input from a public user who could be malicious, and iii) input from the malicious user (such as a filename) is passed by your app to `fread()`, and iv) your app in not running in a protected environment. If all 4 conditions hold then the malicious user could provide a system command instead of a filename which `fread()` would run, and that would be a problem too. If the app is not running in a protected environment (e.g. app is running as root) then this could do damage or obtain data you did not intend. Public facing apps should be running with limited operating system permission so that any breach from any source is contained. We agree with [Linus Torvald's advice](https://lkml.org/lkml/2017/11/21/356) on this which boils down to: "when addressing security concerns the first step is do no harm, just inform". If you aren't creating apps or apis that could have a malicious user then there is no risk but we can't distinguish you so we have to inform everyone. Please change to `fread(cmd=...)` at your leisure. The new message can be suppressed with `options(datatable.fread.input.cmd.message=FALSE)`. Passing system commands to `fread()` continues to be recommended and encouraged and is widely used; e.g. via the techniques gathered together in the book [Data Science at the Command Line](https://www.datascienceatthecommandline.com/). A `warning()` is too strong because best-practice for production systems is to set `options(warn=2)` to tolerate no warnings. Such production systems have no user input and so there is no security risk; we don't want to do harm by breaking production systems via a `warning()` which gets turned into an error by `options(warn=2)`. Now that we have informed all users, we request feedback. There are 3 options for future releases: i) remove the message, ii) leave the message in place, iii) upgrade the message to warning and then eventually error. The default choice is the middle one: leave the message in place. +5. `fread()` has always accepted system commands; e.g. `fread("grep blah file.txt")`. It now gains explicit `cmd=`; e.g. `fread(cmd="grep blah file.txt")`. Further, if and only if `input=` is a system command and a variable was used to hold that command (`fread(someCommand)` not `fread("grep blah file.txt")`) or a variable is used to construct it (`fread(paste("grep",variable,"file.txt"))`), a message is now printed suggesting `cmd=`. This is to inform all users that there is a potential security concern if you are i) creating apps, and ii) your app takes input from a public user who could be malicious, and iii) input from the malicious user (such as a filename) is passed by your app to `fread()`, and iv) your app in not running in a protected environment. If all 4 conditions hold then the malicious user could provide a system command instead of a filename which `fread()` would run, and that would be a problem too. If the app is not running in a protected environment (e.g. app is running as root) then this could do damage or obtain data you did not intend. Public facing apps should be running with limited operating system permission so that any breach from any source is contained. We agree with [Linus Torvald's advice](https://lkml.org/lkml/2017/11/21/356) on this which boils down to: "when addressing security concerns the first step is do no harm, just inform". If you aren't creating apps or apis that could have a malicious user then there is no risk but we can't distinguish you so we have to inform everyone. Please change to `fread(cmd=...)` at your leisure. The new message can be suppressed with `options(datatable.fread.input.cmd.message=FALSE)`. Passing system commands to `fread()` continues to be recommended and encouraged and is widely used; e.g. via the techniques gathered together in the book [Data Science at the Command Line](https://datascienceatthecommandline.com/). A `warning()` is too strong because best-practice for production systems is to set `options(warn=2)` to tolerate no warnings. Such production systems have no user input and so there is no security risk; we don't want to do harm by breaking production systems via a `warning()` which gets turned into an error by `options(warn=2)`. Now that we have informed all users, we request feedback. There are 3 options for future releases: i) remove the message, ii) leave the message in place, iii) upgrade the message to warning and then eventually error. The default choice is the middle one: leave the message in place. 6. New `options(datatable.CJ.names=TRUE)` changes `CJ()` to auto-name its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Thanks @franknarf1 for the suggestion. Current default is `FALSE`; i.e. no change. The option's default will be changed to `TRUE` in v1.12.0 and then eventually the option will be removed. Any code that depends on `CJ(x,y)$V1` will need to be changed to `CJ(x,y)$x` and is more akin to a bug fix due to the inconsistency with `data.table()`. diff --git a/README.md b/README.md index 47fcf46fc2..46bbfed1e8 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ DT[Petal.Width > 1.0, mean(Petal.Length), by = Species] ## Community -`data.table` is widely used by the R community. It is being directly used by hundreds of CRAN and Bioconductor packages, and indirectly by thousands. It is one of the [top most starred](https://www.r-pkg.org/starred) R packages on GitHub, and was highly rated by the [Depsy project](http://depsy.org/package/r/data.table). If you need help, the `data.table` community is active on [StackOverflow](https://stackoverflow.com/questions/tagged/data.table). +`data.table` is widely used by the R community. It is being directly used by hundreds of CRAN and Bioconductor packages, and indirectly by thousands. It is one of the [top most starred](https://medium.datadriveninvestor.com/most-starred-and-forked-github-repos-for-r-in-data-science-fb87a54d2a6a) R packages on GitHub, and was highly rated by the [Depsy project](http://depsy.org/package/r/data.table). If you need help, the `data.table` community is active on [StackOverflow](https://stackoverflow.com/questions/tagged/data.table). ### Stay up-to-date diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index 46008e7045..bf481f06f3 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -102,7 +102,7 @@ my_subset = function(data, col, val) { my_subset(iris, Species, "setosa") ``` -Here, we used the base R `substitute` function to transform the call `subset(data, col == val)` into `subset(iris, Species == "setosa")` by substituting `data`, `col`, and `val` with their original names (or values) from their parent environment. The benefits of this approach to the previous ones should be clear. Note that because we operate at the level of language objects, and don't have to resort to string manipulation, we refer to this as *computing on the language*. There is a dedicated chapter on *Computing on the language* in [R language manual](https://cloud.r-project.org/doc/manuals/r-release/R-lang.html). Although it is not necessary for *programming on data.table*, we encourage readers to read this chapter for the sake of better understanding this powerful and unique feature. +Here, we used the base R `substitute` function to transform the call `subset(data, col == val)` into `subset(iris, Species == "setosa")` by substituting `data`, `col`, and `val` with their original names (or values) from their parent environment. The benefits of this approach to the previous ones should be clear. Note that because we operate at the level of language objects, and don't have to resort to string manipulation, we refer to this as *computing on the language*. There is a dedicated chapter on *Computing on the language* in [R language manual](https://cran.r-project.org/doc/manuals/r-release/R-lang.html). Although it is not necessary for *programming on data.table*, we encourage readers to read this chapter for the sake of better understanding this powerful and unique feature. #### Use third party packages diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index fda2c4751f..f84fd6ea63 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -34,7 +34,7 @@ The simpler usage of `.SD` is for column subsetting (i.e., when `.SDcols` is spe ## Loading and Previewing Lahman Data -To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](http://www.seanlahman.com/baseball-archive/statistics/). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. +To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](https://www.seanlahman.com/baseball-archive/statistics/). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. ```{r download_lahman} load('Teams.RData') @@ -46,7 +46,7 @@ setDT(Pitching) Pitching ``` -Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](http://www.seanlahman.com/files/database/readme2017.txt) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. +Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](https://www.seanlahman.com/files/database/readme2017.txt) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. # `.SD` on Ungrouped Data From 6f2358b36f9ab03fd06dcf23b35c825c4dc70984 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sun, 9 Oct 2022 01:49:10 -0600 Subject: [PATCH 486/588] NEWS-only: move #5183 and #4442 down to 1.14.4 to reflect patch-1.14 branch --- NEWS.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 92efe6af5b..0ad3307d8c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -586,11 +586,7 @@ 12. `?merge` and `?setkey` have been updated to clarify that the row order is retained when `sort=FALSE`, and why `NA`s are always first when `sort=TRUE`, [#2574](https://github.com/Rdatatable/data.table/issues/2574) [#2594](https://github.com/Rdatatable/data.table/issues/2594). Thanks to Davor Josipovic and Markus Bonsch for the reports, and Jan Gorecki for the PR. -13. `datatable.[dll|so]` has changed name to `data_table.[dll|so]`, [#4442](https://github.com/Rdatatable/data.table/pull/4442). Thanks to Jan Gorecki for the PR. We had previously removed the `.` since `.` is not allowed by the following paragraph in the Writing-R-Extensions manual. Replacing `.` with `_` instead now seems more consistent with the last sentence. - - > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. - -14. For nearly two years, since v1.12.4 (Oct 2019) (note 11 below in this NEWS file), using `options(datatable.nomatch=0)` has produced the following message : +13. For nearly two years, since v1.12.4 (Oct 2019) (note 11 below in this NEWS file), using `options(datatable.nomatch=0)` has produced the following message : ``` The option 'datatable.nomatch' is being used and is not set to the default NA. This option @@ -601,9 +597,7 @@ The message is now upgraded to warning that the option is now ignored. -15. Many thanks to Kurt Hornik for investigating potential impact of a possible future change to `base::intersect()` on empty input, providing a patch so that `data.table` won't break if the change is made to R, and giving us plenty of notice, [#5183](https://github.com/Rdatatable/data.table/pull/5183). - -16. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively. +14. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively. # data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) @@ -640,6 +634,12 @@ "1",2022-10-01 01:23:45.012 ``` +5. Many thanks to Kurt Hornik for investigating potential impact of a possible future change to `base::intersect()` on empty input, providing a patch so that `data.table` won't break if the change is made to R, and giving us plenty of notice, [#5183](https://github.com/Rdatatable/data.table/pull/5183). + +6. `datatable.[dll|so]` has changed name to `data_table.[dll|so]`, [#4442](https://github.com/Rdatatable/data.table/pull/4442). Thanks to Jan Gorecki for the PR. We had previously removed the `.` since `.` is not allowed by the following paragraph in the Writing-R-Extensions manual. Replacing `.` with `_` instead now seems more consistent with the last sentence. + + > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. + # data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) From 052f8da64c16a099b2889ac8906a2f24ce6f3a9e Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 12 Oct 2022 04:31:18 +0100 Subject: [PATCH 487/588] outdated benchmark link (#5465) --- NEWS.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0ad3307d8c..74edb99612 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,5 @@ **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** -**Benchmarks are regularly updated: [here](https://h2oai.github.io/db-benchmark/)** - # data.table [v1.14.5](https://github.com/Rdatatable/data.table/milestone/20) (in development) ## NEW FEATURES From 19b7866112614db53eb3e909c097407d91cd6738 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 14 Oct 2022 20:41:45 +0100 Subject: [PATCH 488/588] memrecycle no snprintf overhead (#5463) --- NEWS.md | 2 + src/assign.c | 130 ++++++++++++++++++++++++++------------------------- 2 files changed, 69 insertions(+), 63 deletions(-) diff --git a/NEWS.md b/NEWS.md index 74edb99612..f17c8205a7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -597,6 +597,8 @@ 14. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively. +15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). + # data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) diff --git a/src/assign.c b/src/assign.c index f48f71e733..61f38a5548 100644 --- a/src/assign.c +++ b/src/assign.c @@ -684,6 +684,12 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) #define MSGSIZE 1000 static char memrecycle_message[MSGSIZE+1]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects +const char *targetDesc(const int colnum, const char *colname) { + static char str[501]; // #5463 + snprintf(str, 500, colnum==0 ? _("target vector") : _("column %d named '%s'"), colnum, colname); + return str; +} + const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceStart, const int sourceLen, const int colnum, const char *colname) // like memcpy but recycles single-item source // 'where' a 1-based INTEGER vector subset of target to assign to, or NULL or integer() @@ -707,8 +713,6 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con if (colname==NULL) error(_("Internal error: memrecycle has received NULL colname")); // # nocov *memrecycle_message = '\0'; - static char targetDesc[501]; // from 1.14.1 coerceAs reuses memrecycle for a target vector, PR#4491 - snprintf(targetDesc, 500, colnum==0 ? _("target vector") : _("column %d named '%s'"), colnum, colname); int protecti=0; const bool sourceIsFactor=isFactor(source), targetIsFactor=isFactor(target); const bool sourceIsI64=isReal(source) && INHERITS(source, char_integer64); @@ -730,7 +734,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con for (int i=0; inlevel) { - error(_("Assigning factor numbers to %s. But %d is outside the level range [1,%d]"), targetDesc, val, nlevel); + error(_("Assigning factor numbers to %s. But %d is outside the level range [1,%d]"), targetDesc(colnum, colname), val, nlevel); } } } else { @@ -738,7 +742,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con for (int i=0; inlevel)) { - error(_("Assigning factor numbers to %s. But %f is outside the level range [1,%d], or is not a whole number."), targetDesc, val, nlevel); + error(_("Assigning factor numbers to %s. But %f is outside the level range [1,%d], or is not a whole number."), targetDesc(colnum, colname), val, nlevel); } } } @@ -830,19 +834,19 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con } } } else if (isString(source) && !isString(target) && !isNewList(target)) { - warning(_("Coercing 'character' RHS to '%s' to match the type of %s."), targetIsI64?"integer64":type2char(TYPEOF(target)), targetDesc); + warning(_("Coercing 'character' RHS to '%s' to match the type of %s."), targetIsI64?"integer64":type2char(TYPEOF(target)), targetDesc(colnum, colname)); // this "Coercing ..." warning first to give context in case coerceVector warns 'NAs introduced by coercion' // and also because 'character' to integer/double coercion is often a user mistake (e.g. wrong target column, or wrong // variable on RHS) which they are more likely to appreciate than find inconvenient source = PROTECT(coerceVector(source, TYPEOF(target))); protecti++; } else if (isNewList(source) && !isNewList(target)) { if (targetIsI64) { - error(_("Cannot coerce 'list' RHS to 'integer64' to match the type of %s."), targetDesc); + error(_("Cannot coerce 'list' RHS to 'integer64' to match the type of %s."), targetDesc(colnum, colname)); // because R's coerceVector doesn't know about integer64 } // as in base R; e.g. let as.double(list(1,2,3)) work but not as.double(list(1,c(2,4),3)) // relied on by NNS, simstudy and table.express; tests 1294.* - warning(_("Coercing 'list' RHS to '%s' to match the type of %s."), type2char(TYPEOF(target)), targetDesc); + warning(_("Coercing 'list' RHS to '%s' to match the type of %s."), type2char(TYPEOF(target)), targetDesc(colnum, colname)); source = PROTECT(coerceVector(source, TYPEOF(target))); protecti++; } else if ((TYPEOF(target)!=TYPEOF(source) || targetIsI64!=sourceIsI64) && !isNewList(target)) { if (GetVerbose()>=3) { @@ -850,27 +854,27 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con Rprintf(_("Zero-copy coerce when assigning '%s' to '%s' %s.\n"), sourceIsI64 ? "integer64" : type2char(TYPEOF(source)), targetIsI64 ? "integer64" : type2char(TYPEOF(target)), - targetDesc); + targetDesc(colnum, colname)); } // The following checks are up front here, otherwise we'd need them twice in the two branches // inside BODY that cater for 'where' or not. Maybe there's a way to merge the two macros in future. // The idea is to do these range checks without calling coerceVector() (which allocates) -#define CHECK_RANGE(STYPE, RFUN, COND, FMT, TO, FMTVAL) {{ \ - const STYPE *sd = (const STYPE *)RFUN(source); \ - for (int i=0; i0 && slen==len && soff==0; // mc=memcpy; only if types match and not for single items (a single assign faster than these non-const memcpy calls) From 0895fa247afcf6b38044bd5f56c0d209691ddb31 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 17 Oct 2022 16:43:32 -0600 Subject: [PATCH 489/588] NEWS-only: publish date for 1.14.4 added to title; see patch-14.1 branch for tags --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index f17c8205a7..57214ade88 100644 --- a/NEWS.md +++ b/NEWS.md @@ -600,7 +600,7 @@ 15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). -# data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) +# data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) (17 Oct 2022) ## NOTES From 9acf6d974d297bbfeb5a6eda3de76db9accc7acf Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sun, 30 Oct 2022 17:55:33 -0400 Subject: [PATCH 490/588] DESCRIPTION-only: move contributor list to the bottom so it can grow at the bottom while all other fields are easy to see straight away at the top --- DESCRIPTION | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4f5c45640d..d3a7f9b448 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,16 @@ Package: data.table Version: 1.14.5 Title: Extension of `data.frame` +Depends: R (>= 3.1.0) +Imports: methods +Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown +SystemRequirements: zlib +Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. +License: MPL-2.0 | file LICENSE +URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table +BugReports: https://github.com/Rdatatable/data.table/issues +VignetteBuilder: knitr +ByteCompile: TRUE Authors@R: c( person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), person("Arun","Srinivasan", role="aut", email="asrini@pm.me"), @@ -73,13 +83,4 @@ Authors@R: c( person("Josh","O'Brien", role="ctb"), person("Dereck","de Mezquita", role="ctb"), person("Michael","Czekanski", role="ctb")) -Depends: R (>= 3.1.0) -Imports: methods -Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown -SystemRequirements: zlib -Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. -License: MPL-2.0 | file LICENSE -URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table -BugReports: https://github.com/Rdatatable/data.table/issues -VignetteBuilder: knitr -ByteCompile: TRUE + From 16a2c1c60ef2f2564b89b9b5fbb2e91bafd05e4f Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 31 Oct 2022 13:45:21 -0400 Subject: [PATCH 491/588] DESCRIPTION-only: GLCI requires last line to end abruptly; i.e. no eol at the end of last line. Longer comment: https://github.com/Rdatatable/data.table/commit/9acf6d974d297bbfeb5a6eda3de76db9accc7acf#r88447165 --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d3a7f9b448..55754ba976 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -82,5 +82,5 @@ Authors@R: c( person("Olivier","Delmarcell", role="ctb"), person("Josh","O'Brien", role="ctb"), person("Dereck","de Mezquita", role="ctb"), - person("Michael","Czekanski", role="ctb")) - + person("Michael","Czekanski", role="ctb") + ) From cd02c08698bc475aefcf9ab7c4c011b126b1639e Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 1 Nov 2022 15:15:15 +0000 Subject: [PATCH 492/588] =?UTF-8?q?ensure=20that=20DCF=20does=20not=20have?= =?UTF-8?q?=20empty=20line=20at=20the=20end=20before=20appending=20?= =?UTF-8?q?=E2=80=A6=20(#5508)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 96189c8c31..e4a5fb230b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -63,6 +63,7 @@ build: ## build data.table sources as tar.gz archive before_script: - Rscript -e 'install.packages(c("knitr","rmarkdown"), repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' - rm -r bus + - sed -i '/^[[:space:]]*$/d' ./DESCRIPTION ## make last line end abruptly; i.e. without a final \n - echo "Revision:" $CI_BUILD_REF >> ./DESCRIPTION script: - R CMD build . From 3f190612ead213de237ec54116d72274c0035b3b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 1 Nov 2022 20:15:10 -0600 Subject: [PATCH 493/588] Strengthen CRAN_Release.cmd to find UBSAN errors in data.table.Rcheck (#5509) --- .dev/CRAN_Release.cmd | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 6980ff06a3..b26fac02db 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -289,11 +289,11 @@ cd ~/build wget -N https://stat.ethz.ch/R/daily/R-devel.tar.gz rm -rf R-devel rm -rf R-devel-strict-* -tar xvf R-devel.tar.gz +tar xf R-devel.tar.gz mv R-devel R-devel-strict-gcc -tar xvf R-devel.tar.gz +tar xf R-devel.tar.gz mv R-devel R-devel-strict-clang -tar xvf R-devel.tar.gz +tar xf R-devel.tar.gz sudo apt-get -y build-dep r-base cd R-devel # may be used for revdep testing: .dev/revdep.R. @@ -302,8 +302,12 @@ cd R-devel # may be used for revdep testing: .dev/revdep.R. make # use latest available `apt-cache search gcc-` or `clang-` +# wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - +# sudo add-apt-repository 'deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-15 main' +# sudo apt-get install clang-15 + cd ~/build/R-devel-strict-clang -./configure --without-recommended-packages --disable-byte-compiled-packages --enable-strict-barrier --disable-long-double CC="clang-11 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" +./configure --without-recommended-packages --disable-byte-compiled-packages --enable-strict-barrier --disable-long-double CC="clang-15 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-sanitize=alignment -fno-omit-frame-pointer" CFLAGS="-g -O3 -Wall -pedantic" make cd ~/build/R-devel-strict-gcc @@ -329,15 +333,22 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-gcc CMD INSTALL data.table_1.14.1.tar.gz -Rdevel-strict-clang CMD INSTALL data.table_1.14.1.tar.gz -# Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so should be passed through to here -Rdevel-strict-gcc -Rdevel-strict-clang # repeat below with clang and gcc +Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.14.5.tar.gz +# Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so they should be +# passed through to here. However, our configure script seems to get in the way and gets them from {R_HOME}/bin/R +# So I needed to edit my ~/.R/Makevars to get CFLAGS the way I needed. +Rdevel-strict-[gcc|clang] CMD check data.table_1.14.5.tar.gz +# Use the (failed) output to get the list of currently needed packages and install them +Rdevel-strict-[gcc|clang] isTRUE(.Machine$sizeof.longdouble==0) # check noLD is being tested options(repos = "http://cloud.r-project.org") -install.packages(c("bit64","xts","nanotime","R.utils","yaml")) # minimum packages needed to not skip any tests in test.data.table() -# install.packages(c("curl","knitr")) # for `R CMD check` when not strict. Too slow to install when strict +install.packages(c("bit64", "bit", "curl", "R.utils", "xts","nanotime", "zoo", "yaml", "knitr", "rmarkdown")) +# Issue #5491 showed that CRAN is running UBSAN on .Rd examples which found an error so we now run full R CMD check +q("no") +Rdevel-strict-[gcc|clang] CMD check data.table_1.14.5.tar.gz +# UBSAN errors occur on stderr and don't affect R CMD check result. Made many failed attempts to capture them. So grep for them. +find data.table.Rcheck -name "*.Rout" -exec grep -H "runtime error" {} \; + require(data.table) test.data.table(script="*.Rraw") # 7 mins (vs 1min normally) under UBSAN, ASAN and --strict-barrier # without the fix in PR#3515, the --disable-long-double lumped into this build does now work and correctly reproduces the noLD problem From 74d2cab78e3b09e3f8f78cc910dfb1dd1e64a98a Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 2 Nov 2022 00:01:02 -0400 Subject: [PATCH 494/588] CRAN_Release.cmd-only: added Ncpus thanks comment from Michael in #5509 --- .dev/CRAN_Release.cmd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index b26fac02db..7d042ad754 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -342,12 +342,13 @@ Rdevel-strict-[gcc|clang] CMD check data.table_1.14.5.tar.gz Rdevel-strict-[gcc|clang] isTRUE(.Machine$sizeof.longdouble==0) # check noLD is being tested options(repos = "http://cloud.r-project.org") -install.packages(c("bit64", "bit", "curl", "R.utils", "xts","nanotime", "zoo", "yaml", "knitr", "rmarkdown")) +install.packages(c("bit64", "bit", "curl", "R.utils", "xts","nanotime", "zoo", "yaml", "knitr", "rmarkdown", "markdown"), + Ncpus=4) # Issue #5491 showed that CRAN is running UBSAN on .Rd examples which found an error so we now run full R CMD check q("no") Rdevel-strict-[gcc|clang] CMD check data.table_1.14.5.tar.gz # UBSAN errors occur on stderr and don't affect R CMD check result. Made many failed attempts to capture them. So grep for them. -find data.table.Rcheck -name "*.Rout" -exec grep -H "runtime error" {} \; +find data.table.Rcheck -name "*Rout*" -exec grep -H "runtime error" {} \; require(data.table) test.data.table(script="*.Rraw") # 7 mins (vs 1min normally) under UBSAN, ASAN and --strict-barrier From 9273a5897a012a2a5969dfb90a96bd1efa310784 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 2 Nov 2022 00:57:48 -0400 Subject: [PATCH 495/588] relax test 1962.098 to pass R-devel changes to POSIXlt --- inst/tests/tests.Rraw | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 04336f3d5d..65112984b9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13935,10 +13935,7 @@ y = as.ITime('543210', format = '%S%M%H') test(1962.095, y, structure(37974L, class = "ITime")) test(1962.096, capture.output(print(y)), '[1] "10:32:54"') test(1962.097, rep(y, 2L), structure(c(37974L, 37974L), class = "ITime")) -test(1962.098, as.POSIXlt(y, date = '2018-12-01', tz = 'UTC'), - structure(list(sec = 54, min = 32L, hour = 10L, mday = 1L, mon = 11L, - year = 118L, wday = 6L, yday = 334L, isdst = 0L), - class = c("POSIXlt", "POSIXt"), tzone = "UTC")) +test(1962.098, format(as.POSIXlt(y, date='2018-12-01', tz='UTC'), usetz=TRUE), "2018-12-01 10:32:54 UTC") test(1962.099, as.POSIXct(x, y), structure(1533119574, tzone = "UTC", class = c("POSIXct", "POSIXt"))) From a4c2b01720afa94dae69344c9889167122790e91 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 2 Nov 2022 01:03:27 -0400 Subject: [PATCH 496/588] fread.c remove unused variable 'resj' closes #5511 --- src/fread.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/fread.c b/src/fread.c index 510c118736..fec31f7302 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2536,9 +2536,8 @@ int freadMain(freadMainArgs _args) { rowSize1 = rowSize4 = rowSize8 = 0; nStringCols = 0; nNonStringCols = 0; - for (int j=0, resj=-1; j Date: Tue, 8 Nov 2022 17:41:25 -0500 Subject: [PATCH 497/588] test.data.table(memtest=TRUE) (#5515) --- NEWS.md | 2 + R/test.data.table.R | 103 ++++++++++++++++++----------------------- inst/tests/tests.Rraw | 9 ++-- man/test.data.table.Rd | 4 +- 4 files changed, 55 insertions(+), 63 deletions(-) diff --git a/NEWS.md b/NEWS.md index 57214ade88..15bf7e8eab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -599,6 +599,8 @@ 15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). +16. `test.data.table()` no longer creates `DT` in `.GlobalEnv` and gains `memtest=` for use on Linux to report which tests use the most memory. + # data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) (17 Oct 2022) diff --git a/R/test.data.table.R b/R/test.data.table.R index 298fc34c13..1c212f58b2 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -1,8 +1,12 @@ -test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent) { +test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, + memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0)) { stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress)) + memtest = as.integer(memtest) + stopifnot(length(memtest)==1L, memtest %in% 0:2) if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # package developer # nocov start + dev = TRUE if ("package:data.table" %chin% search()) stopf("data.table package is loaded. Unload or start a fresh R session.") rootdir = if (pkg!="." && pkg %chin% dir()) file.path(getwd(), pkg) else Sys.getenv("PROJ_PATH") subdir = file.path("inst","tests") @@ -10,6 +14,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # nocov end } else { # i) R CMD check and ii) user running test.data.table() + dev = FALSE rootdir = getNamespaceInfo("data.table","path") subdir = "tests" env = new.env(parent=parent.env(.GlobalEnv)) # when user runs test.data.table() we don't want their variables in .GlobalEnv affecting tests, #3705 @@ -112,14 +117,18 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F assign("whichfail", NULL, envir=env) assign("started.at", proc.time(), envir=env) assign("lasttime", proc.time()[3L], envir=env) # used by test() to attribute time inbetween tests to the next test - assign("timings", data.table( ID = seq_len(9999L), time=0.0, nTest=0L ), envir=env) # test timings aggregated to integer id - assign("memtest", as.logical(Sys.getenv("TEST_DATA_TABLE_MEMTEST", "FALSE")), envir=env) + assign("timings", data.table( ID = seq_len(9999L), time=0.0, nTest=0L, RSS=0.0 ), envir=env) # test timings aggregated to integer id + assign("memtest", memtest, envir=env) assign("filename", fn, envir=env) - assign("inittime", as.integer(Sys.time()), envir=env) # keep measures from various test.data.table runs assign("showProgress", showProgress, envir=env) owd = setwd(tempdir()) # ensure writeable directory; e.g. tests that plot may write .pdf here depending on device option and/or batch mode; #5190 on.exit(setwd(owd)) + + if (memtest) { + catf("\n***\n*** memtest=%d. This should be the first task in a fresh R session for best results. Ctrl-C now if not.\n***\n\n", memtest) + if (is.na(ps_mem())) stopf("memtest intended for Linux. Step through ps_mem() to see what went wrong.") + } err = try(sys.source(fn, envir=env), silent=silent) @@ -174,42 +183,27 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F } # There aren't any errors, so we can use up 11 lines for the timings table - timings = env$timings - DT = head(timings[-1L][order(-time)], 10L) # exclude id 1 as in dev that includes JIT - if ((x<-sum(timings[["nTest"]])) != ntest) { - warningf("Timings count mismatch: %d vs %d", x, ntest) # nocov + nTest = RSS = NULL # to avoid 'no visible binding' note + timings = env$timings[nTest>0] + if (!memtest) { + ans = head(timings[if (dev) -1L else TRUE][order(-time)], 10L)[,RSS:=NULL] # exclude id 1 in dev as that includes JIT + if ((x<-sum(timings[["nTest"]])) != ntest) { + warningf("Timings count mismatch: %d vs %d", x, ntest) # nocov + } + catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-ans[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) + print(ans, class=FALSE) + } else { + y = head(order(-diff(timings$RSS)), 10L) + ans = timings[, diff:=c(NA,round(diff(RSS),1))][y+1L][,time:=NULL] # time is distracting and influenced by gc() calls; just focus on RAM usage here + catf("10 largest RAM increases (MB); see plot for cumulative effect (if any)\n") + print(ans, class=FALSE) + plot(timings$RSS, main=basename(fn), ylab="RSS (MB)") } - catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) - print(DT, class=FALSE) catf("All %d tests (last %.8g) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) - - ## this chunk requires to include new suggested deps: graphics, grDevices - #memtest.plot = function(.inittime) { - # if (!all(requireNamespace(c("graphics","grDevices"), quietly=TRUE))) return(invisible()) - # inittime=PS_rss=GC_used=GC_max_used=NULL - # m = fread("memtest.csv")[inittime==.inittime] - # if (nrow(m)) { - # ps_na = allNA(m[["PS_rss"]]) # OS with no 'ps -o rss R' support - # grDevices::png("memtest.png") - # p = graphics::par(mfrow=c(if (ps_na) 2 else 3, 2)) - # if (!ps_na) { - # m[, graphics::plot(test, PS_rss, pch=18, xlab="test num", ylab="mem MB", main="ps -o rss R")] - # m[, graphics::plot(timestamp, PS_rss, type="l", xlab="timestamp", ylab="mem MB", main="ps -o rss R")] - # } - # m[, graphics::plot(test, GC_used, pch=18, xlab="test num", ylab="mem MB", main="gc used")] - # m[, graphics::plot(timestamp, GC_used, type="l", xlab="timestamp", ylab="mem MB", main="gc used")] - # m[, graphics::plot(test, GC_max_used, pch=18, xlab="test num", ylab="mem MB", main="gc max used")] - # m[, graphics::plot(timestamp, GC_max_used, type="l", xlab="timestamp", ylab="mem MB", main="gc max used")] - # graphics::par(p) - # grDevices::dev.off() - # } else { - # warningf("test.data.table runs with memory testing but did not collect any memory statistics.") - # } - #} - #if (memtest<-get("memtest", envir=env)) memtest.plot(get("inittime", envir=env)) - - invisible(nfail==0L) + ans = nfail==0L + attr(ans, "timings") = timings # as attr to not upset callers who expect a TRUE/FALSE result + invisible(ans) } # nocov start @@ -235,17 +229,16 @@ INT = function(...) { as.integer(c(...)) } # utility used in tests.Rraw ps_mem = function() { # nocov start - cmd = sprintf("ps -o rss %s | tail -1", Sys.getpid()) - ans = tryCatch(as.numeric(system(cmd, intern=TRUE, ignore.stderr=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_) - stopifnot(length(ans)==1L) # extra check if other OSes would not handle 'tail -1' properly for some reason - # returns RSS memory occupied by current R process in MB rounded to 1 decimal places (as in gc), ps already returns KB - c("PS_rss"=round(ans / 1024, 1L)) + cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KB + ans = tryCatch(as.numeric(system(cmd, intern=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_) + if (length(ans)!=1L || !is.numeric(ans)) ans=NA_real_ # just in case + round(ans / 1024, 1L) # return MB # nocov end } gc_mem = function() { # nocov start - # gc reported memory in MB + # gc reports memory in MB m = apply(gc()[, c(2L, 4L, 6L)], 2L, sum) names(m) = c("GC_used", "GC_gc_trigger", "GC_max_used") m @@ -278,16 +271,19 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no lasttime = get("lasttime", parent.frame()) timings = get("timings", parent.frame()) memtest = get("memtest", parent.frame()) - inittime = get("inittime", parent.frame()) filename = get("filename", parent.frame()) foreign = get("foreign", parent.frame()) showProgress = get("showProgress", parent.frame()) - time = nTest = NULL # to avoid 'no visible binding' note + time = nTest = RSS = NULL # to avoid 'no visible binding' note if (num>0) on.exit( { - now = proc.time()[3L] - took = now-lasttime # so that prep time between tests is attributed to the following test - assign("lasttime", now, parent.frame(), inherits=TRUE) - timings[ as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE ] + took = proc.time()[3L]-lasttime # so that prep time between tests is attributed to the following test + timings[as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE] + if (memtest) { + if (memtest==1L) gc() # see #5515 for before/after + timings[as.integer(num), RSS:=max(ps_mem(),RSS), verbose=FALSE] + if (memtest==2L) gc() + } + assign("lasttime", proc.time()[3L], parent.frame(), inherits=TRUE) # after gc() to exclude gc() time from next test when memtest } ) if (showProgress) # \r can't be in gettextf msg @@ -300,7 +296,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # not be flushed to the output upon segfault, depending on OS). } else { # not `test.data.table` but developer running tests manually; i.e. `cc(F); test(...)` - memtest = FALSE # nocov + memtest = 0L # nocov filename = NA_character_ # nocov foreign = FALSE # nocov ; assumes users of 'cc(F); test(...)' has LANGUAGE=en showProgress = FALSE # nocov @@ -330,9 +326,6 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no actual$message <<- c(actual$message, conditionMessage(m)) m } - if (memtest) { - timestamp = as.numeric(Sys.time()) # nocov - } if (is.null(output) && is.null(notOutput)) { x = suppressMessages(withCallingHandlers(tryCatch(x, error=eHandler), warning=wHandler, message=mHandler)) # save the overhead of capture.output() since there are a lot of tests, often called in loops @@ -340,10 +333,6 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no } else { out = capture.output(print(x <- suppressMessages(withCallingHandlers(tryCatch(x, error=eHandler), warning=wHandler, message=mHandler)))) } - if (memtest) { - mem = as.list(c(inittime=inittime, filename=basename(filename), timestamp=timestamp, test=num, ps_mem(), gc_mem())) # nocov - fwrite(mem, "memtest.csv", append=TRUE, verbose=FALSE) # nocov - } fail = FALSE if (.test.data.table && num>0) { if (num0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") - DTfun = DT # just in dev-mode, DT() gets overwritten in .GlobalEnv by DT objects here in tests.Rraw; we restore DT() in test 2212 } else { require(data.table) # Make symbols to the installed version's ::: so that we can i) test internal-only not-exposed R functions @@ -163,7 +162,8 @@ base_messages = list( ########################## test(1.1, tables(env=new.env()), null.data.table(), output = "No objects of class") -test(1.2, tables(silent=TRUE), data.table(NAME="timings", NROW=9999L, NCOL=3L, MB=0, COLS=list(c("ID","time","nTest")), KEY=list(NULL))) +test(1.2, tables(silent=TRUE)[,.(NAME,NROW,MB)], # memtest=TRUE adds some columns so exclude NCOL and COLS here + data.table(NAME="timings", NROW=9999L, MB=0)) TESTDT = data.table(a=as.integer(c(1,3,4,4,4,4,7)), b=as.integer(c(5,5,6,6,9,9,2)), v=1:7) setkey(TESTDT,a,b) @@ -15325,10 +15325,10 @@ test(2035.3, fread('A,B\n"foo","ba"r"', quote=""), ans) # source() printing edge case; #2369 setup = c('DT = data.table(a = 1)') writeLines(c(setup, 'DT[ , a := 1]'), tmp<-tempfile()) -test(2036.1, !any(grepl("1: 1", capture.output(source(tmp, echo = TRUE)), fixed = TRUE))) +test(2036.1, !any(grepl("1: 1", capture.output(source(tmp, echo=TRUE, local=TRUE)), fixed=TRUE))) # local= #5514 ## test force-printing still works writeLines(c(setup, 'DT[ , a := 1][]'), tmp) -test(2036.2, source(tmp, echo = TRUE), output = "1:\\s+1") +test(2036.2, source(tmp, echo=TRUE, local=TRUE), output="1:\\s+1") # more helpful guidance when assigning before setDT() after readRDS(); #1729 DT = data.table(a = 1:3) @@ -18317,7 +18317,6 @@ for (col in c("a","b","c")) { # DT() functional form, #4872 #5106 #5107 #5129 if (base::getRversion() >= "4.1.0") { # we have to EVAL "|>" here too otherwise this tests.Rraw file won't parse in R<4.1.0 - if (exists("DTfun")) DT=DTfun # just in dev-mode restore DT() in .GlobalEnv as DT object overwrote it in tests above droprn = function(df) { rownames(df)=NULL; df } # TODO: could retain rownames where droprn is currently used below test(2212.011, EVAL("mtcars |> DT(mpg>20, .(mean_hp=round(mean(hp),2)), by=cyl)"), data.frame(cyl=c(6,4), mean_hp=c(110.0, 82.64))) diff --git a/man/test.data.table.Rd b/man/test.data.table.Rd index ba0fe25f9c..2df2a32842 100644 --- a/man/test.data.table.Rd +++ b/man/test.data.table.Rd @@ -7,7 +7,8 @@ \usage{ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", silent = FALSE, - showProgress = interactive() && !silent) + showProgress = interactive() && !silent, + memtest = Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0)) } \arguments{ \item{script}{ Run arbitrary R test script. } @@ -15,6 +16,7 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", \item{pkg}{ Root directory name under which all package content (ex: DESCRIPTION, src/, R/, inst/ etc..) resides. Used only in \emph{dev-mode}. } \item{silent}{ Controls what happens if a test fails. Like \code{silent} in \code{\link{try}}, \code{TRUE} causes the error message to be suppressed and \code{FALSE} to be returned, otherwise the error is returned. } \item{showProgress}{ Output 'Running test ...\\r' at the start of each test? } +\item{memtest}{ Measure and report memory usage of tests (1:gc before ps, 2:gc after ps) rather than time taken (0) by default. Intended for and tested on Linux. See PR #5515 for more details. } } \details{ Runs a series of tests. These can be used to see features and examples of usage, too. Running test.data.table will tell you the full location of the test file(s) to open. From a2ecd45e99ea16946a3f09e197eb790ee5d43434 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 8 Nov 2022 17:47:33 -0700 Subject: [PATCH 498/588] GLCI memtest env variable now takes '1' not 'TRUE', #5515 --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e4a5fb230b..140ccef6c5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -171,7 +171,7 @@ test-rel-vanilla-lin: ## minimal, no suggested deps, no vignettes or manuals, me <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev variables: - TEST_DATA_TABLE_MEMTEST: "TRUE" + TEST_DATA_TABLE_MEMTEST: "1" before_script: - *cp-src - rm -r bus From e9567169de2adb9a1958d5d4fe21aa3b06eb794b Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 8 Nov 2022 20:36:49 -0700 Subject: [PATCH 499/588] CRAN_Release.cmd: check .GlobalEnv not altered by test.data.table() to prevent #5514 happening again --- .dev/CRAN_Release.cmd | 8 ++++++++ R/test.data.table.R | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 7d042ad754..8495c4998b 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -227,9 +227,17 @@ PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.1.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) require(data.table) +f1 = tempfile() +f2 = tempfile() +suppressWarnings(try(rm(list=c(".Last",".Random.seed")))) +save.image(f1) test.data.table(script="other.Rraw") test.data.table(script="*.Rraw") test.data.table(verbose=TRUE) # since main.R no longer tests verbose mode +suppressWarnings(try(rm(list=c(".Last",".Random.seed")))) +save.image(f2) +system(paste("diff",f1,f2)) # to detect any changes to .GlobalEnv, #5514 +# print(load(f1)); print(load(f2)) # run if diff found any difference # check example() works on every exported function, with these sticter options too, and also that all help pages have examples options(warn=2, warnPartialMatchArgs=TRUE, warnPartialMatchAttr=TRUE, warnPartialMatchDollar=TRUE) diff --git a/R/test.data.table.R b/R/test.data.table.R index 1c212f58b2..fd7750ef08 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -33,7 +33,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F return(sapply(scripts, function(fn) { err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress)) cat("\n"); - identical(err, TRUE) + isTRUE(err) })) # nocov end } From e11a09b523039c4822cd4bbd2d0c7e7fbc6e7f94 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 9 Nov 2022 21:29:45 -0700 Subject: [PATCH 500/588] Move suggests from tests.Rraw to other.Rraw (#5518) --- inst/tests/other.Rraw | 486 +++++++++++++++++++++++++++++++++++++++- inst/tests/tests.Rraw | 503 ++---------------------------------------- 2 files changed, 501 insertions(+), 488 deletions(-) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 11b00cc546..c6520a377c 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -1,10 +1,16 @@ -pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf") +pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf", "nanotime", "R.utils", "yaml") # First expression of this file must be as above: .gitlab-ci.yml uses parse(,n=1L) to read one expression from this file and installs pkgs. # So that these dependencies of other.Rraw are maintained in a single place. # TEST_DATA_TABLE_WITH_OTHER_PACKAGES is off by default so this other.Rraw doesn't run on CRAN. It is run by GLCI, locally in dev, and by # users running test.data.table("other.Rraw"). # zoo needs to be before xts for #5101 otherwise xts's dependency zoo gets attached at position 2 if xts is loaded first +# Optional Suggest-ed package tests moved from tests.Rraw to here in #5516. Retaining their comments: +# "xts", # we have xts methods in R/xts.R +# "nanotime", # fwrite looks for the 'nanotime' class name at C level (but we have our own writer in C, though) +# "yaml" # for fread's yaml argument (csvy capability) +# # zoo # In DESCRIPTION:Suggests otherwise R CMD check warning: '::' or ':::' import not declared from: 'zoo'; it is tested in other.Rraw though + if (exists("test.data.table",.GlobalEnv,inherits=FALSE) || !"package:data.table" %in% search()) { stop("Usage: R CMD INSTALL; require(data.table); test.data.table('other.Rraw')") @@ -207,3 +213,481 @@ if (loaded[["sf"]]) { #2273 test(15, DT[1:3, .(NAME, FIPS, geometry)], output="Ashe.*-81.4.*Surry.*-80.4") } +if (loaded[["yaml"]]) { # csvy; #1701. Was 2032-2033 in tests.Rraw, #5516 + f = testDir("csvy/test.csvy") + DT = data.table(var1 = c("A", "B"), + var2 = c(1L, 3L), + var3 = c(2.5, 4.3)) + DT_yaml = copy(DT) + setattr(DT_yaml, 'yaml_metadata', + list(name = "my-dataset", + source = "https://github.com/leeper/csvy/tree/master/inst/examples", + schema = list(fields = list( + list(name = "var1", title = "variable 1", type = "string", + description = "explaining var1", + constraints = list(list(required = TRUE))), + list(name = "var2", title = "variable 2", type = "integer"), + list(name = "var3", title = "variable 3", type = "number") + )))) + ## with skip = '__auto__', fread can figure out + ## how to start after the metadata (just ignoring it) + test(16.01, fread(f), DT) + ## should be the same, but with yaml_metadata attribute + test(16.02, fread(f, yaml = TRUE), DT_yaml) + ## testing verbose messaging + test(16.03, fread(f, yaml = TRUE, verbose = TRUE), + DT_yaml, output = 'Processed.*YAML metadata.*') + ## this file is identical, except the body of the + ## YAML header is commented out with # (should read identically) + test(16.04, + fread(testDir('csvy/test_comment.csvy'), yaml = TRUE), + DT_yaml) + ## user input is taken as most intentional & overrides YAML + DT_yaml[ , var2 := as.numeric(var2)] + test(16.05, fread(f, yaml = TRUE, colClasses = list(numeric = 'var2')), + DT_yaml, message = 'colClasses.*YAML header are in conflict.*var2') + ## extraneous/unused fields shouldn't throw off reading + DT = fread(testDir('csvy/test_extraneous.csvy'), yaml = TRUE) + test(16.06, names(DT), c('Date', 'WTI')) + test(16.07, attr(DT, 'yaml_metadata'), + list(names = c("Date", "WTI"), class = "data.frame", + title = "Cushing, OK WTI Spot Price FOB", filename = "data.csv", + fileurl = "https://raw.githubusercontent.com/jrovegno/csvy/master/data.csv", + sourceurl = "http://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=PET&s=RWTC&f=D", + source_csvy = "https://github.com/leeper/csvy/tree/master/inst/examples", + item = "PET", sourcekey = "RWTC", freq = "Daily", + rate = "MID", type = "price", units = "Dollars per Barrel", + latestdate = "2015-08-31", releasedate = "2015-09-02", + nextreleasedate = "2015-09-10", source = "Thomson Reuters", + contactemail = "infoctr@eia.doe.gov", contactphone = "(202) 586-8800")) + ## yaml can also handle sep, dec, quote, and na.strings + DT_out = data.table(var1 = c("A", "B"), + var2 = c(1L, NA), + var3 = c(2.5, 4.3)) + meta = + list(name = NULL, + schema = list(fields = list( + list(name = "var1", title = "variable 1", type = "string", + description = "a single-quoted character variable"), + list(name = "var2", title = "variable 2", type = "integer"), + list(name = "var3", title = "variable 3", type = "number", + description = "European-style numeric") + )), + header = TRUE, sep = "|", dec = ",", + quote = "'", na.strings = "@") + attr(DT_out, 'yaml_metadata') = meta + test(16.08, fread(testDir( 'csvy/test_attributes.csvy'), yaml = TRUE), DT_out) + ## user-specified attributes can override data from YAML + meta$sep = "-" + setattr(DT_out, 'yaml_metadata', meta) + test(16.09, fread(testDir('csvy/test_override_sep.csvy'), yaml = TRUE, sep = '|'), DT_out, + message = 'User-supplied.*sep.*override') + + meta$sep = "|" + setattr(DT_out, 'yaml_metadata', meta) + test(16.10, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE), + DT_out, message = 'User-supplied.*header.*override') + col.names = c('x', 'y', 'z') + setnames(DT_out, col.names) + test(16.11, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE, col.names = col.names), DT_out, + message = c('User-supplied.*header.*override', 'User-supplied.*col.names.*override')) + + test(16.12, fread(testDir('csvy/test_attributes.csvy'), yaml = TRUE, col.names = col.names), + DT_out, message = 'User-supplied.*col.names') + + setnames(DT_out, c('var1', 'var2', 'var3')) + meta$quote = "^" + setattr(DT_out, 'yaml_metadata', meta) + test(16.13, fread(testDir('csvy/test_override_quote.csvy'), yaml = TRUE, quote = "'"), + DT_out, message = 'User-supplied.*quote') + + meta$quote = "'" + meta$dec = "." + setattr(DT_out, 'yaml_metadata', meta) + test(16.14, fread(testDir('csvy/test_override_dec.csvy'), yaml = TRUE, dec = ','), + DT_out, message = 'User-supplied.*dec') + + meta$dec = ',' + meta$na.strings = 'NA' + setattr(DT_out, 'yaml_metadata', meta) + test(16.15, fread(testDir('csvy/test_override_na.csvy'), yaml = TRUE, na.strings = '@'), + DT_out, message = 'User-supplied.*na.strings') + + ## error if YAML malformed + test(16.16, fread(testDir('csvy/test_incomplete_header.csvy'), yaml = TRUE), + error = 'Reached the end.*YAML.*valid csvy') + ## use any other CSV in test directory which doesn't have YAML + if (loaded[["R.utils"]]) test(16.17, fread(testDir('issue_2051.csv.gz'), yaml = TRUE), + error = 'Encountered.*unskipped.*constitute.*valid YAML') + ## no problem if some fields are missing a type (just + ## resort to standard auto-inferral, i.e., identical to + ## the case of partially-specified colClasses) + DT = data.table(var1 = c("A", "B"), var2 = c(1L, 3L), + var3 = c(2.5, 4.3)) + setattr(DT, 'yaml_metadata', + list(name = "my-dataset", source = "https://github.com/leeper/csvy/tree/master/inst/examples", + schema = list(fields = list( + list(name = "var1"), list(name = "var2", type = "integer"), + list(name = "var3", type = "number") + )))) + test(16.18, fread(testDir('csvy/test_missing_type.csvy'), yaml = TRUE), DT) + ## skip applies starting after the YAML header + setattr(DT, 'yaml_metadata', + list(schema = list(fields = list( + list(name = "var1", type = "string"), + list(name = "var2", type = "integer"), + list(name = "var3", type = "number") + )))) + test(16.19, fread(testDir('csvy/test_skip.csvy'), yaml = TRUE, skip = 2L), DT) + ## user-supplied col.names override metadata (as for colClasses) + cn = paste0('V', 1:3) + setnames(DT, cn) + test(16.20, fread(testDir('csvy/test_skip.csvy'), + yaml = TRUE, skip = 2L, col.names = cn), + DT, message = 'User-supplied column names.*override.*YAML') + ## invalid value fails + test(16.21, fread(f, yaml = 'gobble'), + error = 'isTRUEorFALSE\\(yaml\\) is not TRUE') + + ## warning that skip-as-search doesn't work with yaml + DT_yaml[ , var2 := as.integer(var2)] + test(16.22, fread(f, skip = 'var1,', yaml = TRUE), + DT_yaml, warning = 'Combining a search.*YAML.*') + + # fwrite csvy: #3534 + tmp = tempfile() + DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5]) + # force eol for platform independence + fwrite(DT, tmp, yaml = TRUE, eol = '\n') + as_read = readLines(tmp) + test(17.01, as_read[c(1L, 24L)], c('---', '---')) + test(17.02, grepl('source: R.*data.table.*fwrite', as_read[2L])) + test(17.03, grepl('creation_time_utc', as_read[3L])) + test(17.04, as_read[4:23], + c("schema:", " fields:", " - name: a", " type: integer", + " - name: b", " type: numeric", " - name: c", " type: character", + "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", + # NB: apparently \n is encoded like this in YAML + "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", + "logical01: no")) + tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") + test(17.05, as_read[25:30], tbl_body) + + # windows eol + fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') + test(17.06, readLines(tmp)[18L], 'eol: "\\r\\n"') + + # multi-class columns + DT[ , t := .POSIXct(1:5, tz = 'UTC')] + fwrite(DT, tmp, yaml = TRUE) + as_read = readLines(tmp) + test(17.07, as_read[13L], " type: POSIXct") + + # ~invertibility~ + # fread side needs to be improved for Hugh's colClasses update + DT[ , t := NULL] + fwrite(DT, tmp, yaml = TRUE) + DT2 = fread(tmp, yaml = TRUE) + # remove metadata to compare + attr(DT2, 'yaml_metadata') = NULL + test(17.08, all.equal(DT, DT2)) + + test(17.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE), + output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*")) + + # TODO: test gzip'd yaml which is now supported + + # yaml + bom arguments + DT = data.table(l=letters, n=1:26) + fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) + fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50 in tests.Rraw + lines = readLines(fcon) + lines = lines[lines!=""] # an extra "" after "eol: |2+" (line 16) on Linux but not Windows + # remove the blank here so we don't need to change this test if/when that changes in yaml package + test(17.11, length(lines), 48L) + close(fcon) + test(17.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) + # re-write should have same output (not appended) + fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) + fcon = file(f, encoding="UTF-8") + lines = readLines(fcon) + lines = lines[lines!=""] + test(17.13, length(lines), 48L) + close(fcon) + test(17.14, fread(f), DT) + unlink(f) +} + +if (loaded[["xts"]]) { # was 1465 in tests.Rraw, #5516 + # data.table-xts conversion #882 + # Date index + dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) + dt_xt = as.data.table(xt) + xt_dt = as.xts.data.table(dt) + test(18.01, all.equal(dt, dt_xt, check.attributes = FALSE)) + test(18.02, xt, xt_dt) + # POSIXct index + dt <- data.table(index = as.POSIXct(as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) + dt_xt = as.data.table(xt) + xt_dt = as.xts.data.table(dt) + test(18.03, all.equal(dt, dt_xt, check.attributes = FALSE)) + test(18.04, xt, xt_dt) + # index types returned from to.period + dt = data.table(index = as.Date((as.Date("2014-12-12") - 729):as.Date("2014-12-12"), origin = "1970-01-01"), quantity = as.numeric(rep(c(1:5), 73)), value = rep(c(1:73) * 100, 5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value), ncol = 2, dimnames = list(NULL, c("quantity", "value"))), order.by = dt$index) + xt_w = xts::to.weekly(xt) + xt_dt_xt_w = as.xts.data.table(as.data.table(xt_w)) + xt_m = xts::to.monthly(xt) + xt_dt_xt_m = as.xts.data.table(as.data.table(xt_m)) + xt_q = xts::to.quarterly(xt) + xt_dt_xt_q = as.xts.data.table(as.data.table(xt_q)) + xt_y = xts::to.yearly(xt) + xt_dt_xt_y = as.xts.data.table(as.data.table(xt_y)) + test(18.05, all.equal(xt_w, xt_dt_xt_w, check.attributes = FALSE)) + test(18.06, all.equal(xt_m, xt_dt_xt_m, check.attributes = FALSE)) + test(18.07, all.equal(xt_q, xt_dt_xt_q, check.attributes = FALSE)) + test(18.08, all.equal(xt_y, xt_dt_xt_y, check.attributes = FALSE)) + + test(18.09, xts::last(1:5), 5L) # was test 1531 + + # xts issue from Joshua, #1347 + x = as.Date(1:5, origin="2015-01-01") + test(18.10, last(x), tail(x, 1L)) # was test 1559 + + x = xts(1:100, Sys.Date()+1:100) + test(18.11, last(x,10), x[91:100,]) # was test 841 + # The important thing this tests is that data.table's last() dispatches to xts's method when data.table is loaded above xts. + # But that isn't tested by R CMD check because xts is loaded above data.table, there. + # So to make this test is relevant, run it in fresh R session directly, after: "require(xts);require(data.table)" + # rather than: "require(data.table);require(xts)" + # Which was the main thrust of bug#2312 fixed in v1.8.3 + + # fix for #1484; was test 1589 + x = xts::as.xts(8, order.by = as.Date("2016-01-03")) + test(18.12, all.equal(as.data.table(x), data.table(index = as.Date("2016-01-03"), V1 = 8), check.attributes=FALSE)) + + # IDate support in as.xts.data.table #1499; was test 1663 + dt <- data.table(date = c(as.IDate("2014-12-31"), + as.IDate("2015-12-31"), + as.IDate("2016-12-31")), + nav = c(100,101,99), + key = "date") + dt.xts <- as.xts.data.table(dt) + test(18.13, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31"))) + + # additional coverage missing uncovered in #3117 + dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) + test(18.14, as.data.table(xt, keep.rownames = FALSE), dt[ , !'index']) + names(xt)[1L] = 'index' + test(18.15, as.data.table(xt), error = 'Input xts object should not') + names(xt)[1L] = 'quantity' + setcolorder(dt, c(3, 1, 2)) + if (base::getRversion() < "3.6.0") as.xts = as.xts.data.table # fix for when we cannot register s3method for suggested dependency #3286 + test(18.16, as.xts(dt), error = 'data.table must have a time based') + setcolorder(dt, c(2, 3, 1)) + dt[ , char_col := 'a'] + test(18.17, as.xts(dt), xt, warning = 'columns are not numeric') + if (base::getRversion() < "3.6.0") rm(as.xts) + + # 890 -- key argument for as.data.table.xts + x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) + old = options(datatable.verbose=FALSE) + test(18.18, capture.output(as.data.table(x, key="index")), + c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", + " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", + " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", + " 9: 1970-01-10 9", "10: 1970-01-11 10")) + options(old) + + # as.data.table.xts(foo) had incorrect integer index with a column name called 'x', #4897 + M = xts::as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) # xts:: just to be extra robust; shouldn't be needed with rm(as.xts) above + test(18.19, inherits(as.data.table(M)$index,"POSIXct")) + + # non-numeric xts coredata, #5268 + x = xts::xts(x=c(TRUE,FALSE), order.by=Sys.Date()+(1:2)) + colnames(x) = "value" # perhaps relates to #4897 + test(18.20, identical(x, as.xts(as.data.table(x), numeric.only=FALSE))) +} + +# was 2108 in tests.Rraw, #5516 +# first and last should no longer load xts namespace, #3857, below commented test for interactive validation when xts present but not loaded or attached +# stopifnot("xts"%in%installed.packages(), !"xts"%in%loadedNamespaces()); library(data.table); x=as.POSIXct("2019-01-01"); last(x); stopifnot(!"xts" %in% loadedNamespaces()) +x = as.POSIXct("2019-09-09")+0:1 +old = options(datatable.verbose=TRUE) +test(19.01, last(x), x[length(x)], output="!is.xts(x)") +test(19.02, first(x), x[1L], output="!is.xts(x)") +if (loaded[["xts"]]) { + xt = xts(1:2, x) + test(19.03, last(xt, 2L), xt, output="using xts::last: is.xts(x)") + test(19.04, first(xt, 2L), xt, output="using xts::first: is.xts(x)") + xt = xts(matrix(1:4, 2L, 2L), x) + test(19.05, last(xt, 2L), xt, output="using xts::last: is.xts(x)") + test(19.06, first(xt, 2L), xt, output="using xts::first: is.xts(x)") +} +# first on empty df now match head(df, n=1L), #3858 +df = data.frame(a=integer(), b=integer()) +test(19.11, first(df), df, output="!is.xts(x)") +test(19.12, last(df), df, output="!is.xts(x)") +options(datatable.verbose=FALSE) # so the as.data.table() doesn't pollute output +# xts last-first dispatch fix #4053 +x = 1:3 +y = as.POSIXct(x, origin="1970-01-01") +df = data.frame(a=1:2, b=3:2) +dt = as.data.table(df) +mx = matrix(1:9, 3, 3) +ar = array(1:27, c(3,3,3)) +xt = structure( + c(142.25, 141.229996, 141.330002, 142.860001, 142.050003, 141.399994, + 140.570007, 140.610001, 140.380005, 141.369995, 141.669998, 140.539993, + 94807600, 69620600, 76645300, 108.999954, 109.231255, 108.360008), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(c(1167782400, 1167868800, 1167955200), tzone = "UTC", tclass = "Date"), + .Dim = c(3L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) +) +options(datatable.verbose=TRUE) +if (loaded[["xts"]]) { + test(19.21, last(x, n=2L), 2:3, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.22, last(y, n=2L), y[2:3], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.23, last(x, n=1L), 3L, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.24, last(y, n=1L), y[3L], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + xt_last = structure( + c(141.330002, 141.399994, 140.380005, 140.539993, 76645300, 108.360008), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(1167955200, tzone = "UTC", tclass = "Date"), + .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + xt_last2 = structure( + c(141.229996, 141.330002, 142.050003, 141.399994, 140.610001, 140.380005, + 141.669998, 140.539993, 69620600, 76645300, 109.231255, 108.360008), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(c(1167868800, 1167955200), tzone = "UTC", tclass = "Date"), + .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + test(19.25, last(xt), xt_last, output="using xts::last: is.xts(x)") + test(19.26, last(xt, n=2L), xt_last2, output="using xts::last: is.xts(x)") + test(19.31, first(x, n=2L), 1:2, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.32, first(y, n=2L), y[1:2], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.33, first(x, n=1L), 1L, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + test(19.34, first(y, n=1L), y[1L], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") + xt_first = structure( + c(142.25, 142.860001, 140.570007, 141.369995, 94807600, 108.999954), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(1167782400, tzone = "UTC", tclass = "Date"), + .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + xt_first2 = structure( + c(142.25, 141.229996, 142.860001, 142.050003, 140.570007, 140.610001, 141.369995, 141.669998, 94807600, 69620600, 108.999954, 109.231255), + class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", + index = structure(c(1167782400, 1167868800), tzone = "UTC", tclass = "Date"), + .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) + ) + test(19.35, first(xt), xt_first, output="using xts::first: is.xts(x)") + test(19.36, first(xt, n=2L), xt_first2, output="using xts::first: is.xts(x)") +} else { + test(19.21, last(x, n=2L), 2:3, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.22, last(y, n=2L), y[2:3], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.23, last(x, n=1L), 3L, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.24, last(y, n=1L), y[3L], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.25, last(xt), error="you should have 'xts' installed already") + test(19.26, last(xt, n=2L), error="you should have 'xts' installed already") + test(19.31, first(x, n=2L), 1:2, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.32, first(y, n=2L), y[1:2], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.33, first(x, n=1L), 1L, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.34, first(y, n=1L), y[1L], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") + test(19.35, first(xt), error="you should have 'xts' installed already") + test(19.36, first(xt, n=2L), error="you should have 'xts' installed already") +} +test(19.41, last(x), 3L, output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(19.42, last(y), y[3L], output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(19.51, first(x), 1L, output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(19.52, first(y), y[1L], output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") +test(19.61, last(df), structure(list(a=2L, b=2L), row.names=2L, class="data.frame"), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +test(19.62, last(dt), data.table(a=2L, b=2L), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +test(19.71, first(df), structure(list(a=1L, b=3L), row.names=1L, class="data.frame"), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +test(19.72, first(dt), data.table(a=1L, b=3L), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") +# matrix/array utils::tail behavior is likely to change in future R, Michael is more in the topic +test(19.81, last(mx), structure(c(3L, 6L, 9L), .Dim = c(1L, 3L), .Dimnames = list("[3,]", NULL)), output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +expected = if (base::getRversion() < "3.7.0") 27L else structure(c(3L, 6L, 9L, 12L, 15L, 18L, 21L, 24L, 27L), .Dim = c(1L, 3L, 3L), .Dimnames = list("[3,]", NULL, NULL)) #4127 +test(19.82, last(ar), expected, output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +test(19.91, first(mx), structure(c(1L, 4L, 7L), .Dim = c(1L, 3L)), output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +expected = if (base::getRversion() < "3.7.0") 1L else structure(c(1L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 25L), .Dim = c(1L, 3L, 3L)) #4127 +test(19.92, first(ar), expected, output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") +options(old) + +if (loaded[["xts"]]) { # was 2133 in tests.Rraw, #5516 + # keep.rownames in as.data.table.xts() supports a string, #4232 + xts = xts::xts(1:10, structure(1:10, class = "Date")) + colnames(xts) = "VALUE" + DT = as.data.table(xts, keep.rownames = "DATE", key = "DATE") + test(20.1, colnames(DT), c("DATE", "VALUE")) + test(20.2, key(DT), "DATE") + test(20.3, as.data.table(xts, keep.rownames = "VALUE"), + error = "Input xts object should not have 'VALUE' column because it would result in duplicate column names. Rename 'VALUE' column in xts or use `keep.rownames` to change the index column name.") + test(20.4, as.data.table(xts, keep.rownames = character()), + error = "keep.rownames must be length 1") + test(20.5, as.data.table(xts, keep.rownames = NA_character_), + error = "keep.rownames must not be NA") +} + +if (loaded[["nanotime"]]) { + + # was 1463.62-65 in tests.Rraw, #5516 + x=nanotime(1:4) + test(21.1, shift(x ), c(nanotime::nanotime(NA), x[1:3])) + test(21.2, shift(x, fill=0L), c(nanotime::nanotime(0L), x[1:3])) + test(21.3, shift(x, 1, type="cyclic"), c(x[4L], x[-4L])) + test(21.4, shift(x, -1, type="cyclic"), c(x[-1L], x[1L])) + + # was 1752 in tests.Rraw, #5516 + DT = data.table(A=nanotime(tt<-c("2016-09-28T15:30:00.000000070Z", + "2016-09-29T23:59:00.000000001Z", + "2016-09-29T23:59:00.000000999Z", + "1970-01-01T00:01:01.000001000Z", + "1970-01-01T00:00:00.000000000Z", + "1969-12-31T23:59:59.999999999Z", + "1969-12-31T23:59:59.000000089Z", + "1969-12-31T12:13:14.000000000Z", + "1969-12-31T12:13:14.999999999Z", + "1969-12-31T12:13:14.000000001Z", + "1967-03-15T00:00:00.300000002Z", + "1967-03-15T23:59:59.300000002Z"))) + test(22, capture.output(fwrite(DT, verbose=FALSE))[-1], tt) + + # was 2060.401-405 in tests.Rraw, #5516 + nt = nanotime(c(1L, 2L, NA_integer_, 4L)) + nt_val = nanotime(1:4) + test(23.1, as.character(fcoalesce(nt, nanotime(3L))), as.character(nt_val)) # as.character due to eddelbuettel/nanotime#46 + test(23.2, as.character(fcoalesce(nt, nanotime(NA), nanotime(3L))), as.character(nt_val)) + test(23.3, as.character(fcoalesce(nt, nanotime(rep(3, 4L)))), as.character(nt_val)) + test(23.4, fcoalesce(nt, 1), error='Item 2 has a different class than item 1') + test(23.5, fcoalesce(nt, 1L), error = 'Item 2 is type integer but the first item is type double') + + # was 2080.01-05 in tests.Rraw, #5516 + n = nanotime(1:4) + n[2L] = NA + opt = options(datatable.verbose=TRUE) + test(24.1, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") + test(24.2, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") + test(24.3, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=NA), c(FALSE, NA, FALSE, NA), output="between parallel processing of integer64") + options(opt) + test(24.4, between(1:10, nanotime(3), nanotime(6)), error="x is not integer64 but.*Please align classes") + test(24.5, between(1:10, 3, nanotime(6)), error="x is not integer64 but.*Please align classes") + + # was 2085.11 in tests.Rraw, #5516 + n = nanotime(1:4) + test(25, fifelse(c(TRUE,FALSE,NA,TRUE), n, n+100), c(n[1L], n[2L]+100, nanotime(NA), n[4])) + + # was 2127.27 in tests.Rraw, #5516 + n = nanotime(1:12) + test(26, fcase(c(-5L:5L<0L,NA), n, c(-5L:5L>0L,NA), n+100), c(n[1L:5L], nanotime(NA), n[7L:11L]+100, as.integer64(NA))) + + # na.omit works for nanotime, #4744. Was 2205 in tests.Rraw, #5516 + DT = data.table(time=nanotime(c(1,NA,3))) + test(27, na.omit(DT), DT[c(1,3)]) + +} + + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 633a562c05..b242290dfc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -100,15 +100,12 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { yearqtr = data.table::yearqtr # zoo } -# Load optional Suggests packages, which are tested by Travis for code coverage, and on CRAN -# The reason for inclusion here is stated next to each package +# Optional suggests are now tested in other.Rraw, #5516. No calls to require() or library() should occur +# in this file other than for methods and data.table above, and these here. +# These are included in code coverage, and on CRAN. The reason for inclusion is stated next to each package. sugg = c( "bit64", # if big integers are detected in file, fread reads them as bit64::integer64 if installed (warning if not) - "xts", # we have xts methods in R/xts.R - "nanotime", # fwrite looks for the 'nanotime' class name at C level (but we have our own writer in C, though) - "R.utils", # for fread to accept .gz and .bz2 files directly - "yaml" # for fread's yaml argument (csvy capability) - # zoo # In DESCRIPTION:Suggests otherwise R CMD check warning: '::' or ':::' import not declared from: 'zoo'; it is tested in other.Rraw though + "R.utils" # many fread test input files are compressed to save space; fundamental to test environment ) for (s in sugg) { assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages( @@ -6798,13 +6795,7 @@ ans = list(as.integer(c(NA, 1:9)), as.integer(c(NA, NA, 1:8))) setattr(ans, 'names', nm) test(1463.61, shift(x, 1:2, give.names=TRUE), ans) -if (test_nanotime) { - x=nanotime(1:4) - test(1463.62, shift(x ), c(nanotime::nanotime(NA), x[1:3])); - test(1463.63, shift(x, fill=0L), c(nanotime::nanotime(0L), x[1:3])); - test(1463.64, shift(x, 1, type="cyclic"), c(x[4L], x[-4L])); - test(1463.65, shift(x, -1, type="cyclic"), c(x[-1L], x[1L])); -} +# 1463.62-65 tested nanotime moved to other.Rraw 21, #5516 # shift circular x = 1:5 @@ -6837,106 +6828,7 @@ test(1464.12, rleidv(DT, 1:2), ans<-INT(1,2,3,4,5,6,6,6,7,8,8,9,10,11,12,13,14,1 test(1464.13, rleidv(DT, 2:1), ans) test(1464.14, rleidv(DT, c(3,1)), INT(1,1,2,2,3,4,5,5,6,7,8,9,10,11,12,13,14,15,16,17)) -if (test_xts) { - - Sys.unsetenv("_R_CHECK_LENGTH_1_LOGIC2_") - # package xts has an issue with an && clause (https://github.com/joshuaulrich/xts/pull/269). When that is fixed in xts and released to CRAN, we can remove this Sys.unsetenv - # Sys.setenv is called again at the end of this xts branch. The original env variable value was stored at the top of this file and restored at the end. - - # data.table-xts conversion #882 - # Date index - dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) - dt_xt = as.data.table(xt) - xt_dt = as.xts.data.table(dt) - test(1465.01, all.equal(dt, dt_xt, check.attributes = FALSE)) - test(1465.02, xt, xt_dt) - # POSIXct index - dt <- data.table(index = as.POSIXct(as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) - dt_xt = as.data.table(xt) - xt_dt = as.xts.data.table(dt) - test(1465.03, all.equal(dt, dt_xt, check.attributes = FALSE)) - test(1465.04, xt, xt_dt) - # index types returned from to.period - dt = data.table(index = as.Date((as.Date("2014-12-12") - 729):as.Date("2014-12-12"), origin = "1970-01-01"), quantity = as.numeric(rep(c(1:5), 73)), value = rep(c(1:73) * 100, 5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value), ncol = 2, dimnames = list(NULL, c("quantity", "value"))), order.by = dt$index) - xt_w = xts::to.weekly(xt) - xt_dt_xt_w = as.xts.data.table(as.data.table(xt_w)) - xt_m = xts::to.monthly(xt) - xt_dt_xt_m = as.xts.data.table(as.data.table(xt_m)) - xt_q = xts::to.quarterly(xt) - xt_dt_xt_q = as.xts.data.table(as.data.table(xt_q)) - xt_y = xts::to.yearly(xt) - xt_dt_xt_y = as.xts.data.table(as.data.table(xt_y)) - test(1465.05, all.equal(xt_w, xt_dt_xt_w, check.attributes = FALSE)) - test(1465.06, all.equal(xt_m, xt_dt_xt_m, check.attributes = FALSE)) - test(1465.07, all.equal(xt_q, xt_dt_xt_q, check.attributes = FALSE)) - test(1465.08, all.equal(xt_y, xt_dt_xt_y, check.attributes = FALSE)) - - test(1465.09, xts::last(1:5), 5L) # was test 1531 - - # xts issue from Joshua, #1347 - x = as.Date(1:5, origin="2015-01-01") - test(1465.10, last(x), tail(x, 1L)) # was test 1559 - - x = xts(1:100, Sys.Date()+1:100) - test(1465.11, last(x,10), x[91:100,]) # was test 841 - # The important thing this tests is that data.table's last() dispatches to xts's method when data.table is loaded above xts. - # But that isn't tested by R CMD check because xts is loaded above data.table, there. - # So to make this test is relevant, run it in fresh R session directly, after: "require(xts);require(data.table)" - # rather than: "require(data.table);require(xts)" - # Which was the main thrust of bug#2312 fixed in v1.8.3 - - # fix for #1484; was test 1589 - x = xts::as.xts(8, order.by = as.Date("2016-01-03")) - test(1465.12, all.equal(as.data.table(x), data.table(index = as.Date("2016-01-03"), V1 = 8), check.attributes=FALSE)) - - # IDate support in as.xts.data.table #1499; was test 1663 - dt <- data.table(date = c(as.IDate("2014-12-31"), - as.IDate("2015-12-31"), - as.IDate("2016-12-31")), - nav = c(100,101,99), - key = "date") - dt.xts <- as.xts.data.table(dt) - test(1465.13, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31"))) - - # additional coverage missing uncovered in #3117 - dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) - xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) - test(1465.14, as.data.table(xt, keep.rownames = FALSE), dt[ , !'index']) - names(xt)[1L] = 'index' - test(1465.15, as.data.table(xt), error = 'Input xts object should not') - names(xt)[1L] = 'quantity' - setcolorder(dt, c(3, 1, 2)) - if (base::getRversion() < "3.6.0") as.xts = as.xts.data.table # fix for when we cannot register s3method for suggested dependency #3286 - test(1465.16, as.xts(dt), error = 'data.table must have a time based') - setcolorder(dt, c(2, 3, 1)) - dt[ , char_col := 'a'] - test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric') - if (base::getRversion() < "3.6.0") rm(as.xts) - - # 890 -- key argument for as.data.table.xts - x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) - old = options(datatable.verbose=FALSE) - test(1465.18, capture.output(as.data.table(x, key="index")), - c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", - " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", - " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", - " 9: 1970-01-10 9", "10: 1970-01-11 10")) - options(old) - - # as.data.table.xts(foo) had incorrect integer index with a column name called 'x', #4897 - M = xts::as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) # xts:: just to be extra robust; shouldn't be needed with rm(as.xts) above - test(1465.19, inherits(as.data.table(M)$index,"POSIXct")) - - # non-numeric xts coredata, #5268 - x = xts::xts(x=c(TRUE,FALSE), order.by=Sys.Date()+(1:2)) - colnames(x) = "value" # perhaps relates to #4897 - test(1465.20, identical(x, as.xts(as.data.table(x), numeric.only=FALSE))) - - Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) -} +# 1465 tested xts moved to other.Rraw 18, #5516 # as.data.table.default #969 ar <- array(NA, dim=c(10,4),dimnames = list(NULL,paste("col",1:4,sep=""))) @@ -11503,23 +11395,7 @@ test(1751.3, capture.output(fwrite(DT,na="NA",verbose=FALSE)), c("\"x\"","NA")) test(1751.4, fread({fwrite(DT, f<-tempfile());f}), DT) # the important thing unlink(f) -if (test_nanotime) { - old = options(warnPartialMatchArgs=FALSE) # option off temporarily pending https://github.com/eddelbuettel/nanotime/pull/49 - DT = data.table(A=nanotime(tt<-c("2016-09-28T15:30:00.000000070Z", - "2016-09-29T23:59:00.000000001Z", - "2016-09-29T23:59:00.000000999Z", - "1970-01-01T00:01:01.000001000Z", - "1970-01-01T00:00:00.000000000Z", - "1969-12-31T23:59:59.999999999Z", - "1969-12-31T23:59:59.000000089Z", - "1969-12-31T12:13:14.000000000Z", - "1969-12-31T12:13:14.999999999Z", - "1969-12-31T12:13:14.000000001Z", - "1967-03-15T00:00:00.300000002Z", - "1967-03-15T23:59:59.300000002Z"))) - options(old) - test(1752, capture.output(fwrite(DT, verbose=FALSE))[-1], tt) -} +# 1752 tested nanotime moved to other.Rraw 22, #5516 # check too many fields error from ,\n line ending highlighted in #2044 test(1753.1, fread("X,Y\n1,2\n3,4\n5,6"), data.table(X=INT(1,3,5),Y=INT(2,4,6))) @@ -15106,210 +14982,7 @@ test(2030.18, .Last.updated, 0L) # zero match test(2031.01, rbind(data.table(A=1:3, B=7:9), data.table(A=4:6, B=as.list(10:12))), ans<-data.table(A=1:6, B=as.list(7:12))) test(2031.02, rbind(data.table(A=1:3, B=as.list(7:9)), data.table(A=4:6, B=10:12)), ans) -if (test_yaml) { # csvy; #1701 - f = testDir("csvy/test.csvy") - DT = data.table(var1 = c("A", "B"), - var2 = c(1L, 3L), - var3 = c(2.5, 4.3)) - DT_yaml = copy(DT) - setattr(DT_yaml, 'yaml_metadata', - list(name = "my-dataset", - source = "https://github.com/leeper/csvy/tree/master/inst/examples", - schema = list(fields = list( - list(name = "var1", title = "variable 1", type = "string", - description = "explaining var1", - constraints = list(list(required = TRUE))), - list(name = "var2", title = "variable 2", type = "integer"), - list(name = "var3", title = "variable 3", type = "number") - )))) - ## with skip = '__auto__', fread can figure out - ## how to start after the metadata (just ignoring it) - test(2032.01, fread(f), DT) - ## should be the same, but with yaml_metadata attribute - test(2032.02, fread(f, yaml = TRUE), DT_yaml) - ## testing verbose messaging - test(2032.03, fread(f, yaml = TRUE, verbose = TRUE), - DT_yaml, output = 'Processed.*YAML metadata.*') - ## this file is identical, except the body of the - ## YAML header is commented out with # (should read identically) - test(2032.04, - fread(testDir('csvy/test_comment.csvy'), yaml = TRUE), - DT_yaml) - ## user input is taken as most intentional & overrides YAML - DT_yaml[ , var2 := as.numeric(var2)] - test(2032.05, fread(f, yaml = TRUE, colClasses = list(numeric = 'var2')), - DT_yaml, message = 'colClasses.*YAML header are in conflict.*var2') - ## extraneous/unused fields shouldn't throw off reading - DT = fread(testDir('csvy/test_extraneous.csvy'), yaml = TRUE) - test(2032.06, names(DT), c('Date', 'WTI')) - test(2032.07, attr(DT, 'yaml_metadata'), - list(names = c("Date", "WTI"), class = "data.frame", - title = "Cushing, OK WTI Spot Price FOB", filename = "data.csv", - fileurl = "https://raw.githubusercontent.com/jrovegno/csvy/master/data.csv", - sourceurl = "http://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=PET&s=RWTC&f=D", - source_csvy = "https://github.com/leeper/csvy/tree/master/inst/examples", - item = "PET", sourcekey = "RWTC", freq = "Daily", - rate = "MID", type = "price", units = "Dollars per Barrel", - latestdate = "2015-08-31", releasedate = "2015-09-02", - nextreleasedate = "2015-09-10", source = "Thomson Reuters", - contactemail = "infoctr@eia.doe.gov", contactphone = "(202) 586-8800")) - ## yaml can also handle sep, dec, quote, and na.strings - DT_out = data.table(var1 = c("A", "B"), - var2 = c(1L, NA), - var3 = c(2.5, 4.3)) - meta = - list(name = NULL, - schema = list(fields = list( - list(name = "var1", title = "variable 1", type = "string", - description = "a single-quoted character variable"), - list(name = "var2", title = "variable 2", type = "integer"), - list(name = "var3", title = "variable 3", type = "number", - description = "European-style numeric") - )), - header = TRUE, sep = "|", dec = ",", - quote = "'", na.strings = "@") - attr(DT_out, 'yaml_metadata') = meta - test(2032.08, fread(testDir( 'csvy/test_attributes.csvy'), yaml = TRUE), DT_out) - ## user-specified attributes can override data from YAML - meta$sep = "-" - setattr(DT_out, 'yaml_metadata', meta) - test(2032.09, fread(testDir('csvy/test_override_sep.csvy'), yaml = TRUE, sep = '|'), DT_out, - message = 'User-supplied.*sep.*override') - - meta$sep = "|" - setattr(DT_out, 'yaml_metadata', meta) - test(2032.10, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE), - DT_out, message = 'User-supplied.*header.*override') - col.names = c('x', 'y', 'z') - setnames(DT_out, col.names) - test(2032.11, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE, col.names = col.names), DT_out, - message = c('User-supplied.*header.*override', 'User-supplied.*col.names.*override')) - - test(2032.12, fread(testDir('csvy/test_attributes.csvy'), yaml = TRUE, col.names = col.names), - DT_out, message = 'User-supplied.*col.names') - - setnames(DT_out, c('var1', 'var2', 'var3')) - meta$quote = "^" - setattr(DT_out, 'yaml_metadata', meta) - test(2032.13, fread(testDir('csvy/test_override_quote.csvy'), yaml = TRUE, quote = "'"), - DT_out, message = 'User-supplied.*quote') - - meta$quote = "'" - meta$dec = "." - setattr(DT_out, 'yaml_metadata', meta) - test(2032.14, fread(testDir('csvy/test_override_dec.csvy'), yaml = TRUE, dec = ','), - DT_out, message = 'User-supplied.*dec') - - meta$dec = ',' - meta$na.strings = 'NA' - setattr(DT_out, 'yaml_metadata', meta) - test(2032.15, fread(testDir('csvy/test_override_na.csvy'), yaml = TRUE, na.strings = '@'), - DT_out, message = 'User-supplied.*na.strings') - - ## error if YAML malformed - test(2032.16, fread(testDir('csvy/test_incomplete_header.csvy'), yaml = TRUE), - error = 'Reached the end.*YAML.*valid csvy') - ## use any other CSV in test directory which doesn't have YAML - if (test_R.utils) test(2032.17, fread(testDir('issue_2051.csv.gz'), yaml = TRUE), - error = 'Encountered.*unskipped.*constitute.*valid YAML') - ## no problem if some fields are missing a type (just - ## resort to standard auto-inferral, i.e., identical to - ## the case of partially-specified colClasses) - DT = data.table(var1 = c("A", "B"), var2 = c(1L, 3L), - var3 = c(2.5, 4.3)) - setattr(DT, 'yaml_metadata', - list(name = "my-dataset", source = "https://github.com/leeper/csvy/tree/master/inst/examples", - schema = list(fields = list( - list(name = "var1"), list(name = "var2", type = "integer"), - list(name = "var3", type = "number") - )))) - test(2032.18, fread(testDir('csvy/test_missing_type.csvy'), yaml = TRUE), DT) - ## skip applies starting after the YAML header - setattr(DT, 'yaml_metadata', - list(schema = list(fields = list( - list(name = "var1", type = "string"), - list(name = "var2", type = "integer"), - list(name = "var3", type = "number") - )))) - test(2032.19, fread(testDir('csvy/test_skip.csvy'), yaml = TRUE, skip = 2L), DT) - ## user-supplied col.names override metadata (as for colClasses) - cn = paste0('V', 1:3) - setnames(DT, cn) - test(2032.20, fread(testDir('csvy/test_skip.csvy'), - yaml = TRUE, skip = 2L, col.names = cn), - DT, message = 'User-supplied column names.*override.*YAML') - ## invalid value fails - test(2032.21, fread(f, yaml = 'gobble'), - error = 'isTRUEorFALSE\\(yaml\\) is not TRUE') - - ## warning that skip-as-search doesn't work with yaml - DT_yaml[ , var2 := as.integer(var2)] - test(2032.22, fread(f, skip = 'var1,', yaml = TRUE), - DT_yaml, warning = 'Combining a search.*YAML.*') - - # fwrite csvy: #3534 - tmp = tempfile() - DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5]) - # force eol for platform independence - fwrite(DT, tmp, yaml = TRUE, eol = '\n') - as_read = readLines(tmp) - test(2033.01, as_read[c(1L, 24L)], c('---', '---')) - test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L])) - test(2033.03, grepl('creation_time_utc', as_read[3L])) - test(2033.04, as_read[4:23], - c("schema:", " fields:", " - name: a", " type: integer", - " - name: b", " type: numeric", " - name: c", " type: character", - "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", - # NB: apparently \n is encoded like this in YAML - "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", - "logical01: no")) - tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") - test(2033.05, as_read[25:30], tbl_body) - - # windows eol - fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') - test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"') - - # multi-class columns - DT[ , t := .POSIXct(1:5, tz = 'UTC')] - fwrite(DT, tmp, yaml = TRUE) - as_read = readLines(tmp) - test(2033.07, as_read[13L], " type: POSIXct") - - # ~invertibility~ - # fread side needs to be improved for Hugh's colClasses update - DT[ , t := NULL] - fwrite(DT, tmp, yaml = TRUE) - DT2 = fread(tmp, yaml = TRUE) - # remove metadata to compare - attr(DT2, 'yaml_metadata') = NULL - test(2033.08, all.equal(DT, DT2)) - - test(2033.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE), - output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*")) - - # TODO: test gzip'd yaml which is now supported - - # yaml + bom arguments - DT = data.table(l=letters, n=1:26) - fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) - fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50 - lines = readLines(fcon) - lines = lines[lines!=""] # an extra "" after "eol: |2+" (line 16) on Linux but not Windows - # remove the blank here so we don't need to change this test if/when that changes in yaml package - test(2033.11, length(lines), 48L) - close(fcon) - test(2033.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) - # re-write should have same output (not appended) - fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) - fcon = file(f, encoding="UTF-8") - lines = readLines(fcon) - lines = lines[lines!=""] - test(2033.13, length(lines), 48L) - close(fcon) - test(2033.14, fread(f), DT) - unlink(f) -} +# 2032-2033 tested yaml moved to other.Rraw 16-17, #5516 # fcast coverage DT = data.table(a = rep(1:2, each = 2), b = rep(1:2, 2), c = 4:1, d = 5:8) @@ -15966,16 +15639,7 @@ if (test_bit64) { test(2060.304, fcoalesce(int64, 1), error='Item 2 has a different class than item 1') test(2060.305, fcoalesce(int64, 1L), error = 'Item 2 is type integer but the first item is type double') } -# nanotime tests -if (test_nanotime) { - nt = nanotime(int) - nt_val = nanotime(1:4) - test(2060.401, as.character(fcoalesce(nt, nanotime(3L))), as.character(nt_val)) # as.character due to eddelbuettel/nanotime#46 - test(2060.402, as.character(fcoalesce(nt, nanotime(NA), nanotime(3L))), as.character(nt_val)) - test(2060.403, as.character(fcoalesce(nt, nanotime(rep(3, 4L)))), as.character(nt_val)) - test(2060.404, fcoalesce(nt, 1), error='Item 2 has a different class than item 1') - test(2060.405, fcoalesce(nt, 1L), error = 'Item 2 is type integer but the first item is type double') -} +# 2060.401-405 tested nanotime moved to other.Rraw 23, #5516 # setcoalesce x = c(11L, NA, 13L, NA, 15L, NA) y = c(NA, 12L, 5L, NA, NA, NA) @@ -16466,18 +16130,7 @@ test(2078.32, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=FALSE, NAboun test(2079.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=NA), c(FALSE, FALSE, NA, NA, NA)) test(2079.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, TRUE, TRUE)) test(2079.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=FALSE), error="NAbounds must be TRUE or NA") -# nanotime support -if (test_nanotime) { - n=nanotime(1:4) - n[2L]=NA - op = options(datatable.verbose=TRUE) - test(2080.01, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") - test(2080.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") - test(2080.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=NA), c(FALSE, NA, FALSE, NA), output="between parallel processing of integer64") - options(op) - test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="x is not integer64 but.*Please align classes") - test(2080.05, between(1:10, 3, nanotime(6)), error="x is not integer64 but.*Please align classes") -} +# 2080.01-05 tested nanotime moved to other.Rraw 24, #5516 # use raw type to cover fallback to R in between.R old = options(datatable.verbose=TRUE) test(2081.01, between(as.raw(1:5), as.raw(2), as.raw(4)), c(FALSE, TRUE, TRUE, TRUE, FALSE), output="fallback to slow R") @@ -16521,10 +16174,7 @@ if (test_bit64) { i = as.integer64(1:4)+3e9 test(2085.01, fifelse(c(TRUE,FALSE,NA,TRUE), i, i+100), c(i[1L], i[2L]+100, as.integer64(NA), i[4])) } -if (test_nanotime) { - n = nanotime(1:4) - test(2085.11, fifelse(c(TRUE,FALSE,NA,TRUE), n, n+100), c(n[1L], n[2L]+100, nanotime(NA), n[4])) -} +# 2085.11 tested nanotime moved to other.Rraw 25, #5516 test(2085.21, fifelse(c(TRUE,FALSE,NA), 1:3, c(1,2,3)), c(1,2,NA)) test(2085.22, fifelse(c(TRUE,FALSE,NA), c(1,2,3), 1:3), c(1,2,NA)) test(2085.31, fifelse(c(a=TRUE,b=FALSE), list(m=1,n=2), list(x=11,y=12)), list(a=1, b=12)) @@ -16756,109 +16406,7 @@ test(2107.3, names(DT), c('A','b','c')) setnames(DT, -(1:2), toupper) test(2107.4, names(DT), c('A','b','C')) -# first and last should no longer load xts namespace, #3857, below commented test for interactive validation when xts present but not loaded or attached -#stopifnot("xts"%in%installed.packages(), !"xts"%in%loadedNamespaces()); library(data.table); x=as.POSIXct("2019-01-01"); last(x); stopifnot(!"xts" %in% loadedNamespaces()) -x = as.POSIXct("2019-09-09")+0:1 -old = options(datatable.verbose=TRUE) -test(2108.01, last(x), x[length(x)], output="!is.xts(x)") -test(2108.02, first(x), x[1L], output="!is.xts(x)") -if (test_xts) { - xt = xts(1:2, x) - test(2108.03, last(xt, 2L), xt, output="using xts::last: is.xts(x)") - test(2108.04, first(xt, 2L), xt, output="using xts::first: is.xts(x)") - xt = xts(matrix(1:4, 2L, 2L), x) - test(2108.05, last(xt, 2L), xt, output="using xts::last: is.xts(x)") - test(2108.06, first(xt, 2L), xt, output="using xts::first: is.xts(x)") -} -# first on empty df now match head(df, n=1L), #3858 -df = data.frame(a=integer(), b=integer()) -test(2108.11, first(df), df, output="!is.xts(x)") -test(2108.12, last(df), df, output="!is.xts(x)") -options(old) -# xts last-first dispatch fix #4053 -x = 1:3 -y = as.POSIXct(x, origin="1970-01-01") -df = data.frame(a=1:2, b=3:2) -dt = as.data.table(df) -mx = matrix(1:9, 3, 3) -ar = array(1:27, c(3,3,3)) -xt = structure( - c(142.25, 141.229996, 141.330002, 142.860001, 142.050003, 141.399994, - 140.570007, 140.610001, 140.380005, 141.369995, 141.669998, 140.539993, - 94807600, 69620600, 76645300, 108.999954, 109.231255, 108.360008), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(c(1167782400, 1167868800, 1167955200), tzone = "UTC", tclass = "Date"), - .Dim = c(3L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) -) -old = options(datatable.verbose=TRUE) -if (test_xts) { - test(2108.21, last(x, n=2L), 2:3, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.22, last(y, n=2L), y[2:3], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.23, last(x, n=1L), 3L, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.24, last(y, n=1L), y[3L], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - xt_last = structure( - c(141.330002, 141.399994, 140.380005, 140.539993, 76645300, 108.360008), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(1167955200, tzone = "UTC", tclass = "Date"), - .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - xt_last2 = structure( - c(141.229996, 141.330002, 142.050003, 141.399994, 140.610001, 140.380005, - 141.669998, 140.539993, 69620600, 76645300, 109.231255, 108.360008), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(c(1167868800, 1167955200), tzone = "UTC", tclass = "Date"), - .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - test(2108.25, last(xt), xt_last, output="using xts::last: is.xts(x)") - test(2108.26, last(xt, n=2L), xt_last2, output="using xts::last: is.xts(x)") - test(2108.31, first(x, n=2L), 1:2, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.32, first(y, n=2L), y[1:2], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.33, first(x, n=1L), 1L, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - test(2108.34, first(y, n=1L), y[1L], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()") - xt_first = structure( - c(142.25, 142.860001, 140.570007, 141.369995, 94807600, 108.999954), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(1167782400, tzone = "UTC", tclass = "Date"), - .Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - xt_first2 = structure( - c(142.25, 141.229996, 142.860001, 142.050003, 140.570007, 140.610001, 141.369995, 141.669998, 94807600, 69620600, 108.999954, 109.231255), - class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", - index = structure(c(1167782400, 1167868800), tzone = "UTC", tclass = "Date"), - .Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted")) - ) - test(2108.35, first(xt), xt_first, output="using xts::first: is.xts(x)") - test(2108.36, first(xt, n=2L), xt_first2, output="using xts::first: is.xts(x)") -} else { - test(2108.21, last(x, n=2L), 2:3, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.22, last(y, n=2L), y[2:3], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.23, last(x, n=1L), 3L, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.24, last(y, n=1L), y[3L], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.25, last(xt), error="you should have 'xts' installed already") - test(2108.26, last(xt, n=2L), error="you should have 'xts' installed already") - test(2108.31, first(x, n=2L), 1:2, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.32, first(y, n=2L), y[1:2], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.33, first(x, n=1L), 1L, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.34, first(y, n=1L), y[1L], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()") - test(2108.35, first(xt), error="you should have 'xts' installed already") - test(2108.36, first(xt, n=2L), error="you should have 'xts' installed already") -} -test(2108.41, last(x), 3L, output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(2108.42, last(y), y[3L], output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(2108.51, first(x), 1L, output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(2108.52, first(y), y[1L], output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))") -test(2108.61, last(df), structure(list(a=2L, b=2L), row.names=2L, class="data.frame"), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -test(2108.62, last(dt), data.table(a=2L, b=2L), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -test(2108.71, first(df), structure(list(a=1L, b=3L), row.names=1L, class="data.frame"), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -test(2108.72, first(dt), data.table(a=1L, b=3L), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)") -# matrix/array utils::tail behavior is likely to change in future R, Michael is more in the topic -test(2108.81, last(mx), structure(c(3L, 6L, 9L), .Dim = c(1L, 3L), .Dimnames = list("[3,]", NULL)), output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -expected = if (base::getRversion() < "3.7.0") 27L else structure(c(3L, 6L, 9L, 12L, 15L, 18L, 21L, 24L, 27L), .Dim = c(1L, 3L, 3L), .Dimnames = list("[3,]", NULL, NULL)) #4127 -test(2108.82, last(ar), expected, output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -test(2108.91, first(mx), structure(c(1L, 4L, 7L), .Dim = c(1L, 3L)), output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -expected = if (base::getRversion() < "3.7.0") 1L else structure(c(1L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 25L), .Dim = c(1L, 3L, 3L)) #4127 -test(2108.92, first(ar), expected, output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") -options(old) +# 2108 tested xts moved to other.Rraw 19, #5516 # error in autonaming by={...}, #3156 DT = data.table(State=c("ERROR", "COMPLETED", "ERROR"), ExitCode=c(1, 0, 2)) @@ -17208,10 +16756,7 @@ if(test_bit64) { i=as.integer64(1:12)+3e9 test(2127.26, fcase(test_vec_na1, i, test_vec_na2, i+100), c(i[1L:5L], as.integer64(NA),i[7L:11L]+100, as.integer64(NA))) } -if(test_nanotime) { - n=nanotime(1:12) - test(2127.27, fcase(test_vec_na1, n, test_vec_na2, n+100), c(n[1L:5L], nanotime(NA),n[7L:11L]+100, as.integer64(NA))) -} +# 2127.27 tested nanotime moved to other.Rraw 26, #5516 test(2127.28, fcase(test_vec1, rep(1L,11L), test_vec2, rep(0L,11L)), as.integer(out_vec)) test(2127.29, fcase(test_vec1, rep(1,11L), test_vec2, rep(0,11L)), out_vec) test(2127.30, fcase(test_vec1, rep("1",11L), test_vec2, rep("0",11L)), as.character(out_vec)) @@ -17376,20 +16921,8 @@ test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanot test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") test(2132.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") rm(s1, s2, class2132) -if (test_xts) { - # keep.rownames in as.data.table.xts() supports a string, #4232 - xts = xts::xts(1:10, structure(1:10, class = "Date")) - colnames(xts) = "VALUE" - DT = as.data.table(xts, keep.rownames = "DATE", key = "DATE") - test(2133.1, colnames(DT), c("DATE", "VALUE")) - test(2133.2, key(DT), "DATE") - test(2133.3, as.data.table(xts, keep.rownames = "VALUE"), - error = "Input xts object should not have 'VALUE' column because it would result in duplicate column names. Rename 'VALUE' column in xts or use `keep.rownames` to change the index column name.") - test(2133.4, as.data.table(xts, keep.rownames = character()), - error = "keep.rownames must be length 1") - test(2133.5, as.data.table(xts, keep.rownames = NA_character_), - error = "keep.rownames must not be NA") -} + +# 2133 tested xts moved to other.Rraw 20, #5516 # friendlier error for common mistake of using := in i instead of j, #4227 DT = data.table(a = 1) @@ -18213,11 +17746,7 @@ test(2203.20, tstrsplit(w, "/", type.convert=list()), error="not support empty l test(2204, as.data.table(mtcars, keep.rownames='model', key='model'), setnames(setkey(as.data.table(mtcars, keep.rownames = TRUE), rn), 'rn', 'model')) -# na.omit works for nanotime, #4744 -if (test_nanotime) { - DT = data.table(time=nanotime(c(1,NA,3))) - test(2205, na.omit(DT), DT[c(1,3)]) -} +# 2205 tested nanotime moved to other.Rraw 27, #5516 # isRealReallyInt, #3966 test(2206.01, isRealReallyInt(c(-2147483647.0, NA, 0.0, 2147483647.0)), TRUE) From 0eb2d9c488dc4560a36593b71194a93bcdb27dc3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 10 Nov 2022 14:01:04 -0800 Subject: [PATCH 501/588] report elapsed time even if tests fail (#5519) --- R/test.data.table.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index fd7750ef08..0f6525e652 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -165,7 +165,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (inherits(err,"try-error")) { # nocov start if (silent) return(FALSE) - stopf("Failed after test %s before the next test() call in %s", env$prevtest, fn) + stopf("Failed in %s after test %s before the next test() call in %s", timetaken(env$started.at), env$prevtest, fn) # the try() above with silent=FALSE will have already printed the error itself # nocov end } @@ -175,8 +175,8 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (nfail > 0L) { # nocov start stopf( - "%d error(s) out of %d. Search %s for test number(s) %s", - nfail, ntest, names(fn), toString(env$whichfail) + "%d error(s) out of %d. Search %s for test number(s) %s. Elapsed time: %s.", + nfail, ntest, names(fn), toString(env$whichfail), timetaken(env$started.at) ) # important to stopf() here, so that 'R CMD check' fails # nocov end From c344cee0e7459a43696c49d63bf79d39acf31c55 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 10 Nov 2022 19:38:18 -0800 Subject: [PATCH 502/588] Use 'duration', not repetitive 'elapsed' (#5521) --- R/test.data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 0f6525e652..3ea6683d5f 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -175,7 +175,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (nfail > 0L) { # nocov start stopf( - "%d error(s) out of %d. Search %s for test number(s) %s. Elapsed time: %s.", + "%d error(s) out of %d. Search %s for test number(s) %s. Duration: %s.", nfail, ntest, names(fn), toString(env$whichfail), timetaken(env$started.at) ) # important to stopf() here, so that 'R CMD check' fails From 1148ab9065a2b8360778d0f1fe395bb6c4faad18 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 11 Nov 2022 20:08:21 -0700 Subject: [PATCH 503/588] tables(mb=type_size) faster lower bound MB by default (#5524) --- NEWS.md | 2 ++ R/tables.R | 26 ++++++++++++++++++++++---- man/tables.Rd | 8 ++++---- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 15bf7e8eab..a79bcf32bc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -294,6 +294,8 @@ 41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. +42. `tables()` is faster by default by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). `mb=` now accepts any function which accepts a `data.table` and returns a higher and better estimate of its size in bytes, albeit more slowly; e.g. `mb = utils::object.size`. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/tables.R b/R/tables.R index 5196935eda..b62516b8fd 100644 --- a/R/tables.R +++ b/R/tables.R @@ -1,7 +1,24 @@ # globals to pass NOTE from R CMD check, see http://stackoverflow.com/questions/9439256 MB = NCOL = NROW = NULL -tables = function(mb=TRUE, order.col="NAME", width=80, +type_size = function(DT) { + # for speed and ram efficiency, a lower bound by not descending into character string lengths or list items + # if a more accurate and higher estimate is needed then user can pass object.size or alternative to mb= + # in case number of columns is very large (e.g. 1e6 columns) then we use a for() to avoid allocation of sapply() + ans = 0L + lookup = c("raw"=1L, "integer"=4L, "double"=8L, "complex"=16L) + for (i in seq_along(DT)) { + col = DT[[i]] + tt = lookup[storage.mode(col)] + if (is.na(tt)) tt = .Machine$sizeof.pointer + tt = tt*nrow(DT) + if (is.factor(col)) tt = tt + length(levels(col))*.Machine$sizeof.pointer + ans = ans + tt + } + ans + ncol(DT)*.Machine$sizeof.pointer # column name pointers +} + +tables = function(mb=type_size, order.col="NAME", width=80, env=parent.frame(), silent=FALSE, index=FALSE) { # Prints name, size and colnames of all data.tables in the calling environment by default @@ -13,6 +30,7 @@ tables = function(mb=TRUE, order.col="NAME", width=80, if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) return(invisible(data.table(NULL))) } + if (isTRUE(mb)) mb=type_size # can still use TRUE, although TRUE will now be the lower faster type_size method DT_names = all_obj[is_DT] info = rbindlist(lapply(DT_names, function(dt_n){ DT = get(dt_n, envir=env) # doesn't copy @@ -20,7 +38,7 @@ tables = function(mb=TRUE, order.col="NAME", width=80, NAME = dt_n, NROW = nrow(DT), NCOL = ncol(DT), - MB = if (mb) round(as.numeric(object.size(DT))/1024^2), # object.size() is slow hence optional; TODO revisit + MB = if (is.function(mb)) round(as.numeric(mb(DT))/1024^2), COLS = list(names(DT)), KEY = list(key(DT)), INDICES = if (index) list(indices(DT))) @@ -38,9 +56,9 @@ tables = function(mb=TRUE, order.col="NAME", width=80, tt = copy(info) tt[ , NROW := pretty_format(NROW, width=4L)] tt[ , NCOL := pretty_format(NCOL, width=4L)] - if (mb) tt[ , MB := pretty_format(MB, width=2L)] + if (is.function(mb)) tt[ , MB := pretty_format(MB, width=2L)] print(tt, class=FALSE, nrows=Inf) - if (mb) catf("Total: %sMB\n", prettyNum(sum(info$MB), big.mark=",")) + if (is.function(mb)) catf("Total: %sMB\n", prettyNum(sum(info$MB), big.mark=",")) } invisible(info) } diff --git a/man/tables.Rd b/man/tables.Rd index 5b95edffa2..a8a74b0a7d 100644 --- a/man/tables.Rd +++ b/man/tables.Rd @@ -5,11 +5,11 @@ Convenience function for concisely summarizing some metadata of all \code{data.table}s in memory (or an optionally specified environment). } \usage{ -tables(mb=TRUE, order.col="NAME", width=80, +tables(mb=type_size, order.col="NAME", width=80, env=parent.frame(), silent=FALSE, index=FALSE) } \arguments{ - \item{mb}{ \code{logical}; \code{TRUE} adds the rough size of each \code{data.table} in megabytes to the output under column \code{MB}. } + \item{mb}{ a function which accepts a \code{data.table} and returns its size in bytes. By default, \code{type_size} (same as \code{TRUE}) provides a fast lower bound by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). A column \code{"MB"} is included in the output unless \code{FALSE} or \code{NULL}. } \item{order.col}{ Column name (\code{character}) by which to sort the output. } \item{width}{ \code{integer}; number of characters beyond which the output for each of the columns \code{COLS}, \code{KEY}, and \code{INDICES} are truncated. } \item{env}{ An \code{environment}, typically the \code{.GlobalEnv} by default, see Details. } @@ -19,9 +19,9 @@ tables(mb=TRUE, order.col="NAME", width=80, \details{ Usually \code{tables()} is executed at the prompt, where \code{parent.frame()} returns \code{.GlobalEnv}. \code{tables()} may also be useful inside functions where \code{parent.frame()} is the local scope of the function; in such a scenario, simply set it to \code{.GlobalEnv} to get the same behaviour as at prompt. -Note that on older versions of \R, \code{object.size} may be slow, so setting \code{mb=FALSE} may speed up execution of \code{tables} significantly. +`mb = utils::object.size` provides a higher and more accurate estimate of size, but may take longer. Its default `units="b"` is appropriate. -Setting \code{silent=TRUE} prints nothing; the metadata are returned as a \code{data.table}, invisibly, whether silent is \code{TRUE} or \code{FALSE}. +Setting \code{silent=TRUE} prints nothing; the metadata is returned as a \code{data.table} invisibly whether \code{silent} is \code{TRUE} or \code{FALSE}. } \value{ A \code{data.table} containing the information printed. From bc52ad00384c3abf8cf6e97f39243dd664c2c204 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 11 Nov 2022 22:34:46 -0700 Subject: [PATCH 504/588] tables() #5524 follow up; it turned out to be this nested data.table() call that 1538 in #5520 bumped up against; perhaps combined with object.size. Anyway, both now improved. --- R/tables.R | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/R/tables.R b/R/tables.R index b62516b8fd..d499d3ad71 100644 --- a/R/tables.R +++ b/R/tables.R @@ -34,15 +34,18 @@ tables = function(mb=type_size, order.col="NAME", width=80, DT_names = all_obj[is_DT] info = rbindlist(lapply(DT_names, function(dt_n){ DT = get(dt_n, envir=env) # doesn't copy - data.table( # data.table excludes any NULL items (MB and INDICES optional) unlike list() - NAME = dt_n, - NROW = nrow(DT), - NCOL = ncol(DT), - MB = if (is.function(mb)) round(as.numeric(mb(DT))/1024^2), - COLS = list(names(DT)), - KEY = list(key(DT)), - INDICES = if (index) list(indices(DT))) + list( # list() here was 9MB better than data.table() for tests.Rraw 1538, #5517 + dt_n, + nrow(DT), + ncol(DT), + if (is.function(mb)) round(as.numeric(mb(DT))/1024^2) else NA, + list(names(DT)), + list(key(DT)), + if (index) list(indices(DT)) else NA) })) + setnames(info, c("NAME","NROW","NCOL","MB","COLS","KEY","INDICES")) + if (!is.function(mb)) info[,MB:=NULL] + if (!index) info[,INDICES:=NULL] if (order.col != "NAME") { if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names From 200dfe5609a2277d5035550b49c8a35f26ac98eb Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 11 Nov 2022 23:49:32 -0700 Subject: [PATCH 505/588] no visible binding INDICES, #5524 --- R/tables.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/tables.R b/R/tables.R index d499d3ad71..dd902d27ca 100644 --- a/R/tables.R +++ b/R/tables.R @@ -1,5 +1,5 @@ # globals to pass NOTE from R CMD check, see http://stackoverflow.com/questions/9439256 -MB = NCOL = NROW = NULL +MB = NCOL = NROW = INDICES = NULL type_size = function(DT) { # for speed and ram efficiency, a lower bound by not descending into character string lengths or list items From fe8623cfda0adcda857ead5971ab2a98fc871dbd Mon Sep 17 00:00:00 2001 From: Ofek Date: Sun, 13 Nov 2022 07:34:28 +0200 Subject: [PATCH 506/588] Limit deparse in name_dots to 1 line (#5501) --- NEWS.md | 9 +++++++++ R/utils.R | 2 +- inst/tests/benchmark.Rraw | 7 +++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index a79bcf32bc..3b176b592e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -552,6 +552,15 @@ 53. `as.data.frame(DT, row.names=)` no longer silently ignores `row.names`, [#5319](https://github.com/Rdatatable/data.table/issues/5319). Thanks to @dereckdemezquita for the fix and PR, and @ben-schwen for guidance. +54. `data.table(...)` unnamed arguments are deparsed in an attempt to name the columns but when called from `do.call()` the input data itself was deparsed taking a very long time, [#5501](https://github.com/Rdatatable/data.table/pull/5501). Many thanks to @OfekShilon for the report and fix, and @michaelchirico for guidance. Unnamed arguments to `data.table(...)` may now be faster in other cases not involving `do.call()` too; e.g. expressions spanning a lot of lines or other function call constructions that led to the data itself being deparsed. + + ```R + DF = data.frame(a=runif(1e6), b=runif(1e6)) + DT1 = data.table(DF) # 0.02s before and after + DT2 = do.call(data.table, list(DF)) # 3.07s before, 0.02s after + identical(DT1, DT2) # TRUE + ``` + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/utils.R b/R/utils.R index 575913d345..c81f9e58c5 100644 --- a/R/utils.R +++ b/R/utils.R @@ -97,7 +97,7 @@ name_dots = function(...) { if (any(notnamed)) { syms = vapply_1b(dot_sub, is.symbol) # save the deparse() in most cases of plain symbol for (i in which(notnamed)) { - tmp = if (syms[i]) as.character(dot_sub[[i]]) else deparse(dot_sub[[i]])[1L] + tmp = if (syms[i]) as.character(dot_sub[[i]]) else deparse(dot_sub[[i]], nlines=1L)[1L] if (tmp == make.names(tmp)) vnames[i]=tmp } } diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index bf0bf77e9f..eb0f6fadc4 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -168,3 +168,10 @@ test(1742.5, substr(x, nchar(x)-10L, nchar(x)), c("50,28,95,76","62,87,23,40")) # Add scaled-up non-ASCII forder test 1896 +# Before #5501 do.call(data.table,) fully deparsed large unnamed args, #5492. +DF = data.frame(a=runif(1e6), b=runif(1e6)) +t1 = system.time(DT1 <- data.table(DF)) # 0.02s before and after +t2 = system.time(DT2 <- do.call(data.table, list(DF))) # 3.07s before, 0.02s after +test(, identical(DT1, DT2)) +test(, t2["elapsed"]/t1["elapsed"]<2) + From dd2134e9aad6fd9d432b1cc0a35ca9c33fcd5dca Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sun, 13 Nov 2022 19:32:57 -0700 Subject: [PATCH 507/588] more tables() #5524 follow up; removed lapply(list()) and stopped adding commas to NCOL, NROW and MB columns, and more --- R/tables.R | 63 +++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/R/tables.R b/R/tables.R index dd902d27ca..e47a1a42e8 100644 --- a/R/tables.R +++ b/R/tables.R @@ -22,46 +22,41 @@ tables = function(mb=type_size, order.col="NAME", width=80, env=parent.frame(), silent=FALSE, index=FALSE) { # Prints name, size and colnames of all data.tables in the calling environment by default - # include "hidden" objects (starting with .) via all.names=TRUE, but exclude ... specifically, #5197 - all_obj = grep("...", ls(envir=env, all.names=TRUE), invert=TRUE, fixed=TRUE, value=TRUE) - if (order.col=="NAME") all_obj=sort(all_obj) # neither ls() nor objects() had sorted arg in R 3.1.0 - is_DT = vapply_1b(mget(all_obj, envir=env), is.data.table) - if (!any(is_DT)) { + mb_name = as.character(substitute(mb)) + if (isTRUE(mb)) { mb=type_size; mb_name="type_size" } + names = ls(envir=env, all.names=TRUE) # include "hidden" objects (starting with .) + obj = mget(names, envir=env) # doesn't copy; mget is ok with ... unlike get, #5197 + w = which(vapply_1b(obj, is.data.table)) + if (!length(w)) { if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) return(invisible(data.table(NULL))) } - if (isTRUE(mb)) mb=type_size # can still use TRUE, although TRUE will now be the lower faster type_size method - DT_names = all_obj[is_DT] - info = rbindlist(lapply(DT_names, function(dt_n){ - DT = get(dt_n, envir=env) # doesn't copy - list( # list() here was 9MB better than data.table() for tests.Rraw 1538, #5517 - dt_n, - nrow(DT), - ncol(DT), - if (is.function(mb)) round(as.numeric(mb(DT))/1024^2) else NA, - list(names(DT)), - list(key(DT)), - if (index) list(indices(DT)) else NA) - })) - setnames(info, c("NAME","NROW","NCOL","MB","COLS","KEY","INDICES")) + info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0, COLS=list(), KEY=list(), INDICES=list()) + for (i in seq_along(w)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables + DT = obj[[w[i]]] + set(info, i, "NROW", nrow(DT)) + set(info, i, "NCOL", ncol(DT)) + if (is.function(mb)) set(info, i, "MB", as.integer(mb(DT)/1024^2)) + if (!is.null(tt<-names(DT))) set(info, i, "COLS", tt) # TODO: don't need these if()s when #5526 is done + if (!is.null(tt<-key(DT))) set(info, i, "KEY", tt) + if (index && !is.null(tt<-indices(DT))) set(info, i, "INDICES", tt) + } if (!is.function(mb)) info[,MB:=NULL] if (!index) info[,INDICES:=NULL] - if (order.col != "NAME") { - if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) - info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names - } + if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) + info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names if (!silent) { - # prettier printing on console - pretty_format = function(x, width) { - format(prettyNum(x, big.mark=","), - width=width, justify="right") - } - tt = copy(info) - tt[ , NROW := pretty_format(NROW, width=4L)] - tt[ , NCOL := pretty_format(NCOL, width=4L)] - if (is.function(mb)) tt[ , MB := pretty_format(MB, width=2L)] - print(tt, class=FALSE, nrows=Inf) - if (is.function(mb)) catf("Total: %sMB\n", prettyNum(sum(info$MB), big.mark=",")) + # add commas into NROW, NCOL and MB when displayed on console + # but this added all these numbers as strings to the character cache which causes the character cache to + # grow especially with a lot of tables, or changing tables over time. Stopped for now to avoid a tipping + # point in RSS in #5520 + # pretty_format = function(x, width) format(prettyNum(x, big.mark=","), width=width, justify="right") + # tt = shallow(info) + # tt[ , NROW := pretty_format(NROW, width=4L)] + # tt[ , NCOL := pretty_format(NCOL, width=4L)] + # if (is.function(mb)) tt[ , MB := pretty_format(MB, width=2L)] + print(info, class=FALSE, nrows=Inf) + if (is.function(mb)) catf("Total: %sMB using %s\n", prettyNum(sum(info$MB), big.mark=","), mb_name) } invisible(info) } From 5affced991ab024b1be548a608b2c504dd64bc08 Mon Sep 17 00:00:00 2001 From: Jim Hester Date: Tue, 15 Nov 2022 04:15:23 -0500 Subject: [PATCH 508/588] Fix memory leak in fread (#4710) --- NEWS.md | 4 +++- R/onAttach.R | 4 ++++ inst/tests/benchmark.Rraw | 8 ++++++++ src/data.table.h | 1 + src/freadR.c | 10 +++++++--- src/init.c | 11 +++++++++++ 6 files changed, 34 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3b176b592e..dfca323781 100644 --- a/NEWS.md +++ b/NEWS.md @@ -560,7 +560,9 @@ DT2 = do.call(data.table, list(DF)) # 3.07s before, 0.02s after identical(DT1, DT2) # TRUE ``` - + +55. `fread()` could leak memory, [#3292](https://github.com/Rdatatable/data.table/issues/3292). Thanks to @patrickhowerter for reporting, and Jim Hester for the fix. The fix requires R 3.4.0 or later. Loading `data.table` in earlier versions now warns that known problems exist, asks users to upgrade R, and warns that we intend to upgrade `data.table`'s dependency from 8-year-old R 3.1.0 (April 2014) to 5-year-old R 3.4.0 (April 2017). + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/onAttach.R b/R/onAttach.R index 9b71a6615c..6ff17972b3 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -35,6 +35,10 @@ else packageStartupMessagef("This is %s. This warning should not normally occur on Windows or Linux where OpenMP is turned on by data.table's configure script by passing -fopenmp to the compiler. If you see this warning on Windows or Linux, please file a GitHub issue.\n**********", Sys.info()["sysname"]) } + if (.Call(CbeforeR340)) { + # not base::getRversion()<"3.4.0" in case the user upgrades R but does not reinstall data.table; a reasonable mistake since data.table would seem to be the latest version + packageStartupMessagef("**********\nThis data.table installation was compiled for R < 3.4.0 (Apr 2017) and is known to leak memory. Please upgrade R and reinstall data.table to fix the leak. Maintaining and testing code branches to support very old versions increases development time so please do upgrade R. We intend to bump data.table's dependency from 8 year old R 3.1.0 (Apr 2014) to 5 year old R 3.4.0 (Apr 2017).\n**********") + } } } diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index eb0f6fadc4..401331a13e 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -175,3 +175,11 @@ t2 = system.time(DT2 <- do.call(data.table, list(DF))) # 3.07s before, 0.02s af test(, identical(DT1, DT2)) test(, t2["elapsed"]/t1["elapsed"]<2) +# fread leak, #3292 +dummy = rep("1\t2\t3\t4\t5", 10000000) +writeLines(dummy, "out.tsv") +start = gc()["Vcells",2] +for (i in 1:10) data.table::fread("out.tsv") +end = gc()["Vcells",2] +test(, end/start < 1.05) + diff --git a/src/data.table.h b/src/data.table.h index 552f2bf176..e57a428eac 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -323,6 +323,7 @@ SEXP nqRecreateIndices(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP fsort(SEXP, SEXP); SEXP inrange(SEXP, SEXP, SEXP, SEXP); SEXP hasOpenMP(void); +SEXP beforeR340(void); SEXP uniqueNlogical(SEXP, SEXP); SEXP dllVersion(void); SEXP initLastUpdated(SEXP); diff --git a/src/freadR.c b/src/freadR.c index 82992aba33..bef1fa6f67 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -523,9 +523,13 @@ void setFinalNrow(size_t nrow) { if (length(DT)) { if (nrow == dtnrows) return; - for (int i=0; i=R_Version(3,4,0) + SET_GROWABLE_BIT(VECTOR_ELT(DT,i)); // #3292 + #endif } } R_FlushConsole(); // # 2481. Just a convenient place; nothing per se to do with setFinalNrow() diff --git a/src/init.c b/src/init.c index 53e0851592..dae13f8a72 100644 --- a/src/init.c +++ b/src/init.c @@ -116,6 +116,7 @@ R_CallMethodDef callMethods[] = { {"Cinrange", (DL_FUNC) &inrange, -1}, {"Cbetween", (DL_FUNC) &between, -1}, {"ChasOpenMP", (DL_FUNC) &hasOpenMP, -1}, +{"CbeforeR340", (DL_FUNC) &beforeR340, -1}, {"CuniqueNlogical", (DL_FUNC) &uniqueNlogical, -1}, {"CfrollfunR", (DL_FUNC) &frollfunR, -1}, {"CdllVersion", (DL_FUNC) &dllVersion, -1}, @@ -330,6 +331,16 @@ SEXP hasOpenMP(void) { } // # nocov end +SEXP beforeR340(void) { + // used in onAttach.R for message about fread memory leak fix needing R 3.4.0 + // at C level to catch if user upgrades R but does not reinstall data.table + #if defined(R_VERSION) && R_VERSION Date: Tue, 15 Nov 2022 04:06:51 -0700 Subject: [PATCH 509/588] Move ram tests (#5520) --- R/test.data.table.R | 22 +- R/utils.R | 10 + inst/tests/benchmark.Rraw | 317 +++++++++++++++++++++ inst/tests/other.Rraw | 28 ++ inst/tests/tests.Rraw | 564 ++++++++++---------------------------- 5 files changed, 514 insertions(+), 427 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 3ea6683d5f..bc512bdfa2 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -126,8 +126,8 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F on.exit(setwd(owd)) if (memtest) { - catf("\n***\n*** memtest=%d. This should be the first task in a fresh R session for best results. Ctrl-C now if not.\n***\n\n", memtest) - if (is.na(ps_mem())) stopf("memtest intended for Linux. Step through ps_mem() to see what went wrong.") + catf("\n***\n*** memtest=%d. This should be the first call in a fresh R_GC_MEM_GROW=0 R session for best results. Ctrl-C now if not.\n***\n\n", memtest) + if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.") } err = try(sys.source(fn, envir=env), silent=silent) @@ -197,7 +197,12 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F ans = timings[, diff:=c(NA,round(diff(RSS),1))][y+1L][,time:=NULL] # time is distracting and influenced by gc() calls; just focus on RAM usage here catf("10 largest RAM increases (MB); see plot for cumulative effect (if any)\n") print(ans, class=FALSE) - plot(timings$RSS, main=basename(fn), ylab="RSS (MB)") + dev.new(width=14, height=7) + par(mfrow=c(1,2)) + plot(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MB)", ylim=c(0,max(timings$RSS))) + mtext(lastRSS<-as.integer(ceiling(last(timings$RSS))), side=4, at=lastRSS, las=1, font=2) + plot(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MB)") + mtext(lastRSS, side=4, at=lastRSS, las=1, font=2) } catf("All %d tests (last %.8g) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) @@ -227,15 +232,6 @@ compactprint = function(DT, topn=2L) { INT = function(...) { as.integer(c(...)) } # utility used in tests.Rraw -ps_mem = function() { - # nocov start - cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KB - ans = tryCatch(as.numeric(system(cmd, intern=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_) - if (length(ans)!=1L || !is.numeric(ans)) ans=NA_real_ # just in case - round(ans / 1024, 1L) # return MB - # nocov end -} - gc_mem = function() { # nocov start # gc reports memory in MB @@ -280,7 +276,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no timings[as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE] if (memtest) { if (memtest==1L) gc() # see #5515 for before/after - timings[as.integer(num), RSS:=max(ps_mem(),RSS), verbose=FALSE] + timings[as.integer(num), RSS:=max(rss(),RSS), verbose=FALSE] if (memtest==2L) gc() } assign("lasttime", proc.time()[3L], parent.frame(), inherits=TRUE) # after gc() to exclude gc() time from next test when memtest diff --git a/R/utils.R b/R/utils.R index c81f9e58c5..a78e5450f7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -156,3 +156,13 @@ edit.data.table = function(name, ...) { setDT(NextMethod('edit', name))[] } # nocov end + +rss = function() { #5515 #5517 + # nocov start + cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KB + ans = tryCatch(as.numeric(system(cmd, intern=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_) + if (length(ans)!=1L || !is.numeric(ans)) ans=NA_real_ # just in case + round(ans / 1024, 1L) # return MB + # nocov end +} + diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 401331a13e..04c5c490b4 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -175,6 +175,322 @@ t2 = system.time(DT2 <- do.call(data.table, list(DF))) # 3.07s before, 0.02s af test(, identical(DT1, DT2)) test(, t2["elapsed"]/t1["elapsed"]<2) +########################################################### +# largest tests by ram usage moved out of tests.Rraw, #5517 +########################################################### + +# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix) +# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which +# this tests. But it's well under 10 seconds. +DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep=""))) +test(301, nrow(DT[,sum(B),by=C])==100010) +DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) +test(301.1, nrow(DT[,sum(B),by=C])==100010) + +# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. +options(datatable.optimize=0L) +set.seed(1) +DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") +test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) +test(637.2, key(DT[J(43L),a:=99L]), NULL) +setkey(DT,a) +test(637.3, key(DT[,a:=99L,by=a]), NULL) +options(datatable.optimize=2L) +set.seed(1) +DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") +test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) +test(638.2, key(DT[J(43L),a:=99L]), NULL) +setkey(DT,a) +test(638.3, key(DT[,a:=99L,by=a]), NULL) + +# Test X[Y] slowdown, #2216 +# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes +# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw. +X = CJ(a=seq_len(1e3),b=seq_len(1e3)) +Y = copy(X) +X[4,b:=3L] # create a dup group, to force allLen1=FALSE +setkey(X) +test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case +test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case + +# test uniqlengths +set.seed(45) +x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE) +ox <- forderv(x) +o1 <- uniqlist(list(x), ox) +test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) +o1 <- uniqlist(list(x)) +test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) +rm(list=c("x","ox","o1")) +gc() + +# Fix for (usually small) memory leak when grouping, #2648. +# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row). +DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001)) +before = gc()["Vcells",2] +for (i in 1:50) DT[, sum(B), by=A] +after = gc()["Vcells",2] +test(1157, after < before+3) # +3 = 3MB +# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case. + +# Similar for when dogroups writes less rows than allocated, #2648. +DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4)) +before = gc()["Vcells",2] +for (i in 1:50) DT[ , unlist(.SD), by = 'k'] +after = gc()["Vcells",2] +test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024 + +# fix DT[TRUE, :=] using too much working memory for i, #1249 +if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled + f = tempfile() + N = 1000000 # or any large number of rows + DT = data.table(A=1:N, B=rnorm(N)) + DT[TRUE, B := B * 2] # stabilize with initial dummy update + Rprofmem(f) + DT[TRUE, B := B * 2] # or some in-place update + Rprofmem(NULL) + test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only + unlink(f) +} + +if (FALSE) { + # Full range takes too long for CRAN. + dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day") + dtsCh = as.character(dts) # 36s + dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000 + test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31"))) +} else { + # test on CRAN a reduced but important range + dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day") + dtsCh = as.character(dts) + test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01"))) +} +DT = data.table(A=dts, B=as.IDate(dts)) +test(1739.3, sapply(DT,typeof), c(A="double",B="integer")) +test(1739.4, typeof(dts), "double") +f = tempfile() +g = tempfile() # Full range +fwrite(DT,f) # 0.092s +write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s +test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=","))) +test(1739.6, readLines(f), readLines(g)) +unlink(f) +unlink(g) +rm(list=c("dtsCh","dts")) +gc() + +# catch malformed factor in rbindlist, #3315 +set.seed(32940) +NN=7e5; KK=4e4; TT=25 +DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) ) +test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites + +# print.data.table row id in non-scientific notation, #1167 +DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5)) +test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c")) +rm(DT) + +# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because +# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not. +# Would need a very complicated construction of embedded new lines in quoted fields, to test that. +# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN. +DT = as.data.table(CO2) +f = tempfile() +for (i in 0:1000) { + start = nrow(CO2)*i + fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE) + if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE) +} +test(1835, fread(f, verbose=TRUE), + output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped", + warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>") +unlink(f) + +# test no memory leak, #2191 and #2284 +# These take a few seconds each, and it's important to run these on CRAN to check no leak +gc(); before = gc()["Vcells","(Mb)"] +for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB +gc(); after = gc()["Vcells","(Mb)"] +test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin +gc(); before = gc()["Vcells","(Mb)"] +DF = data.frame(x=1:20, y=runif(20)) +for (i in 1:2000) { DT = as.data.table(DF); rm(DT) } +gc(); after = gc()["Vcells","(Mb)"] +test(862, after < before+0.5) +gc(); before = gc()["Vcells","(Mb)"] +DT = data.table(x=1:20, y=runif(20)) +for (i in 1:2000) { x <- DT[1:5,]; rm(x) } +gc(); after = gc()["Vcells","(Mb)"] +test(863, after < before+0.5) + +# fread should use multiple threads on single column input. +# tests 2 threads; the very reasonable limit on CRAN +# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently) +if (getDTthreads() == 1L) { + cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n") +} else { + N = if (TRUE) 2e6 else 1e9 # offline speed check + fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile()) + test(1760.1, file.info(f)$size > 4*1024*1024) + test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads") + unlink(f) +} + +# segfault of unprotected var caught with the help of address sanitizer; was test 1509 +# in #5517 I figured this test shouldn't be reduced in size due to its nature +set.seed(1) +val = sample(c(1:5, NA), 1e4L, TRUE) +dt <- setDT(replicate(100L, val, simplify=FALSE)) +## to ensure there's no segfault... +ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE) +test(1035.21, ans, ans) + +# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882 +# This runs with 2 threads in the test suite on CRAN and AppVeyor etc. +# 2 threads are sufficient to fail before the fix. +N = 20 +DF = data.frame(a=rnorm(N), + b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]), + c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5])) +DT = setDT(DF) # setDT required since data.table() already expanded altrep's +before = sum(gc()[, 2]) +fff = function(aref) { + ff = lapply(1:5, function(i) { + DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] + }) + return(rbindlist(ff)) +} +for(i in 1:100) { + f = fff("a") + rm("f") +} +gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after` + # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm. +after = sum(gc()[, 2]) +test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up). +# +before = sum(gc()[, 2]) +fff = function(aref) { + DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race. + lapply(1:5, function(i) { + DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] + }) +} +for(i in 1:100) { + fff("a") +} +gc() +after = sum(gc()[, 2]) +test(1912.2, after < before + 10) + +DT = data.table(A=seq(1, 1000000), B="x", C=TRUE) +fwrite(DT, f<-tempfile()) +test(1815, fread(f, nrows=5), DT[1:5]) #2243: nrows small vs large nrow(DT) + +# Better jump sync and run-on in PR#2627 +# +# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627 +# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly +x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184) +x[51094]="" +cat(x, file=f<-tempfile(), sep="\n") +test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),], + data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")), + output="jumps=[0..2)") # ensure jump 1 happened +# +# out-of-sample short lines in the first jump, not near the jump point +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[5021:5041] = "small,batch,short,lines" # 4 fields not 5 +cat(x, file=f, sep="\n") +test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020), + warning="Stopped early on line 5021.*<>") +test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),], + data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), + V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), + V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), + V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), + V5=c(1L,5020L,NA,NA,5042L,102184L)), + output="jumps=[0..2)") +# +# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267 +# confirmed fails in master with that error before PR#2627 +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[51094:51150] = "small,batch,short,lines" # 4 fields not 5 +cat(x, file=f, sep="\n") +test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093), + warning="Stopped early on line 51094.*<>", + output="jumps=[0..2)") +test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),], + data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), + V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), + V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), + V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), + V5=c(1L,51093L,NA,NA,51151L,102184L)), + output="jumps=[0..2)") +# +# jump inside a quoted field containing many new lines, to simulate a dirty jump +# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too. +# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps. +x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) +x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093" +cat(x, file=f, sep="\n") +test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n + data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"), + V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)), + output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)") +# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's +# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth) +unlink(f) + +# chmatchdup test from benchmark at the bottom of chmatch.c +set.seed(45L) +x = sample(letters, 1e5, TRUE) +y = sample(letters, 1e6, TRUE) +test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406)) +rm(list=c("x","y")) + +# Add nq tests 1641-1652 here with larger sizes and calls that have been turned off in the past as took too long, and +# restore the exact parameters w.r.t. Jan's comment: https://github.com/Rdatatable/data.table/pull/5520#discussion_r1020180583 + +# issue 2351 +set.seed(1) +DT = data.table(id=paste0("id",1:1e5), v=sample(100,1e5,replace=TRUE)) +fwrite(DT, file=f<-tempfile(), eol="\r") +test(1826.1, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id99999","id100000"), v=c(27L,38L,10L,13L))) +cat("id888,42", file=f, append=TRUE) # without final \r after last line +test(1826.2, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id100000","id888"), v=c(27L,38L,13L,42L))) +unlink(f) + +# segfault when rbindlist is asked to create a DT with more than 2bn rows +DT = data.table(1:1e6) +L = vector("list", 2148) +for (i in seq_along(L)) L[[i]] = DT # many references to the same DT to avoid actually using large RAM for this test +test(1850, rbindlist(L), error="Total rows in the list is 2148000000 which is larger than the maximum number of rows, currently 2147483647") +rm(L, DT) +gc() + +# segfault in forder when nrow/throttle=255 && nrow>=65536; #5077 +# Matt ran these on clang's ASAN+OpenMP which correctly faulted v1.14.0; these tests segfault consistently without ASAN too +set.seed(1) +DT = data.table(grp=sample(255L, 65536L ,replace=TRUE)) # >=255 && >=65536 necessary +setDTthreads(throttle=nrow(DT)) # increase throttle to reduce threads to 1 for this nrow +test(2201.1, nrow(DT[, .N, by=grp]), 255L) +test(2201.2, nrow(setkey(DT, grp)), 65536L) +set.seed(1) +DT = data.table(grp=sample(65536L)) # extra case with all size 1 groups too just for fun +test(2201.3, nrow(DT[, .N, by=grp]), 65536L) +test(2201.4, nrow(setkey(DT, grp)), 65536L) +setDTthreads() # restore default throttle + +# print of DT with many columns reordered them, #3306. +DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print +out = capture.output(print(DT)) +tt = out[grep("V",out)] +tt = unlist(strsplit(gsub(" ","",tt), "V")) +test(1982.1, tt[1L], "") +tt = as.integer(tt[tt!=""]) +test(1982.2, tt, seq_along(tt)) + # fread leak, #3292 dummy = rep("1\t2\t3\t4\t5", 10000000) writeLines(dummy, "out.tsv") @@ -183,3 +499,4 @@ for (i in 1:10) data.table::fread("out.tsv") end = gc()["Vcells",2] test(, end/start < 1.05) + diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index c6520a377c..807a67c19e 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -690,4 +690,32 @@ if (loaded[["nanotime"]]) { } +# that plot works; moved from tests.Rraw 167 to here to save ram of loading graphics package and possible screen device issues on overloaded servers, #5517 +DT = data.table( a=1:5, b=11:50, d=c("A","B","C","D"), f=1:5, grp=1:5 ) +test(28.1, DT[,plot(b,f)], NULL) +test(28.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0 +test(28.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer())) +try(graphics.off(),silent=TRUE) + +# test DT$.<- in a data.table-unaware package +# moved from tests.Rraw 1890 to here to save ram of loading stats package and plot, #5517 +DT = data.table(A=1:5) +test(29.1, stats::ts.plot(gpars=DT), error="object must have one or more observations") +# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col() +test(29.2, DT, data.table(A=1:5)) + +if (FALSE) { # moved from tests.Rraw in #5517 and not yet back on; wasn't sure we need to still test reshape2 + # test dispatch for non-data.table objects, #4864. + if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) { + test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + error="The melt generic in data.table has been passed a data.frame") + } else { + # 1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2 + # 2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded) + # 3) in dev locally I have reshape2 installed to run caret in other.Rraw + test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), + as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), + warning="The melt generic in data.table has been passed a data.frame") + } +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b242290dfc..7947ac0097 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7,6 +7,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { } if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") + rm_all = function() {} } else { require(data.table) # Make symbols to the installed version's ::: so that we can i) test internal-only not-exposed R functions @@ -52,6 +53,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { print.data.table = data.table:::print.data.table replace_dot_alias = data.table:::replace_dot_alias rollup.data.table = data.table:::rollup.data.table + rss = data.table:::rss selfrefok = data.table:::selfrefok setcoalesce = data.table:::setcoalesce setdiff_ = data.table:::setdiff_ @@ -98,6 +100,13 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { year = data.table::year # lubridate yearmon = data.table::yearmon # zoo yearqtr = data.table::yearqtr # zoo + + rm_all = function(env=parent.frame()) { + tt = setdiff(ls(envir=env), .do_not_rm) + rm(list=tt, envir=env) + gc() + invisible() + } } # Optional suggests are now tested in other.Rraw, #5516. No calls to require() or library() should occur @@ -156,6 +165,8 @@ base_messages = list( mixed_subscripts = get_msg(letters[-1:1]) ) +########################## +.do_not_rm = ls() # objects that exist at this point should not be removed by rm_all(); e.g. test_*, base_messages, Ctest_dt_win_snprintf, prevtest, etc ########################## test(1.1, tables(env=new.env()), null.data.table(), output = "No objects of class") @@ -520,11 +531,7 @@ test(164, foo(f), DT[,mean(b),by=d]) test(165, subset(DT,a>2), DT[a>2]) test(166, suppressWarnings(split(DT,DT$grp)[[2]]), DT[grp==2]) -# and that plotting works -test(167.1, DT[,plot(b,f)], NULL) -test(167.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0 -test(167.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer())) -try(graphics.off(),silent=TRUE) +# 167 tested graphics::plot, moved to other.Rraw 28 to save ram, #5517 # IDateTime conversion methods that ggplot2 uses (it calls as.data.frame method) # Since %b is e.g. "nov." in LC_TIME=fr_FR.UTF-8 locale, we need to @@ -961,13 +968,7 @@ DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b") test(300, DT[J(1),sum(v),by=b], data.table(b=c(1L,4L,7L),V1=c(1L,4L,7L))) # should not retain key because by= is not on head(key(DT)) test(300.1, DT[J(1:2),sum(v),by=b], data.table(b=c(1L,4L,7L,2L,5L,8L),V1=c(1L,4L,7L,2L,5L,8L))) -# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix) -# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which -# this tests. But it's well under 10 seconds. -DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep=""))) -test(301, nrow(DT[,sum(B),by=C])==100010) -DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) -test(301.1, nrow(DT[,sum(B),by=C])==100010) +# 301 moved to benchmark.Rraw, #5517 # Test fast assign DT = data.table(a=c(1L,2L,2L,3L),b=4:7,key="a") @@ -1930,21 +1931,7 @@ DT = data.table(x=1:3,y=1:3) test(635, names(DT[,list(x,y,a=y)]), c("x","y","a")) test(636, names(DT[,list(x,a=y)]), c("x","a")) -# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. -options(datatable.optimize=0L) -set.seed(1) -DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") -test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) -test(637.2, key(DT[J(43L),a:=99L]), NULL) -setkey(DT,a) -test(637.3, key(DT[,a:=99L,by=a]), NULL) -options(datatable.optimize=2L) -set.seed(1) -DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") -test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) -test(638.2, key(DT[J(43L),a:=99L]), NULL) -setkey(DT,a) -test(638.3, key(DT[,a:=99L,by=a]), NULL) +# 637-638 moved to benchmark.Rraw, #5517 # Test printing is right aligned without quotes etc, and rownames are repeated ok for more than 20 rows DT=data.table(a=8:10,b=c("xy","x","xyz"),c=c(1.1,22.1,0)) @@ -1964,9 +1951,9 @@ test(645, setkey(DT,b), error="Column 2 is length 2 which differs from length of # Test faster mean with a lot of very small groups. Example from (now not needed as much) data.table wiki point 3. # benchmarks.Rraw contains the same, to be scaled up. set.seed(9) -n=1e4 # very small n so as not to overload daily CRAN checks. -DT=data.table(grp1=sample(1:150, n, replace=TRUE), - grp2=sample(1:150, n, replace=TRUE), +n=1e3 # very small n (1e4) so as not to overload daily CRAN checks. Then reduced even further to just 1e3, #5517 +DT=data.table(grp1=sample.int(150L, n, replace=TRUE), + grp2=sample.int(150L, n, replace=TRUE), x=rnorm(n), y=rnorm(n)) DT[c(2,5),x:=NA] # seed chosen to get a group of size 2 and 3 in the first 5 to easily inspect. @@ -2423,16 +2410,7 @@ mycols = 2 test(814.12, DT[,!..mycols], ans) test(814.13, DT[,-..mycols], ans) - -# Test X[Y] slowdown, #2216 -# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes -# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw. -X = CJ(a=seq_len(1e3),b=seq_len(1e3)) -Y = copy(X) -X[4,b:=3L] # create a dup group, to force allLen1=FALSE -setkey(X) -test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case -test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case +# 819-820 moved to benchmark.Rraw, #5517 # Optimization of lapply(,"+"), #2212 DT = data.table(a=rep(1:3,each=2L),b=1:6,c=7:12) @@ -2534,24 +2512,7 @@ i = data.frame(foo=1) test(859, DT[i], DT[J(i)]) test(860, DT[i], DT[data.table(i)]) -# test no memory leak, #2191 and #2284 -# These take a few seconds each, and it's important to run these on CRAN to check no leak -gc(); before = gc()["Vcells","(Mb)"] -for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB -gc(); after = gc()["Vcells","(Mb)"] -test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin - -gc(); before = gc()["Vcells","(Mb)"] -DF = data.frame(x=1:20, y=runif(20)) -for (i in 1:2000) { DT = as.data.table(DF); rm(DT) } -gc(); after = gc()["Vcells","(Mb)"] -test(862, after < before+0.5) - -gc(); before = gc()["Vcells","(Mb)"] -DT = data.table(x=1:20, y=runif(20)) -for (i in 1:2000) { x <- DT[1:5,]; rm(x) } -gc(); after = gc()["Vcells","(Mb)"] -test(863, after < before+0.5) +# 861-863 moved to benchmark.Rraw, #5517 # rbindlist should look for the first non-empty data.table - New changes (from Arun). Explanation below: # Even if data.table is empty, as long as there are column names, they should be considered. @@ -3251,13 +3212,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.20, melt(DT, id.vars=1:2), data.table(A=1:2, B=3:4, variable=factor(rep(1L, 4L), labels="D"), value=5:8)) - # segfault of unprotected var caught with the help of address sanitizer; was test 1509 - set.seed(1) - val = sample(c(1:5, NA), 1e4L, TRUE) - dt <- setDT(replicate(100L, val, simplify=FALSE)) - ## to ensure there's no segfault... - ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE) - test(1035.21, ans, ans) + # 1035.21 moved to benchmark.Rraw, #5517 # improper levels fix, #1359; was test 1563 dt = data.table(id=1:3, x=NA_character_, y=c('a', NA_character_, 'c')) @@ -3360,18 +3315,8 @@ Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") test(1037.414, melt(x, id.vars='x1', measure.vars='r'), error="Unknown column type 'raw' for column 'r'") - # test dispatch for non-data.table objects, #4864. - if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) { - test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), - error="The melt generic in data.table has been passed a data.frame") - } else { - # 1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2 - # 2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded) - # 3) in dev locally I have reshape2 installed to run caret in other.Rraw - test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6), - as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)), - warning="The melt generic in data.table has been passed a data.frame") - } + # 1038 moved to other.Rraw, #5517 + } # sorting and grouping of Inf, -Inf, NA and NaN, #117, #112 & #105 @@ -4070,7 +4015,8 @@ if (test_longdouble) { old = getNumericRounding() set.seed(6) - x = rnorm(1e6)*1e4 + x = rnorm(1e4)*1e4 # first 1e4 reduced from 1e6 to save ram, #5517 + x = c(x, 11969.235757385, 11969.235757322) # add back 2 numbers from the 1e6 sample whose order is changed in test 1147.3 ans = base::sort.list(x, method="shell") setNumericRounding(0) test(1147.1, ans, forderv(x)) @@ -4104,16 +4050,7 @@ if (test_longdouble) { test(1149.1, forderv(integer(0)), integer(0)) test(1149.2, forderv(numeric(0)), integer(0)) -# test uniqlengths -set.seed(45) -x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE) -ox <- forderv(x) -o1 <- uniqlist(list(x), ox) -test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) -o1 <- uniqlist(list(x)) -test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) -rm(list=c("x","ox","o1")) -gc() +# 1151 moved to benchmark.Rraw, #5517 # #67 fix - grouping with .SDcols gave "symbol not subsettable error" - consequence of FR #355 implementation dt = data.table(grp = sample(letters[1:3],20, replace = TRUE), v1 = rnorm(20), v2 = rnorm(20)) @@ -4153,21 +4090,7 @@ setkey(dt, x) test(1155.4, dt[J(NaN)], dt[is.nan(x)]) test(1155.5, dt[J(NA_real_)], dt[is.na(x) & !is.nan(x)]) -# Fix for (usually small) memory leak when grouping, #2648. -# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row). -DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001)) -before = gc()["Vcells",2] -for (i in 1:50) DT[, sum(B), by=A] -after = gc()["Vcells",2] -test(1157, after < before+3) # +3 = 3MB -# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case. - -# Similar for when dogroups writes less rows than allocated, #2648. -DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4)) -before = gc()["Vcells",2] -for (i in 1:50) DT[ , unlist(.SD), by = 'k'] -after = gc()["Vcells",2] -test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024 +# 1157-1158 moved to benchmark.Rraw, #5517 # tests for 'setDT' - convert list, DF to DT without copy x <- data.frame(a=1:4, b=5:8) @@ -4479,48 +4402,46 @@ seed = as.integer(Sys.time()) # sample(9999L, 1L) temporary fix, because all the seedInfo = paste("forder decreasing argument test: seed = ", seed," ", sep="") # no NaN (because it's hard to match with base::order); tested below in 1988.4-8 set.seed(seed) -foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, sep="") +foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, collapse="") i1 = as.integer(sample(c(-100:100), 1e3, TRUE)) i2 = as.integer(sample(c(-100:100, -1e6, 1e6), 1e3, TRUE)) d1 = as.numeric(sample(c(-100:100,Inf,-Inf), 1e3, TRUE)) d2 = as.numeric(rnorm(1e3)) -c1 = sample(c(letters), 1e3, TRUE) -c2 = sample(foo(200), 1e3, TRUE) +c1 = sample(letters, 1e3, TRUE) +c2 = sample(foo(50), 1e3, TRUE) DT = data.table(i1, i2, d1, d2, c1, c2) # randomise col order as well colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") -ans = vector("list", length(names(DT))) test_no = 1223.0 oldnfail = nfail -for (i in seq_along(names(DT))) { - cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i))) - ans[[i]] = combn(names(DT), i, function(x) { - tmp = apply(cj, 1, function(y) { +for (nvars in seq_along(names(DT))) { + signs = expand.grid(replicate(nvars, c(-1L,1L), simplify=FALSE)) + combn(names(DT), nvars, function(x) { + for (i in seq_len(nrow(signs))) { test_no <<- signif(test_no+.001, 7) ll = as.call(c(as.name("order"), lapply(seq_along(x), function(j) { - if (y[j] == 1L) + if (signs[i,j] == 1L) as.name(x[j]) else { - if (class(DT[[x[j]]]) =="character") + if (is.character(DT[[x[j]]])) as.call(c(as.name("-"), as.call(list(as.name("xtfrm"), as.name(x[j]))))) else as.call(list(as.name("-"), as.name(x[j]))) } }) )) - test(test_no, forderv(DT, by=x, order=y), with(DT, eval(ll))) - }) - dim(tmp)=NULL - list(tmp) + test(test_no, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) + } + integer() }) } -ans = NULL if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce +rm_all() # fix for bug #44 - unique on null data.table should return null data.table test(1224, unique(data.table(NULL)), data.table(NULL)) @@ -4616,7 +4537,7 @@ if (base::getRversion() < "3.3.0") { # Test for optimisation of 'order' to 'forder'. Copied to benchmarks.Rraw too. set.seed(45L) -DT = data.table(x=sample(1e2, 1e5, TRUE), y=sample(1e2, 1e5, TRUE)) +DT = data.table(x=sample.int(1e2, 1e3, TRUE), y=sample.int(1e2, 1e3, TRUE)) # 1e5 reduced again to 1e3, #5517 test(1241, DT[order(x,-y)], # optimized to forder() DT[base_order(x,-y)]) # not optimized @@ -4890,7 +4811,7 @@ test(1268.22, dt[, c(as.list(c), lapply(.SD, mean)), by=a], # Wide range numeric and integer64, to test all bits old_rounding = getNumericRounding() -x = sample( c(seq(-1e100, 1e100, length.out=1e5), c(seq(-1e-100,1e-100,length.out=1e5))) ) +x = sample( c(seq(-1e100, 1e100, length.out=1e3), c(seq(-1e-100,1e-100,length.out=1e3))) ) # 1e5 reduced to 1e3, #5517 setNumericRounding(0) test(1269, forderv(x), base::order(x)) setNumericRounding(2) # not affected by rounding @@ -5212,8 +5133,8 @@ DT = DT[1L] set(DT,1L,"b",FALSE) # passing 1L as i here is needed to avoid column plonk, so changes the logical singleton in place test(1297, as.integer(TRUE[1]), 1L) # In R 3.1, TRUE[1] returns the global TRUE but TRUE doesn't yet (parses as new vector) test(1298, as.integer(TRUE), 1L) -# orignal example, verbatim from James Sams : -upc_table = data.table(upc=1:100000, upc_ver_uc=rep(c(1,2), times=50000), is_PL=rep(c(TRUE, FALSE, FALSE, TRUE), each=25000), product_module_code=rep(1:4, times=25000), ignore.column=2:100001) +# orignal example, verbatim from James Sams; sizes reduced to save ram in #5517 +upc_table = data.table(upc=1:1000, upc_ver_uc=rep(c(1,2), times=500), is_PL=rep(c(TRUE, FALSE, FALSE, TRUE), each=250), product_module_code=rep(1:4, times=250), ignore.column=2:1001) test(1299, upc_table[, .N, by=list(upc, upc_ver_uc)][,max(N)], 1L) # all size 1 groups test(1300, upc_table[, list(is_PL, product_module_code), keyby=list(upc, upc_ver_uc)][,upc[1:3]], 1:3L) # was warning "internal TRUE value has been modified" rm(list="upc_table") @@ -7629,18 +7550,8 @@ dtab <- data.table(pid = factor(c("i", "nouana")), c("pid", "year")) test(1541, key(dtp[dtab]), c("pid", "year")) -# fix DT[TRUE, :=] using too much working memory for i, #1249 -if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled - f = tempfile() - N = 1000000 # or any large number of rows - DT = data.table(A=1:N, B=rnorm(N)) - DT[TRUE, B := B * 2] # stabilize with initial dummy update - Rprofmem(f) - DT[TRUE, B := B * 2] # or some in-place update - Rprofmem(NULL) - test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only - unlink(f) -} +# 1542.0 moved to benchmark.Rraw, #5517 + # DT[TRUE] should shallow copy as v1.11.8 and earlier did (#3214); in future more will shallow copy too DT = data.table(id = 1:5, key="id") DT1 = DT[TRUE] @@ -7789,10 +7700,7 @@ ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8") test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown") test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8") -# #1167 print.data.table row id in non-scientific notation -DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5)) -test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c")) -rm(DT) +# 1549 moved to benchmark.Rraw, #5517 # PR by @dselivanov # fixes #504 - handle nastring while reading (without coercion to character) @@ -9193,6 +9101,8 @@ dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) test(1638, dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") options(datatable.optimize=Inf) +rm_all() + #1389 - split.data.table - big chunk of unit tests set.seed(123) dt = data.table(x1 = rep(letters[1:2], 6), x2 = rep(letters[3:5], 4), x3 = rep(letters[5:8], 3), y = rnorm(12)) @@ -9284,14 +9194,14 @@ test(1639.056, TRUE, all( sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(fdt, by = c("x1","x2","x3"), flatten=FALSE) # empty levels in x3 after subset are expanded -test(1639.057, TRUE, all( - is.list(l), identical(names(l), c("b","a")), - sapply(l, function(x) !is.data.table(x) && is.list(x)), - sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), - identical(lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))), - sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6), - sapply(l, sapply, sapply, ncol) == rep(4L, 24) -)) +# memtest tracing in #5520 showed this split() and the one before 1639.188 (both by 3 columns) account for the RAM usage in 1639. But they should be gc()'d eventually after rm_all(). +test(1639.0571, is.list(l)) +test(1639.0572, names(l), c("b","a")) +test(1639.0573, all(sapply(l, function(x) !is.data.table(x) && is.list(x)))) +test(1639.0574, all(sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)))) +test(1639.0575, lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))) +test(1639.0576, all(sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6))) +test(1639.0577, all(sapply(l, sapply, sapply, ncol) == rep(4L, 24))) l = split(fdt, by = c("x3","x1"), drop=TRUE, flatten=FALSE) # multi col rev test(1639.058, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), @@ -9656,6 +9566,7 @@ test(1639.141, all(sapply(dtL, truelength) > 1000)) dt <- data.table(x = factor("a"), y = 1) test(1639.142, x = split(dt, by = "x"), y = list(a = dt)) test(1639.143, x = split(dt, by = "y"), y = list(`1` = dt)) +rm_all() # allow x's cols (specifically x's join cols) to be referred to using 'x.' syntax # patch for #1615. Note that I specifically have not implemented x[y, aa, on=c(aa="bb")] @@ -9668,10 +9579,10 @@ test(1640.2, x[y, c(.SD, .(x.aa=x.aa)), on=c(aa="bb")], data.table(aa=3:5, cc=c( # tests for non-equi joins # function to create a random data.table with all necessary columns nq_fun = function(n=100L) { - i1 = sample(sample(n, 10L), n, TRUE) - i2 = sample(-n/2:n/2, n, TRUE) - i3 = sample(-1e6:1e6, n, TRUE) - i4 = sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE) + i1 = sample(sample.int(n, 10L), n, TRUE) + i2 = sample.int(n, n, TRUE) - as.integer(n/2) # this used to be type numeric before #5517 which didn't seem intentional + i3 = sample.int(2e6, n, TRUE) - as.integer(1e6) # used to sample from -1e6:1e6 which if allocated would be 8MB, #5517 + i4 = sample(c(NA_integer_, sample.int(n*2L, 10L, FALSE)-n), n, TRUE) d1 = sample(rnorm(10L), n, TRUE) d2 = sample(rnorm(50), n, TRUE) @@ -9683,15 +9594,55 @@ nq_fun = function(n=100L) { dt = data.table(i1,i2,i3,i4, d1,d2,d3,d4, c1,c2) if (test_bit64) { - I1 = as.integer64(sample(sample(n, 10L), n, TRUE)) - I2 = as.integer64(sample(-n/2:n/2, n, TRUE)) - I3 = as.integer64(sample(-1e6:1e6, n, TRUE)) - I4 = as.integer64(sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE)) + I1 = as.integer64(sample(sample.int(n, 10L), n, TRUE)) + I2 = as.integer64(sample.int(n, n, TRUE) - as.integer(n/2)) + I3 = as.integer64(sample.int(2e6, n, TRUE) - as.integer(1e6)) # there used to be another -1e6:1e6 here whose altrep likely allocated when sample accessed it, #5517 + I4 = as.integer64(sample(c(NA_integer_, sample.int(n*2L, 10L, FALSE)-n), n, TRUE)) dt = cbind(dt, data.table(I1,I2,I3,I4)) } dt } +construct <- function(cols, vals, ops, x, y) { + expr = lapply(seq_along(cols), function(i) { + GT_or_LT = ops[i]==">" || ops[i]=="<" + if (inherits(vals[[i]], "integer64")) { + if (is.na.integer64(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is.na.integer64), as.name(cols[[i]]))) + else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), as.integer(vals[[i]]))) + # don't know how to construct a call with int64 -- vals[[i]] gets converted to NAN + } else { + if (is.nan(vals[[i]])) if (GT_or_LT) quote(logical(0)) else as.call(list(quote(is.nan), as.name(cols[[i]]))) + else if (is_only_na(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is_only_na), as.name(cols[[i]]))) + else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), vals[[i]])) + } + }) + Reduce(function(x,y)call("&",x,y), expr) +} + +check <- function(x, y, cols, ops, mult="all") { + # gather just row numbers here and then select all rows once afterwards, rather than rbindlist + rowNums = unlist(lapply(1:nrow(y), function(i) { + e = construct(cols, y[i, ..cols], ops, x, y) + rowNums = which(with(x, eval(e))) # raw expression, isolated from both [.data.table overhead and subset optimization + if (!length(rowNums) || mult=="all") + rowNums + else if (mult=="first") + rowNums[1L] + else # mult=="last" + rowNums[length(rowNums)] + })) + x[rowNums] +} + +nq <- function(x, y, cols, ops, nomatch=0L, mult="all") { + sd_cols = c(paste0("x.", cols), setdiff(names(x), cols)) + ans = x[y, mget(sd_cols, as.environment(-1)), on = paste0(cols, ops, cols), allow.cartesian=TRUE, nomatch=nomatch, mult=mult] + setnames(ans, gsub("^x[.]", "", names(ans))) + setcolorder(ans, names(x))[] +} + +is_only_na <- function(x) is.na(x) & !is.nan(x) + nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { ops = c("==", ">=", "<=", ">", "<") xclass = sapply(x, class) @@ -9702,42 +9653,6 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { thisops[startsWith(cols, "c")] = "==" thisops }) - is_only_na <- function(x) is.na(x) & !is.nan(x) - construct <- function(cols, vals, ops) { - expr = lapply(seq_along(cols), function(i) { - GT_or_LT = ops[i]==">" || ops[i]=="<" - if (inherits(vals[[i]], "integer64")) { - if (is.na.integer64(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is.na.integer64), as.name(cols[[i]]))) - else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), as.integer(vals[[i]]))) - # don't know how to construct a call with int64 -- vals[[i]] gets converted to NAN - } else { - if (is.nan(vals[[i]])) if (GT_or_LT) quote(logical(0)) else as.call(list(quote(is.nan), as.name(cols[[i]]))) - else if (is_only_na(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is_only_na), as.name(cols[[i]]))) - else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), vals[[i]])) - } - }) - Reduce(function(x,y)call("&",x,y), expr) - } - check <- function(x, y, cols, ops, mult="all") { - # gather just row numbers here and then select all rows once afterwards, rather than rbindlist - rowNums = unlist(lapply(1:nrow(y), function(i) { - e = construct(cols, y[i, ..cols], ops) - rowNums = which(with(x, eval(e))) # raw expression, isolated from both [.data.table overhead and subset optimization - if (!length(rowNums) || mult=="all") - rowNums - else if (mult=="first") - rowNums[1L] - else # mult=="last" - rowNums[length(rowNums)] - })) - x[rowNums] - } - nq <- function(x, y, cols, ops, nomatch=0L, mult="all") { - sd_cols = c(paste0("x.", cols), setdiff(names(x), cols)) - ans = x[y, mget(sd_cols, as.environment(-1)), on = paste0(cols, ops, cols), allow.cartesian=TRUE, nomatch=nomatch, mult=mult] - setnames(ans, gsub("^x[.]", "", names(ans))) - setcolorder(ans, names(x))[] - } for (i in seq_along(runcmb)) { thiscols = runcmb[[i]] thisops = runops[[i]] @@ -9750,7 +9665,7 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { gc() # no longer needed but left in place just in case, no harm } -dt1 = nq_fun(400L) +dt1 = nq_fun(100L) # 400 reduced to 100, #5517 dt2 = nq_fun(50L) x = na.omit(dt1) y = na.omit(dt2) @@ -10881,31 +10796,7 @@ test(1738.3, sapply(DT,typeof), c(A="double",B="integer")) test(1738.4, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) test(1738.5, as.integer(as.Date(c("0000-03-01","9999-12-31"))), c(-719468L,2932896L)) -if (FALSE) { - # Full range takes too long for CRAN. - dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day") - dtsCh = as.character(dts) # 36s - dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000 - test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31"))) -} else { - # test on CRAN a reduced but important range - dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day") - dtsCh = as.character(dts) - test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01"))) -} -DT = data.table(A=dts, B=as.IDate(dts)) -test(1739.3, sapply(DT,typeof), c(A="double",B="integer")) -test(1739.4, typeof(dts), "double") -f = tempfile() -g = tempfile() # Full range -fwrite(DT,f) # 0.092s -write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s -test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=","))) -test(1739.6, readLines(f), readLines(g)) -unlink(f) -unlink(g) -rm(list=c("dtsCh","dts")) -gc() +# 1739 moved to benchmark.Rraw, #5517 # dateTimeAs DT = data.table( @@ -11223,12 +11114,13 @@ test(1750.07, # 0 length `by`, must also use `sets=list()`, so 0L rows result nrow(groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = character(), .SDcols=c("amount","value"), sets=list(), id=TRUE)), 0L ) -test(1750.08, all( # for any single value from dataset there should be always same aggregate result on any level of grouping - sapply(seq_len(nrow(dt)), function(i) uniqueN( +# for any single value from dataset there should be always be the same aggregate result on any level of grouping +# changed from all(sapply()) to for() to save ram, #5517 +for (i in seq_len(nrow(dt))) { + test(1750.08+i/10000, uniqueN( groupingsets(dt[i], j = lapply(.SD, sum), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character())), - by=c("amount","value") - )) == 1L -), TRUE) + by=c("amount","value")) == 1L) +} # all grouping id matches in all totals r = groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character()), id=TRUE) test(1750.09, uniqueN( @@ -11457,18 +11349,7 @@ if (test_R.utils) test(1759, fread(testDir("alluniquechar.csv.gz"))[c(1,2,499,50 H=c("tokakysooopwtmlkeimzbgpein","hguwmynjhecsxpxldyzlemavmw", "lyclruzkazfqhyxnppaafwcveo","myfqhltlwzwwxyvshwrzrdmfyq"))) -# fread should use multiple threads on single column input. -# tests 2 threads; the very reasonable limit on CRAN -# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently) -if (getDTthreads() == 1L) { - cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n") -} else { - N = if (TRUE) 2e6 else 1e9 # offline speed check - fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile()) - test(1760.1, file.info(f)$size > 4*1024*1024) - test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads") - unlink(f) -} +# 1760 moved to benchmark.Rraw, #5517 # fread single column with superfluous fill=TRUE, #2118 test(1761.1, fread("1\n2\n3", fill=TRUE), data.table(V1=1:3)) @@ -11813,10 +11694,10 @@ ld = sapply(same, as.IDate) test(1779.01, uniqueN(ld)==1L) lt = sapply(same[1:2], as.ITime) # exclude date test(1779.02, uniqueN(lt)==1L) -# some random 1e6 timestamps old defaults vs new methods UTC +# some random timestamps old defaults vs new methods UTC intpx = function(x) as.integer(as.POSIXct(x, origin = "1970-01-01", tz = "UTC")) set.seed(1) -i = sample(intpx("2015-10-12")-intpx("2014-10-12"), 1e5, TRUE) + intpx("2014-10-12") +i = sample(intpx("2015-10-12")-intpx("2014-10-12"), 1e3, TRUE) + intpx("2014-10-12") # 1e5 reduced to 1e3, #5517 p = as.POSIXct(i, origin = "1970-01-01", tz = "UTC") test(1779.03, identical(as.ITime.default(p), as.ITime(p))) test(1779.04, identical(as.IDate.default(p), as.IDate(p))) @@ -11888,9 +11769,7 @@ test(1812, fread("A,B\n1,2\n3,4\n", skip="4", verbose=TRUE), data.table(V1=3L, V test(1813, fread("A,B\n1,2\n3,4", skip=10L), error="skip=10 but the input only has 3 lines") test(1814, fread("A,B\n1,2\n3,4\n \n\t", skip=3L), error="skip has been set after the last non-whitespace") -DT = data.table(A=seq(1, 1000000), B="x", C=TRUE) -fwrite(DT, f<-tempfile()) -test(1815, fread(f, nrows=5), DT[1:5]) #2243 +# 1815 moved to benchmark.Rraw, #5517 test(1816.1, fread("A,E\n1,2\n5,7\n4,6\n\x1A\x1A", verbose=TRUE), data.table(A=c(1L, 5L, 4L), E=c(2L, 7L, 6L)), @@ -12007,14 +11886,7 @@ fwrite(DT, f) test(1825.22, fread(f, colClasses = c(a = "numeric", b = "integer")), DT, warning="Attempt to override column 2.*ignored") unlink(f) -# issue 2351 -set.seed(1) -DT = data.table(id=paste0("id",1:1e5), v=sample(100,1e5,replace=TRUE)) -fwrite(DT, file=f<-tempfile(), eol="\r") -test(1826.1, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id99999","id100000"), v=c(27L,38L,10L,13L))) -cat("id888,42", file=f, append=TRUE) # without final \r after last line -test(1826.2, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id100000","id888"), v=c(27L,38L,13L,42L))) -unlink(f) +# 1826 moved to benchmark.Rraw, #5517 # Issue 2222 test(1827.1, fread("A,B\n1987,1\n1987,3\n", na.strings=c("1987", "NA")), data.table(A=c(NA,NA),B=c(1L,3L))) @@ -12102,21 +11974,7 @@ if (test_R.utils) { V12=c("AAAAAAAAAAAAA","","AAAAAAA","AAA"))) } -# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because -# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not. -# Would need a very complicated construction of embedded new lines in quoted fields, to test that. -# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN. -DT = as.data.table(CO2) -f = tempfile() -for (i in 0:1000) { - start = nrow(CO2)*i - fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE) - if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE) -} -test(1835, fread(f, verbose=TRUE), - output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped", - warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>") -unlink(f) +# 1835 moved to benchmark.Rraw, #5517 test(1836, fread('1,2,"3,a"\n4,5,"6,b"'), data.table(V1=c(1L,4L), V2=c(2L,5L), V3=c("3,a","6,b"))) # 2196 @@ -12221,7 +12079,7 @@ rand_strings = function(n) { apply(M, 1, function(x) paste0(letters[x], collapse="")) } set.seed(123) # the random data here doesn't match the data in issue 2275 because they used stringi::stri_rand_strings which has a different RNG -n = 100000 +n = 1000 # reduced from 100000 to 1000 for #5517 DT1 = data.table(RANDOM_STRING = rand_strings(n), DATE = sample(seq(as.Date('2016-01-01'), as.Date('2016-12-31'), by="day"), n, replace=TRUE)) DT2 = data.table(RANDOM_STRING = rand_strings(n), @@ -12266,13 +12124,7 @@ test(1849.9, fread(f, select=c("Date", "Description", "Balance")), data.table(Date=20150725L,Description="abcd",Balance="$5,006")) unlink(f) -# segfault when rbindlist is asked to create a DT with more than 2bn rows -DT = data.table(1:1e6) -L = vector("list", 2148) -for (i in seq_along(L)) L[[i]] = DT # many references to the same DT to avoid actually using large RAM for this test -test(1850, rbindlist(L), error="Total rows in the list is 2148000000 which is larger than the maximum number of rows, currently 2147483647") -rm(list=c("L","DT")) -gc() +# 1850 moved to benchmark.Rraw, #5517 # by=.EACHI missings to list columns, #2300 dt = data.table(a=factor(1:5, levels=1:10), b=as.list(letters[1:5])) @@ -12577,60 +12429,7 @@ fwrite(DT,f<-tempfile()) test(1873, fread(f), DT) unlink(f) -# Better jump sync and run-on in PR#2627 -# -# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627 -# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly -x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184) -x[51094]="" -cat(x, file=f<-tempfile(), sep="\n") -test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),], - data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")), - output="jumps=[0..2)") # ensure jump 1 happened -# -# out-of-sample short lines in the first jump, not near the jump point -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[5021:5041] = "small,batch,short,lines" # 4 fields not 5 -cat(x, file=f, sep="\n") -test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020), - warning="Stopped early on line 5021.*<>") -test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),], - data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), - V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), - V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), - V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), - V5=c(1L,5020L,NA,NA,5042L,102184L)), - output="jumps=[0..2)") -# -# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267 -# confirmed fails in master with that error before PR#2627 -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[51094:51150] = "small,batch,short,lines" # 4 fields not 5 -cat(x, file=f, sep="\n") -test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093), - warning="Stopped early on line 51094.*<>", - output="jumps=[0..2)") -test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),], - data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"), - V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"), - V3=c("KLMN","KLMN","short","short","KLMN","KLMN"), - V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"), - V5=c(1L,51093L,NA,NA,51151L,102184L)), - output="jumps=[0..2)") -# -# jump inside a quoted field containing many new lines, to simulate a dirty jump -# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too. -# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps. -x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184) -x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093" -cat(x, file=f, sep="\n") -test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n - data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"), - V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)), - output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)") -# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's -# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth) -unlink(f) +# 1874-1875 moved to benchmark.Rraw, #5517 test(1876, fread("http://hkhfsk\nhttp://fhdkf\nhttp://kjfhskd\nhttp://hfkjf", header=FALSE), # data not a download, #2531 data.table(V1=c("http://hkhfsk","http://fhdkf","http://kjfhskd","http://hfkjf"))) @@ -12724,7 +12523,7 @@ DT = fread(",2,3\n1,,3\n1,2,\n") # all rows contain an NA, #2784 test(1887.3, na.omit(DT), DT[0L]) test(1887.4, na.omit(DT, invert=TRUE), DT) -x = runif(1e4) +x = runif(1e3) # 1e4 reduced to 1e3 in #5517 but really it was the 1e6 just after 1888.5 below which is now 1e3 too test(1888, fsort(x), base::sort(x)) test(1888.1, fsort(x, decreasing = TRUE), base::sort(x, decreasing = TRUE), warning = "New parallel sort has not been implemented for decreasing=TRUE.*one thread") @@ -12738,7 +12537,7 @@ test(1888.4, fsort(x, decreasing = TRUE, na.last = TRUE), base::sort(x, decreasi x <- as.integer(x) test(1888.5, fsort(x), base::sort(x, na.last = FALSE), warning = "Input is not a vector of type double. New parallel sort has only been done for double vectors so far.*Using one thread") -x = runif(1e6) +x = runif(1e3) test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 20 MSB counts") test(1888.7, !base::is.unsorted(y)) test(1888.8, fsort(x,verbose=1), error="verbose must be TRUE or FALSE") @@ -12751,11 +12550,7 @@ test(1889, chmatch(x,x), 1:1000) rm(list=x) gc() -# test DT$.<- in a data.table-unaware package -DT = data.table(A=1:5) -test(1890.1, stats::ts.plot(gpars=DT), error="object must have one or more observations") -# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col() -test(1890.2, DT, data.table(A=1:5)) +# 1890 used stats::ts.plot, moved to other.Rraw 29 to save ram, #5517 # na="" default, #2524 test(1891.1, fread('A,B,C\n1,foo,4\n2,,5\n3,bar,6\n', na.strings=""), data.table(A=1:3, B=c("foo",NA,"bar"), C=4:6)) @@ -12971,43 +12766,7 @@ test(1911.2, DT[, COL_INT := integer(0)], error = "RHS of assignment to existing column 'COL_INT' is zero length but not NULL.*") -# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882 -# This runs with 2 threads in the test suite on CRAN and AppVeyor etc. -# 2 threads are sufficient to fail before the fix. -N = 20 -DF = data.frame(a=rnorm(N), - b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]), - c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5])) -DT = setDT(DF) # setDT required since data.table() already expanded altrep's -before = sum(gc()[, 2]) -fff = function(aref) { - ff = lapply(1:5, function(i) { - DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] - }) - return(rbindlist(ff)) -} -for(i in 1:100) { - f = fff("a") - rm("f") -} -gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after` - # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm. -after = sum(gc()[, 2]) -test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up). -# -before = sum(gc()[, 2]) -fff = function(aref) { - DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race. - lapply(1:5, function(i) { - DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]] - }) -} -for(i in 1:100) { - fff("a") -} -gc() -after = sum(gc()[, 2]) -test(1912.2, after < before + 10) +# 1912 moved to benchmark.Rraw, #5517 # BEGIN port of old testthat tests, #2740. Issue numbers may be from R-forge. # @@ -14212,11 +13971,7 @@ test(1977.4, DT["D", -"GRP"], data.table(ID="D", X=NA_real_, key="ID")) test(1977.5, DT["D", c("ID","GRP")], data.table(ID="D", GRP=NA_integer_, key="ID")) test(1977.6, DT[c("A","D"), c("ID","GRP")], data.table(ID=c("A","A","D"), GRP=INT(1,1,NA))) -# catch malformed factor in rbindlist, #3315 -set.seed(32940) -NN=7e5; KK=4e4; TT=25 -DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) ) -test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites +# 1978 moved to benchmark.Rraw, #5517 # Drop Null Values from `j` list elements #1406 DT = data.table(a = 1:3,b = letters[1:3],c = LETTERS[1:3]) @@ -14236,14 +13991,7 @@ DT = data.table( id = 1:5 , val = letters[1:5] ) test(1981.3, DT[, new_col := shift(val, "lead")], error="is.numeric(n) is not TRUE") test(1981.4, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") -# print of DT with many columns reordered them, #3306. -DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print -out = capture.output(print(DT)) -tt = out[grep("V",out)] -tt = unlist(strsplit(gsub(" ","",tt), "V")) -test(1982.1, tt[1L], "") -tt = as.integer(tt[tt!=""]) -test(1982.2, tt, seq_along(tt)) +# 1982 moved to benchmark.Rraw, #5517 # parse(text = 'list(`\\phantom{.}`)') fails, #3319 DT <- data.table(x=1, y=1:5) @@ -14532,12 +14280,7 @@ dx = data.table(id = 1L, key = "id") di = list(z=c(2L, 1L)) test(1999.2, key(dx[di]), NULL) -# chmatchdup test from benchmark at the bottom of chmatch.c -set.seed(45L) -x = sample(letters, 1e5, TRUE) -y = sample(letters, 1e6, TRUE) -test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406)) -rm(list=c("x","y")) +# 2000 moved to benchmark.Rraw, #5517 # rbindlist use.names=TRUE returned random column order when ncol>255; #3373 DT = setDT(replicate(300, rnorm(3L), simplify = FALSE)) @@ -16483,8 +16226,8 @@ g = function(x) { if (x==1L) factor(c("a","b")) else factor(c("a","b","c")) } test(2114.2, DT[,g(.GRP),by=A], data.table(A=INT(1,1,2,2,2), V1=as.factor(c("a","b","a","b","c")))) # original test verbatim from the same issue #2199 set.seed(2) -ids = sample(letters, 20) -dates = 1:40 +ids = sample(letters, 10) # reduced from 20 to 10 +dates = 1:10 # and 40 to 10 to save ram, #5517 dt = data.table(CJ(dates, ids, ids)) setnames(dt, c("date", "id1", "id2")) dt[, value := rnorm(length(date))] @@ -16495,8 +16238,8 @@ f1 = function(sdt) { melt.data.table(dt1, id.vars = "id1") } res = dt[, f1(.SD), by=date] -test(2114.3, setnames(res[c(1,.N)],"variable","id2")[,id2:=as.character(id2)][], dt[c(1,.N)]) -test(2114.4, print(res), output="date.*0.433") +test(2114.3, setnames(res[c(1,.N)],"variable","id2")[,id2:=as.character(id2)], dt[c(1,.N)]) +test(2114.4, print(res), output="date.*-0.830") # and from #2522 DT = data.table(id=1:9, grp=rep(1:3,each=3), val=c("a","b","c", "a","b","c", "a","b","c")) test(2114.5, as.character(DT[, valfactor1 := factor(val), by = grp]$valfactor1), ans<-rep(c("a","b","c"),3)) @@ -17695,18 +17438,7 @@ d[1:50, "a"] = d[51:100, "a"] setDT(d) test(2200, nrow(d[a==99]), 2L) -# segfault in forder when nrow/throttle=255 && nrow>=65536; #5077 -# Matt ran these on clang's ASAN+OpenMP which correctly faulted v1.14.0; these tests segfault consistently without ASAN too -set.seed(1) -DT = data.table(grp=sample(255L, 65536L ,replace=TRUE)) # >=255 && >=65536 necessary -setDTthreads(throttle=nrow(DT)) # increase throttle to reduce threads to 1 for this nrow -test(2201.1, nrow(DT[, .N, by=grp]), 255L) -test(2201.2, nrow(setkey(DT, grp)), 65536L) -set.seed(1) -DT = data.table(grp=sample(65536L)) # extra case with all size 1 groups too just for fun -test(2201.3, nrow(DT[, .N, by=grp]), 65536L) -test(2201.4, nrow(setkey(DT, grp)), 65536L) -setDTthreads() # restore default throttle +# 2201 moved to benchmark.Rraw, #5517 # fwrite now allows sep="", #4817 test(2202.1, fwrite(data.frame(a="id", b=letters[1:5], c=1:5), sep=""), @@ -18021,26 +17753,30 @@ DT = data.table(x = sample(letters[1:5], 20, TRUE), c = sample(c(0+3i,1,-1-1i,NA), 20, TRUE), l = sample(c(TRUE, FALSE, NA), 20, TRUE), r = as.raw(sample(1:5, 20, TRUE))) -load(testDir("test2224.Rdata")) # ans array +load(testDir("test2224.Rdata")) # 47KB array 24x8 where each cell contains a length-20 result if (test_bit64) { DT[, i64:=as.integer64(sample(c(-2L,0L,2L,NA), 20, TRUE))] } else { ans = ans[, -match("i64",colnames(ans))] } -test(2224.01, sapply(names(DT)[-1], function(col) { - sapply(list(1, 5, -1, -5, c(1,2), c(-1,1)), function(n) list( - # fill is tested by group in tests 2218.*; see comments in #5205 - EVAL(sprintf("DT[, shift(%s, %d, type='lag'), by=x]$V1", col, n)), - EVAL(sprintf("DT[, shift(%s, %d, type='lead'), by=x]$V1", col, n)), - EVAL(sprintf("DT[, shift(%s, %d, type='shift'), by=x]$V1", col, n)), - EVAL(sprintf("DT[, shift(%s, %d, type='cyclic'), by=x]$V1", col, n)) - )) -}), ans) +i = 1L +for (col in names(DT)[-1]) { + for (n in list(1, 5, -1, -5, c(1,2), c(-1,1))) { + for (type in c('lag','lead','shift','cyclic')) { + # fill is tested by group in tests 2218.*; see comments in #5205 + # sapply(sapply()) changed to for(for(for())) to save 29MB, #5517 + test(2224.1+i/10000, # 192 tests here when test_bit64=TRUE; 168 when FALSE + EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), + ans[[i]]) + i = i+1L + } + } +} a = 1:2 # fill argument with length > 1 which is not a call -test(2224.02, DT[, shift(i, fill=a), by=x], error="fill must be a vector of length 1") +test(2224.2, DT[, shift(i, fill=a), by=x], error="fill must be a vector of length 1") DT = data.table(x=pairlist(1), g=1) # unsupported type as argument -test(2224.03, DT[, shift(x), g], error="Type 'list' is not supported by GForce gshift.") +test(2224.3, DT[, shift(x), g], error="Type 'list' is not supported by GForce gshift.") # groupingsets by named by argument test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), From e8a594c7727fc2a4c001f3d526b4fb0e68a27ebd Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 15 Nov 2022 05:58:15 -0700 Subject: [PATCH 510/588] #4710 and #5520 follow up; combn() in R 3.1.0, move #if up to data.table.h thanks Jan, plotting in dev memtest need not require imports of standard functions (strange) --- R/test.data.table.R | 12 ++++++------ inst/tests/tests.Rraw | 2 +- src/data.table.h | 3 +++ src/freadR.c | 2 -- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index bc512bdfa2..c81c4b1f39 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -197,12 +197,12 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F ans = timings[, diff:=c(NA,round(diff(RSS),1))][y+1L][,time:=NULL] # time is distracting and influenced by gc() calls; just focus on RAM usage here catf("10 largest RAM increases (MB); see plot for cumulative effect (if any)\n") print(ans, class=FALSE) - dev.new(width=14, height=7) - par(mfrow=c(1,2)) - plot(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MB)", ylim=c(0,max(timings$RSS))) - mtext(lastRSS<-as.integer(ceiling(last(timings$RSS))), side=4, at=lastRSS, las=1, font=2) - plot(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MB)") - mtext(lastRSS, side=4, at=lastRSS, las=1, font=2) + get("dev.new")(width=14, height=7) + get("par")(mfrow=c(1,2)) + get("plot")(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MB)", ylim=c(0,max(timings$RSS))) + get("mtext")(lastRSS<-as.integer(ceiling(last(timings$RSS))), side=4, at=lastRSS, las=1, font=2) + get("plot")(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MB)") + get("mtext")(lastRSS, side=4, at=lastRSS, las=1, font=2) } catf("All %d tests (last %.8g) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7947ac0097..3cbe67680f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4420,7 +4420,7 @@ test_no = 1223.0 oldnfail = nfail for (nvars in seq_along(names(DT))) { signs = expand.grid(replicate(nvars, c(-1L,1L), simplify=FALSE)) - combn(names(DT), nvars, function(x) { + combn(names(DT), nvars, simplify=FALSE, function(x) { # simplify=FALSE needed for R 3.1.0 for (i in seq_len(nrow(signs))) { test_no <<- signif(test_no+.001, 7) ll = as.call(c(as.name("order"), diff --git a/src/data.table.h b/src/data.table.h index e57a428eac..c4458e8999 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -7,6 +7,9 @@ # define USE_RINTERNALS // #3301 # define DATAPTR_RO(x) ((const void *)DATAPTR(x)) #endif +#if !defined(R_VERSION) || R_VERSION < R_Version(3, 4, 0) +# define SET_GROWABLE_BIT(x) // #3292 +#endif #include #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT #include // for uint64_t rather than unsigned long long diff --git a/src/freadR.c b/src/freadR.c index bef1fa6f67..6b12210f5f 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -527,9 +527,7 @@ void setFinalNrow(size_t nrow) { for (int i=0; i=R_Version(3,4,0) SET_GROWABLE_BIT(VECTOR_ELT(DT,i)); // #3292 - #endif } } R_FlushConsole(); // # 2481. Just a convenient place; nothing per se to do with setFinalNrow() From 1b324637045c97b40960fa64a4dc4dc27b00596a Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 15 Nov 2022 23:24:32 -0700 Subject: [PATCH 511/588] 1.14.6 patch submitted to CRAN. Moved items down to its NEWS section and bumped dev to 1.14.7. Will tag patch-1.14 branch when publish date is confirmed in case there are revisions. --- DESCRIPTION | 2 +- NEWS.md | 19 +++++++++++++++---- src/init.c | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 55754ba976..b1809aad8e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.14.5 +Version: 1.14.7 Title: Extension of `data.frame` Depends: R (>= 3.1.0) Imports: methods diff --git a/NEWS.md b/NEWS.md index dfca323781..502a452a6a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** -# data.table [v1.14.5](https://github.com/Rdatatable/data.table/milestone/20) (in development) +# data.table [v1.14.7](https://github.com/Rdatatable/data.table/milestone/20) (in development) ## NEW FEATURES @@ -561,8 +561,6 @@ identical(DT1, DT2) # TRUE ``` -55. `fread()` could leak memory, [#3292](https://github.com/Rdatatable/data.table/issues/3292). Thanks to @patrickhowerter for reporting, and Jim Hester for the fix. The fix requires R 3.4.0 or later. Loading `data.table` in earlier versions now warns that known problems exist, asks users to upgrade R, and warns that we intend to upgrade `data.table`'s dependency from 8-year-old R 3.1.0 (April 2014) to 5-year-old R 3.4.0 (April 2017). - ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : @@ -612,7 +610,20 @@ 15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). -16. `test.data.table()` no longer creates `DT` in `.GlobalEnv` and gains `memtest=` for use on Linux to report which tests use the most memory. + +# data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) + +## BUG FIXES + +1. `fread()` could leak memory, [#3292](https://github.com/Rdatatable/data.table/issues/3292). Thanks to @patrickhowerter for reporting, and Jim Hester for the fix. The fix requires R 3.4.0 or later. Loading `data.table` in earlier versions now highlights this issue on startup, asks users to upgrade R, and warns that we intend to upgrade `data.table`'s dependency from 8 year old R 3.1.0 (April 2014) to 5 year old R 3.4.0 (April 2017). + +## NOTES + +1. Test 1962.098 has been modified to pass latest changes to `POSIXt` in R-devel. + +2. `test.data.table()` no longer creates `DT` in `.GlobalEnv`, a CRAN policy violation, [#5514](https://github.com/Rdatatable/data.table/issues/5514). No other writes occurred to `.GlobalEnv` and release procedures have been improved to prevent this happening again. + +3. The memory usage of the test suite has been halved, [#5507](https://github.com/Rdatatable/data.table/issues/5507). # data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) (17 Oct 2022) diff --git a/src/init.c b/src/init.c index dae13f8a72..9c5afd905a 100644 --- a/src/init.c +++ b/src/init.c @@ -351,6 +351,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion(void) { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.14.5"))); + return(ScalarString(mkChar("1.14.7"))); } From 490b42dd24dedd584e47e6b96b604b27bb2fcb3a Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 16 Nov 2022 16:30:51 -0700 Subject: [PATCH 512/588] NEWS publish date for 1.14.6 added to title (see patch-1.14 branch for tag); Makefile & CRAN_Release version number bump --- .dev/CRAN_Release.cmd | 34 +++++++++++++++++----------------- Makefile | 6 +++--- NEWS.md | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 8495c4998b..be4b55a037 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -195,15 +195,15 @@ R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" # avoids many too-many-requests in --as-cran's ping-all-URLs step (20 mins) inside the `checking CRAN incoming feasibility...` step. # Many thanks to Dirk for the tipoff that setting this env variable solves the problem, #4832. -R CMD check data.table_1.14.1.tar.gz --as-cran -R CMD INSTALL data.table_1.14.1.tar.gz --html +R CMD check data.table_1.14.7.tar.gz --as-cran +R CMD INSTALL data.table_1.14.7.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.14.1.tar.gz +R CMD check data.table_1.14.7.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -220,9 +220,9 @@ q("no") # User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 # Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. -PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.1.tar.gz +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.7.tar.gz # Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work -PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.1.tar.gz +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.7.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) @@ -266,7 +266,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.14.1.tar.gz +R310 CMD INSTALL ./data.table_1.14.7.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -278,7 +278,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.14.1.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.14.7.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -286,7 +286,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.14.1.tar.gz +R CMD check data.table_1.14.7.tar.gz ##################################################### @@ -341,11 +341,11 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.14.5.tar.gz +Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.14.7.tar.gz # Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so they should be # passed through to here. However, our configure script seems to get in the way and gets them from {R_HOME}/bin/R # So I needed to edit my ~/.R/Makevars to get CFLAGS the way I needed. -Rdevel-strict-[gcc|clang] CMD check data.table_1.14.5.tar.gz +Rdevel-strict-[gcc|clang] CMD check data.table_1.14.7.tar.gz # Use the (failed) output to get the list of currently needed packages and install them Rdevel-strict-[gcc|clang] isTRUE(.Machine$sizeof.longdouble==0) # check noLD is being tested @@ -354,7 +354,7 @@ install.packages(c("bit64", "bit", "curl", "R.utils", "xts","nanotime", "zoo", " Ncpus=4) # Issue #5491 showed that CRAN is running UBSAN on .Rd examples which found an error so we now run full R CMD check q("no") -Rdevel-strict-[gcc|clang] CMD check data.table_1.14.5.tar.gz +Rdevel-strict-[gcc|clang] CMD check data.table_1.14.7.tar.gz # UBSAN errors occur on stderr and don't affect R CMD check result. Made many failed attempts to capture them. So grep for them. find data.table.Rcheck -name "*Rout*" -exec grep -H "runtime error" {} \; @@ -391,7 +391,7 @@ cd R-devel-valgrind make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems -Rdevel-valgrind CMD INSTALL data.table_1.14.1.tar.gz +Rdevel-valgrind CMD INSTALL data.table_1.14.7.tar.gz R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" # the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. # including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks @@ -429,7 +429,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.14.1.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.14.7.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -590,7 +590,7 @@ du -k inst/tests # 0.75MB after R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" Rdevel -q -e "packageVersion('xml2')" # ensure installed -Rdevel CMD check data.table_1.14.0.tar.gz --as-cran # use latest Rdevel as it may have extra checks +Rdevel CMD check data.table_1.14.8.tar.gz --as-cran # use latest Rdevel as it may have extra checks # bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit compressed *.Rraw to git # @@ -617,8 +617,8 @@ When CRAN's email contains "Pretest results OK pending a manual inspection" (or 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. 4. Bump dllVersion() in init.c 5. Bump 3 version numbers in Makefile -6. Search and replace this .dev/CRAN_Release.cmd to update 1.13.7 to 1.14.1, and 1.13.6 to 1.14.0 (e.g. in step 8 and 9 below) +6. Search and replace this .dev/CRAN_Release.cmd to update 1.14.7 to 1.14.9 inc below, 1.14.8 to 1.14.10 above, 1.14.6 to 1.14.8 below 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -8. Push to master with this consistent commit message: "1.14.0 on CRAN. Bump to 1.14.1" -9. Take sha from step 8 and run `git tag 1.14.0 96c..sha..d77` then `git push origin 1.14.0` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +8. Push to master with this consistent commit message: "1.14.6 on CRAN. Bump to 1.14.7" +9. Take sha from step 8 and run `git tag 1.14.6 96c..sha..d77` then `git push origin 1.14.6` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) ###### diff --git a/Makefile b/Makefile index 34eedef4cb..6cc4410cd5 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.14.5.tar.gz + $(RM) data.table_1.14.7.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.14.5.tar.gz + $(R) CMD INSTALL data.table_1.14.7.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.5.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.7.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NEWS.md b/NEWS.md index 502a452a6a..06c6013fe1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -611,7 +611,7 @@ 15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). -# data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) +# data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) (16 Nov 2022) ## BUG FIXES From 0a964d3ff9b7fa2dc5d5dc6ca2c4d7fe5bc8dcba Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sun, 20 Nov 2022 00:38:38 -0700 Subject: [PATCH 513/588] added memtest.id to test.data.table, #5515 tidy up --- R/test.data.table.R | 15 +++++++++++++-- man/test.data.table.Rd | 4 +++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index c81c4b1f39..6428bcc72b 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -1,8 +1,15 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, - memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0)) { + memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id=NULL) { stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress)) memtest = as.integer(memtest) stopifnot(length(memtest)==1L, memtest %in% 0:2) + memtest.id = as.integer(memtest.id) + if (length(memtest.id)) { + if (length(memtest.id)==1L) memtest.id = rep(memtest.id, 2L) # for convenience of supplying one id rather than always a range + stopifnot(length(memtest.id)<=2L, # conditions quoted to user when false so "<=2L" even though following conditions rely on ==2L + !anyNA(memtest.id), memtest.id[1L]<=memtest.id[2L]) + if (memtest==0L) memtest=1L # using memtest.id implies memtest + } if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # package developer # nocov start @@ -119,6 +126,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F assign("lasttime", proc.time()[3L], envir=env) # used by test() to attribute time inbetween tests to the next test assign("timings", data.table( ID = seq_len(9999L), time=0.0, nTest=0L, RSS=0.0 ), envir=env) # test timings aggregated to integer id assign("memtest", memtest, envir=env) + assign("memtest.id", memtest.id, envir=env) assign("filename", fn, envir=env) assign("showProgress", showProgress, envir=env) @@ -267,6 +275,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no lasttime = get("lasttime", parent.frame()) timings = get("timings", parent.frame()) memtest = get("memtest", parent.frame()) + memtest.id = get("memtest.id", parent.frame()) filename = get("filename", parent.frame()) foreign = get("foreign", parent.frame()) showProgress = get("showProgress", parent.frame()) @@ -276,7 +285,9 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no timings[as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE] if (memtest) { if (memtest==1L) gc() # see #5515 for before/after - timings[as.integer(num), RSS:=max(rss(),RSS), verbose=FALSE] + inum = as.integer(num) + timings[inum, RSS:=max(rss(),RSS), verbose=FALSE] # TODO prefix inum with .. for clarity when that works + if (length(memtest.id) && memtest.id[1L]<=inum && inum<=memtest.id[2L]) cat(rss(),"\n") # after 'testing id ...' output; not using between() as it has verbose output when getOption(datatable.verbose) if (memtest==2L) gc() } assign("lasttime", proc.time()[3L], parent.frame(), inherits=TRUE) # after gc() to exclude gc() time from next test when memtest diff --git a/man/test.data.table.Rd b/man/test.data.table.Rd index 2df2a32842..c36e5f9d40 100644 --- a/man/test.data.table.Rd +++ b/man/test.data.table.Rd @@ -8,7 +8,8 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", silent = FALSE, showProgress = interactive() && !silent, - memtest = Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0)) + memtest = Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), + memtest.id = NULL) } \arguments{ \item{script}{ Run arbitrary R test script. } @@ -17,6 +18,7 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", \item{silent}{ Controls what happens if a test fails. Like \code{silent} in \code{\link{try}}, \code{TRUE} causes the error message to be suppressed and \code{FALSE} to be returned, otherwise the error is returned. } \item{showProgress}{ Output 'Running test ...\\r' at the start of each test? } \item{memtest}{ Measure and report memory usage of tests (1:gc before ps, 2:gc after ps) rather than time taken (0) by default. Intended for and tested on Linux. See PR #5515 for more details. } +\item{memtest.id}{ An id for which to print memory usage for every sub id. May be a range of ids. } } \details{ Runs a series of tests. These can be used to see features and examples of usage, too. Running test.data.table will tell you the full location of the test file(s) to open. From 20333c19cc1a38dd0cbf41611bd9ae03ee13fede Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sun, 20 Nov 2022 13:59:59 -0700 Subject: [PATCH 514/588] .dev-only: revdep updates and tweaks --- .dev/.bash_aliases | 4 ++- .dev/CRAN_Release.cmd | 8 +++-- .dev/revdep.R | 74 ++++++++++++++++++++++++------------------- 3 files changed, 50 insertions(+), 36 deletions(-) diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases index 3d46c94d6e..928f0e07fc 100644 --- a/.dev/.bash_aliases +++ b/.dev/.bash_aliases @@ -8,6 +8,8 @@ alias gdm='git difftool master &> /dev/null' # If meld has scrolling issues, turn off GTK animation which I don't need: # https://gitlab.gnome.org/GNOME/meld/-/issues/479#note_866040 +alias perfbar=~/build/gtk_perfbar/linux_perfbar # revdep.R; https://github.com/tomkraljevic/gtk_perfbar + alias Rdevel='~/build/R-devel/bin/R --vanilla' alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' @@ -15,7 +17,7 @@ alias Rdevel-valgrind='~/build/R-devel-valgrind/bin/R --vanilla' alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla' alias R310='~/build/R-3.1.0/bin/R --vanilla' -alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=true' +alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=NULL && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=true' alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R R' # use ~/build/R-devel/bin/R at the end of revdepr to use R-devel instead of R-release. # If so, doing a `rm -rf *` in revdeplib first to rebuild everything is easiest way to avoid potential problems later. A full rebuild is a good idea periodically anyway. Packages in diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index be4b55a037..279316bfeb 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -490,14 +490,15 @@ shutdown now # doesn't return you to host prompt properly so just kill the win # Downstream dependencies ############################################### -# IF NOT ALREADY INSTALLED +# IF NOT ALREADY INSTALLED, OR AFTER AN OS UPGRADE +# No harm rerunning these commands; they do not reinstall if already latest version sudo apt-get update sudo apt-get -y install htop sudo apt-get -y install r-base r-base-dev sudo apt-get -y build-dep r-base-dev sudo apt-get -y build-dep qpdf sudo apt-get -y install aptitude -sudo aptitude -y build-dep r-cran-rgl # leads to libglu1-mesa-dev +sudo apt-get -y build-dep r-cran-rgl # leads to libglu1-mesa-dev sudo apt-get -y build-dep r-cran-rmpi sudo apt-get -y build-dep r-cran-cairodevice sudo apt-get -y build-dep r-cran-tkrplot @@ -545,6 +546,8 @@ sudo apt-get -y install libgit2-dev # for gert sudo apt-get -y install cmake # for symengine for RxODE sudo apt-get -y install libxslt1-dev # for xslt sudo apt-get -y install flex # for RcppCWB +sudo apt-get -y install libavfilter-dev libsodium-dev libgmp-dev libssh-dev librdf0-dev +sudo apt-get -y install libmariadb-dev mariadb-client # RMySQL for xQTLbiolinks sudo R CMD javareconf # ENDIF @@ -553,6 +556,7 @@ inst() # *** ensure latest dev version of data.table installed into revdeplib run() # prints menu of options status() # includes timestamp of installed data.table that is being tested. log() # cats all fail logs to ~/fail.log +cran() # compare packages with error or warning to their status on CRAN # Once all issues resolved with CRAN packages, tackle long-term unfixed bioconductor packages as follows. # 1. Note down all error and warning bioc packages diff --git a/.dev/revdep.R b/.dev/revdep.R index 10af35b553..c1ab9b0f38 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -13,9 +13,10 @@ options(error=quote(utils::dump.frames())) options(width=200) # for cran() output not to wrap # Check that env variables have been set correctly: -# export R_LIBS_SITE=none +# export R_LIBS_SITE=NULL # R 4.2.0 changed to NULL but it doesn't appear to work # export R_LIBS=~/build/revdeplib/ # export _R_CHECK_FORCE_SUGGESTS_=true +if (length(.libPaths())==3L) .libPaths(.libPaths()[-2L], include.site=FALSE) # workaround as I couldn't get R_LIBS_SITE=NULL to be effective stopifnot(identical(length(.libPaths()), 2L)) # revdeplib writeable by me, and the pre-installed recommended R library (sudo writeable) stopifnot(identical(.libPaths()[1L], getwd())) tt = file.info(.libPaths())[,"uname"] @@ -96,10 +97,33 @@ update.packages(ask=FALSE, checkBuilt=TRUE) avail = available.packages() # includes CRAN and Bioc, from getOption("repos") set above -avail = avail[!rownames(avail) %in% "cplexAPI", ] +avail = avail[!rownames(avail) %in% c("cplexAPI","Rcplex"), ] # cplexAPI is suggested by revdeps ivmte and prioritizr. I haven't succeeded to install IBM ILOG CPLEX which requires a license, # so consider cplexAPI not available when resolving missing suggests at the end of status(). -# Update: cplexAPI was removed from CRAN on 5 Nov 2021 so this is now redundant, but leave it in place for future use +# Update: cplexAPI was removed from CRAN on 5 Nov 2021 so this is now redundant, but leave it in place for future use. +# Update: Rcplex is on CRAN as of 20 Nov 2022 but with install errors, therefore treat it as not available. + +# The presence of packages here in revdeplib which no longer exist on CRAN could explain differences to CRAN. A revdep +# could be running tests using that package when available and failing which may be the very reason that package was removed from CRAN. +# When it is removed from revdeplib to match CRAN, then the revdep might then pass as it will skip its tests using that package. +x = installed.packages() +tt = match(rownames(x), rownames(avail)) +removed = rownames(x)[is.na(tt) & is.na(x[,"Priority"])] +cat("Removing",length(removed),"packages which are no longer available on CRAN/Bioc:", paste(removed, collapse=","), "\n") +stopifnot(all(x[removed,"LibPath"] == .libPaths()[1])) +oldn = nrow(x) +remove.packages(removed, .libPaths()[1]) +x = installed.packages() +stopifnot(nrow(x) == oldn-length(removed)) + +# Ensure all installed packages were built with this x.y release of R; i.e. that checkBuilt=TRUE worked above +cat("This is R ",R.version$major,".",R.version$minor,"; ",R.version.string,"\n",sep="") +cat("Previously installed packages were built using:\n") +print(tt <- table(x[,"Built"], dnn=NULL)) +minorR = paste(strsplit(as.character(getRversion()), split="[.]")[[1]][c(1,2)], collapse=".") +if (any(w<-names(tt) Date: Sun, 20 Nov 2022 15:20:42 -0700 Subject: [PATCH 515/588] .dev-only: revdep faster startup as one part was too deep in a loop, and no need to update.packages() a 2nd time --- .dev/revdep.R | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.dev/revdep.R b/.dev/revdep.R index c1ab9b0f38..0b949da361 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -155,7 +155,6 @@ for (p in deps) { } } cat("New downloaded:",new," Already had latest:", old, " TOTAL:", length(deps), "\n") -update.packages(checkBuilt=TRUE, ask=FALSE) # won't rebuild packages which are no longer available on CRAN # Remove the tar.gz no longer needed : for (p in deps) { @@ -166,12 +165,12 @@ for (p in deps) { cat("Removing",i,"because",f,"is newer\n") system(paste0("rm ",i)) } - all = system("ls *.tar.gz", intern=TRUE) - all = sapply(strsplit(all, split="_"),'[',1) - for (i in all[!all %in% deps]) { - cat("Removing",i,"because it", if (!i %in% rownames(avail)) "has been removed from CRAN\n" else "no longer uses data.table\n") - system(paste0("rm ",i,"_*.tar.gz")) - } +} +all = system("ls *.tar.gz", intern=TRUE) +all = sapply(strsplit(all, split="_"),'[',1) +for (i in all[!all %in% deps]) { + cat("Removing",i,"because it", if (!i %in% rownames(avail)) "has been removed from CRAN\n" else "no longer uses data.table\n") + system(paste0("rm ",i,"_*.tar.gz")) } num_tar.gz = as.integer(system("ls *.tar.gz | wc -l", intern=TRUE)) if (length(deps) != num_tar.gz) stop("num_tar.gz==",num_tar.gz," but length(deps)==",length(deps)) From 6cc799152662011db78a4516e4f9ffd8979d4482 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 16 Feb 2023 01:10:11 -0500 Subject: [PATCH 516/588] 1.14.8 patch bump. Will tag patch-1.14 branch when publish date is confirmed. --- .dev/CRAN_Release.cmd | 36 ++++++++++++++++++------------------ DESCRIPTION | 2 +- Makefile | 6 +++--- NEWS.md | 9 ++++++++- src/init.c | 2 +- 5 files changed, 31 insertions(+), 24 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 279316bfeb..3442dcb38c 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -195,15 +195,15 @@ R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" # avoids many too-many-requests in --as-cran's ping-all-URLs step (20 mins) inside the `checking CRAN incoming feasibility...` step. # Many thanks to Dirk for the tipoff that setting this env variable solves the problem, #4832. -R CMD check data.table_1.14.7.tar.gz --as-cran -R CMD INSTALL data.table_1.14.7.tar.gz --html +R CMD check data.table_1.14.9.tar.gz --as-cran +R CMD INSTALL data.table_1.14.9.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.14.7.tar.gz +R CMD check data.table_1.14.9.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -220,9 +220,9 @@ q("no") # User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 # Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. -PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.7.tar.gz +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.9.tar.gz # Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work -PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.7.tar.gz +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.9.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) @@ -266,7 +266,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.14.7.tar.gz +R310 CMD INSTALL ./data.table_1.14.9.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -278,7 +278,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.14.7.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.14.9.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -286,7 +286,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.14.7.tar.gz +R CMD check data.table_1.14.9.tar.gz ##################################################### @@ -320,7 +320,7 @@ make cd ~/build/R-devel-strict-gcc # gcc-10 failed to build R-devel at some point, so using regular gcc-9 (9.3.0 as per focal/Pop!_OS 20.04) -./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="gcc-9 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" +./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="gcc-11 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make # See R-exts#4.3.3 @@ -341,11 +341,11 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.14.7.tar.gz +Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.14.9.tar.gz # Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so they should be # passed through to here. However, our configure script seems to get in the way and gets them from {R_HOME}/bin/R # So I needed to edit my ~/.R/Makevars to get CFLAGS the way I needed. -Rdevel-strict-[gcc|clang] CMD check data.table_1.14.7.tar.gz +Rdevel-strict-[gcc|clang] CMD check data.table_1.14.9.tar.gz # Use the (failed) output to get the list of currently needed packages and install them Rdevel-strict-[gcc|clang] isTRUE(.Machine$sizeof.longdouble==0) # check noLD is being tested @@ -354,7 +354,7 @@ install.packages(c("bit64", "bit", "curl", "R.utils", "xts","nanotime", "zoo", " Ncpus=4) # Issue #5491 showed that CRAN is running UBSAN on .Rd examples which found an error so we now run full R CMD check q("no") -Rdevel-strict-[gcc|clang] CMD check data.table_1.14.7.tar.gz +Rdevel-strict-[gcc|clang] CMD check data.table_1.14.9.tar.gz # UBSAN errors occur on stderr and don't affect R CMD check result. Made many failed attempts to capture them. So grep for them. find data.table.Rcheck -name "*Rout*" -exec grep -H "runtime error" {} \; @@ -391,7 +391,7 @@ cd R-devel-valgrind make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems -Rdevel-valgrind CMD INSTALL data.table_1.14.7.tar.gz +Rdevel-valgrind CMD INSTALL data.table_1.14.9.tar.gz R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" # the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. # including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks @@ -429,7 +429,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.14.7.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.14.9.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -594,7 +594,7 @@ du -k inst/tests # 0.75MB after R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" Rdevel -q -e "packageVersion('xml2')" # ensure installed -Rdevel CMD check data.table_1.14.8.tar.gz --as-cran # use latest Rdevel as it may have extra checks +Rdevel CMD check data.table_1.14.10.tar.gz --as-cran # use latest Rdevel as it may have extra checks # bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit compressed *.Rraw to git # @@ -621,8 +621,8 @@ When CRAN's email contains "Pretest results OK pending a manual inspection" (or 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. 4. Bump dllVersion() in init.c 5. Bump 3 version numbers in Makefile -6. Search and replace this .dev/CRAN_Release.cmd to update 1.14.7 to 1.14.9 inc below, 1.14.8 to 1.14.10 above, 1.14.6 to 1.14.8 below +6. Search and replace this .dev/CRAN_Release.cmd to update 1.14.9 to 1.14.11 inc below, 1.14.10 to 1.14.12 above, 1.14.8 to 1.14.10 below 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -8. Push to master with this consistent commit message: "1.14.6 on CRAN. Bump to 1.14.7" -9. Take sha from step 8 and run `git tag 1.14.6 96c..sha..d77` then `git push origin 1.14.6` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +8. Push to master with this consistent commit message: "1.14.8 on CRAN. Bump to 1.14.10" +9. Take sha from step 8 and run `git tag 1.14.8 96c..sha..d77` then `git push origin 1.14.8` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) ###### diff --git a/DESCRIPTION b/DESCRIPTION index b1809aad8e..74a4b6e1c4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.14.7 +Version: 1.14.9 Title: Extension of `data.frame` Depends: R (>= 3.1.0) Imports: methods diff --git a/Makefile b/Makefile index 6cc4410cd5..b4d8517df3 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.14.7.tar.gz + $(RM) data.table_1.14.9.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.14.7.tar.gz + $(R) CMD INSTALL data.table_1.14.9.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.7.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.9.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NEWS.md b/NEWS.md index 06c6013fe1..043ad9ade4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** -# data.table [v1.14.7](https://github.com/Rdatatable/data.table/milestone/20) (in development) +# data.table [v1.14.9](https://github.com/Rdatatable/data.table/milestone/20) (in development) ## NEW FEATURES @@ -611,6 +611,13 @@ 15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). +# data.table [v1.14.8](https://github.com/Rdatatable/data.table/milestone/28?closed=1) + +## BUG FIXES + +## NOTES + + # data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) (16 Nov 2022) ## BUG FIXES diff --git a/src/init.c b/src/init.c index 9c5afd905a..2cffabd34b 100644 --- a/src/init.c +++ b/src/init.c @@ -351,6 +351,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion(void) { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.14.7"))); + return(ScalarString(mkChar("1.14.9"))); } From e444d6d82f5b7673c957e09cb5ff1ededbcd057e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 16 Feb 2023 02:00:30 -0500 Subject: [PATCH 517/588] Test 1613.605 relaxed to pass different all.equal output in R-devel (#5597) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 043ad9ade4..e19c7fecf9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -617,6 +617,8 @@ ## NOTES +1. Test 1613.605 now passes changes to `as.data.frame()` in R-devel, [#5597](https://github.com/Rdatatable/data.table/pull/5597). Thanks to Avraham Adler for reporting. + # data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) (16 Nov 2022) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3cbe67680f..9117c0fcb6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8687,7 +8687,7 @@ test(1613.601, all.equal(data.table(a=1), data.frame(a=1)), "target is data.tabl test(1613.602, all.equal(data.table(a=1), data.frame(a=1), check.attributes = FALSE)) test(1613.603, all.equal(data.table(a=1), list(a=1), check.attributes = FALSE)) test(1613.604, all.equal(data.table(a=1), 1, check.attributes = FALSE)) -test(1613.605, all.equal(data.table(a=1), try(stop('this wont work'), silent = TRUE), check.attributes = FALSE), "target is data.table but current is not and failed to be coerced to it") +test(1613.605, !isTRUE(all.equal(data.table(a=1), try(stop('this wont work'), silent = TRUE), check.attributes = FALSE))) L1 = list(a = data.table(1), b = setattr("foo1613", "tbl", data.table(1))) L2 = list(a = 1, b = setattr("foo1613", "tbl", 1)) test(1613.606, all(grepl("target is data.table, current is numeric", all.equal(L1, L2)))) From e4942c65cdd00e0aa9c9d76ac7655cf4cde0dbc0 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 16 Feb 2023 02:06:49 -0500 Subject: [PATCH 518/588] GLCI-only: r-rel-win version --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 140ccef6c5..9678e6e8ea 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -95,7 +95,7 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/R-4.2.1-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/R-4.2.2-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait # see #5198 for discussion about the https link used above; it will break each time R is released and the version number will need to be updated .test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait From e4565f1cb78c4ecd56bb33bb6269a2f8d60629c1 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 16 Feb 2023 03:20:24 -0500 Subject: [PATCH 519/588] GLCI-only: rtools43 --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9678e6e8ea..dc1ed80870 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -103,7 +103,7 @@ build: ## build data.table sources as tar.gz archive - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.3/R-4.1.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-rtools-win: &install-rtools-win - - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5355-5357.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5493-5475.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools43" -NoNewWindow -Wait .test-template: &test stage: test @@ -247,7 +247,7 @@ test-rel-win: ## R-release on Windows, test and build binaries before_script: - *install-r-rel-win - *install-rtools-win - - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" + - $ENV:PATH = "C:\R\bin;C:\rtools43\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most'), quiet=TRUE)" - *cp-src-win - rm.exe -r bus @@ -266,7 +266,7 @@ test-dev-win: ## R-devel on Windows; see #5294 for changes in Dec 2021 related t before_script: - *install-r-devel-win - *install-rtools-win - - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" + - $ENV:PATH = "C:\R\bin;C:\rtools43\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win - rm.exe -r bus From 092dc2f3e40ec6370a774c332685be78d745e33e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 16 Feb 2023 04:18:24 -0500 Subject: [PATCH 520/588] GLCI-only: R-release (4.2) won't work with rtools43 it seems so back to rtools42 for that --- .gitlab-ci.yml | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dc1ed80870..f0c4037933 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -102,9 +102,6 @@ build: ## build data.table sources as tar.gz archive .test-install-r-oldrel-win: &install-r-oldrel-win - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.3/R-4.1.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait -.test-install-rtools-win: &install-rtools-win - - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5493-5475.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools43" -NoNewWindow -Wait - .test-template: &test stage: test needs: ["mirror-packages","build"] @@ -246,8 +243,8 @@ test-rel-win: ## R-release on Windows, test and build binaries R_VERSION: "$R_REL_VERSION" before_script: - *install-r-rel-win - - *install-rtools-win - - $ENV:PATH = "C:\R\bin;C:\rtools43\usr\bin;$ENV:PATH" + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5355-5357.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait + - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most'), quiet=TRUE)" - *cp-src-win - rm.exe -r bus @@ -265,7 +262,7 @@ test-dev-win: ## R-devel on Windows; see #5294 for changes in Dec 2021 related t R_VERSION: "$R_DEVEL_VERSION" before_script: - *install-r-devel-win - - *install-rtools-win + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5493-5475.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools43" -NoNewWindow -Wait - $ENV:PATH = "C:\R\bin;C:\rtools43\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win @@ -285,7 +282,7 @@ test-old-win: ## R-oldrel on Windows before_script: - *install-r-oldrel-win - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait - ## rtools42 doesn't support 32bit so oldrel-win (currently R 4.1) needs rtools40. Can use install-rtools-win again here when oldrel is R 4.2+ + ## rtools42 doesn't support 32bit so oldrel-win (currently R 4.1) needs rtools40 - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 - *cp-src-win From d0519f1e4a5fc1a3ddd7826322d67fe059cc8c63 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 16 Feb 2023 04:32:57 -0500 Subject: [PATCH 521/588] fixed non-equi out of bounds read (#5599) --- NEWS.md | 4 ++-- src/nqrecreateindices.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index e19c7fecf9..9c352b9b50 100644 --- a/NEWS.md +++ b/NEWS.md @@ -613,12 +613,12 @@ # data.table [v1.14.8](https://github.com/Rdatatable/data.table/milestone/28?closed=1) -## BUG FIXES - ## NOTES 1. Test 1613.605 now passes changes to `as.data.frame()` in R-devel, [#5597](https://github.com/Rdatatable/data.table/pull/5597). Thanks to Avraham Adler for reporting. +2. An out of bounds read when combining non-equi join with `by=.EACHI` has been found and fixed thanks to clang ASAN, [#5598](https://github.com/Rdatatable/data.table/issues/5598). There was no bug or consequence because the read was followed (now preceded) by a bounds test. + # data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) (16 Nov 2022) diff --git a/src/nqrecreateindices.c b/src/nqrecreateindices.c index a750c093eb..4a8f9547b0 100644 --- a/src/nqrecreateindices.c +++ b/src/nqrecreateindices.c @@ -26,7 +26,7 @@ SEXP nqRecreateIndices(SEXP xo, SEXP len, SEXP indices, SEXP nArg, SEXP nomatch) // TODO: revisit to see if this be simplified further when I've some time. R_len_t j=0, tmp=0; for (int i=0; i= xn) { + if (j>=xn || ixo[j]<=0) { // NA_integer_ = INT_MIN is checked in init.c // j >= xn needed for special nomatch=NULL case, see issue#4388 (due to xo[irows] from R removing '0' value in xo) inewstarts[i] = inomatch; From 77d34390a1530a8b393f2369e5ba1d9537b3c37f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 16 Feb 2023 06:37:46 -0500 Subject: [PATCH 522/588] Stop exporting .rbind.data.table in R>=4.0.0 (#5600) --- NAMESPACE | 23 +++++++++++------------ NEWS.md | 2 ++ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c22782440a..ef0aa2d171 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -90,19 +90,18 @@ if (getRversion() >= "4.0.0") { # if we register these (new in v1.12.6) methods always though, the previous workaround no longer works in R<4.0.0. Hence only register in R>=4.0.0. S3method(cbind, data.table) S3method(rbind, data.table) +} else { + # and if we export but don't register in R < 4.0.0 we get this note: + # > Found the following apparent S3 methods exported but not registered: + # > cbind.data.table rbind.data.table + # in addition to errors in tests 324, 326, 414.1, 414.2, 442, 445, 451 + # export(cbind.data.table) + # export(rbind.data.table) + # A revdep using rbind.data.frame() directly before (which data.table changed in base) should change to rbind() generic and that should work + # in all combinations of R before/after 4.0.0 and data.table before/after 1.12.6, so long as data.table is installed using the same major + # version of R (and that is checked in .onLoad with error if not). + export(.rbind.data.table) # only export in R<4.0.0 where it is still used; R-devel now detects it is missing doc, #5600 } -# else { -# # and if we export but don't register in R < 4.0.0 we get this note: -# # > Found the following apparent S3 methods exported but not registered: -# # > cbind.data.table rbind.data.table -# # in addition to errors in tests 324, 326, 414.1, 414.2, 442, 445, 451 -# export(cbind.data.table) -# export(rbind.data.table) -# # A revdep using rbind.data.frame() directly before (which data.table changed in base) should change to rbind() generic and that should work -# # in all combinations of R before/after 4.0.0 and data.table before/after 1.12.6, so long as data.table is installed using the same major -# # version of R (and that is checked in .onLoad with error if not). -# } -export(.rbind.data.table) # continue to export for now because it has been exported in the past so it may be depended on S3method(dim, data.table) S3method(dimnames, data.table) S3method("dimnames<-", data.table) diff --git a/NEWS.md b/NEWS.md index 9c352b9b50..444ae63323 100644 --- a/NEWS.md +++ b/NEWS.md @@ -619,6 +619,8 @@ 2. An out of bounds read when combining non-equi join with `by=.EACHI` has been found and fixed thanks to clang ASAN, [#5598](https://github.com/Rdatatable/data.table/issues/5598). There was no bug or consequence because the read was followed (now preceded) by a bounds test. +3. `.rbind.data.table` (note the leading `.`) is no longer exported when `data.table` is installed in R>=4.0.0 (Apr 2020), [#5600](https://github.com/Rdatatable/data.table/pull/5600). It was never documented which R-devel now detects and warns about. It is only needed by `data.table` internals to support R<4.0.0; see note 1 in v1.12.6 (Oct 2019) below in this file for more details. + # data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) (16 Nov 2022) From 88039186915028ab3c93ccfd8e22c0d1c3534b1a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 17 Feb 2023 12:41:23 -0500 Subject: [PATCH 523/588] NEWS-only: publish date for 1.14.8 added to title (see patch-1.14 branch for tag) --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 444ae63323..025a7651b3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -611,7 +611,7 @@ 15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). -# data.table [v1.14.8](https://github.com/Rdatatable/data.table/milestone/28?closed=1) +# data.table [v1.14.8](https://github.com/Rdatatable/data.table/milestone/28?closed=1) (17 Feb 2023) ## NOTES From ae956ba2e7700761828e2e9a24eed183e1a6b3fc Mon Sep 17 00:00:00 2001 From: Martin Maechler Date: Sat, 23 Sep 2023 21:06:00 +0200 Subject: [PATCH 524/588] run correctly even when is.atomic(NULL) becomes FALSE --- R/data.table.R | 2 +- R/frank.R | 2 +- R/fread.R | 2 +- R/print.data.table.R | 3 ++- R/setkey.R | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 473cf6e766..801482147e 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -882,7 +882,7 @@ replace_dot_alias = function(e) { bynames = allbyvars = NULL # the rest now fall through } else bynames = names(byval) - if (is.atomic(byval)) { + if (is.atomic(byval) || is.null(byval)) { if (is.character(byval) && length(byval)<=ncol(x) && !(is.name(bysub) && bysub %chin% names_x) ) { stopf("'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval%s should work. This is for efficiency so data.table can detect which columns are needed.", deparse(bysub)) } else { diff --git a/R/frank.R b/R/frank.R index ba90a83b93..419f5ea414 100644 --- a/R/frank.R +++ b/R/frank.R @@ -12,7 +12,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a .Call(Csetlistelt, xx, 1L, x) xx } - if (is.atomic(x)) { + if (is.atomic(x) || is.null(x)) { if (!missing(cols) && !is.null(cols)) stopf("x is a single vector, non-NULL 'cols' doesn't make sense") cols = 1L diff --git a/R/fread.R b/R/fread.R index f8b025d9c3..e0337c5915 100644 --- a/R/fread.R +++ b/R/fread.R @@ -135,7 +135,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (!allNA(colClasses)) stopf("colClasses is type 'logical' which is ok if all NA but it has some TRUE or FALSE values in it which is not allowed. Please consider the drop= or select= argument instead. See ?fread.") colClasses = NULL } - if (!is.null(colClasses) && is.atomic(colClasses)) { + if (!is.null(colClasses) && is.atomic(colClasses)) { ## future R can use if (is.atomic(.)) if (!is.character(colClasses)) stopf("colClasses is not type list or character vector") if (!length(colClasses)) { colClasses=NULL; diff --git a/R/print.data.table.R b/R/print.data.table.R index 16950fd110..7271ac458f 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -141,7 +141,8 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), } format.data.table = function (x, ..., justify="none") { - if (is.atomic(x) && !is.null(x)) { + if (is.atomic(x) && !is.null(x)) { ## future R can use if (is.atomic(x)) + stopf("Internal structure doesn't seem to be a list. Possibly corrupt data.table.") } do.call("cbind", lapply(x, format_col, ..., justify=justify)) diff --git a/R/setkey.R b/R/setkey.R index 3bd3f782c4..5f3027a2d7 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -169,7 +169,7 @@ is.sorted = function(x, by=NULL) { ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE) { - if (is.atomic(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), + if (is.atomic(x) || is.null(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), if (!missing(by) && !is.null(by)) stopf("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL } else { From db5ba4135ffb95dca6d43f115c34c5786147c3ba Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 29 Oct 2023 20:22:24 +0100 Subject: [PATCH 525/588] mention survey in README (#5711) * mention survey in README * Michael suggestions Co-authored-by: Michael Chirico --------- Co-authored-by: Michael Chirico --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 46bbfed1e8..37cedad27b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,12 @@ # data.table +The data.table 2023 community survey is now live! Click on https://tinyurl.com/datatable-survey to fill it out. The survey will remain open until **December 1st, 2023**. + +In addition to filling out the survey, it would be great if you could share it with others who might be interested in participating. + +--- + [![CRAN status](https://cranchecks.info/badges/flavor/release/data.table)](https://cran.r-project.org/web/checks/check_results_data.table.html) [![R-CMD-check](https://github.com/Rdatatable/data.table/workflows/R-CMD-check/badge.svg)](https://github.com/Rdatatable/data.table/actions) From 5068e452c9e92df031210a8d7561af57c73e7c6e Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 3 Nov 2023 01:59:33 +0100 Subject: [PATCH 526/588] add inst/cc to gitignore (#5689) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 00d0d0e8be..74c9043c04 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,9 @@ data.table_*.tar.gz data.table.Rcheck src/Makevars +# Package install +inst/cc + # Emacs IDE files .emacs.desktop .emacs.desktop.lock From 2addb00fbae3b47ff5eaf9c7f65059f5e5925ebd Mon Sep 17 00:00:00 2001 From: sluga Date: Fri, 3 Nov 2023 02:00:42 +0100 Subject: [PATCH 527/588] Mention the 2023 data.table community survey (#5705) --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 37cedad27b..f89112486e 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,12 @@ In addition to filling out the survey, it would be great if you could share it w `data.table` provides a high-performance version of [base R](https://www.r-project.org/about.html)'s `data.frame` with syntax and feature enhancements for ease of use, convenience and programming speed. +--- + +**NEW:** Take part in the [data.table 2023 community survey](https://tinyurl.com/datatable-survey/) and help shape the future of the project! The survey closes on **December 1st**. + +--- + ## Why `data.table`? * concise syntax: fast to type, fast to read From dd9ee72ced50bc8ef32424f2671f4cca0468b007 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Fri, 3 Nov 2023 02:05:04 +0100 Subject: [PATCH 528/588] Fix typo in doc of `data.table()` (#5577) --- man/data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/data.table.Rd b/man/data.table.Rd index ecc79e2a54..a5da7ebc4e 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -212,7 +212,7 @@ The way to read this out loud is: "Take \code{DT}, subset rows by \code{i}, \emp X[c>1, sum(a), by=c] # get rows where c>1 is TRUE, and on those rows, get sum(a) grouped by 'c' X[Y, .(a, b), on="c"] # get rows where Y$c == X$c, and select columns 'X$a' and 'X$b' for those rows X[Y, .(a, i.a), on="c"] # get rows where Y$c == X$c, and then select 'X$a' and 'Y$a' (=i.a) - X[Y, sum(a*i.a), on="c" by=.EACHI] # for *each* 'Y$c', get sum(a*i.a) on matching rows in 'X$c' + X[Y, sum(a*i.a), on="c", by=.EACHI] # for *each* 'Y$c', get sum(a*i.a) on matching rows in 'X$c' X[, plot(a, b), by=c] # j accepts any expression, generates plot for each group and returns no data # see ?assign to add/update/delete columns by reference using the same consistent interface From 8afea02b1a68848c321f02c18164a120da1bc355 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 2 Nov 2023 22:40:34 -0700 Subject: [PATCH 529/588] Update repo link for lubridate (#5603) --- R/IDateTime.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/IDateTime.R b/R/IDateTime.R index 4e6adf55e3..185952fe72 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -315,8 +315,8 @@ clip_msec = function(secs, action) { # Adapted from Hadley Wickham's routines cited below to ensure # integer results. # http://gist.github.com/10238 -# See also Hadley's more advanced and complex lubridate package: -# http://github.com/hadley/lubridate +# See also Hadley et al's more advanced and complex lubridate package: +# https://github.com/tidyverse/lubridate # lubridate routines do not return integer values. ################################################################### From c0528ef24c45f4333fbb95b8b5e5dc7954be6514 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 2 Nov 2023 22:44:09 -0700 Subject: [PATCH 530/588] Use new badges endpoint (#5555) Closes #5553 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f89112486e..7f6c3c1031 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ In addition to filling out the survey, it would be great if you could share it w --- -[![CRAN status](https://cranchecks.info/badges/flavor/release/data.table)](https://cran.r-project.org/web/checks/check_results_data.table.html) +[![CRAN status](https://badges.cranchecks.info/flavor/release/data.table.svg)](https://cran.r-project.org/web/checks/check_results_data.table.html) [![R-CMD-check](https://github.com/Rdatatable/data.table/workflows/R-CMD-check/badge.svg)](https://github.com/Rdatatable/data.table/actions) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/kayjdh5qtgymhoxr/branch/master?svg=true)](https://ci.appveyor.com/project/Rdatatable/data-table) [![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://codecov.io/github/Rdatatable/data.table?branch=master) From e6076b02f746dd05d921ac355291fb42623f6c02 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 2 Nov 2023 23:01:12 -0700 Subject: [PATCH 531/588] Revert "Mention the 2023 data.table community survey (#5705)" (#5716) This reverts commit 2addb00fbae3b47ff5eaf9c7f65059f5e5925ebd. --- README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/README.md b/README.md index 7f6c3c1031..fbe2de22a2 100644 --- a/README.md +++ b/README.md @@ -21,12 +21,6 @@ In addition to filling out the survey, it would be great if you could share it w `data.table` provides a high-performance version of [base R](https://www.r-project.org/about.html)'s `data.frame` with syntax and feature enhancements for ease of use, convenience and programming speed. ---- - -**NEW:** Take part in the [data.table 2023 community survey](https://tinyurl.com/datatable-survey/) and help shape the future of the project! The survey closes on **December 1st**. - ---- - ## Why `data.table`? * concise syntax: fast to type, fast to read From ac2b737876c393e635e75d79c6865bbd5af97ee6 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 5 Nov 2023 01:22:36 +0100 Subject: [PATCH 532/588] setup pkgup GH actions workflow (#5690) * setup pkgup for data.table --- .github/workflows/pkgup.yaml | 68 ++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 .github/workflows/pkgup.yaml diff --git a/.github/workflows/pkgup.yaml b/.github/workflows/pkgup.yaml new file mode 100644 index 0000000000..d1064cc41e --- /dev/null +++ b/.github/workflows/pkgup.yaml @@ -0,0 +1,68 @@ +# permissions and concurrency settings for GitHub Pages +permissions: + contents: read + pages: write + id-token: write +concurrency: + group: "pages" + cancel-in-progress: true + +on: [push] +jobs: + build: + name: data.table + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: r-lib/actions/setup-pandoc@v2 + - uses: r-lib/actions/setup-r@v2 + - name: cache-r-dependencies + uses: actions/cache@v3 + with: + path: ${{ env.R_LIBS_USER }}/* + key: library-cache-${{ github.run_id }} + restore-keys: library-cache + - name: setup-os-dependencies + run: | + sudo apt-get install -y libcurl4-openssl-dev + - name: setup-r-dependencies + run: | + Rscript -e 'stopifnot(file.copy("DESCRIPTION", file.path(tdir<-tempdir(), "PACKAGES"))); db<-available.packages(paste0("file://", tdir)); deps<-setdiff(tools::package_dependencies(read.dcf("DESCRIPTION", fields="Package")[[1L]], db, which="most")[[1L]], installed.packages(priority="high")[,"Package"]); if (length(deps)) { ap<-available.packages()[,"Version"]; ap<-ap[names(ap) %in% deps]; if (!all(deps%in%names(ap))) stop("dependencies are not avaiable in repository: ",paste(setdiff(deps, names(ap)), collapse=", ")); ip<-installed.packages()[,"Version"]; ip<-ip[names(ip) %in% deps]; pkgs<-ap[deps]>ip[deps]; install.packages(names(pkgs[pkgs|is.na(pkgs)]), INSTALL_opts="--html") }' + - name: build + run: | + echo "Revision:" $GITHUB_SHA >> ./DESCRIPTION + R CMD build . + - name: check + run: | + R CMD check --as-cran --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) + - name: manual + if: github.ref == 'refs/heads/master' + run: | + cp -R ${{ env.R_LIBS_USER }} library + R CMD INSTALL --library="library" $(ls -1t data.table_*.tar.gz | head -n 1) --html + mkdir -p doc/html + cp /usr/share/R/doc/html/{left.jpg,up.jpg,Rlogo.svg,R.css,index.html} doc/html + Rscript -e 'utils::make.packages.html("library", docdir="doc")' + sed -i "s|file://|../..|g" doc/html/packages.html + mkdir -p public + mv doc public/doc + cp -r --parents library/*/{html,help,doc,demo,DESCRIPTION,README,NEWS,README.md,NEWS.md} public 2>/dev/null || : + sed -i 's|"/doc/html/|"/data.table/doc/html/|g' public/library/data.table/doc/index.html 2>/dev/null || : + - name: repo + if: github.ref == 'refs/heads/master' + run: | + mkdir -p public/src/contrib + mv $(ls -1t data.table_*.tar.gz | head -n 1) public/src/contrib + Rscript -e 'tools::write_PACKAGES("public/src/contrib", fields="Revision")' + - name: upload + if: github.ref == 'refs/heads/master' + uses: actions/upload-pages-artifact@v1 + with: + path: "public" + - name: deploy + if: github.ref == 'refs/heads/master' + id: deployment + uses: actions/deploy-pages@v1 From 94e8fbe448f9dcd83fe6f2751dbe36b05b404ee0 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 5 Nov 2023 01:57:57 +0100 Subject: [PATCH 533/588] update_dev_pkg uses GH for R repo (#5720) --- R/devel.R | 2 +- man/update_dev_pkg.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/devel.R b/R/devel.R index 8bd7a1466a..df77eb0e0c 100644 --- a/R/devel.R +++ b/R/devel.R @@ -17,7 +17,7 @@ dcf.repo = function(pkg, repo, field, type) { dcf[dcf[,"Package"]==pkg, field][[1L]] } -update_dev_pkg = function(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { +update_dev_pkg = function(object="data.table", repo="https://Rdatatable.github.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { # this works for any package, not just data.table pkg = object # perform package upgrade when new Revision present diff --git a/man/update_dev_pkg.Rd b/man/update_dev_pkg.Rd index 3db5b98316..9914138c82 100644 --- a/man/update_dev_pkg.Rd +++ b/man/update_dev_pkg.Rd @@ -5,7 +5,7 @@ Downloads and installs latest development version only when a new commit is available which has also passed all tests. Defaults are set to update \code{data.table}, other packages can be used as well. Their repository has to include git commit information in PACKAGES file. } \usage{update_dev_pkg(object="data.table", - repo="https://Rdatatable.gitlab.io/data.table", + repo="https://Rdatatable.github.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, \dots) } \arguments{ From e66f5dcef6a6a0258787fe0ef968760abae137a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=A4chler?= Date: Sun, 5 Nov 2023 02:01:03 +0100 Subject: [PATCH 534/588] run correctly even when is.atomic(NULL) becomes FALSE (#5691) --- R/data.table.R | 2 +- R/frank.R | 2 +- R/fread.R | 2 +- R/print.data.table.R | 3 ++- R/setkey.R | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 473cf6e766..801482147e 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -882,7 +882,7 @@ replace_dot_alias = function(e) { bynames = allbyvars = NULL # the rest now fall through } else bynames = names(byval) - if (is.atomic(byval)) { + if (is.atomic(byval) || is.null(byval)) { if (is.character(byval) && length(byval)<=ncol(x) && !(is.name(bysub) && bysub %chin% names_x) ) { stopf("'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval%s should work. This is for efficiency so data.table can detect which columns are needed.", deparse(bysub)) } else { diff --git a/R/frank.R b/R/frank.R index ba90a83b93..419f5ea414 100644 --- a/R/frank.R +++ b/R/frank.R @@ -12,7 +12,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a .Call(Csetlistelt, xx, 1L, x) xx } - if (is.atomic(x)) { + if (is.atomic(x) || is.null(x)) { if (!missing(cols) && !is.null(cols)) stopf("x is a single vector, non-NULL 'cols' doesn't make sense") cols = 1L diff --git a/R/fread.R b/R/fread.R index f8b025d9c3..e0337c5915 100644 --- a/R/fread.R +++ b/R/fread.R @@ -135,7 +135,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (!allNA(colClasses)) stopf("colClasses is type 'logical' which is ok if all NA but it has some TRUE or FALSE values in it which is not allowed. Please consider the drop= or select= argument instead. See ?fread.") colClasses = NULL } - if (!is.null(colClasses) && is.atomic(colClasses)) { + if (!is.null(colClasses) && is.atomic(colClasses)) { ## future R can use if (is.atomic(.)) if (!is.character(colClasses)) stopf("colClasses is not type list or character vector") if (!length(colClasses)) { colClasses=NULL; diff --git a/R/print.data.table.R b/R/print.data.table.R index 16950fd110..7271ac458f 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -141,7 +141,8 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), } format.data.table = function (x, ..., justify="none") { - if (is.atomic(x) && !is.null(x)) { + if (is.atomic(x) && !is.null(x)) { ## future R can use if (is.atomic(x)) + stopf("Internal structure doesn't seem to be a list. Possibly corrupt data.table.") } do.call("cbind", lapply(x, format_col, ..., justify=justify)) diff --git a/R/setkey.R b/R/setkey.R index 3bd3f782c4..5f3027a2d7 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -169,7 +169,7 @@ is.sorted = function(x, by=NULL) { ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE) { - if (is.atomic(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), + if (is.atomic(x) || is.null(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), if (!missing(by) && !is.null(by)) stopf("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL } else { From af82d403e375605e8d5da7ef9016bfa539068462 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 6 Nov 2023 11:23:21 +0100 Subject: [PATCH 535/588] fix warnings on CRAN #5696 (#5712) * fix warnings on CRAN #5696 --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9117c0fcb6..d7ad5a99a6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14473,8 +14473,8 @@ options(datatable.rbindlist.check=NULL) # this option is set to NULL at the top if (.Platform$OS.type == 'windows') local({ lc_collate <- Sys.getlocale(c('LC_COLLATE')) lc_ctype <- Sys.getlocale(c('LC_CTYPE')) - Sys.setlocale('LC_COLLATE', "Chinese (Simplified)_China.936") - Sys.setlocale('LC_CTYPE', "Chinese (Simplified)_China.936") + suppressWarnings(Sys.setlocale('LC_COLLATE', "Chinese (Simplified)_China.936")) ## fix CRAN warning #5696 + suppressWarnings(Sys.setlocale('LC_CTYPE', "Chinese (Simplified)_China.936")) on.exit({ Sys.setlocale('LC_COLLATE', lc_collate) Sys.setlocale('LC_CTYPE', lc_ctype) From 4c633907a651f25a4e02c86e6bebb6f5c8c37fa8 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 7 Nov 2023 07:16:31 +0100 Subject: [PATCH 536/588] run workflow only on pushes to master (#5728) --- .github/workflows/pkgup.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pkgup.yaml b/.github/workflows/pkgup.yaml index d1064cc41e..67541f4e69 100644 --- a/.github/workflows/pkgup.yaml +++ b/.github/workflows/pkgup.yaml @@ -7,7 +7,11 @@ concurrency: group: "pages" cancel-in-progress: true -on: [push] +on: + push: + branches: + - 'master' + jobs: build: name: data.table From 6b9d559606767562f7f7dd4c7842a9e4a9fb597c Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 15 Nov 2023 17:54:37 +0100 Subject: [PATCH 537/588] add missing links in forder.c (#5741) --- src/forder.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/forder.c b/src/forder.c index f2846828a5..8a62e1de78 100644 --- a/src/forder.c +++ b/src/forder.c @@ -10,9 +10,9 @@ http://stereopsis.com/radix.html Previous version of this file was promoted into base R, see ?base::sort. - Denmark useR! presentation - Stanford DSC presentation - JSM presentation + Denmark useR! presentation https://github.com/Rdatatable/data.table/wiki/talks/useR2015_Matt.pdf + Stanford DSC presentation https://github.com/Rdatatable/data.table/wiki/talks/DSC2016_ParallelSort.pdf + JSM presentation https://github.com/Rdatatable/data.table/wiki/talks/JSM2018_Matt.pdf Techniques used : skewed groups are split in parallel finds unique bytes to save 256 sweeping From ec9b1e45060e7ff2deeef5c3f1533abd9abe7176 Mon Sep 17 00:00:00 2001 From: Martin Maechler Date: Mon, 20 Nov 2023 14:21:20 +0100 Subject: [PATCH 538/588] update 2 URLs --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 025a7651b3..52333e9b3c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -691,7 +691,7 @@ 1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz="UTC"` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended `In addition to convenience, fread is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.`. - At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://www.rstudio.com/resources/rstudioglobal-2021/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). + At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://posit.co/resources/videos/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). `tz=`'s default is now changed from `""` to `"UTC"`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. @@ -2136,7 +2136,7 @@ When `j` is a symbol (as in the quanteda and xgboost examples above) it will con 2. Just to state explicitly: data.table does not now depend on or require OpenMP. If you don't have it (as on CRAN's Mac it appears but not in general on Mac) then data.table should build, run and pass all tests just fine. -3. There are now 5,910 raw tests as reported by `test.data.table()`. Tests cover 91% of the 4k lines of R and 89% of the 7k lines of C. These stats are now known thanks to Jim Hester's [Covr](https://CRAN.R-project.org/package=covr) package and [Codecov.io](https://about.codecov.io/). If anyone is looking for something to help with, creating tests to hit the missed lines shown by clicking the `R` and `src` folders at the bottom [here](https://codecov.io/github/Rdatatable/data.table?branch=master) would be very much appreciated. +3. There are now 5,910 raw tests as reported by `test.data.table()`. Tests cover 91% of the 4k lines of R and 89% of the 7k lines of C. These stats are now known thanks to Jim Hester's [Covr](https://CRAN.R-project.org/package=covr) package and [Codecov.io](https://about.codecov.io/). If anyone is looking for something to help with, creating tests to hit the missed lines shown by clicking the `R` and `src` folders at the bottom [here](https://app.codecov.io/github/Rdatatable/data.table?branch=master) would be very much appreciated. 4. The FAQ vignette has been revised given the changes in v1.9.8. In particular, the very first FAQ. From 514fd3442eee878a604cbfb9030c38cd6c22f184 Mon Sep 17 00:00:00 2001 From: Martin Maechler Date: Mon, 20 Nov 2023 14:40:56 +0100 Subject: [PATCH 539/588] fix "lost braces" NOTE (--as-cran): here the {.} are *extraneous* --- man/data.table.Rd | 43 +++++++++++++++++++++--------------------- man/fread.Rd | 16 ++++++++-------- man/froll.Rd | 36 +++++++++++++++++------------------ man/fsort.Rd | 6 +++--- man/fwrite.Rd | 34 ++++++++++++++++----------------- man/openmp-utils.Rd | 24 +++++++++++------------ man/setops.Rd | 16 ++++++---------- man/special-symbols.Rd | 12 ++++++------ 8 files changed, 92 insertions(+), 95 deletions(-) diff --git a/man/data.table.Rd b/man/data.table.Rd index a5da7ebc4e..502595d7c0 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -62,13 +62,13 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac If \code{i} is a \code{data.table}, the columns in \code{i} to be matched against \code{x} can be specified using one of these ways: \itemize{ - \item{\code{on} argument (see below). It allows for both \code{equi-} and the newly implemented \code{non-equi} joins.} + \item \code{on} argument (see below). It allows for both \code{equi-} and the newly implemented \code{non-equi} joins. - \item{If not, \code{x} \emph{must be keyed}. Key can be set using \code{\link{setkey}}. If \code{i} is also keyed, then first \emph{key} column of \code{i} is matched against first \emph{key} column of \code{x}, second against second, etc.. + \item If not, \code{x} \emph{must be keyed}. Key can be set using \code{\link{setkey}}. If \code{i} is also keyed, then first \emph{key} column of \code{i} is matched against first \emph{key} column of \code{x}, second against second, etc.. If \code{i} is not keyed, then first column of \code{i} is matched against first \emph{key} column of \code{x}, second column of \code{i} against second \emph{key} column of \code{x}, etc\ldots - This is summarised in code as \code{min(length(key(x)), if (haskey(i)) length(key(i)) else ncol(i))}.} + This is summarised in code as \code{min(length(key(x)), if (haskey(i)) length(key(i)) else ncol(i))}. } Using \code{on=} is recommended (even during keyed joins) as it helps understand the code better and also allows for \emph{non-equi} joins. @@ -100,15 +100,15 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{by}{ Column names are seen as if they are variables (as in \code{j} when \code{with=TRUE}). The \code{data.table} is then grouped by the \code{by} and \code{j} is evaluated within each group. The order of the rows within each group is preserved, as is the order of the groups. \code{by} accepts: \itemize{ - \item{A single unquoted column name: e.g., \code{DT[, .(sa=sum(a)), by=x]}} + \item A single unquoted column name: e.g., \code{DT[, .(sa=sum(a)), by=x]} - \item{a \code{list()} of expressions of column names: e.g., \code{DT[, .(sa=sum(a)), by=.(x=x>0, y)]}} + \item a \code{list()} of expressions of column names: e.g., \code{DT[, .(sa=sum(a)), by=.(x=x>0, y)]} - \item{a single character string containing comma separated column names (where spaces are significant since column names may contain spaces even at the start or end): e.g., \code{DT[, sum(a), by="x,y,z"]}} + \item a single character string containing comma separated column names (where spaces are significant since column names may contain spaces even at the start or end): e.g., \code{DT[, sum(a), by="x,y,z"]} - \item{a character vector of column names: e.g., \code{DT[, sum(a), by=c("x", "y")]}} + \item a character vector of column names: e.g., \code{DT[, sum(a), by=c("x", "y")]} - \item{or of the form \code{startcol:endcol}: e.g., \code{DT[, sum(a), by=x:z]}} + \item or of the form \code{startcol:endcol}: e.g., \code{DT[, sum(a), by=x:z]} } \emph{Advanced:} When \code{i} is a \code{list} (or \code{data.frame} or \code{data.table}), \code{DT[i, j, by=.EACHI]} evaluates \code{j} for the groups in `DT` that each row in \code{i} joins to. That is, you can join (in \code{i}) and aggregate (in \code{j}) simultaneously. We call this \emph{grouping by each i}. See \href{https://stackoverflow.com/a/27004566/559784}{this StackOverflow answer} for a more detailed explanation until we \href{https://github.com/Rdatatable/data.table/issues/944}{roll out vignettes}. @@ -128,10 +128,10 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{roll}{ When \code{i} is a \code{data.table} and its row matches to all but the last \code{x} join column, and its value in the last \code{i} join column falls in a gap (including after the last observation in \code{x} for that group), then: \itemize{ - \item{\code{+Inf} (or \code{TRUE}) rolls the \emph{prevailing} value in \code{x} forward. It is also known as last observation carried forward (LOCF).} - \item{\code{-Inf} rolls backwards instead; i.e., next observation carried backward (NOCB).} - \item{finite positive or negative number limits how far values are carried forward or backward.} - \item{"nearest" rolls the nearest value instead.} + \item \code{+Inf} (or \code{TRUE}) rolls the \emph{prevailing} value in \code{x} forward. It is also known as last observation carried forward (LOCF). + \item \code{-Inf} rolls backwards instead; i.e., next observation carried backward (NOCB). + \item finite positive or negative number limits how far values are carried forward or backward. + \item "nearest" rolls the nearest value instead. } Rolling joins apply to the last join column, generally a date but can be any variable. It is particularly fast using a modified binary search. @@ -139,8 +139,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{rollends}{ A logical vector length 2 (a single logical is recycled) indicating whether values falling before the first value or after the last value for a group should be rolled as well. \itemize{ - \item{If \code{rollends[2]=TRUE}, it will roll the last value forward. \code{TRUE} by default for LOCF and \code{FALSE} for NOCB rolls.} - \item{If \code{rollends[1]=TRUE}, it will roll the first value backward. \code{TRUE} by default for NOCB and \code{FALSE} for LOCF rolls.} + \item If \code{rollends[2]=TRUE}, it will roll the last value forward. \code{TRUE} by default for LOCF and \code{FALSE} for NOCB rolls. + \item If \code{rollends[1]=TRUE}, it will roll the first value backward. \code{TRUE} by default for NOCB and \code{FALSE} for LOCF rolls. } When \code{roll} is a finite number, that limit is also applied when rolling the ends.} @@ -163,15 +163,16 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{on}{ Indicate which columns in \code{x} should be joined with which columns in \code{i} along with the type of binary operator to join with (see non-equi joins below on this). When specified, this overrides the keys set on \code{x} and \code{i}. When \code{.NATURAL} keyword provided then \emph{natural join} is made (join on common columns). There are multiple ways of specifying the \code{on} argument: \itemize{ - \item{As an unnamed character vector, e.g., \code{X[Y, on=c("a", "b")]}, used when columns \code{a} and \code{b} are common to both \code{X} and \code{Y}.} - \item{\emph{Foreign key joins}: As a \emph{named} character vector when the join columns have different names in \code{X} and \code{Y}. + \item As an unnamed character vector, e.g., \code{X[Y, on=c("a", "b")]}, used when columns \code{a} and \code{b} are common to both \code{X} and \code{Y}. + \item \emph{Foreign key joins}: As a \emph{named} character vector when the join columns have different names in \code{X} and \code{Y}. For example, \code{X[Y, on=c(x1="y1", x2="y2")]} joins \code{X} and \code{Y} by matching columns \code{x1} and \code{x2} in \code{X} with columns \code{y1} and \code{y2} in \code{Y}, respectively. From v1.9.8, you can also express foreign key joins using the binary operator \code{==}, e.g. \code{X[Y, on=c("x1==y1", "x2==y2")]}. - NB: shorthand like \code{X[Y, on=c("a", V2="b")]} is also possible if, e.g., column \code{"a"} is common between the two tables.} - \item{For convenience during interactive scenarios, it is also possible to use \code{.()} syntax as \code{X[Y, on=.(a, b)]}.} - \item{From v1.9.8, (non-equi) joins using binary operators \code{>=, >, <=, <} are also possible, e.g., \code{X[Y, on=c("x>=a", "y<=b")]}, or for interactive use as \code{X[Y, on=.(x>=a, y<=b)]}.} + NB: shorthand like \code{X[Y, on=c("a", V2="b")]} is also possible if, e.g., column \code{"a"} is common between the two tables. + + \item For convenience during interactive scenarios, it is also possible to use \code{.()} syntax as \code{X[Y, on=.(a, b)]}. + \item From v1.9.8, (non-equi) joins using binary operators \code{>=, >, <=, <} are also possible, e.g., \code{X[Y, on=c("x>=a", "y<=b")]}, or for interactive use as \code{X[Y, on=.(x>=a, y<=b)]}. } See examples as well as \href{../doc/datatable-secondary-indices-and-auto-indexing.html}{\code{vignette("datatable-secondary-indices-and-auto-indexing")}}. } @@ -182,8 +183,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr \enumerate{ - \item{programming time (easier to write, read, debug and maintain), and} - \item{compute time (fast and memory efficient).} + \item programming time (easier to write, read, debug and maintain), and + \item compute time (fast and memory efficient). } The general form of data.table syntax is:\cr diff --git a/man/fread.Rd b/man/fread.Rd index cc96062dec..78c8a76289 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -88,15 +88,15 @@ On Windows, "French_France.1252" is tried which should be available as standard When \code{quote} is a single character, \itemize{ - \item{Spaces and other whitespace (other than \code{sep} and \code{\\n}) may appear in unquoted character fields, e.g., \code{\dots,2,Joe Bloggs,3.14,\dots}.} + \item Spaces and other whitespace (other than \code{sep} and \code{\\n}) may appear in unquoted character fields, e.g., \code{\dots,2,Joe Bloggs,3.14,\dots}. - \item{When \code{character} columns are \emph{quoted}, they must start and end with that quoting character immediately followed by \code{sep} or \code{\\n}, e.g., \code{\dots,2,"Joe Bloggs",3.14,\dots}. + \item When \code{character} columns are \emph{quoted}, they must start and end with that quoting character immediately followed by \code{sep} or \code{\\n}, e.g., \code{\dots,2,"Joe Bloggs",3.14,\dots}. In essence quoting character fields are \emph{required} only if \code{sep} or \code{\\n} appears in the string value. Quoting may be used to signify that numeric data should be read as text. Unescaped quotes may be present in a quoted field, e.g., \code{\dots,2,"Joe, "Bloggs"",3.14,\dots}, as well as escaped quotes, e.g., \code{\dots,2,"Joe \",Bloggs\"",3.14,\dots}. If an embedded quote is followed by the separator inside a quoted field, the embedded quotes up to that point in that field must be balanced; e.g. \code{\dots,2,"www.blah?x="one",y="two"",3.14,\dots}. - On those fields that do not satisfy these conditions, e.g., fields with unbalanced quotes, \code{fread} re-attempts that field as if it isn't quoted. This is quite useful in reading files that contains fields with unbalanced quotes as well, automatically.} + On those fields that do not satisfy these conditions, e.g., fields with unbalanced quotes, \code{fread} re-attempts that field as if it isn't quoted. This is quite useful in reading files that contains fields with unbalanced quotes as well, automatically. } To read fields \emph{as is} instead, use \code{quote = ""}. @@ -106,11 +106,11 @@ To read fields \emph{as is} instead, use \code{quote = ""}. Currently, the \code{yaml} setting is somewhat inflexible with respect to incorporating metadata to facilitate file reading. Information on column classes should be stored at the top level under the heading \code{schema} and subheading \code{fields}; those with both a \code{type} and a \code{name} sub-heading will be merged into \code{colClasses}. Other supported elements are as follows: \itemize{ - \item{ \code{sep} (or alias \code{delimiter}) } - \item{ \code{header} } - \item{ \code{quote} (or aliases \code{quoteChar}, \code{quote_char}) } - \item{ \code{dec} (or alias \code{decimal}) } - \item{ \code{na.strings} } + \item \code{sep} (or alias \code{delimiter}) + \item \code{header} + \item \code{quote} (or aliases \code{quoteChar}, \code{quote_char}) + \item \code{dec} (or alias \code{decimal}) + \item \code{na.strings} } \bold{File Download:} diff --git a/man/froll.Rd b/man/froll.Rd index 090b397a90..d6cb75067f 100644 --- a/man/froll.Rd +++ b/man/froll.Rd @@ -64,9 +64,9 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) observation has its own corresponding rolling window width. Due to the logic of adaptive rolling functions, the following restrictions apply: \itemize{ - \item{ \code{align} only \code{"right"}. } - \item{ if list of vectors is passed to \code{x}, then all - vectors within it must have equal length. } + \item \code{align} only \code{"right"}. + \item if list of vectors is passed to \code{x}, then all + vectors within it must have equal length. } When multiple columns or multiple windows width are provided, then they @@ -93,21 +93,21 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) \code{zoo} might expect following differences in \code{data.table} implementation. \itemize{ - \item{ rolling function will always return result of the same length - as input. } - \item{ \code{fill} defaults to \code{NA}. } - \item{ \code{fill} accepts only constant values. It does not support - for \emph{na.locf} or other functions. } - \item{ \code{align} defaults to \code{"right"}. } - \item{ \code{na.rm} is respected, and other functions are not needed - when input contains \code{NA}. } - \item{ integers and logical are always coerced to double. } - \item{ when \code{adaptive=FALSE} (default), then \code{n} must be a - numeric vector. List is not accepted. } - \item{ when \code{adaptive=TRUE}, then \code{n} must be vector of - length equal to \code{nrow(x)}, or list of such vectors. } - \item{ \code{partial} window feature is not supported, although it can - be accomplished by using \code{adaptive=TRUE}, see examples. \code{NA} is always returned for incomplete windows. } + \item rolling function will always return result of the same length as input. + \item \code{fill} defaults to \code{NA}. + \item \code{fill} accepts only constant values. It does not support + for \emph{na.locf} or other functions. + \item \code{align} defaults to \code{"right"}. + \item \code{na.rm} is respected, and other functions are not needed + when input contains \code{NA}. + \item integers and logical are always coerced to double. + \item when \code{adaptive=FALSE} (default), then \code{n} must be a + numeric vector. List is not accepted. + \item when \code{adaptive=TRUE}, then \code{n} must be vector of + length equal to \code{nrow(x)}, or list of such vectors. + \item \code{partial} window feature is not supported, although it can + be accomplished by using \code{adaptive=TRUE}, see + examples. \code{NA} is always returned for incomplete windows. } Be aware that rolling functions operates on the physical order of input. diff --git a/man/fsort.Rd b/man/fsort.Rd index 6c11022d2c..0eba047a16 100644 --- a/man/fsort.Rd +++ b/man/fsort.Rd @@ -20,9 +20,9 @@ fsort(x, decreasing = FALSE, na.last = FALSE, internal=FALSE, verbose=FALSE, \do Process will raise error if \code{x} contains negative values. Unless \code{x} is already sorted \code{fsort} will redirect processing to slower single threaded \emph{order} followed by \emph{subset} in following cases: \itemize{ - \item{data type other than \emph{double} (\emph{numeric})} - \item{data having \code{NA}s} - \item{\code{decreasing==FALSE}} + \item data type other than \emph{double} (\emph{numeric}) + \item data having \code{NA}s + \item \code{decreasing==FALSE} } } \value{ diff --git a/man/fwrite.Rd b/man/fwrite.Rd index ba6eb4751c..a4fcf788e3 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -37,18 +37,18 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{col.names}{Should the column names (header row) be written? The default is \code{TRUE} for new files and when overwriting existing files (\code{append=FALSE}). Otherwise, the default is \code{FALSE} to prevent column names appearing again mid-file when stacking a set of \code{data.table}s or appending rows to the end of a file.} \item{qmethod}{A character string specifying how to deal with embedded double quote characters when quoting strings. \itemize{ - \item{"escape" - the quote character (as well as the backslash character) is escaped in C style by a backslash, or} - \item{"double" (default, same as \code{write.csv}), in which case the double quote is doubled with another one.} + \item "escape" - the quote character (as well as the backslash character) is escaped in C style by a backslash, or + \item "double" (default, same as \code{write.csv}), in which case the double quote is doubled with another one. }} \item{logical01}{Should \code{logical} values be written as \code{1} and \code{0} rather than \code{"TRUE"} and \code{"FALSE"}?} \item{logicalAsInt}{Deprecated. Old name for `logical01`. Name change for consistency with `fread` for which `logicalAsInt` would not make sense.} \item{scipen}{ \code{integer} In terms of printing width, how much of a bias should there be towards printing whole numbers rather than scientific notation? See Details. } \item{dateTimeAs}{ How \code{Date}/\code{IDate}, \code{ITime} and \code{POSIXct} items are written. \itemize{ - \item{"ISO" (default) - \code{2016-09-12}, \code{18:12:16} and \code{2016-09-12T18:12:16.999999Z}. 0, 3 or 6 digits of fractional seconds are printed if and when present for convenience, regardless of any R options such as \code{digits.secs}. The idea being that if milli and microseconds are present then you most likely want to retain them. R's internal UTC representation is written faithfully to encourage ISO standards, stymie timezone ambiguity and for speed. An option to consider is to start R in the UTC timezone simply with \code{"$ TZ='UTC' R"} at the shell (NB: it must be one or more spaces between \code{TZ='UTC'} and \code{R}, anything else will be silently ignored; this TZ setting applies just to that R process) or \code{Sys.setenv(TZ='UTC')} at the R prompt and then continue as if UTC were local time.} - \item{"squash" - \code{20160912}, \code{181216} and \code{20160912181216999}. This option allows fast and simple extraction of \code{yyyy}, \code{mm}, \code{dd} and (most commonly to group by) \code{yyyymm} parts using integer div and mod operations. In R for example, one line helper functions could use \code{\%/\%10000}, \code{\%/\%100\%\%100}, \code{\%\%100} and \code{\%/\%100} respectively. POSIXct UTC is squashed to 17 digits (including 3 digits of milliseconds always, even if \code{000}) which may be read comfortably as \code{integer64} (automatically by \code{fread()}).} - \item{"epoch" - \code{17056}, \code{65536} and \code{1473703936.999999}. The underlying number of days or seconds since the relevant epoch (1970-01-01, 00:00:00 and 1970-01-01T00:00:00Z respectively), negative before that (see \code{?Date}). 0, 3 or 6 digits of fractional seconds are printed if and when present.} - \item{"write.csv" - this currently affects \code{POSIXct} only. It is written as \code{write.csv} does by using the \code{as.character} method which heeds \code{digits.secs} and converts from R's internal UTC representation back to local time (or the \code{"tzone"} attribute) as of that historical date. Accordingly this can be slow. All other column types (including \code{Date}, \code{IDate} and \code{ITime} which are independent of timezone) are written as the "ISO" option using fast C code which is already consistent with \code{write.csv}.} + \item "ISO" (default) - \code{2016-09-12}, \code{18:12:16} and \code{2016-09-12T18:12:16.999999Z}. 0, 3 or 6 digits of fractional seconds are printed if and when present for convenience, regardless of any R options such as \code{digits.secs}. The idea being that if milli and microseconds are present then you most likely want to retain them. R's internal UTC representation is written faithfully to encourage ISO standards, stymie timezone ambiguity and for speed. An option to consider is to start R in the UTC timezone simply with \code{"$ TZ='UTC' R"} at the shell (NB: it must be one or more spaces between \code{TZ='UTC'} and \code{R}, anything else will be silently ignored; this TZ setting applies just to that R process) or \code{Sys.setenv(TZ='UTC')} at the R prompt and then continue as if UTC were local time. + \item "squash" - \code{20160912}, \code{181216} and \code{20160912181216999}. This option allows fast and simple extraction of \code{yyyy}, \code{mm}, \code{dd} and (most commonly to group by) \code{yyyymm} parts using integer div and mod operations. In R for example, one line helper functions could use \code{\%/\%10000}, \code{\%/\%100\%\%100}, \code{\%\%100} and \code{\%/\%100} respectively. POSIXct UTC is squashed to 17 digits (including 3 digits of milliseconds always, even if \code{000}) which may be read comfortably as \code{integer64} (automatically by \code{fread()}). + \item "epoch" - \code{17056}, \code{65536} and \code{1473703936.999999}. The underlying number of days or seconds since the relevant epoch (1970-01-01, 00:00:00 and 1970-01-01T00:00:00Z respectively), negative before that (see \code{?Date}). 0, 3 or 6 digits of fractional seconds are printed if and when present. + \item "write.csv" - this currently affects \code{POSIXct} only. It is written as \code{write.csv} does by using the \code{as.character} method which heeds \code{digits.secs} and converts from R's internal UTC representation back to local time (or the \code{"tzone"} attribute) as of that historical date. Accordingly this can be slow. All other column types (including \code{Date}, \code{IDate} and \code{ITime} which are independent of timezone) are written as the "ISO" option using fast C code which is already consistent with \code{write.csv}. } The first three options are fast due to new specialized C code. The epoch to date-part conversion uses a fast approach by Howard Hinnant (see references) using a day-of-year starting on 1 March. You should not be able to notice any difference in write speed between those three options. The date range supported for \code{Date} and \code{IDate} is [0000-03-01, 9999-12-31]. Every one of these 3,652,365 dates have been tested and compared to base R including all 2,790 leap days in this range. \cr \cr This option applies to vectors of date/time in list column cells, too. \cr \cr @@ -73,17 +73,17 @@ To save space, \code{fwrite} prefers to write wide numeric values in scientific The following fields will be written to the header of the file and surrounded by \code{---} on top and bottom: \itemize{ - \item{ \code{source} - Contains the R version and \code{data.table} version used to write the file } - \item{ \code{creation_time_utc} - Current timestamp in UTC time just before the header is written } - \item{ \code{schema} with element \code{fields} giving \code{name}-\code{type} (\code{class}) pairs for the table; multi-class objects (e.g. \code{c('POSIXct', 'POSIXt')}) will have their first class written. } - \item{ \code{header} - same as \code{col.names} (which is \code{header} on input) } - \item{ \code{sep} } - \item{ \code{sep2} } - \item{ \code{eol} } - \item{ \code{na.strings} - same as \code{na} } - \item{ \code{dec} } - \item{ \code{qmethod} } - \item{ \code{logical01} } + \item \code{source} - Contains the R version and \code{data.table} version used to write the file + \item \code{creation_time_utc} - Current timestamp in UTC time just before the header is written + \item \code{schema} with element \code{fields} giving \code{name}-\code{type} (\code{class}) pairs for the table; multi-class objects (e.g. \code{c('POSIXct', 'POSIXt')}) will have their first class written. + \item \code{header} - same as \code{col.names} (which is \code{header} on input) + \item \code{sep} + \item \code{sep2} + \item \code{eol} + \item \code{na.strings} - same as \code{na} + \item \code{dec} + \item \code{qmethod} + \item \code{logical01} } } diff --git a/man/openmp-utils.Rd b/man/openmp-utils.Rd index 71e469ed72..df942009c6 100644 --- a/man/openmp-utils.Rd +++ b/man/openmp-utils.Rd @@ -37,18 +37,18 @@ Internally parallelized code is used in the following places: \itemize{ - \item{\file{between.c} - \code{\link{between}()}} - \item{\file{cj.c} - \code{\link{CJ}()}} - \item{\file{coalesce.c} - \code{\link{fcoalesce}()}} - \item{\file{fifelse.c} - \code{\link{fifelse}()}} - \item{\file{fread.c} - \code{\link{fread}()}} - \item{\file{forder.c}, \file{fsort.c}, and \file{reorder.c} - \code{\link{forder}()} and related} - \item{\file{froll.c}, \file{frolladaptive.c}, and \file{frollR.c} - \code{\link{froll}()} and family} - \item{\file{fwrite.c} - \code{\link{fwrite}()}} - \item{\file{gsumm.c} - GForce in various places, see \link{GForce}} - \item{\file{nafill.c} - \code{\link{nafill}()}} - \item{\file{subset.c} - Used in \code{\link[=data.table]{[.data.table}} subsetting} - \item{\file{types.c} - Internal testing usage} + \item\file{between.c} - \code{\link{between}()} + \item\file{cj.c} - \code{\link{CJ}()} + \item\file{coalesce.c} - \code{\link{fcoalesce}()} + \item\file{fifelse.c} - \code{\link{fifelse}()} + \item\file{fread.c} - \code{\link{fread}()} + \item\file{forder.c}, \file{fsort.c}, and \file{reorder.c} - \code{\link{forder}()} and related + \item\file{froll.c}, \file{frolladaptive.c}, and \file{frollR.c} - \code{\link{froll}()} and family + \item\file{fwrite.c} - \code{\link{fwrite}()} + \item\file{gsumm.c} - GForce in various places, see \link{GForce} + \item\file{nafill.c} - \code{\link{nafill}()} + \item\file{subset.c} - Used in \code{\link[=data.table]{[.data.table}} subsetting + \item\file{types.c} - Internal testing usage } } \examples{ diff --git a/man/setops.Rd b/man/setops.Rd index 395cdab339..dfa2572c74 100644 --- a/man/setops.Rd +++ b/man/setops.Rd @@ -23,16 +23,12 @@ fsetequal(x, y, all = TRUE) \arguments{ \item{x, y}{\code{data.table}s.} \item{all}{Logical. Default is \code{FALSE} and removes duplicate rows on the result. When \code{TRUE}, if there are \code{xn} copies of a particular row in \code{x} and \code{yn} copies of the same row in \code{y}, then: - \itemize{ - - \item{\code{fintersect} will return \code{min(xn, yn)} copies of that row.} - - \item{\code{fsetdiff} will return \code{max(0, xn-yn)} copies of that row.} - - \item{\code{funion} will return \code{xn+yn} copies of that row.} - - \item{\code{fsetequal} will return \code{FALSE} unless \code{xn == yn}.} - } + \itemize{ + \item\code{fintersect} will return \code{min(xn, yn)} copies of that row. + \item\code{fsetdiff} will return \code{max(0, xn-yn)} copies of that row. + \item\code{funion} will return \code{xn+yn} copies of that row. + \item\code{fsetequal} will return \code{FALSE} unless \code{xn == yn}. + } } } \details{ diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd index c96cbef5c4..9fb3cb45a4 100644 --- a/man/special-symbols.Rd +++ b/man/special-symbols.Rd @@ -19,12 +19,12 @@ These symbols used in \code{j} are defined as follows. \itemize{ - \item{\code{.SD} is a \code{data.table} containing the \bold{S}ubset of \code{x}'s \bold{D}ata for each group, excluding any columns used in \code{by} (or \code{keyby}).} - \item{\code{.BY} is a \code{list} containing a length 1 vector for each item in \code{by}. This can be useful when \code{by} is not known in advance. The \code{by} variables are also available to \code{j} directly by name; useful for example for titles of graphs if \code{j} is a plot command, or to branch with \code{if()} depending on the value of a group variable.} - \item{\code{.N} is an integer, length 1, containing the number of rows in the group. This may be useful when the column names are not known in advance and for convenience generally. When grouping by \code{i}, \code{.N} is the number of rows in \code{x} matched to, for each row of \code{i}, regardless of whether \code{nomatch} is \code{NA} or \code{NULL}. It is renamed to \code{N} (no dot) in the result (otherwise a column called \code{".N"} could conflict with the \code{.N} variable, see FAQ 4.6 for more details and example), unless it is explicitly named; e.g., \code{DT[,list(total=.N),by=a]}.} - \item{\code{.I} is an integer vector equal to \code{seq_len(nrow(x))}. While grouping, it holds for each item in the group, its row location in \code{x}. This is useful to subset in \code{j}; e.g. \code{DT[, .I[which.max(somecol)], by=grp]}. If used in \code{by} it corresponds to applying a function rowwise. } - \item{\code{.GRP} is an integer, length 1, containing a simple group counter. 1 for the 1st group, 2 for the 2nd, etc.} - \item{\code{.NGRP} is an integer, length 1, containing the number of groups. } + \item \code{.SD} is a \code{data.table} containing the \bold{S}ubset of \code{x}'s \bold{D}ata for each group, excluding any columns used in \code{by} (or \code{keyby}). + \item \code{.BY} is a \code{list} containing a length 1 vector for each item in \code{by}. This can be useful when \code{by} is not known in advance. The \code{by} variables are also available to \code{j} directly by name; useful for example for titles of graphs if \code{j} is a plot command, or to branch with \code{if()} depending on the value of a group variable. + \item \code{.N} is an integer, length 1, containing the number of rows in the group. This may be useful when the column names are not known in advance and for convenience generally. When grouping by \code{i}, \code{.N} is the number of rows in \code{x} matched to, for each row of \code{i}, regardless of whether \code{nomatch} is \code{NA} or \code{NULL}. It is renamed to \code{N} (no dot) in the result (otherwise a column called \code{".N"} could conflict with the \code{.N} variable, see FAQ 4.6 for more details and example), unless it is explicitly named; e.g., \code{DT[,list(total=.N),by=a]}. + \item \code{.I} is an integer vector equal to \code{seq_len(nrow(x))}. While grouping, it holds for each item in the group, its row location in \code{x}. This is useful to subset in \code{j}; e.g. \code{DT[, .I[which.max(somecol)], by=grp]}. If used in \code{by} it corresponds to applying a function rowwise. + \item \code{.GRP} is an integer, length 1, containing a simple group counter. 1 for the 1st group, 2 for the 2nd, etc. + \item \code{.NGRP} is an integer, length 1, containing the number of groups. } \code{.EACHI} is defined as \code{NULL} but its value is not used. Its usage is \code{by=.EACHI} (or \code{keyby=.EACHI}) which invokes grouping-by-each-row-of-i; see \code{\link{data.table}}'s \code{by} argument for more details. From bd5641207727c0c68340c5e269dd00f0274f1ab5 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Thu, 23 Nov 2023 10:09:39 +0100 Subject: [PATCH 540/588] fix r-devel cran check links --- man/assign.Rd | 2 +- man/fwrite.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/man/assign.Rd b/man/assign.Rd index bb87a5221b..df255d395a 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -66,7 +66,7 @@ All of the following result in a friendly error (by design) : DT[, {col1 := 1L; col2 := 2L}] # Use the functional form, `:=`(), instead (see above). } -For additional resources, please read \href{../doc/datatable-faq.html}{\code{vignette("datatable-faq")}}. Also have a look at StackOverflow's \href{https://stackoverflow.com/search?q=\%5Bdata.table\%5D+reference}{data.table tag}. +For additional resources, please read \href{../doc/datatable-faq.html}{\code{vignette("datatable-faq")}}. Also have a look at StackOverflow's \href{https://stackoverflow.com/questions/tagged/data.table/}{data.table tag}. \code{:=} in \code{j} can be combined with all types of \code{i} (such as binary search), and all types of \code{by}. This a one reason why \code{:=} has been implemented in \code{j}. Please see \href{../doc/datatable-reference-semantics}{\code{vignette("datatable-reference-semantics")}} and also \code{FAQ 2.16} for analogies to SQL. diff --git a/man/fwrite.Rd b/man/fwrite.Rd index a4fcf788e3..42ae44a29a 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -64,7 +64,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{encoding}{ The encoding of the strings written to the CSV file. Default is \code{""}, which means writting raw bytes without considering the encoding. Other possible options are \code{"UTF-8"} and \code{"native"}. } } \details{ -\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://h2o.ai/blog/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. +\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. To save space, \code{fwrite} prefers to write wide numeric values in scientific notation -- e.g. \code{10000000000} takes up much more space than \code{1e+10}. Most file readers (e.g. \code{\link{fread}}) understand scientific notation, so there's no fidelity loss. Like in base R, users can control this by specifying the \code{scipen} argument, which follows the same rules as \code{\link[base]{options}('scipen')}. \code{fwrite} will see how much space a value will take to write in scientific vs. decimal notation, and will only write in scientific notation if the latter is more than \code{scipen} characters wider. For \code{10000000000}, then, \code{1e+10} will be written whenever \code{scipen<6}. From 2ccfdc1fd4132de35321aa29382098de65b11f86 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Thu, 23 Nov 2023 18:50:58 +0100 Subject: [PATCH 541/588] updated mock file, closes #5754 (#5755) --- tests/knitr.Rout.mock | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/knitr.Rout.mock b/tests/knitr.Rout.mock index 1f17724c81..ea37b2c465 100644 --- a/tests/knitr.Rout.mock +++ b/tests/knitr.Rout.mock @@ -8,10 +8,11 @@ DT # yes ``` ``` -## x y -## 1: 1 4 -## 2: 2 5 -## 3: 3 6 +## x y +## +## 1: 1 4 +## 2: 2 5 +## 3: 3 6 ``` ```r @@ -20,10 +21,11 @@ print(DT[, z := 10:12]) # yes ``` ``` -## x y z -## 1: 1 4 10 -## 2: 2 5 11 -## 3: 3 6 12 +## x y z +## +## 1: 1 4 10 +## 2: 2 5 11 +## 3: 3 6 12 ``` ```r @@ -32,10 +34,11 @@ DT # yes ``` ``` -## x y z a -## 1: 1 4 10 1 -## 2: 2 5 11 1 -## 3: 3 6 12 1 +## x y z a +## +## 1: 1 4 10 1 +## 2: 2 5 11 1 +## 3: 3 6 12 1 ``` Some text. From f8f5976f0c3bebdebd1a94c3584698147128c6d6 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 25 Nov 2023 11:09:13 +0100 Subject: [PATCH 542/588] escape zlib dependent tests (#5759) --- DESCRIPTION | 1 - R/fwrite.R | 2 ++ inst/tests/tests.Rraw | 35 +++++++++++++++++++++-------------- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 74a4b6e1c4..00f955c377 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -4,7 +4,6 @@ Title: Extension of `data.frame` Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown -SystemRequirements: zlib Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table diff --git a/R/fwrite.R b/R/fwrite.R index c822b05678..54ef04ed06 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -115,3 +115,5 @@ fwrite = function(x, file="", append=FALSE, quote="auto", invisible() } +nozlib = function() identical(.Call(Cdt_zlib_version), "zlib header files were not found when data.table was compiled") + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d7ad5a99a6..bd3319f802 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -69,6 +69,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { which.first = data.table:::which.first which.last = data.table:::which.last `-.IDate` = data.table:::`-.IDate` + nozlib = data.table:::nozlib # Also, for functions that are masked by other packages, we need to map the data.table one. Or else, # the other package's function would be picked up. As above, we only need to do this because we desire @@ -9880,16 +9881,20 @@ test(1658.39, fwrite(matrix(1:3, nrow=3, ncol=1), quote = TRUE), output = '"V1"\ test(1658.40, fwrite(matrix(1:4, nrow=2, ncol=2, dimnames = list(c("ra","rb"),c("ca","cb"))), quote = TRUE), output = '"ca","cb"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table") # fwrite compress -test(1658.41, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), output='a,b\n1,1\n2,2\n3,3') # compress ignored on console -DT = data.table(a=rep(1:2,each=100), b=rep(1:4,each=25)) -test(1658.421, fwrite(DT, file=f1<-tempfile(fileext=".gz"), verbose=TRUE), NULL, - output="args.nrow=200 args.ncol=2.*maxLineLen=5[12].*Writing 200 rows in 1 batches of 200 rows.*nth=1") # [12] for Windows where eolLen==2 -test(1658.422, fwrite(DT, file=f2<-tempfile()), NULL) -test(1658.423, file.info(f1)$size < file.info(f2)$size) # 74 < 804 (file.size() isn't available in R 3.1.0) -if (test_R.utils) test(1658.43, fread(f1), DT) # use fread to decompress gz (works cross-platform) -fwrite(DT, file=f3<-tempfile(), compress="gzip") # compress to filename not ending .gz -test(1658.44, file.info(f3)$size, file.info(f1)$size) -unlink(c(f1,f2,f3)) +if (nozlib()) { + test(1658.409, fwrite(data.table(a=1), file=tempfile(), compress="gzip"), error="header files were not found at the time data.table was compiled") +} else { + test(1658.41, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), output='a,b\n1,1\n2,2\n3,3') # compress ignored on console + DT = data.table(a=rep(1:2,each=100), b=rep(1:4,each=25)) + test(1658.421, fwrite(DT, file=f1<-tempfile(fileext=".gz"), verbose=TRUE), NULL, + output="args.nrow=200 args.ncol=2.*maxLineLen=5[12].*Writing 200 rows in 1 batches of 200 rows.*nth=1") # [12] for Windows where eolLen==2 + test(1658.422, fwrite(DT, file=f2<-tempfile()), NULL) + test(1658.423, file.info(f1)$size < file.info(f2)$size) # 74 < 804 (file.size() isn't available in R 3.1.0) + if (test_R.utils) test(1658.43, fread(f1), DT) # use fread to decompress gz (works cross-platform) + fwrite(DT, file=f3<-tempfile(), compress="gzip") # compress to filename not ending .gz + test(1658.441, file.info(f3)$size, file.info(f1)$size) + unlink(c(f1,f2,f3)) +} DT = data.table(a=1:3, b=list(1:4, c(3.14, 100e10), c("foo", "bar", "baz"))) test(1658.45, fwrite(DT), output=c("a,b","1,1|2|3|4","2,3.14|1e+12","3,foo|bar|baz")) DT[3,b:=as.raw(0:2)] @@ -9916,10 +9921,12 @@ test(1658.52, file.info(f1)$size, file.info(f2)$size) unlink(c(f1, f2)) # compression error -5 due to only 3 bytes (bom) in first block; #3599 -DT = data.table(l=letters, n=1:26) -test(1658.53, fwrite(DT, file=f<-tempfile(fileext=".gz"), bom=TRUE, col.names=FALSE), NULL) -if (test_R.utils) test(1658.54, fread(f), setnames(DT,c("V1","V2"))) -unlink(f) +if (!nozlib()) { + DT = data.table(l=letters, n=1:26) + test(1658.53, fwrite(DT, file=f<-tempfile(fileext=".gz"), bom=TRUE, col.names=FALSE), NULL) + if (test_R.utils) test(1658.54, fread(f), setnames(DT,c("V1","V2"))) + unlink(f) +} # complex column support for fwrite, part of #3690 DT = data.table(a=1:3, z=0:2 - (2:0)*1i) From b34ac7bfa1845713f2457fbfd6415573c592183c Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 25 Nov 2023 15:11:46 +0100 Subject: [PATCH 543/588] reviving GLCI * removed 3.4.4, 3.5.0 test jobs * updated urls to windows R binaries * cleanup old comments * remove docker builds * using new lighter images --- .ci/Dockerfile.in | 9 -- .ci/ci.R | 4 - .ci/publish.R | 2 +- .gitlab-ci.yml | 309 +++++++++++++++++++++------------------------- 4 files changed, 143 insertions(+), 181 deletions(-) delete mode 100644 .ci/Dockerfile.in diff --git a/.ci/Dockerfile.in b/.ci/Dockerfile.in deleted file mode 100644 index 559bb9a40a..0000000000 --- a/.ci/Dockerfile.in +++ /dev/null @@ -1,9 +0,0 @@ -FROM registry.gitlab.com/jangorecki/dockerfiles/SRC_IMAGE_NAME - -MAINTAINER Jan Gorecki j.gorecki@wit.edu.pl - -COPY bus/build/cran/ /cran/ - -RUN Rscript -e 'install.packages("data.table", repos=file.path("file:","cran"))' - -CMD ["R"] diff --git a/.ci/ci.R b/.ci/ci.R index a165de8189..f3a4285660 100644 --- a/.ci/ci.R +++ b/.ci/ci.R @@ -185,7 +185,3 @@ function(pkgs, dp } -## set repositories for CI tests -if (as.logical(Sys.getenv("GITLAB_CI","false")) && identical(Sys.getenv("CI_PROJECT_NAME"), "data.table")) { - options("repos" = if (.Platform$OS.type == "windows") file.path("file://",getwd(),"bus/mirror-packages/cran") else file.path("file:", normalizePath("bus/mirror-packages/cran", mustWork=FALSE))) -} diff --git a/.ci/publish.R b/.ci/publish.R index 526d9bd80d..16521fc0e6 100644 --- a/.ci/publish.R +++ b/.ci/publish.R @@ -148,7 +148,7 @@ lib.copy <- function(lib.from, repodir="bus/integration/cran"){ pkg.copy <- function(pkg.from, lib.to) { pkg<-basename(pkg.from); dir.create(file.path(lib.to, pkg), recursive=TRUE) - lib.dirs<-intersect(c("html","doc"), all.lib.dirs<-list.dirs(pkg.from, full.names=FALSE)) + lib.dirs<-intersect(c("help","html","doc"), all.lib.dirs<-list.dirs(pkg.from, full.names=FALSE)) ans1<-setNames(file.copy(file.path(pkg.from, lib.dirs), file.path(lib.to, pkg), recursive=TRUE), lib.dirs) lib.files<-setdiff(list.files(pkg.from), all.lib.dirs) ans2<-setNames(file.copy(file.path(pkg.from, lib.files), file.path(lib.to, pkg)), lib.files) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f0c4037933..18f821b43d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,9 +6,9 @@ variables: TZ: "UTC" ## to avoid 'Failed to create bus connection' from timedatectl via Sys.timezone() on Docker with R 3.4. ## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under ## a non-UTC timezone, although, that's what we do routinely in dev. - R_REL_VERSION: "4.2" - R_DEVEL_VERSION: "4.3" - R_OLDREL_VERSION: "4.1" + R_REL_VERSION: "4.3" + R_DEVEL_VERSION: "4.4" + R_OLDREL_VERSION: "4.2" stages: - dependencies @@ -24,83 +24,87 @@ stages: paths: - bus -mirror-packages: ## mirror all recursive dependencies, source and win.binary of data.table suggests from DESCRIPTION +## mirror packages +# download all recursive dependencies once to be used across multiple test jobs +# sources and binaries for r-release, r-devel and r-oldrel +# cache between runs +mirror-packages: stage: dependencies tags: - linux - image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev + image: registry.gitlab.com/jangorecki/dockerfiles/r-base-minimal cache: paths: - - bus/$CI_BUILD_NAME/cran + - bus/$CI_JOB_NAME/cran script: - echo 'source(".ci/ci.R")' >> .Rprofile - - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib + - mkdir -p bus/$CI_JOB_NAME/cran/src/contrib - Rscript -e 'mirror.packages(dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran")' - - rm bus/$CI_BUILD_NAME/cran/src/contrib/PACKAGES.rds ## fallback to PACKAGES dcf so available.packages:3.4.4 works - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_REL_VERSION","R_DEVEL_VERSION","R_OLDREL_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' <<: *artifacts -# mirror-other-packages: ## mirror integration suggests from pkgs at the top of inst/tests/other.Rraw; off now #5274 -# stage: dependencies -# tags: -# - linux -# image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev -# cache: -# paths: -# - bus/$CI_BUILD_NAME/cran -# script: -# - echo 'source(".ci/ci.R")' >> .Rprofile -# - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib -# - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); mirror.packages(pkgs, repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-other-packages/cran")' -# <<: *artifacts - -build: ## build data.table sources as tar.gz archive +## build +# sources as tar.gz archive +# build vignettes +build: stage: build tags: - linux - image: registry.gitlab.com/jangorecki/dockerfiles/r-builder + image: registry.gitlab.com/jangorecki/dockerfiles/r-base ## r-base-gcc after rstudio/markdown#108 needs: ["mirror-packages"] before_script: - Rscript -e 'install.packages(c("knitr","rmarkdown"), repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' - rm -r bus - sed -i '/^[[:space:]]*$/d' ./DESCRIPTION ## make last line end abruptly; i.e. without a final \n - - echo "Revision:" $CI_BUILD_REF >> ./DESCRIPTION + - echo "Revision:" $CI_COMMIT_SHA >> ./DESCRIPTION script: - R CMD build . - - mkdir -p bus/$CI_BUILD_NAME/cran/src/contrib - - mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_BUILD_NAME/cran/src/contrib/. + - mkdir -p bus/$CI_JOB_NAME/cran/src/contrib + - mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_JOB_NAME/cran/src/contrib/. - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/build/cran"), fields="Revision", addFiles=TRUE)' - - rm bus/$CI_BUILD_NAME/cran/src/contrib/PACKAGES.rds ## fallback to PACKAGES dcf so available.packages:3.4.4 works <<: *artifacts +## install deps aliases .test-install-deps: &install-deps - - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="most"), quiet=TRUE)' + - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), repos=file.path("file:", normalizePath("bus/mirror-packages/cran", mustWork=FALSE)), quiet=TRUE)' +.test-install-deps-win: &install-deps-win + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='all'), repos=file.path('file://',getwd(),'bus/mirror-packages/cran'), quiet=TRUE)" +## copy data.table tar.gz from bus R repo to current directory .test-cp-src: &cp-src - cp $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) . .test-cp-src-win: &cp-src-win - cp.exe $(ls.exe -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head.exe -n 1) . +## move data.table tar.gz to bus .test-mv-src: &mv-src - - mkdir -p bus/$CI_BUILD_NAME && mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_BUILD_NAME + - mkdir -p bus/$CI_JOB_NAME && mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_JOB_NAME .test-mv-src-win: &mv-src-win - - mkdir.exe -p bus/$CI_BUILD_NAME; mv.exe $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) bus/$CI_BUILD_NAME + - mkdir.exe -p bus/$CI_JOB_NAME; mv.exe $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) bus/$CI_JOB_NAME + +## move data.table binaries to bus R repo +.test-mv-bin-win: &mv-bin-win + - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION +## remove data.table tar.gz .test-rm-src: &rm-src - rm $(ls -1t data.table_*.tar.gz | head -n 1) .test-rm-src-win: &rm-src-win - rm.exe $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) -.test-mv-bin-win: &mv-bin-win - - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION - +## install R on windows .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/R-4.2.2-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait - # see #5198 for discussion about the https link used above; it will break each time R is released and the version number will need to be updated + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.3.2/R-4.3.2-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-oldrel-win: &install-r-oldrel-win - - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.3/R-4.1.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.2.3/R-4.2.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + +## install Rtools on windows +.test-install-rtools42-win: &install-rtools42-win + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5355-5357.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait +.test-install-rtools43-win: &install-rtools43-win + - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5863-5818.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools43" -NoNewWindow -Wait .test-template: &test stage: test @@ -112,21 +116,6 @@ build: ## build data.table sources as tar.gz archive tags: - linux -.test-cran-lin-template: &test-cran-lin - <<: *test-lin - variables: - _R_CHECK_CRAN_INCOMING_: "TRUE" - _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" - before_script: - - *install-deps - - *cp-src - - rm -r bus - script: - - *mv-src - - cd bus/$CI_BUILD_NAME - - R CMD check --as-cran --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) - - *rm-src - .test-win-template: &test-win <<: *test tags: @@ -138,20 +127,26 @@ build: ## build data.table sources as tar.gz archive # tags: # - macosx -test-rel-lin: ## most comprehensive tests, force all suggests, also integration tests, using gcc -O3 -flto -fno-common -Wunused-result +## most comprehensive tests +# force all suggests +# flags: gcc -O3 -flto -fno-common -Wunused-result +# tests for compilation warnings +# measure memory usage during tests +test-rel-lin: <<: *test-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-builder - needs: ["mirror-packages","build"] # "mirror-other-packages" + image: registry.gitlab.com/jangorecki/dockerfiles/r-data.table + needs: ["mirror-packages","build"] + allow_failure: true ## temp workaround #5760 variables: _R_CHECK_CRAN_INCOMING_: "FALSE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_CHECK_FORCE_SUGGESTS_: "TRUE" _R_CHECK_TESTS_NLINES_: "0" OPENBLAS_MAIN_FREE: "1" - TEST_DATA_TABLE_WITH_OTHER_PACKAGES: "FALSE" #5274 + TEST_DATA_TABLE_MEMTEST: "1" before_script: - - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), quiet=TRUE)' ## does seem to be needed despite 'needs mirror-packages' - ## - Rscript -e 'eval(parse("inst/tests/other.Rraw", n=1L)); install.packages(pkgs, quiet=TRUE, repos=c(getOption("repos"), file.path("file:", normalizePath("bus/mirror-other-packages/cran", mustWork=FALSE))))' + - apt-get update -q && apt-get install -y --no-install-recommends libcurl4-openssl-dev ## remove after #5749 + - *install-deps - *cp-src - rm -r bus - mkdir -p ~/.R @@ -159,16 +154,20 @@ test-rel-lin: ## most comprehensive tests, force all suggests, also integration - echo 'CXXFLAGS=-g -O3 -flto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars script: - *mv-src - - cd bus/$CI_BUILD_NAME + - cd bus/$CI_JOB_NAME - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) - *rm-src - (! grep "warning:" data.table.Rcheck/00install.out) -test-rel-vanilla-lin: ## minimal, no suggested deps, no vignettes or manuals, measure memory, using gcc -O0 -fno-openmp +## vanilla minimal +# no suggested deps +# no vignettes or manuals +# no openmp +# flags: gcc -O0 -fno-openmp +test-rel-vanilla-lin: <<: *test-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-base-dev - variables: - TEST_DATA_TABLE_MEMTEST: "1" + image: registry.gitlab.com/jangorecki/dockerfiles/r-base-gcc + allow_failure: true ## temp workaround #5484 before_script: - *cp-src - rm -r bus @@ -177,36 +176,46 @@ test-rel-vanilla-lin: ## minimal, no suggested deps, no vignettes or manuals, me - echo 'CXXFLAGS=-g -O0 -fno-openmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars script: - *mv-src - - cd bus/$CI_BUILD_NAME + - cd bus/$CI_JOB_NAME - R CMD check --no-manual --ignore-vignettes $(ls -1t data.table_*.tar.gz | head -n 1) - *rm-src -test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual thus not from cran-lin template +## R-release on Linux +# strict checks for 0 NOTEs +# extra NOTEs check and build pdf manual thus not from cran-lin template +test-rel-cran-lin: <<: *test-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-builder + image: registry.gitlab.com/jangorecki/dockerfiles/r-base variables: _R_CHECK_CRAN_INCOMING_: "TRUE" ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though) _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 - _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0 + _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## bytes + _R_CHECK_PKG_SIZES_THRESHOLD_: "7" ## MB 'checking installed package size' NOTE before_script: + - apt-get update -q && apt-get install -y --no-install-recommends libcurl4-openssl-dev ## temp workaround #5749 - *install-deps - *cp-src - rm -r bus - mkdir -p ~/.R - - echo 'CFLAGS=-g0 -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2'> ~/.R/Makevars ## -g0 because -g increases datatable.so size from 0.5MB to 1.5MB and breaches 'installed package size <= 5MB' note - - echo 'CXXFLAGS=-g0 -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + - echo 'CFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars script: - *mv-src - - cd bus/$CI_BUILD_NAME + - cd bus/$CI_JOB_NAME - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - *rm-src - >- Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: OK")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: OK"), " but ", shQuote(l)) else q("no")' -test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double, check for new notes and compilation warnings, thus allow_failure +## R-devel on Linux +# TODO: --enable-strict-barrier --disable-long-double +# tests for compilation warnings +# tests for new notes +# thus allow_failure +test-dev-cran-lin: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-devel - allow_failure: true + allow_failure: true ## to not be blocked by changes in r-devel variables: _R_CHECK_CRAN_INCOMING_: "TRUE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" @@ -218,84 +227,95 @@ test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-d - rm -r bus script: - *mv-src - - cd bus/$CI_BUILD_NAME + - cd bus/$CI_JOB_NAME - R CMD check --as-cran --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) - *rm-src - (! grep "warning:" data.table.Rcheck/00install.out) - - >- + - >- ## this likely need an update but check fails now on complex NA so CI is not reaching here anyway Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 3 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 3 NOTEs"), " (size of tarball, installed package size, top-level files) but ", shQuote(l)) else q("no")' -test-310-cran-lin: ## R-3.1.0 on Linux, stated dependency of R - <<: *test-cran-lin +## R 3.1.0 +# stated dependency on R +test-310-cran-lin: image: registry.gitlab.com/jangorecki/dockerfiles/r-3.1.0 + <<: *test-lin + before_script: + - *install-deps + - *cp-src + - rm -r bus + script: + - *mv-src + - cd bus/$CI_JOB_NAME + - R CMD check --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) + - *rm-src -test-344-cran-lin: ## R-3.4.4 on Linux, last R non-altrep version - <<: *test-cran-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-3.4.4 - -test-350-cran-lin: ## R-3.5.0 on Linux, first R altrep version - <<: *test-cran-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-3.5.0 - -test-rel-win: ## R-release on Windows, test and build binaries +## R-release on Windows +# test and build binaries +test-rel-win: <<: *test-win variables: R_VERSION: "$R_REL_VERSION" before_script: - *install-r-rel-win - - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5355-5357.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait - - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" - - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most'), quiet=TRUE)" + - *install-rtools43-win + - $ENV:PATH = "C:\R\bin;C:\rtools43\usr\bin;$ENV:PATH" + - *install-deps-win - *cp-src-win - rm.exe -r bus script: - *mv-src-win - - cd bus/$CI_BUILD_NAME + - cd bus/$CI_JOB_NAME - R.exe CMD check --no-manual $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - *rm-src-win - *mv-bin-win -test-dev-win: ## R-devel on Windows; see #5294 for changes in Dec 2021 related to UCRT and Rtools42 +## R-devel on Windows +# test and build binaries +test-dev-win: <<: *test-win variables: R_VERSION: "$R_DEVEL_VERSION" + allow_failure: true ## temp workaround #5748 before_script: - *install-r-devel-win - - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5493-5475.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools43" -NoNewWindow -Wait + - *install-rtools43-win - $ENV:PATH = "C:\R\bin;C:\rtools43\usr\bin;$ENV:PATH" - - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 + - *install-deps-win - *cp-src-win - rm.exe -r bus script: - *mv-src-win - - cd bus/$CI_BUILD_NAME + - cd bus/$CI_JOB_NAME - R.exe CMD check --no-manual --ignore-vignettes $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - *rm-src-win - *mv-bin-win -test-old-win: ## R-oldrel on Windows +## R-oldrel on Windows +# test and build binaries +test-old-win: <<: *test-win variables: R_VERSION: "$R_OLDREL_VERSION" before_script: - *install-r-oldrel-win - - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait - ## rtools42 doesn't support 32bit so oldrel-win (currently R 4.1) needs rtools40 - - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='most', exclude=c('knitr','rmarkdown')), quiet=TRUE)" ## exclude= for #5294 + - *install-rtools42-win + - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" + - *install-deps-win - *cp-src-win - rm.exe -r bus script: - *mv-src-win - - cd bus/$CI_BUILD_NAME + - cd bus/$CI_JOB_NAME - R.exe CMD check --no-manual --ignore-vignettes $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - *rm-src-win - *mv-bin-win -#test-rel-mac: ## R-release on MacOS, no macosx runner yet +## R-release on MacOS +# no macosx runner set yet +#test-rel-mac: # <<: *test-mac # variables: # R_VERSION: "$R_REL_VERSION" @@ -305,7 +325,7 @@ test-old-win: ## R-oldrel on Windows # - rm -r bus # script: # - *mv-src -# - cd bus/$CI_BUILD_NAME +# - cd bus/$CI_JOB_NAME # - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) # - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) # - mkdir -p cran/bin/macosx/el-capitan/contrib/$R_VERSION @@ -313,23 +333,28 @@ test-old-win: ## R-oldrel on Windows # - *rm-src # - *mv-bin-mac -integration: ## merging all artifacts to produce single R repository, documentation and website +## integrate artifacts +# merging package tarballs and binaries into single R repository +# rendering documentation +# setting up CRAN-like structure +# generating pkgdown website +integration: stage: integration image: registry.gitlab.com/jangorecki/dockerfiles/r-pkgdown tags: - linux only: - master - - tags - needs: ["mirror-packages","build","test-rel-lin","test-rel-cran-lin","test-dev-cran-lin","test-rel-vanilla-lin","test-310-cran-lin","test-344-cran-lin","test-350-cran-lin","test-rel-win","test-dev-win","test-old-win"] + needs: ["mirror-packages","build","test-rel-lin","test-rel-cran-lin","test-dev-cran-lin","test-rel-vanilla-lin","test-310-cran-lin","test-rel-win","test-dev-win","test-old-win"] script: + - R --version - Rscript -e 'pkgdown::build_site(override=list(destination="./pkgdown"))' ## html manual, vignettes, repos, cran_web, cran_checks - echo 'source(".ci/ci.R"); source(".ci/publish.R")' >> .Rprofile ## list of available test-* jobs dynamically based on bus/test-* directories - Rscript -e 'cat("\ntest.jobs <- c(\n"); cat(paste0(" \"",list.files("bus",pattern="^test-"),"\" = \"data.table\""), sep=",\n"); cat(")\n")' >> .Rprofile - Rscript -e 'sapply(names(test.jobs), check.test, pkg="data.table", simplify=FALSE)' - - mkdir -p bus/$CI_BUILD_NAME + - mkdir -p bus/$CI_JOB_NAME ## delete any existing non-dev version of data.table - rm -f bus/mirror-packages/cran/src/contrib/data.table_*.tar.gz - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_REL_VERSION/data.table_*.zip @@ -339,14 +364,14 @@ integration: ## merging all artifacts to produce single R repository, documentat #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_DEVEL_VERSION/data.table_*.tgz #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_OLDREL_VERSION/data.table_*.tgz ## merge mirror-packages and R devel packages - - mv bus/mirror-packages/cran bus/$CI_BUILD_NAME/ + - mv bus/mirror-packages/cran bus/$CI_JOB_NAME/ ## publish package sources - - mkdir -p bus/$CI_BUILD_NAME/cran/library bus/$CI_BUILD_NAME/cran/doc - - mv $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) bus/$CI_BUILD_NAME/cran/src/contrib + - mkdir -p bus/$CI_JOB_NAME/cran/library bus/$CI_JOB_NAME/cran/doc + - mv $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) bus/$CI_JOB_NAME/cran/src/contrib - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="source"), type="source", fields="Revision", addFiles=TRUE)' ## publish binaries - Rscript -e 'move.bin("test-rel-win", Sys.getenv("R_REL_VERSION"), os.type="windows")' - - Rscript -e 'move.bin("test-dev-win", Sys.getenv("R_DEVEL_VERSION"), os.type="windows")' + - Rscript -e 'move.bin("test-dev-win", Sys.getenv("R_DEVEL_VERSION"), os.type="windows", silent=TRUE)' - Rscript -e 'move.bin("test-old-win", Sys.getenv("R_OLDREL_VERSION"), os.type="windows")' - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_REL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_DEVEL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' @@ -385,67 +410,17 @@ integration: ## merging all artifacts to produce single R repository, documentat - mv pkgdown/* bus/integration/cran/ ## cleanup artifacts from other jobs - mkdir tmpbus - - mv bus/$CI_BUILD_NAME tmpbus + - mv bus/$CI_JOB_NAME tmpbus - rm -r bus - mv tmpbus bus <<: *artifacts -.docker-template: &docker - stage: deploy - tags: - - linux - image: docker - services: - - docker:dind - needs: - - job: build - - job: integration - artifacts: false - before_script: - - sed "s/SRC_IMAGE_NAME/$SRC_IMAGE_NAME/" < .ci/Dockerfile.in > Dockerfile - - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY - script: - - docker build --pull -t "$CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG" -f Dockerfile . - - docker run --rm "$CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG" Rscript -e 'cat(R.version.string, "\ndata.table revision", read.dcf(system.file("DESCRIPTION", package="data.table"), fields="Revision")[[1L]], "\n"); require(data.table); test.data.table()' - - docker push "$CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG" - -docker-r-release: ## data.table on R-release - only: - - master - variables: - SRC_IMAGE_NAME: "r-base-dev" - IMAGE_NAME: "r-release" - IMAGE_TAG: "latest" - <<: *docker - -docker-r-release-builder: ## data.table on R-release extended for Rmd vignettes build dependencies - only: - - master - variables: - SRC_IMAGE_NAME: "r-builder" - IMAGE_NAME: "r-release-builder" - IMAGE_TAG: "latest" - <<: *docker - -docker-r-devel: ## data.table on R-devel - only: - - master - variables: - SRC_IMAGE_NAME: "r-devel" - IMAGE_NAME: "r-devel" - IMAGE_TAG: "latest" - <<: *docker - -docker-tags: ## data.table on R-release fixed version images - only: - - tags - variables: - SRC_IMAGE_NAME: "r-base-dev" - IMAGE_NAME: "r-release" - IMAGE_TAG: $CI_COMMIT_TAG - <<: *docker - -pages: ## publish R repository, test jobs summaries, html documentation of all packages in repo, pkgdown +## publish +# R repository +# test jobs summaries +# html documentation of all packages in repo +# pkgdown website +pages: stage: deploy environment: production tags: @@ -458,7 +433,7 @@ pages: ## publish R repository, test jobs summaries, html documentation of all p - mkdir -p public - cp -r bus/integration/cran/* public - cat public/src/contrib/PACKAGES - artifacts: ## publish only when no failure + artifacts: expire_in: 2 weeks paths: - public From 46ee05bfa1cba9cb4ac31096352e6d056be07385 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sat, 25 Nov 2023 15:12:32 +0100 Subject: [PATCH 544/588] update r-lib actions setup (#5632) --- .github/workflows/test-coverage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index ba1f94fded..3e59198933 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -18,7 +18,7 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 - uses: r-lib/actions/setup-pandoc@v1 From a63a89f844f5c3b817d5ac639b2f5b4d6d7c5c3a Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 25 Nov 2023 16:00:48 +0100 Subject: [PATCH 545/588] no nanotime anymore (#5761) --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 00f955c377..405b7a0095 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Version: 1.14.9 Title: Extension of `data.frame` Depends: R (>= 3.1.0) Imports: methods -Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown +Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table From 37e5521f835f856ea3f4fee9e1f2caf463547c01 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 25 Nov 2023 16:13:37 +0100 Subject: [PATCH 546/588] Revert "update_dev_pkg uses GH for R repo (#5720)" (#5762) This reverts commit 94e8fbe448f9dcd83fe6f2751dbe36b05b404ee0. --- R/devel.R | 2 +- man/update_dev_pkg.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/devel.R b/R/devel.R index df77eb0e0c..8bd7a1466a 100644 --- a/R/devel.R +++ b/R/devel.R @@ -17,7 +17,7 @@ dcf.repo = function(pkg, repo, field, type) { dcf[dcf[,"Package"]==pkg, field][[1L]] } -update_dev_pkg = function(object="data.table", repo="https://Rdatatable.github.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { +update_dev_pkg = function(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { # this works for any package, not just data.table pkg = object # perform package upgrade when new Revision present diff --git a/man/update_dev_pkg.Rd b/man/update_dev_pkg.Rd index 9914138c82..3db5b98316 100644 --- a/man/update_dev_pkg.Rd +++ b/man/update_dev_pkg.Rd @@ -5,7 +5,7 @@ Downloads and installs latest development version only when a new commit is available which has also passed all tests. Defaults are set to update \code{data.table}, other packages can be used as well. Their repository has to include git commit information in PACKAGES file. } \usage{update_dev_pkg(object="data.table", - repo="https://Rdatatable.github.io/data.table", + repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, \dots) } \arguments{ From 74a749be94406553e3d8daad8497014b0beabf9e Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 25 Nov 2023 16:17:32 +0100 Subject: [PATCH 547/588] not building docker images anymore (#5763) --- .ci/publish.R | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.ci/publish.R b/.ci/publish.R index 16521fc0e6..ec35fe43f3 100644 --- a/.ci/publish.R +++ b/.ci/publish.R @@ -102,15 +102,6 @@ package.index <- function(package, lib.loc, repodir="bus/integration/cran") { sprintf(" Windows binaries: %s ", format.bins(ver=c("r-devel","r-release","r-oldrel"), bin_ver=c(r_devel_ver, r_rel_ver, r_oldrel_ver), cran.home=cran.home, os.type="windows", pkg=pkg, version=version, repodir=repodir)), sprintf(" macOS binaries: %s ", format.bins(ver=c("r-release","r-oldrel"), bin_ver=c(r_rel_ver, r_oldrel_ver), cran.home=cran.home, os.type="macosx", pkg=pkg, version=version, repodir=repodir)) ) - if (pkg=="data.table") { ## docker images - registry = Sys.getenv("CI_REGISTRY", "registry.gitlab.com") - namespace = Sys.getenv("CI_PROJECT_NAMESPACE", "Rdatatable") - project = Sys.getenv("CI_PROJECT_NAME", "data.table") - images = c("r-release","r-devel","r-release-builder") - images.title = c("Base R release", "Base R development", "R release package builder") - tags = rep("latest", 3) - docker.dl = sprintf(" %s:
docker pull %s/%s/%s/%s:%s
", images.title, tolower(registry), tolower(namespace), tolower(project), tolower(images), tags) - } index.file = file.path(repodir, "web/packages", pkg, "index.html") if (!dir.exists(dirname(index.file))) dir.create(dirname(index.file), recursive=TRUE) writeLines(c( @@ -131,11 +122,6 @@ package.index <- function(package, lib.loc, repodir="bus/integration/cran") { sprintf("", pkg), tbl.dl, "
", - if (pkg=="data.table") - c("

Docker images:

", - sprintf("", pkg), - docker.dl, - "
"), "", "" ), index.file) From 50a3dc3744a8039b2215ad6a348bf2aea1e5b3d7 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 25 Nov 2023 17:12:48 +0100 Subject: [PATCH 548/588] disable memtest in CI (#5765) --- .gitlab-ci.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 18f821b43d..95c02a46ad 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -131,7 +131,6 @@ build: # force all suggests # flags: gcc -O3 -flto -fno-common -Wunused-result # tests for compilation warnings -# measure memory usage during tests test-rel-lin: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-data.table @@ -143,7 +142,6 @@ test-rel-lin: _R_CHECK_FORCE_SUGGESTS_: "TRUE" _R_CHECK_TESTS_NLINES_: "0" OPENBLAS_MAIN_FREE: "1" - TEST_DATA_TABLE_MEMTEST: "1" before_script: - apt-get update -q && apt-get install -y --no-install-recommends libcurl4-openssl-dev ## remove after #5749 - *install-deps @@ -397,7 +395,7 @@ integration: - mv /tmp/opencran/doc bus/integration/cran/ ## library html manual, vignettes - Rscript -e 'lib.copy(lib.from="/tmp/opencran/library")' - ## web/checks/$pkg/$job 00install.out, 00check.log, *.Rout, memtest.csv, memtest.png + ## web/checks/$pkg/$job 00install.out, 00check.log, *.Rout, memtest.csv, memtest.png ## memtest not available for now #5764 - Rscript -e 'sapply(names(test.jobs), check.copy, simplify=FALSE)' ## web/packages/$pkg/$pkg.pdf - Rscript -e 'pdf.copy("data.table", "test-rel-lin")' From a6fe882cb2088209c201da09cb3c7a59e30c8745 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 27 Nov 2023 09:33:10 +0100 Subject: [PATCH 549/588] lto warning fix (#5766) * attempt to resolve lto compilation warning #5760 * do not allow failure anymore * another attempt for lto warning * try to fix another lto warning * move comment to related line, thx Michael --- .gitlab-ci.yml | 7 +++---- src/chmatch.c | 3 ++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 95c02a46ad..80fa5d00a7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -129,13 +129,12 @@ build: ## most comprehensive tests # force all suggests -# flags: gcc -O3 -flto -fno-common -Wunused-result +# flags: gcc -O3 -flto=auto -fno-common -Wunused-result # tests for compilation warnings test-rel-lin: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-data.table needs: ["mirror-packages","build"] - allow_failure: true ## temp workaround #5760 variables: _R_CHECK_CRAN_INCOMING_: "FALSE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" @@ -148,8 +147,8 @@ test-rel-lin: - *cp-src - rm -r bus - mkdir -p ~/.R - - echo 'CFLAGS=-g -O3 -flto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O3 -flto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + - echo 'CFLAGS=-g -O3 -flto=auto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O3 -flto=auto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars script: - *mv-src - cd bus/$CI_JOB_NAME diff --git a/src/chmatch.c b/src/chmatch.c index a091e646f0..b3ac5d818c 100644 --- a/src/chmatch.c +++ b/src/chmatch.c @@ -95,7 +95,8 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch // For example: A,B,C,B,D,E,A,A => A(TL=1),B(2),C(3),D(4),E(5) => dupMap 1 2 3 5 6 | 8 7 4 // dupLink 7 8 | 6 (blank=0) int *counts = (int *)calloc(nuniq, sizeof(int)); - int *map = (int *)calloc(tablelen+nuniq, sizeof(int)); // +nuniq to store a 0 at the end of each group + unsigned int mapsize = tablelen+nuniq; // lto compilation warning #5760 // +nuniq to store a 0 at the end of each group + int *map = (int *)calloc(mapsize, sizeof(int)); if (!counts || !map) { // # nocov start for (int i=0; i Date: Mon, 27 Nov 2023 10:15:53 +0100 Subject: [PATCH 550/588] proper fix for #5753 to make zlib fully optional (#5770) --- R/fwrite.R | 2 +- inst/tests/tests.Rraw | 6 +++--- src/data.table.h | 1 + src/init.c | 1 + src/utils.c | 7 +++++++ 5 files changed, 13 insertions(+), 4 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 54ef04ed06..e1484b9e3c 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -115,5 +115,5 @@ fwrite = function(x, file="", append=FALSE, quote="auto", invisible() } -nozlib = function() identical(.Call(Cdt_zlib_version), "zlib header files were not found when data.table was compiled") +haszlib = function() .Call(Cdt_has_zlib) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bd3319f802..59ca6aabd6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -69,7 +69,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { which.first = data.table:::which.first which.last = data.table:::which.last `-.IDate` = data.table:::`-.IDate` - nozlib = data.table:::nozlib + haszlib = data.table:::haszlib # Also, for functions that are masked by other packages, we need to map the data.table one. Or else, # the other package's function would be picked up. As above, we only need to do this because we desire @@ -9881,7 +9881,7 @@ test(1658.39, fwrite(matrix(1:3, nrow=3, ncol=1), quote = TRUE), output = '"V1"\ test(1658.40, fwrite(matrix(1:4, nrow=2, ncol=2, dimnames = list(c("ra","rb"),c("ca","cb"))), quote = TRUE), output = '"ca","cb"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table") # fwrite compress -if (nozlib()) { +if (!haszlib()) { test(1658.409, fwrite(data.table(a=1), file=tempfile(), compress="gzip"), error="header files were not found at the time data.table was compiled") } else { test(1658.41, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), output='a,b\n1,1\n2,2\n3,3') # compress ignored on console @@ -9921,7 +9921,7 @@ test(1658.52, file.info(f1)$size, file.info(f2)$size) unlink(c(f1, f2)) # compression error -5 due to only 3 bytes (bom) in first block; #3599 -if (!nozlib()) { +if (haszlib()) { DT = data.table(l=letters, n=1:26) test(1658.53, fwrite(DT, file=f<-tempfile(fileext=".gz"), bom=TRUE, col.names=FALSE), NULL) if (test_R.utils) test(1658.54, fread(f), setnames(DT,c("V1","V2"))) diff --git a/src/data.table.h b/src/data.table.h index c4458e8999..4c9df894c0 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -333,6 +333,7 @@ SEXP initLastUpdated(SEXP); SEXP allNAR(SEXP); SEXP test_dt_win_snprintf(void); SEXP dt_zlib_version(void); +SEXP dt_has_zlib(void); SEXP startsWithAny(SEXP, SEXP, SEXP); SEXP convertDate(SEXP, SEXP); SEXP fastmean(SEXP); diff --git a/src/init.c b/src/init.c index 2cffabd34b..e374eb6e41 100644 --- a/src/init.c +++ b/src/init.c @@ -136,6 +136,7 @@ R_CallMethodDef callMethods[] = { {"CcoerceAs", (DL_FUNC) &coerceAs, -1}, {"Ctest_dt_win_snprintf", (DL_FUNC)&test_dt_win_snprintf, -1}, {"Cdt_zlib_version", (DL_FUNC)&dt_zlib_version, -1}, +{"Cdt_has_zlib", (DL_FUNC)&dt_has_zlib, -1}, {"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {"CconvertDate", (DL_FUNC)&convertDate, -1}, diff --git a/src/utils.c b/src/utils.c index fa10fd97ca..3dfd8bcc69 100644 --- a/src/utils.c +++ b/src/utils.c @@ -379,6 +379,13 @@ SEXP dt_zlib_version(void) { #endif return ScalarString(mkChar(out)); } +SEXP dt_has_zlib(void) { +#ifndef NOZLIB + return ScalarLogical(1); +#else + return ScalarLogical(0); +#endif +} SEXP startsWithAny(const SEXP x, const SEXP y, SEXP start) { // for is_url in fread.R added in #5097 From ba2f26bef5708a281ad4b064a2c8f464c3ff05d5 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 27 Nov 2023 10:56:08 +0100 Subject: [PATCH 551/588] some extra dev-related ignore dir/file (#5771) --- .Rbuildignore | 1 + .gitignore | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.Rbuildignore b/.Rbuildignore index 1e99a9004b..5f47bbacdb 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -39,3 +39,4 @@ ^pkgdown$ ^lib$ ^library$ +^devwd$ diff --git a/.gitignore b/.gitignore index 74c9043c04..559df7b9de 100644 --- a/.gitignore +++ b/.gitignore @@ -41,6 +41,8 @@ vignettes/plots/figures .Renviron lib library +devwd +dev.R *.csv *.csvy *.RDS From f032dd1ff3976fe9e136fb8bbb57a2f4ec6fed87 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 1 Dec 2023 20:25:53 +0100 Subject: [PATCH 552/588] fix broken Sean Lahman link (#5776) Closes #5767 Co-authored-by: Tyson Barrett --- vignettes/datatable-sd-usage.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index f84fd6ea63..60d5c07c1d 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -34,7 +34,7 @@ The simpler usage of `.SD` is for column subsetting (i.e., when `.SDcols` is spe ## Loading and Previewing Lahman Data -To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](https://www.seanlahman.com/baseball-archive/statistics/). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. +To give this a more real-world feel, rather than making up data, let's load some data sets about baseball from the [Lahman database](https://github.com/cdalzell/Lahman). In typical R usage, we'd simply load these data sets from the `Lahman` R package; in this vignette, we've pre-downloaded them directly from the package's GitHub page instead. ```{r download_lahman} load('Teams.RData') @@ -46,7 +46,7 @@ setDT(Pitching) Pitching ``` -Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](https://www.seanlahman.com/files/database/readme2017.txt) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. +Readers up on baseball lingo should find the tables' contents familiar; `Teams` records some statistics for a given team in a given year, while `Pitching` records statistics for a given pitcher in a given year. Please do check out the [documentation](https://github.com/cdalzell/Lahman) and explore the data yourself a bit before proceeding to familiarize yourself with their structure. # `.SD` on Ungrouped Data From cbb0d075ce2c3cf3fc44359c7a940021cb37d0a0 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 2 Dec 2023 13:38:34 +0100 Subject: [PATCH 553/588] fix print format, closes #5778 (#5779) --- src/ijoin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ijoin.c b/src/ijoin.c index 96a9deae4f..b4f0a4b081 100644 --- a/src/ijoin.c +++ b/src/ijoin.c @@ -322,7 +322,7 @@ SEXP overlaps(SEXP ux, SEXP imatches, SEXP multArg, SEXP typeArg, SEXP nomatchAr ++totlen; } break; - default: error(_("Internal error: unknown type in mult=ALL in overlaps: %d"), mult, type); // #nocov + default: error(_("Internal error: unknown type in mult=ALL in overlaps: %d"), type); // #nocov } } else totlen = rows; end1 = clock() - start; From 67fb763662b59f04bd6037e3b80f2c02f8e4c87c Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 2 Dec 2023 15:26:18 +0100 Subject: [PATCH 554/588] cherry picked #5769 to master (#5780) --- src/assign.c | 7 ++++--- src/fread.c | 2 +- src/fsort.c | 6 +++--- src/fwrite.c | 8 ++++---- src/gsumm.c | 6 +++--- src/init.c | 31 ++++++++++++++++--------------- src/reorder.c | 4 ++-- src/snprintf.c | 2 +- 8 files changed, 34 insertions(+), 32 deletions(-) diff --git a/src/assign.c b/src/assign.c index 61f38a5548..fc960132d1 100644 --- a/src/assign.c +++ b/src/assign.c @@ -470,7 +470,8 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // Can growVector at this point easily enough, but it shouldn't happen in first place so leave it as // strong error message for now. else if (TRUELENGTH(names) != oldtncol) - error(_("Internal error: selfrefnames is ok but tl names [%d] != tl [%d]"), TRUELENGTH(names), oldtncol); // # nocov + // Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768, PRId64 didnt work + error(_("Internal error: selfrefnames is ok but tl names [%ld] != tl [%d]"), TRUELENGTH(names), oldtncol); // # nocov SETLENGTH(dt, oldncol+LENGTH(newcolnames)); SETLENGTH(names, oldncol+LENGTH(newcolnames)); for (int i=0; i=tt[i+1]) - error(_("Internal error: %d column numbers to delete not now in strictly increasing order. No-dups were checked earlier.")); // # nocov + error(_("Internal error: %d column numbers to delete not now in strictly increasing order. No-dups were checked earlier."), i); // # nocov } for (int i=tt[0], j=1, k=tt[0]+1; i0) for (int j=0; jCT_EMPTY) { args.header=true; - if (verbose) DTPRINT(_(" 'header' determined to be true due to column %d containing a string on row 1 and a lower type (%s) in the rest of the %d sample rows\n"), + if (verbose) DTPRINT(_(" 'header' determined to be true due to column %d containing a string on row 1 and a lower type (%s) in the rest of the %"PRId64" sample rows\n"), j+1, typeName[type[j]], sampleLines); break; } diff --git a/src/fsort.c b/src/fsort.c index 6dbb85d550..2618ec577b 100644 --- a/src/fsort.c +++ b/src/fsort.c @@ -165,7 +165,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) { int MSBNbits = maxBit > 15 ? 16 : maxBit+1; // how many bits make up the MSB int shift = maxBit + 1 - MSBNbits; // the right shift to leave the MSB bits remaining size_t MSBsize = 1LL< 65,536) - if (verbose) Rprintf(_("maxBit=%d; MSBNbits=%d; shift=%d; MSBsize=%d\n"), maxBit, MSBNbits, shift, MSBsize); + if (verbose) Rprintf(_("maxBit=%d; MSBNbits=%d; shift=%d; MSBsize=%zu\n"), maxBit, MSBNbits, shift, MSBsize); uint64_t *counts = (uint64_t *)R_alloc(nBatch*MSBsize, sizeof(uint64_t)); memset(counts, 0, nBatch*MSBsize*sizeof(uint64_t)); @@ -242,11 +242,11 @@ SEXP fsort(SEXP x, SEXP verboseArg) { if (verbose) { Rprintf(_("Top 20 MSB counts: ")); for(int i=0; i0 && msbCounts[order[MSBsize-1]] < 2) MSBsize--; if (verbose) { - Rprintf(_("%d by excluding 0 and 1 counts\n"), MSBsize); + Rprintf(_("%zu by excluding 0 and 1 counts\n"), MSBsize); } bool failed=false, alloc_fail=false, non_monotonic=false; // shared bools only ever assigned true; no need for atomic or critical assign diff --git a/src/fwrite.c b/src/fwrite.c index c5f9772124..ef8b822e9b 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -714,7 +714,7 @@ void fwriteMain(fwriteMainArgs args) } if (headerLen) { char *buff = malloc(headerLen); - if (!buff) STOP(_("Unable to allocate %d MiB for header: %s"), headerLen / 1024 / 1024, strerror(errno)); + if (!buff) STOP(_("Unable to allocate %zu MiB for header: %s"), headerLen / 1024 / 1024, strerror(errno)); char *ch = buff; if (args.bom) {*ch++=(char)0xEF; *ch++=(char)0xBB; *ch++=(char)0xBF; } // 3 appears above (search for "bom") memcpy(ch, args.yaml, yamlLen); @@ -753,7 +753,7 @@ void fwriteMain(fwriteMainArgs args) char *zbuff = malloc(zbuffSize); if (!zbuff) { free(buff); // # nocov - STOP(_("Unable to allocate %d MiB for zbuffer: %s"), zbuffSize / 1024 / 1024, strerror(errno)); // # nocov + STOP(_("Unable to allocate %zu MiB for zbuffer: %s"), zbuffSize / 1024 / 1024, strerror(errno)); // # nocov } size_t zbuffUsed = zbuffSize; ret1 = compressbuff(&stream, zbuff, &zbuffUsed, buff, (size_t)(ch-buff)); @@ -820,7 +820,7 @@ void fwriteMain(fwriteMainArgs args) char *buffPool = malloc(nth*(size_t)buffSize); if (!buffPool) { // # nocov start - STOP(_("Unable to allocate %d MB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), + STOP(_("Unable to allocate %zu MB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), (size_t)buffSize/(1024^2), nth, errno, strerror(errno)); // # nocov end } @@ -831,7 +831,7 @@ void fwriteMain(fwriteMainArgs args) if (!zbuffPool) { // # nocov start free(buffPool); - STOP(_("Unable to allocate %d MB * %d thread compressed buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), + STOP(_("Unable to allocate %zu MB * %d thread compressed buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), (size_t)zbuffSize/(1024^2), nth, errno, strerror(errno)); // # nocov end } diff --git a/src/gsumm.c b/src/gsumm.c index ed4169ff58..2047c61cd9 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -86,8 +86,8 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) { // TODO: enable stress-test mode in tests only (#3205) which can be turned off by default in release to decrease overhead on small data // if that is established to be biting (it may be fine). if (nBatch<1 || batchSize<1 || lastBatchSize<1) { - error(_("Internal error: nrow=%d ngrp=%d nbit=%d bitshift=%d highSize=%d nBatch=%d batchSize=%d lastBatchSize=%d\n"), // # nocov - nrow, ngrp, nb, bitshift, highSize, nBatch, batchSize, lastBatchSize); // # nocov + error(_("Internal error: nrow=%d ngrp=%d nbit=%d bitshift=%d highSize=%zu nBatch=%zu batchSize=%zu lastBatchSize=%zu\n"), // # nocov + nrow, ngrp, nb, bitshift, highSize, nBatch, batchSize, lastBatchSize); // # nocov } // initial population of g: #pragma omp parallel for num_threads(getDTthreads(ngrp, false)) @@ -1116,7 +1116,7 @@ SEXP gprod(SEXP x, SEXP narmArg) { //clock_t start = clock(); if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gprod"); long double *s = malloc(ngrp * sizeof(long double)); - if (!s) error(_("Unable to allocate %d * %d bytes for gprod"), ngrp, sizeof(long double)); + if (!s) error(_("Unable to allocate %d * %lu bytes for gprod"), ngrp, sizeof(long double)); for (int i=0; i8) error(_("Pointers are %d bytes, greater than 8. We have not tested on any architecture greater than 64bit yet."), sizeof(char *)); + if (sizeof(char *)>8) error(_("Pointers are %lu bytes, greater than 8. We have not tested on any architecture greater than 64bit yet."), sizeof(char *)); // One place we need the largest sizeof is the working memory malloc in reorder.c } @@ -177,23 +177,24 @@ void attribute_visible R_init_data_table(DllInfo *info) const char *msg = _("... failed. Please forward this message to maintainer('data.table')."); if ((int)NA_INTEGER != (int)INT_MIN) error(_("Checking NA_INTEGER [%d] == INT_MIN [%d] %s"), NA_INTEGER, INT_MIN, msg); if ((int)NA_INTEGER != (int)NA_LOGICAL) error(_("Checking NA_INTEGER [%d] == NA_LOGICAL [%d] %s"), NA_INTEGER, NA_LOGICAL, msg); - if (sizeof(int) != 4) error(_("Checking sizeof(%s) [%d] is %d %s"), "int", sizeof(int), 4, msg); - if (sizeof(double) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "double", sizeof(double), 8, msg); // 8 on both 32bit and 64bit - // alignof not available in C99: if (alignof(double) != 8) error(_("Checking alignof(double) [%d] is 8 %s"), alignof(double), msg); // 8 on both 32bit and 64bit - if (sizeof(long long) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "long long", sizeof(long long), 8, msg); - if (sizeof(char *) != 4 && sizeof(char *) != 8) error(_("Checking sizeof(pointer) [%d] is 4 or 8 %s"), sizeof(char *), msg); - if (sizeof(SEXP) != sizeof(char *)) error(_("Checking sizeof(SEXP) [%d] == sizeof(pointer) [%d] %s"), sizeof(SEXP), sizeof(char *), msg); - if (sizeof(uint64_t) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "uint64_t", sizeof(uint64_t), 8, msg); - if (sizeof(int64_t) != 8) error(_("Checking sizeof(%s) [%d] is %d %s"), "int64_t", sizeof(int64_t), 8, msg); - if (sizeof(signed char) != 1) error(_("Checking sizeof(%s) [%d] is %d %s"), "signed char", sizeof(signed char), 1, msg); - if (sizeof(int8_t) != 1) error(_("Checking sizeof(%s) [%d] is %d %s"), "int8_t", sizeof(int8_t), 1, msg); - if (sizeof(uint8_t) != 1) error(_("Checking sizeof(%s) [%d] is %d %s"), "uint8_t", sizeof(uint8_t), 1, msg); - if (sizeof(int16_t) != 2) error(_("Checking sizeof(%s) [%d] is %d %s"), "int16_t", sizeof(int16_t), 2, msg); - if (sizeof(uint16_t) != 2) error(_("Checking sizeof(%s) [%d] is %d %s"), "uint16_t", sizeof(uint16_t), 2 ,msg); + if (sizeof(int) != 4) error(_("Checking sizeof(%s) [%lu] is %d %s"), "int", sizeof(int), 4, msg); + if (sizeof(double) != 8) error(_("Checking sizeof(%s) [%lu] is %d %s"), "double", sizeof(double), 8, msg); // 8 on both 32bit and 64bit + // alignof not available in C99: if (alignof(double) != 8) error(_("Checking alignof(double) [%lu] is 8 %s"), alignof(double), msg); // 8 on both 32bit and 64bit + if (sizeof(long long) != 8) error(_("Checking sizeof(%s) [%lu] is %d %s"), "long long", sizeof(long long), 8, msg); + if (sizeof(char *) != 4 && sizeof(char *) != 8) error(_("Checking sizeof(pointer) [%lu] is 4 or 8 %s"), sizeof(char *), msg); + if (sizeof(SEXP) != sizeof(char *)) error(_("Checking sizeof(SEXP) [%lu] == sizeof(pointer) [%lu] %s"), sizeof(SEXP), sizeof(char *), msg); + if (sizeof(uint64_t) != 8) error(_("Checking sizeof(%s) [%lu] is %d %s"), "uint64_t", sizeof(uint64_t), 8, msg); + if (sizeof(int64_t) != 8) error(_("Checking sizeof(%s) [%lu] is %d %s"), "int64_t", sizeof(int64_t), 8, msg); + if (sizeof(signed char) != 1) error(_("Checking sizeof(%s) [%lu] is %d %s"), "signed char", sizeof(signed char), 1, msg); + if (sizeof(int8_t) != 1) error(_("Checking sizeof(%s) [%lu] is %d %s"), "int8_t", sizeof(int8_t), 1, msg); + if (sizeof(uint8_t) != 1) error(_("Checking sizeof(%s) [%lu] is %d %s"), "uint8_t", sizeof(uint8_t), 1, msg); + if (sizeof(int16_t) != 2) error(_("Checking sizeof(%s) [%lu] is %d %s"), "int16_t", sizeof(int16_t), 2, msg); + if (sizeof(uint16_t) != 2) error(_("Checking sizeof(%s) [%lu] is %d %s"), "uint16_t", sizeof(uint16_t), 2 ,msg); SEXP tmp = PROTECT(allocVector(INTSXP,2)); if (LENGTH(tmp)!=2) error(_("Checking LENGTH(allocVector(INTSXP,2)) [%d] is 2 %s"), LENGTH(tmp), msg); - if (TRUELENGTH(tmp)!=0) error(_("Checking TRUELENGTH(allocVector(INTSXP,2)) [%d] is 0 %s"), TRUELENGTH(tmp), msg); + // Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768 + if (TRUELENGTH(tmp)!=0) error(_("Checking TRUELENGTH(allocVector(INTSXP,2)) [%lld] is 0 %s"), (long long)TRUELENGTH(tmp), msg); UNPROTECT(1); // According to IEEE (http://en.wikipedia.org/wiki/IEEE_754-1985#Zero) we can rely on 0.0 being all 0 bits. diff --git a/src/reorder.c b/src/reorder.c index debdb02172..a36e27055d 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -14,7 +14,7 @@ SEXP reorder(SEXP x, SEXP order) for (int i=0; i maxSize) @@ -24,7 +24,7 @@ SEXP reorder(SEXP x, SEXP order) copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768 } else { if (SIZEOF(x)!=4 && SIZEOF(x)!=8 && SIZEOF(x)!=16 && SIZEOF(x)!=1) - error(_("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%d)"), type2char(TYPEOF(x)), SIZEOF(x)); + error(_("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%zu)"), type2char(TYPEOF(x)), SIZEOF(x)); if (ALTREP(x)) error(_("Internal error in reorder.c: cannot reorder an ALTREP vector. Please see NEWS item 2 in v1.11.4 and report this as a bug.")); // # nocov maxSize = SIZEOF(x); nrow = length(x); diff --git a/src/snprintf.c b/src/snprintf.c index 6b8098c6f2..f322931fc7 100644 --- a/src/snprintf.c +++ b/src/snprintf.c @@ -214,7 +214,7 @@ SEXP test_dt_win_snprintf(void) int res = dt_win_snprintf(buff, 10, "%4$d%2$d%3$d%5$d%1$d", 111, 222, 33, 44, 555); // fmt longer than n if (strlen(buff)!=9 || strcmp(buff, "442223355")) error(_("dt_win_snprintf test %d failed: %s"), 9, buff); - if (res!=13) /* should return what would have been written if not chopped */ error(_("dt_win_snprintf test %d failed: %s"), 10, res); + if (res!=13) /* should return what would have been written if not chopped */ error(_("dt_win_snprintf test %d failed: %d"), 10, res); dt_win_snprintf(buff, 39, "%l", 3); if (strlen(buff)!=38 || strcmp(buff, "0 %l does not end with recognized t")) error(_("dt_win_snprintf test %d failed: %s"), 11, buff); From a413d3c9f9d9b6e25d99e0c348772ccb81188f11 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 2 Dec 2023 15:29:20 +0100 Subject: [PATCH 555/588] Fix format-security compiler warnings (#5774) (#5781) Co-authored-by: Michael Chirico --- src/assign.c | 2 +- src/forder.c | 2 +- src/fwrite.c | 4 ++-- src/rbindlist.c | 8 ++++---- src/subset.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/assign.c b/src/assign.c index fc960132d1..3356e918be 100644 --- a/src/assign.c +++ b/src/assign.c @@ -527,7 +527,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) targetcol = VECTOR_ELT(dt,coln); } const char *ret = memrecycle(targetcol, rows, 0, targetlen, thisvalue, 0, -1, coln+1, CHAR(STRING_ELT(names, coln))); - if (ret) warning(ret); + if (ret) warning("%s", ret); } *_Last_updated = numToDo; // the updates have taken place with no error, so update .Last.updated now diff --git a/src/forder.c b/src/forder.c index 8a62e1de78..c9063782bf 100644 --- a/src/forder.c +++ b/src/forder.c @@ -56,7 +56,7 @@ static int *anso = NULL; static bool notFirst=false; static char msg[1001]; -#define STOP(...) do {snprintf(msg, 1000, __VA_ARGS__); cleanup(); error(msg);} while(0) // http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html#Swallowing-the-Semicolon +#define STOP(...) do {snprintf(msg, 1000, __VA_ARGS__); cleanup(); error("%s", msg);} while(0) // http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html#Swallowing-the-Semicolon // use STOP in this file (not error()) to ensure cleanup() is called first // snprintf to msg first in case nrow (just as an example) is provided in the message because cleanup() sets nrow to 0 #undef warning diff --git a/src/fwrite.c b/src/fwrite.c index ef8b822e9b..322909749a 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -736,7 +736,7 @@ void fwriteMain(fwriteMainArgs args) } if (f==-1) { *ch = '\0'; - DTPRINT(buff); + DTPRINT("%s", buff); free(buff); } else { int ret1=0, ret2=0; @@ -926,7 +926,7 @@ void fwriteMain(fwriteMainArgs args) errno=0; if (f==-1) { *ch='\0'; // standard C string end marker so DTPRINT knows where to stop - DTPRINT(myBuff); + DTPRINT("%s", myBuff); } else if ((args.is_gzip ? WRITE(f, myzBuff, (int)myzbuffUsed) : WRITE(f, myBuff, (int)(ch-myBuff))) == -1) { failed=true; // # nocov diff --git a/src/rbindlist.c b/src/rbindlist.c index 3669028835..2ffff3af8c 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -208,7 +208,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) const char *str = isString(s) ? CHAR(STRING_ELT(s,w2)) : ""; snprintf(buff, 1000, _("Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names.%s"), w2+1, str, i+1, missi+1, extra ); - if (usenames==TRUE) error(buff); + if (usenames==TRUE) error("%s", buff); i = LENGTH(l); // break from outer i loop break; // break from inner j loop } @@ -229,8 +229,8 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) } const char *o = isNull(opt) ? "message" : CHAR(STRING_ELT(opt,0)); if (strcmp(o,"message")==0) { eval(PROTECT(lang2(install("message"),PROTECT(ScalarString(mkChar(buff))))), R_GlobalEnv); UNPROTECT(2); } - else if (strcmp(o,"warning")==0) warning(buff); - else if (strcmp(o,"error")==0) error(buff); + else if (strcmp(o,"warning")==0) warning("%s", buff); + else if (strcmp(o,"error")==0) error("%s", buff); else if (strcmp(o,"none")!=0) warning(_("options()$datatable.rbindlist.check=='%s' which is not 'message'|'warning'|'error'|'none'. See news item 5 in v1.12.2."), o); } } @@ -490,7 +490,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) } for (int k=0; k Date: Sat, 2 Dec 2023 19:17:26 +0100 Subject: [PATCH 556/588] follow up of #5780 to resolve -Wformat warning --- src/gsumm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gsumm.c b/src/gsumm.c index 2047c61cd9..742e718f40 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -616,7 +616,7 @@ SEXP gmean(SEXP x, SEXP narmArg) } else { // narm==true and anyNA==true int *restrict nna_counts = calloc(ngrp, sizeof(int)); - if (!nna_counts) error(_("Unable to allocate %d * %d bytes for non-NA counts in gmean na.rm=TRUE"), ngrp, sizeof(int)); + if (!nna_counts) error(_("Unable to allocate %d * %lu bytes for non-NA counts in gmean na.rm=TRUE"), ngrp, sizeof(int)); #pragma omp parallel for num_threads(getDTthreads(highSize, false)) for (int h=0; h Date: Sat, 2 Dec 2023 21:15:57 +0100 Subject: [PATCH 557/588] vignette render with markdown rather than rmarkdown (#5773) * vignette render with markdown rather than rmarkdown * tune TOC --- DESCRIPTION | 2 +- vignettes/css/toc.css | 6 +++ vignettes/datatable-benchmarking.Rmd | 17 +++++++-- vignettes/datatable-faq.Rmd | 37 ++++++++++--------- vignettes/datatable-importing.Rmd | 4 +- vignettes/datatable-intro.Rmd | 4 +- vignettes/datatable-keys-fast-subset.Rmd | 4 +- vignettes/datatable-programming.Rmd | 4 +- vignettes/datatable-reference-semantics.Rmd | 4 +- vignettes/datatable-reshape.Rmd | 4 +- vignettes/datatable-sd-usage.Rmd | 11 ++++-- ...le-secondary-indices-and-auto-indexing.Rmd | 4 +- 12 files changed, 61 insertions(+), 40 deletions(-) create mode 100644 vignettes/css/toc.css diff --git a/DESCRIPTION b/DESCRIPTION index 405b7a0095..6756db8ae1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Version: 1.14.9 Title: Extension of `data.frame` Depends: R (>= 3.1.0) Imports: methods -Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown +Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, zoo (>= 1.8-1), yaml, knitr, markdown Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table diff --git a/vignettes/css/toc.css b/vignettes/css/toc.css new file mode 100644 index 0000000000..86adaba5b1 --- /dev/null +++ b/vignettes/css/toc.css @@ -0,0 +1,6 @@ +#TOC { + border: 1px solid #ccc; + border-radius: 5px; + padding-left: 1em; + background: #f6f6f6; +} diff --git a/vignettes/datatable-benchmarking.Rmd b/vignettes/datatable-benchmarking.Rmd index 7614a27d54..da580764b8 100644 --- a/vignettes/datatable-benchmarking.Rmd +++ b/vignettes/datatable-benchmarking.Rmd @@ -2,15 +2,24 @@ title: "Benchmarking data.table" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette: - toc: true - number_sections: true + markdown::html_format: + options: + toc: true + number_sections: true + meta: + css: [default, css/toc.css] vignette: > %\VignetteIndexEntry{Benchmarking data.table} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::knitr} \usepackage[utf8]{inputenc} --- + + This document is meant to guide on measuring performance of `data.table`. Single place to document best practices and traps to avoid. # fread: clear caches diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index 4b0645e6b6..f1deaba781 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -2,12 +2,15 @@ title: "Frequently Asked Questions about data.table" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette: - toc: true - number_sections: true + markdown::html_format: + options: + toc: true + number_sections: true + meta: + css: [default, css/toc.css] vignette: > %\VignetteIndexEntry{Frequently Asked Questions about data.table} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::knitr} \usepackage[utf8]{inputenc} --- @@ -94,13 +97,13 @@ As [highlighted above](#j-num), `j` in `[.data.table` is fundamentally different Furthermore, data.table _inherits_ from `data.frame`. It _is_ a `data.frame`, too. A data.table can be passed to any package that only accepts `data.frame` and that package can use `[.data.frame` syntax on the data.table. See [this answer](https://stackoverflow.com/a/10529888/403310) for how that is achieved. -We _have_ proposed enhancements to R wherever possible, too. One of these was accepted as a new feature in R 2.12.0 : +We _have_ proposed enhancements to R wherever possible, too. One of these was accepted as a new feature in R 2.12.0: > `unique()` and `match()` are now faster on character vectors where all elements are in the global CHARSXP cache and have unmarked encoding (ASCII). Thanks to Matt Dowle for suggesting improvements to the way the hash code is generated in unique.c. A second proposal was to use `memcpy` in duplicate.c, which is much faster than a for loop in C. This would improve the _way_ that R copies data internally (on some measures by 13 times). The thread on r-devel is [here](https://stat.ethz.ch/pipermail/r-devel/2010-April/057249.html). -A third more significant proposal that was accepted is that R now uses data.table's radix sort code as from R 3.3.0 : +A third more significant proposal that was accepted is that R now uses data.table's radix sort code as from R 3.3.0: > The radix sort algorithm and implementation from data.table (forder) replaces the previous radix (counting) sort and adds a new method for order(). Contributed by Matt Dowle and Arun Srinivasan, the new algorithm supports logical, integer (even with large values), real, and character vectors. It outperforms all other methods, but there are some caveats (see ?sort). @@ -236,7 +239,7 @@ Then you are using a version prior to 1.5.3. Prior to 1.5.3 `[.data.table` detec ## What are the scoping rules for `j` expressions? -Think of the subset as an environment where all the column names are variables. When a variable `foo` is used in the `j` of a query such as `X[Y, sum(foo)]`, `foo` is looked for in the following order : +Think of the subset as an environment where all the column names are variables. When a variable `foo` is used in the `j` of a query such as `X[Y, sum(foo)]`, `foo` is looked for in the following order: 1. The scope of `X`'s subset; _i.e._, `X`'s column names. 2. The scope of each row of `Y`; _i.e._, `Y`'s column names (_join inherited scope_) @@ -295,18 +298,18 @@ The `Z[Y]` part is not a single name so that is evaluated within the frame of `X ## Can you explain further why data.table is inspired by `A[B]` syntax in `base`? -Consider `A[B]` syntax using an example matrix `A` : +Consider `A[B]` syntax using an example matrix `A`: ```{r} A = matrix(1:12, nrow = 4) A ``` -To obtain cells `(1, 2) = 5` and `(3, 3) = 11` many users (we believe) may try this first : +To obtain cells `(1, 2) = 5` and `(3, 3) = 11` many users (we believe) may try this first: ```{r} A[c(1, 3), c(2, 3)] ``` -However, this returns the union of those rows and columns. To reference the cells, a 2-column matrix is required. `?Extract` says : +However, this returns the union of those rows and columns. To reference the cells, a 2-column matrix is required. `?Extract` says: > When indexing arrays by `[` a single argument `i` can be a matrix with as many columns as there are dimensions of `x`; the result is then a vector with elements corresponding to the sets of indices in each row of `i`. @@ -354,7 +357,7 @@ Furthermore, matrices, especially sparse matrices, are often stored in a 3-colum data.table _inherits_ from `data.frame`. It _is_ a `data.frame`, too. A data.table _can_ be passed to any package that _only_ accepts `data.frame`. When that package uses `[.data.frame` syntax on the data.table, it works. It works because `[.data.table` looks to see where it was called from. If it was called from such a package, `[.data.table` diverts to `[.data.frame`. ## I've heard that data.table syntax is analogous to SQL. -Yes : +Yes: - `i` $\Leftrightarrow$ where - `j` $\Leftrightarrow$ select @@ -367,7 +370,7 @@ Yes : - `mult = "first"|"last"` $\Leftrightarrow$ N/A because SQL is inherently unordered - `roll = TRUE` $\Leftrightarrow$ N/A because SQL is inherently unordered -The general form is : +The general form is: ```{r, eval = FALSE} DT[where, select|update, group by][order by][...] ... [...] @@ -447,7 +450,7 @@ Many thanks to the R core team for fixing the issue in Sep 2019. data.table v1.1 This comes up quite a lot but it's really earth-shatteringly simple. A function such as `merge` is _generic_ if it consists of a call to `UseMethod`. When you see people talking about whether or not functions are _generic_ functions they are merely typing the function without `()` afterwards, looking at the program code inside it and if they see a call to `UseMethod` then it is _generic_. What does `UseMethod` do? It literally slaps the function name together with the class of the first argument, separated by period (`.`) and then calls that function, passing along the same arguments. It's that simple. For example, `merge(X, Y)` contains a `UseMethod` call which means it then _dispatches_ (i.e. calls) `paste("merge", class(X), sep = ".")`. Functions with dots in their name may or may not be methods. The dot is irrelevant really, other than dot being the separator that `UseMethod` uses. Knowing this background should now highlight why, for example, it is obvious to R folk that `as.data.table.data.frame` is the `data.frame` method for the `as.data.table` generic function. Further, it may help to elucidate that, yes, you are correct, it is not obvious from its name alone that `ls.fit` is not the fit method of the `ls` generic function. You only know that by typing `ls` (not `ls()`) and observing it isn't a single call to `UseMethod`. -You might now ask: where is this documented in R? Answer: it's quite clear, but, you need to first know to look in `?UseMethod` and _that_ help file contains : +You might now ask: where is this documented in R? Answer: it's quite clear, but, you need to first know to look in `?UseMethod` and _that_ help file contains: > When a function calling `UseMethod('fun')` is applied to an object with class attribute `c('first', 'second')`, the system searches for a function called `fun.first` and, if it finds it, applies it to the object. If no such function is found a function called `fun.second` is tried. If no class name produces a suitable function, the function `fun.default` is used, if it exists, or an error results. @@ -481,7 +484,7 @@ copied in bulk (`memcpy` in C) rather than looping in C. ## What are primary and secondary indexes in data.table? Manual: [`?setkey`](https://www.rdocumentation.org/packages/data.table/functions/setkey) -S.O. : [What is the purpose of setting a key in data.table?](https://stackoverflow.com/questions/20039335/what-is-the-purpose-of-setting-a-key-in-data-table/20057411#20057411) +S.O.: [What is the purpose of setting a key in data.table?](https://stackoverflow.com/questions/20039335/what-is-the-purpose-of-setting-a-key-in-data-table/20057411#20057411) `setkey(DT, col1, col2)` orders the rows by column `col1` then within each group of `col1` it orders by `col2`. This is a _primary index_. The row order is changed _by reference_ in RAM. Subsequent joins and groups on those key columns then take advantage of the sort order for efficiency. (Imagine how difficult looking for a phone number in a printed telephone directory would be if it wasn't sorted by surname then forename. That's literally all `setkey` does. It sorts the rows by the columns you specify.) The index doesn't use any RAM. It simply changes the row order in RAM and marks the key columns. Analogous to a _clustered index_ in SQL. @@ -521,7 +524,7 @@ DT[ , { mySD = copy(.SD) Please upgrade to v1.8.1 or later. From this version, if `.N` is returned by `j` it is renamed to `N` to avoid any ambiguity in any subsequent grouping between the `.N` special variable and a column called `".N"`. -The old behaviour can be reproduced by forcing `.N` to be called `.N`, like this : +The old behaviour can be reproduced by forcing `.N` to be called `.N`, like this: ```{r} DT = data.table(a = c(1,1,2,2,2), b = c(1,2,2,2,1)) DT @@ -533,7 +536,7 @@ cat(try( If you are already running v1.8.1 or later then the error message is now more helpful than the "cannot change value of locked binding" error, as you can see above, since this vignette was produced using v1.8.1 or later. -The more natural syntax now works : +The more natural syntax now works: ```{r} if (packageVersion("data.table") >= "1.8.1") { DT[ , .N, by = list(a, b)][ , unique(N), by = a] @@ -555,7 +558,7 @@ Hopefully, this is self explanatory. The full message is: Coerced numeric RHS to integer to match the column's type; may have truncated precision. Either change the column to numeric first by creating a new numeric vector length 5 (nrows of entire table) yourself and assigning that (i.e. 'replace' column), or coerce RHS to integer yourself (e.g. 1L or as.integer) to make your intent clear (and for speed). Or, set the column type correctly up front when you create the table and stick to it, please. -To generate it, try : +To generate it, try: ```{r} DT = data.table(a = 1:5, b = 1:5) diff --git a/vignettes/datatable-importing.Rmd b/vignettes/datatable-importing.Rmd index 41a3d629ae..c37cd6f755 100644 --- a/vignettes/datatable-importing.Rmd +++ b/vignettes/datatable-importing.Rmd @@ -2,10 +2,10 @@ title: "Importing data.table" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette + markdown::html_format vignette: > %\VignetteIndexEntry{Importing data.table} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::knitr} \usepackage[utf8]{inputenc} --- diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index 3a5eda34cd..5bd36437a2 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -2,10 +2,10 @@ title: "Introduction to data.table" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette + markdown::html_format vignette: > %\VignetteIndexEntry{Introduction to data.table} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::knitr} \usepackage[utf8]{inputenc} --- diff --git a/vignettes/datatable-keys-fast-subset.Rmd b/vignettes/datatable-keys-fast-subset.Rmd index 465052d941..3e9a4f23c7 100644 --- a/vignettes/datatable-keys-fast-subset.Rmd +++ b/vignettes/datatable-keys-fast-subset.Rmd @@ -2,10 +2,10 @@ title: "Keys and fast binary search based subset" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette + markdown::html_format vignette: > %\VignetteIndexEntry{Keys and fast binary search based subset} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::knitr} \usepackage[utf8]{inputenc} --- diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index bf481f06f3..d63b1bccca 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -2,10 +2,10 @@ title: "Programming on data.table" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette + markdown::html_format vignette: > %\VignetteIndexEntry{Programming on data.table} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::knitr} \usepackage[utf8]{inputenc} --- diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index 33da89bb92..220a2a19a2 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -2,10 +2,10 @@ title: "Reference semantics" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette + markdown::html_format vignette: > %\VignetteIndexEntry{Reference semantics} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::knitr} \usepackage[utf8]{inputenc} --- diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index 3f94392fc6..c26d5510db 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -2,10 +2,10 @@ title: "Efficient reshaping using data.tables" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette + markdown::html_format vignette: > %\VignetteIndexEntry{Efficient reshaping using data.tables} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::knitr} \usepackage[utf8]{inputenc} --- diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 60d5c07c1d..8e7919f34d 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -2,12 +2,15 @@ title: "Using .SD for Data Analysis" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette: - toc: true - number_sections: true + markdown::html_format: + options: + toc: true + number_sections: true + meta: + css: [default, css/toc.css] vignette: > %\VignetteIndexEntry{Using .SD for Data Analysis} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::knitr} \usepackage[utf8]{inputenc} --- diff --git a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd index ef506605c3..374ccd66bb 100644 --- a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd +++ b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd @@ -2,10 +2,10 @@ title: "Secondary indices and auto indexing" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette + markdown::html_format vignette: > %\VignetteIndexEntry{Secondary indices and auto indexing} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::knitr} \usepackage[utf8]{inputenc} --- From c3ad47db96186acdad307c2336b7bfbc515b1ed6 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 3 Dec 2023 10:15:07 +0100 Subject: [PATCH 558/588] followup vignettes updated after change to markdown vignette engine (#5784) * do not use options max.print in vignettes, closes #5783 * follow up of markdown vign enginge #5773 * amend feedback from Michael --- vignettes/css/bootstrap.css | 118 ------------------ vignettes/datatable-intro.Rmd | 63 ++-------- vignettes/datatable-programming.Rmd | 67 +++++----- vignettes/datatable-reference-semantics.Rmd | 48 +++---- vignettes/datatable-reshape.Rmd | 22 +--- vignettes/datatable-sd-usage.Rmd | 3 +- ...le-secondary-indices-and-auto-indexing.Rmd | 4 +- 7 files changed, 74 insertions(+), 251 deletions(-) delete mode 100644 vignettes/css/bootstrap.css diff --git a/vignettes/css/bootstrap.css b/vignettes/css/bootstrap.css deleted file mode 100644 index 1453f27bf9..0000000000 --- a/vignettes/css/bootstrap.css +++ /dev/null @@ -1,118 +0,0 @@ -code, -kbd, -pre, -samp { - font-family: Source Code Pro, Inconsolata, Monaco, Consolas, Menlo, Courier New, monospace; -} - -code { - padding: 0px 2px; - font-size: 90%; - color: #c7254e; - white-space: nowrap; - background-color: #f9f2f4; - border-radius: 3px; - border: 0px; -} - -pre { - display: block; - padding: 9.5px; - margin: 0 0 10px; - font-size: 14px; - line-height: 1.428571429; - color: #c7254e; - background-color: #f9f2f4 - word-break: break-all; - word-wrap: break-word; - border: 0px ; - border-radius: 3px; - /*background-color: #FDF6E3;*/ - /*background-color: #f5f5f5; */ - /*border: 1px solid #FDF6E3;*/ -} - -pre code { - padding: 0; - font-size: inherit; - color: inherit; - white-space: pre-wrap; - background-color: transparent; - border-radius: 0; -} - -.bs-callout { - margin:20px 0; - padding:20px; - border-left:3px solid #eee -} - -.bs-callout h4 { - margin-top:0; - margin-bottom:5px -} - -.bs-callout p:last-child { - margin-bottom:0 -} - -.bs-callout code { - background-color:#fff; - border-radius:3px -} - -.bs-callout pre code { - background-color:transparent; - border-radius:3px -} - -.bs-callout-danger { - background-color:#fdf7f7; - border-color:#d9534f -} - -.bs-callout-danger h4 { - color:#d9534f -} - -.bs-callout-warning { - background-color:#fcf8f2; - border-color:#f0ad4e -} - -.bs-callout-warning h4 { - color:#f0ad4e -} - -.bs-callout-info { - background-color:#f4f8fa; - border-color:#5bc0de -} - -.bs-callout-info h4 { - color:#5bc0de -} - -// KeyWordTok -.sourceCode .kw { color: #268BD2; } -// DataTypeTok -.sourceCode .dt { color: #268BD2; } - -// DecValTok (decimal value), BaseNTok, FloatTok -.sourceCode .dv, .sourceCode .bn, .sourceCode .fl { color: #D33682; } -// CharTok -.sourceCode .ch { color: #DC322F; } -// StringTok -.sourceCode .st { color: #2AA198; } -// CommentTok -.sourceCode .co { color: #93A1A1; } -// OtherTok -.sourceCode .ot { color: #A57800; } -// AlertTok -.sourceCode .al { color: #CB4B16; font-weight: bold; } -// FunctionTok -.sourceCode .fu { color: #268BD2; } -// RegionMarkerTok -.sourceCode .re { } -// ErrorTok -.sourceCode .er { color: #D30102; font-weight: bold; } diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index 5bd36437a2..04fd79e50d 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -86,7 +86,7 @@ class(DT$ID) You can also convert existing objects to a `data.table` using `setDT()` (for `data.frame`s and `list`s) and `as.data.table()` (for other structures); the difference is beyond the scope of this vignette, see `?setDT` and `?as.data.table` for more details. -#### Note that: {.bs-callout .bs-callout-info} +#### Note that: * Row numbers are printed with a `:` in order to visually separate the row number from the first column. @@ -111,7 +111,7 @@ DT[i, j, by] Users who have an SQL background might perhaps immediately relate to this syntax. -#### The way to read it (out loud) is: {.bs-callout .bs-callout-info} +#### The way to read it (out loud) is: Take `DT`, subset/reorder rows using `i`, then calculate `j`, grouped by `by`. @@ -126,8 +126,6 @@ ans <- flights[origin == "JFK" & month == 6L] head(ans) ``` -#### {.bs-callout .bs-callout-info} - * Within the frame of a `data.table`, columns can be referred to *as if they are variables*, much like in SQL or Stata. Therefore, we simply refer to `origin` and `month` as if they are variables. We do not need to add the prefix `flights$` each time. Nevertheless, using `flights$origin` and `flights$month` would work just fine. * The *row indices* that satisfy the condition `origin == "JFK" & month == 6L` are computed, and since there is nothing else left to do, all columns from `flights` at rows corresponding to those *row indices* are simply returned as a `data.table`. @@ -140,7 +138,6 @@ head(ans) ans <- flights[1:2] ans ``` -#### {.bs-callout .bs-callout-info} * In this case, there is no condition. The row indices are already provided in `i`. We therefore return a `data.table` with all columns from `flights` at rows for those *row indices*. @@ -153,7 +150,7 @@ ans <- flights[order(origin, -dest)] head(ans) ``` -#### `order()` is internally optimised {.bs-callout .bs-callout-info} +#### `order()` is internally optimised * We can use "-" on a `character` columns within the frame of a `data.table` to sort in decreasing order. @@ -170,8 +167,6 @@ ans <- flights[, arr_delay] head(ans) ``` -#### {.bs-callout .bs-callout-info} - * Since columns can be referred to as if they are variables within the frame of `data.table`s, we directly refer to the *variable* we want to subset. Since we want *all the rows*, we simply skip `i`. * It returns *all* the rows for the column `arr_delay`. @@ -183,15 +178,13 @@ ans <- flights[, list(arr_delay)] head(ans) ``` -#### {.bs-callout .bs-callout-info} - * We wrap the *variables* (column names) within `list()`, which ensures that a `data.table` is returned. In case of a single column name, not wrapping with `list()` returns a vector instead, as seen in the [previous example](#select-j-1d). * `data.table` also allows wrapping columns with `.()` instead of `list()`. It is an *alias* to `list()`; they both mean the same. Feel free to use whichever you prefer; we have noticed most users seem to prefer `.()` for conciseness, so we will continue to use `.()` hereafter. `data.table`s (and `data.frame`s) are internally `list`s as well, with the stipulation that each element has the same length and the `list` has a `class` attribute. Allowing `j` to return a `list` enables converting and returning `data.table` very efficiently. -#### Tip: {.bs-callout .bs-callout-warning #tip-1} +#### Tip: {#tip-1} As long as `j-expression` returns a `list`, each element of the list will be converted to a column in the resulting `data.table`. This makes `j` quite powerful, as we will see shortly. It is also very important to understand this for when you'd like to make more complicated queries!! @@ -205,8 +198,6 @@ head(ans) # ans <- flights[, list(arr_delay, dep_delay)] ``` -#### {.bs-callout .bs-callout-info} - * Wrap both columns within `.()`, or `list()`. That's it. #### -- Select both `arr_delay` and `dep_delay` columns *and* rename them to `delay_arr` and `delay_dep`. @@ -229,7 +220,7 @@ ans <- flights[, sum( (arr_delay + dep_delay) < 0 )] ans ``` -#### What's happening here? {.bs-callout .bs-callout-info} +#### What's happening here? * `data.table`'s `j` can handle more than just *selecting columns* - it can handle *expressions*, i.e., *computing on columns*. This shouldn't be surprising, as columns can be referred to as if they are variables. Then we should be able to *compute* by calling functions on those variables. And that's what precisely happens here. @@ -243,8 +234,6 @@ ans <- flights[origin == "JFK" & month == 6L, ans ``` -#### {.bs-callout .bs-callout-info} - * We first subset in `i` to find matching *row indices* where `origin` airport equals `"JFK"`, and `month` equals `6L`. We *do not* subset the _entire_ `data.table` corresponding to those rows _yet_. * Now, we look at `j` and find that it uses only *two columns*. And what we have to do is to compute their `mean()`. Therefore we subset just those columns corresponding to the matching rows, and compute their `mean()`. @@ -262,7 +251,7 @@ The function `length()` requires an input argument. We just needed to compute th This type of operation occurs quite frequently, especially while grouping (as we will see in the next section), to the point where `data.table` provides a *special symbol* `.N` for it. -#### Special symbol `.N`: {.bs-callout .bs-callout-info #special-N} +#### Special symbol `.N`: {#special-N} `.N` is a special built-in variable that holds the number of observations _in the current group_. It is particularly useful when combined with `by` as we'll see in the next section. In the absence of group by operations, it simply returns the number of rows in the subset. @@ -273,8 +262,6 @@ ans <- flights[origin == "JFK" & month == 6L, .N] ans ``` -#### {.bs-callout .bs-callout-info} - * Once again, we subset in `i` to get the *row indices* where `origin` airport equals *"JFK"*, and `month` equals *6*. * We see that `j` uses only `.N` and no other columns. Therefore the entire subset is not materialised. We simply return the number of rows in the subset (which is just the length of row indices). @@ -372,8 +359,6 @@ ans # ans <- flights[, .(.N), by = "origin"] ``` -#### {.bs-callout .bs-callout-info} - * We know `.N` [is a special variable](#special-N) that holds the number of rows in the current group. Grouping by `origin` obtains the number of rows, `.N`, for each group. * By doing `head(flights)` you can see that the origin airports occur in the order *"JFK"*, *"LGA"* and *"EWR"*. The original order of grouping variables is preserved in the result. _This is important to keep in mind!_ @@ -400,8 +385,6 @@ ans <- flights[carrier == "AA", .N, by = origin] ans ``` -#### {.bs-callout .bs-callout-info} - * We first obtain the row indices for the expression `carrier == "AA"` from `i`. * Using those *row indices*, we obtain the number of rows while grouped by `origin`. Once again no columns are actually materialised here, because the `j-expression` does not require any columns to be actually subsetted and is therefore fast and memory efficient. @@ -416,8 +399,6 @@ head(ans) # ans <- flights[carrier == "AA", .N, by = c("origin", "dest")] ``` -#### {.bs-callout .bs-callout-info} - * `by` accepts multiple columns. We just provide all the columns by which to group by. Note the use of `.()` again in `by` -- again, this is just shorthand for `list()`, and `list()` can be used here as well. Again, we'll stick with `.()` in this vignette. #### -- How can we get the average arrival and departure delay for each `orig,dest` pair for each month for carrier code `"AA"`? {#origin-dest-month} @@ -429,8 +410,6 @@ ans <- flights[carrier == "AA", ans ``` -#### {.bs-callout .bs-callout-info} - * Since we did not provide column names for the expressions in `j`, they were automatically generated as `V1` and `V2`. * Once again, note that the input order of grouping columns is preserved in the result. @@ -450,8 +429,6 @@ ans <- flights[carrier == "AA", ans ``` -#### {.bs-callout .bs-callout-info} - * All we did was to change `by` to `keyby`. This automatically orders the result by the grouping variables in increasing order. In fact, due to the internal implementation of `by` first requiring a sort before recovering the original table's order, `keyby` is typically faster than `by` because it doesn't require this second step. **Keys:** Actually `keyby` does a little more than *just ordering*. It also *sets a key* after ordering by setting an `attribute` called `sorted`. @@ -475,8 +452,6 @@ ans <- ans[order(origin, -dest)] head(ans) ``` -#### {.bs-callout .bs-callout-info} - * Recall that we can use `-` on a `character` column in `order()` within the frame of a `data.table`. This is possible to due `data.table`'s internal query optimisation. * Also recall that `order(...)` with the frame of a `data.table` is *automatically optimised* to use `data.table`'s internal fast radix order `forder()` for speed. @@ -488,8 +463,6 @@ ans <- flights[carrier == "AA", .N, by = .(origin, dest)][order(origin, -dest)] head(ans, 10) ``` -#### {.bs-callout .bs-callout-info} - * We can tack expressions one after another, *forming a chain* of operations, i.e., `DT[ ... ][ ... ][ ... ]`. * Or you can also chain them vertically: @@ -512,8 +485,6 @@ ans <- flights[, .N, .(dep_delay>0, arr_delay>0)] ans ``` -#### {.bs-callout .bs-callout-info} - * The last row corresponds to `dep_delay > 0 = TRUE` and `arr_delay > 0 = FALSE`. We can see that `r flights[!is.na(arr_delay) & !is.na(dep_delay), .N, .(dep_delay>0, arr_delay>0)][, N[4L]]` flights started late but arrived early (or on time). * Note that we did not provide any names to `by-expression`. Therefore, names have been automatically assigned in the result. As with `j`, you can name these expressions as you would elements of any `list`, e.g. `DT[, .N, .(dep_delayed = dep_delay>0, arr_delayed = arr_delay>0)]`. @@ -528,7 +499,7 @@ It is of course not practical to have to type `mean(myCol)` for every column one How can we do this efficiently, concisely? To get there, refresh on [this tip](#tip-1) - *"As long as the `j`-expression returns a `list`, each element of the `list` will be converted to a column in the resulting `data.table`"*. Suppose we can refer to the *data subset for each group* as a variable *while grouping*, then we can loop through all the columns of that variable using the already- or soon-to-be-familiar base function `lapply()`. No new names to learn specific to `data.table`. -#### Special symbol `.SD`: {.bs-callout .bs-callout-info #special-SD} +#### Special symbol `.SD`: {#special-SD} `data.table` provides a *special* symbol, called `.SD`. It stands for **S**ubset of **D**ata. It by itself is a `data.table` that holds the data for *the current group* defined using `by`. @@ -542,8 +513,6 @@ DT DT[, print(.SD), by = ID] ``` -#### {.bs-callout .bs-callout-info} - * `.SD` contains all the columns *except the grouping columns* by default. * It is also generated by preserving the original order - data corresponding to `ID = "b"`, then `ID = "a"`, and then `ID = "c"`. @@ -554,8 +523,6 @@ To compute on (multiple) columns, we can then simply use the base R function `la DT[, lapply(.SD, mean), by = ID] ``` -#### {.bs-callout .bs-callout-info} - * `.SD` holds the rows corresponding to columns `a`, `b` and `c` for that group. We compute the `mean()` on each of these columns using the already-familiar base function `lapply()`. * Each group returns a list of three elements containing the mean value which will become the columns of the resulting `data.table`. @@ -566,7 +533,7 @@ We are almost there. There is one little thing left to address. In our `flights` #### -- How can we specify just the columns we would like to compute the `mean()` on? -#### .SDcols {.bs-callout .bs-callout-info} +#### .SDcols Using the argument `.SDcols`. It accepts either column names or column indices. For example, `.SDcols = c("arr_delay", "dep_delay")` ensures that `.SD` contains only these two columns for each group. @@ -590,8 +557,6 @@ ans <- flights[, head(.SD, 2), by = month] head(ans) ``` -#### {.bs-callout .bs-callout-info} - * `.SD` is a `data.table` that holds all the rows for *that group*. We simply subset the first two rows as we have seen [here](#subset-rows-integer) already. * For each group, `head(.SD, 2)` returns the first two rows as a `data.table`, which is also a `list`, so we do not have to wrap it with `.()`. @@ -606,8 +571,6 @@ So that we have a consistent syntax and keep using already existing (and familia DT[, .(val = c(a,b)), by = ID] ``` -#### {.bs-callout .bs-callout-info} - * That's it. There is no special syntax required. All we need to know is the base function `c()` which concatenates vectors and [the tip from before](#tip-1). #### -- What if we would like to have all the values of column `a` and `b` concatenated, but returned as a list column? @@ -616,8 +579,6 @@ DT[, .(val = c(a,b)), by = ID] DT[, .(val = list(c(a,b))), by = ID] ``` -#### {.bs-callout .bs-callout-info} - * Here, we first concatenate the values with `c(a,b)` for each group, and wrap that with `list()`. So for each group, we return a list of all concatenated values. * Note those commas are for display only. A list column can contain any object in each cell, and in this example, each cell is itself a vector and some cells contain longer vectors than others. @@ -646,7 +607,7 @@ DT[i, j, by] We have seen so far that, -#### Using `i`: {.bs-callout .bs-callout-info} +#### Using `i`: * We can subset rows similar to a `data.frame`- except you don't have to use `DT$` repetitively since columns within the frame of a `data.table` are seen as if they are *variables*. @@ -654,7 +615,7 @@ We have seen so far that, We can do much more in `i` by keying a `data.table`, which allows blazing fast subsets and joins. We will see this in the *"Keys and fast binary search based subsets"* and *"Joins and rolling joins"* vignette. -#### Using `j`: {.bs-callout .bs-callout-info} +#### Using `j`: 1. Select columns the `data.table` way: `DT[, .(colA, colB)]`. @@ -666,7 +627,7 @@ We can do much more in `i` by keying a `data.table`, which allows blazing fast s 5. Combine with `i`: `DT[colA > value, sum(colB)]`. -#### Using `by`: {.bs-callout .bs-callout-info} +#### Using `by`: * Using `by`, we can group by columns by specifying a *list of columns* or a *character vector of column names* or even *expressions*. The flexibility of `j`, combined with `by` and `i` makes for a very powerful syntax. @@ -682,7 +643,7 @@ We can do much more in `i` by keying a `data.table`, which allows blazing fast s 3. `DT[col > val, head(.SD, 1), by = ...]` - combine `i` along with `j` and `by`. -#### And remember the tip: {.bs-callout .bs-callout-warning} +#### And remember the tip: As long as `j` returns a `list`, each element of the list will become a column in the resulting `data.table`. diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index d63b1bccca..fc3ad726d7 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -25,8 +25,16 @@ knitr::opts_chunk$set( `data.table`, from its very first releases, enabled the usage of `subset` and `with` (or `within`) functions by defining the`[.data.table` method. `subset` and `with` are base R functions that are useful for reducing repetition in code, enhancing readability, and reducing number the total characters the user has to type. This functionality is possible in R because of a quite unique feature called *lazy evaluation*. This feature allows a function to catch its arguments, before they are evaluated, and to evaluate them in a different scope than the one in which they were called. Let's recap usage of the `subset` function. -```{r opt_max_print_10, include = FALSE} -options(max.print = 10L) # 2 rows +```{r df_print, echo=FALSE} +registerS3method("print", "data.frame", function(x, ...) { + base::print.data.frame(head(x, 2L), ...) + cat("...\n") + invisible(x) +}) +.opts = options( + datatable.print.topn=2L, + datatable.print.nrows=20L +) ``` ```{r subset} @@ -149,20 +157,18 @@ Now, to use substitution inside `[.data.table`, we don't need to call the `subst Let's use the `iris` data set as a demonstration. Just as an example, let's pretend we want to compute the `Sepal.Hypotenuse`, treating the sepal width and length as if they were legs of a right triangle. -```{r opt_max_print_8, include = FALSE} -options(max.print = 8L) # 2 rows -``` - ```{r hypotenuse_datatable} DT = as.data.table(iris) -DT[, outer(inner(var1) + inner(var2)), - env = list( - outer = "sqrt", - inner = "square", - var1 = "Sepal.Length", - var2 = "Sepal.Width" - )] +str( + DT[, outer(inner(var1) + inner(var2)), + env = list( + outer = "sqrt", + inner = "square", + var1 = "Sepal.Length", + var2 = "Sepal.Width" + )] +) # return as a data.table DT[, .(Species, var1, var2, out = outer(inner(var1) + inner(var2))), @@ -235,10 +241,6 @@ The example presented above illustrates a neat and powerful way to make your cod An obvious use case could be to mimic `.SD` functionality by injecting a `list` call into the `j` argument. -```{r opt_max_print_4, include = FALSE} -options(max.print = 4L) # 2 rows -``` - ```{r splice_sd} cols = c("Sepal.Length", "Sepal.Width") DT[, .SD, .SDcols = cols] @@ -316,10 +318,6 @@ It takes arbitrary number of variables on input, but now we cannot just *splice* First, we have to construct calls to the `square` function for each of the variables (see `inner_calls`). Then, we have to reduce the list of calls into a single call, having a nested sequence of `+` calls (see `add_calls`). Lastly, we have to substitute the constructed call into the surrounding expression (see `rms`). -```{r opt_max_print_12, include = FALSE} -options(max.print = 12L) # 2 rows -``` - ```{r complex} outer = "sqrt" inner = "square" @@ -344,15 +342,19 @@ rms = substitute2( ) print(rms) -DT[, j, env = list(j = rms)] +str( + DT[, j, env = list(j = rms)] +) # same, but skipping last substitute2 call and using add_calls directly -DT[, outer((add_calls) / len), - env = list( - outer = outer, - add_calls = add_calls, - len = length(vars) - )] +str( + DT[, outer((add_calls) / len), + env = list( + outer = outer, + add_calls = add_calls, + len = length(vars) + )] +) # return as data.table j = substitute2(j, list(j = as.list(setNames(nm = c(vars, "Species", "rms"))))) @@ -377,10 +379,6 @@ In `[.data.table`, it is also possible to use other mechanisms for variable subs ### `get` -```{r opt_max_print_4b, include = FALSE} -options(max.print = 4L) # 2 rows -``` - ```{r old_get} v1 = "Petal.Width" v2 = "Sepal.Width" @@ -418,3 +416,8 @@ DT[, eval(cl)] DT[, cl, env = list(cl = cl)] ``` + +```{r cleanup, echo=FALSE} +options(.opts) +registerS3method("print", "data.frame", base::print.data.frame) +``` \ No newline at end of file diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index 220a2a19a2..c96ed090f7 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -71,7 +71,7 @@ both (1) and (2) resulted in deep copy of the entire data.frame in versions of ` Great performance improvements were made in `R v3.1` as a result of which only a *shallow* copy is made for (1) and not *deep* copy. However, for (2) still, the entire column is *deep* copied even in `R v3.1+`. This means the more columns one subassigns to in the *same query*, the more *deep* copies R does. -#### *shallow* vs *deep* copy {.bs-callout .bs-callout-info} +#### *shallow* vs *deep* copy A *shallow* copy is just a copy of the vector of column pointers (corresponding to the columns in a *data.frame* or *data.table*). The actual data is not physically copied in memory. @@ -86,31 +86,27 @@ It can be used in `j` in two ways: (a) The `LHS := RHS` form - ```{r eval = FALSE} - DT[, c("colA", "colB", ...) := list(valA, valB, ...)] +```{r eval = FALSE} +DT[, c("colA", "colB", ...) := list(valA, valB, ...)] - # when you have only one column to assign to you - # can drop the quotes and list(), for convenience - DT[, colA := valA] - ``` +# when you have only one column to assign to you +# can drop the quotes and list(), for convenience +DT[, colA := valA] +``` (b) The functional form - ```{r eval = FALSE} - DT[, `:=`(colA = valA, # valA is assigned to colA - colB = valB, # valB is assigned to colB - ... - )] - ``` - -#### {.bs-callout .bs-callout-warning} +```{r eval = FALSE} +DT[, `:=`(colA = valA, # valA is assigned to colA + colB = valB, # valB is assigned to colB + ... +)] +``` Note that the code above explains how `:=` can be used. They are not working examples. We will start using them on `flights` *data.table* from the next section. # -#### {.bs-callout .bs-callout-info} - * In (a), `LHS` takes a character vector of column names and `RHS` a *list of values*. `RHS` just needs to be a `list`, irrespective of how its generated (e.g., using `lapply()`, `list()`, `mget()`, `mapply()` etc.). This form is usually easy to program with and is particularly useful when you don't know the columns to assign values to in advance. * On the other hand, (b) is handy if you would like to jot some comments down for later. @@ -140,7 +136,7 @@ head(flights) # flights[, c("speed", "delay") := list(distance/(air_time/60), arr_delay + dep_delay)] ``` -#### Note that {.bs-callout .bs-callout-info} +#### Note that * We did not have to assign the result back to `flights`. @@ -166,8 +162,6 @@ We see that there are totally `25` unique values in the data. Both *0* and *24* flights[hour == 24L, hour := 0L] ``` -#### {.bs-callout .bs-callout-info} - * We can use `i` along with `:=` in `j` the very same way as we have already seen in the *"Introduction to data.table"* vignette. * Column `hour` is replaced with `0` only on those *row indices* where the condition `hour == 24L` specified in `i` evaluates to `TRUE`. @@ -186,7 +180,7 @@ Let's look at all the `hours` to verify. flights[, sort(unique(hour))] ``` -#### Exercise: {.bs-callout .bs-callout-warning #update-by-reference-question} +#### Exercise: {#update-by-reference-question} What is the difference between `flights[hour == 24L, hour := 0L]` and `flights[hour == 24L][, hour := 0L]`? Hint: The latter needs an assignment (`<-`) if you would want to use the result later. @@ -204,7 +198,7 @@ head(flights) # flights[, `:=`(delay = NULL)] ``` -#### {.bs-callout .bs-callout-info #delete-convenience} +#### {#delete-convenience} * Assigning `NULL` to a column *deletes* that column. And it happens *instantly*. @@ -229,8 +223,6 @@ flights[, max_speed := max(speed), by = .(origin, dest)] head(flights) ``` -#### {.bs-callout .bs-callout-info} - * We add a new column `max_speed` using the `:=` operator by reference. * We provide the columns to group by the same way as shown in the *Introduction to data.table* vignette. For each group, `max(speed)` is computed, which returns a single value. That value is recycled to fit the length of the group. Once again, no copies are being made at all. `flights` *data.table* is modified *in-place*. @@ -249,7 +241,6 @@ out_cols = c("max_dep_delay", "max_arr_delay") flights[, c(out_cols) := lapply(.SD, max), by = month, .SDcols = in_cols] head(flights) ``` -#### {.bs-callout .bs-callout-info} * We use the `LHS := RHS` form. We store the input column names and the new columns to add in separate variables and provide them to `.SDcols` and for `LHS` (for better readability). @@ -283,7 +274,6 @@ ans = foo(flights) head(flights) head(ans) ``` -#### {.bs-callout .bs-callout-info} * Note that the new column `speed` has been added to `flights` *data.table*. This is because `:=` performs operations by reference. Since `DT` (the function argument) and `flights` refer to the same object in memory, modifying `DT` also reflects on `flights`. @@ -293,8 +283,6 @@ head(ans) In the previous section, we used `:=` for its side effect. But of course this may not be always desirable. Sometimes, we would like to pass a *data.table* object to a function, and might want to use the `:=` operator, but *wouldn't* want to update the original object. We can accomplish this using the function `copy()`. -#### {.bs-callout .bs-callout-info} - The `copy()` function *deep* copies the input object and therefore any subsequent update by reference operations performed on the copied object will not affect the original object. # @@ -321,8 +309,6 @@ There are two particular places where `copy()` function is essential: head(ans) ``` -#### {.bs-callout .bs-callout-info} - * Using `copy()` function did not update `flights` *data.table* by reference. It doesn't contain the column `speed`. * And `ans` contains the maximum speed corresponding to each month. @@ -354,7 +340,7 @@ However we could improve this functionality further by *shallow* copying instead ## Summary -#### The `:=` operator {.bs-callout .bs-callout-info} +#### The `:=` operator * It is used to *add/update/delete* columns by reference. diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index c26d5510db..0b5d7a57d3 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -77,8 +77,6 @@ DT.m1 str(DT.m1) ``` -#### {.bs-callout .bs-callout-info} - * `measure.vars` specify the set of columns we would like to collapse (or combine) together. * We can also specify column *indices* instead of *names*. @@ -98,8 +96,6 @@ DT.m1 = melt(DT, measure.vars = c("dob_child1", "dob_child2", "dob_child3"), DT.m1 ``` -#### {.bs-callout .bs-callout-info} - * By default, when one of `id.vars` or `measure.vars` is missing, the rest of the columns are *automatically assigned* to the missing argument. * When neither `id.vars` nor `measure.vars` are specified, as mentioned under `?melt`, all *non*-`numeric`, `integer`, `logical` columns will be assigned to `id.vars`. @@ -118,8 +114,6 @@ That is, we'd like to collect all *child* observations corresponding to each `fa dcast(DT.m1, family_id + age_mother ~ child, value.var = "dob") ``` -#### {.bs-callout .bs-callout-info} - * `dcast` uses *formula* interface. The variables on the *LHS* of formula represents the *id* vars and *RHS* the *measure* vars. * `value.var` denotes the column to be filled in with while casting to wide format. @@ -165,7 +159,7 @@ DT.c1 str(DT.c1) ## gender column is character type now! ``` -#### Issues {.bs-callout .bs-callout-info} +#### Issues 1. What we wanted to do was to combine all the `dob` and `gender` type columns together respectively. Instead we are combining *everything* together, and then splitting them again. I think it's easy to see that it's quite roundabout (and inefficient). @@ -198,8 +192,6 @@ DT.m2 str(DT.m2) ## col type is preserved ``` -#### {.bs-callout .bs-callout-info} - * We can remove the `variable` column if necessary. * The functionality is implemented entirely in C, and is therefore both *fast* and *memory efficient* in addition to being *straightforward*. @@ -210,7 +202,7 @@ Usually in these problems, the columns we'd like to melt can be distinguished by ```{r} DT.m2 = melt(DT, measure = patterns("^dob", "^gender"), value.name = c("dob", "gender")) -print(DT.m2, class=TRUE) +DT.m2 ``` #### - Using `measure()` to specify `measure.vars` via separator or pattern @@ -260,7 +252,7 @@ is used to convert the `child` string values to integers: ```{r} DT.m3 = melt(DT, measure = measure(value.name, child=as.integer, sep="_child")) -print(DT.m3, class=TRUE) +DT.m3 ``` In the code above we used `sep="_child"` which results in melting only @@ -288,12 +280,12 @@ groups, two numeric output columns, and an anonymous type conversion function, ```{r} -print(melt(who, measure.vars = measure( +melt(who, measure.vars = measure( diagnosis, gender, ages, ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), pattern="new_?(.*)_(.)(([0-9]{2})([0-9]{0,2}))" -)), class=TRUE) +)) ``` ### b) Enhanced `dcast` @@ -312,15 +304,13 @@ DT.c2 = dcast(DT.m2, family_id + age_mother ~ variable, value.var = c("dob", "ge DT.c2 ``` -#### {.bs-callout .bs-callout-info} - * Attributes are preserved in result wherever possible. * Everything is taken care of internally, and efficiently. In addition to being fast, it is also very memory efficient. # -#### Multiple functions to `fun.aggregate`: {.bs-callout .bs-callout-info} +#### Multiple functions to `fun.aggregate`: You can also provide *multiple functions* to `fun.aggregate` to `dcast` for *data.tables*. Check the examples in `?dcast` which illustrates this functionality. diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 8e7919f34d..e7b08650e4 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -202,7 +202,8 @@ Note that the `x[y]` syntax returns `nrow(y)` values (i.e., it's a right join), Often, we'd like to perform some operation on our data _at the group level_. When we specify `by =` (or `keyby = `), the mental model for what happens when `data.table` processes `j` is to think of your `data.table` as being split into many component sub-`data.table`s, each of which corresponds to a single value of your `by` variable(s): -![Grouping, Illustrated](plots/grouping_illustration.png 'A visual depiction of how grouping works. On the left is a grid. The first column is titled "ID COLUMN" with values the capital letters A through G, and the rest of the data is unlabelled, but is in a darker color and simply has "Data" written to indicate that's arbitrary. A right arrow shows how this data is split into groups. Each capital letter A through G has a grid on the right-hand side; the grid on the left has been subdivided to create that on the right.') +![Grouping, Illustrated](plots/grouping_illustration.png) + In the case of grouping, `.SD` is multiple in nature -- it refers to _each_ of these sub-`data.table`s, _one-at-a-time_ (slightly more accurately, the scope of `.SD` is a single sub-`data.table`). This allows us to concisely express an operation that we'd like to perform on _each sub-`data.table`_ before the re-assembled result is returned to us. diff --git a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd index 374ccd66bb..6f2474c115 100644 --- a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd +++ b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd @@ -105,7 +105,7 @@ setkey(flights, origin) flights["JFK"] # or flights[.("JFK")] ``` -#### `setkey()` requires: {.bs-callout .bs-callout-info} +#### `setkey()` requires: a) computing the order vector for the column(s) provided, here, `origin`, and @@ -139,7 +139,7 @@ Since there can be multiple secondary indices, and creating an index is as simpl As we will see in the next section, the `on` argument provides several advantages: -#### `on` argument {.bs-callout .bs-callout-info} +#### `on` argument * enables subsetting by computing secondary indices on the fly. This eliminates having to do `setindex()` every time. From 5061828a9d1827409bdd24806622a5a5d6ca899f Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 3 Dec 2023 10:31:40 +0100 Subject: [PATCH 559/588] ignore base R warning on 32bit platforms, closes #5785 (#5786) --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 59ca6aabd6..0063d9d8c4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10404,7 +10404,7 @@ test(1702.2, isoweek(as.Date(test_cases)), test_values) test(1702.3, isoweek(as.POSIXct(test_cases)), test_values) # 1% sample of a 400-year cycle of dates for extra robustness -if (test_R.utils) test(1702.4, isoweek((DT<-fread(testDir('isoweek_test.csv.bz2')))$input_date), DT$expected_output) +if (test_R.utils) test(1702.4, isoweek((DT<-fread(testDir('isoweek_test.csv.bz2')))$input_date), DT$expected_output, ignore.warning="datetimes before") ## ignore.warning due to #5785 # fread, ensure no shell commands #1702 if (.Platform$OS.type=="unix") { From a01ca604842aeec7469e3b80e2411f13bb9211f2 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 4 Dec 2023 09:57:22 +0800 Subject: [PATCH 560/588] cast pointers to standard type for printf (#5787) --- src/assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/assign.c b/src/assign.c index 3356e918be..d433c2b54e 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1197,7 +1197,7 @@ static R_len_t *savedtl=NULL, nalloc=0, nsaved=0; void savetl_init(void) { if (nsaved || nalloc || saveds || savedtl) { - error(_("Internal error: savetl_init checks failed (%d %d %p %p). please report to data.table issue tracker."), nsaved, nalloc, saveds, savedtl); // # nocov + error(_("Internal error: savetl_init checks failed (%d %d %p %p). please report to data.table issue tracker."), nsaved, nalloc, (void *)saveds, (void *)savedtl); // # nocov } nsaved = 0; nalloc = 100; From ae809e882c68727e99b2a84fbab5af5ef0f3b093 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Mon, 4 Dec 2023 18:07:36 +0100 Subject: [PATCH 561/588] remove survey advertise (#5789) --- README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/README.md b/README.md index fbe2de22a2..8455602f12 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,6 @@ # data.table -The data.table 2023 community survey is now live! Click on https://tinyurl.com/datatable-survey to fill it out. The survey will remain open until **December 1st, 2023**. - -In addition to filling out the survey, it would be great if you could share it with others who might be interested in participating. - ---- - [![CRAN status](https://badges.cranchecks.info/flavor/release/data.table.svg)](https://cran.r-project.org/web/checks/check_results_data.table.html) [![R-CMD-check](https://github.com/Rdatatable/data.table/workflows/R-CMD-check/badge.svg)](https://github.com/Rdatatable/data.table/actions) From f6146cea66d772175bfa26a318edbc349639acd8 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 5 Dec 2023 05:54:55 -0800 Subject: [PATCH 562/588] Create devcontainer.json --- .devcontainer/devcontainer.json | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000000..bbda2085f8 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,3 @@ +{ + "image": "registry.gitlab.com/jangorecki/dockerfiles/r-devel-gcc" +} From b147969e9d5c30aefc4f2e90b4950dde48ec357c Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 5 Dec 2023 16:36:45 +0100 Subject: [PATCH 563/588] GLCI rework (#5793) --- .ci/README.md | 64 +++----- .ci/deploy.sh | 30 ---- .ci/publish.R | 82 +++++++--- .gitlab-ci.yml | 363 ++++++++++++++++++------------------------ R/devel.R | 6 +- _pkgdown.yml | 35 ++-- man/update_dev_pkg.Rd | 15 +- 7 files changed, 271 insertions(+), 324 deletions(-) delete mode 100644 .ci/deploy.sh diff --git a/.ci/README.md b/.ci/README.md index 3f303e34ac..d684a598e3 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -1,72 +1,50 @@ # data.table continuous integration and deployment -On each Pull Request opened in GitHub we run Travis CI and Appveyor to provide prompt feedback about the status of PR. Our main CI pipeline runs on GitLab CI. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch. It tests more environments and different configurations. It publish variety of artifacts. +On each Pull Request opened in GitHub we run GitHub Actions test jobs to provide prompt feedback about the status of PR. Our main CI pipeline runs on GitLab CI nightly. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch every night. It tests more environments and different configurations. It publish variety of artifacts. ## Environments ### [GitLab CI](./../.gitlab-ci.yml) Test jobs: -- `test-rel-lin` - `r-release` on Linux, most comprehensive test environment, `-O3 -flto -fno-common -Wunused-result`, extra check for no compilation warnings, includes testing [_with other packages_](./../inst/tests/other.Rraw) -- `test-rel-cran-lin` - `--as-cran` on Linux, `-g0`, extra check for final status of `R CMD check` where we allow one NOTE (_size of tarball_). -- `test-dev-cran-lin` - `r-devel` and `--as-cran` on Linux, `--with-recommended-packages --enable-strict-barrier --disable-long-double`, tests for compilation warnings in pkg install and new NOTEs/Warnings in pkg check, and because it is R-devel it is marked as allow_failure -- `test-rel-vanilla-lin` - `r-release` on Linux, no suggested deps, no OpenMP, `-O0`, tracks memory usage during tests -- `test-310-cran-lin` - R 3.1.0 on Linux -- `test-344-cran-lin` - R 3.4.4 on Linux -- `test-350-cran-lin` - R 3.5.0 on Linux, no `r-recommended` -- `test-rel-win` - `r-release` on Windows -- `test-dev-win` - `r-devel` on Windows -- `test-old-win` - `r-oldrel` on Windows -- `test-rel-osx` - MacOSX build not yet deployed, see [#3326](https://github.com/Rdatatable/data.table/issues/3326) for status +- `test-lin-rel` - `r-release` on Linux, most comprehensive test environment, force all suggests, `-O3 -flto=auto -fno-common -Wunused-result`, test for no compilation warnings. +- `test-lin-rel-vanilla` - `r-release` on Linux, no suggested deps, no zlib, no OpenMP, flags `-g -O0 -fno-openmp`, skip manual and vignettes. +- `test-lin-rel-cran` - `--as-cran` on Linux, strict test for final status of `R CMD check`. +- `test-lin-dev-gcc-strict-cran` - `--as-cran` on Linux, `r-devel` built with `-enable-strict-barrier --disable-long-double`, test for compilation warnings, test for new NOTEs/WARNINGs from `R CMD check`. +- `test-lin-dev-clang-cran` - same as `gcc-strict` job but R built with `clang` and no `--enable-strict-barrier --disable-long-double` flags. +- `test-lin-310-cran` - R 3.1.0 on Linux, stated R dependency version. +- `test-win-rel` - `r-release` on Windows. +- `test-win-dev` - `r-devel` on Windows. +- `test-win-old` - `r-oldrel` on Windows. +- `test-mac-rel` - macOS build not yet available, see [#3326](https://github.com/Rdatatable/data.table/issues/3326) for status + +Tests jobs are allowed to fail, summary and logs of test jobs are later published at _CRAN-like checks_ page, see artifacts below. Artifacts: - [homepage](https://rdatatable.gitlab.io/data.table) - made with [pkgdown](https://github.com/r-lib/pkgdown) - [html manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/00Index.html) - [pdf manual](https://rdatatable.gitlab.io/data.table/web/packages/data.table/data.table.pdf) - [html vignettes](https://rdatatable.gitlab.io/data.table/library/data.table/doc/index.html) -- R packages repository for `data.table` and all _Suggests_ dependencies, url: `https://Rdatatable.gitlab.io/data.table` +- R packages repository for `data.table` and all _Suggests_ dependencies, url: `https://rdatatable.gitlab.io/data.table` - sources - Windows binaries for `r-release`, `r-devel` and `r-oldrel` - [CRAN-like homepage](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html) -- [CRAN-like checks results](https://rdatatable.gitlab.io/data.table/web/checks/check_results_data.table.html) - note that all artifacts, including check results page, are being published only when all test jobs successfully pass, thus one will not see an _ERROR_ status there (unless error happened on a job marked as `allow_failure`). -- [docker images](https://gitlab.com/Rdatatable/data.table/container_registry) - copy/paste-able `docker pull` commands can be found at the bottom of our [CRAN-like homepage](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html) +- [CRAN-like checks results](https://rdatatable.gitlab.io/data.table/web/checks/check_results_data.table.html) -### [Travis CI](./../.travis.yml) +### [GitHub Actions](./../.github/workflows) -Test jobs: -- `r-release` on Linux, includes code coverage check -- _(might be disabled)_ `r-release` on OSX - -Artifacts: -- R packages repository having `data.table` sources only, url: `https://Rdatatable.github.io/data.table` -- code coverage stats pushed to [codecov.io/gh/Rdatatable/data.table](https://codecov.io/gh/Rdatatable/data.table) +TODO document ### [Appveyor](./../.appveyor.yml) -Test jobs: -- Windows `r-release` -- _(might be disabled)_ Windows `r-devel` - -Artifacts: -- Windows `r-release` binaries accessed only via web UI +TODO document -## Tools +## CI tools ### [`ci.R`](./ci.R) -Base R implemented helper script, [originally proposed to R](https://svn.r-project.org/R/branches/tools4pkgs/src/library/tools/R/packages.R), that ease the process of extracting dependency information from description files, also to mirror packages and their recursive dependencies from CRAN to local CRAN-like directory. It is widely used in our [GitLab CI pipeline](./../.gitlab-ci.yml). +Base R implemented helper script, [originally proposed to base R](https://svn.r-project.org/R/branches/tools4pkgs/src/library/tools/R/packages.R), that ease the process of extracting dependency information from description files, and to mirror packages and their recursive dependencies from CRAN to local CRAN-like directory. It is used in [GitLab CI pipeline](./../.gitlab-ci.yml). ### [`publish.R`](./publish.R) -Base R implemented helper script to orchestrate generation of most artifacts. It is being used only in [_integration_ stage in GitLab CI pipeline](./../.gitlab-ci.yml). - -### [`Dockerfile.in`](./Dockerfile.in) - -Template file to produce `Dockerfile` for, as of now, three docker images. Docker images are being built and published in [_deploy_ stage in GitLab CI pipeline](./../.gitlab-ci.yml). -- `r-base-dev` using `r-release`: publish docker image of `data.table` on R-release -- `r-builder` using `r-release`: publish on R-release and OS dependencies for building Rmarkdown vignettes -- `r-devel`: publish docker image of `data.table` on R-devel built with `--with-recommended-packages --enable-strict-barrier --disable-long-double` - -### [`deploy.sh`](./deploy.sh) - -Script used on Travis CI to publish CRAN-like repository of `data.table` sources. It publishes to `gh-pages` branch in GitHub repository. It depends on a token, which is provided based on `secure` environment variable in [.travis.yml](./../.travis.yml). It has been generated by @jangorecki. +Base R implemented helper script to orchestrate generation of most artifacts and to arrange them nicely. It is being used only in [_integration_ stage in GitLab CI pipeline](./../.gitlab-ci.yml). diff --git a/.ci/deploy.sh b/.ci/deploy.sh deleted file mode 100644 index 6f01ef136f..0000000000 --- a/.ci/deploy.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -set -o errexit -o nounset -PKG_REPO=$PWD -PKG_TARBALL=$(ls -1t *.tar.gz | head -n 1) -cd .. - -addToDrat(){ - mkdir drat; cd drat - - ## Set up Repo parameters - git init - git config user.name "addToDrat" - git config user.email "addToDrat@travis.ci" - - ## Get drat repo - git remote add upstream "https://$GH_TOKEN@github.com/Rdatatable/data.table.git" 2>err.txt - git fetch upstream gh-pages 2>err.txt - git checkout gh-pages 2>err.txt - git reset --hard "88000defd316538c37af4c8dc842e73e7953f4e2" 2>err.txt - - Rscript -e "drat::insertPackage('$PKG_REPO/$PKG_TARBALL', \ - repodir = '.', \ - commit='Travis publish data.table: build $TRAVIS_COMMIT', \ - addFiles=TRUE, fields='Revision')" - git push --force upstream gh-pages 2>err.txt - -} - -addToDrat diff --git a/.ci/publish.R b/.ci/publish.R index ec35fe43f3..923b89b5d1 100644 --- a/.ci/publish.R +++ b/.ci/publish.R @@ -91,16 +91,16 @@ package.index <- function(package, lib.loc, repodir="bus/integration/cran") { ) vign = tools::getVignetteInfo(pkg, lib.loc=lib.loc) r_rel_ver = Sys.getenv("R_REL_VERSION") - r_devel_ver = Sys.getenv("R_DEVEL_VERSION") - r_oldrel_ver = Sys.getenv("R_OLDREL_VERSION") - stopifnot(nzchar(r_rel_ver), nzchar(r_devel_ver), nzchar(r_oldrel_ver)) + r_dev_ver = Sys.getenv("R_DEV_VERSION") + r_old_ver = Sys.getenv("R_OLD_VERSION") + stopifnot(nzchar(r_rel_ver), nzchar(r_dev_ver), nzchar(r_old_ver)) cran.home = "../../.." tbl.dl = c( sprintf(" Reference manual: %s.pdf, 00Index.html ", pkg, pkg, cran.home, pkg), if (nrow(vign)) sprintf("Vignettes:%s", paste(sprintf("%s
", cran.home, vign[,"PDF"], vign[,"Title"]), collapse="\n")), # location unline cran web/pkg/vignettes to not duplicate content, documentation is in ../../../library sprintf(" Package source: %s_%s.tar.gz ", cran.home,pkg, version, pkg, version), - sprintf(" Windows binaries: %s ", format.bins(ver=c("r-devel","r-release","r-oldrel"), bin_ver=c(r_devel_ver, r_rel_ver, r_oldrel_ver), cran.home=cran.home, os.type="windows", pkg=pkg, version=version, repodir=repodir)), - sprintf(" macOS binaries: %s ", format.bins(ver=c("r-release","r-oldrel"), bin_ver=c(r_rel_ver, r_oldrel_ver), cran.home=cran.home, os.type="macosx", pkg=pkg, version=version, repodir=repodir)) + sprintf(" Windows binaries: %s ", format.bins(ver=c("r-devel","r-release","r-oldrel"), bin_ver=c(r_dev_ver, r_rel_ver, r_old_ver), cran.home=cran.home, os.type="windows", pkg=pkg, version=version, repodir=repodir)), + sprintf(" macOS binaries: %s ", format.bins(ver=c("r-release","r-oldrel"), bin_ver=c(r_rel_ver, r_old_ver), cran.home=cran.home, os.type="macosx", pkg=pkg, version=version, repodir=repodir)) ) index.file = file.path(repodir, "web/packages", pkg, "index.html") if (!dir.exists(dirname(index.file))) dir.create(dirname(index.file), recursive=TRUE) @@ -155,24 +155,30 @@ plat <- function(x) if (grepl("^.*win", x)) "Windows" else if (grepl("^.*mac", x r.ver <- function(x) { tmp = strsplit(x, "-", fixed=TRUE)[[1L]] - if (length(tmp) < 2L) stop("test job names must be test-[r.version]-...") - v = tmp[2L] + if (length(tmp) < 3L) stop("test job names must be test-[lin|win|mac]-[r.version]-...") + v = tmp[3L] if (identical(v, "rel")) "r-release" else if (identical(v, "dev")) "r-devel" else if (identical(v, "old")) "r-oldrel" else { - if (grepl("\\D", v)) stop("second word in test job name must be rel/dev/old or numbers of R version") + if (grepl("\\D", v)) stop("third word in test job name must be rel/dev/old or numbers of R version") paste0("r-", paste(strsplit(v, "")[[1L]], collapse=".")) } } # this for now is constant but when we move to independent pipelines (commit, daily, weekly) those values can be different pkg.version <- function(job, pkg) { - dcf = read.dcf(file.path("bus", job, paste(pkg, "Rcheck", sep="."), pkg, "DESCRIPTION")) + Rcheck = file.path("bus", job, paste(pkg, "Rcheck", sep=".")) + if (!dir.exists(Rcheck)) + return(NA_character_) + dcf = read.dcf(file.path(Rcheck, "00_pkg_src", pkg, "DESCRIPTION")) dcf[,"Version"] } pkg.revision <- function(job, pkg) { - dcf = read.dcf(file.path("bus", job, paste(pkg, "Rcheck", sep="."), pkg, "DESCRIPTION")) + Rcheck = file.path("bus", job, paste(pkg, "Rcheck", sep=".")) + if (!dir.exists(Rcheck)) + return(NA_character_) + dcf = read.dcf(file.path(Rcheck, "00_pkg_src", pkg, "DESCRIPTION")) if ("Revision" %in% colnames(dcf)) { proj.url = Sys.getenv("CI_PROJECT_URL", "") if (!nzchar(proj.url)) { @@ -184,7 +190,10 @@ pkg.revision <- function(job, pkg) { } else "" } pkg.flags <- function(job, pkg) { - cc = file.path("bus", job, paste(pkg, "Rcheck", sep="."), pkg, "cc") ## data.table style cc file + Rcheck = file.path("bus", job, paste(pkg, "Rcheck", sep=".")) + if (!dir.exists(Rcheck)) + return(NA_character_) + cc = file.path(Rcheck, pkg, "cc") ## data.table style cc file if (file.exists(cc)) { d = readLines(cc) w.cflags = substr(d, 1, 7)=="CFLAGS=" @@ -254,6 +263,34 @@ check.flavors <- function(jobs, repodir="bus/integration/cran") { setNames(file.exists(file), file) } +log.copy <- function(job, repodir="bus/integration/cran") { + dir.create(job.checks<-file.path(repodir, "web", "checks", pkg<-"data.table", job), recursive=TRUE, showWarnings=FALSE) + to = file.path(job.checks, "log") + if (!file.exists(job_id_file <- file.path("bus", job, "id"))) + return(setNames(file.exists(to), "log")) + job_id = readLines(job_id_file, warn=FALSE)[1L] + from = sprintf("https://gitlab.com/Rdatatable/data.table/-/jobs/%s/raw", job_id) + download.file(from, to, method="wget", quiet=TRUE) + Sys.sleep(0.1) ## to not get ban from gitlab.com + setNames(file.exists(to), "log") +} + +ci.status <- function(job) { + if (!file.exists(status_file <- file.path("bus", job, "status"))) + return(NA_character_) + readLines(status_file, warn=FALSE)[1L] +} + +ci.log <- function(jobs, repodir="bus/integration/cran") { + pkg = "data.table" + ans = vector("character", length(jobs)) + logs = sapply(jobs, log.copy, repodir=repodir) + statuses = sapply(jobs, ci.status) + ans[!logs] = statuses[!logs] + ans[logs] = sprintf('%s', pkg[any(logs)], jobs[logs], statuses[logs]) + ans +} + check.index <- function(pkg, jobs, repodir="bus/integration/cran") { status = function(x) if (grepl("^.*ERROR", x)) "ERROR" else if (grepl("^.*WARNING", x)) "WARNING" else if (grepl("^.*NOTE", x)) "NOTE" else if (grepl("^.*OK", x)) "OK" else NA_character_ test.files = function(job, files, trim.name=FALSE, trim.exts=0L, pkg="data.table") { @@ -294,17 +331,18 @@ check.index <- function(pkg, jobs, repodir="bus/integration/cran") { } memouts }) - th = "FlavorVersionRevisionInstallStatusFlagsRout.failMemtest" + th = "FlavorVersionRevisionInstallStatusFlagsRout.failLogMemtest" tbl = sprintf( - "%s%s%sout%s%s%s%s", - sub("test-", "", jobs, fixed=TRUE), - sapply(jobs, pkg.version, pkg), - sapply(jobs, pkg.revision, pkg), - pkg, jobs, ## install - pkg, jobs, sapply(sapply(jobs, check.test, pkg="data.table"), status), ## check - sapply(jobs, pkg.flags, pkg), - mapply(test.files, jobs, routs, trim.exts=2L), # 1st fail, 2nd Rout, keep just: tests_x64/main - mapply(test.files, jobs, memouts, trim.name=TRUE) + "%s%s%sout%s%s%s%s%s", + sub("test-", "", jobs, fixed=TRUE), ## Flavor + sapply(jobs, pkg.version, pkg), ## Version + sapply(jobs, pkg.revision, pkg), ## Revision + pkg, jobs, ## Install + pkg, jobs, sapply(sapply(jobs, check.test, pkg="data.table"), status), ## Status + sapply(jobs, pkg.flags, pkg), ## Flags + mapply(test.files, jobs, routs, trim.exts=2L), ## Rout.fail: 1st fail, 2nd Rout, keep just: tests_x64/main + ci.log(jobs), ## CI job logs + mapply(test.files, jobs, memouts, trim.name=TRUE) ## Memtest // currently not used ) file = file.path(repodir, "web/checks", sprintf("check_results_%s.html", pkg)) writeLines(c( @@ -340,7 +378,7 @@ check.test <- function(job, pkg) { check[length(check)] } -move.bin <- function(job, bin.version, os.type, file="DESCRIPTION", silent=FALSE) { +move.bin <- function(job, bin.version, os.type, file="DESCRIPTION", silent=TRUE) { if (os.type=="unix") { stop("publish of linux binaries not supported") } else if (os.type=="windows") { diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 80fa5d00a7..60cf09bb55 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,3 +1,9 @@ +workflow: + rules: + - if: '$CI_PIPELINE_SOURCE=="schedule" && $CI_COMMIT_REF_NAME=="master"' ## nightly scheduled pipeline at 4:15 UTC + - if: '$CI_PIPELINE_SOURCE=="web"' ## manually started from web UI + - if: '$CI_PIPELINE_SOURCE=="push" && $CI_COMMIT_REF_NAME!="master"' ## branches pushed to GL directly, mirror is set for master branch only + variables: CRAN_MIRROR: "https://cloud.r-project.org" _R_CHECK_FORCE_SUGGESTS_: "false" @@ -6,9 +12,18 @@ variables: TZ: "UTC" ## to avoid 'Failed to create bus connection' from timedatectl via Sys.timezone() on Docker with R 3.4. ## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under ## a non-UTC timezone, although, that's what we do routinely in dev. - R_REL_VERSION: "4.3" - R_DEVEL_VERSION: "4.4" - R_OLDREL_VERSION: "4.2" + R_REL_VERSION: "4.3" + R_REL_WIN_BIN: "https://cloud.r-project.org/bin/windows/base/old/4.3.2/R-4.3.2-win.exe" + RTOOLS_REL_BIN: "https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5863-5818.exe" + RTOOLS43_HOME: "/c/rtools" + R_DEV_VERSION: "4.4" + R_DEV_WIN_BIN: "https://cloud.r-project.org/bin/windows/base/R-devel-win.exe" + RTOOLS_DEV_BIN: "https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5863-5818.exe" + RTOOLS44_HOME: "" ## in case R-devel will use new Rtools toolchain, now it uses 4.3 env var + R_OLD_VERSION: "4.2" + R_OLD_WIN_BIN: "https://cloud.r-project.org/bin/windows/base/old/4.2.3/R-4.2.3-win.exe" + RTOOLS_OLD_BIN: "https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5355-5357.exe" + RTOOLS42_HOME: "/c/rtools" stages: - dependencies @@ -22,7 +37,7 @@ stages: expire_in: 2 weeks when: always paths: - - bus + - bus/$CI_JOB_NAME ## mirror packages # download all recursive dependencies once to be used across multiple test jobs @@ -40,9 +55,13 @@ mirror-packages: - echo 'source(".ci/ci.R")' >> .Rprofile - mkdir -p bus/$CI_JOB_NAME/cran/src/contrib - Rscript -e 'mirror.packages(dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran")' - - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_REL_VERSION","R_DEVEL_VERSION","R_OLDREL_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' + - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_REL_VERSION","R_DEV_VERSION","R_OLD_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' <<: *artifacts +## install deps aliases +.test-install-deps: &install-deps + - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), repos=file.path("file:", normalizePath("bus/mirror-packages/cran", mustWork=FALSE)), quiet=FALSE)' + ## build # sources as tar.gz archive # build vignettes @@ -50,137 +69,78 @@ build: stage: build tags: - linux - image: registry.gitlab.com/jangorecki/dockerfiles/r-base ## r-base-gcc after rstudio/markdown#108 + image: registry.gitlab.com/jangorecki/dockerfiles/r-base-gcc needs: ["mirror-packages"] before_script: - - Rscript -e 'install.packages(c("knitr","rmarkdown"), repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' + - *install-deps - rm -r bus + script: - sed -i '/^[[:space:]]*$/d' ./DESCRIPTION ## make last line end abruptly; i.e. without a final \n - echo "Revision:" $CI_COMMIT_SHA >> ./DESCRIPTION - script: - R CMD build . - - mkdir -p bus/$CI_JOB_NAME/cran/src/contrib - - mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_JOB_NAME/cran/src/contrib/. - - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/build/cran"), fields="Revision", addFiles=TRUE)' + - mkdir -p bus/$CI_JOB_NAME/ + - mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_JOB_NAME/ <<: *artifacts -## install deps aliases -.test-install-deps: &install-deps - - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), repos=file.path("file:", normalizePath("bus/mirror-packages/cran", mustWork=FALSE)), quiet=TRUE)' -.test-install-deps-win: &install-deps-win - - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='all'), repos=file.path('file://',getwd(),'bus/mirror-packages/cran'), quiet=TRUE)" - -## copy data.table tar.gz from bus R repo to current directory -.test-cp-src: &cp-src - - cp $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) . -.test-cp-src-win: &cp-src-win - - cp.exe $(ls.exe -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head.exe -n 1) . - -## move data.table tar.gz to bus -.test-mv-src: &mv-src - - mkdir -p bus/$CI_JOB_NAME && mv $(ls -1t data.table_*.tar.gz | head -n 1) bus/$CI_JOB_NAME -.test-mv-src-win: &mv-src-win - - mkdir.exe -p bus/$CI_JOB_NAME; mv.exe $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) bus/$CI_JOB_NAME - -## move data.table binaries to bus R repo -.test-mv-bin-win: &mv-bin-win - - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION - -## remove data.table tar.gz -.test-rm-src: &rm-src - - rm $(ls -1t data.table_*.tar.gz | head -n 1) -.test-rm-src-win: &rm-src-win - - rm.exe $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - -## install R on windows -.test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.3.2/R-4.3.2-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait -.test-install-r-devel-win: &install-r-devel-win - - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait -.test-install-r-oldrel-win: &install-r-oldrel-win - - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.2.3/R-4.2.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait - -## install Rtools on windows -.test-install-rtools42-win: &install-rtools42-win - - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools42/files/rtools42-5355-5357.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools42" -NoNewWindow -Wait -.test-install-rtools43-win: &install-rtools43-win - - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools43/files/rtools43-5863-5818.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools43" -NoNewWindow -Wait - .test-template: &test stage: test needs: ["mirror-packages","build"] + allow_failure: true <<: *artifacts .test-lin-template: &test-lin <<: *test tags: - linux - -.test-win-template: &test-win - <<: *test - tags: - - windows - - shared-windows - -#.test-mac-template: &test-mac -# <<: *test -# tags: -# - macosx + before_script: + - cp $(ls -1t bus/build/data.table_*.tar.gz | head -n 1) . + - mkdir -p ~/.R + after_script: + - mkdir -p bus/$CI_JOB_NAME + - echo $CI_JOB_ID > bus/$CI_JOB_NAME/id + - echo $CI_JOB_STATUS > bus/$CI_JOB_NAME/status + - echo $CI_JOB_IMAGE > bus/$CI_JOB_NAME/image + - '[ -d data.table.Rcheck ] && mv data.table.Rcheck bus/$CI_JOB_NAME/' ## most comprehensive tests # force all suggests # flags: gcc -O3 -flto=auto -fno-common -Wunused-result # tests for compilation warnings -test-rel-lin: +test-lin-rel: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-data.table - needs: ["mirror-packages","build"] variables: _R_CHECK_CRAN_INCOMING_: "FALSE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_CHECK_FORCE_SUGGESTS_: "TRUE" _R_CHECK_TESTS_NLINES_: "0" OPENBLAS_MAIN_FREE: "1" - before_script: + script: - apt-get update -q && apt-get install -y --no-install-recommends libcurl4-openssl-dev ## remove after #5749 - *install-deps - - *cp-src - - rm -r bus - - mkdir -p ~/.R - - echo 'CFLAGS=-g -O3 -flto=auto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O3 -flto=auto -fno-common -Wunused-result -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - script: - - *mv-src - - cd bus/$CI_JOB_NAME + - echo 'CFLAGS=-g -O3 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O3 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) - - *rm-src - (! grep "warning:" data.table.Rcheck/00install.out) ## vanilla minimal +# no zlib # no suggested deps # no vignettes or manuals # no openmp # flags: gcc -O0 -fno-openmp -test-rel-vanilla-lin: +test-lin-rel-vanilla: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-base-gcc - allow_failure: true ## temp workaround #5484 - before_script: - - *cp-src - - rm -r bus - - mkdir -p ~/.R + script: - echo 'CFLAGS=-g -O0 -fno-openmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - echo 'CXXFLAGS=-g -O0 -fno-openmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - script: - - *mv-src - - cd bus/$CI_JOB_NAME - R CMD check --no-manual --ignore-vignettes $(ls -1t data.table_*.tar.gz | head -n 1) - - *rm-src ## R-release on Linux # strict checks for 0 NOTEs # extra NOTEs check and build pdf manual thus not from cran-lin template -test-rel-cran-lin: +test-lin-rel-cran: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-base variables: @@ -188,147 +148,139 @@ test-rel-cran-lin: _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## bytes _R_CHECK_PKG_SIZES_THRESHOLD_: "7" ## MB 'checking installed package size' NOTE - before_script: - - apt-get update -q && apt-get install -y --no-install-recommends libcurl4-openssl-dev ## temp workaround #5749 + script: + - apt-get -qq update && apt-get install -y --no-install-recommends libcurl4-openssl-dev ## temp workaround for curl dep #5749 - *install-deps - - *cp-src - - rm -r bus - - mkdir -p ~/.R - echo 'CFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - echo 'CXXFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - script: - - *mv-src - - cd bus/$CI_JOB_NAME - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - - *rm-src - >- Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: OK")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: OK"), " but ", shQuote(l)) else q("no")' -## R-devel on Linux -# TODO: --enable-strict-barrier --disable-long-double +## R-devel on Linux gcc strict +# R built with --enable-strict-barrier --disable-long-double # tests for compilation warnings # tests for new notes -# thus allow_failure -test-dev-cran-lin: +test-lin-dev-gcc-strict-cran: <<: *test-lin - image: registry.gitlab.com/jangorecki/dockerfiles/r-devel - allow_failure: true ## to not be blocked by changes in r-devel + image: registry.gitlab.com/jangorecki/dockerfiles/r-devel-gcc-strict variables: _R_CHECK_CRAN_INCOMING_: "TRUE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" ## detects S3 method lookup found on search path #4777 _R_S3_METHOD_LOOKUP_REPORT_SEARCH_PATH_USES_: "TRUE" - before_script: + script: + - echo 'CFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - *install-deps - - *cp-src - - rm -r bus + - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) + - (! grep "warning:" data.table.Rcheck/00install.out) + - >- + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 3 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 3 NOTEs"), " (size of tarball, suggested but not available for checking: 'curl', installed package size) but ", shQuote(l)) else q("no")' + +## R-devel on Linux clang +# R compiled with clang +# tests for compilation warnings +# tests for new notes +test-lin-dev-clang-cran: + <<: *test-lin + image: registry.gitlab.com/jangorecki/dockerfiles/r-devel-clang + variables: + _R_CHECK_CRAN_INCOMING_: "TRUE" + _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" + _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" + _R_S3_METHOD_LOOKUP_REPORT_SEARCH_PATH_USES_: "TRUE" script: - - *mv-src - - cd bus/$CI_JOB_NAME - - R CMD check --as-cran --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) - - *rm-src + - echo 'CFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + - *install-deps + - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - (! grep "warning:" data.table.Rcheck/00install.out) - - >- ## this likely need an update but check fails now on complex NA so CI is not reaching here anyway - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 3 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 3 NOTEs"), " (size of tarball, installed package size, top-level files) but ", shQuote(l)) else q("no")' + - >- + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 3 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 3 NOTEs"), " (size of tarball, suggested but not available for checking: 'curl', installed package size) but ", shQuote(l)) else q("no")' ## R 3.1.0 # stated dependency on R -test-310-cran-lin: +test-lin-310-cran: image: registry.gitlab.com/jangorecki/dockerfiles/r-3.1.0 <<: *test-lin - before_script: - - *install-deps - - *cp-src - - rm -r bus script: - - *mv-src - - cd bus/$CI_JOB_NAME + - *install-deps - R CMD check --no-manual $(ls -1t data.table_*.tar.gz | head -n 1) - - *rm-src + +.test-win-template: &test-win + <<: *test + tags: + - shared-windows + before_script: + - curl.exe -s -o ../R-win.exe $R_BIN; Start-Process -FilePath ..\R-win.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../rtools.exe $RTOOLS_BIN; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools" -NoNewWindow -Wait + - $env:PATH = "C:\R\bin;C:\rtools\usr\bin;$env:PATH" + - Rscript.exe -e "source('.ci/ci.R'); install.packages(dcf.dependencies('DESCRIPTION', which='all'), repos=file.path('file://',getwd(),'bus/mirror-packages/cran'), quiet=TRUE)" + - cp.exe $(ls.exe -1t bus/build/data.table_*.tar.gz | head.exe -n 1) . + script: + - R.exe CMD check --no-manual $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + after_script: + - $env:PATH = "C:\R\bin;C:\rtools\usr\bin;$env:PATH" + - mkdir.exe -p bus/$CI_JOB_NAME + - Rscript.exe -e "cat(Sys.getenv('CI_JOB_ID'), file=file.path('bus', Sys.getenv('CI_JOB_NAME'), 'id'))" + - Rscript.exe -e "cat(Sys.getenv('CI_JOB_STATUS'), file=file.path('bus', Sys.getenv('CI_JOB_NAME'), 'status'))" + - Rscript.exe -e "cat(Sys.getenv('CI_JOB_IMAGE'), file=file.path('bus', Sys.getenv('CI_JOB_NAME'), 'image'))" + - Rscript.exe -e "if (dir.exists(from<-'data.table.Rcheck')) file.rename(from, file.path('bus', Sys.getenv('CI_JOB_NAME'), 'data.table.Rcheck'))" + - Rscript.exe -e "if (length(from<-tail(list.files('^data\\.table_.*\\.zip$'), 1L))) file.rename(from, file.path('bus', Sys.getenv('CI_JOB_NAME'), from))" ## R-release on Windows # test and build binaries -test-rel-win: +test-win-rel: <<: *test-win variables: R_VERSION: "$R_REL_VERSION" - before_script: - - *install-r-rel-win - - *install-rtools43-win - - $ENV:PATH = "C:\R\bin;C:\rtools43\usr\bin;$ENV:PATH" - - *install-deps-win - - *cp-src-win - - rm.exe -r bus - script: - - *mv-src-win - - cd bus/$CI_JOB_NAME - - R.exe CMD check --no-manual $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - - *rm-src-win - - *mv-bin-win + R_BIN: "$R_REL_WIN_BIN" + RTOOLS_BIN: "$RTOOLS_REL_BIN" ## R-devel on Windows # test and build binaries -test-dev-win: +test-win-dev: <<: *test-win variables: - R_VERSION: "$R_DEVEL_VERSION" - allow_failure: true ## temp workaround #5748 - before_script: - - *install-r-devel-win - - *install-rtools43-win - - $ENV:PATH = "C:\R\bin;C:\rtools43\usr\bin;$ENV:PATH" - - *install-deps-win - - *cp-src-win - - rm.exe -r bus - script: - - *mv-src-win - - cd bus/$CI_JOB_NAME - - R.exe CMD check --no-manual --ignore-vignettes $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - - *rm-src-win - - *mv-bin-win + R_VERSION: "$R_DEV_VERSION" + R_BIN: "$R_DEV_WIN_BIN" + RTOOLS_BIN: "$RTOOLS_DEV_BIN" ## R-oldrel on Windows # test and build binaries -test-old-win: +test-win-old: <<: *test-win variables: - R_VERSION: "$R_OLDREL_VERSION" + R_VERSION: "$R_OLD_VERSION" + R_BIN: "$R_OLD_WIN_BIN" + RTOOLS_BIN: "$RTOOLS_OLD_BIN" + +.test-mac-template: &test-mac + <<: *test + tags: + - saas-macos-medium-m1 before_script: - - *install-r-oldrel-win - - *install-rtools42-win - - $ENV:PATH = "C:\R\bin;C:\rtools42\usr\bin;$ENV:PATH" - - *install-deps-win - - *cp-src-win - - rm.exe -r bus - script: - - *mv-src-win - - cd bus/$CI_JOB_NAME - - R.exe CMD check --no-manual --ignore-vignettes $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - - *rm-src-win - - *mv-bin-win + - cp $(ls -1t bus/build/data.table_*.tar.gz | head -n 1) . + after_script: + - mkdir -p bus/$CI_JOB_NAME + - '[ -d data.table.Rcheck ] && mv data.table.Rcheck bus/$CI_JOB_NAME/' + #- '[ -f data.table_*.tgz ] && mv $(ls -1t data.table_*.tgz | head -n 1) bus/$CI_JOB_NAME/' + - echo $CI_JOB_ID > bus/$CI_JOB_NAME/id + - echo $CI_JOB_STATUS > bus/$CI_JOB_NAME/status + - echo $CI_JOB_IMAGE > bus/$CI_JOB_NAME/image ## R-release on MacOS # no macosx runner set yet -#test-rel-mac: -# <<: *test-mac -# variables: -# R_VERSION: "$R_REL_VERSION" -# before_script: -# - *install-deps -# - *cp-src -# - rm -r bus -# script: -# - *mv-src -# - cd bus/$CI_JOB_NAME -# - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) -# - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) -# - mkdir -p cran/bin/macosx/el-capitan/contrib/$R_VERSION -# - mv $(ls -1t data.table_*.tgz | head -n 1) cran/bin/macosx/el-capitan/contrib/$R_VERSION -# - *rm-src -# - *mv-bin-mac +.test-mac-rel: + <<: *test-mac + variables: + R_VERSION: "$R_REL_VERSION" + script: + - *install-deps + - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) + - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) ## integrate artifacts # merging package tarballs and binaries into single R repository @@ -342,10 +294,13 @@ integration: - linux only: - master - needs: ["mirror-packages","build","test-rel-lin","test-rel-cran-lin","test-dev-cran-lin","test-rel-vanilla-lin","test-310-cran-lin","test-rel-win","test-dev-win","test-old-win"] + needs: ["mirror-packages","build","test-lin-rel","test-lin-rel-cran","test-lin-dev-gcc-strict-cran","test-lin-dev-clang-cran","test-lin-rel-vanilla","test-lin-310-cran","test-win-rel","test-win-dev" ,"test-win-old"] script: - R --version + - *install-deps ## markdown pkg not present in r-pkgdown image + - rm -rf ./vignettes ## r-lib/pkgdown#2383 - Rscript -e 'pkgdown::build_site(override=list(destination="./pkgdown"))' + - sed -i 's!!!g' pkgdown/index.html ## html manual, vignettes, repos, cran_web, cran_checks - echo 'source(".ci/ci.R"); source(".ci/publish.R")' >> .Rprofile ## list of available test-* jobs dynamically based on bus/test-* directories @@ -355,30 +310,33 @@ integration: ## delete any existing non-dev version of data.table - rm -f bus/mirror-packages/cran/src/contrib/data.table_*.tar.gz - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_REL_VERSION/data.table_*.zip - - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_DEVEL_VERSION/data.table_*.zip - - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_OLDREL_VERSION/data.table_*.zip + - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_DEV_VERSION/data.table_*.zip + - rm -f bus/mirror-packages/cran/bin/windows/contrib/$R_OLD_VERSION/data.table_*.zip #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_REL_VERSION/data.table_*.tgz - #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_DEVEL_VERSION/data.table_*.tgz - #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_OLDREL_VERSION/data.table_*.tgz + #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_DEV_VERSION/data.table_*.tgz + #- rm -f bus/mirror-packages/cran/bin/macosx/el-capitan/contrib/$R_OLD_VERSION/data.table_*.tgz ## merge mirror-packages and R devel packages - mv bus/mirror-packages/cran bus/$CI_JOB_NAME/ ## publish package sources - mkdir -p bus/$CI_JOB_NAME/cran/library bus/$CI_JOB_NAME/cran/doc - - mv $(ls -1t bus/build/cran/src/contrib/data.table_*.tar.gz | head -n 1) bus/$CI_JOB_NAME/cran/src/contrib + - mv $(ls -1t bus/build/data.table_*.tar.gz | head -n 1) bus/$CI_JOB_NAME/cran/src/contrib - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="source"), type="source", fields="Revision", addFiles=TRUE)' ## publish binaries - - Rscript -e 'move.bin("test-rel-win", Sys.getenv("R_REL_VERSION"), os.type="windows")' - - Rscript -e 'move.bin("test-dev-win", Sys.getenv("R_DEVEL_VERSION"), os.type="windows", silent=TRUE)' - - Rscript -e 'move.bin("test-old-win", Sys.getenv("R_OLDREL_VERSION"), os.type="windows")' + - mkdir -p bus/integration/cran/bin/windows/contrib/$R_REL_VERSION/ + - mkdir -p bus/integration/cran/bin/windows/contrib/$R_DEV_VERSION/ + - mkdir -p bus/integration/cran/bin/windows/contrib/$R_OLD_VERSION/ + - '[ -f bus/test-win-rel/data.table_*.zip ] && cp bus/test-win-rel/data.table_*.zip bus/integration/cran/bin/windows/contrib/$R_REL_VERSION/' + - '[ -f bus/test-win-dev/data.table_*.zip ] && cp bus/test-win-dev/data.table_*.zip bus/integration/cran/bin/windows/contrib/$R_DEV_VERSION/' + - '[ -f bus/test-win-old/data.table_*.zip ] && cp bus/test-win-old/data.table_*.zip bus/integration/cran/bin/windows/contrib/$R_OLD_VERSION/' - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_REL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_DEVEL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_OLDREL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - #- Rscript -e 'move.bin("test-rel-mac", Sys.getenv("R_REL_VERSION"), os.type="macosx")' - #- Rscript -e 'move.bin("test-dev-mac", Sys.getenv("R_DEVEL_VERSION"), os.type="macosx")' - #- Rscript -e 'move.bin("test-old-mac", Sys.getenv("R_OLDREL_VERSION"), os.type="macosx")' + - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_DEV_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' + - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_OLD_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' + #- Rscript -e 'move.bin("test-mac-rel", Sys.getenv("R_REL_VERSION"), os.type="macosx")' + #- Rscript -e 'move.bin("test-mac-dev", Sys.getenv("R_DEV_VERSION"), os.type="macosx")' + #- Rscript -e 'move.bin("test-mac-old", Sys.getenv("R_OLD_VERSION"), os.type="macosx")' #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_REL_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' - #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_DEVEL_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' - #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_OLDREL_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' + #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_DEV_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' + #- Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="mac.binary.el-capitan", ver=Sys.getenv("R_OLD_VERSION")), type="mac.binary.el-capitan", fields="Revision", addFiles=TRUE)' ## install all pkgs to render html and double check successful installation of all devel packages - mkdir -p /tmp/opencran/library /tmp/opencran/doc/html ## reset R_LIBS_USER to re-install all with html because pkgdown image has pre installed curl knitr - R_LIBS_USER="" Rscript -e 'install.packages("data.table", dependencies=TRUE, lib="/tmp/opencran/library", repos=file.path("file:",normalizePath("bus/integration/cran")), INSTALL_opts="--html", quiet=TRUE)' @@ -397,7 +355,7 @@ integration: ## web/checks/$pkg/$job 00install.out, 00check.log, *.Rout, memtest.csv, memtest.png ## memtest not available for now #5764 - Rscript -e 'sapply(names(test.jobs), check.copy, simplify=FALSE)' ## web/packages/$pkg/$pkg.pdf - - Rscript -e 'pdf.copy("data.table", "test-rel-lin")' + - Rscript -e 'pdf.copy("data.table", "test-lin-rel")' ## web/checks/check_results_$pkg.html - Rscript -e 'check.index("data.table", names(test.jobs))' ## web/checks/check_flavors.html @@ -405,11 +363,6 @@ integration: ## pkgdown merge - Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); msg = if (length(f<-common_files("pkgdown","bus/integration/cran"))) paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0(" ", f)), collapse="\n") else "No overlapping files from pkgdown artifacts"; message(msg); q("no")' - mv pkgdown/* bus/integration/cran/ - ## cleanup artifacts from other jobs - - mkdir tmpbus - - mv bus/$CI_JOB_NAME tmpbus - - rm -r bus - - mv tmpbus bus <<: *artifacts ## publish diff --git a/R/devel.R b/R/devel.R index 8bd7a1466a..3aed1017f8 100644 --- a/R/devel.R +++ b/R/devel.R @@ -17,9 +17,8 @@ dcf.repo = function(pkg, repo, field, type) { dcf[dcf[,"Package"]==pkg, field][[1L]] } -update_dev_pkg = function(object="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { +update_dev_pkg = function(pkg="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, ...) { # this works for any package, not just data.table - pkg = object # perform package upgrade when new Revision present stopifnot(is.character(pkg), length(pkg)==1L, !is.na(pkg), is.character(repo), length(repo)==1L, !is.na(repo), @@ -28,7 +27,7 @@ update_dev_pkg = function(object="data.table", repo="https://Rdatatable.gitlab.i # get Revision field from remote repository PACKAGES file una = is.na(ups<-dcf.repo(pkg, repo, field, type)) if (una) - catf("No revision information found in DESCRIPTION file for %s package. Unsure '%s' is correct field in PACKAGES file in your package repository '%s'. Otherwise package will be re-installed every time, proceeding to installation.\n", + catf("No revision information found in DESCRIPTION file for %s package. Make sure that '%s' is correct field in PACKAGES file in your package repository '%s'. Otherwise package will be re-installed every time, proceeding to installation.\n", pkg, field, contrib.url(repo, type=type)) # see if Revision is different then currently installed Revision, note that installed package will have Revision info only when it was installed from remote devel repo upg = una || !identical(ups, dcf.lib(pkg, field, lib.loc=lib)) @@ -44,6 +43,7 @@ update_dev_pkg = function(object="data.table", repo="https://Rdatatable.gitlab.i unname(read.dcf(system.file("DESCRIPTION", package=pkg, lib.loc=lib, mustWork=TRUE), fields=field)[, field]), utils::packageVersion(pkg, lib.loc=lib))) }) + invisible(upg) } # non-exported utility when using devel version #3272: data.table:::.git() diff --git a/_pkgdown.yml b/_pkgdown.yml index 4b02b39491..117ec29574 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,8 +1,4 @@ -url: https://Rdatatable.gitlab.io/data.table - -template: - params: - ganalytics: UA-129166154-2 +url: https://rdatatable.gitlab.io/data.table development: version_tooltip: "Development version" @@ -11,6 +7,8 @@ home: links: - text: CRAN-like website href: web/packages/data.table/index.html + - text: CRAN-like checks + href: web/checks/check_results_data.table.html navbar: structure: @@ -22,30 +20,30 @@ navbar: href: index.html introduction: text: Introduction - href: articles/datatable-intro.html + href: library/data.table/doc/datatable-intro.html articles: text: Vignettes menu: - text: "Introduction to data.table" - href: articles/datatable-intro.html + href: library/data.table/doc/datatable-intro.html - text: "Reference semantics" - href: articles/datatable-reference-semantics.html + href: library/data.table/doc/datatable-reference-semantics.html - text: "Using .SD for Data Analysis" - href: articles/datatable-sd-usage.html + href: library/data.table/doc/datatable-sd-usage.html - text: "Keys and fast binary search based subset" - href: articles/datatable-keys-fast-subset.html + href: library/data.table/doc/datatable-keys-fast-subset.html - text: "Secondary indices and auto indexing" - href: articles/datatable-secondary-indices-and-auto-indexing.html + href: library/data.table/doc/datatable-secondary-indices-and-auto-indexing.html - text: "Efficient reshaping using data.table" - href: articles/datatable-reshape.html + href: library/data.table/doc/datatable-reshape.html - text: "Programming on data.table" - href: articles/datatable-programming.html + href: library/data.table/doc/datatable-programming.html - text: "Frequently asked questions" - href: articles/datatable-faq.html + href: library/data.table/doc/datatable-faq.html - text: "Importing data.table" - href: articles/datatable-importing.html + href: library/data.table/doc/datatable-importing.html - text: "Benchmarking data.table" - href: articles/datatable-benchmarking.html + href: library/data.table/doc/datatable-benchmarking.html news: text: News href: news/index.html @@ -64,3 +62,8 @@ navbar: github: icon: fab fa-github fa-lg href: https://github.com/Rdatatable/data.table + +templates: + includes: + in_header: | + diff --git a/man/update_dev_pkg.Rd b/man/update_dev_pkg.Rd index 3db5b98316..66fff0422d 100644 --- a/man/update_dev_pkg.Rd +++ b/man/update_dev_pkg.Rd @@ -2,14 +2,14 @@ \alias{update_dev_pkg} \title{Perform update of development version of a package} \description{ - Downloads and installs latest development version only when a new commit is available which has also passed all tests. Defaults are set to update \code{data.table}, other packages can be used as well. Their repository has to include git commit information in PACKAGES file. + Downloads and installs latest development version, only when a new commit is available. Defaults are set to update \code{data.table}, other packages can be used as well. Repository of a package has to include git commit SHA information in PACKAGES file. } -\usage{update_dev_pkg(object="data.table", +\usage{update_dev_pkg(pkg="data.table", repo="https://Rdatatable.gitlab.io/data.table", field="Revision", type=getOption("pkgType"), lib=NULL, \dots) } \arguments{ - \item{object}{ character scalar, package name. } + \item{pkg}{ character scalar, package name. } \item{repo}{ character scalar, url of package devel repository. } \item{field}{ character scalar, metadata field to use in PACKAGES file and DESCRIPTION file, default \code{"Revision"}. } @@ -20,13 +20,18 @@ \item{\dots}{ passed to \code{\link[utils]{install.packages}}. } } \details{ - In case if a devel repository does not provide binaries user will need development tools installed for package compilation, like \emph{Rtools} on Windows, and eventually set \code{type="source"}. + In case if a devel repository does not provide binaries user will need development tools installed for package compilation, like \emph{Rtools} on Windows, or alternatively eventually set \code{type="source"}. +} +\section{data.table repositories}{ + By default the function uses our GitLab-hosted R repository at \code{https://Rdatatable.gitlab.io/data.table}. This repository is updated nightly. It runs multiple test jobs (on top of GitHub tests jobs run upstream) and publish the package (sources and binaries), even if GitLab test jobs are failing. Status of GitLab test jobs can be checked at \href{https://rdatatable.gitlab.io/data.table/web/checks/check_results_data.table.html}{Package Check Results}.\cr + We also publish bleeding edge version of the package on GitHub-hosted R repository at \code{https://Rdatatable.gitlab.io/data.table} (just minor change in url from \emph{lab} to \emph{hub}). GitHub version should be considered less stable than GitLab one. It publishes only package sources.\cr + There are also other repositories maintained by R community, for example \code{https://rdatatable.r-universe.dev}. Those can be used as well, but as they are unlikely to provide git commit SHA, the function will install the package even if latest version is already installed. } \note{ Package namespace is unloaded before attempting to install newer version. } \value{ - NULL. + Invisibly \code{TRUE} if package was updated, otherwise \code{FALSE}. } \examples{ \dontshow{ # using if(FALSE) because \dontrun could still be run by --run-dontrun; #5421 } From 155bb87826e38ea054000ae24a67d945d297ea93 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 6 Dec 2023 09:55:22 +0800 Subject: [PATCH 564/588] Fix two "unusual" format warnings (#5792) * Fix two "unusual" format warnings * Add comment, find more %d usages * Update src/assign.c --------- Co-authored-by: jangorecki --- src/assign.c | 2 +- src/fread.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/assign.c b/src/assign.c index d433c2b54e..8ae56e3c69 100644 --- a/src/assign.c +++ b/src/assign.c @@ -471,7 +471,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // strong error message for now. else if (TRUELENGTH(names) != oldtncol) // Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768, PRId64 didnt work - error(_("Internal error: selfrefnames is ok but tl names [%ld] != tl [%d]"), TRUELENGTH(names), oldtncol); // # nocov + error(_("Internal error: selfrefnames is ok but tl names [%lld] != tl [%d]"), (long long)TRUELENGTH(names), oldtncol); // # nocov SETLENGTH(dt, oldncol+LENGTH(newcolnames)); SETLENGTH(names, oldncol+LENGTH(newcolnames)); for (int i=0; i Date: Tue, 5 Dec 2023 17:55:57 -0800 Subject: [PATCH 565/588] ignore new .devcontainer dir --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index 5f47bbacdb..08508569d3 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,6 +12,7 @@ ^\.ci$ ^\.dev$ +^\.devcontainer$ ^\.graphics$ ^\.github$ From 3535631b06707e8922e54f7a3cf3896beaf19238 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 6 Dec 2023 10:48:36 +0800 Subject: [PATCH 566/588] Use %zu for sizeof() formats (#5791) --- src/assign.c | 2 +- src/gsumm.c | 6 +++--- src/init.c | 24 ++++++++++++------------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/assign.c b/src/assign.c index 8ae56e3c69..ce2c707dfd 100644 --- a/src/assign.c +++ b/src/assign.c @@ -809,7 +809,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con // # nocov start for (int k=0; k Date: Wed, 6 Dec 2023 11:20:40 +0800 Subject: [PATCH 567/588] Add a simple dockerfile for extending basic CI image to do dev stuff (first: git) (#5795) * Dockerfile for dev * add git --- .devcontainer/Dockerfile | 4 ++++ .devcontainer/devcontainer.json | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 .devcontainer/Dockerfile diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000000..0fb2392aed --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,4 @@ +FROM registry.gitlab.com/jangorecki/dockerfiles/r-devel-gcc + +RUN apt-get -qq update \ + && apt-get install -y --no-install-recommends git diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index bbda2085f8..a1447f19e1 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,3 +1,3 @@ { - "image": "registry.gitlab.com/jangorecki/dockerfiles/r-devel-gcc" + "build": { "dockerfile": "Dockerfile" } } From d8f7a3008723be0746ae41911ff29a3211bb8997 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 6 Dec 2023 15:11:00 +0800 Subject: [PATCH 568/588] Fix tests for complex (#5796) * fix tests for complex * Use NA_complex_ --- inst/tests/tests.Rraw | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0063d9d8c4..a5c0ce3b06 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9935,7 +9935,7 @@ test(1658.56, fwrite(data.table(exp(1) - pi*1i)), output='2.718[0-9]*-3.141[0-9] ## formerly 1658.46 DT = data.table(a=1:3, b=list(1:4, c(3.14, 100e10), c(3i,4i,5i))) test(1658.57, fwrite(DT), output='0+3i|0+4i|0+5i') -DT[ , b := c(1i, -1-1i, NA)] +DT[ , b := c(1i, -1-1i, NA_complex_)] test(1658.58, fwrite(DT), output='a,b\n1,0\\+1i\n2,-1-1i\n3,$') # more coverage @@ -10964,7 +10964,7 @@ test(1743.217, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor test(1743.218, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor = c(1, 2, 4), factor = 3), select = c(5, 4, 2, 3)), class), y = c(e = "character", d = "factor", b = "factor", c = "factor")) test(1743.22, fread("a,b,c\n1999/01/01,2,f", colClasses=list(Date=1L), drop="a"), data.table(b=2L, c="f")) -test(1743.231, fread("a,b,c\n2,1,4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4i"), +test(1743.231, fread("a,b,c\n2,1,4j", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4j"), warning=paste0(base_messages$coerce_na, ".*left as type 'character'")) test(1743.232, fread("a,b,c\n2,1,3+4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c=3+4i)) test(1743.241, fread("a,b,c\n2,2,f", colClasses = list(character="c", integer="b"), drop="a"), data.table(b=2L, c="f")) @@ -15406,9 +15406,9 @@ test(2060.503, xx_addr, address(xx)) test(2060.504, xx, x) test(2060.505, address(setcoalesce(xx)), xx_addr) # complex support for fcoalesce -z1 = c(1i, NA, 1-1i, NA, 0+3i, NA) -z2 = c(NA, 4-2i, 0+0i, NA, NA, NA) -z3 = c(2, NA, 3+6i, 5-1i, NA, NA) +z1 = c(1i, NA_complex_, 1-1i, NA_complex_, 0+3i, NA_complex_) +z2 = c(NA_complex_, 4-2i, 0+0i, NA_complex_, NA_complex_, NA_complex_) +z3 = c(2, NA_complex_, 3+6i, 5-1i, NA_complex_, NA_complex_) na_idx = c(2L, 4L, 6L) test(2060.600, fcoalesce(z1, 0+0i), `[<-`(z1, na_idx, 0+0i)) test(2060.601, fcoalesce(z1, z2), `[<-`(z1, na_idx, c(4-2i, NA, NA))) @@ -15509,7 +15509,7 @@ z = c(1:3) + c(3:1)*1i test(2067.1, shift(z), c(NA, z[1:2])) test(2067.2, shift(z, type = 'lead'), c(z[2:3], NA)) test(2067.3, shift(z, fill = 1i), c(1i, z[1:2])) -test(2067.4, shift(list(z, 1:3)), list(c(NA, z[1:2]), c(NA, 1:2))) +test(2067.4, shift(list(z, 1:3)), list(c(NA_complex_, z[1:2]), c(NA, 1:2))) test(2067.5, shift(z, n=1, type = 'cyclic'), c(z[3], z[1:2])) test(2067.6, shift(z, n=-1, type = 'cyclic'), c(z[2:3], z[1])) test(2067.7, shift(list(z, 1L:3L), n=1, type = 'cyclic'), list(c(z[3], z[1:2]), c(3L, 1:2))) From 25064ee2846605c06fd1eedb8e05e77fcc1c58b1 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 6 Dec 2023 13:15:48 +0100 Subject: [PATCH 569/588] plausible.io website traffic statistics (#5799) --- .gitlab-ci.yml | 4 +++- _pkgdown.yml | 5 ----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 60cf09bb55..6844c88539 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -266,6 +266,7 @@ test-win-old: after_script: - mkdir -p bus/$CI_JOB_NAME - '[ -d data.table.Rcheck ] && mv data.table.Rcheck bus/$CI_JOB_NAME/' + ## no pattern matching in [, TODO when macos available #- '[ -f data.table_*.tgz ] && mv $(ls -1t data.table_*.tgz | head -n 1) bus/$CI_JOB_NAME/' - echo $CI_JOB_ID > bus/$CI_JOB_NAME/id - echo $CI_JOB_STATUS > bus/$CI_JOB_NAME/status @@ -300,7 +301,6 @@ integration: - *install-deps ## markdown pkg not present in r-pkgdown image - rm -rf ./vignettes ## r-lib/pkgdown#2383 - Rscript -e 'pkgdown::build_site(override=list(destination="./pkgdown"))' - - sed -i 's!!!g' pkgdown/index.html ## html manual, vignettes, repos, cran_web, cran_checks - echo 'source(".ci/ci.R"); source(".ci/publish.R")' >> .Rprofile ## list of available test-* jobs dynamically based on bus/test-* directories @@ -363,6 +363,8 @@ integration: ## pkgdown merge - Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); msg = if (length(f<-common_files("pkgdown","bus/integration/cran"))) paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0(" ", f)), collapse="\n") else "No overlapping files from pkgdown artifacts"; message(msg); q("no")' - mv pkgdown/* bus/integration/cran/ + ## add plausible.io stats + - find bus/integration/cran -type f -iname "*.html" | xargs sed -i 's!!!g' <<: *artifacts ## publish diff --git a/_pkgdown.yml b/_pkgdown.yml index 117ec29574..1b9478e386 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -62,8 +62,3 @@ navbar: github: icon: fab fa-github fa-lg href: https://github.com/Rdatatable/data.table - -templates: - includes: - in_header: | - From 5e8ca96f7a9505704289f008f8a940ed2eefe83e Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 6 Dec 2023 13:52:48 +0100 Subject: [PATCH 570/588] follow up of compilation warnings (#5800) --- src/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/init.c b/src/init.c index af439bc0f8..7369141c9f 100644 --- a/src/init.c +++ b/src/init.c @@ -160,7 +160,7 @@ static void setSizes(void) { __sizes[CPLXSXP] = sizeof(Rcomplex); __typeorder[CPLXSXP] = 4; __sizes[STRSXP] = sizeof(SEXP *); __typeorder[STRSXP] = 5; __sizes[VECSXP] = sizeof(SEXP *); __typeorder[VECSXP] = 6; // list column - if (sizeof(char *)>8) error(_("Pointers are %lu bytes, greater than 8. We have not tested on any architecture greater than 64bit yet."), sizeof(char *)); + if (sizeof(char *)>8) error(_("Pointers are %zu bytes, greater than 8. We have not tested on any architecture greater than 64bit yet."), sizeof(char *)); // One place we need the largest sizeof is the working memory malloc in reorder.c } From 63300ffb6373da6a1440a9da97420280e82ad32b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:09:28 +0100 Subject: [PATCH 571/588] Remove `curl` from suggests (#5749) --- DESCRIPTION | 2 +- NEWS.md | 2 ++ R/fread.R | 16 ++++++---------- man/fread.Rd | 2 +- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6756db8ae1..6ba6d92268 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Version: 1.14.9 Title: Extension of `data.frame` Depends: R (>= 3.1.0) Imports: methods -Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, zoo (>= 1.8-1), yaml, knitr, markdown +Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), R.utils, xts, zoo (>= 1.8-1), yaml, knitr, markdown Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table diff --git a/NEWS.md b/NEWS.md index 52333e9b3c..0aaa2e4365 100644 --- a/NEWS.md +++ b/NEWS.md @@ -561,6 +561,8 @@ identical(DT1, DT2) # TRUE ``` +55. `fread(URL)` with `https:` and `ftps:` could timeout if proxy settings were not guessed right by `curl::curl_download`, [#1686](https://github.com/Rdatatable/data.table/issues/1686). `fread(URL)` now uses `download.file()` as default for downloading files from urls. Thanks to @cderv for the report and Benjamin Schwendinger for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/fread.R b/R/fread.R index e0337c5915..8e9a11b123 100644 --- a/R/fread.R +++ b/R/fread.R @@ -76,17 +76,13 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (w <- startsWithAny(file, c("https://", "ftps://", "http://", "ftp://", "file://"))) { # avoid grepl() for #2531 # nocov start tmpFile = tempfile(fileext = paste0(".",tools::file_ext(file)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below - if (w<=2L) { # https: or ftps: - if (!requireNamespace("curl", quietly = TRUE)) - stopf("URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov - - curl::curl_download(file, tmpFile, mode="wb", quiet = !showProgress) - } else { - method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 - else getOption("download.file.method", default="auto") # http: or ftp: - download.file(file, tmpFile, method=method, mode="wb", quiet=!showProgress) - # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" + if (w<=2L && base::getRversion()<"3.2.2") { # https: or ftps: can be read by default by download.file() since 3.2.2 + stopf("URL requires download.file functionalities from R >=3.2.2. You can still manually download the file and fread the downloaded file.") } + method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 + else getOption("download.file.method", default="auto") # http: or ftp: + # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" + download.file(file, tmpFile, method=method, mode="wb", quiet=!showProgress) file = tmpFile on.exit(unlink(tmpFile), add=TRUE) # nocov end diff --git a/man/fread.Rd b/man/fread.Rd index 78c8a76289..4456e11d10 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -115,7 +115,7 @@ Currently, the \code{yaml} setting is somewhat inflexible with respect to incorp \bold{File Download:} -When \code{input} begins with http://, https://, ftp://, ftps://, or file://, \code{fread} detects this and \emph{downloads} the target to a temporary file (at \code{tempfile()}) before proceeding to read the file as usual. Secure URLS (ftps:// and https://) are downloaded with \code{curl::curl_download}; ftp:// and http:// paths are downloaded with \code{download.file} and \code{method} set to \code{getOption("download.file.method")}, defaulting to \code{"auto"}; and file:// is downloaded with \code{download.file} with \code{method="internal"}. NB: this implies that for file://, even files found on the current machine will be "downloaded" (i.e., hard-copied) to a temporary file. See \code{\link{download.file}} for more details. +When \code{input} begins with http://, https://, ftp://, ftps://, or file://, \code{fread} detects this and \emph{downloads} the target to a temporary file (at \code{tempfile()}) before proceeding to read the file as usual. URLS (ftps:// and https:// as well as ftp:// and http://) paths are downloaded with \code{download.file} and \code{method} set to \code{getOption("download.file.method")}, defaulting to \code{"auto"}; and file:// is downloaded with \code{download.file} with \code{method="internal"}. NB: this implies that for file://, even files found on the current machine will be "downloaded" (i.e., hard-copied) to a temporary file. See \code{\link{download.file}} for more details. \bold{Shell commands:} From 4bf4ef328e12705192e068f299ddb611447f6e67 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 6 Dec 2023 17:45:06 +0100 Subject: [PATCH 572/588] improvements to CI (#5802) --- .ci/publish.R | 1 + .gitlab-ci.yml | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.ci/publish.R b/.ci/publish.R index 923b89b5d1..0657790d25 100644 --- a/.ci/publish.R +++ b/.ci/publish.R @@ -379,6 +379,7 @@ check.test <- function(job, pkg) { } move.bin <- function(job, bin.version, os.type, file="DESCRIPTION", silent=TRUE) { + ## currently not used, if not used for macos in future then can be removed if (os.type=="unix") { stop("publish of linux binaries not supported") } else if (os.type=="windows") { diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6844c88539..a394f68e7b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -58,9 +58,9 @@ mirror-packages: - Rscript -e 'sapply(simplify=FALSE, setNames(nm=Sys.getenv(c("R_REL_VERSION","R_DEV_VERSION","R_OLD_VERSION"))), function(binary.ver) mirror.packages(type="win.binary", dcf.dependencies("DESCRIPTION", "all"), repos=Sys.getenv("CRAN_MIRROR"), repodir="bus/mirror-packages/cran", binary.ver=binary.ver))' <<: *artifacts -## install deps aliases +## install deps alias .test-install-deps: &install-deps - - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), repos=file.path("file:", normalizePath("bus/mirror-packages/cran", mustWork=FALSE)), quiet=FALSE)' + - Rscript -e 'source(".ci/ci.R"); install.packages(dcf.dependencies("DESCRIPTION", which="all"), repos=file.path("file:", normalizePath("bus/mirror-packages/cran", mustWork=FALSE)), quiet=TRUE)' ## build # sources as tar.gz archive @@ -116,7 +116,6 @@ test-lin-rel: _R_CHECK_TESTS_NLINES_: "0" OPENBLAS_MAIN_FREE: "1" script: - - apt-get update -q && apt-get install -y --no-install-recommends libcurl4-openssl-dev ## remove after #5749 - *install-deps - echo 'CFLAGS=-g -O3 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - echo 'CXXFLAGS=-g -O3 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars @@ -149,7 +148,6 @@ test-lin-rel-cran: _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## bytes _R_CHECK_PKG_SIZES_THRESHOLD_: "7" ## MB 'checking installed package size' NOTE script: - - apt-get -qq update && apt-get install -y --no-install-recommends libcurl4-openssl-dev ## temp workaround for curl dep #5749 - *install-deps - echo 'CFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - echo 'CXXFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars @@ -176,7 +174,7 @@ test-lin-dev-gcc-strict-cran: - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - (! grep "warning:" data.table.Rcheck/00install.out) - >- - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 3 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 3 NOTEs"), " (size of tarball, suggested but not available for checking: 'curl', installed package size) but ", shQuote(l)) else q("no")' + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 2 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (size of tarball, installed package size) but ", shQuote(l)) else q("no")' ## R-devel on Linux clang # R compiled with clang @@ -197,7 +195,7 @@ test-lin-dev-clang-cran: - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - (! grep "warning:" data.table.Rcheck/00install.out) - >- - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 3 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 3 NOTEs"), " (size of tarball, suggested but not available for checking: 'curl', installed package size) but ", shQuote(l)) else q("no")' + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 2 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (size of tarball, installed package size) but ", shQuote(l)) else q("no")' ## R 3.1.0 # stated dependency on R @@ -221,14 +219,15 @@ test-lin-310-cran: script: - R.exe CMD check --no-manual $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) - R.exe CMD INSTALL --build $(ls.exe -1t data.table_*.tar.gz | head.exe -n 1) + - -not (grep.exe "warning:" data.table.Rcheck\00install.out) after_script: - $env:PATH = "C:\R\bin;C:\rtools\usr\bin;$env:PATH" - mkdir.exe -p bus/$CI_JOB_NAME - Rscript.exe -e "cat(Sys.getenv('CI_JOB_ID'), file=file.path('bus', Sys.getenv('CI_JOB_NAME'), 'id'))" - Rscript.exe -e "cat(Sys.getenv('CI_JOB_STATUS'), file=file.path('bus', Sys.getenv('CI_JOB_NAME'), 'status'))" - Rscript.exe -e "cat(Sys.getenv('CI_JOB_IMAGE'), file=file.path('bus', Sys.getenv('CI_JOB_NAME'), 'image'))" - - Rscript.exe -e "if (dir.exists(from<-'data.table.Rcheck')) file.rename(from, file.path('bus', Sys.getenv('CI_JOB_NAME'), 'data.table.Rcheck'))" - - Rscript.exe -e "if (length(from<-tail(list.files('^data\\.table_.*\\.zip$'), 1L))) file.rename(from, file.path('bus', Sys.getenv('CI_JOB_NAME'), from))" + - Rscript.exe -e "to<-file.path('bus', Sys.getenv('CI_JOB_NAME'), 'data.table.Rcheck'); if (dir.exists(from<-'data.table.Rcheck')) invisible(file.rename(from, to)); dir.exists(to)" + - Rscript.exe -e "from<-tail(list.files(pattern='^data\\.table_.*\\.zip$'), 1L); to<-file.path('bus', Sys.getenv('CI_JOB_NAME'), from); if (length(from)) invisible(file.rename(from, to)); length(to)&&file.exists(to)" ## R-release on Windows # test and build binaries @@ -262,12 +261,12 @@ test-win-old: tags: - saas-macos-medium-m1 before_script: + - *install-deps - cp $(ls -1t bus/build/data.table_*.tar.gz | head -n 1) . after_script: - mkdir -p bus/$CI_JOB_NAME - '[ -d data.table.Rcheck ] && mv data.table.Rcheck bus/$CI_JOB_NAME/' - ## no pattern matching in [, TODO when macos available - #- '[ -f data.table_*.tgz ] && mv $(ls -1t data.table_*.tgz | head -n 1) bus/$CI_JOB_NAME/' + - '[ -f data.table_*.tgz ] && mv $(ls -1t data.table_*.tgz | head -n 1) bus/$CI_JOB_NAME/' - echo $CI_JOB_ID > bus/$CI_JOB_NAME/id - echo $CI_JOB_STATUS > bus/$CI_JOB_NAME/status - echo $CI_JOB_IMAGE > bus/$CI_JOB_NAME/image @@ -279,7 +278,6 @@ test-win-old: variables: R_VERSION: "$R_REL_VERSION" script: - - *install-deps - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) - R CMD INSTALL --build $(ls -1t data.table_*.tar.gz | head -n 1) @@ -326,8 +324,11 @@ integration: - mkdir -p bus/integration/cran/bin/windows/contrib/$R_DEV_VERSION/ - mkdir -p bus/integration/cran/bin/windows/contrib/$R_OLD_VERSION/ - '[ -f bus/test-win-rel/data.table_*.zip ] && cp bus/test-win-rel/data.table_*.zip bus/integration/cran/bin/windows/contrib/$R_REL_VERSION/' + - ls -1 "bus/integration/cran/bin/windows/contrib/$R_REL_VERSION"/data.table_*.zip || true - '[ -f bus/test-win-dev/data.table_*.zip ] && cp bus/test-win-dev/data.table_*.zip bus/integration/cran/bin/windows/contrib/$R_DEV_VERSION/' + - ls -1 "bus/integration/cran/bin/windows/contrib/$R_DEV_VERSION"/data.table_*.zip || true - '[ -f bus/test-win-old/data.table_*.zip ] && cp bus/test-win-old/data.table_*.zip bus/integration/cran/bin/windows/contrib/$R_OLD_VERSION/' + - ls -1 "bus/integration/cran/bin/windows/contrib/$R_OLD_VERSION"/data.table_*.zip || true - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_REL_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_DEV_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' - Rscript -e 'tools::write_PACKAGES(contrib.url("bus/integration/cran", type="win.binary", ver=Sys.getenv("R_OLD_VERSION")), type="win.binary", fields="Revision", addFiles=TRUE)' From 3e590f84194b8ce975ad9ee84ef3330a74c0c63d Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 6 Dec 2023 18:56:42 +0100 Subject: [PATCH 573/588] workaround pkgdown bug by copy rather link to avoid dead links (#5804) --- .gitlab-ci.yml | 4 ++++ _pkgdown.yml | 22 +++++++++++----------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a394f68e7b..099f399772 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -361,6 +361,10 @@ integration: - Rscript -e 'check.index("data.table", names(test.jobs))' ## web/checks/check_flavors.html - Rscript -e 'check.flavors(names(test.jobs))' + ## pkgdown vignettes workaround r-lib/pkgdown#2383 + - mkdir -p pkgdown/articles + - cp bus/integration/cran/library/data.table/doc/*.html pkgdown/articles/. + - rm pkgdown/articles/index.html ## pkgdown merge - Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); msg = if (length(f<-common_files("pkgdown","bus/integration/cran"))) paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0(" ", f)), collapse="\n") else "No overlapping files from pkgdown artifacts"; message(msg); q("no")' - mv pkgdown/* bus/integration/cran/ diff --git a/_pkgdown.yml b/_pkgdown.yml index 1b9478e386..66488b9281 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -20,30 +20,30 @@ navbar: href: index.html introduction: text: Introduction - href: library/data.table/doc/datatable-intro.html + href: articles/datatable-intro.html articles: text: Vignettes menu: - text: "Introduction to data.table" - href: library/data.table/doc/datatable-intro.html + href: articles/datatable-intro.html - text: "Reference semantics" - href: library/data.table/doc/datatable-reference-semantics.html + href: articles/datatable-reference-semantics.html - text: "Using .SD for Data Analysis" - href: library/data.table/doc/datatable-sd-usage.html + href: articles/datatable-sd-usage.html - text: "Keys and fast binary search based subset" - href: library/data.table/doc/datatable-keys-fast-subset.html + href: articles/datatable-keys-fast-subset.html - text: "Secondary indices and auto indexing" - href: library/data.table/doc/datatable-secondary-indices-and-auto-indexing.html + href: articles/datatable-secondary-indices-and-auto-indexing.html - text: "Efficient reshaping using data.table" - href: library/data.table/doc/datatable-reshape.html + href: articles/datatable-reshape.html - text: "Programming on data.table" - href: library/data.table/doc/datatable-programming.html + href: articles/datatable-programming.html - text: "Frequently asked questions" - href: library/data.table/doc/datatable-faq.html + href: articles/datatable-faq.html - text: "Importing data.table" - href: library/data.table/doc/datatable-importing.html + href: articles/datatable-importing.html - text: "Benchmarking data.table" - href: library/data.table/doc/datatable-benchmarking.html + href: articles/datatable-benchmarking.html news: text: News href: news/index.html From 9225c169042a6032c7bb21ffa4ee74b6219934db Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 8 Dec 2023 15:42:52 +0100 Subject: [PATCH 574/588] follow version number conventions #5715 (#5803) --- .dev/CRAN_Release.cmd | 55 +++++++++++++++++++++++++++---------------- DESCRIPTION | 2 +- Makefile | 6 ++--- NEWS.md | 2 +- src/init.c | 2 +- 5 files changed, 41 insertions(+), 26 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 3442dcb38c..6134d923e2 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -195,15 +195,15 @@ R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" # avoids many too-many-requests in --as-cran's ping-all-URLs step (20 mins) inside the `checking CRAN incoming feasibility...` step. # Many thanks to Dirk for the tipoff that setting this env variable solves the problem, #4832. -R CMD check data.table_1.14.9.tar.gz --as-cran -R CMD INSTALL data.table_1.14.9.tar.gz --html +R CMD check data.table_1.14.99.tar.gz --as-cran +R CMD INSTALL data.table_1.14.99.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.14.9.tar.gz +R CMD check data.table_1.14.99.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -220,9 +220,9 @@ q("no") # User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 # Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. -PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.9.tar.gz +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.99.tar.gz # Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work -PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.9.tar.gz +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.99.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) @@ -266,7 +266,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.14.9.tar.gz +R310 CMD INSTALL ./data.table_1.14.99.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -278,7 +278,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.14.9.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.14.99.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -286,7 +286,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.14.9.tar.gz +R CMD check data.table_1.14.99.tar.gz ##################################################### @@ -341,11 +341,11 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.14.9.tar.gz +Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.14.99.tar.gz # Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so they should be # passed through to here. However, our configure script seems to get in the way and gets them from {R_HOME}/bin/R # So I needed to edit my ~/.R/Makevars to get CFLAGS the way I needed. -Rdevel-strict-[gcc|clang] CMD check data.table_1.14.9.tar.gz +Rdevel-strict-[gcc|clang] CMD check data.table_1.14.99.tar.gz # Use the (failed) output to get the list of currently needed packages and install them Rdevel-strict-[gcc|clang] isTRUE(.Machine$sizeof.longdouble==0) # check noLD is being tested @@ -354,7 +354,7 @@ install.packages(c("bit64", "bit", "curl", "R.utils", "xts","nanotime", "zoo", " Ncpus=4) # Issue #5491 showed that CRAN is running UBSAN on .Rd examples which found an error so we now run full R CMD check q("no") -Rdevel-strict-[gcc|clang] CMD check data.table_1.14.9.tar.gz +Rdevel-strict-[gcc|clang] CMD check data.table_1.14.99.tar.gz # UBSAN errors occur on stderr and don't affect R CMD check result. Made many failed attempts to capture them. So grep for them. find data.table.Rcheck -name "*Rout*" -exec grep -H "runtime error" {} \; @@ -391,7 +391,7 @@ cd R-devel-valgrind make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems -Rdevel-valgrind CMD INSTALL data.table_1.14.9.tar.gz +Rdevel-valgrind CMD INSTALL data.table_1.14.99.tar.gz R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" # the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. # including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks @@ -429,7 +429,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.14.9.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.14.99.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -594,7 +594,7 @@ du -k inst/tests # 0.75MB after R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" Rdevel -q -e "packageVersion('xml2')" # ensure installed -Rdevel CMD check data.table_1.14.10.tar.gz --as-cran # use latest Rdevel as it may have extra checks +Rdevel CMD check data.table_1.16.0.tar.gz --as-cran # use latest Rdevel as it may have extra checks # bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit compressed *.Rraw to git # @@ -614,15 +614,30 @@ If it's evening, SLEEP. It can take a few days for CRAN's checks to run. If any issues arise, backport locally. Resubmit the same even version to CRAN. CRAN's first check is automatic and usually received within an hour. WAIT FOR THAT EMAIL. When CRAN's email contains "Pretest results OK pending a manual inspection" (or similar), or if not and it is known why not and ok, then bump dev. -###### Bump dev -0. Close milestone to prevent new issues being tagged with it. Update its name to the even release. The final 'release checks' issue can be left open in a closed milestone. + +###### Bump dev for NON-PATCH RELEASE +0. Close milestone to prevent new issues being tagged with it. The final 'release checks' issue can be left open in a closed milestone. 1. Check that 'git status' shows 4 files in modified and uncommitted state: DESCRIPTION, NEWS.md, init.c and this .dev/CRAN_Release.cmd -2. Bump version in DESCRIPTION to next odd number. Note that DESCRIPTION was in edited and uncommitted state so even number never appears in git. +2. Bump minor version in DESCRIPTION to next odd number. Note that DESCRIPTION was in edited and uncommitted state so even number never appears in git. 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. -4. Bump dllVersion() in init.c -5. Bump 3 version numbers in Makefile +4. Bump minor version in dllVersion() in init.c +5. Bump 3 minor version numbers in Makefile +6. Search and replace this .dev/CRAN_Release.cmd to update 1.14.99 to 1.15.99 inc below, 1.15.0 to 1.16.0 above, 1.14.0 to 1.15.0 below +7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) +8. Push to master with this consistent commit message: "1.15.0 on CRAN. Bump to 1.14.10" +9. Take sha from step 8 and run `git tag 1.15.0 96c..sha..d77` then `git push origin 1.15.0` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +###### + +###### Bump dev for PATCH RELEASE +## WARNING: review this process during the next first patch release (x.y.2) from a regular release (x,y,0), possibly during 1.15.2 release. +0. Close milestone to prevent new issues being tagged with it. The final 'release checks' issue can be left open in a closed milestone. +1. Check that 'git status' shows 4 files in modified and uncommitted state: DESCRIPTION, NEWS.md, init.c and this .dev/CRAN_Release.cmd +2. Bump patch version in DESCRIPTION to next odd number. Note that DESCRIPTION was in edited and uncommitted state so even number never appears in git. +3. Add new heading in NEWS for the next dev PATCH version. Add "(submitted to CRAN on )" on the released heading. +4. Bump patch version in dllVersion() in init.c +5. Bump 3 patch version numbers in Makefile 6. Search and replace this .dev/CRAN_Release.cmd to update 1.14.9 to 1.14.11 inc below, 1.14.10 to 1.14.12 above, 1.14.8 to 1.14.10 below 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) 8. Push to master with this consistent commit message: "1.14.8 on CRAN. Bump to 1.14.10" 9. Take sha from step 8 and run `git tag 1.14.8 96c..sha..d77` then `git push origin 1.14.8` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) -###### +###### \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 6ba6d92268..8942c48d14 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.14.9 +Version: 1.14.99 Title: Extension of `data.frame` Depends: R (>= 3.1.0) Imports: methods diff --git a/Makefile b/Makefile index b4d8517df3..45fb6203b9 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.14.9.tar.gz + $(RM) data.table_1.14.99.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.14.9.tar.gz + $(R) CMD INSTALL data.table_1.14.99.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.9.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.99.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/NEWS.md b/NEWS.md index 0aaa2e4365..a9ac6e8a9f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** -# data.table [v1.14.9](https://github.com/Rdatatable/data.table/milestone/20) (in development) +# data.table [v1.14.99](https://github.com/Rdatatable/data.table/milestone/20) (in development) ## NEW FEATURES diff --git a/src/init.c b/src/init.c index 7369141c9f..c8e8452ec6 100644 --- a/src/init.c +++ b/src/init.c @@ -353,6 +353,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion(void) { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.14.9"))); + return(ScalarString(mkChar("1.14.99"))); } From ae215c70ef804f98dc66a01bb373d2669ffa923a Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 8 Dec 2023 16:01:53 +0100 Subject: [PATCH 575/588] minor programming vignette fixes (#5432) * minor vignette fixes * update version and add link --- man/data.table.Rd | 2 +- vignettes/datatable-programming.Rmd | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/man/data.table.Rd b/man/data.table.Rd index 502595d7c0..4f8d402fc2 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -177,7 +177,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac See examples as well as \href{../doc/datatable-secondary-indices-and-auto-indexing.html}{\code{vignette("datatable-secondary-indices-and-auto-indexing")}}. } - \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. } + \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. For more details see \href{../doc/datatable-programming.html}{\code{vignette("datatable-programming")}}. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index fc3ad726d7..89d1292012 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -122,7 +122,7 @@ Though these can be helpful, we will be discussing a `data.table`-unique approac Now that we've established the proper way to parameterize code that uses *lazy evaluation*, we can move on to the main subject of this vignette, *programming on data.table*. -Starting from version 1.14.2, data.table provides a robust mechanism for parameterizing expressions passed to the `i`, `j`, and `by` (or `keyby`) arguments of `[.data.table`. It is built upon the base R `substitute` function, and mimics its interface. Here, we introduce `substitute2` as a more robust and more user-friendly version of base R's `substitute`. For a complete list of differences between `base::substitute` and `data.table::substitute2` please read the [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). +Starting from version 1.15.0, data.table provides a robust mechanism for parameterizing expressions passed to the `i`, `j`, and `by` (or `keyby`) arguments of `[.data.table`. It is built upon the base R `substitute` function, and mimics its interface. Here, we introduce `substitute2` as a more robust and more user-friendly version of base R's `substitute`. For a complete list of differences between `base::substitute` and `data.table::substitute2` please read the [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). ### Substituting variables and names @@ -203,7 +203,7 @@ DT[filter_col %in% filter_val, ### Substitute variables and character values -In the above example, we have seen a convenient feature of `substitute2`: automatic conversion from strings into names/symbols. An obvious question arises: what if we actually want to substitute a parameter with a *character* value, so as to have base R `substitute` behaviour. We provide a mechanism to escape automatic conversion by wrapping the elements into base R `I()` call. The `I` function marks an object as *AsIs*, preventing its arguments from substitution. (Read the `?AsIs` documentation for more details.) If base R behaviour is desired for the whole `env` argument, then it's best to wrap the whole argument in `I()`. Alternatively, each list element can be wrapped in `I()` individually. Let's explore both cases below. +In the above example, we have seen a convenient feature of `substitute2`: automatic conversion from strings into names/symbols. An obvious question arises: what if we actually want to substitute a parameter with a *character* value, so as to have base R `substitute` behaviour. We provide a mechanism to escape automatic conversion by wrapping the elements into base R `I()` call. The `I` function marks an object as *AsIs*, preventing its arguments from character-to-symbol automatic conversion. (Read the `?AsIs` documentation for more details.) If base R behaviour is desired for the whole `env` argument, then it's best to wrap the whole argument in `I()`. Alternatively, each list element can be wrapped in `I()` individually. Let's explore both cases below. ```{r rank} substitute( # base R behaviour @@ -253,7 +253,7 @@ DT[, list(Sepal.Length, Sepal.Width)] ``` *Splicing* is an operation where a list of objects have to be inlined into an expression as a sequence of arguments to call. -In base R, splicing `cols` into a `list` can be achieved using `as.call(c(quote(list), cols))`. Additionally, starting from R 4.0.0, there is new interface for such an operation in the `bquote` function. +In base R, splicing `cols` into a `list` can be achieved using `as.call(c(quote(list), lapply(cols, as.name)))`. Additionally, starting from R 4.0.0, there is new interface for such an operation in the `bquote` function. In data.table, we make it easier by automatically _enlist_-ing a list of objects into a list call with those objects. This means that any `list` object inside the `env` list argument will be turned into list `call`, making the API for that use case as simple as presented below. From 4e7d46bcb515679e1af18f0205b3bc3eb3588582 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 8 Dec 2023 16:05:15 +0100 Subject: [PATCH 576/588] notin docs #5481 (#5729) --- man/notin.Rd | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/man/notin.Rd b/man/notin.Rd index d84bb2024d..e041ff5cbd 100644 --- a/man/notin.Rd +++ b/man/notin.Rd @@ -1,33 +1,30 @@ \name{notin} \alias{\%notin\%} - \title{ Convenience operator for checking if an example is not in a set of elements } - \description{ -Check whether an object is absent from a table, i.e., the logical inverse of \code{\link[=base]{in}}. +Check whether an object is absent from a table, i.e., the logical inverse of \code{\link[=base]{in}}. See examples on how missing values are being handled. } - \usage{ x \%notin\% table } - \arguments{ \item{x}{ Vector or \code{NULL}: the values to be matched. } \item{table}{ Vector or \code{NULL}: the values to be matched against. } } - - \value{ Logical vector, \code{TRUE} for each element of \code{x} \emph{absent} from \code{table}, and \code{FALSE} for each element of \code{x} \emph{present} in \code{table}. } - \seealso{ \code{\link[base]{match}}, \code{\link[data.table]{chmatch}} } - - \examples{ 11 \%notin\% 1:10 # TRUE "a" \%notin\% c("a", "b") # FALSE -} + ## NAs on the LHS + NA \%in\% 1:2 + NA \%notin\% 1:2 + ## NAs on the RHS + NA \%in\% c(1:2,NA) + NA \%notin\% c(1:2,NA) +} From 05a1be88a2b03cef336490a8963c6c9ed85ed154 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 8 Dec 2023 16:09:13 +0100 Subject: [PATCH 577/588] add case for missing values NA (all types) (#5423) --- NEWS.md | 2 +- inst/tests/tests.Rraw | 20 ++++++++++---------- src/idatetime.c | 6 ++++++ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index a9ac6e8a9f..a1fee2ac62 100644 --- a/NEWS.md +++ b/NEWS.md @@ -290,7 +290,7 @@ # 2: 2 10 ``` -40. New functions `yearmon()` and `yearqtr` give a combined representation of `year()` and `month()`/`quarter()`. These and also `yday`, `wday`, `mday`, `week`, `month` and `year` are now optimized for memory and compute efficiency by removing the `POSIXlt` dependency, [#649](https://github.com/Rdatatable/data.table/issues/649). Thanks to Matt Dowle for the request, and Benjamin Schwendinger for the PR. +40. New functions `yearmon()` and `yearqtr` give a combined representation of `year()` and `month()`/`quarter()`. These and also `yday`, `wday`, `mday`, `week`, `month` and `year` are now optimized for memory and compute efficiency by removing the `POSIXlt` dependency, [#649](https://github.com/Rdatatable/data.table/issues/649). Thanks to Matt Dowle for the request, and Benjamin Schwendinger for the PR. Thanks to @berg-michael for testing dev and filing a bug report for special case of missing values which was fixed before release. 41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a5c0ce3b06..825a7e73f5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18073,16 +18073,16 @@ test(2235.1, copy(DT)[, c("z", "x") := {x = NULL; list(2, NULL)}], data.table(z test(2235.2, copy(DT)[, c("z", "x") := {list(2, NULL)}], data.table(z = 2)) # move IDate from POSIXlt to C, add yearquarter; #649 -x = c("1111-11-11", "2019-01-01", "2019-02-28", "2019-03-01", "2019-12-31", "2020-02-29", "2020-03-01", "2020-12-31", "2040-01-01", "2040-12-31", "2100-03-01") -test(2236.1, yday(x), c(315L, 1L, 59L, 60L, 365L, 60L, 61L, 366L, 1L, 366L, 60L)) -test(2236.2, mday(x), c(11L, 1L, 28L, 1L, 31L, 29L, 1L, 31L, 1L, 31L, 1L)) -test(2236.3, wday(x), c(7L, 3L, 5L, 6L, 3L, 7L, 1L, 5L, 1L, 2L, 2L)) -test(2236.4, week(x), c(46L, 1L, 9L, 9L, 53L, 9L, 9L, 53L, 1L, 53L, 9L)) -test(2236.5, month(x), c(11L, 1L, 2L, 3L, 12L, 2L, 3L, 12L, 1L, 12L, 3L)) -test(2236.6, quarter(x), c(4L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 4L, 1L)) -test(2236.7, year(x), c(1111L, 2019L, 2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2040L, 2040L, 2100L)) -test(2236.8, yearmon(x), c(1111+10/12, 2019, 2019+1/12, 2019+2/12, 2019+11/12, 2020+1/12, 2020+2/12, 2020+11/12, 2040, 2040+11/12, 2100+2/12)) -test(2236.9, yearqtr(x), c(1111.75, 2019, 2019, 2019, 2019.75, 2020, 2020, 2020.75, 2040, 2040.75, 2100)) +x = c("1111-11-11", "2019-01-01", "2019-02-28", "2019-03-01", "2019-12-31", "2020-02-29", "2020-03-01", "2020-12-31", "2040-01-01", "2040-12-31", "2100-03-01", NA) +test(2236.1, yday(x), c(315L, 1L, 59L, 60L, 365L, 60L, 61L, 366L, 1L, 366L, 60L, NA)) +test(2236.2, mday(x), c(11L, 1L, 28L, 1L, 31L, 29L, 1L, 31L, 1L, 31L, 1L, NA)) +test(2236.3, wday(x), c(7L, 3L, 5L, 6L, 3L, 7L, 1L, 5L, 1L, 2L, 2L, NA)) +test(2236.4, week(x), c(46L, 1L, 9L, 9L, 53L, 9L, 9L, 53L, 1L, 53L, 9L, NA)) +test(2236.5, month(x), c(11L, 1L, 2L, 3L, 12L, 2L, 3L, 12L, 1L, 12L, 3L, NA)) +test(2236.6, quarter(x), c(4L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 4L, 1L, NA)) +test(2236.7, year(x), c(1111L, 2019L, 2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2040L, 2040L, 2100L, NA)) +test(2236.8, yearmon(x), c(1111+10/12, 2019, 2019+1/12, 2019+2/12, 2019+11/12, 2020+1/12, 2020+2/12, 2020+11/12, 2040, 2040+11/12, 2100+2/12, NA)) +test(2236.9, yearqtr(x), c(1111.75, 2019, 2019, 2019, 2019.75, 2020, 2020, 2020.75, 2040, 2040.75, 2100, NA)) # as.data.table() no longer ignores row.names=, #5319 dt = data.table(a=1:2, b=3:4) diff --git a/src/idatetime.c b/src/idatetime.c index c70df3b053..c25e9ec9c6 100644 --- a/src/idatetime.c +++ b/src/idatetime.c @@ -16,6 +16,12 @@ void convertSingleDate(int x, datetype type, void *out) static const char months[] = {31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 31, 29}; static const int quarter[] = {31, 91, 92, 92, 60}; + if (x == NA_INTEGER) { + if (type == YEARMON || type == YEARQTR) *(double *)out = NA_REAL; + else *(int *)out = NA_INTEGER; + return; + } + if (type == WDAY) { int wday = (x + 4) % 7; if (wday < 0) wday += 7; From 1b130efafebd362b14b9bbd5520e8723b333c27f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 8 Dec 2023 16:11:35 +0100 Subject: [PATCH 578/588] rbindlist segfault for fill=TRUE and usenames=FALSE (#5468) * add fix for fill=TRUE and usenames=FALSE --- NEWS.md | 8 ++++---- inst/tests/tests.Rraw | 5 ++++- src/rbindlist.c | 16 +++++++++------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index a1fee2ac62..28aba0f969 100644 --- a/NEWS.md +++ b/NEWS.md @@ -205,7 +205,7 @@ # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 ``` -31. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.` +31. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.`, [#5444](https://github.com/Rdatatable/data.table/issues/5444). Thanks to @sindribaldur for testing dev and filing a bug report which was fixed before release. ```R DT1 @@ -249,7 +249,7 @@ # 3: 3 NA # 4: 4 NA ``` - + 32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. 33. `fread()` can now read `.zip` and `.tar` directly, [#3834](https://github.com/Rdatatable/data.table/issues/3834). Moreover, if a compressed file name is missing its extension, `fread()` now attempts to infer the correct filetype from its magic bytes. Thanks to Michael Chirico for the idea, and Benjamin Schwendinger for the PR. @@ -265,7 +265,7 @@ # 1: 1 3 a # 2: 2 4 b ``` - + 35. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. 36. `as.xts.data.table()` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. @@ -282,7 +282,7 @@ # # 1: 3 5 # 2: 4 6 - + DT[, sum(.SD), by=.I] # I V1 # diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 825a7e73f5..8015439f5b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14336,7 +14336,10 @@ test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, us test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TRUE, use.names=FALSE), data.table(a=c(1:4), c=INT(5,6,NA,NA))) test(2003.5, rbindlist(list(data.table(a=1:2), data.table(b=3:4, c=5:6)), fill=TRUE, use.names=FALSE), - data.table(a=c(1:4), V1=INT(NA,NA,5,6))) + data.table(a=c(1:4), c=INT(NA,NA,5,6))) +# rbindlist segfault with fill=TRUE and usenames=FALSE #5444 +test(2003.6, rbindlist(list(list(1), list(2,3)), fill=TRUE, use.names=FALSE), data.table(c(1,2), c(NA, 3))) +test(2003.7, rbindlist(list(list(1), list(2,factor(3))), fill=TRUE, use.names=FALSE), data.table(c(1,2), factor(c(NA, 3)))) # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111 x1 = "fa\xE7ile" diff --git a/src/rbindlist.c b/src/rbindlist.c index 2ffff3af8c..ba19d2c389 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -282,7 +282,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) for (int i=0; i Date: Fri, 8 Dec 2023 08:12:33 -0700 Subject: [PATCH 579/588] add CODEOWNERS file (#5629) * add CODEOWNERS * Add jangorecki to codeowners * Add new env arg files * add ben-schwen to codeowners * add commented sections, vignette * comments * Set @michaelchirico ownership for some file * co-owner for shift & IDateTime * Also printing --------- Co-authored-by: Jan Gorecki Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Co-authored-by: Michael Chirico --- CODEOWNERS | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 CODEOWNERS diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000000..5d98e02422 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,45 @@ +# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners +* @mattdowle + +# melt +/R/fmelt.R @tdhock +/src/fmelt.c @tdhock +/man/melt.data.table.Rd @tdhock +/vignettes/datatable-reshape.Rmd @tdhock + +# rolling statistics +/R/froll.R @jangorecki +/man/froll.Rd @jangorecki +/src/froll.c @jangorecki +/src/frollR.c @jangorecki +/src/frolladaptive.c @jangorecki + +# meta-programming +/R/programming.R @jangorecki +/man/substitute2.Rd @jangorecki +/src/programming.c @jangorecki +/vignettes/datatable-programming.Rmd @jangorecki + +# GForce groupby +/src/gsumm.c @ben-schwen +# datetime classes +/R/IDateTime.R @ben-schwen @michaelchirico +/src/idatetime.c @ben-schwen @michaelchirico +/man/IDateTime.Rd @ben-schwen @michaelchirico + +# shift +/R/shift.R @ben-schwen @michaelchirico +/src/shift.c @ben-schwen @michaelchirico +/man/shift.Rd @ben-schwen @michaelchirico + +# translations +/inst/po/ @michaelchirico +/po/ @michaelchirico +/R/translation.R @michaelchirico +/src/po.h @michaelchirico + +# printing +/R/print.data.table.R @michaelchirico + +# .SD vignette +/vignettes/datatable-sd-usage.Rmd @michaelchirico From 2a554646ea13a66ed98afd2f83bda42e3058653c Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 8 Dec 2023 16:36:34 +0100 Subject: [PATCH 580/588] update maintainer (#5724) --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8942c48d14..a59298fcbe 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,7 +11,8 @@ BugReports: https://github.com/Rdatatable/data.table/issues VignetteBuilder: knitr ByteCompile: TRUE Authors@R: c( - person("Matt","Dowle", role=c("aut","cre"), email="mattjdowle@gmail.com"), + person("Tyson","Barrett", role=c("aut","cre"), email="t.barrett88@gmail.com"), + person("Matt","Dowle", role="aut", email="mattjdowle@gmail.com"), person("Arun","Srinivasan", role="aut", email="asrini@pm.me"), person("Jan","Gorecki", role="ctb"), person("Michael","Chirico", role="ctb"), @@ -59,7 +60,6 @@ Authors@R: c( person("Davis","Vaughan", role="ctb"), person("Toby","Hocking", role="ctb"), person("Leonardo","Silvestri", role="ctb"), - person("Tyson","Barrett", role="ctb"), person("Jim","Hester", role="ctb"), person("Anthony","Damico", role="ctb"), person("Sebastian","Freundt", role="ctb"), From 3ad0e8e5c67dfb4a9ff72990d35fce2adcccda88 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 8 Dec 2023 16:49:02 +0100 Subject: [PATCH 581/588] update CRAN release procedure for less deps (#5810) --- .dev/CRAN_Release.cmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 6134d923e2..94a4a17ec3 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -350,7 +350,7 @@ Rdevel-strict-[gcc|clang] CMD check data.table_1.14.99.tar.gz Rdevel-strict-[gcc|clang] isTRUE(.Machine$sizeof.longdouble==0) # check noLD is being tested options(repos = "http://cloud.r-project.org") -install.packages(c("bit64", "bit", "curl", "R.utils", "xts","nanotime", "zoo", "yaml", "knitr", "rmarkdown", "markdown"), +install.packages(c("bit64", "bit", "R.utils", "xts", "zoo", "yaml", "knitr", "markdown"), Ncpus=4) # Issue #5491 showed that CRAN is running UBSAN on .Rd examples which found an error so we now run full R CMD check q("no") @@ -372,7 +372,7 @@ print(Sys.time()); started.at<-proc.time(); try(test.data.table()); print(Sys.ti ## apt-get update ## apt-get install libc6:i386 libstdc++6:i386 gcc-multilib g++-multilib gfortran-multilib libbz2-dev:i386 liblzma-dev:i386 libpcre3-dev:i386 libcurl3-dev:i386 libstdc++-7-dev:i386 ## sudo apt-get purge libcurl4-openssl-dev # cannot coexist, it seems -## sudo apt-get install libcurl4-openssl-dev:i386 +## sudo apt-get install libcurl4-openssl-dev:i386 ## may not be needed anymore as we dropped dependency on curl, try and update when reproducing ## cd ~/build/32bit/R-devel ## ./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --without-readline --without-x CC="gcc -m32" CXX="g++ -m32" F77="gfortran -m32" FC=${F77} OBJC=${CC} LDFLAGS="-L/usr/local/lib" LIBnn=lib LIBS="-lpthread" CFLAGS="-O0 -g -Wall -pedantic" ## From 6bde0083f4274c55b296abdb0305c3f634b47e8f Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 8 Dec 2023 17:05:33 +0100 Subject: [PATCH 582/588] unexport and undocument DT(), closes #5472 (#5730) * unexport and undocument DT(), closes #5472 * handle DT() in tests * Michael feedback --- NAMESPACE | 2 +- NEWS.md | 47 ++++++++++++++++++------------------------- inst/tests/tests.Rraw | 6 +++++- man/data.table.Rd | 8 -------- 4 files changed, 26 insertions(+), 37 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ef0aa2d171..ac54150824 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,7 +57,7 @@ export(setnafill) export(.Last.updated) export(fcoalesce) export(substitute2) -export(DT) # mtcars |> DT(i,j,by) #4872 +#export(DT) # mtcars |> DT(i,j,by) #4872 #5472 S3method("[", data.table) export("[.data.table") # so that functional DT() finds it; PR#5176 diff --git a/NEWS.md b/NEWS.md index 28aba0f969..bf4250b168 100644 --- a/NEWS.md +++ b/NEWS.md @@ -107,15 +107,8 @@ 21. `melt()` was pseudo generic in that `melt(DT)` would dispatch to the `melt.data.table` method but `melt(not-DT)` would explicitly redirect to `reshape2`. Now `melt()` is standard generic so that methods can be developed in other packages, [#4864](https://github.com/Rdatatable/data.table/pull/4864). Thanks to @odelmarcelle for suggesting and implementing. -22. `DT(i, j, by, ...)` has been added, i.e. functional form of a `data.table` query, [#641](https://github.com/Rdatatable/data.table/issues/641) [#4872](https://github.com/Rdatatable/data.table/issues/4872). Thanks to Yike Lu and Elio Campitelli for filing requests, many others for comments and suggestions, and Matt Dowle for the PR. This enables the `data.table` general form query to be invoked on a `data.frame` without converting it to a `data.table` first. The class of the input object is retained. Thanks to Mark Fairbanks and Boniface Kamgang for testing and reporting problems that have been fixed before release, [#5106](https://github.com/Rdatatable/data.table/issues/5106) [#5107](https://github.com/Rdatatable/data.table/issues/5107). - ```R - mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) - ``` - - When `data.table` queries (either `[...]` or `|> DT(...)`) receive a `data.table`, the operations maintain `data.table`'s attributes such as its key and any indices. For example, if a `data.table` is reordered by `data.table`, or a key column has a value changed by `:=` in `data.table`, its key and indices will either be dropped or reordered appropriately. Some `data.table` operations automatically add and store an index on a `data.table` for reuse in future queries, if `options(datatable.auto.index=TRUE)`, which is `TRUE` by default. `data.table`'s are also over-allocated, which means there are spare column pointer slots allocated in advance so that a `data.table` in the `.GlobalEnv` can have a column added to it truly by reference, like an in-memory database with multiple client sessions connecting to one server R process, as a `data.table` video has shown in the past. But because R and other packages don't maintain `data.table`'s attributes or over-allocation (e.g. a subset or reorder by R or another package will create invalid `data.table` attributes) `data.table` cannot use these attributes when it detects that base R or another package has touched the `data.table` in the meantime, even if the attributes may sometimes still be valid. So, please realize that, `DT()` on a `data.table` should realize better speed and memory usage than `DT()` on a `data.frame`. `DT()` on a `data.frame` may still be useful to use `data.table`'s syntax (e.g. sub-queries within group: `|> DT(i, .SD[sub-query], by=grp)`) without needing to convert to a `data.table` first. - -23. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. +22. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. ```R DT = data.table(A=1:3) @@ -133,13 +126,13 @@ # 2: 3 ``` -24. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. +23. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. -25. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. +24. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. -26. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, Boniface Kamgang and Martin Binder for testing, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. +25. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, Boniface Kamgang and Martin Binder for testing, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. -27. `shift()` now also supports `type="cyclic"`, [#4451](https://github.com/Rdatatable/data.table/issues/4451). Arguments that are normally pushed out by `type="lag"` or `type="lead"` are re-introduced at this type at the first/last positions. Thanks to @RicoDiel for requesting, and Benjamin Schwendinger for the PR. +26. `shift()` now also supports `type="cyclic"`, [#4451](https://github.com/Rdatatable/data.table/issues/4451). Arguments that are normally pushed out by `type="lag"` or `type="lead"` are re-introduced at this type at the first/last positions. Thanks to @RicoDiel for requesting, and Benjamin Schwendinger for the PR. ```R # Usage @@ -167,11 +160,11 @@ # c(tail(x, 1), head(x, -1)) 6.96 7.16 7.49 7.32 7.64 8.60 10 ``` -28. `fread()` now supports "0" and "1" in `na.strings`, [#2927](https://github.com/Rdatatable/data.table/issues/2927). Previously this was not permitted since "0" and "1" can be recognized as boolean values. Note that it is still not permitted to use "0" and "1" in `na.strings` in combination with `logical01 = TRUE`. Thanks to @msgoussi for the request, and Benjamin Schwendinger for the PR. +27. `fread()` now supports "0" and "1" in `na.strings`, [#2927](https://github.com/Rdatatable/data.table/issues/2927). Previously this was not permitted since "0" and "1" can be recognized as boolean values. Note that it is still not permitted to use "0" and "1" in `na.strings` in combination with `logical01 = TRUE`. Thanks to @msgoussi for the request, and Benjamin Schwendinger for the PR. -29. `setkey()` now supports type `raw` as value columns (not as key columns), [#5100](https://github.com/Rdatatable/data.table/issues/5100). Thanks Hugh Parsonage for requesting, and Benjamin Schwendinger for the PR. +28. `setkey()` now supports type `raw` as value columns (not as key columns), [#5100](https://github.com/Rdatatable/data.table/issues/5100). Thanks Hugh Parsonage for requesting, and Benjamin Schwendinger for the PR. -30. `shift()` is now optimised by group, [#1534](https://github.com/Rdatatable/data.table/issues/1534). Thanks to Gerhard Nachtmann for requesting, and Benjamin Schwendinger for the PR. +29. `shift()` is now optimised by group, [#1534](https://github.com/Rdatatable/data.table/issues/1534). Thanks to Gerhard Nachtmann for requesting, and Benjamin Schwendinger for the PR. ```R N = 1e7 @@ -205,7 +198,7 @@ # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 ``` -31. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.`, [#5444](https://github.com/Rdatatable/data.table/issues/5444). Thanks to @sindribaldur for testing dev and filing a bug report which was fixed before release. +30. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.`, [#5444](https://github.com/Rdatatable/data.table/issues/5444). Thanks to @sindribaldur for testing dev and filing a bug report which was fixed before release. ```R DT1 @@ -250,11 +243,11 @@ # 4: 4 NA ``` -32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. +31. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. -33. `fread()` can now read `.zip` and `.tar` directly, [#3834](https://github.com/Rdatatable/data.table/issues/3834). Moreover, if a compressed file name is missing its extension, `fread()` now attempts to infer the correct filetype from its magic bytes. Thanks to Michael Chirico for the idea, and Benjamin Schwendinger for the PR. +32. `fread()` can now read `.zip` and `.tar` directly, [#3834](https://github.com/Rdatatable/data.table/issues/3834). Moreover, if a compressed file name is missing its extension, `fread()` now attempts to infer the correct filetype from its magic bytes. Thanks to Michael Chirico for the idea, and Benjamin Schwendinger for the PR. -34. `DT[, let(...)]` is a new alias for the functional form of `:=`; i.e. `DT[, ':='(...)]`, [#3795](https://github.com/Rdatatable/data.table/issues/3795). Thanks to Elio Campitelli for requesting, and Benjamin Schwendinger for the PR. +33. `DT[, let(...)]` is a new alias for the functional form of `:=`; i.e. `DT[, ':='(...)]`, [#3795](https://github.com/Rdatatable/data.table/issues/3795). Thanks to Elio Campitelli for requesting, and Benjamin Schwendinger for the PR. ```R DT = data.table(A=1:2) @@ -266,15 +259,15 @@ # 2: 2 4 b ``` -35. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. +34. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. -36. `as.xts.data.table()` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. +35. `as.xts.data.table()` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. -37. `unique.data.table()` gains `cols` to specify a subset of columns to include in the resulting `data.table`, [#5243](https://github.com/Rdatatable/data.table/issues/5243). This saves the memory overhead of subsetting unneeded columns, and provides a cleaner API for a common operation previously needing more convoluted code. Thanks to @MichaelChirico for the suggestion & implementation. +36. `unique.data.table()` gains `cols` to specify a subset of columns to include in the resulting `data.table`, [#5243](https://github.com/Rdatatable/data.table/issues/5243). This saves the memory overhead of subsetting unneeded columns, and provides a cleaner API for a common operation previously needing more convoluted code. Thanks to @MichaelChirico for the suggestion & implementation. -38. `:=` is now optimized by group, [#1414](https://github.com/Rdatatable/data.table/issues/1414). Thanks to Arun Srinivasan for suggesting, and Benjamin Schwendinger for the PR. Thanks to @clerousset, @dcaseykc, @OfekShilon, and @SeanShao98 for testing dev and filing detailed bug reports which were fixed before release and their tests added to the test suite. +37. `:=` is now optimized by group, [#1414](https://github.com/Rdatatable/data.table/issues/1414). Thanks to Arun Srinivasan for suggesting, and Benjamin Schwendinger for the PR. Thanks to @clerousset, @dcaseykc, @OfekShilon, and @SeanShao98 for testing dev and filing detailed bug reports which were fixed before release and their tests added to the test suite. -39. `.I` is now available in `by` for rowwise operations, [#1732](https://github.com/Rdatatable/data.table/issues/1732). Thanks to Rafael H. M. Pereira for requesting, and Benjamin Schwendinger for the PR. +38. `.I` is now available in `by` for rowwise operations, [#1732](https://github.com/Rdatatable/data.table/issues/1732). Thanks to Rafael H. M. Pereira for requesting, and Benjamin Schwendinger for the PR. ```R DT @@ -290,11 +283,11 @@ # 2: 2 10 ``` -40. New functions `yearmon()` and `yearqtr` give a combined representation of `year()` and `month()`/`quarter()`. These and also `yday`, `wday`, `mday`, `week`, `month` and `year` are now optimized for memory and compute efficiency by removing the `POSIXlt` dependency, [#649](https://github.com/Rdatatable/data.table/issues/649). Thanks to Matt Dowle for the request, and Benjamin Schwendinger for the PR. Thanks to @berg-michael for testing dev and filing a bug report for special case of missing values which was fixed before release. +39. New functions `yearmon()` and `yearqtr` give a combined representation of `year()` and `month()`/`quarter()`. These and also `yday`, `wday`, `mday`, `week`, `month` and `year` are now optimized for memory and compute efficiency by removing the `POSIXlt` dependency, [#649](https://github.com/Rdatatable/data.table/issues/649). Thanks to Matt Dowle for the request, and Benjamin Schwendinger for the PR. Thanks to @berg-michael for testing dev and filing a bug report for special case of missing values which was fixed before release. -41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. +40. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. -42. `tables()` is faster by default by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). `mb=` now accepts any function which accepts a `data.table` and returns a higher and better estimate of its size in bytes, albeit more slowly; e.g. `mb = utils::object.size`. +41. `tables()` is faster by default by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). `mb=` now accepts any function which accepts a `data.table` and returns a higher and better estimate of its size in bytes, albeit more slowly; e.g. `mb = utils::object.size`. ## BUG FIXES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8015439f5b..98d81fe2bb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8,6 +8,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") rm_all = function() {} + DTfun = DT ## otherwise DT would be re-defined by many tests } else { require(data.table) # Make symbols to the installed version's ::: so that we can i) test internal-only not-exposed R functions @@ -32,6 +33,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { compactprint = data.table:::compactprint cube.data.table = data.table:::cube.data.table dcast.data.table = data.table:::dcast.data.table + DTfun = data.table:::DT endsWith = data.table:::endsWith endsWithAny = data.table:::endsWithAny forder = data.table:::forder @@ -349,7 +351,7 @@ test(83, TESTDT[,list("a","b")], data.table(V1="a",V2="b")) test(83.1, TESTDT[,list("sum(a),sum(b)")], data.table("sum(a),sum(b)")) test(83.2, TESTDT[,list("sum(a),sum(b)"),by=a], {tt=data.table(a=c("a","c","d","g"),V1="sum(a),sum(b)",key="a");tt$V1=as.character(tt$V1);tt}) test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = 'a,b')) -# test(85, TESTDT[1:2,DT(a,b)], data.table(a=c("a","c"), b=c("e","e"))) #DT() now deprecated +# test(85, TESTDT[1:2,DT(a,b)], data.table(a=c("a","c"), b=c("e","e"))) #DT() now deprecated ## this is very old DT() functionality, completely different than DT() discussed in 2023 test(86, TESTDT[,sum(v),by="b"], data.table(b=c("e","f","i","b"),V1=INT(3,7,11,7))) # TESTDT is key'd by a,b, so correct that grouping by b should not be key'd in the result by default test(87, TESTDT[,list(MySum=sum(v)),by="b"], data.table(b=c("e","f","i","b"),MySum=INT(3,7,11,7))) @@ -17587,6 +17589,7 @@ for (col in c("a","b","c")) { # DT() functional form, #4872 #5106 #5107 #5129 if (base::getRversion() >= "4.1.0") { + DT = DTfun # we have to EVAL "|>" here too otherwise this tests.Rraw file won't parse in R<4.1.0 droprn = function(df) { rownames(df)=NULL; df } # TODO: could retain rownames where droprn is currently used below test(2212.011, EVAL("mtcars |> DT(mpg>20, .(mean_hp=round(mean(hp),2)), by=cyl)"), @@ -17638,6 +17641,7 @@ if (base::getRversion() >= "4.1.0") { test(2212.52, EVAL("D |> DT(D[, .I[which.max(mpg)], by=cyl]$V1)"), ans) test(2212.53, EVAL("filter |> DT(filter[, .I[which.max(mpg)], by=cyl]$V1)"), error="unused.*argument.*by.*cyl") # R's [.data.frame error on filter[...] test(2212.54, EVAL("filter |> DT((filter |> DT(, .I[which.max(mpg)], by=cyl))$V1)"), as.data.frame(ans)) + rm(DT) } # precision powers of 10^(-n), #4461 diff --git a/man/data.table.Rd b/man/data.table.Rd index 4f8d402fc2..b8011b422a 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -5,7 +5,6 @@ \alias{Ops.data.table} \alias{is.na.data.table} \alias{[.data.table} -\alias{DT} \alias{.} \alias{.(} \alias{.()} @@ -435,13 +434,6 @@ dev.off() # using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v' DT[, c(.(y=max(y)), lapply(.SD, min)), by=rleid(v), .SDcols=v:b] -# functional query DT(...) -\dontshow{ #dontrun to pass R CMD check prior to R 4.1.0 when |> was added - # an if getRVersion()>"4.1.0" still has its code parsed } -\dontrun{ -mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) -} - # Support guide and links: # https://github.com/Rdatatable/data.table/wiki/Support From a40ec8ed8aa9bdab8cfc46598b91c2062978d9f2 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 8 Dec 2023 17:23:07 +0100 Subject: [PATCH 583/588] shift on matrix: news and improve error (#5462) * news and improve error * Michael feedback, actionable error --- NEWS.md | 4 ++++ inst/tests/tests.Rraw | 3 +++ src/shift.c | 2 ++ src/utils.c | 2 +- 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index bf4250b168..1cfd582f75 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,10 @@ # data.table [v1.14.99](https://github.com/Rdatatable/data.table/milestone/20) (in development) +## BREAKING CHANGE + +1. `shift` and `nafill` will now raise error `input must not be matrix or array` when `matrix` or `array` is provided on input, rather than giving useless result, [#5287](https://github.com/Rdatatable/data.table/issues/5287). Thanks to @ethanbsmith for reporting. + ## NEW FEATURES 1. `nafill()` now applies `fill=` to the front/back of the vector when `type="locf|nocb"`, [#3594](https://github.com/Rdatatable/data.table/issues/3594). Thanks to @ben519 for the feature request. It also now returns a named object based on the input names. Note that if you are considering joining and then using `nafill(...,type='locf|nocb')` afterwards, please review `roll=`/`rollends=` which should achieve the same result in one step more efficiently. `nafill()` is for when filling-while-joining (i.e. `roll=`/`rollends=`/`nomatch=`) cannot be applied. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 98d81fe2bb..8eeb8f7ee7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18108,3 +18108,6 @@ test(2238.6, "a" %notin% integer(), TRUE) test(2238.7, "a" %notin% NULL, TRUE) test(2238.8, NA %notin% 1:5, TRUE) test(2238.9, NA %notin% c(1:5, NA), FALSE) + +# shift actionable error on matrix input #5287 +test(2239.1, shift(matrix(1:10, ncol = 1)), error="consider wrapping") diff --git a/src/shift.c b/src/shift.c index dba598fe50..30c13a547a 100644 --- a/src/shift.c +++ b/src/shift.c @@ -8,6 +8,8 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) if (!xlength(obj)) return(obj); // NULL, list() SEXP x; if (isVectorAtomic(obj)) { + if (!isNull(getAttrib(obj, R_DimSymbol))) + error(_("shift input must not be matrix or array, consider wrapping it into data.table() or c()")); x = PROTECT(allocVector(VECSXP, 1)); nprotect++; SET_VECTOR_ELT(x, 0, obj); } else { diff --git a/src/utils.c b/src/utils.c index 3dfd8bcc69..e5e343ac9f 100644 --- a/src/utils.c +++ b/src/utils.c @@ -348,7 +348,7 @@ SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg) { if (!isNull(getAttrib(x, R_DimSymbol))) error(_("'x' must not be matrix or array")); if (!isNull(getAttrib(as, R_DimSymbol))) - error(_("'as' must not be matrix or array")); + error(_("input must not be matrix or array")); bool verbose = GetVerbose()>=2; // verbose level 2 required if (!LOGICAL(copyArg)[0] && TYPEOF(x)==TYPEOF(as) && class1(x)==class1(as)) { if (verbose) From 2800a616ea4dfd1b9c5ac2a0911f0cd140a6f239 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 8 Dec 2023 17:29:04 +0100 Subject: [PATCH 584/588] add CODEOWNERS to Rbuildignore (#5811) --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index 08508569d3..22a3a807fa 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -23,6 +23,7 @@ ^NEWS\.0\.md$ ^_pkgdown\.yml$ ^src/Makevars$ +^CODEOWNERS$ ^\.RData$ ^\.Rhistory$ From 9b3b251d973a84c3304e0011ea1727faa7eb9f40 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 8 Dec 2023 20:44:39 +0100 Subject: [PATCH 585/588] Pull 1.14.10 into master (#5814) * updated NEWS and urls fixed url issues and added final details for patch release in news * add method for IDate Added `S3method(as.IDate, IDate)`. This is related to #4777 as discussed in NEWS.md. * Add `setDTthreads(1)` to vignettes To reduce runtime on building vignettes. * reset setDTthreads at end of vignettes * reset threads at end of vignettes --------- Co-authored-by: Tyson Barrett --- NAMESPACE | 1 + NEWS.md | 19 ++++++++++++++++--- README.md | 2 +- vignettes/datatable-faq.Rmd | 4 ++++ vignettes/datatable-intro.Rmd | 4 ++++ vignettes/datatable-keys-fast-subset.Rmd | 6 ++++++ vignettes/datatable-reference-semantics.Rmd | 5 +++++ vignettes/datatable-reshape.Rmd | 5 +++++ vignettes/datatable-sd-usage.Rmd | 5 +++++ ...le-secondary-indices-and-auto-indexing.Rmd | 6 ++++++ 10 files changed, 53 insertions(+), 4 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ac54150824..75b490068f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -158,6 +158,7 @@ S3method(as.IDate, Date) S3method(as.IDate, POSIXct) S3method(as.IDate, default) S3method(as.IDate, numeric) +S3method(as.IDate, IDate) S3method(as.ITime, character) S3method(as.ITime, default) S3method(as.ITime, POSIXct) diff --git a/NEWS.md b/NEWS.md index 1cfd582f75..513ac9bc56 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** -# data.table [v1.14.99](https://github.com/Rdatatable/data.table/milestone/20) (in development) +# data.table [v1.14.99](https://github.com/Rdatatable/data.table/milestone/29) (in development) ## BREAKING CHANGE @@ -610,6 +610,19 @@ 15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). +# data.table [v1.14.10](https://github.com/Rdatatable/data.table/milestone/20?closed=1) (8 Dec 2023) + +## NOTES + +1. Maintainer of the package for CRAN releases is from now on Tyson Barrett (@tysonstanley), [#5710](https://github.com/Rdatatable/data.table/issues/5710). + +2. Updated internal code for breaking change of `is.atomic(NULL)` in R-devel, [#5691](https://github.com/Rdatatable/data.table/pull/5691). Thanks to Martin Maechler for the patch. + +3. Fix multiple test concerning coercion to missing complex numbers, [#5695](https://github.com/Rdatatable/data.table/issues/5695) and [#5748](https://github.com/Rdatatable/data.table/issues/5748). Thanks to @MichaelChirico and @ben-schwen for the patches. + +4. Fix multiple format warnings (e.g., -Wformat) [#5712](https://github.com/Rdatatable/data.table/pull/5712), [#5781](https://github.com/Rdatatable/data.table/pull/5781), [#5880](https://github.com/Rdatatable/data.table/pull/5800), [#5786](https://github.com/Rdatatable/data.table/pull/5786). Thanks to @MichaelChirico and @jangorecki for the patches. + + # data.table [v1.14.8](https://github.com/Rdatatable/data.table/milestone/28?closed=1) (17 Feb 2023) ## NOTES @@ -736,7 +749,7 @@ ## NOTES -1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://h2o.ai/blog/behind-the-scenes-of-cran/). +1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://h2o.ai/blog/2016/behind-the-scenes-of-cran/). 2. `as.Date.IDate` is no longer exported as a function to solve a new error in R-devel `S3 method lookup found 'as.Date.IDate' on search path`, [#4777](https://github.com/Rdatatable/data.table/issues/4777). The S3 method is still exported; i.e. `as.Date(x)` will still invoke the `as.Date.IDate` method when `x` is class `IDate`. The function had been exported, in addition to exporting the method, to solve a compatibility issue with `zoo` (and `xts` which uses `zoo`) because `zoo` exports `as.Date` which masks `base::as.Date`. Happily, since zoo 1.8-1 (Jan 2018) made a change to its `as.IDate`, the workaround is no longer needed. @@ -1008,7 +1021,7 @@ has a better chance of working on Mac. * `colClasses` now supports `'complex'`, `'raw'`, `'Date'`, `'POSIXct'`, and user-defined classes (so long as an `as.` method exists), [#491](https://github.com/Rdatatable/data.table/issues/491) [#1634](https://github.com/Rdatatable/data.table/issues/1634) [#2610](https://github.com/Rdatatable/data.table/issues/2610). Any error during coercion results in a warning and the column is left as the default type (probably `"character"`). Thanks to @hughparsonage for the PR. * `stringsAsFactors=0.10` will factorize any character column containing under `0.10*nrow` unique strings, [#2025](https://github.com/Rdatatable/data.table/issues/2025). Thanks to @hughparsonage for the PR. * `colClasses=list(numeric=20:30, numeric="ID")` will apply the `numeric` type to column numbers `20:30` as before and now also column name `"ID"`; i.e. all duplicate class names are now respected rather than only the first. This need may arise when specifying some columns by name and others by number, as in this example. Thanks to @hughparsonage for the PR. - * gains `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files; i.e., csv files with metadata in a header formatted as YAML (https://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats. Please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/leeper/rio) for the inspiration and @MichaelChirico for implementing. + * gains `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files; i.e., csv files with metadata in a header formatted as YAML (https://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats. Please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/gesistsa/rio) for the inspiration and @MichaelChirico for implementing. * `select` can now be used to specify types for just the columns selected, [#1426](https://github.com/Rdatatable/data.table/issues/1426). Just like `colClasses` it can be a named vector of `colname=type` pairs, or a named `list` of `type=col(s)` pairs. For example: ```R diff --git a/README.md b/README.md index 8455602f12..562799db42 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![CRAN status](https://badges.cranchecks.info/flavor/release/data.table.svg)](https://cran.r-project.org/web/checks/check_results_data.table.html) [![R-CMD-check](https://github.com/Rdatatable/data.table/workflows/R-CMD-check/badge.svg)](https://github.com/Rdatatable/data.table/actions) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/kayjdh5qtgymhoxr/branch/master?svg=true)](https://ci.appveyor.com/project/Rdatatable/data-table) -[![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://codecov.io/github/Rdatatable/data.table?branch=master) +[![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://app.codecov.io/github/Rdatatable/data.table?branch=master) [![GitLab CI build status](https://gitlab.com/Rdatatable/data.table/badges/master/pipeline.svg)](https://gitlab.com/Rdatatable/data.table/-/pipelines) [![downloads](https://cranlogs.r-pkg.org/badges/data.table)](https://www.rdocumentation.org/trends) [![CRAN usage](https://jangorecki.gitlab.io/rdeps/data.table/CRAN_usage.svg?sanitize=true)](https://gitlab.com/jangorecki/rdeps) diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index f1deaba781..a2de14a2f6 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -29,6 +29,7 @@ knitr::opts_chunk$set( tidy = FALSE, cache = FALSE, collapse = TRUE) +.old.th = setDTthreads(1) ``` The first section, Beginner FAQs, is intended to be read in order, from start to finish. It's just written in a FAQ style to be digested more easily. It isn't really the most frequently asked questions. A better measure for that is looking on Stack Overflow. @@ -615,3 +616,6 @@ Sure. You're more likely to get a faster answer from the Issues page or Stack Ov Please see [this answer](https://stackoverflow.com/a/10529888/403310). +```{r, echo=FALSE} +setDTthreads(.old.th) +``` \ No newline at end of file diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index 04fd79e50d..3624a7c5be 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -18,6 +18,7 @@ knitr::opts_chunk$set( cache = FALSE, collapse = TRUE ) +.old.th = setDTthreads(1) ``` This vignette introduces the `data.table` syntax, its general form, how to *subset* rows, *select and compute* on columns, and perform aggregations *by group*. Familiarity with `data.frame` data structure from base R is useful, but not essential to follow this vignette. @@ -651,3 +652,6 @@ We will see how to *add/update/delete* columns *by reference* and how to combine *** +```{r, echo=FALSE} +setDTthreads(.old.th) +``` \ No newline at end of file diff --git a/vignettes/datatable-keys-fast-subset.Rmd b/vignettes/datatable-keys-fast-subset.Rmd index 3e9a4f23c7..e73b71b929 100644 --- a/vignettes/datatable-keys-fast-subset.Rmd +++ b/vignettes/datatable-keys-fast-subset.Rmd @@ -17,6 +17,7 @@ knitr::opts_chunk$set( tidy = FALSE, cache = FALSE, collapse = TRUE) +.old.th = setDTthreads(1) ``` This vignette is aimed at those who are already familiar with *data.table* syntax, its general form, how to subset rows in `i`, select and compute on columns, add/modify/delete columns *by reference* in `j` and group by using `by`. If you're not familiar with these concepts, please read the *"Introduction to data.table"* and *"Reference semantics"* vignettes first. @@ -494,3 +495,8 @@ In this vignette, we have learnt another method to subset rows in `i` by keying * combine key based subsets with `j` and `by`. Note that the `j` and `by` operations are exactly the same as before. Key based subsets are **incredibly fast** and are particularly useful when the task involves *repeated subsetting*. But it may not be always desirable to set key and physically reorder the *data.table*. In the next vignette, we will address this using a *new* feature -- *secondary indexes*. + + +```{r, echo=FALSE} +setDTthreads(.old.th) +``` \ No newline at end of file diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index c96ed090f7..7a9990ba40 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -17,6 +17,7 @@ knitr::opts_chunk$set( tidy = FALSE, cache = FALSE, collapse = TRUE) +.old.th = setDTthreads(1) ``` This vignette discusses *data.table*'s reference semantics which allows to *add/update/delete* columns of a *data.table by reference*, and also combine them with `i` and `by`. It is aimed at those who are already familiar with *data.table* syntax, its general form, how to subset rows in `i`, select and compute on columns, and perform aggregations by group. If you're not familiar with these concepts, please read the *"Introduction to data.table"* vignette first. @@ -348,6 +349,10 @@ However we could improve this functionality further by *shallow* copying instead * We can use `:=` for its side effect or use `copy()` to not modify the original object while updating by reference. +```{r, echo=FALSE} +setDTthreads(.old.th) +``` + # So far we have seen a whole lot in `j`, and how to combine it with `by` and little of `i`. Let's turn our attention back to `i` in the next vignette *"Keys and fast binary search based subset"* to perform *blazing fast subsets* by *keying data.tables*. diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index 0b5d7a57d3..d282bc7de3 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -17,6 +17,7 @@ knitr::opts_chunk$set( tidy = FALSE, cache = FALSE, collapse = TRUE) +.old.th = setDTthreads(1) ``` This vignette discusses the default usage of reshaping functions `melt` (wide to long) and `dcast` (long to wide) for *data.tables* as well as the **new extended functionalities** of melting and casting on *multiple columns* available from `v1.9.6`. @@ -314,6 +315,10 @@ DT.c2 You can also provide *multiple functions* to `fun.aggregate` to `dcast` for *data.tables*. Check the examples in `?dcast` which illustrates this functionality. +```{r, echo=FALSE} +setDTthreads(.old.th) +``` + # *** diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index e7b08650e4..ae0b5a84ac 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -25,6 +25,7 @@ knitr::opts_chunk$set( out.width = '100%', dpi = 144 ) +.old.th = setDTthreads(1) ``` This vignette will explain the most common ways to use the `.SD` variable in your `data.table` analyses. It is an adaptation of [this answer](https://stackoverflow.com/a/47406952/3576984) given on StackOverflow. @@ -254,3 +255,7 @@ abline(v = overall_coef, lty = 2L, col = 'red') While there is indeed a fair amount of heterogeneity, there's a distinct concentration around the observed overall value. The above is just a short introduction of the power of `.SD` in facilitating beautiful, efficient code in `data.table`! + +```{r, echo=FALSE} +setDTthreads(.old.th) +``` \ No newline at end of file diff --git a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd index 6f2474c115..ff50ba97e5 100644 --- a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd +++ b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd @@ -17,6 +17,7 @@ knitr::opts_chunk$set( tidy = FALSE, cache = FALSE, collapse = TRUE) +.old.th = setDTthreads(1) ``` This vignette assumes that the reader is familiar with data.table's `[i, j, by]` syntax, and how to perform fast key based subsets. If you're not familiar with these concepts, please read the *"Introduction to data.table"*, *"Reference semantics"* and *"Keys and fast binary search based subset"* vignettes first. @@ -325,3 +326,8 @@ In recent version we extended auto indexing to expressions involving more than o We will discuss fast *subsets* using keys and secondary indices to *joins* in the next vignette, *"Joins and rolling joins"*. *** + +```{r, echo=FALSE} +setDTthreads(.old.th) +``` + From 537688106718b72e04ddf2859c3ec61a5aed2dc0 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 8 Dec 2023 20:52:40 +0100 Subject: [PATCH 586/588] cutoff news (#5815) --- NEWS.1.md | 1549 +++++++++++++++++++++++++++++++++++++++++++++++++++++ NEWS.md | 1547 +--------------------------------------------------- 2 files changed, 1550 insertions(+), 1546 deletions(-) create mode 100644 NEWS.1.md diff --git a/NEWS.1.md b/NEWS.1.md new file mode 100644 index 0000000000..249f349926 --- /dev/null +++ b/NEWS.1.md @@ -0,0 +1,1549 @@ + +**This is OLD NEWS. Latest news is on GitHub [here](https://github.com/Rdatatable/data.table/blob/master/NEWS.md).** + +# data.table [v1.14.10](https://github.com/Rdatatable/data.table/milestone/20?closed=1) (8 Dec 2023) + +## NOTES + +1. Maintainer of the package for CRAN releases is from now on Tyson Barrett (@tysonstanley), [#5710](https://github.com/Rdatatable/data.table/issues/5710). + +2. Updated internal code for breaking change of `is.atomic(NULL)` in R-devel, [#5691](https://github.com/Rdatatable/data.table/pull/5691). Thanks to Martin Maechler for the patch. + +3. Fix multiple test concerning coercion to missing complex numbers, [#5695](https://github.com/Rdatatable/data.table/issues/5695) and [#5748](https://github.com/Rdatatable/data.table/issues/5748). Thanks to @MichaelChirico and @ben-schwen for the patches. + +4. Fix multiple format warnings (e.g., -Wformat) [#5712](https://github.com/Rdatatable/data.table/pull/5712), [#5781](https://github.com/Rdatatable/data.table/pull/5781), [#5880](https://github.com/Rdatatable/data.table/pull/5800), [#5786](https://github.com/Rdatatable/data.table/pull/5786). Thanks to @MichaelChirico and @jangorecki for the patches. + + +# data.table [v1.14.8](https://github.com/Rdatatable/data.table/milestone/28?closed=1) (17 Feb 2023) + +## NOTES + +1. Test 1613.605 now passes changes to `as.data.frame()` in R-devel, [#5597](https://github.com/Rdatatable/data.table/pull/5597). Thanks to Avraham Adler for reporting. + +2. An out of bounds read when combining non-equi join with `by=.EACHI` has been found and fixed thanks to clang ASAN, [#5598](https://github.com/Rdatatable/data.table/issues/5598). There was no bug or consequence because the read was followed (now preceded) by a bounds test. + +3. `.rbind.data.table` (note the leading `.`) is no longer exported when `data.table` is installed in R>=4.0.0 (Apr 2020), [#5600](https://github.com/Rdatatable/data.table/pull/5600). It was never documented which R-devel now detects and warns about. It is only needed by `data.table` internals to support R<4.0.0; see note 1 in v1.12.6 (Oct 2019) below in this file for more details. + + +# data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) (16 Nov 2022) + +## BUG FIXES + +1. `fread()` could leak memory, [#3292](https://github.com/Rdatatable/data.table/issues/3292). Thanks to @patrickhowerter for reporting, and Jim Hester for the fix. The fix requires R 3.4.0 or later. Loading `data.table` in earlier versions now highlights this issue on startup, asks users to upgrade R, and warns that we intend to upgrade `data.table`'s dependency from 8 year old R 3.1.0 (April 2014) to 5 year old R 3.4.0 (April 2017). + +## NOTES + +1. Test 1962.098 has been modified to pass latest changes to `POSIXt` in R-devel. + +2. `test.data.table()` no longer creates `DT` in `.GlobalEnv`, a CRAN policy violation, [#5514](https://github.com/Rdatatable/data.table/issues/5514). No other writes occurred to `.GlobalEnv` and release procedures have been improved to prevent this happening again. + +3. The memory usage of the test suite has been halved, [#5507](https://github.com/Rdatatable/data.table/issues/5507). + + +# data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) (17 Oct 2022) + +## NOTES + +1. gcc 12.1 (May 2022) now detects and warns about an always-false condition (`-Waddress`) in `fread` which caused a small efficiency saving never to be invoked, [#5476](https://github.com/Rdatatable/data.table/pull/5476). Thanks to CRAN for testing latest versions of compilers. + +2. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has passed all tests. As such we don't expect any backwards compatibility concerns. Its manual page was causing an intermittent hang/crash from `R CMD check` on Windows-only on CRAN which we hope will be worked around by changing its name. + +3. Internal C code now passes `-Wstrict-prototypes` to satisfy the warnings now displayed on CRAN, [#5477](https://github.com/Rdatatable/data.table/pull/5477). + +4. `write.csv` in R-devel no longer responds to `getOption("digits.secs")` for `POSIXct`, [#5478](https://github.com/Rdatatable/data.table/issues/5478). This caused our tests of `fwrite(, dateTimeAs="write.csv")` to fail on CRAN's daily checks using latest daily R-devel. While R-devel discussion continues, and currently it seems like the change is intended with further changes possible, this `data.table` release massages our tests to pass on latest R-devel. The idea is to try to get out of the way of R-devel changes in this regard until the new behavior of `write.csv` is released and confirmed. Package updates are not accepted on CRAN if they do not pass the latest daily version of R-devel, even if R-devel changes after the package update is submitted. If the change to `write.csv()` stands, then a future release of `data.table` will be needed to make `fwrite(, dateTimeAs="write.csv")` match `write.csv()` output again in that future version of R onwards. If you use an older version of `data.table` than said future one in the said future version of R, then `fwrite(, dateTimeAs="write.csv")` may not match `write.csv()` if you are using `getOption("digits.secs")` too. However, you can always check that your installation of `data.table` works in your version of R on your platform by simply running `test.data.table()` yourself. Doing so would detect such a situation for you: test 1741 would fail in this case. `test.data.table()` runs the entire suite of tests and is always available to you locally. This way you do not need to rely on our statements about which combinations of versions of R and `data.table` on which platforms we have tested and support; just run `test.data.table()` yourself. Having said that, because test 1741 has been relaxed in this release in order to be accepted on CRAN to pass latest R-devel, this won't be true for this particular release in regard to this particular test. + + ```R + $ R --vanilla + R version 4.2.1 (2022-06-23) -- "Funny-Looking Kid" + > DF = data.frame(A=as.POSIXct("2022-10-01 01:23:45.012")) + > options(digits.secs=0) + > write.csv(DF) + "","A" + "1",2022-10-01 01:23:45 + > options(digits.secs=3) + > write.csv(DF) + "","A" + "1",2022-10-01 01:23:45.012 + + $ Rdevel --vanilla + R Under development (unstable) (2022-10-06 r83040) -- "Unsuffered Consequences" + > DF = data.frame(A=as.POSIXct("2022-10-01 01:23:45.012")) + > options(digits.secs=0) + > write.csv(DF) + "","A" + "1",2022-10-01 01:23:45.012 + ``` + +5. Many thanks to Kurt Hornik for investigating potential impact of a possible future change to `base::intersect()` on empty input, providing a patch so that `data.table` won't break if the change is made to R, and giving us plenty of notice, [#5183](https://github.com/Rdatatable/data.table/pull/5183). + +6. `datatable.[dll|so]` has changed name to `data_table.[dll|so]`, [#4442](https://github.com/Rdatatable/data.table/pull/4442). Thanks to Jan Gorecki for the PR. We had previously removed the `.` since `.` is not allowed by the following paragraph in the Writing-R-Extensions manual. Replacing `.` with `_` instead now seems more consistent with the last sentence. + + > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. + + +# data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) + +## NOTES + +1. clang 13.0.0 (Sep 2021) requires the system header `omp.h` to be included before R's headers, [#5122](https://github.com/Rdatatable/data.table/issues/5122). Many thanks to Prof Ripley for testing and providing a patch file. + + +# data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) + +## POTENTIALLY BREAKING CHANGES + +1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz="UTC"` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended `In addition to convenience, fread is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.`. + + At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://posit.co/resources/videos/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). + + `tz=`'s default is now changed from `""` to `"UTC"`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. + + The community was consulted in [this tweet](https://twitter.com/MattDowle/status/1358011599336931328) before release. + +## BUG FIXES + +1. If `fread()` discards a single line footer, the warning message which includes the discarded text now displays any non-ASCII characters correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. + +2. `fintersect()` now retains the order of the first argument as reasonably expected, rather than retaining the order of the second argument, [#4716](https://github.com/Rdatatable/data.table/issues/4716). Thanks to Michel Lang for reporting, and Ben Schwen for the PR. + +## NOTES + +1. Compiling from source no longer requires `zlib` header files to be available, [#4844](https://github.com/Rdatatable/data.table/pull/4844). The output suggests installing `zlib` headers, and how (e.g. `zlib1g-dev` on Ubuntu) as before, but now proceeds with `gzip` compression disabled in `fwrite`. Upon calling `fwrite(DT, "file.csv.gz")` at runtime, an error message suggests to reinstall `data.table` with `zlib` headers available. This does not apply to users on Windows or Mac who install the pre-compiled binary package from CRAN. + +2. `r-datatable.com` continues to be the short, canonical and long-standing URL which forwards to the current homepage. The homepage domain has changed a few times over the years but those using `r-datatable.com` did not need to change their links. For example, we use `r-datatable.com` in messages (and translated messages) in preference to the word 'homepage' to save users time in searching for the current homepage. The web forwarding was provided by Domain Monster but they do not support `https://r-datatable.com`, only `http://r-datatable.com`, despite the homepage being forwarded to being `https:` for many years. Meanwhile, CRAN submission checks now require all URLs to be `https:`, rejecting `http:`. Therefore we have moved to [gandi.net](https://www.gandi.net) who do support `https:` web forwarding and so [https://r-datatable.com](https://r-datatable.com) now forwards correctly. Thanks to Dirk Eddelbuettel for suggesting Gandi. Further, Gandi allows the web-forward to be marked 301 (permanent) or 302 (temporary). Since the very point of `https://r-datatable.com` is to be a forward, 302 is appropriate in this case. This enables us to link to it in DESCRIPTION, README, and this NEWS item. Otherwise, CRAN submission checks would require the 301 forward to be followed; i.e. the forward replaced with where it points to and the package resubmitted. Thanks to Uwe Ligges for explaining this distinction. + + +# data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (30 Dec 2020) + +## BUG FIXES + +1. Grouping could throw an error `Failed to allocate counts or TMP` with more than 1e9 rows even with sufficient RAM due to an integer overflow, [#4295](https://github.com/Rdatatable/data.table/issues/4295) [#4818](https://github.com/Rdatatable/data.table/issues/4818). Thanks to @renkun-ken and @jangorecki for reporting, and @shrektan for fixing. + +2. `fwrite()`'s mutithreaded `gzip` compression failed on Solaris with Z_STREAM_ERROR, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) there have been no known problems with it on Linux, Windows or Mac. For Solaris, we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. + + It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone reading this knows more, please let us know. + +## NOTES + +1. The last release took place at the same time as several breaking changes were made to R-devel. The CRAN submissions process runs against latest daily R-devel so we had to keep up with those latest changes by making several resubmissions. Then each resubmission reruns against the new latest R-devel again. Overall it took 7 days. For example, we added the new `environments=FALSE` to our `all.equal` call. Then about 4 hours after 1.13.4 was accepted, the `s` was dropped and we now need to resubmit with `environment=FALSE`. In any case, we have suggested that the default should be FALSE first to give packages some notice, as opposed to generating errors in the CRAN submissions process within hours. Then the default for `environment=` could be TRUE in 6 months time after packages have had some time to update in advance of the default change. Readers of this NEWS file will be familiar with `data.table`'s approach to change control and know that we do this ourselves. + + +# data.table [v1.13.4](https://github.com/Rdatatable/data.table/milestone/21?closed=1) (08 Dec 2020) + +## BUG FIXES + +1. `as.matrix()` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting. + +2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the dynamic schedule. Although never guaranteed by the OpenMP standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). If you have an old compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway, so `fsort` now checks that threads are receiving iterations monotonically and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. OpenMP 4.5 may be enabled in some compilers using `-fopenmp-version=45`. Otherwise, if you need to upgrade compiler, https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful. + +3. Columns containing functions that don't inherit the class `'function'` would fail to group, [#4814](https://github.com/Rdatatable/data.table/issues/4814). Thanks @mb706 for reporting, @ecoRoland2 for helping investigate, and @Coorsaa for a follow-up example involving environments. + +## NOTES + +1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://h2o.ai/blog/2016/behind-the-scenes-of-cran/). + +2. `as.Date.IDate` is no longer exported as a function to solve a new error in R-devel `S3 method lookup found 'as.Date.IDate' on search path`, [#4777](https://github.com/Rdatatable/data.table/issues/4777). The S3 method is still exported; i.e. `as.Date(x)` will still invoke the `as.Date.IDate` method when `x` is class `IDate`. The function had been exported, in addition to exporting the method, to solve a compatibility issue with `zoo` (and `xts` which uses `zoo`) because `zoo` exports `as.Date` which masks `base::as.Date`. Happily, since zoo 1.8-1 (Jan 2018) made a change to its `as.IDate`, the workaround is no longer needed. + +3. Thanks to @fredguinog for testing `fcase` in development before 1.13.0 was released and finding a segfault, [#4378](https://github.com/Rdatatable/data.table/issues/4378). It was found separately by the `rchk` tool (which uses static code analysis) in release procedures and fixed before `fcase` was released, but the reproducible example has now been added to the test suite for completeness. Thanks also to @shrektan for investigating, proposing a very similar fix at C level, and a different reproducible example which has also been added to the test suite. + + +# data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1) (19 Oct 2020) + +## BUG FIXES + +1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing. + +2. A regression in v1.13.0 resulted in installation on Mac often failing with `shared object 'datatable.so' not found`, and FreeBSD always failing with `expr: illegal option -- l`, [#4652](https://github.com/Rdatatable/data.table/issues/4652) [#4640](https://github.com/Rdatatable/data.table/issues/4640) [#4650](https://github.com/Rdatatable/data.table/issues/4650). Thanks to many for assistance including Simon Urbanek, Brian Ripley, Wes Morgan, and @ale07alvarez. There were no installation problems on Windows or Linux. + +3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. Related, and also fixed, was a segfault revealed by package POUMM, [#4746](https://github.com/Rdatatable/data.table/issues/4746), when grouping a list column where each item has an attribute; e.g., `coda::mcmc.list`. Detected thanks to CRAN's ASAN checks, and thanks to Venelin Mitov for assistance in tracing the memory fault. Thanks also to Hongyuan Jia and @ben-schwen for assistance in debugging the fix in dev to pass reverse dependency testing which highlighted, before release, that package `eplusr` would fail. Its good usage has been added to `data.table`'s test suite. + +4. `fread("1.2\n", colClasses='integer')` (note no columns names in the data) would segfault when creating a warning message, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present however, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. + +5. `dplyr::mutate(setDT(as.list(1:64)), V1=11)` threw error `can't set ALTREP truelength`, [#4734](https://github.com/Rdatatable/data.table/issues/4734). Thanks to @etryn for the reproducible example, and to Cole Miller for refinements. + +## NOTES + +1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accommodate `bit64`'s update. + + The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on (rather than importing) `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. + + We have requested that CRAN policy be modified to require that reverse dependency testing include packages which `Suggest` the package. Had this been the case, reverse dependency testing of `bit64` would have caught the impact on `data.table` before release. + +2. `?.NGRP` now displays the help page as intended, [#4946](https://github.com/Rdatatable/data.table/issues/4649). Thanks to @KyleHaynes for posting the issue, and Cole Miller for the fix. `.NGRP` is a symbol new in v1.13.0; see below in this file. + +3. `test.data.table()` failed in non-English locales such as `LC_TIME=fr_FR.UTF-8` due to `Jan` vs `janv.` in tests 168 and 2042, [#3450](https://github.com/Rdatatable/data.table/issues/3450). Thanks to @shrektan for reporting, and @tdhock for making the tests locale-aware. + +4. User-supplied `PKG_LIBS` and `PKG_CFLAGS` are now retained and the suggestion in https://mac.r-project.org/openmp/; i.e., + `PKG_CPPFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_.tar.gz` +has a better chance of working on Mac. + + +# data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) + +## POTENTIALLY BREAKING CHANGES + +1. `fread` now supports native parsing of `%Y-%m-%d`, and [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `%Y-%m-%dT%H:%M:%OS%z`, [#4464](https://github.com/Rdatatable/data.table/pull/4464). Dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and datetimes are returned as `POSIXct` provided either `Z` or the offset from `UTC` is present; e.g. `fwrite()` outputs UTC by default including the final `Z`. Reminder that `IDate` inherits from R's `Date` and is identical other than it uses the `integer` type where (oddly) R uses the `double` type for dates (8 bytes instead of 4). `fread()` gains a `tz` argument to control datetime values that are missing a Z or UTC-offset (now referred to as *unmarked* datetimes); e.g. as written by `write.csv`. By default `tz=""` means, as in R, read the unmarked datetime in local time. Unless the timezone of the R session is UTC (e.g. the TZ environment variable is set to `"UTC"`, or `""` on non-Windows), unmarked datetime will then by read by `fread` as character, as before. If you have been using `colClasses="POSIXct"` that will still work using R's `as.POSIXct()` which will interpret the unmarked datetime in local time, as before, and still slowly. You can tell `fread` to read unmarked datetime as UTC, and quickly, by passing `tz="UTC"` which may be appropriate in many circumstances. Note that the default behaviour of R to read and write csv using unmarked datetime can lead to different research results when the csv file has been saved in one timezone and read in another due to observations being shifted to a different date. If you have been using `colClasses="POSIXct"` for UTC-marked datetime (e.g. as written by `fwrite` including the final `Z`) then it will automatically speed up with no changes needed. + + Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. + + The minor version number is bumped from 12 to 13, i.e. `v1.13.0`, where the `.0` conveys 'be-aware' as is common practice. As with any new feature, there may be bugs to fix and changes to defaults required in future. In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided. + +## NEW FEATURES + +1. `%chin%` and `chmatch(x, table)` are faster when `x` is length 1, `table` is long, and `x` occurs near the start of `table`. Thanks to Michael Chirico for the suggestion, [#4117](https://github.com/Rdatatable/data.table/pull/4117#discussion_r358378409). + +2. `CsubsetDT` C function is now exported for use by other packages, [#3751](https://github.com/Rdatatable/data.table/issues/3751). Thanks to Leonardo Silvestri for the request and the PR. This uses R's `R_RegisterCCallable` and `R_GetCCallable` mechanism, [R-exts§5.4.3](https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Linking-to-native-routines-in-other-packages) and [`?cdt`](https://rdatatable.gitlab.io/data.table/reference/cdt.html). Note that organization of our C interface will be changed in future. + +3. `print` method for `data.table` gains `trunc.cols` argument (and corresponding option `datatable.print.trunc.cols`, default `FALSE`), [#1497](https://github.com/Rdatatable/data.table/issues/1497), part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). This prints only as many columns as fit in the console without wrapping to new lines (e.g., the first 5 of 80 columns) and a message that states the count and names of the variables not shown. When `class=TRUE` the message also contains the classes of the variables. `data.table` has always automatically truncated _rows_ of a table for efficiency (e.g. printing 10 rows instead of 10 million); in the future, we may do the same for _columns_ (e.g., 10 columns instead of 20,000) by changing the default for this argument. Thanks to @nverno for the initial suggestion and to @TysonStanley for the PR. + +4. `setnames(DT, new=new_names)` (i.e. explicitly named `new=` argument) now works as expected rather than an error message requesting that `old=` be supplied too, [#4041](https://github.com/Rdatatable/data.table/issues/4041). Thanks @Kodiologist for the suggestion. + +5. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed. + +6. New function `fcase(...,default)` implemented in C by Morgan Jacob, [#3823](https://github.com/Rdatatable/data.table/issues/3823), is inspired by SQL `CASE WHEN` which is a common tool in SQL for e.g. building labels or cutting age groups based on conditions. `fcase` is comparable to R function `dplyr::case_when` however it evaluates its arguments in a lazy way (i.e. only when needed) as shown below. Please see `?fcase` for more details. + + ```R + # Lazy evaluation + x = 1:10 + data.table::fcase( + x < 5L, 1L, + x >= 5L, 3L, + x == 5L, stop("provided value is an unexpected one!") + ) + # [1] 1 1 1 1 3 3 3 3 3 3 + + dplyr::case_when( + x < 5L ~ 1L, + x >= 5L ~ 3L, + x == 5L ~ stop("provided value is an unexpected one!") + ) + # Error in eval_tidy(pair$rhs, env = default_env) : + # provided value is an unexpected one! + + # Benchmark + x = sample(1:100, 3e7, replace = TRUE) # 114 MB + microbenchmark::microbenchmark( + dplyr::case_when( + x < 10L ~ 0L, + x < 20L ~ 10L, + x < 30L ~ 20L, + x < 40L ~ 30L, + x < 50L ~ 40L, + x < 60L ~ 50L, + x > 60L ~ 60L + ), + data.table::fcase( + x < 10L, 0L, + x < 20L, 10L, + x < 30L, 20L, + x < 40L, 30L, + x < 50L, 40L, + x < 60L, 50L, + x > 60L, 60L + ), + times = 5L, + unit = "s") + # Unit: seconds + # expr min lq mean median uq max neval + # dplyr::case_when 11.57 11.71 12.22 11.82 12.00 14.02 5 + # data.table::fcase 1.49 1.55 1.67 1.71 1.73 1.86 5 + ``` + +7. `.SDcols=is.numeric` now works; i.e., `SDcols=` accepts a function which is used to select the columns of `.SD`, [#3950](https://github.com/Rdatatable/data.table/issues/3950). Any function (even _ad hoc_) that returns scalar `TRUE`/`FALSE` for each column will do; e.g., `.SDcols=!is.character` will return _non_-character columns (_a la_ `Negate()`). Note that `.SDcols=patterns(...)` can still be used for filtering based on the column names. + +8. Compiler support for OpenMP is now detected during installation, which allows `data.table` to compile from source (in single threaded mode) on macOS which, frustratingly, does not include OpenMP support by default, [#2161](https://github.com/Rdatatable/data.table/issues/2161), unlike Windows and Linux. A helpful message is emitted during installation from source, and on package startup as before. Many thanks to @jimhester for the PR. + +9. `rbindlist` now supports columns of type `expression`, [#546](https://github.com/Rdatatable/data.table/issues/546). Thanks @jangorecki for the report. + +10. The dimensions of objects in a `list` column are now displayed, [#3671](https://github.com/Rdatatable/data.table/issues/3671). Thanks to @randomgambit for the request, and Tyson Barrett for the PR. + +11. `frank` gains `ties.method='last'`, paralleling the same in `base::order` which has been available since R 3.3.0 (April 2016), [#1689](https://github.com/Rdatatable/data.table/issues/1689). Thanks @abudis for the encouragement to accommodate this. + +12. The `keep.rownames` argument in `as.data.table.xts` now accepts a string, which can be used for specifying the column name of the index of the xts input, [#4232](https://github.com/Rdatatable/data.table/issues/4232). Thanks to @shrektan for the request and the PR. + +13. New symbol `.NGRP` available in `j`, [#1206](https://github.com/Rdatatable/data.table/issues/1206). `.GRP` (the group number) was already available taking values from `1` to `.NGRP`. The number of groups, `.NGRP`, might be useful in `j` to calculate a percentage of groups processed so far, or to do something different for the last or penultimate group, for example. + +14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. + +15. A new throttle feature has been introduced to speed up small data tasks that are repeated in a loop, [#3175](https://github.com/Rdatatable/data.table/issues/3175) [#3438](https://github.com/Rdatatable/data.table/issues/3438) [#3205](https://github.com/Rdatatable/data.table/issues/3205) [#3735](https://github.com/Rdatatable/data.table/issues/3735) [#3739](https://github.com/Rdatatable/data.table/issues/3739) [#4284](https://github.com/Rdatatable/data.table/issues/4284) [#4527](https://github.com/Rdatatable/data.table/issues/4527) [#4294](https://github.com/Rdatatable/data.table/issues/4294) [#1120](https://github.com/Rdatatable/data.table/issues/1120). The default throttle of 1024 means that a single thread will be used when nrow<=1024, two threads when nrow<=2048, etc. To change the default, use `setDTthreads(throttle=)`. Or use the new environment variable `R_DATATABLE_THROTTLE`. If you use `Sys.setenv()` in a running R session to change this environment variable, be sure to run an empty `setDTthreads()` call afterwards for the change to take effect; see `?setDTthreads`. The word *throttle* is used to convey that the number of threads is restricted (throttled) for small data tasks. Reducing throttle to 1 will turn off throttling and should revert behaviour to past versions (i.e. using many threads even for small data). Increasing throttle to, say, 65536 will utilize multi-threading only for larger datasets. The value 1024 is a guess. We welcome feedback and test results indicating what the best default should be. + +## BUG FIXES + +1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). + +2. `DT[i]` could segfault when `i` is a zero-column `data.table`, [#4060](https://github.com/Rdatatable/data.table/issues/4060). Thanks @shrektan for reporting and fixing. + +3. Dispatch of `first` and `last` functions now properly works again for `xts` objects, [#4053](https://github.com/Rdatatable/data.table/issues/4053). Thanks to @ethanbsmith for reporting. + +4. If `.SD` is returned as-is during grouping, it is now unlocked for downstream usage, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks also to @mllg for detecting a problem with the initial fix here during the dev release [#4173](https://github.com/Rdatatable/data.table/issues/4173). + +5. `GForce` is deactivated for `[[` on non-atomic input, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks @hongyuanjia and @ColeMiller1 for helping debug an issue in dev with the original fix before release, [#4612](https://github.com/Rdatatable/data.table/issues/4612). + +6. `all.equal(DT, y)` no longer errors when `y` is not a data.table, [#4042](https://github.com/Rdatatable/data.table/issues/4042). Thanks to @d-sci for reporting and the PR. + +7. A length 1 `colClasses=NA_character_` would cause `fread` to incorrectly coerce all columns to character, [#4237](https://github.com/Rdatatable/data.table/issues/4237). + +8. An `fwrite` error message could include a garbled number and cause test 1737.5 to fail, [#3492](https://github.com/Rdatatable/data.table/issues/3492). Thanks to @QuLogic for debugging the issue on ARMv7hl, and the PR fixing it. + +9. `fread` improves handling of very small (<1e-300) or very large (>1e+300) floating point numbers on non-x86 architectures (specifically ppc64le and armv7hl). Thanks to @QuLogic for reporting and fixing, [PR#4165](https://github.com/Rdatatable/data.table/pull/4165). + +10. When updating by reference, the use of `get` could result in columns being re-ordered silently, [#4089](https://github.com/Rdatatable/data.table/issues/4089). Thanks to @dmongin for reporting and Cole Miller for the fix. + +11. `copy()` now overallocates deeply nested lists of `data.table`s, [#4205](https://github.com/Rdatatable/data.table/issues/4205). Thanks to @d-sci for reporting and the PR. + +12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR. + +13. A relatively rare case of segfault when combining non-equi joins with `by=.EACHI` is now fixed, closes [#4388](https://github.com/Rdatatable/data.table/issues/4388). + +14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report. + +15. `all.equal(DT1, DT2, ignore.row.order=TRUE)` could return TRUE incorrectly in the presence of NAs, [#4422](https://github.com/Rdatatable/data.table/issues/4422). + +16. Non-equi joins now automatically set `allow.cartesian=TRUE`, [4489](https://github.com/Rdatatable/data.table/issues/4489). Thanks to @Henrik-P for reporting. + +17. `X[Y, on=character(0)]` and `merge(X, Y, by.x=character(0), by.y=character(0))` no longer crash, [#4272](https://github.com/Rdatatable/data.table/pull/4272). Thanks to @tlapak for the PR. + +18. `by=col1:col4` gave an incorrect result if `key(DT)==c("col1","col4")`, [#4285](https://github.com/Rdatatable/data.table/issues/4285). Thanks to @cbilot for reporting, and Cole Miller for the PR. + +19. Matrices resulting from logical operators or comparisons on `data.table`s, e.g. in `dta == dtb`, can no longer have their colnames changed by reference later, [#4323](https://github.com/Rdatatable/data.table/issues/4323). Thanks to @eyherabh for reporting and @tlapak for the PR. + +20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8). + +## NOTES + +0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. + +1. `as.IDate`, `as.ITime`, `second`, `minute`, and `hour` now recognize UTC equivalents for speed: GMT, GMT-0, GMT+0, GMT0, Etc/GMT, and Etc/UTC, [#4116](https://github.com/Rdatatable/data.table/issues/4116). + +2. `set2key`, `set2keyv`, and `key2` have been removed, as they have been warning since v1.9.8 (Nov 2016) and halting with helpful message since v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental' and quickly superseded by `setindex` and `indices`. + +3. `data.table` now supports messaging in simplified Chinese (locale `zh_CN`). This was the result of a monumental collaboration to translate `data.table`'s roughly 1400 warnings, errors, and verbose messages (about 16,000 words/100,000 characters) over the course of two months from volunteer translators in at least 4 time zones, most of whom are first-time `data.table` contributors and many of whom are first-time OSS contributors! + + A big thanks goes out to @fengqifang, @hongyuanjia, @biobai, @zhiiiyang, @Leo-Lee15, @soappp9527, @amy17519, @Zachary-Wu, @caiquanyou, @dracodoc, @JulianYlli12, @renkun-ken, @Xueliang24, @koohoko, @KingdaShi, @gaospecial, @shrektan, @sunshine1126, @shawnchen1996, @yc0802, @HesperusArcher, and @Emberwhirl, all of whom took time from their busy schedules to translate and review others' translations. Especial thanks goes to @zhiiiyang and @hongyuanjia who went above and beyond in helping to push the project over the finish line, and to @GuangchuangYu who helped to organize the volunteer pool. + + `data.table` joins `lubridate` and `nlme` as the only of the top 200 most-downloaded community packages on CRAN to offer non-English messaging, and is the only of the top 50 packages to offer complete support of all messaging. We hope this is a first step in broadening the reach and accessibility of the R ecosystem to more users globally and look forward to working with other maintainers looking to bolster the portability of their packages by offering advice on learnings from this undertaking. + + We would be remiss not to mention the laudable lengths to which the R core team goes to maintain the _much_ larger repository (about 6,000 messages in more than 10 languages) of translations for R itself. + + We will evaluate the feasibility (in terms of maintenance difficulty and CRAN package size limits) of offering support for other languages in later releases. + +4. `fifelse` and `fcase` now notify users that S4 objects (except `nanotime`) are not supported [#4135](https://github.com/Rdatatable/data.table/issues/4135). Thanks to @torema-ed for bringing it to our attention and Morgan Jacob for the PR. + +5. `frank(..., ties.method="random", na.last=NA)` now returns the same random ordering that `base::rank` does, [#4243](https://github.com/Rdatatable/data.table/pull/4243). + +6. The error message when mistakenly using `:=` in `i` instead of `j` has been much improved, [#4227](https://github.com/Rdatatable/data.table/issues/4227). Thanks to Hugh Parsonage for the detailed suggestion. + + ```R + > DT = data.table(A=1:2) + > DT[B:=3] + Error: Operator := detected in i, the first argument inside DT[...], but is only valid in + the second argument, j. Most often, this happens when forgetting the first comma + (e.g. DT[newvar:=5] instead of DT[, new_var:=5]). Please double-check the + syntax. Run traceback(), and debugger() to get a line number. + > DT[, B:=3] + > DT + A B + + 1: 1 3 + 2: 2 3 + ``` + +7. Added more explanation/examples to `?data.table` for how to use `.BY`, [#1363](https://github.com/Rdatatable/data.table/issues/1363). + +8. Changes upstream in R have been accomodated; e.g. `c.POSIXct` now raises `'origin' must be supplied` which impacted `foverlaps`, [#4428](https://github.com/Rdatatable/data.table/pull/4428). + +9. `data.table::update.dev.pkg()` now unloads the `data.table` namespace to alleviate a DLL lock issue on Windows, [#4403](https://github.com/Rdatatable/data.table/issues/4403). Thanks to @drag5 for reporting. + +10. `data.table` packages binaries built by R version 3 (R3) should only be installed in R3, and similarly `data.table` package binaries built by R4 should only be installed in R4. Otherwise, `package ‘data.table’ was built under R version...` warning will occur which should not be ignored. This is due to a very welcome change to `rbind` and `cbind` in R 4.0.0 which enabled us to remove workarounds, see news item in v1.12.6 below in this file. To continue to support both R3 and R4, `data.table`'s NAMESPACE file contains a condition on the R major version (3 or 4) and this is what gives rise to the requirement that the major version used to build `data.table` must match the major version used to install it. Thanks to @vinhdizzo for reporting, [#4528](https://github.com/Rdatatable/data.table/issues/4528). + +11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. + +12. The `datatable.old.unique.by.key` option has been removed as per the 4 year schedule detailed in note 10 of v1.12.4 (Oct 2019), note 10 of v1.11.0 (May 2018), and note 1 of v1.9.8 (Nov 2016). It has been generating a helpful warning for 2 years, and helpful error for 1 year. + + +# data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) + +## NEW FEATURES + +1. `DT[, {...; .(A,B)}]` (i.e. when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. Thanks also to @renkun-ken for his go-first strong testing which caught an issue not caught by the test suite or by revdep testing, related to NULL being the last item, [#4061](https://github.com/Rdatatable/data.table/issues/4061). + +## BUG FIXES + +1. `frollapply` could segfault and exceed R's C protect limits, [#3993](https://github.com/Rdatatable/data.table/issues/3993). Thanks to @DavisVaughan for reporting and fixing. + +2. `DT[, sum(grp), by=grp]` (i.e. aggregating the same column being grouped) could error with `object 'grp' not found`, [#3103](https://github.com/Rdatatable/data.table/issues/3103). Thanks to @cbailiss for reporting. + +## NOTES + +1. Links in the manual were creating warnings when installing HTML, [#4000](https://github.com/Rdatatable/data.table/issues/4000). Thanks to Morgan Jacob. + +2. Adjustments for R-devel (R 4.0.0) which now has reference counting turned on, [#4058](https://github.com/Rdatatable/data.table/issues/4058) [#4093](https://github.com/Rdatatable/data.table/issues/4093). This motivated early release to CRAN because every day CRAN tests every package using the previous day's changes in R-devel; a much valued feature of the R ecosystem. It helps R-core if packages can pass changes in R-devel as soon as possible. Thanks to Luke Tierney for the notice, and for implementing reference counting which we look forward to very much. + +3. C internals have been standardized to use `PRI[u|d]64` to print `[u]int64_t`. This solves new warnings from `gcc-8` on Windows with `%lld`, [#4062](https://github.com/Rdatatable/data.table/issues/4062), in many cases already working around `snprintf` on Windows not supporting `%zu`. Release procedures have been augmented to prevent any internal use of `llu`, `lld`, `zu` or `zd`. + +4. `test.data.table()` gains `showProgress=interactive()` to suppress the thousands of `Running test id ...` lines displayed by CRAN checks when there are warnings or errors. + + +# data.table [v1.12.6](https://github.com/Rdatatable/data.table/milestone/18?closed=1) (18 Oct 2019) + +## BUG FIXES + +1. `shift()` on a `nanotime` with the default `fill=NA` now fills a `nanotime` missing value correctly, [#3945](https://github.com/Rdatatable/data.table/issues/3945). Thanks to @mschubmehl for reporting and fixing in PR [#3942](https://github.com/Rdatatable/data.table/pull/3942). + +2. Compilation failed on CRAN's MacOS due to an older version of `zlib.h/zconf.h` which did not have `z_const` defined, [#3939](https://github.com/Rdatatable/data.table/issues/3939). Other open-source projects unrelated to R have experienced this problem on MacOS too. We have followed the common practice of removing `z_const` to support the older `zlib` versions, and data.table's release procedures have gained a `grep` to ensure `z_const` isn't used again by accident in future. The library `zlib` is used for `fwrite`'s new feature of multithreaded compression on-the-fly; see item 3 of 1.12.4 below. + +3. A runtime error in `fwrite`'s compression, but only observed so far on Solaris 10 32bit with zlib 1.2.8 (Apr 2013), [#3931](https://github.com/Rdatatable/data.table/issues/3931): `Error -2: one or more threads failed to allocate buffers or there was a compression error.` In case it happens again, this area has been made more robust and the error more detailed. As is often the case, investigating the Solaris problem revealed secondary issues in the same area of the code. In this case, some `%d` in verbose output should have been `%lld`. This obliquity that CRAN's Solaris provides is greatly appreciated. + +4. A leak could occur in the event of an unsupported column type error, or if working memory could only partially be allocated; [#3940](https://github.com/Rdatatable/data.table/issues/3940). Found thanks to `clang`'s Leak Sanitizer (prompted by CRAN's diligent use of latest tools), and two tests in the test suite which tested the unsupported-type error. + +## NOTES + +1. Many thanks to Kurt Hornik for fixing R's S3 dispatch of `rbind` and `cbind` methods, [#3948](https://github.com/Rdatatable/data.table/issues/3948). With `R>=4.0.0` (current R-devel), `data.table` now registers the S3 methods `cbind.data.table` and `rbind.data.table`, and no longer applies the workaround documented in FAQ 2.24. + + +# data.table [v1.12.4](https://github.com/Rdatatable/data.table/milestone/16?closed=1) (03 Oct 2019) + +## NEW FEATURES + +1. `rleid()` functions now support long vectors (length > 2 billion). + +2. `fread()`: + * now skips embedded `NUL` (`\0`), [#3400](https://github.com/Rdatatable/data.table/issues/3400). Thanks to Marcus Davy for reporting with examples, Roy Storey for the initial PR, and Bingjie Qian for testing this feature on a very complicated real-world file. + * `colClasses` now supports `'complex'`, `'raw'`, `'Date'`, `'POSIXct'`, and user-defined classes (so long as an `as.` method exists), [#491](https://github.com/Rdatatable/data.table/issues/491) [#1634](https://github.com/Rdatatable/data.table/issues/1634) [#2610](https://github.com/Rdatatable/data.table/issues/2610). Any error during coercion results in a warning and the column is left as the default type (probably `"character"`). Thanks to @hughparsonage for the PR. + * `stringsAsFactors=0.10` will factorize any character column containing under `0.10*nrow` unique strings, [#2025](https://github.com/Rdatatable/data.table/issues/2025). Thanks to @hughparsonage for the PR. + * `colClasses=list(numeric=20:30, numeric="ID")` will apply the `numeric` type to column numbers `20:30` as before and now also column name `"ID"`; i.e. all duplicate class names are now respected rather than only the first. This need may arise when specifying some columns by name and others by number, as in this example. Thanks to @hughparsonage for the PR. + * gains `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files; i.e., csv files with metadata in a header formatted as YAML (https://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats. Please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/gesistsa/rio) for the inspiration and @MichaelChirico for implementing. + * `select` can now be used to specify types for just the columns selected, [#1426](https://github.com/Rdatatable/data.table/issues/1426). Just like `colClasses` it can be a named vector of `colname=type` pairs, or a named `list` of `type=col(s)` pairs. For example: + + ```R + fread(file, select=c(colD="character", # returns 2 columns: colD,colA + colA="integer64")) + fread(file, select=list(character="colD", # returns 5 columns: colD,8,9,10,colA + integer= 8:10, + character="colA")) + ``` + * gains `tmpdir=` argument which is passed to `tempfile()` whenever a temporary file is needed. Thanks to @mschubmehl for the PR. As before, setting `TMPDIR` (to `/dev/shm` for example) before starting the R session still works too; see `?base::tempdir`. + +3. `fwrite()`: + * now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Compression, like `fwrite()`, is multithreaded and compresses each chunk on-the-fly (a full size intermediate file is not created). Use a ".gz" extension, or the new `compress=` option. Many thanks to Philippe Chataignon for the significant PR. For example: + + ```R + DT = data.table(A=rep(1:2, 100e6), B=rep(1:4, 50e6)) + fwrite(DT, "data.csv") # 763MB; 1.3s + fwrite(DT, "data.csv.gz") # 2MB; 1.6s + identical(fread("data.csv.gz"), DT) + ``` + + Note that compression is handled using `zlib` library. In the unlikely event of missing `zlib.h`, on a machine that is compiling `data.table` from sources, one may get `fwrite.c` compilation error `zlib.h: No such file or directory`. As of now, the easiest solution is to install missing library using `sudo apt install zlib1g-dev` (Debian/Ubuntu). Installing R (`r-base-dev`) depends on `zlib1g-dev` so this should be rather uncommon. If it happens to you please upvote related issue [#3872](https://github.com/Rdatatable/data.table/issues/3872). + + * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature. + + * Gains `bom` argument to add a *byte order mark* (BOM) at the beginning of the file to signal that the file is encoded in UTF-8, [#3488](https://github.com/Rdatatable/data.table/issues/3488). Thanks to Stefan Fleck for requesting and Philippe Chataignon for implementing. + + * Now supports type `complex`, [#3690](https://github.com/Rdatatable/data.table/issues/3690). + + * Gains `scipen` [#2020](https://github.com/Rdatatable/data.table/issues/2020), the number 1 most-requested feature [#3189](https://github.com/Rdatatable/data.table/issues/3189). The default is `getOption("scipen")` so that `fwrite` will now respect R's option in the same way as `base::write.csv` and `base::format`, as expected. The parameter and option name have been kept the same as base R's `scipen` for consistency and to aid online search. It stands for 'scientific penalty'; i.e., the number of characters to add to the width within which non-scientific number format is used if it will fit. A high penalty essentially turns off scientific format. We believe that common practice is to use a value of 999, however, if you do use 999, because your data _might_ include very long numbers such as `10^300`, `fwrite` needs to account for the worst case field width in its buffer allocation per thread. This may impact space or time. If you experience slowdowns or unacceptable memory usage, please pass `verbose=TRUE` to `fwrite`, inspect the output, and report the issue. A workaround, until we can determine the best strategy, may be to pass a smaller value to `scipen`, such as 50. We have observed that `fwrite(DT, scipen=50)` appears to write `10^50` accurately, unlike base R. However, this may be a happy accident and not apply generally. Further work may be needed in this area. + + ```R + DT = data.table(a=0.0001, b=1000000) + fwrite(DT) + # a,b + # 1e-04,1e+06 + fwrite(DT,scipen=1) + # a,b + # 0.0001,1e+06 + fwrite(DT,scipen=2) + # a,b + # 0.0001,1000000 + + 10^50 + # [1] 1e+50 + options(scipen=50) + 10^50 + # [1] 100000000000000007629769841091887003294964970946560 + fwrite(data.table(A=10^50)) + # A + # 100000000000000000000000000000000000000000000000000 + ``` + +4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). + + ```R + > DT = data.table(A=1:3, B=list(1:2,"foo",3:5)) + > DT + A B + + 1: 1 1,2 + 2: 2 foo + 3: 3 3,4,5 + > + # The following all accomplish the same assignment: + > DT[2, B:=letters[9:13]] # was error, now works + > DT[2, B:=.(letters[9:13])] # was error, now works + > DT[2, B:=.(list(letters[9:13]))] # .(list()) was needed, still works + > DT + A B + + 1: 1 1,2 + 2: 2 i,j,k,l,m + 3: 3 3,4,5 + ``` + +5. `print.data.table()` gains an option to display the timezone of `POSIXct` columns when available, [#2842](https://github.com/Rdatatable/data.table/issues/2842). Thanks to Michael Chirico for reporting and Felipe Parages for the PR. + +6. New functions `nafill` and `setnafill`, [#854](https://github.com/Rdatatable/data.table/issues/854). Thanks to Matthieu Gomez for the request and Jan Gorecki for implementing. + + ```R + DT = setDT(lapply(1:100, function(i) sample(c(rnorm(9e6), rep(NA_real_, 1e6))))) + format(object.size(DT), units="GB") ## 7.5 Gb + zoo::na.locf(DT, na.rm=FALSE) ## zoo 53.518s + setDTthreads(1L) + nafill(DT, "locf") ## DT 1 thread 7.562s + setDTthreads(0L) + nafill(DT, "locf") ## DT 40 threads 0.605s + setnafill(DT, "locf") ## DT in-place 0.367s + ``` + +7. New variable `.Last.updated` (similar to R's `.Last.value`) contains the number of rows affected by the most recent `:=` or `set()`, [#1885](https://github.com/Rdatatable/data.table/issues/1885). For details see `?.Last.updated`. + +8. `between()` and `%between%` are faster for `POSIXct`, [#3519](https://github.com/Rdatatable/data.table/issues/3519), and now support the `.()` alias, [#2315](https://github.com/Rdatatable/data.table/issues/2315). Thanks to @Henrik-P for the reports. There is now also support for `bit64`'s `integer64` class and more robust coercion of types, [#3517](https://github.com/Rdatatable/data.table/issues/3517). `between()` gains `check=` which checks `any(lower>upper)`; off by default for speed in particular for type character. + +9. New convenience functions `%ilike%` and `%flike%` which map to new `like()` arguments `ignore.case` and `fixed` respectively, [#3333](https://github.com/Rdatatable/data.table/issues/3333). `%ilike%` is for case-insensitive pattern matching. `%flike%` is for more efficient matching of fixed strings. Thanks to @andreasLD for providing most of the core code. + +10. `on=.NATURAL` (or alternatively `X[on=Y]` [#3621](https://github.com/Rdatatable/data.table/issues/3621)) joins two tables on their common column names, so called _natural join_, [#629](https://github.com/Rdatatable/data.table/issues/629). Thanks to David Kulp for request. As before, when `on=` is not provided, `X` must have a key and the key columns are used to join (like rownames, but multi-column and multi-type). + +11. `as.data.table` gains `key` argument mirroring its use in `setDT` and `data.table`, [#890](https://github.com/Rdatatable/data.table/issues/890). As a byproduct, the arguments of `as.data.table.array` have changed order, which could affect code relying on positional arguments to this method. Thanks @cooldome for the suggestion and @MichaelChirico for implementation. + +12. `merge.data.table` is now exported, [#2618](https://github.com/Rdatatable/data.table/pull/2618). We realize that S3 methods should not ordinarily be exported. Rather, the method should be invoked via S3 dispatch. But users continue to request its export, perhaps because of intricacies relating to the fact that data.table inherits from data.frame, there are two arguments to `merge()` but S3 dispatch applies just to the first, and a desire to explicitly call `data.table::merge.data.table` from package code. Thanks to @AndreMikulec for the most recent request. + +13. New rolling function to calculate rolling sum has been implemented and exported, see `?frollsum`, [#2778](https://github.com/Rdatatable/data.table/issues/2778). + +14. `setkey` to an existing index now uses the index, [#2889](https://github.com/Rdatatable/data.table/issues/2889). Thanks to @MichaelChirico for suggesting and @saraswatmks for the PR. + +15. `DT[order(col)[1:5], ...]` (i.e. where `i` is a compound expression involving `order()`) is now optimized to use `data.table`'s multithreaded `forder`, [#1921](https://github.com/Rdatatable/data.table/issues/1921). This example is not a fully optimal top-N query since the full ordering is still computed. The improvement is that the call to `order()` is computed faster for any `i` expression using `order`. + +16. `as.data.table` now unpacks columns in a `data.frame` which are themselves a `data.frame` or `matrix`. This need arises when parsing JSON, a corollary in [#3369](https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752). Bug fix 19 in v1.12.2 (see below) added a helpful error (rather than segfault) to detect such invalid `data.table`, and promised that `as.data.table()` would unpack these columns in the next release (i.e. this release) so that the invalid `data.table` is not created in the first place. Further, `setDT` now warns if it observes such columns and suggests using `as.data.table` instead, [#3760](https://github.com/Rdatatable/data.table/issues/3760). + +17. `CJ` has been ported to C and parallelized, thanks to a PR by Michael Chirico, [#3596](https://github.com/Rdatatable/data.table/pull/3596). All types benefit, but, as in many `data.table` operations, factors benefit more than character. + + ```R + # default 4 threads on a laptop with 16GB RAM and 8 logical CPU + + ids = as.vector(outer(LETTERS, LETTERS, paste0)) + system.time( CJ(ids, 1:500000) ) # 3.9GB; 340m rows + # user system elapsed (seconds) + # 3.000 0.817 3.798 # was + # 1.800 0.832 2.190 # now + + # ids = as.factor(ids) + system.time( CJ(ids, 1:500000) ) # 2.6GB; 340m rows + # user system elapsed (seconds) + # 1.779 0.534 2.293 # was + # 0.357 0.763 0.292 # now + ``` + +18. New function `fcoalesce(...)` has been written in C, and is multithreaded for `numeric` and `factor`. It replaces missing values according to a prioritized list of candidates (as per SQL COALESCE, `dplyr::coalesce`, and `hutils::coalesce`), [#3424](https://github.com/Rdatatable/data.table/issues/3424). It accepts any number of vectors in several forms. For example, given three vectors `x`, `y`, and `z`, where each `NA` in `x` is to be replaced by the corresponding value in `y` if that is non-NA, else the corresponding value in `z`, the following equivalent forms are all accepted: `fcoalesce(x,y,z)`, `fcoalesce(x,list(y,z))`, and `fcoalesce(list(x,y,z))`. Being a new function, its behaviour is subject to change particularly for type `list`, [#3712](https://github.com/Rdatatable/data.table/issues/3712). + + ```R + # default 4 threads on a laptop with 16GB RAM and 8 logical CPU + N = 100e6 + x = replicate(5, {x=sample(N); x[sample(N, N/2)]=NA; x}, simplify=FALSE) # 2GB + y1 = do.call(dplyr::coalesce, x)) + y2 = do.call(hutils::coalesce, x)) + y3 = do.call(data.table::fcoalesce, x)) + # user system elapsed (seconds) + # 4.935 1.876 6.810 # dplyr::coalesce + # 3.122 0.831 3.956 # hutils::coalesce + # 0.915 0.099 0.379 # data.table::fcoalesce + identical(y1,y2) && identical(y1,y3) + # TRUE + ``` + +19. Type `complex` is now supported by `setkey`, `setorder`, `:=`, `by=`, `keyby=`, `shift`, `dcast`, `frank`, `rowid`, `rleid`, `CJ`, `fcoalesce`, `unique`, and `uniqueN`, [#3690](https://github.com/Rdatatable/data.table/issues/3690). Thanks to Gareth Ward and Elio Campitelli for their reports and input. Sorting `complex` is achieved the same way as base R; i.e., first by the real part then by the imaginary part (as if the `complex` column were two separate columns of `double`). There is no plan to support joining/merging on `complex` columns until a user demonstrates a need for that. + +20. `setkey`, `[key]by=` and `on=` in verbose mode (`options(datatable.verbose=TRUE)`) now detect any columns inheriting from `Date` which are stored as 8 byte double, test if any fractions are present, and if not suggest using a 4 byte integer instead (such as `data.table::IDate`) to save space and time, [#1738](https://github.com/Rdatatable/data.table/issues/1738). In future this could be upgraded to `message` or `warning` depending on feedback. + +21. New function `fifelse(test, yes, no, na)` has been implemented in C by Morgan Jacob, [#3657](https://github.com/Rdatatable/data.table/issues/3657) and [#3753](https://github.com/Rdatatable/data.table/issues/3753). It is comparable to `base::ifelse`, `dplyr::if_else`, `hutils::if_else`, and (forthcoming) [`vctrs::if_else()`](https://vctrs.r-lib.org/articles/stability.html#ifelse). It returns a vector of the same length as `test` but unlike `base::ifelse` the output type is consistent with those of `yes` and `no`. Please see `?data.table::fifelse` for more details. + + ```R + # default 4 threads on a laptop with 16GB RAM and 8 logical CPU + x = sample(c(TRUE,FALSE), 3e8, replace=TRUE) # 1GB + microbenchmark::microbenchmark( + base::ifelse(x, 7L, 11L), + dplyr::if_else(x, 7L, 11L), + hutils::if_else(x, 7L, 11L), + data.table::fifelse(x, 7L, 11L), + times = 5L, unit="s" + ) + # Unit: seconds + # expr min med max neval + # base::ifelse(x, 7L, 11L) 8.5 8.6 8.8 5 + # dplyr::if_else(x, 7L, 11L) 9.4 9.5 9.7 5 + # hutils::if_else(x, 7L, 11L) 2.6 2.6 2.7 5 + # data.table::fifelse(x, 7L, 11L) 1.5 1.5 1.6 5 # setDTthreads(1) + # data.table::fifelse(x, 7L, 11L) 0.8 0.8 0.9 5 # setDTthreads(2) + # data.table::fifelse(x, 7L, 11L) 0.4 0.4 0.5 5 # setDTthreads(4) + ``` + +22. `transpose` gains `keep.names=` and `make.names=` arguments, [#1886](https://github.com/Rdatatable/data.table/issues/1886). Previously, column names were dropped and there was no way to keep them. `keep.names="rn"` keeps the column names and puts them in the `"rn"` column of the result. Similarly, `make.names="rn"` uses column `"rn"` as the column names of the result. Both arguments are `NULL` by default for backwards compatibility. As these new arguments are new, they are subject to change in future according to community feedback. Thanks to @ghost for the request. + +23. Added a `data.table` method for `utils::edit` to ensure a `data.table` is returned, for convenience, [#593](https://github.com/Rdatatable/data.table/issues/593). + +24. More efficient optimization of many columns in `j` (e.g. from `.SD`), [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. + +25. `setnames(DT, old, new)` now omits any `old==new` to save redundant key and index name updates, [#3783](https://github.com/Rdatatable/data.table/issues/3783). `setnames(DT, new)` (i.e. not providing `old`) already omitted any column name updates where `names(DT)==new`; e.g. `setnames(DT, gsub('^_', '', names(DT)))` exits early if no columns start with `_`. + +26. `[[` by group is now optimized for regular vectors (not type list), [#3209](https://github.com/Rdatatable/data.table/issues/3209). Thanks @renkun-ken for the suggestion. `[` by group was already optimized. Please file a feature request if you would like this optimization for list columns. + +27. New function `frollapply` for rolling computation of arbitrary R functions (caveat: input `x` is coerced to numeric beforehand, and the function must return a scalar numeric value). The API is consistent to extant rolling functions `frollmean` and `frollsum`; note that it will generally be slower than those functions because (1) the known functions use our optimized internal C implementation and (2) there is no thread-safe API to R's C `eval`. Nevertheless `frollapply` is faster than corresponding `base`-only and `zoo` versions: + + ```R + set.seed(108) + x = rnorm(1e6); n = 1e3 + base_rollapply = function(x, n, FUN) { + nx = length(x) + ans = rep(NA_real_, nx) + for (i in n:nx) ans[i] = FUN(x[(i-n+1):i]) + ans + } + system.time(base_rollapply(x, n, mean)) + system.time(zoo::rollapplyr(x, n, function(x) mean(x), fill=NA)) + system.time(zoo::rollmeanr(x, n, fill=NA)) + system.time(frollapply(x, n, mean)) + system.time(frollmean(x, n)) + + ### fun mean sum median + # base_rollapply 8.815 5.151 60.175 + # zoo::rollapply 34.373 27.837 88.552 + # zoo::roll[fun] 0.215 0.185 NA ## median not fully supported + # frollapply 5.404 1.419 56.475 + # froll[fun] 0.003 0.002 NA ## median not yet supported + ``` + +28. `setnames()` now accepts functions in `old=` and `new=`, [#3703](https://github.com/Rdatatable/data.table/issues/3703). Thanks @smingerson for the feature request and @shrektan for the PR. + + ```R + DT = data.table(a=1:3, b=4:6, c=7:9) + setnames(DT, toupper) + names(DT) + # [1] "A" "B" "C" + setnames(DT, c(1,3), tolower) + names(DT) + # [1] "a" "B" "c" + ``` + +29. `:=` and `set()` now use zero-copy type coercion. Accordingly, `DT[..., integerColumn:=0]` and `set(DT,i,j,0)` no longer warn about the `0` ('numeric') needing to be `0L` ('integer') because there is no longer any time or space used for this coercion. The old long warning was off-putting to new users ("what and why L?"), whereas advanced users appreciated the old warning so they could avoid the coercion. Although the time and space for one coercion in a single call is unmeasurably small, when placed in a loop the small overhead of any allocation on R's heap could start to become noticeable (more so for `set()` whose purpose is low-overhead looping). Further, when assigning a value across columns of varying types, it could be inconvenient to supply the correct type for every column. Hence, zero-copy coercion was introduced to satisfy all these requirements. A warning is still issued, as before, when fractional data is discarded; e.g. when 3.14 is assigned to an integer column. Zero-copy coercion applies to length>1 vectors as well as length-1 vectors. + +## BUG FIXES + +1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. + +2. `keyby=colName` could use the wrong index and return incorrect results if both `colName` and `colNameExtra` (where `colName` is a leading subset of characters of `colNameExtra`) are column names and an index exists on `colNameExtra`, [#3498](https://github.com/Rdatatable/data.table/issues/3498). Thanks to Xianying Tan for the detailed report and pinpointing the source line at fault. + +3. A missing item in `j` such as `j=.(colA, )` now gives a helpful error (`Item 2 of the .() or list() passed to j is missing`) rather than the unhelpful error `argument "this_jsub" is missing, with no default` (v1.12.2) or `argument 2 is empty` (v1.12.0 and before), [#3507](https://github.com/Rdatatable/data.table/issues/3507). Thanks to @eddelbuettel for the report. + +4. `fwrite()` could crash when writing very long strings such as 30 million characters, [#2974](https://github.com/Rdatatable/data.table/issues/2974), and could be unstable in memory constrained environments, [#2612](https://github.com/Rdatatable/data.table/issues/2612). Thanks to @logworthy and @zachokeeffe for reporting and Philippe Chataignon for fixing in PR [#3288](https://github.com/Rdatatable/data.table/pull/3288). + +5. `fread()` could crash if `quote=""` (i.e. ignore quotes), the last line is too short, and `fill=TRUE`, [#3524](https://github.com/Rdatatable/data.table/pull/3524). Thanks to Jiucang Hao for the report and reproducible example. + +6. Printing could occur unexpectedly when code is run with `source`, [#2369](https://github.com/Rdatatable/data.table/issues/2369). Thanks to @jan-glx for the report and reproducible example. + +7. Grouping by `NULL` on zero rows `data.table` now behaves consistently to non-zero rows `data.table`, [#3530](https://github.com/Rdatatable/data.table/issues/3530). Thanks to @SymbolixAU for the report and reproducible example. + +8. GForce optimization of `median` did not retain the class; e.g. `median` of `Date` or `POSIXct` would return a raw number rather than retain the date class, [#3079](https://github.com/Rdatatable/data.table/issues/3079). Thanks to @Henrik-P for reporting. + +9. `DT[, format(mean(date,""%b-%Y")), by=group]` could fail with `invalid 'trim' argument`, [#1876](https://github.com/Rdatatable/data.table/issues/1876). Thanks to Ross Holmberg for reporting. + +10. `externalVar=1:5; DT[, mean(externalVar), by=group]` could return incorrect results rather than a constant (`3` in this example) for each group, [#875](https://github.com/Rdatatable/data.table/issues/875). GForce optimization was being applied incorrectly to the `mean` without realizing `externalVar` was not a column. + +11. `test.data.table()` now passes in non-English R sessions, [#630](https://github.com/Rdatatable/data.table/issues/630) [#3039](https://github.com/Rdatatable/data.table/issues/3039). Each test still checks that the number of warnings and/or errors produced is correct. However, a message is displayed suggesting to restart R with `LANGUAGE=en` in order to test that the text of the warning and/or error messages are as expected, too. + +12. Joining a double column in `i` containing say 1.3, with an integer column in `x` containing say 1, would result in the 1.3 matching to 1, [#2592](https://github.com/Rdatatable/data.table/issues/2592), and joining a factor column to an integer column would match the factor's integers rather than error. The type coercion logic has been revised and strengthened. Many thanks to @MarkusBonsch for reporting and fixing. Joining a character column in `i` to a factor column in `x` is now faster and retains the character column in the result rather than coercing it to factor. Joining an integer column in `i` to a double column in `x` now retains the integer type in the result rather than coercing the integers into the double type. Logical columns may now only be joined to logical columns, other than all-NA columns which are coerced to the matching column's type. All coercions are reported in verbose mode: `options(datatable.verbose=TRUE)`. + +13. Attempting to recycle 2 or more items into an existing `list` column now gives the intended helpful error rather than `Internal error: recycle length error not caught earlier.`, [#3543](https://github.com/Rdatatable/data.table/issues/3543). Thanks to @MichaelChirico for finding and reporting. + +14. Subassigning using `$<-` to a `data.table` embedded in a list column of a single-row `data.table` could fail, [#3474](https://github.com/Rdatatable/data.table/issues/3474). Note that `$<-` is not recommended; please use `:=` instead which already worked in this case. Thanks to Jakob Richter for reporting. + +15. `rbind` and `rbindlist` of zero-row items now retain (again) the unused levels of any (zero-length) factor columns, [#3508](https://github.com/Rdatatable/data.table/issues/3508). This was a regression in v1.12.2 just for zero-row items. Unused factor levels were already retained for items having `nrow>=1`. Thanks to Gregory Demin for reporting. + +16. `rbind` and `rbindlist` of an item containing an ordered factor with levels containing an `NA` (as opposed to an NA integer) could segfault, [#3601](https://github.com/Rdatatable/data.table/issues/3601). This was a a regression in v1.12.2. Thanks to Damian Betebenner for reporting. Also a related segfault when recycling a length-1 factor column, [#3662](https://github.com/Rdatatable/data.table/issues/3662). + +17. `example(":=", local=TRUE)` now works rather than error, [#2972](https://github.com/Rdatatable/data.table/issues/2972). Thanks @vlulla for the report. + +18. `rbind.data.frame` on `IDate` columns changed the column from `integer` to `double`, [#2008](https://github.com/Rdatatable/data.table/issues/2008). Thanks to @rmcgehee for reporting. + +19. `merge.data.table` now retains any custom classes of the first argument, [#1378](https://github.com/Rdatatable/data.table/issues/1378). Thanks to @michaelquinn32 for reopening. + +20. `c`, `seq` and `mean` of `ITime` objects now retain the `ITime` class via new `ITime` methods, [#3628](https://github.com/Rdatatable/data.table/issues/3628). Thanks @UweBlock for reporting. The `cut` and `split` methods for `ITime` have been removed since the default methods work, [#3630](https://github.com/Rdatatable/data.table/pull/3630). + +21. `as.data.table.array` now handles the case when some of the array's dimension names are `NULL`, [#3636](https://github.com/Rdatatable/data.table/issues/3636). + +22. Adding a `list` column using `cbind`, `as.data.table`, or `data.table` now works rather than treating the `list` as if it were a set of columns and introducing an invalid NA column name, [#3471](https://github.com/Rdatatable/data.table/pull/3471). However, please note that using `:=` to add columns is preferred. + + ```R + cbind( data.table(1:2), list(c("a","b"),"a") ) + # V1 V2 NA # v1.12.2 and before + # + # 1: 1 a a + # 2: 2 b a + # + # V1 V2 # v1.12.4+ + # + # 1: 1 a,b + # 2: 2 a + ``` + +23. Incorrect sorting/grouping results due to a bug in Intel's `icc` compiler 2019 (Version 19.0.4.243 Build 20190416) has been worked around thanks to a report and fix by Sebastian Freundt, [#3647](https://github.com/Rdatatable/data.table/issues/3647). Please run `data.table::test.data.table()`. If that passes, your installation does not have the problem. + +24. `column not found` could incorrectly occur in rare non-equi-join cases, [#3635](https://github.com/Rdatatable/data.table/issues/3635). Thanks to @UweBlock for the report. + +25. Slight fix to the logic for auto-naming the `by` clause for using a custom function like `evaluate` to now be named `evaluate` instead of the name of the first symbolic argument, [#3758](https://github.com/Rdatatable/data.table/issues/3758). + +26. Column binding of zero column `data.table` will now work as expected, [#3334](https://github.com/Rdatatable/data.table/issues/3334). Thanks to @kzenstratus for the report. + +27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report. + +28. From v1.12.0 `between()` and `%between%` interpret missing values in `lower=` or `upper=` as unlimited bounds. A new parameter `NAbounds` has been added to achieve the old behaviour of returning `NA`, [#3522](https://github.com/Rdatatable/data.table/issues/3522). Thanks @cguill95 for reporting. This is now consistent for character input, [#3667](https://github.com/Rdatatable/data.table/issues/3667) (thanks @AnonymousBoba), and class `nanotime` is now supported too. + +29. `integer64` defined on a subset of a new column would leave "gibberish" on the remaining rows, [#3723](https://github.com/Rdatatable/data.table/issues/3723). A bug in `rbindlist` with the same root cause was also fixed, [#1459](https://github.com/Rdatatable/data.table/issues/1459). Thanks @shrektan and @jangorecki for the reports. + +30. `groupingsets` functions now properly handle alone special symbols when using an empty set to group by, [#3653](https://github.com/Rdatatable/data.table/issues/3653). Thanks to @Henrik-P for the report. + +31. A `data.table` created using `setDT()` on a `data.frame` containing identical columns referencing each other would cause `setkey()` to return incorrect results, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). Thanks @kirillmayantsev and @alex46015 for reporting, and @jaapwalhout and @Atrebas for helping to debug and isolate the issue. + +32. `x[, round(.SD, 1)]` and similar operations on the whole of `.SD` could return a locked result, incorrectly preventing `:=` on the result, [#2245](https://github.com/Rdatatable/data.table/issues/2245). Thanks @grayskripko for raising. + +33. Using `get`/`mget` in `j` could cause `.SDcols` to be ignored or reordered, [#1744](https://github.com/Rdatatable/data.table/issues/1744), [#1965](https://github.com/Rdatatable/data.table/issues/1965), and [#2036](https://github.com/Rdatatable/data.table/issues/2036). Thanks @franknarf1, @MichaelChirico, and @TonyBonen, for the reports. + +34. `DT[, i-1L, with=FALSE]` would misinterpret the minus sign and return an incorrect result, [#2019](https://github.com/Rdatatable/data.table/issues/2109). Thanks @cguill95 for the report. + +35. `DT[id==1, DT2[.SD, on="id"]]` (i.e. joining from `.SD` in `j`) could incorrectly fail in some cases due to `.SD` being locked, [#1926](https://github.com/Rdatatable/data.table/issues/1926), and when updating-on-join with factors [#3559](https://github.com/Rdatatable/data.table/issues/3559) [#2099](https://github.com/Rdatatable/data.table/issues/2099). Thanks @franknarf1 and @Henrik-P for the reports and for diligently tracking use cases for almost 3 years! + +36. `as.IDate.POSIXct` returned `NA` for UTC times before Dec 1901 and after Jan 2038, [#3780](https://github.com/Rdatatable/data.table/issues/3780). Thanks @gschett for the report. + +37. `rbindlist` now returns correct idcols for lists with different length vectors, [#3785](https://github.com/Rdatatable/data.table/issues/3785), [#3786](https://github.com/Rdatatable/data.table/pull/3786). Thanks to @shrektan for the report and fix. + +38. `DT[ , !rep(FALSE, ncol(DT)), with=FALSE]` correctly returns the full table, [#3013](https://github.com/Rdatatable/data.table/issues/3013) and [#2917](https://github.com/Rdatatable/data.table/issues/2917). Thanks @alexnss and @DavidArenburg for the reports. + +39. `shift(x, 0:1, type='lead', give.names=TRUE)` uses `lead` in all returned column names, [#3832](https://github.com/Rdatatable/data.table/issues/3832). Thanks @daynefiler for the report. + +40. Subtracting two `POSIXt` objects by group could lead to incorrect results because the `base` method internally calls `difftime` with `units='auto'`; `data.table` does not notice if the chosen units differ by group and only the last group's `units` attribute was retained, [#3694](https://github.com/Rdatatable/data.table/issues/3694) and [#761](https://github.com/Rdatatable/data.table/issues/761). To surmount this, we now internally force `units='secs'` on all `POSIXt-POSIXt` calls (reported when `verbose=TRUE`); generally we recommend calling `difftime` directly instead. Thanks @oliver-oliver and @boethian for the reports. + +41. Using `get`/`mget` in `j` could cause `.SDcols` to be ignored or reordered, [#1744](https://github.com/Rdatatable/data.table/issues/1744), [#1965](https://github.com/Rdatatable/data.table/issues/1965), [#2036](https://github.com/Rdatatable/data.table/issues/2036), and [#2946](https://github.com/Rdatatable/data.table/issues/2946). Thanks @franknarf1, @MichaelChirico, @TonyBonen, and Steffen J. (StackOverflow) for the reports. + +42. `DT[...,by={...}]` now handles expressions in `{`, [#3156](https://github.com/Rdatatable/data.table/issues/3156). Thanks to @tdhock for the report. + +43. `:=` could change a `data.table` creation statement in the body of the function calling it, or a variable in calling scope, [#3890](https://github.com/Rdatatable/data.table/issues/3890). Many thanks to @kirillmayantsev for the detailed reports. + +44. Grouping could create a `malformed factor` and/or segfault when the factors returned by each group did not have identical levels, [#2199](https://github.com/Rdatatable/data.table/issues/2199) and [#2522](https://github.com/Rdatatable/data.table/issues/2522). Thanks to Václav Hausenblas, @franknarf1, @ben519, and @Henrik-P for reporting. + +45. `rbindlist` (and printing a `data.table` with over 100 rows because that uses `rbindlist(head, tail)`) could error with `malformed factor` for unordered factor columns containing a used `NA_character_` level, [#3915](https://github.com/Rdatatable/data.table/issues/3915). This is an unusual input for unordered factors because NA_integer_ is recommended by default in R. Thanks to @sindribaldur for reporting. + +46. Adding a `list` column containing an item of type `list` to a one row `data.table` could fail, [#3626](https://github.com/Rdatatable/data.table/issues/3626). Thanks to Jakob Richter for reporting. + +## NOTES + +1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. + +2. Adding a new column by reference using `set()` on a `data.table` loaded from binary file now give a more helpful error message, [#2996](https://github.com/Rdatatable/data.table/issues/2996). Thanks to Joseph Burling for reporting. + + ``` + This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed + manually (e.g. using structure()). Please run setDT() or alloc.col() on it first (to pre-allocate + space for new columns) before adding new columns by reference to it. + ``` + +3. `setorder` on a superset of a keyed `data.table`'s key now retains its key, [#3456](https://github.com/Rdatatable/data.table/issues/3456). For example, if `a` is the key of `DT`, `setorder(DT, a, -v)` will leave `DT` keyed by `a`. + +4. New option `options(datatable.quiet = TRUE)` turns off the package startup message, [#3489](https://github.com/Rdatatable/data.table/issues/3489). `suppressPackageStartupMessages()` continues to work too. Thanks to @leobarlach for the suggestion inspired by `options(tidyverse.quiet = TRUE)`. We don't know of a way to make a package respect the `quietly=` option of `library()` and `require()` because the `quietly=` isn't passed through for use by the package's own `.onAttach`. If you can see how to do that, please submit a patch to R. + +5. When loading a `data.table` from disk (e.g. with `readRDS`), best practice is to run `setDT()` on the new object to assure it is correctly allocated memory for new column pointers. Barring this, unexpected behavior can follow; for example, if you assign a new column to `DT` from a function `f`, the new columns will only be assigned within `f` and `DT` will be unchanged. The `verbose` messaging in this situation is now more helpful, [#1729](https://github.com/Rdatatable/data.table/issues/1729). Thanks @vspinu for sharing his experience to spur this. + +6. New vignette _Using `.SD` for Data Analysis_, a deep dive into use cases for the `.SD` variable to help illuminate this topic which we've found to be a sticking point for beginning and intermediate `data.table` users, [#3412](https://github.com/Rdatatable/data.table/issues/3412). + +7. Added a note to `?frank` clarifying that ranking is being done according to C sorting (i.e., like `forder`), [#2328](https://github.com/Rdatatable/data.table/issues/2328). Thanks to @cguill95 for the request. + +8. Historically, `dcast` and `melt` were built as enhancements to `reshape2`'s own `dcast`/`melt`. We removed dependency on `reshape2` in v1.9.6 but maintained some backward compatibility. As that package has been superseded since December 2017, we will begin to formally complete the split from `reshape2` by removing some last vestiges. In particular we now warn when redirecting to `reshape2` methods and will later error before ultimately completing the split; see [#3549](https://github.com/Rdatatable/data.table/issues/3549) and [#3633](https://github.com/Rdatatable/data.table/issues/3633). We thank the `reshape2` authors for their original inspiration for these functions, and @ProfFancyPants for testing and reporting regressions in dev which have been fixed before release. + +9. `DT[col]` where `col` is a column containing row numbers of itself to select, now suggests the correct syntax (`DT[(col)]` or `DT[DT$col]`), [#697](https://github.com/Rdatatable/data.table/issues/697). This expands the message introduced in [#1884](https://github.com/Rdatatable/data.table/issues/1884) for the case where `col` is type `logical` and `DT[col==TRUE]` is suggested. + +10. The `datatable.old.unique.by.key` option has been warning for 1 year that it is deprecated: `... Please stop using it and pass by=key(DT) instead for clarity ...`. This warning is now upgraded to error as per the schedule in note 10 of v1.11.0 (May 2018), and note 1 of v1.9.8 (Nov 2016). In June 2020 the option will be removed. + +11. We intend to deprecate the `datatable.nomatch` option, [more info](https://github.com/Rdatatable/data.table/pull/3578/files). A message is now printed upon use of the option (once per session) as a first step. It asks you to please stop using the option and to pass `nomatch=NULL` explicitly if you require inner join. Outer join (`nomatch=NA`) has always been the default because it is safer; it does not drop missing data silently. The problem is that the option is global; i.e., if a user changes the default using this option for their own use, that can change the behavior of joins inside packages that use `data.table` too. This is the only `data.table` option with this concern. + +12. The test suite of 9k tests now runs with three R options on: `warnPartialMatchArgs`, `warnPartialMatchAttr`, and `warnPartialMatchDollar`. This ensures that we don't rely on partial argument matching in internal code, for robustness and efficiency, and so that users can turn these options on for their code in production, [#3664](https://github.com/Rdatatable/data.table/issues/3664). Thanks to Vijay Lulla for the suggestion, and Michael Chirico for fixing 48 internal calls to `attr()` which were missing `exact=TRUE`, for example. Thanks to R-core for adding these options to R 2.6.0 (Oct 2007). + +13. `test.data.table()` could fail if the `datatable.integer64` user option was set, [#3683](https://github.com/Rdatatable/data.table/issues/3683). Thanks @xiaguoxin for reporting. + +14. The warning message when using `keyby=` together with `:=` is clearer, [#2763](https://github.com/Rdatatable/data.table/issues/2763). Thanks to @eliocamp. + +15. `first` and `last` gain an explicit `n=1L` argument so that it's clear the default is 1, and their almost identical manual pages have been merged into one. + +16. Rolling functions (`?froll`) coerce `logical` input to `numeric` (instead of failing) to mimic the behavior of `integer` input. + +17. The warning message when using `strptime` in `j` has been improved, [#2068](https://github.com/Rdatatable/data.table/issues/2068). Thanks to @tdhock for the report. + +18. Added a note to `?setkey` clarifying that `setkey` always uses C-locale sorting (as has been noted in `?setorder`). Thanks @JBreidaks for the report in [#2114](https://github.com/Rdatatable/data.table/issues/2114). + +19. `hour()`/`minute()`/`second()` are much faster for `ITime` input, [#3518](https://github.com/Rdatatable/data.table/issues/3158). + +20. New alias `setalloccol` for `alloc.col`, [#3475](https://github.com/Rdatatable/data.table/issues/3475). For consistency with `set*` prefixes for functions that operate in-place (like `setkey`, `setorder`, etc.). `alloc.col` is not going to be deprecated but we recommend using `setalloccol`. + +21. `dcast` no longer emits a message when `value.var` is missing but `fun.aggregate` is explicitly set to `length` (since `value.var` is arbitrary in this case), [#2980](https://github.com/Rdatatable/data.table/issues/2980). + +22. Optimized `mean` of `integer` columns no longer warns about a coercion to numeric, [#986](https://github.com/Rdatatable/data.table/issues/986). Thanks @dgrtwo for his [YouTube tutorial at 3:01](https://youtu.be/AmE4LXPQErM?t=175) where the warning occurs. + +23. Using `first` and `last` function on `POSIXct` object no longer loads `xts` namespace, [#3857](https://github.com/Rdatatable/data.table/issues/3857). `first` on empty `data.table` returns empty `data.table` now [#3858](https://github.com/Rdatatable/data.table/issues/3858). + +24. Added some clarifying details about what happens when a shell command is used in `fread`, [#3877](https://github.com/Rdatatable/data.table/issues/3877). Thanks Brian for the StackOverflow question which highlighted the lack of explanation here. + +25. We continue to encourage packages to `Import` rather than `Depend` on `data.table`, [#3076](https://github.com/Rdatatable/data.table/issues/3076). To prevent the growth rate in new packages using `Depend`, we have requested that CRAN apply a small patch we provided to prevent new submissions using `Depend`. If this is accepted, the error under `--as-cran` will be as follows. The existing 73 packages using `Depend` will continue to pass OK until they next update, at which point they will be required to change from `Depend` to `Import`. + + ``` + R CMD check --as-cran + ... + * checking package dependencies ... ERROR + + data.table should be in Imports not Depends. Please contact its + maintainer for more information. + ``` + + +# data.table [v1.12.2](https://github.com/Rdatatable/data.table/milestone/14?closed=1) (07 Apr 2019) + +## NEW FEATURES + +1. `:=` no longer recycles length>1 RHS vectors. There was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length (the same behaviour as base R). Consistent feedback for several years has been that recycling is more often a bug. In rare cases where you need to recycle a length>1 vector, please use `rep()` explicitly. Single values are still recycled silently as before. Early warning was given in [this tweet](https://twitter.com/MattDowle/status/1088544083499311104). The 774 CRAN and Bioconductor packages using `data.table` were tested and the maintainers of the 16 packages affected (2%) were consulted before going ahead, [#3310](https://github.com/Rdatatable/data.table/pull/3310). Upon agreement we went ahead. Many thanks to all those maintainers for already updating on CRAN, [#3347](https://github.com/Rdatatable/data.table/pull/3347). + +2. `foverlaps` now supports `type="equal"`, [#3416](https://github.com/Rdatatable/data.table/issues/3416) and part of [#3002](https://github.com/Rdatatable/data.table/issues/3002). + +3. The number of logical CPUs used by default has been reduced from 100% to 50%. The previous 100% default was reported to cause significant slow downs when other non-trivial processes were also running, [#3395](https://github.com/Rdatatable/data.table/issues/3395) [#3298](https://github.com/Rdatatable/data.table/issues/3298). Two new optional environment variables (`R_DATATABLE_NUM_PROCS_PERCENT` & `R_DATATABLE_NUM_THREADS`) control this default. `setDTthreads()` gains `percent=` and `?setDTthreads` has been significantly revised. The output of `getDTthreads(verbose=TRUE)` has been expanded. The environment variable `OMP_THREAD_LIMIT` is now respected ([#3300](https://github.com/Rdatatable/data.table/issues/3300)) in addition to `OMP_NUM_THREADS` as before. + +4. `rbind` and `rbindlist` now retain the position of duplicate column names rather than grouping them together [#3373](https://github.com/Rdatatable/data.table/issues/3373), fill length 0 columns (including NULL) with NA with warning [#1871](https://github.com/Rdatatable/data.table/issues/1871), and recycle length-1 columns [#524](https://github.com/Rdatatable/data.table/issues/524). Thanks to Kun Ren for the requests which arose when parsing JSON. + +5. `rbindlist`'s `use.names=` default has changed from `FALSE` to `"check"`. This emits a message if the column names of each item are not identical and then proceeds as if `use.names=FALSE` for backwards compatibility; i.e., bind by column position not by column name. The `rbind` method for `data.table` already sets `use.names=TRUE` so this change affects `rbindlist` only and not `rbind.data.table`. To stack differently named columns together silently (the previous default behavior of `rbindlist`), it is now necessary to specify `use.names=FALSE` for clarity to readers of your code. Thanks to Clayton Stanley who first raised the issue [here](https://lists.r-forge.r-project.org/pipermail/datatable-help/2014-April/002480.html). To aid pinpointing the calls to `rbindlist` that need attention, the message can be turned to error using `options(datatable.rbindlist.check="error")`. This option also accepts `"warning"`, `"message"` and `"none"`. In this release the message is suppressed for default column names (`"V[0-9]+"`); the next release will emit the message for those too. In 6 months the default will be upgraded from message to warning. There are two slightly different messages. They are helpful, include context and point to this news item : + + ``` + Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with + NA (NULL for list columns), or use.names=FALSE to ignore column names. + See news item 5 in v1.12.2 for options to control this message. + + Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE + to match by column name, or use.names=FALSE to ignore column names. + See news item 5 in v1.12.2 for options to control this message. + ``` + +6. `fread` gains `keepLeadingZeros`, [#2999](https://github.com/Rdatatable/data.table/issues/2999). By default `FALSE` so that, as before, a field containing `001` is interpreted as the integer 1, otherwise the character string `"001"`. The default may be changed using `options(datatable.keepLeadingZeros=TRUE)`. Many thanks to @marc-outins for the PR. + +## BUG FIXES + +1. `rbindlist()` of a malformed factor which is missing a levels attribute is now a helpful error rather than a cryptic error about `STRING_ELT`, [#3315](https://github.com/Rdatatable/data.table/issues/3315). Thanks to Michael Chirico for reporting. + +2. Forgetting `type=` in `shift(val, "lead")` would segfault, [#3354](https://github.com/Rdatatable/data.table/issues/3354). A helpful error is now produced to indicate `"lead"` is being passed to `n=` rather than the intended `type=` argument. Thanks to @SymbolixAU for reporting. + +3. The default print output (top 5 and bottom 5 rows) when ncol>255 could display the columns in the wrong order, [#3306](https://github.com/Rdatatable/data.table/issues/3306). Thanks to Kun Ren for reporting. + +4. Grouping by unusual column names such as `by='string_with_\\'` and `keyby="x y"` could fail, [#3319](https://github.com/Rdatatable/data.table/issues/3319) [#3378](https://github.com/Rdatatable/data.table/issues/3378). Thanks to @HughParsonage for reporting and @MichaelChirico for the fixes. + +5. `foverlaps()` could return incorrect results for `POSIXct <= 1970-01-01`, [#3349](https://github.com/Rdatatable/data.table/issues/3349). Thanks to @lux5 for reporting. + +6. `dcast.data.table` now handles functions passed to `fun.aggregate=` via a variable; e.g., `funs <- list(sum, mean); dcast(..., fun.aggregate=funs`, [#1974](https://github.com/Rdatatable/data.table/issues/1974) [#1369](https://github.com/Rdatatable/data.table/issues/1369) [#2064](https://github.com/Rdatatable/data.table/issues/2064) [#2949](https://github.com/Rdatatable/data.table/issues/2949). Thanks to @sunbee, @Ping2016, @smidelius and @d0rg0ld for reporting. + +7. Some non-equijoin cases could segfault, [#3401](https://github.com/Rdatatable/data.table/issues/3401). Thanks to @Gayyam for reporting. + +8. `dcast.data.table` could sort rows containing `NA` incorrectly, [#2202](https://github.com/Rdatatable/data.table/issues/2202). Thanks to @Galileo-Galilei for the report. + +9. Sorting, grouping and finding unique values of a numeric column containing at most one finite value (such as `c(Inf,0,-Inf)`) could return incorrect results, [#3372](https://github.com/Rdatatable/data.table/issues/3372) [#3381](https://github.com/Rdatatable/data.table/issues/3381); e.g., `data.table(A=c(Inf,0,-Inf), V=1:3)[,sum(V),by=A]` would treat the 3 rows as one group. This was a regression in 1.12.0. Thanks to Nicolas Ampuero for reporting. + +10. `:=` with quoted expression and dot alias now works as expected, [#3425](https://github.com/Rdatatable/data.table/pull/3425). Thanks to @franknarf1 for raising and @jangorecki for the PR. + +11. A join's result could be incorrectly keyed when a single nomatch occurred at the very beginning while all other values matched, [#3441](https://github.com/Rdatatable/data.table/issues/3441). The incorrect key would cause incorrect results in subsequent queries. Thanks to @symbalex for reporting and @franknarf1 for pinpointing the root cause. + +12. `rbind` and `rbindlist(..., use.names=TRUE)` with over 255 columns could return the columns in a random order, [#3373](https://github.com/Rdatatable/data.table/issues/3373). The contents and name of each column was correct but the order that the columns appeared in the result might not have matched the original input. + +13. `rbind` and `rbindlist` now combine `integer64` columns together with non-`integer64` columns correctly [#1349](https://github.com/Rdatatable/data.table/issues/1349), and support `raw` columns [#2819](https://github.com/Rdatatable/data.table/issues/2819). + +14. `NULL` columns are caught and error appropriately rather than segfault in some cases, [#2303](https://github.com/Rdatatable/data.table/issues/2303) [#2305](https://github.com/Rdatatable/data.table/issues/2305). Thanks to Hugh Parsonage and @franknarf1 for reporting. + +15. `melt` would error with 'factor malformed' or segfault in the presence of duplicate column names, [#1754](https://github.com/Rdatatable/data.table/issues/1754). Many thanks to @franknarf1, William Marble, wligtenberg and Toby Dylan Hocking for reproducible examples. All examples have been added to the test suite. + +16. Removing a column from a null (0-column) data.table is now a (standard and simpler) warning rather than error, [#2335](https://github.com/Rdatatable/data.table/issues/2335). It is no longer an error to add a column to a null (0-column) data.table. + +17. Non-UTF8 strings were not always sorted correctly on Windows (a regression in v1.12.0), [#3397](https://github.com/Rdatatable/data.table/issues/3397) [#3451](https://github.com/Rdatatable/data.table/issues/3451). Many thanks to @shrektan for reporting and fixing. + +18. `cbind` with a null (0-column) `data.table` now works as expected, [#3445](https://github.com/Rdatatable/data.table/issues/3445). Thanks to @mb706 for reporting. + +19. Subsetting does a better job of catching a malformed `data.table` with error rather than segfault. A column may not be NULL, nor may a column be an object which has columns (such as a `data.frame` or `matrix`). Thanks to a comment and reproducible example in [#3369](https://github.com/Rdatatable/data.table/issues/3369) from Drew Abbot which demonstrated the issue which arose from parsing JSON. The next release will enable `as.data.table` to unpack columns which are `data.frame` to support this use case. + +## NOTES + +1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background. + +2. v1.12.0 checked itself on loading using `tools::checkMD5sums("data.table")` but this check failed under the `packrat` package manager on Windows because `packrat` appears to modify the DESCRIPTION file of packages it has snapshot, [#3329](https://github.com/Rdatatable/data.table/issues/3329). This check is now removed. The `CdllVersion` check was introduced after the `checkMD5sums()` attempt and is better; e.g., reliable on all platforms. + +3. As promised in new feature 6 of v1.11.6 Sep 2018 (see below in this news file), the `datatable.CJ.names` option's default is now `TRUE`. In v1.13.0 it will be removed. + +4. Travis CI gains OSX using homebrew llvm for OpenMP support, [#3326](https://github.com/Rdatatable/data.table/issues/3326). Thanks @marcusklik for the PR. + +5. Calling `data.table:::print.data.table()` directly (i.e. bypassing method dispatch by using 3 colons) and passing it a 0-column `data.frame` (not `data.table`) now works, [#3363](https://github.com/Rdatatable/data.table/pull/3363). Thanks @heavywatal for the PR. + +6. v1.12.0 did not compile on Solaris 10 using Oracle Developer Studio 12.6, [#3285](https://github.com/Rdatatable/data.table/issues/3285). Many thanks to Prof Ripley for providing and testing a patch. For future reference and other package developers, a `const` variable should not be passed to OpenMP's `num_threads()` directive otherwise `left operand must be modifiable lvalue` occurs. This appears to be a compiler bug which is why the specific versions are mentioned in this note. + +7. `foverlaps` provides clearer error messages w.r.t. factor and POSIXct interval columns, [#2645](https://github.com/Rdatatable/data.table/issues/2645) [#3007](https://github.com/Rdatatable/data.table/issues/3007) [#1143](https://github.com/Rdatatable/data.table/issues/1143). Thanks to @sritchie73, @msummersgill and @DavidArenburg for the reports. + +8. `unique(DT)` checks up-front the types of all the columns and will fail if any column is type `list` even though those `list` columns may not be needed to establish uniqueness. Use `unique(DT, by=...)` to specify columns that are not type `list`. v1.11.8 and before would also correctly fail with the same error, but not when uniqueness had been established in prior columns: it would stop early, not look at the `list` column and return the correct result. Checking up-front was necessary for some internal optimizations and it's probably best to be explicit anyway. Thanks to James Lamb for reporting, [#3332](https://github.com/Rdatatable/data.table/issues/3332). The error message has been embellished : + + ``` + Column 2 of by= (2) is type 'list', not yet supported. Please use the by= argument to specify + columns with types that are supported. + ``` + +9. Reminder that note 11 in v1.11.0 (May 2018) warned that `set2key()` and `key2()` will be removed in May 2019. They have been warning since v1.9.8 (Nov 2016) and their warnings were upgraded to errors in v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental'. + +10. The `key(DT)<-` form of `setkey()` has been warning since at least 2012 to use `setkey()`. The warning is now stronger: `key(x)<-value is deprecated and not supported. Please change to use setkey().`. This warning will be upgraded to error in one year. + + +# data.table v1.12.0 (13 Jan 2019) + +## NEW FEATURES + +1. `setDTthreads()` gains `restore_after_fork=`, [#2885](https://github.com/Rdatatable/data.table/issues/2885). The default `NULL` leaves the internal option unchanged which by default is `TRUE`. `data.table` has always switched to single-threaded mode on fork. It used to restore multithreading after a fork too but problems were reported on Mac and Intel OpenMP library (see 1.10.4 notes below). We are now trying again thanks to suggestions and success reported by Kun Ren and Mark Klik in package `fst`. If you experience problems with multithreading after a fork, please restart R and call `setDTthreads(restore_after_fork=FALSE)`. + +2. Subsetting, ordering and grouping now use more parallelism. See benchmarks [here](https://h2oai.github.io/db-benchmark/) and Matt Dowle's presentation in October 2018 on YouTube [here](https://youtu.be/Ddr8N9STSuI). These internal changes gave rise to 4 regressions which were found before release thanks to Kun Ren, [#3211](https://github.com/Rdatatable/data.table/issues/3211). He kindly volunteers to 'go-first' and runs data.table through his production systems before release. We are looking for a 'go-second' volunteer please. A request to test before release was tweeted on 17 Dec [here](https://twitter.com/MattDowle/status/1074746218645938176). As usual, all CRAN and Bioconductor packages using data.table (currently 750) have been tested against this release, [#3233](https://github.com/Rdatatable/data.table/issues/3233). There are now 8,000 tests in 13,000 lines of test code; more lines of test code than there is code. Overall coverage has increased to 94% thanks to Michael Chirico. + +3. New `frollmean` has been added by Jan Gorecki to calculate _rolling mean_, see `?froll` for documentation. Function name and arguments are experimental. Related to [#2778](https://github.com/Rdatatable/data.table/issues/2778) (and [#624](https://github.com/Rdatatable/data.table/issues/624), [#626](https://github.com/Rdatatable/data.table/issues/626), [#1855](https://github.com/Rdatatable/data.table/issues/1855)). Other rolling statistics will follow. + +4. `fread()` can now read a remote compressed file in one step; `fread("https://domain.org/file.csv.bz2")`. The `file=` argument now supports `.gz` and `.bz2` too; i.e. `fread(file="file.csv.gz")` works now where only `fread("file.csv.gz")` worked in 1.11.8. + +5. `nomatch=NULL` now does the same as `nomatch=0L` in both `DT[...]` and `foverlaps()`; i.e. discards missing values silently (inner join). The default is still `nomatch=NA` (outer join) for statistical safety so that missing values are retained by default. After several years have elapsed, we will start to deprecate `0L`; please start using `NULL`. In future `nomatch=.(0)` (note that `.()` creates a `list` type and is different to `nomatch=0`) will fill with `0` to save replacing `NA` with `0` afterwards, [#857](https://github.com/Rdatatable/data.table/issues/857). + +6. `setnames()` gains `skip_absent` to skip names in `old` that aren't present, [#3030](https://github.com/Rdatatable/data.table/issues/3030). By default `FALSE` so that it is still an error, as before, to attempt to change a column name that is not present. Thanks to @MusTheDataGuy for the suggestion and the PR. + +7. `NA` in `between()` and `%between%`'s `lower` and `upper` are now taken as missing bounds and return `TRUE` rather than `NA`. This is now documented. + +8. `shift()` now interprets negative values of `n` to mean the opposite `type=`, [#1708](https://github.com/Rdatatable/data.table/issues/1708). When `give.names=TRUE` the result is named using a positive `n` with the appropriate `type=`. Alternatively, a new `type="shift"` names the result using a signed `n` and constant type. + + ```R + shift(x, n=-5:5, give.names=TRUE) => "_lead_5" ... "_lag_5" + shift(x, n=-5:5, type="shift", give.names=TRUE) => "_shift_-5" ... "_shift_5" + ``` + +9. `fwrite()` now accepts `matrix`, [#2613](https://github.com/Rdatatable/data.table/issues/2613). Thanks to Michael Chirico for the suggestion and Felipe Parages for implementing. For now matrix input is converted to data.table (which can be costly) before writing. + +10. `fread()` and `fwrite()` can now handle file names in native and UTF-8 encoding, [#3078](https://github.com/Rdatatable/data.table/issues/3078). Thanks to Daniel Possenriede (@dpprdan) for reporting and fixing. + +11. `DT[i]` and `DT[i,cols]` now call internal parallel subsetting code, [#2951](https://github.com/Rdatatable/data.table/issues/2951). Subsetting is significantly faster (as are many other operations) with factor columns rather than character. + + ```R + N = 2e8 # 4GB data on 4-core CPU with 16GB RAM + DT = data.table(ID = sample(LETTERS,N,TRUE), + V1 = sample(5,N,TRUE), + V2 = runif(N)) + w = which(DT$V1 > 3) # select 40% of rows + # v1.12.0 v1.11.8 + system.time(DT[w]) # 0.8s 2.6s + DT[, ID := as.factor(ID)] + system.time(DT[w]) # 0.4s 2.3s + system.time(DT[w, c("ID","V2")]) # 0.3s 1.9s + ``` + +12. `DT[..., .SDcols=]` now accepts `patterns()`; e.g. `DT[..., .SDcols=patterns("^V")]`, for filtering columns according to a pattern (as in `melt.data.table`), [#1878](https://github.com/Rdatatable/data.table/issues/1878). Thanks to many people for pushing for this and @MichaelChirico for ultimately filing the PR. See `?data.table` for full details and examples. + +13. `split` data.table method will now preserve attributes, closes [#2047](https://github.com/Rdatatable/data.table/issues/2047). Thanks to @caneff for reporting. + +14. `DT[i,j]` now retains user-defined and inherited attributes, [#995](https://github.com/Rdatatable/data.table/issues/995); e.g. + + ```R + attr(datasets::BOD,"reference") # "A1.4, p. 270" + attr(as.data.table(datasets::BOD)[2],"reference") # was NULL now "A1.4, p. 270" + ``` + + If a superclass defines attributes that may not be valid after a `[` subset then the superclass should implement its own `[` method to manage those after calling `NextMethod()`. + +## BUG FIXES + +1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting. + +2. Column names that look like expressions (e.g. `"a<=colB"`) caused an error when used in `on=` even when wrapped with backticks, [#3092](https://github.com/Rdatatable/data.table/issues/3092). Additionally, `on=` now supports white spaces around operators; e.g. `on = "colA == colB"`. Thanks to @mt1022 for reporting and to @MarkusBonsch for fixing. + +3. Unmatched `patterns` in `measure.vars` fail early and with feedback, [#3106](https://github.com/Rdatatable/data.table/issues/3106). + +4. `fread(..., skip=)` now skips non-standard `\r` and `\n\r` line endings properly again, [#3006](https://github.com/Rdatatable/data.table/issues/3006). Standard line endings (`\n` Linux/Mac and `\r\n` Windows) were skipped ok. Thanks to @brattono and @tbrycekelly for providing reproducible examples, and @st-pasha for fixing. + +5. `fread(..., colClasses=)` could return a corrupted result when a lower type was requested for one or more columns (e.g. reading "3.14" as integer), [#2922](https://github.com/Rdatatable/data.table/issues/2922) [#2863](https://github.com/Rdatatable/data.table/issues/2863) [#3143](https://github.com/Rdatatable/data.table/issues/3143). It now ignores the request as documented and the helpful message in verbose mode is upgraded to warning. In future, coercing to a lower type might be supported (with warning if any accuracy is lost). `"NULL"` is recognized again in both vector and list mode; e.g. `colClasses=c("integer","NULL","integer")` and `colClasses=list(NULL=2, integer=10:40)`. Thanks to Arun Srinivasan, Kun Ren, Henri Ståhl and @kszela24 for reporting. + +6. `cube()` will now produce expected order of results, [#3179](https://github.com/Rdatatable/data.table/issues/3179). Thanks to @Henrik-P for reporting. + +7. `groupingsets()` groups by empty column set and constant value in `j`, [#3173](https://github.com/Rdatatable/data.table/issues/3173). + +8. `split.data.table()` failed if `DT` had a factor column named `"x"`, [#3151](https://github.com/Rdatatable/data.table/issues/3151). Thanks to @tdeenes for reporting and fixing. + +9. `fsetequal` now handles properly datasets having last column a character, closes [#2318](https://github.com/Rdatatable/data.table/issues/2318). Thanks to @pschil and @franknarf1 for reporting. + +10. `DT[..., .SDcols=integer(0L)]` could fail, [#3185](https://github.com/Rdatatable/data.table/issues/3185). An empty `data.table` is now returned correctly. + +11. `as.data.table.default` method will now always copy its input, closes [#3230](https://github.com/Rdatatable/data.table/issues/3230). Thanks to @NikdAK for reporting. + +12. `DT[..., .SDcols=integer()]` failed with `.SDcols is numeric but has both +ve and -ve indices`, [#1789](https://github.com/Rdatatable/data.table/issues/1789) and [#3185](https://github.com/Rdatatable/data.table/issues/3185). It now functions as `.SDcols=character()` has done and creates an empty `.SD`. Thanks to Gabor Grothendieck and Hugh Parsonage for reporting. A related issue with empty `.SDcols` was fixed in development before release thanks to Kun Ren's testing, [#3211](https://github.com/Rdatatable/data.table/issues/3211). + +13. Multithreaded stability should be much improved with R 3.5+. Many thanks to Luke Tierney for pinpointing a memory issue with package `constellation` caused by `data.table` and his advice, [#3165](https://github.com/Rdatatable/data.table/issues/3165). Luke also added an extra check to R-devel when compiled with `--enable-strict-barrier`. The test suite is run through latest daily R-devel after every commit as usual, but now with `--enable-strict-barrier` on too via GitLab CI ("Extra" badge on the `data.table` homepage) thanks to Jan Gorecki. + +14. Fixed an edge-case bug of platform-dependent output of `strtoi("", base = 2L)` on which `groupingsets` had relied, [#3267](https://github.com/Rdatatable/data.table/issues/3267). + +## NOTES + +1. When data.table loads it now checks its DLL version against the version of its R level code. This is to detect installation issues on Windows when i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. + +2. When `on=` is provided but not `i=`, a helpful error is now produced rather than silently ignoring `on=`. Thanks to Dirk Eddelbuettel for the idea. + +3. `.SDcols=` is more helpful when passed non-existent columns, [#3116](https://github.com/Rdatatable/data.table/issues/3116) and [#3118](https://github.com/Rdatatable/data.table/issues/3118). Thanks to Michael Chirico for the investigation and PR. + +4. `update.dev.pkg()` gains `type=` to specify if update should be made from binaries, sources or both. [#3148](https://github.com/Rdatatable/data.table/issues/3148). Thanks to Reino Bruner for the detailed suggestions. + +5. `setDT()` improves feedback when passed a ragged list (i.e. where all columns in the list are not the same length), [#3121](https://github.com/Rdatatable/data.table/issues/3121). Thanks @chuk-yong for highlighting. + +6. The one and only usage of `UNPROTECT_PTR()` has been removed, [#3232](https://github.com/Rdatatable/data.table/issues/3232). Thanks to Tomas Kalibera's investigation and advice here: https://developer.r-project.org/Blog/public/2018/12/10/unprotecting-by-value/index.html + + +# data.table v1.11.8 (30 Sep 2018) + +## NEW FEATURES + +1. `fread()` can now read `.gz` and `.bz2` files directly: `fread("file.csv.gz")`, [#717](https://github.com/Rdatatable/data.table/issues/717) [#3058](https://github.com/Rdatatable/data.table/issues/3058). It uses `R.utils::decompressFile` to decompress to a `tempfile()` which is then read by `fread()` in the usual way. For greater speed on large-RAM servers, it is recommended to use ramdisk for temporary files by setting `TMPDIR` to `/dev/shm` before starting R; see `?tempdir`. The decompressed temporary file is removed as soon as `fread` completes even if there is an error reading the file. Reading a remote compressed file in one step will be supported in the next version; e.g. `fread("https://domain.org/file.csv.bz2")`. + +## BUG FIXES + +1. Joining two keyed tables using `on=` to columns not forming a leading subset of `key(i)` could result in an invalidly keyed result, [#3061](https://github.com/Rdatatable/data.table/issues/3061). Subsequent queries on the result could then return incorrect results. A warning `longer object length is not a multiple of shorter object length` could also occur. Thanks to @renkun-ken for reporting and the PR. + +2. `keyby=` on columns for which an index exists now uses the index (new feature 7 in v1.11.6 below) but if an `i` subset is present in the same query then it could segfault, [#3062](https://github.com/Rdatatable/data.table/issues/3062). Again thanks to @renkun-ken for reporting. + +3. Assigning an out-of-range integer to an item in a factor column (a rare operation) correctly created an `NA` in that spot with warning, but now no longer also corrupts the variable being assigned, [#2984](https://github.com/Rdatatable/data.table/issues/2984). Thanks to @radfordneal for reporting and @MarkusBonsch for fixing. Assigning a string which is missing from the factor levels continues to automatically append the string to the factor levels. + +4. Assigning a sequence to a column using base R methods (e.g. `DT[["foo"]] = 1:10`) could cause subsetting to fail with `Internal error in subset.c: column is an ALTREP vector`, [#3051](https://github.com/Rdatatable/data.table/issues/3051). Thanks to Michel Lang for reporting. + +5. `as.data.table` `matrix` method now properly handles rownames for 0 column data.table output. Thanks @mllg for reporting. Closes [#3149](https://github.com/Rdatatable/data.table/issues/3149). + +## NOTES + +1. The test suite now turns on R's new _R_CHECK_LENGTH_1_LOGIC2_ to catch when internal use of `&&` or `||` encounter arguments of length more than one. Thanks to Hugh Parsonage for implementing and fixing the problems caught by this. + +2. Some namespace changes have been made with respect to melt, dcast and xts. No change is expected but if you do have any trouble, please file an issue. + +3. `split.data.table` was exported in v1.11.6 in addition to being registered using `S3method(split, data.table)`. The export has been removed again. It had been added because a user said they found it difficult to find, [#2920](https://github.com/Rdatatable/data.table/issues/2920). But S3 methods are not normally exported explicitly by packages. The proper way to access the `split.data.table` method is to call `split(DT)` where `DT` is a `data.table`. The generic (`base::split` in this case) then dispatches to the `split.data.table` method. v1.11.6 was not on CRAN very long (1 week) so we think it's better to revert this change quickly. To know what methods exist, R provides the `methods()` function. + + ```R + methods(split) # all the methods for the split generic + methods(class="data.table") # all the generics that data.table has a method for (47 currently) + ``` + + +# data.table v1.11.6 (19 Sep 2018) + +## NEW FEATURES + +1. For convenience when some of the files in `fnams` are empty in `rbindlist(lapply(fnams,fread))`, `fread` now reads empty input as a null-data.table with warning rather than error, [#2898](https://github.com/Rdatatable/data.table/issues/2898). For consistency, `fwrite(data.table(NULL))` now creates an empty file and warns instead of error, too. + +2. `setcolorder(DT)` without further arguments now defaults to moving the key columns to be first, [#2895](https://github.com/Rdatatable/data.table/issues/2895). Thanks to @jsams for the PR. + +3. Attempting to subset on `col` when the column is actually called `Col` will still error, but the error message will helpfully suggest similarly-spelled columns, [#2887](https://github.com/Rdatatable/data.table/issues/2887). This is experimental, applies just to `i` currently, and we look forward to feedback. Thanks to Michael Chirico for the suggestion and PR. + +4. `fread()` has always accepted literal data; e.g. `fread("A,B\n1,2\n3,4")`. It now gains explicit `text=`; e.g. `fread(text="A,B\n1,2\n3,4")`. Unlike the first general purpose `input=` argument, the `text=` argument accepts multi-line input; e.g. `fread(text=c("A,B","1,2","3,4"))`, [#1423](https://github.com/Rdatatable/data.table/issues/1423). Thanks to Douglas Clark for the request and Hugh Parsonage for the PR. + +5. `fread()` has always accepted system commands; e.g. `fread("grep blah file.txt")`. It now gains explicit `cmd=`; e.g. `fread(cmd="grep blah file.txt")`. Further, if and only if `input=` is a system command and a variable was used to hold that command (`fread(someCommand)` not `fread("grep blah file.txt")`) or a variable is used to construct it (`fread(paste("grep",variable,"file.txt"))`), a message is now printed suggesting `cmd=`. This is to inform all users that there is a potential security concern if you are i) creating apps, and ii) your app takes input from a public user who could be malicious, and iii) input from the malicious user (such as a filename) is passed by your app to `fread()`, and iv) your app in not running in a protected environment. If all 4 conditions hold then the malicious user could provide a system command instead of a filename which `fread()` would run, and that would be a problem too. If the app is not running in a protected environment (e.g. app is running as root) then this could do damage or obtain data you did not intend. Public facing apps should be running with limited operating system permission so that any breach from any source is contained. We agree with [Linus Torvald's advice](https://lkml.org/lkml/2017/11/21/356) on this which boils down to: "when addressing security concerns the first step is do no harm, just inform". If you aren't creating apps or apis that could have a malicious user then there is no risk but we can't distinguish you so we have to inform everyone. Please change to `fread(cmd=...)` at your leisure. The new message can be suppressed with `options(datatable.fread.input.cmd.message=FALSE)`. Passing system commands to `fread()` continues to be recommended and encouraged and is widely used; e.g. via the techniques gathered together in the book [Data Science at the Command Line](https://datascienceatthecommandline.com/). A `warning()` is too strong because best-practice for production systems is to set `options(warn=2)` to tolerate no warnings. Such production systems have no user input and so there is no security risk; we don't want to do harm by breaking production systems via a `warning()` which gets turned into an error by `options(warn=2)`. Now that we have informed all users, we request feedback. There are 3 options for future releases: i) remove the message, ii) leave the message in place, iii) upgrade the message to warning and then eventually error. The default choice is the middle one: leave the message in place. + +6. New `options(datatable.CJ.names=TRUE)` changes `CJ()` to auto-name its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Thanks @franknarf1 for the suggestion. Current default is `FALSE`; i.e. no change. The option's default will be changed to `TRUE` in v1.12.0 and then eventually the option will be removed. Any code that depends on `CJ(x,y)$V1` will need to be changed to `CJ(x,y)$x` and is more akin to a bug fix due to the inconsistency with `data.table()`. + +7. If an appropriate index exists, `keyby=` will now use it. For example, given `setindex(DT,colA,colB)`, both `DT[,j,keyby=colA]` (a leading subset of the index columns) and `DT[,j,keyby=.(colA,colB)]` will use the index, but not `DT[,j,keyby=.(colB,colA)]`. The option `options(datatable.use.index=FALSE)` will turn this feature off. Please always use `keyby=` unless you wish to retain the order of groups by first-appearance order (in which case use `by=`). Also, both `keyby=` and `by=` already used the key where possible but are now faster when using just the first column of the key. As usual, setting `verbose=TRUE` either per-query or globally using `options(datatable.verbose=TRUE)` will report what's being done internally. + +## BUG FIXES + +1. `fread` now respects the order of columns passed to `select=` when column numbers are used, [#2986](https://github.com/Rdatatable/data.table/issues/2986). It already respected the order when column names are used. Thanks @privefl for raising the issue. + +2. `gmin` and `gmax` no longer fail on _ordered_ factors, [#1947](https://github.com/Rdatatable/data.table/issues/1947). Thanks to @mcieslik-mctp for identifying and @mbacou for the nudge. + +3. `as.ITime.character` now properly handles NA when attempting to detect the format of non-NA values in vector. Thanks @polyjian for reporting, closes [#2940](https://github.com/Rdatatable/data.table/issues/2940). + +4. `as.matrix(DT, rownames="id")` now works when `DT` has a single row, [#2930](https://github.com/Rdatatable/data.table/issues/2930). Thanks to @malcook for reporting and @sritchie73 for fixing. The root cause was the dual meaning of the `rownames=` argument: i) a single column name/number (most common), or ii) rowname values length 1 for the single row. For clarity and safety, `rownames.value=` has been added. Old usage (i.e. `length(rownames)>1`) continues to work for now but will issue a warning in a future release, and then error in a release after that. + +5. Fixed regression in v1.11.0 (May 2018) caused by PR [#2389](https://github.com/Rdatatable/data.table/pull/2389) which introduced partial key retainment on `:=` assigns. This broke the joining logic that assumed implicitly that assigning always drops keys completely. Consequently, join and subset results could be wrong when matching character to factor columns with existing keys, [#2881](https://github.com/Rdatatable/data.table/issues/2881). Thanks to @ddong63 for reporting and to @MarkusBonsch for fixing. Missing test added to ensure this doesn't arise again. + +6. `as.IDate.numeric` no longer ignores "origin", [#2880](https://github.com/Rdatatable/data.table/issues/2880). Thanks to David Arenburg for reporting and fixing. + +7. `as.ITime.times` was rounding fractional seconds while other methods were truncating, [#2870](https://github.com/Rdatatable/data.table/issues/2870). The `as.ITime` method gains `ms=` taking `"truncate"` (default), `"nearest"` and `"ceil"`. Thanks to @rossholmberg for reporting and Michael Chirico for fixing. + +8. `fwrite()` now writes POSIXct dates after 2038 correctly, [#2995](https://github.com/Rdatatable/data.table/issues/2995). Thanks to Manfred Zorn for reporting and Philippe Chataignon for the PR fixing it. + +9. `fsetequal` gains the `all` argument to make it consistent with the other set operator functions `funion`, `fsetdiff` and `fintersect` [#2968](https://github.com/Rdatatable/data.table/issues/2968). When `all = FALSE` `fsetequal` will treat rows as elements in a set when checking whether two `data.tables` are equal (i.e. duplicate rows will be ignored). For now the default value is `all = TRUE` for backwards compatibility, but this will be changed to `all = FALSE` in a future release to make it consistent with the other set operation functions. Thanks to @franknarf1 for reporting and @sritchie73 for fixing. + +10. `fintersect` failed on tables with a column called `y`, [#3034](https://github.com/Rdatatable/data.table/issues/3034). Thanks to Maxim Nazarov for reporting. + +11. Compilation fails in AIX because NAN and INFINITY macros definition in AIX make them not constant literals, [#3043](https://github.com/Rdatatable/data.table/pull/3043). Thanks to Ayappan for reporting and fixing. + +12. The introduction of altrep in R 3.5.0 caused some performance regressions of about 20% in some cases, [#2962](https://github.com/Rdatatable/data.table/issues/2962). Investigating this led to some improvements to grouping which are faster than before R 3.5.0 in some cases. Thanks to Nikolay S. for reporting. The work to accomodate altrep is not complete but it is better and it is highly recommended to upgrade to this update. + +13. Fixed 7 memory faults thanks to CRAN's [`rchk`](https://github.com/kalibera/rchk) tool by Tomas Kalibera, [#3033](https://github.com/Rdatatable/data.table/pull/3033). + +## NOTES + +1. The type coercion warning message has been improved, [#2989](https://github.com/Rdatatable/data.table/pull/2989). Thanks to @sarahbeeysian on Twitter for highlighting. For example, given the follow statements: + + ```R + DT = data.table(id=1:3) + DT[2, id:="foo"] + ``` + + the warning message has changed from : + + ``` + Coerced character RHS to integer to match the column's type. Either change the target column + ['id'] to character first (by creating a new character vector length 3 (nrows of entire table) and + assign that; i.e. 'replace' column), or coerce RHS to integer (e.g. 1L, NA_[real|integer]_, as.*, + etc) to make your intent clear and for speed. Or, set the column type correctly up front when you + create the table and stick to it, please. + ``` + + to : + + ``` + Coerced character RHS to integer to match the type of the target column (column 1 named 'id'). If + the target column's type integer is correct, it's best for efficiency to avoid the coercion and + create the RHS as type integer. To achieve that consider the L postfix: typeof(0L) vs typeof(0), + and typeof(NA) vs typeof(NA_integer_) vs typeof(NA_real_). Wrapping the RHS with as.integer() will + avoid this warning but still perform the coercion. If the target column's type is not correct, it + is best to revisit where the DT was created and fix the column type there; e.g., by using + colClasses= in fread(). Otherwise, you can change the column type now by plonking a new column (of + the desired type) over the top of it; e.g. DT[, `id`:=as.character(`id`)]. If the RHS of := has + nrow(DT) elements then the assignment is called a column plonk and is the way to change a column's + type. Column types can be observed with sapply(DT,typeof). + ``` + + Further, if a coercion from double to integer is performed, fractional data such as 3.14 is now detected and the truncation to 3 is warned about if and only if truncation has occurred. + + ```R + DT = data.table(v=1:3) + DT[2, v:=3.14] + Warning message: + Coerced double RHS to integer to match the type of the target column (column 1 named 'v'). One + or more RHS values contain fractions which have been lost; e.g. item 1 with value 3.140000 has + been truncated to 3. + ``` + +2. `split.data.table` method is now properly exported, [#2920](https://github.com/Rdatatable/data.table/issues/2920). But we don't recommend it because `split` copies all the pieces into new memory. + +3. Setting indices on columns which are part of the key will now create those indices. + +4. `hour`, `minute`, and `second` utility functions use integer arithmetic when the input is already (explicitly) UTC-based `POSIXct` for 4-10x speedup vs. using `as.POSIXlt`. + +5. Error added for incorrect usage of `%between%`, with some helpful diagnostic hints, [#3014](https://github.com/Rdatatable/data.table/issues/3014). Thanks @peterlittlejohn for offering his user experience and providing the impetus. + + +# data.table v1.11.4 (27 May 2018) + +1. Empty RHS of `:=` is no longer an error when the `i` clause returns no rows to assign to anyway, [#2829](https://github.com/Rdatatable/data.table/issues/2829). Thanks to @cguill95 for reporting and to @MarkusBonsch for fixing. + +2. Fixed runaway memory usage with R-devel (R > 3.5.0), [#2882](https://github.com/Rdatatable/data.table/pull/2882). Thanks to many people but in particular to Trang Nguyen for making the breakthrough reproducible example, Paul Bailey for liaising, and Luke Tierney for then pinpointing the issue. It was caused by an interaction of two or more data.table threads operating on new compact vectors in the ALTREP framework, such as the sequence `1:n`. This interaction could result in R's garbage collector turning off, and hence the memory explosion. Problems may occur in R 3.5.0 too but we were only able to reproduce in R > 3.5.0. The R code in data.table's implementation benefits from ALTREP (`for` loops in R no longer allocate their range vector input, for example) but are not so appropriate as data.table columns. Sequences such as `1:n` are common in test data but not very common in real-world datasets. Therefore, there is no need for data.table to support columns which are ALTREP compact sequences. The `data.table()` function already expanded compact vectors (by happy accident) but `setDT()` did not (it now does). If, somehow, a compact vector still reaches the internal parallel regions, a helpful error will now be generated. If this happens, please report it as a bug. + +3. Tests 1590.3 & 1590.4 now pass when users run `test.data.table()` on Windows, [#2856](https://github.com/Rdatatable/data.table/pull/2856). Thanks to Avraham Adler for reporting. Those tests were passing on AppVeyor, win-builder and CRAN's Windows because `R CMD check` sets `LC_COLLATE=C` as documented in R-exts$1.3.1, whereas by default on Windows `LC_COLLATE` is usually a regional Windows-1252 dialect such as `English_United States.1252`. + +4. Around 1 billion very small groups (of size 1 or 2 rows) could result in `"Failed to realloc working memory"` even when plenty of memory is available, [#2777](https://github.com/Rdatatable/data.table/issues/2777). Thanks once again to @jsams for the detailed report as a follow up to bug fix 40 in v1.11.0. + + +# data.table v1.11.2 (08 May 2018) + +1. `test.data.table()` created/overwrote variable `x` in `.GlobalEnv`, [#2828](https://github.com/Rdatatable/data.table/issues/2828); i.e. a modification of user's workspace which is not allowed. Thanks to @etienne-s for reporting. + +2. `as.chron` methods for `IDate` and `ITime` have been removed, [#2825](https://github.com/Rdatatable/data.table/issues/2825). `as.chron` still works since `IDate` inherits from `Date`. We are not sure why we had specific methods in the first place. It may have been from a time when `IDate` did not inherit from `Date`, perhaps. Note that we don't use `chron` ourselves in our own work. + +3. Fixed `SETLENGTH() cannot be applied to an ALTVEC object` starting in R-devel (R 3.6.0) on 1 May 2018, a few hours after 1.11.0 was accepted on CRAN, [#2820](https://github.com/Rdatatable/data.table/issues/2820). Many thanks to Luke Tierney for pinpointing the problem. + +4. Fixed some rare memory faults in `fread()` and `rbindlist()` found with `gctorture2()` and [`rchk`](https://github.com/kalibera/rchk), [#2841](https://github.com/Rdatatable/data.table/issues/2841). + + +# data.table v1.11.0 (01 May 2018) + +## NOTICE OF INTENDED FUTURE POTENTIAL BREAKING CHANGES + +1. `fread()`'s `na.strings=` argument : + + ```R + "NA" # old default + getOption("datatable.na.strings", "NA") # this release; i.e. the same; no change yet + getOption("datatable.na.strings", "") # future release + ``` + + This option controls how `,,` is read in character columns. It does not affect numeric columns which read `,,` as `NA` regardless. We would like `,,`=>`NA` for consistency with numeric types, and `,"",`=>empty string to be the standard default for `fwrite/fread` character columns so that `fread(fwrite(DT))==DT` without needing any change to any parameters. `fwrite` has never written `NA` as `"NA"` in case `"NA"` is a valid string in the data; e.g., 2 character id columns sometimes do. Instead, `fwrite` has always written `,,` by default for an `` in a character columns. The use of R's `getOption()` allows users to move forward now, using `options(datatable.fread.na.strings="")`, or restore old behaviour when the default's default is changed in future, using `options(datatable.fread.na.strings="NA")`. + +2. `fread()` and `fwrite()`'s `logical01=` argument : + + ```R + logical01 = FALSE # old default + getOption("datatable.logical01", FALSE) # this release; i.e. the same; no change yet + getOption("datatable.logical01", TRUE) # future release + ``` + + This option controls whether a column of all 0's and 1's is read as `integer`, or `logical` directly to avoid needing to change the type afterwards to `logical` or use `colClasses`. `0/1` is smaller and faster than `"TRUE"/"FALSE"`, which can make a significant difference to space and time the more `logical` columns there are. When the default's default changes to `TRUE` for `fread` we do not expect much impact since all arithmetic operators that are currently receiving 0's and 1's as type `integer` (think `sum()`) but instead could receive `logical`, would return exactly the same result on the 0's and 1's as `logical` type. However, code that is manipulating column types using `is.integer` or `is.logical` on `fread`'s result, could require change. It could be painful if `DT[(logical_column)]` (i.e. `DT[logical_column==TRUE]`) changed behaviour due to `logical_column` no longer being type `logical` but `integer`. But that is not the change proposed. The change is the other way around; i.e., a previously `integer` column holding only 0's and 1's would now be type `logical`. Since it's that way around, we believe the scope for breakage is limited. We think a lot of code is converting 0/1 integer columns to logical anyway, either using `colClasses=` or afterwards with an assign. For `fwrite`, the level of breakage depends on the consumer of the output file. We believe `0/1` is a better more standard default choice to move to. See notes below about improvements to `fread`'s sampling for type guessing, and automatic rereading in the rare cases of out-of-sample type surprises. + + +These options are meant for temporary use to aid your migration, [#2652](https://github.com/Rdatatable/data.table/pull/2652). You are not meant to set them to the old default and then not migrate your code that is dependent on the default. Either set the argument explicitly so your code is not dependent on the default, or change the code to cope with the new default. Over the next few years we will slowly start to remove these options, warning you if you are using them, and return to a simple default. See the history of NEWS and NEWS.0 for past migrations that have, generally speaking, been successfully managed in this way. For example, at the end of NOTES for this version (below in this file) is a note about the usage of `datatable.old.unique.by.key` now warning, as you were warned it would do over a year ago. When that change was introduced, the default was changed and that option provided an option to restore the old behaviour. These `fread`/`fwrite` changes are even more cautious and not even changing the default's default yet. Giving you extra warning by way of this notice to move forward. And giving you a chance to object. + +## NEW FEATURES + +1. `fread()`: + * Efficiency savings at C level including **parallelization** announced [here](https://github.com/Rdatatable/data.table/wiki/talks/BARUG_201704_ParallelFread.pdf); e.g. a 9GB 2 column integer csv input is **50s down to 12s** to cold load on a 4 core laptop with 16GB RAM and SSD. Run `echo 3 >/proc/sys/vm/drop_caches` first to measure cold load time. Subsequent load time (after file has been cached by OS on the first run) **40s down to 6s**. + * The [fread for small data](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread) page has been revised. + * Memory maps lazily; e.g. reading just the first 10 rows with `nrow=10` is **12s down to 0.01s** from cold for the 9GB file. Large files close to your RAM limit may work more reliably too. The progress meter will commence sooner and more consistently. + * `fread` has always jumped to the middle and to the end of the file for a much improved column type guess. The sample size is increased from 100 rows at 10 jump jump points (1,000 rows) to 100 rows at 100 jumps points (10,000 row sample). In the rare case of there still being out-of-sample type exceptions, those columns are now *automatically reread* so you don't have to use `colClasses` yourself. + * Large number of columns support; e.g. **12,000 columns** tested. + * **Quoting rules** are more robust and flexible. See point 10 on the wiki page [here](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread#10-automatic-quote-escape-method-detection-including-no-escape). + * Numeric data that has been quoted is now detected and read as numeric. + * The ability to position `autostart` anywhere inside one of multiple tables in a single file is removed with warning. It used to search upwards from that line to find the start of the table based on a consistent number of columns. People appear to be using `skip="string"` or `skip=nrow` to find the header row exactly, which is retained and simpler. It was too difficult to retain search-upwards-autostart together with skipping/filling blank lines, filling incomplete rows and parallelization too. If there is any header info above the column names, it is still auto detected and auto skipped (particularly useful when loading a set of files where the column names start on different lines due to a varying height messy header). + * `dec=','` is now implemented directly so there is no dependency on locale. The options `datatable.fread.dec.experiment` and `datatable.fread.dec.locale` have been removed. + * `\\r\\r\\n` line endings are now handled such as produced by `base::download.file()` when it doubles up `\\r`. Other rare line endings (`\\r` and `\\n\\r`) are now more robust. + * Mixed line endings are now handled; e.g. a file formed by concatenating a Unix file and a Windows file so that some lines end with `\\n` while others end with `\\r\\n`. + * Improved automatic detection of whether the first row is column names by comparing the types of the fields on the first row against the column types ascertained by the 10,000 rows sample (or `colClasses` if provided). If a numeric column has a string value at the top, then column names are deemed present. + * Detects GB-18030 and UTF-16 encodings and in verbose mode prints a message about BOM detection. + * Detects and ignores trailing ^Z end-of-file control character sometimes created on MS DOS/Windows, [#1612](https://github.com/Rdatatable/data.table/issues/1612). Thanks to Gergely Daróczi for reporting and providing a file. + * Added ability to recognize and parse hexadecimal floating point numbers, as used for example in Java. Thanks for @scottstanfield [#2316](https://github.com/Rdatatable/data.table/issues/2316) for the report. + * Now handles floating-point NaN values in a wide variety of formats, including `NaN`, `sNaN`, `1.#QNAN`, `NaN1234`, `#NUM!` and others, [#1800](https://github.com/Rdatatable/data.table/issues/1800). Thanks to Jori Liesenborgs for highlighting and the PR. + * If negative numbers are passed to `select=` the out-of-range error now suggests `drop=` instead, [#2423](https://github.com/Rdatatable/data.table/issues/2423). Thanks to Michael Chirico for the suggestion. + * `sep=NULL` or `sep=""` (i.e., no column separator) can now be used to specify single column input reliably like `base::readLines`, [#1616](https://github.com/Rdatatable/data.table/issues/1616). `sep='\\n'` still works (even on Windows where line ending is actually `\\r\\n`) but `NULL` or `""` are now documented and recommended. Thanks to Dmitriy Selivanov for the pull request and many others for comments. As before, `sep=NA` is not valid; use the default `"auto"` for automatic separator detection. `sep='\\n'` is now deprecated and in future will start to warn when used. + * Single-column input with blank lines is now valid and the blank lines are significant (representing `NA`). The blank lines are significant even at the very end, which may be surprising on first glance. The change is so that `fread(fwrite(DT))==DT` for single-column inputs containing `NA` which are written as blank. There is no change when `ncol>1`; i.e., input stops with detailed warning at the first blank line, because a blank line when `ncol>1` is invalid input due to no separators being present. Thanks to @skanskan, Michael Chirico, @franknarf1 and Pasha for the testing and discussions, [#2106](https://github.com/Rdatatable/data.table/issues/2106). + * Too few column names are now auto filled with default column names, with warning, [#1625](https://github.com/Rdatatable/data.table/issues/1625). If there is just one missing column name it is guessed to be for the first column (row names or an index), otherwise the column names are filled at the end. Similarly, too many column names now automatically sets `fill=TRUE`, with warning. + * `skip=` and `nrow=` are more reliable and are no longer affected by invalid lines outside the range specified. Thanks to Ziyad Saeed and Kyle Chung for reporting, [#1267](https://github.com/Rdatatable/data.table/issues/1267). + * Ram disk (`/dev/shm`) is no longer used for the output of system command input. Although faster when it worked, it was causing too many device full errors; e.g., [#1139](https://github.com/Rdatatable/data.table/issues/1139) and [zUMIs/19](https://github.com/sdparekh/zUMIs/issues/19). Thanks to Kyle Chung for reporting. Standard `tempdir()` is now used. If you wish to use ram disk, set TEMPDIR to `/dev/shm`; see `?tempdir`. + * Detecting whether a very long input string is a file name or data is now much faster, [#2531](https://github.com/Rdatatable/data.table/issues/2531). Many thanks to @javrucebo for the detailed report, benchmarks and suggestions. + * A column of `TRUE/FALSE`s is ok, as well as `True/False`s and `true/false`s, but mixing styles (e.g. `TRUE/false`) is not and will be read as type `character`. + * New argument `index` to compliment the existing `key` argument for applying secondary orderings out of the box for convenience, [#2633](https://github.com/Rdatatable/data.table/issues/2633). + * A warning is now issued whenever incorrectly quoted fields have been detected and fixed using a non-standard quote rule. `fread` has always used these advanced rules but now it warns that it is using them. Most file writers correctly quote fields if the field contains the field separator, but a common error is not to also quote fields that contain a quote and then escape those quotes, particularly if that quote occurs at the start of the field. The ability to detect and fix such files is referred to as self-healing. Ambiguities are resolved using the knowledge that the number of columns is constant, and therefore this ability is not available when `fill=TRUE`. This feature can be improved in future by using column type consistency as well as the number of fields. For example: + + ```R + txt = 'A,B\n1,hello\n2,"howdy" said Joe\n3,bonjour\n' + cat(txt) + # A,B + # 1,hello + # 2,"howdy" said Joe + # 3,bonjour + fread(txt) + A B + + 1: 1 hello + 2: 2 "howdy" said Joe + 3: 3 bonjour + Warning message: + In fread(txt) : Found and resolved improper quoting + ``` + + * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney, Ananda Mahto, @memoryfull, @brandenkmurray for testing dev and reporting these regressions before release to CRAN: #1464, #1671, #1888, #1895, #2070, #2073, #2087, #2091, #2092, #2107, #2118, #2123, #2167, #2194, #2196, #2201, #2222, #2228, #2238, #2246, #2251, #2265, #2267, #2285, #2287, #2299, #2322, #2347, #2352, #2370, #2371, #2395, #2404, #2446, #2453, #2457, #2464, #2481, #2499, #2512, #2515, #2516, #2518, #2520, #2523, #2526, #2535, #2542, #2548, #2561, #2600, #2625, #2666, #2697, #2735, #2744. + +2. `fwrite()`: + * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). + * `logical01` has been added and the old name `logicalAsInt` retained. Pease move to the new name when convenient for you. The old argument name (`logicalAsInt`) will slowly be deprecated over the next few years. The default is unchanged: `FALSE`, so `logical` is still written as `"TRUE"`/`"FALSE"` in full by default. We intend to change the default's default in future to `TRUE`; see the notice at the top of these release notes. + +3. Added helpful message when subsetting by a logical column without wrapping it in parentheses, [#1844](https://github.com/Rdatatable/data.table/issues/1844). Thanks @dracodoc for the suggestion and @MichaelChirico for the PR. + +4. `tables` gains `index` argument for supplementary metadata about `data.table`s in memory (or any optionally specified environment), part of [#1648](https://github.com/Rdatatable/data.table/issues/1648). Thanks due variously to @jangorecki, @rsaporta, @MichaelChirico for ideas and work towards PR. + +5. Improved auto-detection of `character` inputs' formats to `as.ITime` to mirror the logic in `as.POSIXlt.character`, [#1383](https://github.com/Rdatatable/data.table/issues/1383) Thanks @franknarf1 for identifying a discrepancy and @MichaelChirico for investigating. + +6. `setcolorder()` now accepts less than `ncol(DT)` columns to be moved to the front, [#592](https://github.com/Rdatatable/data.table/issues/592). Thanks @MichaelChirico for the PR. This also incidentally fixed [#2007](https://github.com/Rdatatable/data.table/issues/2007) whereby explicitly setting `select = NULL` in `fread` errored; thanks to @rcapell for reporting that and @dselivanov and @MichaelChirico for investigating and providing a new test. + +7. Three new *Grouping Sets* functions: `rollup`, `cube` and `groupingsets`, [#1377](https://github.com/Rdatatable/data.table/issues/1377). Allows to aggregation on various grouping levels at once producing sub-totals and grand total. + +8. `as.data.table()` gains new method for `array`s to return a useful data.table, [#1418](https://github.com/Rdatatable/data.table/issues/1418). + +9. `print.data.table()` (all via master issue [#1523](https://github.com/Rdatatable/data.table/issues/1523)): + + * gains `print.keys` argument, `FALSE` by default, which displays the keys and/or indices (secondary keys) of a `data.table`. Thanks @MichaelChirico for the PR, Yike Lu for the suggestion and Arun for honing that idea to its present form. + + * gains `col.names` argument, `"auto"` by default, which toggles which registers of column names to include in printed output. `"top"` forces `data.frame`-like behavior where column names are only ever included at the top of the output, as opposed to the default behavior which appends the column names below the output as well for longer (>20 rows) tables. `"none"` shuts down column name printing altogether. Thanks @MichaelChirico for the PR, Oleg Bondar for the suggestion, and Arun for guiding commentary. + + * list columns would print the first 6 items in each cell followed by a comma if there are more than 6 in that cell. Now it ends ",..." to make it clearer, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). Thanks to @franknarf1 for drawing attention to an issue raised on Stack Overflow by @TMOTTM [here](https://stackoverflow.com/q/47679701). + +10. `setkeyv` accelerated if key already exists [#2331](https://github.com/Rdatatable/data.table/issues/2331). Thanks to @MarkusBonsch for the PR. + +11. Keys and indexes are now partially retained up to the key column assigned to with ':=' [#2372](https://github.com/Rdatatable/data.table/issues/2372). They used to be dropped completely if any one of the columns was affected by `:=`. Tanks to @MarkusBonsch for the PR. + +12. Faster `as.IDate` and `as.ITime` methods for `POSIXct` and `numeric`, [#1392](https://github.com/Rdatatable/data.table/issues/1392). Thanks to Jan Gorecki for the PR. + +13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR, and thanks to @mgahan for pointing out a reversion in `na.omit.data.table` before release, [#2660](https://github.com/Rdatatable/data.table/issues/2660#issuecomment-371027948). + +14. `uniqueN()` is now faster on logical vectors. Thanks to Hugh Parsonage for [PR#2648](https://github.com/Rdatatable/data.table/pull/2648). + + ```R + N = 1e9 + # was now + x = c(TRUE,FALSE,NA,rep(TRUE,N)) # + uniqueN(x) == 3 # 5.4s 0.00s + x = c(TRUE,rep(FALSE,N), NA) # + uniqueN(x,na.rm=TRUE) == 2 # 5.4s 0.00s + x = c(rep(TRUE,N),FALSE,NA) # + uniqueN(x) == 3 # 6.7s 0.38s + ``` + +15. Subsetting optimization with keys and indices is now possible for compound queries like `DT[a==1 & b==2]`, [#2472](https://github.com/Rdatatable/data.table/issues/2472). +Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementation. + +16. `melt.data.table` now offers friendlier functionality for providing `value.name` for `list` input to `measure.vars`, [#1547](https://github.com/Rdatatable/data.table/issues/1547). Thanks @MichaelChirico and @franknarf1 for the suggestion and use cases, @jangorecki and @mrdwab for implementation feedback, and @MichaelChirico for ultimate implementation. + +17. `update.dev.pkg` is new function to update package from development repository, it will download package sources only when newer commit is available in repository. `data.table::update.dev.pkg()` defaults updates `data.table`, but any package can be used. + +18. Item 1 in NEWS for [v1.10.2](https://github.com/Rdatatable/data.table/blob/master/NEWS.md#changes-in-v1102--on-cran-31-jan-2017) on CRAN in Jan 2017 included : + + > When j is a symbol prefixed with `..` it will be looked up in calling scope and its value taken to be column names or numbers. + > When you see the `..` prefix think one-level-up, like the directory `..` in all operating systems means the parent directory. + > In future the `..` prefix could be made to work on all symbols apearing anywhere inside `DT[...]`. + + The response has been positive ([this tweet](https://twitter.com/MattDowle/status/967290562725359617) and [FR#2655](https://github.com/Rdatatable/data.table/issues/2655)) and so this prefix is now expanded to all symbols appearing in `j=` as a first step; e.g. + + ```R + cols = "colB" + DT[, c(..cols, "colC")] # same as DT[, .(colB,colC)] + DT[, -..cols] # all columns other than colB + ``` + + Thus, `with=` should no longer be needed in any cases. Please change to using the `..` prefix and over the next few years we will start to formally deprecate and remove the `with=` parameter. If this is well received, the `..` prefix could be expanded to symbols appearing in `i=` and `by=`, too. Note that column names should not now start with `..`. If a symbol `..var` is used in `j=` but `..var` exists as a column name, the column still takes precedence, for backwards compatibility. Over the next few years, data.table will start issuing warnings/errors when it sees column names starting with `..`. This affects one CRAN package out of 475 using data.table, so we do not believe this restriction to be unreasonable. Our main focus here which we believe `..` achieves is to resolve the more common ambiguity when `var` is in calling scope and `var` is a column name too. Further, we have not forgotten that in the past we recommended prefixing the variable in calling scope with `..` yourself. If you did that and `..var` exists in calling scope, that still works, provided neither `var` exists in calling scope nor `..var` exists as a column name. Please now remove the `..` prefix on `..var` in calling scope to tidy this up. In future data.table will start to warn/error on such usage. + +19. `setindexv` can now assign multiple (separate) indices by accepting a `list` in the `cols` argument. + +20. `as.matrix.data.table` method now has an additional `rownames` argument allowing for a single column to be used as the `rownames` after conversion to a `matrix`. Thanks to @sritchie73 for the suggestion, use cases, [#2692](https://github.com/Rdatatable/data.table/issues/2692) and implementation [PR#2702](https://github.com/Rdatatable/data.table/pull/2702) and @MichaelChirico for additional use cases. + +## BUG FIXES + +1. The new quote rules handles this single field `"Our Stock Screen Delivers an Israeli Software Company (MNDO, CTCH)<\/a> SmallCapInvestor.com - Thu, May 19, 2011 10:02 AM EDT<\/cite><\/div>Yesterday in \""Google, But for Finding + Great Stocks\"", I discussed the value of stock screeners as a powerful tool"`, [#2051](https://github.com/Rdatatable/data.table/issues/2051). Thanks to @scarrascoso for reporting. Example file added to test suite. + +2. `fwrite()` creates a file with permissions that now play correctly with `Sys.umask()`, [#2049](https://github.com/Rdatatable/data.table/issues/2049). Thanks to @gnguy for reporting. + +3. `fread()` no longer holds an open lock on the file when a line outside the large sample has too many fields and generates an error, [#2044](https://github.com/Rdatatable/data.table/issues/2044). Thanks to Hugh Parsonage for reporting. + +4. Setting `j = {}` no longer results in an error, [#2142](https://github.com/Rdatatable/data.table/issues/2142). Thanks Michael Chirico for the pull request. + +5. Segfault in `rbindlist()` when one or more items are empty, [#2019](https://github.com/Rdatatable/data.table/issues/2019). Thanks Michael Lang for the pull request. Another segfault if the result would be more than 2bn rows, thanks to @jsams's comment in [#2340](https://github.com/Rdatatable/data.table/issues/2340#issuecomment-331505494). + +6. Error printing 0-length `ITime` and `NA` objects, [#2032](https://github.com/Rdatatable/data.table/issues/2032) and [#2171](https://github.com/Rdatatable/data.table/issues/2171). Thanks Michael Chirico for the pull requests and @franknarf1 for pointing out a shortcoming of the initial fix. + +7. `as.IDate.POSIXct` error with `NULL` timezone, [#1973](https://github.com/Rdatatable/data.table/issues/1973). Thanks @lbilli for reporting and Michael Chirico for the pull request. + +8. Printing a null `data.table` with `print` no longer visibly outputs `NULL`, [#1852](https://github.com/Rdatatable/data.table/issues/1852). Thanks @aaronmcdaid for spotting and @MichaelChirico for the PR. + +9. `data.table` now works with Shiny Reactivity / Flexdashboard. The error was typically something like `col not found` in `DT[col==val]`. Thanks to Dirk Eddelbuettel leading Matt through reproducible steps and @sergeganakou and Richard White for reporting. Closes [#2001](https://github.com/Rdatatable/data.table/issues/2001) and [shiny/#1696](https://github.com/rstudio/shiny/issues/1696). + +10. The `as.IDate.POSIXct` method passed `tzone` along but was not exported. So `tzone` is now taken into account by `as.IDate` too as well as `IDateTime`, [#977](https://github.com/Rdatatable/data.table/issues/977) and [#1498](https://github.com/Rdatatable/data.table/issues/1498). Tests added. + +11. Named logical vector now select rows as expected from single row data.table. Thanks to @skranz for reporting. Closes [#2152](https://github.com/Rdatatable/data.table/issues/2152). + +12. `fread()`'s rare `Internal error: Sampling jump point 10 is before the last jump ended` has been fixed, [#2157](https://github.com/Rdatatable/data.table/issues/2157). Thanks to Frank Erickson and Artem Klevtsov for reporting with example files which are now added to the test suite. + +13. `CJ()` no longer loses attribute information, [#2029](https://github.com/Rdatatable/data.table/issues/2029). Thanks to @MarkusBonsch and @royalts for the pull request. + +14. `split.data.table` respects `factor` ordering in `by` argument, [#2082](https://github.com/Rdatatable/data.table/issues/2082). Thanks to @MichaelChirico for identifying and fixing the issue. + +15. `.SD` would incorrectly include symbol on lhs of `:=` when `.SDcols` is specified and `get()` appears in `j`. Thanks @renkun-ken for reporting and the PR, and @ProfFancyPants for reporing a regression introduced in the PR. Closes [#2326](https://github.com/Rdatatable/data.table/issues/2326) and [#2338](https://github.com/Rdatatable/data.table/issues/2338). + +16. Integer values that are too large to fit in `int64` will now be read as strings [#2250](https://github.com/Rdatatable/data.table/issues/2250). + +17. Internal-only `.shallow` now retains keys correctly, [#2336](https://github.com/Rdatatable/data.table/issues/2336). Thanks to @MarkusBonsch for reporting, fixing ([PR #2337](https://github.com/Rdatatable/data.table/pull/2337)) and adding 37 tests. This much advances the journey towards exporting `shallow()`, [#2323](https://github.com/Rdatatable/data.table/issues/2323). + +18. `isoweek` calculation is correct regardless of local timezone setting (`Sys.timezone()`), [#2407](https://github.com/Rdatatable/data.table/issues/2407). Thanks to @MoebiusAV and @SimonCoulombe for reporting and @MichaelChirico for fixing. + +19. Fixed `as.xts.data.table` to support all xts supported time based index clasess [#2408](https://github.com/Rdatatable/data.table/issues/2408). Thanks to @ebs238 for reporting and for the PR. + +20. A memory leak when a very small number such as `0.58E-2141` is bumped to type `character` is resolved, [#918](https://github.com/Rdatatable/data.table/issues/918). + +21. The edge case `setnames(data.table(), character(0))` now works rather than error, [#2452](https://github.com/Rdatatable/data.table/issues/2452). + +22. Order of rows returned in non-equi joins were incorrect in certain scenarios as reported under [#1991](https://github.com/Rdatatable/data.table/issues/1991). This is now fixed. Thanks to @Henrik-P for reporting. + +23. Non-equi joins work as expected when `x` in `x[i, on=...]` is a 0-row data.table. Closes [#1986](https://github.com/Rdatatable/data.table/issues/1986). + +24. Non-equi joins along with `by=.EACHI` returned incorrect result in some rare cases as reported under [#2360](https://github.com/Rdatatable/data.table/issues/2360). This is fixed now. This fix also takes care of [#2275](https://github.com/Rdatatable/data.table/issues/2275). Thanks to @ebs238 for the nice minimal reproducible report, @Mihael for asking on SO and to @Frank for following up on SO and filing an issue. + +25. `by=.EACHI` works now when `list` columns are being returned and some join values are missing, [#2300](https://github.com/Rdatatable/data.table/issues/2300). Thanks to @jangorecki and @franknarf1 for the reproducible examples which have been added to the test suite. + +26. Indices are now retrieved by exact name, [#2465](https://github.com/Rdatatable/data.table/issues/2465). This prevents usage of wrong indices as well as unexpected row reordering in join results. Thanks to @pannnda for reporting and providing a reproducible example and to @MarkusBonsch for fixing. + +27. `setnames` of whole table when original table had `NA` names skipped replacing those, [#2475](https://github.com/Rdatatable/data.table/issues/2475). Thanks to @franknarf1 and [BenoitLondon on StackOverflow](https://stackoverflow.com/questions/47228836/) for the report and @MichaelChirico for fixing. + +28. `CJ()` works with multiple empty vectors now [#2511](https://github.com/Rdatatable/data.table/issues/2511). Thanks to @MarkusBonsch for fixing. + +29. `:=` assignment of one vector to two or more columns, e.g. `DT[, c("x", "y") := 1:10]`, failed to copy the `1:10` data causing errors later if and when those columns were updated by reference, [#2540](https://github.com/Rdatatable/data.table/issues/2540). This is an old issue ([#185](https://github.com/Rdatatable/data.table/issues/185)) that had been fixed but reappeared when code was refactored. Thanks to @patrickhowerter for the detailed report with reproducible example and to @MarkusBonsch for fixing and strengthening tests so it doesn't reappear again. + +30. "Negative length vectors not allowed" error when grouping `median` and `var` fixed, [#2046](https://github.com/Rdatatable/data.table/issues/2046) and [#2111](https://github.com/Rdatatable/data.table/issues/2111). Thanks to @caneff and @osofr for reporting and to @kmillar for debugging and explaining the cause. + +31. Fixed a bug on Windows where `data.table`s containing non-UTF8 strings in `key`s were not properly sorted, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826) and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing. + +32. `x.` prefixes during joins sometimes resulted in a "column not found" error. This is now fixed. Closes [#2313](https://github.com/Rdatatable/data.table/issues/2313). Thanks to @franknarf1 for the MRE. + +33. `setattr()` no longer segfaults when setting 'class' to empty character vector, [#2386](https://github.com/Rdatatable/data.table/issues/2386). Thanks to @hatal175 for reporting and to @MarkusBonsch for fixing. + +34. Fixed cases where the result of `merge.data.table()` would contain duplicate column names if `by.x` was also in `names(y)`. +`merge.data.table()` gains the `no.dups` argument (default TRUE) to match the correpsonding patched behaviour in `base:::merge.data.frame()`. Now, when `by.x` is also in `names(y)` the column name from `y` has the corresponding `suffixes` added to it. `by.x` remains unchanged for backwards compatibility reasons. +In addition, where duplicate column names arise anyway (i.e. `suffixes = c("", "")`) `merge.data.table()` will now throw a warning to match the behaviour of `base:::merge.data.frame()`. +Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631) and [PR#2653](https://github.com/Rdatatable/data.table/pull/2653) + +35. `CJ()` now fails with proper error message when results would exceed max integer, [#2636](https://github.com/Rdatatable/data.table/issues/2636). + +36. `NA` in character columns now display as `` just like base R to distinguish from `""` and `"NA"`. + +37. `getDTthreads()` could return INT_MAX (2 billion) after an explicit call to `setDTthreads(0)`, [PR#2708](https://github.com/Rdatatable/data.table/pull/2708). + +38. Fixed a bug on Windows that `data.table` may break if the garbage collecting was triggered when sorting a large number of non-ASCII characters. Thanks to @shrektan for reporting and fixing [PR#2678](https://github.com/Rdatatable/data.table/pull/2678), [#2674](https://github.com/Rdatatable/data.table/issues/2674). + +39. Internal aliasing of `.` to `list` was over-aggressive in applying `list` even when `.` was intended within `bquote`, [#1912](https://github.com/Rdatatable/data.table/issues/1912). Thanks @MichaelChirico for reporting/filing and @ecoRoland for suggesting and testing a fix. + +40. Attempt to allocate a wildly large amount of RAM (16EB) when grouping by key and there are close to 2 billion 1-row groups, [#2777](https://github.com/Rdatatable/data.table/issues/2777). Thanks to @jsams for the detailed report. + +41. Fix a bug that `print(dt, class=TRUE)` shows only `topn - 1` rows. Thanks to @heavywatal for reporting [#2803](https://github.com/Rdatatable/data.table/issues/2803) and filing [PR#2804](https://github.com/Rdatatable/data.table/pull/2804). + +## NOTES + +0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change. + +1. `?data.table` makes explicit the option of using a `logical` vector in `j` to select columns, [#1978](https://github.com/Rdatatable/data.table/issues/1978). Thanks @Henrik-P for the note and @MichaelChirico for filing. + +2. Test 1675.1 updated to cope with a change in R-devel in June 2017 related to `factor()` and `NA` levels. + +3. Package `ezknitr` has been added to the whitelist of packages that run user code and should be consider data.table-aware, [#2266](https://github.com/Rdatatable/data.table/issues/2266). Thanks to Matt Mills for testing and reporting. + +4. Printing with `quote = TRUE` now quotes column names as well, [#1319](https://github.com/Rdatatable/data.table/issues/1319). Thanks @jan-glx for the suggestion and @MichaelChirico for the PR. + +5. Added a blurb to `?melt.data.table` explicating the subtle difference in behavior of the `id.vars` argument vis-a-vis its analog in `reshape2::melt`, [#1699](https://github.com/Rdatatable/data.table/issues/1699). Thanks @MichaelChirico for uncovering and filing. + +6. Added some clarification about the usage of `on` to `?data.table`, [#2383](https://github.com/Rdatatable/data.table/issues/2383). Thanks to @peterlittlejohn for volunteering his confusion and @MichaelChirico for brushing things up. + +7. Clarified that "data.table always sorts in `C-locale`" means that upper-case letters are sorted before lower-case letters by ordering in data.table (e.g. `setorder`, `setkey`, `DT[order(...)]`). Thanks to @hughparsonage for the pull request editing the documentation. Note this makes no difference in most cases of data; e.g. ids where only uppercase or lowercase letters are used (`"AB123"<"AC234"` is always true, regardless), or country names and words which are consistently capitalized. For example, `"America" < "Brazil"` is not affected (it's always true), and neither is `"america" < "brazil"` (always true too); since the first letter is consistently capitalized. But, whether `"america" < "Brazil"` (the words are not consistently capitalized) is true or false in base R depends on the locale of your R session. In America it is true by default and false if you i) type `Sys.setlocale(locale="C")`, ii) the R session has been started in a C locale for you which can happen on servers/services (the locale comes from the environment the R session is started in). However, `"america" < "Brazil"` is always, consistently false in data.table which can be a surprise because it differs to base R by default in most regions. It is false because `"B"<"a"` is true because all upper-case letters come first, followed by all lower case letters (the ascii number of each letter determines the order, which is what is meant by `C-locale`). + +8. `data.table`'s dependency has been moved forward from R 3.0.0 (Apr 2013) to R 3.1.0 (Apr 2014; i.e. 3.5 years old). We keep this dependency as old as possible for as long as possible as requested by users in managed environments. Thanks to Jan Gorecki, the test suite from latest dev now runs on R 3.1.0 continously, as well as R-release (currently 3.4.2) and latest R-devel snapshot. The primary motivation for the bump to R 3.1.0 was allowing one new test which relies on better non-copying behaviour in that version, [#2484](https://github.com/Rdatatable/data.table/issues/2484). It also allows further internal simplifications. Thanks to @MichaelChirico for fixing another test that failed on R 3.1.0 due to slightly different behaviour of `base::read.csv` in R 3.1.0-only which the test was comparing to, [#2489](https://github.com/Rdatatable/data.table/pull/2489). + +9. New vignette added: _Importing data.table_ - focused on using data.table as a dependency in R packages. Answers most commonly asked questions and promote good practices. + +10. As warned in v1.9.8 release notes below in this file (25 Nov 2016) it has been 1 year since then and so use of `options(datatable.old.unique.by.key=TRUE)` to restore the old default is now deprecated with warning. The new warning states that this option still works and repeats the request to pass `by=key(DT)` explicitly to `unique()`, `duplicated()`, `uniqueN()` and `anyDuplicated()` and to stop using this option. In another year, this warning will become error. Another year after that the option will be removed. + +11. As `set2key()` and `key2()` have been warning since v1.9.8 (Nov 2016), their warnings have now been upgraded to errors. Note that when they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental' in NEWS item 4. They will be removed in one year. + + ``` + Was warning: set2key() will be deprecated in the next relase. Please use setindex() instead. + Now error: set2key() is now deprecated. Please use setindex() instead. + ``` + +12. The option `datatable.showProgress` is no longer set to a default value when the package is loaded. Instead, the `default=` argument of `getOption` is used by both `fwrite` and `fread`. The default is the result of `interactive()` at the time of the call. Using `getOption` in this way is intended to be more helpful to users looking at `args(fread)` and `?fread`. + +13. `print.data.table()` invisibly returns its first argument instead of `NULL`. This behavior is compatible with the standard `print.data.frame()` and tibble's `print.tbl_df()`. Thanks to @heavywatal for [PR#2807](https://github.com/Rdatatable/data.table/pull/2807) + + +# data.table v1.10.4-3 (20 Oct 2017) + +1. Fixed crash/hang on MacOS when `parallel::mclapply` is used and data.table is merely loaded, [#2418](https://github.com/Rdatatable/data.table/issues/2418). Oddly, all tests including test 1705 (which tests `mclapply` with data.table) passed fine on CRAN. It appears to be some versions of MacOS or some versions of libraries on MacOS, perhaps. Many thanks to Martin Morgan for reporting and confirming this fix works. Thanks also to @asenabouth, Joe Thorley and Danton Noriega for testing, debugging and confirming that automatic parallelism inside data.table (such as `fwrite`) works well even on these MacOS installations. See also news items below for 1.10.4-1 and 1.10.4-2. + + +# data.table v1.10.4-2 (12 Oct 2017) + +1. OpenMP on MacOS is now supported by CRAN and included in CRAN's package binaries for Mac. But installing v1.10.4-1 from source on MacOS failed when OpenMP was not enabled at compile time, [#2409](https://github.com/Rdatatable/data.table/issues/2409). Thanks to Liz Macfie and @fupangpangpang for reporting. The startup message when OpenMP is not enabled has been updated. + +2. Two rare potential memory faults fixed, thanks to CRAN's automated use of latest compiler tools; e.g. clang-5 and gcc-7 + + +# data.table v1.10.4-1 (09 Oct 2017) + +1. The `nanotime` v0.2.0 update (June 2017) changed from `integer64` to `S4` and broke `fwrite` of `nanotime` columns. Fixed to work with `nanotime` both before and after v0.2.0. + +2. Pass R-devel changes related to `deparse(,backtick=)` and `factor()`. + +3. Internal `NAMED()==2` now `MAYBE_SHARED()`, [#2330](https://github.com/Rdatatable/data.table/issues/2330). Back-ported to pass under the stated dependency, R 3.0.0. + +4. Attempted improvement on Mac-only when the `parallel` package is used too (which forks), [#2137](https://github.com/Rdatatable/data.table/issues/2137). Intel's OpenMP implementation appears to leave threads running after the OpenMP parallel region (inside data.table) has finished unlike GNU libgomp. So, if and when `parallel`'s `fork` is invoked by the user after data.table has run in parallel already, instability occurs. The problem only occurs with Mac package binaries from CRAN because they are built by CRAN with Intel's OpenMP library. No known problems on Windows or Linux and no known problems on any platform when `parallel` is not used. If this Mac-only fix still doesn't work, call `setDTthreads(1)` immediately after `library(data.table)` which has been reported to fix the problem by putting `data.table` into single threaded mode earlier. + +5. When `fread()` and `print()` see `integer64` columns are present but package `bit64` is not installed, the warning is now displayed as intended. Thanks to a question by Santosh on r-help and forwarded by Bill Dunlap. + + +# data.table v1.10.4 (01 Feb 2017) + +## BUG FIXES + +1. The new specialized `nanotime` writer in `fwrite()` type punned using `*(long long *)&REAL(column)[i]` which, strictly, is undefined behavour under C standards. It passed a plethora of tests on linux (gcc 5.4 and clang 3.8), win-builder and 6 out 10 CRAN flavours using gcc. But failed (wrong data written) with the newest version of clang (3.9.1) as used by CRAN on the failing flavors, and solaris-sparc. Replaced with the union method and added a grep to CRAN_Release.cmd. + + +# data.table v1.10.2 (31 Jan 2017) + +## NEW FEATURES + +1. When `j` is a symbol prefixed with `..` it will be looked up in calling scope and its value taken to be column names or numbers. + + ```R + myCols = c("colA","colB") + DT[, myCols, with=FALSE] + DT[, ..myCols] # same + ``` + + When you see the `..` prefix think _one-level-up_ like the directory `..` in all operating systems meaning the parent directory. In future the `..` prefix could be made to work on all symbols apearing anywhere inside `DT[...]`. It is intended to be a convenient way to protect your code from accidentally picking up a column name. Similar to how `x.` and `i.` prefixes (analogous to SQL table aliases) can already be used to disambiguate the same column name present in both `x` and `i`. A symbol prefix rather than a `..()` _function_ will be easier for us to optimize internally and more convenient if you have many variables in calling scope that you wish to use in your expressions safely. This feature was first raised in 2012 and long wished for, [#633](https://github.com/Rdatatable/data.table/issues/633). It is experimental. + +2. When `fread()` or `print()` see `integer64` columns are present, `bit64`'s namespace is now automatically loaded for convenience. + +3. `fwrite()` now supports the new [`nanotime`](https://cran.r-project.org/package=nanotime) type by Dirk Eddelbuettel, [#1982](https://github.com/Rdatatable/data.table/issues/1982). Aside: `data.table` already automatically supported `nanotime` in grouping and joining operations via longstanding support of its underlying `integer64` type. + +4. `indices()` gains a new argument `vectors`, default `FALSE`. This strsplits the index names by `__` for you, [#1589](https://github.com/Rdatatable/data.table/issues/1589). + + ```R + DT = data.table(A=1:3, B=6:4) + setindex(DT, B) + setindex(DT, B, A) + indices(DT) + [1] "B" "B__A" + indices(DT, vectors=TRUE) + [[1]] + [1] "B" + [[2]] + [1] "B" "A" + ``` + +## BUG FIXES + +1. Some long-standing potential instability has been discovered and resolved many thanks to a detailed report from Bill Dunlap and Michael Sannella. At C level any call of the form `setAttrib(x, install(), allocVector())` can be unstable in any R package. Despite `setAttrib()` PROTECTing its inputs, the 3rd argument (`allocVector`) can be executed first only for its result to to be released by `install()`'s potential GC before reaching `setAttrib`'s PROTECTion of its inputs. Fixed by either PROTECTing or pre-`install()`ing. Added to CRAN_Release.cmd procedures: i) `grep`s to prevent usage of this idiom in future and ii) running data.table's test suite with `gctorture(TRUE)`. + +2. A new potential instability introduced in the last release (v1.10.0) in GForce optimized grouping has been fixed by reverting one change from malloc to R_alloc. Thanks again to Michael Sannella for the detailed report. + +3. `fwrite()` could write floating point values incorrectly, [#1968](https://github.com/Rdatatable/data.table/issues/1968). A thread-local variable was incorrectly thread-global. This variable's usage lifetime is only a few clock cycles so it needed large data and many threads for several threads to overlap their usage of it and cause the problem. Many thanks to @mgahan and @jmosser for finding and reporting. + +## NOTES + +1. `fwrite()`'s `..turbo` option has been removed as the warning message warned. If you've found a problem, please [report it](https://github.com/Rdatatable/data.table/issues). + +2. No known issues have arisen due to `DT[,1]` and `DT[,c("colA","colB")]` now returning columns as introduced in v1.9.8. However, as we've moved forward by setting `options('datatable.WhenJisSymbolThenCallingScope'=TRUE)` introduced then too, it has become clear a better solution is needed. All 340 CRAN and Bioconductor packages that use data.table have been checked with this option on. 331 lines would need to be changed in 59 packages. Their usage is elegant, correct and recommended, though. Examples are `DT[1, encoding]` in quanteda and `DT[winner=="first", freq]` in xgboost. These are looking up the columns `encoding` and `freq` respectively and returning them as vectors. But if, for some reason, those columns are removed from `DT` and `encoding` or `freq` are still variables in calling scope, their values in calling scope would be returned. Which cannot be what was intended and could lead to silent bugs. That was the risk we were trying to avoid.
+`options('datatable.WhenJisSymbolThenCallingScope')` is now removed. A migration timeline is no longer needed. The new strategy needs no code changes and has no breakage. It was proposed and discussed in point 2 [here](https://github.com/Rdatatable/data.table/issues/1188#issuecomment-127824969), as follows.
+When `j` is a symbol (as in the quanteda and xgboost examples above) it will continue to be looked up as a column name and returned as a vector, as has always been the case. If it's not a column name however, it is now a helpful error explaining that data.table is different to data.frame and what to do instead (use `..` prefix or `with=FALSE`). The old behaviour of returning the symbol's value in calling scope can never have been useful to anybody and therefore not depended on. Just as the `DT[,1]` change could be made in v1.9.8, this change can be made now. This change increases robustness with no downside. Rerunning all 340 CRAN and Bioconductor package checks reveal 2 packages throwing the new error: partools and simcausal. Their maintainers have been informed that there is a likely bug on those lines due to data.table's (now remedied) weakness. This is exactly what we wanted to reveal and improve. + +3. As before, and as we can see is in common use in CRAN and Bioconductor packages using data.table, `DT[,myCols,with=FALSE]` continues to lookup `myCols` in calling scope and take its value as column names or numbers. You can move to the new experimental convenience feature `DT[, ..myCols]` if you wish at leisure. + + +# data.table v1.10.0 (03 Dec 2016) + +## BUG FIXES + +1. `fwrite(..., quote='auto')` already quoted a field if it contained a `sep` or `\n`, or `sep2[2]` when `list` columns are present. Now it also quotes a field if it contains a double quote (`"`) as documented, [#1925](https://github.com/Rdatatable/data.table/issues/1925). Thanks to Aki Matsuo for reporting. Tests added. The `qmethod` tests did test escaping embedded double quotes, but only when `sep` or `\n` was present in the field as well to trigger the quoting of the field. + +2. Fixed 3 test failures on Solaris only, [#1934](https://github.com/Rdatatable/data.table/issues/1934). Two were on both sparc and x86 and related to a `tzone` attribute difference between `as.POSIXct` and `as.POSIXlt` even when passed the default `tz=""`. The third was on sparc only: a minor rounding issue in `fwrite()` of 1e-305. + +3. Regression crash fixed when 0's occur at the end of a non-empty subset of an empty table, [#1937](https://github.com/Rdatatable/data.table/issues/1937). Thanks Arun for tracking down. Tests added. For example, subsetting the empty `DT=data.table(a=character())` with `DT[c(1,0)]` should return a 1 row result with one `NA` since 1 is past the end of `nrow(DT)==0`, the same result as `DT[1]`. + +4. Fixed newly reported crash that also occurred in old v1.9.6 when `by=.EACHI`, `nomatch=0`, the first item in `i` has no match AND `j` has a function call that is passed a key column, [#1933](https://github.com/Rdatatable/data.table/issues/1933). Many thanks to Reino Bruner for finding and reporting with a reproducible example. Tests added. + +5. Fixed `fread()` error occurring for a subset of Windows users: `showProgress is not type integer but type 'logical'.`, [#1944](https://github.com/Rdatatable/data.table/issues/1944) and [#1111](https://github.com/Rdatatable/data.table/issues/1111). Our tests cover this usage (it is just default usage), pass on AppVeyor (Windows), win-builder (Windows) and CRAN's Windows so perhaps it only occurs on a specific and different version of Windows to all those. Thanks to @demydd for reporting. Fixed by using strictly `logical` type at R level and `Rboolean` at C level, consistently throughout. + +6. Combining `on=` (new in v1.9.6) with `by=` or `keyby=` gave incorrect results, [#1943](https://github.com/Rdatatable/data.table/issues/1943). Many thanks to Henrik-P for the detailed and reproducible report. Tests added. + +7. New function `rleidv` was ignoring its `cols` argument, [#1942](https://github.com/Rdatatable/data.table/issues/1942). Thanks Josh O'Brien for reporting. Tests added. + +## NOTES + +1. It seems OpenMP is not available on CRAN's Mac platform; NOTEs appeared in [CRAN checks](https://cran.r-project.org/web/checks/check_results_data.table.html) for v1.9.8. Moved `Rprintf` from `init.c` to `packageStartupMessage` to avoid the NOTE as requested urgently by Professor Ripley. Also fixed the bad grammar of the message: 'single threaded' now 'single-threaded'. If you have a Mac and run macOS or OS X on it (I run Ubuntu on mine) please contact CRAN maintainers and/or Apple if you'd like CRAN's Mac binary to support OpenMP. Otherwise, please follow [these instructions for OpenMP on Mac](https://github.com/Rdatatable/data.table/wiki/Installation) which people have reported success with. + +2. Just to state explicitly: data.table does not now depend on or require OpenMP. If you don't have it (as on CRAN's Mac it appears but not in general on Mac) then data.table should build, run and pass all tests just fine. + +3. There are now 5,910 raw tests as reported by `test.data.table()`. Tests cover 91% of the 4k lines of R and 89% of the 7k lines of C. These stats are now known thanks to Jim Hester's [Covr](https://CRAN.R-project.org/package=covr) package and [Codecov.io](https://about.codecov.io/). If anyone is looking for something to help with, creating tests to hit the missed lines shown by clicking the `R` and `src` folders at the bottom [here](https://app.codecov.io/github/Rdatatable/data.table?branch=master) would be very much appreciated. + +4. The FAQ vignette has been revised given the changes in v1.9.8. In particular, the very first FAQ. + +5. With hindsight, the last release v1.9.8 should have been named v1.10.0 to convey it wasn't just a patch release from .6 to .8 owing to the 'potentially breaking changes' items. Thanks to @neomantic for correctly pointing out. The best we can do now is now bump to 1.10.0. + + +# data.table v1.9.8 (Nov 2016) back to v1.2 (Aug 2008) has been moved to [NEWS.0.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.0.md) diff --git a/NEWS.md b/NEWS.md index 513ac9bc56..48f7c529e8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -610,1549 +610,4 @@ 15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). -# data.table [v1.14.10](https://github.com/Rdatatable/data.table/milestone/20?closed=1) (8 Dec 2023) - -## NOTES - -1. Maintainer of the package for CRAN releases is from now on Tyson Barrett (@tysonstanley), [#5710](https://github.com/Rdatatable/data.table/issues/5710). - -2. Updated internal code for breaking change of `is.atomic(NULL)` in R-devel, [#5691](https://github.com/Rdatatable/data.table/pull/5691). Thanks to Martin Maechler for the patch. - -3. Fix multiple test concerning coercion to missing complex numbers, [#5695](https://github.com/Rdatatable/data.table/issues/5695) and [#5748](https://github.com/Rdatatable/data.table/issues/5748). Thanks to @MichaelChirico and @ben-schwen for the patches. - -4. Fix multiple format warnings (e.g., -Wformat) [#5712](https://github.com/Rdatatable/data.table/pull/5712), [#5781](https://github.com/Rdatatable/data.table/pull/5781), [#5880](https://github.com/Rdatatable/data.table/pull/5800), [#5786](https://github.com/Rdatatable/data.table/pull/5786). Thanks to @MichaelChirico and @jangorecki for the patches. - - -# data.table [v1.14.8](https://github.com/Rdatatable/data.table/milestone/28?closed=1) (17 Feb 2023) - -## NOTES - -1. Test 1613.605 now passes changes to `as.data.frame()` in R-devel, [#5597](https://github.com/Rdatatable/data.table/pull/5597). Thanks to Avraham Adler for reporting. - -2. An out of bounds read when combining non-equi join with `by=.EACHI` has been found and fixed thanks to clang ASAN, [#5598](https://github.com/Rdatatable/data.table/issues/5598). There was no bug or consequence because the read was followed (now preceded) by a bounds test. - -3. `.rbind.data.table` (note the leading `.`) is no longer exported when `data.table` is installed in R>=4.0.0 (Apr 2020), [#5600](https://github.com/Rdatatable/data.table/pull/5600). It was never documented which R-devel now detects and warns about. It is only needed by `data.table` internals to support R<4.0.0; see note 1 in v1.12.6 (Oct 2019) below in this file for more details. - - -# data.table [v1.14.6](https://github.com/Rdatatable/data.table/milestone/27?closed=1) (16 Nov 2022) - -## BUG FIXES - -1. `fread()` could leak memory, [#3292](https://github.com/Rdatatable/data.table/issues/3292). Thanks to @patrickhowerter for reporting, and Jim Hester for the fix. The fix requires R 3.4.0 or later. Loading `data.table` in earlier versions now highlights this issue on startup, asks users to upgrade R, and warns that we intend to upgrade `data.table`'s dependency from 8 year old R 3.1.0 (April 2014) to 5 year old R 3.4.0 (April 2017). - -## NOTES - -1. Test 1962.098 has been modified to pass latest changes to `POSIXt` in R-devel. - -2. `test.data.table()` no longer creates `DT` in `.GlobalEnv`, a CRAN policy violation, [#5514](https://github.com/Rdatatable/data.table/issues/5514). No other writes occurred to `.GlobalEnv` and release procedures have been improved to prevent this happening again. - -3. The memory usage of the test suite has been halved, [#5507](https://github.com/Rdatatable/data.table/issues/5507). - - -# data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) (17 Oct 2022) - -## NOTES - -1. gcc 12.1 (May 2022) now detects and warns about an always-false condition (`-Waddress`) in `fread` which caused a small efficiency saving never to be invoked, [#5476](https://github.com/Rdatatable/data.table/pull/5476). Thanks to CRAN for testing latest versions of compilers. - -2. `update.dev.pkg()` has been renamed `update_dev_pkg()` to get out of the way of the `stats::update` generic function, [#5421](https://github.com/Rdatatable/data.table/pull/5421). This is a utility function which upgrades the version of `data.table` to the latest commit in development which has passed all tests. As such we don't expect any backwards compatibility concerns. Its manual page was causing an intermittent hang/crash from `R CMD check` on Windows-only on CRAN which we hope will be worked around by changing its name. - -3. Internal C code now passes `-Wstrict-prototypes` to satisfy the warnings now displayed on CRAN, [#5477](https://github.com/Rdatatable/data.table/pull/5477). - -4. `write.csv` in R-devel no longer responds to `getOption("digits.secs")` for `POSIXct`, [#5478](https://github.com/Rdatatable/data.table/issues/5478). This caused our tests of `fwrite(, dateTimeAs="write.csv")` to fail on CRAN's daily checks using latest daily R-devel. While R-devel discussion continues, and currently it seems like the change is intended with further changes possible, this `data.table` release massages our tests to pass on latest R-devel. The idea is to try to get out of the way of R-devel changes in this regard until the new behavior of `write.csv` is released and confirmed. Package updates are not accepted on CRAN if they do not pass the latest daily version of R-devel, even if R-devel changes after the package update is submitted. If the change to `write.csv()` stands, then a future release of `data.table` will be needed to make `fwrite(, dateTimeAs="write.csv")` match `write.csv()` output again in that future version of R onwards. If you use an older version of `data.table` than said future one in the said future version of R, then `fwrite(, dateTimeAs="write.csv")` may not match `write.csv()` if you are using `getOption("digits.secs")` too. However, you can always check that your installation of `data.table` works in your version of R on your platform by simply running `test.data.table()` yourself. Doing so would detect such a situation for you: test 1741 would fail in this case. `test.data.table()` runs the entire suite of tests and is always available to you locally. This way you do not need to rely on our statements about which combinations of versions of R and `data.table` on which platforms we have tested and support; just run `test.data.table()` yourself. Having said that, because test 1741 has been relaxed in this release in order to be accepted on CRAN to pass latest R-devel, this won't be true for this particular release in regard to this particular test. - - ```R - $ R --vanilla - R version 4.2.1 (2022-06-23) -- "Funny-Looking Kid" - > DF = data.frame(A=as.POSIXct("2022-10-01 01:23:45.012")) - > options(digits.secs=0) - > write.csv(DF) - "","A" - "1",2022-10-01 01:23:45 - > options(digits.secs=3) - > write.csv(DF) - "","A" - "1",2022-10-01 01:23:45.012 - - $ Rdevel --vanilla - R Under development (unstable) (2022-10-06 r83040) -- "Unsuffered Consequences" - > DF = data.frame(A=as.POSIXct("2022-10-01 01:23:45.012")) - > options(digits.secs=0) - > write.csv(DF) - "","A" - "1",2022-10-01 01:23:45.012 - ``` - -5. Many thanks to Kurt Hornik for investigating potential impact of a possible future change to `base::intersect()` on empty input, providing a patch so that `data.table` won't break if the change is made to R, and giving us plenty of notice, [#5183](https://github.com/Rdatatable/data.table/pull/5183). - -6. `datatable.[dll|so]` has changed name to `data_table.[dll|so]`, [#4442](https://github.com/Rdatatable/data.table/pull/4442). Thanks to Jan Gorecki for the PR. We had previously removed the `.` since `.` is not allowed by the following paragraph in the Writing-R-Extensions manual. Replacing `.` with `_` instead now seems more consistent with the last sentence. - - > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. - - -# data.table [v1.14.2](https://github.com/Rdatatable/data.table/milestone/24?closed=1) (27 Sep 2021) - -## NOTES - -1. clang 13.0.0 (Sep 2021) requires the system header `omp.h` to be included before R's headers, [#5122](https://github.com/Rdatatable/data.table/issues/5122). Many thanks to Prof Ripley for testing and providing a patch file. - - -# data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) - -## POTENTIALLY BREAKING CHANGES - -1. In v1.13.0 (July 2020) native parsing of datetime was added to `fread` by Michael Chirico which dramatically improved performance. Before then datetime was read as type character by default which was slow. Since v1.13.0, UTC-marked datetime (e.g. `2020-07-24T10:11:12.134Z` where the final `Z` is present) has been read automatically as POSIXct and quickly. We provided the migration option `datatable.old.fread.datetime.character` to revert to the previous slow character behavior. We also added the `tz=` argument to control unmarked datetime; i.e. where the `Z` (or equivalent UTC postfix) is missing in the data. The default `tz=""` reads unmarked datetime as character as before, slowly. We gave you the ability to set `tz="UTC"` to turn on the new behavior and read unmarked datetime as UTC, quickly. R sessions that are running in UTC by setting the TZ environment variable, as is good practice and common in production, have also been reading unmarked datetime as UTC since v1.13.0, much faster. Note 1 of v1.13.0 (below in this file) ended `In addition to convenience, fread is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided.`. - - At `rstudio::global(2021)`, Neal Richardson, Director of Engineering at Ursa Labs, compared Arrow CSV performance to `data.table` CSV performance, [Bigger Data With Ease Using Apache Arrow](https://posit.co/resources/videos/bigger-data-with-ease-using-apache-arrow/). He opened by comparing to `data.table` as his main point. Arrow was presented as 3 times faster than `data.table`. He talked at length about this result. However, no reproducible code was provided and we were not contacted in advance in case we had any comments. He mentioned New York Taxi data in his talk which is a dataset known to us as containing unmarked datetime. [Rebuttal](https://twitter.com/MattDowle/status/1360073970498875394). - - `tz=`'s default is now changed from `""` to `"UTC"`. If you have been using `tz=` explicitly then there should be no change. The change to read UTC-marked datetime as POSIXct rather than character already happened in v1.13.0. The change now is that unmarked datetimes are now read as UTC too by default without needing to set `tz="UTC"`. None of the 1,017 CRAN packages directly using `data.table` are affected. As before, the migration option `datatable.old.fread.datetime.character` can still be set to TRUE to revert to the old character behavior. This migration option is temporary and will be removed in the near future. - - The community was consulted in [this tweet](https://twitter.com/MattDowle/status/1358011599336931328) before release. - -## BUG FIXES - -1. If `fread()` discards a single line footer, the warning message which includes the discarded text now displays any non-ASCII characters correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. - -2. `fintersect()` now retains the order of the first argument as reasonably expected, rather than retaining the order of the second argument, [#4716](https://github.com/Rdatatable/data.table/issues/4716). Thanks to Michel Lang for reporting, and Ben Schwen for the PR. - -## NOTES - -1. Compiling from source no longer requires `zlib` header files to be available, [#4844](https://github.com/Rdatatable/data.table/pull/4844). The output suggests installing `zlib` headers, and how (e.g. `zlib1g-dev` on Ubuntu) as before, but now proceeds with `gzip` compression disabled in `fwrite`. Upon calling `fwrite(DT, "file.csv.gz")` at runtime, an error message suggests to reinstall `data.table` with `zlib` headers available. This does not apply to users on Windows or Mac who install the pre-compiled binary package from CRAN. - -2. `r-datatable.com` continues to be the short, canonical and long-standing URL which forwards to the current homepage. The homepage domain has changed a few times over the years but those using `r-datatable.com` did not need to change their links. For example, we use `r-datatable.com` in messages (and translated messages) in preference to the word 'homepage' to save users time in searching for the current homepage. The web forwarding was provided by Domain Monster but they do not support `https://r-datatable.com`, only `http://r-datatable.com`, despite the homepage being forwarded to being `https:` for many years. Meanwhile, CRAN submission checks now require all URLs to be `https:`, rejecting `http:`. Therefore we have moved to [gandi.net](https://www.gandi.net) who do support `https:` web forwarding and so [https://r-datatable.com](https://r-datatable.com) now forwards correctly. Thanks to Dirk Eddelbuettel for suggesting Gandi. Further, Gandi allows the web-forward to be marked 301 (permanent) or 302 (temporary). Since the very point of `https://r-datatable.com` is to be a forward, 302 is appropriate in this case. This enables us to link to it in DESCRIPTION, README, and this NEWS item. Otherwise, CRAN submission checks would require the 301 forward to be followed; i.e. the forward replaced with where it points to and the package resubmitted. Thanks to Uwe Ligges for explaining this distinction. - - -# data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (30 Dec 2020) - -## BUG FIXES - -1. Grouping could throw an error `Failed to allocate counts or TMP` with more than 1e9 rows even with sufficient RAM due to an integer overflow, [#4295](https://github.com/Rdatatable/data.table/issues/4295) [#4818](https://github.com/Rdatatable/data.table/issues/4818). Thanks to @renkun-ken and @jangorecki for reporting, and @shrektan for fixing. - -2. `fwrite()`'s mutithreaded `gzip` compression failed on Solaris with Z_STREAM_ERROR, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) there have been no known problems with it on Linux, Windows or Mac. For Solaris, we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. - - It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone reading this knows more, please let us know. - -## NOTES - -1. The last release took place at the same time as several breaking changes were made to R-devel. The CRAN submissions process runs against latest daily R-devel so we had to keep up with those latest changes by making several resubmissions. Then each resubmission reruns against the new latest R-devel again. Overall it took 7 days. For example, we added the new `environments=FALSE` to our `all.equal` call. Then about 4 hours after 1.13.4 was accepted, the `s` was dropped and we now need to resubmit with `environment=FALSE`. In any case, we have suggested that the default should be FALSE first to give packages some notice, as opposed to generating errors in the CRAN submissions process within hours. Then the default for `environment=` could be TRUE in 6 months time after packages have had some time to update in advance of the default change. Readers of this NEWS file will be familiar with `data.table`'s approach to change control and know that we do this ourselves. - - -# data.table [v1.13.4](https://github.com/Rdatatable/data.table/milestone/21?closed=1) (08 Dec 2020) - -## BUG FIXES - -1. `as.matrix()` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting. - -2. `fsort()` could crash when compiled using `clang-11` (Oct 2020), [#4786](https://github.com/Rdatatable/data.table/issues/4786). Multithreaded debugging revealed that threads are no longer assigned iterations monotonically by the dynamic schedule. Although never guaranteed by the OpenMP standard, in practice monotonicity could be relied on as far as we knew, until now. We rely on monotonicity in the `fsort` implementation. Happily, a schedule modifier `monotonic:dynamic` was added in OpenMP 4.5 (Nov 2015) which we now use if available (e.g. gcc 6+, clang 3.9+). If you have an old compiler which does not support OpenMP 4.5, it's probably the case that the unmodified dynamic schedule is monotonic anyway, so `fsort` now checks that threads are receiving iterations monotonically and emits a graceful error if not. It may be that `clang` prior to version 11, and `gcc` too, exhibit the same crash. It was just that `clang-11` was the first report. To know which version of OpenMP `data.table` is using, `getDTthreads(verbose=TRUE)` now reports the `YYYYMM` value `_OPENMP`; e.g. 201511 corresponds to v4.5, and 201811 corresponds to v5.0. Oddly, the `x.y` version number is not provided by the OpenMP API. OpenMP 4.5 may be enabled in some compilers using `-fopenmp-version=45`. Otherwise, if you need to upgrade compiler, https://www.openmp.org/resources/openmp-compilers-tools/ may be helpful. - -3. Columns containing functions that don't inherit the class `'function'` would fail to group, [#4814](https://github.com/Rdatatable/data.table/issues/4814). Thanks @mb706 for reporting, @ecoRoland2 for helping investigate, and @Coorsaa for a follow-up example involving environments. - -## NOTES - -1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://h2o.ai/blog/2016/behind-the-scenes-of-cran/). - -2. `as.Date.IDate` is no longer exported as a function to solve a new error in R-devel `S3 method lookup found 'as.Date.IDate' on search path`, [#4777](https://github.com/Rdatatable/data.table/issues/4777). The S3 method is still exported; i.e. `as.Date(x)` will still invoke the `as.Date.IDate` method when `x` is class `IDate`. The function had been exported, in addition to exporting the method, to solve a compatibility issue with `zoo` (and `xts` which uses `zoo`) because `zoo` exports `as.Date` which masks `base::as.Date`. Happily, since zoo 1.8-1 (Jan 2018) made a change to its `as.IDate`, the workaround is no longer needed. - -3. Thanks to @fredguinog for testing `fcase` in development before 1.13.0 was released and finding a segfault, [#4378](https://github.com/Rdatatable/data.table/issues/4378). It was found separately by the `rchk` tool (which uses static code analysis) in release procedures and fixed before `fcase` was released, but the reproducible example has now been added to the test suite for completeness. Thanks also to @shrektan for investigating, proposing a very similar fix at C level, and a different reproducible example which has also been added to the test suite. - - -# data.table [v1.13.2](https://github.com/Rdatatable/data.table/milestone/19?closed=1) (19 Oct 2020) - -## BUG FIXES - -1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing. - -2. A regression in v1.13.0 resulted in installation on Mac often failing with `shared object 'datatable.so' not found`, and FreeBSD always failing with `expr: illegal option -- l`, [#4652](https://github.com/Rdatatable/data.table/issues/4652) [#4640](https://github.com/Rdatatable/data.table/issues/4640) [#4650](https://github.com/Rdatatable/data.table/issues/4650). Thanks to many for assistance including Simon Urbanek, Brian Ripley, Wes Morgan, and @ale07alvarez. There were no installation problems on Windows or Linux. - -3. Operating on columns of type `list`, e.g. `dt[, listCol[[1]], by=id]`, suffered a performance regression in v1.13.0, [#4646](https://github.com/Rdatatable/data.table/issues/4646) [#4658](https://github.com/Rdatatable/data.table/issues/4658). Thanks to @fabiocs8 and @sandoronodi for the detailed reports, and to Cole Miller for substantial debugging, investigation and proposals at C level which enabled the root cause to be fixed. Related, and also fixed, was a segfault revealed by package POUMM, [#4746](https://github.com/Rdatatable/data.table/issues/4746), when grouping a list column where each item has an attribute; e.g., `coda::mcmc.list`. Detected thanks to CRAN's ASAN checks, and thanks to Venelin Mitov for assistance in tracing the memory fault. Thanks also to Hongyuan Jia and @ben-schwen for assistance in debugging the fix in dev to pass reverse dependency testing which highlighted, before release, that package `eplusr` would fail. Its good usage has been added to `data.table`'s test suite. - -4. `fread("1.2\n", colClasses='integer')` (note no columns names in the data) would segfault when creating a warning message, [#4644](https://github.com/Rdatatable/data.table/issues/4644). It now warns with `Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.` When column names are present however, the warning message includes the name as before; i.e., `fread("A\n1.2\n", colClasses='integer')` produces `Attempt to override column 1 <
> of inherent type 'float64' down to 'int32' ignored.`. Thanks to Kun Ren for reporting. - -5. `dplyr::mutate(setDT(as.list(1:64)), V1=11)` threw error `can't set ALTREP truelength`, [#4734](https://github.com/Rdatatable/data.table/issues/4734). Thanks to @etryn for the reproducible example, and to Cole Miller for refinements. - -## NOTES - -1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accommodate `bit64`'s update. - - The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on (rather than importing) `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. - - We have requested that CRAN policy be modified to require that reverse dependency testing include packages which `Suggest` the package. Had this been the case, reverse dependency testing of `bit64` would have caught the impact on `data.table` before release. - -2. `?.NGRP` now displays the help page as intended, [#4946](https://github.com/Rdatatable/data.table/issues/4649). Thanks to @KyleHaynes for posting the issue, and Cole Miller for the fix. `.NGRP` is a symbol new in v1.13.0; see below in this file. - -3. `test.data.table()` failed in non-English locales such as `LC_TIME=fr_FR.UTF-8` due to `Jan` vs `janv.` in tests 168 and 2042, [#3450](https://github.com/Rdatatable/data.table/issues/3450). Thanks to @shrektan for reporting, and @tdhock for making the tests locale-aware. - -4. User-supplied `PKG_LIBS` and `PKG_CFLAGS` are now retained and the suggestion in https://mac.r-project.org/openmp/; i.e., - `PKG_CPPFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_.tar.gz` -has a better chance of working on Mac. - - -# data.table [v1.13.0](https://github.com/Rdatatable/data.table/milestone/17?closed=1) (24 Jul 2020) - -## POTENTIALLY BREAKING CHANGES - -1. `fread` now supports native parsing of `%Y-%m-%d`, and [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `%Y-%m-%dT%H:%M:%OS%z`, [#4464](https://github.com/Rdatatable/data.table/pull/4464). Dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and datetimes are returned as `POSIXct` provided either `Z` or the offset from `UTC` is present; e.g. `fwrite()` outputs UTC by default including the final `Z`. Reminder that `IDate` inherits from R's `Date` and is identical other than it uses the `integer` type where (oddly) R uses the `double` type for dates (8 bytes instead of 4). `fread()` gains a `tz` argument to control datetime values that are missing a Z or UTC-offset (now referred to as *unmarked* datetimes); e.g. as written by `write.csv`. By default `tz=""` means, as in R, read the unmarked datetime in local time. Unless the timezone of the R session is UTC (e.g. the TZ environment variable is set to `"UTC"`, or `""` on non-Windows), unmarked datetime will then by read by `fread` as character, as before. If you have been using `colClasses="POSIXct"` that will still work using R's `as.POSIXct()` which will interpret the unmarked datetime in local time, as before, and still slowly. You can tell `fread` to read unmarked datetime as UTC, and quickly, by passing `tz="UTC"` which may be appropriate in many circumstances. Note that the default behaviour of R to read and write csv using unmarked datetime can lead to different research results when the csv file has been saved in one timezone and read in another due to observations being shifted to a different date. If you have been using `colClasses="POSIXct"` for UTC-marked datetime (e.g. as written by `fwrite` including the final `Z`) then it will automatically speed up with no changes needed. - - Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. - - The minor version number is bumped from 12 to 13, i.e. `v1.13.0`, where the `.0` conveys 'be-aware' as is common practice. As with any new feature, there may be bugs to fix and changes to defaults required in future. In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided. - -## NEW FEATURES - -1. `%chin%` and `chmatch(x, table)` are faster when `x` is length 1, `table` is long, and `x` occurs near the start of `table`. Thanks to Michael Chirico for the suggestion, [#4117](https://github.com/Rdatatable/data.table/pull/4117#discussion_r358378409). - -2. `CsubsetDT` C function is now exported for use by other packages, [#3751](https://github.com/Rdatatable/data.table/issues/3751). Thanks to Leonardo Silvestri for the request and the PR. This uses R's `R_RegisterCCallable` and `R_GetCCallable` mechanism, [R-exts§5.4.3](https://cran.r-project.org/doc/manuals/r-devel/R-exts.html#Linking-to-native-routines-in-other-packages) and [`?cdt`](https://rdatatable.gitlab.io/data.table/reference/cdt.html). Note that organization of our C interface will be changed in future. - -3. `print` method for `data.table` gains `trunc.cols` argument (and corresponding option `datatable.print.trunc.cols`, default `FALSE`), [#1497](https://github.com/Rdatatable/data.table/issues/1497), part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). This prints only as many columns as fit in the console without wrapping to new lines (e.g., the first 5 of 80 columns) and a message that states the count and names of the variables not shown. When `class=TRUE` the message also contains the classes of the variables. `data.table` has always automatically truncated _rows_ of a table for efficiency (e.g. printing 10 rows instead of 10 million); in the future, we may do the same for _columns_ (e.g., 10 columns instead of 20,000) by changing the default for this argument. Thanks to @nverno for the initial suggestion and to @TysonStanley for the PR. - -4. `setnames(DT, new=new_names)` (i.e. explicitly named `new=` argument) now works as expected rather than an error message requesting that `old=` be supplied too, [#4041](https://github.com/Rdatatable/data.table/issues/4041). Thanks @Kodiologist for the suggestion. - -5. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed. - -6. New function `fcase(...,default)` implemented in C by Morgan Jacob, [#3823](https://github.com/Rdatatable/data.table/issues/3823), is inspired by SQL `CASE WHEN` which is a common tool in SQL for e.g. building labels or cutting age groups based on conditions. `fcase` is comparable to R function `dplyr::case_when` however it evaluates its arguments in a lazy way (i.e. only when needed) as shown below. Please see `?fcase` for more details. - - ```R - # Lazy evaluation - x = 1:10 - data.table::fcase( - x < 5L, 1L, - x >= 5L, 3L, - x == 5L, stop("provided value is an unexpected one!") - ) - # [1] 1 1 1 1 3 3 3 3 3 3 - - dplyr::case_when( - x < 5L ~ 1L, - x >= 5L ~ 3L, - x == 5L ~ stop("provided value is an unexpected one!") - ) - # Error in eval_tidy(pair$rhs, env = default_env) : - # provided value is an unexpected one! - - # Benchmark - x = sample(1:100, 3e7, replace = TRUE) # 114 MB - microbenchmark::microbenchmark( - dplyr::case_when( - x < 10L ~ 0L, - x < 20L ~ 10L, - x < 30L ~ 20L, - x < 40L ~ 30L, - x < 50L ~ 40L, - x < 60L ~ 50L, - x > 60L ~ 60L - ), - data.table::fcase( - x < 10L, 0L, - x < 20L, 10L, - x < 30L, 20L, - x < 40L, 30L, - x < 50L, 40L, - x < 60L, 50L, - x > 60L, 60L - ), - times = 5L, - unit = "s") - # Unit: seconds - # expr min lq mean median uq max neval - # dplyr::case_when 11.57 11.71 12.22 11.82 12.00 14.02 5 - # data.table::fcase 1.49 1.55 1.67 1.71 1.73 1.86 5 - ``` - -7. `.SDcols=is.numeric` now works; i.e., `SDcols=` accepts a function which is used to select the columns of `.SD`, [#3950](https://github.com/Rdatatable/data.table/issues/3950). Any function (even _ad hoc_) that returns scalar `TRUE`/`FALSE` for each column will do; e.g., `.SDcols=!is.character` will return _non_-character columns (_a la_ `Negate()`). Note that `.SDcols=patterns(...)` can still be used for filtering based on the column names. - -8. Compiler support for OpenMP is now detected during installation, which allows `data.table` to compile from source (in single threaded mode) on macOS which, frustratingly, does not include OpenMP support by default, [#2161](https://github.com/Rdatatable/data.table/issues/2161), unlike Windows and Linux. A helpful message is emitted during installation from source, and on package startup as before. Many thanks to @jimhester for the PR. - -9. `rbindlist` now supports columns of type `expression`, [#546](https://github.com/Rdatatable/data.table/issues/546). Thanks @jangorecki for the report. - -10. The dimensions of objects in a `list` column are now displayed, [#3671](https://github.com/Rdatatable/data.table/issues/3671). Thanks to @randomgambit for the request, and Tyson Barrett for the PR. - -11. `frank` gains `ties.method='last'`, paralleling the same in `base::order` which has been available since R 3.3.0 (April 2016), [#1689](https://github.com/Rdatatable/data.table/issues/1689). Thanks @abudis for the encouragement to accommodate this. - -12. The `keep.rownames` argument in `as.data.table.xts` now accepts a string, which can be used for specifying the column name of the index of the xts input, [#4232](https://github.com/Rdatatable/data.table/issues/4232). Thanks to @shrektan for the request and the PR. - -13. New symbol `.NGRP` available in `j`, [#1206](https://github.com/Rdatatable/data.table/issues/1206). `.GRP` (the group number) was already available taking values from `1` to `.NGRP`. The number of groups, `.NGRP`, might be useful in `j` to calculate a percentage of groups processed so far, or to do something different for the last or penultimate group, for example. - -14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. - -15. A new throttle feature has been introduced to speed up small data tasks that are repeated in a loop, [#3175](https://github.com/Rdatatable/data.table/issues/3175) [#3438](https://github.com/Rdatatable/data.table/issues/3438) [#3205](https://github.com/Rdatatable/data.table/issues/3205) [#3735](https://github.com/Rdatatable/data.table/issues/3735) [#3739](https://github.com/Rdatatable/data.table/issues/3739) [#4284](https://github.com/Rdatatable/data.table/issues/4284) [#4527](https://github.com/Rdatatable/data.table/issues/4527) [#4294](https://github.com/Rdatatable/data.table/issues/4294) [#1120](https://github.com/Rdatatable/data.table/issues/1120). The default throttle of 1024 means that a single thread will be used when nrow<=1024, two threads when nrow<=2048, etc. To change the default, use `setDTthreads(throttle=)`. Or use the new environment variable `R_DATATABLE_THROTTLE`. If you use `Sys.setenv()` in a running R session to change this environment variable, be sure to run an empty `setDTthreads()` call afterwards for the change to take effect; see `?setDTthreads`. The word *throttle* is used to convey that the number of threads is restricted (throttled) for small data tasks. Reducing throttle to 1 will turn off throttling and should revert behaviour to past versions (i.e. using many threads even for small data). Increasing throttle to, say, 65536 will utilize multi-threading only for larger datasets. The value 1024 is a guess. We welcome feedback and test results indicating what the best default should be. - -## BUG FIXES - -1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). - -2. `DT[i]` could segfault when `i` is a zero-column `data.table`, [#4060](https://github.com/Rdatatable/data.table/issues/4060). Thanks @shrektan for reporting and fixing. - -3. Dispatch of `first` and `last` functions now properly works again for `xts` objects, [#4053](https://github.com/Rdatatable/data.table/issues/4053). Thanks to @ethanbsmith for reporting. - -4. If `.SD` is returned as-is during grouping, it is now unlocked for downstream usage, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks also to @mllg for detecting a problem with the initial fix here during the dev release [#4173](https://github.com/Rdatatable/data.table/issues/4173). - -5. `GForce` is deactivated for `[[` on non-atomic input, part of [#4159](https://github.com/Rdatatable/data.table/issues/4159). Thanks @hongyuanjia and @ColeMiller1 for helping debug an issue in dev with the original fix before release, [#4612](https://github.com/Rdatatable/data.table/issues/4612). - -6. `all.equal(DT, y)` no longer errors when `y` is not a data.table, [#4042](https://github.com/Rdatatable/data.table/issues/4042). Thanks to @d-sci for reporting and the PR. - -7. A length 1 `colClasses=NA_character_` would cause `fread` to incorrectly coerce all columns to character, [#4237](https://github.com/Rdatatable/data.table/issues/4237). - -8. An `fwrite` error message could include a garbled number and cause test 1737.5 to fail, [#3492](https://github.com/Rdatatable/data.table/issues/3492). Thanks to @QuLogic for debugging the issue on ARMv7hl, and the PR fixing it. - -9. `fread` improves handling of very small (<1e-300) or very large (>1e+300) floating point numbers on non-x86 architectures (specifically ppc64le and armv7hl). Thanks to @QuLogic for reporting and fixing, [PR#4165](https://github.com/Rdatatable/data.table/pull/4165). - -10. When updating by reference, the use of `get` could result in columns being re-ordered silently, [#4089](https://github.com/Rdatatable/data.table/issues/4089). Thanks to @dmongin for reporting and Cole Miller for the fix. - -11. `copy()` now overallocates deeply nested lists of `data.table`s, [#4205](https://github.com/Rdatatable/data.table/issues/4205). Thanks to @d-sci for reporting and the PR. - -12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR. - -13. A relatively rare case of segfault when combining non-equi joins with `by=.EACHI` is now fixed, closes [#4388](https://github.com/Rdatatable/data.table/issues/4388). - -14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report. - -15. `all.equal(DT1, DT2, ignore.row.order=TRUE)` could return TRUE incorrectly in the presence of NAs, [#4422](https://github.com/Rdatatable/data.table/issues/4422). - -16. Non-equi joins now automatically set `allow.cartesian=TRUE`, [4489](https://github.com/Rdatatable/data.table/issues/4489). Thanks to @Henrik-P for reporting. - -17. `X[Y, on=character(0)]` and `merge(X, Y, by.x=character(0), by.y=character(0))` no longer crash, [#4272](https://github.com/Rdatatable/data.table/pull/4272). Thanks to @tlapak for the PR. - -18. `by=col1:col4` gave an incorrect result if `key(DT)==c("col1","col4")`, [#4285](https://github.com/Rdatatable/data.table/issues/4285). Thanks to @cbilot for reporting, and Cole Miller for the PR. - -19. Matrices resulting from logical operators or comparisons on `data.table`s, e.g. in `dta == dtb`, can no longer have their colnames changed by reference later, [#4323](https://github.com/Rdatatable/data.table/issues/4323). Thanks to @eyherabh for reporting and @tlapak for the PR. - -20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8). - -## NOTES - -0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. - -1. `as.IDate`, `as.ITime`, `second`, `minute`, and `hour` now recognize UTC equivalents for speed: GMT, GMT-0, GMT+0, GMT0, Etc/GMT, and Etc/UTC, [#4116](https://github.com/Rdatatable/data.table/issues/4116). - -2. `set2key`, `set2keyv`, and `key2` have been removed, as they have been warning since v1.9.8 (Nov 2016) and halting with helpful message since v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental' and quickly superseded by `setindex` and `indices`. - -3. `data.table` now supports messaging in simplified Chinese (locale `zh_CN`). This was the result of a monumental collaboration to translate `data.table`'s roughly 1400 warnings, errors, and verbose messages (about 16,000 words/100,000 characters) over the course of two months from volunteer translators in at least 4 time zones, most of whom are first-time `data.table` contributors and many of whom are first-time OSS contributors! - - A big thanks goes out to @fengqifang, @hongyuanjia, @biobai, @zhiiiyang, @Leo-Lee15, @soappp9527, @amy17519, @Zachary-Wu, @caiquanyou, @dracodoc, @JulianYlli12, @renkun-ken, @Xueliang24, @koohoko, @KingdaShi, @gaospecial, @shrektan, @sunshine1126, @shawnchen1996, @yc0802, @HesperusArcher, and @Emberwhirl, all of whom took time from their busy schedules to translate and review others' translations. Especial thanks goes to @zhiiiyang and @hongyuanjia who went above and beyond in helping to push the project over the finish line, and to @GuangchuangYu who helped to organize the volunteer pool. - - `data.table` joins `lubridate` and `nlme` as the only of the top 200 most-downloaded community packages on CRAN to offer non-English messaging, and is the only of the top 50 packages to offer complete support of all messaging. We hope this is a first step in broadening the reach and accessibility of the R ecosystem to more users globally and look forward to working with other maintainers looking to bolster the portability of their packages by offering advice on learnings from this undertaking. - - We would be remiss not to mention the laudable lengths to which the R core team goes to maintain the _much_ larger repository (about 6,000 messages in more than 10 languages) of translations for R itself. - - We will evaluate the feasibility (in terms of maintenance difficulty and CRAN package size limits) of offering support for other languages in later releases. - -4. `fifelse` and `fcase` now notify users that S4 objects (except `nanotime`) are not supported [#4135](https://github.com/Rdatatable/data.table/issues/4135). Thanks to @torema-ed for bringing it to our attention and Morgan Jacob for the PR. - -5. `frank(..., ties.method="random", na.last=NA)` now returns the same random ordering that `base::rank` does, [#4243](https://github.com/Rdatatable/data.table/pull/4243). - -6. The error message when mistakenly using `:=` in `i` instead of `j` has been much improved, [#4227](https://github.com/Rdatatable/data.table/issues/4227). Thanks to Hugh Parsonage for the detailed suggestion. - - ```R - > DT = data.table(A=1:2) - > DT[B:=3] - Error: Operator := detected in i, the first argument inside DT[...], but is only valid in - the second argument, j. Most often, this happens when forgetting the first comma - (e.g. DT[newvar:=5] instead of DT[, new_var:=5]). Please double-check the - syntax. Run traceback(), and debugger() to get a line number. - > DT[, B:=3] - > DT - A B - - 1: 1 3 - 2: 2 3 - ``` - -7. Added more explanation/examples to `?data.table` for how to use `.BY`, [#1363](https://github.com/Rdatatable/data.table/issues/1363). - -8. Changes upstream in R have been accomodated; e.g. `c.POSIXct` now raises `'origin' must be supplied` which impacted `foverlaps`, [#4428](https://github.com/Rdatatable/data.table/pull/4428). - -9. `data.table::update.dev.pkg()` now unloads the `data.table` namespace to alleviate a DLL lock issue on Windows, [#4403](https://github.com/Rdatatable/data.table/issues/4403). Thanks to @drag5 for reporting. - -10. `data.table` packages binaries built by R version 3 (R3) should only be installed in R3, and similarly `data.table` package binaries built by R4 should only be installed in R4. Otherwise, `package ‘data.table’ was built under R version...` warning will occur which should not be ignored. This is due to a very welcome change to `rbind` and `cbind` in R 4.0.0 which enabled us to remove workarounds, see news item in v1.12.6 below in this file. To continue to support both R3 and R4, `data.table`'s NAMESPACE file contains a condition on the R major version (3 or 4) and this is what gives rise to the requirement that the major version used to build `data.table` must match the major version used to install it. Thanks to @vinhdizzo for reporting, [#4528](https://github.com/Rdatatable/data.table/issues/4528). - -11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. - -12. The `datatable.old.unique.by.key` option has been removed as per the 4 year schedule detailed in note 10 of v1.12.4 (Oct 2019), note 10 of v1.11.0 (May 2018), and note 1 of v1.9.8 (Nov 2016). It has been generating a helpful warning for 2 years, and helpful error for 1 year. - - -# data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) - -## NEW FEATURES - -1. `DT[, {...; .(A,B)}]` (i.e. when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. Thanks also to @renkun-ken for his go-first strong testing which caught an issue not caught by the test suite or by revdep testing, related to NULL being the last item, [#4061](https://github.com/Rdatatable/data.table/issues/4061). - -## BUG FIXES - -1. `frollapply` could segfault and exceed R's C protect limits, [#3993](https://github.com/Rdatatable/data.table/issues/3993). Thanks to @DavisVaughan for reporting and fixing. - -2. `DT[, sum(grp), by=grp]` (i.e. aggregating the same column being grouped) could error with `object 'grp' not found`, [#3103](https://github.com/Rdatatable/data.table/issues/3103). Thanks to @cbailiss for reporting. - -## NOTES - -1. Links in the manual were creating warnings when installing HTML, [#4000](https://github.com/Rdatatable/data.table/issues/4000). Thanks to Morgan Jacob. - -2. Adjustments for R-devel (R 4.0.0) which now has reference counting turned on, [#4058](https://github.com/Rdatatable/data.table/issues/4058) [#4093](https://github.com/Rdatatable/data.table/issues/4093). This motivated early release to CRAN because every day CRAN tests every package using the previous day's changes in R-devel; a much valued feature of the R ecosystem. It helps R-core if packages can pass changes in R-devel as soon as possible. Thanks to Luke Tierney for the notice, and for implementing reference counting which we look forward to very much. - -3. C internals have been standardized to use `PRI[u|d]64` to print `[u]int64_t`. This solves new warnings from `gcc-8` on Windows with `%lld`, [#4062](https://github.com/Rdatatable/data.table/issues/4062), in many cases already working around `snprintf` on Windows not supporting `%zu`. Release procedures have been augmented to prevent any internal use of `llu`, `lld`, `zu` or `zd`. - -4. `test.data.table()` gains `showProgress=interactive()` to suppress the thousands of `Running test id ...` lines displayed by CRAN checks when there are warnings or errors. - - -# data.table [v1.12.6](https://github.com/Rdatatable/data.table/milestone/18?closed=1) (18 Oct 2019) - -## BUG FIXES - -1. `shift()` on a `nanotime` with the default `fill=NA` now fills a `nanotime` missing value correctly, [#3945](https://github.com/Rdatatable/data.table/issues/3945). Thanks to @mschubmehl for reporting and fixing in PR [#3942](https://github.com/Rdatatable/data.table/pull/3942). - -2. Compilation failed on CRAN's MacOS due to an older version of `zlib.h/zconf.h` which did not have `z_const` defined, [#3939](https://github.com/Rdatatable/data.table/issues/3939). Other open-source projects unrelated to R have experienced this problem on MacOS too. We have followed the common practice of removing `z_const` to support the older `zlib` versions, and data.table's release procedures have gained a `grep` to ensure `z_const` isn't used again by accident in future. The library `zlib` is used for `fwrite`'s new feature of multithreaded compression on-the-fly; see item 3 of 1.12.4 below. - -3. A runtime error in `fwrite`'s compression, but only observed so far on Solaris 10 32bit with zlib 1.2.8 (Apr 2013), [#3931](https://github.com/Rdatatable/data.table/issues/3931): `Error -2: one or more threads failed to allocate buffers or there was a compression error.` In case it happens again, this area has been made more robust and the error more detailed. As is often the case, investigating the Solaris problem revealed secondary issues in the same area of the code. In this case, some `%d` in verbose output should have been `%lld`. This obliquity that CRAN's Solaris provides is greatly appreciated. - -4. A leak could occur in the event of an unsupported column type error, or if working memory could only partially be allocated; [#3940](https://github.com/Rdatatable/data.table/issues/3940). Found thanks to `clang`'s Leak Sanitizer (prompted by CRAN's diligent use of latest tools), and two tests in the test suite which tested the unsupported-type error. - -## NOTES - -1. Many thanks to Kurt Hornik for fixing R's S3 dispatch of `rbind` and `cbind` methods, [#3948](https://github.com/Rdatatable/data.table/issues/3948). With `R>=4.0.0` (current R-devel), `data.table` now registers the S3 methods `cbind.data.table` and `rbind.data.table`, and no longer applies the workaround documented in FAQ 2.24. - - -# data.table [v1.12.4](https://github.com/Rdatatable/data.table/milestone/16?closed=1) (03 Oct 2019) - -## NEW FEATURES - -1. `rleid()` functions now support long vectors (length > 2 billion). - -2. `fread()`: - * now skips embedded `NUL` (`\0`), [#3400](https://github.com/Rdatatable/data.table/issues/3400). Thanks to Marcus Davy for reporting with examples, Roy Storey for the initial PR, and Bingjie Qian for testing this feature on a very complicated real-world file. - * `colClasses` now supports `'complex'`, `'raw'`, `'Date'`, `'POSIXct'`, and user-defined classes (so long as an `as.` method exists), [#491](https://github.com/Rdatatable/data.table/issues/491) [#1634](https://github.com/Rdatatable/data.table/issues/1634) [#2610](https://github.com/Rdatatable/data.table/issues/2610). Any error during coercion results in a warning and the column is left as the default type (probably `"character"`). Thanks to @hughparsonage for the PR. - * `stringsAsFactors=0.10` will factorize any character column containing under `0.10*nrow` unique strings, [#2025](https://github.com/Rdatatable/data.table/issues/2025). Thanks to @hughparsonage for the PR. - * `colClasses=list(numeric=20:30, numeric="ID")` will apply the `numeric` type to column numbers `20:30` as before and now also column name `"ID"`; i.e. all duplicate class names are now respected rather than only the first. This need may arise when specifying some columns by name and others by number, as in this example. Thanks to @hughparsonage for the PR. - * gains `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files; i.e., csv files with metadata in a header formatted as YAML (https://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats. Please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/gesistsa/rio) for the inspiration and @MichaelChirico for implementing. - * `select` can now be used to specify types for just the columns selected, [#1426](https://github.com/Rdatatable/data.table/issues/1426). Just like `colClasses` it can be a named vector of `colname=type` pairs, or a named `list` of `type=col(s)` pairs. For example: - - ```R - fread(file, select=c(colD="character", # returns 2 columns: colD,colA - colA="integer64")) - fread(file, select=list(character="colD", # returns 5 columns: colD,8,9,10,colA - integer= 8:10, - character="colA")) - ``` - * gains `tmpdir=` argument which is passed to `tempfile()` whenever a temporary file is needed. Thanks to @mschubmehl for the PR. As before, setting `TMPDIR` (to `/dev/shm` for example) before starting the R session still works too; see `?base::tempdir`. - -3. `fwrite()`: - * now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Compression, like `fwrite()`, is multithreaded and compresses each chunk on-the-fly (a full size intermediate file is not created). Use a ".gz" extension, or the new `compress=` option. Many thanks to Philippe Chataignon for the significant PR. For example: - - ```R - DT = data.table(A=rep(1:2, 100e6), B=rep(1:4, 50e6)) - fwrite(DT, "data.csv") # 763MB; 1.3s - fwrite(DT, "data.csv.gz") # 2MB; 1.6s - identical(fread("data.csv.gz"), DT) - ``` - - Note that compression is handled using `zlib` library. In the unlikely event of missing `zlib.h`, on a machine that is compiling `data.table` from sources, one may get `fwrite.c` compilation error `zlib.h: No such file or directory`. As of now, the easiest solution is to install missing library using `sudo apt install zlib1g-dev` (Debian/Ubuntu). Installing R (`r-base-dev`) depends on `zlib1g-dev` so this should be rather uncommon. If it happens to you please upvote related issue [#3872](https://github.com/Rdatatable/data.table/issues/3872). - - * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature. - - * Gains `bom` argument to add a *byte order mark* (BOM) at the beginning of the file to signal that the file is encoded in UTF-8, [#3488](https://github.com/Rdatatable/data.table/issues/3488). Thanks to Stefan Fleck for requesting and Philippe Chataignon for implementing. - - * Now supports type `complex`, [#3690](https://github.com/Rdatatable/data.table/issues/3690). - - * Gains `scipen` [#2020](https://github.com/Rdatatable/data.table/issues/2020), the number 1 most-requested feature [#3189](https://github.com/Rdatatable/data.table/issues/3189). The default is `getOption("scipen")` so that `fwrite` will now respect R's option in the same way as `base::write.csv` and `base::format`, as expected. The parameter and option name have been kept the same as base R's `scipen` for consistency and to aid online search. It stands for 'scientific penalty'; i.e., the number of characters to add to the width within which non-scientific number format is used if it will fit. A high penalty essentially turns off scientific format. We believe that common practice is to use a value of 999, however, if you do use 999, because your data _might_ include very long numbers such as `10^300`, `fwrite` needs to account for the worst case field width in its buffer allocation per thread. This may impact space or time. If you experience slowdowns or unacceptable memory usage, please pass `verbose=TRUE` to `fwrite`, inspect the output, and report the issue. A workaround, until we can determine the best strategy, may be to pass a smaller value to `scipen`, such as 50. We have observed that `fwrite(DT, scipen=50)` appears to write `10^50` accurately, unlike base R. However, this may be a happy accident and not apply generally. Further work may be needed in this area. - - ```R - DT = data.table(a=0.0001, b=1000000) - fwrite(DT) - # a,b - # 1e-04,1e+06 - fwrite(DT,scipen=1) - # a,b - # 0.0001,1e+06 - fwrite(DT,scipen=2) - # a,b - # 0.0001,1000000 - - 10^50 - # [1] 1e+50 - options(scipen=50) - 10^50 - # [1] 100000000000000007629769841091887003294964970946560 - fwrite(data.table(A=10^50)) - # A - # 100000000000000000000000000000000000000000000000000 - ``` - -4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). - - ```R - > DT = data.table(A=1:3, B=list(1:2,"foo",3:5)) - > DT - A B - - 1: 1 1,2 - 2: 2 foo - 3: 3 3,4,5 - > - # The following all accomplish the same assignment: - > DT[2, B:=letters[9:13]] # was error, now works - > DT[2, B:=.(letters[9:13])] # was error, now works - > DT[2, B:=.(list(letters[9:13]))] # .(list()) was needed, still works - > DT - A B - - 1: 1 1,2 - 2: 2 i,j,k,l,m - 3: 3 3,4,5 - ``` - -5. `print.data.table()` gains an option to display the timezone of `POSIXct` columns when available, [#2842](https://github.com/Rdatatable/data.table/issues/2842). Thanks to Michael Chirico for reporting and Felipe Parages for the PR. - -6. New functions `nafill` and `setnafill`, [#854](https://github.com/Rdatatable/data.table/issues/854). Thanks to Matthieu Gomez for the request and Jan Gorecki for implementing. - - ```R - DT = setDT(lapply(1:100, function(i) sample(c(rnorm(9e6), rep(NA_real_, 1e6))))) - format(object.size(DT), units="GB") ## 7.5 Gb - zoo::na.locf(DT, na.rm=FALSE) ## zoo 53.518s - setDTthreads(1L) - nafill(DT, "locf") ## DT 1 thread 7.562s - setDTthreads(0L) - nafill(DT, "locf") ## DT 40 threads 0.605s - setnafill(DT, "locf") ## DT in-place 0.367s - ``` - -7. New variable `.Last.updated` (similar to R's `.Last.value`) contains the number of rows affected by the most recent `:=` or `set()`, [#1885](https://github.com/Rdatatable/data.table/issues/1885). For details see `?.Last.updated`. - -8. `between()` and `%between%` are faster for `POSIXct`, [#3519](https://github.com/Rdatatable/data.table/issues/3519), and now support the `.()` alias, [#2315](https://github.com/Rdatatable/data.table/issues/2315). Thanks to @Henrik-P for the reports. There is now also support for `bit64`'s `integer64` class and more robust coercion of types, [#3517](https://github.com/Rdatatable/data.table/issues/3517). `between()` gains `check=` which checks `any(lower>upper)`; off by default for speed in particular for type character. - -9. New convenience functions `%ilike%` and `%flike%` which map to new `like()` arguments `ignore.case` and `fixed` respectively, [#3333](https://github.com/Rdatatable/data.table/issues/3333). `%ilike%` is for case-insensitive pattern matching. `%flike%` is for more efficient matching of fixed strings. Thanks to @andreasLD for providing most of the core code. - -10. `on=.NATURAL` (or alternatively `X[on=Y]` [#3621](https://github.com/Rdatatable/data.table/issues/3621)) joins two tables on their common column names, so called _natural join_, [#629](https://github.com/Rdatatable/data.table/issues/629). Thanks to David Kulp for request. As before, when `on=` is not provided, `X` must have a key and the key columns are used to join (like rownames, but multi-column and multi-type). - -11. `as.data.table` gains `key` argument mirroring its use in `setDT` and `data.table`, [#890](https://github.com/Rdatatable/data.table/issues/890). As a byproduct, the arguments of `as.data.table.array` have changed order, which could affect code relying on positional arguments to this method. Thanks @cooldome for the suggestion and @MichaelChirico for implementation. - -12. `merge.data.table` is now exported, [#2618](https://github.com/Rdatatable/data.table/pull/2618). We realize that S3 methods should not ordinarily be exported. Rather, the method should be invoked via S3 dispatch. But users continue to request its export, perhaps because of intricacies relating to the fact that data.table inherits from data.frame, there are two arguments to `merge()` but S3 dispatch applies just to the first, and a desire to explicitly call `data.table::merge.data.table` from package code. Thanks to @AndreMikulec for the most recent request. - -13. New rolling function to calculate rolling sum has been implemented and exported, see `?frollsum`, [#2778](https://github.com/Rdatatable/data.table/issues/2778). - -14. `setkey` to an existing index now uses the index, [#2889](https://github.com/Rdatatable/data.table/issues/2889). Thanks to @MichaelChirico for suggesting and @saraswatmks for the PR. - -15. `DT[order(col)[1:5], ...]` (i.e. where `i` is a compound expression involving `order()`) is now optimized to use `data.table`'s multithreaded `forder`, [#1921](https://github.com/Rdatatable/data.table/issues/1921). This example is not a fully optimal top-N query since the full ordering is still computed. The improvement is that the call to `order()` is computed faster for any `i` expression using `order`. - -16. `as.data.table` now unpacks columns in a `data.frame` which are themselves a `data.frame` or `matrix`. This need arises when parsing JSON, a corollary in [#3369](https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752). Bug fix 19 in v1.12.2 (see below) added a helpful error (rather than segfault) to detect such invalid `data.table`, and promised that `as.data.table()` would unpack these columns in the next release (i.e. this release) so that the invalid `data.table` is not created in the first place. Further, `setDT` now warns if it observes such columns and suggests using `as.data.table` instead, [#3760](https://github.com/Rdatatable/data.table/issues/3760). - -17. `CJ` has been ported to C and parallelized, thanks to a PR by Michael Chirico, [#3596](https://github.com/Rdatatable/data.table/pull/3596). All types benefit, but, as in many `data.table` operations, factors benefit more than character. - - ```R - # default 4 threads on a laptop with 16GB RAM and 8 logical CPU - - ids = as.vector(outer(LETTERS, LETTERS, paste0)) - system.time( CJ(ids, 1:500000) ) # 3.9GB; 340m rows - # user system elapsed (seconds) - # 3.000 0.817 3.798 # was - # 1.800 0.832 2.190 # now - - # ids = as.factor(ids) - system.time( CJ(ids, 1:500000) ) # 2.6GB; 340m rows - # user system elapsed (seconds) - # 1.779 0.534 2.293 # was - # 0.357 0.763 0.292 # now - ``` - -18. New function `fcoalesce(...)` has been written in C, and is multithreaded for `numeric` and `factor`. It replaces missing values according to a prioritized list of candidates (as per SQL COALESCE, `dplyr::coalesce`, and `hutils::coalesce`), [#3424](https://github.com/Rdatatable/data.table/issues/3424). It accepts any number of vectors in several forms. For example, given three vectors `x`, `y`, and `z`, where each `NA` in `x` is to be replaced by the corresponding value in `y` if that is non-NA, else the corresponding value in `z`, the following equivalent forms are all accepted: `fcoalesce(x,y,z)`, `fcoalesce(x,list(y,z))`, and `fcoalesce(list(x,y,z))`. Being a new function, its behaviour is subject to change particularly for type `list`, [#3712](https://github.com/Rdatatable/data.table/issues/3712). - - ```R - # default 4 threads on a laptop with 16GB RAM and 8 logical CPU - N = 100e6 - x = replicate(5, {x=sample(N); x[sample(N, N/2)]=NA; x}, simplify=FALSE) # 2GB - y1 = do.call(dplyr::coalesce, x)) - y2 = do.call(hutils::coalesce, x)) - y3 = do.call(data.table::fcoalesce, x)) - # user system elapsed (seconds) - # 4.935 1.876 6.810 # dplyr::coalesce - # 3.122 0.831 3.956 # hutils::coalesce - # 0.915 0.099 0.379 # data.table::fcoalesce - identical(y1,y2) && identical(y1,y3) - # TRUE - ``` - -19. Type `complex` is now supported by `setkey`, `setorder`, `:=`, `by=`, `keyby=`, `shift`, `dcast`, `frank`, `rowid`, `rleid`, `CJ`, `fcoalesce`, `unique`, and `uniqueN`, [#3690](https://github.com/Rdatatable/data.table/issues/3690). Thanks to Gareth Ward and Elio Campitelli for their reports and input. Sorting `complex` is achieved the same way as base R; i.e., first by the real part then by the imaginary part (as if the `complex` column were two separate columns of `double`). There is no plan to support joining/merging on `complex` columns until a user demonstrates a need for that. - -20. `setkey`, `[key]by=` and `on=` in verbose mode (`options(datatable.verbose=TRUE)`) now detect any columns inheriting from `Date` which are stored as 8 byte double, test if any fractions are present, and if not suggest using a 4 byte integer instead (such as `data.table::IDate`) to save space and time, [#1738](https://github.com/Rdatatable/data.table/issues/1738). In future this could be upgraded to `message` or `warning` depending on feedback. - -21. New function `fifelse(test, yes, no, na)` has been implemented in C by Morgan Jacob, [#3657](https://github.com/Rdatatable/data.table/issues/3657) and [#3753](https://github.com/Rdatatable/data.table/issues/3753). It is comparable to `base::ifelse`, `dplyr::if_else`, `hutils::if_else`, and (forthcoming) [`vctrs::if_else()`](https://vctrs.r-lib.org/articles/stability.html#ifelse). It returns a vector of the same length as `test` but unlike `base::ifelse` the output type is consistent with those of `yes` and `no`. Please see `?data.table::fifelse` for more details. - - ```R - # default 4 threads on a laptop with 16GB RAM and 8 logical CPU - x = sample(c(TRUE,FALSE), 3e8, replace=TRUE) # 1GB - microbenchmark::microbenchmark( - base::ifelse(x, 7L, 11L), - dplyr::if_else(x, 7L, 11L), - hutils::if_else(x, 7L, 11L), - data.table::fifelse(x, 7L, 11L), - times = 5L, unit="s" - ) - # Unit: seconds - # expr min med max neval - # base::ifelse(x, 7L, 11L) 8.5 8.6 8.8 5 - # dplyr::if_else(x, 7L, 11L) 9.4 9.5 9.7 5 - # hutils::if_else(x, 7L, 11L) 2.6 2.6 2.7 5 - # data.table::fifelse(x, 7L, 11L) 1.5 1.5 1.6 5 # setDTthreads(1) - # data.table::fifelse(x, 7L, 11L) 0.8 0.8 0.9 5 # setDTthreads(2) - # data.table::fifelse(x, 7L, 11L) 0.4 0.4 0.5 5 # setDTthreads(4) - ``` - -22. `transpose` gains `keep.names=` and `make.names=` arguments, [#1886](https://github.com/Rdatatable/data.table/issues/1886). Previously, column names were dropped and there was no way to keep them. `keep.names="rn"` keeps the column names and puts them in the `"rn"` column of the result. Similarly, `make.names="rn"` uses column `"rn"` as the column names of the result. Both arguments are `NULL` by default for backwards compatibility. As these new arguments are new, they are subject to change in future according to community feedback. Thanks to @ghost for the request. - -23. Added a `data.table` method for `utils::edit` to ensure a `data.table` is returned, for convenience, [#593](https://github.com/Rdatatable/data.table/issues/593). - -24. More efficient optimization of many columns in `j` (e.g. from `.SD`), [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. - -25. `setnames(DT, old, new)` now omits any `old==new` to save redundant key and index name updates, [#3783](https://github.com/Rdatatable/data.table/issues/3783). `setnames(DT, new)` (i.e. not providing `old`) already omitted any column name updates where `names(DT)==new`; e.g. `setnames(DT, gsub('^_', '', names(DT)))` exits early if no columns start with `_`. - -26. `[[` by group is now optimized for regular vectors (not type list), [#3209](https://github.com/Rdatatable/data.table/issues/3209). Thanks @renkun-ken for the suggestion. `[` by group was already optimized. Please file a feature request if you would like this optimization for list columns. - -27. New function `frollapply` for rolling computation of arbitrary R functions (caveat: input `x` is coerced to numeric beforehand, and the function must return a scalar numeric value). The API is consistent to extant rolling functions `frollmean` and `frollsum`; note that it will generally be slower than those functions because (1) the known functions use our optimized internal C implementation and (2) there is no thread-safe API to R's C `eval`. Nevertheless `frollapply` is faster than corresponding `base`-only and `zoo` versions: - - ```R - set.seed(108) - x = rnorm(1e6); n = 1e3 - base_rollapply = function(x, n, FUN) { - nx = length(x) - ans = rep(NA_real_, nx) - for (i in n:nx) ans[i] = FUN(x[(i-n+1):i]) - ans - } - system.time(base_rollapply(x, n, mean)) - system.time(zoo::rollapplyr(x, n, function(x) mean(x), fill=NA)) - system.time(zoo::rollmeanr(x, n, fill=NA)) - system.time(frollapply(x, n, mean)) - system.time(frollmean(x, n)) - - ### fun mean sum median - # base_rollapply 8.815 5.151 60.175 - # zoo::rollapply 34.373 27.837 88.552 - # zoo::roll[fun] 0.215 0.185 NA ## median not fully supported - # frollapply 5.404 1.419 56.475 - # froll[fun] 0.003 0.002 NA ## median not yet supported - ``` - -28. `setnames()` now accepts functions in `old=` and `new=`, [#3703](https://github.com/Rdatatable/data.table/issues/3703). Thanks @smingerson for the feature request and @shrektan for the PR. - - ```R - DT = data.table(a=1:3, b=4:6, c=7:9) - setnames(DT, toupper) - names(DT) - # [1] "A" "B" "C" - setnames(DT, c(1,3), tolower) - names(DT) - # [1] "a" "B" "c" - ``` - -29. `:=` and `set()` now use zero-copy type coercion. Accordingly, `DT[..., integerColumn:=0]` and `set(DT,i,j,0)` no longer warn about the `0` ('numeric') needing to be `0L` ('integer') because there is no longer any time or space used for this coercion. The old long warning was off-putting to new users ("what and why L?"), whereas advanced users appreciated the old warning so they could avoid the coercion. Although the time and space for one coercion in a single call is unmeasurably small, when placed in a loop the small overhead of any allocation on R's heap could start to become noticeable (more so for `set()` whose purpose is low-overhead looping). Further, when assigning a value across columns of varying types, it could be inconvenient to supply the correct type for every column. Hence, zero-copy coercion was introduced to satisfy all these requirements. A warning is still issued, as before, when fractional data is discarded; e.g. when 3.14 is assigned to an integer column. Zero-copy coercion applies to length>1 vectors as well as length-1 vectors. - -## BUG FIXES - -1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. - -2. `keyby=colName` could use the wrong index and return incorrect results if both `colName` and `colNameExtra` (where `colName` is a leading subset of characters of `colNameExtra`) are column names and an index exists on `colNameExtra`, [#3498](https://github.com/Rdatatable/data.table/issues/3498). Thanks to Xianying Tan for the detailed report and pinpointing the source line at fault. - -3. A missing item in `j` such as `j=.(colA, )` now gives a helpful error (`Item 2 of the .() or list() passed to j is missing`) rather than the unhelpful error `argument "this_jsub" is missing, with no default` (v1.12.2) or `argument 2 is empty` (v1.12.0 and before), [#3507](https://github.com/Rdatatable/data.table/issues/3507). Thanks to @eddelbuettel for the report. - -4. `fwrite()` could crash when writing very long strings such as 30 million characters, [#2974](https://github.com/Rdatatable/data.table/issues/2974), and could be unstable in memory constrained environments, [#2612](https://github.com/Rdatatable/data.table/issues/2612). Thanks to @logworthy and @zachokeeffe for reporting and Philippe Chataignon for fixing in PR [#3288](https://github.com/Rdatatable/data.table/pull/3288). - -5. `fread()` could crash if `quote=""` (i.e. ignore quotes), the last line is too short, and `fill=TRUE`, [#3524](https://github.com/Rdatatable/data.table/pull/3524). Thanks to Jiucang Hao for the report and reproducible example. - -6. Printing could occur unexpectedly when code is run with `source`, [#2369](https://github.com/Rdatatable/data.table/issues/2369). Thanks to @jan-glx for the report and reproducible example. - -7. Grouping by `NULL` on zero rows `data.table` now behaves consistently to non-zero rows `data.table`, [#3530](https://github.com/Rdatatable/data.table/issues/3530). Thanks to @SymbolixAU for the report and reproducible example. - -8. GForce optimization of `median` did not retain the class; e.g. `median` of `Date` or `POSIXct` would return a raw number rather than retain the date class, [#3079](https://github.com/Rdatatable/data.table/issues/3079). Thanks to @Henrik-P for reporting. - -9. `DT[, format(mean(date,""%b-%Y")), by=group]` could fail with `invalid 'trim' argument`, [#1876](https://github.com/Rdatatable/data.table/issues/1876). Thanks to Ross Holmberg for reporting. - -10. `externalVar=1:5; DT[, mean(externalVar), by=group]` could return incorrect results rather than a constant (`3` in this example) for each group, [#875](https://github.com/Rdatatable/data.table/issues/875). GForce optimization was being applied incorrectly to the `mean` without realizing `externalVar` was not a column. - -11. `test.data.table()` now passes in non-English R sessions, [#630](https://github.com/Rdatatable/data.table/issues/630) [#3039](https://github.com/Rdatatable/data.table/issues/3039). Each test still checks that the number of warnings and/or errors produced is correct. However, a message is displayed suggesting to restart R with `LANGUAGE=en` in order to test that the text of the warning and/or error messages are as expected, too. - -12. Joining a double column in `i` containing say 1.3, with an integer column in `x` containing say 1, would result in the 1.3 matching to 1, [#2592](https://github.com/Rdatatable/data.table/issues/2592), and joining a factor column to an integer column would match the factor's integers rather than error. The type coercion logic has been revised and strengthened. Many thanks to @MarkusBonsch for reporting and fixing. Joining a character column in `i` to a factor column in `x` is now faster and retains the character column in the result rather than coercing it to factor. Joining an integer column in `i` to a double column in `x` now retains the integer type in the result rather than coercing the integers into the double type. Logical columns may now only be joined to logical columns, other than all-NA columns which are coerced to the matching column's type. All coercions are reported in verbose mode: `options(datatable.verbose=TRUE)`. - -13. Attempting to recycle 2 or more items into an existing `list` column now gives the intended helpful error rather than `Internal error: recycle length error not caught earlier.`, [#3543](https://github.com/Rdatatable/data.table/issues/3543). Thanks to @MichaelChirico for finding and reporting. - -14. Subassigning using `$<-` to a `data.table` embedded in a list column of a single-row `data.table` could fail, [#3474](https://github.com/Rdatatable/data.table/issues/3474). Note that `$<-` is not recommended; please use `:=` instead which already worked in this case. Thanks to Jakob Richter for reporting. - -15. `rbind` and `rbindlist` of zero-row items now retain (again) the unused levels of any (zero-length) factor columns, [#3508](https://github.com/Rdatatable/data.table/issues/3508). This was a regression in v1.12.2 just for zero-row items. Unused factor levels were already retained for items having `nrow>=1`. Thanks to Gregory Demin for reporting. - -16. `rbind` and `rbindlist` of an item containing an ordered factor with levels containing an `NA` (as opposed to an NA integer) could segfault, [#3601](https://github.com/Rdatatable/data.table/issues/3601). This was a a regression in v1.12.2. Thanks to Damian Betebenner for reporting. Also a related segfault when recycling a length-1 factor column, [#3662](https://github.com/Rdatatable/data.table/issues/3662). - -17. `example(":=", local=TRUE)` now works rather than error, [#2972](https://github.com/Rdatatable/data.table/issues/2972). Thanks @vlulla for the report. - -18. `rbind.data.frame` on `IDate` columns changed the column from `integer` to `double`, [#2008](https://github.com/Rdatatable/data.table/issues/2008). Thanks to @rmcgehee for reporting. - -19. `merge.data.table` now retains any custom classes of the first argument, [#1378](https://github.com/Rdatatable/data.table/issues/1378). Thanks to @michaelquinn32 for reopening. - -20. `c`, `seq` and `mean` of `ITime` objects now retain the `ITime` class via new `ITime` methods, [#3628](https://github.com/Rdatatable/data.table/issues/3628). Thanks @UweBlock for reporting. The `cut` and `split` methods for `ITime` have been removed since the default methods work, [#3630](https://github.com/Rdatatable/data.table/pull/3630). - -21. `as.data.table.array` now handles the case when some of the array's dimension names are `NULL`, [#3636](https://github.com/Rdatatable/data.table/issues/3636). - -22. Adding a `list` column using `cbind`, `as.data.table`, or `data.table` now works rather than treating the `list` as if it were a set of columns and introducing an invalid NA column name, [#3471](https://github.com/Rdatatable/data.table/pull/3471). However, please note that using `:=` to add columns is preferred. - - ```R - cbind( data.table(1:2), list(c("a","b"),"a") ) - # V1 V2 NA # v1.12.2 and before - # - # 1: 1 a a - # 2: 2 b a - # - # V1 V2 # v1.12.4+ - # - # 1: 1 a,b - # 2: 2 a - ``` - -23. Incorrect sorting/grouping results due to a bug in Intel's `icc` compiler 2019 (Version 19.0.4.243 Build 20190416) has been worked around thanks to a report and fix by Sebastian Freundt, [#3647](https://github.com/Rdatatable/data.table/issues/3647). Please run `data.table::test.data.table()`. If that passes, your installation does not have the problem. - -24. `column not found` could incorrectly occur in rare non-equi-join cases, [#3635](https://github.com/Rdatatable/data.table/issues/3635). Thanks to @UweBlock for the report. - -25. Slight fix to the logic for auto-naming the `by` clause for using a custom function like `evaluate` to now be named `evaluate` instead of the name of the first symbolic argument, [#3758](https://github.com/Rdatatable/data.table/issues/3758). - -26. Column binding of zero column `data.table` will now work as expected, [#3334](https://github.com/Rdatatable/data.table/issues/3334). Thanks to @kzenstratus for the report. - -27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report. - -28. From v1.12.0 `between()` and `%between%` interpret missing values in `lower=` or `upper=` as unlimited bounds. A new parameter `NAbounds` has been added to achieve the old behaviour of returning `NA`, [#3522](https://github.com/Rdatatable/data.table/issues/3522). Thanks @cguill95 for reporting. This is now consistent for character input, [#3667](https://github.com/Rdatatable/data.table/issues/3667) (thanks @AnonymousBoba), and class `nanotime` is now supported too. - -29. `integer64` defined on a subset of a new column would leave "gibberish" on the remaining rows, [#3723](https://github.com/Rdatatable/data.table/issues/3723). A bug in `rbindlist` with the same root cause was also fixed, [#1459](https://github.com/Rdatatable/data.table/issues/1459). Thanks @shrektan and @jangorecki for the reports. - -30. `groupingsets` functions now properly handle alone special symbols when using an empty set to group by, [#3653](https://github.com/Rdatatable/data.table/issues/3653). Thanks to @Henrik-P for the report. - -31. A `data.table` created using `setDT()` on a `data.frame` containing identical columns referencing each other would cause `setkey()` to return incorrect results, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). Thanks @kirillmayantsev and @alex46015 for reporting, and @jaapwalhout and @Atrebas for helping to debug and isolate the issue. - -32. `x[, round(.SD, 1)]` and similar operations on the whole of `.SD` could return a locked result, incorrectly preventing `:=` on the result, [#2245](https://github.com/Rdatatable/data.table/issues/2245). Thanks @grayskripko for raising. - -33. Using `get`/`mget` in `j` could cause `.SDcols` to be ignored or reordered, [#1744](https://github.com/Rdatatable/data.table/issues/1744), [#1965](https://github.com/Rdatatable/data.table/issues/1965), and [#2036](https://github.com/Rdatatable/data.table/issues/2036). Thanks @franknarf1, @MichaelChirico, and @TonyBonen, for the reports. - -34. `DT[, i-1L, with=FALSE]` would misinterpret the minus sign and return an incorrect result, [#2019](https://github.com/Rdatatable/data.table/issues/2109). Thanks @cguill95 for the report. - -35. `DT[id==1, DT2[.SD, on="id"]]` (i.e. joining from `.SD` in `j`) could incorrectly fail in some cases due to `.SD` being locked, [#1926](https://github.com/Rdatatable/data.table/issues/1926), and when updating-on-join with factors [#3559](https://github.com/Rdatatable/data.table/issues/3559) [#2099](https://github.com/Rdatatable/data.table/issues/2099). Thanks @franknarf1 and @Henrik-P for the reports and for diligently tracking use cases for almost 3 years! - -36. `as.IDate.POSIXct` returned `NA` for UTC times before Dec 1901 and after Jan 2038, [#3780](https://github.com/Rdatatable/data.table/issues/3780). Thanks @gschett for the report. - -37. `rbindlist` now returns correct idcols for lists with different length vectors, [#3785](https://github.com/Rdatatable/data.table/issues/3785), [#3786](https://github.com/Rdatatable/data.table/pull/3786). Thanks to @shrektan for the report and fix. - -38. `DT[ , !rep(FALSE, ncol(DT)), with=FALSE]` correctly returns the full table, [#3013](https://github.com/Rdatatable/data.table/issues/3013) and [#2917](https://github.com/Rdatatable/data.table/issues/2917). Thanks @alexnss and @DavidArenburg for the reports. - -39. `shift(x, 0:1, type='lead', give.names=TRUE)` uses `lead` in all returned column names, [#3832](https://github.com/Rdatatable/data.table/issues/3832). Thanks @daynefiler for the report. - -40. Subtracting two `POSIXt` objects by group could lead to incorrect results because the `base` method internally calls `difftime` with `units='auto'`; `data.table` does not notice if the chosen units differ by group and only the last group's `units` attribute was retained, [#3694](https://github.com/Rdatatable/data.table/issues/3694) and [#761](https://github.com/Rdatatable/data.table/issues/761). To surmount this, we now internally force `units='secs'` on all `POSIXt-POSIXt` calls (reported when `verbose=TRUE`); generally we recommend calling `difftime` directly instead. Thanks @oliver-oliver and @boethian for the reports. - -41. Using `get`/`mget` in `j` could cause `.SDcols` to be ignored or reordered, [#1744](https://github.com/Rdatatable/data.table/issues/1744), [#1965](https://github.com/Rdatatable/data.table/issues/1965), [#2036](https://github.com/Rdatatable/data.table/issues/2036), and [#2946](https://github.com/Rdatatable/data.table/issues/2946). Thanks @franknarf1, @MichaelChirico, @TonyBonen, and Steffen J. (StackOverflow) for the reports. - -42. `DT[...,by={...}]` now handles expressions in `{`, [#3156](https://github.com/Rdatatable/data.table/issues/3156). Thanks to @tdhock for the report. - -43. `:=` could change a `data.table` creation statement in the body of the function calling it, or a variable in calling scope, [#3890](https://github.com/Rdatatable/data.table/issues/3890). Many thanks to @kirillmayantsev for the detailed reports. - -44. Grouping could create a `malformed factor` and/or segfault when the factors returned by each group did not have identical levels, [#2199](https://github.com/Rdatatable/data.table/issues/2199) and [#2522](https://github.com/Rdatatable/data.table/issues/2522). Thanks to Václav Hausenblas, @franknarf1, @ben519, and @Henrik-P for reporting. - -45. `rbindlist` (and printing a `data.table` with over 100 rows because that uses `rbindlist(head, tail)`) could error with `malformed factor` for unordered factor columns containing a used `NA_character_` level, [#3915](https://github.com/Rdatatable/data.table/issues/3915). This is an unusual input for unordered factors because NA_integer_ is recommended by default in R. Thanks to @sindribaldur for reporting. - -46. Adding a `list` column containing an item of type `list` to a one row `data.table` could fail, [#3626](https://github.com/Rdatatable/data.table/issues/3626). Thanks to Jakob Richter for reporting. - -## NOTES - -1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. - -2. Adding a new column by reference using `set()` on a `data.table` loaded from binary file now give a more helpful error message, [#2996](https://github.com/Rdatatable/data.table/issues/2996). Thanks to Joseph Burling for reporting. - - ``` - This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed - manually (e.g. using structure()). Please run setDT() or alloc.col() on it first (to pre-allocate - space for new columns) before adding new columns by reference to it. - ``` - -3. `setorder` on a superset of a keyed `data.table`'s key now retains its key, [#3456](https://github.com/Rdatatable/data.table/issues/3456). For example, if `a` is the key of `DT`, `setorder(DT, a, -v)` will leave `DT` keyed by `a`. - -4. New option `options(datatable.quiet = TRUE)` turns off the package startup message, [#3489](https://github.com/Rdatatable/data.table/issues/3489). `suppressPackageStartupMessages()` continues to work too. Thanks to @leobarlach for the suggestion inspired by `options(tidyverse.quiet = TRUE)`. We don't know of a way to make a package respect the `quietly=` option of `library()` and `require()` because the `quietly=` isn't passed through for use by the package's own `.onAttach`. If you can see how to do that, please submit a patch to R. - -5. When loading a `data.table` from disk (e.g. with `readRDS`), best practice is to run `setDT()` on the new object to assure it is correctly allocated memory for new column pointers. Barring this, unexpected behavior can follow; for example, if you assign a new column to `DT` from a function `f`, the new columns will only be assigned within `f` and `DT` will be unchanged. The `verbose` messaging in this situation is now more helpful, [#1729](https://github.com/Rdatatable/data.table/issues/1729). Thanks @vspinu for sharing his experience to spur this. - -6. New vignette _Using `.SD` for Data Analysis_, a deep dive into use cases for the `.SD` variable to help illuminate this topic which we've found to be a sticking point for beginning and intermediate `data.table` users, [#3412](https://github.com/Rdatatable/data.table/issues/3412). - -7. Added a note to `?frank` clarifying that ranking is being done according to C sorting (i.e., like `forder`), [#2328](https://github.com/Rdatatable/data.table/issues/2328). Thanks to @cguill95 for the request. - -8. Historically, `dcast` and `melt` were built as enhancements to `reshape2`'s own `dcast`/`melt`. We removed dependency on `reshape2` in v1.9.6 but maintained some backward compatibility. As that package has been superseded since December 2017, we will begin to formally complete the split from `reshape2` by removing some last vestiges. In particular we now warn when redirecting to `reshape2` methods and will later error before ultimately completing the split; see [#3549](https://github.com/Rdatatable/data.table/issues/3549) and [#3633](https://github.com/Rdatatable/data.table/issues/3633). We thank the `reshape2` authors for their original inspiration for these functions, and @ProfFancyPants for testing and reporting regressions in dev which have been fixed before release. - -9. `DT[col]` where `col` is a column containing row numbers of itself to select, now suggests the correct syntax (`DT[(col)]` or `DT[DT$col]`), [#697](https://github.com/Rdatatable/data.table/issues/697). This expands the message introduced in [#1884](https://github.com/Rdatatable/data.table/issues/1884) for the case where `col` is type `logical` and `DT[col==TRUE]` is suggested. - -10. The `datatable.old.unique.by.key` option has been warning for 1 year that it is deprecated: `... Please stop using it and pass by=key(DT) instead for clarity ...`. This warning is now upgraded to error as per the schedule in note 10 of v1.11.0 (May 2018), and note 1 of v1.9.8 (Nov 2016). In June 2020 the option will be removed. - -11. We intend to deprecate the `datatable.nomatch` option, [more info](https://github.com/Rdatatable/data.table/pull/3578/files). A message is now printed upon use of the option (once per session) as a first step. It asks you to please stop using the option and to pass `nomatch=NULL` explicitly if you require inner join. Outer join (`nomatch=NA`) has always been the default because it is safer; it does not drop missing data silently. The problem is that the option is global; i.e., if a user changes the default using this option for their own use, that can change the behavior of joins inside packages that use `data.table` too. This is the only `data.table` option with this concern. - -12. The test suite of 9k tests now runs with three R options on: `warnPartialMatchArgs`, `warnPartialMatchAttr`, and `warnPartialMatchDollar`. This ensures that we don't rely on partial argument matching in internal code, for robustness and efficiency, and so that users can turn these options on for their code in production, [#3664](https://github.com/Rdatatable/data.table/issues/3664). Thanks to Vijay Lulla for the suggestion, and Michael Chirico for fixing 48 internal calls to `attr()` which were missing `exact=TRUE`, for example. Thanks to R-core for adding these options to R 2.6.0 (Oct 2007). - -13. `test.data.table()` could fail if the `datatable.integer64` user option was set, [#3683](https://github.com/Rdatatable/data.table/issues/3683). Thanks @xiaguoxin for reporting. - -14. The warning message when using `keyby=` together with `:=` is clearer, [#2763](https://github.com/Rdatatable/data.table/issues/2763). Thanks to @eliocamp. - -15. `first` and `last` gain an explicit `n=1L` argument so that it's clear the default is 1, and their almost identical manual pages have been merged into one. - -16. Rolling functions (`?froll`) coerce `logical` input to `numeric` (instead of failing) to mimic the behavior of `integer` input. - -17. The warning message when using `strptime` in `j` has been improved, [#2068](https://github.com/Rdatatable/data.table/issues/2068). Thanks to @tdhock for the report. - -18. Added a note to `?setkey` clarifying that `setkey` always uses C-locale sorting (as has been noted in `?setorder`). Thanks @JBreidaks for the report in [#2114](https://github.com/Rdatatable/data.table/issues/2114). - -19. `hour()`/`minute()`/`second()` are much faster for `ITime` input, [#3518](https://github.com/Rdatatable/data.table/issues/3158). - -20. New alias `setalloccol` for `alloc.col`, [#3475](https://github.com/Rdatatable/data.table/issues/3475). For consistency with `set*` prefixes for functions that operate in-place (like `setkey`, `setorder`, etc.). `alloc.col` is not going to be deprecated but we recommend using `setalloccol`. - -21. `dcast` no longer emits a message when `value.var` is missing but `fun.aggregate` is explicitly set to `length` (since `value.var` is arbitrary in this case), [#2980](https://github.com/Rdatatable/data.table/issues/2980). - -22. Optimized `mean` of `integer` columns no longer warns about a coercion to numeric, [#986](https://github.com/Rdatatable/data.table/issues/986). Thanks @dgrtwo for his [YouTube tutorial at 3:01](https://youtu.be/AmE4LXPQErM?t=175) where the warning occurs. - -23. Using `first` and `last` function on `POSIXct` object no longer loads `xts` namespace, [#3857](https://github.com/Rdatatable/data.table/issues/3857). `first` on empty `data.table` returns empty `data.table` now [#3858](https://github.com/Rdatatable/data.table/issues/3858). - -24. Added some clarifying details about what happens when a shell command is used in `fread`, [#3877](https://github.com/Rdatatable/data.table/issues/3877). Thanks Brian for the StackOverflow question which highlighted the lack of explanation here. - -25. We continue to encourage packages to `Import` rather than `Depend` on `data.table`, [#3076](https://github.com/Rdatatable/data.table/issues/3076). To prevent the growth rate in new packages using `Depend`, we have requested that CRAN apply a small patch we provided to prevent new submissions using `Depend`. If this is accepted, the error under `--as-cran` will be as follows. The existing 73 packages using `Depend` will continue to pass OK until they next update, at which point they will be required to change from `Depend` to `Import`. - - ``` - R CMD check --as-cran - ... - * checking package dependencies ... ERROR - - data.table should be in Imports not Depends. Please contact its - maintainer for more information. - ``` - - -# data.table [v1.12.2](https://github.com/Rdatatable/data.table/milestone/14?closed=1) (07 Apr 2019) - -## NEW FEATURES - -1. `:=` no longer recycles length>1 RHS vectors. There was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length (the same behaviour as base R). Consistent feedback for several years has been that recycling is more often a bug. In rare cases where you need to recycle a length>1 vector, please use `rep()` explicitly. Single values are still recycled silently as before. Early warning was given in [this tweet](https://twitter.com/MattDowle/status/1088544083499311104). The 774 CRAN and Bioconductor packages using `data.table` were tested and the maintainers of the 16 packages affected (2%) were consulted before going ahead, [#3310](https://github.com/Rdatatable/data.table/pull/3310). Upon agreement we went ahead. Many thanks to all those maintainers for already updating on CRAN, [#3347](https://github.com/Rdatatable/data.table/pull/3347). - -2. `foverlaps` now supports `type="equal"`, [#3416](https://github.com/Rdatatable/data.table/issues/3416) and part of [#3002](https://github.com/Rdatatable/data.table/issues/3002). - -3. The number of logical CPUs used by default has been reduced from 100% to 50%. The previous 100% default was reported to cause significant slow downs when other non-trivial processes were also running, [#3395](https://github.com/Rdatatable/data.table/issues/3395) [#3298](https://github.com/Rdatatable/data.table/issues/3298). Two new optional environment variables (`R_DATATABLE_NUM_PROCS_PERCENT` & `R_DATATABLE_NUM_THREADS`) control this default. `setDTthreads()` gains `percent=` and `?setDTthreads` has been significantly revised. The output of `getDTthreads(verbose=TRUE)` has been expanded. The environment variable `OMP_THREAD_LIMIT` is now respected ([#3300](https://github.com/Rdatatable/data.table/issues/3300)) in addition to `OMP_NUM_THREADS` as before. - -4. `rbind` and `rbindlist` now retain the position of duplicate column names rather than grouping them together [#3373](https://github.com/Rdatatable/data.table/issues/3373), fill length 0 columns (including NULL) with NA with warning [#1871](https://github.com/Rdatatable/data.table/issues/1871), and recycle length-1 columns [#524](https://github.com/Rdatatable/data.table/issues/524). Thanks to Kun Ren for the requests which arose when parsing JSON. - -5. `rbindlist`'s `use.names=` default has changed from `FALSE` to `"check"`. This emits a message if the column names of each item are not identical and then proceeds as if `use.names=FALSE` for backwards compatibility; i.e., bind by column position not by column name. The `rbind` method for `data.table` already sets `use.names=TRUE` so this change affects `rbindlist` only and not `rbind.data.table`. To stack differently named columns together silently (the previous default behavior of `rbindlist`), it is now necessary to specify `use.names=FALSE` for clarity to readers of your code. Thanks to Clayton Stanley who first raised the issue [here](https://lists.r-forge.r-project.org/pipermail/datatable-help/2014-April/002480.html). To aid pinpointing the calls to `rbindlist` that need attention, the message can be turned to error using `options(datatable.rbindlist.check="error")`. This option also accepts `"warning"`, `"message"` and `"none"`. In this release the message is suppressed for default column names (`"V[0-9]+"`); the next release will emit the message for those too. In 6 months the default will be upgraded from message to warning. There are two slightly different messages. They are helpful, include context and point to this news item : - - ``` - Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with - NA (NULL for list columns), or use.names=FALSE to ignore column names. - See news item 5 in v1.12.2 for options to control this message. - - Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE - to match by column name, or use.names=FALSE to ignore column names. - See news item 5 in v1.12.2 for options to control this message. - ``` - -6. `fread` gains `keepLeadingZeros`, [#2999](https://github.com/Rdatatable/data.table/issues/2999). By default `FALSE` so that, as before, a field containing `001` is interpreted as the integer 1, otherwise the character string `"001"`. The default may be changed using `options(datatable.keepLeadingZeros=TRUE)`. Many thanks to @marc-outins for the PR. - -## BUG FIXES - -1. `rbindlist()` of a malformed factor which is missing a levels attribute is now a helpful error rather than a cryptic error about `STRING_ELT`, [#3315](https://github.com/Rdatatable/data.table/issues/3315). Thanks to Michael Chirico for reporting. - -2. Forgetting `type=` in `shift(val, "lead")` would segfault, [#3354](https://github.com/Rdatatable/data.table/issues/3354). A helpful error is now produced to indicate `"lead"` is being passed to `n=` rather than the intended `type=` argument. Thanks to @SymbolixAU for reporting. - -3. The default print output (top 5 and bottom 5 rows) when ncol>255 could display the columns in the wrong order, [#3306](https://github.com/Rdatatable/data.table/issues/3306). Thanks to Kun Ren for reporting. - -4. Grouping by unusual column names such as `by='string_with_\\'` and `keyby="x y"` could fail, [#3319](https://github.com/Rdatatable/data.table/issues/3319) [#3378](https://github.com/Rdatatable/data.table/issues/3378). Thanks to @HughParsonage for reporting and @MichaelChirico for the fixes. - -5. `foverlaps()` could return incorrect results for `POSIXct <= 1970-01-01`, [#3349](https://github.com/Rdatatable/data.table/issues/3349). Thanks to @lux5 for reporting. - -6. `dcast.data.table` now handles functions passed to `fun.aggregate=` via a variable; e.g., `funs <- list(sum, mean); dcast(..., fun.aggregate=funs`, [#1974](https://github.com/Rdatatable/data.table/issues/1974) [#1369](https://github.com/Rdatatable/data.table/issues/1369) [#2064](https://github.com/Rdatatable/data.table/issues/2064) [#2949](https://github.com/Rdatatable/data.table/issues/2949). Thanks to @sunbee, @Ping2016, @smidelius and @d0rg0ld for reporting. - -7. Some non-equijoin cases could segfault, [#3401](https://github.com/Rdatatable/data.table/issues/3401). Thanks to @Gayyam for reporting. - -8. `dcast.data.table` could sort rows containing `NA` incorrectly, [#2202](https://github.com/Rdatatable/data.table/issues/2202). Thanks to @Galileo-Galilei for the report. - -9. Sorting, grouping and finding unique values of a numeric column containing at most one finite value (such as `c(Inf,0,-Inf)`) could return incorrect results, [#3372](https://github.com/Rdatatable/data.table/issues/3372) [#3381](https://github.com/Rdatatable/data.table/issues/3381); e.g., `data.table(A=c(Inf,0,-Inf), V=1:3)[,sum(V),by=A]` would treat the 3 rows as one group. This was a regression in 1.12.0. Thanks to Nicolas Ampuero for reporting. - -10. `:=` with quoted expression and dot alias now works as expected, [#3425](https://github.com/Rdatatable/data.table/pull/3425). Thanks to @franknarf1 for raising and @jangorecki for the PR. - -11. A join's result could be incorrectly keyed when a single nomatch occurred at the very beginning while all other values matched, [#3441](https://github.com/Rdatatable/data.table/issues/3441). The incorrect key would cause incorrect results in subsequent queries. Thanks to @symbalex for reporting and @franknarf1 for pinpointing the root cause. - -12. `rbind` and `rbindlist(..., use.names=TRUE)` with over 255 columns could return the columns in a random order, [#3373](https://github.com/Rdatatable/data.table/issues/3373). The contents and name of each column was correct but the order that the columns appeared in the result might not have matched the original input. - -13. `rbind` and `rbindlist` now combine `integer64` columns together with non-`integer64` columns correctly [#1349](https://github.com/Rdatatable/data.table/issues/1349), and support `raw` columns [#2819](https://github.com/Rdatatable/data.table/issues/2819). - -14. `NULL` columns are caught and error appropriately rather than segfault in some cases, [#2303](https://github.com/Rdatatable/data.table/issues/2303) [#2305](https://github.com/Rdatatable/data.table/issues/2305). Thanks to Hugh Parsonage and @franknarf1 for reporting. - -15. `melt` would error with 'factor malformed' or segfault in the presence of duplicate column names, [#1754](https://github.com/Rdatatable/data.table/issues/1754). Many thanks to @franknarf1, William Marble, wligtenberg and Toby Dylan Hocking for reproducible examples. All examples have been added to the test suite. - -16. Removing a column from a null (0-column) data.table is now a (standard and simpler) warning rather than error, [#2335](https://github.com/Rdatatable/data.table/issues/2335). It is no longer an error to add a column to a null (0-column) data.table. - -17. Non-UTF8 strings were not always sorted correctly on Windows (a regression in v1.12.0), [#3397](https://github.com/Rdatatable/data.table/issues/3397) [#3451](https://github.com/Rdatatable/data.table/issues/3451). Many thanks to @shrektan for reporting and fixing. - -18. `cbind` with a null (0-column) `data.table` now works as expected, [#3445](https://github.com/Rdatatable/data.table/issues/3445). Thanks to @mb706 for reporting. - -19. Subsetting does a better job of catching a malformed `data.table` with error rather than segfault. A column may not be NULL, nor may a column be an object which has columns (such as a `data.frame` or `matrix`). Thanks to a comment and reproducible example in [#3369](https://github.com/Rdatatable/data.table/issues/3369) from Drew Abbot which demonstrated the issue which arose from parsing JSON. The next release will enable `as.data.table` to unpack columns which are `data.frame` to support this use case. - -## NOTES - -1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background. - -2. v1.12.0 checked itself on loading using `tools::checkMD5sums("data.table")` but this check failed under the `packrat` package manager on Windows because `packrat` appears to modify the DESCRIPTION file of packages it has snapshot, [#3329](https://github.com/Rdatatable/data.table/issues/3329). This check is now removed. The `CdllVersion` check was introduced after the `checkMD5sums()` attempt and is better; e.g., reliable on all platforms. - -3. As promised in new feature 6 of v1.11.6 Sep 2018 (see below in this news file), the `datatable.CJ.names` option's default is now `TRUE`. In v1.13.0 it will be removed. - -4. Travis CI gains OSX using homebrew llvm for OpenMP support, [#3326](https://github.com/Rdatatable/data.table/issues/3326). Thanks @marcusklik for the PR. - -5. Calling `data.table:::print.data.table()` directly (i.e. bypassing method dispatch by using 3 colons) and passing it a 0-column `data.frame` (not `data.table`) now works, [#3363](https://github.com/Rdatatable/data.table/pull/3363). Thanks @heavywatal for the PR. - -6. v1.12.0 did not compile on Solaris 10 using Oracle Developer Studio 12.6, [#3285](https://github.com/Rdatatable/data.table/issues/3285). Many thanks to Prof Ripley for providing and testing a patch. For future reference and other package developers, a `const` variable should not be passed to OpenMP's `num_threads()` directive otherwise `left operand must be modifiable lvalue` occurs. This appears to be a compiler bug which is why the specific versions are mentioned in this note. - -7. `foverlaps` provides clearer error messages w.r.t. factor and POSIXct interval columns, [#2645](https://github.com/Rdatatable/data.table/issues/2645) [#3007](https://github.com/Rdatatable/data.table/issues/3007) [#1143](https://github.com/Rdatatable/data.table/issues/1143). Thanks to @sritchie73, @msummersgill and @DavidArenburg for the reports. - -8. `unique(DT)` checks up-front the types of all the columns and will fail if any column is type `list` even though those `list` columns may not be needed to establish uniqueness. Use `unique(DT, by=...)` to specify columns that are not type `list`. v1.11.8 and before would also correctly fail with the same error, but not when uniqueness had been established in prior columns: it would stop early, not look at the `list` column and return the correct result. Checking up-front was necessary for some internal optimizations and it's probably best to be explicit anyway. Thanks to James Lamb for reporting, [#3332](https://github.com/Rdatatable/data.table/issues/3332). The error message has been embellished : - - ``` - Column 2 of by= (2) is type 'list', not yet supported. Please use the by= argument to specify - columns with types that are supported. - ``` - -9. Reminder that note 11 in v1.11.0 (May 2018) warned that `set2key()` and `key2()` will be removed in May 2019. They have been warning since v1.9.8 (Nov 2016) and their warnings were upgraded to errors in v1.11.0 (May 2018). When they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental'. - -10. The `key(DT)<-` form of `setkey()` has been warning since at least 2012 to use `setkey()`. The warning is now stronger: `key(x)<-value is deprecated and not supported. Please change to use setkey().`. This warning will be upgraded to error in one year. - - -# data.table v1.12.0 (13 Jan 2019) - -## NEW FEATURES - -1. `setDTthreads()` gains `restore_after_fork=`, [#2885](https://github.com/Rdatatable/data.table/issues/2885). The default `NULL` leaves the internal option unchanged which by default is `TRUE`. `data.table` has always switched to single-threaded mode on fork. It used to restore multithreading after a fork too but problems were reported on Mac and Intel OpenMP library (see 1.10.4 notes below). We are now trying again thanks to suggestions and success reported by Kun Ren and Mark Klik in package `fst`. If you experience problems with multithreading after a fork, please restart R and call `setDTthreads(restore_after_fork=FALSE)`. - -2. Subsetting, ordering and grouping now use more parallelism. See benchmarks [here](https://h2oai.github.io/db-benchmark/) and Matt Dowle's presentation in October 2018 on YouTube [here](https://youtu.be/Ddr8N9STSuI). These internal changes gave rise to 4 regressions which were found before release thanks to Kun Ren, [#3211](https://github.com/Rdatatable/data.table/issues/3211). He kindly volunteers to 'go-first' and runs data.table through his production systems before release. We are looking for a 'go-second' volunteer please. A request to test before release was tweeted on 17 Dec [here](https://twitter.com/MattDowle/status/1074746218645938176). As usual, all CRAN and Bioconductor packages using data.table (currently 750) have been tested against this release, [#3233](https://github.com/Rdatatable/data.table/issues/3233). There are now 8,000 tests in 13,000 lines of test code; more lines of test code than there is code. Overall coverage has increased to 94% thanks to Michael Chirico. - -3. New `frollmean` has been added by Jan Gorecki to calculate _rolling mean_, see `?froll` for documentation. Function name and arguments are experimental. Related to [#2778](https://github.com/Rdatatable/data.table/issues/2778) (and [#624](https://github.com/Rdatatable/data.table/issues/624), [#626](https://github.com/Rdatatable/data.table/issues/626), [#1855](https://github.com/Rdatatable/data.table/issues/1855)). Other rolling statistics will follow. - -4. `fread()` can now read a remote compressed file in one step; `fread("https://domain.org/file.csv.bz2")`. The `file=` argument now supports `.gz` and `.bz2` too; i.e. `fread(file="file.csv.gz")` works now where only `fread("file.csv.gz")` worked in 1.11.8. - -5. `nomatch=NULL` now does the same as `nomatch=0L` in both `DT[...]` and `foverlaps()`; i.e. discards missing values silently (inner join). The default is still `nomatch=NA` (outer join) for statistical safety so that missing values are retained by default. After several years have elapsed, we will start to deprecate `0L`; please start using `NULL`. In future `nomatch=.(0)` (note that `.()` creates a `list` type and is different to `nomatch=0`) will fill with `0` to save replacing `NA` with `0` afterwards, [#857](https://github.com/Rdatatable/data.table/issues/857). - -6. `setnames()` gains `skip_absent` to skip names in `old` that aren't present, [#3030](https://github.com/Rdatatable/data.table/issues/3030). By default `FALSE` so that it is still an error, as before, to attempt to change a column name that is not present. Thanks to @MusTheDataGuy for the suggestion and the PR. - -7. `NA` in `between()` and `%between%`'s `lower` and `upper` are now taken as missing bounds and return `TRUE` rather than `NA`. This is now documented. - -8. `shift()` now interprets negative values of `n` to mean the opposite `type=`, [#1708](https://github.com/Rdatatable/data.table/issues/1708). When `give.names=TRUE` the result is named using a positive `n` with the appropriate `type=`. Alternatively, a new `type="shift"` names the result using a signed `n` and constant type. - - ```R - shift(x, n=-5:5, give.names=TRUE) => "_lead_5" ... "_lag_5" - shift(x, n=-5:5, type="shift", give.names=TRUE) => "_shift_-5" ... "_shift_5" - ``` - -9. `fwrite()` now accepts `matrix`, [#2613](https://github.com/Rdatatable/data.table/issues/2613). Thanks to Michael Chirico for the suggestion and Felipe Parages for implementing. For now matrix input is converted to data.table (which can be costly) before writing. - -10. `fread()` and `fwrite()` can now handle file names in native and UTF-8 encoding, [#3078](https://github.com/Rdatatable/data.table/issues/3078). Thanks to Daniel Possenriede (@dpprdan) for reporting and fixing. - -11. `DT[i]` and `DT[i,cols]` now call internal parallel subsetting code, [#2951](https://github.com/Rdatatable/data.table/issues/2951). Subsetting is significantly faster (as are many other operations) with factor columns rather than character. - - ```R - N = 2e8 # 4GB data on 4-core CPU with 16GB RAM - DT = data.table(ID = sample(LETTERS,N,TRUE), - V1 = sample(5,N,TRUE), - V2 = runif(N)) - w = which(DT$V1 > 3) # select 40% of rows - # v1.12.0 v1.11.8 - system.time(DT[w]) # 0.8s 2.6s - DT[, ID := as.factor(ID)] - system.time(DT[w]) # 0.4s 2.3s - system.time(DT[w, c("ID","V2")]) # 0.3s 1.9s - ``` - -12. `DT[..., .SDcols=]` now accepts `patterns()`; e.g. `DT[..., .SDcols=patterns("^V")]`, for filtering columns according to a pattern (as in `melt.data.table`), [#1878](https://github.com/Rdatatable/data.table/issues/1878). Thanks to many people for pushing for this and @MichaelChirico for ultimately filing the PR. See `?data.table` for full details and examples. - -13. `split` data.table method will now preserve attributes, closes [#2047](https://github.com/Rdatatable/data.table/issues/2047). Thanks to @caneff for reporting. - -14. `DT[i,j]` now retains user-defined and inherited attributes, [#995](https://github.com/Rdatatable/data.table/issues/995); e.g. - - ```R - attr(datasets::BOD,"reference") # "A1.4, p. 270" - attr(as.data.table(datasets::BOD)[2],"reference") # was NULL now "A1.4, p. 270" - ``` - - If a superclass defines attributes that may not be valid after a `[` subset then the superclass should implement its own `[` method to manage those after calling `NextMethod()`. - -## BUG FIXES - -1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting. - -2. Column names that look like expressions (e.g. `"a<=colB"`) caused an error when used in `on=` even when wrapped with backticks, [#3092](https://github.com/Rdatatable/data.table/issues/3092). Additionally, `on=` now supports white spaces around operators; e.g. `on = "colA == colB"`. Thanks to @mt1022 for reporting and to @MarkusBonsch for fixing. - -3. Unmatched `patterns` in `measure.vars` fail early and with feedback, [#3106](https://github.com/Rdatatable/data.table/issues/3106). - -4. `fread(..., skip=)` now skips non-standard `\r` and `\n\r` line endings properly again, [#3006](https://github.com/Rdatatable/data.table/issues/3006). Standard line endings (`\n` Linux/Mac and `\r\n` Windows) were skipped ok. Thanks to @brattono and @tbrycekelly for providing reproducible examples, and @st-pasha for fixing. - -5. `fread(..., colClasses=)` could return a corrupted result when a lower type was requested for one or more columns (e.g. reading "3.14" as integer), [#2922](https://github.com/Rdatatable/data.table/issues/2922) [#2863](https://github.com/Rdatatable/data.table/issues/2863) [#3143](https://github.com/Rdatatable/data.table/issues/3143). It now ignores the request as documented and the helpful message in verbose mode is upgraded to warning. In future, coercing to a lower type might be supported (with warning if any accuracy is lost). `"NULL"` is recognized again in both vector and list mode; e.g. `colClasses=c("integer","NULL","integer")` and `colClasses=list(NULL=2, integer=10:40)`. Thanks to Arun Srinivasan, Kun Ren, Henri Ståhl and @kszela24 for reporting. - -6. `cube()` will now produce expected order of results, [#3179](https://github.com/Rdatatable/data.table/issues/3179). Thanks to @Henrik-P for reporting. - -7. `groupingsets()` groups by empty column set and constant value in `j`, [#3173](https://github.com/Rdatatable/data.table/issues/3173). - -8. `split.data.table()` failed if `DT` had a factor column named `"x"`, [#3151](https://github.com/Rdatatable/data.table/issues/3151). Thanks to @tdeenes for reporting and fixing. - -9. `fsetequal` now handles properly datasets having last column a character, closes [#2318](https://github.com/Rdatatable/data.table/issues/2318). Thanks to @pschil and @franknarf1 for reporting. - -10. `DT[..., .SDcols=integer(0L)]` could fail, [#3185](https://github.com/Rdatatable/data.table/issues/3185). An empty `data.table` is now returned correctly. - -11. `as.data.table.default` method will now always copy its input, closes [#3230](https://github.com/Rdatatable/data.table/issues/3230). Thanks to @NikdAK for reporting. - -12. `DT[..., .SDcols=integer()]` failed with `.SDcols is numeric but has both +ve and -ve indices`, [#1789](https://github.com/Rdatatable/data.table/issues/1789) and [#3185](https://github.com/Rdatatable/data.table/issues/3185). It now functions as `.SDcols=character()` has done and creates an empty `.SD`. Thanks to Gabor Grothendieck and Hugh Parsonage for reporting. A related issue with empty `.SDcols` was fixed in development before release thanks to Kun Ren's testing, [#3211](https://github.com/Rdatatable/data.table/issues/3211). - -13. Multithreaded stability should be much improved with R 3.5+. Many thanks to Luke Tierney for pinpointing a memory issue with package `constellation` caused by `data.table` and his advice, [#3165](https://github.com/Rdatatable/data.table/issues/3165). Luke also added an extra check to R-devel when compiled with `--enable-strict-barrier`. The test suite is run through latest daily R-devel after every commit as usual, but now with `--enable-strict-barrier` on too via GitLab CI ("Extra" badge on the `data.table` homepage) thanks to Jan Gorecki. - -14. Fixed an edge-case bug of platform-dependent output of `strtoi("", base = 2L)` on which `groupingsets` had relied, [#3267](https://github.com/Rdatatable/data.table/issues/3267). - -## NOTES - -1. When data.table loads it now checks its DLL version against the version of its R level code. This is to detect installation issues on Windows when i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. - -2. When `on=` is provided but not `i=`, a helpful error is now produced rather than silently ignoring `on=`. Thanks to Dirk Eddelbuettel for the idea. - -3. `.SDcols=` is more helpful when passed non-existent columns, [#3116](https://github.com/Rdatatable/data.table/issues/3116) and [#3118](https://github.com/Rdatatable/data.table/issues/3118). Thanks to Michael Chirico for the investigation and PR. - -4. `update.dev.pkg()` gains `type=` to specify if update should be made from binaries, sources or both. [#3148](https://github.com/Rdatatable/data.table/issues/3148). Thanks to Reino Bruner for the detailed suggestions. - -5. `setDT()` improves feedback when passed a ragged list (i.e. where all columns in the list are not the same length), [#3121](https://github.com/Rdatatable/data.table/issues/3121). Thanks @chuk-yong for highlighting. - -6. The one and only usage of `UNPROTECT_PTR()` has been removed, [#3232](https://github.com/Rdatatable/data.table/issues/3232). Thanks to Tomas Kalibera's investigation and advice here: https://developer.r-project.org/Blog/public/2018/12/10/unprotecting-by-value/index.html - - -# data.table v1.11.8 (30 Sep 2018) - -## NEW FEATURES - -1. `fread()` can now read `.gz` and `.bz2` files directly: `fread("file.csv.gz")`, [#717](https://github.com/Rdatatable/data.table/issues/717) [#3058](https://github.com/Rdatatable/data.table/issues/3058). It uses `R.utils::decompressFile` to decompress to a `tempfile()` which is then read by `fread()` in the usual way. For greater speed on large-RAM servers, it is recommended to use ramdisk for temporary files by setting `TMPDIR` to `/dev/shm` before starting R; see `?tempdir`. The decompressed temporary file is removed as soon as `fread` completes even if there is an error reading the file. Reading a remote compressed file in one step will be supported in the next version; e.g. `fread("https://domain.org/file.csv.bz2")`. - -## BUG FIXES - -1. Joining two keyed tables using `on=` to columns not forming a leading subset of `key(i)` could result in an invalidly keyed result, [#3061](https://github.com/Rdatatable/data.table/issues/3061). Subsequent queries on the result could then return incorrect results. A warning `longer object length is not a multiple of shorter object length` could also occur. Thanks to @renkun-ken for reporting and the PR. - -2. `keyby=` on columns for which an index exists now uses the index (new feature 7 in v1.11.6 below) but if an `i` subset is present in the same query then it could segfault, [#3062](https://github.com/Rdatatable/data.table/issues/3062). Again thanks to @renkun-ken for reporting. - -3. Assigning an out-of-range integer to an item in a factor column (a rare operation) correctly created an `NA` in that spot with warning, but now no longer also corrupts the variable being assigned, [#2984](https://github.com/Rdatatable/data.table/issues/2984). Thanks to @radfordneal for reporting and @MarkusBonsch for fixing. Assigning a string which is missing from the factor levels continues to automatically append the string to the factor levels. - -4. Assigning a sequence to a column using base R methods (e.g. `DT[["foo"]] = 1:10`) could cause subsetting to fail with `Internal error in subset.c: column is an ALTREP vector`, [#3051](https://github.com/Rdatatable/data.table/issues/3051). Thanks to Michel Lang for reporting. - -5. `as.data.table` `matrix` method now properly handles rownames for 0 column data.table output. Thanks @mllg for reporting. Closes [#3149](https://github.com/Rdatatable/data.table/issues/3149). - -## NOTES - -1. The test suite now turns on R's new _R_CHECK_LENGTH_1_LOGIC2_ to catch when internal use of `&&` or `||` encounter arguments of length more than one. Thanks to Hugh Parsonage for implementing and fixing the problems caught by this. - -2. Some namespace changes have been made with respect to melt, dcast and xts. No change is expected but if you do have any trouble, please file an issue. - -3. `split.data.table` was exported in v1.11.6 in addition to being registered using `S3method(split, data.table)`. The export has been removed again. It had been added because a user said they found it difficult to find, [#2920](https://github.com/Rdatatable/data.table/issues/2920). But S3 methods are not normally exported explicitly by packages. The proper way to access the `split.data.table` method is to call `split(DT)` where `DT` is a `data.table`. The generic (`base::split` in this case) then dispatches to the `split.data.table` method. v1.11.6 was not on CRAN very long (1 week) so we think it's better to revert this change quickly. To know what methods exist, R provides the `methods()` function. - - ```R - methods(split) # all the methods for the split generic - methods(class="data.table") # all the generics that data.table has a method for (47 currently) - ``` - - -# data.table v1.11.6 (19 Sep 2018) - -## NEW FEATURES - -1. For convenience when some of the files in `fnams` are empty in `rbindlist(lapply(fnams,fread))`, `fread` now reads empty input as a null-data.table with warning rather than error, [#2898](https://github.com/Rdatatable/data.table/issues/2898). For consistency, `fwrite(data.table(NULL))` now creates an empty file and warns instead of error, too. - -2. `setcolorder(DT)` without further arguments now defaults to moving the key columns to be first, [#2895](https://github.com/Rdatatable/data.table/issues/2895). Thanks to @jsams for the PR. - -3. Attempting to subset on `col` when the column is actually called `Col` will still error, but the error message will helpfully suggest similarly-spelled columns, [#2887](https://github.com/Rdatatable/data.table/issues/2887). This is experimental, applies just to `i` currently, and we look forward to feedback. Thanks to Michael Chirico for the suggestion and PR. - -4. `fread()` has always accepted literal data; e.g. `fread("A,B\n1,2\n3,4")`. It now gains explicit `text=`; e.g. `fread(text="A,B\n1,2\n3,4")`. Unlike the first general purpose `input=` argument, the `text=` argument accepts multi-line input; e.g. `fread(text=c("A,B","1,2","3,4"))`, [#1423](https://github.com/Rdatatable/data.table/issues/1423). Thanks to Douglas Clark for the request and Hugh Parsonage for the PR. - -5. `fread()` has always accepted system commands; e.g. `fread("grep blah file.txt")`. It now gains explicit `cmd=`; e.g. `fread(cmd="grep blah file.txt")`. Further, if and only if `input=` is a system command and a variable was used to hold that command (`fread(someCommand)` not `fread("grep blah file.txt")`) or a variable is used to construct it (`fread(paste("grep",variable,"file.txt"))`), a message is now printed suggesting `cmd=`. This is to inform all users that there is a potential security concern if you are i) creating apps, and ii) your app takes input from a public user who could be malicious, and iii) input from the malicious user (such as a filename) is passed by your app to `fread()`, and iv) your app in not running in a protected environment. If all 4 conditions hold then the malicious user could provide a system command instead of a filename which `fread()` would run, and that would be a problem too. If the app is not running in a protected environment (e.g. app is running as root) then this could do damage or obtain data you did not intend. Public facing apps should be running with limited operating system permission so that any breach from any source is contained. We agree with [Linus Torvald's advice](https://lkml.org/lkml/2017/11/21/356) on this which boils down to: "when addressing security concerns the first step is do no harm, just inform". If you aren't creating apps or apis that could have a malicious user then there is no risk but we can't distinguish you so we have to inform everyone. Please change to `fread(cmd=...)` at your leisure. The new message can be suppressed with `options(datatable.fread.input.cmd.message=FALSE)`. Passing system commands to `fread()` continues to be recommended and encouraged and is widely used; e.g. via the techniques gathered together in the book [Data Science at the Command Line](https://datascienceatthecommandline.com/). A `warning()` is too strong because best-practice for production systems is to set `options(warn=2)` to tolerate no warnings. Such production systems have no user input and so there is no security risk; we don't want to do harm by breaking production systems via a `warning()` which gets turned into an error by `options(warn=2)`. Now that we have informed all users, we request feedback. There are 3 options for future releases: i) remove the message, ii) leave the message in place, iii) upgrade the message to warning and then eventually error. The default choice is the middle one: leave the message in place. - -6. New `options(datatable.CJ.names=TRUE)` changes `CJ()` to auto-name its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Thanks @franknarf1 for the suggestion. Current default is `FALSE`; i.e. no change. The option's default will be changed to `TRUE` in v1.12.0 and then eventually the option will be removed. Any code that depends on `CJ(x,y)$V1` will need to be changed to `CJ(x,y)$x` and is more akin to a bug fix due to the inconsistency with `data.table()`. - -7. If an appropriate index exists, `keyby=` will now use it. For example, given `setindex(DT,colA,colB)`, both `DT[,j,keyby=colA]` (a leading subset of the index columns) and `DT[,j,keyby=.(colA,colB)]` will use the index, but not `DT[,j,keyby=.(colB,colA)]`. The option `options(datatable.use.index=FALSE)` will turn this feature off. Please always use `keyby=` unless you wish to retain the order of groups by first-appearance order (in which case use `by=`). Also, both `keyby=` and `by=` already used the key where possible but are now faster when using just the first column of the key. As usual, setting `verbose=TRUE` either per-query or globally using `options(datatable.verbose=TRUE)` will report what's being done internally. - -## BUG FIXES - -1. `fread` now respects the order of columns passed to `select=` when column numbers are used, [#2986](https://github.com/Rdatatable/data.table/issues/2986). It already respected the order when column names are used. Thanks @privefl for raising the issue. - -2. `gmin` and `gmax` no longer fail on _ordered_ factors, [#1947](https://github.com/Rdatatable/data.table/issues/1947). Thanks to @mcieslik-mctp for identifying and @mbacou for the nudge. - -3. `as.ITime.character` now properly handles NA when attempting to detect the format of non-NA values in vector. Thanks @polyjian for reporting, closes [#2940](https://github.com/Rdatatable/data.table/issues/2940). - -4. `as.matrix(DT, rownames="id")` now works when `DT` has a single row, [#2930](https://github.com/Rdatatable/data.table/issues/2930). Thanks to @malcook for reporting and @sritchie73 for fixing. The root cause was the dual meaning of the `rownames=` argument: i) a single column name/number (most common), or ii) rowname values length 1 for the single row. For clarity and safety, `rownames.value=` has been added. Old usage (i.e. `length(rownames)>1`) continues to work for now but will issue a warning in a future release, and then error in a release after that. - -5. Fixed regression in v1.11.0 (May 2018) caused by PR [#2389](https://github.com/Rdatatable/data.table/pull/2389) which introduced partial key retainment on `:=` assigns. This broke the joining logic that assumed implicitly that assigning always drops keys completely. Consequently, join and subset results could be wrong when matching character to factor columns with existing keys, [#2881](https://github.com/Rdatatable/data.table/issues/2881). Thanks to @ddong63 for reporting and to @MarkusBonsch for fixing. Missing test added to ensure this doesn't arise again. - -6. `as.IDate.numeric` no longer ignores "origin", [#2880](https://github.com/Rdatatable/data.table/issues/2880). Thanks to David Arenburg for reporting and fixing. - -7. `as.ITime.times` was rounding fractional seconds while other methods were truncating, [#2870](https://github.com/Rdatatable/data.table/issues/2870). The `as.ITime` method gains `ms=` taking `"truncate"` (default), `"nearest"` and `"ceil"`. Thanks to @rossholmberg for reporting and Michael Chirico for fixing. - -8. `fwrite()` now writes POSIXct dates after 2038 correctly, [#2995](https://github.com/Rdatatable/data.table/issues/2995). Thanks to Manfred Zorn for reporting and Philippe Chataignon for the PR fixing it. - -9. `fsetequal` gains the `all` argument to make it consistent with the other set operator functions `funion`, `fsetdiff` and `fintersect` [#2968](https://github.com/Rdatatable/data.table/issues/2968). When `all = FALSE` `fsetequal` will treat rows as elements in a set when checking whether two `data.tables` are equal (i.e. duplicate rows will be ignored). For now the default value is `all = TRUE` for backwards compatibility, but this will be changed to `all = FALSE` in a future release to make it consistent with the other set operation functions. Thanks to @franknarf1 for reporting and @sritchie73 for fixing. - -10. `fintersect` failed on tables with a column called `y`, [#3034](https://github.com/Rdatatable/data.table/issues/3034). Thanks to Maxim Nazarov for reporting. - -11. Compilation fails in AIX because NAN and INFINITY macros definition in AIX make them not constant literals, [#3043](https://github.com/Rdatatable/data.table/pull/3043). Thanks to Ayappan for reporting and fixing. - -12. The introduction of altrep in R 3.5.0 caused some performance regressions of about 20% in some cases, [#2962](https://github.com/Rdatatable/data.table/issues/2962). Investigating this led to some improvements to grouping which are faster than before R 3.5.0 in some cases. Thanks to Nikolay S. for reporting. The work to accomodate altrep is not complete but it is better and it is highly recommended to upgrade to this update. - -13. Fixed 7 memory faults thanks to CRAN's [`rchk`](https://github.com/kalibera/rchk) tool by Tomas Kalibera, [#3033](https://github.com/Rdatatable/data.table/pull/3033). - -## NOTES - -1. The type coercion warning message has been improved, [#2989](https://github.com/Rdatatable/data.table/pull/2989). Thanks to @sarahbeeysian on Twitter for highlighting. For example, given the follow statements: - - ```R - DT = data.table(id=1:3) - DT[2, id:="foo"] - ``` - - the warning message has changed from : - - ``` - Coerced character RHS to integer to match the column's type. Either change the target column - ['id'] to character first (by creating a new character vector length 3 (nrows of entire table) and - assign that; i.e. 'replace' column), or coerce RHS to integer (e.g. 1L, NA_[real|integer]_, as.*, - etc) to make your intent clear and for speed. Or, set the column type correctly up front when you - create the table and stick to it, please. - ``` - - to : - - ``` - Coerced character RHS to integer to match the type of the target column (column 1 named 'id'). If - the target column's type integer is correct, it's best for efficiency to avoid the coercion and - create the RHS as type integer. To achieve that consider the L postfix: typeof(0L) vs typeof(0), - and typeof(NA) vs typeof(NA_integer_) vs typeof(NA_real_). Wrapping the RHS with as.integer() will - avoid this warning but still perform the coercion. If the target column's type is not correct, it - is best to revisit where the DT was created and fix the column type there; e.g., by using - colClasses= in fread(). Otherwise, you can change the column type now by plonking a new column (of - the desired type) over the top of it; e.g. DT[, `id`:=as.character(`id`)]. If the RHS of := has - nrow(DT) elements then the assignment is called a column plonk and is the way to change a column's - type. Column types can be observed with sapply(DT,typeof). - ``` - - Further, if a coercion from double to integer is performed, fractional data such as 3.14 is now detected and the truncation to 3 is warned about if and only if truncation has occurred. - - ```R - DT = data.table(v=1:3) - DT[2, v:=3.14] - Warning message: - Coerced double RHS to integer to match the type of the target column (column 1 named 'v'). One - or more RHS values contain fractions which have been lost; e.g. item 1 with value 3.140000 has - been truncated to 3. - ``` - -2. `split.data.table` method is now properly exported, [#2920](https://github.com/Rdatatable/data.table/issues/2920). But we don't recommend it because `split` copies all the pieces into new memory. - -3. Setting indices on columns which are part of the key will now create those indices. - -4. `hour`, `minute`, and `second` utility functions use integer arithmetic when the input is already (explicitly) UTC-based `POSIXct` for 4-10x speedup vs. using `as.POSIXlt`. - -5. Error added for incorrect usage of `%between%`, with some helpful diagnostic hints, [#3014](https://github.com/Rdatatable/data.table/issues/3014). Thanks @peterlittlejohn for offering his user experience and providing the impetus. - - -# data.table v1.11.4 (27 May 2018) - -1. Empty RHS of `:=` is no longer an error when the `i` clause returns no rows to assign to anyway, [#2829](https://github.com/Rdatatable/data.table/issues/2829). Thanks to @cguill95 for reporting and to @MarkusBonsch for fixing. - -2. Fixed runaway memory usage with R-devel (R > 3.5.0), [#2882](https://github.com/Rdatatable/data.table/pull/2882). Thanks to many people but in particular to Trang Nguyen for making the breakthrough reproducible example, Paul Bailey for liaising, and Luke Tierney for then pinpointing the issue. It was caused by an interaction of two or more data.table threads operating on new compact vectors in the ALTREP framework, such as the sequence `1:n`. This interaction could result in R's garbage collector turning off, and hence the memory explosion. Problems may occur in R 3.5.0 too but we were only able to reproduce in R > 3.5.0. The R code in data.table's implementation benefits from ALTREP (`for` loops in R no longer allocate their range vector input, for example) but are not so appropriate as data.table columns. Sequences such as `1:n` are common in test data but not very common in real-world datasets. Therefore, there is no need for data.table to support columns which are ALTREP compact sequences. The `data.table()` function already expanded compact vectors (by happy accident) but `setDT()` did not (it now does). If, somehow, a compact vector still reaches the internal parallel regions, a helpful error will now be generated. If this happens, please report it as a bug. - -3. Tests 1590.3 & 1590.4 now pass when users run `test.data.table()` on Windows, [#2856](https://github.com/Rdatatable/data.table/pull/2856). Thanks to Avraham Adler for reporting. Those tests were passing on AppVeyor, win-builder and CRAN's Windows because `R CMD check` sets `LC_COLLATE=C` as documented in R-exts$1.3.1, whereas by default on Windows `LC_COLLATE` is usually a regional Windows-1252 dialect such as `English_United States.1252`. - -4. Around 1 billion very small groups (of size 1 or 2 rows) could result in `"Failed to realloc working memory"` even when plenty of memory is available, [#2777](https://github.com/Rdatatable/data.table/issues/2777). Thanks once again to @jsams for the detailed report as a follow up to bug fix 40 in v1.11.0. - - -# data.table v1.11.2 (08 May 2018) - -1. `test.data.table()` created/overwrote variable `x` in `.GlobalEnv`, [#2828](https://github.com/Rdatatable/data.table/issues/2828); i.e. a modification of user's workspace which is not allowed. Thanks to @etienne-s for reporting. - -2. `as.chron` methods for `IDate` and `ITime` have been removed, [#2825](https://github.com/Rdatatable/data.table/issues/2825). `as.chron` still works since `IDate` inherits from `Date`. We are not sure why we had specific methods in the first place. It may have been from a time when `IDate` did not inherit from `Date`, perhaps. Note that we don't use `chron` ourselves in our own work. - -3. Fixed `SETLENGTH() cannot be applied to an ALTVEC object` starting in R-devel (R 3.6.0) on 1 May 2018, a few hours after 1.11.0 was accepted on CRAN, [#2820](https://github.com/Rdatatable/data.table/issues/2820). Many thanks to Luke Tierney for pinpointing the problem. - -4. Fixed some rare memory faults in `fread()` and `rbindlist()` found with `gctorture2()` and [`rchk`](https://github.com/kalibera/rchk), [#2841](https://github.com/Rdatatable/data.table/issues/2841). - - -# data.table v1.11.0 (01 May 2018) - -## NOTICE OF INTENDED FUTURE POTENTIAL BREAKING CHANGES - -1. `fread()`'s `na.strings=` argument : - - ```R - "NA" # old default - getOption("datatable.na.strings", "NA") # this release; i.e. the same; no change yet - getOption("datatable.na.strings", "") # future release - ``` - - This option controls how `,,` is read in character columns. It does not affect numeric columns which read `,,` as `NA` regardless. We would like `,,`=>`NA` for consistency with numeric types, and `,"",`=>empty string to be the standard default for `fwrite/fread` character columns so that `fread(fwrite(DT))==DT` without needing any change to any parameters. `fwrite` has never written `NA` as `"NA"` in case `"NA"` is a valid string in the data; e.g., 2 character id columns sometimes do. Instead, `fwrite` has always written `,,` by default for an `` in a character columns. The use of R's `getOption()` allows users to move forward now, using `options(datatable.fread.na.strings="")`, or restore old behaviour when the default's default is changed in future, using `options(datatable.fread.na.strings="NA")`. - -2. `fread()` and `fwrite()`'s `logical01=` argument : - - ```R - logical01 = FALSE # old default - getOption("datatable.logical01", FALSE) # this release; i.e. the same; no change yet - getOption("datatable.logical01", TRUE) # future release - ``` - - This option controls whether a column of all 0's and 1's is read as `integer`, or `logical` directly to avoid needing to change the type afterwards to `logical` or use `colClasses`. `0/1` is smaller and faster than `"TRUE"/"FALSE"`, which can make a significant difference to space and time the more `logical` columns there are. When the default's default changes to `TRUE` for `fread` we do not expect much impact since all arithmetic operators that are currently receiving 0's and 1's as type `integer` (think `sum()`) but instead could receive `logical`, would return exactly the same result on the 0's and 1's as `logical` type. However, code that is manipulating column types using `is.integer` or `is.logical` on `fread`'s result, could require change. It could be painful if `DT[(logical_column)]` (i.e. `DT[logical_column==TRUE]`) changed behaviour due to `logical_column` no longer being type `logical` but `integer`. But that is not the change proposed. The change is the other way around; i.e., a previously `integer` column holding only 0's and 1's would now be type `logical`. Since it's that way around, we believe the scope for breakage is limited. We think a lot of code is converting 0/1 integer columns to logical anyway, either using `colClasses=` or afterwards with an assign. For `fwrite`, the level of breakage depends on the consumer of the output file. We believe `0/1` is a better more standard default choice to move to. See notes below about improvements to `fread`'s sampling for type guessing, and automatic rereading in the rare cases of out-of-sample type surprises. - - -These options are meant for temporary use to aid your migration, [#2652](https://github.com/Rdatatable/data.table/pull/2652). You are not meant to set them to the old default and then not migrate your code that is dependent on the default. Either set the argument explicitly so your code is not dependent on the default, or change the code to cope with the new default. Over the next few years we will slowly start to remove these options, warning you if you are using them, and return to a simple default. See the history of NEWS and NEWS.0 for past migrations that have, generally speaking, been successfully managed in this way. For example, at the end of NOTES for this version (below in this file) is a note about the usage of `datatable.old.unique.by.key` now warning, as you were warned it would do over a year ago. When that change was introduced, the default was changed and that option provided an option to restore the old behaviour. These `fread`/`fwrite` changes are even more cautious and not even changing the default's default yet. Giving you extra warning by way of this notice to move forward. And giving you a chance to object. - -## NEW FEATURES - -1. `fread()`: - * Efficiency savings at C level including **parallelization** announced [here](https://github.com/Rdatatable/data.table/wiki/talks/BARUG_201704_ParallelFread.pdf); e.g. a 9GB 2 column integer csv input is **50s down to 12s** to cold load on a 4 core laptop with 16GB RAM and SSD. Run `echo 3 >/proc/sys/vm/drop_caches` first to measure cold load time. Subsequent load time (after file has been cached by OS on the first run) **40s down to 6s**. - * The [fread for small data](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread) page has been revised. - * Memory maps lazily; e.g. reading just the first 10 rows with `nrow=10` is **12s down to 0.01s** from cold for the 9GB file. Large files close to your RAM limit may work more reliably too. The progress meter will commence sooner and more consistently. - * `fread` has always jumped to the middle and to the end of the file for a much improved column type guess. The sample size is increased from 100 rows at 10 jump jump points (1,000 rows) to 100 rows at 100 jumps points (10,000 row sample). In the rare case of there still being out-of-sample type exceptions, those columns are now *automatically reread* so you don't have to use `colClasses` yourself. - * Large number of columns support; e.g. **12,000 columns** tested. - * **Quoting rules** are more robust and flexible. See point 10 on the wiki page [here](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread#10-automatic-quote-escape-method-detection-including-no-escape). - * Numeric data that has been quoted is now detected and read as numeric. - * The ability to position `autostart` anywhere inside one of multiple tables in a single file is removed with warning. It used to search upwards from that line to find the start of the table based on a consistent number of columns. People appear to be using `skip="string"` or `skip=nrow` to find the header row exactly, which is retained and simpler. It was too difficult to retain search-upwards-autostart together with skipping/filling blank lines, filling incomplete rows and parallelization too. If there is any header info above the column names, it is still auto detected and auto skipped (particularly useful when loading a set of files where the column names start on different lines due to a varying height messy header). - * `dec=','` is now implemented directly so there is no dependency on locale. The options `datatable.fread.dec.experiment` and `datatable.fread.dec.locale` have been removed. - * `\\r\\r\\n` line endings are now handled such as produced by `base::download.file()` when it doubles up `\\r`. Other rare line endings (`\\r` and `\\n\\r`) are now more robust. - * Mixed line endings are now handled; e.g. a file formed by concatenating a Unix file and a Windows file so that some lines end with `\\n` while others end with `\\r\\n`. - * Improved automatic detection of whether the first row is column names by comparing the types of the fields on the first row against the column types ascertained by the 10,000 rows sample (or `colClasses` if provided). If a numeric column has a string value at the top, then column names are deemed present. - * Detects GB-18030 and UTF-16 encodings and in verbose mode prints a message about BOM detection. - * Detects and ignores trailing ^Z end-of-file control character sometimes created on MS DOS/Windows, [#1612](https://github.com/Rdatatable/data.table/issues/1612). Thanks to Gergely Daróczi for reporting and providing a file. - * Added ability to recognize and parse hexadecimal floating point numbers, as used for example in Java. Thanks for @scottstanfield [#2316](https://github.com/Rdatatable/data.table/issues/2316) for the report. - * Now handles floating-point NaN values in a wide variety of formats, including `NaN`, `sNaN`, `1.#QNAN`, `NaN1234`, `#NUM!` and others, [#1800](https://github.com/Rdatatable/data.table/issues/1800). Thanks to Jori Liesenborgs for highlighting and the PR. - * If negative numbers are passed to `select=` the out-of-range error now suggests `drop=` instead, [#2423](https://github.com/Rdatatable/data.table/issues/2423). Thanks to Michael Chirico for the suggestion. - * `sep=NULL` or `sep=""` (i.e., no column separator) can now be used to specify single column input reliably like `base::readLines`, [#1616](https://github.com/Rdatatable/data.table/issues/1616). `sep='\\n'` still works (even on Windows where line ending is actually `\\r\\n`) but `NULL` or `""` are now documented and recommended. Thanks to Dmitriy Selivanov for the pull request and many others for comments. As before, `sep=NA` is not valid; use the default `"auto"` for automatic separator detection. `sep='\\n'` is now deprecated and in future will start to warn when used. - * Single-column input with blank lines is now valid and the blank lines are significant (representing `NA`). The blank lines are significant even at the very end, which may be surprising on first glance. The change is so that `fread(fwrite(DT))==DT` for single-column inputs containing `NA` which are written as blank. There is no change when `ncol>1`; i.e., input stops with detailed warning at the first blank line, because a blank line when `ncol>1` is invalid input due to no separators being present. Thanks to @skanskan, Michael Chirico, @franknarf1 and Pasha for the testing and discussions, [#2106](https://github.com/Rdatatable/data.table/issues/2106). - * Too few column names are now auto filled with default column names, with warning, [#1625](https://github.com/Rdatatable/data.table/issues/1625). If there is just one missing column name it is guessed to be for the first column (row names or an index), otherwise the column names are filled at the end. Similarly, too many column names now automatically sets `fill=TRUE`, with warning. - * `skip=` and `nrow=` are more reliable and are no longer affected by invalid lines outside the range specified. Thanks to Ziyad Saeed and Kyle Chung for reporting, [#1267](https://github.com/Rdatatable/data.table/issues/1267). - * Ram disk (`/dev/shm`) is no longer used for the output of system command input. Although faster when it worked, it was causing too many device full errors; e.g., [#1139](https://github.com/Rdatatable/data.table/issues/1139) and [zUMIs/19](https://github.com/sdparekh/zUMIs/issues/19). Thanks to Kyle Chung for reporting. Standard `tempdir()` is now used. If you wish to use ram disk, set TEMPDIR to `/dev/shm`; see `?tempdir`. - * Detecting whether a very long input string is a file name or data is now much faster, [#2531](https://github.com/Rdatatable/data.table/issues/2531). Many thanks to @javrucebo for the detailed report, benchmarks and suggestions. - * A column of `TRUE/FALSE`s is ok, as well as `True/False`s and `true/false`s, but mixing styles (e.g. `TRUE/false`) is not and will be read as type `character`. - * New argument `index` to compliment the existing `key` argument for applying secondary orderings out of the box for convenience, [#2633](https://github.com/Rdatatable/data.table/issues/2633). - * A warning is now issued whenever incorrectly quoted fields have been detected and fixed using a non-standard quote rule. `fread` has always used these advanced rules but now it warns that it is using them. Most file writers correctly quote fields if the field contains the field separator, but a common error is not to also quote fields that contain a quote and then escape those quotes, particularly if that quote occurs at the start of the field. The ability to detect and fix such files is referred to as self-healing. Ambiguities are resolved using the knowledge that the number of columns is constant, and therefore this ability is not available when `fill=TRUE`. This feature can be improved in future by using column type consistency as well as the number of fields. For example: - - ```R - txt = 'A,B\n1,hello\n2,"howdy" said Joe\n3,bonjour\n' - cat(txt) - # A,B - # 1,hello - # 2,"howdy" said Joe - # 3,bonjour - fread(txt) - A B - - 1: 1 hello - 2: 2 "howdy" said Joe - 3: 3 bonjour - Warning message: - In fread(txt) : Found and resolved improper quoting - ``` - - * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney, Ananda Mahto, @memoryfull, @brandenkmurray for testing dev and reporting these regressions before release to CRAN: #1464, #1671, #1888, #1895, #2070, #2073, #2087, #2091, #2092, #2107, #2118, #2123, #2167, #2194, #2196, #2201, #2222, #2228, #2238, #2246, #2251, #2265, #2267, #2285, #2287, #2299, #2322, #2347, #2352, #2370, #2371, #2395, #2404, #2446, #2453, #2457, #2464, #2481, #2499, #2512, #2515, #2516, #2518, #2520, #2523, #2526, #2535, #2542, #2548, #2561, #2600, #2625, #2666, #2697, #2735, #2744. - -2. `fwrite()`: - * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). - * `logical01` has been added and the old name `logicalAsInt` retained. Pease move to the new name when convenient for you. The old argument name (`logicalAsInt`) will slowly be deprecated over the next few years. The default is unchanged: `FALSE`, so `logical` is still written as `"TRUE"`/`"FALSE"` in full by default. We intend to change the default's default in future to `TRUE`; see the notice at the top of these release notes. - -3. Added helpful message when subsetting by a logical column without wrapping it in parentheses, [#1844](https://github.com/Rdatatable/data.table/issues/1844). Thanks @dracodoc for the suggestion and @MichaelChirico for the PR. - -4. `tables` gains `index` argument for supplementary metadata about `data.table`s in memory (or any optionally specified environment), part of [#1648](https://github.com/Rdatatable/data.table/issues/1648). Thanks due variously to @jangorecki, @rsaporta, @MichaelChirico for ideas and work towards PR. - -5. Improved auto-detection of `character` inputs' formats to `as.ITime` to mirror the logic in `as.POSIXlt.character`, [#1383](https://github.com/Rdatatable/data.table/issues/1383) Thanks @franknarf1 for identifying a discrepancy and @MichaelChirico for investigating. - -6. `setcolorder()` now accepts less than `ncol(DT)` columns to be moved to the front, [#592](https://github.com/Rdatatable/data.table/issues/592). Thanks @MichaelChirico for the PR. This also incidentally fixed [#2007](https://github.com/Rdatatable/data.table/issues/2007) whereby explicitly setting `select = NULL` in `fread` errored; thanks to @rcapell for reporting that and @dselivanov and @MichaelChirico for investigating and providing a new test. - -7. Three new *Grouping Sets* functions: `rollup`, `cube` and `groupingsets`, [#1377](https://github.com/Rdatatable/data.table/issues/1377). Allows to aggregation on various grouping levels at once producing sub-totals and grand total. - -8. `as.data.table()` gains new method for `array`s to return a useful data.table, [#1418](https://github.com/Rdatatable/data.table/issues/1418). - -9. `print.data.table()` (all via master issue [#1523](https://github.com/Rdatatable/data.table/issues/1523)): - - * gains `print.keys` argument, `FALSE` by default, which displays the keys and/or indices (secondary keys) of a `data.table`. Thanks @MichaelChirico for the PR, Yike Lu for the suggestion and Arun for honing that idea to its present form. - - * gains `col.names` argument, `"auto"` by default, which toggles which registers of column names to include in printed output. `"top"` forces `data.frame`-like behavior where column names are only ever included at the top of the output, as opposed to the default behavior which appends the column names below the output as well for longer (>20 rows) tables. `"none"` shuts down column name printing altogether. Thanks @MichaelChirico for the PR, Oleg Bondar for the suggestion, and Arun for guiding commentary. - - * list columns would print the first 6 items in each cell followed by a comma if there are more than 6 in that cell. Now it ends ",..." to make it clearer, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). Thanks to @franknarf1 for drawing attention to an issue raised on Stack Overflow by @TMOTTM [here](https://stackoverflow.com/q/47679701). - -10. `setkeyv` accelerated if key already exists [#2331](https://github.com/Rdatatable/data.table/issues/2331). Thanks to @MarkusBonsch for the PR. - -11. Keys and indexes are now partially retained up to the key column assigned to with ':=' [#2372](https://github.com/Rdatatable/data.table/issues/2372). They used to be dropped completely if any one of the columns was affected by `:=`. Tanks to @MarkusBonsch for the PR. - -12. Faster `as.IDate` and `as.ITime` methods for `POSIXct` and `numeric`, [#1392](https://github.com/Rdatatable/data.table/issues/1392). Thanks to Jan Gorecki for the PR. - -13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR, and thanks to @mgahan for pointing out a reversion in `na.omit.data.table` before release, [#2660](https://github.com/Rdatatable/data.table/issues/2660#issuecomment-371027948). - -14. `uniqueN()` is now faster on logical vectors. Thanks to Hugh Parsonage for [PR#2648](https://github.com/Rdatatable/data.table/pull/2648). - - ```R - N = 1e9 - # was now - x = c(TRUE,FALSE,NA,rep(TRUE,N)) # - uniqueN(x) == 3 # 5.4s 0.00s - x = c(TRUE,rep(FALSE,N), NA) # - uniqueN(x,na.rm=TRUE) == 2 # 5.4s 0.00s - x = c(rep(TRUE,N),FALSE,NA) # - uniqueN(x) == 3 # 6.7s 0.38s - ``` - -15. Subsetting optimization with keys and indices is now possible for compound queries like `DT[a==1 & b==2]`, [#2472](https://github.com/Rdatatable/data.table/issues/2472). -Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementation. - -16. `melt.data.table` now offers friendlier functionality for providing `value.name` for `list` input to `measure.vars`, [#1547](https://github.com/Rdatatable/data.table/issues/1547). Thanks @MichaelChirico and @franknarf1 for the suggestion and use cases, @jangorecki and @mrdwab for implementation feedback, and @MichaelChirico for ultimate implementation. - -17. `update.dev.pkg` is new function to update package from development repository, it will download package sources only when newer commit is available in repository. `data.table::update.dev.pkg()` defaults updates `data.table`, but any package can be used. - -18. Item 1 in NEWS for [v1.10.2](https://github.com/Rdatatable/data.table/blob/master/NEWS.md#changes-in-v1102--on-cran-31-jan-2017) on CRAN in Jan 2017 included : - - > When j is a symbol prefixed with `..` it will be looked up in calling scope and its value taken to be column names or numbers. - > When you see the `..` prefix think one-level-up, like the directory `..` in all operating systems means the parent directory. - > In future the `..` prefix could be made to work on all symbols apearing anywhere inside `DT[...]`. - - The response has been positive ([this tweet](https://twitter.com/MattDowle/status/967290562725359617) and [FR#2655](https://github.com/Rdatatable/data.table/issues/2655)) and so this prefix is now expanded to all symbols appearing in `j=` as a first step; e.g. - - ```R - cols = "colB" - DT[, c(..cols, "colC")] # same as DT[, .(colB,colC)] - DT[, -..cols] # all columns other than colB - ``` - - Thus, `with=` should no longer be needed in any cases. Please change to using the `..` prefix and over the next few years we will start to formally deprecate and remove the `with=` parameter. If this is well received, the `..` prefix could be expanded to symbols appearing in `i=` and `by=`, too. Note that column names should not now start with `..`. If a symbol `..var` is used in `j=` but `..var` exists as a column name, the column still takes precedence, for backwards compatibility. Over the next few years, data.table will start issuing warnings/errors when it sees column names starting with `..`. This affects one CRAN package out of 475 using data.table, so we do not believe this restriction to be unreasonable. Our main focus here which we believe `..` achieves is to resolve the more common ambiguity when `var` is in calling scope and `var` is a column name too. Further, we have not forgotten that in the past we recommended prefixing the variable in calling scope with `..` yourself. If you did that and `..var` exists in calling scope, that still works, provided neither `var` exists in calling scope nor `..var` exists as a column name. Please now remove the `..` prefix on `..var` in calling scope to tidy this up. In future data.table will start to warn/error on such usage. - -19. `setindexv` can now assign multiple (separate) indices by accepting a `list` in the `cols` argument. - -20. `as.matrix.data.table` method now has an additional `rownames` argument allowing for a single column to be used as the `rownames` after conversion to a `matrix`. Thanks to @sritchie73 for the suggestion, use cases, [#2692](https://github.com/Rdatatable/data.table/issues/2692) and implementation [PR#2702](https://github.com/Rdatatable/data.table/pull/2702) and @MichaelChirico for additional use cases. - -## BUG FIXES - -1. The new quote rules handles this single field `"Our Stock Screen Delivers an Israeli Software Company (MNDO, CTCH)<\/a> SmallCapInvestor.com - Thu, May 19, 2011 10:02 AM EDT<\/cite><\/div>Yesterday in \""Google, But for Finding - Great Stocks\"", I discussed the value of stock screeners as a powerful tool"`, [#2051](https://github.com/Rdatatable/data.table/issues/2051). Thanks to @scarrascoso for reporting. Example file added to test suite. - -2. `fwrite()` creates a file with permissions that now play correctly with `Sys.umask()`, [#2049](https://github.com/Rdatatable/data.table/issues/2049). Thanks to @gnguy for reporting. - -3. `fread()` no longer holds an open lock on the file when a line outside the large sample has too many fields and generates an error, [#2044](https://github.com/Rdatatable/data.table/issues/2044). Thanks to Hugh Parsonage for reporting. - -4. Setting `j = {}` no longer results in an error, [#2142](https://github.com/Rdatatable/data.table/issues/2142). Thanks Michael Chirico for the pull request. - -5. Segfault in `rbindlist()` when one or more items are empty, [#2019](https://github.com/Rdatatable/data.table/issues/2019). Thanks Michael Lang for the pull request. Another segfault if the result would be more than 2bn rows, thanks to @jsams's comment in [#2340](https://github.com/Rdatatable/data.table/issues/2340#issuecomment-331505494). - -6. Error printing 0-length `ITime` and `NA` objects, [#2032](https://github.com/Rdatatable/data.table/issues/2032) and [#2171](https://github.com/Rdatatable/data.table/issues/2171). Thanks Michael Chirico for the pull requests and @franknarf1 for pointing out a shortcoming of the initial fix. - -7. `as.IDate.POSIXct` error with `NULL` timezone, [#1973](https://github.com/Rdatatable/data.table/issues/1973). Thanks @lbilli for reporting and Michael Chirico for the pull request. - -8. Printing a null `data.table` with `print` no longer visibly outputs `NULL`, [#1852](https://github.com/Rdatatable/data.table/issues/1852). Thanks @aaronmcdaid for spotting and @MichaelChirico for the PR. - -9. `data.table` now works with Shiny Reactivity / Flexdashboard. The error was typically something like `col not found` in `DT[col==val]`. Thanks to Dirk Eddelbuettel leading Matt through reproducible steps and @sergeganakou and Richard White for reporting. Closes [#2001](https://github.com/Rdatatable/data.table/issues/2001) and [shiny/#1696](https://github.com/rstudio/shiny/issues/1696). - -10. The `as.IDate.POSIXct` method passed `tzone` along but was not exported. So `tzone` is now taken into account by `as.IDate` too as well as `IDateTime`, [#977](https://github.com/Rdatatable/data.table/issues/977) and [#1498](https://github.com/Rdatatable/data.table/issues/1498). Tests added. - -11. Named logical vector now select rows as expected from single row data.table. Thanks to @skranz for reporting. Closes [#2152](https://github.com/Rdatatable/data.table/issues/2152). - -12. `fread()`'s rare `Internal error: Sampling jump point 10 is before the last jump ended` has been fixed, [#2157](https://github.com/Rdatatable/data.table/issues/2157). Thanks to Frank Erickson and Artem Klevtsov for reporting with example files which are now added to the test suite. - -13. `CJ()` no longer loses attribute information, [#2029](https://github.com/Rdatatable/data.table/issues/2029). Thanks to @MarkusBonsch and @royalts for the pull request. - -14. `split.data.table` respects `factor` ordering in `by` argument, [#2082](https://github.com/Rdatatable/data.table/issues/2082). Thanks to @MichaelChirico for identifying and fixing the issue. - -15. `.SD` would incorrectly include symbol on lhs of `:=` when `.SDcols` is specified and `get()` appears in `j`. Thanks @renkun-ken for reporting and the PR, and @ProfFancyPants for reporing a regression introduced in the PR. Closes [#2326](https://github.com/Rdatatable/data.table/issues/2326) and [#2338](https://github.com/Rdatatable/data.table/issues/2338). - -16. Integer values that are too large to fit in `int64` will now be read as strings [#2250](https://github.com/Rdatatable/data.table/issues/2250). - -17. Internal-only `.shallow` now retains keys correctly, [#2336](https://github.com/Rdatatable/data.table/issues/2336). Thanks to @MarkusBonsch for reporting, fixing ([PR #2337](https://github.com/Rdatatable/data.table/pull/2337)) and adding 37 tests. This much advances the journey towards exporting `shallow()`, [#2323](https://github.com/Rdatatable/data.table/issues/2323). - -18. `isoweek` calculation is correct regardless of local timezone setting (`Sys.timezone()`), [#2407](https://github.com/Rdatatable/data.table/issues/2407). Thanks to @MoebiusAV and @SimonCoulombe for reporting and @MichaelChirico for fixing. - -19. Fixed `as.xts.data.table` to support all xts supported time based index clasess [#2408](https://github.com/Rdatatable/data.table/issues/2408). Thanks to @ebs238 for reporting and for the PR. - -20. A memory leak when a very small number such as `0.58E-2141` is bumped to type `character` is resolved, [#918](https://github.com/Rdatatable/data.table/issues/918). - -21. The edge case `setnames(data.table(), character(0))` now works rather than error, [#2452](https://github.com/Rdatatable/data.table/issues/2452). - -22. Order of rows returned in non-equi joins were incorrect in certain scenarios as reported under [#1991](https://github.com/Rdatatable/data.table/issues/1991). This is now fixed. Thanks to @Henrik-P for reporting. - -23. Non-equi joins work as expected when `x` in `x[i, on=...]` is a 0-row data.table. Closes [#1986](https://github.com/Rdatatable/data.table/issues/1986). - -24. Non-equi joins along with `by=.EACHI` returned incorrect result in some rare cases as reported under [#2360](https://github.com/Rdatatable/data.table/issues/2360). This is fixed now. This fix also takes care of [#2275](https://github.com/Rdatatable/data.table/issues/2275). Thanks to @ebs238 for the nice minimal reproducible report, @Mihael for asking on SO and to @Frank for following up on SO and filing an issue. - -25. `by=.EACHI` works now when `list` columns are being returned and some join values are missing, [#2300](https://github.com/Rdatatable/data.table/issues/2300). Thanks to @jangorecki and @franknarf1 for the reproducible examples which have been added to the test suite. - -26. Indices are now retrieved by exact name, [#2465](https://github.com/Rdatatable/data.table/issues/2465). This prevents usage of wrong indices as well as unexpected row reordering in join results. Thanks to @pannnda for reporting and providing a reproducible example and to @MarkusBonsch for fixing. - -27. `setnames` of whole table when original table had `NA` names skipped replacing those, [#2475](https://github.com/Rdatatable/data.table/issues/2475). Thanks to @franknarf1 and [BenoitLondon on StackOverflow](https://stackoverflow.com/questions/47228836/) for the report and @MichaelChirico for fixing. - -28. `CJ()` works with multiple empty vectors now [#2511](https://github.com/Rdatatable/data.table/issues/2511). Thanks to @MarkusBonsch for fixing. - -29. `:=` assignment of one vector to two or more columns, e.g. `DT[, c("x", "y") := 1:10]`, failed to copy the `1:10` data causing errors later if and when those columns were updated by reference, [#2540](https://github.com/Rdatatable/data.table/issues/2540). This is an old issue ([#185](https://github.com/Rdatatable/data.table/issues/185)) that had been fixed but reappeared when code was refactored. Thanks to @patrickhowerter for the detailed report with reproducible example and to @MarkusBonsch for fixing and strengthening tests so it doesn't reappear again. - -30. "Negative length vectors not allowed" error when grouping `median` and `var` fixed, [#2046](https://github.com/Rdatatable/data.table/issues/2046) and [#2111](https://github.com/Rdatatable/data.table/issues/2111). Thanks to @caneff and @osofr for reporting and to @kmillar for debugging and explaining the cause. - -31. Fixed a bug on Windows where `data.table`s containing non-UTF8 strings in `key`s were not properly sorted, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826) and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing. - -32. `x.` prefixes during joins sometimes resulted in a "column not found" error. This is now fixed. Closes [#2313](https://github.com/Rdatatable/data.table/issues/2313). Thanks to @franknarf1 for the MRE. - -33. `setattr()` no longer segfaults when setting 'class' to empty character vector, [#2386](https://github.com/Rdatatable/data.table/issues/2386). Thanks to @hatal175 for reporting and to @MarkusBonsch for fixing. - -34. Fixed cases where the result of `merge.data.table()` would contain duplicate column names if `by.x` was also in `names(y)`. -`merge.data.table()` gains the `no.dups` argument (default TRUE) to match the correpsonding patched behaviour in `base:::merge.data.frame()`. Now, when `by.x` is also in `names(y)` the column name from `y` has the corresponding `suffixes` added to it. `by.x` remains unchanged for backwards compatibility reasons. -In addition, where duplicate column names arise anyway (i.e. `suffixes = c("", "")`) `merge.data.table()` will now throw a warning to match the behaviour of `base:::merge.data.frame()`. -Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631) and [PR#2653](https://github.com/Rdatatable/data.table/pull/2653) - -35. `CJ()` now fails with proper error message when results would exceed max integer, [#2636](https://github.com/Rdatatable/data.table/issues/2636). - -36. `NA` in character columns now display as `` just like base R to distinguish from `""` and `"NA"`. - -37. `getDTthreads()` could return INT_MAX (2 billion) after an explicit call to `setDTthreads(0)`, [PR#2708](https://github.com/Rdatatable/data.table/pull/2708). - -38. Fixed a bug on Windows that `data.table` may break if the garbage collecting was triggered when sorting a large number of non-ASCII characters. Thanks to @shrektan for reporting and fixing [PR#2678](https://github.com/Rdatatable/data.table/pull/2678), [#2674](https://github.com/Rdatatable/data.table/issues/2674). - -39. Internal aliasing of `.` to `list` was over-aggressive in applying `list` even when `.` was intended within `bquote`, [#1912](https://github.com/Rdatatable/data.table/issues/1912). Thanks @MichaelChirico for reporting/filing and @ecoRoland for suggesting and testing a fix. - -40. Attempt to allocate a wildly large amount of RAM (16EB) when grouping by key and there are close to 2 billion 1-row groups, [#2777](https://github.com/Rdatatable/data.table/issues/2777). Thanks to @jsams for the detailed report. - -41. Fix a bug that `print(dt, class=TRUE)` shows only `topn - 1` rows. Thanks to @heavywatal for reporting [#2803](https://github.com/Rdatatable/data.table/issues/2803) and filing [PR#2804](https://github.com/Rdatatable/data.table/pull/2804). - -## NOTES - -0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change. - -1. `?data.table` makes explicit the option of using a `logical` vector in `j` to select columns, [#1978](https://github.com/Rdatatable/data.table/issues/1978). Thanks @Henrik-P for the note and @MichaelChirico for filing. - -2. Test 1675.1 updated to cope with a change in R-devel in June 2017 related to `factor()` and `NA` levels. - -3. Package `ezknitr` has been added to the whitelist of packages that run user code and should be consider data.table-aware, [#2266](https://github.com/Rdatatable/data.table/issues/2266). Thanks to Matt Mills for testing and reporting. - -4. Printing with `quote = TRUE` now quotes column names as well, [#1319](https://github.com/Rdatatable/data.table/issues/1319). Thanks @jan-glx for the suggestion and @MichaelChirico for the PR. - -5. Added a blurb to `?melt.data.table` explicating the subtle difference in behavior of the `id.vars` argument vis-a-vis its analog in `reshape2::melt`, [#1699](https://github.com/Rdatatable/data.table/issues/1699). Thanks @MichaelChirico for uncovering and filing. - -6. Added some clarification about the usage of `on` to `?data.table`, [#2383](https://github.com/Rdatatable/data.table/issues/2383). Thanks to @peterlittlejohn for volunteering his confusion and @MichaelChirico for brushing things up. - -7. Clarified that "data.table always sorts in `C-locale`" means that upper-case letters are sorted before lower-case letters by ordering in data.table (e.g. `setorder`, `setkey`, `DT[order(...)]`). Thanks to @hughparsonage for the pull request editing the documentation. Note this makes no difference in most cases of data; e.g. ids where only uppercase or lowercase letters are used (`"AB123"<"AC234"` is always true, regardless), or country names and words which are consistently capitalized. For example, `"America" < "Brazil"` is not affected (it's always true), and neither is `"america" < "brazil"` (always true too); since the first letter is consistently capitalized. But, whether `"america" < "Brazil"` (the words are not consistently capitalized) is true or false in base R depends on the locale of your R session. In America it is true by default and false if you i) type `Sys.setlocale(locale="C")`, ii) the R session has been started in a C locale for you which can happen on servers/services (the locale comes from the environment the R session is started in). However, `"america" < "Brazil"` is always, consistently false in data.table which can be a surprise because it differs to base R by default in most regions. It is false because `"B"<"a"` is true because all upper-case letters come first, followed by all lower case letters (the ascii number of each letter determines the order, which is what is meant by `C-locale`). - -8. `data.table`'s dependency has been moved forward from R 3.0.0 (Apr 2013) to R 3.1.0 (Apr 2014; i.e. 3.5 years old). We keep this dependency as old as possible for as long as possible as requested by users in managed environments. Thanks to Jan Gorecki, the test suite from latest dev now runs on R 3.1.0 continously, as well as R-release (currently 3.4.2) and latest R-devel snapshot. The primary motivation for the bump to R 3.1.0 was allowing one new test which relies on better non-copying behaviour in that version, [#2484](https://github.com/Rdatatable/data.table/issues/2484). It also allows further internal simplifications. Thanks to @MichaelChirico for fixing another test that failed on R 3.1.0 due to slightly different behaviour of `base::read.csv` in R 3.1.0-only which the test was comparing to, [#2489](https://github.com/Rdatatable/data.table/pull/2489). - -9. New vignette added: _Importing data.table_ - focused on using data.table as a dependency in R packages. Answers most commonly asked questions and promote good practices. - -10. As warned in v1.9.8 release notes below in this file (25 Nov 2016) it has been 1 year since then and so use of `options(datatable.old.unique.by.key=TRUE)` to restore the old default is now deprecated with warning. The new warning states that this option still works and repeats the request to pass `by=key(DT)` explicitly to `unique()`, `duplicated()`, `uniqueN()` and `anyDuplicated()` and to stop using this option. In another year, this warning will become error. Another year after that the option will be removed. - -11. As `set2key()` and `key2()` have been warning since v1.9.8 (Nov 2016), their warnings have now been upgraded to errors. Note that when they were introduced in version 1.9.4 (Oct 2014) they were marked as 'experimental' in NEWS item 4. They will be removed in one year. - - ``` - Was warning: set2key() will be deprecated in the next relase. Please use setindex() instead. - Now error: set2key() is now deprecated. Please use setindex() instead. - ``` - -12. The option `datatable.showProgress` is no longer set to a default value when the package is loaded. Instead, the `default=` argument of `getOption` is used by both `fwrite` and `fread`. The default is the result of `interactive()` at the time of the call. Using `getOption` in this way is intended to be more helpful to users looking at `args(fread)` and `?fread`. - -13. `print.data.table()` invisibly returns its first argument instead of `NULL`. This behavior is compatible with the standard `print.data.frame()` and tibble's `print.tbl_df()`. Thanks to @heavywatal for [PR#2807](https://github.com/Rdatatable/data.table/pull/2807) - - -# data.table v1.10.4-3 (20 Oct 2017) - -1. Fixed crash/hang on MacOS when `parallel::mclapply` is used and data.table is merely loaded, [#2418](https://github.com/Rdatatable/data.table/issues/2418). Oddly, all tests including test 1705 (which tests `mclapply` with data.table) passed fine on CRAN. It appears to be some versions of MacOS or some versions of libraries on MacOS, perhaps. Many thanks to Martin Morgan for reporting and confirming this fix works. Thanks also to @asenabouth, Joe Thorley and Danton Noriega for testing, debugging and confirming that automatic parallelism inside data.table (such as `fwrite`) works well even on these MacOS installations. See also news items below for 1.10.4-1 and 1.10.4-2. - - -# data.table v1.10.4-2 (12 Oct 2017) - -1. OpenMP on MacOS is now supported by CRAN and included in CRAN's package binaries for Mac. But installing v1.10.4-1 from source on MacOS failed when OpenMP was not enabled at compile time, [#2409](https://github.com/Rdatatable/data.table/issues/2409). Thanks to Liz Macfie and @fupangpangpang for reporting. The startup message when OpenMP is not enabled has been updated. - -2. Two rare potential memory faults fixed, thanks to CRAN's automated use of latest compiler tools; e.g. clang-5 and gcc-7 - - -# data.table v1.10.4-1 (09 Oct 2017) - -1. The `nanotime` v0.2.0 update (June 2017) changed from `integer64` to `S4` and broke `fwrite` of `nanotime` columns. Fixed to work with `nanotime` both before and after v0.2.0. - -2. Pass R-devel changes related to `deparse(,backtick=)` and `factor()`. - -3. Internal `NAMED()==2` now `MAYBE_SHARED()`, [#2330](https://github.com/Rdatatable/data.table/issues/2330). Back-ported to pass under the stated dependency, R 3.0.0. - -4. Attempted improvement on Mac-only when the `parallel` package is used too (which forks), [#2137](https://github.com/Rdatatable/data.table/issues/2137). Intel's OpenMP implementation appears to leave threads running after the OpenMP parallel region (inside data.table) has finished unlike GNU libgomp. So, if and when `parallel`'s `fork` is invoked by the user after data.table has run in parallel already, instability occurs. The problem only occurs with Mac package binaries from CRAN because they are built by CRAN with Intel's OpenMP library. No known problems on Windows or Linux and no known problems on any platform when `parallel` is not used. If this Mac-only fix still doesn't work, call `setDTthreads(1)` immediately after `library(data.table)` which has been reported to fix the problem by putting `data.table` into single threaded mode earlier. - -5. When `fread()` and `print()` see `integer64` columns are present but package `bit64` is not installed, the warning is now displayed as intended. Thanks to a question by Santosh on r-help and forwarded by Bill Dunlap. - - -# data.table v1.10.4 (01 Feb 2017) - -## BUG FIXES - -1. The new specialized `nanotime` writer in `fwrite()` type punned using `*(long long *)&REAL(column)[i]` which, strictly, is undefined behavour under C standards. It passed a plethora of tests on linux (gcc 5.4 and clang 3.8), win-builder and 6 out 10 CRAN flavours using gcc. But failed (wrong data written) with the newest version of clang (3.9.1) as used by CRAN on the failing flavors, and solaris-sparc. Replaced with the union method and added a grep to CRAN_Release.cmd. - - -# data.table v1.10.2 (31 Jan 2017) - -## NEW FEATURES - -1. When `j` is a symbol prefixed with `..` it will be looked up in calling scope and its value taken to be column names or numbers. - - ```R - myCols = c("colA","colB") - DT[, myCols, with=FALSE] - DT[, ..myCols] # same - ``` - - When you see the `..` prefix think _one-level-up_ like the directory `..` in all operating systems meaning the parent directory. In future the `..` prefix could be made to work on all symbols apearing anywhere inside `DT[...]`. It is intended to be a convenient way to protect your code from accidentally picking up a column name. Similar to how `x.` and `i.` prefixes (analogous to SQL table aliases) can already be used to disambiguate the same column name present in both `x` and `i`. A symbol prefix rather than a `..()` _function_ will be easier for us to optimize internally and more convenient if you have many variables in calling scope that you wish to use in your expressions safely. This feature was first raised in 2012 and long wished for, [#633](https://github.com/Rdatatable/data.table/issues/633). It is experimental. - -2. When `fread()` or `print()` see `integer64` columns are present, `bit64`'s namespace is now automatically loaded for convenience. - -3. `fwrite()` now supports the new [`nanotime`](https://cran.r-project.org/package=nanotime) type by Dirk Eddelbuettel, [#1982](https://github.com/Rdatatable/data.table/issues/1982). Aside: `data.table` already automatically supported `nanotime` in grouping and joining operations via longstanding support of its underlying `integer64` type. - -4. `indices()` gains a new argument `vectors`, default `FALSE`. This strsplits the index names by `__` for you, [#1589](https://github.com/Rdatatable/data.table/issues/1589). - - ```R - DT = data.table(A=1:3, B=6:4) - setindex(DT, B) - setindex(DT, B, A) - indices(DT) - [1] "B" "B__A" - indices(DT, vectors=TRUE) - [[1]] - [1] "B" - [[2]] - [1] "B" "A" - ``` - -## BUG FIXES - -1. Some long-standing potential instability has been discovered and resolved many thanks to a detailed report from Bill Dunlap and Michael Sannella. At C level any call of the form `setAttrib(x, install(), allocVector())` can be unstable in any R package. Despite `setAttrib()` PROTECTing its inputs, the 3rd argument (`allocVector`) can be executed first only for its result to to be released by `install()`'s potential GC before reaching `setAttrib`'s PROTECTion of its inputs. Fixed by either PROTECTing or pre-`install()`ing. Added to CRAN_Release.cmd procedures: i) `grep`s to prevent usage of this idiom in future and ii) running data.table's test suite with `gctorture(TRUE)`. - -2. A new potential instability introduced in the last release (v1.10.0) in GForce optimized grouping has been fixed by reverting one change from malloc to R_alloc. Thanks again to Michael Sannella for the detailed report. - -3. `fwrite()` could write floating point values incorrectly, [#1968](https://github.com/Rdatatable/data.table/issues/1968). A thread-local variable was incorrectly thread-global. This variable's usage lifetime is only a few clock cycles so it needed large data and many threads for several threads to overlap their usage of it and cause the problem. Many thanks to @mgahan and @jmosser for finding and reporting. - -## NOTES - -1. `fwrite()`'s `..turbo` option has been removed as the warning message warned. If you've found a problem, please [report it](https://github.com/Rdatatable/data.table/issues). - -2. No known issues have arisen due to `DT[,1]` and `DT[,c("colA","colB")]` now returning columns as introduced in v1.9.8. However, as we've moved forward by setting `options('datatable.WhenJisSymbolThenCallingScope'=TRUE)` introduced then too, it has become clear a better solution is needed. All 340 CRAN and Bioconductor packages that use data.table have been checked with this option on. 331 lines would need to be changed in 59 packages. Their usage is elegant, correct and recommended, though. Examples are `DT[1, encoding]` in quanteda and `DT[winner=="first", freq]` in xgboost. These are looking up the columns `encoding` and `freq` respectively and returning them as vectors. But if, for some reason, those columns are removed from `DT` and `encoding` or `freq` are still variables in calling scope, their values in calling scope would be returned. Which cannot be what was intended and could lead to silent bugs. That was the risk we were trying to avoid.
-`options('datatable.WhenJisSymbolThenCallingScope')` is now removed. A migration timeline is no longer needed. The new strategy needs no code changes and has no breakage. It was proposed and discussed in point 2 [here](https://github.com/Rdatatable/data.table/issues/1188#issuecomment-127824969), as follows.
-When `j` is a symbol (as in the quanteda and xgboost examples above) it will continue to be looked up as a column name and returned as a vector, as has always been the case. If it's not a column name however, it is now a helpful error explaining that data.table is different to data.frame and what to do instead (use `..` prefix or `with=FALSE`). The old behaviour of returning the symbol's value in calling scope can never have been useful to anybody and therefore not depended on. Just as the `DT[,1]` change could be made in v1.9.8, this change can be made now. This change increases robustness with no downside. Rerunning all 340 CRAN and Bioconductor package checks reveal 2 packages throwing the new error: partools and simcausal. Their maintainers have been informed that there is a likely bug on those lines due to data.table's (now remedied) weakness. This is exactly what we wanted to reveal and improve. - -3. As before, and as we can see is in common use in CRAN and Bioconductor packages using data.table, `DT[,myCols,with=FALSE]` continues to lookup `myCols` in calling scope and take its value as column names or numbers. You can move to the new experimental convenience feature `DT[, ..myCols]` if you wish at leisure. - - -# data.table v1.10.0 (03 Dec 2016) - -## BUG FIXES - -1. `fwrite(..., quote='auto')` already quoted a field if it contained a `sep` or `\n`, or `sep2[2]` when `list` columns are present. Now it also quotes a field if it contains a double quote (`"`) as documented, [#1925](https://github.com/Rdatatable/data.table/issues/1925). Thanks to Aki Matsuo for reporting. Tests added. The `qmethod` tests did test escaping embedded double quotes, but only when `sep` or `\n` was present in the field as well to trigger the quoting of the field. - -2. Fixed 3 test failures on Solaris only, [#1934](https://github.com/Rdatatable/data.table/issues/1934). Two were on both sparc and x86 and related to a `tzone` attribute difference between `as.POSIXct` and `as.POSIXlt` even when passed the default `tz=""`. The third was on sparc only: a minor rounding issue in `fwrite()` of 1e-305. - -3. Regression crash fixed when 0's occur at the end of a non-empty subset of an empty table, [#1937](https://github.com/Rdatatable/data.table/issues/1937). Thanks Arun for tracking down. Tests added. For example, subsetting the empty `DT=data.table(a=character())` with `DT[c(1,0)]` should return a 1 row result with one `NA` since 1 is past the end of `nrow(DT)==0`, the same result as `DT[1]`. - -4. Fixed newly reported crash that also occurred in old v1.9.6 when `by=.EACHI`, `nomatch=0`, the first item in `i` has no match AND `j` has a function call that is passed a key column, [#1933](https://github.com/Rdatatable/data.table/issues/1933). Many thanks to Reino Bruner for finding and reporting with a reproducible example. Tests added. - -5. Fixed `fread()` error occurring for a subset of Windows users: `showProgress is not type integer but type 'logical'.`, [#1944](https://github.com/Rdatatable/data.table/issues/1944) and [#1111](https://github.com/Rdatatable/data.table/issues/1111). Our tests cover this usage (it is just default usage), pass on AppVeyor (Windows), win-builder (Windows) and CRAN's Windows so perhaps it only occurs on a specific and different version of Windows to all those. Thanks to @demydd for reporting. Fixed by using strictly `logical` type at R level and `Rboolean` at C level, consistently throughout. - -6. Combining `on=` (new in v1.9.6) with `by=` or `keyby=` gave incorrect results, [#1943](https://github.com/Rdatatable/data.table/issues/1943). Many thanks to Henrik-P for the detailed and reproducible report. Tests added. - -7. New function `rleidv` was ignoring its `cols` argument, [#1942](https://github.com/Rdatatable/data.table/issues/1942). Thanks Josh O'Brien for reporting. Tests added. - -## NOTES - -1. It seems OpenMP is not available on CRAN's Mac platform; NOTEs appeared in [CRAN checks](https://cran.r-project.org/web/checks/check_results_data.table.html) for v1.9.8. Moved `Rprintf` from `init.c` to `packageStartupMessage` to avoid the NOTE as requested urgently by Professor Ripley. Also fixed the bad grammar of the message: 'single threaded' now 'single-threaded'. If you have a Mac and run macOS or OS X on it (I run Ubuntu on mine) please contact CRAN maintainers and/or Apple if you'd like CRAN's Mac binary to support OpenMP. Otherwise, please follow [these instructions for OpenMP on Mac](https://github.com/Rdatatable/data.table/wiki/Installation) which people have reported success with. - -2. Just to state explicitly: data.table does not now depend on or require OpenMP. If you don't have it (as on CRAN's Mac it appears but not in general on Mac) then data.table should build, run and pass all tests just fine. - -3. There are now 5,910 raw tests as reported by `test.data.table()`. Tests cover 91% of the 4k lines of R and 89% of the 7k lines of C. These stats are now known thanks to Jim Hester's [Covr](https://CRAN.R-project.org/package=covr) package and [Codecov.io](https://about.codecov.io/). If anyone is looking for something to help with, creating tests to hit the missed lines shown by clicking the `R` and `src` folders at the bottom [here](https://app.codecov.io/github/Rdatatable/data.table?branch=master) would be very much appreciated. - -4. The FAQ vignette has been revised given the changes in v1.9.8. In particular, the very first FAQ. - -5. With hindsight, the last release v1.9.8 should have been named v1.10.0 to convey it wasn't just a patch release from .6 to .8 owing to the 'potentially breaking changes' items. Thanks to @neomantic for correctly pointing out. The best we can do now is now bump to 1.10.0. - - -# data.table v1.9.8 (Nov 2016) back to v1.2 (Aug 2008) has been moved to [NEWS.0.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.0.md) +# data.table v1.14.10 (Dec 2023) back to v1.10.0 (Dec 2016) has been moved to [NEWS.1.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.1.md) From ac576061b8766efbe8e5995adfeaa62aef91bf77 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sat, 9 Dec 2023 14:41:07 +0100 Subject: [PATCH 587/588] ignore newly added file (#5818) --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index 22a3a807fa..343b168b08 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -21,6 +21,7 @@ ^Makefile$ ^NEWS\.0\.md$ +^NEWS\.1\.md$ ^_pkgdown\.yml$ ^src/Makevars$ ^CODEOWNERS$ From f37f8e96b4d098f985df65c6394a53a9a7874c5a Mon Sep 17 00:00:00 2001 From: eitsupi <50911393+eitsupi@users.noreply.github.com> Date: Sun, 10 Dec 2023 00:28:03 +0900 Subject: [PATCH 588/588] Switch to pkgdown Bootstrap 5 template (#5505) * Switch to pkgdown bs5 templage * ignore docs dir used by pkgdown * Update _pkgdown.yml --- .Rbuildignore | 1 + .gitignore | 3 +++ _pkgdown.yml | 3 +++ 3 files changed, 7 insertions(+) diff --git a/.Rbuildignore b/.Rbuildignore index 343b168b08..9b64f62670 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -40,6 +40,7 @@ ^bus$ ^pkgdown$ +^docs$ ^lib$ ^library$ ^devwd$ diff --git a/.gitignore b/.gitignore index 559df7b9de..e05f2b803d 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,6 @@ dev.R *.RDS *.diff *.patch + +# pkgdown +docs diff --git a/_pkgdown.yml b/_pkgdown.yml index 66488b9281..c69f920c09 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,5 +1,8 @@ url: https://rdatatable.gitlab.io/data.table +template: + bootstrap: 5 + development: version_tooltip: "Development version"

Jz#NSZ42=vWg~6*R;#Jy!;=i3l-}0Q7JBE1A=qgP(%7AP*!a1P*Wfw3^I5CD z39FGGgzcz5i5*G2KD@0Tbx%#!sb z$R|*?gtUoRo8A}k#MI!?PHKf7&B-!3Q>fhPDOBccA|5ZH$f6BVX~}za@{QL3!e5aV4t+`t(J>$DuQh z)PVe#q`K7BCH@P$llD{gGgiaIjSo*}8)dbyCWY@I9~|D-sM=H_ouyveAnPsb@76vV z@WSlWiAK$F5*6o21Ia%^eK$45>o^X(k`9p9*GMzkBzD&g@+V1aNylk3jQRukK1o|S zJW7f+p|jhT^RB8?C8aOr|AZ`i4w1Muqht#d4Yz;|UF56aL zQ~H)nZ_CqReOp%oYf#t7#@|xch;&X5mQXUasfeI~wiw&!qBR5cKa$tim$t7-pAv7z z?xZp#ZKEk`ZL-c*+|PcSN!sG@I*CV{_v_^)+Rh?X)k9BPN`o`Z#uu#fD4$4vIcYBW z4z@fc<|b~5#W;|T^E+v(J&xE%tnC1vrp+*tL;g!!J_daQ__2|S8`Mp~0l1N*?KjeL z%I6Rfrc12Hr8pW%n3ue59{lL@ms&9w)qOIabxCgC5H^}@HeAloj?x(hGZ zV;sWvG+u=9woLmDCw)tLZOaCDGG*IRn2q3X(hBldNu9~(L~VBr-p>y>QJ}XY3(=nZ zt!nQ-gHSxswkbiIWyJi-=CpdNeK)()%fu=Bm94ykTkOsOSls4MQy%!%0kr=KQdUwu z+C0Is)St!}96|cpw%?0mNnPx*^aq6tq}`PHu94YnE693(5-)1w(lpTje_PveQd3ec z%92rk3J2pZ+cqcmrz~;%oVX>y226`1a55=7N!y<|P2(SFVka#H{Yh_#Kj%RFQOWyb zuBv;IPoS=*J;5OCO!;EUierE+Z-ZYDx3)1^CxkZI4q}tH%3Bh*(Iaaenen#b1b#<+ zm!xeD@h_N%I1JAdFSBhwAbv%@J*f=!pJH;#p5lGddCJC;E|PzH%Rx|zveCGm^E+cm z&1tCZV@yR_L4*9J$opB{*1Ps#O>C^n?Y8WiEz3##x81i0=C<|Ea6Iu0(i7S~M&E2Q zC2fPJm_#N0HNl`~%9h#gZ?NSKq2%XY=8v zjqabfhTY)`zMw%S%GQz^lfR77cBhWi2NL(TadrHd_yfwTlKK!Qv2_(G??`?-XW*WMw4cfR+Dy<&XRs1=}%liqza_Qq%Ndb z(#*J$wFA2bG~f|LDnn{c>P8w$no3$i+D$r3`i1n8l)51wHKZz}=A>??-*85de7dy?pseokOq$vRnjELM&N+R~BK zXg_7jYex;E%Fa=ygK0}FyGvO%VznzrQoC|IsdPM*rLkpgQpH`blRfh&s#K&ZeFCWY z=c(e-*3Db(6jfDiRZ3eG$X9JDTc!^JHE%~*4O^CY6$?_w^{<=V?{Mn4$#t{04^3nK z?x&Nydk-C{3$c4AUfU$f%G$DIsIwfVO!e9l`{Y)dxYu>FXHLf}iHY`qbxrHEaiR6H zx7T2|Lqq51AF_pYSd}_-g())Q!S?%;@RJ)n!Nw zK15X&Vyg;eFn^1DyKEq3iS6{P)FGC?)h@B@Hf5!$*S;C7{qB%>|+g0d9kwY4Cr-Jg^t@k-3^Rw~;c=;uAmRji&@SxSEQtZv&S zj%y>zit>L{t)IE>FtRypwVG*{^OWVYWyNe+o=kq;BUqOeOp=L*o1V#A6vt@t?xHA` zIj&lR>`4aj9hxOByg`;8o0Vf~|4jA}sx=`RQC8G8Pb{1BR=dOnaO|yiiDk)h|L=>UDRsI8ZMJ)T zZ(30J?xI*h*}IG4AZ70^id@0}SrkV&s4o5Uouw3iWX zU8HhI*p>}TdnAqjV!AtHOVFB>0SN=*+=lMw{%*jQ`1QN}woKo2JSZ;rk)-iC{N0ps zAt%yQ2g3d<^Z}QubXgZu6rghe#k~Q&;M6v)XAiND3vO1`n_oh z?pb&4fBNA{+JHigPhvmpv|X_CrJFk4|Dy|bxCuK`_ywg+pd(hf@!1Rdl}bpK-S1AS J1hUst{vRRlAt?X= diff --git a/inst/po/zh_CN/LC_MESSAGES/data.table.mo b/inst/po/zh_CN/LC_MESSAGES/data.table.mo index d8636f5bbc844d58d252de5f86f2b367fce443ad..74c2e7db0d54eab3712a70a1dfaffa19a5dab618 100644 GIT binary patch delta 22958 zcmcJW2Ygkj25FyOua-@EVk-o}-0tu?b}&#YN9v(HKV@j%`qSMxe< z6m?&08ir9Wmth3J!lex(%-t{^1saAM_P92N;SXoSwrx?k%5^mixu-m=2rI&^uqEsS zA3_^!+|4j5zyZ()j)Apc8XO3}=w>*K=_pmY8%8AD3a7zRJyZj$;4thXFfXj!(=e!F z)PmC4cVJz(1(t!gVGR5W$^%FAqHQ=Awt&0foA6gC^+S3)45J>(*xrUw8*YWg;7uq! z{RO@Tz55sjfiVK%Y1kPu2%}VA!>9p=Lx1RmmElp%J5bs$(9bZ6!d9>v>;t8}@eUNZ zVX5Xp&D*ds{yeX%@>Wnf7y)JEOQ3Y@Gf1?JZ(sxH+h2`r5L|{m3cdl0@S_790VOg! zV14NL8bvx#V1QzvWmaNE z??R$zcG<+A!+C`tV~I32T^4PYm&w zk#&ewA&Gz|uouHbI51AF*RwDgy9x7A9&UqO;FnNBS$3ovX)u)KHUb8~Rj?ua3Q7d> zC8$-@99F`PNg)2x-~t>za4RISjq^|*Sc_g%f$zi8@CzvWLIykyizc!8z?-l>oXbts z;BhDseE=D(QS2?XU!*{8HO@lP$7tgiMYvGb!*a0TXw}o|Q09IJlo8H^GLpSelISNG z2y2lz9bky&2T(fl5K0Gp$Eu_ZhQ8R*nhT+nJNBSdMfnz1haRM^j35BY$OmXnfYQJQ z*b{yUS)qn)oZ@&Gf_)thfX&`fb}DR#{RDP{%~^w~a5h{Z>%THnums0$=m)!|kbv+l z*bnZ7B(za@f;wDALzCn=3ctjEe4?5v=Ongc?0l0A<5TDZCsFS%B-%#IRJ9Rhz{XNO zO(MWU2ccBNF#$?QS3^mz3$O{aO;=ez5K2Su!M?D>3_1m)VM+K7Yy=lW^40hf(l?{U zOf{e*P(toLOKn8eV11r%Y(SBa-hky{z-+bEc7t`WqoB-b8kAf(59LAT92Jo!Q0h&F zEnqrq2_M4-u)$olieg|d?8{Jcsm46wFUuqhMH*fNYr|tu8u$g)gmveuJ$opOz)pZg z;T0$mx({VvsLF)af+JvixD0lL-$AKYZJ~Nk2#0JLhL@B0+n6JV(=cemSha`-umcwp zb{MimF@=nAV6R@P9^n11n(OW`l=~uK4fq9=kUxgZo-t;*VN8K$8VLud!A{U?h028? zD~Nwr9BDY3!h5hEtg=#VweLVl#;Z_9RQ^5nz;Gy2@+p-2pTQ0=c$IpV?4JrqcVSDUdFd9CC0Wj)A#dl#i z_9@sM25eNxm;xiPcR(3&nU7S+BcLSVTG*238&6P%;%K-@Z9q$5F!nKR7uw8;h20p+ z$S-MnY|)1blm;WAgnB>h1D`+{d6%u~keUT$)!c(`!%Ev^svWFn6nWrd*aWuPZWuk_ zI4C!qff9k-AFI%Ig(27rp@i~hSQFOUp>ie^%6*4mb@&7ZL7$zfLt)SldkyT!^Nmv| z=fKK#|b?1*K77a8qnjkvIC*4jtJ-vSHl%j54MG|XH>^_z%Q}$@uMlc3}d0!Sv8y>J&|e)v%`tdA_k7r7XM#E5au*5|;c{?fEHCLYV)C8rfJF zjhzi;iu&GE`@jU)5c^};8Qz1EGY!5|$r}L&V6T8O1y7-)B+B4hhGB=va0I*tXTT1( z)#~^PrkG^;_v*#u;|!K2n)PUwgnbJ*ArsO$HgfVwi^4_{b z{NKV+GFz>Jxv&NHDcBNvGI4!iFzf{1gACfZ3v<(as`urhWh`-hEs@_E?_l^2PwNv0v^iK2R)vvEcS;|zdvjV7r{30B9sV}GED3AYy#_HPk=Sy zZrDPa{2rwa4j*3X<;K3SBU}c>e;ZbV#obKn@M#IlVJB*PK9r6hhsEJTSQUEYGUW>f zqZX9=Tf-O_4O8G%SX+iP(A_jz;g|)bp)*iA@)Syg%{)x&({C?W8aoPBfs>#w+z6$k z=U{J`i_DaIuS02XB$N)N!D8?@tN<@VH=b`i(T+dhci5%#nARcl2Mob3pVzeZ_IEUQ z!w&c#!cMSZKGU$l39vC-110OPLm7yVr;1zztdIRJECbI$M_H5~P-H}T@~e^hLrI!; zP&Sm2Q0DwY$RLgLP#U;gz_dOw{Q_m9@hxatr(|C!^=HHJaF@306;cBm4Bx-hq-d`HGs>N3jY}lBXGz5k^4Cl_jtTybNUusueS>b)TrY z6Uxr`8w`cHy-e#Ej)ViS*FzbA=_sy3UKL8nTWI@DSP^@I_OFKfu+Kni=PF@Z$M9|_ z_hrIgVUdz5>2jA+_Xk3WXiq3pG70*_O`48tTJbEcLfjZigP~AzVh+3rcfyu%Mi~`> z&tZ4${AEp}ARGus!zfq;-iGDjuP_M~EvF(q4a)L81Zm%4Ttl(us=VqzIVcs{LU$Mn zWo{#&jC=;H1k<5}@R}|+E2sxohBDU;pk(`-umoHMC5O_Xboi!K&ieZiMOHzjit2`z zP(q&!r6Y@B5d0X{gTFve=x0+oQx{go?h9pIkAP-_lFeoU=xvya@_zRRN_x5+F&^7lrt+&`nC_UQ@<$OL5 zWi{M`GN&bKs|eJFVc5ZtSuwUiV0;yzI(7kmg#8=r30DWIj{gE>s*BW7k*@DRkrDTU zo#7lP%jyd32&>gqbCv*QmAKb4jZLru%mYtBdB6o22z~3T!{~J=5t<1H!A($-I$r}d z;A&7J<>-U57-bEVIc?NXW%)oj7<)dHQ}6+lp1U_PtuLwlphTzz9fC>^+>`L|}J7HS9V2xVPQhmusw zp=_;tpqvr6;1{q+OEqQJq2$J6cp5&1&Ebh4)5tCBKUXW&b5AItE(axh>%d4D0^7io zy1YPZb&B~w*>I{r*$3J|dBKsKO+;b!cW zFco%bYg%8+Wx!t8$?a4!orE%nzU@_#b%&BWBcViak+u&&>A+1W19k79?wbsy<6Ang z{^bSZ3=Y||Jvyp4m&R}!wgXC!^K?=pEC*%%_J@*OQ(+&t2Fh9R1pWZa2dfP&UuPB3 zFeoD)17%fw3T5NU?(9&nOr^W1g5FTFJXv!Yl#w5VRbb(+YFRdfk^?EQ8Fa#y@Env4 zt#CKReo&GzP4hl<$8OeL)o3U@PntC`s!$iqZt-2Pkt`rLXEhCn&2T63SLQ z9VWpmP^PL|Keb#ZLV4hFcpP4Y>)^82)w^TM{%X!gL0K)U;SP8Rvf(+5gf~p^=r1Nq&SiV1*&71Hn+r6QMM?0``U{U^pxtVp`uDjE6E+e?rNj z-a}Qr^{_qmAFziki&mlPm|X~+*o}vo)~{60KneBYFg4=Op=9yjQ1Hyy;TQCu`JtCVK0h|^lvEZvi6%Q^xdE= zw>P0Ycqyz2GoegL(Ge;MJ3`6&k6>Z=6ZC>^(P|4W38S#P!C1Hl%Ar*?hV?Id{+JlG z(=CK@iv0=YHMv%->d15`_0~e!(+@(K^IxI7`P7P2><^`5lc1FEh0?K~p@h6qyc&=L z4#(ac&-xEWDLzs~U?`MTumDPu?SwMoyHK{^5(#SV2S6F&WGHjH43>oVq3i?hiE3YH z3uQnPpd{-K%^#t3u&N_T9iKztKpa()O@kF=OoWTEue@a%Z{c`2bE;y#X(~d~p)^=wy2_1#P`2EKP^RWK>;vo1P`NM@ zN;02=^7ki3`I+kFGa6Q-U@MemcSYL;XQ?j^8bW!{Iw)J}Z5Ri=XRCc+f#zOl$A1gT zD_w^-BTha&5FJCqxqL&^Tm^Hhh{!x7l$pk#Zq`KB=n4u_Il zH=!hJkp;?b4~JlnhO)JP10~6eE>u%G7)txgp+mCrF-j5)aH@*yG#^2EQ)#+Lg>DRN zh!IXWChP&dQ}y|ukg7(y5z5?Lgkz+_ zQe`iPt+2D9ykb>frWzOk^Jr$98(1!gPl!#_S**Seysgd`9vM6oB>;*9G|~JNsdbERd$buk^>K*)T{Y{n$wX`w&G8r zY&3sCd4O$$>hK#->@~0(yr%8yAF3n`g8{Ps7o*gm!lzI|_cN4{*4(Hd`i0;OZ0z>4r!D5sj&5!09j$3uyX z=TWsaCqoJK_fT@B$1yckqhWRIb+8XS2W9{8J+7v%?{SCvM6&>gY?%*W5!m&Fdb0_E zy|ItNWw6AjYKPkfY0JohW$^h zwZRO5EC*vBl)3)_O2q-ERV3bnl8hH%I4pR^w0_By06pl)N%#Z)FTXU658=MErtu7h zpHq3E7F4{my1eQuA2vNO$uZ@_!76c1>7!8F!l55A~E>~_gCu3}e* zUqO$rO=B+n29|>n->6)g1Z5xxUz7hrLwV59tERCEK7>+l{x$VxQY{p_~z?plrFFZmEx8XJBRQpP+2bC2p$-w1Kiyj)OA5 z9k2;J4O`0k&-1;S;|@?l^){>nKY^{_FR&l1pP`n`Z0L)<2g-820VN4bW~%jF2TJ*y zP*&G`SQf5_CE-!%4X;9njN}=LaBP-o*x*wr9VvZB&Giu21$!!#sk)$<1toWWhvi}Q zY!!iyP?m2DYzbFGx$l-P{}oomE`68vF9nV7s-)=x-LRvegfbeQh6|u<7(?!v?qZGhmGe>bsu857m9uq0}1$rTv9aHli=!Jop=w`=>Y_sU2xM z9E{@@l<$06{-ow`1C)+@2@62uXY~eC5X$HG=}-=z?NB;+S^GVHQIYY7G6ntMYj6qd z0oOyBV#gDdTqwvAaNn_(AO|9_$M#1Z^N^*9a6GC2+vM9M?yNE=wi zBcJiRdV$#ugRwK=6&Ub`dZY6EQ?VtKjwiutvi`TC$Q)gP^3GP|nQE{ll+Z@QO3(@G z!F0F^-q-#)f2ojehs*H0{jI)sPlGb08=;Im?{j@B!gkn=;ZUA$I8bEE{0_Y z)P~QXM53mfYA77aGMNfxL)ih+^JCul2%;jeNMx+mvqzr-5@#&h|pxk#E%5wi1 zN<=-}-5k~jk8192*4Dcc$};r!a5H4_e>>1JhpMR*H4 z2rh;``Q5C&eGim+4`E}Nzkr+iWCV|3&xem;uYzvY6n845>c=}!`cp6)T3HKO7Dm8p zZiXzUcc9GKCs4|tLpl9|3#%!)0wq$B7$;ro%`@8K|8r@#H~If#Lcl7n)&D#*nfm36Jdo`L+ROkfAb;X=bgnmux_Et<|7xY+$cl-8MBdhaeQvM)sNF)8x^Wh_BpZ-eK2wf zd5V1*Sx%jSa4XUlKl_*Q4Z2*ZnxZd2B&knRUVu*hgAN?Tw13$a45er$l*0ltFichYx+BXBc5QA#Q z{$q^7SxJ=}C2{_t^-@&2iM}6cs2f-d`)OUOIpqh}aYSA)j)=i!&r#^?|)Dd1|M zJCTIGTJrA@mA7bTAUuw+%=yb04kP4k$|vda-{9Z)Tvsyoe*Lgh=;tUn48Mc!4CXLp zH?Rw0J7G5D#J8>sP`>aorc)qS6{G=D9qEZL9q~q&%hNfsX)$MyrUjyI;pK3*zQt#| zv#ws)S!nrrqzzv#?3`C&{Bk{_>=%Sh!uS?hhELAOwdgsokr)%@i4MDVAA-CISie2_ zfZFdOZu%)zb3E+}>TRN|7qS~Y7f&9g+mKF#>AF|YYf|Sc*amu0I2~O!SGl&Udi;Ho zzTq;QFAb}5V;_3XYnQg=q&-1hHeBPN_6^l-T)}=1X@Y!+uOoFmbXgjD0j+P=^vtFA zCtkLUZ?x0((0eF9Azvedkk=@$Ov8&1c`fkO^{Zm%ynHbFYDZIDmQG!cCgV2p9eM;b zp?p7PUEaE)7>UYd{al7W6Rnl@jnVZ-X)b}?sIwV)4_(eOxxU8Us_RG&{)x0kX5pKw z{e7Y13o4dJVi4CQFR+u5VC|>?t5IHB-*^DKKKdpo?;#$@SY3u{9FrgVT7mxtGFZ3& zCjPFvZdpDb8G|SopdIJo_8c9lNM#SnDz2;A-;z4=Y9QBNIqF8Czm9KW4xiM?%wd0t z-5J>;1zf%PeDu$z0_Mpci$u#l~ zjK|*y{Q;OTw@4$rVh!_PXxJ+BY5sS*7a7a_n!Vjy%7y zuIz@hrml1rUvsVR#@_<_xwf05|ABmnU7E71=yJ7HFlu5?L!U?433v^@dKIS*FX?Lh zi&2UreKzChm-2s4m#_|uSZX>c}<{V?ZMhTf&%_!$bW`A=h2(ACTIVpVeg*bX_a|pHitVVng1fV1zF0O66nv28ygdfEM<^w}Cp#bUpEP zL)PHW2cJNH*ihel0^X(WC-}aGruH9}GbtEDdNCK4r|>s)xhi6Rf`pjQz6| za+O3cf$Vg);x*<5o?KXRy+zHOR}97??TbPmhqTi6NO(Z&CNV5W8+K#|zM|MUuPpor zwWCqa0(qWq)J4yL{M<@XK{lGk~(fND%S?@_@48&_H641kt%NVFveKuhOT~F?NAHBb}r^A#SWm5hS+j{<1 zj0qUt$d5D<2^S+>(0wUOf4?EJ`4UG1c1Pkf`% zdm^3qhRVpzoOi;xikn`;Sp>Ttw(BZ~zJW69%Q4Uw{V~3&=(BXCHrUbV*^~{3yR^TG zZfCD%JMDL2#7ptVJBSl9j(dl)hS`zQR*&@Ul!jX_o-3&r5-K)LHG z%YfwStn02v--}+IIwNHV{t_936sO=%c$5?1z`0&-78k>39J8euMopw65{`^U zW+SVRoybY#I`R}L(4N(Sv_<+O(a3aUE%FKCxX6!e_ko? z*O5nvdqJi8+LZA~=JlnRvuB%< zj%}h*D3>I&R638(b)&qai0?T*DVN5WWy@EX^Tf6Lx_nje$vsl9fv%T~Po5#Zg4(wj zUv6G`AGRr8;sCn%rM!SHzf{6mrmatl{2W{IUMEGKx~Lbv{Msico%C)gz98*$b>(VF z=h(JoyxrNCT~E)etH|jqx5ZFa-nqSF$rj7eY`WCd$CLQ#YhNCHhkI$&SIigjvXypv zwkuPj8$~3$)mK*^7L|53X;;Saj3P#B6$L?gcIz^3)^9RoX!5=+zR&TI)>ggA+UG3m zrv9iLyO8#s#b-m8dn;+m&M$5FaVE7ixsv4*d@_+zw}vwq5Z( z$0tvgOM)-1PQ2W38cP2C^Wd2ko%`CCslJCYd7YGtI#&InHuWX3_-bh1P@D7b_GR48 z*qmiLlquovt-hC*9?FLic}kSGGq^(;k1Z4xa(3-d(($us6v|Z|kxJ!#{>fEW#Dnm? zx;A>^bFC{^s3P!563au`>8$p4;gfd6C!vsMoWn=9)OZm;DU{Te!bcQJXT;~OeMKrc zOLX*cbgtxP{jN-kT+f_>uZphjr>kGUC!x(*&$IGBYu>fEBxB`~wJDRnNqt!dLU}2a zvUJMi{Q#dcyXhU=>t~)%c zs9&N;E0Q2WHbd*qILcgWTeb^nZD$UjQ~SDCRmm#%uG7BSybwxc#HV*a$MXoBlDXn^ z4M&buX?QnfuJBdUJ}&jk3RGPq2oTM~$tH^JbKH9U0AkYFk=( zxoPKun6l+6)v<-fhUbVB!wV%i!{U-6ouecDcJz+%%9YnQ(bvCvqHp4k`1tTzUV+vv zD$ss!*~6+ewFU0TUJ;Rd5I2X#MJLC^+G3MqhS?K{N}Mg+9&Jyu+hXIAkWsd%SX*LJ zLR45%^ceK81ZE{FcDOArA>5weZ|fFUD?B+q(Kd?Nw1q|5!$#P{ZT8TF=qP94VXqzC z)-H1|(Yvh-mS^d?ZZ5jF^QRTXc4TfS;9enqOsm8m(V;vj!Jd?y5G&2M>e8oIyB@ZP zxCH0s?PYg7+;qyz+3jfF9W{@B=jBi-Lzt_E+ap4gqm!!IVv-Y+Y{TrfK!2mli{hN| zwMp_>wd_%3OM+FIQMfuqYyV0v+S2)EPe_Pkv|(|{u}O(LY~MWaEZ57P$dje7R&H1^ zcJ;>@71=QG)e4gN|Kxp+Bz$p~cU1H0uio`iKG}wmiZ80)hzfS}u4hv%#SMGY9+s47 z^Rtf*v&S=ap-f;@jJ=wb9x|DU$?@@V2}$;Ff8+n^F4y`ci!JT!@a!=mlAMe|o`RAM3vr*^%-x`F;X3f_LV_~6#I>XHCSk)%>MBa)pU zv8k=9Z+H`yOw|`F-zshyF)AU7Y$OqEv3;ytYFhVnZEZ!IwKXh`Z6Yq-+70}j^PZH7 zeEA9VEk41Xm}vih{;(bMo-8Wl^nO+@wMA}oqr)B%9ZFZDVr9c%&DcX@7*V_o*81sb zkgir0mFIMbqESRp*RItfDK05AI)uD!-rP65sck?=`%c|D_3kKEO|0&nU0QFJN2{(O z?YgxU4=qQIBpZ zVdZeEjkju*!P&?3;IXyLd%}3c=_h-&qlCfuB`lcFY zVK+CME&V_(v!IER`nkXPw>UnlZ3bD6Jb~s!DVPyx`boj|Kr=%M(ihh?Cz{Spujfsz zP~Ut|+-A#Kdg#vDec6YX-`OyWv3eR*s?flEznCL)-o&iM+pNW(v1hK!3bfr_d^mI8 z`iwnuGUw0!K4oI|*7Q58w`6Zwn>8aPD{aQzl>OO<7G$khow0xSop&dz%6FFUxwCl& zl?_)jb!-&g*}gw><*Xcy6KmY6X~)0Y&OYSKoVG7>#WbS8!!!1zs1|ZIOcRu{8f>cU z;epv12dDn?*p~|b(<2=J@fn$ir`%n=pT^X9|J{!|w#;06h_ts?}o|bxUmg$w= zWrFE+PyKF|>0ai=F{P&4rj9B3nSOqXnUO1X)k-seC0q94MVZT2X6&1)0w7aSm2FGr zT%TI$>t~pMc%?sIX>KU)aP8Jrb;_xcW-Z-n3zQj`Q%Pk6_p1lLh=ncCmNlKlGFMGc z=Jd5$OE&*IDOCm=s?yy>%d)p_eR$yA%(M-*u!PX`gFDUI`5kxmJF`}%F_|w;kE;Ls z$yANdPuIxj=4dU4`_(m-vClC(J7o%ydNtowi!Ws}seWhj?(CG8dOBz4OLcN)e8#^2 zp>2{dYth!M*%RoQ9Fx-Nt$VY#uYP!7y6bSXNlL9<^6-GQtaIM+e{w_i*7eyROn7k? z{)eN`>tAOfLrDMOu(>Bs>e|oDh3O4Xn*IeGnbW7IpECP zy!7FL<*s8=#o+GJDQu}W*D0B?XES4T9hBdvOpw!+wq)X6C*|Fg87v7J&fdB+Yw27` z!mQL)SvwYII{xR!XU?SuAO3&tM0)5&vs~`<@mI|c^0;<~8n?|lh15<#Qdi~ewQ8!j zyIc9xX}8UTt`pZ!b+4MWlBTRFJi{u8fKwqav<5s;52 zqv8@q$XjcSJti(;OnUb`ZWRkO&>3JQfN$pJ)a;oP$&k$Sbr}cKGN(=<^^8Pb?gsh> U)D0iZ8%Lc0>x^|2b9>YJX{eS=a_?uwjSJM8rKdt9lmcRwB<2XU7 z9487pXLp>qK*yOwA%OClnvPQvcVh2aWH>?_I*vXQhIug?Zox_zkM}SV%QSMFJlF#B z;~>FLY;4zhm1Zj$GXdU1*;H$ZOc_!nG5WQ zQN(AVF60NsfXQAJ@YA6K+o4?bOC|y8jD(+U~S?< zFbsXDE<1yT@IIEo5YB!aYoNwDZCl4-DmjHQ8H-~K{)|D2Gw_+Q6o&HsP7N|e2(-eE zI0l(N&Q;VG7j17&SRFOlYGY;WhFNhb2H;jKhC5Jm;glb79%7?NLLv z!ulGy7N=t$;}Hy`T!b}O7E9uG?2N@-hOaOEuRE*W&vZ#gJW6?{Ew^MbM^YY#<#9D? z_FuwEm~Mc%vzn;oI}{7xTFi|nFhAZy=AV;(AZ@S*zKuT&r2lh}31Fa0VlF&`4e&F} zKFBPSn55D?S>lCBrJtTJY;ky zPf(Mi7?)QDdtyCYWW9kEC>I%KE}$c7=*D9qTx&gL>mOrb;^D*1P{yKepbzTKy;jd& zGCILcY>sJ0I1UTKX=C+aC(2>WP<7!@RQW8%Vbmzo1H&+h@==_N9asVj@gWw$NuwR7 z7H+}T_yn0&9w+`o^U&CaE|cP4{F(+Y$C$A?%@Ad$Tx+c39K&`vo_6{F!}dve-8i$s zpfeDUnu(sC!d@?%GC2WOFC$g{OTFi!@V`cmrnKw?F zNlZu_j=H0Nupq`vHXBS=)KJ|-^=P&!W-|7{2+EVOEUv-|e7|#nj3z^dspgBKF`n`e zRKsM9#Yb2jOHVU<{Xk?TIqR?mW}a^5Qg@7^ycBibGpN<^0`>V4Gf3DEJ(E(xuEgS)ikYP@FOBr7vmF@?Cwh_NaLvvkRF5@YZ014| zHl}<5wWVX0Y$<56F5${M!JoVXLV zCH@gM#OKJuaB3|#J+vRIQBM7_<2)-Ihv9QluQX#c2Q_wwuq8gmIvBUg zI1S?{e}kG6VXKV^Sd+3B^Wp_8g3oL@&l)rKv8Wp!gZ{??t>`r+;HpRbD zlQ4drnIof7clr~?VCnT{^7Y5+ln-HNbT*icrwcZqI0NZa=OUKIAshXAoYiFX1?R8@ z27O|>whbmwUVysOU$7$<{nV_IDfj{9FR(SnY%*UoADdCWj#{=+o6Q~f!8(+;Vkdlt zHMRa*eP)(J61Jz}OVkJRZ83My7Hd$Rh`NwtSOimVHDg=~$50+_%g?bkprF>H2=CIpD*x_$T(mV*5;wC1D)pLpTzH_S653$c$xes7uda6?~4BF)G=NRUa%( z`2^O(ziqkZ0aHH+yAl5ubpaI)8i!(2%Db>4raole2dbeiywf3%nRN3BR3mT?>tn$$ z%mwtvK9tvCQOtDMTxc|^$GV{|cn<3GCs7xe=7^aqO|cl|S*Z4VP&aZH)uZ8_FU^M2 z57p(XPb)u|C&BoLYV<;!1uKZsdic!b>k5OkGwxAqx+^n81sCLUxlkGcOeudGL z%Y9{@4W0pHmZ>4u!Kx?Bm5s%(DSwYuaqUUR8GtuYchu~Z*(W~0W|VJWODuERT*yR> zqkIXQW1cf+_P>v#DDOb#qsJ-5x5N{;i?uN78}rNI5bRI+8&r?PoMmO;Ff56kzcuY= zp)T-SYw&kw?Bg*D@u@f(mtqafan4+LTl~yr{11{@>T-7%%$;YrXl8R4)SYg^%yW<$z9?IP?fcfW)vw?|tgNiknA16Ale;IAS%9PXc+lj1+^>7qw6&%J$%oyPM zp9LjRv%DATMmD2*?lD%tLaAK;a%_h=`F>{@8Qsx5)Sd3Z;&>J-;S1E5mk)HE)U1XM z7(uxkg)%q>wO?$-ig*Kc$JtnuW3W2v^9Qgk-orvzD~;=Tbb{VwLU0KVz>TOAWKV1E ztOn`>U96Kamhxs)SO1EdGePOhoi@c(l*eNhoQ%4#C8$aHEoyE&O~?M5n@rmDu78re zjk=R&s5vnfo8e(pS7*)O`qy&@>vGgib{)ImBaFco8O@!}L0$L*RL^D(Ha%MeRZa-z zw~xGJ`r3-=xRwmXviJ2*%M)lw|OvD$ct{xQX z`j_bj)cKF1f2dFw5EN$WJ*CJ55@?8;_03RsJ_HNkN>mpfwe=5BUmOx{#yS!;s}oRN zIt}#&D^VAG!q#8JoRovJnf67IeZ=E*B2$Bk(O4dnF+Dy(O_rBf5c6ht{p+?4W~bZ@ zHMvHkdSEto#mzVjGvzQtGZ}Rw>rpp&3U$H1V@Iw3f)S?c#-nz&zfd0t%jx>J)+(s6 z9*Vk<6{yK}6m^G%bGc4utblWIAtqpv+^+vM{X=X+`EQKGmU+yDB;q>0-$^3V6Jzt5 zb-fmKrH3#Ie?l#<&`8%giyiPjmda-?G+TbtGgYxM@wTW7-GFg;25Vu~0%mTsMy;xu z=$S<38X1jokAkKvr=z-lBWf0(LQS53P+t&Q$o0R?c110}*{Ggdfh92+HP#PMJ(uQf z*Z*!=7Ik66F&i#@oAs|{wUaU=YrppBtFFzG)g!q} zy8ho0tE1NaQq+3ikLsB_xB$zQG8c3LHK~8W4w$#J>wl-{ilmHdoqA5hj<*xCqnVV*%Yunu*nr)tODu+I#|b9*CQL9$2m<#UzD+; znf-ZCU0Mb8Ea-|SaUNU`5N43}XE+=0FE2G+*- zDynDw%_NhGz-rW$ug9u*46|ZDw9AVEcZ9l=9H^&dEQVrZY=m90CvL&C7+%$NHe+5) z!ZSDrdsK7%zp6dQ_FDh(G3M#F4mHWnqSpTt)Yyi_nsN;40y?8UHxcz}mAbmQ@Y1OF zg(j#Mj!CHXzZWN?Q^Q>F6x0naLyy+$Winc?X=<9Cs{m?07=aIPIcf)+R?BqhZPXqA zgSy~)warG=2dhwCf{LF<&GP5gh&txZV^NcIZXMQteKNZVXfJo;T>mc~p~&QLnxpPu zuJt17X_&jNu|EbOaF;CZ!~!u zm^<%?HEA#bH5rd#G(JQPO|gdN#Emg8<$kCwbp|HlRn!o)Xk?b*B-9svj9=mH>a5ZRuex%$Dq_V>44x>;G%iq;p%EF)E5GH?xkg@fE1acO0YeF&4-C3FZQtpz4RC zKED!M;tA}Ixm&sVwcO*3Bl8Y{*VqEzX>A&Qg4$SKp}IEi9Uhms5WN`luIv8~kKdxY zdT|?b$7fKJGoY>O|4%bb@jc4RF*kD}T|4s>E!^I9#%mq@MMkrAbO+b}7s}-rO2g=m z=54rXC(~62P2BU~%c3r1CTh>$hG9>xx+ z3!8!-H8?;gwhHee2r;iL~{p(Ij4so3W*bKFSyp`zs{{%Z9 zwRcAhH9fE$t5SZ6nzU7hnTO9@>`D0>)N{V#aPz`3A2sW*VmB-^!u0F{jHi6WLq=U5 zJkrea1XRUZRL?v_y;zhOWx8+_&Y^q;>tV+a%muB(O_Xop7@R%Y81kX%p_!=j6&PdY zMrYJc>PaG_vAKr{7)_68F3d*F<_p*a^N%$zk%O=(<(;Tycg>cw{KtGjHH;-b5(naL z)cP+n&h8`FgYpPuWAixI$Y^#J8*g^BR;VuAg_>MhCzu|nkJz zU%hZR<#VXj(QuOa{D-LWcI=3^QCoTK$r@7D-xxA_fjEIWahfS+5_ZKz%1cqRKl4;$ zdsGjt$4dAcYKRI=GxbAIH*f&8%yLaPtD^;~hdrnsI*(Pg{xP|G=dftmHqPVA zu>N&NF9~$Q1`EvxKS7=76lxBnN;0#(GU|O{ENb$d!1wSa>dt#FGVK=Nc*<8%v%bq> z(-X^4V}A^_ueeK?-MaJWC8o=FU>(X=QIE;|ADKDP95tIaq2@}irKVk5)R=yZ+H(Iu z?I*>TnJ;L9y5PySd=|Bu(k(aj?|R5+R?bDu(nqL=O#Y9}f%TRY3 zxW-&i8`LWBqJPq%hAe2UIZrIUO?eb9#FeO_sJzbg{}p^FKGOQXMMgW`x%H+ea&Is% zjiXT;#R}906|~Vj72BZ3{43O0mixqYYTy9W7i>XI-e2)3cKy_BTtzmS@9B$rsO`c; zt^eGc&6Ur?lT@5Ub@hVJ%+u@yYO(}xG1j$uFqHUi)Eqj4nj3YunhT$RKl1ST1+U}r z?PmF{-)ZL5Q!K9a-(;70tsaG1_j9a!Q5SL>^J4bT%~Pu~PN%#I)f1(BuK!P~i?JW& zG`r2*NJI_IQY?liP(%0!md5x!tbdL9NHXnl7iuzQ*=v4<8iZXbkH?mH7Z+pnKC`o( z$0OWX`u*ltx381UZ#+L^CfX$(Ft2DEQ0IC7pm{+#hgxO79%TLN4$B-eld1>mcAbsbh;My?&rv=0^h;yzqh_q@S*M{^(;3W-uTc9;u4CpGn$gER z{8Gssd{1C46_1Xau3Pw(>s+CH0Kdh>CtPP1_C0A{&;LQqiEO9LoixKdloz0u+fLNn zc!^ms%V~4xg{-kSjrcnrGLy;N!m0S)8S_Pd;0ns)zcvjEeq&xpw%}6Y70#L$mG97h zqHj(8EzC!K@9)e$FwMFf#}U7adIt14XErL&E;5?6mrz$8e%>so|U6Hs^D4?o0N){K{3CzA3s)P;PC+A$wtBg}q9L&N&(K_)AK z5va*B12tKaQ9bYj>W-Wr%pI0TjcFpPellu#ZnE*yn2Yjt48RwtcCYa?7GzW4&Yj1Y z!1p`3ubM9&jJm?r=zsd5_Wo4Y%n-z2S<3HWJzRzj@Q#f~vCHUl-7qJ9gt{Rg>e+H1 zH7O%*nDdoJ|9}5)PexZX4{zfE%z>+Knhz#p1D-vlEWM zrj)0lCeIaYj)i|VLpBn#Q(lSMYWJeL`VOim{y|+xlln9ADOq;$(Wh)9n_WoiE$YI*t}S@N6qqN)DT`l%^CNJ>5-hM`r4?W>Wo?i z-BA}j6*;fR@sZJ(eQ*5>^~DiS&GWnj#!~KsTK^kR`@;d$Q|vOTC(`|4&QlY$O1hyg zXd!9{wxW9CdrXfRo@o-Y{=&&<*;TjpLVdw()H?qJ)s@MZh&NGNY3t`^nO(;Gl%JvY zjqHD#3#y3yC=bA4_=AnN_{*Gc8rIPI|BQ^5$wO3^<@?+GvKWgQDX+qIxCJL*hJVcC zbPf)t{1g+g=fCEH_F*LDi+B_N!UcHch1t5_e`(sSM^69Q{a3E@9Miux zW2=2k8%hy;7we$fFT+N7A8TO+H^4tsBT)4|)KfDcAizHab5T9zLk-Q9fB=vGb@&e( zsGTanzd>|BZMm0FTXV+10Dsp;qBf#fR8J*hB+f;>q;5wIRgTmF{vnJ(UC6uG8&{)? ziggd*k5lYF{(l@o+1aWZj(ntVNYp!piAC7>KFT^?P*Kce{Xa)jpTU;(1np0Kd16P% z-z53S*CeGJdV$HWlXLvQ{^S2wttzy!jaE@Pf^r`0h9Sg1dgBE1$oHhKr;WwqX4*yC zay9CPk(QA9kSY>Cjr(b%O-l!>*%?OC{Bzcmve^%-yxiusg=*q`O`_+VpGa%Sv-$b| zD=1m#SCW@_Bhq&Ya1^Kh0A(G+NnJ=shjW6d(=P zvQsgiW;rn%Nk?@OPe~_^l#?`taw}VR1COeaJ@VodTgM*e#M!*6JCJ{YC2%6 zY3*mUBmPKwMcS$LKa)%#U0D%(k#uw<&b9duUICnfq}`;ihy0X~@&Ef98hk`xgbnWELwd-Rqs+^SGm2D*ypPxl(l*LE^Z-dB z|1ouckb?YC^DkqwP)<1((~|K>5ymNOwQ$^KtB#?zn@A(W;m6K zZ%Kp5KSDj+o8UE^h+RpC$=@e6rAHk|~*>j4NqXF?5r2mkn+u%%Ff6JENqOLMYuXe|%D@eXJNyiD>b{pl4q<@G_ z_SY~Pwhcv({{{C3xi+@)G9TP$8>}SXne@!odGUSf))CuC(vjP?)%&L2T{GGETUg)L zmBt#>HMHd~sB1|2RvSwxGIgj(pn;Adw$b<2%+&utK9i~T|Lcnhl($pvK`L+KV~Mr0 z`JH%(&uu5^@ZvQRTbuvay^FM+ON!B^rz5?=8EMNGtc!_HA-{&Shl|9ji#MhB36N@CxvGMg*LjHfFI+XI;r%ipN>|4~8ZLAFh4 z+N`F`Z%R(fH`;f%ANnh0%6?)i@8S;o;Xo{D^Jj=>ed7fB{7F)FQa#!{!3xx$!6Eno z=~LVO0FEbhw&x0`%>~kaVxFsHw%ZD_{y&Elv*mI$(Eop1#|cssQf^{ts6UNEaj$Kg z3kMKOITliGL17Dq;0HL36hYGQH_p`fcQR!sgunpOYsw2b(QQ&STi1)ckGfj+3kG9H z>Xs8LiGeoW8b7Ao%9i! zYTNcb$}h;bC6%Xs0;VPQ6n`e2BQ~D&J^43}oD|9u8;hUw{mwX2GaBj`gBeI`X;9FF z{6DMPde@$;u`R3ea~r#AW4S2*Yd>2Q^Vs@lIEnIX(i7S~M$ddQWo(0|Y6!KSpIO#Y&6N62YT{sD0viR3Hcl{e})+k8#aM)%KC!+zj0{z-!{ zVw*^f$X~*4_CxKd&q}$EEmy}el;0yBL+VR8wXLf{ygm8PiMymaHh!LbF`J*D@&Am% z$D|IVOH`I6&Edm8QeHs5E9#g?+CzRJ24Ew6PO3qh$D}XGKOkO}RF8a1Vk<~GJ|t}+ zu45YMH{yFp2ldOqS5(Z#&!||8IyMlSMJj4P)Q@;6;ulG4ZJTtqZU!zR?WS%S=@U}Q zQIpJfv~NQFF@28ZTnFab2LAr9MMVM95*q2)hj%Hzk4q?@BwvI4eca-2VSao{c{gb( zDdng~JI#%ZK>>cVVy@ae&Pa_&y-5>Di%6UF|C2pL<~->>DUeIcPO3&~M(R!)L7Gq6 zN;*xtLHdJ~HjWiSs!U2C^(Kub%^|HJ9U`44-6y>uW#XYyf>eW)KOhB6TJulID2J)XCa4FrF=l zRG!p~)SWbfG=sE)w4ZdAber@SDPsd(YDm>c%}CuzBS{ddtLR>sgA8TgyksTO>7VMof8qP{XKL7h*b@ zj+EGNVmTHUF7d4I4|jisI?K1L{Tuv`XilRyQK`fpq-&&YQVmw%A|EMcTG0SCS*W z`J&n$dT*(E5iJ_CBL~^48mKdQiDk1f zy~`*T!MiyV$A5|mHg=ns7Ot)$#Kt0ny-Vsvct!>X_+L9!6=|z{#EOxBGv@MW)a$8Y z+8Om#<%y-VONk93R?fD|Y)xg`@1;(k*I_%;^E#QJdRvFaRwvHay}$Q%;vBi#H@bBD__W0RJ0`nrLKns=ZmfvGEb9UsDz0tr?%K zXYH)ysML|0q(%dX{qGX+5K|xONco<5#57^`1#Bq(&)y`a_bJ8H9Xj8Otlr8y(lEkvi>j0x zENYwA;1%(IW6Oy}Qm<2l*>>lMrF2+IEF|auz2W~|0`F3%&+D+A?s?sg;QuaxEyUD= zZ+_WXV!A*bDPteYJ8#NnvWU3)wTctNE%U zr4RNIQ&Sx^Ncu=no*e;=yXN0fa8QO+=_0FDD^#l3kl5HAl@hy`42_EHGN5~8za9g6 zCH5{dtVg$^RU@NzY+2beXvf42ivxGWZ%!Y$W7f74%)VnmNjX!w>AjIBgDXdN9hw-~ zCoytJj~@MpM0Oka{>a7yha}caOdQ;&YmuUngL@=)=`&zRWX(niE#jI*cJarDCiWQO zJ#;i|$2-UFhwk{~hky(_X5C*LwjA2?G9}AYj*N?ycE-|KLaWHcXF3Gf6Wr@TzMa$E`GLM6bKHxmd{gGTzl0_=nD1t& zQ>5ta-J9>s9(Q-aRxYz}p&^CuZQFNe`SgeTXWUMn^#Aqb?SsqiEm(Ye?_N_kemdfZmchk)t-#NL yz^k{C8kBbP_-cm+%yxaR!vgYT%seEqOJci1QRTXKh>0mwHj2(_nLpt7^#2D?xJ\n" "Language-Team: LANGUAGE \n" diff --git a/po/data.table.pot b/po/data.table.pot index 78a6a2beeb..cea9c55a58 100644 --- a/po/data.table.pot +++ b/po/data.table.pot @@ -6,9 +6,9 @@ #, fuzzy msgid "" msgstr "" -"Project-Id-Version: data.table 1.12.9\n" +"Project-Id-Version: data.table 1.13.1\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2020-07-17 14:38+0800\n" +"POT-Creation-Date: 2020-10-17 13:11-0400\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -410,162 +410,169 @@ msgstr "" msgid "Shortening index '%s' to '%s' due to an update on a key column\n" msgstr "" -#: assign.c:695 +#: assign.c:650 +#, c-format +msgid "" +"Internal error: %d column numbers to delete not now in strictly increasing " +"order. No-dups were checked earlier." +msgstr "" + +#: assign.c:688 #, c-format msgid "" "Internal error memrecycle: sourceStart=%d sourceLen=%d length(source)=%d" msgstr "" -#: assign.c:697 +#: assign.c:690 #, c-format msgid "Internal error memrecycle: start=%d len=%d length(target)=%d" msgstr "" -#: assign.c:700 +#: assign.c:693 #, c-format msgid "Internal error: recycle length error not caught earlier. slen=%d len=%d" msgstr "" -#: assign.c:704 +#: assign.c:697 msgid "Internal error: memrecycle has received NULL colname" msgstr "" -#: assign.c:730 +#: assign.c:706 #, c-format msgid "" "Cannot assign 'factor' to '%s'. Factors can only be assigned to factor, " "character or list columns." msgstr "" -#: assign.c:744 +#: assign.c:720 #, c-format msgid "" "Assigning factor numbers to column %d named '%s'. But %d is outside the " "level range [1,%d]" msgstr "" -#: assign.c:752 +#: assign.c:728 #, c-format msgid "" "Assigning factor numbers to column %d named '%s'. But %f is outside the " "level range [1,%d], or is not a whole number." msgstr "" -#: assign.c:758 +#: assign.c:734 #, c-format msgid "" "Cannot assign '%s' to 'factor'. Factor columns can be assigned factor, " "character, NA in any type, or level numbers." msgstr "" -#: assign.c:779 +#: assign.c:755 msgid "" "Internal error: levels of target are either not unique or have truelength<0" msgstr "" -#: assign.c:818 +#: assign.c:794 #, c-format msgid "Unable to allocate working memory of %d bytes to combine factor levels" msgstr "" -#: assign.c:825 +#: assign.c:801 msgid "Internal error: extra level check sum failed" msgstr "" -#: assign.c:844 +#: assign.c:820 #, c-format msgid "" "Coercing 'character' RHS to '%s' to match the type of the target column " "(column %d named '%s')." msgstr "" -#: assign.c:850 +#: assign.c:826 #, c-format msgid "" "Cannot coerce 'list' RHS to 'integer64' to match the type of the target " "column (column %d named '%s')." msgstr "" -#: assign.c:855 +#: assign.c:831 #, c-format msgid "" "Coercing 'list' RHS to '%s' to match the type of the target column (column " "%d named '%s')." msgstr "" -#: assign.c:861 +#: assign.c:837 #, c-format msgid "Zero-copy coerce when assigning '%s' to '%s' column %d named '%s'.\n" msgstr "" -#: assign.c:956 +#: assign.c:932 #, c-format msgid "type '%s' cannot be coerced to '%s'" msgstr "" -#: assign.c:1076 +#: assign.c:1052 msgid "" "To assign integer64 to a character column, please use as.character() for " "clarity." msgstr "" -#: assign.c:1088 +#: assign.c:1064 #, c-format msgid "Unsupported column type in assign.c:memrecycle '%s'" msgstr "" -#: assign.c:1135 +#: assign.c:1111 #, c-format msgid "Internal error: writeNA passed a vector of type '%s'" msgstr "" -#: assign.c:1166 +#: assign.c:1142 #, c-format msgid "" "Internal error: savetl_init checks failed (%d %d %p %p). please report to " "data.table issue tracker." msgstr "" -#: assign.c:1174 +#: assign.c:1150 #, c-format msgid "Failed to allocate initial %d items in savetl_init" msgstr "" -#: assign.c:1183 +#: assign.c:1159 #, c-format msgid "" "Internal error: reached maximum %d items for savetl. Please report to data." "table issue tracker." msgstr "" -#: assign.c:1190 +#: assign.c:1166 #, c-format msgid "Failed to realloc saveds to %d items in savetl" msgstr "" -#: assign.c:1196 +#: assign.c:1172 #, c-format msgid "Failed to realloc savedtl to %d items in savetl" msgstr "" -#: assign.c:1219 +#: assign.c:1195 msgid "x must be a character vector" msgstr "" -#: assign.c:1220 +#: assign.c:1196 msgid "'which' must be an integer vector" msgstr "" -#: assign.c:1221 +#: assign.c:1197 msgid "'new' must be a character vector" msgstr "" -#: assign.c:1222 +#: assign.c:1198 #, c-format msgid "'new' is length %d. Should be the same as length of 'which' (%d)" msgstr "" -#: assign.c:1225 +#: assign.c:1201 #, c-format msgid "" "Item %d of 'which' is %d which is outside range of the length %d character " @@ -777,14 +784,14 @@ msgstr "" msgid "x is type '%s' (must be 'character' or NULL)" msgstr "" -#: chmatch.c:66 +#: chmatch.c:71 #, c-format msgid "" "Internal error: CHARSXP '%s' has a negative truelength (%d). Please file an " "issue on the data.table tracker." msgstr "" -#: chmatch.c:95 +#: chmatch.c:100 #, c-format msgid "" "Failed to allocate % bytes working memory in chmatchdup: " @@ -860,103 +867,108 @@ msgstr "" msgid "Unsupported type: %s" msgstr "" -#: dogroups.c:15 +#: dogroups.c:69 msgid "Internal error: order not integer vector" msgstr "" -#: dogroups.c:16 +#: dogroups.c:70 msgid "Internal error: starts not integer" msgstr "" -#: dogroups.c:17 +#: dogroups.c:71 msgid "Internal error: lens not integer" msgstr "" -#: dogroups.c:19 +#: dogroups.c:73 msgid "Internal error: jiscols not NULL but o__ has length" msgstr "" -#: dogroups.c:20 +#: dogroups.c:74 msgid "Internal error: xjiscols not NULL but o__ has length" msgstr "" -#: dogroups.c:21 +#: dogroups.c:75 msgid "'env' should be an environment" msgstr "" -#: dogroups.c:40 +#: dogroups.c:94 #, c-format msgid "" "Internal error: unsupported size-0 type '%s' in column %d of 'by' should " "have been caught earlier" msgstr "" -#: dogroups.c:44 +#: dogroups.c:99 #, c-format msgid "!length(bynames)[%d]==length(groups)[%d]==length(grpcols)[%d]" msgstr "" -#: dogroups.c:63 +#: dogroups.c:121 msgid "row.names attribute of .SD not found" msgstr "" -#: dogroups.c:65 +#: dogroups.c:123 #, c-format msgid "" "row.names of .SD isn't integer length 2 with NA as first item; i.e., ." "set_row_names(). [%s %d %d]" msgstr "" -#: dogroups.c:70 +#: dogroups.c:128 msgid "length(names)!=length(SD)" msgstr "" -#: dogroups.c:74 +#: dogroups.c:134 #, c-format msgid "" "Internal error: size-0 type %d in .SD column %d should have been caught " "earlier" msgstr "" -#: dogroups.c:84 +#: dogroups.c:136 +#, c-format +msgid "Internal error: SDall %d length = %d != %d" +msgstr "" + +#: dogroups.c:144 msgid "length(xknames)!=length(xSD)" msgstr "" -#: dogroups.c:88 +#: dogroups.c:148 #, c-format msgid "" "Internal error: type %d in .xSD column %d should have been caught by now" msgstr "" -#: dogroups.c:92 +#: dogroups.c:152 #, c-format msgid "length(iSD)[%d] != length(jiscols)[%d]" msgstr "" -#: dogroups.c:93 +#: dogroups.c:153 #, c-format msgid "length(xSD)[%d] != length(xjiscols)[%d]" msgstr "" -#: dogroups.c:198 +#: dogroups.c:259 #, c-format msgid "j evaluates to type '%s'. Must evaluate to atomic vector or list." msgstr "" -#: dogroups.c:206 +#: dogroups.c:267 msgid "" "All items in j=list(...) should be atomic vectors or lists. If you are " "trying something like j=list(.SD,newcol=mean(colA)) then use := by group " "instead (much quicker), or cbind or merge afterwards." msgstr "" -#: dogroups.c:215 +#: dogroups.c:276 msgid "" "RHS of := is NULL during grouped assignment, but it's not possible to delete " "parts of a column." msgstr "" -#: dogroups.c:219 +#: dogroups.c:280 #, c-format msgid "" "Supplied %d items to be assigned to group %d of size %d in column '%s'. The " @@ -965,23 +977,23 @@ msgid "" "make this intent clear to readers of your code." msgstr "" -#: dogroups.c:230 +#: dogroups.c:291 msgid "" "Internal error: Trying to add new column by reference but tl is full; " "setalloccol should have run first at R level before getting to this point in " "dogroups" msgstr "" -#: dogroups.c:245 +#: dogroups.c:312 #, c-format msgid "Group %d column '%s': %s" msgstr "" -#: dogroups.c:252 +#: dogroups.c:319 msgid "j doesn't evaluate to the same number of columns for each group" msgstr "" -#: dogroups.c:286 +#: dogroups.c:353 #, c-format msgid "" "Column %d of j's result for the first group is NULL. We rely on the column " @@ -992,14 +1004,14 @@ msgid "" "integer() or numeric()." msgstr "" -#: dogroups.c:289 +#: dogroups.c:356 msgid "" "j appears to be a named vector. The same names will likely be created over " "and over again for each group and slow things down. Try and pass a named " "list (which data.table optimizes) or an unnamed list() instead.\n" msgstr "" -#: dogroups.c:291 +#: dogroups.c:358 #, c-format msgid "" "Column %d of j is a named vector (each item down the rows is named, " @@ -1007,7 +1019,7 @@ msgid "" "over and over for each group). They are ignored anyway.\n" msgstr "" -#: dogroups.c:299 +#: dogroups.c:366 msgid "" "The result of j is a named list. It's very inefficient to create the same " "names over and over again for each group. When j=list(...), any names are " @@ -1016,17 +1028,17 @@ msgid "" "to :=). This message may be upgraded to warning in future.\n" msgstr "" -#: dogroups.c:311 +#: dogroups.c:378 #, c-format msgid "dogroups: growing from %d to %d rows\n" msgstr "" -#: dogroups.c:312 +#: dogroups.c:379 #, c-format msgid "dogroups: length(ans)[%d]!=ngrpcols[%d]+njval[%d]" msgstr "" -#: dogroups.c:330 +#: dogroups.c:397 #, c-format msgid "" "Item %d of j's result for group %d is zero length. This will be filled with " @@ -1035,14 +1047,14 @@ msgid "" "buffer." msgstr "" -#: dogroups.c:337 +#: dogroups.c:404 #, c-format msgid "" "Column %d of result for group %d is type '%s' but expecting type '%s'. " "Column types must be consistent for each group." msgstr "" -#: dogroups.c:339 +#: dogroups.c:406 #, c-format msgid "" "Supplied %d items for column %d of group %d which has %d rows. The RHS " @@ -1051,33 +1063,33 @@ msgid "" "make this intent clear to readers of your code." msgstr "" -#: dogroups.c:354 +#: dogroups.c:427 #, c-format msgid "Wrote less rows (%d) than allocated (%d).\n" msgstr "" -#: dogroups.c:364 +#: dogroups.c:449 #, c-format msgid "Internal error: block 0 [%d] and block 1 [%d] have both run" msgstr "" -#: dogroups.c:366 +#: dogroups.c:451 #, c-format msgid "" "\n" " %s took %.3fs for %d groups\n" msgstr "" -#: dogroups.c:368 +#: dogroups.c:453 #, c-format msgid " eval(j) took %.3fs for %d calls\n" msgstr "" -#: dogroups.c:392 +#: dogroups.c:477 msgid "growVector passed NULL" msgstr "" -#: dogroups.c:412 +#: dogroups.c:497 #, c-format msgid "Internal error: growVector doesn't support type '%s'" msgstr "" @@ -1105,6 +1117,10 @@ msgstr "" msgid "Argument 'test' must be logical." msgstr "" +#: fifelse.c:9 +msgid "S4 class objects (except nanotime) are not supported." +msgstr "" + #: fifelse.c:28 #, c-format msgid "" @@ -1156,7 +1172,7 @@ msgstr "" msgid "'yes' and 'na' are both type factor but their levels are different." msgstr "" -#: fifelse.c:138 +#: fifelse.c:138 fifelse.c:336 #, c-format msgid "Type %s is not supported." msgstr "" @@ -1169,6 +1185,73 @@ msgid "" "that the default argument must be named explicitly, e.g., default=0" msgstr "" +#: fifelse.c:163 fifelse.c:203 +msgid "" +"S4 class objects (except nanotime) are not supported. Please see https://" +"github.com/Rdatatable/data.table/issues/4131." +msgstr "" + +#: fifelse.c:174 +msgid "Length of 'default' must be 1." +msgstr "" + +#: fifelse.c:181 +#, c-format +msgid "" +"Resulting value is of type %s but 'default' is of type %s. Please make sure " +"that both arguments have the same type." +msgstr "" + +#: fifelse.c:185 +msgid "" +"Resulting value has different class than 'default'. Please make sure that " +"both arguments have the same class." +msgstr "" + +#: fifelse.c:191 +msgid "" +"Resulting value and 'default' are both type factor but their levels are " +"different." +msgstr "" + +#: fifelse.c:206 +#, c-format +msgid "Argument #%d must be logical." +msgstr "" + +#: fifelse.c:210 +#, c-format +msgid "" +"Argument #%d has a different length than argument #1. Please make sure all " +"logical conditions have the same length." +msgstr "" + +#: fifelse.c:215 +#, c-format +msgid "" +"Argument #%d is of type %s, however argument #2 is of type %s. Please make " +"sure all output values have the same type." +msgstr "" + +#: fifelse.c:220 +#, c-format +msgid "" +"Argument #%d has different class than argument #2, Please make sure all " +"output values have the same class." +msgstr "" + +#: fifelse.c:226 +#, c-format +msgid "" +"Argument #2 and argument #%d are both factor but their levels are different." +msgstr "" + +#: fifelse.c:233 +#, c-format +msgid "" +"Length of output value #%d must either be 1 or length of logical condition." +msgstr "" + #: fmelt.c:18 msgid "'x' must be an integer" msgstr "" @@ -1181,27 +1264,27 @@ msgstr "" msgid "Argument to 'which' must be logical" msgstr "" -#: fmelt.c:70 -msgid "concat: 'vec must be a character vector" +#: fmelt.c:65 +msgid "concat: 'vec' must be a character vector" msgstr "" -#: fmelt.c:71 +#: fmelt.c:66 msgid "concat: 'idx' must be an integer vector of length >= 0" msgstr "" #: fmelt.c:75 #, c-format msgid "" -"Internal error in concat: 'idx' must take values between 0 and length(vec); " -"0 <= idx <= %d" +"Internal error in concat: 'idx' must take values between 1 and length(vec); " +"1 <= idx <= %d" msgstr "" -#: fmelt.c:102 +#: fmelt.c:117 #, c-format msgid "Unknown 'measure.vars' type %s at index %d of list" msgstr "" -#: fmelt.c:148 +#: fmelt.c:162 #, c-format msgid "" "id.vars and measure.vars are internally guessed when both are 'NULL'. All " @@ -1210,80 +1293,80 @@ msgid "" "'measure' vars in future." msgstr "" -#: fmelt.c:154 fmelt.c:219 +#: fmelt.c:168 fmelt.c:233 #, c-format msgid "Unknown 'id.vars' type %s, must be character or integer vector" msgstr "" -#: fmelt.c:159 fmelt.c:223 +#: fmelt.c:173 fmelt.c:237 msgid "One or more values in 'id.vars' is invalid." msgstr "" -#: fmelt.c:175 +#: fmelt.c:189 msgid "" "'measure.vars' is missing. Assigning all columns other than 'id.vars' " "columns as 'measure.vars'.\n" msgstr "" -#: fmelt.c:176 +#: fmelt.c:190 #, c-format msgid "Assigned 'measure.vars' are [%s].\n" msgstr "" -#: fmelt.c:184 +#: fmelt.c:198 #, c-format msgid "" "Unknown 'measure.vars' type %s, must be character or integer vector/list" msgstr "" -#: fmelt.c:193 fmelt.c:239 +#: fmelt.c:207 fmelt.c:253 msgid "One or more values in 'measure.vars' is invalid." msgstr "" -#: fmelt.c:211 +#: fmelt.c:225 msgid "" "'id.vars' is missing. Assigning all columns other than 'measure.vars' " "columns as 'id.vars'.\n" msgstr "" -#: fmelt.c:212 +#: fmelt.c:226 #, c-format msgid "Assigned 'id.vars' are [%s].\n" msgstr "" -#: fmelt.c:231 +#: fmelt.c:245 #, c-format msgid "Unknown 'measure.vars' type %s, must be character or integer vector" msgstr "" -#: fmelt.c:276 +#: fmelt.c:290 msgid "" "When 'measure.vars' is a list, 'value.name' must be a character vector of " "length =1 or =length(measure.vars)." msgstr "" -#: fmelt.c:277 +#: fmelt.c:291 msgid "" "When 'measure.vars' is either not specified or a character/integer vector, " "'value.name' must be a character vector of length =1." msgstr "" -#: fmelt.c:280 +#: fmelt.c:294 msgid "'variable.name' must be a character/integer vector of length=1." msgstr "" -#: fmelt.c:329 +#: fmelt.c:343 msgid "" "Internal error: combineFactorLevels in fmelt.c expects all-character input" msgstr "" -#: fmelt.c:332 +#: fmelt.c:346 msgid "" "Internal error: combineFactorLevels in fmelt.c expects a character target to " "factorize" msgstr "" -#: fmelt.c:385 +#: fmelt.c:399 #, c-format msgid "" "'measure.vars' [%s] are not all of the same type. By order of hierarchy, the " @@ -1292,60 +1375,60 @@ msgid "" "coercion.\n" msgstr "" -#: fmelt.c:387 +#: fmelt.c:401 #, c-format msgid "" "The molten data value type is a list at item %d. 'na.rm=TRUE' is ignored.\n" msgstr "" -#: fmelt.c:490 +#: fmelt.c:504 #, c-format msgid "Unknown column type '%s' for column '%s'." msgstr "" -#: fmelt.c:514 +#: fmelt.c:528 #, c-format msgid "Internal error: fmelt.c:getvarcols %d %d" msgstr "" -#: fmelt.c:662 +#: fmelt.c:676 #, c-format msgid "Unknown column type '%s' for column '%s' in 'data'" msgstr "" -#: fmelt.c:673 +#: fmelt.c:687 msgid "Input is not of type VECSXP, expected a data.table, data.frame or list" msgstr "" -#: fmelt.c:674 +#: fmelt.c:688 msgid "Argument 'value.factor' should be logical TRUE/FALSE" msgstr "" -#: fmelt.c:675 +#: fmelt.c:689 msgid "Argument 'variable.factor' should be logical TRUE/FALSE" msgstr "" -#: fmelt.c:676 +#: fmelt.c:690 msgid "Argument 'na.rm' should be logical TRUE/FALSE." msgstr "" -#: fmelt.c:677 +#: fmelt.c:691 msgid "Argument 'variable.name' must be a character vector" msgstr "" -#: fmelt.c:678 +#: fmelt.c:692 msgid "Argument 'value.name' must be a character vector" msgstr "" -#: fmelt.c:679 +#: fmelt.c:693 msgid "Argument 'verbose' should be logical TRUE/FALSE" msgstr "" -#: fmelt.c:683 +#: fmelt.c:697 msgid "ncol(data) is 0. Nothing to melt. Returning original data.table." msgstr "" -#: fmelt.c:688 +#: fmelt.c:702 msgid "names(data) is NULL. Please report to data.table-help" msgstr "" @@ -1481,6 +1564,11 @@ msgstr "" msgid "na.last must be logical TRUE, FALSE or NA of length 1" msgstr "" +#: forder.c:504 forder.c:608 +#, c-format +msgid "Unable to allocate % bytes of working memory" +msgstr "" + #: forder.c:520 #, c-format msgid "Item %d of order (ascending/descending) is %d. Must be +1 or -1." @@ -1658,332 +1746,332 @@ msgstr "" msgid " File copy in RAM took %.3f seconds.\n" msgstr "" -#: fread.c:1248 +#: fread.c:1249 msgid "" "Previous fread() session was not cleaned up properly. Cleaned up ok at the " "beginning of this fread() call.\n" msgstr "" -#: fread.c:1251 +#: fread.c:1252 msgid "[01] Check arguments\n" msgstr "" -#: fread.c:1258 +#: fread.c:1259 #, c-format msgid " Using %d threads (omp_get_max_threads()=%d, nth=%d)\n" msgstr "" -#: fread.c:1266 +#: fread.c:1267 msgid "" "Internal error: NAstrings is itself NULL. When empty it should be pointer to " "NULL." msgstr "" -#: fread.c:1284 +#: fread.c:1285 #, c-format msgid "freadMain: NAstring <<%s>> has whitespace at the beginning or end" msgstr "" -#: fread.c:1289 +#: fread.c:1290 #, c-format msgid "" "freadMain: NAstring <<%s>> is recognized as type boolean, this is not " "permitted." msgstr "" -#: fread.c:1300 +#: fread.c:1301 msgid " No NAstrings provided.\n" msgstr "" -#: fread.c:1302 +#: fread.c:1303 msgid " NAstrings = [" msgstr "" -#: fread.c:1305 +#: fread.c:1306 msgid "]\n" msgstr "" -#: fread.c:1307 +#: fread.c:1308 msgid " One or more of the NAstrings looks like a number.\n" msgstr "" -#: fread.c:1309 +#: fread.c:1310 msgid " None of the NAstrings look like numbers.\n" msgstr "" -#: fread.c:1311 +#: fread.c:1312 #, c-format msgid " skip num lines = %\n" msgstr "" -#: fread.c:1312 +#: fread.c:1313 #, c-format msgid " skip to string = <<%s>>\n" msgstr "" -#: fread.c:1313 +#: fread.c:1314 #, c-format msgid " show progress = %d\n" msgstr "" -#: fread.c:1314 +#: fread.c:1315 #, c-format msgid " 0/1 column will be read as %s\n" msgstr "" -#: fread.c:1322 +#: fread.c:1323 #, c-format msgid "sep == quote ('%c') is not allowed" msgstr "" -#: fread.c:1323 +#: fread.c:1324 msgid "dec='' not allowed. Should be '.' or ','" msgstr "" -#: fread.c:1324 +#: fread.c:1325 #, c-format msgid "sep == dec ('%c') is not allowed" msgstr "" -#: fread.c:1325 +#: fread.c:1326 #, c-format msgid "quote == dec ('%c') is not allowed" msgstr "" -#: fread.c:1342 +#: fread.c:1343 msgid "[02] Opening the file\n" msgstr "" -#: fread.c:1345 +#: fread.c:1346 msgid "" " `input` argument is provided rather than a file name, interpreting as raw " "text to read\n" msgstr "" -#: fread.c:1349 +#: fread.c:1350 msgid "Internal error: last byte of character input isn't \\0" msgstr "" -#: fread.c:1352 +#: fread.c:1353 #, c-format msgid " Opening file %s\n" msgstr "" -#: fread.c:1356 +#: fread.c:1357 #, c-format msgid "file not found: %s" msgstr "" -#: fread.c:1360 +#: fread.c:1361 #, c-format msgid "Opened file ok but couldn't obtain its size: %s" msgstr "" -#: fread.c:1363 fread.c:1391 +#: fread.c:1364 fread.c:1392 #, c-format msgid "File is empty: %s" msgstr "" -#: fread.c:1364 fread.c:1392 +#: fread.c:1365 fread.c:1393 #, c-format msgid " File opened, size = %s.\n" msgstr "" -#: fread.c:1381 +#: fread.c:1382 #, c-format msgid "File not found: %s" msgstr "" -#: fread.c:1387 +#: fread.c:1388 #, c-format msgid "Unable to open file after %d attempts (error %d): %s" msgstr "" -#: fread.c:1389 +#: fread.c:1390 #, c-format msgid "GetFileSizeEx failed (returned 0) on file: %s" msgstr "" -#: fread.c:1394 +#: fread.c:1395 #, c-format msgid "This is Windows, CreateFileMapping returned error %d for file %s" msgstr "" -#: fread.c:1401 +#: fread.c:1402 #, c-format msgid "" "Opened %s file ok but could not memory map it. This is a %dbit process. %s." msgstr "" -#: fread.c:1402 +#: fread.c:1403 msgid "Please upgrade to 64bit" msgstr "" -#: fread.c:1402 +#: fread.c:1403 msgid "There is probably not enough contiguous virtual memory available" msgstr "" -#: fread.c:1405 +#: fread.c:1406 msgid " Memory mapped ok\n" msgstr "" -#: fread.c:1407 +#: fread.c:1408 msgid "" "Internal error: Neither `input` nor `filename` are given, nothing to read." msgstr "" -#: fread.c:1424 +#: fread.c:1425 msgid "[03] Detect and skip BOM\n" msgstr "" -#: fread.c:1428 +#: fread.c:1429 msgid "" " UTF-8 byte order mark EF BB BF found at the start of the file and " "skipped.\n" msgstr "" -#: fread.c:1433 +#: fread.c:1434 msgid "" "GB-18030 encoding detected, however fread() is unable to decode it. Some " "character fields may be garbled.\n" msgstr "" -#: fread.c:1436 +#: fread.c:1437 msgid "" "File is encoded in UTF-16, this encoding is not supported by fread(). Please " "recode the file to UTF-8." msgstr "" -#: fread.c:1441 +#: fread.c:1442 #, c-format msgid " Last byte(s) of input found to be %s and removed.\n" msgstr "" -#: fread.c:1444 +#: fread.c:1445 msgid "Input is empty or only contains BOM or terminal control characters" msgstr "" -#: fread.c:1451 +#: fread.c:1452 msgid "[04] Arrange mmap to be \\0 terminated\n" msgstr "" -#: fread.c:1458 +#: fread.c:1459 msgid "" " No \\n exists in the file at all, so single \\r (if any) will be taken as " "one line ending. This is unusual but will happen normally when there is no " "\\r either; e.g. a single line missing its end of line.\n" msgstr "" -#: fread.c:1459 +#: fread.c:1460 msgid "" " \\n has been found in the input and different lines can end with different " "line endings (e.g. mixed \\n and \\r\\n in one file). This is common and " "ideal.\n" msgstr "" -#: fread.c:1483 +#: fread.c:1484 #, c-format msgid "" " File ends abruptly with '%c'. Final end-of-line is missing. Using cow page " "to write 0 to the last byte.\n" msgstr "" -#: fread.c:1489 +#: fread.c:1490 msgid "" "This file is very unusual: it ends abruptly without a final newline, and " "also its size is a multiple of 4096 bytes. Please properly end the last row " "with a newline using for example 'echo >> file' to avoid this " msgstr "" -#: fread.c:1490 +#: fread.c:1491 #, c-format msgid " File ends abruptly with '%c'. Copying file in RAM. %s copy.\n" msgstr "" -#: fread.c:1524 +#: fread.c:1525 msgid "[05] Skipping initial rows if needed\n" msgstr "" -#: fread.c:1530 +#: fread.c:1531 #, c-format msgid "" "skip='%s' not found in input (it is case sensitive and literal; i.e., no " "patterns, wildcards or regex)" msgstr "" -#: fread.c:1536 +#: fread.c:1537 #, c-format msgid "" "Found skip='%s' on line %. Taking this to be header row or first row " "of data.\n" msgstr "" -#: fread.c:1549 +#: fread.c:1550 #, c-format msgid " Skipped to line % in the file" msgstr "" -#: fread.c:1550 +#: fread.c:1551 #, c-format msgid "skip=% but the input only has % line%s" msgstr "" -#: fread.c:1559 +#: fread.c:1560 msgid "" "Input is either empty, fully whitespace, or skip has been set after the last " "non-whitespace." msgstr "" -#: fread.c:1561 +#: fread.c:1562 #, c-format msgid " Moved forward to first non-blank line (%d)\n" msgstr "" -#: fread.c:1562 +#: fread.c:1563 #, c-format msgid " Positioned on line %d starting: <<%s>>\n" msgstr "" -#: fread.c:1580 +#: fread.c:1581 msgid "[06] Detect separator, quoting rule, and ncolumns\n" msgstr "" -#: fread.c:1584 +#: fread.c:1585 msgid " sep='\\n' passed in meaning read lines as single character column\n" msgstr "" -#: fread.c:1603 +#: fread.c:1604 msgid " Detecting sep automatically ...\n" msgstr "" -#: fread.c:1610 +#: fread.c:1611 #, c-format msgid " Using supplied sep '%s'\n" msgstr "" -#: fread.c:1644 +#: fread.c:1645 #, c-format msgid " with %d fields using quote rule %d\n" msgstr "" -#: fread.c:1694 +#: fread.c:1695 #, c-format msgid " with %d lines of %d fields using quote rule %d\n" msgstr "" -#: fread.c:1701 +#: fread.c:1702 msgid "" " No sep and quote rule found a block of 2x2 or greater. Single column " "input.\n" msgstr "" -#: fread.c:1717 +#: fread.c:1718 msgid "" "Single column input contains invalid quotes. Self healing only effective " "when ncol>1" msgstr "" -#: fread.c:1722 +#: fread.c:1723 #, c-format msgid "" "Found and resolved improper quoting in first %d rows. If the fields are not " @@ -1991,386 +2079,386 @@ msgid "" "\"\" to avoid this warning." msgstr "" -#: fread.c:1738 +#: fread.c:1739 #, c-format msgid "" "Internal error: ncol==%d line==%d after detecting sep, ncol and first line" msgstr "" -#: fread.c:1741 +#: fread.c:1742 #, c-format msgid "Internal error: first line has field count %d but expecting %d" msgstr "" -#: fread.c:1743 +#: fread.c:1744 #, c-format msgid "" " Detected %d columns on line %d. This line is either column names or first " "data row. Line starts as: <<%s>>\n" msgstr "" -#: fread.c:1745 +#: fread.c:1746 #, c-format msgid " Quote rule picked = %d\n" msgstr "" -#: fread.c:1746 +#: fread.c:1747 #, c-format msgid " fill=%s and the most number of columns found is %d\n" msgstr "" -#: fread.c:1752 +#: fread.c:1753 msgid "" "This file is very unusual: it's one single column, ends with 2 or more end-" "of-line (representing several NA at the end), and is a multiple of 4096, too." msgstr "" -#: fread.c:1753 +#: fread.c:1754 #, c-format msgid " Copying file in RAM. %s\n" msgstr "" -#: fread.c:1759 +#: fread.c:1760 msgid "" " 1-column file ends with 2 or more end-of-line. Restoring last eol using " "extra byte in cow page.\n" msgstr "" -#: fread.c:1778 +#: fread.c:1779 msgid "" "[07] Detect column types, good nrow estimate and whether first row is column " "names\n" msgstr "" -#: fread.c:1779 +#: fread.c:1780 #, c-format msgid " 'header' changed by user from 'auto' to %s\n" msgstr "" -#: fread.c:1783 +#: fread.c:1784 #, c-format msgid "Failed to allocate 2 x %d bytes for type and tmpType: %s" msgstr "" -#: fread.c:1804 +#: fread.c:1805 #, c-format msgid " Number of sampling jump points = %d because " msgstr "" -#: fread.c:1805 +#: fread.c:1806 #, c-format msgid "nrow limit (%) supplied\n" msgstr "" -#: fread.c:1806 +#: fread.c:1807 msgid "jump0size==0\n" msgstr "" -#: fread.c:1807 +#: fread.c:1808 #, c-format msgid "" "(% bytes from row 1 to eof) / (2 * % jump0size) == " "%\n" msgstr "" -#: fread.c:1845 +#: fread.c:1846 #, c-format msgid "" " A line with too-%s fields (%d/%d) was found on line %d of sample jump %d. " "%s\n" msgstr "" -#: fread.c:1846 +#: fread.c:1847 msgid "few" msgstr "" -#: fread.c:1846 +#: fread.c:1847 msgid "many" msgstr "" -#: fread.c:1846 +#: fread.c:1847 msgid "" "Most likely this jump landed awkwardly so type bumps here will be skipped." msgstr "" -#: fread.c:1872 +#: fread.c:1873 #, c-format msgid " Type codes (jump %03d) : %s Quote rule %d\n" msgstr "" -#: fread.c:1885 +#: fread.c:1886 #, c-format msgid "" " 'header' determined to be true due to column %d containing a string on row " "1 and a lower type (%s) in the rest of the %d sample rows\n" msgstr "" -#: fread.c:1897 +#: fread.c:1898 msgid "" "Internal error: row before first data row has the same number of fields but " "we're not using it." msgstr "" -#: fread.c:1898 +#: fread.c:1899 msgid "" "Internal error: ch!=pos after counting fields in the line before the first " "data row." msgstr "" -#: fread.c:1899 +#: fread.c:1900 #, c-format msgid "" "Types in 1st data row match types in 2nd data row but previous row has %d " "fields. Taking previous row as column names." msgstr "" -#: fread.c:1902 +#: fread.c:1903 #, c-format msgid "" "Detected %d column names but the data has %d columns (i.e. invalid file). " "Added %d extra default column name%s\n" msgstr "" -#: fread.c:1903 +#: fread.c:1904 msgid "" " for the first column which is guessed to be row names or an index. Use " "setnames() afterwards if this guess is not correct, or fix the file write " "command that created the file to create a valid file." msgstr "" -#: fread.c:1903 +#: fread.c:1904 msgid "s at the end." msgstr "" -#: fread.c:1905 +#: fread.c:1906 msgid "" "Internal error: fill=true but there is a previous row which should already " "have been filled." msgstr "" -#: fread.c:1906 +#: fread.c:1907 #, c-format msgid "" "Detected %d column names but the data has %d columns. Filling rows " "automatically. Set fill=TRUE explicitly to avoid this warning.\n" msgstr "" -#: fread.c:1910 +#: fread.c:1911 #, c-format msgid "Failed to realloc 2 x %d bytes for type and tmpType: %s" msgstr "" -#: fread.c:1930 +#: fread.c:1931 #, c-format msgid "" " 'header' determined to be %s because there are%s number fields in the " "first and only row\n" msgstr "" -#: fread.c:1930 +#: fread.c:1931 msgid " no" msgstr "" -#: fread.c:1933 +#: fread.c:1934 msgid "" " 'header' determined to be true because all columns are type string and a " "better guess is not possible\n" msgstr "" -#: fread.c:1935 +#: fread.c:1936 msgid "" " 'header' determined to be false because there are some number columns and " "those columns do not have a string field at the top of them\n" msgstr "" -#: fread.c:1951 +#: fread.c:1952 #, c-format msgid " Type codes (first row) : %s Quote rule %d\n" msgstr "" -#: fread.c:1960 +#: fread.c:1961 #, c-format msgid "" " All rows were sampled since file is small so we know nrow=% " "exactly\n" msgstr "" -#: fread.c:1972 fread.c:1979 +#: fread.c:1973 fread.c:1980 msgid " =====\n" msgstr "" -#: fread.c:1973 +#: fread.c:1974 #, c-format msgid "" " Sampled % rows (handled \\n inside quoted fields) at %d jump " "points\n" msgstr "" -#: fread.c:1974 +#: fread.c:1975 #, c-format msgid "" " Bytes from first data row on line %d to the end of last row: %\n" msgstr "" -#: fread.c:1975 +#: fread.c:1976 #, c-format msgid " Line length: mean=%.2f sd=%.2f min=%d max=%d\n" msgstr "" -#: fread.c:1976 +#: fread.c:1977 #, c-format msgid " Estimated number of rows: % / %.2f = %\n" msgstr "" -#: fread.c:1977 +#: fread.c:1978 #, c-format msgid "" " Initial alloc = % rows (% + %d%%) using bytes/" "max(mean-2*sd,min) clamped between [1.1*estn, 2.0*estn]\n" msgstr "" -#: fread.c:1981 +#: fread.c:1982 #, c-format msgid "Internal error: sampleLines(%) > allocnrow(%)" msgstr "" -#: fread.c:1985 +#: fread.c:1986 #, c-format msgid " Alloc limited to lower nrows=% passed in.\n" msgstr "" -#: fread.c:1997 +#: fread.c:1998 msgid "[08] Assign column names\n" msgstr "" -#: fread.c:2005 +#: fread.c:2006 #, c-format msgid "Unable to allocate %d*%d bytes for column name pointers: %s" msgstr "" -#: fread.c:2027 +#: fread.c:2028 #, c-format msgid "Internal error: reading colnames ending on '%c'" msgstr "" -#: fread.c:2045 +#: fread.c:2046 msgid "[09] Apply user overrides on column types\n" msgstr "" -#: fread.c:2049 +#: fread.c:2050 msgid " Cancelled by user: userOverride() returned false." msgstr "" -#: fread.c:2059 +#: fread.c:2060 #, c-format msgid "Failed to allocate %d bytes for size array: %s" msgstr "" -#: fread.c:2066 +#: fread.c:2067 #, c-format msgid "" -"Attempt to override column %d <<%.*s>> of inherent type '%s' down to '%s' " +"Attempt to override column %d%s%.*s%s of inherent type '%s' down to '%s' " "ignored. Only overrides to a higher type are currently supported. If this " "was intended, please coerce to the lower type afterwards." msgstr "" -#: fread.c:2080 +#: fread.c:2082 #, c-format msgid " After %d type and %d drop user overrides : %s\n" msgstr "" -#: fread.c:2088 +#: fread.c:2090 msgid "[10] Allocate memory for the datatable\n" msgstr "" -#: fread.c:2089 +#: fread.c:2091 #, c-format msgid " Allocating %d column slots (%d - %d dropped) with % rows\n" msgstr "" -#: fread.c:2143 +#: fread.c:2145 #, c-format msgid "Buffer size % is too large\n" msgstr "" -#: fread.c:2146 +#: fread.c:2148 msgid "[11] Read the data\n" msgstr "" -#: fread.c:2149 +#: fread.c:2151 #, c-format msgid " jumps=[%d..%d), chunk_size=%, total_size=%\n" msgstr "" -#: fread.c:2161 +#: fread.c:2163 #, c-format msgid "Internal error: Master thread is not thread 0 but thread %d.\n" msgstr "" -#: fread.c:2369 +#: fread.c:2371 #, c-format msgid "" "Column %d (\"%.*s\") bumped from '%s' to '%s' due to <<%.*s>> on row " "%\n" msgstr "" -#: fread.c:2418 +#: fread.c:2421 #, c-format msgid "" "Internal error: invalid head position. jump=%d, headPos=%p, thisJumpStart=" "%p, sof=%p" msgstr "" -#: fread.c:2491 +#: fread.c:2494 #, c-format msgid "" " Too few rows allocated. Allocating additional % rows (now nrows=" "%) and continue reading from jump %d\n" msgstr "" -#: fread.c:2498 +#: fread.c:2501 #, c-format msgid " Restarting team from jump %d. nSwept==%d quoteRule==%d\n" msgstr "" -#: fread.c:2518 +#: fread.c:2521 #, c-format msgid " %d out-of-sample type bumps: %s\n" msgstr "" -#: fread.c:2554 +#: fread.c:2557 #, c-format msgid "" "Read % rows x %d columns from %s file in %02d:%06.3f wall clock " "time\n" msgstr "" -#: fread.c:2561 +#: fread.c:2564 msgid "[12] Finalizing the datatable\n" msgstr "" -#: fread.c:2562 +#: fread.c:2565 msgid " Type counts:\n" msgstr "" -#: fread.c:2564 +#: fread.c:2567 #, c-format msgid "%10d : %-9s '%c'\n" msgstr "" -#: fread.c:2580 +#: fread.c:2583 #, c-format msgid "Discarded single-line footer: <<%s>>" msgstr "" -#: fread.c:2585 +#: fread.c:2588 #, c-format msgid "" "Stopped early on line %. Expected %d fields but found %d. Consider " "fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>" msgstr "" -#: fread.c:2591 +#: fread.c:2594 #, c-format msgid "" "Found and resolved improper quoting out-of-sample. First healed line " @@ -2378,213 +2466,213 @@ msgid "" "not appear within any field), try quote=\"\" to avoid this warning." msgstr "" -#: fread.c:2595 +#: fread.c:2598 msgid "=============================\n" msgstr "" -#: fread.c:2597 +#: fread.c:2600 #, c-format msgid "%8.3fs (%3.0f%%) Memory map %.3fGB file\n" msgstr "" -#: fread.c:2598 +#: fread.c:2601 #, c-format msgid "%8.3fs (%3.0f%%) sep=" msgstr "" -#: fread.c:2600 +#: fread.c:2603 #, c-format msgid " ncol=%d and header detection\n" msgstr "" -#: fread.c:2601 +#: fread.c:2604 #, c-format msgid "%8.3fs (%3.0f%%) Column type detection using % sample rows\n" msgstr "" -#: fread.c:2603 +#: fread.c:2606 #, c-format msgid "" "%8.3fs (%3.0f%%) Allocation of % rows x %d cols (%.3fGB) of which " "% (%3.0f%%) rows used\n" msgstr "" -#: fread.c:2607 +#: fread.c:2610 #, c-format msgid "" "%8.3fs (%3.0f%%) Reading %d chunks (%d swept) of %.3fMB (each chunk %d rows) " "using %d threads\n" msgstr "" -#: fread.c:2609 +#: fread.c:2612 #, c-format msgid "" " + %8.3fs (%3.0f%%) Parse to row-major thread buffers (grown %d times)\n" msgstr "" -#: fread.c:2610 +#: fread.c:2613 #, c-format msgid " + %8.3fs (%3.0f%%) Transpose\n" msgstr "" -#: fread.c:2611 +#: fread.c:2614 #, c-format msgid " + %8.3fs (%3.0f%%) Waiting\n" msgstr "" -#: fread.c:2612 +#: fread.c:2615 #, c-format msgid "" "%8.3fs (%3.0f%%) Rereading %d columns due to out-of-sample type exceptions\n" msgstr "" -#: fread.c:2614 +#: fread.c:2617 #, c-format msgid "%8.3fs Total\n" msgstr "" -#: freadR.c:85 +#: freadR.c:86 msgid "" "Internal error: freadR input not a single character string: a filename or " "the data itself. Should have been caught at R level." msgstr "" -#: freadR.c:93 +#: freadR.c:94 msgid "" "Input contains a \\n or is \")\". Taking this to be text input (not a " "filename)\n" msgstr "" -#: freadR.c:96 +#: freadR.c:97 msgid "Input contains no \\n. Taking this to be a filename to open\n" msgstr "" -#: freadR.c:102 +#: freadR.c:103 msgid "" "Internal error: freadR sep not a single character. R level catches this." msgstr "" -#: freadR.c:106 +#: freadR.c:107 msgid "" "Internal error: freadR dec not a single character. R level catches this." msgstr "" -#: freadR.c:113 +#: freadR.c:114 msgid "quote= must be a single character, blank \"\", or FALSE" msgstr "" -#: freadR.c:143 +#: freadR.c:144 msgid "Internal error: skip not integer or string in freadR.c" msgstr "" -#: freadR.c:146 +#: freadR.c:147 #, c-format msgid "Internal error: NAstringsArg is type '%s'. R level catches this" msgstr "" -#: freadR.c:159 +#: freadR.c:160 #, c-format msgid "nThread(%d)<1" msgstr "" -#: freadR.c:166 +#: freadR.c:168 msgid "'integer64' must be a single character string" msgstr "" -#: freadR.c:174 +#: freadR.c:176 #, c-format msgid "" "Invalid value integer64='%s'. Must be 'integer64', 'character', 'double' or " "'numeric'" msgstr "" -#: freadR.c:182 +#: freadR.c:184 msgid "Use either select= or drop= but not both." msgstr "" -#: freadR.c:185 +#: freadR.c:187 msgid "" "select= is type list for specifying types in select=, but colClasses= has " "been provided as well. Please remove colClasses=." msgstr "" -#: freadR.c:187 +#: freadR.c:189 msgid "" "select= is type list but has no names; expecting list(type1=cols1, " "type2=cols2, ...)" msgstr "" -#: freadR.c:194 +#: freadR.c:196 msgid "" "select= is a named vector specifying the columns to select and their types, " "but colClasses= has been provided as well. Please remove colClasses=." msgstr "" -#: freadR.c:202 freadR.c:368 +#: freadR.c:204 freadR.c:370 msgid "colClasses is type list but has no names" msgstr "" -#: freadR.c:212 +#: freadR.c:214 #, c-format msgid "encoding='%s' invalid. Must be 'unknown', 'Latin-1' or 'UTF-8'" msgstr "" -#: freadR.c:235 +#: freadR.c:237 #, c-format msgid "Column name '%s' (%s) not found" msgstr "" -#: freadR.c:237 +#: freadR.c:239 #, c-format msgid "%s is NA" msgstr "" -#: freadR.c:239 +#: freadR.c:241 #, c-format msgid "%s is %d which is out of range [1,ncol=%d]" msgstr "" -#: freadR.c:253 +#: freadR.c:255 msgid "Internal error: typeSize[CT_BOOL8_N] != 1" msgstr "" -#: freadR.c:254 +#: freadR.c:256 msgid "Internal error: typeSize[CT_STRING] != 1" msgstr "" -#: freadR.c:288 +#: freadR.c:290 #, c-format msgid "" "Column name '%s' not found in column name header (case sensitive), skipping." msgstr "" -#: freadR.c:298 +#: freadR.c:300 #, c-format msgid "" "Column number %d (select[%d]) is negative but should be in the range [1,ncol=" "%d]. Consider drop= for column exclusion." msgstr "" -#: freadR.c:299 +#: freadR.c:301 #, c-format msgid "" "select = 0 (select[%d]) has no meaning. All values of select should be in " "the range [1,ncol=%d]." msgstr "" -#: freadR.c:300 +#: freadR.c:302 #, c-format msgid "" "Column number %d (select[%d]) is too large for this table, which only has %d " "columns." msgstr "" -#: freadR.c:301 +#: freadR.c:303 #, c-format msgid "Column number %d ('%s') has been selected twice by select=" msgstr "" -#: freadR.c:324 +#: freadR.c:326 #, c-format msgid "" "colClasses= is an unnamed vector of types, length %d, but there are %d " @@ -2593,54 +2681,54 @@ msgid "" "colClasses=. Please see examples in ?fread." msgstr "" -#: freadR.c:344 +#: freadR.c:346 msgid "Internal error: selectInts is NULL but selectColClasses is true" msgstr "" -#: freadR.c:346 +#: freadR.c:348 msgid "" "Internal error: length(selectSxp)!=length(colClassesSxp) but " "selectColClasses is true" msgstr "" -#: freadR.c:366 +#: freadR.c:368 #, c-format msgid "colClasses is type '%s' but should be list or character" msgstr "" -#: freadR.c:390 +#: freadR.c:392 #, c-format msgid "Column name '%s' (colClasses[[%d]][%d]) not found" msgstr "" -#: freadR.c:392 +#: freadR.c:394 #, c-format msgid "colClasses[[%d]][%d] is NA" msgstr "" -#: freadR.c:396 +#: freadR.c:398 #, c-format msgid "" "Column %d ('%s') appears more than once in colClasses. The second time is " "colClasses[[%d]][%d]." msgstr "" -#: freadR.c:408 +#: freadR.c:410 #, c-format msgid "Column number %d (colClasses[[%d]][%d]) is out of range [1,ncol=%d]" msgstr "" -#: freadR.c:624 +#: freadR.c:626 #, c-format msgid "Field size is 1 but the field is of type %d\n" msgstr "" -#: freadR.c:633 +#: freadR.c:635 #, c-format msgid "Internal error: unexpected field of size %d\n" msgstr "" -#: freadR.c:701 +#: freadR.c:703 #, c-format msgid "%s" msgstr "" @@ -2809,7 +2897,7 @@ msgid "" "caught before. please report to data.table issue tracker." msgstr "" -#: frollR.c:155 frollR.c:279 nafill.c:162 shift.c:21 +#: frollR.c:155 frollR.c:279 nafill.c:162 shift.c:19 msgid "fill must be a vector of length 1" msgstr "" @@ -2927,7 +3015,7 @@ msgstr "" msgid "% " msgstr "" -#: fsort.c:247 fwrite.c:702 fwrite.c:966 +#: fsort.c:247 fwrite.c:702 msgid "\n" msgstr "" @@ -2946,6 +3034,18 @@ msgstr "" msgid "%d: %.3f (%4.1f%%)\n" msgstr "" +#: fwrite.c:572 +#, c-format +msgid "deflate input stream: %p %d %p %d\n" +msgstr "" + +#: fwrite.c:575 +#, c-format +msgid "" +"deflate returned %d with stream->total_out==%d; Z_FINISH==%d, Z_OK==%d, " +"Z_STREAM_END==%d\n" +msgstr "" + #: fwrite.c:613 #, c-format msgid "buffMB=%d outside [1,1024]" @@ -3018,6 +3118,11 @@ msgstr "" msgid "Can't allocate gzip stream structure" msgstr "" +#: fwrite.c:743 fwrite.c:752 +#, c-format +msgid "z_stream for header (%d): " +msgstr "" + #: fwrite.c:748 #, c-format msgid "Unable to allocate %d MiB for zbuffer: %s" @@ -3028,7 +3133,7 @@ msgstr "" msgid "Compress gzip error: %d" msgstr "" -#: fwrite.c:765 fwrite.c:773 fwrite.c:972 +#: fwrite.c:765 fwrite.c:773 #, c-format msgid "%s: '%s'" msgstr "" @@ -3049,6 +3154,25 @@ msgid "" "showProgress=%d, nth=%d)\n" msgstr "" +#: fwrite.c:812 +#, c-format +msgid "" +"Unable to allocate %d MB * %d thread buffers; '%d: %s'. Please read ?fwrite " +"for nThread, buffMB and verbose options." +msgstr "" + +#: fwrite.c:822 +#, c-format +msgid "" +"Unable to allocate %d MB * %d thread compressed buffers; '%d: %s'. Please " +"read ?fwrite for nThread, buffMB and verbose options." +msgstr "" + +#: fwrite.c:851 fwrite.c:883 fwrite.c:885 +#, c-format +msgid "z_stream for data (%d): " +msgstr "" + #: fwrite.c:980 #, c-format msgid "" @@ -3698,6 +3822,10 @@ msgid "" "caught before. Please report to data.table issue tracker." msgstr "" +#: nafill.c:182 +msgid "nan_is_na must be TRUE or FALSE" +msgstr "" + #: nafill.c:206 #, c-format msgid "%s: parallel processing of %d column(s) took %.3fs\n" @@ -4044,32 +4172,40 @@ msgstr "" msgid "Internal error: dt passed to setcolorder has %d columns but %d names" msgstr "" -#: shift.c:17 +#: shift.c:15 #, c-format msgid "" "type '%s' passed to shift(). Must be a vector, list, data.frame or data.table" msgstr "" -#: shift.c:24 shift.c:28 +#: shift.c:22 shift.c:26 msgid "" "Internal error: invalid type for shift(), should have been caught before. " "please report to data.table issue tracker" msgstr "" -#: shift.c:31 +#: shift.c:29 msgid "Internal error: k must be integer" msgstr "" -#: shift.c:33 +#: shift.c:31 #, c-format msgid "Item %d of n is NA" msgstr "" -#: shift.c:157 +#: shift.c:170 #, c-format msgid "Unsupported type '%s'" msgstr "" +#: snprintf.c:192 snprintf.c:195 snprintf.c:198 snprintf.c:201 snprintf.c:204 +#: snprintf.c:207 snprintf.c:210 snprintf.c:213 snprintf.c:216 snprintf.c:217 +#: snprintf.c:220 snprintf.c:223 snprintf.c:226 snprintf.c:229 snprintf.c:232 +#: snprintf.c:235 snprintf.c:238 snprintf.c:241 snprintf.c:244 +#, c-format +msgid "dt_win_snprintf test %d failed: %s" +msgstr "" + #: subset.c:7 #, c-format msgid "Internal error: subsetVectorRaw length(ans)==%d n=%d" @@ -4315,19 +4451,17 @@ msgstr "" msgid "%s: fill argument must be numeric" msgstr "" -#: utils.c:280 +#: utils.c:281 #, c-format msgid "Internal error: unsupported type '%s' passed to copyAsPlain()" msgstr "" -#: utils.c:284 +#: utils.c:286 #, c-format -msgid "" -"Internal error: type '%s' passed to copyAsPlain() but it seems " -"copyMostAttrib() retains ALTREP attributes" +msgid "Internal error: copyAsPlain returning ALTREP for type '%s'" msgstr "" -#: utils.c:319 +#: utils.c:330 #, c-format msgid "Found and copied %d column%s with a shared memory address\n" msgstr "" diff --git a/po/zh_CN.po b/po/zh_CN.po index 3965a017e4..d9b54a4435 100644 --- a/po/zh_CN.po +++ b/po/zh_CN.po @@ -2,8 +2,8 @@ msgid "" msgstr "" "Project-Id-Version: data.table 1.12.5\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2020-07-17 14:38+0800\n" -"PO-Revision-Date: 2019-11-18 00:26-04\n" +"POT-Creation-Date: 2020-10-17 13:11-0400\n" +"PO-Revision-Date: 2020-10-18 20:39-0400\n" "Last-Translator: Yuhang Chen \n" "Language-Team: Mandarin\n" "Language: Mandarin\n" @@ -467,28 +467,35 @@ msgstr " 因为一个主列的更新,丢掉索引 '%s'\n" msgid "Shortening index '%s' to '%s' due to an update on a key column\n" msgstr "因为一个主列的更新,缩短索引 '%s' 到 '%s'\n" -#: assign.c:695 +#: assign.c:650 +#, c-format +msgid "" +"Internal error: %d column numbers to delete not now in strictly increasing " +"order. No-dups were checked earlier." +msgstr "内部错误:指定 %d 删除列的序号目前并非严格升序排列。" +"重复项已于之前检查过。" + +#: assign.c:688 #, c-format msgid "" "Internal error memrecycle: sourceStart=%d sourceLen=%d length(source)=%d" -msgstr "" -"memrecycle 内部错误:sourceStart=%d sourceLen=%d length(source)=%d" +msgstr "memrecycle 内部错误:sourceStart=%d sourceLen=%d length(source)=%d" -#: assign.c:697 +#: assign.c:690 #, c-format msgid "Internal error memrecycle: start=%d len=%d length(target)=%d" msgstr "memrecycle 内部错误:start=%d len=%d length(target)=%d" -#: assign.c:700 +#: assign.c:693 #, c-format msgid "Internal error: recycle length error not caught earlier. slen=%d len=%d" msgstr "内部错误: 早期未被发现的循环长度错误 slen=%d len=%d" -#: assign.c:704 +#: assign.c:697 msgid "Internal error: memrecycle has received NULL colname" msgstr "内部错误: memrecycle 接受到的列名为 NULL " -#: assign.c:730 +#: assign.c:706 #, c-format msgid "" "Cannot assign 'factor' to '%s'. Factors can only be assigned to factor, " @@ -496,14 +503,14 @@ msgid "" msgstr "" "不能将 'factor' 赋值为 '%s' 。因子类型只能赋值为因子,字符或者列表其中的列" -#: assign.c:744 +#: assign.c:720 #, c-format msgid "" "Assigning factor numbers to column %d named '%s'. But %d is outside the " "level range [1,%d]" msgstr "将列 %d 名称为 '%s' 赋值为因子。但是 %d 在层次范围[1,%d]之外" -#: assign.c:752 +#: assign.c:728 #, c-format msgid "" "Assigning factor numbers to column %d named '%s'. But %f is outside the " @@ -512,7 +519,7 @@ msgstr "" "将列 %d 名称为 '%s' 赋值为因子。但是 %f 在层次范围[1,%d]之外,或者不是一个完" "整的数字" -#: assign.c:758 +#: assign.c:734 #, c-format msgid "" "Cannot assign '%s' to 'factor'. Factor columns can be assigned factor, " @@ -520,28 +527,28 @@ msgid "" msgstr "" "不能将 'factor' 赋值为 '%s' 。 因子列可被赋值为因子,字符 ,NA 或者 层次数值" -#: assign.c:779 +#: assign.c:755 msgid "" "Internal error: levels of target are either not unique or have truelength<0" msgstr "内部错误: 目标的层次不是唯一或者长度<0" -#: assign.c:818 +#: assign.c:794 #, c-format msgid "Unable to allocate working memory of %d bytes to combine factor levels" msgstr "不能分配 %d 字节的工作内存来组合因子层次" -#: assign.c:825 +#: assign.c:801 msgid "Internal error: extra level check sum failed" msgstr "内部错误: 额外的层次校验和失败" -#: assign.c:844 +#: assign.c:820 #, c-format msgid "" "Coercing 'character' RHS to '%s' to match the type of the target column " "(column %d named '%s')." msgstr "将'character' RHS 强制转换成 '%s' 来匹配目标列的类型(列 %d 名称 '%s')" -#: assign.c:850 +#: assign.c:826 #, c-format msgid "" "Cannot coerce 'list' RHS to 'integer64' to match the type of the target " @@ -549,40 +556,40 @@ msgid "" msgstr "" "不能将'list' RHS 强制转换成 'integer64' 来匹配目标列的类型(列 %d 名称 '%s')" -#: assign.c:855 +#: assign.c:831 #, c-format msgid "" "Coercing 'list' RHS to '%s' to match the type of the target column (column " "%d named '%s')." msgstr "将'list' RHS 强制转换成 '%s' 来匹配目标列的类型(列 %d 名称 '%s')" -#: assign.c:861 +#: assign.c:837 #, c-format msgid "Zero-copy coerce when assigning '%s' to '%s' column %d named '%s'.\n" msgstr "当 '%s' 赋值成 '%s' 列 %d 名称 '%s',进行Zero-copy强制转换。\n" -#: assign.c:956 +#: assign.c:932 #, c-format msgid "type '%s' cannot be coerced to '%s'" msgstr "类型 '%s' 不能强制转换成 '%s'" -#: assign.c:1076 +#: assign.c:1052 msgid "" "To assign integer64 to a character column, please use as.character() for " "clarity." msgstr "请使用 as.character() 把 integer64 类型的数值赋值给字符列" -#: assign.c:1088 +#: assign.c:1064 #, c-format msgid "Unsupported column type in assign.c:memrecycle '%s'" msgstr "assign.c:memrecycle '%s' 里有不支持的列的类型" -#: assign.c:1135 +#: assign.c:1111 #, c-format msgid "Internal error: writeNA passed a vector of type '%s'" msgstr "内部错误:writeNA 函数读取到了一个类型是'%s'的向量" -#: assign.c:1166 +#: assign.c:1142 #, c-format msgid "" "Internal error: savetl_init checks failed (%d %d %p %p). please report to " @@ -591,12 +598,12 @@ msgstr "" "内部错误:savetl_init的校验失败 (%d %d %p %p),请将此问题汇报给data.table 问" "题追踪器。" -#: assign.c:1174 +#: assign.c:1150 #, c-format msgid "Failed to allocate initial %d items in savetl_init" msgstr "不能为 savetl_init 最开始的 %d 个项分配空间" -#: assign.c:1183 +#: assign.c:1159 #, c-format msgid "" "Internal error: reached maximum %d items for savetl. Please report to data." @@ -605,34 +612,34 @@ msgstr "" "内部错误:已经达到了 savetl 能处理的子项上限 %d。请将此问题汇报给data.table问" "题追踪器。" -#: assign.c:1190 +#: assign.c:1166 #, c-format msgid "Failed to realloc saveds to %d items in savetl" msgstr "不能给 savetl 里的 %d 个项重新分配 saveds" -#: assign.c:1196 +#: assign.c:1172 #, c-format msgid "Failed to realloc savedtl to %d items in savetl" msgstr "不能给savetl里的 %d 个项提供 savetl" -#: assign.c:1219 +#: assign.c:1195 msgid "x must be a character vector" msgstr "x 必须是一个字符向量" -#: assign.c:1220 +#: assign.c:1196 msgid "'which' must be an integer vector" msgstr "'which' 必须是一个整数向量" -#: assign.c:1221 +#: assign.c:1197 msgid "'new' must be a character vector" msgstr "'new' 必须是一个字符向量" -#: assign.c:1222 +#: assign.c:1198 #, c-format msgid "'new' is length %d. Should be the same as length of 'which' (%d)" msgstr "'new' 的长度是 %d。 它的长度必须和'which' (%d)的长度一致。" -#: assign.c:1225 +#: assign.c:1201 #, c-format msgid "" "Item %d of 'which' is %d which is outside range of the length %d character " @@ -848,7 +855,7 @@ msgstr "内部错误:SYMSXP的长度为 %d 而非 1" msgid "x is type '%s' (must be 'character' or NULL)" msgstr "x 类型为 '%s' (必须为'character'或 NULL)" -#: chmatch.c:66 +#: chmatch.c:71 #, c-format msgid "" "Internal error: CHARSXP '%s' has a negative truelength (%d). Please file an " @@ -857,7 +864,7 @@ msgstr "" "内部错误:CHARSXP '%s' 的 truelength (%d) 为负。请将此问题汇报给 data.table " "问题追踪器。" -#: chmatch.c:95 +#: chmatch.c:100 #, c-format msgid "" "Failed to allocate % bytes working memory in chmatchdup: " @@ -939,31 +946,31 @@ msgstr "coalesce 复制了第一项 (inplace=FALSE)\n" msgid "Unsupported type: %s" msgstr "不支持的类型:%s" -#: dogroups.c:15 +#: dogroups.c:69 msgid "Internal error: order not integer vector" msgstr "内部错误:order 不是整型向量" -#: dogroups.c:16 +#: dogroups.c:70 msgid "Internal error: starts not integer" msgstr "内部错误:starts 不是整型" -#: dogroups.c:17 +#: dogroups.c:71 msgid "Internal error: lens not integer" msgstr "内部错误:lens 不是整型" -#: dogroups.c:19 +#: dogroups.c:73 msgid "Internal error: jiscols not NULL but o__ has length" msgstr "内部错误:jiscols 非 NULL,但 o__ 长度不为0" -#: dogroups.c:20 +#: dogroups.c:74 msgid "Internal error: xjiscols not NULL but o__ has length" msgstr "内部错误:jiscols 非 NULL,但 o__ 长度不为0" -#: dogroups.c:21 +#: dogroups.c:75 msgid "'env' should be an environment" msgstr "'env' 应该是一个环境" -#: dogroups.c:40 +#: dogroups.c:94 #, c-format msgid "" "Internal error: unsupported size-0 type '%s' in column %d of 'by' should " @@ -971,16 +978,16 @@ msgid "" msgstr "" "内部错误:未能被提前捕获到 'by' 中第 %2$d 列不支持类型 '%1$s' 且size-0 的问题" -#: dogroups.c:44 +#: dogroups.c:99 #, c-format msgid "!length(bynames)[%d]==length(groups)[%d]==length(grpcols)[%d]" msgstr "!length(bynames)[%d]==length(groups)[%d]==length(grpcols)[%d]" -#: dogroups.c:63 +#: dogroups.c:121 msgid "row.names attribute of .SD not found" msgstr ".SD 的行名属性不存在" -#: dogroups.c:65 +#: dogroups.c:123 #, c-format msgid "" "row.names of .SD isn't integer length 2 with NA as first item; i.e., ." @@ -989,43 +996,48 @@ msgstr "" ".SD 的行名不是长度为2且首个元素为 NA 的整型;例如:set_row_names(). [%s %d " "%d]" -#: dogroups.c:70 +#: dogroups.c:128 msgid "length(names)!=length(SD)" msgstr "length(names)!=length(SD)" -#: dogroups.c:74 +#: dogroups.c:134 #, c-format msgid "" "Internal error: size-0 type %d in .SD column %d should have been caught " "earlier" msgstr "内部错误:未能提前捕获到 .SD 中第 %2$d 列类型 %1$d size-0 的问题" -#: dogroups.c:84 +#: dogroups.c:136 +#, c-format +msgid "Internal error: SDall %d length = %d != %d" +msgstr "内部错误: SDall %d 长度 = %d != %d" + +#: dogroups.c:144 msgid "length(xknames)!=length(xSD)" msgstr "length(xknames)!=length(xSD)" -#: dogroups.c:88 +#: dogroups.c:148 #, c-format msgid "" "Internal error: type %d in .xSD column %d should have been caught by now" msgstr "内部错误:当前未能捕获到 .xSD 中第 %2$d 列类型 %1$d 的问题" -#: dogroups.c:92 +#: dogroups.c:152 #, c-format msgid "length(iSD)[%d] != length(jiscols)[%d]" msgstr "length(iSD)[%d] != length(jiscols)[%d]" -#: dogroups.c:93 +#: dogroups.c:153 #, c-format msgid "length(xSD)[%d] != length(xjiscols)[%d]" msgstr "length(xSD)[%d] != length(xjiscols)[%d]" -#: dogroups.c:198 +#: dogroups.c:259 #, c-format msgid "j evaluates to type '%s'. Must evaluate to atomic vector or list." msgstr "j的运算结果为'%s'类型。其运算结果必须为原子向量或列表。" -#: dogroups.c:206 +#: dogroups.c:267 msgid "" "All items in j=list(...) should be atomic vectors or lists. If you are " "trying something like j=list(.SD,newcol=mean(colA)) then use := by group " @@ -1035,13 +1047,13 @@ msgstr "" "newcol=mean(colA)) 之类的操作请使用 := by group 代替(更快速),或事后使用 " "cbind()、merge()" -#: dogroups.c:215 +#: dogroups.c:276 msgid "" "RHS of := is NULL during grouped assignment, but it's not possible to delete " "parts of a column." msgstr "用 := 分组时 RHS 为 NULL但無法刪除部分列" -#: dogroups.c:219 +#: dogroups.c:280 #, c-format msgid "" "Supplied %d items to be assigned to group %d of size %d in column '%s'. The " @@ -1053,7 +1065,7 @@ msgstr "" "须是 1(可以是单个值) 或完全符合 LHS 的长度如果您想回收(recycle) RHS,请使用 " "rep() 向你的代码读者明确表达你的意图" -#: dogroups.c:230 +#: dogroups.c:291 msgid "" "Internal error: Trying to add new column by reference but tl is full; " "setalloccol should have run first at R level before getting to this point in " @@ -1062,16 +1074,16 @@ msgstr "" "内部错误 : 尝试依照引用增加新列但 tl 已满在进入 dogroups 之前,setalloccol 应" "该先在 R 运行" -#: dogroups.c:245 +#: dogroups.c:312 #, c-format msgid "Group %d column '%s': %s" msgstr "列 '%2$s' 第 %1$d 组 : %3$s" -#: dogroups.c:252 +#: dogroups.c:319 msgid "j doesn't evaluate to the same number of columns for each group" msgstr "j 估算出的每组的列数不同" -#: dogroups.c:286 +#: dogroups.c:353 #, c-format msgid "" "Column %d of j's result for the first group is NULL. We rely on the column " @@ -1085,7 +1097,7 @@ msgstr "" "(需要一致性)空 (NULL) 列可以出现在后面的组(适当的以 NA 取代并回收)但不能是第 " "1 组请输入空向量代替,例如 integer() 或 numeric()" -#: dogroups.c:289 +#: dogroups.c:356 msgid "" "j appears to be a named vector. The same names will likely be created over " "and over again for each group and slow things down. Try and pass a named " @@ -1094,7 +1106,7 @@ msgstr "" "j 是名称向量,这可能使相同的名称不停重复创建导致速度变慢请尝试输入名称列表(较" "适合 data.table)或是非名称列表代替\n" -#: dogroups.c:291 +#: dogroups.c:358 #, c-format msgid "" "Column %d of j is a named vector (each item down the rows is named, " @@ -1104,7 +1116,7 @@ msgstr "" "j 的第 %d 列是名称向量(整行的项都是名称)为了效率请移除这些名称(避免在每组重复" "创建这些名称)总之他们被忽略了\n" -#: dogroups.c:299 +#: dogroups.c:366 msgid "" "The result of j is a named list. It's very inefficient to create the same " "names over and over again for each group. When j=list(...), any names are " @@ -1113,20 +1125,20 @@ msgid "" "to :=). This message may be upgraded to warning in future.\n" msgstr "" "j 的结果是名称列表,在每组不停重复创建相同的名称很没效率为了提高效率,当 " -"j=list(...) 时侦测到的所有名称会被移出,待分组完成后再放回来可以使用 " +"j=list(...) 时侦测到的所有名称会被移出,待分组完成后再放回来可以使用 " "j=transform() 避免这种加速此讯息可能会在未来升级为警告\n" -#: dogroups.c:311 +#: dogroups.c:378 #, c-format msgid "dogroups: growing from %d to %d rows\n" msgstr "dogroups: 从 %d 列增加至 %d 列\n" -#: dogroups.c:312 +#: dogroups.c:379 #, c-format msgid "dogroups: length(ans)[%d]!=ngrpcols[%d]+njval[%d]" msgstr "dogroups: length(ans)[%d]!=ngrpcols[%d]+njval[%d]" -#: dogroups.c:330 +#: dogroups.c:397 #, c-format msgid "" "Item %d of j's result for group %d is zero length. This will be filled with " @@ -1137,7 +1149,7 @@ msgstr "" "j 的结果第 %d 项在第 %d 组中为零长度(zero length)将使用 %d 个 NA 填入以符合结" "果中最长列的长度后面的分组也有相同问题,但只回报第一组以避免过多警告" -#: dogroups.c:337 +#: dogroups.c:404 #, c-format msgid "" "Column %d of result for group %d is type '%s' but expecting type '%s'. " @@ -1146,7 +1158,7 @@ msgstr "" "结果的第 %d 列在第 %d 组中是 '%s' 类别而非预期的 '%s' 类别所有组的列类别必须" "一致" -#: dogroups.c:339 +#: dogroups.c:406 #, c-format msgid "" "Supplied %d items for column %d of group %d which has %d rows. The RHS " @@ -1158,35 +1170,35 @@ msgstr "" "單個值) 或與 LHS 長度完全匹配如果您想回收(recycle) RHS,请使用 rep() 向你的代" "码读者明确表达你的意图" -#: dogroups.c:354 +#: dogroups.c:427 #, c-format msgid "Wrote less rows (%d) than allocated (%d).\n" msgstr "写入的行 (%d) 少于分配的 (%d)\n" -#: dogroups.c:364 +#: dogroups.c:449 #, c-format msgid "Internal error: block 0 [%d] and block 1 [%d] have both run" msgstr "内部错误 : 区块 0 [%d] 与区块 1 [%d] 都运行了" -#: dogroups.c:366 +#: dogroups.c:451 #, c-format msgid "" "\n" " %s took %.3fs for %d groups\n" msgstr "" "\n" -" %s 花了 %.3fs 在 %d 个组\n" +" %s 花了 %.3fs 在 %d 个组\n" -#: dogroups.c:368 +#: dogroups.c:453 #, c-format msgid " eval(j) took %.3fs for %d calls\n" -msgstr " eval(j)取%.3fs给 %d 调用\n" +msgstr " eval(j)取%.3fs给 %d 调用\n" -#: dogroups.c:392 +#: dogroups.c:477 msgid "growVector passed NULL" msgstr "growVector通过NULL" -#: dogroups.c:412 +#: dogroups.c:497 #, c-format msgid "Internal error: growVector doesn't support type '%s'" msgstr "内部错误:growVector 不支持 '%s' 类型" @@ -1214,6 +1226,10 @@ msgstr "fcast val不支持的列类型:'%s'" msgid "Argument 'test' must be logical." msgstr "参数'test'必须是逻辑类型。" +#: fifelse.c:9 +msgid "S4 class objects (except nanotime) are not supported." +msgstr "不支持的S4 类对象(nanotime 除外)。" + #: fifelse.c:28 #, c-format msgid "" @@ -1265,7 +1281,7 @@ msgstr "'yes'的类型与'na'不同。请确认两个参数是同一类型。" msgid "'yes' and 'na' are both type factor but their levels are different." msgstr "'yes'和'na'都是因子类型但他们的因子水平不同" -#: fifelse.c:138 +#: fifelse.c:138 fifelse.c:336 #, c-format msgid "Type %s is not supported." msgstr "不支持类型 %s" @@ -1277,9 +1293,76 @@ msgid "" "consisting of logical condition, resulting value pairs (in that order). Note " "that the default argument must be named explicitly, e.g., default=0" msgstr "" -"接收到 %d 个输入。请向 ... 中提供偶数个参数。" -"每一参数需包含逻辑条件判断,以及对应顺序的结果值对。请注意" -"默认参数须明确给出名字,如 default=0" +"接收到 %d 个输入。请向 ... 中提供偶数个参数。每一参数需包含逻辑条件判断,以及" +"对应顺序的结果值对。请注意默认参数须明确给出名字,如 default=0" + +#: fifelse.c:163 fifelse.c:203 +msgid "" +"S4 class objects (except nanotime) are not supported. Please see https://" +"github.com/Rdatatable/data.table/issues/4131." +msgstr "不支持的S4 类对象(nanotime 除外)。详见 https://" +"github.com/Rdatatable/data.table/issues/4131。" + +#: fifelse.c:174 +msgid "Length of 'default' must be 1." +msgstr "'default' 长度必须是 1。" + +#: fifelse.c:181 +#, c-format +msgid "" +"Resulting value is of type %s but 'default' is of type %s. Please make sure " +"that both arguments have the same type." +msgstr "结果为 %s 类型,然而 'default' 却为 %s 类型。请确认二者为同一类型。" + +#: fifelse.c:185 +msgid "" +"Resulting value has different class than 'default'. Please make sure that " +"both arguments have the same class." +msgstr "结果的类型与 'default' 的类型不同。请确认二者为同一类型。" + +#: fifelse.c:191 +msgid "" +"Resulting value and 'default' are both type factor but their levels are " +"different." +msgstr "结果和 'default' 均为因子类型,但其因子水平不同。" + +#: fifelse.c:206 +#, c-format +msgid "Argument #%d must be logical." +msgstr "参数 #%d 必须为逻辑类型。" + +#: fifelse.c:210 +#, c-format +msgid "" +"Argument #%d has a different length than argument #1. Please make sure all " +"logical conditions have the same length." +msgstr "参数 #%d 与参数 #1 长度不同。请确认所有逻辑条件的长度相等。" + +#: fifelse.c:215 +#, c-format +msgid "" +"Argument #%d is of type %s, however argument #2 is of type %s. Please make " +"sure all output values have the same type." +msgstr "参数 #%d 为 %s 类型,但参数 #2 为 %s 类型。请确认所有输出均为同一类型。" + +#: fifelse.c:220 +#, c-format +msgid "" +"Argument #%d has different class than argument #2, Please make sure all " +"output values have the same class." +msgstr "参数 #2 的类型与参数 #%d 的不同。请确认所有输出均为同一类型。" + +#: fifelse.c:226 +#, c-format +msgid "" +"Argument #2 and argument #%d are both factor but their levels are different." +msgstr "参数 #2 和参数 #%d 均为因子类型,但其因子水平不同。" + +#: fifelse.c:233 +#, c-format +msgid "" +"Length of output value #%d must either be 1 or length of logical condition." +msgstr "#%d 输出的长度必须为 1 或与逻辑判断条件的长度相同。" #: fmelt.c:18 msgid "'x' must be an integer" @@ -1293,27 +1376,27 @@ msgstr "'n'必须是正整数" msgid "Argument to 'which' must be logical" msgstr "'which'的参数必须是逻辑值" -#: fmelt.c:70 -msgid "concat: 'vec must be a character vector" -msgstr "串联:'vec 必须是一个字符向量" +#: fmelt.c:65 +msgid "concat: 'vec' must be a character vector" +msgstr "concat:'vec' 必须是一个字符向量" -#: fmelt.c:71 +#: fmelt.c:66 msgid "concat: 'idx' must be an integer vector of length >= 0" -msgstr "串联:'idx' 必须为一个长度>= 0的整数向量" +msgstr "concat:'idx' 必须为一个长度>= 0的整数向量" #: fmelt.c:75 #, c-format msgid "" -"Internal error in concat: 'idx' must take values between 0 and length(vec); " -"0 <= idx <= %d" -msgstr "串联内部错误:'idx'必须为0到length(vec)之间的值;0 <= idx <= %d" +"Internal error in concat: 'idx' must take values between 1 and length(vec); " +"1 <= idx <= %d" +msgstr "concat内部错误:'idx'必须为1到length(vec)之间的值;1 <= idx <= %d" -#: fmelt.c:102 +#: fmelt.c:117 #, c-format msgid "Unknown 'measure.vars' type %s at index %d of list" msgstr "未知'measure.vars'类型 %s,位于列表中 %d" -#: fmelt.c:148 +#: fmelt.c:162 #, c-format msgid "" "id.vars and measure.vars are internally guessed when both are 'NULL'. All " @@ -1325,54 +1408,54 @@ msgstr "" "值/整数/逻辑类型列会作为'id.vars',即以下列 [%s]。以后请考虑择一指定'id." "vars'或'measure.vars'。" -#: fmelt.c:154 fmelt.c:219 +#: fmelt.c:168 fmelt.c:233 #, c-format msgid "Unknown 'id.vars' type %s, must be character or integer vector" msgstr "未知'id.vars'类型 %s,必须是字符或者整数向量(vector)" -#: fmelt.c:159 fmelt.c:223 +#: fmelt.c:173 fmelt.c:237 msgid "One or more values in 'id.vars' is invalid." msgstr "'id.vars'里,一或多个数值无效" -#: fmelt.c:175 +#: fmelt.c:189 msgid "" "'measure.vars' is missing. Assigning all columns other than 'id.vars' " "columns as 'measure.vars'.\n" msgstr "" "找不到'measure.vars'。将指定所有'id.vars'以外的所有列为'measure.vars'。\n" -#: fmelt.c:176 +#: fmelt.c:190 #, c-format msgid "Assigned 'measure.vars' are [%s].\n" msgstr "指定'measure.vars'为[%s]。\n" -#: fmelt.c:184 +#: fmelt.c:198 #, c-format msgid "" "Unknown 'measure.vars' type %s, must be character or integer vector/list" msgstr "未知'measure.vars'类型 %s,必须是字符或者整数向量(vector)/列表(list)" -#: fmelt.c:193 fmelt.c:239 +#: fmelt.c:207 fmelt.c:253 msgid "One or more values in 'measure.vars' is invalid." msgstr "'measure.vars'里,一或多个数值无效" -#: fmelt.c:211 +#: fmelt.c:225 msgid "" "'id.vars' is missing. Assigning all columns other than 'measure.vars' " "columns as 'id.vars'.\n" msgstr "找不到'id.vars'。将指定所有'measure.vars'以外的所有列为'id.vars'。\n" -#: fmelt.c:212 +#: fmelt.c:226 #, c-format msgid "Assigned 'id.vars' are [%s].\n" msgstr "指定的 'id.vars' 是 [%s].\n" -#: fmelt.c:231 +#: fmelt.c:245 #, c-format msgid "Unknown 'measure.vars' type %s, must be character or integer vector" msgstr "未知'measure.vars'类型 %s,必须是字符或者整数向量" -#: fmelt.c:276 +#: fmelt.c:290 msgid "" "When 'measure.vars' is a list, 'value.name' must be a character vector of " "length =1 or =length(measure.vars)." @@ -1380,7 +1463,7 @@ msgstr "" "当'measure.vars'是一个列表(list), 'value.name' 必须是一个长度为1或者等于" "length(measure.vars)的字符向量" -#: fmelt.c:277 +#: fmelt.c:291 msgid "" "When 'measure.vars' is either not specified or a character/integer vector, " "'value.name' must be a character vector of length =1." @@ -1388,22 +1471,22 @@ msgstr "" "当'measure.vars'未被指定或者是一个字符/整数向量时,'value.name'必须是一个长度" "1的字符/整数向量" -#: fmelt.c:280 +#: fmelt.c:294 msgid "'variable.name' must be a character/integer vector of length=1." msgstr "'variable.name' 必须是长度1的字符/整数向量。" -#: fmelt.c:329 +#: fmelt.c:343 msgid "" "Internal error: combineFactorLevels in fmelt.c expects all-character input" msgstr "内部错误:fmelt.c里的combineFactorLevels期望输入值为全字符" -#: fmelt.c:332 +#: fmelt.c:346 msgid "" "Internal error: combineFactorLevels in fmelt.c expects a character target to " "factorize" msgstr "内部错误:fmelt.c里的combineFactorLevels期望一个字符来分解" -#: fmelt.c:385 +#: fmelt.c:399 #, c-format msgid "" "'measure.vars' [%s] are not all of the same type. By order of hierarchy, the " @@ -1415,60 +1498,60 @@ msgstr "" "以变量中不是'%3$s'类型的数将被强制转换为'%2$s'类型,更多关于强制转换的信息请" "查看 ?melt.data.table.\n" -#: fmelt.c:387 +#: fmelt.c:401 #, c-format msgid "" "The molten data value type is a list at item %d. 'na.rm=TRUE' is ignored.\n" msgstr "在项目%d中,融合后的数值类型是列表,参数'na.rm = TRUE'被自动忽略\n" -#: fmelt.c:490 +#: fmelt.c:504 #, c-format msgid "Unknown column type '%s' for column '%s'." msgstr "'%s'列是未知的纵列类型: '%s'" -#: fmelt.c:514 +#: fmelt.c:528 #, c-format msgid "Internal error: fmelt.c:getvarcols %d %d" msgstr "内部错误:fmelt.c : getvarcols %d %d" -#: fmelt.c:662 +#: fmelt.c:676 #, c-format msgid "Unknown column type '%s' for column '%s' in 'data'" msgstr "'data' 中的'%s'列是未知列类型:'%s'" -#: fmelt.c:673 +#: fmelt.c:687 msgid "Input is not of type VECSXP, expected a data.table, data.frame or list" msgstr "输入类型不是 VECSXP,输入类型应该是 data.table,data.frame 或 list。" -#: fmelt.c:674 +#: fmelt.c:688 msgid "Argument 'value.factor' should be logical TRUE/FALSE" msgstr "'value.factor' 的参数是逻辑值,必须是 TRUE 或FALSE" -#: fmelt.c:675 +#: fmelt.c:689 msgid "Argument 'variable.factor' should be logical TRUE/FALSE" msgstr "'variable.factor' 的参数是逻辑值,必须是 TRUE 或FALSE" -#: fmelt.c:676 +#: fmelt.c:690 msgid "Argument 'na.rm' should be logical TRUE/FALSE." msgstr "'na.rm' 的参数是逻辑值,必须是 TRUE 或 FALSE" -#: fmelt.c:677 +#: fmelt.c:691 msgid "Argument 'variable.name' must be a character vector" msgstr "'variable.name' 必须是字符串类型" -#: fmelt.c:678 +#: fmelt.c:692 msgid "Argument 'value.name' must be a character vector" msgstr "'value.name' 必须是字符串类型" -#: fmelt.c:679 +#: fmelt.c:693 msgid "Argument 'verbose' should be logical TRUE/FALSE" msgstr "'verbose' 的参数是逻辑值,必须是 TRUE 或 FALSE" -#: fmelt.c:683 +#: fmelt.c:697 msgid "ncol(data) is 0. Nothing to melt. Returning original data.table." msgstr "ncol(data)为0,返回原 data.table" -#: fmelt.c:688 +#: fmelt.c:702 msgid "names(data) is NULL. Please report to data.table-help" msgstr "names(data)为NULL,请向 data.table-help 报告" @@ -1607,6 +1690,11 @@ msgstr "retGrp 和sort 的参数中,至少一个必须是 TRUE" msgid "na.last must be logical TRUE, FALSE or NA of length 1" msgstr "na.last 的参数必须是逻辑值 TRUE, FALSE 或 NA " +#: forder.c:504 forder.c:608 +#, c-format +msgid "Unable to allocate % bytes of working memory" +msgstr "无法分配%字节的工作内存" + #: forder.c:520 #, c-format msgid "Item %d of order (ascending/descending) is %d. Must be +1 or -1." @@ -1697,7 +1785,7 @@ msgstr "nrow 必须为长度为1的整型向量" #: forder.c:1392 #, c-format msgid "nrow==%d but must be >=0" -msgstr "nrow==%d 但是必须 >=0" +msgstr "nrow==%d 但是必须 >=0" #: forder.c:1409 msgid "x must be type 'double'" @@ -1723,7 +1811,7 @@ msgstr "'cols' 的 %d 项为 %d ,超出1的范围 [1,ncol(x)=%d]" msgid "" "Column %d of input list x is length %d, inconsistent with first column of " "that item which is length %d." -msgstr "输入列表x的列 %d 长度为 %d,不同于第一列的该项长度为 %d" +msgstr "输入列表x的列 %d 长度为 %d,不同于第一列的该项长度为 %d" #: frank.c:63 frank.c:211 transpose.c:88 #, c-format @@ -1787,219 +1875,219 @@ msgstr "可避免的 %.3f 秒。 %s 复制用时\n" #: fread.c:441 #, c-format msgid " File copy in RAM took %.3f seconds.\n" -msgstr "内存上的文件复制耗时 %.3f 秒\n" +msgstr " 内存上的文件复制耗时 %.3f 秒\n" -#: fread.c:1248 +#: fread.c:1249 msgid "" "Previous fread() session was not cleaned up properly. Cleaned up ok at the " "beginning of this fread() call.\n" msgstr "之前的会话fread()未正确清理。在当前 fread() 会话开始前清理好\n" -#: fread.c:1251 +#: fread.c:1252 msgid "[01] Check arguments\n" msgstr "[01] 参数检查\n" -#: fread.c:1258 +#: fread.c:1259 #, c-format msgid " Using %d threads (omp_get_max_threads()=%d, nth=%d)\n" -msgstr "使用 %d 线程 (omp_get_max_threads()=%d, nth=%d)\n" +msgstr " 使用 %d 线程 (omp_get_max_threads()=%d, nth=%d)\n" -#: fread.c:1266 +#: fread.c:1267 msgid "" "Internal error: NAstrings is itself NULL. When empty it should be pointer to " "NULL." msgstr "内部错误:NAstrings 自身为空值。当清空该项会指向NULL空值" -#: fread.c:1284 +#: fread.c:1285 #, c-format msgid "freadMain: NAstring <<%s>> has whitespace at the beginning or end" -msgstr "freadMain: NAstring <<%s>> 在开始或者结束处有空白" +msgstr "freadMain: NAstring <<%s>> 在开始或者结束处有空白" -#: fread.c:1289 +#: fread.c:1290 #, c-format msgid "" "freadMain: NAstring <<%s>> is recognized as type boolean, this is not " "permitted." msgstr "freadMain: NAstring <<%s>> 被识别为布尔型,这是不允许" -#: fread.c:1300 +#: fread.c:1301 msgid " No NAstrings provided.\n" -msgstr "未提供 NAstrings \n" +msgstr " 未提供 NAstrings \n" -#: fread.c:1302 +#: fread.c:1303 msgid " NAstrings = [" msgstr " NAstrings = [" -#: fread.c:1305 +#: fread.c:1306 msgid "]\n" msgstr "]\n" -#: fread.c:1307 +#: fread.c:1308 msgid " One or more of the NAstrings looks like a number.\n" -msgstr "一个或多个 NAstrings 类似数值\n" +msgstr " 一个或多个 NAstrings 类似数值\n" -#: fread.c:1309 +#: fread.c:1310 msgid " None of the NAstrings look like numbers.\n" -msgstr "没有 NAstrings 为数值\n" +msgstr " 没有 NAstrings 为数值\n" -#: fread.c:1311 +#: fread.c:1312 #, c-format msgid " skip num lines = %\n" -msgstr "跳过行数为 %\n" +msgstr " 跳过行数为 %\n" -#: fread.c:1312 +#: fread.c:1313 #, c-format msgid " skip to string = <<%s>>\n" -msgstr "跳转至 string = <<%s>>\n" +msgstr " 跳转至 string = <<%s>>\n" -#: fread.c:1313 +#: fread.c:1314 #, c-format msgid " show progress = %d\n" -msgstr "显示进程 %d\n" +msgstr " 显示进程 %d\n" -#: fread.c:1314 +#: fread.c:1315 #, c-format msgid " 0/1 column will be read as %s\n" -msgstr " 0/1 列被读取为 %s\n" +msgstr " 0/1 列被读取为 %s\n" -#: fread.c:1322 +#: fread.c:1323 #, c-format msgid "sep == quote ('%c') is not allowed" msgstr "sep == quote ('%c') 不被允许" -#: fread.c:1323 +#: fread.c:1324 msgid "dec='' not allowed. Should be '.' or ','" msgstr "dec='' 不允许,应该为 '.' 或者 ','" -#: fread.c:1324 +#: fread.c:1325 #, c-format msgid "sep == dec ('%c') is not allowed" msgstr "sep == dec ('%c') 不允许" -#: fread.c:1325 +#: fread.c:1326 #, c-format msgid "quote == dec ('%c') is not allowed" msgstr "quote == dec ('%c') 不允许" -#: fread.c:1342 +#: fread.c:1343 msgid "[02] Opening the file\n" -msgstr "[02] 打开文件\n" +msgstr "[02] 打开文件\n" -#: fread.c:1345 +#: fread.c:1346 msgid "" " `input` argument is provided rather than a file name, interpreting as raw " "text to read\n" msgstr "提供 `input` 参数而非文件名,理解为原始的文本读取\n" -#: fread.c:1349 +#: fread.c:1350 msgid "Internal error: last byte of character input isn't \\0" msgstr "内部错误:字符输入的最后一个字节不是 \\0" -#: fread.c:1352 +#: fread.c:1353 #, c-format msgid " Opening file %s\n" -msgstr "打开文件 %s\n" +msgstr " 打开文件 %s\n" -#: fread.c:1356 +#: fread.c:1357 #, c-format msgid "file not found: %s" msgstr "文件未找到: %s" -#: fread.c:1360 +#: fread.c:1361 #, c-format msgid "Opened file ok but couldn't obtain its size: %s" msgstr "文件能够打开但无法获知其大小:%s" -#: fread.c:1363 fread.c:1391 +#: fread.c:1364 fread.c:1392 #, c-format msgid "File is empty: %s" msgstr "文件是空的:%s" -#: fread.c:1364 fread.c:1392 +#: fread.c:1365 fread.c:1393 #, c-format msgid " File opened, size = %s.\n" -msgstr "文件已打开,大小为 %s.\n" +msgstr " 文件已打开,大小为 %s.\n" -#: fread.c:1381 +#: fread.c:1382 #, c-format msgid "File not found: %s" msgstr "文件没有找到:%s" -#: fread.c:1387 +#: fread.c:1388 #, c-format msgid "Unable to open file after %d attempts (error %d): %s" msgstr "经过 %d 次尝试后仍无法打开文件(错误 %d):%s" -#: fread.c:1389 +#: fread.c:1390 #, c-format msgid "GetFileSizeEx failed (returned 0) on file: %s" msgstr "GetFileSizeEx 未能成功执行(返回值为0)于文件:%s" -#: fread.c:1394 +#: fread.c:1395 #, c-format msgid "This is Windows, CreateFileMapping returned error %d for file %s" msgstr "现在在Windows下,CreateFileMapping 返回错误 %d 于文件 %s" -#: fread.c:1401 +#: fread.c:1402 #, c-format msgid "" "Opened %s file ok but could not memory map it. This is a %dbit process. %s." msgstr "能够打开文件 %s 但不能创建内存映射。这是一个 %d 位进程。 %s." -#: fread.c:1402 +#: fread.c:1403 msgid "Please upgrade to 64bit" msgstr "请升级到64位" -#: fread.c:1402 +#: fread.c:1403 msgid "There is probably not enough contiguous virtual memory available" msgstr "多半没有足够的连续虚拟内存" -#: fread.c:1405 +#: fread.c:1406 msgid " Memory mapped ok\n" msgstr " 内存映射正常\n" -#: fread.c:1407 +#: fread.c:1408 msgid "" "Internal error: Neither `input` nor `filename` are given, nothing to read." msgstr "" "内部错误:既没有`input`(输入)也没有`filename`(文件名),没有什么可供读入。" -#: fread.c:1424 +#: fread.c:1425 msgid "[03] Detect and skip BOM\n" msgstr "[03] 检测并跳过字节顺序标记(BOM)\n" -#: fread.c:1428 +#: fread.c:1429 msgid "" " UTF-8 byte order mark EF BB BF found at the start of the file and " "skipped.\n" msgstr "在文件头发现了UTF-8 字节顺序标记(BOM)EF BB BF 并已跳过。\n" -#: fread.c:1433 +#: fread.c:1434 msgid "" "GB-18030 encoding detected, however fread() is unable to decode it. Some " "character fields may be garbled.\n" msgstr "检测到GB-18030 编码,但fread() 未能解码。某些 字符字段可能有乱码。\n" -#: fread.c:1436 +#: fread.c:1437 msgid "" "File is encoded in UTF-16, this encoding is not supported by fread(). Please " "recode the file to UTF-8." msgstr "文件编码是UTF-16,fread()不支持此编码。请 将文件转换为UTF-8。" -#: fread.c:1441 +#: fread.c:1442 #, c-format msgid " Last byte(s) of input found to be %s and removed.\n" msgstr " 发现输入的最后字节是 %s 并已去除。\n" -#: fread.c:1444 +#: fread.c:1445 msgid "Input is empty or only contains BOM or terminal control characters" msgstr "输入是空的或只有字节顺序标记(BOM)或终端控制字符" -#: fread.c:1451 +#: fread.c:1452 msgid "[04] Arrange mmap to be \\0 terminated\n" msgstr "[04] 设定mmap为 \\0 终止\n" -#: fread.c:1458 +#: fread.c:1459 msgid "" " No \\n exists in the file at all, so single \\r (if any) will be taken as " "one line ending. This is unusual but will happen normally when there is no " @@ -2008,7 +2096,7 @@ msgstr "" " 文件中完全没有换行符\\n,所以单个 \\r(如果有的话)将被当成一行的结束。这不" "太常见但如果没有\\r 的话属于正常;例如单个行没有行尾结束符。\n" -#: fread.c:1459 +#: fread.c:1460 msgid "" " \\n has been found in the input and different lines can end with different " "line endings (e.g. mixed \\n and \\r\\n in one file). This is common and " @@ -2017,7 +2105,7 @@ msgstr "" " 输入中有\\n 并且不同行可以有不同的 行尾结束符(如在一个文件中混合使用 \\n " "和\\r\\n)。这很常见也是理想情况。\n" -#: fread.c:1483 +#: fread.c:1484 #, c-format msgid "" " File ends abruptly with '%c'. Final end-of-line is missing. Using cow page " @@ -2026,7 +2114,7 @@ msgstr "" " 文件突然中止于 '%c'。没有最后一个行尾结束符。正使用写时复制页(cow, copy-" "on-write)写入 0 到最后一个字节。\n" -#: fread.c:1489 +#: fread.c:1490 msgid "" "This file is very unusual: it ends abruptly without a final newline, and " "also its size is a multiple of 4096 bytes. Please properly end the last row " @@ -2035,16 +2123,16 @@ msgstr "" "这个文件非常不正常:它突然中止而没有最后的换行,并且其大小是4096 字节的整数" "倍。请用一个换行(例如 'echo >> file')来恰当地结束最后一行以避免此错误" -#: fread.c:1490 +#: fread.c:1491 #, c-format msgid " File ends abruptly with '%c'. Copying file in RAM. %s copy.\n" msgstr " 文件突然中止于 '%c'。正在从内存中复制文件。%s 复制。\n" -#: fread.c:1524 +#: fread.c:1525 msgid "[05] Skipping initial rows if needed\n" msgstr "[05] 如需要的话跳过起始行\n" -#: fread.c:1530 +#: fread.c:1531 #, c-format msgid "" "skip='%s' not found in input (it is case sensitive and literal; i.e., no " @@ -2053,79 +2141,79 @@ msgstr "" "在输入中没有发现 skip='%s' (这里大小写敏感并需要是字面形式,也就是说不能使用" "模式,适配符或正则表达式)" -#: fread.c:1536 +#: fread.c:1537 #, c-format msgid "" "Found skip='%s' on line %. Taking this to be header row or first row " "of data.\n" msgstr "在行 %2$ 发现了 skip='%1$s'。将此当做表头或数据的第一行。\n" -#: fread.c:1549 +#: fread.c:1550 #, c-format msgid " Skipped to line % in the file" msgstr " 跳到文件的第 % 行" -#: fread.c:1550 +#: fread.c:1551 #, c-format msgid "skip=% but the input only has % line%s" msgstr "skip=% 但输入只有 % 行 %s" -#: fread.c:1559 +#: fread.c:1560 msgid "" "Input is either empty, fully whitespace, or skip has been set after the last " "non-whitespace." msgstr "输入是空,或全部为空白,或跳过设置是在最后一个非空白字符之后。" -#: fread.c:1561 +#: fread.c:1562 #, c-format msgid " Moved forward to first non-blank line (%d)\n" msgstr " 前移到第一个非空行 (%d)\n" -#: fread.c:1562 +#: fread.c:1563 #, c-format msgid " Positioned on line %d starting: <<%s>>\n" msgstr " 定位到行 %d 开始于: <<%s>>\n" -#: fread.c:1580 +#: fread.c:1581 msgid "[06] Detect separator, quoting rule, and ncolumns\n" msgstr "[06] 检测分隔符,引用规则,以及列数\n" -#: fread.c:1584 +#: fread.c:1585 msgid " sep='\\n' passed in meaning read lines as single character column\n" msgstr " sep='\\n' 设定意味着将把所有行读作一个字符列\n" -#: fread.c:1603 +#: fread.c:1604 msgid " Detecting sep automatically ...\n" msgstr " 自动检测分隔符中 ...\n" -#: fread.c:1610 +#: fread.c:1611 #, c-format msgid " Using supplied sep '%s'\n" msgstr " 使用提供的分隔符 '%s'\n" -#: fread.c:1644 +#: fread.c:1645 #, c-format msgid " with %d fields using quote rule %d\n" msgstr " 对 %d 个字段使用引用规则 %d\n" -#: fread.c:1694 +#: fread.c:1695 #, c-format msgid " with %d lines of %d fields using quote rule %d\n" msgstr " 对 %d 行的 %d 字段使用引用规则 %d\n" -#: fread.c:1701 +#: fread.c:1702 msgid "" " No sep and quote rule found a block of 2x2 or greater. Single column " "input.\n" msgstr " 没有分隔符并且引用规则发现了一个大于或等于2x2的区块。输入是单列。\n" -#: fread.c:1717 +#: fread.c:1718 msgid "" "Single column input contains invalid quotes. Self healing only effective " "when ncol>1" msgstr "单列输入包含了不合法的引用。自我修正只有在列数大于1(ncol>1)时才有效" -#: fread.c:1722 +#: fread.c:1723 #, c-format msgid "" "Found and resolved improper quoting in first %d rows. If the fields are not " @@ -2135,35 +2223,35 @@ msgstr "" "在前 %d 行中发现并修正了不合适的引号用法。如果字段没有加引号(例如字段间隔符" "没有在任何字段内出现),可以尝试使用 quote=\"\" 来避免此警告。" -#: fread.c:1738 +#: fread.c:1739 #, c-format msgid "" "Internal error: ncol==%d line==%d after detecting sep, ncol and first line" msgstr "内部错误:检测分隔符,列数和首行后,ncol==%d line==%d" -#: fread.c:1741 +#: fread.c:1742 #, c-format msgid "Internal error: first line has field count %d but expecting %d" msgstr "内部错误:首行有%d个字段,但应该有%d个" -#: fread.c:1743 +#: fread.c:1744 #, c-format msgid "" " Detected %d columns on line %d. This line is either column names or first " "data row. Line starts as: <<%s>>\n" msgstr "检测到第%2$d行有%1$d列。该行为列名或数据集首行。该行以<<%3$s>>开始\n" -#: fread.c:1745 +#: fread.c:1746 #, c-format msgid " Quote rule picked = %d\n" msgstr "标点符号规则 = %d\n" -#: fread.c:1746 +#: fread.c:1747 #, c-format msgid " fill=%s and the most number of columns found is %d\n" msgstr "fill=%s 且找到的最大列数为 %d\n" -#: fread.c:1752 +#: fread.c:1753 msgid "" "This file is very unusual: it's one single column, ends with 2 or more end-" "of-line (representing several NA at the end), and is a multiple of 4096, too." @@ -2171,12 +2259,12 @@ msgstr "" "该文件极为特殊,仅有一列数据,在结尾处包含多个行结束标记(表示多个空值),且" "长度为4096的整数倍。" -#: fread.c:1753 +#: fread.c:1754 #, c-format msgid " Copying file in RAM. %s\n" msgstr "正在将文件拷贝到RAM。%s\n" -#: fread.c:1759 +#: fread.c:1760 msgid "" " 1-column file ends with 2 or more end-of-line. Restoring last eol using " "extra byte in cow page.\n" @@ -2184,37 +2272,37 @@ msgstr "" "该文件包含一列数据,存在多个行结束标记(表示多个空值)。正在使用写时复制页" "(cow, copy-on-write)额外的字节恢复最后一个标记.\n" -#: fread.c:1778 +#: fread.c:1779 msgid "" "[07] Detect column types, good nrow estimate and whether first row is column " "names\n" msgstr "[07] 检测列类型,估计行数以及首行是否为列名\n" -#: fread.c:1779 +#: fread.c:1780 #, c-format msgid " 'header' changed by user from 'auto' to %s\n" msgstr " 用户已将'header'(列名)从 'auto' 改为 %s\n" -#: fread.c:1783 +#: fread.c:1784 #, c-format msgid "Failed to allocate 2 x %d bytes for type and tmpType: %s" msgstr "为 %2$s 类型分配 2 x %1$d bytes失败" -#: fread.c:1804 +#: fread.c:1805 #, c-format msgid " Number of sampling jump points = %d because " msgstr "采样跳点数 = %d 因为" -#: fread.c:1805 +#: fread.c:1806 #, c-format msgid "nrow limit (%) supplied\n" msgstr "指定了nrow 的最大值 (%) \n" -#: fread.c:1806 +#: fread.c:1807 msgid "jump0size==0\n" msgstr "jump0size==0\n" -#: fread.c:1807 +#: fread.c:1808 #, c-format msgid "" "(% bytes from row 1 to eof) / (2 * % jump0size) == " @@ -2222,32 +2310,32 @@ msgid "" msgstr "" "(从首行到结束共 % bytes) / (2 * % jump0size) == %\n" -#: fread.c:1845 +#: fread.c:1846 #, c-format msgid "" " A line with too-%s fields (%d/%d) was found on line %d of sample jump %d. " "%s\n" msgstr "第%5$d个跳点所找到的第%4$d行,该行字段过于%1$s(%2$d/%3$d). %6$s\n" -#: fread.c:1846 +#: fread.c:1847 msgid "few" msgstr "少" -#: fread.c:1846 +#: fread.c:1847 msgid "many" msgstr "多" -#: fread.c:1846 +#: fread.c:1847 msgid "" "Most likely this jump landed awkwardly so type bumps here will be skipped." msgstr "很有可能这一跳点的位置并不合适,因此此处的类型转换将被跳过。" -#: fread.c:1872 +#: fread.c:1873 #, c-format msgid " Type codes (jump %03d) : %s Quote rule %d\n" msgstr " 类型码(跳点 %03d) : %s 引用规则 %d\n" -#: fread.c:1885 +#: fread.c:1886 #, c-format msgid "" " 'header' determined to be true due to column %d containing a string on row " @@ -2256,19 +2344,19 @@ msgstr "" " 'header' 参数设为真,原因是第%1$d列首行包含字符串,并且在样本中的另外%3$d行" "包含有较底层的数据类型(%2$s)\n" -#: fread.c:1897 +#: fread.c:1898 msgid "" "Internal error: row before first data row has the same number of fields but " "we're not using it." msgstr "内部错误:数据首行的前一行包含相同数量的字段但不会用到该行。" -#: fread.c:1898 +#: fread.c:1899 msgid "" "Internal error: ch!=pos after counting fields in the line before the first " "data row." msgstr "内部错误:对数据首行前一行的字段计数后,ch不等于pos" -#: fread.c:1899 +#: fread.c:1900 #, c-format msgid "" "Types in 1st data row match types in 2nd data row but previous row has %d " @@ -2277,7 +2365,7 @@ msgstr "" "数据第一行的类型与第二行相匹配,但是之前的行有 %d 个字段。故将第一行数据的前" "一行作为列名" -#: fread.c:1902 +#: fread.c:1903 #, c-format msgid "" "Detected %d column names but the data has %d columns (i.e. invalid file). " @@ -2285,7 +2373,7 @@ msgid "" msgstr "" "检测到 %d 个列名,然而数据共有 %d 列(文件不合法)。添加了 %d 个额外列名%s\n" -#: fread.c:1903 +#: fread.c:1904 msgid "" " for the first column which is guessed to be row names or an index. Use " "setnames() afterwards if this guess is not correct, or fix the file write " @@ -2294,17 +2382,17 @@ msgstr "" "作为第一列,并被用于猜测行名或索引。若上述猜测不正确,可在后续使用setnames()" "进行修改,或修复用于生成该文件的文件写入命令以生成有效的文件。" -#: fread.c:1903 +#: fread.c:1904 msgid "s at the end." msgstr "到结尾处" -#: fread.c:1905 +#: fread.c:1906 msgid "" "Internal error: fill=true but there is a previous row which should already " "have been filled." msgstr "内部错误:参数fill=true,但是在此之前有一行应当已经被填充。" -#: fread.c:1906 +#: fread.c:1907 #, c-format msgid "" "Detected %d column names but the data has %d columns. Filling rows " @@ -2313,74 +2401,74 @@ msgstr "" "检测到%d个列名,但数据共有%d列。已经自动填充。设置参数fill=TRUE以屏蔽此警" "告。\n" -#: fread.c:1910 +#: fread.c:1911 #, c-format msgid "Failed to realloc 2 x %d bytes for type and tmpType: %s" msgstr "为 %2$s 类型重新分配 2 x %1$d bytes失败" -#: fread.c:1930 +#: fread.c:1931 #, c-format msgid "" " 'header' determined to be %s because there are%s number fields in the " "first and only row\n" msgstr " 参数'header' 被设置为%s, 因为唯一的一行包含 %s 个字段\n" -#: fread.c:1930 +#: fread.c:1931 msgid " no" msgstr "0" -#: fread.c:1933 +#: fread.c:1934 msgid "" " 'header' determined to be true because all columns are type string and a " "better guess is not possible\n" msgstr "参数 'header' 被设置为true,因为所有列类型均为字符串\n" -#: fread.c:1935 +#: fread.c:1936 msgid "" " 'header' determined to be false because there are some number columns and " "those columns do not have a string field at the top of them\n" msgstr "参数 'header' 被设置为false,因为部分字段的首行不为字符串\n" -#: fread.c:1951 +#: fread.c:1952 #, c-format msgid " Type codes (first row) : %s Quote rule %d\n" msgstr " 类型码(第一行) : %s 引用规则 %d\n" -#: fread.c:1960 +#: fread.c:1961 #, c-format msgid "" " All rows were sampled since file is small so we know nrow=% " "exactly\n" msgstr " 文件太小,全部行均被采样到,所以 nrow=%\n" -#: fread.c:1972 fread.c:1979 +#: fread.c:1973 fread.c:1980 msgid " =====\n" msgstr " =====\n" -#: fread.c:1973 +#: fread.c:1974 #, c-format msgid "" " Sampled % rows (handled \\n inside quoted fields) at %d jump " "points\n" msgstr " 已使用了 %2$d个跳点抽样 %1$ 行(处理了字段间的分隔符\\n)\n" -#: fread.c:1974 +#: fread.c:1975 #, c-format msgid "" " Bytes from first data row on line %d to the end of last row: %\n" msgstr " 从第一个数据行(%d)到最后一行的字节: %\n" -#: fread.c:1975 +#: fread.c:1976 #, c-format msgid " Line length: mean=%.2f sd=%.2f min=%d max=%d\n" msgstr "文件每行长度的统计量:均值=%.2f,标准差=%.2f,最小值=%d ,最大值=%d\n" -#: fread.c:1976 +#: fread.c:1977 #, c-format msgid " Estimated number of rows: % / %.2f = %\n" msgstr "估计数据共有 % / %.2f = % 行\n" -#: fread.c:1977 +#: fread.c:1978 #, c-format msgid "" " Initial alloc = % rows (% + %d%%) using bytes/" @@ -2389,87 +2477,87 @@ msgstr "" "为 % 行 (% + %d%%)分配初始内存,大小为字节数/max(mean-2*sd," "min),并确保该数值落于区间[1.1*estn, 2.0*estn]中\n" -#: fread.c:1981 +#: fread.c:1982 #, c-format msgid "Internal error: sampleLines(%) > allocnrow(%)" msgstr "内部错误:sampleLines(%) > allocnrow(%)" -#: fread.c:1985 +#: fread.c:1986 #, c-format msgid " Alloc limited to lower nrows=% passed in.\n" msgstr " 分配被限制在输入的更小的 nrows=% 值上。\n" -#: fread.c:1997 +#: fread.c:1998 msgid "[08] Assign column names\n" msgstr "[08] 指定列名\n" -#: fread.c:2005 +#: fread.c:2006 #, c-format msgid "Unable to allocate %d*%d bytes for column name pointers: %s" msgstr "无法分配 %d*%d 字节给列名指针: %s" -#: fread.c:2027 +#: fread.c:2028 #, c-format msgid "Internal error: reading colnames ending on '%c'" msgstr "内部错误:读取列名终止于 '%c'" -#: fread.c:2045 +#: fread.c:2046 msgid "[09] Apply user overrides on column types\n" msgstr "[09] 使用用户指定的列类型\n" -#: fread.c:2049 +#: fread.c:2050 msgid " Cancelled by user: userOverride() returned false." msgstr " 用户已取消:userOverride() 返回 false。" -#: fread.c:2059 +#: fread.c:2060 #, c-format msgid "Failed to allocate %d bytes for size array: %s" msgstr "无法分配 %d 字节给 size 数组:%s" -#: fread.c:2066 +#: fread.c:2067 #, c-format msgid "" -"Attempt to override column %d <<%.*s>> of inherent type '%s' down to '%s' " +"Attempt to override column %d%s%.*s%s of inherent type '%s' down to '%s' " "ignored. Only overrides to a higher type are currently supported. If this " "was intended, please coerce to the lower type afterwards." msgstr "" -"试图覆盖第 %d 列 <<%.*s>>,将内部类型 '%s' 降级为 '%s' 的操作被忽略。只支持将" +"试图覆盖第 %d 列 %s%.*s%s,将内部类型 '%s' 降级为 '%s' 的操作被忽略。只支持将" "列类型升为更高阶的类型。如果确定此操作,请完成之后再转换类型。" -#: fread.c:2080 +#: fread.c:2082 #, c-format msgid " After %d type and %d drop user overrides : %s\n" msgstr " 经过 %d 类型和 %d 丢弃用户覆盖:%s\n" -#: fread.c:2088 +#: fread.c:2090 msgid "[10] Allocate memory for the datatable\n" msgstr "[10] 分配内存给 datatable\n" -#: fread.c:2089 +#: fread.c:2091 #, c-format msgid " Allocating %d column slots (%d - %d dropped) with % rows\n" msgstr " 正在分配 %d 列位置(%d - %d 已丢弃),% 行\n" -#: fread.c:2143 +#: fread.c:2145 #, c-format msgid "Buffer size % is too large\n" msgstr "缓冲长度 % 过大\n" -#: fread.c:2146 +#: fread.c:2148 msgid "[11] Read the data\n" msgstr "[11] 读取数据\n" -#: fread.c:2149 +#: fread.c:2151 #, c-format msgid " jumps=[%d..%d), chunk_size=%, total_size=%\n" msgstr " jumps=[%d..%d),chunk_size=%,total_size=%\n" -#: fread.c:2161 +#: fread.c:2163 #, c-format msgid "Internal error: Master thread is not thread 0 but thread %d.\n" msgstr "内部错误:主线程并非线程0而是线程%d\n" -#: fread.c:2369 +#: fread.c:2371 #, c-format msgid "" "Column %d (\"%.*s\") bumped from '%s' to '%s' due to <<%.*s>> on row " @@ -2478,14 +2566,14 @@ msgstr "" "第 %d 列(\"%.*s\") 发生了从 '%s' 到 '%s' 的类型转换,由于 <<%.*s>> 出现在第 " "% 行\n" -#: fread.c:2418 +#: fread.c:2421 #, c-format msgid "" "Internal error: invalid head position. jump=%d, headPos=%p, thisJumpStart=" "%p, sof=%p" msgstr "内部错误:head 位置无效。jump=%d, headPos=%p, thisJumpStart=%p, sof=%p" -#: fread.c:2491 +#: fread.c:2494 #, c-format msgid "" " Too few rows allocated. Allocating additional % rows (now nrows=" @@ -2494,42 +2582,42 @@ msgstr "" " 分配的行数太少。正在分配额外的 % 行(当前 nrows=%),并从跳" "跃 %d 继续读取\n" -#: fread.c:2498 +#: fread.c:2501 #, c-format msgid " Restarting team from jump %d. nSwept==%d quoteRule==%d\n" msgstr " 从跳跃 %d 重启组。nSwept==%d quoteRule==%d\n" -#: fread.c:2518 +#: fread.c:2521 #, c-format msgid " %d out-of-sample type bumps: %s\n" msgstr " %d 样本外类型变更:%s\n" -#: fread.c:2554 +#: fread.c:2557 #, c-format msgid "" "Read % rows x %d columns from %s file in %02d:%06.3f wall clock " "time\n" msgstr "读取 % 行 x %d 列,从 %s 文件(时钟时间 %02d:%06.3f)\n" -#: fread.c:2561 +#: fread.c:2564 msgid "[12] Finalizing the datatable\n" msgstr "[12] 最后定型 datatable\n" -#: fread.c:2562 +#: fread.c:2565 msgid " Type counts:\n" msgstr " 类型数量:\n" -#: fread.c:2564 +#: fread.c:2567 #, c-format msgid "%10d : %-9s '%c'\n" msgstr "%10d : %-9s '%c'\n" -#: fread.c:2580 +#: fread.c:2583 #, c-format msgid "Discarded single-line footer: <<%s>>" msgstr "丢弃末尾行:<<%s>>" -#: fread.c:2585 +#: fread.c:2588 #, c-format msgid "" "Stopped early on line %. Expected %d fields but found %d. Consider " @@ -2538,7 +2626,7 @@ msgstr "" "在第 % 行提前终止。预期有 %d 个字段但只找到 %d 个。可以考虑设置 " "fill=TRUE 和 comment.char=。 首个丢弃的非空行:<<%s>>" -#: fread.c:2591 +#: fread.c:2594 #, c-format msgid "" "Found and resolved improper quoting out-of-sample. First healed line " @@ -2549,31 +2637,31 @@ msgstr "" "不在引号内(例如:字段间隔符没有在任何一个字段中出现),尝试用 quote=\"\" 来" "避免该警告。" -#: fread.c:2595 +#: fread.c:2598 msgid "=============================\n" msgstr "=============================\n" -#: fread.c:2597 +#: fread.c:2600 #, c-format msgid "%8.3fs (%3.0f%%) Memory map %.3fGB file\n" msgstr "%8.3fs (%3.0f%%) 内存映射 %.3fGB 文件\n" -#: fread.c:2598 +#: fread.c:2601 #, c-format msgid "%8.3fs (%3.0f%%) sep=" msgstr "%8.3fs (%3.0f%%) sep=" -#: fread.c:2600 +#: fread.c:2603 #, c-format msgid " ncol=%d and header detection\n" msgstr " ncol=%d 和表头检测\n" -#: fread.c:2601 +#: fread.c:2604 #, c-format msgid "%8.3fs (%3.0f%%) Column type detection using % sample rows\n" msgstr "%8.3fs (%3.0f%%) 列类型检测基于 % 个样本行\n" -#: fread.c:2603 +#: fread.c:2606 #, c-format msgid "" "%8.3fs (%3.0f%%) Allocation of % rows x %d cols (%.3fGB) of which " @@ -2582,7 +2670,7 @@ msgstr "" "%8.3fs (%3.0f%%) % 行 x %d 列 (%.3fGB) 的分配中已使用 % " "(%3.0f%%) 行\n" -#: fread.c:2607 +#: fread.c:2610 #, c-format msgid "" "%8.3fs (%3.0f%%) Reading %d chunks (%d swept) of %.3fMB (each chunk %d rows) " @@ -2591,34 +2679,34 @@ msgstr "" "%8.3fs (%3.0f%%) 正在读取 %d 个块 (%d 已扫描) of %.3fMB (每个块 %d 行) 使用 " "%d 个线程\n" -#: fread.c:2609 +#: fread.c:2612 #, c-format msgid "" " + %8.3fs (%3.0f%%) Parse to row-major thread buffers (grown %d times)\n" msgstr " + %8.3fs (%3.0f%%) 解析到行处理线程的缓冲区(已增长 %d 次)\n" -#: fread.c:2610 +#: fread.c:2613 #, c-format msgid " + %8.3fs (%3.0f%%) Transpose\n" msgstr " + %8.3fs (%3.0f%%) 转置\n" -#: fread.c:2611 +#: fread.c:2614 #, c-format msgid " + %8.3fs (%3.0f%%) Waiting\n" msgstr " + %8.3fs (%3.0f%%) 正在等待\n" -#: fread.c:2612 +#: fread.c:2615 #, c-format msgid "" "%8.3fs (%3.0f%%) Rereading %d columns due to out-of-sample type exceptions\n" msgstr "%8.3fs (%3.0f%%) 正在重读 %d 列,由于样本外类型异常\n" -#: fread.c:2614 +#: fread.c:2617 #, c-format msgid "%8.3fs Total\n" msgstr "%8.3fs 总计\n" -#: freadR.c:85 +#: freadR.c:86 msgid "" "Internal error: freadR input not a single character string: a filename or " "the data itself. Should have been caught at R level." @@ -2626,49 +2714,49 @@ msgstr "" "内部错误:freadR 输入的不是单个字符串:文件名或者数据文本。该错误本应在 R 中" "被捕获。" -#: freadR.c:93 +#: freadR.c:94 msgid "" "Input contains a \\n or is \")\". Taking this to be text input (not a " "filename)\n" msgstr "输入中包含 \\n 或者是 \")\"。输入将被当做数据文本(而非文件名)\n" -#: freadR.c:96 +#: freadR.c:97 msgid "Input contains no \\n. Taking this to be a filename to open\n" msgstr "输入中不包含 \\n。输入将被当做文件名打开。\n" -#: freadR.c:102 +#: freadR.c:103 msgid "" "Internal error: freadR sep not a single character. R level catches this." msgstr "内部错误:freadR sep 不是单个字符。R 中应该捕获此错误。" -#: freadR.c:106 +#: freadR.c:107 msgid "" "Internal error: freadR dec not a single character. R level catches this." msgstr "内部错误:freadR dec 不是单个字符。R 中应该捕获此错误。" -#: freadR.c:113 +#: freadR.c:114 msgid "quote= must be a single character, blank \"\", or FALSE" msgstr "quote= 必须是单个字符,空白 \"\",或者 FALSE" -#: freadR.c:143 +#: freadR.c:144 msgid "Internal error: skip not integer or string in freadR.c" msgstr "内部错误:freadR.c 中 skip 非整数或字符串" -#: freadR.c:146 +#: freadR.c:147 #, c-format msgid "Internal error: NAstringsArg is type '%s'. R level catches this" msgstr "内部错误:NAstringsArg是'%s'数据类型.R中能够捕获这个信息" -#: freadR.c:159 +#: freadR.c:160 #, c-format msgid "nThread(%d)<1" msgstr "nThread(%1$d)<1(线程数(%1$d)小于1)" -#: freadR.c:166 +#: freadR.c:168 msgid "'integer64' must be a single character string" msgstr "'64整数型'必须是单个字符串" -#: freadR.c:174 +#: freadR.c:176 #, c-format msgid "" "Invalid value integer64='%s'. Must be 'integer64', 'character', 'double' or " @@ -2677,11 +2765,11 @@ msgstr "" "64位整数型有效值='%s'.必须是'64位整数型','字符串','双精度浮点型'或者'数值" "型'" -#: freadR.c:182 +#: freadR.c:184 msgid "Use either select= or drop= but not both." msgstr "select=和drop=不可同时使用" -#: freadR.c:185 +#: freadR.c:187 msgid "" "select= is type list for specifying types in select=, but colClasses= has " "been provided as well. Please remove colClasses=." @@ -2689,7 +2777,7 @@ msgstr "" "select=是用于在select=中指定类型的类型列表,但是还提供了colClasses=。请删除" "colClasses=。" -#: freadR.c:187 +#: freadR.c:189 msgid "" "select= is type list but has no names; expecting list(type1=cols1, " "type2=cols2, ...)" @@ -2697,7 +2785,7 @@ msgstr "" "select =是类型列表,但没有名称; 期望列表(type1 = cols1,type2 = " "cols2,...)" -#: freadR.c:194 +#: freadR.c:196 msgid "" "select= is a named vector specifying the columns to select and their types, " "but colClasses= has been provided as well. Please remove colClasses=." @@ -2705,45 +2793,45 @@ msgstr "" "select =是一个命名向量,用于指定要选择的列及其类型,但是还提供了colClasses " "=。 请删除colClasses =。" -#: freadR.c:202 freadR.c:368 +#: freadR.c:204 freadR.c:370 msgid "colClasses is type list but has no names" msgstr "colClasses是类型列表,但没有名称" -#: freadR.c:212 +#: freadR.c:214 #, c-format msgid "encoding='%s' invalid. Must be 'unknown', 'Latin-1' or 'UTF-8'" msgstr "encoding ='%s'无效。 必须为'未知','Latin-1'或'UTF-8'" -#: freadR.c:235 +#: freadR.c:237 #, c-format msgid "Column name '%s' (%s) not found" msgstr "找不到列名'%s'(%s)" -#: freadR.c:237 +#: freadR.c:239 #, c-format msgid "%s is NA" msgstr "%s是缺失值" -#: freadR.c:239 +#: freadR.c:241 #, c-format msgid "%s is %d which is out of range [1,ncol=%d]" msgstr "%s是%d,超出范围[1,ncol =%d]" -#: freadR.c:253 +#: freadR.c:255 msgid "Internal error: typeSize[CT_BOOL8_N] != 1" msgstr "内部错误:类型大小[CT_BOOL8_N]不等于1" -#: freadR.c:254 +#: freadR.c:256 msgid "Internal error: typeSize[CT_STRING] != 1" msgstr "内部错误:类型大小[CT_STRING]不等于1" -#: freadR.c:288 +#: freadR.c:290 #, c-format msgid "" "Column name '%s' not found in column name header (case sensitive), skipping." msgstr "在列名标题中找不到列名'%s'(区分大小写),正在跳过。" -#: freadR.c:298 +#: freadR.c:300 #, c-format msgid "" "Column number %d (select[%d]) is negative but should be in the range [1,ncol=" @@ -2751,7 +2839,7 @@ msgid "" msgstr "" "列号%d(select [%d])为负,但应在[1,ncol =%d]范围内。考虑drop=用于排除列。" -#: freadR.c:299 +#: freadR.c:301 #, c-format msgid "" "select = 0 (select[%d]) has no meaning. All values of select should be in " @@ -2759,19 +2847,19 @@ msgid "" msgstr "" "select=0(select[%d])没有意义。select的所有值都应在[1,ncol=%d]范围内。" -#: freadR.c:300 +#: freadR.c:302 #, c-format msgid "" "Column number %d (select[%d]) is too large for this table, which only has %d " "columns." msgstr "对于此表(仅包含%d列,)列号%d(select [%d])太大。" -#: freadR.c:301 +#: freadR.c:303 #, c-format msgid "Column number %d ('%s') has been selected twice by select=" msgstr "列号%d('%s')已由select =选择两次" -#: freadR.c:324 +#: freadR.c:326 #, c-format msgid "" "colClasses= is an unnamed vector of types, length %d, but there are %d " @@ -2783,11 +2871,11 @@ msgstr "" "定类型,可以使用命名向量,列表格式或使用select=而不是colClasses=。请参阅'?" "fread'中的示例。" -#: freadR.c:344 +#: freadR.c:346 msgid "Internal error: selectInts is NULL but selectColClasses is true" msgstr "内部错误:selectInts为NULL,但selectColClasses为true" -#: freadR.c:346 +#: freadR.c:348 msgid "" "Internal error: length(selectSxp)!=length(colClassesSxp) but " "selectColClasses is true" @@ -2795,22 +2883,22 @@ msgstr "" "内部错误:length(select xp)!=length(colClasses xp),但select ColClasses" "为true" -#: freadR.c:366 +#: freadR.c:368 #, c-format msgid "colClasses is type '%s' but should be list or character" msgstr "colClasses是类型'%s',但应该是列表或字符" -#: freadR.c:390 +#: freadR.c:392 #, c-format msgid "Column name '%s' (colClasses[[%d]][%d]) not found" msgstr "找不到列名'%s'(colClasses[[%d]][%d])" -#: freadR.c:392 +#: freadR.c:394 #, c-format msgid "colClasses[[%d]][%d] is NA" msgstr "colClasses[[%d]][%d]是NA" -#: freadR.c:396 +#: freadR.c:398 #, c-format msgid "" "Column %d ('%s') appears more than once in colClasses. The second time is " @@ -2818,22 +2906,22 @@ msgid "" msgstr "" "Column %d ('%s')在colClasses中出现了多次。第二次是colClasses[[%d]][%d]." -#: freadR.c:408 +#: freadR.c:410 #, c-format msgid "Column number %d (colClasses[[%d]][%d]) is out of range [1,ncol=%d]" msgstr "列号%d(colClasses[[%d]][%d])超出范围[1,ncol=%d]" -#: freadR.c:624 +#: freadR.c:626 #, c-format msgid "Field size is 1 but the field is of type %d\n" msgstr "字段大小为1,但字段类型为%d \n" -#: freadR.c:633 +#: freadR.c:635 #, c-format msgid "Internal error: unexpected field of size %d\n" msgstr "内部错误:大小为%d 的意外字段\n" -#: freadR.c:701 +#: freadR.c:703 #, c-format msgid "%s" msgstr "%s" @@ -3022,7 +3110,7 @@ msgstr "" "内部错误: 在 rolling 函数中无效的 fun 参数, 理应在更早阶段排除请向data.table " "issue tracker报告" -#: frollR.c:155 frollR.c:279 nafill.c:162 shift.c:21 +#: frollR.c:155 frollR.c:279 nafill.c:162 shift.c:19 msgid "fill must be a vector of length 1" msgstr "fill 必须是长度为1的向量" @@ -3146,7 +3234,7 @@ msgstr "前5个MSB counts:" msgid "% " msgstr "% " -#: fsort.c:247 fwrite.c:702 fwrite.c:966 +#: fsort.c:247 fwrite.c:702 msgid "\n" msgstr "\n" @@ -3165,6 +3253,19 @@ msgstr "%d 通过排除0和1的counts\n" msgid "%d: %.3f (%4.1f%%)\n" msgstr "%d: %.3f (%4.1f%%)\n" +#: fwrite.c:572 +#, c-format +msgid "deflate input stream: %p %d %p %d\n" +msgstr "deflate (压缩) 输入数据流:%p %d %p %d\n" + +#: fwrite.c:575 +#, c-format +msgid "" +"deflate returned %d with stream->total_out==%d; Z_FINISH==%d, Z_OK==%d, " +"Z_STREAM_END==%d\n" +msgstr "deflate (压缩) 返回 %d,stream->total_out==%d; Z_FINISH==%d, Z_OK==%d, " +"Z_STREAM_END==%d\n" + #: fwrite.c:613 #, c-format msgid "buffMB=%d outside [1,1024]" @@ -3244,6 +3345,11 @@ msgstr "无法为header: %2$s分配%1$d MiB" msgid "Can't allocate gzip stream structure" msgstr "无法分配gzip的流结构" +#: fwrite.c:743 fwrite.c:752 +#, c-format +msgid "z_stream for header (%d): " +msgstr "header (%d) 的 z_stream:" + #: fwrite.c:748 #, c-format msgid "Unable to allocate %d MiB for zbuffer: %s" @@ -3254,7 +3360,7 @@ msgstr "无法为zbuffer: %2$s分配%1$d MiB" msgid "Compress gzip error: %d" msgstr "解压gzip错误: %d" -#: fwrite.c:765 fwrite.c:773 fwrite.c:972 +#: fwrite.c:765 fwrite.c:773 #, c-format msgid "%s: '%s'" msgstr "%s: '%s'" @@ -3278,6 +3384,27 @@ msgstr "" "showProgress=%5$d, nth=%6$d)\n" ")\n" +#: fwrite.c:812 +#, c-format +msgid "" +"Unable to allocate %d MB * %d thread buffers; '%d: %s'. Please read ?fwrite " +"for nThread, buffMB and verbose options." +msgstr "无法分配 %d MB * %d 的线程缓存;'%d: %s'。请阅读 ?fwrite 中" +"对 nThread、buffMB 和 verbose 选项的说明。" + +#: fwrite.c:822 +#, c-format +msgid "" +"Unable to allocate %d MB * %d thread compressed buffers; '%d: %s'. Please " +"read ?fwrite for nThread, buffMB and verbose options." +msgstr "无法分配 %d MB * %d 的线程压缩缓存;'%d: %s'。请" +"阅读 ?fwrite 中对 nThread、buffMB 和 verbose 选项的说明。" + +#: fwrite.c:851 fwrite.c:883 fwrite.c:885 +#, c-format +msgid "z_stream for data (%d): " +msgstr "data (%d) 的 z_stream:" + #: fwrite.c:980 #, c-format msgid "" @@ -3461,7 +3588,7 @@ msgid "" "hold so the result has been coerced to 'numeric' automatically for " "convenience." msgstr "" -"某整数列分组求和的结果中,出现了超过了整型(interger)数值所允许最大值的情" +"某整数列分组求和的结果中,出现了超过了整型(integer)数值所允许最大值的情" "况,故结果被自动转换为数值类型(numeric)" #: gsumm.c:565 @@ -3999,6 +4126,10 @@ msgstr "" "内部错误:函数 nafillR 中有无效类型的参数, 该错误理应已被捕获,请向data.table" "的issue通道报告" +#: nafill.c:182 +msgid "nan_is_na must be TRUE or FALSE" +msgstr "nan_is_na 必须是 TRUE 或者 FALSE" + #: nafill.c:206 #, c-format msgid "%s: parallel processing of %d column(s) took %.3fs\n" @@ -4010,8 +4141,8 @@ msgid "" "Ignoring invalid %s==\"%s\". Not an integer >= 1. Please remove any " "characters that are not a digit [0-9]. See ?data.table::setDTthreads." msgstr "" -"忽略无效的 %s==\"%s\". 不是一个 >= 1 的整型. 请去除任何不是[0-9]数字的字" -"符。 查看?data.table::setDTthreads." +"忽略无效的 %s==\"%s\". 不是一个 >= 1 的整型. 请去除任何不是[0-9]数字的字符。 " +"查看?data.table::setDTthreads." #: openmp-utils.c:44 #, c-format @@ -4375,8 +4506,9 @@ msgstr "nrow(x)[%d] 不等于 length(order)[%d]" msgid "" "Item %d of order (%d) is either NA, out of range [1,%d], or is duplicated. " "The new order must be a strict permutation of 1:n" -msgstr "排序(%2$d)的 %1$d 项为 NA,超出范围 [1,%3$d],或与其他项重复。" -"新的排序必须为 1:n 的排列" +msgstr "" +"排序(%2$d)的 %1$d 项为 NA,超出范围 [1,%3$d],或与其他项重复。新的排序必须" +"为 1:n 的排列" #: reorder.c:105 msgid "dt passed to setcolorder has no names" @@ -4387,14 +4519,14 @@ msgstr "setcolorder读取到的dt并没有名字" msgid "Internal error: dt passed to setcolorder has %d columns but %d names" msgstr "内部错误: setcolorder读取到的dt有 %d 列但是有 %d 个名字。" -#: shift.c:17 +#: shift.c:15 #, c-format msgid "" "type '%s' passed to shift(). Must be a vector, list, data.frame or data.table" msgstr "" "传递给 shift() 的 '%s' 类型,必须是向量、列表、data.frame 或 data.table" -#: shift.c:24 shift.c:28 +#: shift.c:22 shift.c:26 msgid "" "Internal error: invalid type for shift(), should have been caught before. " "please report to data.table issue tracker" @@ -4402,20 +4534,28 @@ msgstr "" "内部错误:shift() 的类型无效,请提前排查。请向 data.table 提交问题追踪" "(issue tracker)报告" -#: shift.c:31 +#: shift.c:29 msgid "Internal error: k must be integer" msgstr "内部错误:k 必须是整数" -#: shift.c:33 +#: shift.c:31 #, c-format msgid "Item %d of n is NA" msgstr "n 的第 %d 项是NA" -#: shift.c:157 +#: shift.c:170 #, c-format msgid "Unsupported type '%s'" msgstr "不支持 '%s' 类型" +#: snprintf.c:192 snprintf.c:195 snprintf.c:198 snprintf.c:201 snprintf.c:204 +#: snprintf.c:207 snprintf.c:210 snprintf.c:213 snprintf.c:216 snprintf.c:217 +#: snprintf.c:220 snprintf.c:223 snprintf.c:226 snprintf.c:229 snprintf.c:232 +#: snprintf.c:235 snprintf.c:238 snprintf.c:241 snprintf.c:244 +#, c-format +msgid "dt_win_snprintf test %d failed: %s" +msgstr "dt_win_snprintf 测试 %d 失败了: %s" + #: subset.c:7 #, c-format msgid "Internal error: subsetVectorRaw length(ans)==%d n=%d" @@ -4669,20 +4809,17 @@ msgstr "%s:fill参数的长度必须为1" msgid "%s: fill argument must be numeric" msgstr "%s:fill参数必须为数值类型" -#: utils.c:280 +#: utils.c:281 #, c-format msgid "Internal error: unsupported type '%s' passed to copyAsPlain()" msgstr "内部错误:copyAsPlain()不支持类型为'%s'的参数" -#: utils.c:284 +#: utils.c:286 #, c-format -msgid "" -"Internal error: type '%s' passed to copyAsPlain() but it seems " -"copyMostAttrib() retains ALTREP attributes" -msgstr "" -"内部错误:copyAsPlain()中参数为'%s'类型,但copyMostAttrib() 保留了ALTREP属性" +msgid "Internal error: copyAsPlain returning ALTREP for type '%s'" +msgstr "内部错误:copyAsPlain 返回了类型为 '%s' 的 ALTREP" -#: utils.c:319 +#: utils.c:330 #, c-format msgid "Found and copied %d column%s with a shared memory address\n" msgstr "发现并拷贝了具有相同的内存地址的%d列%s\n" diff --git a/src/assign.c b/src/assign.c index 5a944b4993..5c0b808707 100644 --- a/src/assign.c +++ b/src/assign.c @@ -647,7 +647,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) R_isort(tt, ndelete); // sort the column-numbers-to-delete into ascending order for (int i=0; i=tt[i+1]) - error("Internal error: %d column numbers to delete not now in strictly increasing order. No-dups were checked earlier."); // # nocov + error(_("Internal error: %d column numbers to delete not now in strictly increasing order. No-dups were checked earlier.")); // # nocov } for (int i=tt[0], j=1, k=tt[0]+1; i0) { if (xlength(cons) != len0) { - error("Argument #%d has a different length than argument #1. " - "Please make sure all logical conditions have the same length.", - i*2+1); + error(_("Argument #%d has a different length than argument #1. " + "Please make sure all logical conditions have the same length."), + i*2+1); } if (TYPEOF(outs) != type0) { - error("Argument #%d is of type %s, however argument #2 is of type %s. " - "Please make sure all output values have the same type.", - i*2+2, type2char(TYPEOF(outs)), type2char(type0)); + error(_("Argument #%d is of type %s, however argument #2 is of type %s. " + "Please make sure all output values have the same type."), + i*2+2, type2char(TYPEOF(outs)), type2char(type0)); } if (!R_compute_identical(PROTECT(getAttrib(value0,R_ClassSymbol)), PROTECT(getAttrib(outs,R_ClassSymbol)), 0)) { - error("Argument #%d has different class than argument #2, " - "Please make sure all output values have the same class.", i*2+2); + error(_("Argument #%d has different class than argument #2, " + "Please make sure all output values have the same class."), i*2+2); } UNPROTECT(2); if (isFactor(value0)) { if (!R_compute_identical(PROTECT(getAttrib(value0,R_LevelsSymbol)), PROTECT(getAttrib(outs,R_LevelsSymbol)), 0)) { - error("Argument #2 and argument #%d are both factor but their levels are different.", i*2+2); + error(_("Argument #2 and argument #%d are both factor but their levels are different."), i*2+2); } UNPROTECT(2); } } int64_t len1 = xlength(outs); if (len1!=len0 && len1!=1) { - error("Length of output value #%d must either be 1 or length of logical condition.", i*2+2); + error(_("Length of output value #%d must either be 1 or length of logical condition."), i*2+2); } int64_t amask = len1>1 ? INT64_MAX : 0; const int *restrict pcons = LOGICAL(cons); @@ -333,7 +333,7 @@ SEXP fcaseR(SEXP na, SEXP rho, SEXP args) { } } break; default: - error("Type %s is not supported.", type2char(TYPEOF(outs))); + error(_("Type %s is not supported."), type2char(TYPEOF(outs))); } UNPROTECT(2); // this cons and outs if (l==0) { diff --git a/src/forder.c b/src/forder.c index 79d126e4c2..16e40593e3 100644 --- a/src/forder.c +++ b/src/forder.c @@ -501,7 +501,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S int keyAlloc = (ncol+n_cplx)*8 + 1; // +1 for NULL to mark end; calloc to initialize with NULLs key = calloc(keyAlloc, sizeof(uint8_t *)); // needs to be before loop because part II relies on part I, column-by-column. if (!key) - STOP("Unable to allocate %"PRId64" bytes of working memory", (uint64_t)keyAlloc*sizeof(uint8_t *)); // # nocov + STOP(_("Unable to allocate %"PRIu64" bytes of working memory"), (uint64_t)keyAlloc*sizeof(uint8_t *)); // # nocov nradix=0; // the current byte we're writing this column to; might be squashing into it (spare>0) int spare=0; // the amount of bits remaining on the right of the current nradix byte bool isReal=false; @@ -605,7 +605,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S if (key[nradix+b]==NULL) { uint8_t *tt = calloc(nrow, sizeof(uint8_t)); // 0 initialize so that NA's can just skip (NA is always the 0 offset) if (!tt) - STOP("Unable to allocate %"PRIu64" bytes of working memory", (uint64_t)nrow*sizeof(uint8_t)); // # nocov + STOP(_("Unable to allocate %"PRIu64" bytes of working memory"), (uint64_t)nrow*sizeof(uint8_t)); // # nocov key[nradix+b] = tt; } } diff --git a/src/fwrite.c b/src/fwrite.c index 5c5e9eb579..7a79e6ad8c 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -569,10 +569,10 @@ int compressbuff(z_stream *stream, void* dest, size_t *destLen, const void* sour stream->avail_out = *destLen; stream->next_in = (Bytef *)source; // don't use z_const anywhere; #3939 stream->avail_in = sourceLen; - if (verbose) DTPRINT("deflate input stream: %p %d %p %d\n", stream->next_out, (int)(stream->avail_out), stream->next_in, (int)(stream->avail_in)); + if (verbose) DTPRINT(_("deflate input stream: %p %d %p %d\n"), stream->next_out, (int)(stream->avail_out), stream->next_in, (int)(stream->avail_in)); int err = deflate(stream, Z_FINISH); - if (verbose) DTPRINT("deflate returned %d with stream->total_out==%d; Z_FINISH==%d, Z_OK==%d, Z_STREAM_END==%d\n", err, (int)(stream->total_out), Z_FINISH, Z_OK, Z_STREAM_END); + if (verbose) DTPRINT(_("deflate returned %d with stream->total_out==%d; Z_FINISH==%d, Z_OK==%d, Z_STREAM_END==%d\n"), err, (int)(stream->total_out), Z_FINISH, Z_OK, Z_STREAM_END); if (err == Z_OK) { // with Z_FINISH, deflate must return Z_STREAM_END if correct, otherwise it's an error and we shouldn't return Z_OK (0) err = -9; // # nocov @@ -740,7 +740,7 @@ void fwriteMain(fwriteMainArgs args) free(buff); // # nocov STOP(_("Can't allocate gzip stream structure")); // # nocov } - if (verbose) {DTPRINT("z_stream for header (1): "); print_z_stream(&stream);} + if (verbose) {DTPRINT(_("z_stream for header (%d): "), 1); print_z_stream(&stream);} size_t zbuffSize = deflateBound(&stream, headerLen); char *zbuff = malloc(zbuffSize); if (!zbuff) { @@ -749,7 +749,7 @@ void fwriteMain(fwriteMainArgs args) } size_t zbuffUsed = zbuffSize; ret1 = compressbuff(&stream, zbuff, &zbuffUsed, buff, (size_t)(ch-buff)); - if (verbose) {DTPRINT("z_stream for header (2): "); print_z_stream(&stream);} + if (verbose) {DTPRINT(_("z_stream for header (%d): "), 2); print_z_stream(&stream);} if (ret1==Z_OK) ret2 = WRITE(f, zbuff, (int)zbuffUsed); deflateEnd(&stream); free(zbuff); @@ -809,7 +809,7 @@ void fwriteMain(fwriteMainArgs args) char *buffPool = malloc(nth*(size_t)buffSize); if (!buffPool) { // # nocov start - STOP("Unable to allocate %d MB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options.", + STOP(_("Unable to allocate %d MB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), (size_t)buffSize/(1024^2), nth, errno, strerror(errno)); // # nocov end } @@ -819,7 +819,7 @@ void fwriteMain(fwriteMainArgs args) if (!zbuffPool) { // # nocov start free(buffPool); - STOP("Unable to allocate %d MB * %d thread compressed buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options.", + STOP(_("Unable to allocate %d MB * %d thread compressed buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), (size_t)zbuffSize/(1024^2), nth, errno, strerror(errno)); // # nocov end } @@ -848,7 +848,7 @@ void fwriteMain(fwriteMainArgs args) failed = true; // # nocov my_failed_compress = -998; // # nocov } - if (verbose) {DTPRINT("z_stream for data (1): "); print_z_stream(&mystream);} + if (verbose) {DTPRINT(_("z_stream for data (%d): "), 1); print_z_stream(&mystream);} } #pragma omp for ordered schedule(dynamic) @@ -880,9 +880,9 @@ void fwriteMain(fwriteMainArgs args) // compress buffer if gzip if (args.is_gzip && !failed) { myzbuffUsed = zbuffSize; - if (verbose) {DTPRINT("z_stream for data (2): "); print_z_stream(&mystream);} + if (verbose) {DTPRINT(_("z_stream for data (%d): "), 2); print_z_stream(&mystream);} int ret = compressbuff(&mystream, myzBuff, &myzbuffUsed, myBuff, (size_t)(ch-myBuff)); - if (verbose) {DTPRINT("z_stream for data (3): "); print_z_stream(&mystream);} + if (verbose) {DTPRINT(_("z_stream for data (%d): "), 3); print_z_stream(&mystream);} if (ret) { failed=true; my_failed_compress=ret; } else deflateReset(&mystream); } @@ -963,13 +963,13 @@ void fwriteMain(fwriteMainArgs args) DTPRINT("\r " " \r"); } else { // don't clear any potentially helpful output before error - DTPRINT(_("\n")); + DTPRINT("\n"); } // # nocov end } if (f!=-1 && CLOSE(f) && !failed) - STOP(_("%s: '%s'"), strerror(errno), args.filename); // # nocov + STOP("%s: '%s'", strerror(errno), args.filename); // # nocov // quoted '%s' in case of trailing spaces in the filename // If a write failed, the line above tries close() to clean up, but that might fail as well. So the // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail diff --git a/src/nafill.c b/src/nafill.c index 84e603cc0e..d3da3c9c0b 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -179,7 +179,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S nafillInteger64(i64x[i], inx[i], itype, i64fill, &vans[i], verbose); } else { if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) - error("nan_is_na must be TRUE or FALSE"); // # nocov + error(_("nan_is_na must be TRUE or FALSE")); // # nocov bool nan_is_na = LOGICAL(nan_is_na_arg)[0]; nafillDouble(dx[i], inx[i], itype, dfill, nan_is_na, &vans[i], verbose); } diff --git a/src/snprintf.c b/src/snprintf.c index 52f7ea37c6..497437644d 100644 --- a/src/snprintf.c +++ b/src/snprintf.c @@ -189,60 +189,59 @@ SEXP test_dt_win_snprintf() char buff[50]; dt_win_snprintf(buff, 50, "No pos %d%%%d ok", 42, -84); - if (strcmp(buff, "No pos 42%-84 ok")) error("dt_win_snprintf test 1 failed: %s", buff); + if (strcmp(buff, "No pos 42%-84 ok")) error(_("dt_win_snprintf test %d failed: %s"), 1, buff); dt_win_snprintf(buff, 50, "With pos %1$d%%%2$d ok", 42, -84); - if (strcmp(buff, "With pos 42%-84 ok")) error("dt_win_snprintf test 2 failed: %s", buff); + if (strcmp(buff, "With pos 42%-84 ok")) error(_("dt_win_snprintf test %d failed: %s"), 2, buff); dt_win_snprintf(buff, 50, "With pos %2$d%%%1$d ok", 42, -84); - if (strcmp(buff, "With pos -84%42 ok")) error("dt_win_snprintf test 3 failed: %s", buff); + if (strcmp(buff, "With pos -84%42 ok")) error(_("dt_win_snprintf test %d failed: %s"), 3, buff); dt_win_snprintf(buff, 50, "%3$s %1$d %4$10s %2$03d$", -99, 12, "hello%2$d", "short"); - if (strcmp(buff, "hello%2$d -99 short 012$")) error("dt_win_snprintf test 4 failed: %s", buff); + if (strcmp(buff, "hello%2$d -99 short 012$")) error(_("dt_win_snprintf test %d failed: %s"), 4, buff); dt_win_snprintf(buff, 50, "%1$d %s", 9, "foo"); - if (strcmp(buff, "3 some %n$ but not all")) error("dt_win_snprintf test 5 failed: %s", buff); + if (strcmp(buff, "3 some %n$ but not all")) error(_("dt_win_snprintf test %d failed: %s"), 5, buff); dt_win_snprintf(buff, 50, "%%1$foo%d", 9); // The %1$f is not a specifier because % is doubled - if (strcmp(buff, "%1$foo9")) error("dt_win_snprintf test 6 failed: %s", buff); + if (strcmp(buff, "%1$foo9")) error(_("dt_win_snprintf test %d failed: %s"), 6, buff); dt_win_snprintf(buff, 40, "long format string more than n==%d chopped", 40); // regular library (no %n$) chops to 39 chars + '/0' - if (strlen(buff)!=39 || strcmp(buff, "long format string more than n==40 chop")) error("dt_win_snprintf test 7 failed: %s", buff); + if (strlen(buff)!=39 || strcmp(buff, "long format string more than n==40 chop")) error(_("dt_win_snprintf test %d failed: %s"), 7, buff); dt_win_snprintf(buff, 40, "long %3$s %2$s more than n==%1$d chopped", 40, "string", "format"); // same with dt_win_snprintf - if (strlen(buff)!=39 || strcmp(buff, "long format string more than n==40 chop")) error("dt_win_snprintf test 8 failed: %s", buff); + if (strlen(buff)!=39 || strcmp(buff, "long format string more than n==40 chop")) error(_("dt_win_snprintf test %d failed: %s"), 8, buff); int res = dt_win_snprintf(buff, 10, "%4$d%2$d%3$d%5$d%1$d", 111, 222, 33, 44, 555); // fmt longer than n - if (strlen(buff)!=9 || strcmp(buff, "442223355")) error("dt_win_snprintf test 9 failed: %s", buff); - if (res!=13) /* should return what would have been written if not chopped */ error("dt_win_snprintf test 10 failed: %d", res); + if (strlen(buff)!=9 || strcmp(buff, "442223355")) error(_("dt_win_snprintf test %d failed: %s"), 9, buff); + if (res!=13) /* should return what would have been written if not chopped */ error(_("dt_win_snprintf test %d failed: %s"), 10, res); dt_win_snprintf(buff, 39, "%l", 3); - if (strlen(buff)!=38 || strcmp(buff, "0 %l does not end with recognized t")) error("dt_win_snprintf test 11 failed: %s", buff); + if (strlen(buff)!=38 || strcmp(buff, "0 %l does not end with recognized t")) error(_("dt_win_snprintf test %d failed: %s"), 11, buff); dt_win_snprintf(buff, 19, "%l", 3); - if (strlen(buff)!=18 || strcmp(buff, "0 %l does not e")) error("dt_win_snprintf test 12 failed: %s", buff); + if (strlen(buff)!=18 || strcmp(buff, "0 %l does not e")) error(_("dt_win_snprintf test %d failed: %s"), 12, buff); dt_win_snprintf(buff, 50, "%1$d == %0$d", 1, 2); - if (strcmp(buff, "1 %0$ outside range [1,99]")) error("dt_win_snprintf test 13 failed: %s", buff); + if (strcmp(buff, "1 %0$ outside range [1,99]")) error(_("dt_win_snprintf test %d failed: %s"), 13, buff); dt_win_snprintf(buff, 50, "%1$d == %$d", 1, 2); - if (strcmp(buff, "1 %$ outside range [1,99]")) error("dt_win_snprintf test 14 failed: %s", buff); + if (strcmp(buff, "1 %$ outside range [1,99]")) error(_("dt_win_snprintf test %d failed: %s"), 14, buff); dt_win_snprintf(buff, 50, "%1$d == %100$d", 1, 2); - if (strcmp(buff, "1 %100$ outside range [1,99]")) error("dt_win_snprintf test 15 failed: %s", buff); + if (strcmp(buff, "1 %100$ outside range [1,99]")) error(_("dt_win_snprintf test %d failed: %s"), 15, buff); dt_win_snprintf(buff, 50, "%1$d == %-1$d", 1, 2); - if (strcmp(buff, "1 %-1$ outside range [1,99]")) error("dt_win_snprintf test 16 failed: %s", buff); + if (strcmp(buff, "1 %-1$ outside range [1,99]")) error(_("dt_win_snprintf test %d failed: %s"), 16, buff); dt_win_snprintf(buff, 50, "%1$d == %3$d", 1, 2, 3); - if (strcmp(buff, "5 %2$ missing")) error("dt_win_snprintf test 17 failed: %s", buff); + if (strcmp(buff, "5 %2$ missing")) error(_("dt_win_snprintf test %d failed: %s"), 17, buff); dt_win_snprintf(buff, 50, "%1$d == %1$d", 42); - if (strcmp(buff, "2 %1$ appears twice")) error("dt_win_snprintf test 18 failed: %s", buff); + if (strcmp(buff, "2 %1$ appears twice")) error(_("dt_win_snprintf test %d failed: %s"), 18, buff); dt_win_snprintf(buff, 50, "%1$d + %3$d - %2$d == %3$d", 1, 1, 2); - if (strcmp(buff, "2 %3$ appears twice")) error("dt_win_snprintf test 19 failed: %s", buff); + if (strcmp(buff, "2 %3$ appears twice")) error(_("dt_win_snprintf test %d failed: %s"), 19, buff); return R_NilValue; } - From 91df161c5f1f0b6b87a6ad2c415bf95f34003525 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 1 Nov 2020 12:10:15 -0700 Subject: [PATCH 116/588] as.matrix(empty DT) retains type (#4790) --- NEWS.md | 2 ++ R/data.table.R | 8 ++++++-- inst/tests/tests.Rraw | 34 ++++++++++++++++++++++++++++++---- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index f7c46a17b2..a2318dda88 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ ## BUG FIXES +1. `as.matrix()` now retains the column type for the empty matrix result, [#4762](https://github.com/Rdatatable/data.table/issues/4762). Thus, for example, `min(DT[0])` where DT's columns are numeric, is now consistent with non-empty all-NA input and returns `Inf` with R's warning `no non-missing arguments to min; returning Inf` rather than R's error `only defined on a data frame with all numeric[-alike] variables`. Thanks to @mb706 for reporting. + ## NOTES 1. Continuous daily testing by CRAN using latest daily R-devel revealed, within one day of the change to R-devel, that a future version of R would break one of our tests, [#4769](https://github.com/Rdatatable/data.table/issues/4769). The characters "-alike" were added into one of R's error messages, so our too-strict test which expected the error `only defined on a data frame with all numeric variables` will fail when it sees the new error message `only defined on a data frame with all numeric-alike variables`. We have relaxed the pattern the test looks for to `data.*frame.*numeric` well in advance of the future version of R being released. Readers are reminded that CRAN is not just a host for packages. It is also a giant test suite for R-devel. For more information, [behind the scenes of cran, 2016](https://www.h2o.ai/blog/behind-the-scenes-of-cran/). diff --git a/R/data.table.R b/R/data.table.R index 99afcfb271..d513891b93 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1925,8 +1925,6 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { cn = names(x) X = x } - if (any(dm == 0L)) - return(array(NA, dim = dm, dimnames = list(rownames.value, cn))) p = dm[2L] n = dm[1L] collabs = as.list(cn) @@ -1973,6 +1971,12 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { } } X = unlist(X, recursive = FALSE, use.names = FALSE) + if (any(dm==0L)) { + # retain highest type of input for empty output, #4762 + if (length(X)!=0L) + stop("Internal error: as.matrix.data.table length(X)==", length(X), " but a dimension is zero") # nocov + return(array(if (is.null(X)) NA else X, dim = dm, dimnames = list(rownames.value, cn))) + } dim(X) <- c(n, length(X)/n) dimnames(X) <- list(rownames.value, unlist(collabs, use.names = FALSE)) X diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e3753ebd2d..e68b818aa8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13777,10 +13777,8 @@ test(1967.524, x[1:2, keyby=a], x[1:2,], warning="Ignoring keyby= because j= is test(1967.525, x[, keyby=a], x, warning=c("Ignoring keyby= because j= is not supplied","i and j are both missing.*upgraded to error in future")) test(1967.526, x[keyby=a], x, warning=c("Ignoring keyby= because j= is not supplied","i and j are both missing.*upgraded to error in future")) -test(1967.53, as.matrix(x, rownames = 2:3), - error = 'length(rownames)==2 but') -test(1967.54, as.matrix(x[0L]), - structure(logical(0), .Dim = c(0L, 2L), .Dimnames = list(NULL, c("a", "b")))) +test(1967.53, as.matrix(x, rownames = 2:3), error='length(rownames)==2 but') +test(1967.54, as.matrix(x[0L]), structure(integer(0), .Dim = c(0L, 2L), .Dimnames = list(NULL, c("a", "b")))) test(1967.55, subset(x, 5L), error = "'subset' must evaluate to logical") @@ -17185,4 +17183,32 @@ test(2158.1, DT[, .(value = list(value)), index], DT = data.table(value=as.list(1:6), index=rep(1:2, each=3)) test(2158.2, DT[, by="index", list(value=list(value))], data.table(index=1:2, value=list(as.list(1:3), as.list(4:6)))) + +# type consistency of empty input to as.matrix.data.table, #4762 +DT = data.table(x = 1) +test(2159.01, typeof(as.matrix(DT)), "double") +test(2159.02, typeof(as.matrix(DT[0L])), "double") +test(2159.03, min(DT[0L]), Inf, warning="missing") # R's warning message; use one word 'missing' to insulate from possible future changes to R's message +DT = data.table(x = 1L) +test(2159.04, typeof(as.matrix(DT)), "integer") +test(2159.05, typeof(as.matrix(DT[0L])), "integer") +test(2159.06, min(DT[0L]), Inf, warning="missing") +DT = data.table(x = TRUE) +test(2159.07, typeof(as.matrix(DT)), "logical") +test(2159.08, typeof(as.matrix(DT[0L])), "logical") +x = try(min(data.frame(X=c(TRUE,FALSE))), silent=TRUE) +if (inherits(x,"try-error")) { + # this version of R doesn't have the fix linked to from #4762. That fix was made to R-devel in Oct 2020 when R-release was 4.0.3 + test(2159.09, min(DT[0L]), error="only.*numeric") +} else { + test(2159.10, min(DT[0L]), Inf, warning="missing") +} +DT = data.table(x = c("a","b")) +test(2159.11, typeof(as.matrix(DT)), "character") +test(2159.12, typeof(as.matrix(DT[0L])), "character") +test(2159.13, min(DT[0L]), error="only.*numeric") # R's message 'only defined on a data frame with all numeric[-alike] variables' +DT = data.table(x=1, y="a") +test(2159.14, typeof(as.matrix(DT)), "character") +test(2159.15, typeof(as.matrix(DT[0L])), "character") +test(2159.16, min(DT[0L]), error="only.*numeric") From 1ab3d58e2373b8ffcfb2294aaa5a8c548bb82d77 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Tue, 3 Nov 2020 04:57:43 +0200 Subject: [PATCH 117/588] CI improvements (#4768) --- .ci/README.md | 3 +- .ci/publish.R | 192 +++++++++++++++++++++++++++++------ .gitlab-ci.yml | 156 +++++++++++++++++----------- inst/tests/other.Rraw | 15 +-- inst/tests/tests-DESCRIPTION | 2 +- 5 files changed, 258 insertions(+), 110 deletions(-) diff --git a/.ci/README.md b/.ci/README.md index 7b8ea3d2a9..72568fd844 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -16,6 +16,7 @@ Test jobs: - `test-350-cran-lin` - R 3.5.0 on Linux, no `r-recommended` - `test-rel-win` - `r-release` on Windows - `test-dev-win` - `r-devel` on Windows +- `test-old-win` - `r-oldrel` on Windows - `test-rel-osx` - MacOSX build not yet deployed, see [#3326](https://github.com/Rdatatable/data.table/issues/3326) for status Artifacts: @@ -25,7 +26,7 @@ Artifacts: - [html vignettes](https://rdatatable.gitlab.io/data.table/library/data.table/doc/index.html) - R packages repository for `data.table` and all _Suggests_ dependencies, url: `https://Rdatatable.gitlab.io/data.table` - sources - - Windows binaries for `r-release` and `r-devel` + - Windows binaries for `r-release`, `r-devel` and `r-oldrel` - [CRAN-like homepage](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html) - [CRAN-like checks results](https://rdatatable.gitlab.io/data.table/web/checks/check_results_data.table.html) - note that all artifacts, including check results page, are being published only when all test jobs successfully pass, thus one will not see an _ERROR_ status there (unless error happened on a job marked as `allow_failure`). - [docker images](https://gitlab.com/Rdatatable/data.table/container_registry) - copy/paste-able `docker pull` commands can be found at the bottom of our [CRAN-like homepage](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html) diff --git a/.ci/publish.R b/.ci/publish.R index fd95947ed4..526d9bd80d 100644 --- a/.ci/publish.R +++ b/.ci/publish.R @@ -1,12 +1,17 @@ format.deps <- function(file, which) { deps.raw = read.dcf(file, fields=which)[[1L]] if (all(is.na(deps.raw))) return(character()) + deps.raw = gsub("\n", " ", deps.raw, fixed=TRUE) deps.full = trimws(strsplit(deps.raw, ", ", fixed=TRUE)[[1L]]) deps = trimws(sapply(strsplit(deps.full, "(", fixed=TRUE), `[[`, 1L)) + deps.full = gsub(">=", "≥", deps.full, fixed=TRUE) + deps.full = gsub("<=", "≤", deps.full, fixed=TRUE) + if (any(grepl(">", deps.full, fixed=TRUE), grepl("<", deps.full, fixed=TRUE), grepl("=", deps.full, fixed=TRUE))) + stop("formatting dependencies version for CRAN-line package website failed because some dependencies have version defined using operators other than >= and <=") names(deps.full) <- deps base.deps = c("R", unlist(tools:::.get_standard_package_names(), use.names = FALSE)) ans = sapply(deps, function(x) { - if (x %in% base.deps) deps.full[[x]] + if (x %in% base.deps) deps.full[[x]] ## base R packages are not linked else sprintf("%s", x, deps.full[[x]]) }) sprintf("%s:%s", which, paste(ans, collapse=", ")) @@ -26,6 +31,39 @@ format.bins <- function(ver, bin_ver, cran.home, os.type, pkg, version, repodir) paste(ans[fe], collapse=", ") } +format.entry <- function(field, dcf, url=FALSE) { + if (field %in% colnames(dcf)) { + value = gsub("\n", " ", dcf[,field], fixed=TRUE) + if (url) { + urls = trimws(strsplit(value, ",", fixed=TRUE)[[1L]]) + value = paste(sprintf("%s", urls, urls), collapse=", ") + } + sprintf("%s:%s", field, value) + } +} +format.maintainer <- function(dcf) { + if ("Maintainer" %in% colnames(dcf)) { + text2html = function(x) { + # https://stackoverflow.com/a/64446320/2490497 + splitted <- strsplit(x, "")[[1L]] + intvalues <- as.hexmode(utf8ToInt(enc2utf8(x))) + paste(paste0("&#x", intvalues, ";"), collapse = "") + } + tmp = gsub("@", " at ", dcf[,"Maintainer"], fixed=TRUE) + sep = regexpr("<", tmp, fixed=TRUE) + name = trimws(substr(tmp, 1L, sep-1L)) + mail = text2html(trimws(substr(tmp, sep, nchar(tmp)))) + sprintf("Maintainer:%s %s", name, mail) + } +} +format.materials <- function() { + return(NULL) ## TODO + value = NA + #NEWS + #README + sprintf("Materials:%s", value) +} + package.index <- function(package, lib.loc, repodir="bus/integration/cran") { file = system.file("DESCRIPTION", package=package, lib.loc=lib.loc) dcf = read.dcf(file) @@ -40,21 +78,31 @@ package.index <- function(package, lib.loc, repodir="bus/integration/cran") { format.deps(file, "LinkingTo"), format.deps(file, "Suggests"), format.deps(file, "Enhances"), + if ("Built" %in% colnames(dcf)) sprintf("Built:%s", substr(trimws(strsplit(dcf[,"Built"], ";", fixed=TRUE)[[1L]][[3L]]), 1L, 10L)), + if ("Author" %in% colnames(dcf)) sprintf("Author:%s", dcf[,"Author"]), + format.maintainer(dcf), + format.entry("BugReports", dcf, url=TRUE), + format.entry("License", dcf), + format.entry("URL", dcf, url=TRUE), + format.entry("NeedsCompilation", dcf), + format.entry("SystemRequirements", dcf), + format.materials(), ## TODO if (pkg=="data.table") sprintf("Checks:%s results", pkg, pkg) ) vign = tools::getVignetteInfo(pkg, lib.loc=lib.loc) - r_bin_ver = Sys.getenv("R_BIN_VERSION") - r_devel_bin_ver = Sys.getenv("R_DEVEL_BIN_VERSION") - stopifnot(nzchar(r_bin_ver), nzchar(r_devel_bin_ver)) + r_rel_ver = Sys.getenv("R_REL_VERSION") + r_devel_ver = Sys.getenv("R_DEVEL_VERSION") + r_oldrel_ver = Sys.getenv("R_OLDREL_VERSION") + stopifnot(nzchar(r_rel_ver), nzchar(r_devel_ver), nzchar(r_oldrel_ver)) cran.home = "../../.." tbl.dl = c( sprintf(" Reference manual: %s.pdf, 00Index.html ", pkg, pkg, cran.home, pkg), if (nrow(vign)) sprintf("Vignettes:%s", paste(sprintf("%s
", cran.home, vign[,"PDF"], vign[,"Title"]), collapse="\n")), # location unline cran web/pkg/vignettes to not duplicate content, documentation is in ../../../library sprintf(" Package source: %s_%s.tar.gz ", cran.home,pkg, version, pkg, version), - sprintf(" Windows binaries: %s ", format.bins(ver=c("r-devel","r-release"), bin_ver=c(r_devel_bin_ver,r_bin_ver), cran.home=cran.home, os.type="windows", pkg=pkg, version=version, repodir=repodir)), - sprintf(" OS X binaries: %s ", format.bins(ver=c("r-devel","r-release"), bin_ver=c(r_devel_bin_ver, r_bin_ver), cran.home=cran.home, os.type="macosx", pkg=pkg, version=version, repodir=repodir)) + sprintf(" Windows binaries: %s ", format.bins(ver=c("r-devel","r-release","r-oldrel"), bin_ver=c(r_devel_ver, r_rel_ver, r_oldrel_ver), cran.home=cran.home, os.type="windows", pkg=pkg, version=version, repodir=repodir)), + sprintf(" macOS binaries: %s ", format.bins(ver=c("r-release","r-oldrel"), bin_ver=c(r_rel_ver, r_oldrel_ver), cran.home=cran.home, os.type="macosx", pkg=pkg, version=version, repodir=repodir)) ) - if (pkg=="data.table") { + if (pkg=="data.table") { ## docker images registry = Sys.getenv("CI_REGISTRY", "registry.gitlab.com") namespace = Sys.getenv("CI_PROJECT_NAMESPACE", "Rdatatable") project = Sys.getenv("CI_PROJECT_NAME", "data.table") @@ -74,7 +122,7 @@ package.index <- function(package, lib.loc, repodir="bus/integration/cran") { "", "", "", - sprintf("