From e4324cf814641167c60697802a1dbe89745c62da Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 6 Jan 2024 13:23:08 +0000 Subject: [PATCH 01/21] 1.15.0 on CRAN. Bump to 1.15.99 --- .dev/CRAN_Release.cmd | 32 ++++++++++++++++---------------- DESCRIPTION | 2 +- Makefile | 6 +++--- src/init.c | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 0be251e65..78c188de9 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -195,15 +195,15 @@ R CMD build . export GITHUB_PAT="f1c.. github personal access token ..7ad" # avoids many too-many-requests in --as-cran's ping-all-URLs step (20 mins) inside the `checking CRAN incoming feasibility...` step. # Many thanks to Dirk for the tipoff that setting this env variable solves the problem, #4832. -R CMD check data.table_1.14.99.tar.gz --as-cran -R CMD INSTALL data.table_1.14.99.tar.gz --html +R CMD check data.table_1.15.99.tar.gz --as-cran +R CMD INSTALL data.table_1.15.99.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.14.99.tar.gz +R CMD check data.table_1.15.99.tar.gz rm ~/.Renviron # Test non-English does not break test.data.table() due to translation of messages; #3039, #630 @@ -220,9 +220,9 @@ q("no") # User supplied PKG_CFLAGS and PKG_LIBS passed through, #4664 # Next line from https://mac.r-project.org/openmp/. Should see the arguments passed through and then fail with gcc on linux. -PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.14.99.tar.gz +PKG_CFLAGS='-Xclang -fopenmp' PKG_LIBS=-lomp R CMD INSTALL data.table_1.15.99.tar.gz # Next line should work on Linux, just using superfluous and duplicate but valid parameters here to see them retained and work -PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.14.99.tar.gz +PKG_CFLAGS='-fopenmp' PKG_LIBS=-lz R CMD INSTALL data.table_1.15.99.tar.gz R remove.packages("xml2") # we checked the URLs; don't need to do it again (many minutes) @@ -266,7 +266,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.14.99.tar.gz +R310 CMD INSTALL ./data.table_1.15.99.tar.gz R310 require(data.table) test.data.table(script="*.Rraw") @@ -278,7 +278,7 @@ test.data.table(script="*.Rraw") vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.14.99.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.15.99.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -286,7 +286,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.14.99.tar.gz +R CMD check data.table_1.15.99.tar.gz ##################################################### @@ -341,11 +341,11 @@ alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.14.99.tar.gz +Rdevel-strict-[gcc|clang] CMD INSTALL data.table_1.15.99.tar.gz # Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so they should be # passed through to here. However, our configure script seems to get in the way and gets them from {R_HOME}/bin/R # So I needed to edit my ~/.R/Makevars to get CFLAGS the way I needed. -Rdevel-strict-[gcc|clang] CMD check data.table_1.14.99.tar.gz +Rdevel-strict-[gcc|clang] CMD check data.table_1.15.99.tar.gz # Use the (failed) output to get the list of currently needed packages and install them Rdevel-strict-[gcc|clang] isTRUE(.Machine$sizeof.longdouble==0) # check noLD is being tested @@ -354,7 +354,7 @@ install.packages(c("bit64", "bit", "R.utils", "xts", "zoo", "yaml", "knitr", "ma Ncpus=4) # Issue #5491 showed that CRAN is running UBSAN on .Rd examples which found an error so we now run full R CMD check q("no") -Rdevel-strict-[gcc|clang] CMD check data.table_1.14.99.tar.gz +Rdevel-strict-[gcc|clang] CMD check data.table_1.15.99.tar.gz # UBSAN errors occur on stderr and don't affect R CMD check result. Made many failed attempts to capture them. So grep for them. find data.table.Rcheck -name "*Rout*" -exec grep -H "runtime error" {} \; @@ -391,7 +391,7 @@ cd R-devel-valgrind make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O2 -g line active, for info on source lines with any problems -Rdevel-valgrind CMD INSTALL data.table_1.14.99.tar.gz +Rdevel-valgrind CMD INSTALL data.table_1.15.99.tar.gz R_DONT_USE_TK=true Rdevel-valgrind -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite,possible --gen-suppressions=all --suppressions=./.dev/valgrind.supp -s" # the default for --show-leak-kinds is 'definite,possible' which we're setting explicitly here as a reminder. CRAN uses the default too. # including 'reachable' (as 'all' does) generates too much output from R itself about by-design permanent blocks @@ -429,7 +429,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc vi ~/.R/Makevars # set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.14.99.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.15.99.tar.gz",repos=NULL)' | ./bin/R --slave # objcopy warnings (if any) can be ignored: https://github.com/kalibera/rchk/issues/17#issuecomment-497312504 . ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check @@ -622,10 +622,10 @@ bunzip2 inst/tests/*.Rraw.bz2 # decompress *.Rraw again so as not to commit com # 3. Add new heading in NEWS for the next dev version. Add "(submitted to CRAN on )" on the released heading. # 4. Bump minor version in dllVersion() in init.c # 5. Bump 3 minor version numbers in Makefile -# 6. Search and replace this .dev/CRAN_Release.cmd to update 1.14.99 to 1.15.99 inc below, 1.15.0 to 1.16.0 above, 1.14.0 to 1.15.0 below +# 6. Search and replace this .dev/CRAN_Release.cmd to update 1.15.99 to 1.16.99 inc below, 1.16.0 to 1.17.0 above, 1.15.0 to 1.16.0 below # 7. Another final gd to view all diffs using meld. (I have `alias gd='git difftool &> /dev/null'` and difftool meld: http://meldmerge.org/) -# 8. Push to master with this consistent commit message: "1.15.0 on CRAN. Bump to 1.14.10" -# 9. Take sha from step 8 and run `git tag 1.15.0 96c..sha..d77` then `git push origin 1.15.0` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) +# 8. Push to master with this consistent commit message: "1.16.0 on CRAN. Bump to 1.16.99" +# 9. Take sha from step 8 and run `git tag 1.16.0 96c..sha..d77` then `git push origin 1.16.0` (not `git push --tags` according to https://stackoverflow.com/a/5195913/403310) ###### ###### Bump dev for PATCH RELEASE diff --git a/DESCRIPTION b/DESCRIPTION index d4792140e..bcaa3b257 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: data.table -Version: 1.14.99 +Version: 1.15.99 Title: Extension of `data.frame` Depends: R (>= 3.1.0) Imports: methods diff --git a/Makefile b/Makefile index 45fb6203b..c5324d3c3 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ some: .PHONY: clean clean: - $(RM) data.table_1.14.99.tar.gz + $(RM) data.table_1.15.99.tar.gz $(RM) src/*.o $(RM) src/*.so @@ -28,7 +28,7 @@ build: .PHONY: install install: - $(R) CMD INSTALL data.table_1.14.99.tar.gz + $(R) CMD INSTALL data.table_1.15.99.tar.gz .PHONY: uninstall uninstall: @@ -40,7 +40,7 @@ test: .PHONY: check check: - _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.14.99.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error + _R_CHECK_CRAN_INCOMING_REMOTE_=false $(R) CMD check data.table_1.15.99.tar.gz --as-cran --ignore-vignettes --no-stop-on-test-error .PHONY: revision revision: diff --git a/src/init.c b/src/init.c index c8e8452ec..1dcc67dc6 100644 --- a/src/init.c +++ b/src/init.c @@ -353,6 +353,6 @@ SEXP initLastUpdated(SEXP var) { SEXP dllVersion(void) { // .onLoad calls this and checks the same as packageVersion() to ensure no R/C version mismatch, #3056 - return(ScalarString(mkChar("1.14.99"))); + return(ScalarString(mkChar("1.15.99"))); } From 18a7209b242fa2a784fe76be845106baacd8349f Mon Sep 17 00:00:00 2001 From: Ofek Date: Sat, 6 Jan 2024 12:49:20 +0200 Subject: [PATCH 02/21] Fix transform slowness (#5493) * Fix 5492 by limiting the costly deparse to `nlines=1` * Implementing PR feedbacks * Added inside * Fix typo in name * Idiomatic use of inside * Separating the deparse line limit to a different PR --------- Co-authored-by: Michael Chirico --- NEWS.md | 6 ++++++ R/data.table.R | 23 ++++------------------- man/transform.data.table.Rd | 4 +--- 3 files changed, 11 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index fa7deb3b9..3ef946575 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** +# data.table [v1.15.99]() (in development) + +## NOTES + +1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. + # data.table [v1.14.99](https://github.com/Rdatatable/data.table/milestone/29) (in development) ## BREAKING CHANGE diff --git a/R/data.table.R b/R/data.table.R index a7009399f..d66773e3a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2345,25 +2345,10 @@ transform.data.table = function (`_data`, ...) # basically transform.data.frame with data.table instead of data.frame, and retains key { if (!cedta()) return(NextMethod()) # nocov - e = eval(substitute(list(...)), `_data`, parent.frame()) - tags = names(e) - inx = chmatch(tags, names(`_data`)) - matched = !is.na(inx) - if (any(matched)) { - .Call(C_unlock, `_data`) # fix for #1641, now covered by test 104.2 - `_data`[,inx[matched]] = e[matched] - `_data` = as.data.table(`_data`) - } - if (!all(matched)) { - ans = do.call("data.table", c(list(`_data`), e[!matched])) - } else { - ans = `_data` - } - key.cols = key(`_data`) - if (!any(tags %chin% key.cols)) { - setattr(ans, "sorted", key.cols) - } - ans + `_data` = copy(`_data`) + e = eval(substitute(list(...)), `_data`, parent.frame()) + set(`_data`, ,names(e), e) + `_data` } subset.data.table = function (x, subset, select, ...) diff --git a/man/transform.data.table.Rd b/man/transform.data.table.Rd index 4a173ce7a..1bf6fb551 100644 --- a/man/transform.data.table.Rd +++ b/man/transform.data.table.Rd @@ -7,9 +7,7 @@ \description{ Utilities for \code{data.table} transformation. - \strong{\code{transform} by group is particularly slow. Please use \code{:=} by group instead.} - - \code{within}, \code{transform} and other similar functions in \code{data.table} are not just provided for users who expect them to work, but for non-data.table-aware packages to retain keys, for example. Hopefully the (much) faster and more convenient \code{data.table} syntax will be used in time. See examples. + \code{within}, \code{transform} and other similar functions in \code{data.table} are not just provided for users who expect them to work, but for non-data.table-aware packages to retain keys, for example. Hopefully the faster and more convenient \code{data.table} syntax will be used in time. See examples. } \usage{ \method{transform}{data.table}(`_data`, \ldots) From b6bd964bb8504f6fe52552830492eaf624699818 Mon Sep 17 00:00:00 2001 From: Ani Date: Sat, 6 Jan 2024 05:37:12 -0800 Subject: [PATCH 03/21] Improvements to the introductory vignette (#5836) * Added my improvements to the intro vignette * Removed two lines I added extra as a mistake earlier * Requested changes --- vignettes/datatable-intro.Rmd | 80 +++++++++++++++++------------------ 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index 3624a7c5b..81d4ca131 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -21,19 +21,19 @@ knitr::opts_chunk$set( .old.th = setDTthreads(1) ``` -This vignette introduces the `data.table` syntax, its general form, how to *subset* rows, *select and compute* on columns, and perform aggregations *by group*. Familiarity with `data.frame` data structure from base R is useful, but not essential to follow this vignette. +This vignette introduces the `data.table` syntax, its general form, how to *subset* rows, *select and compute* on columns, and perform aggregations *by group*. Familiarity with the `data.frame` data structure from base R is useful, but not essential to follow this vignette. *** ## Data analysis using `data.table` -Data manipulation operations such as *subset*, *group*, *update*, *join* etc., are all inherently related. Keeping these *related operations together* allows for: +Data manipulation operations such as *subset*, *group*, *update*, *join*, etc. are all inherently related. Keeping these *related operations together* allows for: * *concise* and *consistent* syntax irrespective of the set of operations you would like to perform to achieve your end goal. * performing analysis *fluidly* without the cognitive burden of having to map each operation to a particular function from a potentially huge set of functions available before performing the analysis. -* *automatically* optimising operations internally, and very effectively, by knowing precisely the data required for each operation, leading to very fast and memory efficient code. +* *automatically* optimising operations internally and very effectively by knowing precisely the data required for each operation, leading to very fast and memory-efficient code. Briefly, if you are interested in reducing *programming* and *compute* time tremendously, then this package is for you. The philosophy that `data.table` adheres to makes this possible. Our goal is to illustrate it through this series of vignettes. @@ -58,13 +58,13 @@ flights dim(flights) ``` -Aside: `fread` accepts `http` and `https` URLs directly as well as operating system commands such as `sed` and `awk` output. See `?fread` for examples. +Aside: `fread` accepts `http` and `https` URLs directly, as well as operating system commands such as `sed` and `awk` output. See `?fread` for examples. ## Introduction In this vignette, we will -1. Start with basics - what is a `data.table`, its general form, how to *subset* rows, how to *select and compute* on columns; +1. Start with the basics - what is a `data.table`, its general form, how to *subset* rows, how to *select and compute* on columns; 2. Then we will look at performing data aggregations by group @@ -72,7 +72,7 @@ In this vignette, we will ### a) What is `data.table`? {#what-is-datatable-1a} -`data.table` is an R package that provides **an enhanced version** of `data.frame`s, which are the standard data structure for storing data in `base` R. In the [Data](#data) section above, we already created a `data.table` using `fread()`. We can also create one using the `data.table()` function. Here is an example: +`data.table` is an R package that provides **an enhanced version** of a `data.frame`, the standard data structure for storing data in `base` R. In the [Data](#data) section above, we saw how to create a `data.table` using `fread()`, but alternatively we can also create one using the `data.table()` function. Here is an example: ```{r} DT = data.table( @@ -85,13 +85,13 @@ DT class(DT$ID) ``` -You can also convert existing objects to a `data.table` using `setDT()` (for `data.frame`s and `list`s) and `as.data.table()` (for other structures); the difference is beyond the scope of this vignette, see `?setDT` and `?as.data.table` for more details. +You can also convert existing objects to a `data.table` using `setDT()` (for `data.frame` and `list` structures) or `as.data.table()` (for other structures). For more details pertaining to the difference (goes beyond the scope of this vignette), please see `?setDT` and `?as.data.table`. #### Note that: * Row numbers are printed with a `:` in order to visually separate the row number from the first column. -* When the number of rows to print exceeds the global option `datatable.print.nrows` (default = `r getOption("datatable.print.nrows")`), it automatically prints only the top 5 and bottom 5 rows (as can be seen in the [Data](#data) section). If you've had a lot of experience with `data.frame`s, you may have found yourself waiting around while larger tables print-and-page, sometimes seemingly endlessly. You can query the default number like so: +* When the number of rows to print exceeds the global option `datatable.print.nrows` (default = `r getOption("datatable.print.nrows")`), it automatically prints only the top 5 and bottom 5 rows (as can be seen in the [Data](#data) section). For a large `data.frame`, you may have found yourself waiting around while larger tables print-and-page, sometimes seemingly endlessly. This restriction helps with that, and you can query the default number like so: ```{.r} getOption("datatable.print.nrows") @@ -101,7 +101,7 @@ You can also convert existing objects to a `data.table` using `setDT()` (for `da ### b) General form - in what way is a `data.table` *enhanced*? {#enhanced-1b} -In contrast to a `data.frame`, you can do *a lot more* than just subsetting rows and selecting columns within the frame of a `data.table`, i.e., within `[ ... ]` (NB: we might also refer to writing things inside `DT[...]` as "querying `DT`", in analogy to SQL). To understand it we will have to first look at the *general form* of `data.table` syntax, as shown below: +In contrast to a `data.frame`, you can do *a lot more* than just subsetting rows and selecting columns within the frame of a `data.table`, i.e., within `[ ... ]` (NB: we might also refer to writing things inside `DT[...]` as "querying `DT`", as an analogy or in relevance to SQL). To understand it we will have to first look at the *general form* of the `data.table` syntax, as shown below: ```{r eval = FALSE} DT[i, j, by] @@ -110,7 +110,7 @@ DT[i, j, by] ## SQL: where | order by select | update group by ``` -Users who have an SQL background might perhaps immediately relate to this syntax. +Users with an SQL background might perhaps immediately relate to this syntax. #### The way to read it (out loud) is: @@ -131,7 +131,7 @@ head(ans) * The *row indices* that satisfy the condition `origin == "JFK" & month == 6L` are computed, and since there is nothing else left to do, all columns from `flights` at rows corresponding to those *row indices* are simply returned as a `data.table`. -* A comma after the condition in `i` is not required. But `flights[origin == "JFK" & month == 6L, ]` would work just fine. In `data.frame`s, however, the comma is necessary. +* A comma after the condition in `i` is not required. But `flights[origin == "JFK" & month == 6L, ]` would work just fine. In a `data.frame`, however, the comma is necessary. #### -- Get the first two rows from `flights`. {#subset-rows-integer} @@ -153,9 +153,9 @@ head(ans) #### `order()` is internally optimised -* We can use "-" on a `character` columns within the frame of a `data.table` to sort in decreasing order. +* We can use "-" on `character` columns within the frame of a `data.table` to sort in decreasing order. -* In addition, `order(...)` within the frame of a `data.table` uses `data.table`'s internal fast radix order `forder()`. This sort provided such a compelling improvement over R's `base::order` that the R project adopted the `data.table` algorithm as its default sort in 2016 for R 3.3.0, see `?sort` and the [R Release NEWS](https://cran.r-project.org/doc/manuals/r-release/NEWS.pdf). +* In addition, `order(...)` within the frame of a `data.table` uses `data.table`'s internal fast radix order `forder()`. This sort provided such a compelling improvement over R's `base::order` that the R project adopted the `data.table` algorithm as its default sort in 2016 for R 3.3.0 (for reference, check `?sort` and the [R Release NEWS](https://cran.r-project.org/doc/manuals/r-release/NEWS.pdf)). We will discuss `data.table`'s fast order in more detail in the *`data.table` internals* vignette. @@ -168,7 +168,7 @@ ans <- flights[, arr_delay] head(ans) ``` -* Since columns can be referred to as if they are variables within the frame of `data.table`s, we directly refer to the *variable* we want to subset. Since we want *all the rows*, we simply skip `i`. +* Since columns can be referred to as if they are variables within the frame of a `data.table`, we directly refer to the *variable* we want to subset. Since we want *all the rows*, we simply skip `i`. * It returns *all* the rows for the column `arr_delay`. @@ -183,7 +183,7 @@ head(ans) * `data.table` also allows wrapping columns with `.()` instead of `list()`. It is an *alias* to `list()`; they both mean the same. Feel free to use whichever you prefer; we have noticed most users seem to prefer `.()` for conciseness, so we will continue to use `.()` hereafter. -`data.table`s (and `data.frame`s) are internally `list`s as well, with the stipulation that each element has the same length and the `list` has a `class` attribute. Allowing `j` to return a `list` enables converting and returning `data.table` very efficiently. +A `data.table` (and a `data.frame` too) is internally a `list` as well, with the stipulation that each element has the same length and the `list` has a `class` attribute. Allowing `j` to return a `list` enables converting and returning `data.table` very efficiently. #### Tip: {#tip-1} @@ -210,8 +210,6 @@ ans <- flights[, .(delay_arr = arr_delay, delay_dep = dep_delay)] head(ans) ``` -That's it. - ### e) Compute or *do* in `j` #### -- How many trips have had total delay < 0? @@ -239,7 +237,7 @@ ans * Now, we look at `j` and find that it uses only *two columns*. And what we have to do is to compute their `mean()`. Therefore we subset just those columns corresponding to the matching rows, and compute their `mean()`. -Because the three main components of the query (`i`, `j` and `by`) are *together* inside `[...]`, `data.table` can see all three and optimise the query altogether *before evaluation*, not each separately. We are able to therefore avoid the entire subset (i.e., subsetting the columns _besides_ `arr_delay` and `dep_delay`), for both speed and memory efficiency. +Because the three main components of the query (`i`, `j` and `by`) are *together* inside `[...]`, `data.table` can see all three and optimise the query altogether *before evaluation*, rather than optimizing each separately. We are able to therefore avoid the entire subset (i.e., subsetting the columns _besides_ `arr_delay` and `dep_delay`), for both speed and memory efficiency. #### -- How many trips have been made in 2014 from "JFK" airport in the month of June? @@ -248,7 +246,7 @@ ans <- flights[origin == "JFK" & month == 6L, length(dest)] ans ``` -The function `length()` requires an input argument. We just needed to compute the number of rows in the subset. We could have used any other column as input argument to `length()` really. This approach is reminiscent of `SELECT COUNT(dest) FROM flights WHERE origin = 'JFK' AND month = 6` in SQL. +The function `length()` requires an input argument. We just need to compute the number of rows in the subset. We could have used any other column as the input argument to `length()`. This approach is reminiscent of `SELECT COUNT(dest) FROM flights WHERE origin = 'JFK' AND month = 6` in SQL. This type of operation occurs quite frequently, especially while grouping (as we will see in the next section), to the point where `data.table` provides a *special symbol* `.N` for it. @@ -256,7 +254,7 @@ This type of operation occurs quite frequently, especially while grouping (as we `.N` is a special built-in variable that holds the number of observations _in the current group_. It is particularly useful when combined with `by` as we'll see in the next section. In the absence of group by operations, it simply returns the number of rows in the subset. -So we can now accomplish the same task by using `.N` as follows: +Now that we now, we can now accomplish the same task by using `.N` as follows: ```{r} ans <- flights[origin == "JFK" & month == 6L, .N] @@ -273,7 +271,7 @@ We could have accomplished the same operation by doing `nrow(flights[origin == " ### g) Great! But how can I refer to columns by names in `j` (like in a `data.frame`)? {#refer_j} -If you're writing out the column names explicitly, there's no difference vis-a-vis `data.frame` (since v1.9.8). +If you're writing out the column names explicitly, there's no difference compared to a `data.frame` (since v1.9.8). #### -- Select both `arr_delay` and `dep_delay` columns the `data.frame` way. @@ -291,7 +289,7 @@ select_cols = c("arr_delay", "dep_delay") flights[ , ..select_cols] ``` -For those familiar with the Unix terminal, the `..` prefix should be reminiscent of the "up-one-level" command, which is analogous to what's happening here -- the `..` signals to `data.table` to look for the `select_cols` variable "up-one-level", i.e., in the global environment in this case. +For those familiar with the Unix terminal, the `..` prefix should be reminiscent of the "up-one-level" command, which is analogous to what's happening here -- the `..` signals to `data.table` to look for the `select_cols` variable "up-one-level", i.e., within the global environment in this case. #### -- Select columns named in a variable using `with = FALSE` @@ -362,11 +360,11 @@ ans * We know `.N` [is a special variable](#special-N) that holds the number of rows in the current group. Grouping by `origin` obtains the number of rows, `.N`, for each group. -* By doing `head(flights)` you can see that the origin airports occur in the order *"JFK"*, *"LGA"* and *"EWR"*. The original order of grouping variables is preserved in the result. _This is important to keep in mind!_ +* By doing `head(flights)` you can see that the origin airports occur in the order *"JFK"*, *"LGA"*, and *"EWR"*. The original order of grouping variables is preserved in the result. _This is important to keep in mind!_ -* Since we did not provide a name for the column returned in `j`, it was named `N` automatically by recognising the special symbol `.N`. +* Since we did not provide a name for the column returned in `j`, it was named `N` automatically by recognising the special symbol `.N`. -* `by` also accepts a character vector of column names. This is particularly useful for coding programmatically, e.g., designing a function with the grouping columns as a (`character` vector) function argument. +* `by` also accepts a character vector of column names. This is particularly useful for coding programmatically. For e.g., designing a function with the grouping columns (in the form of a `character` vector) as a function argument. * When there's only one column or expression to refer to in `j` and `by`, we can drop the `.()` notation. This is purely for convenience. We could instead do: @@ -430,7 +428,7 @@ ans <- flights[carrier == "AA", ans ``` -* All we did was to change `by` to `keyby`. This automatically orders the result by the grouping variables in increasing order. In fact, due to the internal implementation of `by` first requiring a sort before recovering the original table's order, `keyby` is typically faster than `by` because it doesn't require this second step. +* All we did was change `by` to `keyby`. This automatically orders the result by the grouping variables in increasing order. In fact, due to the internal implementation of `by` first requiring a sort before recovering the original table's order, `keyby` is typically faster than `by` because it doesn't require this second step. **Keys:** Actually `keyby` does a little more than *just ordering*. It also *sets a key* after ordering by setting an `attribute` called `sorted`. @@ -453,7 +451,7 @@ ans <- ans[order(origin, -dest)] head(ans) ``` -* Recall that we can use `-` on a `character` column in `order()` within the frame of a `data.table`. This is possible to due `data.table`'s internal query optimisation. +* Recall that we can use `-` on a `character` column in `order()` within the frame of a `data.table`. This is possible due to `data.table`'s internal query optimisation. * Also recall that `order(...)` with the frame of a `data.table` is *automatically optimised* to use `data.table`'s internal fast radix order `forder()` for speed. @@ -488,7 +486,7 @@ ans * The last row corresponds to `dep_delay > 0 = TRUE` and `arr_delay > 0 = FALSE`. We can see that `r flights[!is.na(arr_delay) & !is.na(dep_delay), .N, .(dep_delay>0, arr_delay>0)][, N[4L]]` flights started late but arrived early (or on time). -* Note that we did not provide any names to `by-expression`. Therefore, names have been automatically assigned in the result. As with `j`, you can name these expressions as you would elements of any `list`, e.g. `DT[, .N, .(dep_delayed = dep_delay>0, arr_delayed = arr_delay>0)]`. +* Note that we did not provide any names to `by-expression`. Therefore, names have been automatically assigned in the result. As with `j`, you can name these expressions as you would for elements of any `list`, like for e.g. `DT[, .N, .(dep_delayed = dep_delay>0, arr_delayed = arr_delay>0)]`. * You can provide other columns along with expressions, for example: `DT[, .N, by = .(a, b>0)]`. @@ -498,11 +496,11 @@ ans It is of course not practical to have to type `mean(myCol)` for every column one by one. What if you had 100 columns to average `mean()`? -How can we do this efficiently, concisely? To get there, refresh on [this tip](#tip-1) - *"As long as the `j`-expression returns a `list`, each element of the `list` will be converted to a column in the resulting `data.table`"*. Suppose we can refer to the *data subset for each group* as a variable *while grouping*, then we can loop through all the columns of that variable using the already- or soon-to-be-familiar base function `lapply()`. No new names to learn specific to `data.table`. +How can we do this efficiently and concisely? To get there, refresh on [this tip](#tip-1) - *"As long as the `j`-expression returns a `list`, each element of the `list` will be converted to a column in the resulting `data.table`"*. If we can refer to the *data subset for each group* as a variable *while grouping*, we can then loop through all the columns of that variable using the already- or soon-to-be-familiar base function `lapply()`. No new names to learn specific to `data.table`. #### Special symbol `.SD`: {#special-SD} -`data.table` provides a *special* symbol, called `.SD`. It stands for **S**ubset of **D**ata. It by itself is a `data.table` that holds the data for *the current group* defined using `by`. +`data.table` provides a *special* symbol called `.SD`. It stands for **S**ubset of **D**ata. It by itself is a `data.table` that holds the data for *the current group* defined using `by`. Recall that a `data.table` is internally a `list` as well with all its columns of equal length. @@ -530,7 +528,7 @@ DT[, lapply(.SD, mean), by = ID] * Since `lapply()` returns a `list`, so there is no need to wrap it with an additional `.()` (if necessary, refer to [this tip](#tip-1)). -We are almost there. There is one little thing left to address. In our `flights` `data.table`, we only wanted to calculate the `mean()` of two columns `arr_delay` and `dep_delay`. But `.SD` would contain all the columns other than the grouping variables by default. +We are almost there. There is one little thing left to address. In our `flights` `data.table`, we only wanted to calculate the `mean()` of the two columns `arr_delay` and `dep_delay`. But `.SD` would contain all the columns other than the grouping variables by default. #### -- How can we specify just the columns we would like to compute the `mean()` on? @@ -538,7 +536,7 @@ We are almost there. There is one little thing left to address. In our `flights` Using the argument `.SDcols`. It accepts either column names or column indices. For example, `.SDcols = c("arr_delay", "dep_delay")` ensures that `.SD` contains only these two columns for each group. -Similar to [part g)](#refer_j), you can also provide the columns to remove instead of columns to keep using `-` or `!` sign as well as select consecutive columns as `colA:colB` and deselect consecutive columns as `!(colA:colB)` or `-(colA:colB)`. +Similar to [part g)](#refer_j), you can also specify the columns to remove instead of columns to keep using `-` or `!`. Additionally, you can select consecutive columns as `colA:colB` and deselect them as `!(colA:colB)` or `-(colA:colB)`. Now let us try to use `.SD` along with `.SDcols` to get the `mean()` of `arr_delay` and `dep_delay` columns grouped by `origin`, `dest` and `month`. @@ -564,7 +562,7 @@ head(ans) ### g) Why keep `j` so flexible? -So that we have a consistent syntax and keep using already existing (and familiar) base functions instead of learning new functions. To illustrate, let us use the `data.table` `DT` that we created at the very beginning under [What is a data.table?](#what-is-datatable-1a) section. +So that we have a consistent syntax and keep using already existing (and familiar) base functions instead of learning new functions. To illustrate, let us use the `data.table` `DT` that we created at the very beginning under the section [What is a data.table?](#what-is-datatable-1a). #### -- How can we concatenate columns `a` and `b` for each group in `ID`? @@ -582,18 +580,18 @@ DT[, .(val = list(c(a,b))), by = ID] * Here, we first concatenate the values with `c(a,b)` for each group, and wrap that with `list()`. So for each group, we return a list of all concatenated values. -* Note those commas are for display only. A list column can contain any object in each cell, and in this example, each cell is itself a vector and some cells contain longer vectors than others. +* Note that those commas are for display only. A list column can contain any object in each cell, and in this example, each cell is itself a vector and some cells contain longer vectors than others. Once you start internalising usage in `j`, you will realise how powerful the syntax can be. A very useful way to understand it is by playing around, with the help of `print()`. For example: ```{r} -## (1) look at the difference between -DT[, print(c(a,b)), by = ID] +## look at the difference between +DT[, print(c(a,b)), by = ID] # (1) -## (2) and -DT[, print(list(c(a,b))), by = ID] +## and +DT[, print(list(c(a,b))), by = ID] # (2) ``` In (1), for each group, a vector is returned, with length = 6,4,2 here. However (2) returns a list of length 1 for each group, with its first element holding vectors of length 6,4,2. Therefore (1) results in a length of ` 6+4+2 = `r 6+4+2``, whereas (2) returns `1+1+1=`r 1+1+1``. @@ -612,9 +610,9 @@ We have seen so far that, * We can subset rows similar to a `data.frame`- except you don't have to use `DT$` repetitively since columns within the frame of a `data.table` are seen as if they are *variables*. -* We can also sort a `data.table` using `order()`, which internally uses `data.table`'s fast order for performance. +* We can also sort a `data.table` using `order()`, which internally uses data.table's fast order for better performance. -We can do much more in `i` by keying a `data.table`, which allows blazing fast subsets and joins. We will see this in the *"Keys and fast binary search based subsets"* and *"Joins and rolling joins"* vignette. +We can do much more in `i` by keying a `data.table`, which allows for blazing fast subsets and joins. We will see this in the *"Keys and fast binary search based subsets"* and *"Joins and rolling joins"* vignette. #### Using `j`: @@ -630,7 +628,7 @@ We can do much more in `i` by keying a `data.table`, which allows blazing fast s #### Using `by`: -* Using `by`, we can group by columns by specifying a *list of columns* or a *character vector of column names* or even *expressions*. The flexibility of `j`, combined with `by` and `i` makes for a very powerful syntax. +* Using `by`, we can group by columns by specifying a *list of columns* or a *character vector of column names* or even *expressions*. The flexibility of `j`, combined with `by` and `i`, makes for a very powerful syntax. * `by` can handle multiple columns and also *expressions*. From 68f0e416a2d40385cf2778e56c7dcbcc16cddafc Mon Sep 17 00:00:00 2001 From: David Budzynski <56514985+davidbudzynski@users.noreply.github.com> Date: Sat, 6 Jan 2024 14:30:35 +0000 Subject: [PATCH 04/21] Vignette typo patch (#5402) * fix typos and grammatical mistakes * fix typos and punctuation * remove double spaces where it wasn't necessary * fix typos and adhere to British English spelling * fix typos * fix typos * add missing closing bracket * fix typos * review fixes * Update vignettes/datatable-benchmarking.Rmd Co-authored-by: Michael Chirico * Update vignettes/datatable-benchmarking.Rmd Co-authored-by: Michael Chirico * Apply suggestions from code review benchmarking Co-authored-by: Michael Chirico * remove unnecessary [ ] from datatable-keys-fast-subset.Rmd * Update vignettes/datatable-programming.Rmd Co-authored-by: Michael Chirico * Update vignettes/datatable-reshape.Rmd Co-authored-by: Michael Chirico * One last batch of fine-tuning --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- vignettes/datatable-benchmarking.Rmd | 20 +++++++++---------- vignettes/datatable-importing.Rmd | 4 ++-- vignettes/datatable-intro.Rmd | 16 +++++++-------- vignettes/datatable-keys-fast-subset.Rmd | 8 ++++---- vignettes/datatable-programming.Rmd | 4 ++-- vignettes/datatable-reshape.Rmd | 4 ++-- vignettes/datatable-sd-usage.Rmd | 2 +- ...le-secondary-indices-and-auto-indexing.Rmd | 6 +++--- 8 files changed, 32 insertions(+), 32 deletions(-) diff --git a/vignettes/datatable-benchmarking.Rmd b/vignettes/datatable-benchmarking.Rmd index da580764b..66788d46d 100644 --- a/vignettes/datatable-benchmarking.Rmd +++ b/vignettes/datatable-benchmarking.Rmd @@ -33,7 +33,7 @@ sudo lshw -class disk sudo hdparm -t /dev/sda ``` -When comparing `fread` to non-R solutions be aware that R requires values of character columns to be added to _R's global string cache_. This takes time when reading data but later operations benefit since the character strings have already been cached. Consequently as well timing isolated tasks (such as `fread` alone), it's a good idea to benchmark a pipeline of tasks such as reading data, computing operators and producing final output and report the total time of the pipeline. +When comparing `fread` to non-R solutions be aware that R requires values of character columns to be added to _R's global string cache_. This takes time when reading data but later operations benefit since the character strings have already been cached. Consequently, in addition to timing isolated tasks (such as `fread` alone), it's a good idea to benchmark the total time of an end-to-end pipeline of tasks such as reading data, manipulating it, and producing final output. # subset: threshold for index optimization on compound queries @@ -68,7 +68,7 @@ options(datatable.auto.index=TRUE) options(datatable.use.index=TRUE) ``` -- `use.index=FALSE` will force query not to use indices even if they exists, but existing keys are still used for optimization. +- `use.index=FALSE` will force the query not to use indices even if they exist, but existing keys are still used for optimization. - `auto.index=FALSE` disables building index automatically when doing subset on non-indexed data, but if indices were created before this option was set, or explicitly by calling `setindex` they still will be used for optimization. Two other options control optimization globally, including use of indices: @@ -77,27 +77,27 @@ options(datatable.optimize=2L) options(datatable.optimize=3L) ``` `options(datatable.optimize=2L)` will turn off optimization of subsets completely, while `options(datatable.optimize=3L)` will switch it back on. -Those options affects much more optimizations thus should not be used when only control of index is needed. Read more in `?datatable.optimize`. +Those options affect many more optimizations and thus should not be used when only control of indices is needed. Read more in `?datatable.optimize`. # _by reference_ operations -When benchmarking `set*` functions it make sense to measure only first run. Those functions updates data.table by reference thus in subsequent runs they get already processed `data.table` on input. +When benchmarking `set*` functions it only makes sense to measure the first run. These functions update their input by reference, so subsequent runs will use the already-processed `data.table`, biasing the results. Protecting your `data.table` from being updated by reference operations can be achieved using `copy` or `data.table:::shallow` functions. Be aware `copy` might be very expensive as it needs to duplicate whole object. It is unlikely we want to include duplication time in time of the actual task we are benchmarking. # try to benchmark atomic processes If your benchmark is meant to be published it will be much more insightful if you will split it to measure time of atomic processes. This way your readers can see how much time was spent on reading data from source, cleaning, actual transformation, exporting results. -Of course if your benchmark is meant to present _full workflow_ then it perfectly make sense to present total timing, still splitting timings might give good insight into bottlenecks in such workflow. -There are another cases when it might not be desired, for example when benchmarking _reading csv_, followed by _grouping_. R requires to populate _R's global string cache_ which adds extra overhead when importing character data to R session. On the other hand _global string cache_ might speed up processes like _grouping_. In such cases when comparing R to other languages it might be useful to include total timing. +Of course if your benchmark is meant to present to present an _end-to-end workflow_, then it makes perfect sense to present the overall timing. Nevertheless, separating out timing of individual steps is useful for understanding which steps are the main bottlenecks of a workflow. +There are other cases when atomic benchmarking might not be desirable, for example when _reading a csv_, followed by _grouping_. R requires populating _R's global string cache_ which adds extra overhead when importing character data to an R session. On the other hand, the _global string cache_ might speed up processes like _grouping_. In such cases when comparing R to other languages it might be useful to include total timing. # avoid class coercion -Unless this is what you truly want to measure you should prepare input objects for every tools you are benchmarking in expected class. +Unless this is what you truly want to measure you should prepare input objects of the expected class for every tool you are benchmarking. # avoid `microbenchmark(..., times=100)` -Repeating benchmarking many times usually does not fit well for data processing tools. Of course it perfectly make sense for more atomic calculations. It does not well represent use case for common data processing tasks, which rather consists of batches sequentially provided transformations, each run once. +Repeating a benchmark many times usually does not give the clearest picture for data processing tools. Of course, it makes perfect sense for more atomic calculations, but this is not a good representation of the most common way these tools will actually be used, namely for data processing tasks, which consist of batches of sequentially provided transformations, each run once. Matt once said: > I'm very wary of benchmarks measured in anything under 1 second. Much prefer 10 seconds or more for a single run, achieved by increasing data size. A repetition count of 500 is setting off alarm bells. 3-5 runs should be enough to convince on larger data. Call overhead and time to GC affect inferences at this very small scale. @@ -106,8 +106,8 @@ This is very valid. The smaller time measurement is the relatively bigger noise # multithreaded processing -One of the main factor that is likely to impact timings is number of threads in your machine. In recent versions of `data.table` some of the functions has been parallelized. -You can control how much threads you want to use with `setDTthreads`. +One of the main factors that is likely to impact timings is the number of threads available to your R session. In recent versions of `data.table`, some functions are parallelized. +You can control the number of threads you want to use with `setDTthreads`. ```r setDTthreads(0) # use all available cores (default) diff --git a/vignettes/datatable-importing.Rmd b/vignettes/datatable-importing.Rmd index c37cd6f75..f1d1e9257 100644 --- a/vignettes/datatable-importing.Rmd +++ b/vignettes/datatable-importing.Rmd @@ -21,11 +21,11 @@ Importing `data.table` is no different from importing other R packages. This vig ## Why to import `data.table` -One of the biggest features of `data.table` is its concise syntax which makes exploratory analysis faster and easier to write and perceive; this convenience can drive packages authors to use `data.table` in their own packages. Another maybe even more important reason is high performance. When outsourcing heavy computing tasks from your package to `data.table`, you usually get top performance without needing to re-invent any high of these numerical optimization tricks on your own. +One of the biggest features of `data.table` is its concise syntax which makes exploratory analysis faster and easier to write and perceive; this convenience can drive package authors to use `data.table`. Another, perhaps more important reason is high performance. When outsourcing heavy computing tasks from your package to `data.table`, you usually get top performance without needing to re-invent any of these numerical optimization tricks on your own. ## Importing `data.table` is easy -It is very easy to use `data.table` as a dependency due to the fact that `data.table` does not have any of its own dependencies. This statement is valid for both operating system dependencies and R dependencies. It means that if you have R installed on your machine, it already has everything needed to install `data.table`. This also means that adding `data.table` as a dependency of your package will not result in a chain of other recursive dependencies to install, making it very convenient for offline installation. +It is very easy to use `data.table` as a dependency due to the fact that `data.table` does not have any of its own dependencies. This applies both to operating system and to R dependencies. It means that if you have R installed on your machine, it already has everything needed to install `data.table`. It also means that adding `data.table` as a dependency of your package will not result in a chain of other recursive dependencies to install, making it very convenient for offline installation. ## `DESCRIPTION` file {DESCRIPTION} diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index 81d4ca131..c783c3fa7 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -39,7 +39,7 @@ Briefly, if you are interested in reducing *programming* and *compute* time trem ## Data {#data} -In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained by [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the Bureau of Transporation Statistics for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/tidyverse/nycflights13)). The data is available only for Jan-Oct'14. +In this vignette, we will use [NYC-flights14](https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) data obtained from the [flights](https://github.com/arunsrinivasan/flights) package (available on GitHub only). It contains On-Time flights data from the Bureau of Transportation Statistics for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/tidyverse/nycflights13)). The data is available only for Jan-Oct'14. We can use `data.table`'s fast-and-friendly file reader `fread` to load `flights` directly as follows: @@ -179,7 +179,7 @@ ans <- flights[, list(arr_delay)] head(ans) ``` -* We wrap the *variables* (column names) within `list()`, which ensures that a `data.table` is returned. In case of a single column name, not wrapping with `list()` returns a vector instead, as seen in the [previous example](#select-j-1d). +* We wrap the *variables* (column names) within `list()`, which ensures that a `data.table` is returned. In the case of a single column name, not wrapping with `list()` returns a vector instead, as seen in the [previous example](#select-j-1d). * `data.table` also allows wrapping columns with `.()` instead of `list()`. It is an *alias* to `list()`; they both mean the same. Feel free to use whichever you prefer; we have noticed most users seem to prefer `.()` for conciseness, so we will continue to use `.()` hereafter. @@ -235,7 +235,7 @@ ans * We first subset in `i` to find matching *row indices* where `origin` airport equals `"JFK"`, and `month` equals `6L`. We *do not* subset the _entire_ `data.table` corresponding to those rows _yet_. -* Now, we look at `j` and find that it uses only *two columns*. And what we have to do is to compute their `mean()`. Therefore we subset just those columns corresponding to the matching rows, and compute their `mean()`. +* Now, we look at `j` and find that it uses only *two columns*. And what we have to do is to compute their `mean()`. Therefore, we subset just those columns corresponding to the matching rows, and compute their `mean()`. Because the three main components of the query (`i`, `j` and `by`) are *together* inside `[...]`, `data.table` can see all three and optimise the query altogether *before evaluation*, rather than optimizing each separately. We are able to therefore avoid the entire subset (i.e., subsetting the columns _besides_ `arr_delay` and `dep_delay`), for both speed and memory efficiency. @@ -263,9 +263,9 @@ ans * Once again, we subset in `i` to get the *row indices* where `origin` airport equals *"JFK"*, and `month` equals *6*. -* We see that `j` uses only `.N` and no other columns. Therefore the entire subset is not materialised. We simply return the number of rows in the subset (which is just the length of row indices). +* We see that `j` uses only `.N` and no other columns. Therefore, the entire subset is not materialised. We simply return the number of rows in the subset (which is just the length of row indices). -* Note that we did not wrap `.N` with `list()` or `.()`. Therefore a vector is returned. +* Note that we did not wrap `.N` with `list()` or `.()`. Therefore, a vector is returned. We could have accomplished the same operation by doing `nrow(flights[origin == "JFK" & month == 6L])`. However, it would have to subset the entire `data.table` first corresponding to the *row indices* in `i` *and then* return the rows using `nrow()`, which is unnecessary and inefficient. We will cover this and other optimisation aspects in detail under the *`data.table` design* vignette. @@ -311,7 +311,7 @@ DF[with(DF, x > 1), ] * Using `with()` in (2) allows using `DF`'s column `x` as if it were a variable. - Hence the argument name `with` in `data.table`. Setting `with = FALSE` disables the ability to refer to columns as if they are variables, thereby restoring the "`data.frame` mode". + Hence, the argument name `with` in `data.table`. Setting `with = FALSE` disables the ability to refer to columns as if they are variables, thereby restoring the "`data.frame` mode". * We can also *deselect* columns using `-` or `!`. For example: @@ -364,7 +364,7 @@ ans * Since we did not provide a name for the column returned in `j`, it was named `N` automatically by recognising the special symbol `.N`. -* `by` also accepts a character vector of column names. This is particularly useful for coding programmatically. For e.g., designing a function with the grouping columns (in the form of a `character` vector) as a function argument. +* `by` also accepts a character vector of column names. This is particularly useful for coding programmatically, e.g., designing a function with the grouping columns (in the form of a `character` vector) as a function argument. * When there's only one column or expression to refer to in `j` and `by`, we can drop the `.()` notation. This is purely for convenience. We could instead do: @@ -594,7 +594,7 @@ DT[, print(c(a,b)), by = ID] # (1) DT[, print(list(c(a,b))), by = ID] # (2) ``` -In (1), for each group, a vector is returned, with length = 6,4,2 here. However (2) returns a list of length 1 for each group, with its first element holding vectors of length 6,4,2. Therefore (1) results in a length of ` 6+4+2 = `r 6+4+2``, whereas (2) returns `1+1+1=`r 1+1+1``. +In (1), for each group, a vector is returned, with length = 6,4,2 here. However, (2) returns a list of length 1 for each group, with its first element holding vectors of length 6,4,2. Therefore, (1) results in a length of ` 6+4+2 = `r 6+4+2``, whereas (2) returns `1+1+1=`r 1+1+1``. ## Summary diff --git a/vignettes/datatable-keys-fast-subset.Rmd b/vignettes/datatable-keys-fast-subset.Rmd index e73b71b92..3ec50640c 100644 --- a/vignettes/datatable-keys-fast-subset.Rmd +++ b/vignettes/datatable-keys-fast-subset.Rmd @@ -20,7 +20,7 @@ knitr::opts_chunk$set( .old.th = setDTthreads(1) ``` -This vignette is aimed at those who are already familiar with *data.table* syntax, its general form, how to subset rows in `i`, select and compute on columns, add/modify/delete columns *by reference* in `j` and group by using `by`. If you're not familiar with these concepts, please read the *"Introduction to data.table"* and *"Reference semantics"* vignettes first. +This vignette is aimed at those who are already familiar with *data.table* syntax, its general form, how to subset rows in `i`, select and compute on columns, add/modify/delete columns *by reference* in `j` and group by using `by`. If you're not familiar with these concepts, please read the *"Introduction to data.table"* and *"Reference semantics"* vignettes first. *** @@ -147,7 +147,7 @@ head(flights) #### set* and `:=`: -In *data.table*, the `:=` operator and all the `set*` (e.g., `setkey`, `setorder`, `setnames` etc..) functions are the only ones which modify the input object *by reference*. +In *data.table*, the `:=` operator and all the `set*` (e.g., `setkey`, `setorder`, `setnames` etc.) functions are the only ones which modify the input object *by reference*. Once you *key* a *data.table* by certain columns, you can subset by querying those key columns using the `.()` notation in `i`. Recall that `.()` is an *alias to* `list()`. @@ -239,7 +239,7 @@ flights[.(unique(origin), "MIA")] #### What's happening here? -* Read [this](#multiple-key-point) again. The value provided for the second key column *"MIA"* has to find the matching values in `dest` key column *on the matching rows provided by the first key column `origin`*. We can not skip the values of key columns *before*. Therefore we provide *all* unique values from key column `origin`. +* Read [this](#multiple-key-point) again. The value provided for the second key column *"MIA"* has to find the matching values in `dest` key column *on the matching rows provided by the first key column `origin`*. We can not skip the values of key columns *before*. Therefore, we provide *all* unique values from key column `origin`. * *"MIA"* is automatically recycled to fit the length of `unique(origin)` which is *3*. @@ -308,7 +308,7 @@ key(flights) * And on those row indices, we replace the `key` column with the value `0`. -* Since we have replaced values on the *key* column, the *data.table* `flights` isn't sorted by `hour` any more. Therefore, the key has been automatically removed by setting to NULL. +* Since we have replaced values on the *key* column, the *data.table* `flights` isn't sorted by `hour` anymore. Therefore, the key has been automatically removed by setting to NULL. Now, there shouldn't be any *24* in the `hour` column. diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index 4e8a99879..0536f11d6 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -88,7 +88,7 @@ my_subset = function(data, col, val) { my_subset(iris, Species, "setosa") ``` -We have to use `deparse(substitute(...))` to catch the actual names of objects passed to function so we can construct the `subset` function call using those original names. Although ths provides unlimited flexibility with relatively low complexity, **use of `eval(parse(...))` should be avoided**. The main reasons are: +We have to use `deparse(substitute(...))` to catch the actual names of objects passed to function, so we can construct the `subset` function call using those original names. Although this provides unlimited flexibility with relatively low complexity, **use of `eval(parse(...))` should be avoided**. The main reasons are: - lack of syntax validation - [vulnerability to code injection](https://github.com/Rdatatable/data.table/issues/2655#issuecomment-376781159) @@ -151,7 +151,7 @@ substitute2( ) ``` -We can see in the output that both the functions names, as well as the names of the variables passed to those functions, have been replaced.. We used `substitute2` for convenience. In this simple case, base R's `substitute` could have been used as well, though it would've required usage of `lapply(env, as.name)`. +We can see in the output that both the functions names, as well as the names of the variables passed to those functions, have been replaced. We used `substitute2` for convenience. In this simple case, base R's `substitute` could have been used as well, though it would've required usage of `lapply(env, as.name)`. Now, to use substitution inside `[.data.table`, we don't need to call the `substitute2` function. As it is now being used internally, all we have to do is to provide `env` argument, the same way as we've provided it to the `substitute2` function in the example above. Substitution can be applied to the `i`, `j` and `by` (or `keyby`) arguments of the `[.data.table` method. Note that setting the `verbose` argument to `TRUE` can be used to print expressions after substitution is applied. This is very useful for debugging. diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd index d282bc7de..d05cdb10c 100644 --- a/vignettes/datatable-reshape.Rmd +++ b/vignettes/datatable-reshape.Rmd @@ -133,7 +133,7 @@ Check `?dcast` for other useful arguments and additional examples. ## 2. Limitations in current `melt/dcast` approaches -So far we've seen features of `melt` and `dcast` that are implemented efficiently for `data.table`s, using internal `data.table` machinery (*fast radix ordering*, *binary search* etc..). +So far we've seen features of `melt` and `dcast` that are implemented efficiently for `data.table`s, using internal `data.table` machinery (*fast radix ordering*, *binary search* etc.). However, there are situations we might run into where the desired operation is not expressed in a straightforward manner. For example, consider the `data.table` shown below: @@ -162,7 +162,7 @@ str(DT.c1) ## gender column is character type now! #### Issues -1. What we wanted to do was to combine all the `dob` and `gender` type columns together respectively. Instead we are combining *everything* together, and then splitting them again. I think it's easy to see that it's quite roundabout (and inefficient). +1. What we wanted to do was to combine all the `dob` and `gender` type columns together respectively. Instead, we are combining *everything* together, and then splitting them again. I think it's easy to see that it's quite roundabout (and inefficient). As an analogy, imagine you've a closet with four shelves of clothes and you'd like to put together the clothes from shelves 1 and 2 together (in 1), and 3 and 4 together (in 3). What we are doing is more or less to combine all the clothes together, and then split them back on to shelves 1 and 3! diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index ae0b5a84a..5f0348e4f 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -197,7 +197,7 @@ Pitching[rank_in_team == 1, team_performance := Teams[.SD, Rank, on = c('teamID', 'yearID')]] ``` -Note that the `x[y]` syntax returns `nrow(y)` values (i.e., it's a right join), which is why `.SD` is on the right in `Teams[.SD]` (since the RHS of `:=` in this case requires `nrow(Pitching[rank_in_team == 1])` values. +Note that the `x[y]` syntax returns `nrow(y)` values (i.e., it's a right join), which is why `.SD` is on the right in `Teams[.SD]` (since the RHS of `:=` in this case requires `nrow(Pitching[rank_in_team == 1])` values). # Grouped `.SD` operations diff --git a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd index ff50ba97e..7d631fe93 100644 --- a/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd +++ b/vignettes/datatable-secondary-indices-and-auto-indexing.Rmd @@ -114,7 +114,7 @@ b) reordering the entire data.table, by reference, based on the order vector com # -Computing the order isn't the time consuming part, since data.table uses true radix sorting on integer, character and numeric vectors. However reordering the data.table could be time consuming (depending on the number of rows and columns). +Computing the order isn't the time consuming part, since data.table uses true radix sorting on integer, character and numeric vectors. However, reordering the data.table could be time consuming (depending on the number of rows and columns). Unless our task involves repeated subsetting on the same column, fast key based subsetting could effectively be nullified by the time to reorder, depending on our data.table dimensions. @@ -148,7 +148,7 @@ As we will see in the next section, the `on` argument provides several advantage * allows for a cleaner syntax by having the columns on which the subset is performed as part of the syntax. This makes the code easier to follow when looking at it at a later point. - Note that `on` argument can also be used on keyed subsets as well. In fact, we encourage to provide the `on` argument even when subsetting using keys for better readability. + Note that `on` argument can also be used on keyed subsets as well. In fact, we encourage providing the `on` argument even when subsetting using keys for better readability. # @@ -277,7 +277,7 @@ flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last", on = c("origin", "dest" ## 3. Auto indexing -First we looked at how to fast subset using binary search using *keys*. Then we figured out that we could improve performance even further and have more cleaner syntax by using secondary indices. +First we looked at how to fast subset using binary search using *keys*. Then we figured out that we could improve performance even further and have cleaner syntax by using secondary indices. That is what *auto indexing* does. At the moment, it is only implemented for binary operators `==` and `%in%`. An index is automatically created *and* saved as an attribute. That is, unlike the `on` argument which computes the index on the fly each time (unless one already exists), a secondary index is created here. From 7e1a95068d1643bbe921a3765ce6992b6db336b4 Mon Sep 17 00:00:00 2001 From: Scott Ritchie Date: Sun, 7 Jan 2024 23:58:36 +0000 Subject: [PATCH 05/21] Improved handling of list columns with NULL entries (#4250) * Updated documentation for rbindlist(fill=TRUE) * Print NULL entries of list as NULL * Added news item * edit NEWS, use '[NULL]' not 'NULL' * fix test * split NEWS item * add example --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico Co-authored-by: Benjamin Schwendinger --- NEWS.md | 14 ++++++++++++++ R/print.data.table.R | 2 +- inst/tests/tests.Rraw | 2 +- man/rbindlist.Rd | 2 +- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3ef946575..7b7f45f63 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,10 +2,24 @@ # data.table [v1.15.99]() (in development) +## NEW FEATURES + +1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. + + ```R + data.table(a=list(NULL, "")) + # a + # + # 1: [NULL] + # 2: + ``` + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. +2. The documentation for the `fill` argument in `rbind()` and `rbindlist()` now notes the expected behaviour for missing `list` columns when `fill=TRUE`, namely to use `NULL` (not `NA`), [#4198](https://github.com/Rdatatable/data.table/pull/4198). Thanks @sritchie73 for the proposal and fix. + # data.table [v1.14.99](https://github.com/Rdatatable/data.table/milestone/29) (in development) ## BREAKING CHANGE diff --git a/R/print.data.table.R b/R/print.data.table.R index 7271ac458..6588ca458 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -215,7 +215,7 @@ format_col.expression = function(x, ...) format(char.trunc(as.character(x)), ... format_list_item.default = function(x, ...) { if (is.null(x)) # NULL item in a list column - "" + "[NULL]" # not '' or 'NULL' to distinguish from those "common" string values in data else if (is.atomic(x) || inherits(x, "formula")) # FR #2591 - format.data.table issue with columns of class "formula" paste(c(format(head(x, 6L), ...), if (length(x) > 6L) "..."), collapse=",") # fix for #5435 and #37 - format has to be added here... else if (has_format_method(x) && length(formatted<-format(x, ...))==1L) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 935d63825..b0a6367d2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -320,7 +320,7 @@ test(69.4, names(tables(silent=TRUE, mb=FALSE, index=TRUE)), xenv = new.env() # to control testing tables() xenv$DT = data.table(a = 1) -test(69.5, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 1 1 0 a.*Total: 0MB") +test(69.5, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 1 1 0 a [NULL]\nTotal: 0MB") xenv$DT = data.table(A=1:2, B=3:4, C=5:6, D=7:8, E=9:10, F=11:12, G=13:14, H=15:16, key="A,D,F,G") test(69.6, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 2 8 0 A,B,C,D,E,F,... A,D,F,G.*Total: 0MB") rm(xenv) diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd index 2ba39a2a9..ffc79c61a 100644 --- a/man/rbindlist.Rd +++ b/man/rbindlist.Rd @@ -13,7 +13,7 @@ rbindlist(l, use.names="check", fill=FALSE, idcol=NULL) \arguments{ \item{l}{ A list containing \code{data.table}, \code{data.frame} or \code{list} objects. \code{\dots} is the same but you pass the objects by name separately. } \item{use.names}{\code{TRUE} binds by matching column name, \code{FALSE} by position. `check` (default) warns if all items don't have the same names in the same order and then currently proceeds as if `use.names=FALSE` for backwards compatibility (\code{TRUE} in future); see news for v1.12.2.} - \item{fill}{\code{TRUE} fills missing columns with NAs. By default \code{FALSE}.} + \item{fill}{\code{TRUE} fills missing columns with NAs, or NULL for missing list columns. By default \code{FALSE}.} \item{idcol}{Creates a column in the result showing which list item those rows came from. \code{TRUE} names this column \code{".id"}. \code{idcol="file"} names this column \code{"file"}. If the input list has names, those names are the values placed in this id column, otherwise the values are an integer vector \code{1:length(l)}. See \code{examples}.} } \details{ From d9d17a7c4e16c44c45014c24899958cfa2f60cbc Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 8 Jan 2024 14:03:41 +0800 Subject: [PATCH 06/21] clarify that list input->unnamed list output (#5383) * clarify that list input->unnamed list output * Add example where make.names is used * mention role of make.names --- man/transpose.Rd | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/man/transpose.Rd b/man/transpose.Rd index 547688fd6..61a2d1dd1 100644 --- a/man/transpose.Rd +++ b/man/transpose.Rd @@ -2,7 +2,7 @@ \alias{transpose} \title{Efficient transpose of list} \description{ - \code{transpose} is an efficient way to transpose \code{lists}, \code{data frames} or \code{data tables}. + \code{transpose} is an efficient way to transpose \code{lists}, \code{data.frames} or \code{data.tables}. } \usage{ @@ -26,6 +26,8 @@ transpose(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names=NULL) } \value{ A transposed \code{list}, \code{data.frame} or \code{data.table}. + + \code{list} outputs will only be named according to \code{make.names}. } \examples{ @@ -35,6 +37,9 @@ setDT(transpose(ll, fill=0))[] DT = data.table(x=1:5, y=6:10) transpose(DT) + +ll = list(nm=c('x', 'y'), 1:2, 3:4) +transpose(ll, make.names="nm") } \seealso{ \code{\link{data.table}}, \code{\link{tstrsplit}} From da24f85dbdf448eaa4e72251b30f03fe33678d30 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Jan 2024 00:34:28 +0800 Subject: [PATCH 07/21] fix subsetting issue in split.data.table (#5368) * fix subsetting issue in split.data.table * add a test * drop=FALSE on inner [ --- NEWS.md | 2 ++ R/data.table.R | 3 ++- inst/tests/other.Rraw | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 7b7f45f63..6c1b89831 100644 --- a/NEWS.md +++ b/NEWS.md @@ -580,6 +580,8 @@ 55. `fread(URL)` with `https:` and `ftps:` could timeout if proxy settings were not guessed right by `curl::curl_download`, [#1686](https://github.com/Rdatatable/data.table/issues/1686). `fread(URL)` now uses `download.file()` as default for downloading files from urls. Thanks to @cderv for the report and Benjamin Schwendinger for the fix. +56. `split.data.table()` works for downstream methods that don't implement `DT[i]` form (i.e., requiring `DT[i, j]` form, like plain `data.frame`s), for example `sf`'s `[.sf`, [#5365](https://github.com/Rdatatable/data.table/issues/5365). Thanks @barryrowlingson for the report and @michaelchirico for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/data.table.R b/R/data.table.R index d66773e3a..9d62702aa 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2454,7 +2454,8 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR if (!missing(by)) stopf("passing 'f' argument together with 'by' is not allowed, use 'by' when split by column in data.table and 'f' when split by external factor") # same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency - return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind])) + # be sure to use x[ind, , drop = FALSE], not x[ind], in case downstream methods don't follow the same subsetting semantics (#5365) + return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind, , drop = FALSE])) } if (missing(by)) stopf("Either 'by' or 'f' argument must be supplied") # check reserved column names during processing diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 807a67c19..88593bcdf 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -211,6 +211,9 @@ test(14.2, {example('CJ', package='data.table', local=TRUE, echo=FALSE); TRUE}) if (loaded[["sf"]]) { #2273 DT = as.data.table(st_read(system.file("shape/nc.shp", package = "sf"))) test(15, DT[1:3, .(NAME, FIPS, geometry)], output="Ashe.*-81.4.*Surry.*-80.4") + + dsf = sf::st_as_sf(data.table(x=1:10, y=1:10, s=sample(1:2, 10, TRUE)), coords=1:2) + test(16, split(dsf, dsf$s), list(`1` = dsf[dsf$s == 1, ], `2` = dsf[dsf$s == 2, ])) } if (loaded[["yaml"]]) { # csvy; #1701. Was 2032-2033 in tests.Rraw, #5516 From 58608a25bd1a93ed43f5949706e6f953d9bf0cfc Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Jan 2024 10:45:39 +0800 Subject: [PATCH 08/21] switch to 3.2.0 R dep (#5905) --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index bcaa3b257..7fff65b85 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: data.table Version: 1.15.99 Title: Extension of `data.frame` -Depends: R (>= 3.1.0) +Depends: R (>= 3.2.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), R.utils, xts, zoo (>= 1.8-1), yaml, knitr, markdown Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. diff --git a/NEWS.md b/NEWS.md index 6c1b89831..84dcbc8ce 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,8 @@ 2. The documentation for the `fill` argument in `rbind()` and `rbindlist()` now notes the expected behaviour for missing `list` columns when `fill=TRUE`, namely to use `NULL` (not `NA`), [#4198](https://github.com/Rdatatable/data.table/pull/4198). Thanks @sritchie73 for the proposal and fix. +3. data.table now depends on R 3.2.0 (2015) instead of 3.1.0 (2014). 1.17.0 will likely move to R 3.3.0 (2016). Recent versions of R have good features that we would gradually like to incorporate, and we see next to no usage of these very old versions of R. + # data.table [v1.14.99](https://github.com/Rdatatable/data.table/milestone/29) (in development) ## BREAKING CHANGE From c84a123b6882fdaa1092541c2fa4f984b40d4e91 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Jan 2024 16:19:37 +0800 Subject: [PATCH 09/21] Allow early exit from check for eval/evalq in cedta (#5660) * Allow early exit from check for eval/evalq in cedta Done in the browser+untested, please take a second look :) * Use %chin% * nocov new code --- R/cedta.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/R/cedta.R b/R/cedta.R index 5d0a60c10..51dc0e9dd 100644 --- a/R/cedta.R +++ b/R/cedta.R @@ -22,6 +22,19 @@ cedta.pkgEvalsUserCode = c("gWidgetsWWW","statET","FastRWeb","slidify","rmarkdow # package authors could set it using assignInNamespace and then not revert its value properly which would # cause subsequent calls from other packages to fail. +# nocov start: very hard to reach this within our test suite -- the call stack within a call generated by e.g. knitr +# for loop, not any(vapply_1b(.)), to allow early exit +.any_eval_calls_in_stack <- function() { + calls = sys.calls() + # likelier to be close to the end of the call stack, right? + for (ii in length(calls):1) { # rev(seq_len(length(calls)))? See https://bugs.r-project.org/show_bug.cgi?id=18406. + the_call <- calls[[ii]][[1L]] + if (is.name(the_call) && (the_call %chin% c("eval", "evalq"))) return(TRUE) + } + return(FALSE) +} +# nocov end + # cedta = Calling Environment Data.Table-Aware cedta = function(n=2L) { # Calling Environment Data Table Aware @@ -41,7 +54,7 @@ cedta = function(n=2L) { (exists("debugger.look", parent.frame(n+1L)) || (length(sc<-sys.calls())>=8L && sc[[length(sc)-7L]] %iscall% 'example')) ) || # 'example' for #2972 (nsname=="base" && all(c("FUN", "X") %chin% ls(parent.frame(n)))) || # lapply - (nsname %chin% cedta.pkgEvalsUserCode && any(vapply_1b(sys.calls(), function(x) is.name(x[[1L]]) && (x[[1L]]=="eval" || x[[1L]]=="evalq")))) || + (nsname %chin% cedta.pkgEvalsUserCode && .any_eval_calls_in_stack()) || nsname %chin% cedta.override || isTRUE(ns$.datatable.aware) || # As of Sep 2018: RCAS, caretEnsemble, dtplyr, rstanarm, rbokeh, CEMiTool, rqdatatable, RImmPort, BPRMeth, rlist tryCatch("data.table" %chin% get(".Depends",paste("package",nsname,sep=":"),inherits=FALSE),error=function(e)FALSE) # both ns$.Depends and get(.Depends,ns) are not sufficient From 513f20fd6ce2ab3fbe4498948dfcbe916115019c Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Fri, 12 Jan 2024 14:50:40 +0100 Subject: [PATCH 10/21] frollmax1: frollmax, frollmax adaptive, left adaptive support (#5889) * frollmax exact, buggy fast, no fast adaptive * frollmax fast fixing bugs * frollmax man to fix CRAN check * frollmax fast adaptive non NA, dev * froll docs, adaptive left * no frollmax fast adaptive * frollmax adaptive exact NAs handling * PR summary in news * copy-edit changes from reviews Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Michael Chirico Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * comment requested by Michael * update NEWS file * Apply suggestions from code review Co-authored-by: Michael Chirico * Apply suggestions from code review Co-authored-by: Michael Chirico * add comment requested by Michael * add comment about int iterator for loop over k-1 obs * extra comments * Revert "extra comments" This reverts commit 03af0e30f1a6a9e75f82b5827c1078f42db48e45. * add comments to frollmax and frollsum * typo fix --------- Co-authored-by: Michael Chirico Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> --- NAMESPACE | 1 + NEWS.md | 2 + R/froll.R | 26 +++- inst/tests/froll.Rraw | 4 +- man/froll.Rd | 107 ++++++++++------ src/data.table.h | 6 + src/froll.c | 275 +++++++++++++++++++++++++++++++++++++++++- src/frollR.c | 16 ++- src/frolladaptive.c | 77 ++++++++++++ 9 files changed, 471 insertions(+), 43 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c5a67095a..9b5930ece 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,6 +51,7 @@ S3method(cube, data.table) S3method(rollup, data.table) export(frollmean) export(frollsum) +export(frollmax) export(frollapply) export(nafill) export(setnafill) diff --git a/NEWS.md b/NEWS.md index 84dcbc8ce..04d37bf40 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,8 @@ # 2: ``` +2. New window function `frollmax` computes the rolling maximum. Request came from @gpierard who needs left-aligned, adaptive, rolling max, [#5438](https://github.com/Rdatatable/data.table/issues/5438). Adaptive rolling functions did not have support for `align="left"`, therefore we added this feature as well for all adaptive rolling functions. We measure adaptive `frollmax` to be up to 50 times faster than the next fastest solution using `max` and grouping `by=.EACHI`. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/froll.R b/R/froll.R index df901f0b8..27085f99f 100644 --- a/R/froll.R +++ b/R/froll.R @@ -2,8 +2,25 @@ froll = function(fun, x, n, fill=NA, algo=c("fast", "exact"), align=c("right", " stopifnot(!missing(fun), is.character(fun), length(fun)==1L, !is.na(fun)) algo = match.arg(algo) align = match.arg(align) + leftadaptive = isTRUE(adaptive) && align=="left" ## support for left added in #5441 + if (leftadaptive) { + rev2 = function(x) if (is.list(x)) sapply(x, rev, simplify=FALSE) else rev(x) + verbose = getOption("datatable.verbose") + if (verbose) + catf("froll: adaptive=TRUE && align='left' pre-processing for align='right'\n") + ## TODO test atomic x but list of lists n (multiple windows)! + x = rev2(x) + n = rev2(n) + align = "right" + } ans = .Call(CfrollfunR, fun, x, n, fill, algo, align, na.rm, hasNA, adaptive) - ans + if (!leftadaptive) + ans + else { + if (verbose) + catf("froll: adaptive=TRUE && align='left' post-processing from align='right'\n") + rev2(ans) + } } frollmean = function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) { @@ -12,9 +29,14 @@ frollmean = function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "l frollsum = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) { froll(fun="sum", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, hasNA=hasNA, adaptive=adaptive) } -frollapply = function(x, n, FUN, ..., fill=NA, align=c("right", "left", "center")) { +frollmax = function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) { + froll(fun="max", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, hasNA=hasNA, adaptive=adaptive) +} +frollapply = function(x, n, FUN, ..., fill=NA, align=c("right", "left", "center"), adaptive) { FUN = match.fun(FUN) align = match.arg(align) + if (!missing(adaptive)) + stopf("frollapply does not support 'adaptive' argument") rho = new.env() ans = .Call(CfrollapplyR, FUN, x, n, fill, align, rho) ans diff --git a/inst/tests/froll.Rraw b/inst/tests/froll.Rraw index f6a4f96a8..54c67e562 100644 --- a/inst/tests/froll.Rraw +++ b/inst/tests/froll.Rraw @@ -563,8 +563,8 @@ if (FALSE) { #### adaptive limitations test(6000.145, frollmean(1:2, 1:2, adaptive=TRUE, align="right"), c(1, 1.5)) -test(6000.146, frollmean(1:2, 1:2, adaptive=TRUE, align="center"), error="using adaptive TRUE and align argument different than 'right' is not implemented") -test(6000.147, frollmean(1:2, 1:2, adaptive=TRUE, align="left"), error="using adaptive TRUE and align argument different than 'right' is not implemented") +test(6000.146, frollmean(1:2, 1:2, adaptive=TRUE, align="center"), error="using adaptive TRUE and align 'center' is not implemented") +##test(6000.147, frollmean(1:2, 1:2, adaptive=TRUE, align="left"), error="using adaptive TRUE and align argument different than 'right' is not implemented") ## support added in #5441, TODO add tests test(6000.148, frollmean(list(1:2, 1:3), list(1:2), adaptive=TRUE), error="adaptive rolling function can only process 'x' having equal length of elements, like data.table or data.frame. If you want to call rolling function on list having variable length of elements call it for each field separately") #### adaptive exact diff --git a/man/froll.Rd b/man/froll.Rd index d6cb75067..c718bafa0 100644 --- a/man/froll.Rd +++ b/man/froll.Rd @@ -7,7 +7,9 @@ \alias{rollmean} \alias{frollmean} \alias{rollsum} +\alias{rollmax} \alias{frollsum} +\alias{frollmax} \alias{rollapply} \alias{frollapply} \title{Rolling functions} @@ -19,7 +21,9 @@ frollmean(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) frollsum(x, n, fill=NA, algo=c("fast","exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) -frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) +frollmax(x, n, fill=NA, algo=c("fast","exact"), + align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) +frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center"), adaptive) } \arguments{ \item{x}{ Vector, \code{data.frame} or \code{data.table} of integer, numeric or logical columns over which to calculate the windowed aggregations. May also be a list, in which case the rolling function is applied to each of its elements. } @@ -45,36 +49,69 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) conveniently within \code{data.table} syntax. Argument \code{n} allows multiple values to apply rolling functions on - multiple window sizes. If \code{adaptive=TRUE}, then \code{n} must be a list. - Each list element must be integer vector of window sizes corresponding - to every single observation in each column; see Examples. + multiple window sizes. If \code{adaptive=TRUE}, then \code{n} must be a list, + see \emph{Adaptive rolling functions} section below for details. - When \code{algo="fast"} an \emph{"on-line"} algorithm is used, and - all of \code{NaN, +Inf, -Inf} are treated as \code{NA}. - Setting \code{algo="exact"} will make rolling functions to use a more - computationally-intensive algorithm that suffers less from floating point - rounding error (the same consideration applies to \code{\link[base]{mean}}). - \code{algo="exact"} also handles \code{NaN, +Inf, -Inf} consistently to - base R. In case of some functions (like \emph{mean}), it will additionally + When multiple columns or multiple window widths are provided, then they + are run in parallel. The exception is for \code{algo="exact"}, which runs in + parallel already. See \code{\link{setDTthreads}} for defaults and further details on parallelism in data.table. +} +\section{\code{hasNA} argument}{ + \code{hasNA} can be used to speed up processing in cases when it is known + whether \code{x} contains infinite values \code{NA, NaN, +Inf, -Inf}. + \itemize{ + \item{ Default \code{hasNA=NA} will use faster \code{NA} agnostic implementation, + but when \code{NA}s are detected it will re-run \code{NA} aware implementation. } + \item{ \code{hasNA=TRUE} will use \code{NA} aware implementation straightaway. } + \item{ \code{hasNA=FALSE} will use faster \code{NA} agnostic implementation. + Then depending on the rolling function it will either + \itemize{ + \item{ (\emph{mean, sum}) detect \code{NA}s, raise warning, re-run \code{NA} aware. } + \item{ (\emph{max}) not detect \code{NA}s and may silently produce an incorrect + answer. }} + Therefore \code{hasNA=FALSE} should be used with care. + } + } +} +\section{Implementation}{ + \itemize{ + \item{ \code{algo="fast"} uses \emph{"on-line"} algorithm, and + all of \code{NaN, +Inf, -Inf} are treated as \code{NA}. Not all functions + have \emph{fast} implementation available. As of now + \emph{max} and \code{adaptive=TRUE} do not have it, therefore it will + automatically fall back to \emph{exact} implementation. \code{datatable.verbose} + option can be used to check that. } + \item{ \code{algo="exact"} will make rolling functions use a more + computationally-intensive algorithm. For each observation from input vector + it will compute a function on a window from scratch (complexity \eqn{O(n^2)}). + Depending on the function, this algorithm may suffers less from + floating point rounding error (the same consideration applies to base \code{\link[base]{mean}}). + Algorithm also handles \code{NaN, +Inf, -Inf} consistently to + base R, unless - for some functions (e.g. \emph{max}) - \code{hasNA} is \code{FALSE} + but NAs are present. In case of some functions (like \emph{mean}), it will additionally make extra pass to perform floating point error correction. Error corrections might not be truly exact on some platforms (like Windows) - when using multiple threads. - + when using multiple threads. } + } +} +\section{Adaptive rolling functions}{ Adaptive rolling functions are a special case where each - observation has its own corresponding rolling window width. Due to the logic - of adaptive rolling functions, the following restrictions apply: + observation has its own corresponding rolling window width. \code{n} + argument must be a list, then each list element must be an integer vector + of window sizes corresponding to every single observation in each column; + see Examples. Due to the logic or implementation of adaptive rolling + functions, the following restrictions apply: \itemize{ - \item \code{align} only \code{"right"}. + \item \code{align} does not support \code{"center"}. \item if list of vectors is passed to \code{x}, then all vectors within it must have equal length. + \item functionality is not suported in \code{frollapply}. } - - When multiple columns or multiple windows width are provided, then they - are run in parallel. The exception is for \code{algo="exact"}, which runs in - parallel already. - +} +\section{\code{frollapply}}{ \code{frollapply} computes rolling aggregate on arbitrary R functions. - The input \code{x} (first argument) to the function \code{FUN} + \code{adaptive} argument is not supported. The input + \code{x} (first argument) to the function \code{FUN} is coerced to \emph{numeric} beforehand and \code{FUN} has to return a scalar \emph{numeric} value. Checks for that are made only during the first iteration when \code{FUN} is evaluated. Edge cases can be @@ -84,32 +121,34 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) because there is no thread-safe API to R's C \code{eval}. Nevertheless we've seen the computation speed up vis-a-vis versions implemented in base R. } -\value{ - A list except when the input is a \code{vector} and - \code{length(n)==1} in which case a \code{vector} is returned. -} -\note{ +\section{\code{zoo} package users notice}{ Users coming from most popular package for rolling functions \code{zoo} might expect following differences in \code{data.table} implementation. \itemize{ - \item rolling function will always return result of the same length as input. - \item \code{fill} defaults to \code{NA}. + \item rolling functions will always return result of the same length + as input. + \item \code{fill} defaults to \code{NA}. \item \code{fill} accepts only constant values. It does not support for \emph{na.locf} or other functions. - \item \code{align} defaults to \code{"right"}. + \item \code{align} defaults to \code{"right"}. \item \code{na.rm} is respected, and other functions are not needed when input contains \code{NA}. - \item integers and logical are always coerced to double. + \item integers and logical are always coerced to double. \item when \code{adaptive=FALSE} (default), then \code{n} must be a numeric vector. List is not accepted. \item when \code{adaptive=TRUE}, then \code{n} must be vector of length equal to \code{nrow(x)}, or list of such vectors. \item \code{partial} window feature is not supported, although it can - be accomplished by using \code{adaptive=TRUE}, see - examples. \code{NA} is always returned for incomplete windows. + be accomplished by using \code{adaptive=TRUE}, see examples. + \code{NA} is always returned for incomplete windows. } - +} +\value{ + A list except when the input is a \code{vector} and + \code{length(n)==1} in which case a \code{vector} is returned. +} +\note{ Be aware that rolling functions operates on the physical order of input. If the intent is to roll values in a vector by a logical window, for example an hour, or a day, one has to ensure that there are no gaps in diff --git a/src/data.table.h b/src/data.table.h index 0a6eb207a..a6213b731 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -206,6 +206,9 @@ void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool void frollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose); void frollsumFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); void frollsumExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); +void frollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose); +void frollmaxFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); +void frollmaxExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); void frollapply(double *x, int64_t nx, double *w, int k, ans_t *ans, int align, double fill, SEXP call, SEXP rho, bool verbose); // frolladaptive.c @@ -215,6 +218,9 @@ void fadaptiverollmeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double f void fadaptiverollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); void fadaptiverollsumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); void fadaptiverollsumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); +void fadaptiverollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); +//void fadaptiverollmaxFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); // does not exists as of now +void fadaptiverollmaxExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); // frollR.c SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEXP narm, SEXP hasNA, SEXP adaptive); diff --git a/src/froll.c b/src/froll.c index 3ab7bd927..1c0fa2537 100644 --- a/src/froll.c +++ b/src/froll.c @@ -49,7 +49,7 @@ void frollmeanFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool long double w = 0.0; // sliding window aggregate bool truehasna = hasna>0; // flag to re-run with NA support if NAs detected if (!truehasna) { - int i; // iterator declared here because it is being used after for loop + int i; // iterator declared here because it is being used after for loop, it is int32 rather than int64 because first loop over k-1 observations cannot exceed int32 range for (i=0; idbl_v[i] = fill; // answers are fill for partial window @@ -217,6 +217,7 @@ void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool } /* fast rolling sum */ +// see frollmean code for comments describing online implementations for rolling functions void frollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose) { if (nx < k) { if (verbose) @@ -397,6 +398,278 @@ void frollsumExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool } } +/* fast rolling max */ +// see frollmean code for comments describing online implementations for rolling functions +void frollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose) { + if (nx < k) { + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: window width longer than input vector, returning all %f vector\n"), __func__, fill); + for (int i=0; idbl_v[i] = fill; + } + return; + } + double tic = 0; + if (verbose) + tic = omp_get_wtime(); + if (algo==0) { + frollmaxFast(x, nx, ans, k, fill, narm, hasna, verbose); + } else { // algo==1 + frollmaxExact(x, nx, ans, k, fill, narm, hasna, verbose); + } + if (ans->status < 3 && align < 1) { + int k_ = align==-1 ? k-1 : floor(k/2); + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: align %d, shift answer by %d\n"), __func__, align, -k_); + memmove((char *)ans->dbl_v, (char *)ans->dbl_v + (k_*sizeof(double)), (nx-k_)*sizeof(double)); + for (uint64_t i=nx-k_; idbl_v[i] = fill; + } + } + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: processing algo %u took %.3fs\n"), __func__, algo, omp_get_wtime()-tic); +} +inline void windowmax(double *x, uint64_t o, int k, /* max of current window */ double *w, /* index of w within window */ uint64_t *iw) { + for (int i=0; i= w[0]: x[%d-%d+1] >= w[0]: %f >= %f: %d\n", i, o, x[o], i, k, x[o+i-k+1], w[0], x[o+i-k+1] >= w[0]); + if (x[o+i-k+1] >= w[0]) { // what if that is never satisfied? test! + iw[0] = o+i-k+1; + w[0] = x[iw[0]]; + } + } +} +inline void windowmaxnarm(double *x, uint64_t o, int k, bool narm, /* NA counter */ int *nc, double *w, uint64_t *iw) { + for (int i=0; i= w[0]: x[%d-%d+1] >= w[0]: %f >= %f: %d\n", i, o, x[o], i, k, x[o+i-k+1], w[0], x[o+i-k+1] >= w[0]); + if (R_FINITE(x[o+i-k+1])) { + if (x[o+i-k+1] >= w[0]) { // what if that is never satisfied? test! + iw[0] = o+i-k+1; + w[0] = x[iw[0]]; + } + } else if (narm) { + nc[0]++; + } else { + w[0] = NA_REAL; + } + } +} +/* fast rolling max - fast + * fast online algorithm do single pass over elements keeping track of recent max and its index + * if index of max is within progressing window then it keeps running single pass + * whenever max is leaving the window (index of max is outside of iterator minus window size) then new maximum is computed via nested loop on current complete window + * new max is used to continue outer single pass as long as new max index is not leaving the running window + * should scale well for bigger window size, may carry overhead for small window, needs benchmarking + * TODO NA handling + */ +void frollmaxFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose) { + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", window %d, hasna %d, narm %d\n"), "frollmaxFast", (uint64_t)nx, k, hasna, (int)narm); + double w = R_NegInf; // window max + uint64_t cmax = 0; // counter of nested loops for verbose + uint64_t iw = 0; // index of window max + bool truehasna = hasna>0; + if (!truehasna) { + int i; + for (i=0; i= w) { // >= rather than > because we track most recent maximum using iw + iw = i; + w = x[iw]; + } + ans->dbl_v[i] = fill; + } + if (!truehasna && !R_FINITE(x[i])) { + if (x[i] >= w) { + iw = i; + w = x[iw]; + } else { + // ensure iw ok, case when all window was -Inf or NA? + } + ans->dbl_v[i] = w; + } else { + truehasna = true; + } + if (!truehasna) { + for (uint64_t i=k; i= w) { + //Rprintf("iteration %d, new max %f at %d, old max %f at %d\n", i, x[i], i, w, iw); + iw = i; + w = x[iw]; + } else { + if (iw > i-k) { // max is still within window + // w and iw are still ok + //Rprintf("iteration %d, max %f at %d bigger than current value %f\n", i, w, iw, x[i]); + } else { // max left the window, need to find max in this window + //Rprintf("iteration %d, max %f at %d left the window, call windowmax from %d of size %d\n", i, w, iw, i, k); + w = R_NegInf; + iw = i-k; + windowmax(x, i, k, &w, &iw); + //Rprintf("iteration %d, windowmax found new max %f at %d\n", i, w, iw); + cmax++; + } + } + ans->dbl_v[i] = w; + } + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: windowmax called %"PRIu64" time(s)\n"), __func__, cmax); + if (truehasna) { + if (hasna==-1) { + ans->status = 2; + snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); + } + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); + w = R_NegInf; + cmax = 0; + } + } else { + if (hasna==-1) { + ans->status = 2; + snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); + } + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, skip non-NA attempt and run with extra care for NAs\n"), __func__); + w = R_NegInf; + cmax = 0; + } + } + if (truehasna) { + int nc = 0; + int i; + for (i=0; i= w) { + iw = i; + w = x[iw]; + } + } else { + nc++; + } + ans->dbl_v[i] = fill; + } + if (R_FINITE(x[i])) { + if (x[i] >= w) { + iw = i; + w = x[iw]; + } + } else { + nc++; + } + if (nc == 0) { + ans->dbl_v[i] = w; + } else if (nc == k) { + ans->dbl_v[i] = narm ? R_NegInf : NA_REAL; + } else { + ans->dbl_v[i] = narm ? w : NA_REAL; + } + for (uint64_t i=k; i= w) { + iw = i; + w = x[iw]; + } else { + if (iw > i-k) { // max is still within window + // w and iw are still ok + } else { // max left the window, need to find max in this window + w = R_NegInf; + iw = i-k; + windowmaxnarm(x, i, k, narm, &nc, &w, &iw); + //Rprintf("iteration %d, windowmaxnarm found new max %f at %d\n", i, w, iw); + cmax++; + } + } + ans->dbl_v[i] = w; + } else { + nc++; + } + if (!R_FINITE(x[i-k])) { // value leaving rolling window is non-finite, decrement NF counter + nc--; + } + if (nc == 0) { + ans->dbl_v[i] = w; + } else if (nc == k) { + ans->dbl_v[i] = narm ? R_NegInf : NA_REAL; + } else { + ans->dbl_v[i] = narm ? w : NA_REAL; + } + } + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: windowmaxnarm called %"PRIu64" time(s)\n"), __func__, cmax); + } +} +/* fast rolling max - exact + * for each observation in x compute max in window from scratch + * no proper support for NAs yet + */ +void frollmaxExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose) { + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", window %d, hasna %d, narm %d\n"), "frollmaxExact", (uint64_t)nx, k, hasna, (int)narm); + for (int i=0; idbl_v[i] = fill; + } + bool truehasna = hasna>0; + if (!truehasna || !narm) { + #pragma omp parallel for num_threads(getDTthreads(nx, true)) + for (uint64_t i=k-1; i w) + w = x[i+j]; + } + if (R_FINITE(w)) { + ans->dbl_v[i] = w; + } else { + if (!narm) { + ans->dbl_v[i] = w; + } + truehasna = true; + } + } + if (truehasna) { + if (hasna==-1) { + ans->status = 2; + snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); + } + if (verbose) { + if (narm) { + snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); + } else { + snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, na.rm was FALSE so in 'exact' implementation NAs were handled already, no need to re-run\n"), __func__); + } + } + } + } + if (truehasna && narm) { + #pragma omp parallel for num_threads(getDTthreads(nx, true)) + for (uint64_t i=k-1; i w) { + w = x[i+j]; + } + } + if (nc < k) { + ans->dbl_v[i] = w; + } else { + ans->dbl_v[i] = R_NegInf; + } + } + } +} + /* fast rolling any R function * not plain C, not thread safe * R eval() allocates diff --git a/src/frollR.c b/src/frollR.c index 74cc7dd4e..ee4d7fd39 100644 --- a/src/frollR.c +++ b/src/frollR.c @@ -108,8 +108,8 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX else error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "align", "rolling"); // # nocov - if (badaptive && ialign!=1) - error(_("using adaptive TRUE and align argument different than 'right' is not implemented")); + if (badaptive && ialign==0) // support for left added in #5441 + error(_("using adaptive TRUE and align 'center' is not implemented")); SEXP ans = PROTECT(allocVector(VECSXP, nk * nx)); protecti++; // allocate list to keep results if (verbose) @@ -132,11 +132,13 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX dx[i] = REAL(VECTOR_ELT(x, i)); // assign source columns to C pointers } - enum {MEAN, SUM} sfun; + enum {MEAN, SUM, MAX} sfun; if (!strcmp(CHAR(STRING_ELT(fun, 0)), "mean")) { sfun = MEAN; } else if (!strcmp(CHAR(STRING_ELT(fun, 0)), "sum")) { sfun = SUM; + } else if (!strcmp(CHAR(STRING_ELT(fun, 0)), "max")) { + sfun = MAX; } else { error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "fun", "rolling"); // # nocov } @@ -152,7 +154,7 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX int ihasna = // plain C tri-state boolean as integer LOGICAL(hasna)[0]==NA_LOGICAL ? 0 : // hasna NA, default, no info about NA LOGICAL(hasna)[0]==TRUE ? 1 : // hasna TRUE, might be some NAs - -1; // hasna FALSE, there should be no NAs + -1; // hasna FALSE, there should be no NAs // or there must be no NAs for rollmax #5441 unsigned int ialgo; // decode algo to integer if (!strcmp(CHAR(STRING_ELT(algo, 0)), "fast")) @@ -193,6 +195,12 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX else fadaptiverollsum(ialgo, dx[i], inx[i], &dans[i*nk+j], ikl[j], dfill, bnarm, ihasna, verbose); break; + case MAX : + if (!badaptive) + frollmax(ialgo, dx[i], inx[i], &dans[i*nk+j], iik[j], ialign, dfill, bnarm, ihasna, verbose); + else + fadaptiverollmax(ialgo, dx[i], inx[i], &dans[i*nk+j], ikl[j], dfill, bnarm, ihasna, verbose); + break; default: error(_("Internal error: Unknown sfun value in froll: %d"), sfun); // #nocov } diff --git a/src/frolladaptive.c b/src/frolladaptive.c index e005cef69..f0a40fb9c 100644 --- a/src/frolladaptive.c +++ b/src/frolladaptive.c @@ -364,3 +364,80 @@ void fadaptiverollsumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fi } } } + +/* fast adaptive rolling max */ +void fadaptiverollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { + double tic = 0; + if (verbose) + tic = omp_get_wtime(); + if (algo==0 && verbose) { + //fadaptiverollmaxFast(x, nx, ans, k, fill, narm, hasna, verbose); // fadaptiverollmaxFast does not exists as of now + snprintf(end(ans->message[0]), 500, _("%s: algo %u not implemented, fall back to %u\n"), __func__, algo, (unsigned int) 1); + } + fadaptiverollmaxExact(x, nx, ans, k, fill, narm, hasna, verbose); + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: processing algo %u took %.3fs\n"), __func__, algo, omp_get_wtime()-tic); +} +//void fadaptiverollmaxFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); // does not exists as of now +/* fast adaptive rolling max - exact + * for hasNA=FALSE it will not detect if any NAs were in the input, therefore could produce incorrect result, well documented + */ +void fadaptiverollmaxExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { + if (verbose) + snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasna %d, narm %d\n"), "fadaptiverollmaxExact", (uint64_t)nx, hasna, (int) narm); + if (hasna==-1) { // fastest we can get for adaptive max as there is no algo='fast', therefore we drop any NA checks when hasNA=FALSE + #pragma omp parallel for num_threads(getDTthreads(nx, true)) + for (uint64_t i=0; idbl_v[i] = fill; + } else { + double w = R_NegInf; + for (int j=-k[i]+1; j<=0; j++) { //Rprintf("x[%d+%d] > w: %f > %f: %d\n", i, j, x[i+j], w, x[i+j] > w); + if (x[i+j] > w) + w = x[i+j]; + } + ans->dbl_v[i] = w; + } + } + } else { + if (narm) { + #pragma omp parallel for num_threads(getDTthreads(nx, true)) + for (uint64_t i=0; idbl_v[i] = fill; + } else { + int nc = 0; + double w = R_NegInf; + for (int j=-k[i]+1; j<=0; j++) { + if (ISNAN(x[i+j])) + nc++; + else if (x[i+j] > w) + w = x[i+j]; + } + if (nc < k[i]) + ans->dbl_v[i] = w; + else + ans->dbl_v[i] = R_NegInf; + } + } + } else { + #pragma omp parallel for num_threads(getDTthreads(nx, true)) + for (uint64_t i=0; idbl_v[i] = fill; + } else { + double w = R_NegInf; + for (int j=-k[i]+1; j<=0; j++) { + if (ISNAN(x[i+j])) { + w = NA_REAL; + break; + } else if (x[i+j] > w) { + w = x[i+j]; + } + } + ans->dbl_v[i] = w; + } + } + } + } +} From daee139584e614d0b3bfc4d0ddd6984f43501841 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 14 Jan 2024 15:13:36 +0000 Subject: [PATCH 11/21] Friendlier error in assignment with trailing comma (#5467) * friendlier error in assignment with trailing comma e.g. `DT[, `:=`(a = 1, b = 2,)`. WIP. Need to add tests and such, but editing from browser before I forget. * Another pass * include unnamed indices on RHS too * tests * NEWS * test numbering * explicit example in NEWS --- NEWS.md | 2 ++ R/data.table.R | 11 ++++++++++- inst/tests/tests.Rraw | 8 ++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 04d37bf40..944241ba2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,6 +24,8 @@ 3. data.table now depends on R 3.2.0 (2015) instead of 3.1.0 (2014). 1.17.0 will likely move to R 3.3.0 (2016). Recent versions of R have good features that we would gradually like to incorporate, and we see next to no usage of these very old versions of R. +4. Erroneous assignment calls in `[` with a trailing comma (e.g. ``DT[, `:=`(a = 1, b = 2,)]``) get a friendlier error since this situation is common during refactoring and easy to miss visually. Thanks @MichaelChirico for the fix. + # data.table [v1.14.99](https://github.com/Rdatatable/data.table/milestone/29) (in development) ## BREAKING CHANGE diff --git a/R/data.table.R b/R/data.table.R index 9d62702aa..47e6080a0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1128,7 +1128,16 @@ replace_dot_alias = function(e) { } else { # `:=`(c2=1L,c3=2L,...) lhs = names(jsub)[-1L] - if (any(lhs=="")) stopf("In %s(col1=val1, col2=val2, ...) form, all arguments must be named.", if (root == "let") "let" else "`:=`") + if (!all(named_idx <- nzchar(lhs))) { + # friendly error for common case: trailing terminal comma + n_lhs = length(lhs) + root_name <- if (root == "let") "let" else "`:=`" + if (!named_idx[n_lhs] && all(named_idx[-n_lhs])) { + stopf("In %s(col1=val1, col2=val2, ...) form, all arguments must be named, but the last argument has no name. Did you forget a trailing comma?", root_name) + } else { + stopf("In %s(col1=val1, col2=val2, ...) form, all arguments must be named, but these arguments lack names: %s.", root_name, brackify(which(!named_idx))) + } + } names(jsub)="" jsub[[1L]]=as.name("list") } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b0a6367d2..a5a7708bc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18245,6 +18245,7 @@ test(2243.35, dt[, min(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table( test(2243.36, dt[, max(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(3L,4L)), output="GForce FALSE") test(2243.37, dt[, var(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,2)), output="GForce FALSE") test(2243.38, dt[, sd(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(sqrt(c(2,2)))), output="GForce FALSE") + # revdeps dt = data.table(x = c(2,2,1,1), y = 1:4, z=letters[1:4]) i=c(1,2) @@ -18260,3 +18261,10 @@ test(2243.54, dt[, .I[j], x]$V1, c(1L, 3L), output="GForce TRUE") test(2243.55, dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") test(2243.56, dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") options(old) + +DT = data.table(1) +test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") +test(2244.2, DT[, let(a=1, )], error="let.*Did you forget a trailing comma\\?") +test(2244.3, DT[, `:=`(a=1, , b=2)], error="`:=`.*\\[2\\]") +test(2244.4, DT[, let(a=1, , b=2)], error="let.*\\[2\\]") +test(2244.5, DT[, `:=`(a=1, , b=2, )], error="[2, 4]") From f5ef168719a4dd8b9e779f2f26f36d9aa513ef1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20L=C3=B3pez-Ib=C3=A1=C3=B1ez?= <2620021+MLopez-Ibanez@users.noreply.github.com> Date: Sat, 13 Jan 2024 15:05:21 +0100 Subject: [PATCH 12/21] Link to ?read.delim in ?fread to give a closer analogue of expected behavior (#5635) * fread is similar to read.delim (#5634) * Use ?read.csv / ?read.delim --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- man/fread.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fread.Rd b/man/fread.Rd index 4456e11d1..09ed80bd5 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -2,7 +2,7 @@ \alias{fread} \title{ Fast and friendly file finagler } \description{ - Similar to \code{read.table} but faster and more convenient. All controls such as \code{sep}, \code{colClasses} and \code{nrows} are automatically detected. + Similar to \code{\link[utils:read.csv]{read.csv()}} and \code{\link[utils:read.delim]{read.delim()}} but faster and more convenient. All controls such as \code{sep}, \code{colClasses} and \code{nrows} are automatically detected. \code{bit64::integer64}, \code{\link{IDate}}, and \code{\link{POSIXct}} types are also detected and read directly without needing to read as character before converting. From f658ff497247e97cf28c84c6d0ca6addf30d3a70 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 14 Jan 2024 19:47:02 +0800 Subject: [PATCH 13/21] Run GHA jobs on 1-15-99 dev branch (#5909) --- .github/workflows/R-CMD-check.yaml | 4 ++-- .github/workflows/test-coverage.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index fff8223d6..106021051 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -3,12 +3,12 @@ on: push: branches: - - main - master + - 1-15-99 pull_request: branches: - - main - master + - 1-15-99 name: R-CMD-check diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 7f5fb04cd..e85deaf2f 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -1,12 +1,12 @@ on: push: branches: - - main - master + - 1-15-99 pull_request: branches: - - main - master + - 1-15-99 name: test-coverage From a56b796ed334dcd58254ec443f0294095d289d92 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jan 2024 09:28:20 +0800 Subject: [PATCH 14/21] Make declarations static for covr (#5910) --- src/froll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/froll.c b/src/froll.c index 1c0fa2537..af72d2740 100644 --- a/src/froll.c +++ b/src/froll.c @@ -429,7 +429,7 @@ void frollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int if (verbose) snprintf(end(ans->message[0]), 500, _("%s: processing algo %u took %.3fs\n"), __func__, algo, omp_get_wtime()-tic); } -inline void windowmax(double *x, uint64_t o, int k, /* max of current window */ double *w, /* index of w within window */ uint64_t *iw) { +static inline void windowmax(double *x, uint64_t o, int k, /* max of current window */ double *w, /* index of w within window */ uint64_t *iw) { for (int i=0; i= w[0]: x[%d-%d+1] >= w[0]: %f >= %f: %d\n", i, o, x[o], i, k, x[o+i-k+1], w[0], x[o+i-k+1] >= w[0]); if (x[o+i-k+1] >= w[0]) { // what if that is never satisfied? test! @@ -438,7 +438,7 @@ inline void windowmax(double *x, uint64_t o, int k, /* max of current window */ } } } -inline void windowmaxnarm(double *x, uint64_t o, int k, bool narm, /* NA counter */ int *nc, double *w, uint64_t *iw) { +static inline void windowmaxnarm(double *x, uint64_t o, int k, bool narm, /* NA counter */ int *nc, double *w, uint64_t *iw) { for (int i=0; i= w[0]: x[%d-%d+1] >= w[0]: %f >= %f: %d\n", i, o, x[o], i, k, x[o+i-k+1], w[0], x[o+i-k+1] >= w[0]); if (R_FINITE(x[o+i-k+1])) { From 8ca0e0ac52af7ec16ddd6747b68087d9f9dafa23 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jan 2024 08:46:59 -0800 Subject: [PATCH 15/21] class= argument for condition calls --- R/translation.R | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/R/translation.R b/R/translation.R index 66faa9fe8..18407bc50 100644 --- a/R/translation.R +++ b/R/translation.R @@ -4,18 +4,48 @@ catf = function(fmt, ..., sep=" ", domain="R-data.table") { cat(gettextf(fmt, ..., domain=domain), sep=sep) } -stopf = function(fmt, ..., domain="R-data.table") { - stop(gettextf(fmt, ..., domain=domain), domain=NA, call. = FALSE) +stopf = function(fmt, ..., class=NULL, domain="R-data.table") { + obj = list( + message=gettextf(fmt, ..., domain=domain), + call=sys.call() + ) + setattr(obj, "class", c(class, "simpleError", "error", "condition")) + stop(obj) } -warningf = function(fmt, ..., immediate.=FALSE, noBreaks.=FALSE, domain="R-data.table") { - warning(gettextf(fmt, ..., domain=domain), domain=NA, call.=FALSE, immediate.=immediate., noBreaks.=noBreaks.) +warningf = function(fmt, ..., immediate.=FALSE, class=NULL, domain="R-data.table") { + # cannot set immediate.=TRUE through warning(), so use the description in ?warning to replicate this behavior ourselves + if (immediate.) { + old = options(warn = 1) + on.exit(options(old)) + } + obj = list( + message=gettextf(fmt, ..., domain=domain), + call=sys.call() + ) + setattr(obj, "class", c(class, "simpleWarning", "warning", "condition")) + warning(obj) } -messagef = function(fmt, ..., appendLF=TRUE, domain="R-data.table") { - message(gettextf(fmt, ..., domain=domain), domain=NA, appendLF=appendLF) +messagef = function(fmt, ..., appendLF=TRUE, class=NULL, domain="R-data.table") { + obj = list( + message=gettextf(fmt, ..., domain=domain), + call=sys.call() + ) + # NB: append _after_ translation + if (appendLF) obj$message <- paste0(obj$message, "\n") + setattr(obj, "class", c(class, "simpleMessage", "message", "condition")) + message(obj) } -packageStartupMessagef = function(fmt, ..., appendLF=TRUE, domain="R-data.table") { - packageStartupMessage(gettextf(fmt, ..., domain=domain), domain=NA, appendLF=appendLF) +packageStartupMessagef = function(fmt, ..., appendLF=TRUE, class=NULL, domain="R-data.table") { + obj = list( + message=gettextf(fmt, ..., domain=domain), + call=sys.call() + ) + # NB: append _after_ translation + if (appendLF) obj$message <- paste0(obj$message, "\n") + setattr(obj, "class", c(class, "packageStartupMessage", "simpleMessage", "message", "condition")) + # NB: packageStartupMessage() itself calls message(.packageStartupMessage(...)) + message(obj) } From b548ab2b7489f54e7a9851076b703c7b3fd1f85e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jan 2024 09:00:33 -0800 Subject: [PATCH 16/21] Unify logic with helper --- R/translation.R | 53 +++++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/R/translation.R b/R/translation.R index 18407bc50..058851e90 100644 --- a/R/translation.R +++ b/R/translation.R @@ -4,48 +4,35 @@ catf = function(fmt, ..., sep=" ", domain="R-data.table") { cat(gettextf(fmt, ..., domain=domain), sep=sep) } +raise_condition = function(signal, message, call, classes, immediate=FALSE, appendLF=FALSE) { + obj = list(message=message, call=call) + # NB: append _after_ translation + if (appendLF) obj$message = paste0(obj$message, "\n") + setattr(obj, "class", classes) + # cannot set immediate.=TRUE through warning(), so use the description in ?warning to replicate this behavior ourselves + if (immediate) { + old = options(warn=1) + on.exit(options(old)) + } + signal(obj) +} + stopf = function(fmt, ..., class=NULL, domain="R-data.table") { - obj = list( - message=gettextf(fmt, ..., domain=domain), - call=sys.call() - ) - setattr(obj, "class", c(class, "simpleError", "error", "condition")) - stop(obj) + call = sys.call() + raise_condition(stop, gettextf(fmt, ..., domain=domain), call, c(class, "simpleError", "error", "condition")) } warningf = function(fmt, ..., immediate.=FALSE, class=NULL, domain="R-data.table") { - # cannot set immediate.=TRUE through warning(), so use the description in ?warning to replicate this behavior ourselves - if (immediate.) { - old = options(warn = 1) - on.exit(options(old)) - } - obj = list( - message=gettextf(fmt, ..., domain=domain), - call=sys.call() - ) - setattr(obj, "class", c(class, "simpleWarning", "warning", "condition")) - warning(obj) + call = sys.call() + raise_condition(warning, gettextf(fmt, ..., domain=domain), call, c(class, "simpleWarning", "warning", "condition"), immediate=immediate.) } messagef = function(fmt, ..., appendLF=TRUE, class=NULL, domain="R-data.table") { - obj = list( - message=gettextf(fmt, ..., domain=domain), - call=sys.call() - ) - # NB: append _after_ translation - if (appendLF) obj$message <- paste0(obj$message, "\n") - setattr(obj, "class", c(class, "simpleMessage", "message", "condition")) - message(obj) + call = sys.call() + raise_condition(message, gettextf(fmt, ..., domain=domain), call, c(class, "simpleMessage", "message", "condition"), appendLF=appendLF) } packageStartupMessagef = function(fmt, ..., appendLF=TRUE, class=NULL, domain="R-data.table") { - obj = list( - message=gettextf(fmt, ..., domain=domain), - call=sys.call() - ) - # NB: append _after_ translation - if (appendLF) obj$message <- paste0(obj$message, "\n") - setattr(obj, "class", c(class, "packageStartupMessage", "simpleMessage", "message", "condition")) # NB: packageStartupMessage() itself calls message(.packageStartupMessage(...)) - message(obj) + messagef(fmt, ..., appendLF=appendLF, class=c(class, "packageStartupMessage"), domain=domain) } From c1ce0716cb0b39ac11ceac329b70dc7ee947ceb2 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 16 Jan 2024 09:03:35 +0800 Subject: [PATCH 17/21] Add tests --- R/translation.R | 2 +- inst/tests/tests.Rraw | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/R/translation.R b/R/translation.R index 058851e90..88daf403f 100644 --- a/R/translation.R +++ b/R/translation.R @@ -9,7 +9,7 @@ raise_condition = function(signal, message, call, classes, immediate=FALSE, appe # NB: append _after_ translation if (appendLF) obj$message = paste0(obj$message, "\n") setattr(obj, "class", classes) - # cannot set immediate.=TRUE through warning(), so use the description in ?warning to replicate this behavior ourselves + # cannot set immediate.=TRUE through warning(), so use the description in ?warning to replicate this behavior ourselves. tested manually. if (immediate) { old = options(warn=1) on.exit(options(old)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a5a7708bc..839cb6042 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -51,7 +51,9 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { isRealReallyInt = data.table:::isRealReallyInt is_utc = data.table:::is_utc melt.data.table = data.table:::melt.data.table # for test 1953.4 + messagef = data.table:::messagef null.data.table = data.table:::null.data.table + packageStartupMessagef = data.table:::packageStartupMessagef print.data.table = data.table:::print.data.table replace_dot_alias = data.table:::replace_dot_alias rollup.data.table = data.table:::rollup.data.table @@ -64,9 +66,11 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { .shallow = data.table:::.shallow split.data.table = data.table:::split.data.table if (!exists('startsWith', 'package:base', inherits=FALSE)) startsWith = data.table:::startsWith + stopf = data.table:::stopf test = data.table:::test uniqlengths = data.table:::uniqlengths uniqlist = data.table:::uniqlist + warningf = data.table:::warningf which_ = data.table:::which_ which.first = data.table:::which.first which.last = data.table:::which.last @@ -18268,3 +18272,26 @@ test(2244.2, DT[, let(a=1, )], error="let.*Did you forget a trailing comma\\?") test(2244.3, DT[, `:=`(a=1, , b=2)], error="`:=`.*\\[2\\]") test(2244.4, DT[, let(a=1, , b=2)], error="let.*\\[2\\]") test(2244.5, DT[, `:=`(a=1, , b=2, )], error="[2, 4]") + +# custom signaling functions +## basics: default signals with/without formats +test(2245.01, tryCatch(stopf("%s", "abc"), error=function(x) conditionMessage(x)), "abc") +test(2245.02, tryCatch(stopf("abc"), error=function(x) conditionMessage(x)), "abc") +test(2245.03, tryCatch(warningf("%s", "abc"), warning=function(x) conditionMessage(x)), "abc") +test(2245.04, tryCatch(warningf("abc"), warning=function(x) conditionMessage(x)), "abc") +test(2245.05, tryCatch(messagef("%s", "abc"), message=function(x) conditionMessage(x)), "abc\n") +test(2245.06, tryCatch(messagef("abc"), message=function(x) conditionMessage(x)), "abc\n") +test(2245.07, tryCatch(messagef("abc", appendLF=FALSE), message=function(x) conditionMessage(x)), "abc") +test(2245.08, tryCatch(packageStartupMessagef("%s", "abc"), packageStartupMessage=function(x) conditionMessage(x)), "abc\n") +test(2245.09, tryCatch(packageStartupMessagef("abc"), packageStartupMessage=function(x) conditionMessage(x)), "abc\n") +test(2245.10, tryCatch(packageStartupMessagef("abc", appendLF=FALSE), packageStartupMessagef=function(x) conditionMessage(x)), "abc") + +## custom signal classes +test(2245.11, inherits(tryCatch(stopf("x", class="test_error"), condition=identity), "test_error")) +test(2245.12, inherits(tryCatch(stopf("x", class="test_error"), condition=identity), "error")) +test(2245.13, inherits(tryCatch(warningf("x", class="test_warning"), condition=identity), "test_warning")) +test(2245.14, inherits(tryCatch(warningf("x", class="test_warning"), condition=identity), "warning")) +test(2245.15, inherits(tryCatch(messagef("x", class="test_message"), condition=identity), "test_message")) +test(2245.16, inherits(tryCatch(messagef("x", class="test_message"), condition=identity), "message")) +test(2245.17, inherits(tryCatch(packageStartupMessagef("x", class="test_psm"), condition=identity), "test_psm")) +test(2245.18, inherits(tryCatch(packageStartupMessagef("x", class="test_psm"), condition=identity), "packageStartupMessage")) From ef538c23641f5624dc6cab5b1ea5a920b790c210 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jan 2024 17:13:21 -0800 Subject: [PATCH 18/21] Use call.=FALSE where possible --- R/translation.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/translation.R b/R/translation.R index 88daf403f..4bc56b686 100644 --- a/R/translation.R +++ b/R/translation.R @@ -4,7 +4,7 @@ catf = function(fmt, ..., sep=" ", domain="R-data.table") { cat(gettextf(fmt, ..., domain=domain), sep=sep) } -raise_condition = function(signal, message, call, classes, immediate=FALSE, appendLF=FALSE) { +raise_condition = function(signal, message, call, classes, call.=FALSE, immediate=FALSE, appendLF=FALSE) { obj = list(message=message, call=call) # NB: append _after_ translation if (appendLF) obj$message = paste0(obj$message, "\n") @@ -14,7 +14,11 @@ raise_condition = function(signal, message, call, classes, immediate=FALSE, appe old = options(warn=1) on.exit(options(old)) } - signal(obj) + if (is.null(call.)) { + signal(obj) # message() doesn't support call.=FALSE + } else { + signal(obj, call. = call.) + } } stopf = function(fmt, ..., class=NULL, domain="R-data.table") { @@ -29,7 +33,7 @@ warningf = function(fmt, ..., immediate.=FALSE, class=NULL, domain="R-data.table messagef = function(fmt, ..., appendLF=TRUE, class=NULL, domain="R-data.table") { call = sys.call() - raise_condition(message, gettextf(fmt, ..., domain=domain), call, c(class, "simpleMessage", "message", "condition"), appendLF=appendLF) + raise_condition(message, gettextf(fmt, ..., domain=domain), call, c(class, "simpleMessage", "message", "condition"), appendLF=appendLF, call.=NULL) } packageStartupMessagef = function(fmt, ..., appendLF=TRUE, class=NULL, domain="R-data.table") { From adaf7ea6793e6cb15b3d5e7b782aec93b31dc511 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jan 2024 17:14:04 -0800 Subject: [PATCH 19/21] correct caught class --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 839cb6042..8bde30f99 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18284,7 +18284,7 @@ test(2245.06, tryCatch(messagef("abc"), message=function(x) conditionMessage(x)) test(2245.07, tryCatch(messagef("abc", appendLF=FALSE), message=function(x) conditionMessage(x)), "abc") test(2245.08, tryCatch(packageStartupMessagef("%s", "abc"), packageStartupMessage=function(x) conditionMessage(x)), "abc\n") test(2245.09, tryCatch(packageStartupMessagef("abc"), packageStartupMessage=function(x) conditionMessage(x)), "abc\n") -test(2245.10, tryCatch(packageStartupMessagef("abc", appendLF=FALSE), packageStartupMessagef=function(x) conditionMessage(x)), "abc") +test(2245.10, tryCatch(packageStartupMessagef("abc", appendLF=FALSE), packageStartupMessage=function(x) conditionMessage(x)), "abc") ## custom signal classes test(2245.11, inherits(tryCatch(stopf("x", class="test_error"), condition=identity), "test_error")) From 7c199c571694698fab1c466108cd51cbb52ac9b7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jan 2024 17:28:48 -0800 Subject: [PATCH 20/21] strip call=/call.= handling --- R/translation.R | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/R/translation.R b/R/translation.R index 4bc56b686..42073aced 100644 --- a/R/translation.R +++ b/R/translation.R @@ -4,8 +4,8 @@ catf = function(fmt, ..., sep=" ", domain="R-data.table") { cat(gettextf(fmt, ..., domain=domain), sep=sep) } -raise_condition = function(signal, message, call, classes, call.=FALSE, immediate=FALSE, appendLF=FALSE) { - obj = list(message=message, call=call) +raise_condition = function(signal, message, classes, immediate=FALSE, appendLF=FALSE) { + obj = list(message=message, call=sys.call(2)) # NB: append _after_ translation if (appendLF) obj$message = paste0(obj$message, "\n") setattr(obj, "class", classes) @@ -14,26 +14,19 @@ raise_condition = function(signal, message, call, classes, call.=FALSE, immediat old = options(warn=1) on.exit(options(old)) } - if (is.null(call.)) { - signal(obj) # message() doesn't support call.=FALSE - } else { - signal(obj, call. = call.) - } + signal(obj) } stopf = function(fmt, ..., class=NULL, domain="R-data.table") { - call = sys.call() - raise_condition(stop, gettextf(fmt, ..., domain=domain), call, c(class, "simpleError", "error", "condition")) + raise_condition(stop, gettextf(fmt, ..., domain=domain), c(class, "simpleError", "error", "condition")) } warningf = function(fmt, ..., immediate.=FALSE, class=NULL, domain="R-data.table") { - call = sys.call() - raise_condition(warning, gettextf(fmt, ..., domain=domain), call, c(class, "simpleWarning", "warning", "condition"), immediate=immediate.) + raise_condition(warning, gettextf(fmt, ..., domain=domain), c(class, "simpleWarning", "warning", "condition"), immediate=immediate.) } messagef = function(fmt, ..., appendLF=TRUE, class=NULL, domain="R-data.table") { - call = sys.call() - raise_condition(message, gettextf(fmt, ..., domain=domain), call, c(class, "simpleMessage", "message", "condition"), appendLF=appendLF, call.=NULL) + raise_condition(message, gettextf(fmt, ..., domain=domain), c(class, "simpleMessage", "message", "condition"), appendLF=appendLF) } packageStartupMessagef = function(fmt, ..., appendLF=TRUE, class=NULL, domain="R-data.table") { From d7575c37efb5443b2808871e667a70e094e16f78 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 18 Feb 2024 19:41:26 -0800 Subject: [PATCH 21/21] botched merge --- .github/workflows/R-CMD-check.yaml | 4 +- .github/workflows/test-coverage.yaml | 4 +- NAMESPACE | 1 - R/froll.R | 26 +-- inst/tests/froll.Rraw | 4 +- man/froll.Rd | 107 ++++------- src/data.table.h | 6 - src/froll.c | 275 +-------------------------- src/frollR.c | 16 +- src/frolladaptive.c | 77 -------- 10 files changed, 47 insertions(+), 473 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 106021051..fff8223d6 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -3,12 +3,12 @@ on: push: branches: + - main - master - - 1-15-99 pull_request: branches: + - main - master - - 1-15-99 name: R-CMD-check diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index e85deaf2f..7f5fb04cd 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -1,12 +1,12 @@ on: push: branches: + - main - master - - 1-15-99 pull_request: branches: + - main - master - - 1-15-99 name: test-coverage diff --git a/NAMESPACE b/NAMESPACE index 9b5930ece..c5a67095a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,7 +51,6 @@ S3method(cube, data.table) S3method(rollup, data.table) export(frollmean) export(frollsum) -export(frollmax) export(frollapply) export(nafill) export(setnafill) diff --git a/R/froll.R b/R/froll.R index 27085f99f..df901f0b8 100644 --- a/R/froll.R +++ b/R/froll.R @@ -2,25 +2,8 @@ froll = function(fun, x, n, fill=NA, algo=c("fast", "exact"), align=c("right", " stopifnot(!missing(fun), is.character(fun), length(fun)==1L, !is.na(fun)) algo = match.arg(algo) align = match.arg(align) - leftadaptive = isTRUE(adaptive) && align=="left" ## support for left added in #5441 - if (leftadaptive) { - rev2 = function(x) if (is.list(x)) sapply(x, rev, simplify=FALSE) else rev(x) - verbose = getOption("datatable.verbose") - if (verbose) - catf("froll: adaptive=TRUE && align='left' pre-processing for align='right'\n") - ## TODO test atomic x but list of lists n (multiple windows)! - x = rev2(x) - n = rev2(n) - align = "right" - } ans = .Call(CfrollfunR, fun, x, n, fill, algo, align, na.rm, hasNA, adaptive) - if (!leftadaptive) - ans - else { - if (verbose) - catf("froll: adaptive=TRUE && align='left' post-processing from align='right'\n") - rev2(ans) - } + ans } frollmean = function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) { @@ -29,14 +12,9 @@ frollmean = function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "l frollsum = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) { froll(fun="sum", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, hasNA=hasNA, adaptive=adaptive) } -frollmax = function(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) { - froll(fun="max", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, hasNA=hasNA, adaptive=adaptive) -} -frollapply = function(x, n, FUN, ..., fill=NA, align=c("right", "left", "center"), adaptive) { +frollapply = function(x, n, FUN, ..., fill=NA, align=c("right", "left", "center")) { FUN = match.fun(FUN) align = match.arg(align) - if (!missing(adaptive)) - stopf("frollapply does not support 'adaptive' argument") rho = new.env() ans = .Call(CfrollapplyR, FUN, x, n, fill, align, rho) ans diff --git a/inst/tests/froll.Rraw b/inst/tests/froll.Rraw index 54c67e562..f6a4f96a8 100644 --- a/inst/tests/froll.Rraw +++ b/inst/tests/froll.Rraw @@ -563,8 +563,8 @@ if (FALSE) { #### adaptive limitations test(6000.145, frollmean(1:2, 1:2, adaptive=TRUE, align="right"), c(1, 1.5)) -test(6000.146, frollmean(1:2, 1:2, adaptive=TRUE, align="center"), error="using adaptive TRUE and align 'center' is not implemented") -##test(6000.147, frollmean(1:2, 1:2, adaptive=TRUE, align="left"), error="using adaptive TRUE and align argument different than 'right' is not implemented") ## support added in #5441, TODO add tests +test(6000.146, frollmean(1:2, 1:2, adaptive=TRUE, align="center"), error="using adaptive TRUE and align argument different than 'right' is not implemented") +test(6000.147, frollmean(1:2, 1:2, adaptive=TRUE, align="left"), error="using adaptive TRUE and align argument different than 'right' is not implemented") test(6000.148, frollmean(list(1:2, 1:3), list(1:2), adaptive=TRUE), error="adaptive rolling function can only process 'x' having equal length of elements, like data.table or data.frame. If you want to call rolling function on list having variable length of elements call it for each field separately") #### adaptive exact diff --git a/man/froll.Rd b/man/froll.Rd index c718bafa0..d6cb75067 100644 --- a/man/froll.Rd +++ b/man/froll.Rd @@ -7,9 +7,7 @@ \alias{rollmean} \alias{frollmean} \alias{rollsum} -\alias{rollmax} \alias{frollsum} -\alias{frollmax} \alias{rollapply} \alias{frollapply} \title{Rolling functions} @@ -21,9 +19,7 @@ frollmean(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) frollsum(x, n, fill=NA, algo=c("fast","exact"), align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) -frollmax(x, n, fill=NA, algo=c("fast","exact"), - align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) -frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center"), adaptive) +frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) } \arguments{ \item{x}{ Vector, \code{data.frame} or \code{data.table} of integer, numeric or logical columns over which to calculate the windowed aggregations. May also be a list, in which case the rolling function is applied to each of its elements. } @@ -49,69 +45,36 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center"), adapti conveniently within \code{data.table} syntax. Argument \code{n} allows multiple values to apply rolling functions on - multiple window sizes. If \code{adaptive=TRUE}, then \code{n} must be a list, - see \emph{Adaptive rolling functions} section below for details. + multiple window sizes. If \code{adaptive=TRUE}, then \code{n} must be a list. + Each list element must be integer vector of window sizes corresponding + to every single observation in each column; see Examples. - When multiple columns or multiple window widths are provided, then they - are run in parallel. The exception is for \code{algo="exact"}, which runs in - parallel already. See \code{\link{setDTthreads}} for defaults and further details on parallelism in data.table. -} -\section{\code{hasNA} argument}{ - \code{hasNA} can be used to speed up processing in cases when it is known - whether \code{x} contains infinite values \code{NA, NaN, +Inf, -Inf}. - \itemize{ - \item{ Default \code{hasNA=NA} will use faster \code{NA} agnostic implementation, - but when \code{NA}s are detected it will re-run \code{NA} aware implementation. } - \item{ \code{hasNA=TRUE} will use \code{NA} aware implementation straightaway. } - \item{ \code{hasNA=FALSE} will use faster \code{NA} agnostic implementation. - Then depending on the rolling function it will either - \itemize{ - \item{ (\emph{mean, sum}) detect \code{NA}s, raise warning, re-run \code{NA} aware. } - \item{ (\emph{max}) not detect \code{NA}s and may silently produce an incorrect - answer. }} - Therefore \code{hasNA=FALSE} should be used with care. - } - } -} -\section{Implementation}{ - \itemize{ - \item{ \code{algo="fast"} uses \emph{"on-line"} algorithm, and - all of \code{NaN, +Inf, -Inf} are treated as \code{NA}. Not all functions - have \emph{fast} implementation available. As of now - \emph{max} and \code{adaptive=TRUE} do not have it, therefore it will - automatically fall back to \emph{exact} implementation. \code{datatable.verbose} - option can be used to check that. } - \item{ \code{algo="exact"} will make rolling functions use a more - computationally-intensive algorithm. For each observation from input vector - it will compute a function on a window from scratch (complexity \eqn{O(n^2)}). - Depending on the function, this algorithm may suffers less from - floating point rounding error (the same consideration applies to base \code{\link[base]{mean}}). - Algorithm also handles \code{NaN, +Inf, -Inf} consistently to - base R, unless - for some functions (e.g. \emph{max}) - \code{hasNA} is \code{FALSE} - but NAs are present. In case of some functions (like \emph{mean}), it will additionally + When \code{algo="fast"} an \emph{"on-line"} algorithm is used, and + all of \code{NaN, +Inf, -Inf} are treated as \code{NA}. + Setting \code{algo="exact"} will make rolling functions to use a more + computationally-intensive algorithm that suffers less from floating point + rounding error (the same consideration applies to \code{\link[base]{mean}}). + \code{algo="exact"} also handles \code{NaN, +Inf, -Inf} consistently to + base R. In case of some functions (like \emph{mean}), it will additionally make extra pass to perform floating point error correction. Error corrections might not be truly exact on some platforms (like Windows) - when using multiple threads. } - } -} -\section{Adaptive rolling functions}{ + when using multiple threads. + Adaptive rolling functions are a special case where each - observation has its own corresponding rolling window width. \code{n} - argument must be a list, then each list element must be an integer vector - of window sizes corresponding to every single observation in each column; - see Examples. Due to the logic or implementation of adaptive rolling - functions, the following restrictions apply: + observation has its own corresponding rolling window width. Due to the logic + of adaptive rolling functions, the following restrictions apply: \itemize{ - \item \code{align} does not support \code{"center"}. + \item \code{align} only \code{"right"}. \item if list of vectors is passed to \code{x}, then all vectors within it must have equal length. - \item functionality is not suported in \code{frollapply}. } -} -\section{\code{frollapply}}{ + + When multiple columns or multiple windows width are provided, then they + are run in parallel. The exception is for \code{algo="exact"}, which runs in + parallel already. + \code{frollapply} computes rolling aggregate on arbitrary R functions. - \code{adaptive} argument is not supported. The input - \code{x} (first argument) to the function \code{FUN} + The input \code{x} (first argument) to the function \code{FUN} is coerced to \emph{numeric} beforehand and \code{FUN} has to return a scalar \emph{numeric} value. Checks for that are made only during the first iteration when \code{FUN} is evaluated. Edge cases can be @@ -121,34 +84,32 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center"), adapti because there is no thread-safe API to R's C \code{eval}. Nevertheless we've seen the computation speed up vis-a-vis versions implemented in base R. } -\section{\code{zoo} package users notice}{ +\value{ + A list except when the input is a \code{vector} and + \code{length(n)==1} in which case a \code{vector} is returned. +} +\note{ Users coming from most popular package for rolling functions \code{zoo} might expect following differences in \code{data.table} implementation. \itemize{ - \item rolling functions will always return result of the same length - as input. - \item \code{fill} defaults to \code{NA}. + \item rolling function will always return result of the same length as input. + \item \code{fill} defaults to \code{NA}. \item \code{fill} accepts only constant values. It does not support for \emph{na.locf} or other functions. - \item \code{align} defaults to \code{"right"}. + \item \code{align} defaults to \code{"right"}. \item \code{na.rm} is respected, and other functions are not needed when input contains \code{NA}. - \item integers and logical are always coerced to double. + \item integers and logical are always coerced to double. \item when \code{adaptive=FALSE} (default), then \code{n} must be a numeric vector. List is not accepted. \item when \code{adaptive=TRUE}, then \code{n} must be vector of length equal to \code{nrow(x)}, or list of such vectors. \item \code{partial} window feature is not supported, although it can - be accomplished by using \code{adaptive=TRUE}, see examples. - \code{NA} is always returned for incomplete windows. + be accomplished by using \code{adaptive=TRUE}, see + examples. \code{NA} is always returned for incomplete windows. } -} -\value{ - A list except when the input is a \code{vector} and - \code{length(n)==1} in which case a \code{vector} is returned. -} -\note{ + Be aware that rolling functions operates on the physical order of input. If the intent is to roll values in a vector by a logical window, for example an hour, or a day, one has to ensure that there are no gaps in diff --git a/src/data.table.h b/src/data.table.h index a6213b731..0a6eb207a 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -206,9 +206,6 @@ void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool void frollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose); void frollsumFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); void frollsumExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); -void frollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose); -void frollmaxFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); -void frollmaxExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose); void frollapply(double *x, int64_t nx, double *w, int k, ans_t *ans, int align, double fill, SEXP call, SEXP rho, bool verbose); // frolladaptive.c @@ -218,9 +215,6 @@ void fadaptiverollmeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double f void fadaptiverollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); void fadaptiverollsumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); void fadaptiverollsumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); -void fadaptiverollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); -//void fadaptiverollmaxFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); // does not exists as of now -void fadaptiverollmaxExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); // frollR.c SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEXP narm, SEXP hasNA, SEXP adaptive); diff --git a/src/froll.c b/src/froll.c index af72d2740..3ab7bd927 100644 --- a/src/froll.c +++ b/src/froll.c @@ -49,7 +49,7 @@ void frollmeanFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool long double w = 0.0; // sliding window aggregate bool truehasna = hasna>0; // flag to re-run with NA support if NAs detected if (!truehasna) { - int i; // iterator declared here because it is being used after for loop, it is int32 rather than int64 because first loop over k-1 observations cannot exceed int32 range + int i; // iterator declared here because it is being used after for loop for (i=0; idbl_v[i] = fill; // answers are fill for partial window @@ -217,7 +217,6 @@ void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool } /* fast rolling sum */ -// see frollmean code for comments describing online implementations for rolling functions void frollsum(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose) { if (nx < k) { if (verbose) @@ -398,278 +397,6 @@ void frollsumExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool } } -/* fast rolling max */ -// see frollmean code for comments describing online implementations for rolling functions -void frollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasna, bool verbose) { - if (nx < k) { - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: window width longer than input vector, returning all %f vector\n"), __func__, fill); - for (int i=0; idbl_v[i] = fill; - } - return; - } - double tic = 0; - if (verbose) - tic = omp_get_wtime(); - if (algo==0) { - frollmaxFast(x, nx, ans, k, fill, narm, hasna, verbose); - } else { // algo==1 - frollmaxExact(x, nx, ans, k, fill, narm, hasna, verbose); - } - if (ans->status < 3 && align < 1) { - int k_ = align==-1 ? k-1 : floor(k/2); - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: align %d, shift answer by %d\n"), __func__, align, -k_); - memmove((char *)ans->dbl_v, (char *)ans->dbl_v + (k_*sizeof(double)), (nx-k_)*sizeof(double)); - for (uint64_t i=nx-k_; idbl_v[i] = fill; - } - } - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: processing algo %u took %.3fs\n"), __func__, algo, omp_get_wtime()-tic); -} -static inline void windowmax(double *x, uint64_t o, int k, /* max of current window */ double *w, /* index of w within window */ uint64_t *iw) { - for (int i=0; i= w[0]: x[%d-%d+1] >= w[0]: %f >= %f: %d\n", i, o, x[o], i, k, x[o+i-k+1], w[0], x[o+i-k+1] >= w[0]); - if (x[o+i-k+1] >= w[0]) { // what if that is never satisfied? test! - iw[0] = o+i-k+1; - w[0] = x[iw[0]]; - } - } -} -static inline void windowmaxnarm(double *x, uint64_t o, int k, bool narm, /* NA counter */ int *nc, double *w, uint64_t *iw) { - for (int i=0; i= w[0]: x[%d-%d+1] >= w[0]: %f >= %f: %d\n", i, o, x[o], i, k, x[o+i-k+1], w[0], x[o+i-k+1] >= w[0]); - if (R_FINITE(x[o+i-k+1])) { - if (x[o+i-k+1] >= w[0]) { // what if that is never satisfied? test! - iw[0] = o+i-k+1; - w[0] = x[iw[0]]; - } - } else if (narm) { - nc[0]++; - } else { - w[0] = NA_REAL; - } - } -} -/* fast rolling max - fast - * fast online algorithm do single pass over elements keeping track of recent max and its index - * if index of max is within progressing window then it keeps running single pass - * whenever max is leaving the window (index of max is outside of iterator minus window size) then new maximum is computed via nested loop on current complete window - * new max is used to continue outer single pass as long as new max index is not leaving the running window - * should scale well for bigger window size, may carry overhead for small window, needs benchmarking - * TODO NA handling - */ -void frollmaxFast(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose) { - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: running for input length %"PRIu64", window %d, hasna %d, narm %d\n"), "frollmaxFast", (uint64_t)nx, k, hasna, (int)narm); - double w = R_NegInf; // window max - uint64_t cmax = 0; // counter of nested loops for verbose - uint64_t iw = 0; // index of window max - bool truehasna = hasna>0; - if (!truehasna) { - int i; - for (i=0; i= w) { // >= rather than > because we track most recent maximum using iw - iw = i; - w = x[iw]; - } - ans->dbl_v[i] = fill; - } - if (!truehasna && !R_FINITE(x[i])) { - if (x[i] >= w) { - iw = i; - w = x[iw]; - } else { - // ensure iw ok, case when all window was -Inf or NA? - } - ans->dbl_v[i] = w; - } else { - truehasna = true; - } - if (!truehasna) { - for (uint64_t i=k; i= w) { - //Rprintf("iteration %d, new max %f at %d, old max %f at %d\n", i, x[i], i, w, iw); - iw = i; - w = x[iw]; - } else { - if (iw > i-k) { // max is still within window - // w and iw are still ok - //Rprintf("iteration %d, max %f at %d bigger than current value %f\n", i, w, iw, x[i]); - } else { // max left the window, need to find max in this window - //Rprintf("iteration %d, max %f at %d left the window, call windowmax from %d of size %d\n", i, w, iw, i, k); - w = R_NegInf; - iw = i-k; - windowmax(x, i, k, &w, &iw); - //Rprintf("iteration %d, windowmax found new max %f at %d\n", i, w, iw); - cmax++; - } - } - ans->dbl_v[i] = w; - } - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: windowmax called %"PRIu64" time(s)\n"), __func__, cmax); - if (truehasna) { - if (hasna==-1) { - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); - w = R_NegInf; - cmax = 0; - } - } else { - if (hasna==-1) { - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, skip non-NA attempt and run with extra care for NAs\n"), __func__); - w = R_NegInf; - cmax = 0; - } - } - if (truehasna) { - int nc = 0; - int i; - for (i=0; i= w) { - iw = i; - w = x[iw]; - } - } else { - nc++; - } - ans->dbl_v[i] = fill; - } - if (R_FINITE(x[i])) { - if (x[i] >= w) { - iw = i; - w = x[iw]; - } - } else { - nc++; - } - if (nc == 0) { - ans->dbl_v[i] = w; - } else if (nc == k) { - ans->dbl_v[i] = narm ? R_NegInf : NA_REAL; - } else { - ans->dbl_v[i] = narm ? w : NA_REAL; - } - for (uint64_t i=k; i= w) { - iw = i; - w = x[iw]; - } else { - if (iw > i-k) { // max is still within window - // w and iw are still ok - } else { // max left the window, need to find max in this window - w = R_NegInf; - iw = i-k; - windowmaxnarm(x, i, k, narm, &nc, &w, &iw); - //Rprintf("iteration %d, windowmaxnarm found new max %f at %d\n", i, w, iw); - cmax++; - } - } - ans->dbl_v[i] = w; - } else { - nc++; - } - if (!R_FINITE(x[i-k])) { // value leaving rolling window is non-finite, decrement NF counter - nc--; - } - if (nc == 0) { - ans->dbl_v[i] = w; - } else if (nc == k) { - ans->dbl_v[i] = narm ? R_NegInf : NA_REAL; - } else { - ans->dbl_v[i] = narm ? w : NA_REAL; - } - } - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: windowmaxnarm called %"PRIu64" time(s)\n"), __func__, cmax); - } -} -/* fast rolling max - exact - * for each observation in x compute max in window from scratch - * no proper support for NAs yet - */ -void frollmaxExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasna, bool verbose) { - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", window %d, hasna %d, narm %d\n"), "frollmaxExact", (uint64_t)nx, k, hasna, (int)narm); - for (int i=0; idbl_v[i] = fill; - } - bool truehasna = hasna>0; - if (!truehasna || !narm) { - #pragma omp parallel for num_threads(getDTthreads(nx, true)) - for (uint64_t i=k-1; i w) - w = x[i+j]; - } - if (R_FINITE(w)) { - ans->dbl_v[i] = w; - } else { - if (!narm) { - ans->dbl_v[i] = w; - } - truehasna = true; - } - } - if (truehasna) { - if (hasna==-1) { - ans->status = 2; - snprintf(end(ans->message[2]), 500, _("%s: hasNA=FALSE used but NA (or other non-finite) value(s) are present in input, use default hasNA=NA to avoid this warning"), __func__); - } - if (verbose) { - if (narm) { - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, re-running with extra care for NAs\n"), __func__); - } else { - snprintf(end(ans->message[0]), 500, _("%s: NA (or other non-finite) value(s) are present in input, na.rm was FALSE so in 'exact' implementation NAs were handled already, no need to re-run\n"), __func__); - } - } - } - } - if (truehasna && narm) { - #pragma omp parallel for num_threads(getDTthreads(nx, true)) - for (uint64_t i=k-1; i w) { - w = x[i+j]; - } - } - if (nc < k) { - ans->dbl_v[i] = w; - } else { - ans->dbl_v[i] = R_NegInf; - } - } - } -} - /* fast rolling any R function * not plain C, not thread safe * R eval() allocates diff --git a/src/frollR.c b/src/frollR.c index ee4d7fd39..74cc7dd4e 100644 --- a/src/frollR.c +++ b/src/frollR.c @@ -108,8 +108,8 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX else error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "align", "rolling"); // # nocov - if (badaptive && ialign==0) // support for left added in #5441 - error(_("using adaptive TRUE and align 'center' is not implemented")); + if (badaptive && ialign!=1) + error(_("using adaptive TRUE and align argument different than 'right' is not implemented")); SEXP ans = PROTECT(allocVector(VECSXP, nk * nx)); protecti++; // allocate list to keep results if (verbose) @@ -132,13 +132,11 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX dx[i] = REAL(VECTOR_ELT(x, i)); // assign source columns to C pointers } - enum {MEAN, SUM, MAX} sfun; + enum {MEAN, SUM} sfun; if (!strcmp(CHAR(STRING_ELT(fun, 0)), "mean")) { sfun = MEAN; } else if (!strcmp(CHAR(STRING_ELT(fun, 0)), "sum")) { sfun = SUM; - } else if (!strcmp(CHAR(STRING_ELT(fun, 0)), "max")) { - sfun = MAX; } else { error(_("Internal error: invalid %s argument in %s function should have been caught earlier. Please report to the data.table issue tracker."), "fun", "rolling"); // # nocov } @@ -154,7 +152,7 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX int ihasna = // plain C tri-state boolean as integer LOGICAL(hasna)[0]==NA_LOGICAL ? 0 : // hasna NA, default, no info about NA LOGICAL(hasna)[0]==TRUE ? 1 : // hasna TRUE, might be some NAs - -1; // hasna FALSE, there should be no NAs // or there must be no NAs for rollmax #5441 + -1; // hasna FALSE, there should be no NAs unsigned int ialgo; // decode algo to integer if (!strcmp(CHAR(STRING_ELT(algo, 0)), "fast")) @@ -195,12 +193,6 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX else fadaptiverollsum(ialgo, dx[i], inx[i], &dans[i*nk+j], ikl[j], dfill, bnarm, ihasna, verbose); break; - case MAX : - if (!badaptive) - frollmax(ialgo, dx[i], inx[i], &dans[i*nk+j], iik[j], ialign, dfill, bnarm, ihasna, verbose); - else - fadaptiverollmax(ialgo, dx[i], inx[i], &dans[i*nk+j], ikl[j], dfill, bnarm, ihasna, verbose); - break; default: error(_("Internal error: Unknown sfun value in froll: %d"), sfun); // #nocov } diff --git a/src/frolladaptive.c b/src/frolladaptive.c index f0a40fb9c..e005cef69 100644 --- a/src/frolladaptive.c +++ b/src/frolladaptive.c @@ -364,80 +364,3 @@ void fadaptiverollsumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fi } } } - -/* fast adaptive rolling max */ -void fadaptiverollmax(unsigned int algo, double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { - double tic = 0; - if (verbose) - tic = omp_get_wtime(); - if (algo==0 && verbose) { - //fadaptiverollmaxFast(x, nx, ans, k, fill, narm, hasna, verbose); // fadaptiverollmaxFast does not exists as of now - snprintf(end(ans->message[0]), 500, _("%s: algo %u not implemented, fall back to %u\n"), __func__, algo, (unsigned int) 1); - } - fadaptiverollmaxExact(x, nx, ans, k, fill, narm, hasna, verbose); - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: processing algo %u took %.3fs\n"), __func__, algo, omp_get_wtime()-tic); -} -//void fadaptiverollmaxFast(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose); // does not exists as of now -/* fast adaptive rolling max - exact - * for hasNA=FALSE it will not detect if any NAs were in the input, therefore could produce incorrect result, well documented - */ -void fadaptiverollmaxExact(double *x, uint64_t nx, ans_t *ans, int *k, double fill, bool narm, int hasna, bool verbose) { - if (verbose) - snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasna %d, narm %d\n"), "fadaptiverollmaxExact", (uint64_t)nx, hasna, (int) narm); - if (hasna==-1) { // fastest we can get for adaptive max as there is no algo='fast', therefore we drop any NA checks when hasNA=FALSE - #pragma omp parallel for num_threads(getDTthreads(nx, true)) - for (uint64_t i=0; idbl_v[i] = fill; - } else { - double w = R_NegInf; - for (int j=-k[i]+1; j<=0; j++) { //Rprintf("x[%d+%d] > w: %f > %f: %d\n", i, j, x[i+j], w, x[i+j] > w); - if (x[i+j] > w) - w = x[i+j]; - } - ans->dbl_v[i] = w; - } - } - } else { - if (narm) { - #pragma omp parallel for num_threads(getDTthreads(nx, true)) - for (uint64_t i=0; idbl_v[i] = fill; - } else { - int nc = 0; - double w = R_NegInf; - for (int j=-k[i]+1; j<=0; j++) { - if (ISNAN(x[i+j])) - nc++; - else if (x[i+j] > w) - w = x[i+j]; - } - if (nc < k[i]) - ans->dbl_v[i] = w; - else - ans->dbl_v[i] = R_NegInf; - } - } - } else { - #pragma omp parallel for num_threads(getDTthreads(nx, true)) - for (uint64_t i=0; idbl_v[i] = fill; - } else { - double w = R_NegInf; - for (int j=-k[i]+1; j<=0; j++) { - if (ISNAN(x[i+j])) { - w = NA_REAL; - break; - } else if (x[i+j] > w) { - w = x[i+j]; - } - } - ans->dbl_v[i] = w; - } - } - } - } -}