diff --git a/DESCRIPTION b/DESCRIPTION index 8937532..be46b5d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,27 +1,27 @@ Package: tdigest Type: Package -Title: Wicked Fast, Accurate Quantiles Using 't-Digests' +Title: Wicked Fast, Accurate Quantiles Using t-Digests Version: 0.3.0 -Date: 2019-07-21 +Date: 2019-07-25 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-5670-2640")), person("Ted", "Dunning", role = "aut", comment = "t-Digest algorithm; "), - person("ajwerner", "", role = "aut", + person("Andrew", "Werner", role = "aut", comment = "Original C+ code; ") ) -Maintainer: Bob Rudis -Description: The 't-digest' construction algorithm uses a variant of 1-dimensional - 'k-means' clustering to produce a very compact data structure that allows - accurate estimation of quantiles. This 't-digest' data structure can be used +Description: The t-Digest construction algorithm, by + Dunning et al., (2019) , uses a variant of 1-dimensional + k-means clustering to produce a very compact data structure that allows + accurate estimation of quantiles. This t-Digest data structure can be used to estimate quantiles, compute other rank statistics or even to estimate - related measures like trimmed means. The advantage of the 't-digest' over - previous digests for this purpose is that the 't-digest' handles data with - full floating point resolution. With small changes, the 't-digest' can handle - values from any ordered set for which we can compute something akin to a mean. - The accuracy of quantile estimates produced by 't-digests' can be orders of - magnitude more accurate than those produced by previous digest algorithms. + related measures like trimmed means. The advantage of the t-Digest over + previous digests for this purpose is that the t-Digest handles data with + full floating point resolution. The accuracy of quantile estimates produced + by t-Digests can be orders of magnitude more accurate than those produced + by previous digest algorithms. Methods are provided to create and update + t-Digests and retrieve quantiles from the accumulated distributions. URL: https://gitlab.com/hrbrmstr/tdigest BugReports: https://gitlab.com/hrbrmstr/tdigest/issues Copyright: file inst/COPYRIGHTS diff --git a/R/create.R b/R/create.R index ad2772f..63d9692 100644 --- a/R/create.R +++ b/R/create.R @@ -1,31 +1,31 @@ -#' Create a new t-digest histogram from a vector +#' Create a new t-Digest histogram from a vector #' -#' The t-digest construction algorithm uses a variant of 1-dimensional +#' The t-Digest construction algorithm, by Dunning et al., uses a variant of 1-dimensional #' k-means clustering to produce a very compact data structure that allows -#' accurate estimation of quantiles. This t-digest data structure can be used +#' accurate estimation of quantiles. This t-Digest data structure can be used #' to estimate quantiles, compute other rank statistics or even to estimate -#' related measures like trimmed means. The advantage of the t-digest over -#' previous digests for this purpose is that the t-digest handles data with -#' full floating point resolution. With small changes, the t-digest can handle -#' values from any ordered set for which we can compute something akin to a mean. -#' The accuracy of quantile estimates produced by t-digests can be orders of -#' magnitude more accurate than those produced by previous digest algorithms. +#' related measures like trimmed means. The advantage of the t-Digest over +#' previous digests for this purpose is that the t-Digest handles data with +#' full floating point resolution. The accuracy of quantile estimates produced +#' by t-Digests can be orders of magnitude more accurate than those produced +#' by previous digest algorithms. Methods are provided to create and update +#' t-Digests and retrieve quantiles from the accumulated distributions. #' -#' @param vec vector (will be converted to `double` if not already double). NOTE that this -#' is ALTREP-aware and will not materialize the passed-in object in order to -#' add the values to the t-Digest. +#' @param vec vector (will be converted to `double` if not already double). +#' NOTE that this is ALTREP-aware and will not materialize the passed-in +#' object in order to add the values to the t-Digest. #' @param compression the input compression value; should be >= 1.0; this -#' will control how aggressively the TDigest compresses data together. +#' will control how aggressively the t-Digest compresses data together. #' The original t-Digest paper suggests using a value of 100 for a good #' balance between precision and efficiency. It will land at very small #' (think like 1e-6 percentile points) errors at extreme points in the #' distribution, and compression ratios of around 500 for large data sets #' (~1 million datapoints). Defaults to 100. -#' @export -#' @return a tdigest object -#' @references +#' @return a `tdigest` object +#' @references [Computing Extremely Accurate Quantiles Using t-Digests](https://arxiv.org/abs/1902.04023) #' @importFrom stats quantile #' @useDynLib tdigest, .registration = TRUE +#' @export #' @examples #' set.seed(1492) #' x <- sample(0:100, 1000000, replace = TRUE) @@ -37,13 +37,13 @@ tdigest <- function(vec, compression=100) { .Call("Rtdig", vec=vec, compression=compression) } -#' Calculate sample quantiles from a t-digest +#' Calculate sample quantiles from a t-Digest #' -#' @param td t-digest object +#' @param td t-Digest object #' @param probs numeric vector of probabilities with values in range 0:1 #' @export -#' @return a numeric vector -#' @references +#' @return a `numeric` vector containing the requested quantile values +#' @references #' @examples #' set.seed(1492) #' x <- sample(0:100, 1000000, replace = TRUE) @@ -66,7 +66,7 @@ quantile.tdigest <- function(x, probs = seq(0, 1, 0.25), ...) { } #' @rdname tdigest -#' @param x t-tigest object +#' @param x `tdigest` object #' @param ... unused #' @keywords internal #' @export @@ -87,26 +87,28 @@ print.tdigest <- function(x, ...) { #' Allocate a new histogram #' #' @param compression the input compression value; should be >= 1.0; this -#' will control how aggressively the TDigest compresses data together. +#' will control how aggressively the t-Digest compresses data together. #' The original t-Digest paper suggests using a value of 100 for a good #' balance between precision and efficiency. It will land at very small #' (think like 1e-6 percentile points) errors at extreme points in the #' distribution, and compression ratios of around 500 for large data sets #' (~1 million datapoints). Defaults to 100. #' @export -#' @return a tdigest object -#' @references +#' @return a `tdigest` object +#' @references #' @examples #' td <- td_create(10) td_create <- function(compression=100) { + stopifnot(compression >= 1.0) compression <- as.double(compression[1]) .Call("Rtd_create", compression=compression, PACKAGE="tdigest") } -#' Total items contained in the t-digest +#' Total items contained in the t-Digest #' -#' @param td t-digest object +#' @param td t-Digest object #' @export +#' @return `double` containing the size of the t-Digest #' @examples #' td <- td_create(10) #' td_add(td, 0, 1) @@ -118,11 +120,12 @@ td_total_count <- function(td) { .Call("Rtd_total_count", td=td, PACKAGE="tdigest") } -#' Add a value to the t-digest with the specified count +#' Add a value to the t-Digest with the specified count #' -#' @param td t-digest object +#' @param td t-Digest object #' @param val value #' @param count count +#' @return the original, updated `tdigest` object #' @export #' @examples #' td <- td_create(10) @@ -138,9 +141,10 @@ td_add <- function(td, val, count) { #' Return the value at the specified quantile #' -#' @param td t-digest object +#' @param td t-Digest object #' @param q quantile (range 0:1) #' @export +#' @return the computed quantile (`double`) #' @examples #' td <- td_create(10) #' @@ -160,8 +164,9 @@ td_value_at <- function(td, q) { #' Return the quantile of the value #' -#' @param td t-digest object +#' @param td t-Digest object #' @param val value +#' @return the computed quantile (`double`) #' @export td_quantile_of <- function(td, val) { stopifnot(inherits(td, "tdigest")) @@ -170,11 +175,11 @@ td_quantile_of <- function(td, val) { .Call("Rtd_quantile_of", tdig=td, val=val, PACKAGE="tdigest") } -#' Merge one t-digest into another +#' Merge one t-Digest into another #' -#' @param from,into t-digests +#' @param from,into t-Digests #' @return `into` -#' @return a tdigest object +#' @return a `tdigest` object #' @export td_merge <- function(from, into) { stopifnot(inherits(from, "tdigest")) diff --git a/README.Rmd b/README.Rmd index 380d167..b4a9095 100644 --- a/README.Rmd +++ b/README.Rmd @@ -19,9 +19,18 @@ Wicked Fast, Accurate Quantiles Using 't-Digests' ## Description -The t-digest construction algorithm uses a variant of 1-dimensional k-means clustering to produce a very compact data structure that allows accurate estimation of quantiles. This t-digest data structure can be used to estimate quantiles, compute other rank statistics or even to estimate related measures like trimmed means. The advantage of the t-digest over previous digests for this purpose is that the t-digest handles data with full floating point resolution. With small changes, the t-digest can handle values from any ordered set for which we can compute something akin to a mean. The accuracy of quantile estimates produced by t-digests can be orders of magnitude more accurate than those produced by previous digest algorithms. - -See [the original paper by Ted Dunning](https://raw.githubusercontent.com/tdunning/t-digest/master/docs/t-digest-paper/histo.pdf) for more details on t-Digests. +The t-Digest construction algorithm uses a variant of 1-dimensional +k-means clustering to produce a very compact data structure that allows +accurate estimation of quantiles. This t-Digest data structure can be used +to estimate quantiles, compute other rank statistics or even to estimate +related measures like trimmed means. The advantage of the t-Digest over +previous digests for this purpose is that the t-Digest handles data with +full floating point resolution. The accuracy of quantile estimates produced +by t-Digests can be orders of magnitude more accurate than those produced +by previous digest algorithms. Methods are provided to create and update +t-Digests and retreive quantiles from the accumulated distributions. + +See [the original paper by Ted Dunning & Otmar Ertl](https://arxiv.org/abs/1902.04023) for more details on t-Digests. ## What's Inside The Tin diff --git a/inst/COPYRIGHTS b/inst/COPYRIGHTS index a5e5b0f..5e1a721 100644 --- a/inst/COPYRIGHTS +++ b/inst/COPYRIGHTS @@ -1,8 +1,15 @@ -The R code and src/tdigest-main.c, src/init.c are MIT-licensed by the package author. +The R code and src/tdigest-main.c, src/init.c are MIT-licensed +by the package author. -src/tdigest.h, src/tdigest.c are MIT-licensed & Copyright (c) 2018 ajwerner [REF: https://github.com/ajwerner/tdigestc; license below copied from that repository] +src/tdigest.h, src/tdigest.c are MIT-licensed & Copyright (c) 2018 +Andrew Werner [REF: https://github.com/ajwerner/tdigestc; license below +copied from that repository] -The original t-Digest implementation and algorithm are have the following license: +The t-Digest original algorithm and Java implementation +(which is the base of Andrew Werner's re-implementation) is +Copyright (c) 2015 Ted Dunning and uses the same Apache License2.0, below. + +------------ Apache License Version 2.0, January 2004 diff --git a/man/td_add.Rd b/man/td_add.Rd index 12426a2..4d04d58 100644 --- a/man/td_add.Rd +++ b/man/td_add.Rd @@ -2,19 +2,22 @@ % Please edit documentation in R/create.R \name{td_add} \alias{td_add} -\title{Add a value to the t-digest with the specified count} +\title{Add a value to the t-Digest with the specified count} \usage{ td_add(td, val, count) } \arguments{ -\item{td}{t-digest object} +\item{td}{t-Digest object} \item{val}{value} \item{count}{count} } +\value{ +the original, updated \code{tdigest} object +} \description{ -Add a value to the t-digest with the specified count +Add a value to the t-Digest with the specified count } \examples{ td <- td_create(10) diff --git a/man/td_create.Rd b/man/td_create.Rd index caf32b4..1efa60a 100644 --- a/man/td_create.Rd +++ b/man/td_create.Rd @@ -11,7 +11,7 @@ is_tdigest(td) } \arguments{ \item{compression}{the input compression value; should be >= 1.0; this -will control how aggressively the TDigest compresses data together. +will control how aggressively the t-Digest compresses data together. The original t-Digest paper suggests using a value of 100 for a good balance between precision and efficiency. It will land at very small (think like 1e-6 percentile points) errors at extreme points in the @@ -21,7 +21,7 @@ distribution, and compression ratios of around 500 for large data sets \item{td}{t-digest object} } \value{ -a tdigest object +a \code{tdigest} object } \description{ Allocate a new histogram @@ -30,5 +30,5 @@ Allocate a new histogram td <- td_create(10) } \references{ -\url{https://raw.githubusercontent.com/tdunning/t-digest/master/docs/t-digest-paper/histo.pdf} +\url{https://raw.githubusercontent.com/tdunning/t-Digest/master/docs/t-Digest-paper/histo.pdf} } diff --git a/man/td_merge.Rd b/man/td_merge.Rd index e512d43..e1bfaec 100644 --- a/man/td_merge.Rd +++ b/man/td_merge.Rd @@ -2,18 +2,18 @@ % Please edit documentation in R/create.R \name{td_merge} \alias{td_merge} -\title{Merge one t-digest into another} +\title{Merge one t-Digest into another} \usage{ td_merge(from, into) } \arguments{ -\item{from, into}{t-digests} +\item{from, into}{t-Digests} } \value{ \code{into} -a tdigest object +a \code{tdigest} object } \description{ -Merge one t-digest into another +Merge one t-Digest into another } diff --git a/man/td_quantile_of.Rd b/man/td_quantile_of.Rd index 09e87de..1aba486 100644 --- a/man/td_quantile_of.Rd +++ b/man/td_quantile_of.Rd @@ -7,10 +7,13 @@ td_quantile_of(td, val) } \arguments{ -\item{td}{t-digest object} +\item{td}{t-Digest object} \item{val}{value} } +\value{ +the computed quantile (\code{double}) +} \description{ Return the quantile of the value } diff --git a/man/td_total_count.Rd b/man/td_total_count.Rd index 971e7c3..689d8c4 100644 --- a/man/td_total_count.Rd +++ b/man/td_total_count.Rd @@ -3,19 +3,22 @@ \name{td_total_count} \alias{td_total_count} \alias{length.tdigest} -\title{Total items contained in the t-digest} +\title{Total items contained in the t-Digest} \usage{ td_total_count(td) \method{length}{tdigest}(x) } \arguments{ -\item{td}{t-digest object} +\item{td}{t-Digest object} \item{x}{a tdigest object} } +\value{ +\code{double} containing the size of the t-Digest +} \description{ -Total items contained in the t-digest +Total items contained in the t-Digest } \examples{ td <- td_create(10) diff --git a/man/td_value_at.Rd b/man/td_value_at.Rd index 3ab239d..ae34cb3 100644 --- a/man/td_value_at.Rd +++ b/man/td_value_at.Rd @@ -10,7 +10,7 @@ td_value_at(td, q) \method{[}{tdigest}(x, i, ...) } \arguments{ -\item{td}{t-digest object} +\item{td}{t-Digest object} \item{q}{quantile (range 0:1)} @@ -20,6 +20,9 @@ td_value_at(td, q) \item{...}{unused} } +\value{ +the computed quantile (\code{double}) +} \description{ Return the value at the specified quantile } diff --git a/man/tdigest.Rd b/man/tdigest.Rd index e2a073d..7a8b285 100644 --- a/man/tdigest.Rd +++ b/man/tdigest.Rd @@ -3,43 +3,43 @@ \name{tdigest} \alias{tdigest} \alias{print.tdigest} -\title{Create a new t-digest histogram from a vector} +\title{Create a new t-Digest histogram from a vector} \usage{ tdigest(vec, compression = 100) \method{print}{tdigest}(x, ...) } \arguments{ -\item{vec}{vector (will be converted to \code{double} if not already double). NOTE that this -is ALTREP-aware and will not materialize the passed-in object in order to -add the values to the t-Digest.} +\item{vec}{vector (will be converted to \code{double} if not already double). +NOTE that this is ALTREP-aware and will not materialize the passed-in +object in order to add the values to the t-Digest.} \item{compression}{the input compression value; should be >= 1.0; this -will control how aggressively the TDigest compresses data together. +will control how aggressively the t-Digest compresses data together. The original t-Digest paper suggests using a value of 100 for a good balance between precision and efficiency. It will land at very small (think like 1e-6 percentile points) errors at extreme points in the distribution, and compression ratios of around 500 for large data sets (~1 million datapoints). Defaults to 100.} -\item{x}{t-tigest object} +\item{x}{\code{tdigest} object} \item{...}{unused} } \value{ -a tdigest object +a \code{tdigest} object } \description{ -The t-digest construction algorithm uses a variant of 1-dimensional +The t-Digest construction algorithm, by Dunning et al., uses a variant of 1-dimensional k-means clustering to produce a very compact data structure that allows -accurate estimation of quantiles. This t-digest data structure can be used +accurate estimation of quantiles. This t-Digest data structure can be used to estimate quantiles, compute other rank statistics or even to estimate -related measures like trimmed means. The advantage of the t-digest over -previous digests for this purpose is that the t-digest handles data with -full floating point resolution. With small changes, the t-digest can handle -values from any ordered set for which we can compute something akin to a mean. -The accuracy of quantile estimates produced by t-digests can be orders of -magnitude more accurate than those produced by previous digest algorithms. +related measures like trimmed means. The advantage of the t-Digest over +previous digests for this purpose is that the t-Digest handles data with +full floating point resolution. The accuracy of quantile estimates produced +by t-Digests can be orders of magnitude more accurate than those produced +by previous digest algorithms. Methods are provided to create and update +t-Digests and retrieve quantiles from the accumulated distributions. } \examples{ set.seed(1492) @@ -49,6 +49,6 @@ tquantile(td, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1)) quantile(td) } \references{ -\url{https://raw.githubusercontent.com/tdunning/t-digest/master/docs/t-digest-paper/histo.pdf} +\href{https://arxiv.org/abs/1902.04023}{Computing Extremely Accurate Quantiles Using t-Digests} } \keyword{internal} diff --git a/man/tquantile.Rd b/man/tquantile.Rd index e5e2cc1..5f9ee4a 100644 --- a/man/tquantile.Rd +++ b/man/tquantile.Rd @@ -3,14 +3,14 @@ \name{tquantile} \alias{tquantile} \alias{quantile.tdigest} -\title{Calculate sample quantiles from a t-digest} +\title{Calculate sample quantiles from a t-Digest} \usage{ tquantile(td, probs) \method{quantile}{tdigest}(x, probs = seq(0, 1, 0.25), ...) } \arguments{ -\item{td}{t-digest object} +\item{td}{t-Digest object} \item{probs}{numeric vector of probabilities with values in range 0:1} @@ -19,10 +19,10 @@ tquantile(td, probs) \item{...}{unused} } \value{ -a numeric vector +a \code{numeric} vector containing the requested quantile values } \description{ -Calculate sample quantiles from a t-digest +Calculate sample quantiles from a t-Digest } \examples{ set.seed(1492) @@ -32,5 +32,5 @@ tquantile(td, c(0, .01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1)) quantile(td) } \references{ -\url{https://raw.githubusercontent.com/tdunning/t-digest/master/docs/t-digest-paper/histo.pdf} +\url{https://raw.githubusercontent.com/tdunning/t-Digest/master/docs/t-Digest-paper/histo.pdf} } diff --git a/src/tdigest.h b/src/tdigest.h index 6310ec3..8357ee3 100644 --- a/src/tdigest.h +++ b/src/tdigest.h @@ -6,17 +6,15 @@ // Copyright (c) 2018 Andrew Werner, All rights reserved. // // tdigest is an implementation of Ted Dunning's streaming quantile estimation -// data structure. +// data structure. // This implementation is intended to be like the new MergingHistogram. // It focuses on being in portable C that should be easy to integrate into other -// languages. In particular it provides mechanisms to preallocate all memory +// languages. In particular it provides mechanisms to preallocate all memory // at construction time. // -// The implementation is a direct descendent of +// The implementation is a direct descendent of // https://github.com/tdunning/t-digest/ // -// TODO: add a Ted Dunning Copyright notice. -// //////////////////////////////////////////////////////////////////////////////// #include