diff --git a/DESCRIPTION b/DESCRIPTION
index 8937532..be46b5d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,27 +1,27 @@
Package: tdigest
Type: Package
-Title: Wicked Fast, Accurate Quantiles Using 't-Digests'
+Title: Wicked Fast, Accurate Quantiles Using t-Digests
Version: 0.3.0
-Date: 2019-07-21
+Date: 2019-07-25
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640")),
person("Ted", "Dunning", role = "aut",
comment = "t-Digest algorithm; "),
- person("ajwerner", "", role = "aut",
+ person("Andrew", "Werner", role = "aut",
comment = "Original C+ code; ")
)
-Maintainer: Bob Rudis
-Description: The 't-digest' construction algorithm uses a variant of 1-dimensional
- 'k-means' clustering to produce a very compact data structure that allows
- accurate estimation of quantiles. This 't-digest' data structure can be used
+Description: The t-Digest construction algorithm, by
+ Dunning et al., (2019) , uses a variant of 1-dimensional
+ k-means clustering to produce a very compact data structure that allows
+ accurate estimation of quantiles. This t-Digest data structure can be used
to estimate quantiles, compute other rank statistics or even to estimate
- related measures like trimmed means. The advantage of the 't-digest' over
- previous digests for this purpose is that the 't-digest' handles data with
- full floating point resolution. With small changes, the 't-digest' can handle
- values from any ordered set for which we can compute something akin to a mean.
- The accuracy of quantile estimates produced by 't-digests' can be orders of
- magnitude more accurate than those produced by previous digest algorithms.
+ related measures like trimmed means. The advantage of the t-Digest over
+ previous digests for this purpose is that the t-Digest handles data with
+ full floating point resolution. The accuracy of quantile estimates produced
+ by t-Digests can be orders of magnitude more accurate than those produced
+ by previous digest algorithms. Methods are provided to create and update
+ t-Digests and retrieve quantiles from the accumulated distributions.
URL: https://gitlab.com/hrbrmstr/tdigest
BugReports: https://gitlab.com/hrbrmstr/tdigest/issues
Copyright: file inst/COPYRIGHTS
diff --git a/R/create.R b/R/create.R
index ad2772f..63d9692 100644
--- a/R/create.R
+++ b/R/create.R
@@ -1,31 +1,31 @@
-#' Create a new t-digest histogram from a vector
+#' Create a new t-Digest histogram from a vector
#'
-#' The t-digest construction algorithm uses a variant of 1-dimensional
+#' The t-Digest construction algorithm, by Dunning et al., uses a variant of 1-dimensional
#' k-means clustering to produce a very compact data structure that allows
-#' accurate estimation of quantiles. This t-digest data structure can be used
+#' accurate estimation of quantiles. This t-Digest data structure can be used
#' to estimate quantiles, compute other rank statistics or even to estimate
-#' related measures like trimmed means. The advantage of the t-digest over
-#' previous digests for this purpose is that the t-digest handles data with
-#' full floating point resolution. With small changes, the t-digest can handle
-#' values from any ordered set for which we can compute something akin to a mean.
-#' The accuracy of quantile estimates produced by t-digests can be orders of
-#' magnitude more accurate than those produced by previous digest algorithms.
+#' related measures like trimmed means. The advantage of the t-Digest over
+#' previous digests for this purpose is that the t-Digest handles data with
+#' full floating point resolution. The accuracy of quantile estimates produced
+#' by t-Digests can be orders of magnitude more accurate than those produced
+#' by previous digest algorithms. Methods are provided to create and update
+#' t-Digests and retrieve quantiles from the accumulated distributions.
#'
-#' @param vec vector (will be converted to `double` if not already double). NOTE that this
-#' is ALTREP-aware and will not materialize the passed-in object in order to
-#' add the values to the t-Digest.
+#' @param vec vector (will be converted to `double` if not already double).
+#' NOTE that this is ALTREP-aware and will not materialize the passed-in
+#' object in order to add the values to the t-Digest.
#' @param compression the input compression value; should be >= 1.0; this
-#' will control how aggressively the TDigest compresses data together.
+#' will control how aggressively the t-Digest compresses data together.
#' The original t-Digest paper suggests using a value of 100 for a good
#' balance between precision and efficiency. It will land at very small
#' (think like 1e-6 percentile points) errors at extreme points in the
#' distribution, and compression ratios of around 500 for large data sets
#' (~1 million datapoints). Defaults to 100.
-#' @export
-#' @return a tdigest object
-#' @references
+#' @return a `tdigest` object
+#' @references [Computing Extremely Accurate Quantiles Using t-Digests](https://arxiv.org/abs/1902.04023)
#' @importFrom stats quantile
#' @useDynLib tdigest, .registration = TRUE
+#' @export
#' @examples
#' set.seed(1492)
#' x <- sample(0:100, 1000000, replace = TRUE)
@@ -37,13 +37,13 @@ tdigest <- function(vec, compression=100) {
.Call("Rtdig", vec=vec, compression=compression)
}
-#' Calculate sample quantiles from a t-digest
+#' Calculate sample quantiles from a t-Digest
#'
-#' @param td t-digest object
+#' @param td t-Digest object
#' @param probs numeric vector of probabilities with values in range 0:1
#' @export
-#' @return a numeric vector
-#' @references
+#' @return a `numeric` vector containing the requested quantile values
+#' @references
#' @examples
#' set.seed(1492)
#' x <- sample(0:100, 1000000, replace = TRUE)
@@ -66,7 +66,7 @@ quantile.tdigest <- function(x, probs = seq(0, 1, 0.25), ...) {
}
#' @rdname tdigest
-#' @param x t-tigest object
+#' @param x `tdigest` object
#' @param ... unused
#' @keywords internal
#' @export
@@ -87,26 +87,28 @@ print.tdigest <- function(x, ...) {
#' Allocate a new histogram
#'
#' @param compression the input compression value; should be >= 1.0; this
-#' will control how aggressively the TDigest compresses data together.
+#' will control how aggressively the t-Digest compresses data together.
#' The original t-Digest paper suggests using a value of 100 for a good
#' balance between precision and efficiency. It will land at very small
#' (think like 1e-6 percentile points) errors at extreme points in the
#' distribution, and compression ratios of around 500 for large data sets
#' (~1 million datapoints). Defaults to 100.
#' @export
-#' @return a tdigest object
-#' @references
+#' @return a `tdigest` object
+#' @references
#' @examples
#' td <- td_create(10)
td_create <- function(compression=100) {
+ stopifnot(compression >= 1.0)
compression <- as.double(compression[1])
.Call("Rtd_create", compression=compression, PACKAGE="tdigest")
}
-#' Total items contained in the t-digest
+#' Total items contained in the t-Digest
#'
-#' @param td t-digest object
+#' @param td t-Digest object
#' @export
+#' @return `double` containing the size of the t-Digest
#' @examples
#' td <- td_create(10)
#' td_add(td, 0, 1)
@@ -118,11 +120,12 @@ td_total_count <- function(td) {
.Call("Rtd_total_count", td=td, PACKAGE="tdigest")
}
-#' Add a value to the t-digest with the specified count
+#' Add a value to the t-Digest with the specified count
#'
-#' @param td t-digest object
+#' @param td t-Digest object
#' @param val value
#' @param count count
+#' @return the original, updated `tdigest` object
#' @export
#' @examples
#' td <- td_create(10)
@@ -138,9 +141,10 @@ td_add <- function(td, val, count) {
#' Return the value at the specified quantile
#'
-#' @param td t-digest object
+#' @param td t-Digest object
#' @param q quantile (range 0:1)
#' @export
+#' @return the computed quantile (`double`)
#' @examples
#' td <- td_create(10)
#'
@@ -160,8 +164,9 @@ td_value_at <- function(td, q) {
#' Return the quantile of the value
#'
-#' @param td t-digest object
+#' @param td t-Digest object
#' @param val value
+#' @return the computed quantile (`double`)
#' @export
td_quantile_of <- function(td, val) {
stopifnot(inherits(td, "tdigest"))
@@ -170,11 +175,11 @@ td_quantile_of <- function(td, val) {
.Call("Rtd_quantile_of", tdig=td, val=val, PACKAGE="tdigest")
}
-#' Merge one t-digest into another
+#' Merge one t-Digest into another
#'
-#' @param from,into t-digests
+#' @param from,into t-Digests
#' @return `into`
-#' @return a tdigest object
+#' @return a `tdigest` object
#' @export
td_merge <- function(from, into) {
stopifnot(inherits(from, "tdigest"))
diff --git a/README.Rmd b/README.Rmd
index 380d167..b4a9095 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -19,9 +19,18 @@ Wicked Fast, Accurate Quantiles Using 't-Digests'
## Description
-The t-digest construction algorithm uses a variant of 1-dimensional k-means clustering to produce a very compact data structure that allows accurate estimation of quantiles. This t-digest data structure can be used to estimate quantiles, compute other rank statistics or even to estimate related measures like trimmed means. The advantage of the t-digest over previous digests for this purpose is that the t-digest handles data with full floating point resolution. With small changes, the t-digest can handle values from any ordered set for which we can compute something akin to a mean. The accuracy of quantile estimates produced by t-digests can be orders of magnitude more accurate than those produced by previous digest algorithms.
-
-See [the original paper by Ted Dunning](https://raw.githubusercontent.com/tdunning/t-digest/master/docs/t-digest-paper/histo.pdf) for more details on t-Digests.
+The t-Digest construction algorithm uses a variant of 1-dimensional
+k-means clustering to produce a very compact data structure that allows
+accurate estimation of quantiles. This t-Digest data structure can be used
+to estimate quantiles, compute other rank statistics or even to estimate
+related measures like trimmed means. The advantage of the t-Digest over
+previous digests for this purpose is that the t-Digest handles data with
+full floating point resolution. The accuracy of quantile estimates produced
+by t-Digests can be orders of magnitude more accurate than those produced
+by previous digest algorithms. Methods are provided to create and update
+t-Digests and retreive quantiles from the accumulated distributions.
+
+See [the original paper by Ted Dunning & Otmar Ertl](https://arxiv.org/abs/1902.04023) for more details on t-Digests.
## What's Inside The Tin
diff --git a/inst/COPYRIGHTS b/inst/COPYRIGHTS
index a5e5b0f..5e1a721 100644
--- a/inst/COPYRIGHTS
+++ b/inst/COPYRIGHTS
@@ -1,8 +1,15 @@
-The R code and src/tdigest-main.c, src/init.c are MIT-licensed by the package author.
+The R code and src/tdigest-main.c, src/init.c are MIT-licensed
+by the package author.
-src/tdigest.h, src/tdigest.c are MIT-licensed & Copyright (c) 2018 ajwerner [REF: https://github.com/ajwerner/tdigestc; license below copied from that repository]
+src/tdigest.h, src/tdigest.c are MIT-licensed & Copyright (c) 2018
+Andrew Werner [REF: https://github.com/ajwerner/tdigestc; license below
+copied from that repository]
-The original t-Digest implementation and algorithm are have the following license:
+The t-Digest original algorithm and Java implementation
+(which is the base of Andrew Werner's re-implementation) is
+Copyright (c) 2015 Ted Dunning and uses the same Apache License2.0, below.
+
+------------
Apache License
Version 2.0, January 2004
diff --git a/man/td_add.Rd b/man/td_add.Rd
index 12426a2..4d04d58 100644
--- a/man/td_add.Rd
+++ b/man/td_add.Rd
@@ -2,19 +2,22 @@
% Please edit documentation in R/create.R
\name{td_add}
\alias{td_add}
-\title{Add a value to the t-digest with the specified count}
+\title{Add a value to the t-Digest with the specified count}
\usage{
td_add(td, val, count)
}
\arguments{
-\item{td}{t-digest object}
+\item{td}{t-Digest object}
\item{val}{value}
\item{count}{count}
}
+\value{
+the original, updated \code{tdigest} object
+}
\description{
-Add a value to the t-digest with the specified count
+Add a value to the t-Digest with the specified count
}
\examples{
td <- td_create(10)
diff --git a/man/td_create.Rd b/man/td_create.Rd
index caf32b4..1efa60a 100644
--- a/man/td_create.Rd
+++ b/man/td_create.Rd
@@ -11,7 +11,7 @@ is_tdigest(td)
}
\arguments{
\item{compression}{the input compression value; should be >= 1.0; this
-will control how aggressively the TDigest compresses data together.
+will control how aggressively the t-Digest compresses data together.
The original t-Digest paper suggests using a value of 100 for a good
balance between precision and efficiency. It will land at very small
(think like 1e-6 percentile points) errors at extreme points in the
@@ -21,7 +21,7 @@ distribution, and compression ratios of around 500 for large data sets
\item{td}{t-digest object}
}
\value{
-a tdigest object
+a \code{tdigest} object
}
\description{
Allocate a new histogram
@@ -30,5 +30,5 @@ Allocate a new histogram
td <- td_create(10)
}
\references{
-\url{https://raw.githubusercontent.com/tdunning/t-digest/master/docs/t-digest-paper/histo.pdf}
+\url{https://raw.githubusercontent.com/tdunning/t-Digest/master/docs/t-Digest-paper/histo.pdf}
}
diff --git a/man/td_merge.Rd b/man/td_merge.Rd
index e512d43..e1bfaec 100644
--- a/man/td_merge.Rd
+++ b/man/td_merge.Rd
@@ -2,18 +2,18 @@
% Please edit documentation in R/create.R
\name{td_merge}
\alias{td_merge}
-\title{Merge one t-digest into another}
+\title{Merge one t-Digest into another}
\usage{
td_merge(from, into)
}
\arguments{
-\item{from, into}{t-digests}
+\item{from, into}{t-Digests}
}
\value{
\code{into}
-a tdigest object
+a \code{tdigest} object
}
\description{
-Merge one t-digest into another
+Merge one t-Digest into another
}
diff --git a/man/td_quantile_of.Rd b/man/td_quantile_of.Rd
index 09e87de..1aba486 100644
--- a/man/td_quantile_of.Rd
+++ b/man/td_quantile_of.Rd
@@ -7,10 +7,13 @@
td_quantile_of(td, val)
}
\arguments{
-\item{td}{t-digest object}
+\item{td}{t-Digest object}
\item{val}{value}
}
+\value{
+the computed quantile (\code{double})
+}
\description{
Return the quantile of the value
}
diff --git a/man/td_total_count.Rd b/man/td_total_count.Rd
index 971e7c3..689d8c4 100644
--- a/man/td_total_count.Rd
+++ b/man/td_total_count.Rd
@@ -3,19 +3,22 @@
\name{td_total_count}
\alias{td_total_count}
\alias{length.tdigest}
-\title{Total items contained in the t-digest}
+\title{Total items contained in the t-Digest}
\usage{
td_total_count(td)
\method{length}{tdigest}(x)
}
\arguments{
-\item{td}{t-digest object}
+\item{td}{t-Digest object}
\item{x}{a tdigest object}
}
+\value{
+\code{double} containing the size of the t-Digest
+}
\description{
-Total items contained in the t-digest
+Total items contained in the t-Digest
}
\examples{
td <- td_create(10)
diff --git a/man/td_value_at.Rd b/man/td_value_at.Rd
index 3ab239d..ae34cb3 100644
--- a/man/td_value_at.Rd
+++ b/man/td_value_at.Rd
@@ -10,7 +10,7 @@ td_value_at(td, q)
\method{[}{tdigest}(x, i, ...)
}
\arguments{
-\item{td}{t-digest object}
+\item{td}{t-Digest object}
\item{q}{quantile (range 0:1)}
@@ -20,6 +20,9 @@ td_value_at(td, q)
\item{...}{unused}
}
+\value{
+the computed quantile (\code{double})
+}
\description{
Return the value at the specified quantile
}
diff --git a/man/tdigest.Rd b/man/tdigest.Rd
index e2a073d..7a8b285 100644
--- a/man/tdigest.Rd
+++ b/man/tdigest.Rd
@@ -3,43 +3,43 @@
\name{tdigest}
\alias{tdigest}
\alias{print.tdigest}
-\title{Create a new t-digest histogram from a vector}
+\title{Create a new t-Digest histogram from a vector}
\usage{
tdigest(vec, compression = 100)
\method{print}{tdigest}(x, ...)
}
\arguments{
-\item{vec}{vector (will be converted to \code{double} if not already double). NOTE that this
-is ALTREP-aware and will not materialize the passed-in object in order to
-add the values to the t-Digest.}
+\item{vec}{vector (will be converted to \code{double} if not already double).
+NOTE that this is ALTREP-aware and will not materialize the passed-in
+object in order to add the values to the t-Digest.}
\item{compression}{the input compression value; should be >= 1.0; this
-will control how aggressively the TDigest compresses data together.
+will control how aggressively the t-Digest compresses data together.
The original t-Digest paper suggests using a value of 100 for a good
balance between precision and efficiency. It will land at very small
(think like 1e-6 percentile points) errors at extreme points in the
distribution, and compression ratios of around 500 for large data sets
(~1 million datapoints). Defaults to 100.}
-\item{x}{t-tigest object}
+\item{x}{\code{tdigest} object}
\item{...}{unused}
}
\value{
-a tdigest object
+a \code{tdigest} object
}
\description{
-The t-digest construction algorithm uses a variant of 1-dimensional
+The t-Digest construction algorithm, by Dunning et al., uses a variant of 1-dimensional
k-means clustering to produce a very compact data structure that allows
-accurate estimation of quantiles. This t-digest data structure can be used
+accurate estimation of quantiles. This t-Digest data structure can be used
to estimate quantiles, compute other rank statistics or even to estimate
-related measures like trimmed means. The advantage of the t-digest over
-previous digests for this purpose is that the t-digest handles data with
-full floating point resolution. With small changes, the t-digest can handle
-values from any ordered set for which we can compute something akin to a mean.
-The accuracy of quantile estimates produced by t-digests can be orders of
-magnitude more accurate than those produced by previous digest algorithms.
+related measures like trimmed means. The advantage of the t-Digest over
+previous digests for this purpose is that the t-Digest handles data with
+full floating point resolution. The accuracy of quantile estimates produced
+by t-Digests can be orders of magnitude more accurate than those produced
+by previous digest algorithms. Methods are provided to create and update
+t-Digests and retrieve quantiles from the accumulated distributions.
}
\examples{
set.seed(1492)
@@ -49,6 +49,6 @@ tquantile(td, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
quantile(td)
}
\references{
-\url{https://raw.githubusercontent.com/tdunning/t-digest/master/docs/t-digest-paper/histo.pdf}
+\href{https://arxiv.org/abs/1902.04023}{Computing Extremely Accurate Quantiles Using t-Digests}
}
\keyword{internal}
diff --git a/man/tquantile.Rd b/man/tquantile.Rd
index e5e2cc1..5f9ee4a 100644
--- a/man/tquantile.Rd
+++ b/man/tquantile.Rd
@@ -3,14 +3,14 @@
\name{tquantile}
\alias{tquantile}
\alias{quantile.tdigest}
-\title{Calculate sample quantiles from a t-digest}
+\title{Calculate sample quantiles from a t-Digest}
\usage{
tquantile(td, probs)
\method{quantile}{tdigest}(x, probs = seq(0, 1, 0.25), ...)
}
\arguments{
-\item{td}{t-digest object}
+\item{td}{t-Digest object}
\item{probs}{numeric vector of probabilities with values in range 0:1}
@@ -19,10 +19,10 @@ tquantile(td, probs)
\item{...}{unused}
}
\value{
-a numeric vector
+a \code{numeric} vector containing the requested quantile values
}
\description{
-Calculate sample quantiles from a t-digest
+Calculate sample quantiles from a t-Digest
}
\examples{
set.seed(1492)
@@ -32,5 +32,5 @@ tquantile(td, c(0, .01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
quantile(td)
}
\references{
-\url{https://raw.githubusercontent.com/tdunning/t-digest/master/docs/t-digest-paper/histo.pdf}
+\url{https://raw.githubusercontent.com/tdunning/t-Digest/master/docs/t-Digest-paper/histo.pdf}
}
diff --git a/src/tdigest.h b/src/tdigest.h
index 6310ec3..8357ee3 100644
--- a/src/tdigest.h
+++ b/src/tdigest.h
@@ -6,17 +6,15 @@
// Copyright (c) 2018 Andrew Werner, All rights reserved.
//
// tdigest is an implementation of Ted Dunning's streaming quantile estimation
-// data structure.
+// data structure.
// This implementation is intended to be like the new MergingHistogram.
// It focuses on being in portable C that should be easy to integrate into other
-// languages. In particular it provides mechanisms to preallocate all memory
+// languages. In particular it provides mechanisms to preallocate all memory
// at construction time.
//
-// The implementation is a direct descendent of
+// The implementation is a direct descendent of
// https://github.com/tdunning/t-digest/
//
-// TODO: add a Ted Dunning Copyright notice.
-//
////////////////////////////////////////////////////////////////////////////////
#include