From 5a43a390378ba29112eafa8df1206acbeaf2c81e Mon Sep 17 00:00:00 2001 From: chaodengusc Date: Wed, 27 Jun 2018 03:02:06 -0700 Subject: [PATCH] update the documents --- DESCRIPTION | 6 +-- NAMESPACE | 1 + R/kmer.R | 20 +++++----- R/sequencing.R | 9 ++--- inst/CITATION | 10 ++--- man/Dickens.Rd | 2 +- ...{ShakespeareWordHist.Rd => Shakespeare.Rd} | 0 man/Twitter.Rd | 2 +- man/WillButterfly.Rd | 2 +- man/bbc.rSAC.Rd | 6 ++- man/cs.rSAC.Rd | 6 ++- man/ds.rSAC.Rd | 7 ++-- man/ds.rSAC.bootsrap.Rd | 11 +++--- man/fisher.alpha.Rd | 2 +- man/fisher.rSAC.Rd | 6 ++- man/kmer.frac.curve.Rd | 13 +++---- man/kmer.frac.curve.bootstrap.Rd | 23 ++++++------ man/preseqR.interpolate.rSAC.Rd | 2 +- man/preseqR.nonreplace.sampling.Rd | 2 +- man/preseqR.optimal.sequencing.Rd | 37 ++++++++----------- man/preseqR.package.Rd | 35 ++++++------------ man/preseqR.rSAC.Rd | 7 ++-- man/preseqR.rSAC.bootstrap.Rd | 11 +++--- man/preseqR.rSAC.sequencing.rmdup.Rd | 15 ++++---- man/preseqR.sample.cov.Rd | 7 ++-- man/preseqR.sample.cov.bootstrap.Rd | 7 ++-- man/preseqR.simu.hist.Rd | 6 +-- man/preseqR.ztnb.em.Rd | 2 +- man/ztnb.rSAC.Rd | 5 ++- man/ztp.rSAC.Rd | 2 +- 30 files changed, 134 insertions(+), 130 deletions(-) rename man/{ShakespeareWordHist.Rd => Shakespeare.Rd} (100%) diff --git a/DESCRIPTION b/DESCRIPTION index 7a15845..bb66f60 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,11 @@ Package: preseqR Type: Package -Title: Predicting the Number of Species in a Random Sample +Title: Predicting Species Accumulation Curves Version: 4.0.0 -Date: 2017-12-26 +Date: 2018-06-27 Author: Chao Deng, Timothy Daley and Andrew D. Smith Maintainer: Chao Deng -Description: The relation between the number of species and the number of individuals in a random sample is a classic problem back to Fisher (1943) . We generalize this problem to predict the number of species represented at least r times in a random sample. In particular when r=1, it becomes the classic problem. We use a mixture of Poisson processes to model sampling procedures and apply an empirical Bayes approach to obtain a rational function estimator. The approach can be applied to assess the quality of DNA sequencing libraries and optimize depths of sequencing experiments. For more information on 'preseqR', see Deng C, Daley T and Smith AD (2015) and Deng C and Smith AD (2016) . +Description: Originally as an R version of Preseq , the package has extended its functionality to predict the r-species accumulation curve (r-SAC), which is the number of species represented at least r times as a function of the sampling effort. When r = 1, the curve is known as the species accumulation curve, or the library complexity curve in high-throughput genomic sequencing. The package includes both parametric and nonparametric methods, as described by Deng C, et al. (2018) . License: GPL-3 Imports: polynom, graphics, stats diff --git a/NAMESPACE b/NAMESPACE index 06aa4b7..56cec47 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(preseqR.interpolate.rSAC) export(preseqR.rSAC) export(preseqR.rSAC.bootstrap) export(ds.rSAC) +export(ds.rSAC.bootstrap) export(ztnb.rSAC) export(ztp.rSAC) export(bbc.rSAC) diff --git a/R/kmer.R b/R/kmer.R index 51870e9..88945ae 100644 --- a/R/kmer.R +++ b/R/kmer.R @@ -26,7 +26,7 @@ kmer.frac <- function(n, r=2, mt=20) { ## the fraction of k-mers represented at least r times as a function of ## sample sizes -kmer.frac.curve <- function(n, k, read.len, seq.gb, r=2, mt=20) { +kmer.frac.curve <- function(n, k, read.len, seq, r=2, mt=20) { f <- kmer.frac(n, r=r, mt=mt) if (is.null(f)) return(NULL) @@ -34,10 +34,10 @@ kmer.frac.curve <- function(n, k, read.len, seq.gb, r=2, mt=20) { N <- n[, 1] %*% n[, 2] ## average number of k-mers per read m <- read.len - k + 1 - unit.gb <- N / m * read.len / 1e9 - seq.effort <- seq.gb / unit.gb - result <- matrix(c(seq.gb, f(seq.effort)), ncol=2, byrow=FALSE) - colnames(result) <- c("bases(GB)", paste("frac(X>=", r, ")", sep="")) + unit <- N / m * read.len + seq.effort <- seq / unit + result <- matrix(c(seq, f(seq.effort)), ncol=2, byrow=FALSE) + colnames(result) <- c("bases", paste("frac(X>=", r, ")", sep="")) return(result) } @@ -50,7 +50,7 @@ kmer.frac.bootstrap <- function(n, r=2, mt=20, times=30, conf=0.95) { ## the fraction of k-mers represented at least r times as a function of ## sample sizes -kmer.frac.curve.bootstrap <- function(n, k, read.len, seq.gb, r=2, mt=20, +kmer.frac.curve.bootstrap <- function(n, k, read.len, seq, r=2, mt=20, times=30, conf=0.95) { f <- kmer.frac.bootstrap(n, r=r, mt=mt, times=times, conf=conf) @@ -60,11 +60,11 @@ kmer.frac.curve.bootstrap <- function(n, k, read.len, seq.gb, r=2, mt=20, N <- n[, 1] %*% n[, 2] ## average number of k-mers per read m <- read.len - k + 1 - unit.gb <- N / m * read.len / 1e9 - seq.effort <- seq.gb / unit.gb - result <- matrix(c(seq.gb, f$f(seq.effort), f$lb(seq.effort), + unit <- N / m * read.len + seq.effort <- seq / unit + result <- matrix(c(seq, f$f(seq.effort), f$lb(seq.effort), f$ub(seq.effort)), ncol=4, byrow=FALSE) - colnames(result) <- c("bases(GB)", paste("frac(X>=", r, ")", sep=""), + colnames(result) <- c("bases", paste("frac(X>=", r, ")", sep=""), "lb", "ub") return(result) } diff --git a/R/sequencing.R b/R/sequencing.R index ccaeec1..1c84588 100644 --- a/R/sequencing.R +++ b/R/sequencing.R @@ -20,8 +20,7 @@ ## predict the optimal number of sequenced bases using cost-benefit ratio preseqR.optimal.sequencing <- function( - n, efficiency=0.05, bin=1e8, r=1, mt=20, size=SIZE.INIT, - mu=MU.INIT, times=30, conf=0.95) + n, efficiency=0.05, bin=1e8, r=1, mt=20, times=30, conf=0.95) { find.start <- function(f, N, bin, efficiency) { y = sapply(1:100, function(x) (f(x + bin / N) - f(x)) / bin - efficiency) @@ -36,8 +35,8 @@ preseqR.optimal.sequencing <- function( N <- n[, 1] %*% n[, 2] ## r-species accumulation curve as a function of relative sample size - f.rSAC <- preseqR.rSAC.bootstrap( - n=n, r=r, mt=mt, size=size, mu=mu,times=times, conf=conf) + f.rSAC <- ds.rSAC.bootstrap( + n=n, r=r, mt=mt, times=times, conf=conf) ## hint: using r-SAC as a function of the number of sequenced bases f <- f.rSAC$f @@ -73,7 +72,7 @@ preseqR.optimal.sequencing <- function( ## the function is designed for EXOME sequencing, where aligned reads that ## map to the same location are removed to avoid potential duplicate preseqR.rSAC.sequencing.rmdup <- function( - n_base, n_read, r=1, mt=20, times=100, conf=0.95) + n_base, n_read, r=1, mt=20, times=30, conf=0.95) { checking.hist(n_read) checking.hist(n_base) diff --git a/inst/CITATION b/inst/CITATION index 262cdf8..2448606 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -19,14 +19,14 @@ citEntry(entry = "article", citEntry(entry = "article", title = "Estimating the number of species to attain sufficient representation in a random sample", - author = personList(as.person("Chao Deng"), as.person("Andrew D. Smith")), + author = personList(as.person("Chao Deng"), as.person("Timothy Daley"), as.person("Peter Calabrese"), as.person("Jie Ren"), as.person("Andrew D. Smith")), journal = "arXiv", - year = "2016", - url = "https://arxiv.org/abs/1607.02804v2", + year = "2018", + url = "https://arxiv.org/abs/1607.02804v3", textVersion = - paste("Deng C and Smith AD (2016).", + paste("Deng C, Daley T, Calabrese P, Ren J and Smith AD (2018).", "Estimating the number of species to attain sufficient representation in a random sample.", "arXiv preprint.", - "URL https://arxiv.org/abs/1607.02804v2.") + "URL https://arxiv.org/abs/1607.02804v3.") ) diff --git a/man/Dickens.Rd b/man/Dickens.Rd index f7b728b..f8d1fe5 100644 --- a/man/Dickens.Rd +++ b/man/Dickens.Rd @@ -8,7 +8,7 @@ \details{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of unique words appeared \eqn{j} + is \eqn{N_j}, the number of unique words appeared exactly \eqn{j} times in a collection of Charles Dickens. } diff --git a/man/ShakespeareWordHist.Rd b/man/Shakespeare.Rd similarity index 100% rename from man/ShakespeareWordHist.Rd rename to man/Shakespeare.Rd diff --git a/man/Twitter.Rd b/man/Twitter.Rd index 7579a22..85f1877 100644 --- a/man/Twitter.Rd +++ b/man/Twitter.Rd @@ -7,7 +7,7 @@ \details{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{n_j}, the number of users with \eqn{j} followers. + is \eqn{n_j}, the number of users with exactly \eqn{j} followers. } \references{ diff --git a/man/WillButterfly.Rd b/man/WillButterfly.Rd index 654d224..d2ed323 100644 --- a/man/WillButterfly.Rd +++ b/man/WillButterfly.Rd @@ -13,7 +13,7 @@ Animal Population, Journal of Animal Ecology, 12, 42-58, Table 3. \details{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{n_j}, the number of butterflies captured \eqn{j} + is \eqn{n_j}, the number of butterflies captured exactly \eqn{j} times in the sample. } diff --git a/man/bbc.rSAC.Rd b/man/bbc.rSAC.Rd index c68f20d..974e717 100644 --- a/man/bbc.rSAC.Rd +++ b/man/bbc.rSAC.Rd @@ -18,7 +18,7 @@ bbc.rSAC(n, r=1) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -41,6 +41,10 @@ bbc.rSAC(n, r=1) Boneh, S., Boneh, A., & Caron, R. J. (1998). Estimating the prediction function and the number of unseen species in sampling with replacement. Journal of the American Statistical Association, 93(441), 372-379. + +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \examples{ diff --git a/man/cs.rSAC.Rd b/man/cs.rSAC.Rd index 7d4d20e..7f9f190 100644 --- a/man/cs.rSAC.Rd +++ b/man/cs.rSAC.Rd @@ -18,7 +18,7 @@ cs.rSAC(n, r=1, k=10) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -43,6 +43,10 @@ cs.rSAC(n, r=1, k=10) \references{ Chao, A., & Shen, T. J. (2004). Nonparametric prediction in species sampling. Journal of agricultural, biological, and environmental statistics, 9(3), 253-269. + +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \examples{ diff --git a/man/ds.rSAC.Rd b/man/ds.rSAC.Rd index d414aac..15040d4 100644 --- a/man/ds.rSAC.Rd +++ b/man/ds.rSAC.Rd @@ -16,7 +16,7 @@ ds.rSAC(n, r=1, mt=20) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -45,8 +45,9 @@ ds.rSAC(n, r=1, mt=20) the initial sample. } \references{ -Deng, C and Smith, AD (2016). Estimating the number of species to attain -sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \author{ Chao Deng diff --git a/man/ds.rSAC.bootsrap.Rd b/man/ds.rSAC.bootsrap.Rd index b6f0bff..212d8a4 100644 --- a/man/ds.rSAC.bootsrap.Rd +++ b/man/ds.rSAC.bootsrap.Rd @@ -17,7 +17,7 @@ ds.rSAC.bootstrap(n, r=1, mt=20, times=30, conf=0.95) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -29,7 +29,7 @@ ds.rSAC.bootstrap(n, r=1, mt=20, times=30, conf=0.95) approximations. Default is 20. } \item{times}{ - The number of bootstrap samples. + The number of bootstrap samples. Default is 30. } \item{conf}{ The confidence level. Default is 0.95 @@ -69,8 +69,9 @@ ds.rSAC.bootstrap(n, r=1, mt=20, times=30, conf=0.95) \references{ Efron, B., & Tibshirani, R. J. (1994). An introduction to the bootstrap. CRC press. -Deng, C & Smith, AD (2016). Estimating the number of species to attain -sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \author{ @@ -102,7 +103,7 @@ sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 ## when the sample size is 50 or 100 times of the initial sample # ds2$f(c(50, 100)) ## The standard error of the estiamtes -# ds2$se(c(50, 100))) +# ds2$se(c(50, 100)) ## The confidence interval of the estimates # lb <- ds2$lb(c(50, 100)) # ub <- ds2$ub(c(50, 100)) diff --git a/man/fisher.alpha.Rd b/man/fisher.alpha.Rd index ede4970..ffb299f 100644 --- a/man/fisher.alpha.Rd +++ b/man/fisher.alpha.Rd @@ -16,7 +16,7 @@ fisher.alpha(n) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } diff --git a/man/fisher.rSAC.Rd b/man/fisher.rSAC.Rd index d2d1f61..851b30b 100644 --- a/man/fisher.rSAC.Rd +++ b/man/fisher.rSAC.Rd @@ -18,7 +18,7 @@ fisher.rSAC(n, r=1) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -41,6 +41,10 @@ fisher.rSAC(n, r=1) Fisher, R., Corbet, A., & Williams, C. (1943). The Relation Between the Number of Species and the Number of Individuals in a Random Sample of an Animal Population. Journal of Animal Ecology, 12(1), 42-58. doi:10.2307/1411 + +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \examples{ diff --git a/man/kmer.frac.curve.Rd b/man/kmer.frac.curve.Rd index 1ff0944..d5fb0a1 100644 --- a/man/kmer.frac.curve.Rd +++ b/man/kmer.frac.curve.Rd @@ -10,7 +10,7 @@ least \eqn{r} times in a high-throughput sequencing experiment given the amount of sequencing } \usage{ - kmer.frac.curve(n, k, read.len, seq.gb, r=2, mt=20) + kmer.frac.curve(n, k, read.len, seq, r=2, mt=20) } %- maybe also 'usage' for other objects documented here. \arguments{ @@ -27,8 +27,8 @@ amount of sequencing \item{read.len}{ The average length of a read. } - \item{seq.gb}{ - The amount of sequencing in billions. + \item{seq}{ + The amount of nucleotides sequenced.. } \item{r}{ A positive integer. Default is 1. @@ -49,8 +49,7 @@ amount of sequencing } \value{ A two-column matrix. The first column is the amount of sequencing in an - experiment. The value is specified by the variable \eqn{seq.gb}. - The second column is the estimate of the fraction of \eqn{k}-mers observed at least + experiment. The second column is the estimate of the fraction of \eqn{k}-mers observed at least \eqn{r} times in the experiment. } \references{ @@ -68,9 +67,9 @@ library(preseqR) ## import data data(SRR061157_k31) -## the fraction of 31-mers represented at least twice in an experiment when +## the fraction of 31-mers represented at least 10 times in an experiment when ## sequencing 1M, 10M, 100M, 1G, 10G, 100G, 1T nucleotides -kmer.frac.curve(n=SRR061157_k31, k=31, read.len=200, seq.gb=10^(6:12), r=2, mt=20) +kmer.frac.curve(n=SRR061157_k31, k=31, read.len=100, seq=10^(6:12), r=10, mt=20) } % Add one or more standard keywords, see file 'KEYWORDS' in the % R documentation directory. diff --git a/man/kmer.frac.curve.bootstrap.Rd b/man/kmer.frac.curve.bootstrap.Rd index 2a82700..6d1f2cc 100644 --- a/man/kmer.frac.curve.bootstrap.Rd +++ b/man/kmer.frac.curve.bootstrap.Rd @@ -10,7 +10,7 @@ least \eqn{r} times in a high-throughput sequencing experiment given the amount of sequencing } \usage{ -kmer.frac.curve.bootstrap(n, k, read.len, seq.gb, r=2, mt=20, times=30, conf=0.95) +kmer.frac.curve.bootstrap(n, k, read.len, seq, r=2, mt=20, times=30, conf=0.95) } %- maybe also 'usage' for other objects documented here. \arguments{ @@ -27,8 +27,8 @@ kmer.frac.curve.bootstrap(n, k, read.len, seq.gb, r=2, mt=20, times=30, conf=0.9 \item{read.len}{ The average length of a read. } - \item{seq.gb}{ - The amount of sequencing in billions. + \item{seq}{ + The amount of nucleotides sequenced. } \item{r}{ A positive integer. Default is 1. @@ -55,7 +55,7 @@ kmer.frac.curve.bootstrap(n, k, read.len, seq.gb, r=2, mt=20, times=30, conf=0.9 } \value{ A four-column matrix. The first column is the amount of sequencing in an - experiment. The value is specified by the variable \eqn{seq.gb}. + experiment. The second column is the estimate of the fraction of \eqn{k}-mers observed at least \eqn{r} times in the experiment. The third and fourth columns are the lower bounds and the upper bounds of the confidence intervals. @@ -64,8 +64,9 @@ kmer.frac.curve.bootstrap(n, k, read.len, seq.gb, r=2, mt=20, times=30, conf=0.9 \references{ Efron, B., & Tibshirani, R. J. (1994). An introduction to the bootstrap. CRC press. -Deng, C and Smith, AD (2016). Estimating the number of species to attain -sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \author{ @@ -74,15 +75,15 @@ sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 \examples{ ## load library -library(preseqR) +# library(preseqR) ## import data -data(SRR061157_k31) +# data(SRR061157_k31) -## the fraction of 31-mers represented at least twice in an experiment when +## the fraction of 31-mers represented at least 10 times in an experiment when ## sequencing 1M, 10M, 100M, 1G, 10G, 100G, 1T nucleotides -kmer.frac.curve.bootstrap(n=SRR061157_k31, k=31, read.len=200, - seq.gb=10^(6:12), r=2, mt=20) +# kmer.frac.curve.bootstrap(n=SRR061157_k31, k=31, read.len=100, +# seq=10^(6:12), r=10, mt=20) } % Add one or more standard keywords, see file 'KEYWORDS' in the % R documentation directory. diff --git a/man/preseqR.interpolate.rSAC.Rd b/man/preseqR.interpolate.rSAC.Rd index b99ee11..2ac9ca5 100644 --- a/man/preseqR.interpolate.rSAC.Rd +++ b/man/preseqR.interpolate.rSAC.Rd @@ -16,7 +16,7 @@ preseqR.interpolate.rSAC(n, ss, r=1) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } diff --git a/man/preseqR.nonreplace.sampling.Rd b/man/preseqR.nonreplace.sampling.Rd index 0409342..52d35fc 100644 --- a/man/preseqR.nonreplace.sampling.Rd +++ b/man/preseqR.nonreplace.sampling.Rd @@ -15,7 +15,7 @@ Sampling \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species represented \eqn{j} times in the initial + is \eqn{N_j}, the number of species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } \item{size}{ diff --git a/man/preseqR.optimal.sequencing.Rd b/man/preseqR.optimal.sequencing.Rd index 55e0d5b..b00a5b8 100644 --- a/man/preseqR.optimal.sequencing.Rd +++ b/man/preseqR.optimal.sequencing.Rd @@ -10,19 +10,19 @@ a single-cell whole-genome sequencing (scWGS) experiment based on a shallow sequ } \usage{ preseqR.optimal.sequencing(n, efficiency=0.05, bin=1e8, r=1, mt=20, - size=SIZE.INIT, mu=MU.INIT, times=30, conf=0.95) + times=30, conf=0.95) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{n_j}, the number of species with each species represented \eqn{j} + is \eqn{n_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } \item{efficiency}{ - The minimum cost-benefit ratio + The minimum benefit-cost ratio } \item{bin}{ One unit of sequencing effort. Default is 1e8. @@ -37,14 +37,6 @@ preseqR.optimal.sequencing(n, efficiency=0.05, bin=1e8, r=1, mt=20, \item{times}{ The number of bootstrap samples. } - \item{size}{ - A positive double, the initial value of the parameter \code{size} in - the negative binomial distribution for the EM algorithm. Default value is 1. - } - \item{mu}{ - A positive double, the initial value of the parameter \code{mu} in the - negative binomial distribution for the EM algorithm. Default value is 0.5. - } \item{conf}{ The confidence level. Default is 0.95 } @@ -52,15 +44,15 @@ preseqR.optimal.sequencing(n, efficiency=0.05, bin=1e8, r=1, mt=20, \details{ \code{preseqR.optimal.sequencing} predicts the optimal amount of sequencing in a scWGS experiment. The term optimal is interpreted as the maximum - amount of sequencing with its cost-benefit ratio greater than a given threshold. - The cost-benefit ratio is defined as the probability of a new nucleotide in the + amount of sequencing with its benefit-cost ratio greater than a given threshold. + The benefit-cost ratio is defined as the probability of a new nucleotide in the genome represented at least \eqn{r} times when one more base is sequenced. In order to improve the numeric stability, we use the mean of new nucleotdies with coverage at least \eqn{r} in one unit of sequencing effort to approximate the ratio. The amount of sequences in one unit of sequencing effort is defined by the variable \code{bin}. - Note that the cost-benefit ratio is not monotonic. The ratio first increases + Note that the benefit-cost ratio is not monotonic. The ratio first increases and then decrease as the amount of sequencing increase. To predicte the optimal amount of sequencing, we consider only the areas after the peak, where the ratio starts to decrease. @@ -72,8 +64,9 @@ preseqR.optimal.sequencing(n, efficiency=0.05, bin=1e8, r=1, mt=20, } \references{ -Deng, C and Smith, AD (2016). Estimating the number of species to attain -sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \author{ @@ -85,14 +78,14 @@ sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 #library(preseqR) ## import data -#data(SRR611492_5M) -## the optimal amount of sequencing with the cost-benefit ratio greater than +# data(SRR611492_5M) +## the optimal amount of sequencing with the benefit-cost ratio greater than ## 0.05 for r = 4 -#preseqR.optimal.sequencing(n=SRR611492_5M, efficiency=0.05, bin=1e8, r=4) -## the optimal amount of sequencing with the cost-benefit ratio greater than +# preseqR.optimal.sequencing(n=SRR611492_5M, efficiency=0.05, bin=1e8, r=4) +## the optimal amount of sequencing with the benefit-cost ratio greater than ## 0.05 for r = 10 -#preseqR.optimal.sequencing(n=SRR611492_5M, efficiency=0.05, bin=1e8, r=10) +# preseqR.optimal.sequencing(n=SRR611492_5M, efficiency=0.05, bin=1e8, r=10) } % Add one or more standard keywords, see file 'KEYWORDS' in the % R documentation directory. -\keyword{WGS, high-throughput, cost-benefit} +\keyword{WGS, high-throughput, benefit-cost ratio} diff --git a/man/preseqR.package.Rd b/man/preseqR.package.Rd index 9b0f8b2..b48f677 100644 --- a/man/preseqR.package.Rd +++ b/man/preseqR.package.Rd @@ -2,30 +2,18 @@ \alias{preseqR-package} \docType{package} \title{ - Predicting species abundances in a random sample + Predicting \eqn{r}-species accumulation curves } \description{ - The functionality of this package is to predict the number of species - represented at least \eqn{r} times in a random sample, - based on an initial sample. - The prediction is based on an empirical Bayes - approach using rational function approximation. The estimator is + The functionality of this package is to predict \eqn{r}-species + accumulaiton curves. The method is based on a nonparametric empirical Bayes approach + with rational function approximation. The estimator is excellent in accuracy for both large values of \eqn{r} and long-range - extrapolations, which are essential to large-scale applications. This - package also generalizes other popular parametric and nonparametric - methods that were oringinally designed for estimating the number of - species in a random sample (the case \eqn{r=1}), - such as the log series estimator by Fisher and - zero-truncated negative binomial estimator, to address the general problems - of predicting the number of species represented at least \eqn{r} times for - \eqn{r > 1}. - - For the ease of reference, we call the quantity, the number of species - represented at least \eqn{r} times in a sample as a function of the sample - size, a \eqn{r}-species accumulation curve (\eqn{r}-SAC). In particular, - when \eqn{r = 1}, \eqn{r}-SAC is known as species accmulaton curve (SAC), - which has been widely studied in ecology. - + extrapolations, which are essential to large-scale applications. Some + examples are predicting the molecular complexity of sequencing + libraries, estimating the minimum sufficient sequencing depths for + whole-exome sequencing experiments and optimizing depths for single-cell + whole-genome sequencing experiments. } \details{ main functions: @@ -68,8 +56,9 @@ Deng C, Daley T & Smith AD (2015). Applications of species accumulation curves in large-scale biological data analysis. Quantitative Biology, 3(3), 135-144. URL http://dx.doi.org/10.1007/s40484-015-0049-7. -Deng, C & Smith, AD (2016). Estimating the number of species to attain -sufficient representation in a random sample. arXiv preprint arXiv:1607.02804. +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. Efron, B., & Thisted, R. (1976). Estimating the number of unseen species: How many words did Shakespeare know?. Biometrika, 63(3), 435-447. diff --git a/man/preseqR.rSAC.Rd b/man/preseqR.rSAC.Rd index b36ae4d..f45c2a2 100644 --- a/man/preseqR.rSAC.Rd +++ b/man/preseqR.rSAC.Rd @@ -16,7 +16,7 @@ preseqR.rSAC(n, r=1, mt=20, size=SIZE.INIT, mu=MU.INIT) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -57,8 +57,9 @@ preseqR.rSAC(n, r=1, mt=20, size=SIZE.INIT, mu=MU.INIT) the initial sample. } \references{ -Deng, C and Smith, AD (2016). Estimating the number of species to attain -sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \author{ Chao Deng diff --git a/man/preseqR.rSAC.bootstrap.Rd b/man/preseqR.rSAC.bootstrap.Rd index fe29d0c..ccc43ff 100644 --- a/man/preseqR.rSAC.bootstrap.Rd +++ b/man/preseqR.rSAC.bootstrap.Rd @@ -17,7 +17,7 @@ preseqR.rSAC.bootstrap(n, r=1, mt=20, size=SIZE.INIT, mu=MU.INIT, times=30, \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -76,8 +76,9 @@ preseqR.rSAC.bootstrap(n, r=1, mt=20, size=SIZE.INIT, mu=MU.INIT, times=30, \references{ Efron, B., & Tibshirani, R. J. (1994). An introduction to the bootstrap. CRC press. -Deng, C and Smith, AD (2016). Estimating the number of species to attain -sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \author{ @@ -86,10 +87,10 @@ sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 \examples{ ## load library -#library(preseqR) +# library(preseqR) ## import data -#data(FisherButterfly) +# data(FisherButterfly) ## construct estimator for SAC # estimator1 <- preseqR.rSAC.bootstrap(FisherButterfly, r=1) diff --git a/man/preseqR.rSAC.sequencing.rmdup.Rd b/man/preseqR.rSAC.sequencing.rmdup.Rd index 9066e5b..1f375f4 100644 --- a/man/preseqR.rSAC.sequencing.rmdup.Rd +++ b/man/preseqR.rSAC.sequencing.rmdup.Rd @@ -10,7 +10,7 @@ experiment, based on a shallow sequencing experiment. } \usage{ -preseqR.rSAC.sequencing.rmdup(n_base, n_read, r=1, mt=20, times=100, conf=0.95) +preseqR.rSAC.sequencing.rmdup(n_base, n_read, r=1, mt=20, times=30, conf=0.95) } %- maybe also 'usage' for other objects documented here. \arguments{ @@ -24,7 +24,7 @@ preseqR.rSAC.sequencing.rmdup(n_base, n_read, r=1, mt=20, times=100, conf=0.95) \item{n_read}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N'_j}, the number of distinct reads with \eqn{j} duplicates + is \eqn{N'_j}, the number of distinct reads with exactly \eqn{j} duplicates in the initial experiment. The first column must be sorted in an ascending order. } @@ -36,7 +36,7 @@ preseqR.rSAC.sequencing.rmdup(n_base, n_read, r=1, mt=20, times=100, conf=0.95) approximations. Default is 20. } \item{times}{ - The number of bootstrap samples. + The number of bootstrap samples. Default is 30. } \item{conf}{ The confidence level. Default is 0.95 @@ -74,8 +74,9 @@ preseqR.rSAC.sequencing.rmdup(n_base, n_read, r=1, mt=20, times=100, conf=0.95) } \references{ -Deng, C and Smith, AD (2016). Estimating the number of species to attain -sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \author{ @@ -87,8 +88,8 @@ sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 #library(preseqR) ## import data -#data(SRR1301329_1M_base) -#data(SRR1301329_1M_read) +# data(SRR1301329_1M_base) +# data(SRR1301329_1M_read) # construct the estimator # estimator1 <- preseqR.rSAC.sequencing.rmdup( # n_base=SRR1301329_1M_base, n_read=SRR5365359_5M_read, diff --git a/man/preseqR.sample.cov.Rd b/man/preseqR.sample.cov.Rd index 6cfb267..c13a64d 100644 --- a/man/preseqR.sample.cov.Rd +++ b/man/preseqR.sample.cov.Rd @@ -16,7 +16,7 @@ \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -65,8 +65,9 @@ population parameters. Biometrika, 40(3-4), 237-264. Mao, C. X. and Lindsay, B. G. (2002). A Poisson model for the coverage problem with a genomic application. Biometrika, 89(3), 669-682. -Deng, C and Smith, A. D. (2016). Estimating the number of species to attain -sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \author{ Chao Deng diff --git a/man/preseqR.sample.cov.bootstrap.Rd b/man/preseqR.sample.cov.bootstrap.Rd index 6fbecce..b348168 100644 --- a/man/preseqR.sample.cov.bootstrap.Rd +++ b/man/preseqR.sample.cov.bootstrap.Rd @@ -16,7 +16,7 @@ \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -69,8 +69,9 @@ \references{ Efron, B., & Tibshirani, R. J. (1994). An introduction to the bootstrap. CRC press. -Deng, C and Smith, AD (2016). Estimating the number of species to attain -sufficient representation in a random sample. arXiv preprint arXiv:1607.02804 +Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating +the number of species to attain sufficient representation in a random sample. +arXiv preprint arXiv:1607.02804v3. } \author{ diff --git a/man/preseqR.simu.hist.Rd b/man/preseqR.simu.hist.Rd index f3db8a5..441f6e2 100644 --- a/man/preseqR.simu.hist.Rd +++ b/man/preseqR.simu.hist.Rd @@ -36,7 +36,7 @@ \value{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -52,8 +52,8 @@ library(preseqR) f <- function(n) { rgamma(n, shape=0.5, scale=1) } - -preseqR.simu.hist(L=1e5, N=1, f) +## sample 10,000 individuals +preseqR.simu.hist(L=1e5, N=10000, f) } % Add one or more standard keywords, see file 'KEYWORDS' in the % R documentation directory. diff --git a/man/preseqR.ztnb.em.Rd b/man/preseqR.ztnb.em.Rd index 71962ec..5e80cd1 100644 --- a/man/preseqR.ztnb.em.Rd +++ b/man/preseqR.ztnb.em.Rd @@ -20,7 +20,7 @@ preseqR.ztnb.em(n, size = SIZE.INIT, mu = MU.INIT) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } diff --git a/man/ztnb.rSAC.Rd b/man/ztnb.rSAC.Rd index bd68491..2b890d4 100644 --- a/man/ztnb.rSAC.Rd +++ b/man/ztnb.rSAC.Rd @@ -16,7 +16,7 @@ ztnb.rSAC(n, r=1, size=SIZE.INIT, mu=MU.INIT) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. } @@ -62,6 +62,9 @@ ztnb.rSAC(n, r=1, size=SIZE.INIT, mu=MU.INIT) \references{ Daley, T., & Smith, A. D. (2013). Predicting the molecular complexity of sequencing libraries. Nature methods, 10(4), 325-327. + +Deng C, Daley T & Smith AD (2015). Applications of species accumulation curves +in large-scale biological data analysis. Quantitative Biology, 3(3), 135-144. } \examples{ diff --git a/man/ztp.rSAC.Rd b/man/ztp.rSAC.Rd index bd87157..4cbc504 100644 --- a/man/ztp.rSAC.Rd +++ b/man/ztp.rSAC.Rd @@ -16,7 +16,7 @@ ztp.rSAC(n, r=1) \item{n}{ A two-column matrix. The first column is the frequency \eqn{j = 1,2,\dots}; and the second column - is \eqn{N_j}, the number of species with each species represented \eqn{j} + is \eqn{N_j}, the number of species with each species represented exactly \eqn{j} times in the initial sample. The first column must be sorted in an ascending order. }