update the documents

smithlabcode · Jan 1, 2018 · 4c5e34d · 4c5e34d
1 parent d33c7e9
commit 4c5e34d
Show file tree

Hide file tree

Showing 22 changed files with 253 additions and 323 deletions.
diff --git a/data/FisherButterflyHist.txt → data/FisherButterfly.txt b/data/FisherButterflyHist.txt → data/FisherButterfly.txt
diff --git a/man/Dickens.Rd b/man/Dickens.Rd
@@ -8,7 +8,7 @@
 \details{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{n_j}, the number of unique words appeared \eqn{j}
+    is \eqn{N_j}, the number of unique words appeared \eqn{j}
     times in a collection of Charles Dickens.
 }
 
@@ -24,4 +24,4 @@ library(preseqR)
 data(Dickens)
 }
 
-\keyword{ data }
+\keyword{ datasets }
diff --git a/man/FisherButterflyHist.Rd → man/FisherButterfly.Rd b/man/FisherButterflyHist.Rd → man/FisherButterfly.Rd
@@ -1,5 +1,5 @@
-\name{FisherButterflyHist}
-\alias{FisherButterflyHist}
+\name{FisherButterfly}
+\alias{FisherButterfly}
 \docType{data}
 \title{Fisher's butterfly data}
 \description{Frequencies data of butterflies collected in the Malay peninsula
@@ -22,7 +22,7 @@ Animal Population, Journal of Animal Ecology, 12, 42-58, Table 1,2.
 library(preseqR)
 
 ##load data
-data(FisherButterflyHist)
+data(FisherButterfly)
 }
 
 \keyword{ data }
diff --git a/man/bbc.rSAC.Rd b/man/bbc.rSAC.Rd
@@ -28,8 +28,8 @@ bbc.rSAC(n, r=1)
 }
 \value{
   The estimator for the \eqn{r}-SAC. The input of the estimator is a vector of
-  sampling efforts t, i.e., the relative sample sizes comparing with the initial
-  sample. For example, t = 2 means a random sample that is twice the size of
+  sampling efforts \eqn{t}, i.e., the relative sample sizes comparing with the initial
+  sample. For example, \eqn{t = 2} means a random sample that is twice the size of
   the initial sample.
 }
 \author{
@@ -48,16 +48,16 @@ Journal of the American Statistical Association, 93(441), 372-379.
 library(preseqR)
 
 ## import data
-data(FisherButterflyHist)
+data(FisherButterfly)
 
 ## construct the estimator for SAC
-bbc1 <- bbc.rSAC(FisherButterflyHist, r=1)
+bbc1 <- bbc.rSAC(FisherButterfly, r=1)
 ## The number of species represented at least once in a sample, 
 ## when the sample size is 10 or 20 times of the initial sample
 bbc1(c(10, 20))
 
 ## construct the estimator for r-SAC
-bbc2 <- bbc.rSAC(FisherButterflyHist, r=2)
+bbc2 <- bbc.rSAC(FisherButterfly, r=2)
 ## The number of species represented at least twice in a sample, 
 ## when the sample size is 50 or 100 times of the initial sample
 bbc2(c(50, 100))

diff --git a/man/cs.rSAC.Rd b/man/cs.rSAC.Rd
@@ -2,12 +2,13 @@
 \alias{cs.rSAC}
 %- Also NEED an '\alias' for EACH other topic documented here.
 \title{
-Estimating the expected number of species represented r or more times
+  CS estimator
 }
 \description{
-The function estimates the expected number of species represented at least 
-r times in a random sample based on the initial sample. The estimator proposed
-by Chao and Shen (2004) for SAC is generalized to estimate r-SAC for r > 1.
+\code{cs.rSAC} predicts the expected number of species represented at least 
+\eqn{r} times in a random sample, based on the initial sample. 
+The estimator was originally proposed by Chao and Shen (2004) for estimating
+the SAC. We generalize this estimator for predicting the \eqn{r}-SAC.
 }
 \usage{
 cs.rSAC(n, r=1, k=10)
@@ -17,7 +18,7 @@ cs.rSAC(n, r=1, k=10)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{n_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
   }
@@ -29,11 +30,10 @@ cs.rSAC(n, r=1, k=10)
   }
 }
 \value{
-  The constructed estimator for the number of species represneted at least r
-  times in a sample. The input of the estimator is a vector of sampling 
-  efforts t, i.e. the relative sample sizes comparing with the initial sample.
-  For example, t = 2 means a random sample that is twice the size of the 
-  initial sample.
+  The estimator for the \eqn{r}-SAC. The input of the estimator is a vector of
+  sampling efforts \eqn{t}, i.e., the relative sample sizes comparing with the initial
+  sample. For example, \eqn{t = 2} means a random sample that is twice the size of
+  the initial sample.
 }
 \author{
   Chao Deng
@@ -50,20 +50,18 @@ Journal of agricultural, biological, and environmental statistics, 9(3), 253-269
 library(preseqR)
 
 ## import data
-data(FisherButterflyHist)
+data(FisherButterfly)
 
-## construct the estimator for the number of species represented at least once
-## in a random sample
-chao1 <- cs.rSAC(FisherButterflyHist, r=1)
-## The number of species represented at least once, when the sample size is 
-## 10 or 20 times of the initial sample
+## construct the estimator for SAC
+chao1 <- cs.rSAC(FisherButterfly, r=1)
+## The number of species represented at least once in a sample, 
+## when the sample size is 10 or 20 times of the initial sample
 chao1(c(10, 20))
 
-## construct the estimator for the number of species represented at least twice
-## in a random sample
-chao2 <- cs.rSAC(FisherButterflyHist, r=2)
-## The number of species represented at least twice, when the sample size is 
-## 50 or 100 times of the initial sample
+## construct the estimator for r-SAC
+chao2 <- cs.rSAC(FisherButterfly, r=2)
+## The number of species represented at least twice in a sample, 
+## when the sample size is 50 or 100 times of the initial sample
 chao2(c(50, 100))
 }
 % Add one or more standard keywords, see file 'KEYWORDS' in the

diff --git a/man/ds.rSAC.Rd b/man/ds.rSAC.Rd
@@ -2,11 +2,11 @@
 \alias{ds.rSAC}
 %- Also NEED an '\alias' for EACH other topic documented here.
 \title{
-Estimating the expected number of species represented r or more times
+  RFA estimator
 }
 \description{
-The function estimates the expected number of species represented at least 
-r times in a random sample based on the initial sample.
+\code{ds.rSAC} predicts the expected number of species represented at least 
+\eqn{r} times in a random sample, based on the initial sample. 
 }
 \usage{
 ds.rSAC(n, r=1, mt=20)
@@ -16,7 +16,7 @@ ds.rSAC(n, r=1, mt=20)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{n_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
   }
@@ -29,42 +29,22 @@ ds.rSAC(n, r=1, mt=20)
   }
 }
 \details{
-  Under a mixture of Poisson models, the expected number of
-  species represented at least r times in a random sample 
-  can be expressed as higher derivatives
-  of the expected number of species represented at least once per unit of 
-  sampling effort. 
-  We first use rational function approximations to the 
-  modified Good and Toulmin's (1956) non-parametric empirical Bayes power 
-  series to estimate the average discovery rate. By differentiating the
-  rational function approximation, we obtain an estimator 
-  for the number of species represented at least r times in a random
-  sample.
+  The estimator is based on an empirical Bayes approach using rational
+  function approximation (RFA), as described in the paper in the 
+  references section.
 
-  This function is the fast version of \code{\link{ds.rSAC.bootstrap}}.
-  The function does not provide an estimate for the confidence interval.
-  If one needs the confidence interval, please use 
-  \code{\link{ds.rSAC.bootstrap}} instead.
+  \code{ds.rSAC} is the fast version of \code{\link{ds.rSAC.bootstrap}}.
+  The function does not provide the confidence interval. To obtain the
+  confidence interval along with the estimates, one should use the function
+  \code{\link{ds.rSAC.bootstrap}}.
 }
 \value{
-  The estimator for the number of species represented at least r times in 
-  a random sample. The input of the estimator is a vector of sampling 
-  efforts t, i.e. the relative sample sizes comparing with the initial sample.
-  For example, t = 2 means a random sample that is twice the size of the 
-  initial sample.
+  The estimator for the \eqn{r}-SAC. The input of the estimator is a vector of
+  sampling efforts \eqn{t}, i.e., the relative sample sizes comparing with the initial
+  sample. For example, \eqn{t = 2} means a random sample that is twice the size of
+  the initial sample.
 }
 \references{
-Kalinin V (1965). Functionals related to the poisson distribution and
-statistical structure of a text. Articles on Mathematical Statistics and
-the Theory of Probability pp. 202-220.
-
-Daley, T and Smith, AD. (2013). Predicting the molecular complexity of
-sequencing libraries. Nature methods, 10(4), 325-327.
-
-Deng, C., Daley, T., & Smith, A. (2015). Applications of species accumulation
-curves in large-scale biological data analysis. Quantitative Biology, 3(3),
-135-144.
-
 Deng, C and Smith, AD (2016). Estimating the number of species to attain 
 sufficient representation in a random sample. arXiv preprint arXiv:1607.02804
 }
@@ -77,20 +57,18 @@ sufficient representation in a random sample. arXiv preprint arXiv:1607.02804
 library(preseqR)
 
 ## import data
-data(FisherButterflyHist)
+data(FisherButterfly)
 
-## construct the estimator for the number of species represented at least once
-## in a random sample
-ds1 <- ds.rSAC(FisherButterflyHist, r=1)
-## The number of species represented at least once, when the sample size is 
-## 10 or 20 times of the initial sample
+## construct the estimator for SAC
+ds1 <- ds.rSAC(FisherButterfly, r=1)
+## The number of species represented at least once in a sample, 
+## when the sample size is 10 or 20 times of the initial sample
 ds1(c(10, 20))
 
-## construct the estimator for the number of species represented at least twice
-## in a random sample
-ds2 <- ds.rSAC(FisherButterflyHist, r=2)
-## The number of species represented at least twice, when the sample size is 
-## 50 or 100 times of the initial sample
+## construct the estimator for r-SAC
+ds2 <- ds.rSAC(FisherButterfly, r=2)
+## The number of species represented at least twice in a sample, 
+## when the sample size is 50 or 100 times of the initial sample
 ds2(c(50, 100))
 }
 % Add one or more standard keywords, see file 'KEYWORDS' in the

diff --git a/man/ds.rSAC.bootsrap.Rd b/man/ds.rSAC.bootsrap.Rd
@@ -2,11 +2,12 @@
 \alias{ds.rSAC.bootstrap}
 %- Also NEED an '\alias' for EACH other topic documented here.
 \title{
-Estimating the number of species represented r or more times
+  RFA estimator with bootstrap
 }
 \description{
-The function estimates the expected number of species represented at least 
-r times in a random sample based on the initial sample.
+\code{ds.rSAC.bootstrap} predicts the expected number of species 
+represented at least \eqn{r} times in a random sample, 
+based on the initial sample. 
 }
 \usage{
 ds.rSAC.bootstrap(n, r=1, mt=20, times=30, conf=0.95)
@@ -16,7 +17,7 @@ ds.rSAC.bootstrap(n, r=1, mt=20, times=30, conf=0.95)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{n_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
   }
@@ -35,10 +36,13 @@ ds.rSAC.bootstrap(n, r=1, mt=20, times=30, conf=0.95)
   }
 }
 \details{
-  This is the bootstrap version of \code{\link{ds.rSAC}}. The function
-  provides the confidence interval for estiamtes based on
-  \code{\link{ds.rSAC}}. The confidence interval is based on the lognormal
-  model proposed by Chao (1987).
+  This is the bootstrap version of \code{\link{ds.rSAC}}. The bootstrap
+  sample is generated by randomly sampling the initial sample with replacement.
+  For each bootstrap sample, we construct an estimator. The median of
+  estimates is used as the prediction for the number of species
+  represented at least \eqn{r} times in a random sample. 
+
+  The confidence interval is constructed based on a lognormal distribution.
 }
 \value{
   \item{f}{
@@ -50,33 +54,22 @@ ds.rSAC.bootstrap(n, r=1, mt=20, times=30, conf=0.95)
   }
   \item{se}{
     The standard error for the estimator. The input is a vector of sampling 
-    efforts t. The standard error depends on t.
+    efforts \eqn{t}.
   }
   \item{lb}{
     The lower bound of the confidence interval.The input is a vector of sampling 
-    efforts t.
+    efforts \eqn{t}.
   }
   \item{ub}{
     The upper bound of the confidence interval.The input is a vector of sampling 
-    efforts t.
+    efforts \eqn{t}.
   }
 }
 
 \references{
 Efron, B., & Tibshirani, R. J. (1994). An introduction to the bootstrap. CRC press.
 
-Kalinin V (1965). Functionals related to the poisson distribution and 
-statistical structure of a text. Articles on Mathematical Statistics and 
-the Theory of Probability pp. 202-220.
-
-Daley, T., & Smith, A. D. (2013). Predicting the molecular complexity of
-sequencing libraries. Nature methods, 10(4), 325-327.
-
-Deng, C., Daley, T., & Smith, A. (2015). Applications of species accumulation
-curves in large-scale biological data analysis. Quantitative Biology, 3(3),
-135-144.
-
-Deng, C and Smith, AD (2016). Estimating the number of species to attain 
+Deng, C & Smith, AD (2016). Estimating the number of species to attain 
 sufficient representation in a random sample. arXiv preprint arXiv:1607.02804
 }
 
@@ -86,35 +79,33 @@ sufficient representation in a random sample. arXiv preprint arXiv:1607.02804
 
 \examples{
 ## load library
-#library(preseqR)
+# library(preseqR)
 
 ## import data
-#data(FisherButterflyHist)
+# data(FisherButterfly)
 
-## construct the estimator for the number of species represented at least once
-## in a random sample
-# estimator1 <- ds.rSAC.bootstrap(FisherButterflyHist, r=1)
-## The number of species represented at least once, when the sample size is 
-## 10 or 20 times of the initial sample
-# estimator1$f(c(10, 20))
+## construct the estimator for SAC
+# ds1 <- ds.rSAC.bootstrap(FisherButterfly, r=1)
+## The number of species represented at least once in a sample, 
+## when the sample size is 10 or 20 times of the initial sample
+# ds1$f(c(10, 20))
 ## The standard error of the estiamtes
-# estimator1$se(c(10, 20))
+# ds1$se(c(10, 20))
 ## The confidence interval of the estimates
-# lb <- estimator1$lb(c(10, 20))
-# ub <- estimator1$ub(c(10, 20))
+# lb <- ds1$lb(c(10, 20))
+# ub <- ds1$ub(c(10, 20))
 # matrix(c(lb, ub), byrow=FALSE, ncol=2)
 
-## construct the estimator for the number of species represented at least twice
-## in a random sample
-# estimator2 <- ds.rSAC.bootstrap(FisherButterflyHist, r=2)
-## The number of species represented at least twice, when the sample size is 
-## 50 or 100 times of the initial sample
-# estimator2$f(c(50, 100))
+## construct the estimator for SAC
+# ds2 <- ds.rSAC.bootstrap(FisherButterfly, r=2)
+## The number of species represented at least twice in a sample, 
+## when the sample size is 50 or 100 times of the initial sample
+# ds2$f(c(50, 100))
 ## The standard error of the estiamtes
-# estimator2$se(c(50, 100)))
+# ds2$se(c(50, 100)))
 ## The confidence interval of the estimates
-# lb <- estimator2$lb(c(50, 100))
-# ub <- estimator2$ub(c(50, 100))
+# lb <- ds2$lb(c(50, 100))
+# ub <- ds2$ub(c(50, 100))
 # matrix(c(lb, ub), byrow=FALSE, ncol=2)
 }
 % Add one or more standard keywords, see file 'KEYWORDS' in the