added functions

Russel88 · Mar 6, 2019 · 40e64b5 · 40e64b5
1 parent c13b559
commit 40e64b5
Show file tree

Hide file tree

Showing 7 changed files with 355 additions and 4 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: MicEco
 Title: Various functions for microbial community data
-Version: 0.9.3
+Version: 0.9.4
 Authors@R: person("Jakob", "Russel", email = "[email protected]", role = c("aut", "cre"))
 Description: Collection of functions for microbiome analyses. E.g. fitting neutral models and standardized effect sizes of phylogenetic beta diversities, and much more.
 Depends: R (>= 3.2.5)

diff --git a/NAMESPACE b/NAMESPACE
@@ -15,6 +15,8 @@ export(ses.comdist)
 export(ses.comdist2)
 export(ses.comdistnt)
 export(ses.comdistnt2)
+export(ses.mntd.par)
+export(ses.mpd.par)
 export(ses.permtest)
 import(Hmisc)
 import(bbmle)

diff --git a/R/ses.mntd.par.R b/R/ses.mntd.par.R
@@ -0,0 +1,111 @@
+#' Standardized effect size of MNTD
+#'
+#' Parallel calculation of standardized effect size of mean nearest taxon distances in communities. When used with a phylogenetic distance matrix, equivalent to -1 times the Nearest Taxon Index (NTI).
+#' 
+#' Faster than ses.mntd from \code{picante} when there are many samples and taxa.
+#' @param samp Community data matrix with samples as rows
+#' @param dis Distance matrix (generally a phylogenetic distance matrix)
+#' @param null.model Null model to use (see Details section for description)
+#' @param abundance.weighted Should mean nearest taxon distances for each species be weighted by species abundance? (default = FALSE)
+#' @param runs Number of randomizations
+#' @param iterations Number of iterations to use for each randomization (for independent swap and trial null models)
+#' @param cores Number of cores to use for parallel computing
+#' @details Currently implemented null models (arguments to null.model):
+#' \itemize{
+#'  \item taxa.labels - Shuffle distance matrix labels (across all taxa included in distance matrix) 
+#'  \item richness - Randomize community data matrix abundances within samples (maintains sample species richness) 
+#'  \item frequency - Randomize community data matrix abundances within species (maintains species occurence frequency) 
+#'  \item sample.pool - Randomize community data matrix by drawing species from pool of species occurring in at least one community (sample pool) with equal probability   
+#'  \item phylogeny.pool- Randomize community data matrix by drawing species from pool of species occurring in the distance matrix (phylogeny pool) with equal probability   
+#'  \item independentswap - Randomize community data matrix with the independent swap algorithm (Gotelli 2000) maintaining species occurrence frequency and sample species richness  
+#'  \item trialswap - Randomize community data matrix with the trial-swap algorithm (Miklos & Podani 2004) maintaining species occurrence frequency and sample species richness  
+#' }
+#' @keywords ses MNTD NTI
+#' @return
+#' A data frame of results for each community
+#' \itemize{
+#' \item{ntaxa}{Number of taxa in community}
+#' \item{mntd.obs}{Observed mntd in community}
+#' \item{mntd.rand.mean}{Mean mntd in null communities}
+#' \item{mntd.rand.sd}{Standard deviation of mntd in null communities}
+#' \item{mntd.obs.rank}{Rank of observed mntd vs. null communities}
+#' \item{mntd.obs.z}{Standardized effect size of mntd vs. null communities (= (mntd.obs - mntd.rand.mean) / mntd.rand.sd, equivalent to -NRI)}
+#' \item{mntd.obs.p}{P-value (quantile) of observed mntd vs. null communities (= mntd.obs.rank / runs + 1)}
+#' \item{runs}{Number of randomizations}  
+#' }
+#' @import picante
+#' @import doSNOW
+#' @export
+
+ses.mntd.par <- function(samp, dis, null.model = c("taxa.labels", "richness", 
+                                                  "frequency", "sample.pool", "phylogeny.pool", "independentswap", 
+                                                  "trialswap"), abundance.weighted = FALSE, runs = 999, iterations = 1000, cores = 1){
+    dis <- as.matrix(dis)
+    mntd.obs <- mntd(samp, dis, abundance.weighted = abundance.weighted)
+    null.model <- match.arg(null.model)
+    N <- nrow(samp)
+
+    # MNTD function on single sample
+    mntd.single <- function(samp, dis, abundance.weighted, i){
+        sppInSample <- names(samp[i, samp[i, ] > 0])
+        if (length(sppInSample) > 1) {
+            sample.dis <- dis[sppInSample, sppInSample]
+            diag(sample.dis) <- NA
+            if (abundance.weighted) {
+                mntds <- apply(sample.dis, 2, min, na.rm = TRUE)
+                sample.weights <- samp[i, sppInSample]
+                mntd <- weighted.mean(mntds, sample.weights)
+            }
+            else {
+                mntd <- mean(apply(sample.dis, 2, min, na.rm = TRUE))
+            }
+        }
+        else {
+            mntd <- NA
+        }
+    }
+
+    # Progress bar
+    pb <- txtProgressBar(max = N, style = 3)
+    progress <- function(n) setTxtProgressBar(pb, n)
+    opts <- list(progress = progress)
+
+    # Start parallel
+    if(cores == 1) {
+        registerDoSEQ() 
+    } else {
+        cl <- parallel::makeCluster(cores)
+        registerDoSNOW(cl)
+        on.exit(stopCluster(cl))
+    }
+
+    # Run parallel
+    i <- NULL
+    mntd.rand <- foreach(i = seq_len(N), .options.snow = opts, .combine = cbind, .packages = "picante") %dopar% {
+
+        rand.sub <- switch(null.model, 
+                           taxa.labels = replicate(runs, mntd.single(samp, taxaShuffle(dis), abundance.weighted = abundance.weighted, i)), 
+                           richness = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "richness"), dis, abundance.weighted, i)), 
+                           frequency = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "frequency"), dis, abundance.weighted, i)), 
+                           sample.pool = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "richness"), dis, abundance.weighted, i)), 
+                           phylogeny.pool = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "richness"), taxaShuffle(dis), abundance.weighted, i)), 
+                           independentswap = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "independentswap", iterations), dis, abundance.weighted, i)), 
+                           trialswap = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "trialswap", iterations), dis, abundance.weighted, i)))
+
+        return(rand.sub)
+
+    }
+
+    mntd.rand.mean <- apply(X = mntd.rand, MARGIN = 2, FUN = mean, na.rm = TRUE)
+    mntd.rand.sd <- apply(X = mntd.rand, MARGIN = 2, FUN = sd, na.rm = TRUE)
+    mntd.obs.z <- (mntd.obs - mntd.rand.mean)/mntd.rand.sd
+    mntd.obs.rank <- apply(X = rbind(mntd.obs, mntd.rand), MARGIN = 2,   FUN = rank)[1, ]
+    mntd.obs.rank <- ifelse(is.na(mntd.rand.mean), NA, mntd.obs.rank)
+    data.frame(ntaxa = specnumber(samp), 
+               mntd.obs, 
+               mntd.rand.mean, 
+               mntd.rand.sd, 
+               mntd.obs.rank, 
+               mntd.obs.z, 
+               mntd.obs.p = mntd.obs.rank/(runs + 1), runs = runs, row.names = row.names(samp))
+}
diff --git a/R/ses.mpd.par.R b/R/ses.mpd.par.R
@@ -0,0 +1,112 @@
+#' Standardized effect size of MPD
+#'
+#' Parallel calculation of standardized effect size of mean pairwise distances in communities. When used with a phylogenetic distance matrix, equivalent to -1 times the Nearest Relative Index (NRI).
+#' 
+#' Faster than ses.mpd from \code{picante} when there are many samples and taxa.
+#' @param samp Community data matrix with samples as rows
+#' @param dis Distance matrix (generally a phylogenetic distance matrix)
+#' @param null.model Null model to use (see Details section for description)
+#' @param abundance.weighted Should mean nearest taxon distances for each species be weighted by species abundance? (default = FALSE)
+#' @param runs Number of randomizations
+#' @param iterations Number of iterations to use for each randomization (for independent swap and trial null models)
+#' @param cores Number of cores to use for parallel computing
+#' @details Currently implemented null models (arguments to null.model):
+#' \itemize{
+#'  \item taxa.labels - Shuffle distance matrix labels (across all taxa included in distance matrix) 
+#'  \item richness - Randomize community data matrix abundances within samples (maintains sample species richness) 
+#'  \item frequency - Randomize community data matrix abundances within species (maintains species occurence frequency) 
+#'  \item sample.pool - Randomize community data matrix by drawing species from pool of species occurring in at least one community (sample pool) with equal probability   
+#'  \item phylogeny.pool- Randomize community data matrix by drawing species from pool of species occurring in the distance matrix (phylogeny pool) with equal probability   
+#'  \item independentswap - Randomize community data matrix with the independent swap algorithm (Gotelli 2000) maintaining species occurrence frequency and sample species richness  
+#'  \item trialswap - Randomize community data matrix with the trial-swap algorithm (Miklos & Podani 2004) maintaining species occurrence frequency and sample species richness  
+#' }
+#' @keywords ses MPD NRI
+#' @return
+#' A data frame of results for each community
+#' \itemize{
+#' \item{ntaxa}{Number of taxa in community}
+#' \item{mpd.obs}{Observed mpd in community}
+#' \item{mpd.rand.mean}{Mean mpd in null communities}
+#' \item{mpd.rand.sd}{Standard deviation of mpd in null communities}
+#' \item{mpd.obs.rank}{Rank of observed mpd vs. null communities}
+#' \item{mpd.obs.z}{Standardized effect size of mpd vs. null communities (= (mpd.obs - mpd.rand.mean) / mpd.rand.sd, equivalent to -NRI)}
+#' \item{mpd.obs.p}{P-value (quantile) of observed mpd vs. null communities (= mpd.obs.rank / runs + 1)}
+#' \item{runs}{Number of randomizations}  
+#' }
+#' @import picante
+#' @import doSNOW
+#' @export
+
+ses.mpd.par <- function(samp, dis, null.model = c("taxa.labels", "richness", 
+                                                  "frequency", "sample.pool", "phylogeny.pool", "independentswap", 
+                                                  "trialswap"), abundance.weighted = FALSE, runs = 999, iterations = 1000, cores = 1){
+    dis <- as.matrix(dis)
+    mpd.obs <- mpd(samp, dis, abundance.weighted = abundance.weighted)
+    null.model <- match.arg(null.model)
+    N <- nrow(samp)
+
+    # MPD function on single sample
+    mpd.single <- function(samp, dis, abundance.weighted, i){
+        sppInSample <- names(samp[i, samp[i, ] > 0])
+        if (length(sppInSample) > 1) {
+            sample.dis <- dis[sppInSample, sppInSample]
+            if (abundance.weighted) {
+                sample.weights <- t(as.matrix(samp[i, sppInSample, 
+                                                   drop = FALSE])) %*% as.matrix(samp[i, sppInSample, 
+                                                                                      drop = FALSE])
+                mpd <- weighted.mean(sample.dis, sample.weights)
+            }
+            else {
+                mpd <- mean(sample.dis[lower.tri(sample.dis)])
+            }
+        }
+        else {
+            mpd <- NA
+        }
+        mpd
+    }
+
+    # Progress bar
+    pb <- txtProgressBar(max = N, style = 3)
+    progress <- function(n) setTxtProgressBar(pb, n)
+    opts <- list(progress = progress)
+
+    # Start parallel
+    if(cores == 1) {
+        registerDoSEQ() 
+    } else {
+        cl <- parallel::makeCluster(cores)
+        registerDoSNOW(cl)
+        on.exit(stopCluster(cl))
+    }
+
+    # Run parallel
+    i <- NULL
+    mpd.rand <- foreach(i = seq_len(N), .options.snow = opts, .combine = cbind, .packages = "picante") %dopar% {
+
+        rand.sub <- switch(null.model, 
+                           taxa.labels = replicate(runs, mpd.single(samp, taxaShuffle(dis), abundance.weighted = abundance.weighted, i)), 
+                           richness = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "richness"), dis, abundance.weighted, i)), 
+                           frequency = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "frequency"), dis, abundance.weighted, i)), 
+                           sample.pool = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "richness"), dis, abundance.weighted, i)), 
+                           phylogeny.pool = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "richness"), taxaShuffle(dis), abundance.weighted, i)), 
+                           independentswap = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "independentswap", iterations), dis, abundance.weighted, i)), 
+                           trialswap = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "trialswap", iterations), dis, abundance.weighted, i)))
+
+        return(rand.sub)
+
+    }
+
+    mpd.rand.mean <- apply(X = mpd.rand, MARGIN = 2, FUN = mean, na.rm = TRUE)
+    mpd.rand.sd <- apply(X = mpd.rand, MARGIN = 2, FUN = sd, na.rm = TRUE)
+    mpd.obs.z <- (mpd.obs - mpd.rand.mean)/mpd.rand.sd
+    mpd.obs.rank <- apply(X = rbind(mpd.obs, mpd.rand), MARGIN = 2,   FUN = rank)[1, ]
+    mpd.obs.rank <- ifelse(is.na(mpd.rand.mean), NA, mpd.obs.rank)
+    data.frame(ntaxa = specnumber(samp), 
+               mpd.obs, 
+               mpd.rand.mean, 
+               mpd.rand.sd, 
+               mpd.obs.rank, 
+               mpd.obs.z, 
+               mpd.obs.p = mpd.obs.rank/(runs + 1), runs = runs, row.names = row.names(samp))
+}
diff --git a/README.md b/README.md
@@ -79,11 +79,19 @@ permatfull/permatswap from the vegan package
 
 #### comdist.par
 
-A parallel version of the comdist function from the picante package for significant speedup
+A parallel version of the comdist function from the picante package for significant speedup on large datasets
 
 #### comdistnt.par
 
-A parallel version of the comdistnt function from the picante package for significant speedup
+A parallel version of the comdistnt function from the picante package for significant speedup on large datasets
+
+#### ses.mpd.par
+
+A parallel version of the ses.mpd function from the picante package for significant speedup on large datasets
+
+#### ses.mntd.par
+
+A parallel version of the ses.mntd function from the picante package for significant speedup on large datasets
 
 #### ses.permtest
 
@@ -94,6 +102,6 @@ Permutation test of z-matrix from `ses.comdist`, `ses.comdist2`, `ses.comdistnt`
 `rarefy_rrna`: Some code is from vegan licensed under GPL-2
 (<https://github.com/vegandevs/vegan>)
 
-`comdist.par`, `comdistnt.par`, `ses.comdist`, `ses.comdist2`, `ses.comdistnt` and `ses.comdistnt2`: Some
+`ses.mpd.par`, `ses.mntd.par`, `comdist.par`, `comdistnt.par`, `ses.comdist`, `ses.comdist2`, `ses.comdistnt` and `ses.comdistnt2`: Some
 code is from picante licensed under GPL-2
 (<https://github.com/skembel/picante>)
diff --git a/man/ses.mntd.par.Rd b/man/ses.mntd.par.Rd