From 40e64b5829d5e3c00a35144833f5efb1d27aa237 Mon Sep 17 00:00:00 2001 From: Jakob Russel Date: Wed, 6 Mar 2019 11:19:20 +0100 Subject: [PATCH] added functions --- DESCRIPTION | 2 +- NAMESPACE | 2 + R/ses.mntd.par.R | 111 +++++++++++++++++++++++++++++++++++++++++++ R/ses.mpd.par.R | 112 ++++++++++++++++++++++++++++++++++++++++++++ README.md | 14 ++++-- man/ses.mntd.par.Rd | 59 +++++++++++++++++++++++ man/ses.mpd.par.Rd | 59 +++++++++++++++++++++++ 7 files changed, 355 insertions(+), 4 deletions(-) create mode 100644 R/ses.mntd.par.R create mode 100644 R/ses.mpd.par.R create mode 100644 man/ses.mntd.par.Rd create mode 100644 man/ses.mpd.par.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 7f83899..454285a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: MicEco Title: Various functions for microbial community data -Version: 0.9.3 +Version: 0.9.4 Authors@R: person("Jakob", "Russel", email = "russel2620@gmail.com", role = c("aut", "cre")) Description: Collection of functions for microbiome analyses. E.g. fitting neutral models and standardized effect sizes of phylogenetic beta diversities, and much more. Depends: R (>= 3.2.5) diff --git a/NAMESPACE b/NAMESPACE index b499e47..b9ab347 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,8 @@ export(ses.comdist) export(ses.comdist2) export(ses.comdistnt) export(ses.comdistnt2) +export(ses.mntd.par) +export(ses.mpd.par) export(ses.permtest) import(Hmisc) import(bbmle) diff --git a/R/ses.mntd.par.R b/R/ses.mntd.par.R new file mode 100644 index 0000000..07bb122 --- /dev/null +++ b/R/ses.mntd.par.R @@ -0,0 +1,111 @@ +#' Standardized effect size of MNTD +#' +#' Parallel calculation of standardized effect size of mean nearest taxon distances in communities. When used with a phylogenetic distance matrix, equivalent to -1 times the Nearest Taxon Index (NTI). +#' +#' Faster than ses.mntd from \code{picante} when there are many samples and taxa. +#' @param samp Community data matrix with samples as rows +#' @param dis Distance matrix (generally a phylogenetic distance matrix) +#' @param null.model Null model to use (see Details section for description) +#' @param abundance.weighted Should mean nearest taxon distances for each species be weighted by species abundance? (default = FALSE) +#' @param runs Number of randomizations +#' @param iterations Number of iterations to use for each randomization (for independent swap and trial null models) +#' @param cores Number of cores to use for parallel computing +#' @details Currently implemented null models (arguments to null.model): +#' \itemize{ +#' \item taxa.labels - Shuffle distance matrix labels (across all taxa included in distance matrix) +#' \item richness - Randomize community data matrix abundances within samples (maintains sample species richness) +#' \item frequency - Randomize community data matrix abundances within species (maintains species occurence frequency) +#' \item sample.pool - Randomize community data matrix by drawing species from pool of species occurring in at least one community (sample pool) with equal probability +#' \item phylogeny.pool- Randomize community data matrix by drawing species from pool of species occurring in the distance matrix (phylogeny pool) with equal probability +#' \item independentswap - Randomize community data matrix with the independent swap algorithm (Gotelli 2000) maintaining species occurrence frequency and sample species richness +#' \item trialswap - Randomize community data matrix with the trial-swap algorithm (Miklos & Podani 2004) maintaining species occurrence frequency and sample species richness +#' } +#' @keywords ses MNTD NTI +#' @return +#' A data frame of results for each community +#' \itemize{ +#' \item{ntaxa}{Number of taxa in community} +#' \item{mntd.obs}{Observed mntd in community} +#' \item{mntd.rand.mean}{Mean mntd in null communities} +#' \item{mntd.rand.sd}{Standard deviation of mntd in null communities} +#' \item{mntd.obs.rank}{Rank of observed mntd vs. null communities} +#' \item{mntd.obs.z}{Standardized effect size of mntd vs. null communities (= (mntd.obs - mntd.rand.mean) / mntd.rand.sd, equivalent to -NRI)} +#' \item{mntd.obs.p}{P-value (quantile) of observed mntd vs. null communities (= mntd.obs.rank / runs + 1)} +#' \item{runs}{Number of randomizations} +#' } +#' @import picante +#' @import doSNOW +#' @export + +ses.mntd.par <- function(samp, dis, null.model = c("taxa.labels", "richness", + "frequency", "sample.pool", "phylogeny.pool", "independentswap", + "trialswap"), abundance.weighted = FALSE, runs = 999, iterations = 1000, cores = 1){ + dis <- as.matrix(dis) + mntd.obs <- mntd(samp, dis, abundance.weighted = abundance.weighted) + null.model <- match.arg(null.model) + N <- nrow(samp) + + # MNTD function on single sample + mntd.single <- function(samp, dis, abundance.weighted, i){ + sppInSample <- names(samp[i, samp[i, ] > 0]) + if (length(sppInSample) > 1) { + sample.dis <- dis[sppInSample, sppInSample] + diag(sample.dis) <- NA + if (abundance.weighted) { + mntds <- apply(sample.dis, 2, min, na.rm = TRUE) + sample.weights <- samp[i, sppInSample] + mntd <- weighted.mean(mntds, sample.weights) + } + else { + mntd <- mean(apply(sample.dis, 2, min, na.rm = TRUE)) + } + } + else { + mntd <- NA + } + } + + # Progress bar + pb <- txtProgressBar(max = N, style = 3) + progress <- function(n) setTxtProgressBar(pb, n) + opts <- list(progress = progress) + + # Start parallel + if(cores == 1) { + registerDoSEQ() + } else { + cl <- parallel::makeCluster(cores) + registerDoSNOW(cl) + on.exit(stopCluster(cl)) + } + + # Run parallel + i <- NULL + mntd.rand <- foreach(i = seq_len(N), .options.snow = opts, .combine = cbind, .packages = "picante") %dopar% { + + rand.sub <- switch(null.model, + taxa.labels = replicate(runs, mntd.single(samp, taxaShuffle(dis), abundance.weighted = abundance.weighted, i)), + richness = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "richness"), dis, abundance.weighted, i)), + frequency = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "frequency"), dis, abundance.weighted, i)), + sample.pool = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "richness"), dis, abundance.weighted, i)), + phylogeny.pool = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "richness"), taxaShuffle(dis), abundance.weighted, i)), + independentswap = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "independentswap", iterations), dis, abundance.weighted, i)), + trialswap = replicate(runs, mntd.single(randomizeMatrix(samp, null.model = "trialswap", iterations), dis, abundance.weighted, i))) + + return(rand.sub) + + } + + mntd.rand.mean <- apply(X = mntd.rand, MARGIN = 2, FUN = mean, na.rm = TRUE) + mntd.rand.sd <- apply(X = mntd.rand, MARGIN = 2, FUN = sd, na.rm = TRUE) + mntd.obs.z <- (mntd.obs - mntd.rand.mean)/mntd.rand.sd + mntd.obs.rank <- apply(X = rbind(mntd.obs, mntd.rand), MARGIN = 2, FUN = rank)[1, ] + mntd.obs.rank <- ifelse(is.na(mntd.rand.mean), NA, mntd.obs.rank) + data.frame(ntaxa = specnumber(samp), + mntd.obs, + mntd.rand.mean, + mntd.rand.sd, + mntd.obs.rank, + mntd.obs.z, + mntd.obs.p = mntd.obs.rank/(runs + 1), runs = runs, row.names = row.names(samp)) +} diff --git a/R/ses.mpd.par.R b/R/ses.mpd.par.R new file mode 100644 index 0000000..a8e0a78 --- /dev/null +++ b/R/ses.mpd.par.R @@ -0,0 +1,112 @@ +#' Standardized effect size of MPD +#' +#' Parallel calculation of standardized effect size of mean pairwise distances in communities. When used with a phylogenetic distance matrix, equivalent to -1 times the Nearest Relative Index (NRI). +#' +#' Faster than ses.mpd from \code{picante} when there are many samples and taxa. +#' @param samp Community data matrix with samples as rows +#' @param dis Distance matrix (generally a phylogenetic distance matrix) +#' @param null.model Null model to use (see Details section for description) +#' @param abundance.weighted Should mean nearest taxon distances for each species be weighted by species abundance? (default = FALSE) +#' @param runs Number of randomizations +#' @param iterations Number of iterations to use for each randomization (for independent swap and trial null models) +#' @param cores Number of cores to use for parallel computing +#' @details Currently implemented null models (arguments to null.model): +#' \itemize{ +#' \item taxa.labels - Shuffle distance matrix labels (across all taxa included in distance matrix) +#' \item richness - Randomize community data matrix abundances within samples (maintains sample species richness) +#' \item frequency - Randomize community data matrix abundances within species (maintains species occurence frequency) +#' \item sample.pool - Randomize community data matrix by drawing species from pool of species occurring in at least one community (sample pool) with equal probability +#' \item phylogeny.pool- Randomize community data matrix by drawing species from pool of species occurring in the distance matrix (phylogeny pool) with equal probability +#' \item independentswap - Randomize community data matrix with the independent swap algorithm (Gotelli 2000) maintaining species occurrence frequency and sample species richness +#' \item trialswap - Randomize community data matrix with the trial-swap algorithm (Miklos & Podani 2004) maintaining species occurrence frequency and sample species richness +#' } +#' @keywords ses MPD NRI +#' @return +#' A data frame of results for each community +#' \itemize{ +#' \item{ntaxa}{Number of taxa in community} +#' \item{mpd.obs}{Observed mpd in community} +#' \item{mpd.rand.mean}{Mean mpd in null communities} +#' \item{mpd.rand.sd}{Standard deviation of mpd in null communities} +#' \item{mpd.obs.rank}{Rank of observed mpd vs. null communities} +#' \item{mpd.obs.z}{Standardized effect size of mpd vs. null communities (= (mpd.obs - mpd.rand.mean) / mpd.rand.sd, equivalent to -NRI)} +#' \item{mpd.obs.p}{P-value (quantile) of observed mpd vs. null communities (= mpd.obs.rank / runs + 1)} +#' \item{runs}{Number of randomizations} +#' } +#' @import picante +#' @import doSNOW +#' @export + +ses.mpd.par <- function(samp, dis, null.model = c("taxa.labels", "richness", + "frequency", "sample.pool", "phylogeny.pool", "independentswap", + "trialswap"), abundance.weighted = FALSE, runs = 999, iterations = 1000, cores = 1){ + dis <- as.matrix(dis) + mpd.obs <- mpd(samp, dis, abundance.weighted = abundance.weighted) + null.model <- match.arg(null.model) + N <- nrow(samp) + + # MPD function on single sample + mpd.single <- function(samp, dis, abundance.weighted, i){ + sppInSample <- names(samp[i, samp[i, ] > 0]) + if (length(sppInSample) > 1) { + sample.dis <- dis[sppInSample, sppInSample] + if (abundance.weighted) { + sample.weights <- t(as.matrix(samp[i, sppInSample, + drop = FALSE])) %*% as.matrix(samp[i, sppInSample, + drop = FALSE]) + mpd <- weighted.mean(sample.dis, sample.weights) + } + else { + mpd <- mean(sample.dis[lower.tri(sample.dis)]) + } + } + else { + mpd <- NA + } + mpd + } + + # Progress bar + pb <- txtProgressBar(max = N, style = 3) + progress <- function(n) setTxtProgressBar(pb, n) + opts <- list(progress = progress) + + # Start parallel + if(cores == 1) { + registerDoSEQ() + } else { + cl <- parallel::makeCluster(cores) + registerDoSNOW(cl) + on.exit(stopCluster(cl)) + } + + # Run parallel + i <- NULL + mpd.rand <- foreach(i = seq_len(N), .options.snow = opts, .combine = cbind, .packages = "picante") %dopar% { + + rand.sub <- switch(null.model, + taxa.labels = replicate(runs, mpd.single(samp, taxaShuffle(dis), abundance.weighted = abundance.weighted, i)), + richness = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "richness"), dis, abundance.weighted, i)), + frequency = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "frequency"), dis, abundance.weighted, i)), + sample.pool = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "richness"), dis, abundance.weighted, i)), + phylogeny.pool = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "richness"), taxaShuffle(dis), abundance.weighted, i)), + independentswap = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "independentswap", iterations), dis, abundance.weighted, i)), + trialswap = replicate(runs, mpd.single(randomizeMatrix(samp, null.model = "trialswap", iterations), dis, abundance.weighted, i))) + + return(rand.sub) + + } + + mpd.rand.mean <- apply(X = mpd.rand, MARGIN = 2, FUN = mean, na.rm = TRUE) + mpd.rand.sd <- apply(X = mpd.rand, MARGIN = 2, FUN = sd, na.rm = TRUE) + mpd.obs.z <- (mpd.obs - mpd.rand.mean)/mpd.rand.sd + mpd.obs.rank <- apply(X = rbind(mpd.obs, mpd.rand), MARGIN = 2, FUN = rank)[1, ] + mpd.obs.rank <- ifelse(is.na(mpd.rand.mean), NA, mpd.obs.rank) + data.frame(ntaxa = specnumber(samp), + mpd.obs, + mpd.rand.mean, + mpd.rand.sd, + mpd.obs.rank, + mpd.obs.z, + mpd.obs.p = mpd.obs.rank/(runs + 1), runs = runs, row.names = row.names(samp)) +} diff --git a/README.md b/README.md index 317aef3..42541d8 100644 --- a/README.md +++ b/README.md @@ -79,11 +79,19 @@ permatfull/permatswap from the vegan package #### comdist.par -A parallel version of the comdist function from the picante package for significant speedup +A parallel version of the comdist function from the picante package for significant speedup on large datasets #### comdistnt.par -A parallel version of the comdistnt function from the picante package for significant speedup +A parallel version of the comdistnt function from the picante package for significant speedup on large datasets + +#### ses.mpd.par + +A parallel version of the ses.mpd function from the picante package for significant speedup on large datasets + +#### ses.mntd.par + +A parallel version of the ses.mntd function from the picante package for significant speedup on large datasets #### ses.permtest @@ -94,6 +102,6 @@ Permutation test of z-matrix from `ses.comdist`, `ses.comdist2`, `ses.comdistnt` `rarefy_rrna`: Some code is from vegan licensed under GPL-2 () -`comdist.par`, `comdistnt.par`, `ses.comdist`, `ses.comdist2`, `ses.comdistnt` and `ses.comdistnt2`: Some +`ses.mpd.par`, `ses.mntd.par`, `comdist.par`, `comdistnt.par`, `ses.comdist`, `ses.comdist2`, `ses.comdistnt` and `ses.comdistnt2`: Some code is from picante licensed under GPL-2 () diff --git a/man/ses.mntd.par.Rd b/man/ses.mntd.par.Rd new file mode 100644 index 0000000..202c1b8 --- /dev/null +++ b/man/ses.mntd.par.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ses.mntd.par.R +\name{ses.mntd.par} +\alias{ses.mntd.par} +\title{Standardized effect size of MNTD} +\usage{ +ses.mntd.par(samp, dis, null.model = c("taxa.labels", "richness", + "frequency", "sample.pool", "phylogeny.pool", "independentswap", + "trialswap"), abundance.weighted = FALSE, runs = 999, + iterations = 1000, cores = 1) +} +\arguments{ +\item{samp}{Community data matrix with samples as rows} + +\item{dis}{Distance matrix (generally a phylogenetic distance matrix)} + +\item{null.model}{Null model to use (see Details section for description)} + +\item{abundance.weighted}{Should mean nearest taxon distances for each species be weighted by species abundance? (default = FALSE)} + +\item{runs}{Number of randomizations} + +\item{iterations}{Number of iterations to use for each randomization (for independent swap and trial null models)} + +\item{cores}{Number of cores to use for parallel computing} +} +\value{ +A data frame of results for each community +\itemize{ +\item{ntaxa}{Number of taxa in community} +\item{mntd.obs}{Observed mntd in community} +\item{mntd.rand.mean}{Mean mntd in null communities} +\item{mntd.rand.sd}{Standard deviation of mntd in null communities} +\item{mntd.obs.rank}{Rank of observed mntd vs. null communities} +\item{mntd.obs.z}{Standardized effect size of mntd vs. null communities (= (mntd.obs - mntd.rand.mean) / mntd.rand.sd, equivalent to -NRI)} +\item{mntd.obs.p}{P-value (quantile) of observed mntd vs. null communities (= mntd.obs.rank / runs + 1)} +\item{runs}{Number of randomizations} +} +} +\description{ +Parallel calculation of standardized effect size of mean nearest taxon distances in communities. When used with a phylogenetic distance matrix, equivalent to -1 times the Nearest Taxon Index (NTI). +} +\details{ +Faster than ses.mntd from \code{picante} when there are many samples and taxa. + +Currently implemented null models (arguments to null.model): +\itemize{ + \item taxa.labels - Shuffle distance matrix labels (across all taxa included in distance matrix) + \item richness - Randomize community data matrix abundances within samples (maintains sample species richness) + \item frequency - Randomize community data matrix abundances within species (maintains species occurence frequency) + \item sample.pool - Randomize community data matrix by drawing species from pool of species occurring in at least one community (sample pool) with equal probability + \item phylogeny.pool- Randomize community data matrix by drawing species from pool of species occurring in the distance matrix (phylogeny pool) with equal probability + \item independentswap - Randomize community data matrix with the independent swap algorithm (Gotelli 2000) maintaining species occurrence frequency and sample species richness + \item trialswap - Randomize community data matrix with the trial-swap algorithm (Miklos & Podani 2004) maintaining species occurrence frequency and sample species richness +} +} +\keyword{MNTD} +\keyword{NTI} +\keyword{ses} diff --git a/man/ses.mpd.par.Rd b/man/ses.mpd.par.Rd new file mode 100644 index 0000000..1181358 --- /dev/null +++ b/man/ses.mpd.par.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ses.mpd.par.R +\name{ses.mpd.par} +\alias{ses.mpd.par} +\title{Standardized effect size of MPD} +\usage{ +ses.mpd.par(samp, dis, null.model = c("taxa.labels", "richness", + "frequency", "sample.pool", "phylogeny.pool", "independentswap", + "trialswap"), abundance.weighted = FALSE, runs = 999, + iterations = 1000, cores = 1) +} +\arguments{ +\item{samp}{Community data matrix with samples as rows} + +\item{dis}{Distance matrix (generally a phylogenetic distance matrix)} + +\item{null.model}{Null model to use (see Details section for description)} + +\item{abundance.weighted}{Should mean nearest taxon distances for each species be weighted by species abundance? (default = FALSE)} + +\item{runs}{Number of randomizations} + +\item{iterations}{Number of iterations to use for each randomization (for independent swap and trial null models)} + +\item{cores}{Number of cores to use for parallel computing} +} +\value{ +A data frame of results for each community +\itemize{ +\item{ntaxa}{Number of taxa in community} +\item{mpd.obs}{Observed mpd in community} +\item{mpd.rand.mean}{Mean mpd in null communities} +\item{mpd.rand.sd}{Standard deviation of mpd in null communities} +\item{mpd.obs.rank}{Rank of observed mpd vs. null communities} +\item{mpd.obs.z}{Standardized effect size of mpd vs. null communities (= (mpd.obs - mpd.rand.mean) / mpd.rand.sd, equivalent to -NRI)} +\item{mpd.obs.p}{P-value (quantile) of observed mpd vs. null communities (= mpd.obs.rank / runs + 1)} +\item{runs}{Number of randomizations} +} +} +\description{ +Parallel calculation of standardized effect size of mean pairwise distances in communities. When used with a phylogenetic distance matrix, equivalent to -1 times the Nearest Relative Index (NRI). +} +\details{ +Faster than ses.mpd from \code{picante} when there are many samples and taxa. + +Currently implemented null models (arguments to null.model): +\itemize{ + \item taxa.labels - Shuffle distance matrix labels (across all taxa included in distance matrix) + \item richness - Randomize community data matrix abundances within samples (maintains sample species richness) + \item frequency - Randomize community data matrix abundances within species (maintains species occurence frequency) + \item sample.pool - Randomize community data matrix by drawing species from pool of species occurring in at least one community (sample pool) with equal probability + \item phylogeny.pool- Randomize community data matrix by drawing species from pool of species occurring in the distance matrix (phylogeny pool) with equal probability + \item independentswap - Randomize community data matrix with the independent swap algorithm (Gotelli 2000) maintaining species occurrence frequency and sample species richness + \item trialswap - Randomize community data matrix with the trial-swap algorithm (Miklos & Podani 2004) maintaining species occurrence frequency and sample species richness +} +} +\keyword{MPD} +\keyword{NRI} +\keyword{ses}