From 24f1075abda69d4c7b3382ca4ebe2739706dab9c Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Mon, 13 Jul 2020 11:03:11 -0400 Subject: [PATCH] Add sparse+low rank and other models (#134) * fix pr and mvdistr * add pulsar + glmnets * added cpp admm * updated wrappers and src * update exports * added slr to spiec.easi * added npn option * added lambda.max option * init ebic methods * many changes * fixed imports * update README * update some docs * rebase and cleanup docs * update docs and roxygen comments * fix readme Co-authored-by: Zachary Kurtz --- DESCRIPTION | 11 +- NAMESPACE | 16 + R/RcppExports.R | 8 + R/SparseICov.R | 113 +++++- R/SparseLowRankICov.R | 304 +++++++++++++++ R/coat.R | 175 +++++++++ R/community_graph.R | 27 +- R/fitdistr.R | 10 +- R/mvdistributions.R | 14 +- R/normalization.R | 9 + R/roc.R | 53 ++- R/select.R | 191 ++++++++++ R/spaRcc.R | 13 +- R/spiec-easi-pkg.R | 8 +- R/spiec-easi.R | 92 +++-- R/utilities.R | 69 +++- README.Rmd | 68 +++- README.md | 691 +++++++++++++++++++--------------- man/AGP.Rd | 6 +- man/adj2igraph.Rd | 9 +- man/alr.Rd | 23 +- man/clr.Rd | 3 +- man/coat.Rd | 44 +++ man/ebic.Rd | 20 + man/edge.diss.Rd | 27 ++ man/graph2prec.Rd | 10 +- man/hmp2.Rd | 11 +- man/multi.spiec.easi.Rd | 11 +- man/neighborhood.net.Rd | 24 ++ man/norm_rdiric.Rd | 14 + man/qqdplot.Rd | 12 - man/rmvnorm.Rd | 9 +- man/robustPCA.Rd | 21 ++ man/sparccboot.Rd | 16 +- man/sparseLowRankiCov.Rd | 34 ++ man/sparseiCov.Rd | 3 +- man/spiec.easi.Rd | 18 +- man/synth_comm_from_counts.Rd | 12 +- man/triu.Rd | 24 ++ src/ADMM.cpp | 240 ++++++++++++ src/Makevars | 7 + src/RcppExports.cpp | 43 +++ src/matops.cpp | 15 + src/sqrtNewton.cpp | 58 +++ src/svthresh.cpp | 34 ++ 45 files changed, 2184 insertions(+), 436 deletions(-) create mode 100644 R/RcppExports.R create mode 100644 R/SparseLowRankICov.R create mode 100644 R/coat.R create mode 100644 R/select.R create mode 100644 man/coat.Rd create mode 100644 man/ebic.Rd create mode 100644 man/edge.diss.Rd create mode 100644 man/neighborhood.net.Rd create mode 100644 man/norm_rdiric.Rd delete mode 100644 man/qqdplot.Rd create mode 100644 man/robustPCA.Rd create mode 100644 man/sparseLowRankiCov.Rd create mode 100644 man/triu.Rd create mode 100644 src/ADMM.cpp create mode 100644 src/Makevars create mode 100644 src/RcppExports.cpp create mode 100644 src/matops.cpp create mode 100644 src/sqrtNewton.cpp create mode 100644 src/svthresh.cpp diff --git a/DESCRIPTION b/DESCRIPTION index 3d1e6df..48e1063 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: SpiecEasi Title: Sparse Inverse Covariance for Ecological Statistical Inference -Version: 1.0.7 +Version: 1.1.0 Authors@R: c( person("Zachary", "Kurtz", role = c("aut", "cre"), email="zdkurtz@gmail.com"), person("Christian", "Mueller", role = "aut"), @@ -10,7 +10,7 @@ Authors@R: c( Description: Estimate networks from the precision matrix of compositional microbial abundance data. Encoding: UTF-8 Depends: - R (>= 3.3.0), + R (>= 3.6.0), Imports: stats, methods, @@ -20,7 +20,8 @@ Imports: pulsar (>= 0.3.4), MASS, VGAM, - Matrix + Matrix, + glmnet Suggests: parallel, boot, @@ -30,5 +31,5 @@ Suggests: testthat biocViews: License: GPL (>= 2) -LazyData: true -RoxygenNote: 6.1.1 +LinkingTo: Rcpp, RcppArmadillo +RoxygenNote: 7.1.0 diff --git a/NAMESPACE b/NAMESPACE index b2967c2..25c459b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,7 @@ # Generated by roxygen2: do not edit by hand +S3method('$',Matrix) +S3method('[[',Matrix) S3method(alr,data.frame) S3method(alr,default) S3method(alr,matrix) @@ -8,6 +10,7 @@ S3method(as.matrix,graph) S3method(clr,data.frame) S3method(clr,default) S3method(clr,matrix) +S3method(getOptX,pulsar.refit) S3method(spiec.easi,default) S3method(spiec.easi,list) S3method(spiec.easi,otu_table) @@ -15,8 +18,11 @@ S3method(spiec.easi,phyloseq) export(adj2igraph) export(alr) export(clr) +export(coat) export(cor2cov) export(cov2prec) +export(ebic) +export(edge.diss) export(fitdistr) export(getOptBeta) export(getOptCov) @@ -33,6 +39,7 @@ export(make_graph) export(multi.spiec.easi) export(neff) export(norm_pseudo) +export(norm_rdiric) export(norm_to_total) export(prec2cov) export(pval.sparccboot) @@ -42,6 +49,7 @@ export(rmvnorm) export(rmvpois) export(rmvzinegbin) export(rmvzipois) +export(robustPCA) export(rzipois) export(shannon) export(sparcc) @@ -52,12 +60,17 @@ export(stars.pr) export(stars.roc) export(symBeta) export(synth_comm_from_counts) +export(tril) +export(triu) +export(triu2diag) importFrom(MASS,ginv) +importFrom(Matrix,t) importFrom(VGAM,dzinegbin) importFrom(VGAM,dzipois) importFrom(VGAM,qzinegbin) importFrom(VGAM,qzipois) importFrom(VGAM,rdiric) +importFrom(glmnet,glmnet) importFrom(grDevices,dev.off) importFrom(grDevices,png) importFrom(graphics,abline) @@ -68,6 +81,8 @@ importFrom(huge,huge) importFrom(huge,huge.npn) importFrom(methods,as) importFrom(pulsar,batch.pulsar) +importFrom(pulsar,getLamPath) +importFrom(pulsar,getMaxCov) importFrom(pulsar,pulsar) importFrom(stats,cor) importFrom(stats,cov) @@ -90,3 +105,4 @@ importFrom(stats,rpois) importFrom(stats,runif) importFrom(stats,sd) importFrom(stats,var) +useDynLib(SpiecEasi) diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..4223a02 --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,8 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#' @noRd +ADMM <- function(SigmaO, lambda, I, Lambda, Y, beta = 0, r = 0L, mu = 0, eta = 4/5, muf = 1e-6, maxiter = 500L, newtol = 1e-5, tol = 1e-5, over_relax_par = 1.6, shrinkDiag = TRUE) { + .Call('_SpiecEasi_ADMM', PACKAGE = 'SpiecEasi', SigmaO, lambda, I, Lambda, Y, beta, r, mu, eta, muf, maxiter, newtol, tol, over_relax_par, shrinkDiag) +} + diff --git a/R/SparseICov.R b/R/SparseICov.R index d6d9e7f..a07950d 100644 --- a/R/SparseICov.R +++ b/R/SparseICov.R @@ -47,13 +47,9 @@ sparseiCov <- function(data, method, npn=FALSE, verbose=FALSE, cov.output = TRUE if (npn) data <- huge::huge.npn(data, verbose=verbose) args <- list(...) + method <- switch(method, glasso = "glasso", mb = "mb", stop("Method not supported")) - method <- switch(method, glasso = "glasso", mb = "mb", - stop("Method not supported")) - - if (is.null(args$lambda.min.ratio)) - args$lambda.min.ratio <- 1e-3 - + if (is.null(args$lambda.min.ratio)) args$lambda.min.ratio <- 1e-3 est <- do.call(huge::huge, c(args, list(x=data, method=method, verbose=verbose, @@ -69,3 +65,108 @@ sparseiCov <- function(data, method, npn=FALSE, verbose=FALSE, cov.output = TRUE # } return(est) } + + +#' Neighborhood net estimates +#' +#' Select a sparse inverse covariance matrix using neighborhood selection and glmnet from various exponential models. +#' @param data n x p input (pre-transformed) data +#' @param lambda the lambda path +#' @param method ising and poisson models currently supported. +#' @param ncores number of cores for distributing the model fitting +#' @param sym symmetrize the neighborhood using the 'or' (default)/'and' rule +#' @param ... further arguments to glmnet +#' @importFrom Matrix t +neighborhood.net <- function(data, lambda, method="ising", ncores=1, sym='or', ...) { + p <- ncol(data) + l <- length(lambda) + args <- list(...) + match.fun(switch(method, + ising ={ nbFun <- glm.neighborhood ; args$link <- 'binomial' }, + poisson ={ nbFun <- glm.neighborhood ; args$link <- 'poisson' } + # loglin ={ nbFun <- llgm.neighborhood} + )) + + ### Remove 0/1 binomials/ zero variance features + if (method=='ising') { + mintab <- function(x) { + xtab <- table(x) + length(xtab) == 1 || min(xtab) == 1 + } + zvind <- which(apply(data, 2, mintab)) + } else { + zvind <- which(apply(data, 2, var)==0) + } + + estFun <- function(i) { + betamat <- matrix(0, p, l) + if (!(i %in% zvind)) { + suppressWarnings( + out <- do.call(nbFun, + c(list(data[,-c(i, zvind)], data[,i,drop=FALSE], lambda), args))) + lsub <- ncol(out) + if (lsubth) +# } +# return(Bmat) +# } + +dclr <- function(x) t(clr(apply(x, 1, norm_diric),2)) +dclrNPN <- function(x) huge::huge.npn(t(clr(apply(x, 1, norm_diric),2)), verbose=FALSE) diff --git a/R/SparseLowRankICov.R b/R/SparseLowRankICov.R new file mode 100644 index 0000000..32a9752 --- /dev/null +++ b/R/SparseLowRankICov.R @@ -0,0 +1,304 @@ +#' Sparse plus Low Rank inverse covariance +#' +#' Select an inverse covariance matrix that is a sparse plus low rank decomposition. +#' +#' @param data the n x p data matrix +#' @param npn flag to first fit nonparametric normal transform to the data +#' @param verbose flag to turn on verbose output +#' @param cor flag to use correlation matrix as the input (default: false - uses covariance) +#' @param ... arguments to override default algorithm settings (see details) +#' @details +#' This is a wrapper function for sparse plus low rank iCov estimations performed by a custom ADMM algorithm. +#' +#' Therefore, arguments \code{...} should be named. Typically, these are for specifying a penalty parameter, \code{lambda}, or the number of penalties to use. +#' By default 10 pentalties are used, ranging logarithmically between \code{lambda.min.ratio}*MAX and MAX. +#' Max is the theoretical upper bound on \code{lambda} and us \code{max|S|}, the maximum absolute value in the data correlation matrix. +#' \code{lambda.min.ratio} is 1e-3 by default. Lower values of \code{lambda} require more memory/cpu time to compute, and sometimes huge will throw an error. +#' +#' The argument \code{nlambda} determines the number of penalties - somewhere between 10-100 is usually good, depending on how the values of empirical correlation are distributed.#' @export +#' +#' One of \code{beta} (penalty for the nuclear norm) or \code{r} (number of ranks) should be supplied or \code{r=2} is chosen by default. +sparseLowRankiCov <- function(data, npn=FALSE, verbose=FALSE, cor=FALSE, ...) { +## TODO: make args to admm2 explicit + args <- list(...) + if (length(args$r) > 1 || length(args$beta) > 1) + stop("Only single value allowed for \'r\' or \'beta\'") + + if (npn) data <- huge::huge.npn(data, verbose=verbose) + if (isSymmetric(data)) SigmaO <- data + else SigmaO <- cov(data) + if (cor) SigmaO <- cov2cor(SigmaO) + + if (!is.null(args[[ "lambda.max" ]])) maxlam <- args$lambda.max + else maxlam <- 1 + if (is.null(args[[ "lambda" ]])) { + if (is.null(args[[ "lambda.min.ratio" ]])) args$lambda.min.ratio <- 1e-3 + if (is.null(args[[ "nlambda" ]])) args$nlambda <- 10 + lambda <- getLamPath(maxlam, maxlam*args$lambda.min.ratio, args$nlambda, log=TRUE) + args$lambda.min.ratio <- NULL ; args$nlambda <- NULL + } else lambda <- args$lambda + args$lambda.min.ratio <- args$nlambda <- args$lambda <- args$lambda.max <- NULL + + if (is.null(args[[ "beta" ]])) { + if (is.null(args[[ "r" ]])) + args$r <- 2 + } + + n <- length(lambda) + args$SigmaO <- SigmaO +### args$r <- Lrank + +# lest <- parallel::mclapply(lambda, mc.cores=ncores, FUN=function(lam) +# do.call('admm2', c(list(lambda=lam), args))) + p <- ncol(SigmaO) + I <- diag(p) + args$opts <- c(args$opts, list(I=I)) + args$opts$tol <- 1e-3 +## lest <- vector('list', n) + loglik <- vector('numeric', n) + path <- vector('list', n) ; icov <- vector('list', n) ; resid <- vector('list', n) + for (i in n:1) { + est <- do.call('admm2', c(list(lambda=lambda[i]), args)) + tmp <- est$S + icov [[i]] <- tmp + resid[[i]] <- est$L + tmp <- Matrix::forceSymmetric(Matrix::triu(tmp,k=1)) + path [[i]] <- as(tmp, 'lsCMatrix') + args$opts$Lambda <- est$Lambda + args$opts$Y <- est$Y + ## lest[[i]] <- est + args$opts$tol <- 1 + z <- Matrix::rowSums(path[[i]])!=0 #1:p # + q <- sum(!z) #p # + R <- icov[[i]] #- resid[[i]] + loglik[[i]] <- log(Matrix::det(R[z,z])) - sum(Matrix::diag(R[z,z] %*% SigmaO[z,z])) - (p-q) +# args$opts$eta <- max(.5, (p-(n-i))/p) + } + list(icov=icov, path=path, resid=resid, lambda=lambda, loglik=loglik, data=data) +} + +#' @useDynLib SpiecEasi +#' @noRd +admm2 <- function(SigmaO, lambda, beta, r, tol=1e-2, shrinkDiag=TRUE, opts) { + n <- nrow(SigmaO) + defopts <- list(mu=n, eta=75/100, muf=1e-4, maxiter=100, newtol=1e-4) + if (!missing(opts)) for (o in names(opts)) defopts[[ o ]] <- opts [[ o ]] + if (missing(beta)) beta <- 0 + if (missing(r)) r <- 0 + opts <- defopts + over_relax_par <- 1.6 + + if (is.null(opts[[ 'I' ]])) + I <- diag(n) + else I <- opts$I + if (is.null(opts[[ 'Lambda' ]])) + Lambda <- matrix(0, n, n*3) + else Lambda <- opts$Lambda + if (is.null(opts[[ 'Y' ]])) + Y <- cbind(I, I, matrix(0, n, n)) + else Y <- opts$Y + ADMM(SigmaO=SigmaO, lambda=lambda, I=I, Lambda=Lambda, Y=Y, beta=beta, r=r, shrinkDiag=shrinkDiag, + maxiter=opts$maxiter, mu=opts$mu, eta=opts$eta, newtol=opts$newtol, muf=opts$muf) +} + +#' robust PCA +#' +#' Form a robust PCA from clr-transformed data and [the low rank component of] an inverse covariance matrix +#' +#' @param X the n x p [clr-transformed] data +#' @param L the p x p rank-r ('residual') inverse covariance matrix from \code{spiec.easi} run argument method='slr'. +#' @param inverse flag to indicate the L is the inverse covariance matrix +#' @returns a named list with n x r matrix of scores and r x r matrix of loadings +#' @export +robustPCA <- function(X, L, inverse=TRUE) { + Lsvd <- svd(L) + ind <- Lsvd$d>1e-9 + if (inverse) { + loadings <- diag(sqrt(1/Lsvd$d[ind])) %*% t(Lsvd$v[,ind]) + } else { + loadings <- diag(sqrt(Lsvd$d[ind])) %*% t(Lsvd$v[,ind]) + } + + scores <- X %*% t(loadings) + return(list(scores=scores, loadings=loadings)) +} +# #' @importFrom Matrix sparseMatrix forceSymmetric +# admm <- function(SigmaO, lambda, beta, r, LPD=FALSE, eigsolve=FALSE, tol=1e-5, shrinkDiag=TRUE, opts) { +# n <- nrow(SigmaO) +# defopts <- list(mu=n, eta=99/100, muf=1e-4, maxiter=500, newtol=1e-4) +# if (!missing(opts)) for (o in names(opts)) defopts[[ o ]] <- opts [[ o ]] +# opts <- defopts +# ABSTOL <- tol +# RELTOL <- tol +# over_relax_par <- 1.6 +# Ip <- sparseDiag(n) +# Id <- diag(n) +# J <- matrix(1/n, n, n) +# R <- S <- Ip +# L <- zeros(n,n) +# RY <- Id; SY <- Id; LY <- L +# Y <- cbind(RY, SY, LY) +# Lambda1 <- zeros(ncol(R)); Lambda2 <- Lambda1; Lambda3 <- Lambda1 +# mu <- opts$mu +# eta <- opts$eta +# objval <- vector('numeric', opts$maxiter) +# r_norm <- vector('numeric', opts$maxiter) +# # s_norm <- vector('numeric', opts$maxiter) +# eps_pri <- vector('numeric', opts$maxiter) +# # eps_dual <- vector('numeric', opts$maxiter) +# for (iter in 1:opts$maxiter) { +# ## update X = (R,S,L) +# RA <- RY + mu*Lambda1 +# SA <- SY + mu*Lambda2 +# LA <- LY + mu*Lambda3 +# if (eigsolve) { +# tmp <- mu*SigmaO-RA +# tmp <- (tmp+t(tmp))/2 +# eV <- eigen(tmp) ; U <- eV$vectors ; D <- eV$values ; d = sparseDiag(D) +# eigR <- (-d + sqrt(d^2+4*mu))/2 +# R <- U%*%sparseDiag(Matrix::diag(eigR))%*%t(U) +# R <- (R+t(R))/2 +# } else { +# K <- mu*SigmaO - RA +# KI <- sqrtmNewt(as(K %*% K, 'matrix') + 4*mu*Id, Id, errTol=opts$newtol) +# R <- MATAVE2(KI, as.matrix(-K)) #.5*(KI-K) +# } +# S <- SOFTTHRESH(as(SA, 'matrix'), lambda*mu, shrinkDiag=shrinkDiag) +# # S <- forceSymmetric(S) +# # LA <- forceSymmetric(LA) +# if (LPD) { +# # if (iter>1) { +# eV <- eigen(LA) ; U <- Re(eV$vectors) ; d <- Re(eV$values) +# if (missing(beta)) +# beta <- d[r+1]/mu #*1.01 +# eigL <- sparseDiag(pmax(d-mu*beta,0)) +# L <- U %*% eigL %*% t(U) +# L <- as(forceSymmetric(L), 'sparseMatrix') +# # } else L <- LA +# } else { +# if (iter==1) { +# L <- LA +# } else { +# if (missing(beta)) { +# tmp <- softSVT(as(LA, 'matrix'), k=r) +# # beta <- tmp$tau/mu +# L <- tmp$M +# } else { +# tmp <- softSVT(as(LA, 'matrix'), tau=mu*beta) +# L <- tmp$M +# } +# } +# } +# X <- cbind(R, S, L) +# RO <- over_relax_par*R + (1-over_relax_par)*RY +# SO <- over_relax_par*S + (1-over_relax_par)*SY +# LO <- over_relax_par*L + (1-over_relax_par)*LY +# ## update Y = (RY,SY,LY) +# Y_old <- Y +# RA <- RO - mu*Lambda1 +# SA <- SO - mu*Lambda2 +# LA <- LO - mu*Lambda3 +# TA <- (RA-SA+LA)/3 +# RY <- RA - TA ; SY <- SA + TA ; LY <- LA - TA +# +# ## update Lambda +# Lambda1 <- (Lambda1 - (RO-RY)/mu); # Lambda1 = forceSymmetric(Lambda1) +# Lambda2 <- (Lambda2 - (SO-SY)/mu); #Lambda2 = forceSymmetric(Lambda2) +# Lambda3 <- (Lambda3 - (LO-LY)/mu); #Lambda3 = forceSymmetric(Lambda3) +# Lambda <- cbind(Lambda1, Lambda2, Lambda3) +# +# Y <- cbind(RY, SY, LY) +# ## diagnostics, reporting, termination checks +# # k = iter +# # objval[iter] = objective(R,SigmaO,eigR,S,eigL,lambda,beta); +# +# r_norm[iter] <- Matrix::norm(X - Y, 'F') +# # s_norm[iter] = max(svd(-(Y - Y_old)/mu, nu=0, nv=0)$d) +# eps_pri[iter] <- sqrt(3*n*n)*ABSTOL + +# RELTOL*max(Matrix::norm(X,'F'), Matrix::norm(Y,'F')) +# # eps_dual[iter]= sqrt(3*n*n)*ABSTOL + RELTOL*Matrix::norm(Lambda,'F') +# # print(r_norm[iter] - eps_pri[iter]) +# if (r_norm[iter] < eps_pri[iter]) # & s_norm[k] < eps_dual[k]) +# break +# ## TODO: make this optional +# mu <- max(mu*eta, opts$muf) +# } +# +# history <- list(objval=objval, r_norm=r_norm,eps_pri=eps_pri) +# # eigR = eigR, eigL = eigL, +# list(R=R, S = S, L = L, RA=RA, LA=LA, SA=SA, mu=mu, obj = objval[iter], iter = iter, history=history) +# } +# +# zeros <- function(n, p=n, sparse=TRUE) { +# if (!sparse) +# matrix(0, nrow=n, ncol=p) +# else +# Matrix::sparseMatrix(i=NULL, j=NULL, dims=c(n,p), symmetric=TRUE) +# } +# +# sparseDiag <- function(x) { +# if (length(x) == 1 && as.integer(x) == x) { +# M <- zeros(x, sparse=TRUE) +# diag(M) <- 1 +# } else { +# M <- zeros(length(x), sparse=TRUE) +# diag(M) <- x +# } +# as(M, 'dsCMatrix') +# } +# +# +# .sqrtNewton <- function(C, sqrtC0=diag(ncol(C)), errTol=5e-1) { +# # compute square root of symmetric matrix C +# # init +# n <- ncol(C) +# X <- sqrtC0 +# err <- Inf +# while (err > errTol) { +# X_new <- .5*(X + Matrix::solve(X, C)) #.5*sqrt(mu)*Id))) +# err <- Matrix::norm(X_new - X, 'F') +# X <- X_new +# } +# X +# } +# +# +# objective <- function(R,SigmaO,eigR,S,eigL,lambda,beta) { +# sum(sum(R*SigmaO)) - sum(log(eigR)) + lambda*sum(abs(S)) + beta*sum(eigL) +# } +# +# +# hardsvdthresh <- function(M, tau, k=ncol(M)) { +# Msvd <- svdPow(M, min(ncol(M), k+1), k+2) +# # Msvd <- svd(M) +# if (k < ncol(M)) +# tau <- Msvd$d[k+1]*1.01 #min(tau, Msvd$d[k+1]) +# ind <- which(Msvd$d - tau > 0) +# tmpd <- sparseDiag(Msvd$d[ind]) +# M <- Msvd$u[,ind,drop=FALSE] %*% tmpd %*% t(Msvd$v[,ind,drop=FALSE]) +# return(list(M=M, tau=tau, d=tmpd)) +# } +# +# +# svdPow <- function(A, k, q) { +# l <- k +# m <- nrow(A) +# n <- ncol(A) +# P <- matrix(rnorm(m*l), nrow=m, ncol=l) +# QR <- qr(t(t(P)%*%A)) +# Q <- qr.Q(QR) +# R <- qr.R(QR) +# # return(list(Q=Q, R=R)) +# for (j in 1:q) { +# PR <- qr(A%*%Q) +# P <- qr.Q(PR) +# R <- qr.R(PR) +# QR <- qr(t(t(P)%*%A)) +# Q <- qr.Q(QR) +# R <- qr.R(QR) +# } +# Asvd <- svd(A%*%Q) +# list(u=Asvd$u, #[,1:k,drop=FALSE], +# d=Asvd$d, #[1:k,drop=FALSE], +# v=(Q%*%Asvd$v), P=P, Q=Q) #[,1:k,drop=FALSE]) +# } diff --git a/R/coat.R b/R/coat.R new file mode 100644 index 0000000..54b439b --- /dev/null +++ b/R/coat.R @@ -0,0 +1,175 @@ +#' COAT +#' +#' Compositional-adjusted thresholding by doi.org/10.1080/01621459.2018.1442340 by Cao, Lin & Li (2018) +#' @param data a clr-transformed data or covariance matrix +#' @param lambda threshold parameter(s) +#' @param thresh "soft" or "hard" thresholding +#' @param adaptive use adative-version of the lambda as in the original COAT paper. See details. +#' @param shrinkDiag flag to exclude the covariance diagonal from the shrinkage operation +#' @param ret.icov flag to also return the inverse covariance matrix (inverse of all thresholded COAT matrices) +#' @param ... Arguments to automatically calculating the lambda path. See details. +#' @details If \code{adaptive=TRUE}, and \code{data} is a covariance matrix, the adaptive penalty is calculated by assuming the underlying data is jointly Gaussian in the infinite sample setting. The results may differ from the 'empirical' adaptive setting. +#' +#' There are a few undocumented arguments useful for computing a lambda path on the fly: +#' \itemize{ +#' \item{lambda.max: }{Maximum lambda. Default: max absolute covariance} +#' \item{lambda.min.ratio: }{lambda.min is lambda.min.ratio*lambda.max is the smallest lambda evaluated. Defailt: 1e-3} +#' \item{nlambda: }{Number of values of lambda between lambda.max and lambda.min. Default: 30} +#'} +#' @export +coat <- function(data, lambda, thresh="soft", adaptive=TRUE, shrinkDiag=TRUE, ret.icov=FALSE, ...) { + + if (isSymmetric(data)) { + S <- data + adapt <- 2 + } else { + S <- cov(data) + adapt <- 1 + } + if (adaptive) { + theta <- switch(adapt, + '1'=.getThetaMat(data), + '2'=sqrt(S^2 + diag(S)%*%t(diag(S))) ## assume joint gaussian model + ) + } else + theta <- 1 + + args <- list(...) + if (missing(lambda)) { + if (!is.null(args[[ "lambda.max" ]])) maxlam <- args$lambda.max + else maxlam <- pulsar::getMaxCov(abs(S)/theta) + if (is.null(args[[ "lambda" ]])) { + if (is.null(args[[ "lambda.min.ratio" ]])) args$lambda.min.ratio <- 1e-3 + if (is.null(args[[ "nlambda" ]])) args$nlambda <- 10 + lambda <- pulsar::getLamPath(maxlam, maxlam*args$lambda.min.ratio, args$nlambda, log=FALSE) + args$lambda.min.ratio <- NULL ; args$nlambda <- NULL + } else lambda <- args$lambda + } + args$lambda.min.ratio <- args$nlambda <- args$lambda <- args$lambda.max <- NULL + + + + threshfun <- + switch(thresh, + soft = soft.thresh, + hard = hard.thresh, + adapt = adaptive.thresh) + + + if (length(lambda) > 1) { + n <- length(lambda) + path <- vector('list', n) + cov <- vector('list', n) + if (ret.icov) icov <- vector('list', n) + Stmp <- S + for (i in n:1) { + cov[[i]] <- threshfun(Stmp, theta*lambda[i], shrinkDiag) + if (ret.icov) icov[[i]] <- tryCatch(solve(cov[[i]]), error=function(e) MASS::ginv(cov[[i]])) + path[[i]] <- spMat2Adj(cov[[i]]) + Stmp <- path[[i]]*S + Matrix::diag(Stmp) <- diag(S) + } + est <- list(path=path, cov=cov) + } else { + Stmp <- threshfun(S, theta*lambda, shrinkDiag) + if (ret.icov) icov <- tryCatch(solve(Stmp), error=function(e) MASS::ginv(Stmp)) + est <- list( + cov = Stmp, + path = spMat2Adj(Stmp) + ) + } + est$lambda <- lambda + if (ret.icov) est$icov <- icov + est +} + +#' @noRd +spMat2Adj <- function(A, class="lsCMatrix") { + if (inherits(A, "sparseMatrix")) { + return(as(Matrix::forceSymmetric(Matrix::triu(A, k=1)), class)) + } else if (inherits(A, "matrix")) { + return(as(Matrix::forceSymmetric(Matrix::triu(A, k=1)), 'lsyMatrix')) + } else + stop("Class is not recognized") +} + +#' @noRd +soft.thresh <- function(S, lam, shrinkDiag=FALSE) { + if (inherits(S, "sparseMatrix")) { + return(.sparseThresh(S, lam, shrinkDiag, thresh="soft")) + } else if (inherits(S, "matrix")) { + Sign <- sign(S) + M <- as(Sign*pmax(abs(S) - lam, 0), "sparseMatrix") + if (!shrinkDiag) Matrix::diag(M) <- diag(S) + return(M) + } else if (inherits(S, 'denseMatrix')) { + return(soft.thresh(as(S, 'matrix'), lam, shrinkDiag)) + } else + stop('Class not recognized') +} + +#' @noRd +hard.thresh <- function(S, lam, shrinkDiag=FALSE) { + if (inherits(S, "matrix")) { + M <- Matrix::Matrix(S*(abs(S) >= lam), sparse=TRUE) + if (!shrinkDiag) diag(M) <- diag(S) + return(M) + } else if (inherits(S, "sparseMatrix")) { + return(.sparseThresh(S, lam, shrinkDiag, thresh="hard")) + } else if (inherits(S, 'denseMatrix')) { + return(hard.thresh(as(S, 'sparseMatrix'), lam, shrinkDiag)) + } else + stop('Class not recognized') +} + +#' @noRd +adaptive.thresh <- function(S, lam, shrinkDiag=FALSE, eta=1) { + if (inherits(S, "matrix")) { + p <- ncol(S) + eta <- pmax(eta, 1) + M <- as(S*pmax(1 - abs(lam/S)^eta, 0), "sparseMatrix") + if (!shrinkDiag) Matrix::diag(M) <- diag(S) + return(M) + } else if (inherits(S, "sparseMatrix")) { + return(.sparseThresh(S, lam, shrinkDiag, thresh="adapt", eta)) + } else if (inherits(S, 'denseMatrix')) { + return(adaptive.thresh(as(S, 'sparseMatrix'), lam, shrinkDiag, eta)) + } else + stop('Class not recognized') +} + +#' @noRd +.sparseThresh <- function(S, lam, shrinkDiag, thresh="soft", eta=1) { + k <- if (shrinkDiag) 0 else 1 + p <- ncol(S) + ilist <- Matrix::summary(Matrix::triu(S, k=k)) + ind <- (ilist[,2]-1)*p + ilist[,1] + if (inherits(lam, "matrix")) + Lam <- lam[ind] + else + Lam <- lam + newx <- switch(thresh, + soft = pmax(abs(ilist[,3]) - Lam, 0), + hard = S[ind]*(abs(ilist[,3]) >= Lam), + adapt= S[ind]*pmax(1 - abs(Lam/ilist[,3])^eta, 0) + ) + + nzind <- which(newx != 0) + newx <- if (thresh=='soft') sign(ilist[nzind,3])*newx[nzind] else newx[nzind] + M <- Matrix::sparseMatrix(ilist[nzind,1], ilist[nzind,2], x=newx, dims=dim(S)) + if (!shrinkDiag) Matrix::diag(M) <- Matrix::diag(S) + return(Matrix::forceSymmetric(M)) + +} + +#' @noRd +.getThetaMat <- function(X) { + n <- ncol(X) + theta <- matrix(0, n,n) + for (i in 1:n) { + for (j in i:n) { + theta[j,i] <- theta[i,j] <- sd(X[,i]*X[,j]) + } + } + theta +} diff --git a/R/community_graph.R b/R/community_graph.R index 50ad6b6..ad52ea9 100644 --- a/R/community_graph.R +++ b/R/community_graph.R @@ -57,7 +57,7 @@ graph2prec <- function(Graph, posThetaLims=c(2,3), negThetaLims=-posThetaLims, t degVec <- colSums(Graph) # add random edge weights uniformly from theta_min to theta_max - utri <- triu(Theta) + utri <- triu(Theta, k=1) nzind <- which(utri != 0) if (length(posThetaLims) > 2 || length(negThetaLims) > 2) @@ -88,7 +88,7 @@ graph2prec <- function(Graph, posThetaLims=c(2,3), negThetaLims=-posThetaLims, t return(Theta) } - +#' @noRd .binSearchCond <- function(Theta, condTheta, numBinSearch, epsBin) { # Internal function that determines the constant in the diagonal to satisfy the # condition constraint on the Precision/Covariance matrix @@ -157,8 +157,7 @@ make_graph <- function(method, D, e, enforce=TRUE, ...) { return(structure(Graph, class='graph')) } - -#' @keywords internal +#' @noRd enforceE <- function(Graph, e) { nedges <- edge_count(Graph) @@ -193,8 +192,7 @@ enforceE <- function(Graph, e) { else Graph } - -#' @keywords internal +#' @noRd scale_free <- function(D, e, pfun) { # Make a scale free graph # Args: @@ -249,7 +247,7 @@ scale_free <- function(D, e, pfun) { return(Graph) } -#' @keywords internal +#' @noRd erdos_renyi <- function(D, e, p=e/(D*(D-1)/2)) { # Make a random graph: # Args: @@ -264,7 +262,7 @@ erdos_renyi <- function(D, e, p=e/(D*(D-1)/2)) { return(Graph) } -#' @keywords internal +#' @noRd hub <- function(D, e, numHubs=ceiling(D/20)) { # Make hub graph # Args: @@ -287,7 +285,7 @@ hub <- function(D, e, numHubs=ceiling(D/20)) { return(Graph) } -#' @keywords internal +#' @noRd cluster <- function(D, e, numHubs=floor((D/15)+(e/D))-1) { # Make a cluster graph (groups of random graphs) # Args: @@ -309,7 +307,7 @@ cluster <- function(D, e, numHubs=floor((D/15)+(e/D))-1) { return(Graph) } -#' @keywords internal +#' @noRd band <- function(D, e) { # Make a banded graph # Args: @@ -334,7 +332,7 @@ band <- function(D, e) { return(Graph) } -#' @keywords internal +#' @noRd block <- function(D, e, numHubs) { #blocksize=20, p=D/((D/blocksize)*(blocksize*(blocksize)/2)), u=NULL, v=NULL) { # Make precision matrix for block graph (note the difference from other functions) # Args: @@ -371,13 +369,13 @@ block <- function(D, e, numHubs) { #blocksize=20, p=D/((D/blocksize)*(blocksize } -#' @keywords internal +#' @noRd edge_count <- function(Graph) { # return number of edges in a [square matrix] graph length(which(Graph[upper.tri(Graph)] != 0)) } -#' @keywords internal +#' @noRd eigGraph <- function(G, tol=1e-12) { D <- diag(rowSums(G)) L <- D-G @@ -387,14 +385,13 @@ eigGraph <- function(G, tol=1e-12) { return(list(specGap=specGap, nCC=nCC, eigsG=eigsG)) } -#' @keywords internal graphReport <- function(Graph) { nedges <- edge_count(Graph) eigGraph <- eigGraph(Graph) return(c(list(NumEdges=nedges), eigGraph)) } -#' @keywords internal +#' @noRd covReport <- function(Cov, Prec) { condCov <- kappa(Cov) condPrec <- kappa(Prec) diff --git a/R/fitdistr.R b/R/fitdistr.R index eda16e5..f946aa3 100644 --- a/R/fitdistr.R +++ b/R/fitdistr.R @@ -69,7 +69,7 @@ fitdistr <- function (x, densfun, start, control, ...) { Call <- match.call(expand.dots = TRUE) if (missing(start)) start <- NULL - if (missing(control)) + if (missing(control)) control <- list(fnscale=1e12, factr=1e-2, maxit=10) dots <- names(list(...)) @@ -179,7 +179,7 @@ fitdistr <- function (x, densfun, start, control, ...) { } -#' @keywords internal +#' @noRd #' @importFrom VGAM dzinegbin logLikzinb <- function(param,x,...) { param <- abs(param) @@ -189,7 +189,7 @@ logLikzinb <- function(param,x,...) { (-sum(VGAM::dzinegbin(x, munb=munb, pstr0=pstr0, size=size, log=TRUE, ...))) } -#' @keywords internal +#' @noRd #' @importFrom stats dnbinom logLiknb <- function(param, x, ...) { param <- abs(param) @@ -197,7 +197,7 @@ logLiknb <- function(param, x, ...) { size <- param['size'] (-sum(dnbinom(x, mu=munb, size=size, log=TRUE, ...))) } -#' @keywords internal +#' @noRd #' @importFrom VGAM dzipois logLikzip <- function(param, x, ddistr, ...) { pstr0 <- param['pstr0'] @@ -240,7 +240,7 @@ qqdplot_comm <- function(comm, distr, param, plot=TRUE, ...) { } #' deprecated?? -#' @keywords internal +#' @noRd qqdplot <- function(y, distr, param, plot=TRUE, ...) { if ((missing(param) || is.null(param)) && !(distr %in% c('lnorm', 'pois', 'negbin'))) { diff --git a/R/mvdistributions.R b/R/mvdistributions.R index 15141e4..173135e 100644 --- a/R/mvdistributions.R +++ b/R/mvdistributions.R @@ -24,12 +24,14 @@ rzipois <- function(n, lambda, pstr0 = 0) { } +#' @noRd #' @keywords internal .zipois_getLam <- function(mu, S) { S <- max(sqrt(mu), S) (S^2/mu) + mu - 1 } +#' @noRd #' @keywords internal .zipois_getP <- function(mu, S) { S <- max(sqrt(mu), S) @@ -64,7 +66,7 @@ rmvzipois <- function(n, mu, Sigma=diag(length(mu)), lambdas, ps, ...) { normd <- rmvnorm(n, rep(0, d), Cor) unif <- pnorm(normd) - data <- matrix(VGAM::qzipois(unif, lambdas, pstr0=ps, ...), n, d) + data <- t(matrix(VGAM::qzipois(t(unif), lambdas, pstr0=ps, ...), d, n)) data <- .fixInf(data) return(data) } @@ -88,11 +90,12 @@ rmvpois <- function(n, mu, Sigma, ...) { if (length(mu) == 1) stop("Need more than 1 variable") normd <- rmvnorm(n, rep(0, d), Cor) unif <- pnorm(normd) - data <- matrix(qpois(unif, mu, ...), n, d) + data <- t(matrix(qpois(t(unif), mu, ...), d, n)) data <- .fixInf(data) return(data) } +#' @noRd #' @keywords internal .negbin_getK <- function(mu, S) { S <- max(sqrt(mu), S) @@ -129,17 +132,20 @@ rmvnegbin <- function(n, mu, Sigma, ks, ...) { } +#' @noRd #' @keywords internal .zinegbin_getLam <- function(mu, S) { S <- max(sqrt(mu)+1e-3, S) (mu + (mu^2 - mu + S^2) / mu) / 2 } +#' @noRd #' @keywords internal .zinegbin_getP <- function(mu, lam) { 1 - (mu / lam) } +#' @noRd #' @keywords internal .zinegbin_getK <- function(mu, S, lam) { S <- max(sqrt(mu)+1e-3, S) @@ -175,7 +181,7 @@ rmvzinegbin <- function(n, mu, Sigma, munbs, ks, ps, ...) { d <- length(munbs) normd <- rmvnorm(n, rep(0, d), Sigma=Cor) unif <- pnorm(normd) - data <- matrix(VGAM::qzinegbin(unif, munb=munbs, size=ks, pstr0=ps, ...), n, d) + data <- t(matrix(VGAM::qzinegbin(t(unif), munb=munbs, size=ks, pstr0=ps, ...), d, n)) data <- .fixInf(data) return(data) } @@ -216,7 +222,7 @@ rmvnorm <- function(n=100, mu=rep(0,10), Sigma=diag(10), tol=1e-6, empirical=TRU ev <- eS$values if (!all(ev >= -tol * abs(ev[1L]))) stop("'Sigma' is not positive definite") - X <- matrix(rnorm(p * n), n) + X <- matrix(rnorm(p * n), n, p) if (empirical) { X <- scale(X, TRUE, FALSE) X <- X %*% svd(X, nu = 0, nv = length(mu))$v diff --git a/R/normalization.R b/R/normalization.R index 92248c0..9f0d23a 100644 --- a/R/normalization.R +++ b/R/normalization.R @@ -21,6 +21,15 @@ norm_pseudo <- function(x) norm_to_total(x+1) norm_to_total <- function(x) x/sum(x) +#' Normalize via dirichlet sampling +#' +#' "Normalize" a count vector by drawing a single sample from a Dirichlet distribution, using the count vector as the prior. +#' @param x count data vector +#' @export +norm_rdiric <- function(x) { + VGAM::rdiric(n=1, shape=x)[1,] +} + #' compute the shannon entropy from a vector (normalized internally) #' #' Shannon entropy is: diff --git a/R/roc.R b/R/roc.R index e04fd72..61ce395 100644 --- a/R/roc.R +++ b/R/roc.R @@ -34,7 +34,7 @@ merge2path <- function(merge, length.out) { lapply(path, function(i) merge > i) } - +#' @noRd huge.pr <- function (path, theta, verbose = TRUE, plot = TRUE) { gcinfo(verbose = FALSE) ROC = list() @@ -68,17 +68,56 @@ huge.pr <- function (path, theta, verbose = TRUE, plot = TRUE) { message("done.") rm(precision, recall, tp.all, fp.all, path, theta, fn) gc() - ord.p = order(ROC$prec, na.last=NA) - tmp1 = ROC$prec[ord.p] - tmp2 = ROC$rec[ord.p] + ord.p = order(ROC$prec, ROC$rec, na.last=NA) + ROC$prec <- ROC$prec[ord.p] + ROC$rec <- ROC$rec[ord.p] + tmp2 = c(min(c(4/(d-1), ROC$prec)), ROC$prec[ord.p], 1) + tmp1 = c(1, ROC$rec[ord.p], 0) if (plot) { par(mfrow = c(1, 1)) - plot(tmp1, tmp2, type = "b", main = "PR Curve", xlab = "Precision", - ylab = "Recall", ylim = c(0, 1)) + plot(tmp1, tmp2, type = "b", main = "PR Curve", xlab = "Recall", + ylab = "Precision", ylim = c(0, 1)) } - ROC$AUC = sum(diff(tmp1) * (tmp2[-1] + tmp2[-length(tmp2)]))/2 + tmax <- diff(range(tmp2))*diff(range(tmp1)) + ROC$AUC = sum(diff(tmp2) * (tmp1[-1] + tmp1[-length(tmp1)]))/(2*tmax) rm(ord.p, tmp1, tmp2) gc() class(ROC) = "roc" return(ROC) } + +#' Edge set dissimilarity +#' +#' Compute the dissimilarity between the edge sets of two networks via: +#' \enumerate{ +#' \item maximum overlap: |x y| / max\{|x|,|y|\} +#' \item jaccard index (default): |x y|/(|x U y|) +#' } +#' Input networks do not have to have the same node sets. +#' @param x pxp adjacency matrix (\code{Matrix::sparseMatrix} class) +#' @param y other qxq adjacency matrix (\code{Matrix::sparseMatrix} class) +#' @param metric 'jaccard' or 'max' +#' @param otux taxa names of adjacency x +#' @param otuy taxa names of adjacency y +#' @export +edge.diss <- function(x, y, metric='jaccard', otux=NULL, otuy=NULL) { + stopifnot(inherits(x, 'sparseMatrix'), TRUE) + stopifnot(inherits(y, 'sparseMatrix'), TRUE) + xli <- Matrix::summary(Matrix::triu(x)) + yli <- Matrix::summary(Matrix::triu(y)) + if (!is.null(otux)) { + xli[,1] <- otux[xli[,1]] + xli[,2] <- otux[xli[,2]] + } + if (!is.null(otuy)) { + yli[,1] <- otuy[yli[,1]] + yli[,2] <- otuy[yli[,2]] + } + xedges <- apply(xli[,1:2], 1, paste, collapse="-") + yedges <- apply(yli[,1:2], 1, paste, collapse="-") + if (metric=="jaccard") { + return(length(intersect(xedges, yedges)) / length(unique(c(xedges, yedges)))) + } else { + return(length(intersect(xedges, yedges)) / max(length(xedges), length(yedges))) + } +} \ No newline at end of file diff --git a/R/select.R b/R/select.R new file mode 100644 index 0000000..844e30c --- /dev/null +++ b/R/select.R @@ -0,0 +1,191 @@ +#' Extended BIC +#' +#' Calculate the extended BIC criterion on a sparse (refit) network and the input data +#' +#' @param refit adjacency matrix, getOpt from SpiecEasi output +#' @param data input data set used to get the network +#' @param loglik log likeihood of the graphical model +#' @param gamma the model likeihood/complexity tradeoff parameter +#' @export +ebic <- function(refit, data, loglik, gamma=.5) { + df <- sum(refit)/2 + n <- nrow(data) + d <- ncol(data) + -n * loglik + log(n) * df + 4 * gamma * log(d) * df +} + +## DEPRECATED ## +# #' Model selection for picking the right \code{lambda} penalty. +# #' This is identical to huge::huge.stars except that the subsampling loop is replaced with an mclapply function to add parallelization capabilities. +# #' +# #' @param est an estimate/model as produced by the sparseiCov function +# #' @param criterion character string specifying criterion/method for model selection accepts 'stars' [default], 'ric', 'ebic' +# #' @param stars.thresh variability threshold for stars selection +# #' @param ebic.gamma tuning parameter for ebic +# #' @param stars.subsample.ratio The default value 'is 10*sqrt(n)/n' when 'n>144' and '0.8' when 'n<=144', where 'n' is the sample size. +# #' @param rep.num number of subsamplings when \code{criterion} = stars. +# #' @param ncores number of cores to use. Need multiple processers if \code{ncores > 1} +# #' @param normfun normalize internally if data should be renormalized +# #' @importFrom parallel mclapply +# #' @export +# icov.select <- function(est, criterion = 'stars', stars.thresh = 0.05, ebic.gamma = 0.5, +# stars.subsample.ratio = NULL, rep.num = 20, ncores=1, normfun=function(x) x, verbose=FALSE) { +# gcinfo(FALSE) +# if (est$cov.input) { +# message("Model selection is not available when using the covariance matrix as input.") +# class(est) = "select" +# return(est) +# } +# if (!est$cov.input) { +# if (est$method == "mb" && is.null(criterion)) +# criterion = "stars" +# if (est$method == "ct" && is.null(criterion)) +# criterion = "ebic" +# n = nrow(est$data) +# d = ncol(est$data) +# nlambda = length(est$lambda) +# if (criterion == "ric") { +# if (verbose) { +# message("Conducting rotation information criterion (ric) selection....") +# } +# if (n > rep.num) { +# nr = rep.num +# r = sample(n, rep.num) +# } +# if (n <= rep.num) { +# nr = n +# r = 1:n +# } +# out = .C("RIC", X = as.double(est$data), dd = as.integer(d), +# nn = as.integer(n), r = as.integer(r), nr = as.integer(nr), +# lambda_opt = as.double(0), PACKAGE = "huge") +# est$opt.lambda = out$lambda_opt/n +# rm(out) +# gc() +# if (verbose) { +# message("done\n") +# } +# if (verbose) { +# message("Computing the optimal graph....") +# } +# if (est$opt.lambda > max(cor(est$data))) +# est$refit = Matrix(0, d, d) +# else { +# if (est$method == "mb") +# est$refit = huge::huge.mb(est$data, lambda = est$opt.lambda, +# sym = est$sym, idx.mat = est$idx.mat, verbose = FALSE)$path[[1]] +# if (est$method == "glasso") { +# if (!is.null(est$cov)) { +# tmp = huge::huge.glasso(est$data, lambda = est$opt.lambda, +# scr = est$scr, cov.output = TRUE, verbose = FALSE) +# est$opt.cov = tmp$cov[[1]] +# } +# if (is.null(est$cov)) +# tmp = huge::huge.glasso(est$data, lambda = est$opt.lambda, +# verbose = FALSE) +# est$refit = tmp$path[[1]] +# est$opt.icov = tmp$icov[[1]] +# rm(tmp) +# gc() +# } +# if (est$method == "ct") +# est$refit = huge::huge.ct(est$data, lambda = est$opt.lambda, +# verbose = FALSE)$path[[1]] +# } +# est$opt.sparsity = sum(est$refit)/d/(d - 1) +# if (verbose) { +# message("done") +# } +# } +# if (criterion == "ebic" && est$method == "glasso") { +# if (verbose) { +# cat("Conducting extended Bayesian information criterion (ebic) selection....") +# # flush.console() +# } +# est$ebic.score = -n * est$loglik + log(n) * est$df + 4 * ebic.gamma * log(d) * est$df +# est$opt.index = which.min(est$ebic.score) +# est$refit = est$path[[est$opt.index]] +# est$opt.icov = est$icov[[est$opt.index]] +# if (est$cov.output) +# est$opt.cov = est$cov[[est$opt.index]] +# est$opt.lambda = est$lambda[est$opt.index] +# est$opt.sparsity = est$sparsity[est$opt.index] +# if (verbose) { +# message("done\n") +# # flush.console() +# } +# } +# if (criterion == "stars") { +# if (is.null(stars.subsample.ratio)) { +# if (n > 144) +# stars.subsample.ratio = 10 * sqrt(n)/n +# if (n <= 144) +# stars.subsample.ratio = 0.8 +# } +# +# # for (i in 1:nlambda) merge[[i]] <- Matrix(0, d, d) +# +# if (verbose) { +# mes = "Conducting Subsampling....." +# message(mes, appendLF = FALSE) +# } +# # for (i in 1:rep.num) { +# premerge <- parallel::mclapply(1:rep.num, function(i) { +# # if (verbose) { +# # mes <- paste(c("Conducting Subsampling....in progress:", +# # floor(100 * i/rep.num), "%"), collapse = "") +# # cat(mes, "\r") +# # flush.console() +# # } +# # merge <- replicate(nlambda, Matrix(0, d,d)) +# ind.sample = sample(c(1:n), floor(n * stars.subsample.ratio), +# replace = FALSE) +# if (est$method == "mb") +# tmp = huge::huge.mb(normfun(est$data[ind.sample, ]), lambda = est$lambda, +# scr = est$scr, idx.mat = est$idx.mat, sym = est$sym, +# verbose = FALSE)$path +# if (est$method == "ct") +# tmp = huge::huge.ct(normfun(est$data[ind.sample, ]), lambda = est$lambda, +# verbose = FALSE)$path +# if (est$method == "glasso") +# tmp = huge::huge.glasso(normfun(est$data[ind.sample, ]), lambda = est$lambda, +# scr = est$scr, verbose = FALSE)$path +# # for (j in 1:nlambda) merge[[j]] <- merge[[j]] + tmp[[j]] +# +# rm(ind.sample) +# gc() +# return(tmp) +# }, mc.cores=ncores) +# # } +# # merge <- lapply(merge, as.matrix) +# # merge <- lapply(merge, simplify2array) +# # est$merge <- lapply(1:dim(merge)[3], function(i) merge[,,i]/rep.num) +# +# merge <- Reduce(function(l1, l2) lapply(1:length(l1), +# function(i) l1[[i]] + l2[[i]]), premerge, accumulate=FALSE) +# +# if (verbose) { +# message("done") +# } +# est$variability = rep(0, nlambda) +# est$merge <- vector('list', nlambda) +# for (i in 1:nlambda) { +# est$merge[[i]] <- merge[[i]]/rep.num +# est$variability[i] <- 4 * sum(est$merge[[i]] * (1 - est$merge[[i]]))/(d * (d - 1)) +# } +# est$opt.index = max(which.max(est$variability >= +# stars.thresh)[1] - 1, 1) +# est$refit = est$path[[est$opt.index]] +# est$opt.lambda = est$lambda[est$opt.index] +# est$opt.sparsity = est$sparsity[est$opt.index] +# if (est$method == "glasso") { +# est$opt.icov = est$icov[[est$opt.index]] +# if (!is.null(est$cov)) +# est$opt.cov = est$cov[[est$opt.index]] +# } +# } +# est$criterion = criterion +# class(est) = "select" +# return(est) +# } +# } diff --git a/R/spaRcc.R b/R/spaRcc.R index ed39a01..e473cfe 100644 --- a/R/spaRcc.R +++ b/R/spaRcc.R @@ -90,7 +90,7 @@ pval.sparccboot <- function(x, sided='both') { -#' @keywords internal +#' @noRd sparccinner <- function(data.f, T=NULL, iter=10, th=0.1) { if (is.null(T)) T <- av(data.f) res.bv <- basis_var(T) @@ -119,7 +119,7 @@ sparccinner <- function(data.f, T=NULL, iter=10, th=0.1) { list(Cov=Cov, Cor=Cor, i=i, M=M, excluded=excluded) } -#' @keywords internal +#' @noRd exclude_pairs <- function(Cor, M, th=0.1, excluded=NULL) { # exclude pairs with high correlations break_flag <- FALSE @@ -137,7 +137,7 @@ exclude_pairs <- function(Cor, M, th=0.1, excluded=NULL) { list(M=M, excluded=excluded_new, break_flag=break_flag) } -#' @keywords internal +#' @noRd basis_cov <- function(data.f) { # data.f -> relative abundance data # OTUs in columns, samples in rows (yes, I know this is transpose of normal) @@ -152,7 +152,7 @@ basis_cov <- function(data.f) { list(Cov=Cov, M=M) } -#' @keywords internal +#' @noRd basis_var <- function(T, CovMat = matrix(0, nrow(T), ncol(T)), M = matrix(1, nrow(T), ncol(T)) + (diag(ncol(T))*(ncol(T)-2)), excluded = NULL, Vmin=1e-4) { @@ -169,7 +169,6 @@ basis_var <- function(T, CovMat = matrix(0, nrow(T), ncol(T)), list(Vbase=Vbase, M=M) } -#' @keywords internal C_from_V <- function(T, Vbase) { J <- matrix(1, nrow(T), ncol(T)) Vdiag <- diag(c(Vbase)) @@ -182,7 +181,7 @@ C_from_V <- function(T, Vbase) { list(Cov=CovMat, Cor=CorMat) } -#' @keywords internal +#' @noRd av <- function(data) { cov.clr <- cov(clr(data)) J <- matrix(1, ncol(data), ncol(data)) @@ -191,7 +190,7 @@ av <- function(data) { #' @importFrom VGAM rdiric -#' @keywords internal +#' @noRd norm_diric <- function(x, rep=1) { dmat <- VGAM::rdiric(rep, x+1) norm_to_total(colMeans(dmat)) diff --git a/R/spiec-easi-pkg.R b/R/spiec-easi-pkg.R index d3dd70f..f41debb 100644 --- a/R/spiec-easi-pkg.R +++ b/R/spiec-easi-pkg.R @@ -1,6 +1,6 @@ #' @importFrom graphics abline legend par plot #' @importFrom methods as -#' @importFrom stats cor cov cov2cor lm median na.exclude na.omit optim pnorm ppoints qlnorm rmultinom rnorm rpois sd var +#' @importFrom stats cor cov cov2cor lm median na.exclude na.omit optim pnorm ppoints qlnorm rmultinom rnorm rpois sd var #' @keywords internal "_PACKAGE" @@ -39,8 +39,10 @@ NULL #' @source https://www.hmpdacc.org/ihmp/ NULL +#' @name hmp216S #' @rdname hmp2 -"hmp216S" +NULL +#' @name hmp2prot #' @rdname hmp2 -"hmp2prot" +NULL diff --git a/R/spiec-easi.R b/R/spiec-easi.R index 92a5d7d..be2bac7 100644 --- a/R/spiec-easi.R +++ b/R/spiec-easi.R @@ -3,12 +3,11 @@ #' Run the whole SPIEC-EASI pipeline, from data transformation, sparse inverse covariance estimation and model selection. #' Inputs are a non-normalized OTU table and pipeline options. #' @export -#' @importFrom pulsar pulsar batch.pulsar +#' @importFrom pulsar pulsar batch.pulsar getMaxCov getLamPath spiec.easi <- function(data, ...) { UseMethod('spiec.easi', data) } -#' @keywords internal .phy2mat <- function(OTU) { if (inherits(OTU, 'phyloseq')) OTU <- OTU@otu_table @@ -19,7 +18,6 @@ spiec.easi <- function(data, ...) { return(OTU) } -#' @keywords internal .data.checks <- function(data) { ## data checks ## if (inherits(data, 'list')) { @@ -58,8 +56,7 @@ spiec.easi.otu_table <- function(data, ...) { } - -#' @keywords internal +#' @noRd .spiec.easi.norm <- function(data) { # internal function to normalize a data matrix if (inherits(data, 'matrix')) { @@ -98,7 +95,7 @@ spiec.easi.otu_table <- function(data, ...) { #' @seealso \code{\link[pulsar]{pulsar}} \code{\link[pulsar]{batch.pulsar}} \code{\link{spiec.easi}} NULL -#' @keywords internal +#' @noRd .check_pulsar_params <- function(fun, args=list()) { if (!inherits(args, 'list') || (length(args) >0 && is.null(names(args))) || any('' %in% names(args))) { stop('pulsar.params must be a named list') @@ -135,6 +132,7 @@ NULL #' @param pulsar.params list of further arguments to \code{\link{pulsar}} or \code{\link{batch.pulsar}}. See the documentation for \code{\link{pulsar.params}}. #' @param icov.select deprecated. #' @param icov.select.params deprecated. +#' @param lambda.log should values of lambda be distributed logarithmically (\code{TRUE}) or linearly ()\code{FALSE}) between \code{lamba.min} and \code{lambda.max}? #' @param ... further arguments to \code{\link{sparseiCov}} / \code{huge} #' @method spiec.easi default #' @rdname spiec.easi @@ -143,12 +141,10 @@ NULL spiec.easi.default <- function(data, method='glasso', sel.criterion='stars', verbose=TRUE, pulsar.select=TRUE, pulsar.params=list(), icov.select=pulsar.select, - icov.select.params=pulsar.params, ...) { + icov.select.params=pulsar.params, + lambda.log=TRUE, ...) { args <- list(...) - - .data.checks(data) - if (verbose) msg <- .makeMessage("Applying data transformations...") else msg <- .makeMessage('') @@ -159,7 +155,7 @@ spiec.easi.default <- function(data, method='glasso', sel.criterion='stars', args$method <- method X <- .spiec.easi.norm(data) if (is.null(args[['lambda.max']])) - args$lambda.max <- pulsar::getMaxCov(cor(X)) + args$lambda.max <- getMaxCov(cor(X)) }, mb = { @@ -168,16 +164,71 @@ spiec.easi.default <- function(data, method='glasso', sel.criterion='stars', args$method <- method X <- .spiec.easi.norm(data) if (is.null(args[['lambda.max']])) - args$lambda.max <- pulsar::getMaxCov(cor(X)) + args$lambda.max <- getMaxCov(cor(X)) + }, + + slr = { + # if (!require('irlba')) + # stop('irlba package required') + if (length(args$r) > 1) { #TODO: add beta vector option + tmp <- lapply(args$r, function(r) { + if (verbose) + message(sprintf("SPIEC-EASI SLR, with rank r=%s", r)) + args$r <- r + args2 <- c(list(data=data, method='slr', + sel.criterion=sel.criterion, verbose=verbose, + pulsar.params=pulsar.params, + pulsar.select=pulsar.select), args) + do.call(spiec.easi, args2) + }) + names(tmp) <- paste0("rank", args$r) + return(tmp) + } + message(msg, appendLF=verbose) + estFun <- "sparseLowRankiCov" + X <- .spiec.easi.norm(data) + if (is.null(args[['lambda.max']])) + args$lambda.max <- getMaxCov(cov(X)) }, + coat = { + message(msg, appendLF=verbose) + estFun <- "coat" + X <- .spiec.easi.norm(data) + if (is.null(args[['lambda.max']])) + args$lambda.max <- getMaxCov(X) + }, + + ising = { + if (inherits(data, 'list')) + stop('method "ising" does not support list data') + + message(msg, appendLF=verbose) + estFun <- "neighborhood.net" + args$method <- method + X <- sign(data) ; + if (is.null(args[['lambda.max']])) + args$lambda.max <- max(abs(t(scale(X)) %*% X)) / nrow(X) + }, + + poisson= { + if (inherits(data, 'list')) + stop('method "poisson" does not support list data') + + message(msg, appendLF=verbose) + estFun <- "neighborhood.net" + args$method <- method + X <- data ; + if (is.null(args[['lambda.max']])) + args$lambda.max <- max(abs(t(scale(X)) %*% X)) / nrow(X) + } ) if (is.null(args[[ "lambda" ]])) { if (is.null(args[[ "lambda.min.ratio" ]])) args$lambda.min.ratio <- 1e-3 if (is.null(args[[ "nlambda" ]])) args$nlambda <- 20 - args$lambda <- pulsar::getLamPath(args$lambda.max, args$lambda.max*args$lambda.min.ratio, - args$nlambda, log=TRUE) + args$lambda <- getLamPath(args$lambda.max, args$lambda.max*args$lambda.min.ratio, + args$nlambda, log=lambda.log) args$lambda.min.ratio <- args$nlambda <- args$lambda.max <- NULL } @@ -218,10 +269,8 @@ spiec.easi.default <- function(data, method='glasso', sel.criterion='stars', pulsar.params$lb.stars <- pulsar.params$ub.stars <- TRUE if (is.null(pulsar.params[[ "thresh" ]])) pulsar.params$thresh <- 0.05 - obj <- list(call=call) - class(obj) <- 'pulsar' call <- do.call('update', - c(pulsar.params, list(object=obj, evaluate=FALSE))) + c(pulsar.params, list(object=list(call=call), evaluate=FALSE))) if (verbose) message(sprintf("Selecting model with %s using ", fun), sel.criterion, "...") @@ -239,15 +288,6 @@ spiec.easi.default <- function(data, method='glasso', sel.criterion='stars', ) if (pulsar.select) { fit$select <- est -# list( -# merge = est$stars$merge, -# summary = est$stars$summary, -# opt.index = opt.index, -# lb.index = est$stars$lb.index, -# ub.index = est$stars$ub.index -# ) -# fit$refit <- if (sel.criterion=="gstars") fit$refit$gcd else fit$refit$stars -# attr(fit$refit, 'names') <- sel.criterion } fit$lambda <- args$lambda fit$fun <- call(estFun)[[1]] diff --git a/R/utilities.R b/R/utilities.R index 36bddfc..4bd5609 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -18,11 +18,11 @@ #' \item getOptBeta: the optimal coefficient matrix (mb only) #'} #' @export -getOptInd <- function(est) getOptX(est, 'index') +getOptInd <- function(est) getOptX(est, 'index') #' @rdname getOptInd #' @export -getOptLambda <- function(est) getOptX(est, 'lambda') +getOptLambda <- function(est) getOptX(est, 'lambda') #' @rdname getOptInd #' @export @@ -30,34 +30,35 @@ getOptMerge <- function(est) getOptX(est, 'merge') #' @rdname getOptInd #' @export -getStability <- function(est) getOptX(est, 'stars') +getStability <- function(est) getOptX(est, 'stars') #' @rdname getOptInd #' @export -getOptNet <- function(est) getOptX(est, 'refit') +getOptNet <- function(est) getOptX(est, 'refit') #' @rdname getOptInd #' @export -getRefit <- function(est) getOptNet(est) +getRefit <- function(est) getOptNet(est) #' @rdname getOptInd #' @export -getOptBeta <- function(est) getOptX(est, 'beta') +getOptBeta <- function(est) getOptX(est, 'beta') #' @rdname getOptInd #' @export -getOptCov <- function(est) getOptX(est, 'cov') +getOptCov <- function(est) getOptX(est, 'cov') #' @rdname getOptInd #' @export -getOptiCov <- function(est) getOptX(est, 'icov') +getOptiCov <- function(est) getOptX(est, 'icov') -#' @keywords internal -getOptX <- function(est, ...) { - UseMethod('getOptX', est) +#' @noRd +getOptX <- function(est, getter) { + UseMethod('getOptX') } -#' @keywords internal +#' @exportS3Method +#' @noRd getOptX.pulsar.refit <- function(est, getter='index') { if (length(est$refit) > 0) { if (names(est$refit) %in% "stars") { @@ -103,7 +104,6 @@ getOptX.pulsar.refit <- function(est, getter='index') { } - #' sym beta #' #' Symmetrize a beta (coefficient) matrix, ie. selected from MB neighborhood selection @@ -146,3 +146,46 @@ symBeta <- function(beta, mode='ave') { stop ("mode not recognized") as(symbeta, 'symmetricMatrix') } + +#' Functions for triangular matrices +#' +#' Get or symmeterize the upper/lower triangle of a symmetric matrix with the other side zeroed out +#' @param x the data matrix or vector +#' @param k (0/1 flag indicate diagonal should be selected) +#' @param diagval value to be added to the diagonal if converting from upper triangular matrix. +#' @export +triu <- function(x, k=1) x[upper.tri(x, !k)] +#' @export +#' @rdname triu +tril <- function(x, k=1) x[lower.tri(x, !k)] + +#' @rdname triu +#' @export +triu2diag <- function(x, diagval=0) { + stopifnot(is.null(dim(x)), TRUE) + e <- length(x) + n <- .5 * (sqrt(8*e + 1)+1) + mat <- matrix(0, n, n) + mat[upper.tri(mat)] <- x + mat <- mat + t(mat) + diag(mat) <- diagval + mat +} + + +#' @exportS3Method '[[' Matrix +#' @noRd +'[[.Matrix' <- function(x, i, exact=TRUE) { + if (exact) name <- attr(x, 'names') + else name <- substr(attr(x, 'names'), 1, nchar(i)) + if (name == i) + return(x) + else return(NULL) +} + + +#' @exportS3Method '$' Matrix +#' @noRd +'$.Matrix' <- function(x, name) { + x[[ name, exact=FALSE]] +} diff --git a/README.Rmd b/README.Rmd index 56624e1..d1e9181 100644 --- a/README.Rmd +++ b/README.Rmd @@ -29,6 +29,7 @@ One small point on notation: we refer to the method as "SPIEC-EASI" and reserve 2. [Basic Usage](#basic-usage) 3. [American Gut Data](#analysis-of-american-gut-data) 4. [Using phyloseq](#working-with-phyloseq) +5. [Learning latent variable graphical models](#learning-latent-variable-graphical-models) 5. [Cross Domain SPIEC-EASI](#cross-domain-interactions) 6. [pulsar & batch options](#pulsar-parallel-utilities-for-model-selection) 7. [Troubleshooting](#troubleshooting) @@ -184,9 +185,71 @@ ig2.mb <- adj2igraph(getRefit(se.mb.amgut2), vertex.attr=list(name=taxa_names(a plot_network(ig2.mb, amgut2.filt.phy, type='taxa', color="Rank3") ``` +## Learning latent variable graphical models ## +It can be shown that unobserved, latent variables introduce +artifacts into empirical estimates of OTU-OTU associations. These effects can be +removed from the network by treating the inverse covariance selection problem as +a sparse + low-rank decomposition (SPIEC-EASI slr), where the sparse component are the +associations encoded as a conditional independence graph, and the low-rank components +are the isolated latent effects. + +Please see the [preprint](https://www.biorxiv.org/content/10.1101/2019.12.21.885889v1.full) +and the manuscript [Synapse project](https://www.synapse.org/#!Synapse:syn20843558) +or [github repository](https://github.com/zdk123/SpiecEasiSLR_manuscript) for more details. + +To demonstrate this in action we can show that removing latent effects improves +a consistency measure between round 1 and round 2 of the American Gut project data. + +First we fit the networks, assuming that there are 10 latent components in the dataset: +```{r, eval=FALSE} +se1.mb.amgut <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=20, ncores=4)) +se2.mb.amgut <- spiec.easi(amgut2.filt.phy, method='mb', lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=20, ncores=4)) + + +se1.slr.amgut <- spiec.easi(amgut1.filt, method='slr', r=10, lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=20, ncores=4)) +se2.slr.amgut <- spiec.easi(amgut2.filt.phy, method='slr', r=10, lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=20, ncores=4)) +``` + +Then we compare the consistency between the edge sets within each method using +the Jaccard index. +```{r, eval=FALSE} +otu1 <- colnames(amgut1.filt) +otu2 <- taxa_names(amgut2.filt.phy) +edge.diss(getRefit(se1.mb.amgut), getRefit(se2.mb.amgut), 'jaccard', otu1, otu2) +edge.diss(getRefit(se1.slr.amgut), getRefit(se2.slr.amgut), 'jaccard', otu1, otu2) +``` +Consistency should be a bit better for the slr networks. + + +Construct the robust PCA from amgut2 data +```{r, eval=FALSE} +X <- se2.slr.amgut$est$data +L <- se2.slr.amgut$est$resid[[getOptInd(se2.slr.amgut)]] +pca <- robustPCA(X, L) +``` + +We can also check the correlation between AGP meta-data and the latent factors +(scores of the robust PCA). +```{r, eval=FALSE} +age <- as.numeric(as.character(sample_data(amgut2.filt.phy)$AGE)) +bmi <- as.numeric(as.character(sample_data(amgut2.filt.phy)$BMI)) +depth <- colSums(otu_table(amgut2.filt.phy)) + +cor(age, pca$scores, use='pairwise') +cor(bmi, pca$scores, use='pairwise') +cor(depth, pca$scores, use='pairwise') + +``` + ## Cross domain interactions ## -SpiecEasi now includes a convenience wrapper for dealing with multiple taxa sequenced on the same samples, such as 16S and ITS, as seen in [Tipton, Müller, et. al. (2018)](https://microbiomejournal.biomedcentral.com/articles/10.1186/s40168-017-0393-0). It assumes that each taxa is in it's own data matrix and that all samples are in all data matrices in the same order. +SpiecEasi now includes a convenience wrapper for dealing with multiple taxa sequenced on the same samples, +such as 16S and ITS, as seen in [Tipton, Müller, et. al. (2018)](https://microbiomejournal.biomedcentral.com/articles/10.1186/s40168-017-0393-0). +It assumes that each taxa is in it's own data matrix and that all samples are in all data matrices in the same order. Here's an example run from the [HMP2 project](https://ibdmdb.org/tunnel/public/summary.html) with 16S and Proteomics data. @@ -249,7 +312,7 @@ t2[3] > t4[3] Pulsar gives us the option of running stability selection in batch mode, using the [batchtools](https://mllg.github.io/batchtools/) package. This will be useful to anyone with access to an hpc/distributing computing system. Each subsample will be independently executed using a system-specific cluster function. This requires an external config file which will instruct the batchtools registry how to construct the cluster function which will execute the individual jobs. `batch.pulsar` has some built in config files that are useful for testing purposes (serial mode, "parallel", "snow", etc), but it is advisable to create your own config file and pass in the absolute path. See the [batchtools docs](https://mllg.github.io/batchtools/articles/batchtools.html#configuration-file) for instructions on how to construct config file and template files (i.e. to interact with a queueing system such as TORQUE or SGE). -```{r, run=FALSE} +```{r, eval=FALSE} ## bargs <- list(rep.num=50, seed=10010, conffile="path/to/conf.txt") bargs <- list(rep.num=50, seed=10010, conffile="parallel") @@ -262,6 +325,7 @@ se5 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, sel.criterion='stars', pulsar.select='batch', pulsar.params=bargs) ``` + ## Troubleshooting ## A common issue that comes up with when running `spiec.easi` is coming up with an empty network after running StARS. diff --git a/README.md b/README.md index c992057..c8f288b 100644 --- a/README.md +++ b/README.md @@ -1,334 +1,437 @@ - SpiecEasi ========= -[![Build Status](https://travis-ci.org/zdk123/SpiecEasi.svg?branch=pulsar)](https://travis-ci.org/zdk123/SpiecEasi) - -Sparse InversE Covariance estimation for Ecological Association and Statistical Inference - -This package will be useful to anybody who wants to infer graphical models for all sorts of compositional data, though primarily intended for microbiome relative abundance data (generated from 16S amplicon sequence data). It also includes a generator for [overdispersed, zero inflated] multivariate, correlated count data. Please see the paper published in [PLoS Comp Bio](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004226). - -One small point on notation: we refer to the method as "SPIEC-EASI" and reserve "SpiecEasi" for this package. - -## TOC ## -0. [Installation](#installation) -1. [News](#news) -2. [Basic Usage](#basic-usage) -3. [American Gut Data](#analysis-of-american-gut-data) -4. [Using phyloseq](#working-with-phyloseq) -5. [Cross Domain SPIEC-EASI](#cross-domain-interactions) -6. [pulsar & batch options](#pulsar-parallel-utilities-for-model-selection) -7. [Troubleshooting](#troubleshooting) - -## Installation ## - -## Installation ## -I assume that all auxiliary packages are already installed - for example huge, MASS, etc. If you get an unexpected error, you may need to download and install a missing dependency. +[![Build +Status](https://travis-ci.org/zdk123/SpiecEasi.svg?branch=pulsar)](https://travis-ci.org/zdk123/SpiecEasi) + +Sparse InversE Covariance estimation for Ecological Association and +Statistical Inference + +This package will be useful to anybody who wants to infer graphical +models for all sorts of compositional data, though primarily intended +for microbiome relative abundance data (generated from 16S amplicon +sequence data). It also includes a generator for \[overdispersed, zero +inflated\] multivariate, correlated count data. Please see the paper +published in [PLoS Comp +Bio](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004226). + +One small point on notation: we refer to the method as “SPIEC-EASI” and +reserve “SpiecEasi” for this package. + +TOC +--- + +1. [Installation](#installation) +2. [News](#news) +3. [Basic Usage](#basic-usage) +4. [American Gut Data](#analysis-of-american-gut-data) +5. [Using phyloseq](#working-with-phyloseq) +6. [Learning latent variable graphical + models](#learning-latent-variable-graphical-models) +7. [Cross Domain SPIEC-EASI](#cross-domain-interactions) +8. [pulsar & batch + options](#pulsar-parallel-utilities-for-model-selection) +9. [Troubleshooting](#troubleshooting) + +Installation +------------ + +I assume that all auxiliary packages are already installed - for example +pulsar, huge, MASS, etc. If you get an unexpected error, you may need to +download and install a missing dependency. From an interactive R session: + library(devtools) + install_github("zdk123/SpiecEasi") + library(SpiecEasi) -```r -library(devtools) -install_github("zdk123/SpiecEasi") -library(SpiecEasi) -``` +News +---- +The latest SpiecEasi (version 1.0.0 and higher) now uses the [pulsar +package](https://cran.r-project.org/package=pulsar) for stability-based +model selection. The methods are similar to previous releases, but +contains some additional methods for [speeding up +computations](#pulsar-parallel-utilities-for-model-selection) -## News ## +The input arguments have changed slightly (but are backwards compatible) +but the data structure that is returned from `spiec.easi` has changed. -The latest SpiecEasi (version 1.0.0 and higher) now uses the [pulsar package](https://cran.r-project.org/package=pulsar) for stability-based model selection. The methods are similar to previous releases, but contains some additional methods for [speeding up computations](#pulsar-parallel-utilities-for-model-selection) +The output to spiec.easi-fit models structure can be easily processed +using new getter functions. See `?getOptInd` for usage. -The input arguments have changed slightly (but are backwards compatible) but the data structure that is returned from `spiec.easi` has changed. +You can revert to the previous release +([0.1.4](https://github.com/zdk123/SpiecEasi/releases/tag/v0.1.4)) to +avoid code-breaking changes. -The output to spiec.easi-fit models structure can be easily processed using new getter functions. See `?getOptInd` for usage. +Basic Usage +----------- -You can revert to the previous release ([0.1.4](https://github.com/zdk123/SpiecEasi/releases/tag/v0.1.4)) to avoid code-breaking changes. +Lets simulate some multivariate data under zero-inflated negative +binomial model, based on (high depth/count) round 1 of the American gut +project, with a sparse network. The basic steps are -## Basic Usage ## - -Lets simulate some multivariate data under zero-inflated negative binomial model, based on (high depth/count) round 1 of the American gut project, with a sparse network. The basic steps are - -1. load the data and normalize counts to to common scale (min depth) -2. fit count margins to the model -3. generate a synthetic network -4. generate some synthetic data -5. clr transformation -6. inverse covariance estimation along a lambda (sparsity) path -7. stability selection using the StARS criterion -8. evaluate performance +1. load the data and normalize counts to to common scale (min depth) +2. fit count margins to the model +3. generate a synthetic network +4. generate some synthetic data +5. clr transformation +6. inverse covariance estimation along a lambda (sparsity) path +7. stability selection using the StARS criterion +8. evaluate performance Obviously, for real data, skip 1-4. + data(amgut1.filt) + depths <- rowSums(amgut1.filt) + amgut1.filt.n <- t(apply(amgut1.filt, 1, norm_to_total)) + amgut1.filt.cs <- round(amgut1.filt.n * min(depths)) -```r -data(amgut1.filt) -depths <- rowSums(amgut1.filt) -amgut1.filt.n <- t(apply(amgut1.filt, 1, norm_to_total)) -amgut1.filt.cs <- round(amgut1.filt.n * min(depths)) + d <- ncol(amgut1.filt.cs) + n <- nrow(amgut1.filt.cs) + e <- d -d <- ncol(amgut1.filt.cs) -n <- nrow(amgut1.filt.cs) -e <- d -``` Synthesize the data -```r -set.seed(10010) -graph <- make_graph('cluster', d, e) -Prec <- graph2prec(graph) -Cor <- cov2cor(prec2cov(Prec)) - -X <- synth_comm_from_counts(amgut1.filt.cs, mar=2, distr='zinegbin', Sigma=Cor, n=n) -``` - -the main SPIEC-EASI pipeline: Data transformation, sparse inverse covariance estimation and model selection - -```r -se <- spiec.easi(X, method='mb', lambda.min.ratio=1e-2, nlambda=15) -# Applying data transformations... -# Selecting model with pulsar using stars... -# Fitting final estimate with mb... -# done -``` - -examine ROC over lambda path and PR over the stars index for the selected graph - -```r -huge::huge.roc(se$est$path, graph, verbose=FALSE) -stars.pr(getOptMerge(se), graph, verbose=FALSE) -# stars selected final network under: se.est$refit$stars -``` - -The above example does not cover all possible options and parameters. For example, other generative network models are available, the lambda.min.ratio (the scaling factor that determines the minimum sparsity/lambda parameter) shown here might not be right for your dataset, and its possible that you'll want more repetitions (number of subsamples) for StARS. - - -## Analysis of American Gut data ## - - -Now let's apply SpiecEasi directly to the American Gut data. Don't forget that the normalization is performed internally in the `spiec.easi` function. Also, we should use a larger number of stars repetitions for real data. We can pass in arguments to the inner stars selection function as a list via the parameter `pulsar.params`. If you have more than one processor available, you can also supply a number to `ncores`. Also, let's compare results from the MB and glasso methods as well as SparCC (correlation). - - -```r -se.mb.amgut <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-2, - nlambda=20, pulsar.params=list(rep.num=50)) -se.gl.amgut <- spiec.easi(amgut1.filt, method='glasso', lambda.min.ratio=1e-2, - nlambda=20, pulsar.params=list(rep.num=50)) -sparcc.amgut <- sparcc(amgut1.filt) -## Define arbitrary threshold for SparCC correlation matrix for the graph -sparcc.graph <- abs(sparcc.amgut$Cor) >= 0.3 -diag(sparcc.graph) <- 0 -sparcc.graph <- Matrix(sparcc.graph, sparse=TRUE) -## Create igraph objects -ig.mb <- adj2igraph(getRefit(se.mb.amgut)) -ig.gl <- adj2igraph(getRefit(se.gl.amgut)) -ig.sparcc <- adj2igraph(sparcc.graph) -``` + set.seed(10010) + graph <- make_graph('cluster', d, e) + Prec <- graph2prec(graph) + Cor <- cov2cor(prec2cov(Prec)) + + X <- synth_comm_from_counts(amgut1.filt.cs, mar=2, distr='zinegbin', Sigma=Cor, n=n) + +the main SPIEC-EASI pipeline: Data transformation, sparse inverse +covariance estimation and model selection + + se <- spiec.easi(X, method='mb', lambda.min.ratio=1e-2, nlambda=15) + # Applying data transformations... + # Selecting model with pulsar using stars... + # Fitting final estimate with mb... + # done + +examine ROC over lambda path and PR over the stars index for the +selected graph + + huge::huge.roc(se$est$path, graph, verbose=FALSE) + stars.pr(getOptMerge(se), graph, verbose=FALSE) + # stars selected final network under: getRefit(se) + +The above example does not cover all possible options and parameters. +For example, other generative network models are available, the +lambda.min.ratio (the scaling factor that determines the minimum +sparsity/lambda parameter) shown here might not be right for your +dataset, and its possible that you’ll want more repetitions (number of +subsamples) for StARS. + +Analysis of American Gut data +----------------------------- + +Now let’s apply SpiecEasi directly to the American Gut data. Don’t +forget that the normalization is performed internally in the +`spiec.easi` function. Also, we should use a larger number of stars +repetitions for real data. We can pass in arguments to the inner stars +selection function as a list via the parameter `pulsar.params`. If you +have more than one processor available, you can also supply a number to +`ncores`. Also, let’s compare results from the MB and glasso methods as +well as SparCC (correlation). + + se.mb.amgut <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=50)) + se.gl.amgut <- spiec.easi(amgut1.filt, method='glasso', lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=50)) + sparcc.amgut <- sparcc(amgut1.filt) + ## Define arbitrary threshold for SparCC correlation matrix for the graph + sparcc.graph <- abs(sparcc.amgut$Cor) >= 0.3 + diag(sparcc.graph) <- 0 + library(Matrix) + sparcc.graph <- Matrix(sparcc.graph, sparse=TRUE) + ## Create igraph objects + ig.mb <- adj2igraph(getRefit(se.mb.amgut)) + ig.gl <- adj2igraph(getRefit(se.gl.amgut)) + ig.sparcc <- adj2igraph(sparcc.graph) Visualize using igraph plotting: -```r -library(igraph) -## set size of vertex proportional to clr-mean -vsize <- rowMeans(clr(amgut1.filt, 1))+6 -am.coord <- layout.fruchterman.reingold(ig.mb) - -par(mfrow=c(1,3)) -plot(ig.mb, layout=am.coord, vertex.size=vsize, vertex.label=NA, main="MB") -plot(ig.gl, layout=am.coord, vertex.size=vsize, vertex.label=NA, main="glasso") -plot(ig.sparcc, layout=am.coord, vertex.size=vsize, vertex.label=NA, main="sparcc") -``` - -![plot of chunk unnamed-chunk-7](https://i.imgur.com/SUUK3RV.png) - -We can evaluate the weights on edges networks using the terms from the underlying model. SparCC correlations can be used directly, while SpiecEasi networks need to be massaged a bit. Note that since SPIEC-EASI is based on penalized estimators, the edge weights are not directly comparable to SparCC (or Pearson/Spearman correlation coefficients) - - -```r -library(Matrix) -secor <- cov2cor(getOptCov(se.gl.amgut)) -sebeta <- symBeta(getOptBeta(se.mb.amgut), mode='maxabs') -elist.gl <- summary(triu(secor*getRefit(se.gl.amgut), k=1)) -elist.mb <- summary(sebeta) -elist.sparcc <- summary(sparcc.graph*sparcc.amgut$Cor) - -hist(elist.sparcc[,3], main='', xlab='edge weights') -hist(elist.mb[,3], add=TRUE, col='forestgreen') -hist(elist.gl[,3], add=TRUE, col='red') -``` + library(igraph) + ## set size of vertex proportional to clr-mean + vsize <- rowMeans(clr(amgut1.filt, 1))+6 + am.coord <- layout.fruchterman.reingold(ig.mb) + + par(mfrow=c(1,3)) + plot(ig.mb, layout=am.coord, vertex.size=vsize, vertex.label=NA, main="MB") + plot(ig.gl, layout=am.coord, vertex.size=vsize, vertex.label=NA, main="glasso") + plot(ig.sparcc, layout=am.coord, vertex.size=vsize, vertex.label=NA, main="sparcc") -![plot of chunk unnamed-chunk-8](https://i.imgur.com/Wx2u0il.png) - -Lets look at the degree statistics from the networks inferred by each method. +![](https://i.imgur.com/husP6HN.png) +We can evaluate the weights on edges networks using the terms from the +underlying model. SparCC correlations can be used directly, while +SpiecEasi networks need to be massaged a bit. Note that since SPIEC-EASI +is based on penalized estimators, the edge weights are not directly +comparable to SparCC (or Pearson/Spearman correlation coefficients) -```r -dd.gl <- degree.distribution(ig.gl) -dd.mb <- degree.distribution(ig.mb) -dd.sparcc <- degree.distribution(ig.sparcc) + library(Matrix) + secor <- cov2cor(getOptCov(se.gl.amgut)) + sebeta <- symBeta(getOptBeta(se.mb.amgut), mode='maxabs') + elist.gl <- summary(triu(secor*getRefit(se.gl.amgut), k=1)) + elist.mb <- summary(sebeta) + elist.sparcc <- summary(sparcc.graph*sparcc.amgut$Cor) -plot(0:(length(dd.sparcc)-1), dd.sparcc, ylim=c(0,.35), type='b', - ylab="Frequency", xlab="Degree", main="Degree Distributions") -points(0:(length(dd.gl)-1), dd.gl, col="red" , type='b') -points(0:(length(dd.mb)-1), dd.mb, col="forestgreen", type='b') -legend("topright", c("MB", "glasso", "sparcc"), - col=c("forestgreen", "red", "black"), pch=1, lty=1) -``` + hist(elist.sparcc[,3], main='', xlab='edge weights') + hist(elist.mb[,3], add=TRUE, col='forestgreen') + hist(elist.gl[,3], add=TRUE, col='red') -![plot of chunk unnamed-chunk-9](https://i.imgur.com/jFXxt9x.png) +![](https://i.imgur.com/xQb3LyF.png) +Lets look at the degree statistics from the networks inferred by each +method. -## Working with phyloseq ## + dd.gl <- degree.distribution(ig.gl) + dd.mb <- degree.distribution(ig.mb) + dd.sparcc <- degree.distribution(ig.sparcc) -SpiecEasi includes some convience wrappers to work directly with `phyloseq` objects. - -```r -library(phyloseq) -## Load round 2 of American gut project -data('amgut2.filt.phy') -se.mb.amgut2 <- spiec.easi(amgut2.filt.phy, method='mb', lambda.min.ratio=1e-2, - nlambda=20, pulsar.params=list(rep.num=50)) -ig2.mb <- adj2igraph(getRefit(se.mb.amgut2), vertex.attr=list(name=taxa_names(amgut2.filt.phy))) -plot_network(ig2.mb, amgut2.filt.phy, type='taxa', color="Rank3") -``` + plot(0:(length(dd.sparcc)-1), dd.sparcc, ylim=c(0,.35), type='b', + ylab="Frequency", xlab="Degree", main="Degree Distributions") + points(0:(length(dd.gl)-1), dd.gl, col="red" , type='b') + points(0:(length(dd.mb)-1), dd.mb, col="forestgreen", type='b') + legend("topright", c("MB", "glasso", "sparcc"), + col=c("forestgreen", "red", "black"), pch=1, lty=1) -![plot of chunk unnamed-chunk-10](https://i.imgur.com/U5YLDyB.png) +![](https://i.imgur.com/mheniLP.png) + +Working with phyloseq +--------------------- -## Cross domain interactions ## +SpiecEasi includes some convience wrappers to work directly with +`phyloseq` objects. -SpiecEasi now includes a convenience wrapper for dealing with multiple taxa sequenced on the same samples, such as 16S and ITS, as seen in [Tipton, Müller, et. al. (2018)](https://microbiomejournal.biomedcentral.com/articles/10.1186/s40168-017-0393-0). It assumes that each taxa is in it's own data matrix and that all samples are in all data matrices in the same order. - -Here's an example run from the [HMP2 project](https://ibdmdb.org/tunnel/public/summary.html) with 16S and Proteomics data. - - -```r -library(phyloseq) -data(hmp2) -se.hmp2 <- spiec.easi(list(hmp216S, hmp2prot), method='mb', nlambda=40, - lambda.min.ratio=1e-2, pulsar.params = list(thresh = 0.05)) - -dtype <- c(rep(1,ntaxa(hmp216S)), rep(2,ntaxa(hmp2prot))) -plot(adj2igraph(getRefit(se.hmp2)), vertex.color=dtype+1, vertex.size=9) -``` - -![plot of chunk unnamed-chunk-11](https://i.imgur.com/2DiRAgO.png) - - -## pulsar: parallel utilities for model selection ## -SpiecEasi is now using the [pulsar package](https://cran.r-project.org/package=pulsar) as the backend for performing model selection. In the default parameter setting, this uses the same [StARS](https://arxiv.org/abs/1006.3316) procedure as previous versions. -As in the previous version of SpiecEasi, we can supply the `ncores` argument to the pulsar.params list to break up the subsampled computations into parallel tasks. -In this example, we set the random seed to make consistent comparison across experiments. - -```r -## Default settings ## -pargs1 <- list(rep.num=50, seed=10010) -t1 <- system.time( -se1 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, - sel.criterion='stars', pulsar.select=TRUE, pulsar.params=pargs1) -) -## Parallel multicore ## -pargs2 <- list(rep.num=50, seed=10010, ncores=4) -t2 <- system.time( -se2 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, - sel.criterion='stars', pulsar.select=TRUE, pulsar.params=pargs2) -) -``` - -We can further speed up StARS using the [bounded-StARS](https://arxiv.org/abs/1605.07072) ('bstars') method. The B-StARS approach computes network stability across the whole lambda path, but only for the first 2 subsamples. This is used to build an initial estimate of the summary statistic, which in turn gives us a lower/upper bound on the optimal lambda. The remaining subsamples are used to compute the stability over the restricted path. Since denser networks are more computational expensive to compute, this can significantly reduce computational time for datasets with many variables. - -```r -t3 <- system.time( -se3 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, - sel.criterion='bstars', pulsar.select=TRUE, pulsar.params=pargs1) -) -t4 <- system.time( -se4 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, - sel.criterion='bstars', pulsar.select=TRUE, pulsar.params=pargs2) -) -``` - -We can see that in addition to the computational savings, the refit networks are identical. - -```r -## serial vs parallel -identical(getRefit(se1), getRefit(se2)) -t1[3] > t2[3] -## stars vs bstars -identical(getRefit(se1), getRefit(se3)) -t1[3] > t3[3] -identical(getRefit(se2), getRefit(se4)) -t2[3] > t4[3] -``` - -### Batch Mode ### - -Pulsar gives us the option of running stability selection in batch mode, using the [batchtools](https://mllg.github.io/batchtools/) package. This will be useful to anyone with access to an hpc/distributing computing system. Each subsample will be independently executed using a system-specific cluster function. - -This requires an external config file which will instruct the batchtools registry how to construct the cluster function which will execute the individual jobs. `batch.pulsar` has some built in config files that are useful for testing purposes (serial mode, "parallel", "snow", etc), but it is advisable to create your own config file and pass in the absolute path. See the [batchtools docs](https://mllg.github.io/batchtools/articles/batchtools.html#configuration-file) for instructions on how to construct config file and template files (i.e. to interact with a queueing system such as TORQUE or SGE). - -```r - -## bargs <- list(rep.num=50, seed=10010, conffile="path/to/conf.txt") -bargs <- list(rep.num=50, seed=10010, conffile="parallel") -## See the config file stores: -pulsar::findConfFile('parallel') -# [1] "/usr/local/lib/R/3.4/site-library/pulsar/config/batchtools.conf.parallel.R" - -## uncomment line below to turn off batchtools reporting -# options(batchtools.verbose=FALSE) -se5 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, - sel.criterion='stars', pulsar.select='batch', pulsar.params=bargs) -# Applying data transformations... -# Selecting model with batch.pulsar using stars... -# Fitting final estimate with mb... -# done -``` - -## Troubleshooting ## - -A common issue that comes up with when running `spiec.easi` is coming up with an empty network after running StARS. + library(phyloseq) + ## Load round 2 of American gut project + data('amgut2.filt.phy') + se.mb.amgut2 <- spiec.easi(amgut2.filt.phy, method='mb', lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=50)) + ig2.mb <- adj2igraph(getRefit(se.mb.amgut2), vertex.attr=list(name=taxa_names(amgut2.filt.phy))) + plot_network(ig2.mb, amgut2.filt.phy, type='taxa', color="Rank3") + +![](https://i.imgur.com/EuIv2Q6.png) + +Learning latent variable graphical models +----------------------------------------- + +It can be shown that unobserved, latent variables introduce artifacts +into empirical estimates of OTU-OTU associations. These effects can be +removed from the network by treating the inverse covariance selection +problem as a sparse + low-rank decomposition (SPIEC-EASI slr), where the +sparse component are the associations encoded as a conditional +independence graph, and the low-rank components are the isolated latent +effects. + +Please see the +[preprint](https://www.biorxiv.org/content/10.1101/2019.12.21.885889v1.full) +and the manuscript [Synapse +project](https://www.synapse.org/#!Synapse:syn20843558) or [github +repository](https://github.com/zdk123/SpiecEasiSLR_manuscript) for more +details. + +To demonstrate this in action we can show that removing latent effects +improves a consistency measure between round 1 and round 2 of the +American Gut project data. + +First we fit the networks, assuming that there are 10 latent components +in the dataset: + + se1.mb.amgut <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=20, ncores=4)) + se2.mb.amgut <- spiec.easi(amgut2.filt.phy, method='mb', lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=20, ncores=4)) + + + se1.slr.amgut <- spiec.easi(amgut1.filt, method='slr', r=10, lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=20, ncores=4)) + se2.slr.amgut <- spiec.easi(amgut2.filt.phy, method='slr', r=10, lambda.min.ratio=1e-2, + nlambda=20, pulsar.params=list(rep.num=20, ncores=4)) + +Then we compare the consistency between the edge sets within each method +using the Jaccard index. + + otu1 <- colnames(amgut1.filt) + otu2 <- taxa_names(amgut2.filt.phy) + edge.diss(getRefit(se1.mb.amgut), getRefit(se2.mb.amgut), 'jaccard', otu1, otu2) + edge.diss(getRefit(se1.slr.amgut), getRefit(se2.slr.amgut), 'jaccard', otu1, otu2) + +Consistency should be a bit better for the slr networks. + +Construct the robust PCA from amgut2 data + + X <- se2.slr.amgut$est$data + L <- se2.slr.amgut$est$resid[[getOptInd(se2.slr.amgut)]] + pca <- robustPCA(X, L) + +We can also check the correlation between AGP meta-data and the latent +factors (scores of the robust PCA). + + age <- as.numeric(as.character(sample_data(amgut2.filt.phy)$AGE)) + bmi <- as.numeric(as.character(sample_data(amgut2.filt.phy)$BMI)) + depth <- colSums(otu_table(amgut2.filt.phy)) + + cor(age, pca$scores, use='pairwise') + cor(bmi, pca$scores, use='pairwise') + cor(depth, pca$scores, use='pairwise') + +Cross domain interactions +------------------------- + +SpiecEasi now includes a convenience wrapper for dealing with multiple +taxa sequenced on the same samples, such as 16S and ITS, as seen in +[Tipton, Müller, et. +al. (2018)](https://microbiomejournal.biomedcentral.com/articles/10.1186/s40168-017-0393-0). +It assumes that each taxa is in it’s own data matrix and that all +samples are in all data matrices in the same order. + +Here’s an example run from the [HMP2 +project](https://ibdmdb.org/tunnel/public/summary.html) with 16S and +Proteomics data. + + library(phyloseq) + data(hmp2) + se.hmp2 <- spiec.easi(list(hmp216S, hmp2prot), method='mb', nlambda=40, + lambda.min.ratio=1e-2, pulsar.params = list(thresh = 0.05)) + + dtype <- c(rep(1,ntaxa(hmp216S)), rep(2,ntaxa(hmp2prot))) + plot(adj2igraph(getRefit(se.hmp2)), vertex.color=dtype+1, vertex.size=9) + +![](https://i.imgur.com/86OwfM2.png) + +pulsar: parallel utilities for model selection +---------------------------------------------- + +SpiecEasi is now using the [pulsar +package](https://cran.r-project.org/package=pulsar) as the backend for +performing model selection. In the default parameter setting, this uses +the same [StARS](https://arxiv.org/abs/1006.3316) procedure as previous +versions. As in the previous version of SpiecEasi, we can supply the +`ncores` argument to the pulsar.params list to break up the subsampled +computations into parallel tasks. In this example, we set the random +seed to make consistent comparison across experiments. + + ## Default settings ## + pargs1 <- list(rep.num=50, seed=10010) + t1 <- system.time( + se1 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, + sel.criterion='stars', pulsar.select=TRUE, pulsar.params=pargs1) + ) + ## Parallel multicore ## + pargs2 <- list(rep.num=50, seed=10010, ncores=4) + t2 <- system.time( + se2 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, + sel.criterion='stars', pulsar.select=TRUE, pulsar.params=pargs2) + ) + +We can further speed up StARS using the +[bounded-StARS](https://arxiv.org/abs/1605.07072) (‘bstars’) method. The +B-StARS approach computes network stability across the whole lambda +path, but only for the first 2 subsamples. This is used to build an +initial estimate of the summary statistic, which in turn gives us a +lower/upper bound on the optimal lambda. The remaining subsamples are +used to compute the stability over the restricted path. Since denser +networks are more computational expensive to compute, this can +significantly reduce computational time for datasets with many +variables. + + t3 <- system.time( + se3 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, + sel.criterion='bstars', pulsar.select=TRUE, pulsar.params=pargs1) + ) + t4 <- system.time( + se4 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, + sel.criterion='bstars', pulsar.select=TRUE, pulsar.params=pargs2) + ) + +We can see that in addition to the computational savings, the refit +networks are identical. + + ## serial vs parallel + identical(getRefit(se1), getRefit(se2)) + t1[3] > t2[3] + ## stars vs bstars + identical(getRefit(se1), getRefit(se3)) + t1[3] > t3[3] + identical(getRefit(se2), getRefit(se4)) + t2[3] > t4[3] + +### Batch Mode + +Pulsar gives us the option of running stability selection in batch mode, +using the [batchtools](https://mllg.github.io/batchtools/) package. This +will be useful to anyone with access to an hpc/distributing computing +system. Each subsample will be independently executed using a +system-specific cluster function. + +This requires an external config file which will instruct the batchtools +registry how to construct the cluster function which will execute the +individual jobs. `batch.pulsar` has some built in config files that are +useful for testing purposes (serial mode, “parallel”, “snow”, etc), but +it is advisable to create your own config file and pass in the absolute +path. See the [batchtools +docs](https://mllg.github.io/batchtools/articles/batchtools.html#configuration-file) +for instructions on how to construct config file and template files +(i.e. to interact with a queueing system such as TORQUE or SGE). + + + ## bargs <- list(rep.num=50, seed=10010, conffile="path/to/conf.txt") + bargs <- list(rep.num=50, seed=10010, conffile="parallel") + ## See the config file stores: + pulsar::findConfFile('parallel') + + ## uncomment line below to turn off batchtools reporting + # options(batchtools.verbose=FALSE) + se5 <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-3, nlambda=30, + sel.criterion='stars', pulsar.select='batch', pulsar.params=bargs) + +Troubleshooting +--------------- + +A common issue that comes up with when running `spiec.easi` is coming up +with an empty network after running StARS. For example: -```r -pargs <- list(seed=10010) -se <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=5e-1, nlambda=10, pulsar.params=pargs) -# Warning in pulsar(data = X, fun = match.fun(estFun), fargs = args, seed = -# 10010, : Optimal lambda may be smaller than the supplied values -getOptInd(se) -# [1] 1 -sum(getRefit(se))/2 -# [1] 0 -``` - -As the warning indicates, the network stability could not be determined from the lambda path. Looking at the stability along the lambda path, `se$select$stars$summary`, we can see that the maximum value of the StARS summary statistic never crosses the default threshold (0.05). - -This problem we can fix by lowering `lambda.min.ratio` to explore denser networks. - - -```r -se <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-1, nlambda=10, pulsar.params=pargs) -``` - -We have now fit a network, but since we have only a rough, discrete sampling of networks along the lambda path, we should check how far we are from the target stability threshold (0.05). - - -```r -getStability(se) -# [1] 0.03518685 -sum(getRefit(se))/2 -# [1] 158 -``` - -To get closer to the mark, we should bump up `nlambda` to more finely sample of the lambda path, which gives a denser network. - - -```r -se <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-1, nlambda=100, pulsar.params=pargs) -getStability(se) -# [1] 0.04798275 -sum(getRefit(se))/2 -# [1] 206 -``` + pargs <- list(seed=10010) + se <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=5e-1, nlambda=10, pulsar.params=pargs) + # Warning in pulsar(data = X, fun = match.fun(estFun), fargs = args, seed = + # 10010, : Optimal lambda may be smaller than the supplied values + getOptInd(se) + # [1] 1 + sum(getRefit(se))/2 + # [1] 0 + +As the warning indicates, the network stability could not be determined +from the lambda path. Looking at the stability along the lambda path, +`se$select$stars$summary`, we can see that the maximum value of the +StARS summary statistic never crosses the default threshold (0.05). + +This problem we can fix by lowering `lambda.min.ratio` to explore denser +networks. + + se <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-1, nlambda=10, pulsar.params=pargs) + +We have now fit a network, but since we have only a rough, discrete +sampling of networks along the lambda path, we should check how far we +are from the target stability threshold (0.05). + + getStability(se) + # [1] 0.034032 + sum(getRefit(se))/2 + # [1] 158 + +To get closer to the mark, we should bump up `nlambda` to more finely +sample of the lambda path, which gives a denser network. + + se <- spiec.easi(amgut1.filt, method='mb', lambda.min.ratio=1e-1, nlambda=100, pulsar.params=pargs) + getStability(se) + # [1] 0.04946882 + sum(getRefit(se))/2 + # [1] 210 diff --git a/man/AGP.Rd b/man/AGP.Rd index 9fbf53f..86a27fa 100644 --- a/man/AGP.Rd +++ b/man/AGP.Rd @@ -5,10 +5,12 @@ \alias{amgut1.filt} \alias{amgut2.filt.phy} \title{American Gut Project} -\format{\enumerate{ +\format{ +\enumerate{ \item amgut1.filt: A matrix with 289 samples (rows) and 127 OTUs (cols). \item amgut2.filt.phy: A phyloseq object -}} +} +} \source{ http://humanfoodproject.com/americangut/ } diff --git a/man/adj2igraph.Rd b/man/adj2igraph.Rd index dc95892..07eaa64 100644 --- a/man/adj2igraph.Rd +++ b/man/adj2igraph.Rd @@ -4,8 +4,13 @@ \alias{adj2igraph} \title{Adjacency to igraph} \usage{ -adj2igraph(Adj, rmEmptyNodes = FALSE, diag = FALSE, - edge.attr = list(), vertex.attr = list(name = 1:ncol(Adj))) +adj2igraph( + Adj, + rmEmptyNodes = FALSE, + diag = FALSE, + edge.attr = list(), + vertex.attr = list(name = 1:ncol(Adj)) +) } \arguments{ \item{Adj}{an Adjacency matrix} diff --git a/man/alr.Rd b/man/alr.Rd index a0054b5..2183f66 100644 --- a/man/alr.Rd +++ b/man/alr.Rd @@ -9,11 +9,24 @@ \usage{ alr(x.f, ...) -\method{alr}{default}(x.f, divcomp = 1, base = exp(1), - removeDivComp = TRUE, tol = .Machine$double.eps, ...) - -\method{alr}{matrix}(x.f, mar = 2, divcomp = 1, base = exp(1), - removeDivComp = TRUE, tol = .Machine$double.eps, ...) +\method{alr}{default}( + x.f, + divcomp = 1, + base = exp(1), + removeDivComp = TRUE, + tol = .Machine$double.eps, + ... +) + +\method{alr}{matrix}( + x.f, + mar = 2, + divcomp = 1, + base = exp(1), + removeDivComp = TRUE, + tol = .Machine$double.eps, + ... +) \method{alr}{data.frame}(x.f, mar = 2, ...) } diff --git a/man/clr.Rd b/man/clr.Rd index 2bb7ae3..13e5e1b 100644 --- a/man/clr.Rd +++ b/man/clr.Rd @@ -9,8 +9,7 @@ \usage{ clr(x.f, ...) -\method{clr}{default}(x.f, base = exp(1), tol = .Machine$double.eps, - ...) +\method{clr}{default}(x.f, base = exp(1), tol = .Machine$double.eps, ...) \method{clr}{matrix}(x.f, mar = 2, ...) diff --git a/man/coat.Rd b/man/coat.Rd new file mode 100644 index 0000000..a67886d --- /dev/null +++ b/man/coat.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/coat.R +\name{coat} +\alias{coat} +\title{COAT} +\usage{ +coat( + data, + lambda, + thresh = "soft", + adaptive = TRUE, + shrinkDiag = TRUE, + ret.icov = FALSE, + ... +) +} +\arguments{ +\item{data}{a clr-transformed data or covariance matrix} + +\item{lambda}{threshold parameter(s)} + +\item{thresh}{"soft" or "hard" thresholding} + +\item{adaptive}{use adative-version of the lambda as in the original COAT paper. See details.} + +\item{shrinkDiag}{flag to exclude the covariance diagonal from the shrinkage operation} + +\item{ret.icov}{flag to also return the inverse covariance matrix (inverse of all thresholded COAT matrices)} + +\item{...}{Arguments to automatically calculating the lambda path. See details.} +} +\description{ +Compositional-adjusted thresholding by doi.org/10.1080/01621459.2018.1442340 by Cao, Lin & Li (2018) +} +\details{ +If \code{adaptive=TRUE}, and \code{data} is a covariance matrix, the adaptive penalty is calculated by assuming the underlying data is jointly Gaussian in the infinite sample setting. The results may differ from the 'empirical' adaptive setting. + +There are a few undocumented arguments useful for computing a lambda path on the fly: +\itemize{ + \item{lambda.max: }{Maximum lambda. Default: max absolute covariance} + \item{lambda.min.ratio: }{lambda.min is lambda.min.ratio*lambda.max is the smallest lambda evaluated. Defailt: 1e-3} + \item{nlambda: }{Number of values of lambda between lambda.max and lambda.min. Default: 30} +} +} diff --git a/man/ebic.Rd b/man/ebic.Rd new file mode 100644 index 0000000..1709371 --- /dev/null +++ b/man/ebic.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/select.R +\name{ebic} +\alias{ebic} +\title{Extended BIC} +\usage{ +ebic(refit, data, loglik, gamma = 0.5) +} +\arguments{ +\item{refit}{adjacency matrix, getOpt from SpiecEasi output} + +\item{data}{input data set used to get the network} + +\item{loglik}{log likeihood of the graphical model} + +\item{gamma}{the model likeihood/complexity tradeoff parameter} +} +\description{ +Calculate the extended BIC criterion on a sparse (refit) network and the input data +} diff --git a/man/edge.diss.Rd b/man/edge.diss.Rd new file mode 100644 index 0000000..39fcfd1 --- /dev/null +++ b/man/edge.diss.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/roc.R +\name{edge.diss} +\alias{edge.diss} +\title{Edge set dissimilarity} +\usage{ +edge.diss(x, y, metric = "jaccard", otux = NULL, otuy = NULL) +} +\arguments{ +\item{x}{pxp adjacency matrix (\code{Matrix::sparseMatrix} class)} + +\item{y}{other qxq adjacency matrix (\code{Matrix::sparseMatrix} class)} + +\item{metric}{'jaccard' or 'max'} + +\item{otux}{taxa names of adjacency x} + +\item{otuy}{taxa names of adjacency y} +} +\description{ +Compute the dissimilarity between the edge sets of two networks via: +\enumerate{ + \item maximum overlap: |x y| / max\{|x|,|y|\} + \item jaccard index (default): |x y|/(|x U y|) +} +Input networks do not have to have the same node sets. +} diff --git a/man/graph2prec.Rd b/man/graph2prec.Rd index b99bcc1..2237823 100644 --- a/man/graph2prec.Rd +++ b/man/graph2prec.Rd @@ -4,8 +4,14 @@ \alias{graph2prec} \title{Convert a symmetric graph (extension of R matrix class)} \usage{ -graph2prec(Graph, posThetaLims = c(2, 3), negThetaLims = -posThetaLims, - targetCondition = 100, epsBin = 0.01, numBinSearch = 100) +graph2prec( + Graph, + posThetaLims = c(2, 3), + negThetaLims = -posThetaLims, + targetCondition = 100, + epsBin = 0.01, + numBinSearch = 100 +) } \arguments{ \item{Graph}{graph adjacency matrix} diff --git a/man/hmp2.Rd b/man/hmp2.Rd index 847b0a8..7c37988 100644 --- a/man/hmp2.Rd +++ b/man/hmp2.Rd @@ -6,21 +6,18 @@ \alias{hmp216S} \alias{hmp2prot} \title{Human Microbiome Project 2} -\format{\enumerate{ +\format{ +\enumerate{ \item hmp216S: 16S data, phyloseq object 45 taxa and 47 samples. \item hmp2prot: protein data, A phyloseq object, 43 'taxa' and 47 samples. - }} + } +} \source{ https://www.hmpdacc.org/ihmp/ } \usage{ data(hmp2) - -hmp216S - -hmp2prot } \description{ Pre-filtered data from the intregrated human microbiome project. } -\keyword{datasets} diff --git a/man/multi.spiec.easi.Rd b/man/multi.spiec.easi.Rd index cbc47ee..7b61bc3 100644 --- a/man/multi.spiec.easi.Rd +++ b/man/multi.spiec.easi.Rd @@ -5,8 +5,15 @@ \alias{spiec.easi.list} \title{multi domain SPIEC-EASI} \usage{ -multi.spiec.easi(datalist, method = "glasso", sel.criterion = "stars", - verbose = TRUE, pulsar.select = TRUE, pulsar.params = list(), ...) +multi.spiec.easi( + datalist, + method = "glasso", + sel.criterion = "stars", + verbose = TRUE, + pulsar.select = TRUE, + pulsar.params = list(), + ... +) \method{spiec.easi}{list}(data, ...) } diff --git a/man/neighborhood.net.Rd b/man/neighborhood.net.Rd new file mode 100644 index 0000000..fd9e02d --- /dev/null +++ b/man/neighborhood.net.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SparseICov.R +\name{neighborhood.net} +\alias{neighborhood.net} +\title{Neighborhood net estimates} +\usage{ +neighborhood.net(data, lambda, method = "ising", ncores = 1, sym = "or", ...) +} +\arguments{ +\item{data}{n x p input (pre-transformed) data} + +\item{lambda}{the lambda path} + +\item{method}{ising and poisson models currently supported.} + +\item{ncores}{number of cores for distributing the model fitting} + +\item{sym}{symmetrize the neighborhood using the 'or' (default)/'and' rule} + +\item{...}{further arguments to glmnet} +} +\description{ +Select a sparse inverse covariance matrix using neighborhood selection and glmnet from various exponential models. +} diff --git a/man/norm_rdiric.Rd b/man/norm_rdiric.Rd new file mode 100644 index 0000000..bc3da0f --- /dev/null +++ b/man/norm_rdiric.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalization.R +\name{norm_rdiric} +\alias{norm_rdiric} +\title{Normalize via dirichlet sampling} +\usage{ +norm_rdiric(x) +} +\arguments{ +\item{x}{count data vector} +} +\description{ +"Normalize" a count vector by drawing a single sample from a Dirichlet distribution, using the count vector as the prior. +} diff --git a/man/qqdplot.Rd b/man/qqdplot.Rd deleted file mode 100644 index 6d73058..0000000 --- a/man/qqdplot.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fitdistr.R -\name{qqdplot} -\alias{qqdplot} -\title{deprecated??} -\usage{ -qqdplot(y, distr, param, plot = TRUE, ...) -} -\description{ -deprecated?? -} -\keyword{internal} diff --git a/man/rmvnorm.Rd b/man/rmvnorm.Rd index 99bc556..40acd92 100644 --- a/man/rmvnorm.Rd +++ b/man/rmvnorm.Rd @@ -5,8 +5,13 @@ \title{Draw samples from multivariate, correlated normal distribution with counts correlated according to Sigma} \usage{ -rmvnorm(n = 100, mu = rep(0, 10), Sigma = diag(10), tol = 1e-06, - empirical = TRUE) +rmvnorm( + n = 100, + mu = rep(0, 10), + Sigma = diag(10), + tol = 1e-06, + empirical = TRUE +) } \arguments{ \item{n}{number of samples to draw} diff --git a/man/robustPCA.Rd b/man/robustPCA.Rd new file mode 100644 index 0000000..bc240e9 --- /dev/null +++ b/man/robustPCA.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SparseLowRankICov.R +\name{robustPCA} +\alias{robustPCA} +\title{robust PCA} +\usage{ +robustPCA(X, L, inverse = TRUE) +} +\arguments{ +\item{X}{the n x p [clr-transformed] data} + +\item{L}{the p x p rank-r ('residual') inverse covariance matrix from \code{spiec.easi} run argument method='slr'.} + +\item{inverse}{flag to indicate the L is the inverse covariance matrix} +} +\value{ +a named list with n x r matrix of scores and r x r matrix of loadings +} +\description{ +Form a robust PCA from clr-transformed data and [the low rank component of] an inverse covariance matrix +} diff --git a/man/sparccboot.Rd b/man/sparccboot.Rd index 5181a4e..b9ea811 100644 --- a/man/sparccboot.Rd +++ b/man/sparccboot.Rd @@ -4,11 +4,17 @@ \alias{sparccboot} \title{Bootstrap SparCC} \usage{ -sparccboot(data, sparcc.params = list(), statisticboot = function(data, - indices) triu(do.call("sparcc", c(list(data[indices, , drop = FALSE]), - sparcc.params))$Cor), statisticperm = function(data, indices) - triu(do.call("sparcc", c(list(apply(data[indices, ], 2, sample)), - sparcc.params))$Cor), R, ncpus = 1, ...) +sparccboot( + data, + sparcc.params = list(), + statisticboot = function(data, indices) triu(do.call("sparcc", c(list(data[indices, , + drop = FALSE]), sparcc.params))$Cor), + statisticperm = function(data, indices) triu(do.call("sparcc", + c(list(apply(data[indices, ], 2, sample)), sparcc.params))$Cor), + R, + ncpus = 1, + ... +) } \arguments{ \item{data}{Community count data} diff --git a/man/sparseLowRankiCov.Rd b/man/sparseLowRankiCov.Rd new file mode 100644 index 0000000..b0045a1 --- /dev/null +++ b/man/sparseLowRankiCov.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SparseLowRankICov.R +\name{sparseLowRankiCov} +\alias{sparseLowRankiCov} +\title{Sparse plus Low Rank inverse covariance} +\usage{ +sparseLowRankiCov(data, npn = FALSE, verbose = FALSE, cor = FALSE, ...) +} +\arguments{ +\item{data}{the n x p data matrix} + +\item{npn}{flag to first fit nonparametric normal transform to the data} + +\item{verbose}{flag to turn on verbose output} + +\item{cor}{flag to use correlation matrix as the input (default: false - uses covariance)} + +\item{...}{arguments to override default algorithm settings (see details)} +} +\description{ +Select an inverse covariance matrix that is a sparse plus low rank decomposition. +} +\details{ +This is a wrapper function for sparse plus low rank iCov estimations performed by a custom ADMM algorithm. + +Therefore, arguments \code{...} should be named. Typically, these are for specifying a penalty parameter, \code{lambda}, or the number of penalties to use. +By default 10 pentalties are used, ranging logarithmically between \code{lambda.min.ratio}*MAX and MAX. +Max is the theoretical upper bound on \code{lambda} and us \code{max|S|}, the maximum absolute value in the data correlation matrix. +\code{lambda.min.ratio} is 1e-3 by default. Lower values of \code{lambda} require more memory/cpu time to compute, and sometimes huge will throw an error. + +The argument \code{nlambda} determines the number of penalties - somewhere between 10-100 is usually good, depending on how the values of empirical correlation are distributed.#' @export + +One of \code{beta} (penalty for the nuclear norm) or \code{r} (number of ranks) should be supplied or \code{r=2} is chosen by default. +} diff --git a/man/sparseiCov.Rd b/man/sparseiCov.Rd index 9f71148..5d9e6cf 100644 --- a/man/sparseiCov.Rd +++ b/man/sparseiCov.Rd @@ -4,8 +4,7 @@ \alias{sparseiCov} \title{Sparse/penalized estimators of covariance matrices} \usage{ -sparseiCov(data, method, npn = FALSE, verbose = FALSE, - cov.output = TRUE, ...) +sparseiCov(data, method, npn = FALSE, verbose = FALSE, cov.output = TRUE, ...) } \arguments{ \item{data}{data matrix with features/OTUs in the columns and samples in the rows. Should be transformed by clr for meaningful results, if the data is compositional} diff --git a/man/spiec.easi.Rd b/man/spiec.easi.Rd index 3d27baf..e68884c 100644 --- a/man/spiec.easi.Rd +++ b/man/spiec.easi.Rd @@ -13,10 +13,18 @@ spiec.easi(data, ...) \method{spiec.easi}{otu_table}(data, ...) -\method{spiec.easi}{default}(data, method = "glasso", - sel.criterion = "stars", verbose = TRUE, pulsar.select = TRUE, - pulsar.params = list(), icov.select = pulsar.select, - icov.select.params = pulsar.params, ...) +\method{spiec.easi}{default}( + data, + method = "glasso", + sel.criterion = "stars", + verbose = TRUE, + pulsar.select = TRUE, + pulsar.params = list(), + icov.select = pulsar.select, + icov.select.params = pulsar.params, + lambda.log = TRUE, + ... +) } \arguments{ \item{data}{For a matrix, non-normalized count OTU/data table with samples on rows and features/OTUs in columns. Can also by phyloseq or otu_table object.} @@ -36,6 +44,8 @@ spiec.easi(data, ...) \item{icov.select}{deprecated.} \item{icov.select.params}{deprecated.} + +\item{lambda.log}{should values of lambda be distributed logarithmically (\code{TRUE}) or linearly ()\code{FALSE}) between \code{lamba.min} and \code{lambda.max}?} } \description{ Run the whole SPIEC-EASI pipeline, from data transformation, sparse inverse covariance estimation and model selection. diff --git a/man/synth_comm_from_counts.Rd b/man/synth_comm_from_counts.Rd index 8868f7d..9b609f2 100644 --- a/man/synth_comm_from_counts.Rd +++ b/man/synth_comm_from_counts.Rd @@ -4,8 +4,16 @@ \alias{synth_comm_from_counts} \title{synth_comm_from_counts} \usage{ -synth_comm_from_counts(comm, mar = 2, distr, Sigma = cov(comm), params, - n = nrow(comm), retParams = FALSE, ...) +synth_comm_from_counts( + comm, + mar = 2, + distr, + Sigma = cov(comm), + params, + n = nrow(comm), + retParams = FALSE, + ... +) } \arguments{ \item{comm}{community: matrix of counts} diff --git a/man/triu.Rd b/man/triu.Rd new file mode 100644 index 0000000..171a1f3 --- /dev/null +++ b/man/triu.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{triu} +\alias{triu} +\alias{tril} +\alias{triu2diag} +\title{Functions for triangular matrices} +\usage{ +triu(x, k = 1) + +tril(x, k = 1) + +triu2diag(x, diagval = 0) +} +\arguments{ +\item{x}{the data matrix or vector} + +\item{k}{(0/1 flag indicate diagonal should be selected)} + +\item{diagval}{value to be added to the diagonal if converting from upper triangular matrix.} +} +\description{ +Get or symmeterize the upper/lower triangle of a symmetric matrix with the other side zeroed out +} diff --git a/src/ADMM.cpp b/src/ADMM.cpp new file mode 100644 index 0000000..ea2f57d --- /dev/null +++ b/src/ADMM.cpp @@ -0,0 +1,240 @@ +#include +// [[Rcpp::depends(RcppArmadillo)]] + +using namespace Rcpp; +using namespace arma; + + +arma::mat sqrtmNewt2(arma::mat& C, arma::mat& sqrt0, const double& errTol) { + arma::mat X(sqrt0); + arma::mat X_new; +// arma::mat XC; + double err = std::numeric_limits::infinity(); + while (err > errTol) { + X_new = 0.5*(X + solve(X, C, solve_opts::fast)); + err = norm(X_new - X, "fro"); + X = X_new; + } + return X_new; +} + + + +arma::sp_mat SOFTTHRESH2(arma::mat& Sig, const float& lambda, const bool& shrinkDiag = true) { + arma::mat A = symmatu(Sig); + int n = A.n_cols; + arma::sp_mat S(sign(A)); + arma::mat M(abs(A)-lambda); + M.elem(find(M < 0)).zeros(); + if (!shrinkDiag) + M.diag() = A.diag(); + return M % S; +} + + +void svdPowSym(arma::mat& U, arma::vec& d, arma::mat& V, arma::mat& A, int k, int q) { + int l = k; + int m = A.n_rows; + int n = A.n_cols; + arma::mat P = randu(m,k-1); + arma::mat Q, R; + qr(Q, R, (P.t()*A).t()); + for (int j = 0; j( R_ExternalPtrAddr(xp) ) ; +// +// // int maxit=1000; +// // double tol=1e-5; +// // double eps2=1e-12; +// // int SP; +// // int RESTART = 0; +// // // SEXP RV = Rcpp::Nullable<>(); +// // // SEXP RW = Rcpp::Nullable<>(); +// // // SEXP RS = Rcpp::Nullable<>(); +// // // SEXP SCALE = Rcpp::Nullable<>(); +// // // SEXP SHIFT = Rcpp::Nullable<>(); +// // // SEXP CENTER = Rcpp::Nullable<>(); +// // Rcpp::Nullable RV = R_NilValue; +// // Rcpp::Nullable RW = R_NilValue; +// // Rcpp::Nullable RS = R_NilValue; +// // Rcpp::Nullable SCALE = R_NilValue; +// // Rcpp::Nullable SHIFT = R_NilValue; +// // Rcpp::Nullable CENTER = R_NilValue; +// // +// // double svtol = tol; +// // // RESTART <- 0L +// // //RV <- RW <- RS <- NULL +// // +// // int n = A.n_rows; +// // int work = k + 7; +// // +// // arma::vec v = randn(n); +// +// return IRLB; +// //A, k, v, work, maxit, tol, eps2, SP, RESTART, RV, RW, RS, SCALE, SHIFT, CENTER, svtol); +// +// } + + +// // [[Rcpp::export]] +// arma::mat softSVT4(arma::mat& M, int k, double beta=0) { +// List sout = SVD3(M, k); +// arma::mat U = as(sout["u"]); +// arma::mat V = as(sout["v"]); +// arma::vec d = as(sout["d"]); +// if (beta == 0) +// beta = d(k-1); +// arma::vec tmpd = d - beta; +// return U*diagmat(tmpd)*V.t(); +// } + +//' @noRd +// [[Rcpp::export]] +List ADMM(const arma::mat& SigmaO, const double& lambda, arma::mat& I, + arma::mat& Lambda, arma::mat& Y, double beta=0, int r=0, double mu=0, + const double& eta=4/5, const double& muf=1e-6, const int& + maxiter=500, const double& newtol=1e-5, const double& tol=1e-5, + const double& over_relax_par=1.6, bool shrinkDiag=true) { + + int n = SigmaO.n_rows; + if (mu == 0) mu = n; + arma::vec r_norm(maxiter), eps_pri(maxiter); + arma::mat Lambda1 = Lambda.cols(0 , n-1 ); + arma::mat Lambda2 = Lambda.cols(n , n*2-1); + arma::mat Lambda3 = Lambda.cols(n*2, n*3-1); + arma::mat R = Y.cols(0,n-1), L = Y.cols(n*2,n*3-1); + arma::sp_mat S(Y.cols(n,n*2-1)); + arma::mat RY(R), LY(L), SY(S); + arma::mat RO(size(R)), SO(size(S)), LO(size(L)); + arma::mat RA(size(R)), SA(size(S)), LA(size(L)), TA(size(L)); + arma::mat X(size(Y)); +// arma::mat Y_old(Y); + arma::mat K, K2, KI; + int iter; + for (iter = 0; iter < maxiter; iter++) { + // update X = (R,S,L) +// if (iter > 0) { + RA = RY + mu*Lambda1; + SA = SY + mu*Lambda2; + LA = LY + mu*Lambda3; +// } + // update R + K = mu*SigmaO - RA; + K2 = K*K + 4*mu*I; +// KI = sqrtmNewt2(K2, I, newtol); + arma::sqrtmat_sympd(KI, K2); + R = (KI-K)/2; + + // update S + S = SOFTTHRESH2(SA, lambda*mu, shrinkDiag); + if (iter == 0) { + L = LA; + } else { + L = softSVT2(LA, r, mu*beta); +// L = softSVT4(LA, r+1, mu*beta); + } + + X.cols(0 , n-1 ) = R; + X.cols(n , n*2-1) = S; + X.cols(n*2, n*3-1) = L; + + RO = over_relax_par*R + (1-over_relax_par)*RY; + SO = over_relax_par*S + (1-over_relax_par)*SY; + LO = over_relax_par*L + (1-over_relax_par)*LY; + + RA = RO - mu*Lambda1; + SA = SO - mu*Lambda2; + LA = LO - mu*Lambda3; + TA = (RA-SA+LA)/3; + RY = RA - TA; + SY = SA + TA; + LY = LA - TA; + + // update Lambda + Lambda1 = Lambda1 - (RO-RY)/mu; + Lambda2 = Lambda2 - (SO-SY)/mu; + Lambda3 = Lambda3 - (LO-LY)/mu; + + Y.cols(0 , n-1 ) = RY; + Y.cols(n , n*2-1) = SY; + Y.cols(n*2, n*3-1) = LY; + + r_norm(iter) = norm(X - Y, "fro"); + eps_pri(iter) = sqrt(3*n*n)*tol + tol*std::max(norm(X,"fro"), norm(Y,"fro")); + + if (r_norm(iter) < eps_pri(iter)) { + break; + } + mu = std::max(mu*eta, muf); + + } + + Lambda.cols(0 , n-1 ) = Lambda1; + Lambda.cols(n , n*2-1) = Lambda2; + Lambda.cols(n*2, n*3-1) = Lambda3; + + return List::create(_["R"] = R, _["S"] = S, _["L"] = L, + _["Y"] = Y, + _["Lambda"] = Lambda, +// _["RA"] = RA, _["SA"] = SA, _["LA"] = LA, + _["iter"] = iter+1, _["beta"] = beta, _["mu"] = mu, + _["history"] = List::create( + _["r_norm"] = r_norm, + _["eps_pri"] = eps_pri) + ); +} diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 0000000..15e29ba --- /dev/null +++ b/src/Makevars @@ -0,0 +1,7 @@ + +#PKG_CXXFLAGS=-I../inst/include +PKG_LIBS= $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) + +## With R 3.1.0 or later, you can uncomment the following line to tell R to +## enable compilation with C++11 (where available) +CXX_STD = CXX11 diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..c214a8f --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,43 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include +#include + +using namespace Rcpp; + +// ADMM +List ADMM(const arma::mat& SigmaO, const double& lambda, arma::mat& I, arma::mat& Lambda, arma::mat& Y, double beta, int r, double mu, const double& eta, const double& muf, const int& maxiter, const double& newtol, const double& tol, const double& over_relax_par, bool shrinkDiag); +RcppExport SEXP _SpiecEasi_ADMM(SEXP SigmaOSEXP, SEXP lambdaSEXP, SEXP ISEXP, SEXP LambdaSEXP, SEXP YSEXP, SEXP betaSEXP, SEXP rSEXP, SEXP muSEXP, SEXP etaSEXP, SEXP mufSEXP, SEXP maxiterSEXP, SEXP newtolSEXP, SEXP tolSEXP, SEXP over_relax_parSEXP, SEXP shrinkDiagSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::mat& >::type SigmaO(SigmaOSEXP); + Rcpp::traits::input_parameter< const double& >::type lambda(lambdaSEXP); + Rcpp::traits::input_parameter< arma::mat& >::type I(ISEXP); + Rcpp::traits::input_parameter< arma::mat& >::type Lambda(LambdaSEXP); + Rcpp::traits::input_parameter< arma::mat& >::type Y(YSEXP); + Rcpp::traits::input_parameter< double >::type beta(betaSEXP); + Rcpp::traits::input_parameter< int >::type r(rSEXP); + Rcpp::traits::input_parameter< double >::type mu(muSEXP); + Rcpp::traits::input_parameter< const double& >::type eta(etaSEXP); + Rcpp::traits::input_parameter< const double& >::type muf(mufSEXP); + Rcpp::traits::input_parameter< const int& >::type maxiter(maxiterSEXP); + Rcpp::traits::input_parameter< const double& >::type newtol(newtolSEXP); + Rcpp::traits::input_parameter< const double& >::type tol(tolSEXP); + Rcpp::traits::input_parameter< const double& >::type over_relax_par(over_relax_parSEXP); + Rcpp::traits::input_parameter< bool >::type shrinkDiag(shrinkDiagSEXP); + rcpp_result_gen = Rcpp::wrap(ADMM(SigmaO, lambda, I, Lambda, Y, beta, r, mu, eta, muf, maxiter, newtol, tol, over_relax_par, shrinkDiag)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_SpiecEasi_ADMM", (DL_FUNC) &_SpiecEasi_ADMM, 15}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_SpiecEasi(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/matops.cpp b/src/matops.cpp new file mode 100644 index 0000000..7b283e8 --- /dev/null +++ b/src/matops.cpp @@ -0,0 +1,15 @@ +// #include +// // [[Rcpp::depends(RcppArmadillo)]] +// +// using namespace Rcpp; +// using namespace arma; +// +// // [[Rcpp::export]] +// arma::mat MATAVE2(arma::mat A, arma::mat B) { +// return (A + B)/2; +// } +// +// // [[Rcpp::export]] +// arma::mat MATAVE3(arma::mat A, arma::mat B, arma::mat C) { +// return (A + B + C)/3; +// } diff --git a/src/sqrtNewton.cpp b/src/sqrtNewton.cpp new file mode 100644 index 0000000..331aa13 --- /dev/null +++ b/src/sqrtNewton.cpp @@ -0,0 +1,58 @@ +#include +// [[Rcpp::depends(RcppArmadillo)]] + +using namespace Rcpp; +using namespace arma; + + +// Compute the square root of a matrix using Newton's method +// does arma has a faster implementation? (yes) +arma::mat sqrtmNewt_dense(arma::mat C, arma::mat sqrt0, double errTol) { + arma::mat X = sqrt0; + arma::mat X_new; +// arma::mat XC; + double err = std::numeric_limits::infinity(); + while (err > errTol) { + X_new = 0.5*(X + solve(X, C, solve_opts::fast)); +// X_new = 0.5*(X + XC); + err = norm(X_new - X, "fro"); + X = X_new; + } + return X_new; +} + + +arma::mat sqrtmNewt_sp(arma::mat C, arma::sp_mat sqrt0, double errTol) { + arma::sp_mat X = sqrt0; + arma::mat X_new(X); +// arma::mat XC; + double err = std::numeric_limits::infinity(); + while (err > errTol) { + X_new = 0.5*(X_new + spsolve(X, C, "lapack")); +// X_new = + XC); + err = norm(X_new - X, "fro"); + X = sp_mat(X_new); + } + return X_new; +} + + +arma::mat sqrtmNewt(arma::mat C, SEXP sqrt0, double errTol = 1e-3) { + if (Rf_isS4(sqrt0)) { + if (Rf_inherits(sqrt0, "sparseMatrix")) { + //TODO: make this faster! + return sqrtmNewt_sp(C, as(sqrt0), errTol); + } ; + stop("matrix has unknown class") ; + } else { + return sqrtmNewt_dense(C, as(sqrt0), errTol); + } +} + + + +arma::mat solveCpp(arma::mat C, arma::mat X) { + arma::mat out; + out = solve(C, X); + return out; +} diff --git a/src/svthresh.cpp b/src/svthresh.cpp new file mode 100644 index 0000000..0d57d17 --- /dev/null +++ b/src/svthresh.cpp @@ -0,0 +1,34 @@ +#include +// [[Rcpp::depends(RcppArmadillo)]] + +using namespace Rcpp; +using namespace arma; + + +//soft thresholding operator of symmetric matrix +arma::sp_mat SOFTTHRESH(arma::mat Sig, float lambda, bool shrinkDiag = true) { + arma::mat A = symmatu(Sig); + int n = A.n_cols; + arma::sp_mat S(sign(A)); + arma::mat M(abs(A)-lambda); + M.elem(find(M < 0)).zeros(); + if (!shrinkDiag) + M.diag() = A.diag(); + return M % S; +} + + +List softSVT(arma::mat M, float tau=0, int k=0) { + arma::mat U, V; + arma::vec d; + + svd(U, d, V, M); + if (k != 0) + tau = d(k); //*1.01; + arma::vec tmpd = d - tau; + tmpd.elem(find(tmpd < 0)).zeros(); + M = U*diagmat(tmpd)*V.t(); + return List::create(Named("M") = M, + Named("tau") = tau, + Named("d") = tmpd); +}