From 81f6f02bbd17539ba6f21a0e04c8587447cddb55 Mon Sep 17 00:00:00 2001 From: iciarfernandez Date: Tue, 8 Aug 2023 16:36:46 -0700 Subject: [PATCH 01/19] fixing errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I updated the function to include the code where I load the model object (which yes, I think it basically replaces the equivalent of `data(ethnicityCpGs, envir=environment())` in the ethnicity function) and to require the mixOmics and ExperimentHub packages, ran `devtools::document()`, then `devtools::check()`. I think everything with the function is working now - the error that I am getting now with `devtools::check()` is about the vignette; `valMeta` in the vignette is supposed to be the pData of the `val` object, but we didn't actually upload it to the ExperimentHub so I can't load it. If you could update the ExperimentHub to 1) include the `valMeta` object, which I have added [to the dropbox](https://www.dropbox.com/scl/fi/byok0mkwl7iqwssexg5d2/valMeta.csv?rlkey=c0hle2lwgost83u6o6u4iwzku&dl=0) and 2) delete the `x_test` object and instead include a new `val` object valBMIQ, also [now in the dropbox](https://www.dropbox.com/scl/fi/uteq2joahew6wm9i0wl35/valBMIQ.rds?rlkey=er4trp4bg4xkskvey4yihgw4d&dl=0), then I can try running `devtools::check()` again and hopefully everything works! Last but not least, I downgraded to Bioconductor 3.17 but still getting this warning when I try to load the data from ExperimentHub() > Warning message: > package ‘eoPredData’ is not available for Bioconductor version '3.17' Even though it does seem to install - i.e., I am able to run `mod = eh[["EH8090"]]` and it works to load the data. Anyway, it doesn't seem to be a problem? --- DESCRIPTION | 3 +- NAMESPACE | 3 + R/predictPreeclampsia.R | 184 ++++++++++++++++++------------------- man/predictEthnicity.Rd | 4 +- man/predictPreeclampsia.Rd | 35 +++++++ vignettes/planet.Rmd | 14 ++- 6 files changed, 146 insertions(+), 97 deletions(-) create mode 100644 man/predictPreeclampsia.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 2600499..428f8f3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,7 +21,8 @@ Imports: methods, tibble, magrittr, - dplyr + dplyr, + ExperimentHub Suggests: badger, ggplot2, diff --git a/NAMESPACE b/NAMESPACE index 291a69b..7437be4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,5 +5,8 @@ export(pl_infer_age) export(pl_infer_ethnicity) export(predictAge) export(predictEthnicity) +export(predictPreeclampsia) +import(ExperimentHub) +import(mixOmics) importFrom(magrittr,"%>%") importFrom(tibble,tibble) diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R index 51c7732..de13ce8 100644 --- a/R/predictPreeclampsia.R +++ b/R/predictPreeclampsia.R @@ -1,93 +1,91 @@ -#' @title predictPreeclampsia -#' -#' @description Uses 45 CpGs to predict early preeclampsia (PE delivered before or at 34 weeks of gestation) -#' on placental DNA methylation microarray data. -#' -#' @details Assigns the class labels "early-PE" or "normotensive" to each sample -#' and returns a class probability. -#' -#' It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data -#' prior to prediction. This was the normalization method used on the training data. -#' -#' @param betas matrix or array of methylation values on the beta scale (0, 1), -#' where the variables are arranged in rows, and samples in columns. -#' -#' @return produces a list with components detailed in the `mixOmics::predict` R documentation -#' -#' @examples -#' -#' To predict early preeclampsia on 450k/850k samples -#' -#' Load data -#' data(peBetas) -#' predictPreeclampsia(peBetas, dist = "max.dist") -#' -#' @export predictPreeclampsia -#' - -predictPreeclampsia <- function(betas, ...){ - - # read in data to generate model - data(trainBetas, envir=environment()) - data(trainLabels, envir=environment()) - - # model - set.seed(2022) - mod = mixOmics::splsda(trainBetas, trainLabels, ncomp = 1, keepX = 45) - trainCpGs = colnames(mod)$X - peCpGs = mixOmics::selectVar(mod)$name - - # check that there are no NAs in the predictors (or if there are, how many) - pp <- intersect(colnames(betas), peCpGs) - - if(length(pp) < length(peCpGs)){ - stop(paste( - "Only", length(pp), "out of 45 predictive CpGs present. All 45 predictive CpGs are needed to run the function." - )) - } else { - message(paste(length(pp), "of 45 predictive CpGs present.")) - message("BMIQ normalization is recommended for best results. If choosing other method, it is recommended to compare results to predictions on BMIQ normalized data.") - } - - # set up data for prediction - - # if input data is missing any of the cpgs present in the training data, this function - # adds the ones that are missing as NAs - # necessary for `mixOmics::predict` to work - - outersect = function(x, y) { - sort(c(x[!x%in%y], - y[!y%in%x])) - } - - if(inherits(betas, 'matrix')){ - } else if (inherits(betas, 'array')) { - } else { - - # throw an error - print(paste0("Input data must be a matrix or an array")) - } - - subset <- betas[,colnames(betas) %in% trainCpGs] - - # order - subset <- subset[drop=FALSE,, trainCpGs] - - if(all(colnames(subset) == trainCpGs) == FALSE){ - stop() - } else - - # predict - out <- mixOmics:::predict.mixo_spls(mod, subset) - - # get class probabilities - CP <- out$predict[,,1] - CP <- t(apply(as.matrix(CP), 1, function(data) exp(data)/sum(exp(data)))) - CP <- as.data.frame(CP) %>% tibble::rownames_to_column("Sample_ID") - CP$Pred_Class <- CP$comp1 - CP <- CP %>% - dplyr::mutate(Pred_Class = dplyr::case_when(EOPE > 0.55 ~ "EOPE", - EOPE < 0.55 ~ "Normotensive")) - - return(CP) -} +#' @title predictPreeclampsia +#' +#' @description Uses 45 CpGs to predict early preeclampsia (PE delivered before or at 34 weeks of gestation) +#' on placental DNA methylation microarray data. +#' +#' @details Assigns the class labels "early-PE" or "normotensive" to each sample +#' and returns a class probability. +#' +#' It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data +#' prior to prediction. This was the normalization method used on the training data. +#' +#' @param betas matrix or array of methylation values on the beta scale (0, 1), +#' where the variables are arranged in rows, and samples in columns. +#' +#' @return produces a list with components detailed in the `mixOmics::predict` R documentation +#' +#' @examples +#' +#' To predict early preeclampsia on 450k/850k samples +#' +#' Load data +#' data(peBetas) +#' predictPreeclampsia(peBetas, dist = "max.dist") +#' +#' @import mixOmics +#' @import ExperimentHub +#' @export predictPreeclampsia +#' + +predictPreeclampsia <- function(betas, ...){ + + # read in data to generate model + eh = ExperimentHub() + mod = eh[['EH8090']] + trainCpGs = colnames(mod$X) + + # check that there are no NAs in the predictors (or if there are, how many) + peCpGs = mixOmics::selectVar(mod)$name + pp <- intersect(colnames(betas), peCpGs) + + if(length(pp) < length(peCpGs)){ + stop(paste( + "Only", length(pp), "out of 45 predictive CpGs present. All 45 predictive CpGs are needed to run the function." + )) + } else { + message(paste(length(pp), "of 45 predictive CpGs present.")) + message("BMIQ normalization is recommended for best results. If choosing other method, it is recommended to compare results to predictions on BMIQ normalized data.") + } + + # set up data for prediction + + # if input data is missing any of the cpgs present in the training data, this function + # adds the ones that are missing as NAs + # necessary for `mixOmics::predict` to work + + outersect = function(x, y) { + sort(c(x[!x%in%y], + y[!y%in%x])) + } + + if(inherits(betas, 'matrix')){ + } else if (inherits(betas, 'array')) { + } else { + + # throw an error + print(paste0("Input data must be a matrix or an array")) + } + + subset <- betas[,colnames(betas) %in% trainCpGs] + + # order + subset <- subset[drop=FALSE,, trainCpGs] + + if(all(colnames(subset) == trainCpGs) == FALSE){ + stop() + } else + + # predict + out <- mixOmics:::predict.mixo_spls(mod, subset) + + # get class probabilities + CP <- out$predict[,,1] + CP <- t(apply(as.matrix(CP), 1, function(data) exp(data)/sum(exp(data)))) + CP <- as.data.frame(CP) %>% tibble::rownames_to_column("Sample_ID") + CP$Pred_Class <- CP$comp1 + CP <- CP %>% + dplyr::mutate(Pred_Class = dplyr::case_when(EOPE > 0.55 ~ "EOPE", + EOPE < 0.55 ~ "Normotensive")) + + return(CP) +} diff --git a/man/predictEthnicity.Rd b/man/predictEthnicity.Rd index 7068ea0..61ecd28 100644 --- a/man/predictEthnicity.Rd +++ b/man/predictEthnicity.Rd @@ -23,8 +23,8 @@ Uses 1860 CpGs to predict self-reported ethnicity on placental microarray data. } \details{ -Predicts self-reported ethnicity from 3 classes: Africans, Asians, -and Caucasians, using placental DNA methylation data measured on the Infinium +Predicts self-reported ethnicity from 3 groups: African, Asian, +and European, using placental DNA methylation data measured on the Infinium 450k/EPIC methylation array. Will return membership probabilities that often reflect genetic ancestry composition. diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd new file mode 100644 index 0000000..7972990 --- /dev/null +++ b/man/predictPreeclampsia.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predictPreeclampsia.R +\name{predictPreeclampsia} +\alias{predictPreeclampsia} +\title{predictPreeclampsia} +\usage{ +predictPreeclampsia(betas, ...) +} +\arguments{ +\item{betas}{matrix or array of methylation values on the beta scale (0, 1), +where the variables are arranged in rows, and samples in columns.} +} +\value{ +produces a list with components detailed in the \code{mixOmics::predict} R documentation +} +\description{ +Uses 45 CpGs to predict early preeclampsia (PE delivered before or at 34 weeks of gestation) +on placental DNA methylation microarray data. +} +\details{ +Assigns the class labels "early-PE" or "normotensive" to each sample +and returns a class probability. + +It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data +prior to prediction. This was the normalization method used on the training data. +} +\examples{ + +To predict early preeclampsia on 450k/850k samples + +Load data +data(peBetas) +predictPreeclampsia(peBetas, dist = "max.dist") + +} diff --git a/vignettes/planet.Rmd b/vignettes/planet.Rmd index 98cc5f7..ca6a1bd 100644 --- a/vignettes/planet.Rmd +++ b/vignettes/planet.Rmd @@ -329,9 +329,21 @@ for classification used to assign labels is 55%; if the users wishes to use other threshold, different labels can be assigned based on the output probabilities. +```{r include=FALSE} + +library(ExperimentHub) +eh = ExperimentHub() +query(eh, c('eoPred')) +val = eh[["EH8091"]] +# add line to load valMeta + +``` + ```{r} -preds <- predictPreeclampsia(t(val)) +dim(val) # 48 by 341,281 + +preds <- predictPreeclampsia(val) # have a quick look head(preds) From 83876d1caf8f50a8bf5d66e59d9df8a78a717d0e Mon Sep 17 00:00:00 2001 From: iciarfernandez Date: Sun, 13 Aug 2023 18:12:18 -0700 Subject: [PATCH 02/19] updating code to fix errors --- DESCRIPTION | 3 ++- R/predictPreeclampsia.R | 22 +++++++++++----------- man/predictPreeclampsia.Rd | 13 +++++++------ vignettes/planet.Rmd | 4 ++-- 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 428f8f3..6b40ad6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,8 @@ Imports: tibble, magrittr, dplyr, - ExperimentHub + ExperimentHub, + mixOmics Suggests: badger, ggplot2, diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R index de13ce8..82faea7 100644 --- a/R/predictPreeclampsia.R +++ b/R/predictPreeclampsia.R @@ -6,7 +6,7 @@ #' @details Assigns the class labels "early-PE" or "normotensive" to each sample #' and returns a class probability. #' -#' It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data +#' # It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data #' prior to prediction. This was the normalization method used on the training data. #' #' @param betas matrix or array of methylation values on the beta scale (0, 1), @@ -16,11 +16,11 @@ #' #' @examples #' -#' To predict early preeclampsia on 450k/850k samples +#' # To predict early preeclampsia on 450k/850k samples #' -#' Load data -#' data(peBetas) -#' predictPreeclampsia(peBetas, dist = "max.dist") +#' # Load data +#' # data(peBetas) +#' # predictPreeclampsia(peBetas, dist = "max.dist") #' #' @import mixOmics #' @import ExperimentHub @@ -66,25 +66,25 @@ predictPreeclampsia <- function(betas, ...){ print(paste0("Input data must be a matrix or an array")) } - subset <- betas[,colnames(betas) %in% trainCpGs] + betasSubset <- betas[,colnames(betas) %in% trainCpGs] # order - subset <- subset[drop=FALSE,, trainCpGs] + betasSubset <- betasSubset[drop=FALSE,, trainCpGs] - if(all(colnames(subset) == trainCpGs) == FALSE){ + if(all(colnames(betasSubset) == trainCpGs) == FALSE){ stop() } else # predict - out <- mixOmics:::predict.mixo_spls(mod, subset) + out <- mixOmics:::predict.mixo_spls(mod, betasSubset) # get class probabilities CP <- out$predict[,,1] CP <- t(apply(as.matrix(CP), 1, function(data) exp(data)/sum(exp(data)))) CP <- as.data.frame(CP) %>% tibble::rownames_to_column("Sample_ID") - CP$Pred_Class <- CP$comp1 + CP$PE_Status <- CP$comp1 CP <- CP %>% - dplyr::mutate(Pred_Class = dplyr::case_when(EOPE > 0.55 ~ "EOPE", + dplyr::mutate(PE_Status = dplyr::case_when(EOPE > 0.55 ~ "EOPE", EOPE < 0.55 ~ "Normotensive")) return(CP) diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd index 7972990..41ee2ee 100644 --- a/man/predictPreeclampsia.Rd +++ b/man/predictPreeclampsia.Rd @@ -20,16 +20,17 @@ on placental DNA methylation microarray data. \details{ Assigns the class labels "early-PE" or "normotensive" to each sample and returns a class probability. - -It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data +} +\section{It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data}{ prior to prediction. This was the normalization method used on the training data. } + \examples{ -To predict early preeclampsia on 450k/850k samples +# To predict early preeclampsia on 450k/850k samples -Load data -data(peBetas) -predictPreeclampsia(peBetas, dist = "max.dist") +# Load data +# data(peBetas) +# predictPreeclampsia(peBetas, dist = "max.dist") } diff --git a/vignettes/planet.Rmd b/vignettes/planet.Rmd index ca6a1bd..4b81626 100644 --- a/vignettes/planet.Rmd +++ b/vignettes/planet.Rmd @@ -329,7 +329,7 @@ for classification used to assign labels is 55%; if the users wishes to use other threshold, different labels can be assigned based on the output probabilities. -```{r include=FALSE} +```{r include=FALSE, eval = FALSE} library(ExperimentHub) eh = ExperimentHub() @@ -339,7 +339,7 @@ val = eh[["EH8091"]] ``` -```{r} +```{r include=FALSE, eval = FALSE} dim(val) # 48 by 341,281 From fe5ed86dfbf2dd71ca7b3df172aa2b0e579da036 Mon Sep 17 00:00:00 2001 From: iciarfernandez Date: Wed, 12 Jun 2024 16:23:47 +0200 Subject: [PATCH 03/19] in response to pull request --- DESCRIPTION | 6 ++++-- R/predictPreeclampsia.R | 4 +++- man/planet-package.Rd | 5 +++++ man/predictPreeclampsia.Rd | 2 ++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6b40ad6..06d6e06 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -4,8 +4,10 @@ Version: 1.7.0 Authors@R: c(person("Victor", "Yuan", email = "victor.2wy@gmail.com", role = c("aut", "cre")), - person(c("Wendy", "P."), "Robinson", - role = "ctb")) + person(c("Wendy", "P."), "Robinson", + role = "ctb"), + person("Icíar", "Fernández-Boyano", email = "iciarfernandez@outlook.com", + role = c("aut", "ctb"))) URL: https://victor.rbind.io/planet, http://github.com/wvictor14/planet BugReports: diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R index 82faea7..13de998 100644 --- a/R/predictPreeclampsia.R +++ b/R/predictPreeclampsia.R @@ -11,6 +11,8 @@ #' #' @param betas matrix or array of methylation values on the beta scale (0, 1), #' where the variables are arranged in rows, and samples in columns. +#' +#' @param ... feeds into outersect function #' #' @return produces a list with components detailed in the `mixOmics::predict` R documentation #' @@ -76,7 +78,7 @@ predictPreeclampsia <- function(betas, ...){ } else # predict - out <- mixOmics:::predict.mixo_spls(mod, betasSubset) + out <- mixOmics::predict.mixo_spls(mod, betasSubset) # get class probabilities CP <- out$predict[,,1] diff --git a/man/planet-package.Rd b/man/planet-package.Rd index 2f60eed..e80dbb0 100644 --- a/man/planet-package.Rd +++ b/man/planet-package.Rd @@ -22,6 +22,11 @@ Useful links: \author{ \strong{Maintainer}: Victor Yuan \email{victor.2wy@gmail.com} +Authors: +\itemize{ + \item Icíar Fernández-Boyano \email{iciarfernandez@outlook.com} [contributor] +} + Other contributors: \itemize{ \item Wendy P. Robinson [contributor] diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd index 41ee2ee..cf8e51a 100644 --- a/man/predictPreeclampsia.Rd +++ b/man/predictPreeclampsia.Rd @@ -9,6 +9,8 @@ predictPreeclampsia(betas, ...) \arguments{ \item{betas}{matrix or array of methylation values on the beta scale (0, 1), where the variables are arranged in rows, and samples in columns.} + +\item{...}{feeds into outersect function} } \value{ produces a list with components detailed in the \code{mixOmics::predict} R documentation From 77a385da7b4bedced737d6e6f152b245be780e18 Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Fri, 14 Jun 2024 16:16:07 -0700 Subject: [PATCH 04/19] Add Wendy as author / contributor --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 06d6e06..6f832b3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,7 +5,7 @@ Authors@R: c(person("Victor", "Yuan", email = "victor.2wy@gmail.com", role = c("aut", "cre")), person(c("Wendy", "P."), "Robinson", - role = "ctb"), + role = c("aut", "ctb")), person("Icíar", "Fernández-Boyano", email = "iciarfernandez@outlook.com", role = c("aut", "ctb"))) URL: From bd4b806bf348a7ed629c2acb9e9397be89de4ded Mon Sep 17 00:00:00 2001 From: iciarfernandez Date: Mon, 17 Jun 2024 14:23:00 +0200 Subject: [PATCH 05/19] fixed ::, DS Store file and dependencies --- .gitignore | 1 + NAMESPACE | 2 -- R/predictPreeclampsia.R | 6 ++---- man/planet-package.Rd | 6 +----- 4 files changed, 4 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 760c4d3..373bcc4 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ Meta pkgdown/ *.rds .vscode/settings.json +.DS_Store diff --git a/NAMESPACE b/NAMESPACE index 7437be4..b601849 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,5 @@ export(pl_infer_ethnicity) export(predictAge) export(predictEthnicity) export(predictPreeclampsia) -import(ExperimentHub) -import(mixOmics) importFrom(magrittr,"%>%") importFrom(tibble,tibble) diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R index 13de998..e8d948e 100644 --- a/R/predictPreeclampsia.R +++ b/R/predictPreeclampsia.R @@ -24,15 +24,13 @@ #' # data(peBetas) #' # predictPreeclampsia(peBetas, dist = "max.dist") #' -#' @import mixOmics -#' @import ExperimentHub #' @export predictPreeclampsia #' predictPreeclampsia <- function(betas, ...){ # read in data to generate model - eh = ExperimentHub() + eh = ExperimentHub::ExperimentHub() mod = eh[['EH8090']] trainCpGs = colnames(mod$X) @@ -78,7 +76,7 @@ predictPreeclampsia <- function(betas, ...){ } else # predict - out <- mixOmics::predict.mixo_spls(mod, betasSubset) + out <- mixOmics:::predict.mixo_spls(mod, betasSubset) # get class probabilities CP <- out$predict[,,1] diff --git a/man/planet-package.Rd b/man/planet-package.Rd index e80dbb0..4d8fb55 100644 --- a/man/planet-package.Rd +++ b/man/planet-package.Rd @@ -23,13 +23,9 @@ Useful links: \strong{Maintainer}: Victor Yuan \email{victor.2wy@gmail.com} Authors: -\itemize{ - \item Icíar Fernández-Boyano \email{iciarfernandez@outlook.com} [contributor] -} - -Other contributors: \itemize{ \item Wendy P. Robinson [contributor] + \item Icíar Fernández-Boyano \email{iciarfernandez@outlook.com} [contributor] } } From a5afdc9c16941ea38d2f3ae9743ca07d85b4df65 Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Fri, 24 Jan 2025 15:47:03 -0800 Subject: [PATCH 06/19] move heavy bioconductor packages to suggests --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 87414d7..ad739ce 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,9 +23,9 @@ Imports: tibble, magrittr, dplyr, - ExperimentHub, - mixOmics Suggests: + ExperimentHub, + mixOmics, ggplot2, testthat, tidyr, From 96150ba6bfa3bf7455b703a0206bde94d8cb1682 Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Wed, 29 Jan 2025 23:24:31 -0800 Subject: [PATCH 07/19] betas input is not transposed --- R/predictPreeclampsia.R | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R index e8d948e..41db153 100644 --- a/R/predictPreeclampsia.R +++ b/R/predictPreeclampsia.R @@ -21,22 +21,23 @@ #' # To predict early preeclampsia on 450k/850k samples #' #' # Load data -#' # data(peBetas) -#' # predictPreeclampsia(peBetas, dist = "max.dist") +#' library(ExperimentHub) #' +#' # test object +#' x_test <- eh[['EH8403']] +#' x_test %>% predictPreeclampsia() #' @export predictPreeclampsia #' - predictPreeclampsia <- function(betas, ...){ # read in data to generate model - eh = ExperimentHub::ExperimentHub() - mod = eh[['EH8090']] - trainCpGs = colnames(mod$X) + eh <- ExperimentHub::ExperimentHub() + mod <- eh[['EH8090']] + trainCpGs <- colnames(mod$X) # check that there are no NAs in the predictors (or if there are, how many) - peCpGs = mixOmics::selectVar(mod)$name - pp <- intersect(colnames(betas), peCpGs) + peCpGs <- mixOmics::selectVar(mod)$name + pp <- intersect(rownames(betas), peCpGs) if(length(pp) < length(peCpGs)){ stop(paste( @@ -53,30 +54,29 @@ predictPreeclampsia <- function(betas, ...){ # adds the ones that are missing as NAs # necessary for `mixOmics::predict` to work - outersect = function(x, y) { + outersect <- function(x, y) { sort(c(x[!x%in%y], y[!y%in%x])) } if(inherits(betas, 'matrix')){ + } else if (inherits(betas, 'array')) { - } else { + } else { # throw an error print(paste0("Input data must be a matrix or an array")) } - betasSubset <- betas[,colnames(betas) %in% trainCpGs] + betasSubset <- betas[rownames(betas) %in% trainCpGs,] # order - betasSubset <- betasSubset[drop=FALSE,, trainCpGs] + betasSubset <- betasSubset[drop=FALSE,trainCpGs, ] - if(all(colnames(betasSubset) == trainCpGs) == FALSE){ - stop() - } else - - # predict - out <- mixOmics:::predict.mixo_spls(mod, betasSubset) + stopifnot(all(rownames(betasSubset) == trainCpGs)) + + # predict + out <- mixOmics:::predict.mixo_spls(mod, t(betasSubset)) # get class probabilities CP <- out$predict[,,1] @@ -85,7 +85,6 @@ predictPreeclampsia <- function(betas, ...){ CP$PE_Status <- CP$comp1 CP <- CP %>% dplyr::mutate(PE_Status = dplyr::case_when(EOPE > 0.55 ~ "EOPE", - EOPE < 0.55 ~ "Normotensive")) - - return(CP) + EOPE < 0.55 ~ "Normotensive")) + return(tibble::as_tibble(CP)) } From 64709dc520e4afd1992eb2de6666688978b91efa Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Wed, 29 Jan 2025 23:24:37 -0800 Subject: [PATCH 08/19] docs add eopreddata to imports --- DESCRIPTION | 3 ++- man/predictEthnicity.Rd | 4 ++-- man/predictPreeclampsia.Rd | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ad739ce..9f2b89e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,6 +23,7 @@ Imports: tibble, magrittr, dplyr, + eoPredData Suggests: ExperimentHub, mixOmics, @@ -37,7 +38,7 @@ Suggests: License: GPL-2 Encoding: UTF-8 LazyData: false -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 VignetteBuilder: knitr biocViews: Software, DifferentialMethylation, Epigenetics, Microarray, MethylationArray, DNAMethylation, CpGIsland diff --git a/man/predictEthnicity.Rd b/man/predictEthnicity.Rd index 6b088b7..289882d 100644 --- a/man/predictEthnicity.Rd +++ b/man/predictEthnicity.Rd @@ -25,8 +25,8 @@ Uses 1860 CpGs to predict self-reported ethnicity on placental microarray data. } \details{ -Predicts self-reported ethnicity from 3 groups: African, Asian, -and European, using placental DNA methylation data measured on the Infinium +Predicts self-reported ethnicity from 3 classes: Africans, Asians, +and Caucasians, using placental DNA methylation data measured on the Infinium 450k/EPIC methylation array. Will return membership probabilities that often reflect genetic ancestry composition. diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd index cf8e51a..b67de41 100644 --- a/man/predictPreeclampsia.Rd +++ b/man/predictPreeclampsia.Rd @@ -32,7 +32,9 @@ prior to prediction. This was the normalization method used on the training data # To predict early preeclampsia on 450k/850k samples # Load data -# data(peBetas) -# predictPreeclampsia(peBetas, dist = "max.dist") +library(ExperimentHub) +# test object +x_test <- eh[['EH8403']] +x_test \%>\% predictPreeclampsia() } From 363aeba5f727a3dbc57a78ccba00588d0fc2dca1 Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Thu, 30 Jan 2025 22:40:47 -0800 Subject: [PATCH 09/19] fix eopred vignette section --- vignettes/planet.Rmd | 258 ++++++++++++++++++++++--------------------- 1 file changed, 135 insertions(+), 123 deletions(-) diff --git a/vignettes/planet.Rmd b/vignettes/planet.Rmd index 15bcc7d..7244acc 100644 --- a/vignettes/planet.Rmd +++ b/vignettes/planet.Rmd @@ -8,18 +8,18 @@ output: pkgdown: as_is: true vignette: > - %\VignetteIndexEntry{planet} - %\VignetteEncoding{UTF-8} - %\VignetteEngine{knitr::rmarkdown} -editor_options: - chunk_output_type: console + %\VignetteIndexEntry{planet} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: source +chunk_output_type: console --- ## Installation ```{r, message = FALSE, warning = FALSE, eval = FALSE} if(!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") + install.packages("BiocManager") BiocManager::install("planet") ``` @@ -60,9 +60,9 @@ how to do both. ```{r} houseman_estimates <- minfi:::projectCellType( - plBetas[rownames(plCellCpGsThird), ], - plCellCpGsThird, - lessThanOne = FALSE + plBetas[rownames(plCellCpGsThird), ], + plCellCpGsThird, + lessThanOne = FALSE ) head(houseman_estimates) @@ -73,23 +73,23 @@ head(houseman_estimates) ```{r} # robust partial correlations epidish_RPC <- epidish( - beta.m = plBetas[rownames(plCellCpGsThird), ], - ref.m = plCellCpGsThird, - method = "RPC" + beta.m = plBetas[rownames(plCellCpGsThird), ], + ref.m = plCellCpGsThird, + method = "RPC" ) # CIBERSORT epidish_CBS <- epidish( - beta.m = plBetas[rownames(plCellCpGsThird), ], - ref.m = plCellCpGsThird, - method = "CBS" + beta.m = plBetas[rownames(plCellCpGsThird), ], + ref.m = plCellCpGsThird, + method = "CBS" ) # constrained projection (houseman 2012) epidish_CP <- epidish( - beta.m = plBetas[rownames(plCellCpGsThird), ], - ref.m = plCellCpGsThird, - method = "CP" + beta.m = plBetas[rownames(plCellCpGsThird), ], + ref.m = plCellCpGsThird, + method = "CP" ) ``` @@ -103,40 +103,40 @@ data("plColors") # bind estimate data frames and reshape for plotting bind_rows( - houseman_estimates %>% as.data.frame() %>% mutate(algorithm = "CP (Houseman)"), - epidish_RPC$estF %>% as.data.frame() %>% mutate(algorithm = "RPC"), - epidish_CBS$estF %>% as.data.frame() %>% mutate(algorithm = "CBS"), - epidish_CP$estF %>% as.data.frame() %>% mutate(algorithm = "CP (EpiDISH)") + houseman_estimates %>% as.data.frame() %>% mutate(algorithm = "CP (Houseman)"), + epidish_RPC$estF %>% as.data.frame() %>% mutate(algorithm = "RPC"), + epidish_CBS$estF %>% as.data.frame() %>% mutate(algorithm = "CBS"), + epidish_CP$estF %>% as.data.frame() %>% mutate(algorithm = "CP (EpiDISH)") ) %>% - mutate(sample = rep(rownames(houseman_estimates), 4)) %>% - as_tibble() %>% - pivot_longer( - cols = -c(algorithm, sample), - names_to = "component", - values_to = "estimate" - ) %>% - - # relevel for plot - mutate(component = factor(component, - levels = c( - "nRBC", "Endothelial", "Hofbauer", - "Stromal", "Trophoblasts", - "Syncytiotrophoblast" - ) - )) %>% - - # plot - ggplot(aes(x = sample, y = estimate, fill = component)) + - geom_bar(stat = "identity") + - facet_wrap(~algorithm, ncol = 1) + - scale_fill_manual(values = plColors) + - scale_y_continuous( - limits = c(-0.1, 1.1), breaks = c(0, 0.5, 1), - labels = scales::percent - ) + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + - coord_cartesian(ylim = c(0, 1)) + - labs(x = "", fill = "") + mutate(sample = rep(rownames(houseman_estimates), 4)) %>% + as_tibble() %>% + pivot_longer( + cols = -c(algorithm, sample), + names_to = "component", + values_to = "estimate" + ) %>% + + # relevel for plot + mutate(component = factor(component, + levels = c( + "nRBC", "Endothelial", "Hofbauer", + "Stromal", "Trophoblasts", + "Syncytiotrophoblast" + ) + )) %>% + + # plot + ggplot(aes(x = sample, y = estimate, fill = component)) + + geom_bar(stat = "identity") + + facet_wrap(~algorithm, ncol = 1) + + scale_fill_manual(values = plColors) + + scale_y_continuous( + limits = c(-0.1, 1.1), breaks = c(0, 0.5, 1), + labels = scales::percent + ) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + + coord_cartesian(ylim = c(0, 1)) + + labs(x = "", fill = "") ``` Some notes: @@ -197,11 +197,11 @@ sample information data.frame, `plPhenoData`. ```{r} plPhenoData <- plPhenoData %>% - mutate( - ga_RPC = predictAge(plBetas, type = "RPC"), - ga_CPC = predictAge(plBetas, type = "CPC"), - ga_RRPC = predictAge(plBetas, type = "RRPC") - ) + mutate( + ga_RPC = predictAge(plBetas, type = "RPC"), + ga_CPC = predictAge(plBetas, type = "CPC"), + ga_RRPC = predictAge(plBetas, type = "RRPC") + ) ``` Note that the number of predictors (CpGs) that were used in our data are @@ -215,21 +215,21 @@ each of the 3 gestational age predictors. ```{r, fig.width = 7, fig.height = 5} plPhenoData %>% - # reshape, to plot - pivot_longer( - cols = contains("ga"), - names_to = "clock_type", - names_prefix = "ga_", - values_to = "ga" - ) %>% - - # plot code - ggplot(aes(x = gestation_wk, y = ga, col = disease)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) + - facet_wrap(~clock_type) + - theme(legend.position = "top") + - labs(x = "Reported GA (weeks)", y = "Inferred GA (weeks)", col = "") + # reshape, to plot + pivot_longer( + cols = contains("ga"), + names_to = "clock_type", + names_prefix = "ga_", + values_to = "ga" + ) %>% + + # plot code + ggplot(aes(x = gestation_wk, y = ga, col = disease)) + + geom_point() + + geom_smooth(method = "lm", se = FALSE) + + facet_wrap(~clock_type) + + theme(legend.position = "top") + + labs(x = "Reported GA (weeks)", y = "Inferred GA (weeks)", col = "") ``` ## Ethnicity @@ -243,7 +243,7 @@ all(ethnicityCpGs %in% rownames(plBetas)) results <- predictEthnicity(plBetas) results %>% - tail(8) + tail(8) ``` `predictEthnicity` returns probabilities corresponding to each ethnicity for @@ -261,26 +261,26 @@ require special attention in downstream analyses. ```{r, fig.width = 7} results %>% - ggplot(aes( - x = Prob_Caucasian, y = Prob_African, - col = Predicted_ethnicity - )) + - geom_point(alpha = 0.7) + - coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + - scale_x_continuous(labels = scales::percent) + - scale_y_continuous(labels = scales::percent) + - labs(x = "P(Caucasian)", y = "P(African)") + ggplot(aes( + x = Prob_Caucasian, y = Prob_African, + col = Predicted_ethnicity + )) + + geom_point(alpha = 0.7) + + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + + scale_x_continuous(labels = scales::percent) + + scale_y_continuous(labels = scales::percent) + + labs(x = "P(Caucasian)", y = "P(African)") results %>% - ggplot(aes( - x = Prob_Caucasian, y = Prob_Asian, - col = Predicted_ethnicity - )) + - geom_point(alpha = 0.7) + - coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + - scale_x_continuous(labels = scales::percent) + - scale_y_continuous(labels = scales::percent) + - labs(x = "P(Caucasian)", y = "P(Asian)") + ggplot(aes( + x = Prob_Caucasian, y = Prob_Asian, + col = Predicted_ethnicity + )) + + geom_point(alpha = 0.7) + + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + + scale_x_continuous(labels = scales::percent) + + scale_y_continuous(labels = scales::percent) + + labs(x = "P(Caucasian)", y = "P(Asian)") ``` We can't compare this to self-reported ethnicity as it is unavailable. But we @@ -321,51 +321,63 @@ for classification used to assign labels is 55%; if the users wishes to use other threshold, different labels can be assigned based on the output probabilities. -```{r include=FALSE, eval = FALSE} +```{r include=TRUE, eval = TRUE} library(ExperimentHub) -eh = ExperimentHub() -query(eh, c('eoPred')) -val = eh[["EH8091"]] -# add line to load valMeta +eh <- ExperimentHub() +query(eh, "eoPredData") +# download BMIQ normalized 450k data +x_test <- eh[['EH8403']] +valMeta <- eh[['EH8404']] +preds <- x_test %>% predictPreeclampsia() ``` -```{r include=FALSE, eval = FALSE} - -dim(val) # 48 by 341,281 - -preds <- predictPreeclampsia(val) +Inspect the results: -# have a quick look +```{r include=FALSE, eval = TRUE} head(preds) # join with metadata valMeta <- left_join(valMeta, preds, by="Sample_ID") # visualize results -valMeta %>% - ggplot() + - geom_boxplot(aes(x = Class, - y = EOPE), - width=0.3, - alpha=0.5, - color="darkgrey") + - geom_jitter(aes(x = Class, - y = EOPE, - color = Pred_Class), - alpha = 0.5, - position = position_jitter(width = .1)) + - coord_flip() + - ylab("Class Probability of EOPE") + - xlab("True Class") + - ylim(0,1) + - scale_color_manual(name = "Predicted Class", values=c("#E69F00", "#999999")) + - theme_minimal() - -# if user wishes to use different threshold from 55% probability, as an example 70% -EOpreds %>% mutate(Pred_Class = case_when(EOPE > 0.7 ~ "EOPE", - EOPE < 0.7 ~ "Non-PE Preterm")) +plot_predictions <- function(df, predicted_class_column) { + df %>% + ggplot() + + geom_boxplot( + aes(x = Class, y = EOPE), + width = 0.3, + alpha = 0.5, + color = "darkgrey" + ) + + geom_jitter( + aes(x = Class, y = EOPE, color = {{predicted_class_column}}), + alpha = 0.75, + position = position_jitter(width = .1) + ) + + coord_flip() + + ylab("Class Probability of EOPE") + + xlab("True Class") + + ylim(0,1) + + scale_color_manual( + name = "Predicted Class", + values = c("#E69F00", "skyblue", "#999999") + ) + + theme_minimal() +} +valMeta %>% plot_predictions(PE_Status) +``` + +if user wishes to use different threshold from 55% probability, as an example 70% + +```{r} +valMeta %>% mutate( + Pred_Class = case_when( + EOPE > 0.7 ~ "EOPE", + `Non-PE Preterm` > 0.7 ~ "Non-PE Preterm", + .default = 'low-confidence' + )) %>% plot_predictions(Pred_Class) ``` From 4d1ee7ebb05f6d85dbfdd4fd6f70fafc596d840a Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Thu, 30 Jan 2025 23:00:57 -0800 Subject: [PATCH 10/19] adjust figure dimensions --- vignettes/planet.Rmd | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/vignettes/planet.Rmd b/vignettes/planet.Rmd index 7244acc..85fa734 100644 --- a/vignettes/planet.Rmd +++ b/vignettes/planet.Rmd @@ -5,6 +5,8 @@ output: df_print: "kable" fig_caption: FALSE toc: TRUE + fig.width: 7 + fig.height: 7 pkgdown: as_is: true vignette: > @@ -34,7 +36,7 @@ In this example we are using term villi DNAm data, so we first load the reference cpgs `plCellCpGsThird`. This is a data frame of 600 cpgs, with mean methylation levels for each cell type. -```{r, message = FALSE, warning = FALSE} +```{r load_libraries, message = FALSE, warning = FALSE} # cell deconvolution packages library(minfi) library(EpiDISH) @@ -58,7 +60,7 @@ how to do both. #### Minfi -```{r} +```{r houseman} houseman_estimates <- minfi:::projectCellType( plBetas[rownames(plCellCpGsThird), ], plCellCpGsThird, @@ -70,7 +72,7 @@ head(houseman_estimates) #### EpiDISH -```{r} +```{r epidish echo=FALSE} # robust partial correlations epidish_RPC <- epidish( beta.m = plBetas[rownames(plCellCpGsThird), ], @@ -98,7 +100,7 @@ epidish_CP <- epidish( Below, I demonstrate how we can visually compare the different cell composition estimates. -```{r, fig.width = 7, fig.height = 7} +```{r compare_deconvolution, fig.width = 7, fig.height = 7} data("plColors") # bind estimate data frames and reshape for plotting @@ -158,7 +160,7 @@ information from `data(plPhenoData)`. Note that for demonstration purposes, the cpgs have been filtered to a random \~10,000 CpGs, plus the CpGs used in all of the functions from this package. -```{r} +```{r gestational_age} # load example data data(plBetas) data(plPhenoData) @@ -195,7 +197,7 @@ To select the type of clock, we can specify the `type` argument in `predictAge`. We will apply all three clocks on this data, and add the predicted age to the sample information data.frame, `plPhenoData`. -```{r} +```{r predict_ga} plPhenoData <- plPhenoData %>% mutate( ga_RPC = predictAge(plBetas, type = "RPC"), @@ -212,7 +214,7 @@ accuracy. Next, I plot the difference between predicted and reported gestational age, for each of the 3 gestational age predictors. -```{r, fig.width = 7, fig.height = 5} +```{r view_ga, fig.width = 7, fig.height = 5} plPhenoData %>% # reshape, to plot @@ -237,7 +239,7 @@ plPhenoData %>% Before predicting ethnicity You can ensure that you have all features using the `ethnicityCpGs` vector: -```{r} +```{r ethnicity} data(ethnicityCpGs) all(ethnicityCpGs %in% rownames(plBetas)) From 7261f52f4d2c92e3bdc698e26d3fa071bec511e0 Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Thu, 30 Jan 2025 23:13:44 -0800 Subject: [PATCH 11/19] copy predict functions over from mixomics --- NAMESPACE | 5 + R/predictPreeclampsia.R | 8 +- R/predict_mixOmics.R | 773 +++++++++++++++++++++++++++++++++++++ man/predict.Rd | 203 ++++++++++ man/predictPreeclampsia.Rd | 4 + 5 files changed, 991 insertions(+), 2 deletions(-) create mode 100644 R/predict_mixOmics.R create mode 100644 man/predict.Rd diff --git a/NAMESPACE b/NAMESPACE index b601849..9702d9d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,7 @@ # Generated by roxygen2: do not edit by hand +S3method(predict,mixo_pls) +S3method(predict,mixo_spls) export("%>%") export(pl_infer_age) export(pl_infer_ethnicity) @@ -7,4 +9,7 @@ export(predictAge) export(predictEthnicity) export(predictPreeclampsia) importFrom(magrittr,"%>%") +importFrom(methods,hasArg) +importFrom(methods,is) +importFrom(stats,setNames) importFrom(tibble,tibble) diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R index 41db153..c07c037 100644 --- a/R/predictPreeclampsia.R +++ b/R/predictPreeclampsia.R @@ -16,16 +16,19 @@ #' #' @return produces a list with components detailed in the `mixOmics::predict` R documentation #' -#' @examples +#' @examples \dontrun{ #' #' # To predict early preeclampsia on 450k/850k samples #' #' # Load data #' library(ExperimentHub) +#' eh <- ExperimentHub() +#' query(eh, "eoPredData") #' #' # test object #' x_test <- eh[['EH8403']] #' x_test %>% predictPreeclampsia() +#' } #' @export predictPreeclampsia #' predictPreeclampsia <- function(betas, ...){ @@ -76,7 +79,7 @@ predictPreeclampsia <- function(betas, ...){ stopifnot(all(rownames(betasSubset) == trainCpGs)) # predict - out <- mixOmics:::predict.mixo_spls(mod, t(betasSubset)) + out <- predict.mixo_spls(mod, t(betasSubset)) # get class probabilities CP <- out$predict[,,1] @@ -88,3 +91,4 @@ predictPreeclampsia <- function(betas, ...){ EOPE < 0.55 ~ "Normotensive")) return(tibble::as_tibble(CP)) } + diff --git a/R/predict_mixOmics.R b/R/predict_mixOmics.R new file mode 100644 index 0000000..9efcb9c --- /dev/null +++ b/R/predict_mixOmics.R @@ -0,0 +1,773 @@ +# copied from https://raw.githubusercontent.com/mixOmicsTeam/mixOmics/refs/heads/master/R/predict.R +# on 2025 Jan 30. Some components omitted that are not used in planet. + +# ======================================================================================================== +# This function makes a prediction of a 'newdata' by using the results of 'object'. +# Depending on the class of the object (mint).(block).(s)pls(da) (16 classes so far), the input data is different +# and the preparation of the data is different - different scaling for instance if object="mint...." +# However, the prediction formula is the same for all classes, thus only one code +# ======================================================================================================== + +#' Predict Method for (mint).(block).(s)pls(da) methods +#' +#' Predicted values based on PLS models. New responses and variates are +#' predicted using a fitted model and a new matrix of observations. +#' +#' \code{predict} produces predicted values, obtained by evaluating the +#' PLS-derived methods, returned by \code{(mint).(block).(s)pls(da)} in the +#' frame \code{newdata}. Variates for \code{newdata} are also returned. Please +#' note that this method performs multilevel decomposition and/or log ratio +#' transformations if needed (\code{multilevel} is an input parameter while +#' \code{logratio} is extracted from \code{object}). +#' +#' Different prediction distances are proposed for discriminant analysis. The +#' reason is that our supervised models work with a dummy indicator matrix of +#' \code{Y} to indicate the class membership of each sample. The prediction of +#' a new observation results in either a predicted dummy variable (output +#' \code{object$predict}), or a predicted variate (output +#' \code{object$variates}). Therefore, an appropriate distance needs to be +#' applied to those predicted values to assign the predicted class. We propose +#' distances such as `maximum distance' for the predicted dummy variables, +#' `Mahalanobis distance' and `Centroids distance' for the predicted variates. +#' +#' \code{"max.dist"} is the simplest method to predict the class of a test +#' sample. For each new individual, the class with the largest predicted dummy +#' variable is the predicted class. This distance performs well in single data +#' set analysis with multiclass problems (PLS-DA). +#' +#' \code{"centroids.dist"} allocates to the new observation the class that +#' mimimises the distance between the predicted score and the centroids of the +#' classes calculated on the latent components or variates of the trained +#' model. +#' +#' \code{"mahalanobis.dist"} allocates the new sample the class defined as the +#' centroid distance, but using the Mahalanobis metric in the calculation of +#' the distance. +#' +#' In practice we found that the centroid-based distances +#' (\code{"centroids.dist"} and \code{"mahalanobis.dist"}), and specifically +#' the Mahalanobis distance led to more accurate predictions than the maximum +#' distance for complex classification problems and N-integration problems +#' (block.splsda). The centroid distances consider the prediction in +#' dimensional space spanned by the predicted variates, while the maximum +#' distance considers a single point estimate using the predicted scores on the +#' last dimension of the model. The user can assess the different distances, +#' and choose the prediction distance that leads to the best performance of the +#' model, as highlighted from the tune and perf outputs +#' +#' More (mathematical) details about the prediction distances are available in +#' the supplemental of the mixOmics article (Rohart et al 2017). +#' +#' For a visualisation of those prediction distances, see +#' \code{background.predict} that overlays the prediction area in +#' \code{plotIndiv} for a sPLS-DA object. +#' +#' Allocates the individual \eqn{x} to the class of \eqn{Y} minimizing +#' \eqn{dist(\code{x-variate}, G_l)}, where \eqn{G_l}, \eqn{l = 1,...,L} are +#' the centroids of the classes calculated on the \eqn{X}-variates of the +#' model. \code{"mahalanobis.dist"} allocates the individual \eqn{x} to the +#' class of \eqn{Y} as in \code{"centroids.dist"} but by using the Mahalanobis +#' metric in the calculation of the distance. +#' +#' For MINT objects, the \code{study.test} argument is required and provides +#' the grouping factor of \code{newdata}. +#' +#' For multi block analysis (thus block objects), \code{newdata} is a list of +#' matrices whose names are a subset of \code{names(object$X)} and missing +#' blocks are allowed. Several predictions are returned, either for each block +#' or for all blocks. For non discriminant analysis, the predicted values +#' (\code{predict}) are returned for each block and these values are combined +#' by average (\code{AveragedPredict}) or weighted average +#' (\code{WeightedPredict}), using the weights of the blocks that are +#' calculated as the correlation between a block's components and the outcome's +#' components. +#' +#' For discriminant analysis, the predicted class is returned for each block +#' (\code{class}) and each distance (\code{dist}) and these predictions are +#' combined by majority vote (\code{MajorityVote}) or weighted majority vote +#' (\code{WeightedVote}), using the weights of the blocks that are calculated +#' as the correlation between a block's components and the outcome's +#' components. NA means that there is no consensus among the block. For PLS-DA +#' and sPLS-DA objects, the prediction area can be visualised in plotIndiv via +#' the \code{background.predict} function. +#' +#' @aliases predict predict.pls predict.spls predict.plsda predict.splsda +#' predict.mint.pls predict.mint.spls predict.mint.plsda predict.mint.splsda +#' predict.mint.block.pls predict.mint.block.spls predict.mint.block.plsda +#' predict.mint.block.splsda +#' @param object object of class inheriting from +#' \code{"(mint).(block).(s)pls(da)"}. +#' @param newdata data matrix in which to look for for explanatory variables to +#' be used for prediction. Please note that this method does not perform +#' multilevel decomposition or log ratio transformations, which need to be +#' processed beforehand. +#' @param study.test For MINT objects, grouping factor indicating which samples +#' of \code{newdata} are from the same study. Overlap with \code{object$study} +#' are allowed. +#' @param dist distance to be applied for discriminant methods to predict the +#' class of new data, should be a subset of \code{"centroids.dist"}, +#' \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). Defaults to +#' \code{"all"}. +#' @param multilevel Design matrix for multilevel analysis (for repeated +#' measurements). A numeric matrix or data frame. For a one level factor +#' decomposition, the input is a vector indicating the repeated measures on +#' each individual, i.e. the individuals ID. For a two level decomposition with +#' splsda models, the two factors are included in Y. Finally for a two level +#' decomposition with spls models, 2nd AND 3rd columns in design indicate those +#' factors (see example in \code{?splsda} and \code{?spls}). +#' @param ... not used currently. +#' @return \code{predict} produces a list with the following components: +#' \item{predict}{predicted response values. The dimensions correspond to the +#' observations, the response variables and the model dimension, respectively. +#' For a supervised model, it corresponds to the predicted dummy variables.} +#' \item{variates}{matrix of predicted variates.} \item{B.hat}{matrix of +#' regression coefficients (without the intercept).} +#' +#' \item{AveragedPredict}{if more than one block, returns the average predicted +#' values over the blocks (using the \code{predict} output)} +#' \item{WeightedPredict}{if more than one block, returns the weighted average +#' of the predicted values over the blocks (using the \code{predict} and +#' \code{weights} outputs)} +#' +#' \item{class}{predicted class of \code{newdata} for each +#' \eqn{1,...,}\code{ncomp} components.} +#' +#' \item{MajorityVote}{if more than one block, returns the majority class over +#' the blocks. NA for a sample means that there is no consensus on the +#' predicted class for this particular sample over the blocks.} +#' \item{WeightedVote}{if more than one block, returns the weighted majority +#' class over the blocks. NA for a sample means that there is no consensus on +#' the predicted class for this particular sample over the blocks.} +#' \item{weights}{Returns the weights of each block used for the weighted +#' predictions, for each nrepeat and each fold} +#' +#' \item{centroids}{matrix of coordinates for centroids.} \item{dist}{type of +#' distance requested.} \item{vote}{majority vote result for multi block +#' analysis (see details above).} +#' @author Florian Rohart, Sébastien Déjean, Ignacio González, Kim-Anh Lê Cao, +#' Al J Abadi +#' @seealso \code{\link{pls}}, \code{\link{spls}}, \code{\link{plsda}}, +#' \code{\link{splsda}}, \code{\link{mint.pls}}, \code{\link{mint.spls}}, +#' \code{\link{mint.plsda}}, \code{\link{mint.splsda}}, +#' \code{\link{block.pls}}, \code{\link{block.spls}}, +#' \code{\link{block.plsda}}, \code{\link{block.splsda}}, +#' \code{\link{mint.block.pls}}, \code{\link{mint.block.spls}}, +#' \code{\link{mint.block.plsda}}, \code{\link{mint.block.splsda}} and +#' visualisation with \code{\link{background.predict}} and +#' http://www.mixOmics.org for more details. +#' @references +#' +#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +#' feature selection and multiple data integration. PLoS Comput Biol 13(11): +#' e1005752 +#' +#' Tenenhaus, M. (1998). \emph{La regression PLS: theorie et pratique}. Paris: +#' Editions Technic. +#' @keywords regression multivariate +#' @name predict +#' @rdname predict +#' @method predict mixo_pls +#' @importFrom methods hasArg is +#' @importFrom stats setNames +#' @export +predict.mixo_pls <- + function(object, + newdata, + study.test, + dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), + multilevel = NULL, + ... + ) + { + + time=FALSE + + # pass R check + newdata.scale = misdata.all = is.na.X = is.na.newdata = noAveragePredict = NULL + + # input parameter: noAveragePredict=> no averagePredict calculation, used in tune.block.splsda + + if(inherits(object, c("rgcca","sparse.rgcca"))) + stop("no prediction for RGCCA methods") + + #check on dist + if (!any(dist %in% c("all", "max.dist", "centroids.dist", "mahalanobis.dist")) & is(object, "DA")) + stop("ERROR : choose one of the four following modes: 'all', 'max.dist', 'centroids.dist' or 'mahalanobis.dist'") + #end general checks + + ncomp = object$ncomp + + if(is(object, "DA")) # a DA analysis (mint).(block).(s)plsda + { + #if DA analysis, the unmap Y is in ind.mat + Y.factor=object$Y + Y=object$ind.mat + }else{ + #if not DA, Y is in object$Y + Y=object$Y + if(is.null(Y)) # block analysis + { + Y=object$X[[object$indY]] + } + } + q=ncol(Y) + + if(time) time1 = proc.time() + #print(names(list(...))) + + if(hasArg(newdata.scale)) # if an input `newdata.scale' is given, we use that one and we don't scale the data and don't do the logratio or multilevel + { + newdata = list(...)$newdata.scale + object$logratio = NULL + multilevel = NULL + } + + mint.object = c("mint.pls", "mint.spls", "mint.plsda", "mint.splsda") + block.object = c("block.pls", "block.spls", "block.plsda", "block.splsda") + ### if the object is a block, the input newdata is different, we check newdata, make sure it's a list and check newdata/X + if(!inherits(object, block.object)) # not a block (pls/spls/plsda/splsda/mint...) + { + p=ncol(object$X) + if(is.list(object$X)) + stop("Something is wrong, object$X should be a matrix and it appears to be a list") #this should never happen/intern check + + if(is.list(newdata) & !is.data.frame(newdata)) + stop("'newdata' must be a numeric matrix") + + # deal with near.zero.var in object, to remove the same variable in newdata as in object$X (already removed in object$X) + if(length(object$nzv$Position) > 0) + newdata = newdata[, -object$nzv$Position,drop=FALSE] + + if(all.equal(colnames(newdata),colnames(object$X))!=TRUE) + stop("'newdata' must include all the variables of 'object$X'") + + #not a block, the input newdata should be a matrix + if (length(dim(newdata)) == 2) { + if (ncol(newdata) != p) + stop("'newdata' must be a numeric matrix with ncol = ", p, + " or a vector of length = ", p, ".") + } + + if (length(dim(newdata)) == 0) { + if (length(newdata) != p) + stop("'newdata' must be a numeric matrix with ncol = ", p, + " or a vector of length = ", p, ".") + dim(newdata) = c(1, p) + } + + #check col/rownames of newdata + check=Check.entry.single(newdata, ncomp,q=1) + newdata=check$X + + if(length(rownames(newdata))==0) rownames(newdata)=1:nrow(newdata) + if(max(table(rownames(newdata)))>1) stop('samples should have a unique identifier/rowname') + + # we transform everything in lists + X=list(X=object$X) + object$X=X + newdata=list(newdata=newdata) + + object$indY=2 + ind.match = 1 + + }else{ + + # a block, newdata should be a list, each blocks should have the same number of samples + if(!is.list(newdata)) + stop("'newdata' should be a list") + + if(!is.null(object$indY) && !is(object, "DA")) #if DA, object$X is already without Y + { + X = object$X[-object$indY] + }else{ + X=object$X + } + object$X=X + + p = lapply(X, ncol) + + #matching newdata and X + + # error if no names on keepX or not matching the names in X + if(length(unique(names(newdata)))!=length(newdata) | sum(is.na(match(names(newdata),names(X)))) > 0) + stop("Each entry of 'newdata' must have a unique name corresponding to a block of 'X'") + + # error if not same number of samples in each block of newdata + if (length(unique(sapply(newdata,nrow))) != 1) + stop("All entries of 'newdata' must have the same number of rows") + + # I want to match keepX to X by names + ind.match = match(names(X), names(newdata)) + if(any(is.na(ind.match))) + warning("Some blocks are missing in 'newdata'; the prediction is based on the following blocks only: ", paste(names(X)[!is.na(ind.match)],collapse=", ")) + + #replace missing blocks by 0 + newdataA = list() + for (q in 1:length(X)) + { + + if (!is.na(ind.match[q])) # means there is a newdata with the same name as X[q] #(q <= length(newdata)) + { + newdataA[[q]] = newdata[[ind.match[q]]] + }else{ + newdataA[[q]] = matrix(0,nrow = unique(sapply(newdata,nrow)), ncol = ncol(X[[q]]), dimnames = list(rownames(newdata[[1]]), colnames(X[[q]]))) # if missing, replaced by a 0 matrix of correct size + } + } + names(newdataA) = names(X) + newdata = newdataA + + # deal with near.zero.var in object, to remove the same variable in newdata as in object$X (already removed in object$X) + if(!is.null(object$nzv)) + { + # for each of the input blocks, checks to see if the nzv features have already been removed + # if not, then these features are removed here + for (x in 1:length(newdata)) { + if (nrow(object$nzv[[x]]$Metrics) == 0) { next } + if (all(!(rownames(object$nzv[[x]]$Metrics) %in% colnames(newdata[[x]])))) { next } + + newdata[[x]] <- newdata[[x]][, -object$nzv[[x]]$Position,drop=FALSE] + } + } + if(length(newdata)!=length(object$X)) stop("'newdata' must have as many blocks as 'object$X'") + + names(newdata)=names(X) + + if (any(lapply(newdata, function(x){length(dim(x))}) != 2)) { + if (any(unlist(lapply(newdata, ncol)) != unlist(p))) + stop("'newdata' must be a list with ", length(p), " numeric matrix and ncol respectively equal to ", paste(p, collapse = ", "), ".") + } + + if (any(lapply(newdata, function(x){length(dim(x))}) == 0)) { + if (any(unlist(lapply(newdata, ncol)) != unlist(p))) + stop("'newdata' must be a list with ", length(p), " numeric matrix and ncol respectively equal to ", paste(p, collapse = ", "), ".") + dim(newdata) = c(1, p) #don't understand that, Benoit? + } + + #check dimnames and ncomp per block of A + for(q in 1:length(newdata)) + { + check=Check.entry.single(newdata[[q]], ncomp[q],q=q) + newdata[[q]]=check$X + } + names(newdata)=names(X) + + #check that newdata and X have the same variables and that they are in the same order + for (i in 1:length(newdata)) { + X.names <- colnames(X[[i]]) + newdata.names <- colnames(newdata[[i]]) + + if (any(X.names != newdata.names)) { # if there is any difference between colnames + if (length(setdiff(X.names, newdata.names)) > 0) { # if there is presence of novel feature/absence of existing feature + stop(paste("The set of features in 'object$X$", + names(X)[i], "' is different to 'newdata$", + names(newdata)[i], "'. Please ensure they have the same features.", sep = "")) + } + else if (all(sort(X.names) == sort(newdata.names))) { # if it is only a difference in order + stop(paste("The order of features in 'object$X$", + names(X)[i], "' is different to 'newdata$", + names(newdata)[i], "'. Please ensure that you adjust these to the same order", sep = "")) + } + } + } + + #reorder variables in the newdata as in X if needed + if (all.equal(lapply(newdata, colnames), lapply(X, colnames)) != TRUE) { + newdata <- setNames(lapply(seq_along(X), function(i) {newdata[[i]][, colnames(X[[i]])]}), nm = names(X)) + } + #need to reorder variates and loadings to put 'Y' in last + if(!is.null(object$indY)) + { + indY=object$indY + object$variates=c(object$variates[-indY],object$variates[indY]) + object$loadings=c(object$loadings[-indY],object$loadings[indY]) + } + + + } + + # logratio and multilevel transform if necessary + if (!is.null(object$logratio)) + newdata = lapply(newdata, logratio.transfo, logratio = object$logratio) + + if(!is.null(multilevel)) + newdata = lapply(newdata, withinVariation, design = data.frame(multilevel)) + + p = lapply(X, ncol) + q = ncol(Y) + J = length(X) #at this stage we have a list of blocks + variatesX = object$variates[-(J + 1)]; + loadingsX = object$loadings[-(J + 1)] + + scale = object$scale # X and Y are both mean centered by groups and if scale=TRUE they are scaled by groups + + + ### if the object is not a mint analysis, the input study.test is missing and we can go faster to scale the data + + # we only scale the data if the input `newdata.scale' is missing. Only use in tune functions where we do not need to scale for every prediction as we predict on the same newdata for a grid of keepX + if(!hasArg(newdata.scale)) + { + if(!inherits(object, mint.object))#| nlevels(factor(object$study))<=1) #not a mint object or just one level in the study + { # not a mint (pls/spls/plsda/splsda/block...) + + # scale newdata if just one study + if (!is.null(attr(X[[1]], "scaled:center"))) + newdata[which(!is.na(ind.match))] = lapply(which(!is.na(ind.match)), function(x){sweep(newdata[[x]], 2, STATS = attr(X[[x]], "scaled:center"))}) + if (scale) + newdata[which(!is.na(ind.match))] = lapply(which(!is.na(ind.match)), function(x){sweep(newdata[[x]], 2, FUN = "/", STATS = attr(X[[x]], "scaled:scale"))}) + if (any(unlist(lapply(newdata[which(!is.na(ind.match))], function(x){any(is.infinite(x))})))) { + newdata[which(!is.na(ind.match))] <- lapply(which(!is.na(ind.match)), function(x){df <- newdata[[x]]; df[which(is.infinite(df))] <- NaN; return(df)}) + } + + means.Y = matrix(attr(Y, "scaled:center"),nrow=nrow(newdata[[1]]),ncol=q,byrow=TRUE); + if (scale) + {sigma.Y = matrix(attr(Y, "scaled:scale"),nrow=nrow(newdata[[1]]),ncol=q,byrow=TRUE)}else{sigma.Y=matrix(1,nrow=nrow(newdata[[1]]),ncol=q)} + concat.newdata=newdata + names(concat.newdata)=names(X) + + }else{ + # a mint analysis + + #check study.test + if(missing(study.test)) + { + if(nlevels(object$study)==1) + {study.test=factor(rep(1,nrow(newdata[[1]])))}else{ + stop("'study.test' is missing")} + }else{ + study.test=as.factor(study.test) + } + if (any(unlist(lapply(newdata, nrow)) != length(study.test))) + stop(paste0("'study' must be a factor of length ",nrow(newdata[[1]]),".")) + + #scale newdata if more than one study. If some levels of study.test are included in study.learn, the means/sigmas of study.learn are used to scale + M=nlevels(study.test) + study.learn=factor(object$study) + names.study.learn=levels(study.learn);names.study.test=levels(study.test) + match.study=match(names.study.test,names.study.learn) + match.study.indice=which(!is.na(match.study)) + + newdata.list.study = lapply(newdata,study_split,study.test) #a list of lists. [[j]][[m]]: j is for the blocks, m is for the levels(study) + + # each study is normalized, depending on how the normalization was performed on the learning set (center/scale) + newdata.list.study.scale.temp =NULL#vector("list",length=J) #list(list()) + concat.newdata = vector("list",length=J) + for(j in 1:J) # loop on the blocks + { + for (m in 1:M) #loop on the groups of study.test + { + # case 1: sample test is part of the learning data set + if(m%in%match.study.indice) #if some study of study.test were already in the learning set + { + + if(scale==TRUE) + { + if(nlevels(object$study)>1) + { + newdata.list.study.scale.temp=scale(newdata.list.study[[j]][[m]], center = attr(X[[j]],paste0("means:", levels(study.test)[m])), scale = attr(X[[j]],paste0("sigma:", levels(study.test)[m]))) + }else{#if just one level in object$study, the normalisation attributes are not named the same + newdata.list.study.scale.temp=scale(newdata.list.study[[j]][[m]], center = attr(X[[j]],"scaled:center"), scale = attr(X[[j]],"scaled:scale")) + } + } + + if(scale==FALSE) + { + if(nlevels(object$study)>1) + { + newdata.list.study.scale.temp=scale(newdata.list.study[[j]][[m]], center = attr(X[[j]],paste0("means:", levels(study.test)[m])),scale=FALSE) + }else{#if just one level in object$study, the normalisation attributes are not named the same + newdata.list.study.scale.temp=scale(newdata.list.study[[j]][[m]], center = attr(X[[j]],"scaled:center"),scale=FALSE) + } + + + } + + }else{ + # case 2: sample test is a new study # a new study which was not in the learning set + + ## if there is one sample from a new study to be scaled throw a condition and do not scale + if (scale == TRUE & dim(newdata.list.study[[j]][[m]])[1] == 1 ) { + warning("Train data are scaled but the test data include a single sample from a new study (which cannot be scaled). Mkaing prediction without scaling this test sample and thus prediction for this sample should be given with care. Consider scale=FALSE for model, or using more samples for prediction, or using a sample from model studies.\n") + newdata.list.study.scale.temp = newdata.list.study[[j]][[m]] + } else { + newdata.list.study.scale.temp=scale(newdata.list.study[[j]][[m]],center=TRUE,scale=scale) + } + } + #concatenation of the scaled data + concat.newdata[[j]] = rbind(concat.newdata[[j]], unlist(newdata.list.study.scale.temp))#[[j]][[m]])) + } + } + + # we now need to reorganise concat.newdata as newdata. Indeed, the concatenation was done without taking care of the order of the samples in newdata + for(j in 1:J) # loop on the blocks + { + indice.match=match(rownames(newdata[[j]]),rownames(concat.newdata[[j]])) + #match indice + concat.newdata[[j]]=concat.newdata[[j]][indice.match,, drop=FALSE] + concat.newdata[[j]][which(is.na(concat.newdata[[j]]))]=0 # taking care of the NA due to normalisation: put to 0 so they don't influence the product below (in Y.hat), reviens au meme que de supprimer les colonnes sans variances au depart. + + } + + names(concat.newdata)=names(X) + + means.Y=matrix(0,nrow=nrow(concat.newdata[[1]]),ncol=q) + sigma.Y=matrix(1,nrow=nrow(concat.newdata[[1]]),ncol=q) + + #loop on the blocks to define means.Y and sigma.Y for mint analysis + for(m in 1:M) + { + if(m%in%match.study.indice) #if some study of study.test were already in the learning set + { + if(nlevels(object$study)>1) + { + means.Y[which(study.test%in%levels(study.learn)[match.study[m]]),]=matrix(attr(Y,paste0("means:", levels(study.test)[m])),nrow=length(which(study.test%in%levels(study.learn)[match.study[m]])),ncol=q,byrow=TRUE) + }else{#if just one level in object$study, the normalisation attributes are not named the same + means.Y[which(study.test%in%levels(study.learn)[match.study[m]]),]=matrix(attr(Y,"scaled:center"),nrow=length(which(study.test%in%levels(study.learn)[match.study[m]])),ncol=q,byrow=TRUE) + } + + + if(scale==TRUE) + { + # I want to build a vector with sigma.Y for each group + if(nlevels(object$study)>1) + { + sigma.Y[which(study.test%in%levels(study.learn)[match.study[m]]),]=matrix(attr(Y,paste0("sigma:", levels(study.test)[m])),nrow=length(which(study.test%in%levels(study.learn)[match.study[m]])),ncol=q,byrow=TRUE) + }else{#if just one level in object$study, the normalisation attributes are not named the same + sigma.Y[which(study.test%in%levels(study.learn)[match.study[m]]),]=matrix(attr(Y,"scaled:scale"),nrow=length(which(study.test%in%levels(study.learn)[match.study[m]])),ncol=q,byrow=TRUE) + } + + + } + } + } + + } + ### end if object is a mint analysis + } else { + means.Y = matrix(attr(Y, "scaled:center"),nrow=nrow(newdata[[1]]),ncol=q,byrow=TRUE); + if (scale) + {sigma.Y = matrix(attr(Y, "scaled:scale"),nrow=nrow(newdata[[1]]),ncol=q,byrow=TRUE)}else{sigma.Y=matrix(1,nrow=nrow(newdata[[1]]),ncol=q)} + concat.newdata=newdata + names(concat.newdata)=names(X) + } + if(time) time2 = proc.time() + if(time) print("scaling") + if(time) print(time2-time1) + + + ### at this stage we have + # X # list of blocks + # Y # observation + # newdata #list of blocks for the prediction, same length as A, scaled + + ### replace NA by 0 in the training data + if( (hasArg(misdata.all) & hasArg(is.na.X)) && any(list(...)$misdata.all)) # ind.na.X: all blocks except Y + { # if misdata.all and ind.na.X are provided, we don't calculate the is.na(X) as it takes time. Used in tune functions. + for(j in c(1:J)[list(...)$misdata.all]) + X[[j]][list(...)$is.na.X[[j]]]=0 # faster than using replace + } else { + # replace NA by 0 + X = lapply(X,function(x) + { + if (anyNA(x)){ + ind = is.na(x) + x[ind] = 0 + } + x + }) + } + + ### replace NA by 0 in the test data + + if( (hasArg(misdata.all) & hasArg(is.na.newdata)) && any(list(...)$misdata.all)) # ind.na.newdata: all blocks of newdata + { + # if misdata.all and ind.na.X are provided, we don't calculate the is.na(X) as it takes time. Used in tune functions. + concat.newdata = lapply(1:J, function(q){replace(concat.newdata[[q]], list(...)$is.na.newdata[[q]], 0)}) + + } + + if (any(sapply(concat.newdata, anyNA))) + { + concat.newdata = lapply(concat.newdata,function(x) + { + if (anyNA(x)){ + ind = is.na(x) + x[ind] = 0 + } + x + }) + } + + + # replace NA by 0 in Y + Y[is.na(Y)] = 0 + + if(time) time3 = proc.time() + if(time) print("NA") + if(time) print(time3-time2) + + # ----------------------- + # prediction + # ----------------------- + + B.hat = t.pred = Y.hat = list() #= betay + for (i in 1 : J) + { + Pmat = Cmat = Wmat = NULL + + ### Start estimation using formula Y = XW(P'W)C (+ Yr, residuals on Y) See page 136 La regression PLS Theorie et pratique Tenenhaus + # Estimation matrix W, P and C + Pmat = crossprod(X[[i]], variatesX[[i]]) + Cmat = crossprod(Y, variatesX[[i]]) + Wmat = loadingsX[[i]] + + # Prediction Y.hat, B.hat and t.pred + Ypred = lapply(1 : ncomp[i], function(x){concat.newdata[[i]] %*% Wmat[, 1:x] %*% solve(t(Pmat[, 1:x]) %*% Wmat[, 1:x]) %*% t(Cmat)[1:x, ]}) + Ypred = sapply(Ypred, function(x){x*sigma.Y + means.Y}, simplify = "array") + + Y.hat[[i]] = array(Ypred, c(nrow(newdata[[i]]), ncol(Y), ncomp[i])) # in case one observation and only one Y, we need array() to keep it an array with a third dimension being ncomp + + t.pred[[i]] = concat.newdata[[i]] %*% Wmat %*% solve(t(Pmat) %*% Wmat) + t.pred[[i]] = matrix(data = sapply(1:ncol(t.pred[[i]]), + function(x) {t.pred[[i]][, x] * apply(variatesX[[i]], 2, + function(y){(norm(y, type = "2"))^2})[x]}), nrow = nrow(concat.newdata[[i]]), ncol = ncol(t.pred[[i]])) + + B.hat[[i]] = sapply(1 : ncomp[i], function(x){Wmat[, 1:x] %*% solve(t(Pmat[, 1:x]) %*% Wmat[, 1:x]) %*% t(Cmat)[1:x, ]}, simplify = "array") + ### End estimation using formula Y = XW(P'W)C (+ Yr, residuals on Y) See page 136 La regression PLS Theorie et pratique Tenenhaus + + rownames(t.pred[[i]]) = rownames(newdata[[i]]) + colnames(t.pred[[i]]) = paste0("dim", c(1:ncomp[i])) + rownames(Y.hat[[i]]) = rownames(newdata[[i]]) + colnames(Y.hat[[i]]) = colnames(Y) + dimnames(Y.hat[[i]])[[3]]=paste0("dim", c(1:ncomp[i])) + rownames(B.hat[[i]]) = colnames(newdata[[i]]) + colnames(B.hat[[i]]) = colnames(Y) + dimnames(B.hat[[i]])[[3]]=paste0("dim", c(1:ncomp[i])) + + } + + if(time) time4 = proc.time() + if(time) print("Prediction") + if(time) print(time4-time3) + + #-- valeurs sortantes --# + names(Y.hat)=names(t.pred)=names(B.hat)=names(object$X) + + if(time) time4 = proc.time() + + # basic prediction results + if(inherits(object, block.object) & length(object$X)>1 ) + { + out=list(predict=Y.hat[which(!is.na(ind.match))],variates=t.pred[which(!is.na(ind.match))],B.hat=B.hat[which(!is.na(ind.match))]) + + # average prediction over the blocks + temp.all =list() + for(comp in 1:min(ncomp[-object$indY])) #note: all ncomp are the same in v6 as the input parameter is a single value + { + temp = array(0, c(nrow(Y.hat[[1]]), ncol(Y.hat[[1]]), J), dimnames = list(rownames(newdata[[1]]), colnames(Y),names(object$X))) + for(i in 1 : J) + temp[, , i] = Y.hat[[i]][, , comp] + + temp.all[[comp]] = temp + } + names(temp.all) = paste0("dim", c(1:min(ncomp[-object$indY]))) + + if(!hasArg(noAveragePredict)) + { + out$AveragedPredict = array(unlist(lapply(temp.all, function(x){ + apply(x, c(1,2), mean) + + })), dim(Y.hat[[1]]), dimnames = list(rownames(newdata[[1]]), colnames(Y), paste0("dim", c(1:min(ncomp[-object$indY]))))) + + out$WeightedPredict = array(unlist(lapply(temp.all, function(x){ + apply(x, c(1,2), function(z){ + temp = aggregate(rowMeans(object$weights),list(z),sum) + ind = which(temp[,2]== max (temp[,2]))# if two max, then NA + if(length(ind) == 1) + { + res = temp[ind, 1] + } else { + res = NA + } + res + })})), dim(Y.hat[[1]]), dimnames = list(rownames(newdata[[1]]), colnames(Y), paste0("dim", c(1:min(ncomp[-object$indY]))))) + } + + + #out$newdata=concat.newdata + }else if(inherits(object, block.object)){ # a block but can have only one block (so e.g. a pls done with a block.pls) + out=list(predict=Y.hat,variates=t.pred,B.hat=B.hat) + + } else {# not a block (pls/spls/plsda/splsda/mint...) + out=list(predict=Y.hat[[1]],variates=t.pred[[1]],B.hat=B.hat[[1]]) + } + + if(time) time5 = proc.time() + if(time) print("Y.hat") + if(time) print(time5-time4) + + + # get the classification for each new sample if the object is a DA + if(is(object, "DA")) # a DA analysis (mint).(block).(s)plsda + { + + if(inherits(object, block.object) & length(object$X)>1 ) + { + if(!hasArg(noAveragePredict)) + { + # predict class of AveragePredict, only with max.dist + out$AveragedPredict.class$max.dist = matrix(sapply(1:ncomp[1], ### List level + function(y){apply(out$AveragedPredict[, , y, drop = FALSE], 1, ### component level + function(z){ + paste(levels(Y.factor)[which(z == max(z))], collapse = "/") + }) ### matrix level + }), nrow = nrow(newdata[[1]]), ncol = ncomp[1]) + + + # predict class of WeightedPredict, only with max.dist + out$WeightedPredict.class$max.dist = matrix(sapply(1:ncomp[1], ### List level + function(y){apply(out$WeightedPredict[, , y, drop = FALSE], 1, ### component level + function(z){ + paste(levels(Y.factor)[which(z == max(z))], collapse = "/") + }) ### matrix level + }), nrow = nrow(newdata[[1]]), ncol = ncomp[1]) + + rownames(out$AveragedPredict.class$max.dist) = rownames(out$WeightedPredict.class$max.dist) = rownames(newdata[[1]]) + colnames(out$AveragedPredict.class$max.dist) = colnames(out$WeightedPredict.class$max.dist) = paste0("dim", c(1:min(ncomp[-object$indY]))) + } + } + + + # creating temporary 'blocks' outputs to pass into the internal_predict.DA function + out.temp=list(predict=Y.hat[which(!is.na(ind.match))],variates=t.pred[which(!is.na(ind.match))],B.hat=B.hat[which(!is.na(ind.match))]) + out.temp$newdata=concat.newdata[which(!is.na(ind.match))] + + # getting classification for each new sample + object.temp = object + object.temp$X = object.temp$X[which(!is.na(ind.match))] + object.temp$variates = object.temp$variates[c(which(!is.na(ind.match)),J+1)] #J+1 is Y + if (!is.null(object$weights)) + weights <- rowMeans(object$weights)[which(!is.na(ind.match))] + else + weights <- NULL + + classif.DA=internal_predict.DA(object=object.temp, q=q, out=out.temp, dist=dist, weights = weights) + out=c(out,classif.DA) + + } + + if(time) time6 = proc.time() + if(time) print("DA") + if(time) print(time6-time5) + + out$call = match.call() + class(out) = paste("predict") + + out + + } + +#' @rdname predict +#' @method predict mixo_spls +#' @export +predict.mixo_spls <- predict.mixo_pls + diff --git a/man/predict.Rd b/man/predict.Rd new file mode 100644 index 0000000..bc64d95 --- /dev/null +++ b/man/predict.Rd @@ -0,0 +1,203 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict_mixOmics.R +\name{predict} +\alias{predict} +\alias{predict.mixo_pls} +\alias{predict.pls} +\alias{predict.spls} +\alias{predict.plsda} +\alias{predict.splsda} +\alias{predict.mint.pls} +\alias{predict.mint.spls} +\alias{predict.mint.plsda} +\alias{predict.mint.splsda} +\alias{predict.mint.block.pls} +\alias{predict.mint.block.spls} +\alias{predict.mint.block.plsda} +\alias{predict.mint.block.splsda} +\alias{predict.mixo_spls} +\title{Predict Method for (mint).(block).(s)pls(da) methods} +\usage{ +\method{predict}{mixo_pls}( + object, + newdata, + study.test, + dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), + multilevel = NULL, + ... +) + +\method{predict}{mixo_spls}( + object, + newdata, + study.test, + dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), + multilevel = NULL, + ... +) +} +\arguments{ +\item{object}{object of class inheriting from +\code{"(mint).(block).(s)pls(da)"}.} + +\item{newdata}{data matrix in which to look for for explanatory variables to +be used for prediction. Please note that this method does not perform +multilevel decomposition or log ratio transformations, which need to be +processed beforehand.} + +\item{study.test}{For MINT objects, grouping factor indicating which samples +of \code{newdata} are from the same study. Overlap with \code{object$study} +are allowed.} + +\item{dist}{distance to be applied for discriminant methods to predict the +class of new data, should be a subset of \code{"centroids.dist"}, +\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). Defaults to +\code{"all"}.} + +\item{multilevel}{Design matrix for multilevel analysis (for repeated +measurements). A numeric matrix or data frame. For a one level factor +decomposition, the input is a vector indicating the repeated measures on +each individual, i.e. the individuals ID. For a two level decomposition with +splsda models, the two factors are included in Y. Finally for a two level +decomposition with spls models, 2nd AND 3rd columns in design indicate those +factors (see example in \code{?splsda} and \code{?spls}).} + +\item{...}{not used currently.} +} +\value{ +\code{predict} produces a list with the following components: +\item{predict}{predicted response values. The dimensions correspond to the +observations, the response variables and the model dimension, respectively. +For a supervised model, it corresponds to the predicted dummy variables.} +\item{variates}{matrix of predicted variates.} \item{B.hat}{matrix of +regression coefficients (without the intercept).} + +\item{AveragedPredict}{if more than one block, returns the average predicted +values over the blocks (using the \code{predict} output)} +\item{WeightedPredict}{if more than one block, returns the weighted average +of the predicted values over the blocks (using the \code{predict} and +\code{weights} outputs)} + +\item{class}{predicted class of \code{newdata} for each +\eqn{1,...,}\code{ncomp} components.} + +\item{MajorityVote}{if more than one block, returns the majority class over +the blocks. NA for a sample means that there is no consensus on the +predicted class for this particular sample over the blocks.} +\item{WeightedVote}{if more than one block, returns the weighted majority +class over the blocks. NA for a sample means that there is no consensus on +the predicted class for this particular sample over the blocks.} +\item{weights}{Returns the weights of each block used for the weighted +predictions, for each nrepeat and each fold} + +\item{centroids}{matrix of coordinates for centroids.} \item{dist}{type of +distance requested.} \item{vote}{majority vote result for multi block +analysis (see details above).} +} +\description{ +Predicted values based on PLS models. New responses and variates are +predicted using a fitted model and a new matrix of observations. +} +\details{ +\code{predict} produces predicted values, obtained by evaluating the +PLS-derived methods, returned by \code{(mint).(block).(s)pls(da)} in the +frame \code{newdata}. Variates for \code{newdata} are also returned. Please +note that this method performs multilevel decomposition and/or log ratio +transformations if needed (\code{multilevel} is an input parameter while +\code{logratio} is extracted from \code{object}). + +Different prediction distances are proposed for discriminant analysis. The +reason is that our supervised models work with a dummy indicator matrix of +\code{Y} to indicate the class membership of each sample. The prediction of +a new observation results in either a predicted dummy variable (output +\code{object$predict}), or a predicted variate (output +\code{object$variates}). Therefore, an appropriate distance needs to be +applied to those predicted values to assign the predicted class. We propose +distances such as \verb{maximum distance' for the predicted dummy variables, }Mahalanobis distance' and `Centroids distance' for the predicted variates. + +\code{"max.dist"} is the simplest method to predict the class of a test +sample. For each new individual, the class with the largest predicted dummy +variable is the predicted class. This distance performs well in single data +set analysis with multiclass problems (PLS-DA). + +\code{"centroids.dist"} allocates to the new observation the class that +mimimises the distance between the predicted score and the centroids of the +classes calculated on the latent components or variates of the trained +model. + +\code{"mahalanobis.dist"} allocates the new sample the class defined as the +centroid distance, but using the Mahalanobis metric in the calculation of +the distance. + +In practice we found that the centroid-based distances +(\code{"centroids.dist"} and \code{"mahalanobis.dist"}), and specifically +the Mahalanobis distance led to more accurate predictions than the maximum +distance for complex classification problems and N-integration problems +(block.splsda). The centroid distances consider the prediction in +dimensional space spanned by the predicted variates, while the maximum +distance considers a single point estimate using the predicted scores on the +last dimension of the model. The user can assess the different distances, +and choose the prediction distance that leads to the best performance of the +model, as highlighted from the tune and perf outputs + +More (mathematical) details about the prediction distances are available in +the supplemental of the mixOmics article (Rohart et al 2017). + +For a visualisation of those prediction distances, see +\code{background.predict} that overlays the prediction area in +\code{plotIndiv} for a sPLS-DA object. + +Allocates the individual \eqn{x} to the class of \eqn{Y} minimizing +\eqn{dist(\code{x-variate}, G_l)}, where \eqn{G_l}, \eqn{l = 1,...,L} are +the centroids of the classes calculated on the \eqn{X}-variates of the +model. \code{"mahalanobis.dist"} allocates the individual \eqn{x} to the +class of \eqn{Y} as in \code{"centroids.dist"} but by using the Mahalanobis +metric in the calculation of the distance. + +For MINT objects, the \code{study.test} argument is required and provides +the grouping factor of \code{newdata}. + +For multi block analysis (thus block objects), \code{newdata} is a list of +matrices whose names are a subset of \code{names(object$X)} and missing +blocks are allowed. Several predictions are returned, either for each block +or for all blocks. For non discriminant analysis, the predicted values +(\code{predict}) are returned for each block and these values are combined +by average (\code{AveragedPredict}) or weighted average +(\code{WeightedPredict}), using the weights of the blocks that are +calculated as the correlation between a block's components and the outcome's +components. + +For discriminant analysis, the predicted class is returned for each block +(\code{class}) and each distance (\code{dist}) and these predictions are +combined by majority vote (\code{MajorityVote}) or weighted majority vote +(\code{WeightedVote}), using the weights of the blocks that are calculated +as the correlation between a block's components and the outcome's +components. NA means that there is no consensus among the block. For PLS-DA +and sPLS-DA objects, the prediction area can be visualised in plotIndiv via +the \code{background.predict} function. +} +\references{ +Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics +feature selection and multiple data integration. PLoS Comput Biol 13(11): +e1005752 + +Tenenhaus, M. (1998). \emph{La regression PLS: theorie et pratique}. Paris: +Editions Technic. +} +\seealso{ +\code{\link{pls}}, \code{\link{spls}}, \code{\link{plsda}}, +\code{\link{splsda}}, \code{\link{mint.pls}}, \code{\link{mint.spls}}, +\code{\link{mint.plsda}}, \code{\link{mint.splsda}}, +\code{\link{block.pls}}, \code{\link{block.spls}}, +\code{\link{block.plsda}}, \code{\link{block.splsda}}, +\code{\link{mint.block.pls}}, \code{\link{mint.block.spls}}, +\code{\link{mint.block.plsda}}, \code{\link{mint.block.splsda}} and +visualisation with \code{\link{background.predict}} and +http://www.mixOmics.org for more details. +} +\author{ +Florian Rohart, Sébastien Déjean, Ignacio González, Kim-Anh Lê Cao, +Al J Abadi +} +\keyword{multivariate} +\keyword{regression} diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd index b67de41..ea612d7 100644 --- a/man/predictPreeclampsia.Rd +++ b/man/predictPreeclampsia.Rd @@ -28,13 +28,17 @@ prior to prediction. This was the normalization method used on the training data } \examples{ +\dontrun{ # To predict early preeclampsia on 450k/850k samples # Load data library(ExperimentHub) +eh <- ExperimentHub() +query(eh, "eoPredData") # test object x_test <- eh[['EH8403']] x_test \%>\% predictPreeclampsia() } +} From d32e51d6e3c15a6f3e27ff28e7a1d2d19b507f05 Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Thu, 30 Jan 2025 23:23:39 -0800 Subject: [PATCH 12/19] add other functions from mixOmics --- NAMESPACE | 6 + R/predict_mixOmics.R | 469 ++++++++++++++++++++++++++++++++ man/logratio-transformations.Rd | 58 ++++ man/unmap.Rd | 59 ++++ 4 files changed, 592 insertions(+) create mode 100644 man/logratio-transformations.Rd create mode 100644 man/unmap.Rd diff --git a/NAMESPACE b/NAMESPACE index 9702d9d..24492c1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,11 +3,17 @@ S3method(predict,mixo_pls) S3method(predict,mixo_spls) export("%>%") +export(logratio.transfo) export(pl_infer_age) export(pl_infer_ethnicity) export(predictAge) export(predictEthnicity) export(predictPreeclampsia) +export(unmap) +importFrom(dplyr,filter) +importFrom(dplyr,group_by) +importFrom(dplyr,n) +importFrom(dplyr,row_number) importFrom(magrittr,"%>%") importFrom(methods,hasArg) importFrom(methods,is) diff --git a/R/predict_mixOmics.R b/R/predict_mixOmics.R index 9efcb9c..5f29025 100644 --- a/R/predict_mixOmics.R +++ b/R/predict_mixOmics.R @@ -771,3 +771,472 @@ predict.mixo_pls <- #' @export predict.mixo_spls <- predict.mixo_pls + +# from https://github.com/mixOmicsTeam/mixOmics/blob/master/R/check_entry.R#L111 +################################################################################ +# -------------------------------------- +# Check.entry.single: check input parameter for a single matrix that comes +# from a list of matrix +# -------------------------------------- +# X: a single numeric matrix +# ncomp: the number of components to include in the model +# q: position of X in a list of matrix, as used in horizontal analysis +# (e.g. block.pls) +Check.entry.single = function(X, ncomp, q) +{ + + #-- validation des arguments --# + if (length(dim(X)) != 2) + stop(paste0("'X[[", q, "]]' must be a numeric matrix.")) + + if(! any(class(X) %in% "matrix")) + X = as.matrix(X) + + if (!is.numeric(X)) + stop(paste0("'X[[", q, "]]' must be a numeric matrix.")) + + N = nrow(X) + P = ncol(X) + + if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0) + stop(paste0( + "invalid number of variates 'ncomp' for matrix 'X[[", q, "]]'.")) + + ncomp = round(ncomp) + + # add colnames and rownames if missing + X.names = dimnames(X)[[2]] + if (is.null(X.names)) + { + X.names = paste("X", 1:P, sep = "") + dimnames(X)[[2]] = X.names + } + + ind.names = dimnames(X)[[1]] + if (is.null(ind.names)) + { + ind.names = 1:N + rownames(X) = ind.names + } + + if (length(unique(rownames(X))) != nrow(X)) + stop("samples should have a unique identifier/rowname") + if (length(unique(X.names)) != P) + stop("Unique indentifier is needed for the columns of X") + + return(list(X=X, ncomp=ncomp, X.names=X.names, ind.names=ind.names)) +} + +# from https://github.com/mixOmicsTeam/mixOmics/blob/master/R/logratio-transformations.R +#' Log-ratio transformation +#' +#' This function applies a log transformation to the data, either CLR or ILR +#' +#' \code{logratio.transfo} applies a log transformation to the data, either CLR +#' (centered log ratio transformation) or ILR (Isometric Log Ratio +#' transformation). In the case of CLR log-transformation, X needs to be a +#' matrix of non-negative values and \code{offset} is used to shift the values +#' away from 0, as commonly done with counts data. +#' +#' @param X numeric matrix of predictors +#' @param logratio log-ratio transform to apply, one of "none", "CLR" or "ILR" +#' @param offset Value that is added to X for CLR and ILR log transformation. +#' Default to 0. +#' @return \code{logratio.transfo} simply returns the log-ratio transformed +#' data. +#' @author Florian Rohart, Kim-Anh Lê Cao, Al J Abadi +#' @seealso \code{\link{pca}}, \code{\link{pls}}, \code{\link{spls}}, +#' \code{\link{plsda}}, \code{\link{splsda}}. +#' @references Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis, +#' Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a +#' multivariate statistical framework to gain insight into Microbial +#' Communities bioRxiv 044206; doi: http://dx.doi.org/10.1101/044206 +#' +#' John Aitchison. The statistical analysis of compositional data. Journal of +#' the Royal Statistical Society. Series B (Methodological), pages 139-177, +#' 1982. +#' +#' Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component +#' analysis for compositional data with outliers. Environmetrics, +#' 20(6):621-632, 2009. +#' @examples +#' +#' data(diverse.16S) +#' CLR = logratio.transfo(X = diverse.16S$data.TSS, logratio = 'CLR') +#' # no offset needed here as we have put it prior to the TSS, see www.mixOmics.org/mixMC +#' @name logratio-transformations +NULL +#' @rdname logratio-transformations +#' @export +logratio.transfo <- function(X, + logratio = c('none','CLR','ILR'), + offset = 0) +{ + X <- as.matrix(X) + if (!is.numeric(X)) + stop("X must be a numeric matrix", call. = FALSE) + + logratio <- match.arg(logratio) + if (logratio == 'ILR') + { + if (!is(X, 'ilr')) + { # data are ilr transformed, then the data lose 1 variable, but we'll use V to reconstruct the matrix + X = ilr.transfo(X, offset = offset) + } + } else if (logratio == 'CLR') { + X = clr.transfo(X, offset = offset) + } + return(X) +} + + +#' @importFrom dplyr group_by row_number filter n +internal_predict.DA <- + function(object, out, q, dist, weights) + { + # a DA analysis (mint).(block).(s)plsda + if (!is(object, "DA")) + stop("'Object' is not from a Discriminant Analysis", call.=FALSE) + + out.DA = list() + J = length(object$X) #at this stage we have a list of blocks + p = lapply(object$X, ncol) + t.pred = out$variates + Y.hat = out$predict + newdata = out$newdata #actually concat.newdata + variatesX = object$variates[-(J + 1)]; + ncomp = object$ncomp + + Y = object$Y + Y.prim = unmap(object$Y) + G = cls = list() + for (i in 1 : J) + { + G[[i]] = sapply(1:q, function(x) + {apply(as.matrix(variatesX[[i]][Y.prim[, x] == 1, + ,drop=FALSE]), 2, mean)}) + if (ncomp[i] == 1) + G[[i]] = t(t(G[[i]])) + else + G[[i]] = t(G[[i]]) + colnames(G[[i]]) = paste0("dim", c(1:ncomp[i])) + rownames(G[[i]]) = colnames(object$ind.mat) + + } + names(G)=names(object$X) + + ### Start: Maximum distance + if (any(dist == "all") || any(dist == "max.dist")) + { + cls$max.dist = lapply(1:J, function(x){matrix(sapply(1:ncomp[x], + ### List level + function(y){apply(Y.hat[[x]][, , y, drop = FALSE], 1, + ### component level + function(z){ + paste(levels(Y)[which(z == max(z))], collapse = "/") + }) ### matrix level + }), nrow = nrow(newdata[[x]]), ncol = ncomp[x]) + }) + cls$max.dist = lapply(1:J, function(x){colnames(cls$max.dist[[x]]) = + paste0(rep("comp", ncomp[x]), 1 : ncomp[[x]]); + rownames(cls$max.dist[[x]]) = rownames(newdata[[x]]); + return(cls$max.dist[[x]])}) + names(cls$max.dist)=names(object$X) + } + + + ### Start: Centroids distance + if (any(dist == "all") || any(dist == "centroids.dist")) + { + cl = list() + centroids.fun = function(x, G, h, i) { + q = nrow(G[[i]]) + x = matrix(x, nrow = q, ncol = h, byrow = TRUE) + + if (h > 1) { + d = apply((x - G[[i]][, 1:h])^2, 1, sum) + } + else { + d = (x - G[[i]][, 1])^2 + } + cl.id = paste(levels(Y)[which(d == min(d))], collapse = "/") + } + + for (i in 1 : J) + { + cl[[i]] = matrix(nrow = nrow(newdata[[1]]), ncol = ncomp[i]) + + for (h in 1 : ncomp[[i]]) + { + cl.id = apply(matrix(t.pred[[i]][, 1:h], ncol = h), 1, + function(x) {centroids.fun(x = x, G = G, h = h, i = i)}) + cl[[i]][, h] = cl.id + } + } + + cls$centroids.dist = lapply(1:J, function(x){colnames(cl[[x]]) = + paste0(rep("comp", ncomp[x]), 1 : ncomp[[x]]); + rownames(cl[[x]]) = rownames(newdata[[x]]); return(cl[[x]])}) + names(cls$centroids.dist)=names(object$X) + }### End: Centroids distance + + + ### Start: Mahalanobis distance + if (any(dist == "all") || any(dist == "mahalanobis.dist")) + { + cl = list() + Sr.fun = function(x, G, Yprim, h, i) { + q = nrow(G[[i]]) + Xe = Yprim %*% G[[i]][, 1:h] + #Xr = object$variates$X[, 1:h] - Xe + Xr = variatesX[[i]][, 1:h] - Xe + Sr = t(Xr) %*% Xr/nrow(Yprim) + Sr.inv = solve(Sr) + x = matrix(x, nrow = q, ncol = h, byrow = TRUE) + if (h > 1) { + mat = (x - G[[i]][, 1:h]) %*% Sr.inv %*% t(x - G[[i]][, 1:h]) + d = apply(mat^2, 1, sum) + } else { + d = drop(Sr.inv) * (x - G[[i]][, 1])^2 + } + cl.id = paste(levels(Y)[which(d == min(d))], collapse = "/") + } + + for (i in 1 : J){ + cl[[i]] = matrix(nrow = nrow(newdata[[1]]), ncol = ncomp[i]) + + for (h in 1:ncomp[[i]]) { + cl.id = apply(matrix(t.pred[[i]][, 1:h], ncol = h), 1, Sr.fun, + G = G, Yprim = Y.prim, h = h, i = i) + cl[[i]][, h] = cl.id + } + } + + cls$mahalanobis.dist = lapply(1:J, function(x){colnames(cl[[x]]) = + paste0(rep("comp", ncomp[x]), 1 : ncomp[[x]]); + rownames(cl[[x]]) = rownames(newdata[[x]]);return(cl[[x]])}) + names(cls$mahalanobis.dist)=names(object$X) + } ### End: Mahalanobis distance + + out.DA$class = cls + + ### End if discriminant analysis is performed + + # at this stage, we have the classification of each sample for each dataset + # of object$X + # now we need to combine the classification by vote (majority wins), + # only when more than one block, otherwise 'vote' is classic classification + if (length(object$X)>1) + { + for (ijk in 1:length(out.DA$class))# loop on the dist + { + # create a temporary array to make computation on the lists easier + temp=array(, c(nrow(newdata[[1]]), min(ncomp), J)) + for(i in 1:J) + { + temp[, , i] = out.DA$class[[ijk]][[i]][, 1:min(ncomp)] + + } + # look at the majority vote for all dataset of object$X (with table) + # if more than a unique max, we put NA + table.temp = apply(temp,c(1,2), function(x) + {a=table(x); if (length(which(a==max(a)))==1) + {b=names(which.max(a))}else{b=NA}; b}) + colnames(table.temp) = + colnames(out.DA$class[[ijk]][[i]])[1:min(ncomp)] + rownames(table.temp) = rownames(out.DA$class[[ijk]][[i]]) + out.DA$MajorityVote[[ijk]] = table.temp + } + names(out.DA$MajorityVote) = names(out.DA$class) + + #save(list=ls(),file="temp.Rdata") + #stop("") + # weighted vote for each distance, each comp + if(!is.null(weights)) + { + out.WeightedVote = vector("list",length=length(out.DA$class)) + Group.2 = n = x =NULL #CRAN check + for(i in 1:length(out.DA$class)){ #i distance + out = matrix(NA_real_,nrow=nrow(newdata[[1]]), ncol=min(ncomp)) + rownames(out) = rownames(newdata[[1]]) + colnames(out) = paste0("comp",1:min(ncomp)) + + for(comp in 1:min(ncomp)){ #comp + data.temp=NULL + for(j in 1:J){ #block + data.temp = rbind(data.temp, + out.DA$class[[i]][[j]][,comp,drop=FALSE]) + } + colnames(data.temp)="pred" + temp=data.frame(data.temp,indiv=rownames(data.temp), + weights=rep(weights,each=nrow(out.DA$class[[1]][[1]]))) + ag = aggregate(temp$weights, + by=list(temp$pred, temp$indiv), FUN=sum) + data_max <- group_by(.data = ag, Group.2) + data_max <- filter(.data = data_max, row_number(x)==n()) + out.comp = as.matrix(data_max[,1]) + rownames(out.comp) = as.matrix(data_max[,2]) + colnames(out.comp) = paste0("comp",comp) + + out[,comp] = out.comp[match(rownames(out), + rownames(out.comp)),] + } + + out.WeightedVote[[i]] = out + } + names(out.WeightedVote) = names(out.DA$class) + out.DA$WeightedVote = out.WeightedVote + + + + if(FALSE){ + out.DA$WeightedVote = lapply(out.DA$class, function(x){ + # x is a distance + class.per.comp = lapply(1:min(ncomp), function(y) { + matrix(sapply(x, function(z) + z[,y, drop = FALSE]),ncol=J)}) + # combine the results per component + names(class.per.comp) = paste0("comp",1:min(ncomp)) + class.weighted.per.comp = sapply(class.per.comp, function(y){ + # for each component + apply(y,1,function(z){ + # we aggregate the results of each individuals using the 'weights' + temp = aggregate(weights,list(as.character(z)),sum) + ind = which(temp[,2]== max (temp[,2])) + # if two max, then NA + if(length(ind) == 1) + { + res = as.character(temp[ind, 1]) + } else { + res = NA + } + res + + }) + + }) + class.weighted.per.comp = matrix(class.weighted.per.comp, + nrow = nrow(class.per.comp[[1]])) + colnames(class.weighted.per.comp) = names(class.per.comp) + rownames(class.weighted.per.comp) = + rownames(out.DA$MajorityVote[[1]]) + class.weighted.per.comp + + }) + } + out.DA$weights = weights + + } + }else{ + out.DA$MajorityVote = lapply(out.DA$class,function(x){x[[1]]}) + } + + block.object = c("block.pls", "block.spls", "block.plsda", "block.spsda") + if (inherits(object, block.object) & J>1) # a block + { + out.DA$centroids = G + }else{ #not a block + out.DA$centroids = G[[1]] + out.DA$class = out.DA$MajorityVote + } + if (any(dist == "all")) + dist = "all" + + out.DA$dist = dist + + out.DA + + } + +# from https://github.com/mixOmicsTeam/mixOmics/blob/766521eaff07b25c9c2ceccdabdf281dd0ef735c/R/unmap.R#L47 +# --------------------------------------------------- +# unmap variates.A variable for (s)plsda +# --------------------------------------------------- +#' Dummy matrix for an outcome factor +#' +#' Converts a class or group vector or factor into a matrix of indicator +#' variables. +#' +#' @param classification A numeric or character vector or factor. Typically the +#' distinct entries of this vector would represent a classification of +#' observations in a data set. +#' @param groups A numeric or character vector indicating the groups from which +#' \code{classification} is drawn. If not supplied, the default is to assumed +#' to be the unique entries of classification. +#' @param noise A single numeric or character value used to indicate the value +#' of \code{groups} corresponding to noise. +#' @return An \emph{n} by \emph{K} matrix of \emph{(0,1)} indicator variables, +#' where \emph{n} is the length of samples and \emph{K} the number of classes +#' in the outcome. +#' +#' If a \code{noise} value of symbol is designated, the corresponding indicator +#' variables are relocated to the last column of the matrix. +#' +#' Note: - you can remap an unmap vector using the function \code{map} from the +#' package \pkg{mclust}. - this function should be used to unmap an outcome +#' vector as in the non-supervised methods of mixOmics. For other supervised +#' analyses such as (s)PLS-DA, (s)gccaDA this function is used internally. +#' @author Ignacio Gonzalez, Kim-Anh Le Cao, Pierre Monget, AL J Abadi +#' @references +#' C. Fraley and A. E. Raftery (2002). Model-based +#' clustering, discriminant analysis, and density estimation. \emph{Journal of +#' the American Statistical Association 97:611-631}. +#' +#' C. Fraley, A. E. Raftery, T. B. Murphy and L. Scrucca (2012). mclust Version +#' 4 for R: Normal Mixture Modeling for Model-Based Clustering, Classification, +#' and Density Estimation. Technical Report No. 597, Department of Statistics, +#' University of Washington. +#' @keywords cluster +#' @export +#' @examples +#' data(nutrimouse) +#' Y = unmap(nutrimouse$diet) +#' Y +#' data = list(gene = nutrimouse$gene, lipid = nutrimouse$lipid, Y = Y) +#' # data could then used as an input in wrapper.rgcca, which is not, technically, +#' # a supervised method, see ??wrapper.rgcca +unmap <- + function (classification, groups = NULL, noise = NULL) + { + n = length(classification) + u = sort(unique(classification)) + levels = levels(classification)### Add levels + + if (is.null(groups)) + { + groups = u + } else { + if (any(match(u, groups, nomatch = 0) == 0)) + stop("groups incompatible with classification") + miss = match(groups, u, nomatch = 0) == 0 + } + + cgroups = as.character(groups) + if (!is.null(noise)) + { + noiz = match(noise, groups, nomatch = 0) + if (any(noiz == 0)) + stop("noise incompatible with classification") + + groups = c(groups[groups != noise], groups[groups == noise]) + noise = as.numeric(factor(as.character(noise), levels = unique(groups))) + } + + groups = as.numeric(factor(cgroups, levels = unique(cgroups))) + classification = as.numeric(factor(as.character(classification), levels = unique(cgroups))) + k = length(groups) - length(noise) + nam = levels(groups) + + if (!is.null(noise)) + { + k = k + 1 + nam = nam[1:k] + nam[k] = "noise" + } + + z = matrix(0, n, k, dimnames = c(names(classification), nam)) + for (j in 1:k) z[classification == groups[j], j] = 1 + attr(z, "levels") = levels + z + } + diff --git a/man/logratio-transformations.Rd b/man/logratio-transformations.Rd new file mode 100644 index 0000000..a230237 --- /dev/null +++ b/man/logratio-transformations.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict_mixOmics.R +\name{logratio-transformations} +\alias{logratio-transformations} +\alias{logratio.transfo} +\title{Log-ratio transformation} +\usage{ +logratio.transfo(X, logratio = c("none", "CLR", "ILR"), offset = 0) +} +\arguments{ +\item{X}{numeric matrix of predictors} + +\item{logratio}{log-ratio transform to apply, one of "none", "CLR" or "ILR"} + +\item{offset}{Value that is added to X for CLR and ILR log transformation. +Default to 0.} +} +\value{ +\code{logratio.transfo} simply returns the log-ratio transformed +data. +} +\description{ +This function applies a log transformation to the data, either CLR or ILR +} +\details{ +\code{logratio.transfo} applies a log transformation to the data, either CLR +(centered log ratio transformation) or ILR (Isometric Log Ratio +transformation). In the case of CLR log-transformation, X needs to be a +matrix of non-negative values and \code{offset} is used to shift the values +away from 0, as commonly done with counts data. +} +\examples{ + +data(diverse.16S) +CLR = logratio.transfo(X = diverse.16S$data.TSS, logratio = 'CLR') +# no offset needed here as we have put it prior to the TSS, see www.mixOmics.org/mixMC +} +\references{ +Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis, +Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a +multivariate statistical framework to gain insight into Microbial +Communities bioRxiv 044206; doi: http://dx.doi.org/10.1101/044206 + +John Aitchison. The statistical analysis of compositional data. Journal of +the Royal Statistical Society. Series B (Methodological), pages 139-177, +1982. + +Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component +analysis for compositional data with outliers. Environmetrics, +20(6):621-632, 2009. +} +\seealso{ +\code{\link{pca}}, \code{\link{pls}}, \code{\link{spls}}, +\code{\link{plsda}}, \code{\link{splsda}}. +} +\author{ +Florian Rohart, Kim-Anh Lê Cao, Al J Abadi +} diff --git a/man/unmap.Rd b/man/unmap.Rd new file mode 100644 index 0000000..4afdb73 --- /dev/null +++ b/man/unmap.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict_mixOmics.R +\name{unmap} +\alias{unmap} +\title{Dummy matrix for an outcome factor} +\usage{ +unmap(classification, groups = NULL, noise = NULL) +} +\arguments{ +\item{classification}{A numeric or character vector or factor. Typically the +distinct entries of this vector would represent a classification of +observations in a data set.} + +\item{groups}{A numeric or character vector indicating the groups from which +\code{classification} is drawn. If not supplied, the default is to assumed +to be the unique entries of classification.} + +\item{noise}{A single numeric or character value used to indicate the value +of \code{groups} corresponding to noise.} +} +\value{ +An \emph{n} by \emph{K} matrix of \emph{(0,1)} indicator variables, +where \emph{n} is the length of samples and \emph{K} the number of classes +in the outcome. + +If a \code{noise} value of symbol is designated, the corresponding indicator +variables are relocated to the last column of the matrix. + +Note: - you can remap an unmap vector using the function \code{map} from the +package \pkg{mclust}. - this function should be used to unmap an outcome +vector as in the non-supervised methods of mixOmics. For other supervised +analyses such as (s)PLS-DA, (s)gccaDA this function is used internally. +} +\description{ +Converts a class or group vector or factor into a matrix of indicator +variables. +} +\examples{ +data(nutrimouse) +Y = unmap(nutrimouse$diet) +Y +data = list(gene = nutrimouse$gene, lipid = nutrimouse$lipid, Y = Y) +# data could then used as an input in wrapper.rgcca, which is not, technically, +# a supervised method, see ??wrapper.rgcca +} +\references{ +C. Fraley and A. E. Raftery (2002). Model-based +clustering, discriminant analysis, and density estimation. \emph{Journal of +the American Statistical Association 97:611-631}. + +C. Fraley, A. E. Raftery, T. B. Murphy and L. Scrucca (2012). mclust Version +4 for R: Normal Mixture Modeling for Model-Based Clustering, Classification, +and Density Estimation. Technical Report No. 597, Department of Statistics, +University of Washington. +} +\author{ +Ignacio Gonzalez, Kim-Anh Le Cao, Pierre Monget, AL J Abadi +} +\keyword{cluster} From 47b0abedfb9e8a5e39c1c424871e7756f58ec16e Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Thu, 30 Jan 2025 23:31:01 -0800 Subject: [PATCH 13/19] remove links in documention --- R/predict_mixOmics.R | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/R/predict_mixOmics.R b/R/predict_mixOmics.R index 5f29025..3e55e8c 100644 --- a/R/predict_mixOmics.R +++ b/R/predict_mixOmics.R @@ -146,15 +146,7 @@ #' analysis (see details above).} #' @author Florian Rohart, Sébastien Déjean, Ignacio González, Kim-Anh Lê Cao, #' Al J Abadi -#' @seealso \code{\link{pls}}, \code{\link{spls}}, \code{\link{plsda}}, -#' \code{\link{splsda}}, \code{\link{mint.pls}}, \code{\link{mint.spls}}, -#' \code{\link{mint.plsda}}, \code{\link{mint.splsda}}, -#' \code{\link{block.pls}}, \code{\link{block.spls}}, -#' \code{\link{block.plsda}}, \code{\link{block.splsda}}, -#' \code{\link{mint.block.pls}}, \code{\link{mint.block.spls}}, -#' \code{\link{mint.block.plsda}}, \code{\link{mint.block.splsda}} and -#' visualisation with \code{\link{background.predict}} and -#' http://www.mixOmics.org for more details. +#' @seealso http://www.mixOmics.org for more details. #' @references #' #' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics @@ -859,11 +851,8 @@ Check.entry.single = function(X, ncomp, q) #' Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component #' analysis for compositional data with outliers. Environmetrics, #' 20(6):621-632, 2009. -#' @examples #' -#' data(diverse.16S) -#' CLR = logratio.transfo(X = diverse.16S$data.TSS, logratio = 'CLR') -#' # no offset needed here as we have put it prior to the TSS, see www.mixOmics.org/mixMC +#' See mixOmics documentation for more information #' @name logratio-transformations NULL #' @rdname logratio-transformations From d94565b33e06fefe0627d31019b97cea08fa3618 Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Thu, 30 Jan 2025 23:31:15 -0800 Subject: [PATCH 14/19] fix chunk error --- vignettes/planet.Rmd | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vignettes/planet.Rmd b/vignettes/planet.Rmd index 85fa734..7c06a70 100644 --- a/vignettes/planet.Rmd +++ b/vignettes/planet.Rmd @@ -72,7 +72,7 @@ head(houseman_estimates) #### EpiDISH -```{r epidish echo=FALSE} +```{r epidish, results='hide'} # robust partial correlations epidish_RPC <- epidish( beta.m = plBetas[rownames(plCellCpGsThird), ], @@ -323,15 +323,13 @@ for classification used to assign labels is 55%; if the users wishes to use other threshold, different labels can be assigned based on the output probabilities. -```{r include=TRUE, eval = TRUE} - +```{r predict_pe, include=TRUE, eval = TRUE} library(ExperimentHub) eh <- ExperimentHub() query(eh, "eoPredData") # download BMIQ normalized 450k data x_test <- eh[['EH8403']] -valMeta <- eh[['EH8404']] preds <- x_test %>% predictPreeclampsia() ``` @@ -341,6 +339,7 @@ Inspect the results: head(preds) # join with metadata +valMeta <- eh[['EH8404']] valMeta <- left_join(valMeta, preds, by="Sample_ID") # visualize results From 9f88b161b2a2effc5ca77f8ebbcb7337947e452b Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Thu, 30 Jan 2025 23:53:05 -0800 Subject: [PATCH 15/19] fix cran check errors --- .gitignore | 1 + NAMESPACE | 6 ++---- R/predict_mixOmics.R | 15 ++------------- man/logratio-transformations.Rd | 12 ++---------- man/predict.Rd | 8 -------- man/unmap.Rd | 8 -------- 6 files changed, 7 insertions(+), 43 deletions(-) diff --git a/.gitignore b/.gitignore index f1697a9..ad3e8c4 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ data-raw/mod.rds data-raw/x_test.rds data-raw/x_train.rds data-raw/y_train.rds +R/bioc-devel.sh diff --git a/NAMESPACE b/NAMESPACE index 24492c1..6ac8431 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,15 +1,11 @@ # Generated by roxygen2: do not edit by hand -S3method(predict,mixo_pls) -S3method(predict,mixo_spls) export("%>%") -export(logratio.transfo) export(pl_infer_age) export(pl_infer_ethnicity) export(predictAge) export(predictEthnicity) export(predictPreeclampsia) -export(unmap) importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,n) @@ -17,5 +13,7 @@ importFrom(dplyr,row_number) importFrom(magrittr,"%>%") importFrom(methods,hasArg) importFrom(methods,is) +importFrom(stats,aggregate) importFrom(stats,setNames) importFrom(tibble,tibble) +importFrom(utils,data) diff --git a/R/predict_mixOmics.R b/R/predict_mixOmics.R index 3e55e8c..7ba170f 100644 --- a/R/predict_mixOmics.R +++ b/R/predict_mixOmics.R @@ -161,7 +161,6 @@ #' @method predict mixo_pls #' @importFrom methods hasArg is #' @importFrom stats setNames -#' @export predict.mixo_pls <- function(object, newdata, @@ -760,7 +759,6 @@ predict.mixo_pls <- #' @rdname predict #' @method predict mixo_spls -#' @export predict.mixo_spls <- predict.mixo_pls @@ -837,8 +835,6 @@ Check.entry.single = function(X, ncomp, q) #' @return \code{logratio.transfo} simply returns the log-ratio transformed #' data. #' @author Florian Rohart, Kim-Anh Lê Cao, Al J Abadi -#' @seealso \code{\link{pca}}, \code{\link{pls}}, \code{\link{spls}}, -#' \code{\link{plsda}}, \code{\link{splsda}}. #' @references Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis, #' Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a #' multivariate statistical framework to gain insight into Microbial @@ -856,7 +852,8 @@ Check.entry.single = function(X, ncomp, q) #' @name logratio-transformations NULL #' @rdname logratio-transformations -#' @export +#' @importFrom stats aggregate +#' @importFrom utils data logratio.transfo <- function(X, logratio = c('none','CLR','ILR'), offset = 0) @@ -1176,14 +1173,6 @@ internal_predict.DA <- #' and Density Estimation. Technical Report No. 597, Department of Statistics, #' University of Washington. #' @keywords cluster -#' @export -#' @examples -#' data(nutrimouse) -#' Y = unmap(nutrimouse$diet) -#' Y -#' data = list(gene = nutrimouse$gene, lipid = nutrimouse$lipid, Y = Y) -#' # data could then used as an input in wrapper.rgcca, which is not, technically, -#' # a supervised method, see ??wrapper.rgcca unmap <- function (classification, groups = NULL, noise = NULL) { diff --git a/man/logratio-transformations.Rd b/man/logratio-transformations.Rd index a230237..dbe9c12 100644 --- a/man/logratio-transformations.Rd +++ b/man/logratio-transformations.Rd @@ -29,12 +29,6 @@ transformation). In the case of CLR log-transformation, X needs to be a matrix of non-negative values and \code{offset} is used to shift the values away from 0, as commonly done with counts data. } -\examples{ - -data(diverse.16S) -CLR = logratio.transfo(X = diverse.16S$data.TSS, logratio = 'CLR') -# no offset needed here as we have put it prior to the TSS, see www.mixOmics.org/mixMC -} \references{ Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis, Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a @@ -48,10 +42,8 @@ the Royal Statistical Society. Series B (Methodological), pages 139-177, Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component analysis for compositional data with outliers. Environmetrics, 20(6):621-632, 2009. -} -\seealso{ -\code{\link{pca}}, \code{\link{pls}}, \code{\link{spls}}, -\code{\link{plsda}}, \code{\link{splsda}}. + +See mixOmics documentation for more information } \author{ Florian Rohart, Kim-Anh Lê Cao, Al J Abadi diff --git a/man/predict.Rd b/man/predict.Rd index bc64d95..88f8bff 100644 --- a/man/predict.Rd +++ b/man/predict.Rd @@ -185,14 +185,6 @@ Tenenhaus, M. (1998). \emph{La regression PLS: theorie et pratique}. Paris: Editions Technic. } \seealso{ -\code{\link{pls}}, \code{\link{spls}}, \code{\link{plsda}}, -\code{\link{splsda}}, \code{\link{mint.pls}}, \code{\link{mint.spls}}, -\code{\link{mint.plsda}}, \code{\link{mint.splsda}}, -\code{\link{block.pls}}, \code{\link{block.spls}}, -\code{\link{block.plsda}}, \code{\link{block.splsda}}, -\code{\link{mint.block.pls}}, \code{\link{mint.block.spls}}, -\code{\link{mint.block.plsda}}, \code{\link{mint.block.splsda}} and -visualisation with \code{\link{background.predict}} and http://www.mixOmics.org for more details. } \author{ diff --git a/man/unmap.Rd b/man/unmap.Rd index 4afdb73..3df86b8 100644 --- a/man/unmap.Rd +++ b/man/unmap.Rd @@ -35,14 +35,6 @@ analyses such as (s)PLS-DA, (s)gccaDA this function is used internally. Converts a class or group vector or factor into a matrix of indicator variables. } -\examples{ -data(nutrimouse) -Y = unmap(nutrimouse$diet) -Y -data = list(gene = nutrimouse$gene, lipid = nutrimouse$lipid, Y = Y) -# data could then used as an input in wrapper.rgcca, which is not, technically, -# a supervised method, see ??wrapper.rgcca -} \references{ C. Fraley and A. E. Raftery (2002). Model-based clustering, discriminant analysis, and density estimation. \emph{Journal of From a47a86872cae204087e3fd900a1898bc72446d47 Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Thu, 30 Jan 2025 23:59:37 -0800 Subject: [PATCH 16/19] remove documnetation and exports --- NAMESPACE | 2 - R/predict_mixOmics.R | 176 ++------------------------------ man/logratio-transformations.Rd | 50 --------- man/predict.Rd | 96 +---------------- man/unmap.Rd | 51 --------- 5 files changed, 11 insertions(+), 364 deletions(-) delete mode 100644 man/logratio-transformations.Rd delete mode 100644 man/unmap.Rd diff --git a/NAMESPACE b/NAMESPACE index 6ac8431..e40a48a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,7 +13,5 @@ importFrom(dplyr,row_number) importFrom(magrittr,"%>%") importFrom(methods,hasArg) importFrom(methods,is) -importFrom(stats,aggregate) importFrom(stats,setNames) importFrom(tibble,tibble) -importFrom(utils,data) diff --git a/R/predict_mixOmics.R b/R/predict_mixOmics.R index 7ba170f..97b5cb8 100644 --- a/R/predict_mixOmics.R +++ b/R/predict_mixOmics.R @@ -1,100 +1,9 @@ -# copied from https://raw.githubusercontent.com/mixOmicsTeam/mixOmics/refs/heads/master/R/predict.R -# on 2025 Jan 30. Some components omitted that are not used in planet. -# ======================================================================================================== -# This function makes a prediction of a 'newdata' by using the results of 'object'. -# Depending on the class of the object (mint).(block).(s)pls(da) (16 classes so far), the input data is different -# and the preparation of the data is different - different scaling for instance if object="mint...." -# However, the prediction formula is the same for all classes, thus only one code -# ======================================================================================================== - -#' Predict Method for (mint).(block).(s)pls(da) methods -#' -#' Predicted values based on PLS models. New responses and variates are -#' predicted using a fitted model and a new matrix of observations. -#' -#' \code{predict} produces predicted values, obtained by evaluating the -#' PLS-derived methods, returned by \code{(mint).(block).(s)pls(da)} in the -#' frame \code{newdata}. Variates for \code{newdata} are also returned. Please -#' note that this method performs multilevel decomposition and/or log ratio -#' transformations if needed (\code{multilevel} is an input parameter while -#' \code{logratio} is extracted from \code{object}). -#' -#' Different prediction distances are proposed for discriminant analysis. The -#' reason is that our supervised models work with a dummy indicator matrix of -#' \code{Y} to indicate the class membership of each sample. The prediction of -#' a new observation results in either a predicted dummy variable (output -#' \code{object$predict}), or a predicted variate (output -#' \code{object$variates}). Therefore, an appropriate distance needs to be -#' applied to those predicted values to assign the predicted class. We propose -#' distances such as `maximum distance' for the predicted dummy variables, -#' `Mahalanobis distance' and `Centroids distance' for the predicted variates. -#' -#' \code{"max.dist"} is the simplest method to predict the class of a test -#' sample. For each new individual, the class with the largest predicted dummy -#' variable is the predicted class. This distance performs well in single data -#' set analysis with multiclass problems (PLS-DA). -#' -#' \code{"centroids.dist"} allocates to the new observation the class that -#' mimimises the distance between the predicted score and the centroids of the -#' classes calculated on the latent components or variates of the trained -#' model. -#' -#' \code{"mahalanobis.dist"} allocates the new sample the class defined as the -#' centroid distance, but using the Mahalanobis metric in the calculation of -#' the distance. -#' -#' In practice we found that the centroid-based distances -#' (\code{"centroids.dist"} and \code{"mahalanobis.dist"}), and specifically -#' the Mahalanobis distance led to more accurate predictions than the maximum -#' distance for complex classification problems and N-integration problems -#' (block.splsda). The centroid distances consider the prediction in -#' dimensional space spanned by the predicted variates, while the maximum -#' distance considers a single point estimate using the predicted scores on the -#' last dimension of the model. The user can assess the different distances, -#' and choose the prediction distance that leads to the best performance of the -#' model, as highlighted from the tune and perf outputs -#' -#' More (mathematical) details about the prediction distances are available in -#' the supplemental of the mixOmics article (Rohart et al 2017). -#' -#' For a visualisation of those prediction distances, see -#' \code{background.predict} that overlays the prediction area in -#' \code{plotIndiv} for a sPLS-DA object. -#' -#' Allocates the individual \eqn{x} to the class of \eqn{Y} minimizing -#' \eqn{dist(\code{x-variate}, G_l)}, where \eqn{G_l}, \eqn{l = 1,...,L} are -#' the centroids of the classes calculated on the \eqn{X}-variates of the -#' model. \code{"mahalanobis.dist"} allocates the individual \eqn{x} to the -#' class of \eqn{Y} as in \code{"centroids.dist"} but by using the Mahalanobis -#' metric in the calculation of the distance. -#' -#' For MINT objects, the \code{study.test} argument is required and provides -#' the grouping factor of \code{newdata}. -#' -#' For multi block analysis (thus block objects), \code{newdata} is a list of -#' matrices whose names are a subset of \code{names(object$X)} and missing -#' blocks are allowed. Several predictions are returned, either for each block -#' or for all blocks. For non discriminant analysis, the predicted values -#' (\code{predict}) are returned for each block and these values are combined -#' by average (\code{AveragedPredict}) or weighted average -#' (\code{WeightedPredict}), using the weights of the blocks that are -#' calculated as the correlation between a block's components and the outcome's -#' components. +#' predict mixxo from #' -#' For discriminant analysis, the predicted class is returned for each block -#' (\code{class}) and each distance (\code{dist}) and these predictions are -#' combined by majority vote (\code{MajorityVote}) or weighted majority vote -#' (\code{WeightedVote}), using the weights of the blocks that are calculated -#' as the correlation between a block's components and the outcome's -#' components. NA means that there is no consensus among the block. For PLS-DA -#' and sPLS-DA objects, the prediction area can be visualised in plotIndiv via -#' the \code{background.predict} function. +#' copied from mixOmicsTeam/mixOmics/refs/heads/master/R/predict.R +#' on 2025 Jan 30. Some components omitted that are not used in planet. #' -#' @aliases predict predict.pls predict.spls predict.plsda predict.splsda -#' predict.mint.pls predict.mint.spls predict.mint.plsda predict.mint.splsda -#' predict.mint.block.pls predict.mint.block.spls predict.mint.block.plsda -#' predict.mint.block.splsda #' @param object object of class inheriting from #' \code{"(mint).(block).(s)pls(da)"}. #' @param newdata data matrix in which to look for for explanatory variables to @@ -161,6 +70,9 @@ #' @method predict mixo_pls #' @importFrom methods hasArg is #' @importFrom stats setNames +#' @examples +#' # example code +#' predict.mixo_pls <- function(object, newdata, @@ -817,43 +729,6 @@ Check.entry.single = function(X, ncomp, q) return(list(X=X, ncomp=ncomp, X.names=X.names, ind.names=ind.names)) } -# from https://github.com/mixOmicsTeam/mixOmics/blob/master/R/logratio-transformations.R -#' Log-ratio transformation -#' -#' This function applies a log transformation to the data, either CLR or ILR -#' -#' \code{logratio.transfo} applies a log transformation to the data, either CLR -#' (centered log ratio transformation) or ILR (Isometric Log Ratio -#' transformation). In the case of CLR log-transformation, X needs to be a -#' matrix of non-negative values and \code{offset} is used to shift the values -#' away from 0, as commonly done with counts data. -#' -#' @param X numeric matrix of predictors -#' @param logratio log-ratio transform to apply, one of "none", "CLR" or "ILR" -#' @param offset Value that is added to X for CLR and ILR log transformation. -#' Default to 0. -#' @return \code{logratio.transfo} simply returns the log-ratio transformed -#' data. -#' @author Florian Rohart, Kim-Anh Lê Cao, Al J Abadi -#' @references Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis, -#' Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a -#' multivariate statistical framework to gain insight into Microbial -#' Communities bioRxiv 044206; doi: http://dx.doi.org/10.1101/044206 -#' -#' John Aitchison. The statistical analysis of compositional data. Journal of -#' the Royal Statistical Society. Series B (Methodological), pages 139-177, -#' 1982. -#' -#' Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component -#' analysis for compositional data with outliers. Environmetrics, -#' 20(6):621-632, 2009. -#' -#' See mixOmics documentation for more information -#' @name logratio-transformations -NULL -#' @rdname logratio-transformations -#' @importFrom stats aggregate -#' @importFrom utils data logratio.transfo <- function(X, logratio = c('none','CLR','ILR'), offset = 0) @@ -1134,45 +1009,6 @@ internal_predict.DA <- } -# from https://github.com/mixOmicsTeam/mixOmics/blob/766521eaff07b25c9c2ceccdabdf281dd0ef735c/R/unmap.R#L47 -# --------------------------------------------------- -# unmap variates.A variable for (s)plsda -# --------------------------------------------------- -#' Dummy matrix for an outcome factor -#' -#' Converts a class or group vector or factor into a matrix of indicator -#' variables. -#' -#' @param classification A numeric or character vector or factor. Typically the -#' distinct entries of this vector would represent a classification of -#' observations in a data set. -#' @param groups A numeric or character vector indicating the groups from which -#' \code{classification} is drawn. If not supplied, the default is to assumed -#' to be the unique entries of classification. -#' @param noise A single numeric or character value used to indicate the value -#' of \code{groups} corresponding to noise. -#' @return An \emph{n} by \emph{K} matrix of \emph{(0,1)} indicator variables, -#' where \emph{n} is the length of samples and \emph{K} the number of classes -#' in the outcome. -#' -#' If a \code{noise} value of symbol is designated, the corresponding indicator -#' variables are relocated to the last column of the matrix. -#' -#' Note: - you can remap an unmap vector using the function \code{map} from the -#' package \pkg{mclust}. - this function should be used to unmap an outcome -#' vector as in the non-supervised methods of mixOmics. For other supervised -#' analyses such as (s)PLS-DA, (s)gccaDA this function is used internally. -#' @author Ignacio Gonzalez, Kim-Anh Le Cao, Pierre Monget, AL J Abadi -#' @references -#' C. Fraley and A. E. Raftery (2002). Model-based -#' clustering, discriminant analysis, and density estimation. \emph{Journal of -#' the American Statistical Association 97:611-631}. -#' -#' C. Fraley, A. E. Raftery, T. B. Murphy and L. Scrucca (2012). mclust Version -#' 4 for R: Normal Mixture Modeling for Model-Based Clustering, Classification, -#' and Density Estimation. Technical Report No. 597, Department of Statistics, -#' University of Washington. -#' @keywords cluster unmap <- function (classification, groups = NULL, noise = NULL) { diff --git a/man/logratio-transformations.Rd b/man/logratio-transformations.Rd deleted file mode 100644 index dbe9c12..0000000 --- a/man/logratio-transformations.Rd +++ /dev/null @@ -1,50 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/predict_mixOmics.R -\name{logratio-transformations} -\alias{logratio-transformations} -\alias{logratio.transfo} -\title{Log-ratio transformation} -\usage{ -logratio.transfo(X, logratio = c("none", "CLR", "ILR"), offset = 0) -} -\arguments{ -\item{X}{numeric matrix of predictors} - -\item{logratio}{log-ratio transform to apply, one of "none", "CLR" or "ILR"} - -\item{offset}{Value that is added to X for CLR and ILR log transformation. -Default to 0.} -} -\value{ -\code{logratio.transfo} simply returns the log-ratio transformed -data. -} -\description{ -This function applies a log transformation to the data, either CLR or ILR -} -\details{ -\code{logratio.transfo} applies a log transformation to the data, either CLR -(centered log ratio transformation) or ILR (Isometric Log Ratio -transformation). In the case of CLR log-transformation, X needs to be a -matrix of non-negative values and \code{offset} is used to shift the values -away from 0, as commonly done with counts data. -} -\references{ -Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis, -Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a -multivariate statistical framework to gain insight into Microbial -Communities bioRxiv 044206; doi: http://dx.doi.org/10.1101/044206 - -John Aitchison. The statistical analysis of compositional data. Journal of -the Royal Statistical Society. Series B (Methodological), pages 139-177, -1982. - -Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component -analysis for compositional data with outliers. Environmetrics, -20(6):621-632, 2009. - -See mixOmics documentation for more information -} -\author{ -Florian Rohart, Kim-Anh Lê Cao, Al J Abadi -} diff --git a/man/predict.Rd b/man/predict.Rd index 88f8bff..0d836a2 100644 --- a/man/predict.Rd +++ b/man/predict.Rd @@ -3,20 +3,8 @@ \name{predict} \alias{predict} \alias{predict.mixo_pls} -\alias{predict.pls} -\alias{predict.spls} -\alias{predict.plsda} -\alias{predict.splsda} -\alias{predict.mint.pls} -\alias{predict.mint.spls} -\alias{predict.mint.plsda} -\alias{predict.mint.splsda} -\alias{predict.mint.block.pls} -\alias{predict.mint.block.spls} -\alias{predict.mint.block.plsda} -\alias{predict.mint.block.splsda} \alias{predict.mixo_spls} -\title{Predict Method for (mint).(block).(s)pls(da) methods} +\title{predict mixxo from} \usage{ \method{predict}{mixo_pls}( object, @@ -95,86 +83,12 @@ distance requested.} \item{vote}{majority vote result for multi block analysis (see details above).} } \description{ -Predicted values based on PLS models. New responses and variates are -predicted using a fitted model and a new matrix of observations. +copied from mixOmicsTeam/mixOmics/refs/heads/master/R/predict.R +on 2025 Jan 30. Some components omitted that are not used in planet. } -\details{ -\code{predict} produces predicted values, obtained by evaluating the -PLS-derived methods, returned by \code{(mint).(block).(s)pls(da)} in the -frame \code{newdata}. Variates for \code{newdata} are also returned. Please -note that this method performs multilevel decomposition and/or log ratio -transformations if needed (\code{multilevel} is an input parameter while -\code{logratio} is extracted from \code{object}). +\examples{ +# example code -Different prediction distances are proposed for discriminant analysis. The -reason is that our supervised models work with a dummy indicator matrix of -\code{Y} to indicate the class membership of each sample. The prediction of -a new observation results in either a predicted dummy variable (output -\code{object$predict}), or a predicted variate (output -\code{object$variates}). Therefore, an appropriate distance needs to be -applied to those predicted values to assign the predicted class. We propose -distances such as \verb{maximum distance' for the predicted dummy variables, }Mahalanobis distance' and `Centroids distance' for the predicted variates. - -\code{"max.dist"} is the simplest method to predict the class of a test -sample. For each new individual, the class with the largest predicted dummy -variable is the predicted class. This distance performs well in single data -set analysis with multiclass problems (PLS-DA). - -\code{"centroids.dist"} allocates to the new observation the class that -mimimises the distance between the predicted score and the centroids of the -classes calculated on the latent components or variates of the trained -model. - -\code{"mahalanobis.dist"} allocates the new sample the class defined as the -centroid distance, but using the Mahalanobis metric in the calculation of -the distance. - -In practice we found that the centroid-based distances -(\code{"centroids.dist"} and \code{"mahalanobis.dist"}), and specifically -the Mahalanobis distance led to more accurate predictions than the maximum -distance for complex classification problems and N-integration problems -(block.splsda). The centroid distances consider the prediction in -dimensional space spanned by the predicted variates, while the maximum -distance considers a single point estimate using the predicted scores on the -last dimension of the model. The user can assess the different distances, -and choose the prediction distance that leads to the best performance of the -model, as highlighted from the tune and perf outputs - -More (mathematical) details about the prediction distances are available in -the supplemental of the mixOmics article (Rohart et al 2017). - -For a visualisation of those prediction distances, see -\code{background.predict} that overlays the prediction area in -\code{plotIndiv} for a sPLS-DA object. - -Allocates the individual \eqn{x} to the class of \eqn{Y} minimizing -\eqn{dist(\code{x-variate}, G_l)}, where \eqn{G_l}, \eqn{l = 1,...,L} are -the centroids of the classes calculated on the \eqn{X}-variates of the -model. \code{"mahalanobis.dist"} allocates the individual \eqn{x} to the -class of \eqn{Y} as in \code{"centroids.dist"} but by using the Mahalanobis -metric in the calculation of the distance. - -For MINT objects, the \code{study.test} argument is required and provides -the grouping factor of \code{newdata}. - -For multi block analysis (thus block objects), \code{newdata} is a list of -matrices whose names are a subset of \code{names(object$X)} and missing -blocks are allowed. Several predictions are returned, either for each block -or for all blocks. For non discriminant analysis, the predicted values -(\code{predict}) are returned for each block and these values are combined -by average (\code{AveragedPredict}) or weighted average -(\code{WeightedPredict}), using the weights of the blocks that are -calculated as the correlation between a block's components and the outcome's -components. - -For discriminant analysis, the predicted class is returned for each block -(\code{class}) and each distance (\code{dist}) and these predictions are -combined by majority vote (\code{MajorityVote}) or weighted majority vote -(\code{WeightedVote}), using the weights of the blocks that are calculated -as the correlation between a block's components and the outcome's -components. NA means that there is no consensus among the block. For PLS-DA -and sPLS-DA objects, the prediction area can be visualised in plotIndiv via -the \code{background.predict} function. } \references{ Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics diff --git a/man/unmap.Rd b/man/unmap.Rd deleted file mode 100644 index 3df86b8..0000000 --- a/man/unmap.Rd +++ /dev/null @@ -1,51 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/predict_mixOmics.R -\name{unmap} -\alias{unmap} -\title{Dummy matrix for an outcome factor} -\usage{ -unmap(classification, groups = NULL, noise = NULL) -} -\arguments{ -\item{classification}{A numeric or character vector or factor. Typically the -distinct entries of this vector would represent a classification of -observations in a data set.} - -\item{groups}{A numeric or character vector indicating the groups from which -\code{classification} is drawn. If not supplied, the default is to assumed -to be the unique entries of classification.} - -\item{noise}{A single numeric or character value used to indicate the value -of \code{groups} corresponding to noise.} -} -\value{ -An \emph{n} by \emph{K} matrix of \emph{(0,1)} indicator variables, -where \emph{n} is the length of samples and \emph{K} the number of classes -in the outcome. - -If a \code{noise} value of symbol is designated, the corresponding indicator -variables are relocated to the last column of the matrix. - -Note: - you can remap an unmap vector using the function \code{map} from the -package \pkg{mclust}. - this function should be used to unmap an outcome -vector as in the non-supervised methods of mixOmics. For other supervised -analyses such as (s)PLS-DA, (s)gccaDA this function is used internally. -} -\description{ -Converts a class or group vector or factor into a matrix of indicator -variables. -} -\references{ -C. Fraley and A. E. Raftery (2002). Model-based -clustering, discriminant analysis, and density estimation. \emph{Journal of -the American Statistical Association 97:611-631}. - -C. Fraley, A. E. Raftery, T. B. Murphy and L. Scrucca (2012). mclust Version -4 for R: Normal Mixture Modeling for Model-Based Clustering, Classification, -and Density Estimation. Technical Report No. 597, Department of Statistics, -University of Washington. -} -\author{ -Ignacio Gonzalez, Kim-Anh Le Cao, Pierre Monget, AL J Abadi -} -\keyword{cluster} From 098e38d8a649322713c10d2db1818cfa9ab6e92e Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Fri, 31 Jan 2025 00:02:32 -0800 Subject: [PATCH 17/19] remove deprecations --- R/planet-deprecated.R | 14 -------------- R/predictAge.R | 5 ----- R/predictEthnicity.R | 5 ----- R/predictPreeclampsia.R | 4 ++-- man/planet-deprecated.Rd | 17 ----------------- man/predictPreeclampsia.Rd | 3 +-- 6 files changed, 3 insertions(+), 45 deletions(-) delete mode 100644 man/planet-deprecated.Rd diff --git a/R/planet-deprecated.R b/R/planet-deprecated.R index 13c3583..e69de29 100644 --- a/R/planet-deprecated.R +++ b/R/planet-deprecated.R @@ -1,14 +0,0 @@ -#' Deprecated functions in \pkg{planet} -#' -#' These functions still work but will be removed (defunct) in the next version. -#' -#' \itemize{ -#' \item \code{\link{pl_infer_ethnicity}}: This function has been renamed -#' \code{\link{predictEthnicity}} -#' -#' \item \code{\link{pl_infer_age}}: This function has been renamed -#' \code{\link{predictAge}} -#' } -#' -#' @name planet-deprecated -NULL \ No newline at end of file diff --git a/R/predictAge.R b/R/predictAge.R index d5959fe..1645365 100644 --- a/R/predictAge.R +++ b/R/predictAge.R @@ -89,9 +89,4 @@ predictAge <- function(betas, type = "RPC") { as.vector() return(age) -} - -pl_infer_age <- function(betas, type = 'RPC'){ - .Deprecated('predictAge') - predictAge(betas = betas, type = type) } \ No newline at end of file diff --git a/R/predictEthnicity.R b/R/predictEthnicity.R index a8165da..860f65f 100644 --- a/R/predictEthnicity.R +++ b/R/predictEthnicity.R @@ -147,8 +147,3 @@ glmnet_softmax <- function(x, ignore_labels = FALSE) { } pclass } - -pl_infer_ethnicity <- function(betas, threshold = 0.75) { - .Deprecated("predictEthnicity") - predictEthnicity(betas = betas, threshold = threshold) -} diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R index c07c037..375e4c4 100644 --- a/R/predictPreeclampsia.R +++ b/R/predictPreeclampsia.R @@ -16,7 +16,7 @@ #' #' @return produces a list with components detailed in the `mixOmics::predict` R documentation #' -#' @examples \dontrun{ +#' @examples #' #' # To predict early preeclampsia on 450k/850k samples #' @@ -28,7 +28,7 @@ #' # test object #' x_test <- eh[['EH8403']] #' x_test %>% predictPreeclampsia() -#' } +#' #' @export predictPreeclampsia #' predictPreeclampsia <- function(betas, ...){ diff --git a/man/planet-deprecated.Rd b/man/planet-deprecated.Rd deleted file mode 100644 index 8625f0d..0000000 --- a/man/planet-deprecated.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/planet-deprecated.R -\name{planet-deprecated} -\alias{planet-deprecated} -\title{Deprecated functions in \pkg{planet}} -\description{ -These functions still work but will be removed (defunct) in the next version. -} -\details{ -\itemize{ -\item \code{\link{pl_infer_ethnicity}}: This function has been renamed -\code{\link{predictEthnicity}} - -\item \code{\link{pl_infer_age}}: This function has been renamed -\code{\link{predictAge}} -} -} diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd index ea612d7..e1e90a9 100644 --- a/man/predictPreeclampsia.Rd +++ b/man/predictPreeclampsia.Rd @@ -28,7 +28,6 @@ prior to prediction. This was the normalization method used on the training data } \examples{ -\dontrun{ # To predict early preeclampsia on 450k/850k samples @@ -40,5 +39,5 @@ query(eh, "eoPredData") # test object x_test <- eh[['EH8403']] x_test \%>\% predictPreeclampsia() -} + } From 474e4dbe21d2b922114a3cd1422b2cb5bdffff98 Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Fri, 31 Jan 2025 08:52:18 -0800 Subject: [PATCH 18/19] remove deprecated exports --- NAMESPACE | 2 -- R/predictAge.R | 2 -- R/predictEthnicity.R | 3 --- man/predictAge.Rd | 1 - man/predictEthnicity.Rd | 1 - 5 files changed, 9 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e40a48a..6ce34a1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,6 @@ # Generated by roxygen2: do not edit by hand export("%>%") -export(pl_infer_age) -export(pl_infer_ethnicity) export(predictAge) export(predictEthnicity) export(predictPreeclampsia) diff --git a/R/predictAge.R b/R/predictAge.R index 1645365..58d17cf 100644 --- a/R/predictAge.R +++ b/R/predictAge.R @@ -31,8 +31,6 @@ #' mutate(inferred_ga = predictAge(plBetas, type = "RPC")) #' #' @export predictAge -#' @export pl_infer_age -#' @aliases pl_infer_age #' predictAge <- function(betas, type = "RPC") { data(ageCpGs, envir = environment()) diff --git a/R/predictEthnicity.R b/R/predictEthnicity.R index 860f65f..b7e15b3 100644 --- a/R/predictEthnicity.R +++ b/R/predictEthnicity.R @@ -30,9 +30,6 @@ #' predictEthnicity(plBetas) #' #' @export predictEthnicity -#' @export pl_infer_ethnicity -#' @aliases pl_infer_ethnicity - predictEthnicity <- function(betas, threshold = 0.75, force = FALSE) { data(ethnicityCpGs, envir=environment()) pf <- intersect(ethnicityCpGs, rownames(betas)) diff --git a/man/predictAge.Rd b/man/predictAge.Rd index c9085f5..fca97d2 100644 --- a/man/predictAge.Rd +++ b/man/predictAge.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/predictAge.R \name{predictAge} \alias{predictAge} -\alias{pl_infer_age} \title{Predicts gestational age using placental DNA methylation microarray data} \usage{ diff --git a/man/predictEthnicity.Rd b/man/predictEthnicity.Rd index 289882d..7ffe88f 100644 --- a/man/predictEthnicity.Rd +++ b/man/predictEthnicity.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/predictEthnicity.R \name{predictEthnicity} \alias{predictEthnicity} -\alias{pl_infer_ethnicity} \title{Predicts ethnicity using placental DNA methylation microarray data} \usage{ predictEthnicity(betas, threshold = 0.75, force = FALSE) From b64fbf934164056c1d708598f5b4fb807a7308e1 Mon Sep 17 00:00:00 2001 From: Victor Yuan Date: Sat, 1 Feb 2025 08:59:36 -0800 Subject: [PATCH 19/19] remove eopreddata warning --- DESCRIPTION | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9f2b89e..25a326e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,8 +22,7 @@ Imports: methods, tibble, magrittr, - dplyr, - eoPredData + dplyr Suggests: ExperimentHub, mixOmics,