From 81f6f02bbd17539ba6f21a0e04c8587447cddb55 Mon Sep 17 00:00:00 2001
From: iciarfernandez <iciarfernandez@outlook.com>
Date: Tue, 8 Aug 2023 16:36:46 -0700
Subject: [PATCH 01/19] fixing errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I updated the function to include the code where I load the model object (which yes, I think it basically replaces the equivalent of `data(ethnicityCpGs, envir=environment())` in the ethnicity function) and to require the mixOmics and ExperimentHub packages, ran `devtools::document()`, then `devtools::check()`.

I think everything with the function is working now - the error that I am getting now with `devtools::check()` is about the vignette; `valMeta` in the vignette is supposed to be the pData of the `val` object, but we didn't actually upload it to the ExperimentHub so I can't load it.

If you could update the ExperimentHub to 1) include the `valMeta` object, which I have added [to the dropbox](https://www.dropbox.com/scl/fi/byok0mkwl7iqwssexg5d2/valMeta.csv?rlkey=c0hle2lwgost83u6o6u4iwzku&dl=0) and 2) delete the `x_test` object and instead include a new `val` object valBMIQ, also [now in the dropbox](https://www.dropbox.com/scl/fi/uteq2joahew6wm9i0wl35/valBMIQ.rds?rlkey=er4trp4bg4xkskvey4yihgw4d&dl=0), then I can try running `devtools::check()` again and hopefully everything works!

Last but not least, I downgraded to Bioconductor 3.17 but still getting this warning when I try to load the data from ExperimentHub()

> Warning message:
> package ‘eoPredData’ is not available for Bioconductor version '3.17'

Even though it does seem to install - i.e., I am able to run `mod = eh[["EH8090"]]` and it works to load the data. Anyway, it doesn't seem to be a problem?
---
 DESCRIPTION                |   3 +-
 NAMESPACE                  |   3 +
 R/predictPreeclampsia.R    | 184 ++++++++++++++++++-------------------
 man/predictEthnicity.Rd    |   4 +-
 man/predictPreeclampsia.Rd |  35 +++++++
 vignettes/planet.Rmd       |  14 ++-
 6 files changed, 146 insertions(+), 97 deletions(-)
 create mode 100644 man/predictPreeclampsia.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 2600499..428f8f3 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -21,7 +21,8 @@ Imports:
     methods,
     tibble,
     magrittr,
-    dplyr
+    dplyr,
+    ExperimentHub
 Suggests:
     badger,
     ggplot2,
diff --git a/NAMESPACE b/NAMESPACE
index 291a69b..7437be4 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -5,5 +5,8 @@ export(pl_infer_age)
 export(pl_infer_ethnicity)
 export(predictAge)
 export(predictEthnicity)
+export(predictPreeclampsia)
+import(ExperimentHub)
+import(mixOmics)
 importFrom(magrittr,"%>%")
 importFrom(tibble,tibble)
diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R
index 51c7732..de13ce8 100644
--- a/R/predictPreeclampsia.R
+++ b/R/predictPreeclampsia.R
@@ -1,93 +1,91 @@
-#' @title predictPreeclampsia
-#'
-#' @description Uses 45 CpGs to predict early preeclampsia (PE delivered before or at 34 weeks of gestation)
-#' on placental DNA methylation microarray data.
-#'
-#' @details Assigns the class labels "early-PE" or "normotensive" to each sample
-#' and returns a class probability.
-#'
-#' It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data
-#' prior to prediction. This was the normalization method used on the training data.
-#'
-#' @param betas matrix or array of methylation values on the beta scale (0, 1),
-#' where the variables are arranged in rows, and samples in columns.
-#'
-#' @return produces a list with components detailed in the `mixOmics::predict` R documentation
-#'
-#' @examples
-#'
-#' To predict early preeclampsia on 450k/850k samples
-#'
-#' Load data
-#' data(peBetas)
-#' predictPreeclampsia(peBetas, dist = "max.dist")
-#'
-#' @export predictPreeclampsia
-#'
-
-predictPreeclampsia <- function(betas, ...){
-  
-  # read in data to generate model
-  data(trainBetas, envir=environment())
-  data(trainLabels, envir=environment())
-  
-  # model
-  set.seed(2022)
-  mod = mixOmics::splsda(trainBetas, trainLabels, ncomp = 1, keepX = 45)
-  trainCpGs = colnames(mod)$X
-  peCpGs = mixOmics::selectVar(mod)$name
-  
-  # check that there are no NAs in the predictors (or if there are, how many)
-  pp <- intersect(colnames(betas), peCpGs)
-  
-  if(length(pp) < length(peCpGs)){
-    stop(paste(
-      "Only", length(pp), "out of 45 predictive CpGs present. All 45 predictive CpGs are needed to run the function."
-    ))
-  } else {
-    message(paste(length(pp), "of 45 predictive CpGs present."))
-    message("BMIQ normalization is recommended for best results. If choosing other method, it is recommended to compare results to predictions on BMIQ normalized data.")
-  }
-  
-  # set up data for prediction
-  
-  # if input data is missing any of the cpgs present in the training data, this function
-  # adds the ones that are missing as NAs
-  # necessary for `mixOmics::predict` to work
-  
-  outersect = function(x, y) {
-    sort(c(x[!x%in%y],
-           y[!y%in%x]))
-  }
-  
-  if(inherits(betas, 'matrix')){
-  } else if (inherits(betas, 'array')) {
-  } else {
-    
-    # throw an error
-    print(paste0("Input data must be a matrix or an array"))
-  }
-  
-  subset <- betas[,colnames(betas) %in% trainCpGs]
-  
-  # order
-  subset <- subset[drop=FALSE,, trainCpGs]
-  
-  if(all(colnames(subset) == trainCpGs) == FALSE){
-    stop()
-  } else
-    
-    # predict
-    out <- mixOmics:::predict.mixo_spls(mod, subset)
-  
-  # get class probabilities
-  CP <- out$predict[,,1]
-  CP <- t(apply(as.matrix(CP), 1, function(data) exp(data)/sum(exp(data))))
-  CP <- as.data.frame(CP) %>% tibble::rownames_to_column("Sample_ID")
-  CP$Pred_Class <- CP$comp1
-  CP <- CP %>%
-    dplyr::mutate(Pred_Class = dplyr::case_when(EOPE > 0.55 ~ "EOPE",
-                                                EOPE < 0.55 ~ "Normotensive"))
-  
-  return(CP)
-}
+#' @title predictPreeclampsia
+#'
+#' @description Uses 45 CpGs to predict early preeclampsia (PE delivered before or at 34 weeks of gestation)
+#' on placental DNA methylation microarray data.
+#'
+#' @details Assigns the class labels "early-PE" or "normotensive" to each sample
+#' and returns a class probability.
+#'
+#' It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data
+#' prior to prediction. This was the normalization method used on the training data.
+#'
+#' @param betas matrix or array of methylation values on the beta scale (0, 1),
+#' where the variables are arranged in rows, and samples in columns.
+#'
+#' @return produces a list with components detailed in the `mixOmics::predict` R documentation
+#'
+#' @examples
+#'
+#' To predict early preeclampsia on 450k/850k samples
+#'
+#' Load data
+#' data(peBetas)
+#' predictPreeclampsia(peBetas, dist = "max.dist")
+#' 
+#' @import mixOmics
+#' @import ExperimentHub
+#' @export predictPreeclampsia
+#'
+
+predictPreeclampsia <- function(betas, ...){
+  
+  # read in data to generate model
+  eh = ExperimentHub()
+  mod = eh[['EH8090']]
+  trainCpGs = colnames(mod$X)
+  
+  # check that there are no NAs in the predictors (or if there are, how many)
+  peCpGs = mixOmics::selectVar(mod)$name
+  pp <- intersect(colnames(betas), peCpGs)
+  
+  if(length(pp) < length(peCpGs)){
+    stop(paste(
+      "Only", length(pp), "out of 45 predictive CpGs present. All 45 predictive CpGs are needed to run the function."
+    ))
+  } else {
+    message(paste(length(pp), "of 45 predictive CpGs present."))
+    message("BMIQ normalization is recommended for best results. If choosing other method, it is recommended to compare results to predictions on BMIQ normalized data.")
+  }
+  
+  # set up data for prediction
+  
+  # if input data is missing any of the cpgs present in the training data, this function
+  # adds the ones that are missing as NAs
+  # necessary for `mixOmics::predict` to work
+  
+  outersect = function(x, y) {
+    sort(c(x[!x%in%y],
+           y[!y%in%x]))
+  }
+  
+  if(inherits(betas, 'matrix')){
+  } else if (inherits(betas, 'array')) {
+  } else {
+    
+    # throw an error
+    print(paste0("Input data must be a matrix or an array"))
+  }
+  
+  subset <- betas[,colnames(betas) %in% trainCpGs]
+  
+  # order
+  subset <- subset[drop=FALSE,, trainCpGs]
+  
+  if(all(colnames(subset) == trainCpGs) == FALSE){
+    stop()
+  } else
+    
+    # predict
+    out <- mixOmics:::predict.mixo_spls(mod, subset)
+  
+  # get class probabilities
+  CP <- out$predict[,,1]
+  CP <- t(apply(as.matrix(CP), 1, function(data) exp(data)/sum(exp(data))))
+  CP <- as.data.frame(CP) %>% tibble::rownames_to_column("Sample_ID")
+  CP$Pred_Class <- CP$comp1
+  CP <- CP %>%
+    dplyr::mutate(Pred_Class = dplyr::case_when(EOPE > 0.55 ~ "EOPE",
+                                                EOPE < 0.55 ~ "Normotensive"))
+  
+  return(CP)
+}
diff --git a/man/predictEthnicity.Rd b/man/predictEthnicity.Rd
index 7068ea0..61ecd28 100644
--- a/man/predictEthnicity.Rd
+++ b/man/predictEthnicity.Rd
@@ -23,8 +23,8 @@ Uses 1860 CpGs to predict self-reported ethnicity on placental
 microarray data.
 }
 \details{
-Predicts self-reported ethnicity from 3 classes: Africans, Asians,
-and Caucasians, using placental DNA methylation data measured on the Infinium
+Predicts self-reported ethnicity from 3 groups: African, Asian,
+and European, using placental DNA methylation data measured on the Infinium
 450k/EPIC methylation array. Will return membership probabilities that often
 reflect genetic ancestry composition.
 
diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd
new file mode 100644
index 0000000..7972990
--- /dev/null
+++ b/man/predictPreeclampsia.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/predictPreeclampsia.R
+\name{predictPreeclampsia}
+\alias{predictPreeclampsia}
+\title{predictPreeclampsia}
+\usage{
+predictPreeclampsia(betas, ...)
+}
+\arguments{
+\item{betas}{matrix or array of methylation values on the beta scale (0, 1),
+where the variables are arranged in rows, and samples in columns.}
+}
+\value{
+produces a list with components detailed in the \code{mixOmics::predict} R documentation
+}
+\description{
+Uses 45 CpGs to predict early preeclampsia (PE delivered before or at 34 weeks of gestation)
+on placental DNA methylation microarray data.
+}
+\details{
+Assigns the class labels "early-PE" or "normotensive" to each sample
+and returns a class probability.
+
+It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data
+prior to prediction. This was the normalization method used on the training data.
+}
+\examples{
+
+To predict early preeclampsia on 450k/850k samples
+
+Load data
+data(peBetas)
+predictPreeclampsia(peBetas, dist = "max.dist")
+
+}
diff --git a/vignettes/planet.Rmd b/vignettes/planet.Rmd
index 98cc5f7..ca6a1bd 100644
--- a/vignettes/planet.Rmd
+++ b/vignettes/planet.Rmd
@@ -329,9 +329,21 @@ for classification used to assign labels is 55%; if the users wishes to use
 other threshold, different labels can be assigned based on the output 
 probabilities. 
 
+```{r include=FALSE}
+
+library(ExperimentHub)
+eh = ExperimentHub()
+query(eh, c('eoPred'))
+val = eh[["EH8091"]]
+# add line to load valMeta
+
+```
+
 ```{r}
 
-preds <- predictPreeclampsia(t(val))
+dim(val) # 48 by 341,281
+
+preds <- predictPreeclampsia(val)
 
 # have a quick look
 head(preds)

From 83876d1caf8f50a8bf5d66e59d9df8a78a717d0e Mon Sep 17 00:00:00 2001
From: iciarfernandez <iciarfernandez@outlook.com>
Date: Sun, 13 Aug 2023 18:12:18 -0700
Subject: [PATCH 02/19] updating code to fix errors

---
 DESCRIPTION                |  3 ++-
 R/predictPreeclampsia.R    | 22 +++++++++++-----------
 man/predictPreeclampsia.Rd | 13 +++++++------
 vignettes/planet.Rmd       |  4 ++--
 4 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 428f8f3..6b40ad6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -22,7 +22,8 @@ Imports:
     tibble,
     magrittr,
     dplyr,
-    ExperimentHub
+    ExperimentHub,
+    mixOmics
 Suggests:
     badger,
     ggplot2,
diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R
index de13ce8..82faea7 100644
--- a/R/predictPreeclampsia.R
+++ b/R/predictPreeclampsia.R
@@ -6,7 +6,7 @@
 #' @details Assigns the class labels "early-PE" or "normotensive" to each sample
 #' and returns a class probability.
 #'
-#' It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data
+#' # It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data
 #' prior to prediction. This was the normalization method used on the training data.
 #'
 #' @param betas matrix or array of methylation values on the beta scale (0, 1),
@@ -16,11 +16,11 @@
 #'
 #' @examples
 #'
-#' To predict early preeclampsia on 450k/850k samples
+#' # To predict early preeclampsia on 450k/850k samples
 #'
-#' Load data
-#' data(peBetas)
-#' predictPreeclampsia(peBetas, dist = "max.dist")
+#' # Load data
+#' # data(peBetas)
+#' # predictPreeclampsia(peBetas, dist = "max.dist")
 #' 
 #' @import mixOmics
 #' @import ExperimentHub
@@ -66,25 +66,25 @@ predictPreeclampsia <- function(betas, ...){
     print(paste0("Input data must be a matrix or an array"))
   }
   
-  subset <- betas[,colnames(betas) %in% trainCpGs]
+  betasSubset <- betas[,colnames(betas) %in% trainCpGs]
   
   # order
-  subset <- subset[drop=FALSE,, trainCpGs]
+  betasSubset <- betasSubset[drop=FALSE,, trainCpGs]
   
-  if(all(colnames(subset) == trainCpGs) == FALSE){
+  if(all(colnames(betasSubset) == trainCpGs) == FALSE){
     stop()
   } else
     
     # predict
-    out <- mixOmics:::predict.mixo_spls(mod, subset)
+    out <- mixOmics:::predict.mixo_spls(mod, betasSubset)
   
   # get class probabilities
   CP <- out$predict[,,1]
   CP <- t(apply(as.matrix(CP), 1, function(data) exp(data)/sum(exp(data))))
   CP <- as.data.frame(CP) %>% tibble::rownames_to_column("Sample_ID")
-  CP$Pred_Class <- CP$comp1
+  CP$PE_Status <- CP$comp1
   CP <- CP %>%
-    dplyr::mutate(Pred_Class = dplyr::case_when(EOPE > 0.55 ~ "EOPE",
+    dplyr::mutate(PE_Status = dplyr::case_when(EOPE > 0.55 ~ "EOPE",
                                                 EOPE < 0.55 ~ "Normotensive"))
   
   return(CP)
diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd
index 7972990..41ee2ee 100644
--- a/man/predictPreeclampsia.Rd
+++ b/man/predictPreeclampsia.Rd
@@ -20,16 +20,17 @@ on placental DNA methylation microarray data.
 \details{
 Assigns the class labels "early-PE" or "normotensive" to each sample
 and returns a class probability.
-
-It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data
+}
+\section{It is recommended that users apply beta-mixture quantile normalization (BMIQ) to their data}{
 prior to prediction. This was the normalization method used on the training data.
 }
+
 \examples{
 
-To predict early preeclampsia on 450k/850k samples
+# To predict early preeclampsia on 450k/850k samples
 
-Load data
-data(peBetas)
-predictPreeclampsia(peBetas, dist = "max.dist")
+# Load data
+# data(peBetas)
+# predictPreeclampsia(peBetas, dist = "max.dist")
 
 }
diff --git a/vignettes/planet.Rmd b/vignettes/planet.Rmd
index ca6a1bd..4b81626 100644
--- a/vignettes/planet.Rmd
+++ b/vignettes/planet.Rmd
@@ -329,7 +329,7 @@ for classification used to assign labels is 55%; if the users wishes to use
 other threshold, different labels can be assigned based on the output 
 probabilities. 
 
-```{r include=FALSE}
+```{r include=FALSE, eval = FALSE}
 
 library(ExperimentHub)
 eh = ExperimentHub()
@@ -339,7 +339,7 @@ val = eh[["EH8091"]]
 
 ```
 
-```{r}
+```{r include=FALSE, eval = FALSE}
 
 dim(val) # 48 by 341,281
 

From fe5ed86dfbf2dd71ca7b3df172aa2b0e579da036 Mon Sep 17 00:00:00 2001
From: iciarfernandez <iciarfernandez@outlook.com>
Date: Wed, 12 Jun 2024 16:23:47 +0200
Subject: [PATCH 03/19] in response to pull request

---
 DESCRIPTION                | 6 ++++--
 R/predictPreeclampsia.R    | 4 +++-
 man/planet-package.Rd      | 5 +++++
 man/predictPreeclampsia.Rd | 2 ++
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 6b40ad6..06d6e06 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -4,8 +4,10 @@ Version: 1.7.0
 Authors@R: 
     c(person("Victor", "Yuan", email = "victor.2wy@gmail.com",
             role = c("aut", "cre")),
-    person(c("Wendy", "P."), "Robinson",
-            role = "ctb"))
+      person(c("Wendy", "P."), "Robinson",
+            role = "ctb"),
+      person("Icíar", "Fernández-Boyano", email = "iciarfernandez@outlook.com",
+            role = c("aut", "ctb")))
 URL: 
     https://victor.rbind.io/planet, http://github.com/wvictor14/planet
 BugReports: 
diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R
index 82faea7..13de998 100644
--- a/R/predictPreeclampsia.R
+++ b/R/predictPreeclampsia.R
@@ -11,6 +11,8 @@
 #'
 #' @param betas matrix or array of methylation values on the beta scale (0, 1),
 #' where the variables are arranged in rows, and samples in columns.
+#' 
+#' @param ... feeds into outersect function
 #'
 #' @return produces a list with components detailed in the `mixOmics::predict` R documentation
 #'
@@ -76,7 +78,7 @@ predictPreeclampsia <- function(betas, ...){
   } else
     
     # predict
-    out <- mixOmics:::predict.mixo_spls(mod, betasSubset)
+    out <- mixOmics::predict.mixo_spls(mod, betasSubset)
   
   # get class probabilities
   CP <- out$predict[,,1]
diff --git a/man/planet-package.Rd b/man/planet-package.Rd
index 2f60eed..e80dbb0 100644
--- a/man/planet-package.Rd
+++ b/man/planet-package.Rd
@@ -22,6 +22,11 @@ Useful links:
 \author{
 \strong{Maintainer}: Victor Yuan \email{victor.2wy@gmail.com}
 
+Authors:
+\itemize{
+  \item Icíar Fernández-Boyano \email{iciarfernandez@outlook.com} [contributor]
+}
+
 Other contributors:
 \itemize{
   \item Wendy P. Robinson [contributor]
diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd
index 41ee2ee..cf8e51a 100644
--- a/man/predictPreeclampsia.Rd
+++ b/man/predictPreeclampsia.Rd
@@ -9,6 +9,8 @@ predictPreeclampsia(betas, ...)
 \arguments{
 \item{betas}{matrix or array of methylation values on the beta scale (0, 1),
 where the variables are arranged in rows, and samples in columns.}
+
+\item{...}{feeds into outersect function}
 }
 \value{
 produces a list with components detailed in the \code{mixOmics::predict} R documentation

From 77a385da7b4bedced737d6e6f152b245be780e18 Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Fri, 14 Jun 2024 16:16:07 -0700
Subject: [PATCH 04/19] Add Wendy as author / contributor

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 06d6e06..6f832b3 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -5,7 +5,7 @@ Authors@R:
     c(person("Victor", "Yuan", email = "victor.2wy@gmail.com",
             role = c("aut", "cre")),
       person(c("Wendy", "P."), "Robinson",
-            role = "ctb"),
+            role = c("aut", "ctb")),
       person("Icíar", "Fernández-Boyano", email = "iciarfernandez@outlook.com",
             role = c("aut", "ctb")))
 URL: 

From bd4b806bf348a7ed629c2acb9e9397be89de4ded Mon Sep 17 00:00:00 2001
From: iciarfernandez <iciarfernandez@outlook.com>
Date: Mon, 17 Jun 2024 14:23:00 +0200
Subject: [PATCH 05/19] fixed ::, DS Store file and dependencies

---
 .gitignore              | 1 +
 NAMESPACE               | 2 --
 R/predictPreeclampsia.R | 6 ++----
 man/planet-package.Rd   | 6 +-----
 4 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 760c4d3..373bcc4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ Meta
 pkgdown/
 *.rds
 .vscode/settings.json
+.DS_Store
diff --git a/NAMESPACE b/NAMESPACE
index 7437be4..b601849 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,7 +6,5 @@ export(pl_infer_ethnicity)
 export(predictAge)
 export(predictEthnicity)
 export(predictPreeclampsia)
-import(ExperimentHub)
-import(mixOmics)
 importFrom(magrittr,"%>%")
 importFrom(tibble,tibble)
diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R
index 13de998..e8d948e 100644
--- a/R/predictPreeclampsia.R
+++ b/R/predictPreeclampsia.R
@@ -24,15 +24,13 @@
 #' # data(peBetas)
 #' # predictPreeclampsia(peBetas, dist = "max.dist")
 #' 
-#' @import mixOmics
-#' @import ExperimentHub
 #' @export predictPreeclampsia
 #'
 
 predictPreeclampsia <- function(betas, ...){
   
   # read in data to generate model
-  eh = ExperimentHub()
+  eh = ExperimentHub::ExperimentHub()
   mod = eh[['EH8090']]
   trainCpGs = colnames(mod$X)
   
@@ -78,7 +76,7 @@ predictPreeclampsia <- function(betas, ...){
   } else
     
     # predict
-    out <- mixOmics::predict.mixo_spls(mod, betasSubset)
+    out <- mixOmics:::predict.mixo_spls(mod, betasSubset)
   
   # get class probabilities
   CP <- out$predict[,,1]
diff --git a/man/planet-package.Rd b/man/planet-package.Rd
index e80dbb0..4d8fb55 100644
--- a/man/planet-package.Rd
+++ b/man/planet-package.Rd
@@ -23,13 +23,9 @@ Useful links:
 \strong{Maintainer}: Victor Yuan \email{victor.2wy@gmail.com}
 
 Authors:
-\itemize{
-  \item Icíar Fernández-Boyano \email{iciarfernandez@outlook.com} [contributor]
-}
-
-Other contributors:
 \itemize{
   \item Wendy P. Robinson [contributor]
+  \item Icíar Fernández-Boyano \email{iciarfernandez@outlook.com} [contributor]
 }
 
 }

From a5afdc9c16941ea38d2f3ae9743ca07d85b4df65 Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Fri, 24 Jan 2025 15:47:03 -0800
Subject: [PATCH 06/19] move heavy bioconductor packages to suggests

---
 DESCRIPTION | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 87414d7..ad739ce 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -23,9 +23,9 @@ Imports:
     tibble,
     magrittr,
     dplyr,
-    ExperimentHub,
-    mixOmics
 Suggests:
+    ExperimentHub,
+    mixOmics,
     ggplot2,
     testthat,
     tidyr,

From 96150ba6bfa3bf7455b703a0206bde94d8cb1682 Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Wed, 29 Jan 2025 23:24:31 -0800
Subject: [PATCH 07/19] betas input is not transposed

---
 R/predictPreeclampsia.R | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R
index e8d948e..41db153 100644
--- a/R/predictPreeclampsia.R
+++ b/R/predictPreeclampsia.R
@@ -21,22 +21,23 @@
 #' # To predict early preeclampsia on 450k/850k samples
 #'
 #' # Load data
-#' # data(peBetas)
-#' # predictPreeclampsia(peBetas, dist = "max.dist")
+#' library(ExperimentHub)
 #' 
+#' # test object
+#' x_test <- eh[['EH8403']]
+#' x_test %>% predictPreeclampsia()
 #' @export predictPreeclampsia
 #'
-
 predictPreeclampsia <- function(betas, ...){
   
   # read in data to generate model
-  eh = ExperimentHub::ExperimentHub()
-  mod = eh[['EH8090']]
-  trainCpGs = colnames(mod$X)
+  eh <- ExperimentHub::ExperimentHub()
+  mod <- eh[['EH8090']]
+  trainCpGs <- colnames(mod$X)
   
   # check that there are no NAs in the predictors (or if there are, how many)
-  peCpGs = mixOmics::selectVar(mod)$name
-  pp <- intersect(colnames(betas), peCpGs)
+  peCpGs <- mixOmics::selectVar(mod)$name
+  pp <- intersect(rownames(betas), peCpGs)
   
   if(length(pp) < length(peCpGs)){
     stop(paste(
@@ -53,30 +54,29 @@ predictPreeclampsia <- function(betas, ...){
   # adds the ones that are missing as NAs
   # necessary for `mixOmics::predict` to work
   
-  outersect = function(x, y) {
+  outersect <- function(x, y) {
     sort(c(x[!x%in%y],
            y[!y%in%x]))
   }
   
   if(inherits(betas, 'matrix')){
+    
   } else if (inherits(betas, 'array')) {
-  } else {
     
+  } else {
     # throw an error
     print(paste0("Input data must be a matrix or an array"))
   }
   
-  betasSubset <- betas[,colnames(betas) %in% trainCpGs]
+  betasSubset <- betas[rownames(betas) %in% trainCpGs,]
   
   # order
-  betasSubset <- betasSubset[drop=FALSE,, trainCpGs]
+  betasSubset <- betasSubset[drop=FALSE,trainCpGs, ]
   
-  if(all(colnames(betasSubset) == trainCpGs) == FALSE){
-    stop()
-  } else
-    
-    # predict
-    out <- mixOmics:::predict.mixo_spls(mod, betasSubset)
+  stopifnot(all(rownames(betasSubset) == trainCpGs))
+  
+  # predict
+  out <- mixOmics:::predict.mixo_spls(mod, t(betasSubset))
   
   # get class probabilities
   CP <- out$predict[,,1]
@@ -85,7 +85,6 @@ predictPreeclampsia <- function(betas, ...){
   CP$PE_Status <- CP$comp1
   CP <- CP %>%
     dplyr::mutate(PE_Status = dplyr::case_when(EOPE > 0.55 ~ "EOPE",
-                                                EOPE < 0.55 ~ "Normotensive"))
-  
-  return(CP)
+                                               EOPE < 0.55 ~ "Normotensive"))
+  return(tibble::as_tibble(CP))
 }

From 64709dc520e4afd1992eb2de6666688978b91efa Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Wed, 29 Jan 2025 23:24:37 -0800
Subject: [PATCH 08/19] docs add eopreddata to imports

---
 DESCRIPTION                | 3 ++-
 man/predictEthnicity.Rd    | 4 ++--
 man/predictPreeclampsia.Rd | 6 ++++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index ad739ce..9f2b89e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -23,6 +23,7 @@ Imports:
     tibble,
     magrittr,
     dplyr,
+    eoPredData
 Suggests:
     ExperimentHub,
     mixOmics,
@@ -37,7 +38,7 @@ Suggests:
 License: GPL-2
 Encoding: UTF-8
 LazyData: false
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 VignetteBuilder: knitr
 biocViews: Software, DifferentialMethylation, Epigenetics, Microarray, 
     MethylationArray, DNAMethylation, CpGIsland
diff --git a/man/predictEthnicity.Rd b/man/predictEthnicity.Rd
index 6b088b7..289882d 100644
--- a/man/predictEthnicity.Rd
+++ b/man/predictEthnicity.Rd
@@ -25,8 +25,8 @@ Uses 1860 CpGs to predict self-reported ethnicity on placental
 microarray data.
 }
 \details{
-Predicts self-reported ethnicity from 3 groups: African, Asian,
-and European, using placental DNA methylation data measured on the Infinium
+Predicts self-reported ethnicity from 3 classes: Africans, Asians,
+and Caucasians, using placental DNA methylation data measured on the Infinium
 450k/EPIC methylation array. Will return membership probabilities that often
 reflect genetic ancestry composition.
 
diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd
index cf8e51a..b67de41 100644
--- a/man/predictPreeclampsia.Rd
+++ b/man/predictPreeclampsia.Rd
@@ -32,7 +32,9 @@ prior to prediction. This was the normalization method used on the training data
 # To predict early preeclampsia on 450k/850k samples
 
 # Load data
-# data(peBetas)
-# predictPreeclampsia(peBetas, dist = "max.dist")
+library(ExperimentHub)
 
+# test object
+x_test <- eh[['EH8403']]
+x_test \%>\% predictPreeclampsia()
 }

From 363aeba5f727a3dbc57a78ccba00588d0fc2dca1 Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Thu, 30 Jan 2025 22:40:47 -0800
Subject: [PATCH 09/19] fix eopred vignette section

---
 vignettes/planet.Rmd | 258 ++++++++++++++++++++++---------------------
 1 file changed, 135 insertions(+), 123 deletions(-)

diff --git a/vignettes/planet.Rmd b/vignettes/planet.Rmd
index 15bcc7d..7244acc 100644
--- a/vignettes/planet.Rmd
+++ b/vignettes/planet.Rmd
@@ -8,18 +8,18 @@ output:
 pkgdown:
     as_is: true
 vignette: >
-    %\VignetteIndexEntry{planet}
-    %\VignetteEncoding{UTF-8}
-    %\VignetteEngine{knitr::rmarkdown}
-editor_options: 
-    chunk_output_type: console
+  %\VignetteIndexEntry{planet}
+  %\VignetteEncoding{UTF-8}
+  %\VignetteEngine{knitr::rmarkdown}
+editor_options: source
+chunk_output_type: console
 ---
 
 ## Installation
 
 ```{r, message = FALSE, warning = FALSE, eval = FALSE}
 if(!requireNamespace("BiocManager", quietly = TRUE))
-    install.packages("BiocManager")
+  install.packages("BiocManager")
 BiocManager::install("planet")
 ```
 
@@ -60,9 +60,9 @@ how to do both.
 
 ```{r}
 houseman_estimates <- minfi:::projectCellType(
-    plBetas[rownames(plCellCpGsThird), ],
-    plCellCpGsThird,
-    lessThanOne = FALSE
+  plBetas[rownames(plCellCpGsThird), ],
+  plCellCpGsThird,
+  lessThanOne = FALSE
 )
 
 head(houseman_estimates)
@@ -73,23 +73,23 @@ head(houseman_estimates)
 ```{r}
 # robust partial correlations
 epidish_RPC <- epidish(
-    beta.m = plBetas[rownames(plCellCpGsThird), ],
-    ref.m = plCellCpGsThird,
-    method = "RPC"
+  beta.m = plBetas[rownames(plCellCpGsThird), ],
+  ref.m = plCellCpGsThird,
+  method = "RPC"
 )
 
 # CIBERSORT
 epidish_CBS <- epidish(
-    beta.m = plBetas[rownames(plCellCpGsThird), ],
-    ref.m = plCellCpGsThird,
-    method = "CBS"
+  beta.m = plBetas[rownames(plCellCpGsThird), ],
+  ref.m = plCellCpGsThird,
+  method = "CBS"
 )
 
 # constrained projection (houseman 2012)
 epidish_CP <- epidish(
-    beta.m = plBetas[rownames(plCellCpGsThird), ],
-    ref.m = plCellCpGsThird,
-    method = "CP"
+  beta.m = plBetas[rownames(plCellCpGsThird), ],
+  ref.m = plCellCpGsThird,
+  method = "CP"
 )
 ```
 
@@ -103,40 +103,40 @@ data("plColors")
 
 # bind estimate data frames and reshape for plotting
 bind_rows(
-    houseman_estimates %>% as.data.frame() %>% mutate(algorithm = "CP (Houseman)"),
-    epidish_RPC$estF %>% as.data.frame() %>% mutate(algorithm = "RPC"),
-    epidish_CBS$estF %>% as.data.frame() %>% mutate(algorithm = "CBS"),
-    epidish_CP$estF %>% as.data.frame() %>% mutate(algorithm = "CP (EpiDISH)")
+  houseman_estimates %>% as.data.frame() %>% mutate(algorithm = "CP (Houseman)"),
+  epidish_RPC$estF %>% as.data.frame() %>% mutate(algorithm = "RPC"),
+  epidish_CBS$estF %>% as.data.frame() %>% mutate(algorithm = "CBS"),
+  epidish_CP$estF %>% as.data.frame() %>% mutate(algorithm = "CP (EpiDISH)")
 ) %>%
-    mutate(sample = rep(rownames(houseman_estimates), 4)) %>%
-    as_tibble() %>%
-    pivot_longer(
-        cols = -c(algorithm, sample),
-        names_to = "component",
-        values_to = "estimate"
-    ) %>%
-
-    # relevel for plot
-    mutate(component = factor(component,
-        levels = c(
-            "nRBC", "Endothelial", "Hofbauer",
-            "Stromal", "Trophoblasts",
-            "Syncytiotrophoblast"
-        )
-    )) %>%
-
-    # plot
-    ggplot(aes(x = sample, y = estimate, fill = component)) +
-    geom_bar(stat = "identity") +
-    facet_wrap(~algorithm, ncol = 1) +
-    scale_fill_manual(values = plColors) +
-    scale_y_continuous(
-        limits = c(-0.1, 1.1), breaks = c(0, 0.5, 1),
-        labels = scales::percent
-    ) +
-    theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
-    coord_cartesian(ylim = c(0, 1)) +
-    labs(x = "", fill = "")
+  mutate(sample = rep(rownames(houseman_estimates), 4)) %>%
+  as_tibble() %>%
+  pivot_longer(
+    cols = -c(algorithm, sample),
+    names_to = "component",
+    values_to = "estimate"
+  ) %>%
+  
+  # relevel for plot
+  mutate(component = factor(component,
+                            levels = c(
+                              "nRBC", "Endothelial", "Hofbauer",
+                              "Stromal", "Trophoblasts",
+                              "Syncytiotrophoblast"
+                            )
+  )) %>%
+  
+  # plot
+  ggplot(aes(x = sample, y = estimate, fill = component)) +
+  geom_bar(stat = "identity") +
+  facet_wrap(~algorithm, ncol = 1) +
+  scale_fill_manual(values = plColors) +
+  scale_y_continuous(
+    limits = c(-0.1, 1.1), breaks = c(0, 0.5, 1),
+    labels = scales::percent
+  ) +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
+  coord_cartesian(ylim = c(0, 1)) +
+  labs(x = "", fill = "")
 ```
 
 Some notes:
@@ -197,11 +197,11 @@ sample information data.frame, `plPhenoData`.
 
 ```{r}
 plPhenoData <- plPhenoData %>%
-    mutate(
-        ga_RPC = predictAge(plBetas, type = "RPC"),
-        ga_CPC = predictAge(plBetas, type = "CPC"),
-        ga_RRPC = predictAge(plBetas, type = "RRPC")
-    )
+  mutate(
+    ga_RPC = predictAge(plBetas, type = "RPC"),
+    ga_CPC = predictAge(plBetas, type = "CPC"),
+    ga_RRPC = predictAge(plBetas, type = "RRPC")
+  )
 ```
 
 Note that the number of predictors (CpGs) that were used in our data are 
@@ -215,21 +215,21 @@ each of the 3 gestational age predictors.
 ```{r, fig.width = 7, fig.height = 5}
 plPhenoData %>%
   
-    # reshape, to plot
-    pivot_longer(
-        cols = contains("ga"),
-        names_to = "clock_type",
-        names_prefix = "ga_",
-        values_to = "ga"
-    ) %>%
-
-    # plot code
-    ggplot(aes(x = gestation_wk, y = ga, col = disease)) +
-    geom_point() +
-    geom_smooth(method = "lm", se = FALSE) +
-    facet_wrap(~clock_type) +
-    theme(legend.position = "top") +
-    labs(x = "Reported GA (weeks)", y = "Inferred GA (weeks)", col = "")
+  # reshape, to plot
+  pivot_longer(
+    cols = contains("ga"),
+    names_to = "clock_type",
+    names_prefix = "ga_",
+    values_to = "ga"
+  ) %>%
+  
+  # plot code
+  ggplot(aes(x = gestation_wk, y = ga, col = disease)) +
+  geom_point() +
+  geom_smooth(method = "lm", se = FALSE) +
+  facet_wrap(~clock_type) +
+  theme(legend.position = "top") +
+  labs(x = "Reported GA (weeks)", y = "Inferred GA (weeks)", col = "")
 ```
 
 ## Ethnicity
@@ -243,7 +243,7 @@ all(ethnicityCpGs %in% rownames(plBetas))
 
 results <- predictEthnicity(plBetas)
 results %>%
-    tail(8)
+  tail(8)
 ```
 
 `predictEthnicity` returns probabilities corresponding to each ethnicity for
@@ -261,26 +261,26 @@ require special attention in downstream analyses.
 
 ```{r, fig.width = 7}
 results %>%
-    ggplot(aes(
-        x = Prob_Caucasian, y = Prob_African,
-        col = Predicted_ethnicity
-    )) +
-    geom_point(alpha = 0.7) +
-    coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
-    scale_x_continuous(labels = scales::percent) +
-    scale_y_continuous(labels = scales::percent) +
-    labs(x = "P(Caucasian)", y = "P(African)")
+  ggplot(aes(
+    x = Prob_Caucasian, y = Prob_African,
+    col = Predicted_ethnicity
+  )) +
+  geom_point(alpha = 0.7) +
+  coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
+  scale_x_continuous(labels = scales::percent) +
+  scale_y_continuous(labels = scales::percent) +
+  labs(x = "P(Caucasian)", y = "P(African)")
 
 results %>%
-    ggplot(aes(
-        x = Prob_Caucasian, y = Prob_Asian,
-        col = Predicted_ethnicity
-    )) +
-    geom_point(alpha = 0.7) +
-    coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
-    scale_x_continuous(labels = scales::percent) +
-    scale_y_continuous(labels = scales::percent) +
-    labs(x = "P(Caucasian)", y = "P(Asian)")
+  ggplot(aes(
+    x = Prob_Caucasian, y = Prob_Asian,
+    col = Predicted_ethnicity
+  )) +
+  geom_point(alpha = 0.7) +
+  coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
+  scale_x_continuous(labels = scales::percent) +
+  scale_y_continuous(labels = scales::percent) +
+  labs(x = "P(Caucasian)", y = "P(Asian)")
 ```
 
 We can't compare this to self-reported ethnicity as it is unavailable. But we
@@ -321,51 +321,63 @@ for classification used to assign labels is 55%; if the users wishes to use
 other threshold, different labels can be assigned based on the output 
 probabilities. 
 
-```{r include=FALSE, eval = FALSE}
+```{r include=TRUE, eval = TRUE}
 
 library(ExperimentHub)
-eh = ExperimentHub()
-query(eh, c('eoPred'))
-val = eh[["EH8091"]]
-# add line to load valMeta
+eh <- ExperimentHub()
+query(eh, "eoPredData")
 
+# download BMIQ normalized 450k data
+x_test <- eh[['EH8403']]
+valMeta <- eh[['EH8404']]
+preds <- x_test %>% predictPreeclampsia()
 ```
 
-```{r include=FALSE, eval = FALSE}
-
-dim(val) # 48 by 341,281
-
-preds <- predictPreeclampsia(val)
+Inspect the results:
 
-# have a quick look
+```{r include=FALSE, eval = TRUE}
 head(preds)
 
 # join with metadata
 valMeta <- left_join(valMeta, preds, by="Sample_ID")
 
 # visualize results
-valMeta %>% 
-  ggplot() +
-  geom_boxplot(aes(x = Class,
-                   y = EOPE),
-               width=0.3,
-               alpha=0.5,
-               color="darkgrey") +
-  geom_jitter(aes(x = Class,
-                  y = EOPE, 
-                  color = Pred_Class), 
-              alpha = 0.5, 
-              position = position_jitter(width = .1)) + 
-  coord_flip() +
-  ylab("Class Probability of EOPE") +
-  xlab("True Class") +
-  ylim(0,1) +
-  scale_color_manual(name = "Predicted Class", values=c("#E69F00", "#999999")) + 
-  theme_minimal() 
-
-# if user wishes to use different threshold from 55% probability, as an example 70%
-EOpreds %>% mutate(Pred_Class = case_when(EOPE > 0.7 ~ "EOPE",
-                                          EOPE < 0.7 ~ "Non-PE Preterm"))
+plot_predictions <- function(df, predicted_class_column) {
+  df %>% 
+    ggplot() +
+    geom_boxplot(
+      aes(x = Class, y = EOPE),
+      width = 0.3,
+      alpha = 0.5,
+      color = "darkgrey"
+    ) +
+    geom_jitter(
+      aes(x = Class, y = EOPE, color = {{predicted_class_column}}), 
+      alpha = 0.75, 
+      position = position_jitter(width = .1)
+    ) + 
+    coord_flip() +
+    ylab("Class Probability of EOPE") +
+    xlab("True Class") +
+    ylim(0,1) +
+    scale_color_manual(
+      name = "Predicted Class", 
+      values = c("#E69F00", "skyblue", "#999999")
+    ) + 
+    theme_minimal() 
+}
+valMeta %>% plot_predictions(PE_Status)
+```
+
+if user wishes to use different threshold from 55% probability, as an example 70%
+
+```{r}
+valMeta %>% mutate(
+  Pred_Class = case_when(
+    EOPE > 0.7 ~ "EOPE",
+    `Non-PE Preterm` > 0.7 ~ "Non-PE Preterm",
+    .default = 'low-confidence'
+  )) %>% plot_predictions(Pred_Class)
 
 ```
 

From 4d1ee7ebb05f6d85dbfdd4fd6f70fafc596d840a Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Thu, 30 Jan 2025 23:00:57 -0800
Subject: [PATCH 10/19] adjust figure dimensions

---
 vignettes/planet.Rmd | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/vignettes/planet.Rmd b/vignettes/planet.Rmd
index 7244acc..85fa734 100644
--- a/vignettes/planet.Rmd
+++ b/vignettes/planet.Rmd
@@ -5,6 +5,8 @@ output:
         df_print: "kable"
         fig_caption: FALSE
         toc: TRUE
+        fig.width: 7
+        fig.height: 7
 pkgdown:
     as_is: true
 vignette: >
@@ -34,7 +36,7 @@ In this example we are using term villi DNAm data, so we first load the
 reference cpgs `plCellCpGsThird`. This is a data frame of 600 cpgs, with 
 mean methylation levels for each cell type.
 
-```{r, message = FALSE, warning = FALSE}
+```{r load_libraries, message = FALSE, warning = FALSE}
 # cell deconvolution packages
 library(minfi)
 library(EpiDISH)
@@ -58,7 +60,7 @@ how to do both.
 
 #### Minfi
 
-```{r}
+```{r houseman}
 houseman_estimates <- minfi:::projectCellType(
   plBetas[rownames(plCellCpGsThird), ],
   plCellCpGsThird,
@@ -70,7 +72,7 @@ head(houseman_estimates)
 
 #### EpiDISH
 
-```{r}
+```{r epidish echo=FALSE}
 # robust partial correlations
 epidish_RPC <- epidish(
   beta.m = plBetas[rownames(plCellCpGsThird), ],
@@ -98,7 +100,7 @@ epidish_CP <- epidish(
 Below, I demonstrate how we can visually compare the different cell composition 
 estimates.
 
-```{r, fig.width = 7, fig.height = 7}
+```{r compare_deconvolution, fig.width = 7, fig.height = 7}
 data("plColors")
 
 # bind estimate data frames and reshape for plotting
@@ -158,7 +160,7 @@ information from `data(plPhenoData)`. Note that for demonstration purposes, the
 cpgs have been filtered to a random \~10,000 CpGs, plus the CpGs used in all of 
 the functions from this package.
 
-```{r}
+```{r gestational_age}
 # load example data
 data(plBetas)
 data(plPhenoData)
@@ -195,7 +197,7 @@ To select the type of clock, we can specify the `type` argument in `predictAge`.
 We will apply all three clocks on this data, and add the predicted age to the
 sample information data.frame, `plPhenoData`.
 
-```{r}
+```{r predict_ga}
 plPhenoData <- plPhenoData %>%
   mutate(
     ga_RPC = predictAge(plBetas, type = "RPC"),
@@ -212,7 +214,7 @@ accuracy.
 Next, I plot the difference between predicted and reported gestational age, for
 each of the 3 gestational age predictors.
 
-```{r, fig.width = 7, fig.height = 5}
+```{r view_ga, fig.width = 7, fig.height = 5}
 plPhenoData %>%
   
   # reshape, to plot
@@ -237,7 +239,7 @@ plPhenoData %>%
 Before predicting ethnicity You can ensure that you have all features using the
 `ethnicityCpGs` vector:
 
-```{r}
+```{r ethnicity}
 data(ethnicityCpGs)
 all(ethnicityCpGs %in% rownames(plBetas))
 

From 7261f52f4d2c92e3bdc698e26d3fa071bec511e0 Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Thu, 30 Jan 2025 23:13:44 -0800
Subject: [PATCH 11/19] copy predict functions over from mixomics

---
 NAMESPACE                  |   5 +
 R/predictPreeclampsia.R    |   8 +-
 R/predict_mixOmics.R       | 773 +++++++++++++++++++++++++++++++++++++
 man/predict.Rd             | 203 ++++++++++
 man/predictPreeclampsia.Rd |   4 +
 5 files changed, 991 insertions(+), 2 deletions(-)
 create mode 100644 R/predict_mixOmics.R
 create mode 100644 man/predict.Rd

diff --git a/NAMESPACE b/NAMESPACE
index b601849..9702d9d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,5 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(predict,mixo_pls)
+S3method(predict,mixo_spls)
 export("%>%")
 export(pl_infer_age)
 export(pl_infer_ethnicity)
@@ -7,4 +9,7 @@ export(predictAge)
 export(predictEthnicity)
 export(predictPreeclampsia)
 importFrom(magrittr,"%>%")
+importFrom(methods,hasArg)
+importFrom(methods,is)
+importFrom(stats,setNames)
 importFrom(tibble,tibble)
diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R
index 41db153..c07c037 100644
--- a/R/predictPreeclampsia.R
+++ b/R/predictPreeclampsia.R
@@ -16,16 +16,19 @@
 #'
 #' @return produces a list with components detailed in the `mixOmics::predict` R documentation
 #'
-#' @examples
+#' @examples \dontrun{
 #'
 #' # To predict early preeclampsia on 450k/850k samples
 #'
 #' # Load data
 #' library(ExperimentHub)
+#' eh <- ExperimentHub()
+#' query(eh, "eoPredData")
 #' 
 #' # test object
 #' x_test <- eh[['EH8403']]
 #' x_test %>% predictPreeclampsia()
+#' }
 #' @export predictPreeclampsia
 #'
 predictPreeclampsia <- function(betas, ...){
@@ -76,7 +79,7 @@ predictPreeclampsia <- function(betas, ...){
   stopifnot(all(rownames(betasSubset) == trainCpGs))
   
   # predict
-  out <- mixOmics:::predict.mixo_spls(mod, t(betasSubset))
+  out <- predict.mixo_spls(mod, t(betasSubset))
   
   # get class probabilities
   CP <- out$predict[,,1]
@@ -88,3 +91,4 @@ predictPreeclampsia <- function(betas, ...){
                                                EOPE < 0.55 ~ "Normotensive"))
   return(tibble::as_tibble(CP))
 }
+
diff --git a/R/predict_mixOmics.R b/R/predict_mixOmics.R
new file mode 100644
index 0000000..9efcb9c
--- /dev/null
+++ b/R/predict_mixOmics.R
@@ -0,0 +1,773 @@
+# copied from https://raw.githubusercontent.com/mixOmicsTeam/mixOmics/refs/heads/master/R/predict.R
+# on 2025 Jan 30. Some components omitted that are not used in planet. 
+
+# ========================================================================================================
+# This function makes a prediction of a 'newdata' by using the results of 'object'.
+# Depending on the class of the object (mint).(block).(s)pls(da) (16 classes so far), the input data is different
+# and the preparation of the data is different - different scaling for instance if object="mint...."
+# However, the prediction formula is the same for all classes, thus only one code
+# ========================================================================================================
+
+#' Predict Method for (mint).(block).(s)pls(da) methods
+#' 
+#' Predicted values based on PLS models. New responses and variates are
+#' predicted using a fitted model and a new matrix of observations.
+#' 
+#' \code{predict} produces predicted values, obtained by evaluating the
+#' PLS-derived methods, returned by \code{(mint).(block).(s)pls(da)} in the
+#' frame \code{newdata}. Variates for \code{newdata} are also returned. Please
+#' note that this method performs multilevel decomposition and/or log ratio
+#' transformations if needed (\code{multilevel} is an input parameter while
+#' \code{logratio} is extracted from \code{object}).
+#' 
+#' Different prediction distances are proposed for discriminant analysis. The
+#' reason is that our supervised models work with a dummy indicator matrix of
+#' \code{Y} to indicate the class membership of each sample. The prediction of
+#' a new observation results in either a predicted dummy variable (output
+#' \code{object$predict}), or a predicted variate (output
+#' \code{object$variates}). Therefore, an appropriate distance needs to be
+#' applied to those predicted values to assign the predicted class. We propose
+#' distances such as `maximum distance' for the predicted dummy variables,
+#' `Mahalanobis distance' and `Centroids distance' for the predicted variates.
+#' 
+#' \code{"max.dist"} is the simplest method to predict the class of a test
+#' sample. For each new individual, the class with the largest predicted dummy
+#' variable is the predicted class. This distance performs well in single data
+#' set analysis with multiclass problems (PLS-DA).
+#' 
+#' \code{"centroids.dist"} allocates to the new observation the class that
+#' mimimises the distance between the predicted score and the centroids of the
+#' classes calculated on the latent components or variates of the trained
+#' model.
+#' 
+#' \code{"mahalanobis.dist"} allocates the new sample the class defined as the
+#' centroid distance, but using the Mahalanobis metric in the calculation of
+#' the distance.
+#' 
+#' In practice we found that the centroid-based distances
+#' (\code{"centroids.dist"} and \code{"mahalanobis.dist"}), and specifically
+#' the Mahalanobis distance led to more accurate predictions than the maximum
+#' distance for complex classification problems and N-integration problems
+#' (block.splsda). The centroid distances consider the prediction in
+#' dimensional space spanned by the predicted variates, while the maximum
+#' distance considers a single point estimate using the predicted scores on the
+#' last dimension of the model. The user can assess the different distances,
+#' and choose the prediction distance that leads to the best performance of the
+#' model, as highlighted from the tune and perf outputs
+#' 
+#' More (mathematical) details about the prediction distances are available in
+#' the supplemental of the mixOmics article (Rohart et al 2017).
+#' 
+#' For a visualisation of those prediction distances, see
+#' \code{background.predict} that overlays the prediction area in
+#' \code{plotIndiv} for a sPLS-DA object.
+#' 
+#' Allocates the individual \eqn{x} to the class of \eqn{Y} minimizing
+#' \eqn{dist(\code{x-variate}, G_l)}, where \eqn{G_l}, \eqn{l = 1,...,L} are
+#' the centroids of the classes calculated on the \eqn{X}-variates of the
+#' model. \code{"mahalanobis.dist"} allocates the individual \eqn{x} to the
+#' class of \eqn{Y} as in \code{"centroids.dist"} but by using the Mahalanobis
+#' metric in the calculation of the distance.
+#' 
+#' For MINT objects, the \code{study.test} argument is required and provides
+#' the grouping factor of \code{newdata}.
+#' 
+#' For multi block analysis (thus block objects), \code{newdata} is a list of
+#' matrices whose names are a subset of \code{names(object$X)} and missing
+#' blocks are allowed. Several predictions are returned, either for each block
+#' or for all blocks. For non discriminant analysis, the predicted values
+#' (\code{predict}) are returned for each block and these values are combined
+#' by average (\code{AveragedPredict}) or weighted average
+#' (\code{WeightedPredict}), using the weights of the blocks that are
+#' calculated as the correlation between a block's components and the outcome's
+#' components.
+#' 
+#' For discriminant analysis, the predicted class is returned for each block
+#' (\code{class}) and each distance (\code{dist}) and these predictions are
+#' combined by majority vote (\code{MajorityVote}) or weighted majority vote
+#' (\code{WeightedVote}), using the weights of the blocks that are calculated
+#' as the correlation between a block's components and the outcome's
+#' components. NA means that there is no consensus among the block. For PLS-DA
+#' and sPLS-DA objects, the prediction area can be visualised in plotIndiv via
+#' the \code{background.predict} function.
+#' 
+#' @aliases predict predict.pls predict.spls predict.plsda predict.splsda
+#' predict.mint.pls predict.mint.spls predict.mint.plsda predict.mint.splsda
+#' predict.mint.block.pls predict.mint.block.spls predict.mint.block.plsda
+#' predict.mint.block.splsda
+#' @param object object of class inheriting from
+#' \code{"(mint).(block).(s)pls(da)"}.
+#' @param newdata data matrix in which to look for for explanatory variables to
+#' be used for prediction. Please note that this method does not perform
+#' multilevel decomposition or log ratio transformations, which need to be
+#' processed beforehand.
+#' @param study.test For MINT objects, grouping factor indicating which samples
+#' of \code{newdata} are from the same study. Overlap with \code{object$study}
+#' are allowed.
+#' @param dist distance to be applied for discriminant methods to predict the
+#' class of new data, should be a subset of \code{"centroids.dist"},
+#' \code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). Defaults to
+#' \code{"all"}.
+#' @param multilevel Design matrix for multilevel analysis (for repeated
+#' measurements). A numeric matrix or data frame. For a one level factor
+#' decomposition, the input is a vector indicating the repeated measures on
+#' each individual, i.e. the individuals ID. For a two level decomposition with
+#' splsda models, the two factors are included in Y. Finally for a two level
+#' decomposition with spls models, 2nd AND 3rd columns in design indicate those
+#' factors (see example in \code{?splsda} and \code{?spls}).
+#' @param ... not used currently.
+#' @return \code{predict} produces a list with the following components:
+#' \item{predict}{predicted response values. The dimensions correspond to the
+#' observations, the response variables and the model dimension, respectively.
+#' For a supervised model, it corresponds to the predicted dummy variables.}
+#' \item{variates}{matrix of predicted variates.} \item{B.hat}{matrix of
+#' regression coefficients (without the intercept).}
+#' 
+#' \item{AveragedPredict}{if more than one block, returns the average predicted
+#' values over the blocks (using the \code{predict} output)}
+#' \item{WeightedPredict}{if more than one block, returns the weighted average
+#' of the predicted values over the blocks (using the \code{predict} and
+#' \code{weights} outputs)}
+#' 
+#' \item{class}{predicted class of \code{newdata} for each
+#' \eqn{1,...,}\code{ncomp} components.}
+#' 
+#' \item{MajorityVote}{if more than one block, returns the majority class over
+#' the blocks. NA for a sample means that there is no consensus on the
+#' predicted class for this particular sample over the blocks.}
+#' \item{WeightedVote}{if more than one block, returns the weighted majority
+#' class over the blocks. NA for a sample means that there is no consensus on
+#' the predicted class for this particular sample over the blocks.}
+#' \item{weights}{Returns the weights of each block used for the weighted
+#' predictions, for each nrepeat and each fold}
+#' 
+#' \item{centroids}{matrix of coordinates for centroids.} \item{dist}{type of
+#' distance requested.} \item{vote}{majority vote result for multi block
+#' analysis (see details above).}
+#' @author Florian Rohart, Sébastien Déjean, Ignacio González, Kim-Anh Lê Cao,
+#' Al J Abadi
+#' @seealso \code{\link{pls}}, \code{\link{spls}}, \code{\link{plsda}},
+#' \code{\link{splsda}}, \code{\link{mint.pls}}, \code{\link{mint.spls}},
+#' \code{\link{mint.plsda}}, \code{\link{mint.splsda}},
+#' \code{\link{block.pls}}, \code{\link{block.spls}},
+#' \code{\link{block.plsda}}, \code{\link{block.splsda}},
+#' \code{\link{mint.block.pls}}, \code{\link{mint.block.spls}},
+#' \code{\link{mint.block.plsda}}, \code{\link{mint.block.splsda}} and
+#' visualisation with \code{\link{background.predict}} and
+#' http://www.mixOmics.org for more details.
+#' @references
+#' 
+#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics
+#' feature selection and multiple data integration. PLoS Comput Biol 13(11):
+#' e1005752
+#' 
+#' Tenenhaus, M. (1998). \emph{La regression PLS: theorie et pratique}. Paris:
+#' Editions Technic.
+#' @keywords regression multivariate
+#' @name predict
+#' @rdname predict
+#' @method predict mixo_pls
+#' @importFrom methods hasArg is
+#' @importFrom stats setNames
+#' @export
+predict.mixo_pls <-
+  function(object,
+           newdata,
+           study.test,
+           dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"),
+           multilevel = NULL,
+           ...
+  )
+  {
+    
+    time=FALSE
+    
+    # pass R check
+    newdata.scale = misdata.all = is.na.X = is.na.newdata = noAveragePredict = NULL
+    
+    # input parameter: noAveragePredict=> no averagePredict calculation, used in tune.block.splsda
+    
+    if(inherits(object, c("rgcca","sparse.rgcca")))
+      stop("no prediction for RGCCA methods")
+    
+    #check on dist
+    if (!any(dist %in% c("all", "max.dist", "centroids.dist", "mahalanobis.dist")) & is(object, "DA"))
+      stop("ERROR : choose one of the four following modes: 'all', 'max.dist', 'centroids.dist' or 'mahalanobis.dist'")
+    #end general checks
+    
+    ncomp = object$ncomp
+    
+    if(is(object, "DA")) # a DA analysis (mint).(block).(s)plsda
+    {
+      #if DA analysis, the unmap Y is in ind.mat
+      Y.factor=object$Y
+      Y=object$ind.mat
+    }else{
+      #if not DA, Y is in object$Y
+      Y=object$Y
+      if(is.null(Y)) # block analysis
+      {
+        Y=object$X[[object$indY]]
+      }
+    }
+    q=ncol(Y)
+    
+    if(time) time1 = proc.time()
+    #print(names(list(...)))
+    
+    if(hasArg(newdata.scale)) # if an input `newdata.scale' is given, we use that one and we don't scale the data and don't do the logratio or multilevel
+    {
+      newdata = list(...)$newdata.scale
+      object$logratio = NULL
+      multilevel = NULL
+    }
+    
+    mint.object = c("mint.pls", "mint.spls", "mint.plsda", "mint.splsda")
+    block.object = c("block.pls", "block.spls", "block.plsda", "block.splsda")
+    ### if the object is a block, the input newdata is different, we check newdata, make sure it's a list and check newdata/X
+    if(!inherits(object, block.object)) # not a block (pls/spls/plsda/splsda/mint...)
+    {
+      p=ncol(object$X)
+      if(is.list(object$X))
+        stop("Something is wrong, object$X should be a matrix and it appears to be a list") #this should never happen/intern check
+      
+      if(is.list(newdata) & !is.data.frame(newdata))
+        stop("'newdata' must be a numeric matrix")
+      
+      # deal with near.zero.var in object, to remove the same variable in newdata as in object$X (already removed in object$X)
+      if(length(object$nzv$Position) > 0)
+        newdata = newdata[, -object$nzv$Position,drop=FALSE]
+      
+      if(all.equal(colnames(newdata),colnames(object$X))!=TRUE)
+        stop("'newdata' must include all the variables of 'object$X'")
+      
+      #not a block, the input newdata should be a matrix
+      if (length(dim(newdata)) == 2) {
+        if (ncol(newdata) != p)
+          stop("'newdata' must be a numeric matrix with ncol = ", p,
+               " or a vector of length = ", p, ".")
+      }
+      
+      if (length(dim(newdata)) == 0) {
+        if (length(newdata) != p)
+          stop("'newdata' must be a numeric matrix with ncol = ", p,
+               " or a vector of length = ", p, ".")
+        dim(newdata) = c(1, p)
+      }
+      
+      #check col/rownames of newdata
+      check=Check.entry.single(newdata, ncomp,q=1)
+      newdata=check$X
+      
+      if(length(rownames(newdata))==0) rownames(newdata)=1:nrow(newdata)
+      if(max(table(rownames(newdata)))>1) stop('samples should have a unique identifier/rowname')
+      
+      # we transform everything in lists
+      X=list(X=object$X)
+      object$X=X
+      newdata=list(newdata=newdata)
+      
+      object$indY=2
+      ind.match = 1
+      
+    }else{
+      
+      # a block, newdata should be a list, each blocks should have the same number of samples
+      if(!is.list(newdata))
+        stop("'newdata' should be a list")
+      
+      if(!is.null(object$indY) && !is(object, "DA")) #if DA, object$X is already without Y
+      {
+        X = object$X[-object$indY]
+      }else{
+        X=object$X
+      }
+      object$X=X
+      
+      p = lapply(X, ncol)
+      
+      #matching newdata and X
+      
+      # error if no names on keepX or not matching the names in X
+      if(length(unique(names(newdata)))!=length(newdata) | sum(is.na(match(names(newdata),names(X)))) > 0)
+        stop("Each entry of 'newdata' must have a unique name corresponding to a block of 'X'")
+      
+      # error if not same number of samples in each block of newdata
+      if (length(unique(sapply(newdata,nrow))) != 1)
+        stop("All entries of 'newdata' must have the same number of rows")
+      
+      # I want to match keepX to X by names
+      ind.match = match(names(X), names(newdata))
+      if(any(is.na(ind.match)))
+        warning("Some blocks are missing in 'newdata'; the prediction is based on the following blocks only: ", paste(names(X)[!is.na(ind.match)],collapse=", "))
+      
+      #replace missing blocks by 0
+      newdataA = list()
+      for (q in 1:length(X))
+      {
+        
+        if (!is.na(ind.match[q])) # means there is a newdata with the same name as X[q] #(q <= length(newdata))
+        {
+          newdataA[[q]] = newdata[[ind.match[q]]]
+        }else{
+          newdataA[[q]] = matrix(0,nrow = unique(sapply(newdata,nrow)), ncol = ncol(X[[q]]), dimnames = list(rownames(newdata[[1]]), colnames(X[[q]]))) # if missing, replaced by a 0 matrix of correct size
+        }
+      }
+      names(newdataA) = names(X)
+      newdata = newdataA
+      
+      # deal with near.zero.var in object, to remove the same variable in newdata as in object$X (already removed in object$X)
+      if(!is.null(object$nzv))
+      {
+        # for each of the input blocks, checks to see if the nzv features have already been removed
+        # if not, then these features are removed here
+        for (x in 1:length(newdata)) {
+          if (nrow(object$nzv[[x]]$Metrics) == 0) { next }
+          if (all(!(rownames(object$nzv[[x]]$Metrics) %in% colnames(newdata[[x]])))) { next }
+          
+          newdata[[x]] <- newdata[[x]][, -object$nzv[[x]]$Position,drop=FALSE]
+        }
+      }
+      if(length(newdata)!=length(object$X)) stop("'newdata' must have as many blocks as 'object$X'")
+      
+      names(newdata)=names(X)
+      
+      if (any(lapply(newdata, function(x){length(dim(x))}) != 2)) {
+        if (any(unlist(lapply(newdata, ncol)) != unlist(p)))
+          stop("'newdata' must be a list with ", length(p), " numeric matrix and ncol respectively equal to ", paste(p, collapse = ", "), ".")
+      }
+      
+      if (any(lapply(newdata, function(x){length(dim(x))}) == 0)) {
+        if (any(unlist(lapply(newdata, ncol)) != unlist(p)))
+          stop("'newdata' must be a list with ", length(p), " numeric matrix and ncol respectively equal to ", paste(p, collapse = ", "), ".")
+        dim(newdata) = c(1, p) #don't understand that, Benoit?
+      }
+      
+      #check dimnames and ncomp per block of A
+      for(q in 1:length(newdata))
+      {
+        check=Check.entry.single(newdata[[q]], ncomp[q],q=q)
+        newdata[[q]]=check$X
+      }
+      names(newdata)=names(X)
+      
+      #check that newdata and X have the same variables and that they are in the same order
+      for (i in 1:length(newdata)) {
+        X.names <- colnames(X[[i]])
+        newdata.names <- colnames(newdata[[i]])
+        
+        if (any(X.names != newdata.names)) { # if there is any difference between colnames
+          if (length(setdiff(X.names, newdata.names)) > 0) { # if there is presence of novel feature/absence of existing feature
+            stop(paste("The set of features in 'object$X$", 
+                       names(X)[i], "' is different to 'newdata$", 
+                       names(newdata)[i], "'. Please ensure they have the same features.", sep = ""))
+          }
+          else if (all(sort(X.names) == sort(newdata.names))) { # if it is only a difference in order
+            stop(paste("The order of features in 'object$X$", 
+                       names(X)[i], "' is different to 'newdata$", 
+                       names(newdata)[i], "'. Please ensure that you adjust these to the same order", sep = ""))
+          }
+        }
+      }
+      
+      #reorder variables in the newdata as in X if needed
+      if (all.equal(lapply(newdata, colnames), lapply(X, colnames)) != TRUE) {
+        newdata <- setNames(lapply(seq_along(X), function(i) {newdata[[i]][, colnames(X[[i]])]}), nm = names(X))
+      }
+      #need to reorder variates and loadings to put 'Y' in last
+      if(!is.null(object$indY))
+      {
+        indY=object$indY
+        object$variates=c(object$variates[-indY],object$variates[indY])
+        object$loadings=c(object$loadings[-indY],object$loadings[indY])
+      }
+      
+      
+    }
+    
+    # logratio and multilevel transform if necessary
+    if (!is.null(object$logratio))
+      newdata = lapply(newdata, logratio.transfo, logratio = object$logratio)
+    
+    if(!is.null(multilevel))
+      newdata = lapply(newdata, withinVariation, design = data.frame(multilevel))
+    
+    p = lapply(X, ncol)
+    q = ncol(Y)
+    J = length(X) #at this stage we have a list of blocks
+    variatesX = object$variates[-(J + 1)];
+    loadingsX = object$loadings[-(J + 1)]
+    
+    scale = object$scale # X and Y are both mean centered by groups and if scale=TRUE they are scaled by groups
+    
+    
+    ### if the object is not a mint analysis, the input study.test is missing and we can go faster to scale the data
+    
+    # we only scale the data if the input `newdata.scale' is missing. Only use in tune functions where we do not need to scale for every prediction as we predict on the same newdata for a grid of keepX
+    if(!hasArg(newdata.scale))
+    {
+      if(!inherits(object, mint.object))#| nlevels(factor(object$study))<=1) #not a mint object or just one level in the study
+      {   # not a mint (pls/spls/plsda/splsda/block...)
+        
+        # scale newdata if just one study
+        if (!is.null(attr(X[[1]], "scaled:center")))
+          newdata[which(!is.na(ind.match))] = lapply(which(!is.na(ind.match)), function(x){sweep(newdata[[x]], 2, STATS = attr(X[[x]], "scaled:center"))})
+        if (scale)
+          newdata[which(!is.na(ind.match))] = lapply(which(!is.na(ind.match)), function(x){sweep(newdata[[x]], 2, FUN = "/", STATS = attr(X[[x]], "scaled:scale"))})
+        if (any(unlist(lapply(newdata[which(!is.na(ind.match))], function(x){any(is.infinite(x))})))) {
+          newdata[which(!is.na(ind.match))] <- lapply(which(!is.na(ind.match)), function(x){df <- newdata[[x]]; df[which(is.infinite(df))] <- NaN; return(df)})
+        }
+        
+        means.Y = matrix(attr(Y, "scaled:center"),nrow=nrow(newdata[[1]]),ncol=q,byrow=TRUE);
+        if (scale)
+        {sigma.Y = matrix(attr(Y, "scaled:scale"),nrow=nrow(newdata[[1]]),ncol=q,byrow=TRUE)}else{sigma.Y=matrix(1,nrow=nrow(newdata[[1]]),ncol=q)}
+        concat.newdata=newdata
+        names(concat.newdata)=names(X)
+        
+      }else{
+        # a mint analysis
+        
+        #check study.test
+        if(missing(study.test))
+        {
+          if(nlevels(object$study)==1)
+          {study.test=factor(rep(1,nrow(newdata[[1]])))}else{
+            stop("'study.test' is missing")}
+        }else{
+          study.test=as.factor(study.test)
+        }
+        if (any(unlist(lapply(newdata, nrow)) != length(study.test)))
+          stop(paste0("'study' must be a factor of length ",nrow(newdata[[1]]),"."))
+        
+        #scale newdata if more than one study. If some levels of study.test are included in study.learn, the means/sigmas of study.learn are used to scale
+        M=nlevels(study.test)
+        study.learn=factor(object$study)
+        names.study.learn=levels(study.learn);names.study.test=levels(study.test)
+        match.study=match(names.study.test,names.study.learn)
+        match.study.indice=which(!is.na(match.study))
+        
+        newdata.list.study = lapply(newdata,study_split,study.test) #a list of lists. [[j]][[m]]: j is for the blocks, m is for the levels(study)
+        
+        # each study is normalized, depending on how the normalization was performed on the learning set (center/scale)
+        newdata.list.study.scale.temp =NULL#vector("list",length=J) #list(list())
+        concat.newdata  = vector("list",length=J)
+        for(j in 1:J) # loop on the blocks
+        {
+          for (m in 1:M) #loop on the groups of study.test
+          {
+            # case 1: sample test is part of the learning data set
+            if(m%in%match.study.indice) #if some study of study.test were already in the learning set
+            {
+              
+              if(scale==TRUE)
+              {
+                if(nlevels(object$study)>1)
+                {
+                  newdata.list.study.scale.temp=scale(newdata.list.study[[j]][[m]], center = attr(X[[j]],paste0("means:", levels(study.test)[m])), scale = attr(X[[j]],paste0("sigma:", levels(study.test)[m])))
+                }else{#if just one level in object$study, the normalisation attributes are not named the same
+                  newdata.list.study.scale.temp=scale(newdata.list.study[[j]][[m]], center = attr(X[[j]],"scaled:center"), scale = attr(X[[j]],"scaled:scale"))
+                }
+              }
+              
+              if(scale==FALSE)
+              {
+                if(nlevels(object$study)>1)
+                {
+                  newdata.list.study.scale.temp=scale(newdata.list.study[[j]][[m]], center = attr(X[[j]],paste0("means:", levels(study.test)[m])),scale=FALSE)
+                }else{#if just one level in object$study, the normalisation attributes are not named the same
+                  newdata.list.study.scale.temp=scale(newdata.list.study[[j]][[m]], center = attr(X[[j]],"scaled:center"),scale=FALSE)
+                }
+                
+                
+              }
+              
+            }else{
+              # case 2: sample test is a new study # a new study which was not in the learning set
+              
+              ## if there is one sample from a new study to be scaled throw a condition and do not scale
+              if (scale == TRUE & dim(newdata.list.study[[j]][[m]])[1] == 1 ) {
+                warning("Train data are scaled but the test data include a single sample from a new study (which cannot be scaled). Mkaing prediction without scaling this test sample and thus prediction for this sample should be given with care. Consider scale=FALSE for model, or using more samples for prediction, or using a sample from model studies.\n")
+                newdata.list.study.scale.temp = newdata.list.study[[j]][[m]]
+              } else {
+                newdata.list.study.scale.temp=scale(newdata.list.study[[j]][[m]],center=TRUE,scale=scale)
+              }
+            }
+            #concatenation of the scaled data
+            concat.newdata[[j]] = rbind(concat.newdata[[j]], unlist(newdata.list.study.scale.temp))#[[j]][[m]]))
+          }
+        }
+        
+        # we now need to reorganise concat.newdata as newdata. Indeed, the concatenation was done without taking care of the order of the samples in newdata
+        for(j in 1:J) # loop on the blocks
+        {
+          indice.match=match(rownames(newdata[[j]]),rownames(concat.newdata[[j]]))
+          #match indice
+          concat.newdata[[j]]=concat.newdata[[j]][indice.match,, drop=FALSE]
+          concat.newdata[[j]][which(is.na(concat.newdata[[j]]))]=0 # taking care of the NA due to normalisation: put to 0 so they don't influence the product below (in Y.hat), reviens au meme que de supprimer les colonnes sans variances au depart.
+          
+        }
+        
+        names(concat.newdata)=names(X)
+        
+        means.Y=matrix(0,nrow=nrow(concat.newdata[[1]]),ncol=q)
+        sigma.Y=matrix(1,nrow=nrow(concat.newdata[[1]]),ncol=q)
+        
+        #loop on the blocks to define means.Y and sigma.Y for mint analysis
+        for(m in 1:M)
+        {
+          if(m%in%match.study.indice) #if some study of study.test were already in the learning set
+          {
+            if(nlevels(object$study)>1)
+            {
+              means.Y[which(study.test%in%levels(study.learn)[match.study[m]]),]=matrix(attr(Y,paste0("means:", levels(study.test)[m])),nrow=length(which(study.test%in%levels(study.learn)[match.study[m]])),ncol=q,byrow=TRUE)
+            }else{#if just one level in object$study, the normalisation attributes are not named the same
+              means.Y[which(study.test%in%levels(study.learn)[match.study[m]]),]=matrix(attr(Y,"scaled:center"),nrow=length(which(study.test%in%levels(study.learn)[match.study[m]])),ncol=q,byrow=TRUE)
+            }
+            
+            
+            if(scale==TRUE)
+            {
+              # I want to build a vector with sigma.Y for each group
+              if(nlevels(object$study)>1)
+              {
+                sigma.Y[which(study.test%in%levels(study.learn)[match.study[m]]),]=matrix(attr(Y,paste0("sigma:", levels(study.test)[m])),nrow=length(which(study.test%in%levels(study.learn)[match.study[m]])),ncol=q,byrow=TRUE)
+              }else{#if just one level in object$study, the normalisation attributes are not named the same
+                sigma.Y[which(study.test%in%levels(study.learn)[match.study[m]]),]=matrix(attr(Y,"scaled:scale"),nrow=length(which(study.test%in%levels(study.learn)[match.study[m]])),ncol=q,byrow=TRUE)
+              }
+              
+              
+            }
+          }
+        }
+        
+      }
+      ### end if object is a mint analysis
+    } else {
+      means.Y = matrix(attr(Y, "scaled:center"),nrow=nrow(newdata[[1]]),ncol=q,byrow=TRUE);
+      if (scale)
+      {sigma.Y = matrix(attr(Y, "scaled:scale"),nrow=nrow(newdata[[1]]),ncol=q,byrow=TRUE)}else{sigma.Y=matrix(1,nrow=nrow(newdata[[1]]),ncol=q)}
+      concat.newdata=newdata
+      names(concat.newdata)=names(X)
+    }
+    if(time) time2 = proc.time()
+    if(time) print("scaling")
+    if(time) print(time2-time1)
+    
+    
+    ### at this stage we have
+    # X         # list of blocks
+    # Y         # observation
+    # newdata   #list of blocks for the prediction, same length as A, scaled
+    
+    ###  replace NA by 0 in the training data
+    if( (hasArg(misdata.all) & hasArg(is.na.X)) && any(list(...)$misdata.all)) # ind.na.X: all blocks except Y
+    {    # if misdata.all and ind.na.X are provided, we don't calculate the is.na(X) as it takes time. Used in tune functions.
+      for(j in c(1:J)[list(...)$misdata.all])
+        X[[j]][list(...)$is.na.X[[j]]]=0 # faster than using replace
+    } else {
+      # replace NA by 0
+      X = lapply(X,function(x)
+      {
+        if (anyNA(x)){
+          ind = is.na(x)
+          x[ind] = 0
+        }
+        x
+      })
+    }
+    
+    ###  replace NA by 0 in the test data
+    
+    if( (hasArg(misdata.all) & hasArg(is.na.newdata)) && any(list(...)$misdata.all)) # ind.na.newdata: all blocks of newdata
+    {
+      # if misdata.all and ind.na.X are provided, we don't calculate the is.na(X) as it takes time. Used in tune functions.
+      concat.newdata = lapply(1:J, function(q){replace(concat.newdata[[q]], list(...)$is.na.newdata[[q]], 0)})
+      
+    }
+    
+    if (any(sapply(concat.newdata, anyNA)))
+    {
+      concat.newdata = lapply(concat.newdata,function(x)
+      {
+        if (anyNA(x)){
+          ind = is.na(x)
+          x[ind] = 0
+        }
+        x
+      })
+    }
+    
+    
+    # replace NA by 0 in Y
+    Y[is.na(Y)] = 0
+    
+    if(time) time3 = proc.time()
+    if(time) print("NA")
+    if(time) print(time3-time2)
+    
+    # -----------------------
+    #       prediction
+    # -----------------------
+    
+    B.hat = t.pred = Y.hat = list() #= betay
+    for (i in 1 : J)
+    {
+      Pmat = Cmat = Wmat = NULL
+      
+      ### Start estimation using formula Y = XW(P'W)C (+ Yr, residuals on Y) See page 136 La regression PLS Theorie et pratique Tenenhaus
+      # Estimation matrix W, P and C
+      Pmat = crossprod(X[[i]], variatesX[[i]])
+      Cmat = crossprod(Y, variatesX[[i]])
+      Wmat = loadingsX[[i]]
+      
+      # Prediction Y.hat, B.hat and t.pred
+      Ypred = lapply(1 : ncomp[i], function(x){concat.newdata[[i]] %*% Wmat[, 1:x] %*% solve(t(Pmat[, 1:x]) %*% Wmat[, 1:x]) %*% t(Cmat)[1:x, ]})
+      Ypred = sapply(Ypred, function(x){x*sigma.Y + means.Y}, simplify = "array")
+      
+      Y.hat[[i]] = array(Ypred, c(nrow(newdata[[i]]), ncol(Y), ncomp[i])) # in case one observation and only one Y, we need array() to keep it an array with a third dimension being ncomp
+      
+      t.pred[[i]] = concat.newdata[[i]] %*% Wmat %*% solve(t(Pmat) %*% Wmat)
+      t.pred[[i]] = matrix(data = sapply(1:ncol(t.pred[[i]]),
+                                         function(x) {t.pred[[i]][, x] * apply(variatesX[[i]], 2,
+                                                                               function(y){(norm(y, type = "2"))^2})[x]}), nrow = nrow(concat.newdata[[i]]), ncol = ncol(t.pred[[i]]))
+      
+      B.hat[[i]] = sapply(1 : ncomp[i], function(x){Wmat[, 1:x] %*% solve(t(Pmat[, 1:x]) %*% Wmat[, 1:x]) %*% t(Cmat)[1:x, ]}, simplify = "array")
+      ### End estimation using formula Y = XW(P'W)C (+ Yr, residuals on Y) See page 136 La regression PLS Theorie et pratique Tenenhaus
+      
+      rownames(t.pred[[i]]) = rownames(newdata[[i]])
+      colnames(t.pred[[i]]) = paste0("dim", c(1:ncomp[i]))
+      rownames(Y.hat[[i]]) = rownames(newdata[[i]])
+      colnames(Y.hat[[i]]) = colnames(Y)
+      dimnames(Y.hat[[i]])[[3]]=paste0("dim", c(1:ncomp[i]))
+      rownames(B.hat[[i]]) = colnames(newdata[[i]])
+      colnames(B.hat[[i]]) = colnames(Y)
+      dimnames(B.hat[[i]])[[3]]=paste0("dim", c(1:ncomp[i]))
+      
+    }
+    
+    if(time) time4 = proc.time()
+    if(time) print("Prediction")
+    if(time) print(time4-time3)
+    
+    #-- valeurs sortantes --#
+    names(Y.hat)=names(t.pred)=names(B.hat)=names(object$X)
+    
+    if(time) time4 = proc.time()
+    
+    # basic prediction results
+    if(inherits(object, block.object) & length(object$X)>1 )
+    {
+      out=list(predict=Y.hat[which(!is.na(ind.match))],variates=t.pred[which(!is.na(ind.match))],B.hat=B.hat[which(!is.na(ind.match))])
+      
+      # average prediction over the blocks
+      temp.all =list()
+      for(comp in 1:min(ncomp[-object$indY])) #note: all ncomp are the same in v6 as the input parameter is a single value
+      {
+        temp = array(0, c(nrow(Y.hat[[1]]), ncol(Y.hat[[1]]), J), dimnames = list(rownames(newdata[[1]]), colnames(Y),names(object$X)))
+        for(i in 1 : J)
+          temp[, , i] = Y.hat[[i]][, , comp]
+        
+        temp.all[[comp]] = temp
+      }
+      names(temp.all) = paste0("dim", c(1:min(ncomp[-object$indY])))
+      
+      if(!hasArg(noAveragePredict))
+      {
+        out$AveragedPredict = array(unlist(lapply(temp.all, function(x){
+          apply(x, c(1,2), mean)
+          
+        })), dim(Y.hat[[1]]), dimnames = list(rownames(newdata[[1]]), colnames(Y), paste0("dim", c(1:min(ncomp[-object$indY])))))
+        
+        out$WeightedPredict = array(unlist(lapply(temp.all, function(x){
+          apply(x, c(1,2), function(z){
+            temp = aggregate(rowMeans(object$weights),list(z),sum)
+            ind = which(temp[,2]== max (temp[,2]))# if two max, then NA
+            if(length(ind) == 1)
+            {
+              res = temp[ind, 1]
+            } else {
+              res = NA
+            }
+            res
+          })})), dim(Y.hat[[1]]), dimnames = list(rownames(newdata[[1]]), colnames(Y), paste0("dim", c(1:min(ncomp[-object$indY])))))
+      }
+      
+      
+      #out$newdata=concat.newdata
+    }else if(inherits(object, block.object)){ # a block but can have only one block (so e.g. a pls done with a block.pls)
+      out=list(predict=Y.hat,variates=t.pred,B.hat=B.hat)
+      
+    } else {# not a block (pls/spls/plsda/splsda/mint...)
+      out=list(predict=Y.hat[[1]],variates=t.pred[[1]],B.hat=B.hat[[1]])
+    }
+    
+    if(time) time5 = proc.time()
+    if(time) print("Y.hat")
+    if(time) print(time5-time4)
+    
+    
+    # get the classification for each new sample if the object is a DA
+    if(is(object, "DA")) # a DA analysis (mint).(block).(s)plsda
+    {
+      
+      if(inherits(object, block.object) & length(object$X)>1 )
+      {
+        if(!hasArg(noAveragePredict))
+        {
+          # predict class of AveragePredict, only with max.dist
+          out$AveragedPredict.class$max.dist = matrix(sapply(1:ncomp[1], ### List level
+                                                             function(y){apply(out$AveragedPredict[, , y, drop = FALSE], 1,  ### component level
+                                                                               function(z){
+                                                                                 paste(levels(Y.factor)[which(z == max(z))], collapse = "/")
+                                                                               }) ### matrix level
+                                                             }), nrow = nrow(newdata[[1]]), ncol = ncomp[1])
+          
+          
+          # predict class of WeightedPredict, only with max.dist
+          out$WeightedPredict.class$max.dist = matrix(sapply(1:ncomp[1], ### List level
+                                                             function(y){apply(out$WeightedPredict[, , y, drop = FALSE], 1,  ### component level
+                                                                               function(z){
+                                                                                 paste(levels(Y.factor)[which(z == max(z))], collapse = "/")
+                                                                               }) ### matrix level
+                                                             }), nrow = nrow(newdata[[1]]), ncol = ncomp[1])
+          
+          rownames(out$AveragedPredict.class$max.dist) = rownames(out$WeightedPredict.class$max.dist) = rownames(newdata[[1]])
+          colnames(out$AveragedPredict.class$max.dist) = colnames(out$WeightedPredict.class$max.dist) = paste0("dim", c(1:min(ncomp[-object$indY])))
+        }
+      }
+      
+      
+      # creating temporary 'blocks' outputs to pass into the internal_predict.DA function
+      out.temp=list(predict=Y.hat[which(!is.na(ind.match))],variates=t.pred[which(!is.na(ind.match))],B.hat=B.hat[which(!is.na(ind.match))])
+      out.temp$newdata=concat.newdata[which(!is.na(ind.match))]
+      
+      # getting classification for each new sample
+      object.temp = object
+      object.temp$X = object.temp$X[which(!is.na(ind.match))]
+      object.temp$variates = object.temp$variates[c(which(!is.na(ind.match)),J+1)] #J+1 is Y
+      if (!is.null(object$weights))
+        weights <- rowMeans(object$weights)[which(!is.na(ind.match))]
+      else
+        weights <- NULL
+      
+      classif.DA=internal_predict.DA(object=object.temp, q=q, out=out.temp, dist=dist, weights = weights)
+      out=c(out,classif.DA)
+      
+    }
+    
+    if(time) time6 = proc.time()
+    if(time) print("DA")
+    if(time) print(time6-time5)
+    
+    out$call = match.call()
+    class(out) = paste("predict")
+    
+    out
+    
+  }
+
+#' @rdname predict
+#' @method predict mixo_spls
+#' @export
+predict.mixo_spls <- predict.mixo_pls
+
diff --git a/man/predict.Rd b/man/predict.Rd
new file mode 100644
index 0000000..bc64d95
--- /dev/null
+++ b/man/predict.Rd
@@ -0,0 +1,203 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/predict_mixOmics.R
+\name{predict}
+\alias{predict}
+\alias{predict.mixo_pls}
+\alias{predict.pls}
+\alias{predict.spls}
+\alias{predict.plsda}
+\alias{predict.splsda}
+\alias{predict.mint.pls}
+\alias{predict.mint.spls}
+\alias{predict.mint.plsda}
+\alias{predict.mint.splsda}
+\alias{predict.mint.block.pls}
+\alias{predict.mint.block.spls}
+\alias{predict.mint.block.plsda}
+\alias{predict.mint.block.splsda}
+\alias{predict.mixo_spls}
+\title{Predict Method for (mint).(block).(s)pls(da) methods}
+\usage{
+\method{predict}{mixo_pls}(
+  object,
+  newdata,
+  study.test,
+  dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"),
+  multilevel = NULL,
+  ...
+)
+
+\method{predict}{mixo_spls}(
+  object,
+  newdata,
+  study.test,
+  dist = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"),
+  multilevel = NULL,
+  ...
+)
+}
+\arguments{
+\item{object}{object of class inheriting from
+\code{"(mint).(block).(s)pls(da)"}.}
+
+\item{newdata}{data matrix in which to look for for explanatory variables to
+be used for prediction. Please note that this method does not perform
+multilevel decomposition or log ratio transformations, which need to be
+processed beforehand.}
+
+\item{study.test}{For MINT objects, grouping factor indicating which samples
+of \code{newdata} are from the same study. Overlap with \code{object$study}
+are allowed.}
+
+\item{dist}{distance to be applied for discriminant methods to predict the
+class of new data, should be a subset of \code{"centroids.dist"},
+\code{"mahalanobis.dist"} or \code{"max.dist"} (see Details). Defaults to
+\code{"all"}.}
+
+\item{multilevel}{Design matrix for multilevel analysis (for repeated
+measurements). A numeric matrix or data frame. For a one level factor
+decomposition, the input is a vector indicating the repeated measures on
+each individual, i.e. the individuals ID. For a two level decomposition with
+splsda models, the two factors are included in Y. Finally for a two level
+decomposition with spls models, 2nd AND 3rd columns in design indicate those
+factors (see example in \code{?splsda} and \code{?spls}).}
+
+\item{...}{not used currently.}
+}
+\value{
+\code{predict} produces a list with the following components:
+\item{predict}{predicted response values. The dimensions correspond to the
+observations, the response variables and the model dimension, respectively.
+For a supervised model, it corresponds to the predicted dummy variables.}
+\item{variates}{matrix of predicted variates.} \item{B.hat}{matrix of
+regression coefficients (without the intercept).}
+
+\item{AveragedPredict}{if more than one block, returns the average predicted
+values over the blocks (using the \code{predict} output)}
+\item{WeightedPredict}{if more than one block, returns the weighted average
+of the predicted values over the blocks (using the \code{predict} and
+\code{weights} outputs)}
+
+\item{class}{predicted class of \code{newdata} for each
+\eqn{1,...,}\code{ncomp} components.}
+
+\item{MajorityVote}{if more than one block, returns the majority class over
+the blocks. NA for a sample means that there is no consensus on the
+predicted class for this particular sample over the blocks.}
+\item{WeightedVote}{if more than one block, returns the weighted majority
+class over the blocks. NA for a sample means that there is no consensus on
+the predicted class for this particular sample over the blocks.}
+\item{weights}{Returns the weights of each block used for the weighted
+predictions, for each nrepeat and each fold}
+
+\item{centroids}{matrix of coordinates for centroids.} \item{dist}{type of
+distance requested.} \item{vote}{majority vote result for multi block
+analysis (see details above).}
+}
+\description{
+Predicted values based on PLS models. New responses and variates are
+predicted using a fitted model and a new matrix of observations.
+}
+\details{
+\code{predict} produces predicted values, obtained by evaluating the
+PLS-derived methods, returned by \code{(mint).(block).(s)pls(da)} in the
+frame \code{newdata}. Variates for \code{newdata} are also returned. Please
+note that this method performs multilevel decomposition and/or log ratio
+transformations if needed (\code{multilevel} is an input parameter while
+\code{logratio} is extracted from \code{object}).
+
+Different prediction distances are proposed for discriminant analysis. The
+reason is that our supervised models work with a dummy indicator matrix of
+\code{Y} to indicate the class membership of each sample. The prediction of
+a new observation results in either a predicted dummy variable (output
+\code{object$predict}), or a predicted variate (output
+\code{object$variates}). Therefore, an appropriate distance needs to be
+applied to those predicted values to assign the predicted class. We propose
+distances such as \verb{maximum distance' for the predicted dummy variables, }Mahalanobis distance' and `Centroids distance' for the predicted variates.
+
+\code{"max.dist"} is the simplest method to predict the class of a test
+sample. For each new individual, the class with the largest predicted dummy
+variable is the predicted class. This distance performs well in single data
+set analysis with multiclass problems (PLS-DA).
+
+\code{"centroids.dist"} allocates to the new observation the class that
+mimimises the distance between the predicted score and the centroids of the
+classes calculated on the latent components or variates of the trained
+model.
+
+\code{"mahalanobis.dist"} allocates the new sample the class defined as the
+centroid distance, but using the Mahalanobis metric in the calculation of
+the distance.
+
+In practice we found that the centroid-based distances
+(\code{"centroids.dist"} and \code{"mahalanobis.dist"}), and specifically
+the Mahalanobis distance led to more accurate predictions than the maximum
+distance for complex classification problems and N-integration problems
+(block.splsda). The centroid distances consider the prediction in
+dimensional space spanned by the predicted variates, while the maximum
+distance considers a single point estimate using the predicted scores on the
+last dimension of the model. The user can assess the different distances,
+and choose the prediction distance that leads to the best performance of the
+model, as highlighted from the tune and perf outputs
+
+More (mathematical) details about the prediction distances are available in
+the supplemental of the mixOmics article (Rohart et al 2017).
+
+For a visualisation of those prediction distances, see
+\code{background.predict} that overlays the prediction area in
+\code{plotIndiv} for a sPLS-DA object.
+
+Allocates the individual \eqn{x} to the class of \eqn{Y} minimizing
+\eqn{dist(\code{x-variate}, G_l)}, where \eqn{G_l}, \eqn{l = 1,...,L} are
+the centroids of the classes calculated on the \eqn{X}-variates of the
+model. \code{"mahalanobis.dist"} allocates the individual \eqn{x} to the
+class of \eqn{Y} as in \code{"centroids.dist"} but by using the Mahalanobis
+metric in the calculation of the distance.
+
+For MINT objects, the \code{study.test} argument is required and provides
+the grouping factor of \code{newdata}.
+
+For multi block analysis (thus block objects), \code{newdata} is a list of
+matrices whose names are a subset of \code{names(object$X)} and missing
+blocks are allowed. Several predictions are returned, either for each block
+or for all blocks. For non discriminant analysis, the predicted values
+(\code{predict}) are returned for each block and these values are combined
+by average (\code{AveragedPredict}) or weighted average
+(\code{WeightedPredict}), using the weights of the blocks that are
+calculated as the correlation between a block's components and the outcome's
+components.
+
+For discriminant analysis, the predicted class is returned for each block
+(\code{class}) and each distance (\code{dist}) and these predictions are
+combined by majority vote (\code{MajorityVote}) or weighted majority vote
+(\code{WeightedVote}), using the weights of the blocks that are calculated
+as the correlation between a block's components and the outcome's
+components. NA means that there is no consensus among the block. For PLS-DA
+and sPLS-DA objects, the prediction area can be visualised in plotIndiv via
+the \code{background.predict} function.
+}
+\references{
+Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics
+feature selection and multiple data integration. PLoS Comput Biol 13(11):
+e1005752
+
+Tenenhaus, M. (1998). \emph{La regression PLS: theorie et pratique}. Paris:
+Editions Technic.
+}
+\seealso{
+\code{\link{pls}}, \code{\link{spls}}, \code{\link{plsda}},
+\code{\link{splsda}}, \code{\link{mint.pls}}, \code{\link{mint.spls}},
+\code{\link{mint.plsda}}, \code{\link{mint.splsda}},
+\code{\link{block.pls}}, \code{\link{block.spls}},
+\code{\link{block.plsda}}, \code{\link{block.splsda}},
+\code{\link{mint.block.pls}}, \code{\link{mint.block.spls}},
+\code{\link{mint.block.plsda}}, \code{\link{mint.block.splsda}} and
+visualisation with \code{\link{background.predict}} and
+http://www.mixOmics.org for more details.
+}
+\author{
+Florian Rohart, Sébastien Déjean, Ignacio González, Kim-Anh Lê Cao,
+Al J Abadi
+}
+\keyword{multivariate}
+\keyword{regression}
diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd
index b67de41..ea612d7 100644
--- a/man/predictPreeclampsia.Rd
+++ b/man/predictPreeclampsia.Rd
@@ -28,13 +28,17 @@ prior to prediction. This was the normalization method used on the training data
 }
 
 \examples{
+\dontrun{
 
 # To predict early preeclampsia on 450k/850k samples
 
 # Load data
 library(ExperimentHub)
+eh <- ExperimentHub()
+query(eh, "eoPredData")
 
 # test object
 x_test <- eh[['EH8403']]
 x_test \%>\% predictPreeclampsia()
 }
+}

From d32e51d6e3c15a6f3e27ff28e7a1d2d19b507f05 Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Thu, 30 Jan 2025 23:23:39 -0800
Subject: [PATCH 12/19] add other functions from mixOmics

---
 NAMESPACE                       |   6 +
 R/predict_mixOmics.R            | 469 ++++++++++++++++++++++++++++++++
 man/logratio-transformations.Rd |  58 ++++
 man/unmap.Rd                    |  59 ++++
 4 files changed, 592 insertions(+)
 create mode 100644 man/logratio-transformations.Rd
 create mode 100644 man/unmap.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 9702d9d..24492c1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,11 +3,17 @@
 S3method(predict,mixo_pls)
 S3method(predict,mixo_spls)
 export("%>%")
+export(logratio.transfo)
 export(pl_infer_age)
 export(pl_infer_ethnicity)
 export(predictAge)
 export(predictEthnicity)
 export(predictPreeclampsia)
+export(unmap)
+importFrom(dplyr,filter)
+importFrom(dplyr,group_by)
+importFrom(dplyr,n)
+importFrom(dplyr,row_number)
 importFrom(magrittr,"%>%")
 importFrom(methods,hasArg)
 importFrom(methods,is)
diff --git a/R/predict_mixOmics.R b/R/predict_mixOmics.R
index 9efcb9c..5f29025 100644
--- a/R/predict_mixOmics.R
+++ b/R/predict_mixOmics.R
@@ -771,3 +771,472 @@ predict.mixo_pls <-
 #' @export
 predict.mixo_spls <- predict.mixo_pls
 
+
+# from https://github.com/mixOmicsTeam/mixOmics/blob/master/R/check_entry.R#L111
+################################################################################
+# --------------------------------------
+# Check.entry.single: check input parameter for a single matrix that comes
+#   from a list of matrix
+# --------------------------------------
+# X: a single numeric matrix
+# ncomp: the number of components to include in the model
+# q: position of X in a list of matrix, as used in horizontal analysis
+#   (e.g. block.pls)
+Check.entry.single = function(X,  ncomp, q)
+{
+  
+  #-- validation des arguments --#
+  if (length(dim(X)) != 2)
+    stop(paste0("'X[[", q, "]]' must be a numeric matrix."))
+  
+  if(! any(class(X) %in% "matrix"))
+    X = as.matrix(X)
+  
+  if (!is.numeric(X))
+    stop(paste0("'X[[", q, "]]'  must be a numeric matrix."))
+  
+  N = nrow(X)
+  P = ncol(X)
+  
+  if (is.null(ncomp) || !is.numeric(ncomp) || ncomp <= 0)
+    stop(paste0(
+      "invalid number of variates 'ncomp' for matrix 'X[[", q, "]]'."))
+  
+  ncomp = round(ncomp)
+  
+  # add colnames and rownames if missing
+  X.names = dimnames(X)[[2]]
+  if (is.null(X.names))
+  {
+    X.names = paste("X", 1:P, sep = "")
+    dimnames(X)[[2]] = X.names
+  }
+  
+  ind.names = dimnames(X)[[1]]
+  if (is.null(ind.names))
+  {
+    ind.names = 1:N
+    rownames(X)  = ind.names
+  }
+  
+  if (length(unique(rownames(X))) != nrow(X))
+    stop("samples should have a unique identifier/rowname")
+  if (length(unique(X.names)) != P)
+    stop("Unique indentifier is needed for the columns of X")
+  
+  return(list(X=X, ncomp=ncomp, X.names=X.names, ind.names=ind.names))
+}
+
+# from https://github.com/mixOmicsTeam/mixOmics/blob/master/R/logratio-transformations.R
+#' Log-ratio transformation
+#' 
+#' This function applies a log transformation to the data, either CLR or ILR
+#' 
+#' \code{logratio.transfo} applies a log transformation to the data, either CLR
+#' (centered log ratio transformation) or ILR (Isometric Log Ratio
+#' transformation). In the case of CLR log-transformation, X needs to be a
+#' matrix of non-negative values and \code{offset} is used to shift the values
+#' away from 0, as commonly done with counts data.
+#' 
+#' @param X numeric matrix of predictors
+#' @param logratio log-ratio transform to apply, one of "none", "CLR" or "ILR"
+#' @param offset Value that is added to X for CLR and ILR log transformation.
+#' Default to 0.
+#' @return \code{logratio.transfo} simply returns the log-ratio transformed
+#' data.
+#' @author Florian Rohart, Kim-Anh Lê Cao, Al J Abadi
+#' @seealso \code{\link{pca}}, \code{\link{pls}}, \code{\link{spls}},
+#' \code{\link{plsda}}, \code{\link{splsda}}.
+#' @references Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis,
+#' Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a
+#' multivariate statistical framework to gain insight into Microbial
+#' Communities bioRxiv 044206; doi: http://dx.doi.org/10.1101/044206
+#' 
+#' John Aitchison. The statistical analysis of compositional data. Journal of
+#' the Royal Statistical Society. Series B (Methodological), pages 139-177,
+#' 1982.
+#' 
+#' Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component
+#' analysis for compositional data with outliers. Environmetrics,
+#' 20(6):621-632, 2009.
+#' @examples
+#' 
+#' data(diverse.16S)
+#' CLR = logratio.transfo(X = diverse.16S$data.TSS, logratio = 'CLR')
+#' # no offset needed here as we have put it prior to the TSS, see www.mixOmics.org/mixMC
+#' @name logratio-transformations
+NULL
+#' @rdname logratio-transformations
+#' @export
+logratio.transfo <- function(X,
+                             logratio = c('none','CLR','ILR'),
+                             offset = 0)
+{
+  X <- as.matrix(X)
+  if (!is.numeric(X))
+    stop("X must be a numeric matrix", call. = FALSE)
+  
+  logratio <- match.arg(logratio)
+  if (logratio == 'ILR')
+  {
+    if (!is(X, 'ilr'))
+    {   # data are ilr transformed, then the data lose 1 variable, but we'll use V to reconstruct the matrix
+      X = ilr.transfo(X, offset = offset)
+    }
+  } else if (logratio == 'CLR') {
+    X = clr.transfo(X, offset = offset)
+  }
+  return(X)
+}
+
+
+#' @importFrom dplyr group_by row_number filter n
+internal_predict.DA <- 
+  function(object, out, q, dist, weights)
+  {
+    # a DA analysis (mint).(block).(s)plsda
+    if (!is(object, "DA"))
+      stop("'Object' is not from a Discriminant Analysis", call.=FALSE)
+    
+    out.DA = list()
+    J = length(object$X) #at this stage we have a list of blocks
+    p = lapply(object$X, ncol)
+    t.pred = out$variates
+    Y.hat = out$predict
+    newdata = out$newdata #actually concat.newdata
+    variatesX = object$variates[-(J + 1)];
+    ncomp = object$ncomp
+    
+    Y = object$Y
+    Y.prim = unmap(object$Y)
+    G = cls = list()
+    for (i in 1 : J)
+    {
+      G[[i]] = sapply(1:q, function(x)
+      {apply(as.matrix(variatesX[[i]][Y.prim[, x] == 1,
+                                      ,drop=FALSE]), 2, mean)})
+      if (ncomp[i] == 1)
+        G[[i]] = t(t(G[[i]]))
+      else
+        G[[i]] = t(G[[i]])
+      colnames(G[[i]]) = paste0("dim", c(1:ncomp[i]))
+      rownames(G[[i]]) = colnames(object$ind.mat)
+      
+    }
+    names(G)=names(object$X)
+    
+    ### Start: Maximum distance
+    if (any(dist == "all") || any(dist == "max.dist"))
+    {
+      cls$max.dist = lapply(1:J, function(x){matrix(sapply(1:ncomp[x],
+                                                           ### List level
+                                                           function(y){apply(Y.hat[[x]][, , y, drop = FALSE], 1,
+                                                                             ### component level
+                                                                             function(z){
+                                                                               paste(levels(Y)[which(z == max(z))], collapse = "/")
+                                                                             }) ### matrix level
+                                                           }), nrow = nrow(newdata[[x]]), ncol = ncomp[x])
+      })
+      cls$max.dist = lapply(1:J, function(x){colnames(cls$max.dist[[x]]) =
+        paste0(rep("comp", ncomp[x]), 1 : ncomp[[x]]);
+      rownames(cls$max.dist[[x]]) = rownames(newdata[[x]]);
+      return(cls$max.dist[[x]])})
+      names(cls$max.dist)=names(object$X)
+    }
+    
+    
+    ### Start: Centroids distance
+    if (any(dist == "all") || any(dist == "centroids.dist"))
+    {
+      cl = list()
+      centroids.fun = function(x, G, h, i) {
+        q = nrow(G[[i]])
+        x = matrix(x, nrow = q, ncol = h, byrow = TRUE)
+        
+        if (h > 1) {
+          d = apply((x - G[[i]][, 1:h])^2, 1, sum)
+        }
+        else {
+          d = (x - G[[i]][, 1])^2
+        }
+        cl.id = paste(levels(Y)[which(d == min(d))], collapse = "/")
+      }
+      
+      for (i in 1 : J)
+      {
+        cl[[i]] = matrix(nrow = nrow(newdata[[1]]), ncol = ncomp[i])
+        
+        for (h in 1 : ncomp[[i]])
+        {
+          cl.id = apply(matrix(t.pred[[i]][, 1:h], ncol = h), 1,
+                        function(x) {centroids.fun(x = x, G = G, h = h, i = i)})
+          cl[[i]][, h] = cl.id
+        }
+      }
+      
+      cls$centroids.dist = lapply(1:J, function(x){colnames(cl[[x]]) =
+        paste0(rep("comp", ncomp[x]), 1 : ncomp[[x]]);
+      rownames(cl[[x]]) = rownames(newdata[[x]]); return(cl[[x]])})
+      names(cls$centroids.dist)=names(object$X)
+    }### End: Centroids distance
+    
+    
+    ### Start: Mahalanobis distance
+    if (any(dist == "all") || any(dist == "mahalanobis.dist"))
+    {
+      cl = list()
+      Sr.fun = function(x, G, Yprim, h, i) {
+        q = nrow(G[[i]])
+        Xe = Yprim %*% G[[i]][, 1:h]
+        #Xr = object$variates$X[, 1:h] - Xe
+        Xr = variatesX[[i]][, 1:h] - Xe
+        Sr = t(Xr) %*% Xr/nrow(Yprim)
+        Sr.inv = solve(Sr)
+        x = matrix(x, nrow = q, ncol = h, byrow = TRUE)
+        if (h > 1) {
+          mat = (x - G[[i]][, 1:h]) %*% Sr.inv %*% t(x - G[[i]][, 1:h])
+          d = apply(mat^2, 1, sum)
+        } else {
+          d = drop(Sr.inv) * (x - G[[i]][, 1])^2
+        }
+        cl.id = paste(levels(Y)[which(d == min(d))], collapse = "/")
+      }
+      
+      for (i in 1 : J){
+        cl[[i]] = matrix(nrow = nrow(newdata[[1]]), ncol = ncomp[i])
+        
+        for (h in 1:ncomp[[i]]) {
+          cl.id = apply(matrix(t.pred[[i]][, 1:h], ncol = h), 1, Sr.fun,
+                        G = G, Yprim = Y.prim, h = h, i = i)
+          cl[[i]][, h] = cl.id
+        }
+      }
+      
+      cls$mahalanobis.dist = lapply(1:J, function(x){colnames(cl[[x]]) =
+        paste0(rep("comp", ncomp[x]), 1 : ncomp[[x]]);
+      rownames(cl[[x]]) = rownames(newdata[[x]]);return(cl[[x]])})
+      names(cls$mahalanobis.dist)=names(object$X)
+    } ### End: Mahalanobis distance
+    
+    out.DA$class = cls
+    
+    ### End if discriminant analysis is performed
+    
+    # at this stage, we have the classification of each sample for each dataset
+    #   of object$X
+    # now we need to combine the classification by vote (majority wins),
+    #  only when more than one block, otherwise 'vote' is classic classification
+    if (length(object$X)>1)
+    {
+      for (ijk in 1:length(out.DA$class))# loop on the dist
+      {
+        # create a temporary array to make computation on the lists easier
+        temp=array(, c(nrow(newdata[[1]]), min(ncomp), J))
+        for(i in 1:J)
+        {
+          temp[, , i] = out.DA$class[[ijk]][[i]][, 1:min(ncomp)]
+          
+        }
+        # look at the majority vote for all dataset of object$X (with table)
+        #   if more than a unique max, we put NA
+        table.temp = apply(temp,c(1,2), function(x)
+        {a=table(x); if (length(which(a==max(a)))==1)
+        {b=names(which.max(a))}else{b=NA}; b})
+        colnames(table.temp) =
+          colnames(out.DA$class[[ijk]][[i]])[1:min(ncomp)]
+        rownames(table.temp) = rownames(out.DA$class[[ijk]][[i]])
+        out.DA$MajorityVote[[ijk]] = table.temp
+      }
+      names(out.DA$MajorityVote) = names(out.DA$class)
+      
+      #save(list=ls(),file="temp.Rdata")
+      #stop("")
+      # weighted vote for each distance, each comp
+      if(!is.null(weights))
+      {
+        out.WeightedVote = vector("list",length=length(out.DA$class))
+        Group.2 = n = x =NULL #CRAN check
+        for(i in 1:length(out.DA$class)){ #i distance
+          out = matrix(NA_real_,nrow=nrow(newdata[[1]]), ncol=min(ncomp))
+          rownames(out) = rownames(newdata[[1]])
+          colnames(out) = paste0("comp",1:min(ncomp))
+          
+          for(comp in 1:min(ncomp)){ #comp
+            data.temp=NULL
+            for(j in 1:J){ #block
+              data.temp = rbind(data.temp,
+                                out.DA$class[[i]][[j]][,comp,drop=FALSE])
+            }
+            colnames(data.temp)="pred"
+            temp=data.frame(data.temp,indiv=rownames(data.temp),
+                            weights=rep(weights,each=nrow(out.DA$class[[1]][[1]])))
+            ag = aggregate(temp$weights,
+                           by=list(temp$pred, temp$indiv), FUN=sum)
+            data_max <- group_by(.data = ag, Group.2)
+            data_max <- filter(.data = data_max, row_number(x)==n())
+            out.comp = as.matrix(data_max[,1])
+            rownames(out.comp) = as.matrix(data_max[,2])
+            colnames(out.comp) = paste0("comp",comp)
+            
+            out[,comp] = out.comp[match(rownames(out),
+                                        rownames(out.comp)),]
+          }
+          
+          out.WeightedVote[[i]] = out
+        }
+        names(out.WeightedVote) = names(out.DA$class)
+        out.DA$WeightedVote = out.WeightedVote
+        
+        
+        
+        if(FALSE){
+          out.DA$WeightedVote = lapply(out.DA$class, function(x){
+            # x is a distance
+            class.per.comp = lapply(1:min(ncomp), function(y) {
+              matrix(sapply(x, function(z)
+                z[,y, drop = FALSE]),ncol=J)})
+            # combine the results per component
+            names(class.per.comp) = paste0("comp",1:min(ncomp))
+            class.weighted.per.comp = sapply(class.per.comp, function(y){
+              # for each component
+              apply(y,1,function(z){
+                # we aggregate the results of each individuals using the 'weights'
+                temp = aggregate(weights,list(as.character(z)),sum)
+                ind = which(temp[,2]== max (temp[,2]))
+                # if two max, then NA
+                if(length(ind) == 1)
+                {
+                  res = as.character(temp[ind, 1])
+                } else {
+                  res = NA
+                }
+                res
+                
+              })
+              
+            })
+            class.weighted.per.comp = matrix(class.weighted.per.comp,
+                                             nrow = nrow(class.per.comp[[1]]))
+            colnames(class.weighted.per.comp) = names(class.per.comp)
+            rownames(class.weighted.per.comp) =
+              rownames(out.DA$MajorityVote[[1]])
+            class.weighted.per.comp
+            
+          })
+        }
+        out.DA$weights = weights
+        
+      }
+    }else{
+      out.DA$MajorityVote = lapply(out.DA$class,function(x){x[[1]]})
+    }
+    
+    block.object = c("block.pls", "block.spls", "block.plsda", "block.spsda")
+    if (inherits(object, block.object) & J>1) # a block
+    {
+      out.DA$centroids = G
+    }else{ #not a block
+      out.DA$centroids = G[[1]]
+      out.DA$class = out.DA$MajorityVote
+    }
+    if (any(dist == "all"))
+      dist = "all"
+    
+    out.DA$dist = dist
+    
+    out.DA
+    
+  }
+
+# from https://github.com/mixOmicsTeam/mixOmics/blob/766521eaff07b25c9c2ceccdabdf281dd0ef735c/R/unmap.R#L47
+# ---------------------------------------------------
+# unmap variates.A variable for (s)plsda
+# ---------------------------------------------------
+#' Dummy matrix for an outcome factor
+#' 
+#' Converts a class or group vector or factor into a matrix of indicator
+#' variables.
+#' 
+#' @param classification A numeric or character vector or factor. Typically the
+#' distinct entries of this vector would represent a classification of
+#' observations in a data set.
+#' @param groups A numeric or character vector indicating the groups from which
+#' \code{classification} is drawn. If not supplied, the default is to assumed
+#' to be the unique entries of classification.
+#' @param noise A single numeric or character value used to indicate the value
+#' of \code{groups} corresponding to noise.
+#' @return An \emph{n} by \emph{K} matrix of \emph{(0,1)} indicator variables,
+#' where \emph{n} is the length of samples and \emph{K} the number of classes
+#' in the outcome.
+#' 
+#' If a \code{noise} value of symbol is designated, the corresponding indicator
+#' variables are relocated to the last column of the matrix.
+#' 
+#' Note: - you can remap an unmap vector using the function \code{map} from the
+#' package \pkg{mclust}. - this function should be used to unmap an outcome
+#' vector as in the non-supervised methods of mixOmics. For other supervised
+#' analyses such as (s)PLS-DA, (s)gccaDA this function is used internally.
+#' @author Ignacio Gonzalez, Kim-Anh Le Cao, Pierre Monget, AL J Abadi
+#' @references 
+#' C. Fraley and A. E. Raftery (2002). Model-based
+#' clustering, discriminant analysis, and density estimation. \emph{Journal of
+#' the American Statistical Association 97:611-631}.
+#' 
+#' C. Fraley, A. E. Raftery, T. B. Murphy and L. Scrucca (2012). mclust Version
+#' 4 for R: Normal Mixture Modeling for Model-Based Clustering, Classification,
+#' and Density Estimation. Technical Report No. 597, Department of Statistics,
+#' University of Washington.
+#' @keywords cluster
+#' @export
+#' @examples
+#' data(nutrimouse)
+#' Y = unmap(nutrimouse$diet)
+#' Y
+#' data = list(gene = nutrimouse$gene, lipid = nutrimouse$lipid, Y = Y)
+#' # data could then used as an input in wrapper.rgcca, which is not, technically,
+#' # a supervised method, see ??wrapper.rgcca
+unmap <-
+  function (classification, groups = NULL, noise = NULL)
+  {
+    n = length(classification)
+    u = sort(unique(classification))
+    levels =  levels(classification)### Add levels
+    
+    if (is.null(groups))
+    {
+      groups = u
+    } else {
+      if (any(match(u, groups, nomatch = 0) == 0))
+        stop("groups incompatible with classification")
+      miss = match(groups, u, nomatch = 0) == 0
+    }
+    
+    cgroups = as.character(groups)
+    if (!is.null(noise))
+    {
+      noiz = match(noise, groups, nomatch = 0)
+      if (any(noiz == 0))
+        stop("noise incompatible with classification")
+      
+      groups = c(groups[groups != noise], groups[groups == noise])
+      noise = as.numeric(factor(as.character(noise), levels = unique(groups)))
+    }
+    
+    groups = as.numeric(factor(cgroups, levels = unique(cgroups)))
+    classification = as.numeric(factor(as.character(classification), levels = unique(cgroups)))
+    k = length(groups) - length(noise)
+    nam = levels(groups)
+    
+    if (!is.null(noise))
+    {
+      k = k + 1
+      nam = nam[1:k]
+      nam[k] = "noise"
+    }
+    
+    z = matrix(0, n, k, dimnames = c(names(classification), nam))
+    for (j in 1:k) z[classification == groups[j], j] = 1
+    attr(z, "levels") = levels
+    z
+  }
+
diff --git a/man/logratio-transformations.Rd b/man/logratio-transformations.Rd
new file mode 100644
index 0000000..a230237
--- /dev/null
+++ b/man/logratio-transformations.Rd
@@ -0,0 +1,58 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/predict_mixOmics.R
+\name{logratio-transformations}
+\alias{logratio-transformations}
+\alias{logratio.transfo}
+\title{Log-ratio transformation}
+\usage{
+logratio.transfo(X, logratio = c("none", "CLR", "ILR"), offset = 0)
+}
+\arguments{
+\item{X}{numeric matrix of predictors}
+
+\item{logratio}{log-ratio transform to apply, one of "none", "CLR" or "ILR"}
+
+\item{offset}{Value that is added to X for CLR and ILR log transformation.
+Default to 0.}
+}
+\value{
+\code{logratio.transfo} simply returns the log-ratio transformed
+data.
+}
+\description{
+This function applies a log transformation to the data, either CLR or ILR
+}
+\details{
+\code{logratio.transfo} applies a log transformation to the data, either CLR
+(centered log ratio transformation) or ILR (Isometric Log Ratio
+transformation). In the case of CLR log-transformation, X needs to be a
+matrix of non-negative values and \code{offset} is used to shift the values
+away from 0, as commonly done with counts data.
+}
+\examples{
+
+data(diverse.16S)
+CLR = logratio.transfo(X = diverse.16S$data.TSS, logratio = 'CLR')
+# no offset needed here as we have put it prior to the TSS, see www.mixOmics.org/mixMC
+}
+\references{
+Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis,
+Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a
+multivariate statistical framework to gain insight into Microbial
+Communities bioRxiv 044206; doi: http://dx.doi.org/10.1101/044206
+
+John Aitchison. The statistical analysis of compositional data. Journal of
+the Royal Statistical Society. Series B (Methodological), pages 139-177,
+1982.
+
+Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component
+analysis for compositional data with outliers. Environmetrics,
+20(6):621-632, 2009.
+}
+\seealso{
+\code{\link{pca}}, \code{\link{pls}}, \code{\link{spls}},
+\code{\link{plsda}}, \code{\link{splsda}}.
+}
+\author{
+Florian Rohart, Kim-Anh Lê Cao, Al J Abadi
+}
diff --git a/man/unmap.Rd b/man/unmap.Rd
new file mode 100644
index 0000000..4afdb73
--- /dev/null
+++ b/man/unmap.Rd
@@ -0,0 +1,59 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/predict_mixOmics.R
+\name{unmap}
+\alias{unmap}
+\title{Dummy matrix for an outcome factor}
+\usage{
+unmap(classification, groups = NULL, noise = NULL)
+}
+\arguments{
+\item{classification}{A numeric or character vector or factor. Typically the
+distinct entries of this vector would represent a classification of
+observations in a data set.}
+
+\item{groups}{A numeric or character vector indicating the groups from which
+\code{classification} is drawn. If not supplied, the default is to assumed
+to be the unique entries of classification.}
+
+\item{noise}{A single numeric or character value used to indicate the value
+of \code{groups} corresponding to noise.}
+}
+\value{
+An \emph{n} by \emph{K} matrix of \emph{(0,1)} indicator variables,
+where \emph{n} is the length of samples and \emph{K} the number of classes
+in the outcome.
+
+If a \code{noise} value of symbol is designated, the corresponding indicator
+variables are relocated to the last column of the matrix.
+
+Note: - you can remap an unmap vector using the function \code{map} from the
+package \pkg{mclust}. - this function should be used to unmap an outcome
+vector as in the non-supervised methods of mixOmics. For other supervised
+analyses such as (s)PLS-DA, (s)gccaDA this function is used internally.
+}
+\description{
+Converts a class or group vector or factor into a matrix of indicator
+variables.
+}
+\examples{
+data(nutrimouse)
+Y = unmap(nutrimouse$diet)
+Y
+data = list(gene = nutrimouse$gene, lipid = nutrimouse$lipid, Y = Y)
+# data could then used as an input in wrapper.rgcca, which is not, technically,
+# a supervised method, see ??wrapper.rgcca
+}
+\references{
+C. Fraley and A. E. Raftery (2002). Model-based
+clustering, discriminant analysis, and density estimation. \emph{Journal of
+the American Statistical Association 97:611-631}.
+
+C. Fraley, A. E. Raftery, T. B. Murphy and L. Scrucca (2012). mclust Version
+4 for R: Normal Mixture Modeling for Model-Based Clustering, Classification,
+and Density Estimation. Technical Report No. 597, Department of Statistics,
+University of Washington.
+}
+\author{
+Ignacio Gonzalez, Kim-Anh Le Cao, Pierre Monget, AL J Abadi
+}
+\keyword{cluster}

From 47b0abedfb9e8a5e39c1c424871e7756f58ec16e Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Thu, 30 Jan 2025 23:31:01 -0800
Subject: [PATCH 13/19] remove links in documention

---
 R/predict_mixOmics.R | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/R/predict_mixOmics.R b/R/predict_mixOmics.R
index 5f29025..3e55e8c 100644
--- a/R/predict_mixOmics.R
+++ b/R/predict_mixOmics.R
@@ -146,15 +146,7 @@
 #' analysis (see details above).}
 #' @author Florian Rohart, Sébastien Déjean, Ignacio González, Kim-Anh Lê Cao,
 #' Al J Abadi
-#' @seealso \code{\link{pls}}, \code{\link{spls}}, \code{\link{plsda}},
-#' \code{\link{splsda}}, \code{\link{mint.pls}}, \code{\link{mint.spls}},
-#' \code{\link{mint.plsda}}, \code{\link{mint.splsda}},
-#' \code{\link{block.pls}}, \code{\link{block.spls}},
-#' \code{\link{block.plsda}}, \code{\link{block.splsda}},
-#' \code{\link{mint.block.pls}}, \code{\link{mint.block.spls}},
-#' \code{\link{mint.block.plsda}}, \code{\link{mint.block.splsda}} and
-#' visualisation with \code{\link{background.predict}} and
-#' http://www.mixOmics.org for more details.
+#' @seealso http://www.mixOmics.org for more details.
 #' @references
 #' 
 #' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics
@@ -859,11 +851,8 @@ Check.entry.single = function(X,  ncomp, q)
 #' Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component
 #' analysis for compositional data with outliers. Environmetrics,
 #' 20(6):621-632, 2009.
-#' @examples
 #' 
-#' data(diverse.16S)
-#' CLR = logratio.transfo(X = diverse.16S$data.TSS, logratio = 'CLR')
-#' # no offset needed here as we have put it prior to the TSS, see www.mixOmics.org/mixMC
+#' See mixOmics documentation for more information
 #' @name logratio-transformations
 NULL
 #' @rdname logratio-transformations

From d94565b33e06fefe0627d31019b97cea08fa3618 Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Thu, 30 Jan 2025 23:31:15 -0800
Subject: [PATCH 14/19] fix chunk error

---
 vignettes/planet.Rmd | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vignettes/planet.Rmd b/vignettes/planet.Rmd
index 85fa734..7c06a70 100644
--- a/vignettes/planet.Rmd
+++ b/vignettes/planet.Rmd
@@ -72,7 +72,7 @@ head(houseman_estimates)
 
 #### EpiDISH
 
-```{r epidish echo=FALSE}
+```{r epidish, results='hide'}
 # robust partial correlations
 epidish_RPC <- epidish(
   beta.m = plBetas[rownames(plCellCpGsThird), ],
@@ -323,15 +323,13 @@ for classification used to assign labels is 55%; if the users wishes to use
 other threshold, different labels can be assigned based on the output 
 probabilities. 
 
-```{r include=TRUE, eval = TRUE}
-
+```{r predict_pe, include=TRUE, eval = TRUE}
 library(ExperimentHub)
 eh <- ExperimentHub()
 query(eh, "eoPredData")
 
 # download BMIQ normalized 450k data
 x_test <- eh[['EH8403']]
-valMeta <- eh[['EH8404']]
 preds <- x_test %>% predictPreeclampsia()
 ```
 
@@ -341,6 +339,7 @@ Inspect the results:
 head(preds)
 
 # join with metadata
+valMeta <- eh[['EH8404']]
 valMeta <- left_join(valMeta, preds, by="Sample_ID")
 
 # visualize results

From 9f88b161b2a2effc5ca77f8ebbcb7337947e452b Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Thu, 30 Jan 2025 23:53:05 -0800
Subject: [PATCH 15/19] fix cran check errors

---
 .gitignore                      |  1 +
 NAMESPACE                       |  6 ++----
 R/predict_mixOmics.R            | 15 ++-------------
 man/logratio-transformations.Rd | 12 ++----------
 man/predict.Rd                  |  8 --------
 man/unmap.Rd                    |  8 --------
 6 files changed, 7 insertions(+), 43 deletions(-)

diff --git a/.gitignore b/.gitignore
index f1697a9..ad3e8c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,4 @@ data-raw/mod.rds
 data-raw/x_test.rds
 data-raw/x_train.rds
 data-raw/y_train.rds
+R/bioc-devel.sh
diff --git a/NAMESPACE b/NAMESPACE
index 24492c1..6ac8431 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,15 +1,11 @@
 # Generated by roxygen2: do not edit by hand
 
-S3method(predict,mixo_pls)
-S3method(predict,mixo_spls)
 export("%>%")
-export(logratio.transfo)
 export(pl_infer_age)
 export(pl_infer_ethnicity)
 export(predictAge)
 export(predictEthnicity)
 export(predictPreeclampsia)
-export(unmap)
 importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,n)
@@ -17,5 +13,7 @@ importFrom(dplyr,row_number)
 importFrom(magrittr,"%>%")
 importFrom(methods,hasArg)
 importFrom(methods,is)
+importFrom(stats,aggregate)
 importFrom(stats,setNames)
 importFrom(tibble,tibble)
+importFrom(utils,data)
diff --git a/R/predict_mixOmics.R b/R/predict_mixOmics.R
index 3e55e8c..7ba170f 100644
--- a/R/predict_mixOmics.R
+++ b/R/predict_mixOmics.R
@@ -161,7 +161,6 @@
 #' @method predict mixo_pls
 #' @importFrom methods hasArg is
 #' @importFrom stats setNames
-#' @export
 predict.mixo_pls <-
   function(object,
            newdata,
@@ -760,7 +759,6 @@ predict.mixo_pls <-
 
 #' @rdname predict
 #' @method predict mixo_spls
-#' @export
 predict.mixo_spls <- predict.mixo_pls
 
 
@@ -837,8 +835,6 @@ Check.entry.single = function(X,  ncomp, q)
 #' @return \code{logratio.transfo} simply returns the log-ratio transformed
 #' data.
 #' @author Florian Rohart, Kim-Anh Lê Cao, Al J Abadi
-#' @seealso \code{\link{pca}}, \code{\link{pls}}, \code{\link{spls}},
-#' \code{\link{plsda}}, \code{\link{splsda}}.
 #' @references Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis,
 #' Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a
 #' multivariate statistical framework to gain insight into Microbial
@@ -856,7 +852,8 @@ Check.entry.single = function(X,  ncomp, q)
 #' @name logratio-transformations
 NULL
 #' @rdname logratio-transformations
-#' @export
+#' @importFrom stats aggregate
+#' @importFrom utils data
 logratio.transfo <- function(X,
                              logratio = c('none','CLR','ILR'),
                              offset = 0)
@@ -1176,14 +1173,6 @@ internal_predict.DA <-
 #' and Density Estimation. Technical Report No. 597, Department of Statistics,
 #' University of Washington.
 #' @keywords cluster
-#' @export
-#' @examples
-#' data(nutrimouse)
-#' Y = unmap(nutrimouse$diet)
-#' Y
-#' data = list(gene = nutrimouse$gene, lipid = nutrimouse$lipid, Y = Y)
-#' # data could then used as an input in wrapper.rgcca, which is not, technically,
-#' # a supervised method, see ??wrapper.rgcca
 unmap <-
   function (classification, groups = NULL, noise = NULL)
   {
diff --git a/man/logratio-transformations.Rd b/man/logratio-transformations.Rd
index a230237..dbe9c12 100644
--- a/man/logratio-transformations.Rd
+++ b/man/logratio-transformations.Rd
@@ -29,12 +29,6 @@ transformation). In the case of CLR log-transformation, X needs to be a
 matrix of non-negative values and \code{offset} is used to shift the values
 away from 0, as commonly done with counts data.
 }
-\examples{
-
-data(diverse.16S)
-CLR = logratio.transfo(X = diverse.16S$data.TSS, logratio = 'CLR')
-# no offset needed here as we have put it prior to the TSS, see www.mixOmics.org/mixMC
-}
 \references{
 Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis,
 Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a
@@ -48,10 +42,8 @@ the Royal Statistical Society. Series B (Methodological), pages 139-177,
 Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component
 analysis for compositional data with outliers. Environmetrics,
 20(6):621-632, 2009.
-}
-\seealso{
-\code{\link{pca}}, \code{\link{pls}}, \code{\link{spls}},
-\code{\link{plsda}}, \code{\link{splsda}}.
+
+See mixOmics documentation for more information
 }
 \author{
 Florian Rohart, Kim-Anh Lê Cao, Al J Abadi
diff --git a/man/predict.Rd b/man/predict.Rd
index bc64d95..88f8bff 100644
--- a/man/predict.Rd
+++ b/man/predict.Rd
@@ -185,14 +185,6 @@ Tenenhaus, M. (1998). \emph{La regression PLS: theorie et pratique}. Paris:
 Editions Technic.
 }
 \seealso{
-\code{\link{pls}}, \code{\link{spls}}, \code{\link{plsda}},
-\code{\link{splsda}}, \code{\link{mint.pls}}, \code{\link{mint.spls}},
-\code{\link{mint.plsda}}, \code{\link{mint.splsda}},
-\code{\link{block.pls}}, \code{\link{block.spls}},
-\code{\link{block.plsda}}, \code{\link{block.splsda}},
-\code{\link{mint.block.pls}}, \code{\link{mint.block.spls}},
-\code{\link{mint.block.plsda}}, \code{\link{mint.block.splsda}} and
-visualisation with \code{\link{background.predict}} and
 http://www.mixOmics.org for more details.
 }
 \author{
diff --git a/man/unmap.Rd b/man/unmap.Rd
index 4afdb73..3df86b8 100644
--- a/man/unmap.Rd
+++ b/man/unmap.Rd
@@ -35,14 +35,6 @@ analyses such as (s)PLS-DA, (s)gccaDA this function is used internally.
 Converts a class or group vector or factor into a matrix of indicator
 variables.
 }
-\examples{
-data(nutrimouse)
-Y = unmap(nutrimouse$diet)
-Y
-data = list(gene = nutrimouse$gene, lipid = nutrimouse$lipid, Y = Y)
-# data could then used as an input in wrapper.rgcca, which is not, technically,
-# a supervised method, see ??wrapper.rgcca
-}
 \references{
 C. Fraley and A. E. Raftery (2002). Model-based
 clustering, discriminant analysis, and density estimation. \emph{Journal of

From a47a86872cae204087e3fd900a1898bc72446d47 Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Thu, 30 Jan 2025 23:59:37 -0800
Subject: [PATCH 16/19] remove documnetation and exports

---
 NAMESPACE                       |   2 -
 R/predict_mixOmics.R            | 176 ++------------------------------
 man/logratio-transformations.Rd |  50 ---------
 man/predict.Rd                  |  96 +----------------
 man/unmap.Rd                    |  51 ---------
 5 files changed, 11 insertions(+), 364 deletions(-)
 delete mode 100644 man/logratio-transformations.Rd
 delete mode 100644 man/unmap.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 6ac8431..e40a48a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -13,7 +13,5 @@ importFrom(dplyr,row_number)
 importFrom(magrittr,"%>%")
 importFrom(methods,hasArg)
 importFrom(methods,is)
-importFrom(stats,aggregate)
 importFrom(stats,setNames)
 importFrom(tibble,tibble)
-importFrom(utils,data)
diff --git a/R/predict_mixOmics.R b/R/predict_mixOmics.R
index 7ba170f..97b5cb8 100644
--- a/R/predict_mixOmics.R
+++ b/R/predict_mixOmics.R
@@ -1,100 +1,9 @@
-# copied from https://raw.githubusercontent.com/mixOmicsTeam/mixOmics/refs/heads/master/R/predict.R
-# on 2025 Jan 30. Some components omitted that are not used in planet. 
 
-# ========================================================================================================
-# This function makes a prediction of a 'newdata' by using the results of 'object'.
-# Depending on the class of the object (mint).(block).(s)pls(da) (16 classes so far), the input data is different
-# and the preparation of the data is different - different scaling for instance if object="mint...."
-# However, the prediction formula is the same for all classes, thus only one code
-# ========================================================================================================
-
-#' Predict Method for (mint).(block).(s)pls(da) methods
-#' 
-#' Predicted values based on PLS models. New responses and variates are
-#' predicted using a fitted model and a new matrix of observations.
-#' 
-#' \code{predict} produces predicted values, obtained by evaluating the
-#' PLS-derived methods, returned by \code{(mint).(block).(s)pls(da)} in the
-#' frame \code{newdata}. Variates for \code{newdata} are also returned. Please
-#' note that this method performs multilevel decomposition and/or log ratio
-#' transformations if needed (\code{multilevel} is an input parameter while
-#' \code{logratio} is extracted from \code{object}).
-#' 
-#' Different prediction distances are proposed for discriminant analysis. The
-#' reason is that our supervised models work with a dummy indicator matrix of
-#' \code{Y} to indicate the class membership of each sample. The prediction of
-#' a new observation results in either a predicted dummy variable (output
-#' \code{object$predict}), or a predicted variate (output
-#' \code{object$variates}). Therefore, an appropriate distance needs to be
-#' applied to those predicted values to assign the predicted class. We propose
-#' distances such as `maximum distance' for the predicted dummy variables,
-#' `Mahalanobis distance' and `Centroids distance' for the predicted variates.
-#' 
-#' \code{"max.dist"} is the simplest method to predict the class of a test
-#' sample. For each new individual, the class with the largest predicted dummy
-#' variable is the predicted class. This distance performs well in single data
-#' set analysis with multiclass problems (PLS-DA).
-#' 
-#' \code{"centroids.dist"} allocates to the new observation the class that
-#' mimimises the distance between the predicted score and the centroids of the
-#' classes calculated on the latent components or variates of the trained
-#' model.
-#' 
-#' \code{"mahalanobis.dist"} allocates the new sample the class defined as the
-#' centroid distance, but using the Mahalanobis metric in the calculation of
-#' the distance.
-#' 
-#' In practice we found that the centroid-based distances
-#' (\code{"centroids.dist"} and \code{"mahalanobis.dist"}), and specifically
-#' the Mahalanobis distance led to more accurate predictions than the maximum
-#' distance for complex classification problems and N-integration problems
-#' (block.splsda). The centroid distances consider the prediction in
-#' dimensional space spanned by the predicted variates, while the maximum
-#' distance considers a single point estimate using the predicted scores on the
-#' last dimension of the model. The user can assess the different distances,
-#' and choose the prediction distance that leads to the best performance of the
-#' model, as highlighted from the tune and perf outputs
-#' 
-#' More (mathematical) details about the prediction distances are available in
-#' the supplemental of the mixOmics article (Rohart et al 2017).
-#' 
-#' For a visualisation of those prediction distances, see
-#' \code{background.predict} that overlays the prediction area in
-#' \code{plotIndiv} for a sPLS-DA object.
-#' 
-#' Allocates the individual \eqn{x} to the class of \eqn{Y} minimizing
-#' \eqn{dist(\code{x-variate}, G_l)}, where \eqn{G_l}, \eqn{l = 1,...,L} are
-#' the centroids of the classes calculated on the \eqn{X}-variates of the
-#' model. \code{"mahalanobis.dist"} allocates the individual \eqn{x} to the
-#' class of \eqn{Y} as in \code{"centroids.dist"} but by using the Mahalanobis
-#' metric in the calculation of the distance.
-#' 
-#' For MINT objects, the \code{study.test} argument is required and provides
-#' the grouping factor of \code{newdata}.
-#' 
-#' For multi block analysis (thus block objects), \code{newdata} is a list of
-#' matrices whose names are a subset of \code{names(object$X)} and missing
-#' blocks are allowed. Several predictions are returned, either for each block
-#' or for all blocks. For non discriminant analysis, the predicted values
-#' (\code{predict}) are returned for each block and these values are combined
-#' by average (\code{AveragedPredict}) or weighted average
-#' (\code{WeightedPredict}), using the weights of the blocks that are
-#' calculated as the correlation between a block's components and the outcome's
-#' components.
+#' predict mixxo from
 #' 
-#' For discriminant analysis, the predicted class is returned for each block
-#' (\code{class}) and each distance (\code{dist}) and these predictions are
-#' combined by majority vote (\code{MajorityVote}) or weighted majority vote
-#' (\code{WeightedVote}), using the weights of the blocks that are calculated
-#' as the correlation between a block's components and the outcome's
-#' components. NA means that there is no consensus among the block. For PLS-DA
-#' and sPLS-DA objects, the prediction area can be visualised in plotIndiv via
-#' the \code{background.predict} function.
+#' copied from mixOmicsTeam/mixOmics/refs/heads/master/R/predict.R
+#' on 2025 Jan 30. Some components omitted that are not used in planet. 
 #' 
-#' @aliases predict predict.pls predict.spls predict.plsda predict.splsda
-#' predict.mint.pls predict.mint.spls predict.mint.plsda predict.mint.splsda
-#' predict.mint.block.pls predict.mint.block.spls predict.mint.block.plsda
-#' predict.mint.block.splsda
 #' @param object object of class inheriting from
 #' \code{"(mint).(block).(s)pls(da)"}.
 #' @param newdata data matrix in which to look for for explanatory variables to
@@ -161,6 +70,9 @@
 #' @method predict mixo_pls
 #' @importFrom methods hasArg is
 #' @importFrom stats setNames
+#' @examples
+#' # example code
+#' 
 predict.mixo_pls <-
   function(object,
            newdata,
@@ -817,43 +729,6 @@ Check.entry.single = function(X,  ncomp, q)
   return(list(X=X, ncomp=ncomp, X.names=X.names, ind.names=ind.names))
 }
 
-# from https://github.com/mixOmicsTeam/mixOmics/blob/master/R/logratio-transformations.R
-#' Log-ratio transformation
-#' 
-#' This function applies a log transformation to the data, either CLR or ILR
-#' 
-#' \code{logratio.transfo} applies a log transformation to the data, either CLR
-#' (centered log ratio transformation) or ILR (Isometric Log Ratio
-#' transformation). In the case of CLR log-transformation, X needs to be a
-#' matrix of non-negative values and \code{offset} is used to shift the values
-#' away from 0, as commonly done with counts data.
-#' 
-#' @param X numeric matrix of predictors
-#' @param logratio log-ratio transform to apply, one of "none", "CLR" or "ILR"
-#' @param offset Value that is added to X for CLR and ILR log transformation.
-#' Default to 0.
-#' @return \code{logratio.transfo} simply returns the log-ratio transformed
-#' data.
-#' @author Florian Rohart, Kim-Anh Lê Cao, Al J Abadi
-#' @references Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis,
-#' Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a
-#' multivariate statistical framework to gain insight into Microbial
-#' Communities bioRxiv 044206; doi: http://dx.doi.org/10.1101/044206
-#' 
-#' John Aitchison. The statistical analysis of compositional data. Journal of
-#' the Royal Statistical Society. Series B (Methodological), pages 139-177,
-#' 1982.
-#' 
-#' Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component
-#' analysis for compositional data with outliers. Environmetrics,
-#' 20(6):621-632, 2009.
-#' 
-#' See mixOmics documentation for more information
-#' @name logratio-transformations
-NULL
-#' @rdname logratio-transformations
-#' @importFrom stats aggregate
-#' @importFrom utils data
 logratio.transfo <- function(X,
                              logratio = c('none','CLR','ILR'),
                              offset = 0)
@@ -1134,45 +1009,6 @@ internal_predict.DA <-
     
   }
 
-# from https://github.com/mixOmicsTeam/mixOmics/blob/766521eaff07b25c9c2ceccdabdf281dd0ef735c/R/unmap.R#L47
-# ---------------------------------------------------
-# unmap variates.A variable for (s)plsda
-# ---------------------------------------------------
-#' Dummy matrix for an outcome factor
-#' 
-#' Converts a class or group vector or factor into a matrix of indicator
-#' variables.
-#' 
-#' @param classification A numeric or character vector or factor. Typically the
-#' distinct entries of this vector would represent a classification of
-#' observations in a data set.
-#' @param groups A numeric or character vector indicating the groups from which
-#' \code{classification} is drawn. If not supplied, the default is to assumed
-#' to be the unique entries of classification.
-#' @param noise A single numeric or character value used to indicate the value
-#' of \code{groups} corresponding to noise.
-#' @return An \emph{n} by \emph{K} matrix of \emph{(0,1)} indicator variables,
-#' where \emph{n} is the length of samples and \emph{K} the number of classes
-#' in the outcome.
-#' 
-#' If a \code{noise} value of symbol is designated, the corresponding indicator
-#' variables are relocated to the last column of the matrix.
-#' 
-#' Note: - you can remap an unmap vector using the function \code{map} from the
-#' package \pkg{mclust}. - this function should be used to unmap an outcome
-#' vector as in the non-supervised methods of mixOmics. For other supervised
-#' analyses such as (s)PLS-DA, (s)gccaDA this function is used internally.
-#' @author Ignacio Gonzalez, Kim-Anh Le Cao, Pierre Monget, AL J Abadi
-#' @references 
-#' C. Fraley and A. E. Raftery (2002). Model-based
-#' clustering, discriminant analysis, and density estimation. \emph{Journal of
-#' the American Statistical Association 97:611-631}.
-#' 
-#' C. Fraley, A. E. Raftery, T. B. Murphy and L. Scrucca (2012). mclust Version
-#' 4 for R: Normal Mixture Modeling for Model-Based Clustering, Classification,
-#' and Density Estimation. Technical Report No. 597, Department of Statistics,
-#' University of Washington.
-#' @keywords cluster
 unmap <-
   function (classification, groups = NULL, noise = NULL)
   {
diff --git a/man/logratio-transformations.Rd b/man/logratio-transformations.Rd
deleted file mode 100644
index dbe9c12..0000000
--- a/man/logratio-transformations.Rd
+++ /dev/null
@@ -1,50 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/predict_mixOmics.R
-\name{logratio-transformations}
-\alias{logratio-transformations}
-\alias{logratio.transfo}
-\title{Log-ratio transformation}
-\usage{
-logratio.transfo(X, logratio = c("none", "CLR", "ILR"), offset = 0)
-}
-\arguments{
-\item{X}{numeric matrix of predictors}
-
-\item{logratio}{log-ratio transform to apply, one of "none", "CLR" or "ILR"}
-
-\item{offset}{Value that is added to X for CLR and ILR log transformation.
-Default to 0.}
-}
-\value{
-\code{logratio.transfo} simply returns the log-ratio transformed
-data.
-}
-\description{
-This function applies a log transformation to the data, either CLR or ILR
-}
-\details{
-\code{logratio.transfo} applies a log transformation to the data, either CLR
-(centered log ratio transformation) or ILR (Isometric Log Ratio
-transformation). In the case of CLR log-transformation, X needs to be a
-matrix of non-negative values and \code{offset} is used to shift the values
-away from 0, as commonly done with counts data.
-}
-\references{
-Kim-Anh Lê Cao, Mary-Ellen Costello, Vanessa Anne Lakis,
-Francois Bartolo, Xin-Yi Chua, Remi Brazeilles, Pascale Rondeau mixMC: a
-multivariate statistical framework to gain insight into Microbial
-Communities bioRxiv 044206; doi: http://dx.doi.org/10.1101/044206
-
-John Aitchison. The statistical analysis of compositional data. Journal of
-the Royal Statistical Society. Series B (Methodological), pages 139-177,
-1982.
-
-Peter Filzmoser, Karel Hron, and Clemens Reimann. Principal component
-analysis for compositional data with outliers. Environmetrics,
-20(6):621-632, 2009.
-
-See mixOmics documentation for more information
-}
-\author{
-Florian Rohart, Kim-Anh Lê Cao, Al J Abadi
-}
diff --git a/man/predict.Rd b/man/predict.Rd
index 88f8bff..0d836a2 100644
--- a/man/predict.Rd
+++ b/man/predict.Rd
@@ -3,20 +3,8 @@
 \name{predict}
 \alias{predict}
 \alias{predict.mixo_pls}
-\alias{predict.pls}
-\alias{predict.spls}
-\alias{predict.plsda}
-\alias{predict.splsda}
-\alias{predict.mint.pls}
-\alias{predict.mint.spls}
-\alias{predict.mint.plsda}
-\alias{predict.mint.splsda}
-\alias{predict.mint.block.pls}
-\alias{predict.mint.block.spls}
-\alias{predict.mint.block.plsda}
-\alias{predict.mint.block.splsda}
 \alias{predict.mixo_spls}
-\title{Predict Method for (mint).(block).(s)pls(da) methods}
+\title{predict mixxo from}
 \usage{
 \method{predict}{mixo_pls}(
   object,
@@ -95,86 +83,12 @@ distance requested.} \item{vote}{majority vote result for multi block
 analysis (see details above).}
 }
 \description{
-Predicted values based on PLS models. New responses and variates are
-predicted using a fitted model and a new matrix of observations.
+copied from mixOmicsTeam/mixOmics/refs/heads/master/R/predict.R
+on 2025 Jan 30. Some components omitted that are not used in planet.
 }
-\details{
-\code{predict} produces predicted values, obtained by evaluating the
-PLS-derived methods, returned by \code{(mint).(block).(s)pls(da)} in the
-frame \code{newdata}. Variates for \code{newdata} are also returned. Please
-note that this method performs multilevel decomposition and/or log ratio
-transformations if needed (\code{multilevel} is an input parameter while
-\code{logratio} is extracted from \code{object}).
+\examples{
+# example code
 
-Different prediction distances are proposed for discriminant analysis. The
-reason is that our supervised models work with a dummy indicator matrix of
-\code{Y} to indicate the class membership of each sample. The prediction of
-a new observation results in either a predicted dummy variable (output
-\code{object$predict}), or a predicted variate (output
-\code{object$variates}). Therefore, an appropriate distance needs to be
-applied to those predicted values to assign the predicted class. We propose
-distances such as \verb{maximum distance' for the predicted dummy variables, }Mahalanobis distance' and `Centroids distance' for the predicted variates.
-
-\code{"max.dist"} is the simplest method to predict the class of a test
-sample. For each new individual, the class with the largest predicted dummy
-variable is the predicted class. This distance performs well in single data
-set analysis with multiclass problems (PLS-DA).
-
-\code{"centroids.dist"} allocates to the new observation the class that
-mimimises the distance between the predicted score and the centroids of the
-classes calculated on the latent components or variates of the trained
-model.
-
-\code{"mahalanobis.dist"} allocates the new sample the class defined as the
-centroid distance, but using the Mahalanobis metric in the calculation of
-the distance.
-
-In practice we found that the centroid-based distances
-(\code{"centroids.dist"} and \code{"mahalanobis.dist"}), and specifically
-the Mahalanobis distance led to more accurate predictions than the maximum
-distance for complex classification problems and N-integration problems
-(block.splsda). The centroid distances consider the prediction in
-dimensional space spanned by the predicted variates, while the maximum
-distance considers a single point estimate using the predicted scores on the
-last dimension of the model. The user can assess the different distances,
-and choose the prediction distance that leads to the best performance of the
-model, as highlighted from the tune and perf outputs
-
-More (mathematical) details about the prediction distances are available in
-the supplemental of the mixOmics article (Rohart et al 2017).
-
-For a visualisation of those prediction distances, see
-\code{background.predict} that overlays the prediction area in
-\code{plotIndiv} for a sPLS-DA object.
-
-Allocates the individual \eqn{x} to the class of \eqn{Y} minimizing
-\eqn{dist(\code{x-variate}, G_l)}, where \eqn{G_l}, \eqn{l = 1,...,L} are
-the centroids of the classes calculated on the \eqn{X}-variates of the
-model. \code{"mahalanobis.dist"} allocates the individual \eqn{x} to the
-class of \eqn{Y} as in \code{"centroids.dist"} but by using the Mahalanobis
-metric in the calculation of the distance.
-
-For MINT objects, the \code{study.test} argument is required and provides
-the grouping factor of \code{newdata}.
-
-For multi block analysis (thus block objects), \code{newdata} is a list of
-matrices whose names are a subset of \code{names(object$X)} and missing
-blocks are allowed. Several predictions are returned, either for each block
-or for all blocks. For non discriminant analysis, the predicted values
-(\code{predict}) are returned for each block and these values are combined
-by average (\code{AveragedPredict}) or weighted average
-(\code{WeightedPredict}), using the weights of the blocks that are
-calculated as the correlation between a block's components and the outcome's
-components.
-
-For discriminant analysis, the predicted class is returned for each block
-(\code{class}) and each distance (\code{dist}) and these predictions are
-combined by majority vote (\code{MajorityVote}) or weighted majority vote
-(\code{WeightedVote}), using the weights of the blocks that are calculated
-as the correlation between a block's components and the outcome's
-components. NA means that there is no consensus among the block. For PLS-DA
-and sPLS-DA objects, the prediction area can be visualised in plotIndiv via
-the \code{background.predict} function.
 }
 \references{
 Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics
diff --git a/man/unmap.Rd b/man/unmap.Rd
deleted file mode 100644
index 3df86b8..0000000
--- a/man/unmap.Rd
+++ /dev/null
@@ -1,51 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/predict_mixOmics.R
-\name{unmap}
-\alias{unmap}
-\title{Dummy matrix for an outcome factor}
-\usage{
-unmap(classification, groups = NULL, noise = NULL)
-}
-\arguments{
-\item{classification}{A numeric or character vector or factor. Typically the
-distinct entries of this vector would represent a classification of
-observations in a data set.}
-
-\item{groups}{A numeric or character vector indicating the groups from which
-\code{classification} is drawn. If not supplied, the default is to assumed
-to be the unique entries of classification.}
-
-\item{noise}{A single numeric or character value used to indicate the value
-of \code{groups} corresponding to noise.}
-}
-\value{
-An \emph{n} by \emph{K} matrix of \emph{(0,1)} indicator variables,
-where \emph{n} is the length of samples and \emph{K} the number of classes
-in the outcome.
-
-If a \code{noise} value of symbol is designated, the corresponding indicator
-variables are relocated to the last column of the matrix.
-
-Note: - you can remap an unmap vector using the function \code{map} from the
-package \pkg{mclust}. - this function should be used to unmap an outcome
-vector as in the non-supervised methods of mixOmics. For other supervised
-analyses such as (s)PLS-DA, (s)gccaDA this function is used internally.
-}
-\description{
-Converts a class or group vector or factor into a matrix of indicator
-variables.
-}
-\references{
-C. Fraley and A. E. Raftery (2002). Model-based
-clustering, discriminant analysis, and density estimation. \emph{Journal of
-the American Statistical Association 97:611-631}.
-
-C. Fraley, A. E. Raftery, T. B. Murphy and L. Scrucca (2012). mclust Version
-4 for R: Normal Mixture Modeling for Model-Based Clustering, Classification,
-and Density Estimation. Technical Report No. 597, Department of Statistics,
-University of Washington.
-}
-\author{
-Ignacio Gonzalez, Kim-Anh Le Cao, Pierre Monget, AL J Abadi
-}
-\keyword{cluster}

From 098e38d8a649322713c10d2db1818cfa9ab6e92e Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Fri, 31 Jan 2025 00:02:32 -0800
Subject: [PATCH 17/19] remove deprecations

---
 R/planet-deprecated.R      | 14 --------------
 R/predictAge.R             |  5 -----
 R/predictEthnicity.R       |  5 -----
 R/predictPreeclampsia.R    |  4 ++--
 man/planet-deprecated.Rd   | 17 -----------------
 man/predictPreeclampsia.Rd |  3 +--
 6 files changed, 3 insertions(+), 45 deletions(-)
 delete mode 100644 man/planet-deprecated.Rd

diff --git a/R/planet-deprecated.R b/R/planet-deprecated.R
index 13c3583..e69de29 100644
--- a/R/planet-deprecated.R
+++ b/R/planet-deprecated.R
@@ -1,14 +0,0 @@
-#' Deprecated functions in \pkg{planet}
-#' 
-#' These functions still work but will be removed (defunct) in the next version.
-#' 
-#' \itemize{
-#'  \item \code{\link{pl_infer_ethnicity}}: This function has been renamed
-#'  \code{\link{predictEthnicity}}
-#'  
-#'  \item \code{\link{pl_infer_age}}: This function has been renamed
-#'  \code{\link{predictAge}}
-#' }
-#' 
-#' @name planet-deprecated
-NULL
\ No newline at end of file
diff --git a/R/predictAge.R b/R/predictAge.R
index d5959fe..1645365 100644
--- a/R/predictAge.R
+++ b/R/predictAge.R
@@ -89,9 +89,4 @@ predictAge <- function(betas, type = "RPC") {
         as.vector()
 
     return(age)
-}
-
-pl_infer_age <- function(betas, type = 'RPC'){
-    .Deprecated('predictAge')
-    predictAge(betas = betas, type = type)
 }
\ No newline at end of file
diff --git a/R/predictEthnicity.R b/R/predictEthnicity.R
index a8165da..860f65f 100644
--- a/R/predictEthnicity.R
+++ b/R/predictEthnicity.R
@@ -147,8 +147,3 @@ glmnet_softmax <- function(x, ignore_labels = FALSE) {
   }
   pclass
 }
-
-pl_infer_ethnicity <- function(betas, threshold = 0.75) {
-  .Deprecated("predictEthnicity")
-  predictEthnicity(betas = betas, threshold = threshold)
-}
diff --git a/R/predictPreeclampsia.R b/R/predictPreeclampsia.R
index c07c037..375e4c4 100644
--- a/R/predictPreeclampsia.R
+++ b/R/predictPreeclampsia.R
@@ -16,7 +16,7 @@
 #'
 #' @return produces a list with components detailed in the `mixOmics::predict` R documentation
 #'
-#' @examples \dontrun{
+#' @examples
 #'
 #' # To predict early preeclampsia on 450k/850k samples
 #'
@@ -28,7 +28,7 @@
 #' # test object
 #' x_test <- eh[['EH8403']]
 #' x_test %>% predictPreeclampsia()
-#' }
+#' 
 #' @export predictPreeclampsia
 #'
 predictPreeclampsia <- function(betas, ...){
diff --git a/man/planet-deprecated.Rd b/man/planet-deprecated.Rd
deleted file mode 100644
index 8625f0d..0000000
--- a/man/planet-deprecated.Rd
+++ /dev/null
@@ -1,17 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/planet-deprecated.R
-\name{planet-deprecated}
-\alias{planet-deprecated}
-\title{Deprecated functions in \pkg{planet}}
-\description{
-These functions still work but will be removed (defunct) in the next version.
-}
-\details{
-\itemize{
-\item \code{\link{pl_infer_ethnicity}}: This function has been renamed
-\code{\link{predictEthnicity}}
-
-\item \code{\link{pl_infer_age}}: This function has been renamed
-\code{\link{predictAge}}
-}
-}
diff --git a/man/predictPreeclampsia.Rd b/man/predictPreeclampsia.Rd
index ea612d7..e1e90a9 100644
--- a/man/predictPreeclampsia.Rd
+++ b/man/predictPreeclampsia.Rd
@@ -28,7 +28,6 @@ prior to prediction. This was the normalization method used on the training data
 }
 
 \examples{
-\dontrun{
 
 # To predict early preeclampsia on 450k/850k samples
 
@@ -40,5 +39,5 @@ query(eh, "eoPredData")
 # test object
 x_test <- eh[['EH8403']]
 x_test \%>\% predictPreeclampsia()
-}
+
 }

From 474e4dbe21d2b922114a3cd1422b2cb5bdffff98 Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Fri, 31 Jan 2025 08:52:18 -0800
Subject: [PATCH 18/19] remove deprecated exports

---
 NAMESPACE               | 2 --
 R/predictAge.R          | 2 --
 R/predictEthnicity.R    | 3 ---
 man/predictAge.Rd       | 1 -
 man/predictEthnicity.Rd | 1 -
 5 files changed, 9 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index e40a48a..6ce34a1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,8 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
-export(pl_infer_age)
-export(pl_infer_ethnicity)
 export(predictAge)
 export(predictEthnicity)
 export(predictPreeclampsia)
diff --git a/R/predictAge.R b/R/predictAge.R
index 1645365..58d17cf 100644
--- a/R/predictAge.R
+++ b/R/predictAge.R
@@ -31,8 +31,6 @@
 #'     mutate(inferred_ga = predictAge(plBetas, type = "RPC"))
 #'     
 #' @export predictAge
-#' @export pl_infer_age
-#' @aliases pl_infer_age
 #' 
 predictAge <- function(betas, type = "RPC") {
     data(ageCpGs, envir = environment())
diff --git a/R/predictEthnicity.R b/R/predictEthnicity.R
index 860f65f..b7e15b3 100644
--- a/R/predictEthnicity.R
+++ b/R/predictEthnicity.R
@@ -30,9 +30,6 @@
 #' predictEthnicity(plBetas)
 #' 
 #' @export predictEthnicity 
-#' @export pl_infer_ethnicity
-#' @aliases pl_infer_ethnicity
-
 predictEthnicity <- function(betas, threshold = 0.75, force = FALSE) {
   data(ethnicityCpGs, envir=environment())
   pf <- intersect(ethnicityCpGs, rownames(betas)) 
diff --git a/man/predictAge.Rd b/man/predictAge.Rd
index c9085f5..fca97d2 100644
--- a/man/predictAge.Rd
+++ b/man/predictAge.Rd
@@ -2,7 +2,6 @@
 % Please edit documentation in R/predictAge.R
 \name{predictAge}
 \alias{predictAge}
-\alias{pl_infer_age}
 \title{Predicts gestational age using placental DNA methylation microarray
 data}
 \usage{
diff --git a/man/predictEthnicity.Rd b/man/predictEthnicity.Rd
index 289882d..7ffe88f 100644
--- a/man/predictEthnicity.Rd
+++ b/man/predictEthnicity.Rd
@@ -2,7 +2,6 @@
 % Please edit documentation in R/predictEthnicity.R
 \name{predictEthnicity}
 \alias{predictEthnicity}
-\alias{pl_infer_ethnicity}
 \title{Predicts ethnicity using placental DNA methylation microarray data}
 \usage{
 predictEthnicity(betas, threshold = 0.75, force = FALSE)

From b64fbf934164056c1d708598f5b4fb807a7308e1 Mon Sep 17 00:00:00 2001
From: Victor Yuan <victor.2wy@gmail.com>
Date: Sat, 1 Feb 2025 08:59:36 -0800
Subject: [PATCH 19/19] remove eopreddata warning

---
 DESCRIPTION | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 9f2b89e..25a326e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -22,8 +22,7 @@ Imports:
     methods,
     tibble,
     magrittr,
-    dplyr,
-    eoPredData
+    dplyr
 Suggests:
     ExperimentHub,
     mixOmics,