expect_classifiers.qmd

---
title: "The Early Prediction of prEgnancy Complications Testing or ExPECT study"
subtitle: "Machine learning classification"
author: "Bram Van Gaever"

output-dir: quarto_output
format: html
code-fold: true
warning: false
message: false

toc: true
toc-expand: 2
toc-location: left
---

# Introduction

This notebook contains the code for the creation of the machine learning classifiers of the ExPECT study. The code is kept separate mainly for readability and to keep the code in check.

```{r}
#| label: "libs_and_utils"
#| output: false
# Chunck containing the used libraries, colors and some utility functions 

set.seed(2341)

library(readxl)
library(limma)
library(bsseq)
library(RColorBrewer)
library(Rtsne)
library(pheatmap)
library(GenomicRanges)
library(biomaRt)
library(HDF5Array)
library(kableExtra)
library(patchwork)
library(ggrepel)
library(ggbeeswarm)
library(seqsetvis)
library(biomaRt)
library(fuzzyjoin)
library(ggpubr)
library(tidyverse)
library(tidymodels)
library(themis)


#Plotting colors
Green <- "#00A08A"
Orange <- "#F98300"
Blue <-  "#5BBCD6"
Purple <- "#BF8AC2"
Brown <- "#B86D33"
Orange2 <-  "#CC9F68"
Green2 <- "#7BB6AE"
Blue2 <- "#00778A"
Purple2 <- "#741078"

color_palette <- c(Green, Orange, Blue, Purple, Brown, Green2, Purple2, Blue2, Orange2)
gradient <- colorRampPalette(c(Blue, "#FFFFFF", Purple))(1000)

#"not in" operator
`%nin%` = Negate(`%in%`)

#Ranges to id funtction
ranges2id <- function(genomicRanges) {
    idVector <- c(paste0(genomicRanges@seqnames, ":", genomicRanges@ranges@start, "-", genomicRanges@ranges@start + genomicRanges@ranges@width - 1))
}
#Id to ranges function
id2ranges <- function(idVector) {
    idTib <- as_tibble(idVector) %>% 
               separate(value, into = c("seqnames", "start", "end"), sep = "[:-]", convert = T)
        
    genomicRanges <- GRanges(seqnames =idTib$seqnames, ranges = IRanges(start = idTib$start, end = idTib$end))
}

#Limma model fitting and result gathering fucntion 
train_limma <- function(matrix, design) {
    limma_fit = lmFit(matrix, design = design)
    eB_fit = eBayes(limma_fit)
    categories <- colnames(design)[-1]
    results = list()
    for (category in categories) {
        top = topTable(eB_fit, coef = category, p.value = 0.01, 
                       number = Inf, adjust.method = "BH")
        results[[category]] = top
        
    }
    return(results)
}

plot_roc <-  function(fit_prediction, fit_truth, fit_control, test_predictions, test_truth, test_control) {
    fit_auc <- roc_auc(fit_prediction, fit_truth, fit_control)
    test_auc <- roc_auc(test_predictions, test_truth, test_control)
    
    fit_roc <- roc_curve(fit_prediction, fit_truth, fit_control)
    test_roc <- roc_curve(test_predictions, test_truth, test_control)
    
    ggplot(mapping = aes(x = 1 - specificity, y = sensitivity)) +
    geom_path(data = fit_roc, color = color_palette[1], size = 1.5) +
    geom_path(data = test_roc, color = color_palette[2], size = 1.5)+
    geom_abline(lty = 3) +
    geom_text(aes(x = 0.30, y = 0.80, label = paste0("AUC:", round(fit_auc$.estimate, 4))), color = color_palette[1], size = 4) +
    geom_text(aes(x = 0.80, y = 0.30, label = paste0("AUC:", round(test_auc$.estimate, 4))), color = color_palette[2], size = 4) +
    coord_equal() +
    theme_classic()
}

#Set metric types to be collected
sensPE <- metric_tweak("sensPE", sensitivity, event_level = "second")
specPE <- metric_tweak("specPE", specificity, event_level = "second")
cls_metrics <- metric_set(sensPE, specPE, accuracy)

```

# Data loading and prepping

Data is loaded in the same manner as for the exploratory analysis. However this time, additional splits are made. The symptomatic and pre-symptomatic data sets are divided into a training and test set (each containing an equal proportion of cases and control samples). The training set is used for DMR identification and model training while the test sets are used for model validation.

```{r}
#| label: data_loading 
setwd("~/Drive/ExPECT paper Symptomatic analysis/Code/Git_repo")
#Load bsseq object
bs_expect <- loadHDF5SummarizedExperiment(dir = "Data/Methylation/Total")

#Load study annotation
annotation <-  read_csv("Data/expect_annotation.csv")
long_samples <- read_table("Data/longitudinal_samples.txt")

#Split up annotations for ease of use

long_annotation <- annotation %>% filter(Sample_id %in% long_samples$Sample_id)
presympt_annotation <- annotation %>% filter(Gest_age_weeks < 14)
sympt_annotation <- annotation %>% filter(Group == "Symptomatic") %>% 
                                filter(Sample_id %nin% long_annotation$Sample_id)
```

The presymptomatic set is first divided after which DMR's are recalculated using `DMRfinder` on the samples included in the training set alone. A set of 25 DMR's is then selected for further modelling to limit overfitting of the models. However, because our cohort is fairly limited, some overfitting is impossible to prevent.

```{r}
#| label: prep_presympt
#| output: false

#Prepare a dataset containing symptomatic samples
sample_select <- presympt_annotation$Sample_id[presympt_annotation$Sample_id %in% rownames(pData(bs_expect))]
presympt_annotation  <-  filter(presympt_annotation, Sample_id %in% sample_select)
bs_presympt <- bs_expect[,sample_select]

pData(bs_presympt) <- pData(bs_presympt) %>% as_tibble(rownames = NA) %>% 
    rownames_to_column(var = "Sample_id") %>% 
    left_join(presympt_annotation)
rownames(pData(bs_presympt)) <- bs_presympt$Sample_id


cov_presympt <- getCoverage(bs_presympt, type = "Cov", what = "perBase")
keep <- which(rowSums(cov_presympt[,bs_presympt$Category == "Control"] >= 2) >= 2 &
                rowSums(cov_presympt[,bs_presympt$Category == "PE"] >= 2) >= 2)
bs_presympt_filtered <- bs_presympt[keep,]
          
#Create a training and test set
set.seed(2341)
presympt_data_split <- initial_split(presympt_annotation, strata = Category)
presympt_train <- training(presympt_data_split)
presympt_test<- testing(presympt_data_split)


#Calculate DMRs on the training set
pe_ids <- presympt_train[presympt_train$Category == "PE",]$Sample_id
control_ids <- presympt_train[presympt_train$Category == "Control",]$Sample_id

presympt_t_stat <- BSmooth.tstat(bs_presympt_filtered, 
                                 group1 = pe_ids,
                                 group2 = control_ids,
                                 estimate.var = "group2",
                                 local.correct = T,
                                 verbose = T)

dmr_presympt_finder_train <- dmrFinder(presympt_t_stat, qcutoff = c(0.01, 0.99))
dmr_presympt_finder_train_filtered <- subset(dmr_presympt_finder_train, n >= 3 & abs(meanDiff) >= 0.1)

#Get methylation values for the testing an training set
model_meth_presympt <- getMeth(bs_presympt_filtered, what = "perRegion", type = "smooth", regions = dmr_presympt_finder_train_filtered[1:50,]) %>% 
    t(.) %>%
    as_tibble(.) %>% 
    bind_cols(dplyr::select(presympt_annotation, Category, Sample_id)) %>%  
    mutate(Category = as.factor(Category)) %>% 
    relocate(Category)


presympt_data_split$data <- presympt_data_split$data %>% select(Sample_id) %>% left_join(model_meth_presympt) %>% select(-Sample_id)
presympt_train_data <- training(presympt_data_split)
presympt_test_data <-  testing(presympt_data_split)

#Set validation folds
presympt_folds <- vfold_cv(presympt_train_data, v = 5, strata = Category)
presympt_loo_folds <- vfold_cv(presympt_train_data, v = nrow(presympt_train_data)) #using a workaround here to get loocv as it's not really supported by tidymodels

#Prepare a dataset of the pre-symptomatic samples to use for the symptomatic model
bs_sympt <- bs_expect[,sympt_annotation$Sample_id]

test_set_sympt <- getMeth(bs_sympt, type = "smooth", what = "perRegion", regions = dmr_presympt_finder_train_filtered[1:50,]) %>% 
  t(.) %>%
  as_tibble(.) %>% 
  bind_cols(dplyr::select(sympt_annotation, Category)) %>%  
  mutate(Category = as.factor(Category)) %>% 
  relocate(Category) 
```

The same approach is used for the symptomatic cohort. Again the top 25 most significant DMR's are used for model creation.

```{r}
pData(bs_sympt) <- pData(bs_sympt) %>% as_tibble(rownames = NA) %>% 
    rownames_to_column(var = "Sample_id") %>% 
    left_join(sympt_annotation)
rownames(pData(bs_sympt)) <- bs_sympt$Sample_id


cov_sympt <- getCoverage(bs_sympt, type = "Cov", what = "perBase")
keep <- which(rowSums(cov_sympt[,bs_sympt$Category == "Control"] >= 2) >= 2 &
                rowSums(cov_sympt[,bs_sympt$Category == "PE"] >= 2) >= 2)
bs_sympt_filtered <- bs_sympt[keep,]

#Create a training and test set
set.seed(2341)
sympt_data_split <- initial_split(sympt_annotation, strata = Category)
sympt_train <- training(sympt_data_split)
sympt_test<- testing(sympt_data_split)

#Calculate DMRs on the training set
pe_ids_sympt <- sympt_train[sympt_train$Category == "PE",]$Sample_id
control_ids_sympt <- sympt_train[sympt_train$Category == "Control",]$Sample_id

sympt_t_stat <- BSmooth.tstat(bs_sympt_filtered, 
                                 group1 = pe_ids_sympt,
                                 group2 = control_ids_sympt,
                                 estimate.var = "group2",
                                 local.correct = T,
                                 verbose = T)
dmr_sympt_finder_train <- dmrFinder(sympt_t_stat, qcutoff = c(0.01, 0.99))
dmr_sympt_finder_train_filtered <- subset(dmr_sympt_finder_train, n >= 3 & abs(meanDiff) >= 0.1)

model_meth_sympt <- getMeth(bs_sympt_filtered, what = "perRegion", type = "smooth", regions = dmr_sympt_finder_train_filtered[1:25,]) %>% 
    t(.) %>%
    as_tibble(.) %>% 
    bind_cols(dplyr::select(sympt_annotation, Category, Sample_id)) %>%  
    mutate(Category = as.factor(Category)) %>% 
    relocate(Category) 

sympt_data_split$data <- sympt_data_split$data %>% select(Sample_id) %>% left_join(model_meth_sympt) %>% select(-Sample_id)
sympt_train <- training(sympt_data_split)
sympt_test <- testing(sympt_data_split)

#Set validation folds
sympt_folds <- vfold_cv(sympt_train, v = 5, strata = Category)
sympt_loo_folds <- vfold_cv(sympt_train, v = nrow(sympt_train)) #using a workaround here to get loocv as it's not really supported by tidymodels
   

test_set_presympt <- getMeth(bs_presympt, type = "smooth", what = "perRegion", regions = dmr_sympt_finder_train_filtered[1:25,]) %>% 
  t(.) %>%
  as_tibble(.) %>% 
  bind_cols(dplyr::select(presympt_annotation, Category)) %>%  
  mutate(Category = as.factor(Category)) %>% 
  relocate(Category)        
```

# Machine learning

Three types of classical machine learning classifiers are tested on the dataset. A random forest classifier, support vector machine and a logistic regression model. Each model type is trained on the training set of each cohort and then tested on their respective test sets as well as the entire other cohort, e.g: a random forest classifier is trained on the training set of the symptomatic cohort, the obtained model is tested on the testing set of the symptomatic cohort as well as on the full presymtomatic cohort.

In the following section the terms "Diagnostic" and "Predictive" model are used for models trained on symptomatic and pre-symptomatic data respectively. In ROC plots the green curve shows testing results on the testing set from the same cohort as the training data (intended use) while the orange curve shows results from testing with the other cohort.

## Random forest classifiers

A random forest classifier from the `ranger` package is built. Hyperparameter training is first performed on the number of trees, number of randomly selected predictors and the minimal node size. More info on random forest classifiers can be found on [Wikipedia](https://en.wikipedia.org/wiki/Random_forest).

::: panel-tabset
### Diagnostic model

```{r}
#| label: sympt_rf

#Set a sampling recipe for the symptomatic data

sampling_rec <- recipe(Category ~ ., data=sympt_train) %>% step_rose(Category)

#make a rf model
rf_mod <- rand_forest(trees = tune(), mtry = tune(), min_n = tune()) %>% 
  set_engine("ranger") %>% 
  set_mode("classification")

#make a workflow for resampling
rf_wf_sympt <- workflow() %>% 
  add_model(rf_mod) %>% 
  add_recipe(sampling_rec) 

#Make a grid for a parameter search
rf_grid <- grid_regular(trees(), finalize(mtry(), x = sympt_folds), min_n(), levels = 5)

#fit the resamplings
sympt_rf_tune <- rf_wf_sympt %>% 
  tune_grid(resamples = sympt_folds, grid = rf_grid)

#get the best model
sympt_best_rf <- sympt_rf_tune %>% 
  select_best(metric = "accuracy")

#make the final fit and evaluate the test set
sympt_final_rf_wf <- rf_wf_sympt %>% 
  finalize_workflow(sympt_best_rf)

sympt_rf_resample_fit <- fit_resamples(sympt_final_rf_wf, resamples = sympt_folds, metrics = cls_metrics)

sympt_final_rf_fit <- sympt_final_rf_wf %>% 
  last_fit(sympt_data_split)

#Get metrics 
sympt_rf_metrics <- sympt_final_rf_wf %>% 
  last_fit(sympt_data_split, metrics = cls_metrics) %>% collect_metrics()

#Test the model on the pre-symptomatic data

sympt_rf_fit <- sympt_final_rf_wf %>% fit(select(model_meth_sympt, -Sample_id))
presympt_rf_predictions <- predict(sympt_rf_fit, test_set_presympt, type ="prob") %>% 
    bind_cols(Truth = test_set_presympt$Category)

plot_roc(sympt_final_rf_fit$.predictions[[1]], "Category", ".pred_Control",
         presympt_rf_predictions, "Truth", ".pred_Control")

```

The best performing random forest model has the following parameters `r kable(select(sympt_best_rf, -.config))`

### Predictive model

```{r}
#| label: presympt_rf

sampling_rec_presympt <-  recipe(Category ~ ., data=presympt_train_data) %>% step_rose(Category)

rf_wf_presympt <- workflow() %>% 
  add_model(rf_mod) %>% 
  add_recipe(sampling_rec_presympt) 

rf_grid <- grid_regular(trees(), finalize(mtry(), x = presympt_folds), min_n(), levels = 5)
#fit the pre symptomatic resamplings
presympt_rf_tune <- rf_wf_presympt %>% 
  tune_grid(resamples = presympt_folds, grid = rf_grid)

#get the best model
presympt_best_rf <- presympt_rf_tune %>% 
  select_best(metric = "accuracy")

#make the final fit and evaluate the test set
presympt_final_rf_wf <- rf_wf_presympt %>% 
  finalize_workflow(presympt_best_rf)

presympt_rf_resample_fit <- fit_resamples(presympt_final_rf_wf, resamples = presympt_folds, metrics = cls_metrics)

presympt_final_rf_fit <- presympt_final_rf_wf %>% 
  last_fit(presympt_data_split)

#Get metrics 
presympt_rf_metrics <- presympt_final_rf_wf %>% 
  last_fit(presympt_data_split, metrics = cls_metrics) %>% collect_metrics()

#Test the model on the symptomatic data

presympt_rf_fit <- presympt_final_rf_wf %>% fit(select(model_meth_presympt, -Sample_id))
sympt_rf_predictions <- predict(presympt_rf_fit, test_set_sympt, type ="prob") %>% 
    bind_cols(Truth = test_set_sympt$Category)

plot_roc(presympt_final_rf_fit$.predictions[[1]], "Category", ".pred_Control",
         sympt_rf_predictions, "Truth", ".pred_Control")
```

The best performing random forest model has the following parameters `r kable(select(presympt_best_rf, -.config))`
:::

## Support vector machine

For the support vector machine, the `kernlab`-package is used set to classification mode. The only hyperparameter that is tuned is the **cost** parameter. More info on support vector machines can be read [here](https://en.wikipedia.org/wiki/Support_vector_machine)

::: panel-tabset
### Diagnostic model

```{r}
#| label: sympt_svm

#Make an svm model
svm_mod <- svm_linear(cost = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("classification")

#Make a workflow
svm_wf_sympt <- workflow() %>% 
  add_model(svm_mod) %>% 
  add_recipe(sampling_rec)

#Make a parameter grid
svm_grid <- grid_regular(cost(), levels = 5)

#Fit the resamples and do a grid search for symptomatic samples
svm_tune <- svm_wf_sympt %>% 
  tune_grid(resamples = sympt_folds, grid = svm_grid, metrics =  cls_metrics)

sympt_best_svm <- svm_tune %>% 
  select_best(metric = "accuracy")

sympt_final_svm_wf <- svm_wf_sympt %>% 
  finalize_workflow(sympt_best_svm)

sympt_svm_resample_fit <- fit_resamples(sympt_final_svm_wf, resamples = sympt_folds, metrics = cls_metrics)

sympt_final_svm_fit <- sympt_final_svm_wf %>% 
  last_fit(sympt_data_split)

#Get metrics 
sympt_svm_metrics <- sympt_final_svm_wf %>% 
  last_fit(sympt_data_split, metrics = cls_metrics) %>% collect_metrics()

#Test the model on the pre-symptomatic data
sympt_svm_fit <- sympt_final_svm_wf %>% fit(select(model_meth_sympt, -Sample_id))
presympt_svm_predictions <- predict(sympt_svm_fit, test_set_presympt, type ="prob") %>% 
    bind_cols(Truth = test_set_presympt$Category)

plot_roc(sympt_final_svm_fit$.predictions[[1]], "Category", ".pred_Control",
         presympt_svm_predictions, "Truth", ".pred_Control")

```

The best performing support vector machine has the following parameters `r kable(select(sympt_best_svm, -.config))`

### Predictive model

```{r}
#| label: presympt_svm

svm_wf_presympt <- workflow() %>% 
  add_model(svm_mod) %>% 
  add_recipe(sampling_rec_presympt)

#Fit the resamples and do a grid search for symptomatic samples
presympt_svm_tune <- svm_wf_presympt %>% 
  tune_grid(resamples = presympt_folds, grid = svm_grid, metrics =  cls_metrics)

presympt_best_svm <- svm_tune %>% 
  select_best(metric = "accuracy")

presympt_final_svm_wf <- svm_wf_presympt %>% 
  finalize_workflow(presympt_best_svm)

presympt_svm_resample_fit <- fit_resamples(presympt_final_svm_wf, resamples = presympt_folds, metrics = cls_metrics)

presympt_final_svm_fit <- presympt_final_svm_wf %>% 
  last_fit(presympt_data_split)

#Get metrics 
presympt_svm_metrics <- presympt_final_svm_wf %>% 
  last_fit(presympt_data_split, metrics = cls_metrics) %>% collect_metrics()

#Test the model on the symptomatic data

presympt_svm_fit <- presympt_final_svm_wf %>% fit(select(model_meth_presympt, -Sample_id))
sympt_svm_predictions <- predict(presympt_svm_fit, test_set_sympt, type ="prob") %>% 
    bind_cols(Truth = test_set_sympt$Category)

plot_roc(presympt_final_svm_fit$.predictions[[1]], "Category", ".pred_Control",
         sympt_svm_predictions, "Truth", ".pred_Control")
```

The best performing support vector machine has the following parameters `r kable(select(presympt_best_svm, -.config))`
:::

## Regression model

A logistic regression model is created with an elastic regularisation. The model family is set to binomial and the `glmnet` package is used for modelling. Both the model penalty and regularisation mixture are tuned during hyperparameter tuning. More about regression [here](https://en.wikipedia.org/wiki/Logistic_regression)

::: panel-tabset
### Diagnostic model

```{r}
#| label: sympt_reg

#Make an elestic net reggression model
reg_mod <- logistic_reg(penalty = tune(), mixture = tune()) %>%
  set_engine("glmnet", family = "binomial") %>%
  set_mode("classification")

#Make workflow
reg_wf_sympt <- workflow() %>%
  add_model(reg_mod) %>%
 add_recipe(sampling_rec)

#Make a parameter grid
reg_grid <- grid_regular(penalty(), mixture(), levels = 5)

#Tune the model
sympt_reg_tune <- reg_wf_sympt %>%
  tune_grid(resamples = sympt_folds, grid = reg_grid, metrics = cls_metrics)

sympt_best_reg <- sympt_reg_tune %>%
  select_best(metric = "accuracy")

sympt_final_reg_wf <- reg_wf_sympt %>%
  finalize_workflow(sympt_best_reg)

sympt_reg_resample_fit <- fit_resamples(sympt_final_reg_wf, resamples = sympt_folds, metrics = cls_metrics)


sympt_final_reg_fit <- sympt_final_reg_wf %>%   
  last_fit(sympt_data_split)

#Get metrics 
sympt_reg_metrics <- sympt_final_reg_wf %>% 
  last_fit(sympt_data_split, metrics = cls_metrics) %>% collect_metrics()

#Test the model on the pre-symptomatic data

sympt_reg_fit <- sympt_final_reg_wf %>% fit(select(model_meth_sympt, -Sample_id))
presympt_reg_predictions <- predict(sympt_reg_fit, test_set_presympt, type ="prob") %>% 
    bind_cols(Truth = test_set_presympt$Category)

plot_roc(sympt_final_reg_fit$.predictions[[1]], "Category", ".pred_Control",
         presympt_reg_predictions, "Truth", ".pred_Control")

```

The best performing regression model has the following parameters `r kable(select(sympt_best_reg, -.config))`

### Predictive model

```{r}
#| label: presympt_reg

reg_wf_presympt <- workflow() %>%
  add_model(reg_mod) %>%
 add_recipe(sampling_rec_presympt)

#Tune the model
presympt_reg_tune <- reg_wf_presympt %>%
  tune_grid(resamples = presympt_folds, grid = reg_grid, metrics = cls_metrics)

presympt_best_reg <- presympt_reg_tune %>%
  select_best(metric = "accuracy")

presympt_final_reg_wf <- reg_wf_presympt %>%
  finalize_workflow(presympt_best_reg)

presympt_reg_resample_fit <- fit_resamples(presympt_final_reg_wf, resamples = presympt_folds, metrics = cls_metrics)

presympt_final_reg_fit <- presympt_final_reg_wf %>%   
  last_fit(presympt_data_split)

#Get metrics 
presympt_reg_metrics <- presympt_final_reg_wf %>% 
  last_fit(presympt_data_split, metrics = cls_metrics) %>% collect_metrics()

#Test the model on the pre-symptomatic data

presympt_reg_fit <- presympt_final_reg_wf %>% fit(select(model_meth_presympt, -Sample_id))
sympt_reg_predictions <- predict(presympt_reg_fit, test_set_sympt, type ="prob") %>% 
    bind_cols(Truth = test_set_sympt$Category)

plot_roc(presympt_final_reg_fit$.predictions[[1]], "Category", ".pred_Control",
         sympt_reg_predictions, "Truth", ".pred_Control")
```

The best performing regression model has the following parameters `r kable(select(presympt_best_reg, -.config))`
:::

The following table contains specificity and sensitivity metrics for each of the create models (tested on their respective test set). Most models overclassify samples as controls, symptomatic models are more powerful at differentiating then pre-symptomatic models

```{r}
#| label: tbl-metrics

total_metrics <- tibble(Model = c("Random forest", "Random forest", "Support Vector Machine", "Support Vector Machine", "Logistic regression", "Logistic regression" ),
                        Dataset = c("Symptomatic", "Pre-symptomatic","Symptomatic", "Pre-symptomatic","Symptomatic", "Pre-symptomatic"),
                        Sensitivity = c(
                            sympt_rf_metrics$.estimate[1],
                            presympt_rf_metrics$.estimate[1],
                            sympt_svm_metrics$.estimate[1],
                            presympt_svm_metrics$.estimate[1],
                            sympt_reg_metrics$.estimate[1],
                            presympt_reg_metrics$.estimate[1]
                        ),
                        Specificity = c(sympt_rf_metrics$.estimate[2],
                            presympt_rf_metrics$.estimate[2],
                            sympt_svm_metrics$.estimate[2],
                            presympt_svm_metrics$.estimate[2],
                            sympt_reg_metrics$.estimate[2],
                            presympt_reg_metrics$.estimate[2]))

kable(total_metrics)
```

```{r}
#| include: false
#| label: fig-roc_paper

#Make combined roc curve plots

#Symptomatic models

roc_reg_symp_train <- roc_curve(sympt_final_reg_fit$.predictions[[1]], Category, .pred_Control)  
auc_reg_symp_train <- roc_auc(sympt_final_reg_fit$.predictions[[1]], Category, .pred_Control)  
roc_svm_symp_train <- roc_curve(sympt_final_svm_fit$.predictions[[1]], Category, .pred_Control)  
auc_svm_symp_train <- roc_auc(sympt_final_svm_fit$.predictions[[1]], Category, .pred_Control)  
roc_rf_symp_train <- roc_curve(sympt_final_rf_fit$.predictions[[1]], Category, .pred_Control)  
auc_rf_symp_train <- roc_auc(sympt_final_rf_fit$.predictions[[1]], Category, .pred_Control)  

#Symptomatic models predicting presympt

roc_reg_symp_pred <- roc_curve(presympt_reg_predictions, Truth, .pred_Control)  
auc_reg_symp_pred <- roc_auc(presympt_reg_predictions, Truth, .pred_Control)  
roc_svm_symp_pred <- roc_curve(presympt_svm_predictions, Truth, .pred_Control)  
auc_svm_symp_pred <- roc_auc(presympt_svm_predictions, Truth, .pred_Control)  
roc_rf_symp_pred <- roc_curve(presympt_rf_predictions, Truth, .pred_Control)  
auc_rf_symp_pred <- roc_auc(presympt_rf_predictions, Truth, .pred_Control)  

#Presympt models 
roc_reg_presymp_train <- roc_curve(presympt_final_reg_fit$.predictions[[1]], Category, .pred_Control)  
auc_reg_presymp_train <- roc_auc(presympt_final_reg_fit$.predictions[[1]], Category, .pred_Control)  
roc_svm_presymp_train <- roc_curve(presympt_final_svm_fit$.predictions[[1]], Category, .pred_Control)  
auc_svm_presymp_train <- roc_auc(presympt_final_svm_fit$.predictions[[1]], Category, .pred_Control)  
roc_rf_presymp_train <- roc_curve(presympt_final_rf_fit$.predictions[[1]], Category, .pred_Control)  
auc_rf_presymp_train <- roc_auc(presympt_final_rf_fit$.predictions[[1]], Category, .pred_Control)  

#Presympt models predicting sympt
roc_reg_presymp_pred <- roc_curve(sympt_reg_predictions, Truth, .pred_Control)  
auc_reg_presymp_pred <- roc_auc(sympt_reg_predictions, Truth, .pred_Control)  
roc_svm_presymp_pred <- roc_curve(sympt_svm_predictions, Truth, .pred_Control)  
auc_svm_presymp_pred <- roc_auc(sympt_svm_predictions, Truth, .pred_Control)  
roc_rf_presymp_pred <- roc_curve(sympt_rf_predictions, Truth, .pred_Control)  
auc_rf_presymp_pred <- roc_auc(sympt_rf_predictions, Truth, .pred_Control)  


(ggplot(mapping = aes(x = 1 - specificity, y = sensitivity)) +
    geom_path(data = roc_rf_symp_pred, color = color_palette[1], size = 1.5) +
    geom_path(data = roc_rf_symp_train, color = color_palette[2], size = 1.5)+
    geom_abline(lty = 3) +
    geom_text(aes(x = 0.85, y = 0.50, label = paste0("AUC:", round(auc_rf_symp_pred$.estimate, 4))), color = color_palette[1], size = 4) +
    geom_text(aes(x = 0.2, y = 0.90, label = paste0("AUC:", round(auc_rf_symp_train$.estimate, 4))), color = color_palette[2], size = 4) +
    coord_equal() +
    theme_classic() +
ggplot(mapping = aes(x = 1 - specificity, y = sensitivity)) +
    geom_path(data = roc_svm_symp_pred, color = color_palette[1], size = 1.5) +
    geom_path(data = roc_svm_symp_train, color = color_palette[2], size = 1.5)+
    geom_abline(lty = 3) +
    geom_text(aes(x = 0.85, y = 0.50, label = paste0("AUC:", round(auc_svm_symp_pred$.estimate, 4))), color = color_palette[1], size = 4) +
    geom_text(aes(x = 0.2, y = 0.90, label = paste0("AUC:", round(auc_svm_symp_train$.estimate, 4))), color = color_palette[2], size = 4) +
    coord_equal() +
    theme_classic() +
ggplot(mapping = aes(x = 1 - specificity, y = sensitivity)) +
    geom_path(data = roc_reg_symp_pred, color = color_palette[1], size = 1.5) +
    geom_path(data = roc_reg_symp_train, color = color_palette[2], size = 1.5)+
    geom_abline(lty = 3) +
    geom_text(aes(x = 0.85, y = 0.50, label = paste0("AUC:", round(auc_reg_symp_pred$.estimate, 4))), color = color_palette[1], size = 4) +
    geom_text(aes(x = 0.2, y = 0.90, label = paste0("AUC:", round(auc_reg_symp_train$.estimate, 4))), color = color_palette[2], size = 4) +
    coord_equal() +
    theme_classic() +plot_layout(tag_level = "new")) /

(ggplot(mapping = aes(x = 1 - specificity, y = sensitivity)) +
    geom_path(data = roc_rf_presymp_pred, color = color_palette[2], size = 1.5) +
    geom_path(data = roc_rf_presymp_train, color = color_palette[1], size = 1.5)+
    geom_abline(lty = 3) +
    geom_text(aes(x = 0.75, y = 0.50, label = paste0("AUC:", round(auc_rf_presymp_pred$.estimate, 4))), color = color_palette[2], size = 4) +
    geom_text(aes(x = 0.2, y = 1, label = paste0("AUC:", round(auc_rf_presymp_train$.estimate, 4))), color = color_palette[1], size = 4) +
    coord_equal() +
    theme_classic() +
ggplot(mapping = aes(x = 1 - specificity, y = sensitivity)) +
    geom_path(data = roc_svm_presymp_pred, color = color_palette[2], size = 1.5) +
    geom_path(data = roc_svm_presymp_train, color = color_palette[1], size = 1.5)+
    geom_abline(lty = 3) +
    geom_text(aes(x = 0.75, y = 0.50, label = paste0("AUC:", round(auc_svm_presymp_pred$.estimate, 4))), color = color_palette[2], size = 4) +
    geom_text(aes(x = 0.25, y = 1, label = paste0("AUC:", round(auc_svm_presymp_train$.estimate, 4))), color = color_palette[1], size = 4) +
    coord_equal() +
    theme_classic() +
ggplot(mapping = aes(x = 1 - specificity, y = sensitivity)) +
    geom_path(data = roc_reg_presymp_pred, color = color_palette[2], size = 1.5) +
    geom_path(data = roc_reg_presymp_train, color = color_palette[1], size = 1.5)+
    geom_abline(lty = 3) + 
    geom_text(aes(x = 0.75, y = 0.40, label = paste0("AUC:", round(auc_reg_presymp_pred$.estimate, 4))), color = color_palette[2], size = 4) +
    geom_text(aes(x = 0.20, y = 1, label = paste0("AUC:", round(auc_reg_presymp_train$.estimate, 4))), color = color_palette[1], size = 4) +
    coord_equal() +
    theme_classic() +plot_layout(tag_level = "new") ) + patchwork::plot_annotation(tag_levels = c("A", 1))
ggsave(filename = "Plots/ROC_plots.png", width = 12, height = 7, bg = "transparent")

```