-
Notifications
You must be signed in to change notification settings - Fork 0
/
project.R
120 lines (87 loc) · 5.3 KB
/
project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Load Necessary Libraries
library(glmnet)
library(dplyr)
library(readr)
library(ggplot2)
library(tidyr)
library(readr)
# Load Datasets
load("~/Downloads/microbiome/trainMicrobiomeData.RData") # Load the dataset
load("~/Downloads/microbiome/trainMicrobiomeMetadata.RData") # Load the meta_datasets
# Data Transposing
trainMicrobiomeData_t <- as.data.frame(t(trainMicrobiomeData))
colnames(trainMicrobiomeData_t) <- rownames(trainMicrobiomeData)
rownames(trainMicrobiomeData_t) <- NULL
trainMicrobiomeData_t$sample.ID <- rownames(t(trainMicrobiomeData))
merged_data <- merge(trainMicrobiomeData_t, trainMicrobiomeMetadata, by = "sample.ID") # Merge the data
merged_data$Total_Microbiome <- rowSums(merged_data[,2:223], na.rm = TRUE) # Total Sum Scaling - proportional normalization
# TSS - Proportional Normalization
merged_data_ratio <- merged_data
for (i in 2:223) {merged_data_ratio[, i] <- merged_data[, i] / merged_data$Total_Microbiome}
merged_data_ratio$Total_Microbiome <- NULL # Remove the 'Total_Microbiome' column as it's no longer needed
sampled_rows <- sample(1:nrow(merged_data_ratio), size = floor(4 * nrow(merged_data_ratio) / 5)) # Random Sampling 4 out of 5
merged_ratio_sampled <- merged_data_ratio[sampled_rows, ] # Create new data frames based on the sampled rows
## Parameters for Elastic Model Fitting
afeatures <- c("Barnesiella", "Odoribacter", "Streptococcus", "Anaerostipes", "Intestinimonas","Ruminococcus",
"Allisonella", "Evtepia", "Faecalimonas","Anaerobutyricum","Negativibacillus","Escherichia")
X_train <- as.matrix(merged_ratio_sampled[, features, drop = FALSE]) # bring feature matrix to model
y_train <- merged_ratio_sampled$Age # bring target variable(age)
X_question <- as.matrix(merged_data_ratio[, features, drop = FALSE]) # Prepare the data for prediction
lasso_model <- glmnet(X_train, y_train, alpha = 0.75, nlambda = 100) # Fit the Lasso model
age_estimations <- predict(lasso_model, s = 0.001, newx = X_question) # Make age estimations. s=0.01 is convention
merged_data_ratio$Estimated_Age <- as.numeric(age_estimations) # Convert the matrix to a numeric vector
## Lasso Bootstrapping
n_bootstrap <- 300 # Initialize a matrix to s
age_estimations_lasso_bootstrap <- matrix(0, nrow = nrow(X_question), ncol = n_bootstrap) # tore bootstrap estimates
for (i in 1:n_bootstrap) {
bootstrap_indices <- sample(1:nrow(X_train), replace = TRUE) # Resample the indices
X_bootstrap <- X_train[bootstrap_indices, ] # feature matrix_ bring each Iterations
y_bootstrap <- y_train[bootstrap_indices] # bring target variable each iterations
lasso_model_bootstrap <- glmnet(X_bootstrap, y_bootstrap, alpha = 0.75, nlambda = 100) # Fit the Lasso model
age_estimations_lasso_bootstrap[, i] <- as.numeric(predict(lasso_model_bootstrap, s = 0.001, newx = X_question)) # Age estimations Iterations
}
# Average across all bootstrap estimates
merged_data_ratio$Estimated_Age_Lasso_Bootstrap <- rowMeans(age_estimations_lasso_bootstrap)
# Extracts the coefficients of the Lasso model that was trained on the original dataset
lasso_coefficients <- coef(lasso_model, s = 0.001)
print("Lasso Coefficients:")
print(lasso_coefficients)
# Calculates the average of the coefficients from the Bootstrap Lasso model
average_lasso_bootstrap_coefficients <- rowMeans(as.matrix(lasso_model_bootstrap$beta))
print("Average Lasso Bootstrap Coefficients:")
print(average_lasso_bootstrap_coefficients)
## Verification Process
# Establish the data to long format
df_long <- merged_data_ratio %>%
select(Age, Estimated_Age, Estimated_Age_Lasso_Bootstrap) %>%
gather(key = "Estimation_Type", value = "Estimated_Age_Value", -Age)
# Calculate R-squared values
r_squared_values <- df_long %>%
group_by(Estimation_Type) %>%
summarise(R_squared = summary(lm(Estimated_Age_Value ~ Age))$r.squared)
# Calculate R-squared and RMSE values
metrics_values <- df_long %>%
group_by(Estimation_Type) %>%
summarise(
R_squared = summary(lm(Estimated_Age_Value ~ Age))$r.squared,
RMSE = sqrt(mean((Estimated_Age_Value - Age)^2))
)
# Create the scatter plot with facet grid
p <- ggplot(df_long, aes(x = Age, y = Estimated_Age_Value)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + # y = x linear line for age reference line
facet_wrap(~ Estimation_Type, scales = "free") +
ggtitle("Age vs Estimated Age (Lasso and Lasso Bootstrap)") +
guides(color = FALSE)+ # Remove color legend
geom_label(fill = "lightblue", color = "black", alpha = 0.8, hjust = 0, vjust = 0, data = metrics_values,
aes(label = paste("R-squared = ", round(R_squared, 2), "\nRMSE = ", round(RMSE, 2)), x = 30, y = 60), size = 4) +
theme(plot.title = element_text(hjust = 0.5), # Center the plot title
axis.title.x = element_text(margin = margin(t = 10)), # Adjust x-axis title margin
axis.title.y = element_text(margin = margin(r = 10)), # Adjust y-axis title margin
strip.text = element_text(size = 12, face = "bold"), # Customize facet labels
strip.background = element_blank(), # Remove facet labels background
legend.position = "none") # Remove legend
# Add R-squared and RMSE values
metrics_values$Estimation_Type <- factor(metrics_values$Estimation_Type, levels = unique(df_long$Estimation_Type))
print(p) # Display the plot