Add files via upload

JRigh · web-flow · commit 03092b39a894 · 2023-03-23T19:53:47.000+01:00
diff --git a/LOOCV.R b/LOOCV.R
@@ -0,0 +1,152 @@
+#-------------------------------------------------------------
+# Leave-one-out Cross-validation (LOOCV) for regression models
+# in R
+#-------------------------------------------------------------
+
+#--------------------------------------------------
+# Example 1: Linear and Quadratic regression models
+#--------------------------------------------------
+
+# 1. generate artificial data
+set.seed(2023)
+n <- 300
+x <- rnorm(n = n, mean = 5, sd = 3)
+y <- x^2 + rnorm(n = n, mean = 0, sd = 8)
+data = data.frame(x = x, y = y)
+
+# 2. plot the models that we want to fit
+par(mfrow = c(1,2), pty = "s")
+
+model1 <- lm(y ~ x)
+plot(x = x, y = y, main = 'Linear model', cex = 1.1, pch = 1, lwd = 1.2)
+yhat1 <- model1$coef[1] + model1$coef[2] * x
+lines(x, yhat1, lw = 2.5, col = 'red')
+
+model2 <- lm(y ~ x + I(x^2))
+plot(x = x, y = y, main = 'Quadratic model', cex = 1.1, pch = 1, lwd = 1.2)
+yhat2 <- model2$coef[1] + model2$coef[2] * x  + model2$coef[3] * x^2
+lines(x[order(x)], yhat2[order(x)], lw = 2.5, col = 'red')
+
+# 3. Cross-Validation
+# fit the models on leave-one-out samples
+pred.cv.mod1 <- pred.cv.mod2 <- numeric(n)
+
+for(i in 1:n) {
+  
+  # quadratic model
+  mod1 = lm(y ~ x, subset = -i)
+  pred.cv.mod1[i] = predict(mod1, data[i,])
+  
+  # quadratic model
+  mod2 = lm(y ~ x + I(x^2), subset = -i)
+  pred.cv.mod2[i] = predict(mod2, data[i,])
+}
+
+MSE1 = (1/n) * sum((y - pred.cv.mod1)^2) # theta_hat_pe
+MSE2 = (1/n) * sum((y - pred.cv.mod2)^2) # theta_hat_pe
+
+# Root Mean Squared Error (RMSE)
+sqrt(c(MSE1, MSE2))
+# [1] 15.68599  7.99332
+# The second model (Quadratic) has the lowest RMSE and thus is prefered.
+
+#---------------------------------------
+# Example 2: binomial regressions models
+#---------------------------------------
+
+# splitting the dataset into training and test sets
+data("mtcars")
+head(mtcars)
+set.seed(2023)
+ind <- sample(2, nrow(mtcars), replace=TRUE, prob=c(0.6,0.4))
+training <- mtcars[ind==1,]
+testing <- mtcars[ind==2,]
+
+# save a copy of entire dataset, training and testing datasets in .csv
+write.csv(mtcars, 
+          "C:/Users/julia/OneDrive/Desktop/github/16. Crossvalidation/mtcars.csv",
+          row.names = FALSE)
+write.csv(training, 
+          "C:/Users/julia/OneDrive/Desktop/github/16. Crossvalidation/mtcars_training.csv",
+          row.names = FALSE)
+write.csv(testing, 
+          "C:/Users/julia/OneDrive/Desktop/github/16. Crossvalidation/mtcars_testing.csv",
+          row.names = FALSE)
+
+# 2. Cross-Validation
+# fit the models on leave-one-out samples
+pred.cv.modl <- pred.cv.modp <- pred.cv.modc <- numeric(length=nrow(testing))
+
+for(i in 1:nrow(testing)) {
+  
+  # logistic model
+  modl = glm(vs ~ mpg, data=mtcars,
+             family= binomial(link = "logit"), subset = -i)
+  pred.cv.modl[i] = predict(modl, testing[i,])
+  
+  # probit model
+  modp = glm(vs ~ mpg, data=mtcars,
+             family= binomial(link = "probit"), subset = -i)
+  pred.cv.modp[i] = predict(modp, testing[i,])
+  
+  # complementary log-log model
+  modc = glm(vs ~ mpg, data=mtcars,
+             family= binomial(link = "cloglog"), subset = -i)
+  pred.cv.modc[i] = predict(modc, testing[i,])
+}
+
+MSE1 = (1/nrow(testing)) * sum((testing$vs - pred.cv.modl)^2) # theta_hat_pe
+MSE2 = (1/nrow(testing)) * sum((testing$vs - pred.cv.modp)^2) # theta_hat_pe
+MSE3 = (1/nrow(testing)) * sum((testing$vs - pred.cv.modc)^2) # theta_hat_pe
+
+# Root Mean Squared Error (RMSE)
+sqrt(c(MSE1, MSE2, MSE3))
+# [1] 2.720540 1.485185 1.705154
+
+# The second model (Quadratic) has the lowest RMSE and thus is prefered.
+
+# 3. Plot of the fit on testing dataset with annotation
+# logistic fit
+modl <- glm(vs ~ mpg, data=training,
+            family= binomial(link = "logit"))
+
+# probit fit
+modp <- glm(vs ~ mpg, data=training,
+            family= binomial(link = "probit"))
+
+# complementary log-log fit
+modc <- glm(vs ~ mpg, data=training,
+            family= binomial(link = "cloglog"))
+
+# complete the dataset with predictions for each model
+testing$pred.modl <- predict(modl, type = 'response', newdata = testing)
+testing$pred.modp <- predict(modp, type = 'response', newdata = testing)
+testing$pred.modc <- predict(modc, type = 'response', newdata = testing)
+
+# plot 
+ggplot(testing, aes(x = mpg, y = vs)) + 
+  geom_point(size = 1.8) +
+  geom_line(size = 1, data = testing, aes(x = mpg[order(testing$mpg)], 
+                                          y = pred.modl[order(testing$pred.modl)],
+                                          color='Logistic')) +
+  geom_line(size = 1, data = testing, aes(x = mpg[order(testing$mpg)], 
+                                          y = pred.modp[order(testing$pred.modp)],
+                                          color='Probit')) +
+  geom_line(size = 1, data = testing, aes(x = mpg[order(testing$mpg)], 
+                                          y = pred.modc[order(testing$pred.modc)],
+                                          color='Log-log')) +
+  annotate('text', label = paste('Logistic RMSE = ', round(sqrt(MSE1),2)), x = 25, y = 0.65, size = 3) + 
+  annotate('text', label = paste('Probit RMSE = ', round(sqrt(MSE2),2)), x = 25, y = 0.58, size = 3) + 
+  annotate('text', label = paste('C Log-log RMSE = ', round(sqrt(MSE3),2)), x = 25, y = 0.51, size = 3) + 
+  labs(title = 'Scatterplot - Fit of Logistic, Probit and Log-log models',
+       subtitle = 'mtcars dataset', color = "Legend",
+       y="Engine (0 = V-shaped, 1 = straight)", x="Miles/(US) gallon") +
+  theme(axis.text=element_text(size=8),
+        axis.title=element_text(size=8),
+        plot.subtitle=element_text(size=10, face="italic", color="darkred"),
+        panel.background = element_rect(fill = "white", colour = "grey50"),
+        panel.grid.major = element_line(colour = "grey90"))
+
+#----
+# end
+#----
diff --git a/LOOCV_P.py b/LOOCV_P.py
@@ -0,0 +1,106 @@
+#-------------------------------------------------------------
+# Leave-one-out Cross-validation (LOOCV) for regression models
+# in Python
+#-------------------------------------------------------------
+
+#--------------------------------------------------
+# Example 1: Linear and Quadratic regression models
+#--------------------------------------------------
+
+# 1. generate artificial data
+import numpy as np
+np.random.seed(2023)
+
+n = 300
+x = np.random.normal(loc = 5, scale = 3, size = n)
+y = x**2 + np.random.normal(loc = 0, scale = 8, size = n)
+
+# 2. plot the models that we want to fit
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import figure
+from sklearn.linear_model import LinearRegression
+
+px = 1/plt.rcParams['figure.dpi'] 
+plt.figure(figsize=(850*px, 400*px))
+plt.subplot(1, 2, 1)
+plt.plot(x, y, 'o', fillstyle = 'none', color = 'black')
+plt.title('Linear model', fontsize = 15)
+plt.xlabel("x") ; plt.ylabel("y")
+beta1, beta0 = np.polyfit(x, y, 1)
+yhat1 = beta0 + beta1*x
+plt.plot(x, yhat1, color = 'red')
+
+plt.subplot(1, 2, 2)
+plt.plot(x, y, 'o', fillstyle = 'none', color = 'black')
+plt.xlabel('x') ; plt.ylabel('y') ; plt.title('Quadatic model', fontsize = 15)
+beta2, beta1, beta0 = np.polyfit(x, y, 2)
+yhat2 = beta0 + beta1*x + beta2*(x**2)
+orders = np.argsort(x.ravel())
+plt.plot(x[orders], yhat2[orders], color = 'red')
+
+
+# 3. Cross-Validation
+# fit the models on leave-one-out samples
+import pandas as pd
+data = pd.DataFrame({'x': x, 'y': y})
+xn = data['x'].values.reshape(-1,1)
+yn = data['y'].values.reshape(-1,1)
+
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.model_selection import LeaveOneOut, cross_val_score
+loocv1 = LeaveOneOut()
+
+# linear model
+mod1 = PolynomialFeatures(degree = 1, include_bias = False).fit_transform(xn)
+mod11 = LinearRegression().fit(mod1, yn)
+
+loocv1 = LeaveOneOut()
+scoresmod1 = cross_val_score(mod11, 
+                         mod1,
+                         yn, 
+                         scoring = 'neg_mean_squared_error',
+                         cv = loocv1)
+
+    
+# quadratic model
+mod2 = PolynomialFeatures(degree = 2, include_bias = False).fit_transform(xn)
+mod22 = LinearRegression().fit(mod2, yn)
+
+loocv2 = LeaveOneOut()
+scoresmod2 = cross_val_score(mod22, 
+                         mod2,
+                         yn, 
+                         scoring = 'neg_mean_squared_error',
+                         cv = loocv2)
+
+# Root Mean Squared Error (RMSE)
+import statistics
+import math
+
+RMSE1 = math.sqrt(statistics.mean(abs(scoresmod1)))
+RMSE2 = math.sqrt(statistics.mean(abs(scoresmod2)))
+[RMSE1, RMSE2]
+# [16.169293289892607, 7.873829105930071]
+# The second model (Quadratic) has the lowest RMSE and thus is prefered.
+
+#---------------------------------------
+# Example 2: binomial regressions models
+#---------------------------------------
+
+# importing the data from .csv file
+
+training = pd.read_csv("C:/Users/julia/OneDrive/Desktop/github/16. Crossvalidation/mtcars_training.csv")
+testing = pd.read_csv("C:/Users/julia/OneDrive/Desktop/github/16. Crossvalidation/mtcars_testing.csv")
+testing
+
+# 2. Cross-Validation
+# fit the models on leave-one-out samples
+from sklearn.linear_model import LogisticRegression
+
+# logistic model
+modl = LogisticRegression(random_state=0).fit_transform(training['mpg'])
+modl1 = LogisticRegression.fit(mod1, training['vs'])
+  
+#-----------
+# unfinished
+#-----------
diff --git a/LOOCV_Reg_R_Python.pdf b/LOOCV_Reg_R_Python.pdf
diff --git a/mtcars.csv b/mtcars.csv
@@ -0,0 +1,33 @@
+"mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
+21,6,160,110,3.9,2.62,16.46,0,1,4,4
+21,6,160,110,3.9,2.875,17.02,0,1,4,4
+22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
+21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
+18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
+18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
+14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
+24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
+22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
+19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
+17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
+16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
+17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
+15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
+10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
+10.4,8,460,215,3,5.424,17.82,0,0,3,4
+14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
+32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
+30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
+33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
+21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
+15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
+15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
+13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
+19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
+27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
+26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
+30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
+15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
+19.7,6,145,175,3.62,2.77,15.5,0,1,5,6
+15,8,301,335,3.54,3.57,14.6,0,1,5,8
+21.4,4,121,109,4.11,2.78,18.6,1,1,4,2
diff --git a/mtcars_testing.csv b/mtcars_testing.csv
@@ -0,0 +1,14 @@
+"mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
+24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
+17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
+15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
+10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
+32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
+33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
+21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
+13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
+19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
+27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
+26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
+15,8,301,335,3.54,3.57,14.6,0,1,5,8
+21.4,4,121,109,4.11,2.78,18.6,1,1,4,2
diff --git a/mtcars_training.csv b/mtcars_training.csv
@@ -0,0 +1,20 @@
+"mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
+21,6,160,110,3.9,2.62,16.46,0,1,4,4
+21,6,160,110,3.9,2.875,17.02,0,1,4,4
+22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
+21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
+18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
+18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
+14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
+22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
+19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
+16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
+17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
+10.4,8,460,215,3,5.424,17.82,0,0,3,4
+14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
+30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
+15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
+15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
+30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
+15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
+19.7,6,145,175,3.62,2.77,15.5,0,1,5,6