Skip to content

Commit 03092b3

Browse files
authored
Add files via upload
1 parent d211137 commit 03092b3

6 files changed

+325
-0
lines changed

LOOCV.R

+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#-------------------------------------------------------------
2+
# Leave-one-out Cross-validation (LOOCV) for regression models
3+
# in R
4+
#-------------------------------------------------------------
5+
6+
#--------------------------------------------------
7+
# Example 1: Linear and Quadratic regression models
8+
#--------------------------------------------------
9+
10+
# 1. generate artificial data
11+
set.seed(2023)
12+
n <- 300
13+
x <- rnorm(n = n, mean = 5, sd = 3)
14+
y <- x^2 + rnorm(n = n, mean = 0, sd = 8)
15+
data = data.frame(x = x, y = y)
16+
17+
# 2. plot the models that we want to fit
18+
par(mfrow = c(1,2), pty = "s")
19+
20+
model1 <- lm(y ~ x)
21+
plot(x = x, y = y, main = 'Linear model', cex = 1.1, pch = 1, lwd = 1.2)
22+
yhat1 <- model1$coef[1] + model1$coef[2] * x
23+
lines(x, yhat1, lw = 2.5, col = 'red')
24+
25+
model2 <- lm(y ~ x + I(x^2))
26+
plot(x = x, y = y, main = 'Quadratic model', cex = 1.1, pch = 1, lwd = 1.2)
27+
yhat2 <- model2$coef[1] + model2$coef[2] * x + model2$coef[3] * x^2
28+
lines(x[order(x)], yhat2[order(x)], lw = 2.5, col = 'red')
29+
30+
# 3. Cross-Validation
31+
# fit the models on leave-one-out samples
32+
pred.cv.mod1 <- pred.cv.mod2 <- numeric(n)
33+
34+
for(i in 1:n) {
35+
36+
# quadratic model
37+
mod1 = lm(y ~ x, subset = -i)
38+
pred.cv.mod1[i] = predict(mod1, data[i,])
39+
40+
# quadratic model
41+
mod2 = lm(y ~ x + I(x^2), subset = -i)
42+
pred.cv.mod2[i] = predict(mod2, data[i,])
43+
}
44+
45+
MSE1 = (1/n) * sum((y - pred.cv.mod1)^2) # theta_hat_pe
46+
MSE2 = (1/n) * sum((y - pred.cv.mod2)^2) # theta_hat_pe
47+
48+
# Root Mean Squared Error (RMSE)
49+
sqrt(c(MSE1, MSE2))
50+
# [1] 15.68599 7.99332
51+
# The second model (Quadratic) has the lowest RMSE and thus is prefered.
52+
53+
#---------------------------------------
54+
# Example 2: binomial regressions models
55+
#---------------------------------------
56+
57+
# splitting the dataset into training and test sets
58+
data("mtcars")
59+
head(mtcars)
60+
set.seed(2023)
61+
ind <- sample(2, nrow(mtcars), replace=TRUE, prob=c(0.6,0.4))
62+
training <- mtcars[ind==1,]
63+
testing <- mtcars[ind==2,]
64+
65+
# save a copy of entire dataset, training and testing datasets in .csv
66+
write.csv(mtcars,
67+
"C:/Users/julia/OneDrive/Desktop/github/16. Crossvalidation/mtcars.csv",
68+
row.names = FALSE)
69+
write.csv(training,
70+
"C:/Users/julia/OneDrive/Desktop/github/16. Crossvalidation/mtcars_training.csv",
71+
row.names = FALSE)
72+
write.csv(testing,
73+
"C:/Users/julia/OneDrive/Desktop/github/16. Crossvalidation/mtcars_testing.csv",
74+
row.names = FALSE)
75+
76+
# 2. Cross-Validation
77+
# fit the models on leave-one-out samples
78+
pred.cv.modl <- pred.cv.modp <- pred.cv.modc <- numeric(length=nrow(testing))
79+
80+
for(i in 1:nrow(testing)) {
81+
82+
# logistic model
83+
modl = glm(vs ~ mpg, data=mtcars,
84+
family= binomial(link = "logit"), subset = -i)
85+
pred.cv.modl[i] = predict(modl, testing[i,])
86+
87+
# probit model
88+
modp = glm(vs ~ mpg, data=mtcars,
89+
family= binomial(link = "probit"), subset = -i)
90+
pred.cv.modp[i] = predict(modp, testing[i,])
91+
92+
# complementary log-log model
93+
modc = glm(vs ~ mpg, data=mtcars,
94+
family= binomial(link = "cloglog"), subset = -i)
95+
pred.cv.modc[i] = predict(modc, testing[i,])
96+
}
97+
98+
MSE1 = (1/nrow(testing)) * sum((testing$vs - pred.cv.modl)^2) # theta_hat_pe
99+
MSE2 = (1/nrow(testing)) * sum((testing$vs - pred.cv.modp)^2) # theta_hat_pe
100+
MSE3 = (1/nrow(testing)) * sum((testing$vs - pred.cv.modc)^2) # theta_hat_pe
101+
102+
# Root Mean Squared Error (RMSE)
103+
sqrt(c(MSE1, MSE2, MSE3))
104+
# [1] 2.720540 1.485185 1.705154
105+
106+
# The second model (Quadratic) has the lowest RMSE and thus is prefered.
107+
108+
# 3. Plot of the fit on testing dataset with annotation
109+
# logistic fit
110+
modl <- glm(vs ~ mpg, data=training,
111+
family= binomial(link = "logit"))
112+
113+
# probit fit
114+
modp <- glm(vs ~ mpg, data=training,
115+
family= binomial(link = "probit"))
116+
117+
# complementary log-log fit
118+
modc <- glm(vs ~ mpg, data=training,
119+
family= binomial(link = "cloglog"))
120+
121+
# complete the dataset with predictions for each model
122+
testing$pred.modl <- predict(modl, type = 'response', newdata = testing)
123+
testing$pred.modp <- predict(modp, type = 'response', newdata = testing)
124+
testing$pred.modc <- predict(modc, type = 'response', newdata = testing)
125+
126+
# plot
127+
ggplot(testing, aes(x = mpg, y = vs)) +
128+
geom_point(size = 1.8) +
129+
geom_line(size = 1, data = testing, aes(x = mpg[order(testing$mpg)],
130+
y = pred.modl[order(testing$pred.modl)],
131+
color='Logistic')) +
132+
geom_line(size = 1, data = testing, aes(x = mpg[order(testing$mpg)],
133+
y = pred.modp[order(testing$pred.modp)],
134+
color='Probit')) +
135+
geom_line(size = 1, data = testing, aes(x = mpg[order(testing$mpg)],
136+
y = pred.modc[order(testing$pred.modc)],
137+
color='Log-log')) +
138+
annotate('text', label = paste('Logistic RMSE = ', round(sqrt(MSE1),2)), x = 25, y = 0.65, size = 3) +
139+
annotate('text', label = paste('Probit RMSE = ', round(sqrt(MSE2),2)), x = 25, y = 0.58, size = 3) +
140+
annotate('text', label = paste('C Log-log RMSE = ', round(sqrt(MSE3),2)), x = 25, y = 0.51, size = 3) +
141+
labs(title = 'Scatterplot - Fit of Logistic, Probit and Log-log models',
142+
subtitle = 'mtcars dataset', color = "Legend",
143+
y="Engine (0 = V-shaped, 1 = straight)", x="Miles/(US) gallon") +
144+
theme(axis.text=element_text(size=8),
145+
axis.title=element_text(size=8),
146+
plot.subtitle=element_text(size=10, face="italic", color="darkred"),
147+
panel.background = element_rect(fill = "white", colour = "grey50"),
148+
panel.grid.major = element_line(colour = "grey90"))
149+
150+
#----
151+
# end
152+
#----

LOOCV_P.py

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#-------------------------------------------------------------
2+
# Leave-one-out Cross-validation (LOOCV) for regression models
3+
# in Python
4+
#-------------------------------------------------------------
5+
6+
#--------------------------------------------------
7+
# Example 1: Linear and Quadratic regression models
8+
#--------------------------------------------------
9+
10+
# 1. generate artificial data
11+
import numpy as np
12+
np.random.seed(2023)
13+
14+
n = 300
15+
x = np.random.normal(loc = 5, scale = 3, size = n)
16+
y = x**2 + np.random.normal(loc = 0, scale = 8, size = n)
17+
18+
# 2. plot the models that we want to fit
19+
import matplotlib.pyplot as plt
20+
from matplotlib.pyplot import figure
21+
from sklearn.linear_model import LinearRegression
22+
23+
px = 1/plt.rcParams['figure.dpi']
24+
plt.figure(figsize=(850*px, 400*px))
25+
plt.subplot(1, 2, 1)
26+
plt.plot(x, y, 'o', fillstyle = 'none', color = 'black')
27+
plt.title('Linear model', fontsize = 15)
28+
plt.xlabel("x") ; plt.ylabel("y")
29+
beta1, beta0 = np.polyfit(x, y, 1)
30+
yhat1 = beta0 + beta1*x
31+
plt.plot(x, yhat1, color = 'red')
32+
33+
plt.subplot(1, 2, 2)
34+
plt.plot(x, y, 'o', fillstyle = 'none', color = 'black')
35+
plt.xlabel('x') ; plt.ylabel('y') ; plt.title('Quadatic model', fontsize = 15)
36+
beta2, beta1, beta0 = np.polyfit(x, y, 2)
37+
yhat2 = beta0 + beta1*x + beta2*(x**2)
38+
orders = np.argsort(x.ravel())
39+
plt.plot(x[orders], yhat2[orders], color = 'red')
40+
41+
42+
# 3. Cross-Validation
43+
# fit the models on leave-one-out samples
44+
import pandas as pd
45+
data = pd.DataFrame({'x': x, 'y': y})
46+
xn = data['x'].values.reshape(-1,1)
47+
yn = data['y'].values.reshape(-1,1)
48+
49+
from sklearn.preprocessing import PolynomialFeatures
50+
from sklearn.model_selection import LeaveOneOut, cross_val_score
51+
loocv1 = LeaveOneOut()
52+
53+
# linear model
54+
mod1 = PolynomialFeatures(degree = 1, include_bias = False).fit_transform(xn)
55+
mod11 = LinearRegression().fit(mod1, yn)
56+
57+
loocv1 = LeaveOneOut()
58+
scoresmod1 = cross_val_score(mod11,
59+
mod1,
60+
yn,
61+
scoring = 'neg_mean_squared_error',
62+
cv = loocv1)
63+
64+
65+
# quadratic model
66+
mod2 = PolynomialFeatures(degree = 2, include_bias = False).fit_transform(xn)
67+
mod22 = LinearRegression().fit(mod2, yn)
68+
69+
loocv2 = LeaveOneOut()
70+
scoresmod2 = cross_val_score(mod22,
71+
mod2,
72+
yn,
73+
scoring = 'neg_mean_squared_error',
74+
cv = loocv2)
75+
76+
# Root Mean Squared Error (RMSE)
77+
import statistics
78+
import math
79+
80+
RMSE1 = math.sqrt(statistics.mean(abs(scoresmod1)))
81+
RMSE2 = math.sqrt(statistics.mean(abs(scoresmod2)))
82+
[RMSE1, RMSE2]
83+
# [16.169293289892607, 7.873829105930071]
84+
# The second model (Quadratic) has the lowest RMSE and thus is prefered.
85+
86+
#---------------------------------------
87+
# Example 2: binomial regressions models
88+
#---------------------------------------
89+
90+
# importing the data from .csv file
91+
92+
training = pd.read_csv("C:/Users/julia/OneDrive/Desktop/github/16. Crossvalidation/mtcars_training.csv")
93+
testing = pd.read_csv("C:/Users/julia/OneDrive/Desktop/github/16. Crossvalidation/mtcars_testing.csv")
94+
testing
95+
96+
# 2. Cross-Validation
97+
# fit the models on leave-one-out samples
98+
from sklearn.linear_model import LogisticRegression
99+
100+
# logistic model
101+
modl = LogisticRegression(random_state=0).fit_transform(training['mpg'])
102+
modl1 = LogisticRegression.fit(mod1, training['vs'])
103+
104+
#-----------
105+
# unfinished
106+
#-----------

LOOCV_Reg_R_Python.pdf

47.1 KB
Binary file not shown.

mtcars.csv

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
2+
21,6,160,110,3.9,2.62,16.46,0,1,4,4
3+
21,6,160,110,3.9,2.875,17.02,0,1,4,4
4+
22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
5+
21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
6+
18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
7+
18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
8+
14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
9+
24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
10+
22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
11+
19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
12+
17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
13+
16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
14+
17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
15+
15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
16+
10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
17+
10.4,8,460,215,3,5.424,17.82,0,0,3,4
18+
14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
19+
32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
20+
30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
21+
33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
22+
21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
23+
15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
24+
15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
25+
13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
26+
19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
27+
27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
28+
26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
29+
30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
30+
15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
31+
19.7,6,145,175,3.62,2.77,15.5,0,1,5,6
32+
15,8,301,335,3.54,3.57,14.6,0,1,5,8
33+
21.4,4,121,109,4.11,2.78,18.6,1,1,4,2

mtcars_testing.csv

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
2+
24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
3+
17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
4+
15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
5+
10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
6+
32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
7+
33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
8+
21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
9+
13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
10+
19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
11+
27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
12+
26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
13+
15,8,301,335,3.54,3.57,14.6,0,1,5,8
14+
21.4,4,121,109,4.11,2.78,18.6,1,1,4,2

mtcars_training.csv

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
2+
21,6,160,110,3.9,2.62,16.46,0,1,4,4
3+
21,6,160,110,3.9,2.875,17.02,0,1,4,4
4+
22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
5+
21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
6+
18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
7+
18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
8+
14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
9+
22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
10+
19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
11+
16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
12+
17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
13+
10.4,8,460,215,3,5.424,17.82,0,0,3,4
14+
14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
15+
30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
16+
15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
17+
15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
18+
30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
19+
15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
20+
19.7,6,145,175,3.62,2.77,15.5,0,1,5,6

0 commit comments

Comments
 (0)